mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
Grafana 12 requires explicit relativeTimeRange on all alert rule data
queries. Without it, queries default to {from: 0, to: 0} which is
rejected as invalid, causing Grafana to crash on startup during
alerting provisioning.
Added relativeTimeRange to all data entries:
- Prometheus queries: {from: 600, to: 0} (10 min lookback)
- Expression refs: {from: 0, to: 0}
This was preventing Grafana from starting on london-a, which meant
alerts (including Host Down for copenhagen-a) could never auto-resolve.
413 lines
12 KiB
YAML
413 lines
12 KiB
YAML
---
|
|
apiVersion: 1
|
|
|
|
# Tier 1 — Critical alerts. These page PagerDuty.
|
|
# Datasource UID: bezqqznn81wqof (Prometheus on london-a)
|
|
# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations.
|
|
|
|
groups:
|
|
- orgId: 1
|
|
name: critical-availability
|
|
folder: Alerting
|
|
interval: 1m
|
|
rules:
|
|
- uid: cff6uy1tufj0ge
|
|
title: Host Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: up{job="node_exporter"}
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
for: 2m
|
|
annotations:
|
|
summary: "Host {{ $labels.server }} is down"
|
|
description: "Node exporter on {{ $labels.server }} ({{ $labels.instance }}) has been unreachable for 2+ minutes."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
|
|
- uid: aff6uy1vxchdse
|
|
title: Disk Usage Critical (>95%)
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: |
|
|
(
|
|
node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
|
|
- node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
|
|
)
|
|
/ node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
|
|
* 100
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [95]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
summary: "Disk critically full on {{ $labels.server }}"
|
|
description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 95% full (currently {{ $value | printf \"%.1f\" }}%)."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
|
|
- uid: aff6uy1xq9udca
|
|
title: Memory Usage Critical (>95%)
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: |
|
|
(1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [95]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
summary: "Memory critically low on {{ $labels.server }}"
|
|
description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 95% for 5+ minutes."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
|
|
- uid: fff6uy219mo00e
|
|
title: SMART Disk Health Failure (london-b)
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: smartctl_device_smart_status{job="smartmontools"}
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: NoData
|
|
execErrState: Error
|
|
for: 0m
|
|
annotations:
|
|
summary: "Disk SMART health failure on london-b"
|
|
description: "Drive {{ $labels.device }} on london-b reports SMART health failure. Check immediately."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
|
|
- orgId: 1
|
|
name: critical-caddy
|
|
folder: Alerting
|
|
interval: 1m
|
|
rules:
|
|
- uid: fff6uy1zgpb0gd
|
|
title: Caddy Down (helsinki-a)
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: up{job="caddy"}
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
for: 1m
|
|
annotations:
|
|
summary: "Caddy is down on helsinki-a"
|
|
description: "Caddy (main reverse proxy) on helsinki-a unreachable. External services likely down."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|
|
|
|
- orgId: 1
|
|
name: critical-services
|
|
folder: Alerting
|
|
interval: 1m
|
|
rules:
|
|
- uid: bff6uy2a2rrwgb
|
|
title: Plex Down (london-b)
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
datasourceUid: bezqqznn81wqof
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
expr: up{job="plex"}
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: ""
|
|
refId: B
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: [C]
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: B
|
|
refId: C
|
|
type: threshold
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
for: 5m
|
|
annotations:
|
|
summary: "Plex is down on london-b"
|
|
description: "The Plex exporter on london-b has been unreachable for 5+ minutes."
|
|
labels:
|
|
severity: critical
|
|
isPaused: false
|