pez-infra/ansible/services/grafana/provisioning/alerting/rules-critical.yml
Rasmus Wejlgaard dc198eea81 fix more yaml document-start and comment indentation
- add missing --- to 13 more yml files
- fix comment indentation in prometheus.yml
2026-03-28 13:15:46 +00:00

359 lines
10 KiB
YAML

---
apiVersion: 1
# Tier 1 — Critical alerts. These page PagerDuty.
# Datasource UID: bezqqznn81wqof (Prometheus on london-a)
# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations.
groups:
- orgId: 1
name: critical-availability
folder: Alerting
interval: 1m
rules:
- uid: cff6uy1tufj0ge
title: Host Down
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: up{job="node_exporter"}
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: Alerting
execErrState: Alerting
for: 2m
annotations:
summary: "Host {{ $labels.server }} is down"
description: "Node exporter on {{ $labels.server }} ({{ $labels.instance }}) has been unreachable for 2+ minutes."
labels:
severity: critical
isPaused: false
- uid: aff6uy1vxchdse
title: Disk Usage Critical (>95%)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: |
(
node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
- node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
)
/ node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
* 100
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [95]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: "Disk critically full on {{ $labels.server }}"
description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 95% full (currently {{ $value | printf \"%.1f\" }}%)."
labels:
severity: critical
isPaused: false
- uid: aff6uy1xq9udca
title: Memory Usage Critical (>95%)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: |
(1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [95]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: "Memory critically low on {{ $labels.server }}"
description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 95% for 5+ minutes."
labels:
severity: critical
isPaused: false
- uid: fff6uy219mo00e
title: SMART Disk Health Failure (london-b)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: smartctl_device_smart_status{job="smartmontools"}
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 0m
annotations:
summary: "Disk SMART health failure on london-b"
description: "Drive {{ $labels.device }} on london-b reports SMART health failure. Check immediately."
labels:
severity: critical
isPaused: false
- orgId: 1
name: critical-caddy
folder: Alerting
interval: 1m
rules:
- uid: fff6uy1zgpb0gd
title: Caddy Down (helsinki-a)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: up{job="caddy"}
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: Alerting
execErrState: Alerting
for: 1m
annotations:
summary: "Caddy is down on helsinki-a"
description: "Caddy (main reverse proxy) on helsinki-a unreachable. External services likely down."
labels:
severity: critical
isPaused: false
- orgId: 1
name: critical-services
folder: Alerting
interval: 1m
rules:
- uid: bff6uy2a2rrwgb
title: Plex Down (london-b)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: up{job="plex"}
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: Alerting
execErrState: Alerting
for: 5m
annotations:
summary: "Plex is down on london-b"
description: "The Plex exporter on london-b has been unreachable for 5+ minutes."
labels:
severity: critical
isPaused: false