pez-infra/ansible/services/grafana/provisioning/alerting/rules-warning.yml
Rasmus Wejlgaard dc198eea81 fix more yaml document-start and comment indentation
- add missing --- to 13 more yml files
- fix comment indentation in prometheus.yml
2026-03-28 13:15:46 +00:00

242 lines
7.4 KiB
YAML

---
apiVersion: 1
# Tier 2 — Warning alerts. These send email only (non-paging).
# Datasource UID: bezqqznn81wqof (Prometheus on london-a)
# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations.
groups:
- orgId: 1
name: warning-resources
folder: Alerting
interval: 2m
rules:
- uid: cff6uy23024n4c
title: Disk Usage Warning (>80%)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: |
(
node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
- node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
)
/ node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
* 100
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [80]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
annotations:
summary: "Disk usage high on {{ $labels.server }}"
description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 80% full (currently {{ $value | printf \"%.1f\" }}%)."
labels:
severity: warning
isPaused: false
- uid: dff6uy24szhmod
title: Memory Usage Warning (>85%)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: |
(1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [85]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
annotations:
summary: "Memory usage high on {{ $labels.server }}"
description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 85% for 10+ minutes."
labels:
severity: warning
isPaused: false
- uid: cff6uy26jey9sd
title: CPU Usage High (>85%)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
expr: |
100 - (avg by (server, instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m])) * 100)
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [85]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 15m
annotations:
summary: "CPU usage sustained high on {{ $labels.server }}"
description: "CPU on {{ $labels.server }} has been above 85% for 15+ minutes (currently {{ $value | printf \"%.1f\" }}%)."
labels:
severity: warning
isPaused: false
- uid: eff6uy289uewwb
title: System Load High (>2x CPUs)
condition: C
data:
- refId: A
datasourceUid: bezqqznn81wqof
model:
# Compare 15-minute load against number of CPUs
expr: |
node_load15{job="node_exporter"} / on(instance) group_left() count by (instance) (node_cpu_seconds_total{job="node_exporter", mode="idle"})
instant: true
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
datasourceUid: __expr__
model:
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
settings:
mode: ""
refId: B
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [2]
type: gt
operator:
type: and
query:
params: [C]
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 15m
annotations:
summary: "High system load on {{ $labels.server }}"
description: "15-minute load average on {{ $labels.server }} is {{ $value | printf \"%.2f\" }}x the CPU count (threshold: 2x)."
labels:
severity: warning
isPaused: false