--- apiVersion: 1 # Tier 2 — Warning alerts. These send email only (non-paging). # Datasource UID: bezqqznn81wqof (Prometheus on london-a) # All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations. groups: - orgId: 1 name: warning-resources folder: Alerting interval: 2m rules: - uid: cff6uy23024n4c title: Disk Usage Warning (>80%) condition: C data: - refId: A datasourceUid: bezqqznn81wqof relativeTimeRange: from: 600 to: 0 model: expr: | ( node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} - node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} ) / node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} * 100 instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: conditions: - evaluator: params: [80] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 10m annotations: summary: "Disk usage high on {{ $labels.server }}" description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 80% full (currently {{ $value | printf \"%.1f\" }}%)." labels: severity: warning isPaused: false - uid: dff6uy24szhmod title: Memory Usage Warning (>85%) condition: C data: - refId: A datasourceUid: bezqqznn81wqof relativeTimeRange: from: 600 to: 0 model: expr: | (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: conditions: - evaluator: params: [85] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 10m annotations: summary: "Memory usage high on {{ $labels.server }}" description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 85% for 10+ minutes." labels: severity: warning isPaused: false - uid: cff6uy26jey9sd title: CPU Usage High (>85%) condition: C data: - refId: A datasourceUid: bezqqznn81wqof relativeTimeRange: from: 600 to: 0 model: expr: | 100 - (avg by (server, instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m])) * 100) instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: conditions: - evaluator: params: [85] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 15m annotations: summary: "CPU usage sustained high on {{ $labels.server }}" description: "CPU on {{ $labels.server }} has been above 85% for 15+ minutes (currently {{ $value | printf \"%.1f\" }}%)." labels: severity: warning isPaused: false - uid: eff6uy289uewwb title: System Load High (>2x CPUs) condition: C data: - refId: A datasourceUid: bezqqznn81wqof relativeTimeRange: from: 600 to: 0 model: # Compare 15-minute load against number of CPUs expr: | node_load15{job="node_exporter"} / on(instance) group_left() count by (instance) (node_cpu_seconds_total{job="node_exporter", mode="idle"}) instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ relativeTimeRange: from: 0 to: 0 model: conditions: - evaluator: params: [2] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 15m annotations: summary: "High system load on {{ $labels.server }}" description: "15-minute load average on {{ $labels.server }} is {{ $value | printf \"%.2f\" }}x the CPU count (threshold: 2x)." labels: severity: warning isPaused: false