apiVersion: 1 # Tier 1 — Critical alerts. These page PagerDuty. # Datasource UID: bezqqznn81wqof (Prometheus on london-a) # All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations. groups: - orgId: 1 name: critical-availability folder: Alerting interval: 1m rules: - uid: cff6uy1tufj0ge title: Host Down condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: up{job="node_exporter"} instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [1] type: lt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: Alerting execErrState: Alerting for: 2m annotations: summary: "Host {{ $labels.server }} is down" description: "Node exporter on {{ $labels.server }} ({{ $labels.instance }}) has been unreachable for 2+ minutes." labels: severity: critical isPaused: false - uid: aff6uy1vxchdse title: Disk Usage Critical (>95%) condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: | ( node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} - node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} ) / node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} * 100 instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [95] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 5m annotations: summary: "Disk critically full on {{ $labels.server }}" description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 95% full (currently {{ $value | printf \"%.1f\" }}%)." labels: severity: critical isPaused: false - uid: aff6uy1xq9udca title: Memory Usage Critical (>95%) condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: | (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [95] type: gt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 5m annotations: summary: "Memory critically low on {{ $labels.server }}" description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 95% for 5+ minutes." labels: severity: critical isPaused: false - uid: fff6uy219mo00e title: SMART Disk Health Failure (london-b) condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: smartctl_device_smart_status{job="smartmontools"} instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [1] type: lt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: NoData execErrState: Error for: 0m annotations: summary: "Disk SMART health failure on london-b" description: "Drive {{ $labels.device }} on london-b reports SMART health failure. Check immediately." labels: severity: critical isPaused: false - orgId: 1 name: critical-caddy folder: Alerting interval: 1m rules: - uid: fff6uy1zgpb0gd title: Caddy Down (helsinki-a) condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: up{job="caddy"} instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [1] type: lt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: Alerting execErrState: Alerting for: 1m annotations: summary: "Caddy is down on helsinki-a" description: "Caddy (main reverse proxy) on helsinki-a unreachable. External services likely down." labels: severity: critical isPaused: false - orgId: 1 name: critical-services folder: Alerting interval: 1m rules: - uid: bff6uy2a2rrwgb title: Plex Down (london-b) condition: C data: - refId: A datasourceUid: bezqqznn81wqof model: expr: up{job="plex"} instant: true intervalMs: 1000 maxDataPoints: 43200 refId: A - refId: B datasourceUid: __expr__ model: datasource: type: __expr__ uid: __expr__ expression: A reducer: last settings: mode: "" refId: B type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: [1] type: lt operator: type: and query: params: [C] reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: B refId: C type: threshold noDataState: Alerting execErrState: Alerting for: 5m annotations: summary: "Plex is down on london-b" description: "The Plex exporter on london-b has been unreachable for 5+ minutes." labels: severity: critical isPaused: false