apiVersion: 1

# Tier 1 — Critical alerts. These page PagerDuty.
# Datasource UID: bezqqznn81wqof (Prometheus on london-a)
# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations.

groups:
  - orgId: 1
    name: critical-availability
    folder: Alerting
    interval: 1m
    rules:
      - uid: cff6uy1tufj0ge
        title: Host Down
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: up{job="node_exporter"}
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [1]
                    type: lt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: Alerting
        execErrState: Alerting
        for: 2m
        annotations:
          summary: "Host {{ $labels.server }} is down"
          description: "Node exporter on {{ $labels.server }} ({{ $labels.instance }}) has been unreachable for 2+ minutes."
        labels:
          severity: critical
        isPaused: false

      - uid: aff6uy1vxchdse
        title: Disk Usage Critical (>95%)
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: |
                (
                  node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
                  - node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
                )
                / node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"}
                * 100
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [95]
                    type: gt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: NoData
        execErrState: Error
        for: 5m
        annotations:
          summary: "Disk critically full on {{ $labels.server }}"
          description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 95% full (currently {{ $value | printf \"%.1f\" }}%)."
        labels:
          severity: critical
        isPaused: false

      - uid: aff6uy1xq9udca
        title: Memory Usage Critical (>95%)
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: |
                (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [95]
                    type: gt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: NoData
        execErrState: Error
        for: 5m
        annotations:
          summary: "Memory critically low on {{ $labels.server }}"
          description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 95% for 5+ minutes."
        labels:
          severity: critical
        isPaused: false

      - uid: fff6uy219mo00e
        title: SMART Disk Health Failure (london-b)
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: smartctl_device_smart_status{job="smartmontools"}
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [1]
                    type: lt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: NoData
        execErrState: Error
        for: 0m
        annotations:
          summary: "Disk SMART health failure on london-b"
          description: "Drive {{ $labels.device }} on london-b reports SMART health failure. Check immediately."
        labels:
          severity: critical
        isPaused: false

  - orgId: 1
    name: critical-caddy
    folder: Alerting
    interval: 1m
    rules:
      - uid: fff6uy1zgpb0gd
        title: Caddy Down (helsinki-a)
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: up{job="caddy"}
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [1]
                    type: lt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: Alerting
        execErrState: Alerting
        for: 1m
        annotations:
          summary: "Caddy is down on helsinki-a"
          description: "Caddy (main reverse proxy) on helsinki-a unreachable. External services likely down."
        labels:
          severity: critical
        isPaused: false

  - orgId: 1
    name: critical-services
    folder: Alerting
    interval: 1m
    rules:
      - uid: bff6uy2a2rrwgb
        title: Plex Down (london-b)
        condition: C
        data:
          - refId: A
            datasourceUid: bezqqznn81wqof
            model:
              expr: up{job="plex"}
              instant: true
              intervalMs: 1000
              maxDataPoints: 43200
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              datasource:
                type: __expr__
                uid: __expr__
              expression: A
              reducer: last
              settings:
                mode: ""
              refId: B
              type: reduce
          - refId: C
            datasourceUid: __expr__
            model:
              conditions:
                - evaluator:
                    params: [1]
                    type: lt
                  operator:
                    type: and
                  query:
                    params: [C]
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: __expr__
              expression: B
              refId: C
              type: threshold
        noDataState: Alerting
        execErrState: Alerting
        for: 5m
        annotations:
          summary: "Plex is down on london-b"
          description: "The Plex exporter on london-b has been unreachable for 5+ minutes."
        labels:
          severity: critical
        isPaused: false