mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
- New zfs role with cron-based scrub scheduling for Linux and FreeBSD - Weekly Sunday scrubs at noon (matching existing manual crons) - Add zfs_hosts inventory group with london-a and london-b - Configure zfs_pools per host: zroot (london-a), hdd (london-b) - Add Prometheus alert rules for degraded/faulted/offline pools - Add zfs.yml playbook for targeted deploys Captures the previously untracked scrub cron on london-a and re-enables the commented-out scrub on london-b. Refs: PESO-93
29 lines
1.1 KiB
Text
29 lines
1.1 KiB
Text
groups:
|
|
- name: zfs
|
|
rules:
|
|
- alert: ZfsPoolDegraded
|
|
expr: node_zfs_zpool_state{state="degraded"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}"
|
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately."
|
|
|
|
- alert: ZfsPoolFaulted
|
|
expr: node_zfs_zpool_state{state="faulted"} == 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}"
|
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk."
|
|
|
|
- alert: ZfsPoolOffline
|
|
expr: node_zfs_zpool_state{state="offline"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}"
|
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."
|