Add ZFS management role: scrub scheduling and pool monitoring (#18)

- New zfs role with cron-based scrub scheduling for Linux and FreeBSD
- Weekly Sunday scrubs at noon (matching existing manual crons)
- Add zfs_hosts inventory group with london-a and london-b
- Configure zfs_pools per host: zroot (london-a), hdd (london-b)
- Add Prometheus alert rules for degraded/faulted/offline pools
- Add zfs.yml playbook for targeted deploys

Captures the previously untracked scrub cron on london-a and
re-enables the commented-out scrub on london-b.

Refs: PESO-93
This commit is contained in:
Rasmus Wejlgaard 2026-03-29 19:12:42 +01:00 committed by GitHub
parent 3d8fb84d1f
commit 69918c8619
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 92 additions and 0 deletions

View file

@ -4,3 +4,6 @@ host_description: "Monitoring stack (Prometheus, Grafana)"
host_location: "London"
ansible_python_interpreter: /usr/local/bin/python3
grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning
zfs_pools:
- zroot

View file

@ -12,6 +12,9 @@ docker_services:
- smartctl-exporter
- plex-exporter
zfs_pools:
- hdd
common_ufw_allowed_ports:
- {port: 32400, proto: tcp, comment: "Plex Media Server"}
- {port: 6881, proto: tcp, comment: "BitTorrent"}

View file

@ -13,6 +13,10 @@ nuremberg-a ansible_host=100.117.235.28
[freebsd]
london-a ansible_host=100.122.219.41
[zfs_hosts]
london-a
london-b
[docker_hosts]
helsinki-a
london-b

View file

@ -0,0 +1,9 @@
---
# Deploy ZFS management (scrub scheduling, monitoring).
# Usage: ansible-playbook playbooks/zfs.yml
# ansible-playbook playbooks/zfs.yml --check --diff
- name: "ZFS management"
hosts: zfs_hosts
roles:
- zfs

View file

@ -0,0 +1,15 @@
---
# ZFS management defaults
# List of ZFS pools to manage scrubs for.
# Override per-host in host_vars.
zfs_pools: []
# Scrub schedule (cron format).
# Default: weekly on Sunday at noon.
zfs_scrub_weekday: "0" # 0 = Sunday
zfs_scrub_hour: "12"
zfs_scrub_minute: "0"
# Whether to enable ZFS scrub scheduling.
zfs_scrub_enabled: true

View file

@ -0,0 +1,29 @@
---
# ZFS management: scrub scheduling, pool monitoring.
# Supports both Linux (systemd timers / cron) and FreeBSD (crontab).
- name: "ZFS scrub scheduling (Linux)"
when: ansible_os_family != "FreeBSD" and zfs_scrub_enabled
block:
- name: Install ZFS scrub cron jobs (Linux)
ansible.builtin.cron:
name: "ZFS scrub {{ item }}"
minute: "{{ zfs_scrub_minute }}"
hour: "{{ zfs_scrub_hour }}"
weekday: "{{ zfs_scrub_weekday }}"
job: "/sbin/zpool scrub {{ item }}"
user: root
loop: "{{ zfs_pools }}"
- name: "ZFS scrub scheduling (FreeBSD)"
when: ansible_os_family == "FreeBSD" and zfs_scrub_enabled
block:
- name: Install ZFS scrub cron jobs (FreeBSD)
ansible.builtin.cron:
name: "ZFS scrub {{ item }}"
minute: "{{ zfs_scrub_minute }}"
hour: "{{ zfs_scrub_hour }}"
weekday: "{{ zfs_scrub_weekday }}"
job: "/sbin/zpool scrub {{ item }}"
user: root
loop: "{{ zfs_pools }}"

View file

@ -0,0 +1,29 @@
groups:
- name: zfs
rules:
- alert: ZfsPoolDegraded
expr: node_zfs_zpool_state{state="degraded"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}"
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately."
- alert: ZfsPoolFaulted
expr: node_zfs_zpool_state{state="faulted"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}"
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk."
- alert: ZfsPoolOffline
expr: node_zfs_zpool_state{state="offline"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}"
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."