From 69918c861952f625a6e7a027bd88c7ae574c0c97 Mon Sep 17 00:00:00 2001 From: "Rasmus \"Pez\" Wejlgaard" Date: Sun, 29 Mar 2026 19:12:42 +0100 Subject: [PATCH] Add ZFS management role: scrub scheduling and pool monitoring (#18) - New zfs role with cron-based scrub scheduling for Linux and FreeBSD - Weekly Sunday scrubs at noon (matching existing manual crons) - Add zfs_hosts inventory group with london-a and london-b - Configure zfs_pools per host: zroot (london-a), hdd (london-b) - Add Prometheus alert rules for degraded/faulted/offline pools - Add zfs.yml playbook for targeted deploys Captures the previously untracked scrub cron on london-a and re-enables the commented-out scrub on london-b. Refs: PESO-93 --- ansible/inventory/host_vars/london-a.yml | 3 +++ ansible/inventory/host_vars/london-b.yml | 3 +++ ansible/inventory/hosts.ini | 4 +++ ansible/playbooks/zfs.yml | 9 +++++++ ansible/roles/zfs/defaults/main.yml | 15 +++++++++++ ansible/roles/zfs/tasks/main.yml | 29 +++++++++++++++++++++ ansible/services/prometheus/rules/zfs.rules | 29 +++++++++++++++++++++ 7 files changed, 92 insertions(+) create mode 100644 ansible/playbooks/zfs.yml create mode 100644 ansible/roles/zfs/defaults/main.yml create mode 100644 ansible/roles/zfs/tasks/main.yml create mode 100644 ansible/services/prometheus/rules/zfs.rules diff --git a/ansible/inventory/host_vars/london-a.yml b/ansible/inventory/host_vars/london-a.yml index f5d4c67..f67f2bc 100644 --- a/ansible/inventory/host_vars/london-a.yml +++ b/ansible/inventory/host_vars/london-a.yml @@ -4,3 +4,6 @@ host_description: "Monitoring stack (Prometheus, Grafana)" host_location: "London" ansible_python_interpreter: /usr/local/bin/python3 grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning + +zfs_pools: + - zroot diff --git a/ansible/inventory/host_vars/london-b.yml b/ansible/inventory/host_vars/london-b.yml index ce11ce2..fa7c949 100644 --- a/ansible/inventory/host_vars/london-b.yml +++ b/ansible/inventory/host_vars/london-b.yml @@ -12,6 +12,9 @@ docker_services: - smartctl-exporter - plex-exporter +zfs_pools: + - hdd + common_ufw_allowed_ports: - {port: 32400, proto: tcp, comment: "Plex Media Server"} - {port: 6881, proto: tcp, comment: "BitTorrent"} diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini index 5af353a..6b38541 100644 --- a/ansible/inventory/hosts.ini +++ b/ansible/inventory/hosts.ini @@ -13,6 +13,10 @@ nuremberg-a ansible_host=100.117.235.28 [freebsd] london-a ansible_host=100.122.219.41 +[zfs_hosts] +london-a +london-b + [docker_hosts] helsinki-a london-b diff --git a/ansible/playbooks/zfs.yml b/ansible/playbooks/zfs.yml new file mode 100644 index 0000000..e76056d --- /dev/null +++ b/ansible/playbooks/zfs.yml @@ -0,0 +1,9 @@ +--- +# Deploy ZFS management (scrub scheduling, monitoring). +# Usage: ansible-playbook playbooks/zfs.yml +# ansible-playbook playbooks/zfs.yml --check --diff + +- name: "ZFS management" + hosts: zfs_hosts + roles: + - zfs diff --git a/ansible/roles/zfs/defaults/main.yml b/ansible/roles/zfs/defaults/main.yml new file mode 100644 index 0000000..30f39fb --- /dev/null +++ b/ansible/roles/zfs/defaults/main.yml @@ -0,0 +1,15 @@ +--- +# ZFS management defaults + +# List of ZFS pools to manage scrubs for. +# Override per-host in host_vars. +zfs_pools: [] + +# Scrub schedule (cron format). +# Default: weekly on Sunday at noon. +zfs_scrub_weekday: "0" # 0 = Sunday +zfs_scrub_hour: "12" +zfs_scrub_minute: "0" + +# Whether to enable ZFS scrub scheduling. +zfs_scrub_enabled: true diff --git a/ansible/roles/zfs/tasks/main.yml b/ansible/roles/zfs/tasks/main.yml new file mode 100644 index 0000000..a3c4010 --- /dev/null +++ b/ansible/roles/zfs/tasks/main.yml @@ -0,0 +1,29 @@ +--- +# ZFS management: scrub scheduling, pool monitoring. +# Supports both Linux (systemd timers / cron) and FreeBSD (crontab). + +- name: "ZFS scrub scheduling (Linux)" + when: ansible_os_family != "FreeBSD" and zfs_scrub_enabled + block: + - name: Install ZFS scrub cron jobs (Linux) + ansible.builtin.cron: + name: "ZFS scrub {{ item }}" + minute: "{{ zfs_scrub_minute }}" + hour: "{{ zfs_scrub_hour }}" + weekday: "{{ zfs_scrub_weekday }}" + job: "/sbin/zpool scrub {{ item }}" + user: root + loop: "{{ zfs_pools }}" + +- name: "ZFS scrub scheduling (FreeBSD)" + when: ansible_os_family == "FreeBSD" and zfs_scrub_enabled + block: + - name: Install ZFS scrub cron jobs (FreeBSD) + ansible.builtin.cron: + name: "ZFS scrub {{ item }}" + minute: "{{ zfs_scrub_minute }}" + hour: "{{ zfs_scrub_hour }}" + weekday: "{{ zfs_scrub_weekday }}" + job: "/sbin/zpool scrub {{ item }}" + user: root + loop: "{{ zfs_pools }}" diff --git a/ansible/services/prometheus/rules/zfs.rules b/ansible/services/prometheus/rules/zfs.rules new file mode 100644 index 0000000..1405aa8 --- /dev/null +++ b/ansible/services/prometheus/rules/zfs.rules @@ -0,0 +1,29 @@ +groups: +- name: zfs + rules: + - alert: ZfsPoolDegraded + expr: node_zfs_zpool_state{state="degraded"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately." + + - alert: ZfsPoolFaulted + expr: node_zfs_zpool_state{state="faulted"} == 1 + for: 1m + labels: + severity: critical + annotations: + summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk." + + - alert: ZfsPoolOffline + expr: node_zfs_zpool_state{state="offline"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."