diff --git a/ansible/inventory/host_vars/london-a.yml b/ansible/inventory/host_vars/london-a.yml index f5d4c67..f67f2bc 100644 --- a/ansible/inventory/host_vars/london-a.yml +++ b/ansible/inventory/host_vars/london-a.yml @@ -4,3 +4,6 @@ host_description: "Monitoring stack (Prometheus, Grafana)" host_location: "London" ansible_python_interpreter: /usr/local/bin/python3 grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning + +zfs_pools: + - zroot diff --git a/ansible/inventory/host_vars/london-b.yml b/ansible/inventory/host_vars/london-b.yml index 8855e67..fbb267b 100644 --- a/ansible/inventory/host_vars/london-b.yml +++ b/ansible/inventory/host_vars/london-b.yml @@ -12,6 +12,9 @@ docker_services: - smartctl-exporter - plex-exporter +zfs_pools: + - hdd + common_ufw_allowed_ports: - {port: 32400, proto: tcp, comment: "Plex Media Server"} - {port: 6881, proto: tcp, comment: "BitTorrent"} diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini index 7f1581e..9ad3aa9 100644 --- a/ansible/inventory/hosts.ini +++ b/ansible/inventory/hosts.ini @@ -13,6 +13,10 @@ nuremberg-a ansible_host=100.117.235.28 [freebsd] london-a ansible_host=100.122.219.41 +[zfs_hosts] +london-a +london-b + [docker_hosts] london-b nuremberg-a diff --git a/ansible/playbooks/zfs.yml b/ansible/playbooks/zfs.yml new file mode 100644 index 0000000..e76056d --- /dev/null +++ b/ansible/playbooks/zfs.yml @@ -0,0 +1,9 @@ +--- +# Deploy ZFS management (scrub scheduling, monitoring). +# Usage: ansible-playbook playbooks/zfs.yml +# ansible-playbook playbooks/zfs.yml --check --diff + +- name: "ZFS management" + hosts: zfs_hosts + roles: + - zfs diff --git a/ansible/roles/zfs/defaults/main.yml b/ansible/roles/zfs/defaults/main.yml new file mode 100644 index 0000000..30f39fb --- /dev/null +++ b/ansible/roles/zfs/defaults/main.yml @@ -0,0 +1,15 @@ +--- +# ZFS management defaults + +# List of ZFS pools to manage scrubs for. +# Override per-host in host_vars. +zfs_pools: [] + +# Scrub schedule (cron format). +# Default: weekly on Sunday at noon. +zfs_scrub_weekday: "0" # 0 = Sunday +zfs_scrub_hour: "12" +zfs_scrub_minute: "0" + +# Whether to enable ZFS scrub scheduling. +zfs_scrub_enabled: true diff --git a/ansible/roles/zfs/tasks/main.yml b/ansible/roles/zfs/tasks/main.yml new file mode 100644 index 0000000..a3c4010 --- /dev/null +++ b/ansible/roles/zfs/tasks/main.yml @@ -0,0 +1,29 @@ +--- +# ZFS management: scrub scheduling, pool monitoring. +# Supports both Linux (systemd timers / cron) and FreeBSD (crontab). + +- name: "ZFS scrub scheduling (Linux)" + when: ansible_os_family != "FreeBSD" and zfs_scrub_enabled + block: + - name: Install ZFS scrub cron jobs (Linux) + ansible.builtin.cron: + name: "ZFS scrub {{ item }}" + minute: "{{ zfs_scrub_minute }}" + hour: "{{ zfs_scrub_hour }}" + weekday: "{{ zfs_scrub_weekday }}" + job: "/sbin/zpool scrub {{ item }}" + user: root + loop: "{{ zfs_pools }}" + +- name: "ZFS scrub scheduling (FreeBSD)" + when: ansible_os_family == "FreeBSD" and zfs_scrub_enabled + block: + - name: Install ZFS scrub cron jobs (FreeBSD) + ansible.builtin.cron: + name: "ZFS scrub {{ item }}" + minute: "{{ zfs_scrub_minute }}" + hour: "{{ zfs_scrub_hour }}" + weekday: "{{ zfs_scrub_weekday }}" + job: "/sbin/zpool scrub {{ item }}" + user: root + loop: "{{ zfs_pools }}" diff --git a/ansible/services/prometheus/rules/zfs.rules b/ansible/services/prometheus/rules/zfs.rules new file mode 100644 index 0000000..1405aa8 --- /dev/null +++ b/ansible/services/prometheus/rules/zfs.rules @@ -0,0 +1,29 @@ +groups: +- name: zfs + rules: + - alert: ZfsPoolDegraded + expr: node_zfs_zpool_state{state="degraded"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately." + + - alert: ZfsPoolFaulted + expr: node_zfs_zpool_state{state="faulted"} == 1 + for: 1m + labels: + severity: critical + annotations: + summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk." + + - alert: ZfsPoolOffline + expr: node_zfs_zpool_state{state="offline"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}" + description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."