mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
Add ZFS management role: scrub scheduling and pool monitoring (#18)
- New zfs role with cron-based scrub scheduling for Linux and FreeBSD - Weekly Sunday scrubs at noon (matching existing manual crons) - Add zfs_hosts inventory group with london-a and london-b - Configure zfs_pools per host: zroot (london-a), hdd (london-b) - Add Prometheus alert rules for degraded/faulted/offline pools - Add zfs.yml playbook for targeted deploys Captures the previously untracked scrub cron on london-a and re-enables the commented-out scrub on london-b. Refs: PESO-93
This commit is contained in:
parent
3d8fb84d1f
commit
69918c8619
7 changed files with 92 additions and 0 deletions
|
|
@ -4,3 +4,6 @@ host_description: "Monitoring stack (Prometheus, Grafana)"
|
||||||
host_location: "London"
|
host_location: "London"
|
||||||
ansible_python_interpreter: /usr/local/bin/python3
|
ansible_python_interpreter: /usr/local/bin/python3
|
||||||
grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning
|
grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning
|
||||||
|
|
||||||
|
zfs_pools:
|
||||||
|
- zroot
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,9 @@ docker_services:
|
||||||
- smartctl-exporter
|
- smartctl-exporter
|
||||||
- plex-exporter
|
- plex-exporter
|
||||||
|
|
||||||
|
zfs_pools:
|
||||||
|
- hdd
|
||||||
|
|
||||||
common_ufw_allowed_ports:
|
common_ufw_allowed_ports:
|
||||||
- {port: 32400, proto: tcp, comment: "Plex Media Server"}
|
- {port: 32400, proto: tcp, comment: "Plex Media Server"}
|
||||||
- {port: 6881, proto: tcp, comment: "BitTorrent"}
|
- {port: 6881, proto: tcp, comment: "BitTorrent"}
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,10 @@ nuremberg-a ansible_host=100.117.235.28
|
||||||
[freebsd]
|
[freebsd]
|
||||||
london-a ansible_host=100.122.219.41
|
london-a ansible_host=100.122.219.41
|
||||||
|
|
||||||
|
[zfs_hosts]
|
||||||
|
london-a
|
||||||
|
london-b
|
||||||
|
|
||||||
[docker_hosts]
|
[docker_hosts]
|
||||||
helsinki-a
|
helsinki-a
|
||||||
london-b
|
london-b
|
||||||
|
|
|
||||||
9
ansible/playbooks/zfs.yml
Normal file
9
ansible/playbooks/zfs.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
---
|
||||||
|
# Deploy ZFS management (scrub scheduling, monitoring).
|
||||||
|
# Usage: ansible-playbook playbooks/zfs.yml
|
||||||
|
# ansible-playbook playbooks/zfs.yml --check --diff
|
||||||
|
|
||||||
|
- name: "ZFS management"
|
||||||
|
hosts: zfs_hosts
|
||||||
|
roles:
|
||||||
|
- zfs
|
||||||
15
ansible/roles/zfs/defaults/main.yml
Normal file
15
ansible/roles/zfs/defaults/main.yml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
---
|
||||||
|
# ZFS management defaults
|
||||||
|
|
||||||
|
# List of ZFS pools to manage scrubs for.
|
||||||
|
# Override per-host in host_vars.
|
||||||
|
zfs_pools: []
|
||||||
|
|
||||||
|
# Scrub schedule (cron format).
|
||||||
|
# Default: weekly on Sunday at noon.
|
||||||
|
zfs_scrub_weekday: "0" # 0 = Sunday
|
||||||
|
zfs_scrub_hour: "12"
|
||||||
|
zfs_scrub_minute: "0"
|
||||||
|
|
||||||
|
# Whether to enable ZFS scrub scheduling.
|
||||||
|
zfs_scrub_enabled: true
|
||||||
29
ansible/roles/zfs/tasks/main.yml
Normal file
29
ansible/roles/zfs/tasks/main.yml
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
---
|
||||||
|
# ZFS management: scrub scheduling, pool monitoring.
|
||||||
|
# Supports both Linux (systemd timers / cron) and FreeBSD (crontab).
|
||||||
|
|
||||||
|
- name: "ZFS scrub scheduling (Linux)"
|
||||||
|
when: ansible_os_family != "FreeBSD" and zfs_scrub_enabled
|
||||||
|
block:
|
||||||
|
- name: Install ZFS scrub cron jobs (Linux)
|
||||||
|
ansible.builtin.cron:
|
||||||
|
name: "ZFS scrub {{ item }}"
|
||||||
|
minute: "{{ zfs_scrub_minute }}"
|
||||||
|
hour: "{{ zfs_scrub_hour }}"
|
||||||
|
weekday: "{{ zfs_scrub_weekday }}"
|
||||||
|
job: "/sbin/zpool scrub {{ item }}"
|
||||||
|
user: root
|
||||||
|
loop: "{{ zfs_pools }}"
|
||||||
|
|
||||||
|
- name: "ZFS scrub scheduling (FreeBSD)"
|
||||||
|
when: ansible_os_family == "FreeBSD" and zfs_scrub_enabled
|
||||||
|
block:
|
||||||
|
- name: Install ZFS scrub cron jobs (FreeBSD)
|
||||||
|
ansible.builtin.cron:
|
||||||
|
name: "ZFS scrub {{ item }}"
|
||||||
|
minute: "{{ zfs_scrub_minute }}"
|
||||||
|
hour: "{{ zfs_scrub_hour }}"
|
||||||
|
weekday: "{{ zfs_scrub_weekday }}"
|
||||||
|
job: "/sbin/zpool scrub {{ item }}"
|
||||||
|
user: root
|
||||||
|
loop: "{{ zfs_pools }}"
|
||||||
29
ansible/services/prometheus/rules/zfs.rules
Normal file
29
ansible/services/prometheus/rules/zfs.rules
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
groups:
|
||||||
|
- name: zfs
|
||||||
|
rules:
|
||||||
|
- alert: ZfsPoolDegraded
|
||||||
|
expr: node_zfs_zpool_state{state="degraded"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}"
|
||||||
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately."
|
||||||
|
|
||||||
|
- alert: ZfsPoolFaulted
|
||||||
|
expr: node_zfs_zpool_state{state="faulted"} == 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}"
|
||||||
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk."
|
||||||
|
|
||||||
|
- alert: ZfsPoolOffline
|
||||||
|
expr: node_zfs_zpool_state{state="offline"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}"
|
||||||
|
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."
|
||||||
Loading…
Add table
Reference in a new issue