mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
Add ZFS management role: scrub scheduling and pool monitoring (#18)
- New zfs role with cron-based scrub scheduling for Linux and FreeBSD - Weekly Sunday scrubs at noon (matching existing manual crons) - Add zfs_hosts inventory group with london-a and london-b - Configure zfs_pools per host: zroot (london-a), hdd (london-b) - Add Prometheus alert rules for degraded/faulted/offline pools - Add zfs.yml playbook for targeted deploys Captures the previously untracked scrub cron on london-a and re-enables the commented-out scrub on london-b. Refs: PESO-93
This commit is contained in:
parent
3d8fb84d1f
commit
69918c8619
7 changed files with 92 additions and 0 deletions
|
|
@ -4,3 +4,6 @@ host_description: "Monitoring stack (Prometheus, Grafana)"
|
|||
host_location: "London"
|
||||
ansible_python_interpreter: /usr/local/bin/python3
|
||||
grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning
|
||||
|
||||
zfs_pools:
|
||||
- zroot
|
||||
|
|
|
|||
|
|
@ -12,6 +12,9 @@ docker_services:
|
|||
- smartctl-exporter
|
||||
- plex-exporter
|
||||
|
||||
zfs_pools:
|
||||
- hdd
|
||||
|
||||
common_ufw_allowed_ports:
|
||||
- {port: 32400, proto: tcp, comment: "Plex Media Server"}
|
||||
- {port: 6881, proto: tcp, comment: "BitTorrent"}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,10 @@ nuremberg-a ansible_host=100.117.235.28
|
|||
[freebsd]
|
||||
london-a ansible_host=100.122.219.41
|
||||
|
||||
[zfs_hosts]
|
||||
london-a
|
||||
london-b
|
||||
|
||||
[docker_hosts]
|
||||
helsinki-a
|
||||
london-b
|
||||
|
|
|
|||
9
ansible/playbooks/zfs.yml
Normal file
9
ansible/playbooks/zfs.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
---
|
||||
# Deploy ZFS management (scrub scheduling, monitoring).
|
||||
# Usage: ansible-playbook playbooks/zfs.yml
|
||||
# ansible-playbook playbooks/zfs.yml --check --diff
|
||||
|
||||
- name: "ZFS management"
|
||||
hosts: zfs_hosts
|
||||
roles:
|
||||
- zfs
|
||||
15
ansible/roles/zfs/defaults/main.yml
Normal file
15
ansible/roles/zfs/defaults/main.yml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
---
|
||||
# ZFS management defaults
|
||||
|
||||
# List of ZFS pools to manage scrubs for.
|
||||
# Override per-host in host_vars.
|
||||
zfs_pools: []
|
||||
|
||||
# Scrub schedule (cron format).
|
||||
# Default: weekly on Sunday at noon.
|
||||
zfs_scrub_weekday: "0" # 0 = Sunday
|
||||
zfs_scrub_hour: "12"
|
||||
zfs_scrub_minute: "0"
|
||||
|
||||
# Whether to enable ZFS scrub scheduling.
|
||||
zfs_scrub_enabled: true
|
||||
29
ansible/roles/zfs/tasks/main.yml
Normal file
29
ansible/roles/zfs/tasks/main.yml
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
# ZFS management: scrub scheduling, pool monitoring.
|
||||
# Supports both Linux (systemd timers / cron) and FreeBSD (crontab).
|
||||
|
||||
- name: "ZFS scrub scheduling (Linux)"
|
||||
when: ansible_os_family != "FreeBSD" and zfs_scrub_enabled
|
||||
block:
|
||||
- name: Install ZFS scrub cron jobs (Linux)
|
||||
ansible.builtin.cron:
|
||||
name: "ZFS scrub {{ item }}"
|
||||
minute: "{{ zfs_scrub_minute }}"
|
||||
hour: "{{ zfs_scrub_hour }}"
|
||||
weekday: "{{ zfs_scrub_weekday }}"
|
||||
job: "/sbin/zpool scrub {{ item }}"
|
||||
user: root
|
||||
loop: "{{ zfs_pools }}"
|
||||
|
||||
- name: "ZFS scrub scheduling (FreeBSD)"
|
||||
when: ansible_os_family == "FreeBSD" and zfs_scrub_enabled
|
||||
block:
|
||||
- name: Install ZFS scrub cron jobs (FreeBSD)
|
||||
ansible.builtin.cron:
|
||||
name: "ZFS scrub {{ item }}"
|
||||
minute: "{{ zfs_scrub_minute }}"
|
||||
hour: "{{ zfs_scrub_hour }}"
|
||||
weekday: "{{ zfs_scrub_weekday }}"
|
||||
job: "/sbin/zpool scrub {{ item }}"
|
||||
user: root
|
||||
loop: "{{ zfs_pools }}"
|
||||
29
ansible/services/prometheus/rules/zfs.rules
Normal file
29
ansible/services/prometheus/rules/zfs.rules
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
groups:
|
||||
- name: zfs
|
||||
rules:
|
||||
- alert: ZfsPoolDegraded
|
||||
expr: node_zfs_zpool_state{state="degraded"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ZFS pool {{ $labels.zpool }} is degraded on {{ $labels.instance }}"
|
||||
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} has entered a degraded state. Check disk health immediately."
|
||||
|
||||
- alert: ZfsPoolFaulted
|
||||
expr: node_zfs_zpool_state{state="faulted"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "ZFS pool {{ $labels.zpool }} is FAULTED on {{ $labels.instance }}"
|
||||
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is faulted. Data may be at risk."
|
||||
|
||||
- alert: ZfsPoolOffline
|
||||
expr: node_zfs_zpool_state{state="offline"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ZFS pool {{ $labels.zpool }} is offline on {{ $labels.instance }}"
|
||||
description: "Pool {{ $labels.zpool }} on {{ $labels.instance }} is offline."
|
||||
Loading…
Add table
Reference in a new issue