diff --git a/ansible/deploy.yml b/ansible/deploy.yml index 80b7bfe..91ce3bf 100644 --- a/ansible/deploy.yml +++ b/ansible/deploy.yml @@ -46,25 +46,28 @@ # Stage 4: Per-host services # ────────────────────────────────────────────── -# helsinki-a: Caddy reverse proxy -- name: "Stage 4a: Caddy (helsinki-a)" +# helsinki-a: Caddy reverse proxy + status page +- name: "Stage 4a: Caddy + status page (helsinki-a)" hosts: helsinki-a - tags: [services, caddy] + tags: [services, caddy, status_page] roles: - role: caddy + - role: status_page -# london-b: Docker services (storage, apps) +# london-b: Docker services (storage, apps) + backups - name: "Stage 4b: Docker services (london-b)" hosts: london-b tags: [services, london-b] roles: - role: docker_services + - role: backup # nuremberg-a: Mail (poste.io via Docker) - name: "Stage 4c: Mail (nuremberg-a)" hosts: nuremberg-a tags: [services, mail] roles: + - role: firewall_alpine - role: docker_services # copenhagen-a: Gaming servers diff --git a/ansible/inventory/host_vars/london-b.yml b/ansible/inventory/host_vars/london-b.yml index 1b69685..ce11ce2 100644 --- a/ansible/inventory/host_vars/london-b.yml +++ b/ansible/inventory/host_vars/london-b.yml @@ -17,3 +17,4 @@ common_ufw_allowed_ports: - {port: 6881, proto: tcp, comment: "BitTorrent"} - {port: 6881, proto: udp, comment: "BitTorrent"} - {port: 445, proto: tcp, comment: "Samba"} + diff --git a/ansible/inventory/host_vars/nuremberg-a.yml b/ansible/inventory/host_vars/nuremberg-a.yml index 2061d0d..e06d9db 100644 --- a/ansible/inventory/host_vars/nuremberg-a.yml +++ b/ansible/inventory/host_vars/nuremberg-a.yml @@ -4,4 +4,6 @@ host_description: "Mail server (poste.io)" host_location: "Hetzner Cloud" ansible_python_interpreter: /usr/bin/python3 # NOTE: Alpine host — UFW tasks are Debian-only. -# Firewall rules for mail ports (25,465,587,993,143,80,443) managed separately. +# Firewall: iptables + fail2ban managed by firewall_alpine role. +# Mail ports (25,80,110,143,443,465,587,993,995) exposed via Docker +# port mappings in ansible/services/poste-io/docker-compose.yml. diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini index 7f1581e..5af353a 100644 --- a/ansible/inventory/hosts.ini +++ b/ansible/inventory/hosts.ini @@ -14,6 +14,7 @@ nuremberg-a ansible_host=100.117.235.28 london-a ansible_host=100.122.219.41 [docker_hosts] +helsinki-a london-b nuremberg-a copenhagen-a diff --git a/ansible/roles/backup/tasks/main.yml b/ansible/roles/backup/tasks/main.yml new file mode 100644 index 0000000..6c029b4 --- /dev/null +++ b/ansible/roles/backup/tasks/main.yml @@ -0,0 +1,22 @@ +--- +# Deploy backup script and cron job for rclone-to-B2 backups. + +- name: Ensure scripts directory exists + ansible.builtin.file: + path: /root/scripts + state: directory + mode: '0755' + +- name: Deploy backup script + ansible.builtin.copy: + src: "{{ playbook_dir }}/scripts/hdd-backup.sh" + dest: /root/scripts/backup.sh + mode: '0755' + +- name: Configure backup cron job + ansible.builtin.cron: + name: "HDD backup to B2" + minute: "0" + hour: "22" + job: "/root/scripts/backup.sh" + user: root diff --git a/ansible/roles/firewall_alpine/defaults/main.yml b/ansible/roles/firewall_alpine/defaults/main.yml new file mode 100644 index 0000000..b096e28 --- /dev/null +++ b/ansible/roles/firewall_alpine/defaults/main.yml @@ -0,0 +1,9 @@ +--- +# firewall_alpine defaults + +# Enable iptables persistence via OpenRC +firewall_alpine_persist: true + +# fail2ban SSH protection +firewall_alpine_fail2ban_enabled: true +firewall_alpine_fail2ban_maxretry: 10 diff --git a/ansible/roles/firewall_alpine/handlers/main.yml b/ansible/roles/firewall_alpine/handlers/main.yml new file mode 100644 index 0000000..9cb6b17 --- /dev/null +++ b/ansible/roles/firewall_alpine/handlers/main.yml @@ -0,0 +1,9 @@ +--- +- name: Restore iptables + ansible.builtin.command: iptables-restore < /etc/iptables/rules-save + changed_when: true + +- name: Restart fail2ban + ansible.builtin.service: + name: fail2ban + state: restarted diff --git a/ansible/roles/firewall_alpine/tasks/main.yml b/ansible/roles/firewall_alpine/tasks/main.yml new file mode 100644 index 0000000..f8743f8 --- /dev/null +++ b/ansible/roles/firewall_alpine/tasks/main.yml @@ -0,0 +1,52 @@ +--- +# Firewall management for Alpine hosts. +# Manages iptables persistence and fail2ban for SSH protection. +# +# NOTE: Docker manages port-forwarding rules for published container ports +# (e.g. mail ports on nuremberg-a). This role only handles non-Docker rules. + +- name: Install iptables and fail2ban + community.general.apk: + name: + - iptables + - fail2ban + state: present + +# --- iptables persistence --- + +- name: Ensure /etc/iptables directory exists + ansible.builtin.file: + path: /etc/iptables + state: directory + mode: '0700' + +- name: Deploy iptables rules + ansible.builtin.template: + src: rules.v4.j2 + dest: /etc/iptables/rules-save + mode: '0600' + notify: Restore iptables + when: firewall_alpine_persist | bool + +- name: Ensure iptables starts on boot + ansible.builtin.service: + name: iptables + enabled: true + when: firewall_alpine_persist | bool + +# --- fail2ban --- + +- name: Deploy fail2ban Alpine SSH jail + ansible.builtin.template: + src: alpine-ssh.conf.j2 + dest: /etc/fail2ban/jail.d/alpine-ssh.conf + mode: '0644' + notify: Restart fail2ban + when: firewall_alpine_fail2ban_enabled | bool + +- name: Enable fail2ban + ansible.builtin.service: + name: fail2ban + state: started + enabled: true + when: firewall_alpine_fail2ban_enabled | bool diff --git a/ansible/roles/firewall_alpine/templates/alpine-ssh.conf.j2 b/ansible/roles/firewall_alpine/templates/alpine-ssh.conf.j2 new file mode 100644 index 0000000..77854f9 --- /dev/null +++ b/ansible/roles/firewall_alpine/templates/alpine-ssh.conf.j2 @@ -0,0 +1,16 @@ +# {{ ansible_managed }} +# fail2ban SSH jails for Alpine Linux + +[sshd] +enabled = true +filter = alpine-sshd +port = ssh +logpath = /var/log/messages +maxretry = {{ firewall_alpine_fail2ban_maxretry }} + +[sshd-ddos] +enabled = true +filter = alpine-sshd-ddos +port = ssh +logpath = /var/log/messages +maxretry = {{ firewall_alpine_fail2ban_maxretry }} diff --git a/ansible/roles/firewall_alpine/templates/rules.v4.j2 b/ansible/roles/firewall_alpine/templates/rules.v4.j2 new file mode 100644 index 0000000..5182207 --- /dev/null +++ b/ansible/roles/firewall_alpine/templates/rules.v4.j2 @@ -0,0 +1,14 @@ +# {{ ansible_managed }} +# iptables rules for {{ inventory_hostname }} +# +# Docker and Tailscale manage their own chains automatically. +# This file captures non-Docker, non-Tailscale rules only. +# +# Mail ports (25,80,110,143,443,465,587,993,995) are exposed via +# Docker port mappings in the poste-io docker-compose.yml — not here. + +*filter +:INPUT ACCEPT [0:0] +:FORWARD ACCEPT [0:0] +:OUTPUT ACCEPT [0:0] +COMMIT diff --git a/ansible/roles/status_page/defaults/main.yml b/ansible/roles/status_page/defaults/main.yml new file mode 100644 index 0000000..ed854a9 --- /dev/null +++ b/ansible/roles/status_page/defaults/main.yml @@ -0,0 +1,5 @@ +--- +status_page_script_dest: /usr/local/bin/update-status.sh +status_page_output_dir: /srv/status +status_page_log_file: /var/log/update-status.log +status_page_cron_schedule: "* * * * *" diff --git a/ansible/roles/status_page/tasks/main.yml b/ansible/roles/status_page/tasks/main.yml new file mode 100644 index 0000000..6df3750 --- /dev/null +++ b/ansible/roles/status_page/tasks/main.yml @@ -0,0 +1,30 @@ +--- +# Deploy the status page update script and cron job. +# Runs every minute, queries Prometheus for Caddy upstream health, +# writes status.json + history to /srv/status/. + +- name: Ensure status output directory exists + ansible.builtin.file: + path: "{{ status_page_output_dir }}" + state: directory + mode: '0755' + +- name: Deploy update-status.sh + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/status-page/update-status.sh" + dest: "{{ status_page_script_dest }}" + mode: '0755' + backup: true + +- name: Ensure python3 is installed (for history generation) + ansible.builtin.apt: + name: python3 + state: present + when: ansible_facts["os_family"] == "Debian" + +- name: Set up status page cron job + ansible.builtin.cron: + name: "update-status-page" + job: "{{ status_page_script_dest }} >> {{ status_page_log_file }} 2>&1" + minute: "*" + user: root diff --git a/ansible/services/caddy/Caddyfile b/ansible/services/caddy/Caddyfile index 7995ef0..2f4b3af 100644 --- a/ansible/services/caddy/Caddyfile +++ b/ansible/services/caddy/Caddyfile @@ -38,7 +38,7 @@ alertmanager.pez.solutions, alertmanager.pez.sh { uri /api/authz/forward-auth copy_headers Remote-User Remote-Groups Remote-Name Remote-Email } - reverse_proxy 100.122.219.41:3000 + reverse_proxy 100.122.219.41:9093 } ## LONDON-B SERVICES ## diff --git a/ansible/services/caddy/Caddyfile.template b/ansible/services/caddy/Caddyfile.template index 600d437..7fe093b 100644 --- a/ansible/services/caddy/Caddyfile.template +++ b/ansible/services/caddy/Caddyfile.template @@ -45,7 +45,7 @@ prometheus.{{DOMAIN_ALT}}, prometheus.{{DOMAIN_PRIMARY}} { # Alertmanager alertmanager.{{DOMAIN_ALT}}, alertmanager.{{DOMAIN_PRIMARY}} { import authelia - reverse_proxy {{LONDON_A_IP}}:3000 + reverse_proxy {{LONDON_A_IP}}:9093 } ## LONDON-B SERVICES ## diff --git a/ansible/services/status-page/update-status.sh b/ansible/services/status-page/update-status.sh new file mode 100755 index 0000000..66f329b --- /dev/null +++ b/ansible/services/status-page/update-status.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# update-status.sh — Fetch Prometheus metrics and write /srv/status/status.json + history +set -euo pipefail + +PROMETHEUS="http://100.122.219.41:9090" +OUTPUT="/srv/status/status.json" +HISTORY_LOG="/srv/status/history.log" +HISTORY_JSON="/srv/status/history.json" +QUERY="caddy_reverse_proxy_upstreams_healthy" + +# Service map: upstream address → display name +declare -A SERVICE_MAP +SERVICE_MAP["localhost:8443"]="Bitwarden" +SERVICE_MAP["100.122.219.41:3000"]="Grafana" +SERVICE_MAP["100.84.65.101:32400"]="Plex" +SERVICE_MAP["100.84.65.101:4533"]="Navidrome" +SERVICE_MAP["100.84.65.101:5030"]="Soulseek" +SERVICE_MAP["100.84.65.101:5055"]="Overseerr" +SERVICE_MAP["100.84.65.101:5056"]="Jellyfin Requests" +SERVICE_MAP["100.84.65.101:7878"]="Radarr" +SERVICE_MAP["100.84.65.101:8096"]="Jellyfin" +SERVICE_MAP["100.84.65.101:8686"]="Lidarr" +SERVICE_MAP["100.84.65.101:8787"]="Readarr" +SERVICE_MAP["100.84.65.101:8989"]="Sonarr" +SERVICE_MAP["100.84.65.101:9091"]="Transmission" +SERVICE_MAP["100.84.65.101:9696"]="Prowlarr" +SERVICE_MAP["100.84.65.101:11000"]="Nextcloud" +SERVICE_MAP["localhost:9091"]="Authelia" +SERVICE_MAP["100.84.65.101:8181"]="Miniflux" +SERVICE_MAP["localhost:3000"]="Forgejo" + +# Desired display order +DISPLAY_ORDER=( + "localhost:8443" + "localhost:9091" + "100.84.65.101:11000" + "100.122.219.41:3000" + "100.84.65.101:32400" + "100.84.65.101:8096" + "100.84.65.101:5056" + "100.84.65.101:4533" + "100.84.65.101:5030" + "100.84.65.101:5055" + "100.84.65.101:7878" + "100.84.65.101:8989" + "100.84.65.101:8686" + "100.84.65.101:8787" + "100.84.65.101:9696" + "100.84.65.101:9091" + "100.84.65.101:8181" + "localhost:3000" +) + +# Fetch from Prometheus +RESPONSE=$(curl -sf --max-time 10 \ + "${PROMETHEUS}/api/v1/query?query=${QUERY}" 2>/dev/null) || { + echo "ERROR: Failed to fetch Prometheus metrics" >&2 + exit 1 +} + +# Parse with jq — build a lookup of upstream→value +UPSTREAM_DATA=$(echo "$RESPONSE" | jq -r ' + .data.result[] | + .metric.upstream + " " + .value[1] +' 2>/dev/null) || { + echo "ERROR: Failed to parse Prometheus response" >&2 + exit 1 +} + +# Build services JSON array +UPDATED=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +HAS_DOWN=0 +HAS_UP=0 + +SERVICES_JSON="" +HISTORY_SERVICES="" + +for upstream in "${DISPLAY_ORDER[@]}"; do + name="${SERVICE_MAP[$upstream]:-}" + [ -z "$name" ] && continue + + # Look up health value for this upstream + value=$(echo "$UPSTREAM_DATA" | grep -F "$upstream " | awk '{print $NF}' | head -1) + + if [ "$value" = "1" ]; then + status="operational" + hist_val=1 + HAS_UP=1 + elif [ "$value" = "0" ]; then + status="degraded" + hist_val=0 + HAS_DOWN=1 + else + status="degraded" + hist_val=0 + HAS_DOWN=1 + fi + + if [ -n "$SERVICES_JSON" ]; then + SERVICES_JSON="${SERVICES_JSON}," + fi + SERVICES_JSON="${SERVICES_JSON}{\"name\":\"${name}\",\"status\":\"${status}\"}" + + if [ -n "$HISTORY_SERVICES" ]; then + HISTORY_SERVICES="${HISTORY_SERVICES}," + fi + HISTORY_SERVICES="${HISTORY_SERVICES}\"${name}\":${hist_val}" +done + +# Determine overall status +if [ $HAS_DOWN -eq 0 ]; then + OVERALL="operational" +elif [ $HAS_UP -eq 0 ]; then + OVERALL="outage" +else + OVERALL="degraded" +fi + +# Write status.json +mkdir -p "$(dirname "$OUTPUT")" +cat > "$OUTPUT" <> "$HISTORY_LOG" + +# Trim history.log to last 129600 lines (90 days × 24h × 60min) +MAX_LINES=129600 +LINE_COUNT=$(wc -l < "$HISTORY_LOG") +if [ "$LINE_COUNT" -gt "$MAX_LINES" ]; then + tail -n "$MAX_LINES" "$HISTORY_LOG" > "${HISTORY_LOG}.tmp" && mv "${HISTORY_LOG}.tmp" "$HISTORY_LOG" +fi + +# Regenerate history.json from history.log +python3 - "$HISTORY_LOG" "$HISTORY_JSON" <<'PYEOF' +import sys, json +from datetime import datetime, timezone, timedelta +from collections import defaultdict + +history_log = sys.argv[1] +history_json_path = sys.argv[2] + +# Parse all log lines, group by hour key +hour_data = defaultdict(lambda: defaultdict(list)) + +try: + with open(history_log) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + ts = entry['ts'] + hour_key = ts[:13] # e.g. "2026-03-03T19" + for svc, val in entry['services'].items(): + hour_data[hour_key][svc].append(val) + except Exception: + continue +except FileNotFoundError: + pass + +# Generate exactly 2160 hour slots (90 days), oldest first, ending at current hour +now = datetime.now(timezone.utc) +current_hour = now.replace(minute=0, second=0, microsecond=0) +slots = [(current_hour - timedelta(hours=2159 - i)) for i in range(2160)] +slot_keys = [h.strftime('%Y-%m-%dT%H') for h in slots] + +# Collect all service names from log data +service_names = set() +for hour_vals in hour_data.values(): + service_names.update(hour_vals.keys()) + +result = { + 'days': 90, + 'generated': now.strftime('%Y-%m-%dT%H:%M:%SZ'), + 'services': {} +} + +for svc in sorted(service_names): + hours_list = [] + for slot_key in slot_keys: + checks = hour_data.get(slot_key, {}).get(svc, []) + if not checks: + hours_list.append(None) + else: + # Majority vote: >50% up → 1, otherwise 0 + hours_list.append(1 if sum(checks) > len(checks) / 2 else 0) + + valid = [h for h in hours_list if h is not None] + uptime_pct = round(sum(valid) / len(valid) * 100, 2) if valid else None + + result['services'][svc] = { + 'uptime_percent': uptime_pct, + 'hours': hours_list + } + +with open(history_json_path, 'w') as f: + json.dump(result, f, separators=(',', ':')) + +print(f"[history] Wrote {history_json_path} ({len(service_names)} services, {len(slot_keys)} hour slots)") +PYEOF diff --git a/terraform/cloudflare_dns.tf b/terraform/cloudflare_dns.tf index a9a8da9..36d541e 100644 --- a/terraform/cloudflare_dns.tf +++ b/terraform/cloudflare_dns.tf @@ -9,15 +9,6 @@ resource "cloudflare_zone" "pez-sh" { # A Records # ============================================================================= -resource "cloudflare_dns_record" "ecp-dev-0o9lix" { - zone_id = cloudflare_zone.pez-sh.id - name = "0o9lix.ecp-dev" - type = "A" - content = "0.0.0.0" - proxied = false - ttl = 300 -} - resource "cloudflare_dns_record" "alertmanager" { zone_id = cloudflare_zone.pez-sh.id name = "alertmanager" @@ -54,15 +45,6 @@ resource "cloudflare_dns_record" "bitwarden" { ttl = 1 } -resource "cloudflare_dns_record" "chimera" { - zone_id = cloudflare_zone.pez-sh.id - name = "chimera" - type = "A" - content = "13.43.223.167" - proxied = false - ttl = 1 -} - resource "cloudflare_dns_record" "cloud" { zone_id = cloudflare_zone.pez-sh.id name = "cloud" @@ -90,15 +72,6 @@ resource "cloudflare_dns_record" "git" { ttl = 1 } -resource "cloudflare_dns_record" "gopher" { - zone_id = cloudflare_zone.pez-sh.id - name = "gopher" - type = "A" - content = "83.94.248.182" - proxied = false - ttl = 1 -} - resource "cloudflare_dns_record" "grafana" { zone_id = cloudflare_zone.pez-sh.id name = "grafana" @@ -412,43 +385,3 @@ resource "cloudflare_dns_record" "root-txt-spf" { content = "v=spf1 ip4:167.235.134.154 ip6:2a01:4f8:1c1e:9c53::1 -all" ttl = 1 } - -resource "cloudflare_dns_record" "root-txt-protonmail" { - zone_id = cloudflare_zone.pez-sh.id - name = "@" - type = "TXT" - content = "protonmail-verification=66cf5eff60c61c46a0d36b108c5cfbddc4f2eede" - ttl = 1 -} - -resource "cloudflare_dns_record" "root-txt-keybase" { - zone_id = cloudflare_zone.pez-sh.id - name = "@" - type = "TXT" - content = "keybase-site-verification=ur7GwlgtEEPgIZ-2P0fyFsniuu6YwdkluO7N6LkymK0" - ttl = 1 -} - -resource "cloudflare_dns_record" "root-txt-ms" { - zone_id = cloudflare_zone.pez-sh.id - name = "@" - type = "TXT" - content = "MS=ms99554544" - ttl = 300 -} - -resource "cloudflare_dns_record" "root-txt-google" { - zone_id = cloudflare_zone.pez-sh.id - name = "@" - type = "TXT" - content = "google-site-verification=BZD6ITg5SFnc7mQcb9KGkPwhP9gQKDZgw4nrFOZ0Y0w" - ttl = 1 -} - -resource "cloudflare_dns_record" "root-txt-apple" { - zone_id = cloudflare_zone.pez-sh.id - name = "@" - type = "TXT" - content = "apple-domain=1zXuOydmezm51GT8" - ttl = 1 -}