capture helsinki-a status page cron in repo

add status_page role that deploys update-status.sh and its cron job. script queries prometheus for caddy upstream health and writes status.json + history to /srv/status/ every minute. refs: PESO-94
2026-05-06 04:14:43 +00:00 · 2026-03-29 14:38:00 +00:00 · 2026-03-29 14:38:00 +00:00 · 01b6220eb8
commit 01b6220eb8
parent 42eba42522
4 changed files with 250 additions and 3 deletions
--- a/ansible/deploy.yml
+++ b/ansible/deploy.yml
@ -46,12 +46,13 @@
 # Stage 4: Per-host services
 # ──────────────────────────────────────────────
-# helsinki-a: Caddy reverse proxy
+# helsinki-a: Caddy reverse proxy + status page
- name: "Stage 4a: Caddy (helsinki-a)"
+- name: "Stage 4a: Caddy + status page (helsinki-a)"
  hosts: helsinki-a
-  tags: [services, caddy]
+  tags: [services, caddy, status_page]
  roles:
    - role: caddy
    - role: status_page
 # london-b: Docker services (storage, apps) + backups
 - name: "Stage 4b: Docker services (london-b)"
--- a/ansible/roles/status_page/defaults/main.yml
+++ b/ansible/roles/status_page/defaults/main.yml
@ -0,0 +1,5 @@
 ---
 status_page_script_dest: /usr/local/bin/update-status.sh
 status_page_output_dir: /srv/status
 status_page_log_file: /var/log/update-status.log
 status_page_cron_schedule: "* * * * *"
--- a/ansible/roles/status_page/tasks/main.yml
+++ b/ansible/roles/status_page/tasks/main.yml
@ -0,0 +1,30 @@
 ---
 # Deploy the status page update script and cron job.
 # Runs every minute, queries Prometheus for Caddy upstream health,
 # writes status.json + history to /srv/status/.
 - name: Ensure status output directory exists
  ansible.builtin.file:
    path: "{{ status_page_output_dir }}"
    state: directory
    mode: '0755'
 - name: Deploy update-status.sh
  ansible.builtin.copy:
    src: "{{ playbook_dir }}/services/status-page/update-status.sh"
    dest: "{{ status_page_script_dest }}"
    mode: '0755'
    backup: true
 - name: Ensure python3 is installed (for history generation)
  ansible.builtin.apt:
    name: python3
    state: present
  when: ansible_facts["os_family"] == "Debian"
 - name: Set up status page cron job
  ansible.builtin.cron:
    name: "update-status-page"
    job: "{{ status_page_script_dest }} >> {{ status_page_log_file }} 2>&1"
    minute: "*"
    user: root
--- a/ansible/services/status-page/update-status.sh
+++ b/ansible/services/status-page/update-status.sh
@ -0,0 +1,211 @@
 #!/bin/bash
 # update-status.sh — Fetch Prometheus metrics and write /srv/status/status.json + history
 set -euo pipefail
 PROMETHEUS="http://100.122.219.41:9090"
 OUTPUT="/srv/status/status.json"
 HISTORY_LOG="/srv/status/history.log"
 HISTORY_JSON="/srv/status/history.json"
 QUERY="caddy_reverse_proxy_upstreams_healthy"
 # Service map: upstream address → display name
 declare -A SERVICE_MAP
 SERVICE_MAP["localhost:8443"]="Bitwarden"
 SERVICE_MAP["100.122.219.41:3000"]="Grafana"
 SERVICE_MAP["100.84.65.101:32400"]="Plex"
 SERVICE_MAP["100.84.65.101:4533"]="Navidrome"
 SERVICE_MAP["100.84.65.101:5030"]="Soulseek"
 SERVICE_MAP["100.84.65.101:5055"]="Overseerr"
 SERVICE_MAP["100.84.65.101:5056"]="Jellyfin Requests"
 SERVICE_MAP["100.84.65.101:7878"]="Radarr"
 SERVICE_MAP["100.84.65.101:8096"]="Jellyfin"
 SERVICE_MAP["100.84.65.101:8686"]="Lidarr"
 SERVICE_MAP["100.84.65.101:8787"]="Readarr"
 SERVICE_MAP["100.84.65.101:8989"]="Sonarr"
 SERVICE_MAP["100.84.65.101:9091"]="Transmission"
 SERVICE_MAP["100.84.65.101:9696"]="Prowlarr"
 SERVICE_MAP["100.84.65.101:11000"]="Nextcloud"
 SERVICE_MAP["localhost:9091"]="Authelia"
 SERVICE_MAP["100.84.65.101:8181"]="Miniflux"
 SERVICE_MAP["localhost:3000"]="Forgejo"
 # Desired display order
 DISPLAY_ORDER=(
  "localhost:8443"
  "localhost:9091"
  "100.84.65.101:11000"
  "100.122.219.41:3000"
  "100.84.65.101:32400"
  "100.84.65.101:8096"
  "100.84.65.101:5056"
  "100.84.65.101:4533"
  "100.84.65.101:5030"
  "100.84.65.101:5055"
  "100.84.65.101:7878"
  "100.84.65.101:8989"
  "100.84.65.101:8686"
  "100.84.65.101:8787"
  "100.84.65.101:9696"
  "100.84.65.101:9091"
  "100.84.65.101:8181"
  "localhost:3000"
 )
 # Fetch from Prometheus
 RESPONSE=$(curl -sf --max-time 10 \
  "${PROMETHEUS}/api/v1/query?query=${QUERY}" 2>/dev/null) || {
  echo "ERROR: Failed to fetch Prometheus metrics" >&2
  exit 1
 }
 # Parse with jq — build a lookup of upstream→value
 UPSTREAM_DATA=$(echo "$RESPONSE" | jq -r '
  .data.result[] |
  .metric.upstream + " " + .value[1]
 ' 2>/dev/null) || {
  echo "ERROR: Failed to parse Prometheus response" >&2
  exit 1
 }
 # Build services JSON array
 UPDATED=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 HAS_DOWN=0
 HAS_UP=0
 SERVICES_JSON=""
 HISTORY_SERVICES=""
 for upstream in "${DISPLAY_ORDER[@]}"; do
  name="${SERVICE_MAP[$upstream]:-}"
  [ -z "$name" ] && continue
  # Look up health value for this upstream
  value=$(echo "$UPSTREAM_DATA" | grep -F "$upstream " | awk '{print $NF}' | head -1)
  if [ "$value" = "1" ]; then
    status="operational"
    hist_val=1
    HAS_UP=1
  elif [ "$value" = "0" ]; then
    status="degraded"
    hist_val=0
    HAS_DOWN=1
  else
    status="degraded"
    hist_val=0
    HAS_DOWN=1
  fi
  if [ -n "$SERVICES_JSON" ]; then
    SERVICES_JSON="${SERVICES_JSON},"
  fi
  SERVICES_JSON="${SERVICES_JSON}{\"name\":\"${name}\",\"status\":\"${status}\"}"
  if [ -n "$HISTORY_SERVICES" ]; then
    HISTORY_SERVICES="${HISTORY_SERVICES},"
  fi
  HISTORY_SERVICES="${HISTORY_SERVICES}\"${name}\":${hist_val}"
 done
 # Determine overall status
 if [ $HAS_DOWN -eq 0 ]; then
  OVERALL="operational"
 elif [ $HAS_UP -eq 0 ]; then
  OVERALL="outage"
 else
  OVERALL="degraded"
 fi
 # Write status.json
 mkdir -p "$(dirname "$OUTPUT")"
 cat > "$OUTPUT" <<EOF
 {
  "updated": "${UPDATED}",
  "overall": "${OVERALL}",
  "services": [${SERVICES_JSON}]
 }
 EOF
 echo "[$UPDATED] Status written to $OUTPUT (overall: $OVERALL)"
 # ===== History tracking =====
 # Append current check to history.log
 echo "{\"ts\":\"${UPDATED}\",\"services\":{${HISTORY_SERVICES}}}" >> "$HISTORY_LOG"
 # Trim history.log to last 129600 lines (90 days × 24h × 60min)
 MAX_LINES=129600
 LINE_COUNT=$(wc -l < "$HISTORY_LOG")
 if [ "$LINE_COUNT" -gt "$MAX_LINES" ]; then
  tail -n "$MAX_LINES" "$HISTORY_LOG" > "${HISTORY_LOG}.tmp" && mv "${HISTORY_LOG}.tmp" "$HISTORY_LOG"
 fi
 # Regenerate history.json from history.log
 python3 - "$HISTORY_LOG" "$HISTORY_JSON" <<'PYEOF'
 import sys, json
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 history_log = sys.argv[1]
 history_json_path = sys.argv[2]
 # Parse all log lines, group by hour key
 hour_data = defaultdict(lambda: defaultdict(list))
 try:
    with open(history_log) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
                ts = entry['ts']
                hour_key = ts[:13]  # e.g. "2026-03-03T19"
                for svc, val in entry['services'].items():
                    hour_data[hour_key][svc].append(val)
            except Exception:
                continue
 except FileNotFoundError:
    pass
 # Generate exactly 2160 hour slots (90 days), oldest first, ending at current hour
 now = datetime.now(timezone.utc)
 current_hour = now.replace(minute=0, second=0, microsecond=0)
 slots = [(current_hour - timedelta(hours=2159 - i)) for i in range(2160)]
 slot_keys = [h.strftime('%Y-%m-%dT%H') for h in slots]
 # Collect all service names from log data
 service_names = set()
 for hour_vals in hour_data.values():
    service_names.update(hour_vals.keys())
 result = {
    'days': 90,
    'generated': now.strftime('%Y-%m-%dT%H:%M:%SZ'),
    'services': {}
 }
 for svc in sorted(service_names):
    hours_list = []
    for slot_key in slot_keys:
        checks = hour_data.get(slot_key, {}).get(svc, [])
        if not checks:
            hours_list.append(None)
        else:
            # Majority vote: >50% up → 1, otherwise 0
            hours_list.append(1 if sum(checks) > len(checks) / 2 else 0)
    valid = [h for h in hours_list if h is not None]
    uptime_pct = round(sum(valid) / len(valid) * 100, 2) if valid else None
    result['services'][svc] = {
        'uptime_percent': uptime_pct,
        'hours': hours_list
    }
 with open(history_json_path, 'w') as f:
    json.dump(result, f, separators=(',', ':'))
 print(f"[history] Wrote {history_json_path} ({len(service_names)} services, {len(slot_keys)} hour slots)")
 PYEOF