mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
capture helsinki-a status page cron in repo (#17)
add status_page role that deploys update-status.sh and its cron job. script queries prometheus for caddy upstream health and writes status.json + history to /srv/status/ every minute. refs: PESO-94
This commit is contained in:
parent
42eba42522
commit
b0acdb72e3
4 changed files with 250 additions and 3 deletions
|
|
@ -46,12 +46,13 @@
|
|||
# Stage 4: Per-host services
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
# helsinki-a: Caddy reverse proxy
|
||||
- name: "Stage 4a: Caddy (helsinki-a)"
|
||||
# helsinki-a: Caddy reverse proxy + status page
|
||||
- name: "Stage 4a: Caddy + status page (helsinki-a)"
|
||||
hosts: helsinki-a
|
||||
tags: [services, caddy]
|
||||
tags: [services, caddy, status_page]
|
||||
roles:
|
||||
- role: caddy
|
||||
- role: status_page
|
||||
|
||||
# london-b: Docker services (storage, apps) + backups
|
||||
- name: "Stage 4b: Docker services (london-b)"
|
||||
|
|
|
|||
5
ansible/roles/status_page/defaults/main.yml
Normal file
5
ansible/roles/status_page/defaults/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
status_page_script_dest: /usr/local/bin/update-status.sh
|
||||
status_page_output_dir: /srv/status
|
||||
status_page_log_file: /var/log/update-status.log
|
||||
status_page_cron_schedule: "* * * * *"
|
||||
30
ansible/roles/status_page/tasks/main.yml
Normal file
30
ansible/roles/status_page/tasks/main.yml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
---
|
||||
# Deploy the status page update script and cron job.
|
||||
# Runs every minute, queries Prometheus for Caddy upstream health,
|
||||
# writes status.json + history to /srv/status/.
|
||||
|
||||
- name: Ensure status output directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ status_page_output_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Deploy update-status.sh
|
||||
ansible.builtin.copy:
|
||||
src: "{{ playbook_dir }}/services/status-page/update-status.sh"
|
||||
dest: "{{ status_page_script_dest }}"
|
||||
mode: '0755'
|
||||
backup: true
|
||||
|
||||
- name: Ensure python3 is installed (for history generation)
|
||||
ansible.builtin.apt:
|
||||
name: python3
|
||||
state: present
|
||||
when: ansible_facts["os_family"] == "Debian"
|
||||
|
||||
- name: Set up status page cron job
|
||||
ansible.builtin.cron:
|
||||
name: "update-status-page"
|
||||
job: "{{ status_page_script_dest }} >> {{ status_page_log_file }} 2>&1"
|
||||
minute: "*"
|
||||
user: root
|
||||
211
ansible/services/status-page/update-status.sh
Executable file
211
ansible/services/status-page/update-status.sh
Executable file
|
|
@ -0,0 +1,211 @@
|
|||
#!/bin/bash
|
||||
# update-status.sh — Fetch Prometheus metrics and write /srv/status/status.json + history
|
||||
set -euo pipefail
|
||||
|
||||
PROMETHEUS="http://100.122.219.41:9090"
|
||||
OUTPUT="/srv/status/status.json"
|
||||
HISTORY_LOG="/srv/status/history.log"
|
||||
HISTORY_JSON="/srv/status/history.json"
|
||||
QUERY="caddy_reverse_proxy_upstreams_healthy"
|
||||
|
||||
# Service map: upstream address → display name
|
||||
declare -A SERVICE_MAP
|
||||
SERVICE_MAP["localhost:8443"]="Bitwarden"
|
||||
SERVICE_MAP["100.122.219.41:3000"]="Grafana"
|
||||
SERVICE_MAP["100.84.65.101:32400"]="Plex"
|
||||
SERVICE_MAP["100.84.65.101:4533"]="Navidrome"
|
||||
SERVICE_MAP["100.84.65.101:5030"]="Soulseek"
|
||||
SERVICE_MAP["100.84.65.101:5055"]="Overseerr"
|
||||
SERVICE_MAP["100.84.65.101:5056"]="Jellyfin Requests"
|
||||
SERVICE_MAP["100.84.65.101:7878"]="Radarr"
|
||||
SERVICE_MAP["100.84.65.101:8096"]="Jellyfin"
|
||||
SERVICE_MAP["100.84.65.101:8686"]="Lidarr"
|
||||
SERVICE_MAP["100.84.65.101:8787"]="Readarr"
|
||||
SERVICE_MAP["100.84.65.101:8989"]="Sonarr"
|
||||
SERVICE_MAP["100.84.65.101:9091"]="Transmission"
|
||||
SERVICE_MAP["100.84.65.101:9696"]="Prowlarr"
|
||||
SERVICE_MAP["100.84.65.101:11000"]="Nextcloud"
|
||||
SERVICE_MAP["localhost:9091"]="Authelia"
|
||||
SERVICE_MAP["100.84.65.101:8181"]="Miniflux"
|
||||
SERVICE_MAP["localhost:3000"]="Forgejo"
|
||||
|
||||
# Desired display order
|
||||
DISPLAY_ORDER=(
|
||||
"localhost:8443"
|
||||
"localhost:9091"
|
||||
"100.84.65.101:11000"
|
||||
"100.122.219.41:3000"
|
||||
"100.84.65.101:32400"
|
||||
"100.84.65.101:8096"
|
||||
"100.84.65.101:5056"
|
||||
"100.84.65.101:4533"
|
||||
"100.84.65.101:5030"
|
||||
"100.84.65.101:5055"
|
||||
"100.84.65.101:7878"
|
||||
"100.84.65.101:8989"
|
||||
"100.84.65.101:8686"
|
||||
"100.84.65.101:8787"
|
||||
"100.84.65.101:9696"
|
||||
"100.84.65.101:9091"
|
||||
"100.84.65.101:8181"
|
||||
"localhost:3000"
|
||||
)
|
||||
|
||||
# Fetch from Prometheus
|
||||
RESPONSE=$(curl -sf --max-time 10 \
|
||||
"${PROMETHEUS}/api/v1/query?query=${QUERY}" 2>/dev/null) || {
|
||||
echo "ERROR: Failed to fetch Prometheus metrics" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Parse with jq — build a lookup of upstream→value
|
||||
UPSTREAM_DATA=$(echo "$RESPONSE" | jq -r '
|
||||
.data.result[] |
|
||||
.metric.upstream + " " + .value[1]
|
||||
' 2>/dev/null) || {
|
||||
echo "ERROR: Failed to parse Prometheus response" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Build services JSON array
|
||||
UPDATED=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
HAS_DOWN=0
|
||||
HAS_UP=0
|
||||
|
||||
SERVICES_JSON=""
|
||||
HISTORY_SERVICES=""
|
||||
|
||||
for upstream in "${DISPLAY_ORDER[@]}"; do
|
||||
name="${SERVICE_MAP[$upstream]:-}"
|
||||
[ -z "$name" ] && continue
|
||||
|
||||
# Look up health value for this upstream
|
||||
value=$(echo "$UPSTREAM_DATA" | grep -F "$upstream " | awk '{print $NF}' | head -1)
|
||||
|
||||
if [ "$value" = "1" ]; then
|
||||
status="operational"
|
||||
hist_val=1
|
||||
HAS_UP=1
|
||||
elif [ "$value" = "0" ]; then
|
||||
status="degraded"
|
||||
hist_val=0
|
||||
HAS_DOWN=1
|
||||
else
|
||||
status="degraded"
|
||||
hist_val=0
|
||||
HAS_DOWN=1
|
||||
fi
|
||||
|
||||
if [ -n "$SERVICES_JSON" ]; then
|
||||
SERVICES_JSON="${SERVICES_JSON},"
|
||||
fi
|
||||
SERVICES_JSON="${SERVICES_JSON}{\"name\":\"${name}\",\"status\":\"${status}\"}"
|
||||
|
||||
if [ -n "$HISTORY_SERVICES" ]; then
|
||||
HISTORY_SERVICES="${HISTORY_SERVICES},"
|
||||
fi
|
||||
HISTORY_SERVICES="${HISTORY_SERVICES}\"${name}\":${hist_val}"
|
||||
done
|
||||
|
||||
# Determine overall status
|
||||
if [ $HAS_DOWN -eq 0 ]; then
|
||||
OVERALL="operational"
|
||||
elif [ $HAS_UP -eq 0 ]; then
|
||||
OVERALL="outage"
|
||||
else
|
||||
OVERALL="degraded"
|
||||
fi
|
||||
|
||||
# Write status.json
|
||||
mkdir -p "$(dirname "$OUTPUT")"
|
||||
cat > "$OUTPUT" <<EOF
|
||||
{
|
||||
"updated": "${UPDATED}",
|
||||
"overall": "${OVERALL}",
|
||||
"services": [${SERVICES_JSON}]
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "[$UPDATED] Status written to $OUTPUT (overall: $OVERALL)"
|
||||
|
||||
# ===== History tracking =====
|
||||
|
||||
# Append current check to history.log
|
||||
echo "{\"ts\":\"${UPDATED}\",\"services\":{${HISTORY_SERVICES}}}" >> "$HISTORY_LOG"
|
||||
|
||||
# Trim history.log to last 129600 lines (90 days × 24h × 60min)
|
||||
MAX_LINES=129600
|
||||
LINE_COUNT=$(wc -l < "$HISTORY_LOG")
|
||||
if [ "$LINE_COUNT" -gt "$MAX_LINES" ]; then
|
||||
tail -n "$MAX_LINES" "$HISTORY_LOG" > "${HISTORY_LOG}.tmp" && mv "${HISTORY_LOG}.tmp" "$HISTORY_LOG"
|
||||
fi
|
||||
|
||||
# Regenerate history.json from history.log
|
||||
python3 - "$HISTORY_LOG" "$HISTORY_JSON" <<'PYEOF'
|
||||
import sys, json
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from collections import defaultdict
|
||||
|
||||
history_log = sys.argv[1]
|
||||
history_json_path = sys.argv[2]
|
||||
|
||||
# Parse all log lines, group by hour key
|
||||
hour_data = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
try:
|
||||
with open(history_log) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
ts = entry['ts']
|
||||
hour_key = ts[:13] # e.g. "2026-03-03T19"
|
||||
for svc, val in entry['services'].items():
|
||||
hour_data[hour_key][svc].append(val)
|
||||
except Exception:
|
||||
continue
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# Generate exactly 2160 hour slots (90 days), oldest first, ending at current hour
|
||||
now = datetime.now(timezone.utc)
|
||||
current_hour = now.replace(minute=0, second=0, microsecond=0)
|
||||
slots = [(current_hour - timedelta(hours=2159 - i)) for i in range(2160)]
|
||||
slot_keys = [h.strftime('%Y-%m-%dT%H') for h in slots]
|
||||
|
||||
# Collect all service names from log data
|
||||
service_names = set()
|
||||
for hour_vals in hour_data.values():
|
||||
service_names.update(hour_vals.keys())
|
||||
|
||||
result = {
|
||||
'days': 90,
|
||||
'generated': now.strftime('%Y-%m-%dT%H:%M:%SZ'),
|
||||
'services': {}
|
||||
}
|
||||
|
||||
for svc in sorted(service_names):
|
||||
hours_list = []
|
||||
for slot_key in slot_keys:
|
||||
checks = hour_data.get(slot_key, {}).get(svc, [])
|
||||
if not checks:
|
||||
hours_list.append(None)
|
||||
else:
|
||||
# Majority vote: >50% up → 1, otherwise 0
|
||||
hours_list.append(1 if sum(checks) > len(checks) / 2 else 0)
|
||||
|
||||
valid = [h for h in hours_list if h is not None]
|
||||
uptime_pct = round(sum(valid) / len(valid) * 100, 2) if valid else None
|
||||
|
||||
result['services'][svc] = {
|
||||
'uptime_percent': uptime_pct,
|
||||
'hours': hours_list
|
||||
}
|
||||
|
||||
with open(history_json_path, 'w') as f:
|
||||
json.dump(result, f, separators=(',', ':'))
|
||||
|
||||
print(f"[history] Wrote {history_json_path} ({len(service_names)} services, {len(slot_keys)} hour slots)")
|
||||
PYEOF
|
||||
Loading…
Add table
Reference in a new issue