mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-05-06 04:14:43 +00:00
capture helsinki-a status page cron in repo (#17)
add status_page role that deploys update-status.sh and its cron job. script queries prometheus for caddy upstream health and writes status.json + history to /srv/status/ every minute. refs: PESO-94
This commit is contained in:
parent
42eba42522
commit
b0acdb72e3
4 changed files with 250 additions and 3 deletions
|
|
@ -46,12 +46,13 @@
|
||||||
# Stage 4: Per-host services
|
# Stage 4: Per-host services
|
||||||
# ──────────────────────────────────────────────
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
# helsinki-a: Caddy reverse proxy
|
# helsinki-a: Caddy reverse proxy + status page
|
||||||
- name: "Stage 4a: Caddy (helsinki-a)"
|
- name: "Stage 4a: Caddy + status page (helsinki-a)"
|
||||||
hosts: helsinki-a
|
hosts: helsinki-a
|
||||||
tags: [services, caddy]
|
tags: [services, caddy, status_page]
|
||||||
roles:
|
roles:
|
||||||
- role: caddy
|
- role: caddy
|
||||||
|
- role: status_page
|
||||||
|
|
||||||
# london-b: Docker services (storage, apps) + backups
|
# london-b: Docker services (storage, apps) + backups
|
||||||
- name: "Stage 4b: Docker services (london-b)"
|
- name: "Stage 4b: Docker services (london-b)"
|
||||||
|
|
|
||||||
5
ansible/roles/status_page/defaults/main.yml
Normal file
5
ansible/roles/status_page/defaults/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
status_page_script_dest: /usr/local/bin/update-status.sh
|
||||||
|
status_page_output_dir: /srv/status
|
||||||
|
status_page_log_file: /var/log/update-status.log
|
||||||
|
status_page_cron_schedule: "* * * * *"
|
||||||
30
ansible/roles/status_page/tasks/main.yml
Normal file
30
ansible/roles/status_page/tasks/main.yml
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
---
|
||||||
|
# Deploy the status page update script and cron job.
|
||||||
|
# Runs every minute, queries Prometheus for Caddy upstream health,
|
||||||
|
# writes status.json + history to /srv/status/.
|
||||||
|
|
||||||
|
- name: Ensure status output directory exists
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ status_page_output_dir }}"
|
||||||
|
state: directory
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Deploy update-status.sh
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: "{{ playbook_dir }}/services/status-page/update-status.sh"
|
||||||
|
dest: "{{ status_page_script_dest }}"
|
||||||
|
mode: '0755'
|
||||||
|
backup: true
|
||||||
|
|
||||||
|
- name: Ensure python3 is installed (for history generation)
|
||||||
|
ansible.builtin.apt:
|
||||||
|
name: python3
|
||||||
|
state: present
|
||||||
|
when: ansible_facts["os_family"] == "Debian"
|
||||||
|
|
||||||
|
- name: Set up status page cron job
|
||||||
|
ansible.builtin.cron:
|
||||||
|
name: "update-status-page"
|
||||||
|
job: "{{ status_page_script_dest }} >> {{ status_page_log_file }} 2>&1"
|
||||||
|
minute: "*"
|
||||||
|
user: root
|
||||||
211
ansible/services/status-page/update-status.sh
Executable file
211
ansible/services/status-page/update-status.sh
Executable file
|
|
@ -0,0 +1,211 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# update-status.sh — Fetch Prometheus metrics and write /srv/status/status.json + history
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
PROMETHEUS="http://100.122.219.41:9090"
|
||||||
|
OUTPUT="/srv/status/status.json"
|
||||||
|
HISTORY_LOG="/srv/status/history.log"
|
||||||
|
HISTORY_JSON="/srv/status/history.json"
|
||||||
|
QUERY="caddy_reverse_proxy_upstreams_healthy"
|
||||||
|
|
||||||
|
# Service map: upstream address → display name
|
||||||
|
declare -A SERVICE_MAP
|
||||||
|
SERVICE_MAP["localhost:8443"]="Bitwarden"
|
||||||
|
SERVICE_MAP["100.122.219.41:3000"]="Grafana"
|
||||||
|
SERVICE_MAP["100.84.65.101:32400"]="Plex"
|
||||||
|
SERVICE_MAP["100.84.65.101:4533"]="Navidrome"
|
||||||
|
SERVICE_MAP["100.84.65.101:5030"]="Soulseek"
|
||||||
|
SERVICE_MAP["100.84.65.101:5055"]="Overseerr"
|
||||||
|
SERVICE_MAP["100.84.65.101:5056"]="Jellyfin Requests"
|
||||||
|
SERVICE_MAP["100.84.65.101:7878"]="Radarr"
|
||||||
|
SERVICE_MAP["100.84.65.101:8096"]="Jellyfin"
|
||||||
|
SERVICE_MAP["100.84.65.101:8686"]="Lidarr"
|
||||||
|
SERVICE_MAP["100.84.65.101:8787"]="Readarr"
|
||||||
|
SERVICE_MAP["100.84.65.101:8989"]="Sonarr"
|
||||||
|
SERVICE_MAP["100.84.65.101:9091"]="Transmission"
|
||||||
|
SERVICE_MAP["100.84.65.101:9696"]="Prowlarr"
|
||||||
|
SERVICE_MAP["100.84.65.101:11000"]="Nextcloud"
|
||||||
|
SERVICE_MAP["localhost:9091"]="Authelia"
|
||||||
|
SERVICE_MAP["100.84.65.101:8181"]="Miniflux"
|
||||||
|
SERVICE_MAP["localhost:3000"]="Forgejo"
|
||||||
|
|
||||||
|
# Desired display order
|
||||||
|
DISPLAY_ORDER=(
|
||||||
|
"localhost:8443"
|
||||||
|
"localhost:9091"
|
||||||
|
"100.84.65.101:11000"
|
||||||
|
"100.122.219.41:3000"
|
||||||
|
"100.84.65.101:32400"
|
||||||
|
"100.84.65.101:8096"
|
||||||
|
"100.84.65.101:5056"
|
||||||
|
"100.84.65.101:4533"
|
||||||
|
"100.84.65.101:5030"
|
||||||
|
"100.84.65.101:5055"
|
||||||
|
"100.84.65.101:7878"
|
||||||
|
"100.84.65.101:8989"
|
||||||
|
"100.84.65.101:8686"
|
||||||
|
"100.84.65.101:8787"
|
||||||
|
"100.84.65.101:9696"
|
||||||
|
"100.84.65.101:9091"
|
||||||
|
"100.84.65.101:8181"
|
||||||
|
"localhost:3000"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch from Prometheus
|
||||||
|
RESPONSE=$(curl -sf --max-time 10 \
|
||||||
|
"${PROMETHEUS}/api/v1/query?query=${QUERY}" 2>/dev/null) || {
|
||||||
|
echo "ERROR: Failed to fetch Prometheus metrics" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse with jq — build a lookup of upstream→value
|
||||||
|
UPSTREAM_DATA=$(echo "$RESPONSE" | jq -r '
|
||||||
|
.data.result[] |
|
||||||
|
.metric.upstream + " " + .value[1]
|
||||||
|
' 2>/dev/null) || {
|
||||||
|
echo "ERROR: Failed to parse Prometheus response" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build services JSON array
|
||||||
|
UPDATED=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
HAS_DOWN=0
|
||||||
|
HAS_UP=0
|
||||||
|
|
||||||
|
SERVICES_JSON=""
|
||||||
|
HISTORY_SERVICES=""
|
||||||
|
|
||||||
|
for upstream in "${DISPLAY_ORDER[@]}"; do
|
||||||
|
name="${SERVICE_MAP[$upstream]:-}"
|
||||||
|
[ -z "$name" ] && continue
|
||||||
|
|
||||||
|
# Look up health value for this upstream
|
||||||
|
value=$(echo "$UPSTREAM_DATA" | grep -F "$upstream " | awk '{print $NF}' | head -1)
|
||||||
|
|
||||||
|
if [ "$value" = "1" ]; then
|
||||||
|
status="operational"
|
||||||
|
hist_val=1
|
||||||
|
HAS_UP=1
|
||||||
|
elif [ "$value" = "0" ]; then
|
||||||
|
status="degraded"
|
||||||
|
hist_val=0
|
||||||
|
HAS_DOWN=1
|
||||||
|
else
|
||||||
|
status="degraded"
|
||||||
|
hist_val=0
|
||||||
|
HAS_DOWN=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$SERVICES_JSON" ]; then
|
||||||
|
SERVICES_JSON="${SERVICES_JSON},"
|
||||||
|
fi
|
||||||
|
SERVICES_JSON="${SERVICES_JSON}{\"name\":\"${name}\",\"status\":\"${status}\"}"
|
||||||
|
|
||||||
|
if [ -n "$HISTORY_SERVICES" ]; then
|
||||||
|
HISTORY_SERVICES="${HISTORY_SERVICES},"
|
||||||
|
fi
|
||||||
|
HISTORY_SERVICES="${HISTORY_SERVICES}\"${name}\":${hist_val}"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Determine overall status
|
||||||
|
if [ $HAS_DOWN -eq 0 ]; then
|
||||||
|
OVERALL="operational"
|
||||||
|
elif [ $HAS_UP -eq 0 ]; then
|
||||||
|
OVERALL="outage"
|
||||||
|
else
|
||||||
|
OVERALL="degraded"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write status.json
|
||||||
|
mkdir -p "$(dirname "$OUTPUT")"
|
||||||
|
cat > "$OUTPUT" <<EOF
|
||||||
|
{
|
||||||
|
"updated": "${UPDATED}",
|
||||||
|
"overall": "${OVERALL}",
|
||||||
|
"services": [${SERVICES_JSON}]
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "[$UPDATED] Status written to $OUTPUT (overall: $OVERALL)"
|
||||||
|
|
||||||
|
# ===== History tracking =====
|
||||||
|
|
||||||
|
# Append current check to history.log
|
||||||
|
echo "{\"ts\":\"${UPDATED}\",\"services\":{${HISTORY_SERVICES}}}" >> "$HISTORY_LOG"
|
||||||
|
|
||||||
|
# Trim history.log to last 129600 lines (90 days × 24h × 60min)
|
||||||
|
MAX_LINES=129600
|
||||||
|
LINE_COUNT=$(wc -l < "$HISTORY_LOG")
|
||||||
|
if [ "$LINE_COUNT" -gt "$MAX_LINES" ]; then
|
||||||
|
tail -n "$MAX_LINES" "$HISTORY_LOG" > "${HISTORY_LOG}.tmp" && mv "${HISTORY_LOG}.tmp" "$HISTORY_LOG"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Regenerate history.json from history.log
|
||||||
|
python3 - "$HISTORY_LOG" "$HISTORY_JSON" <<'PYEOF'
|
||||||
|
import sys, json
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
history_log = sys.argv[1]
|
||||||
|
history_json_path = sys.argv[2]
|
||||||
|
|
||||||
|
# Parse all log lines, group by hour key
|
||||||
|
hour_data = defaultdict(lambda: defaultdict(list))
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(history_log) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
ts = entry['ts']
|
||||||
|
hour_key = ts[:13] # e.g. "2026-03-03T19"
|
||||||
|
for svc, val in entry['services'].items():
|
||||||
|
hour_data[hour_key][svc].append(val)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Generate exactly 2160 hour slots (90 days), oldest first, ending at current hour
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
current_hour = now.replace(minute=0, second=0, microsecond=0)
|
||||||
|
slots = [(current_hour - timedelta(hours=2159 - i)) for i in range(2160)]
|
||||||
|
slot_keys = [h.strftime('%Y-%m-%dT%H') for h in slots]
|
||||||
|
|
||||||
|
# Collect all service names from log data
|
||||||
|
service_names = set()
|
||||||
|
for hour_vals in hour_data.values():
|
||||||
|
service_names.update(hour_vals.keys())
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'days': 90,
|
||||||
|
'generated': now.strftime('%Y-%m-%dT%H:%M:%SZ'),
|
||||||
|
'services': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
for svc in sorted(service_names):
|
||||||
|
hours_list = []
|
||||||
|
for slot_key in slot_keys:
|
||||||
|
checks = hour_data.get(slot_key, {}).get(svc, [])
|
||||||
|
if not checks:
|
||||||
|
hours_list.append(None)
|
||||||
|
else:
|
||||||
|
# Majority vote: >50% up → 1, otherwise 0
|
||||||
|
hours_list.append(1 if sum(checks) > len(checks) / 2 else 0)
|
||||||
|
|
||||||
|
valid = [h for h in hours_list if h is not None]
|
||||||
|
uptime_pct = round(sum(valid) / len(valid) * 100, 2) if valid else None
|
||||||
|
|
||||||
|
result['services'][svc] = {
|
||||||
|
'uptime_percent': uptime_pct,
|
||||||
|
'hours': hours_list
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(history_json_path, 'w') as f:
|
||||||
|
json.dump(result, f, separators=(',', ':'))
|
||||||
|
|
||||||
|
print(f"[history] Wrote {history_json_path} ({len(service_names)} services, {len(slot_keys)} hour slots)")
|
||||||
|
PYEOF
|
||||||
Loading…
Add table
Reference in a new issue