mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-07-04 15:46:16 +00:00
Compare commits
2 commits
81efa1b717
...
3945b8cafc
| Author | SHA1 | Date | |
|---|---|---|---|
| 3945b8cafc | |||
| 9ac179dbec |
7 changed files with 63 additions and 59 deletions
|
|
@ -8,7 +8,6 @@ docker_services:
|
||||||
- jellyseerr
|
- jellyseerr
|
||||||
- navidrome
|
- navidrome
|
||||||
- slskd
|
- slskd
|
||||||
- miniflux
|
|
||||||
- smartctl-exporter
|
- smartctl-exporter
|
||||||
- plex-exporter
|
- plex-exporter
|
||||||
- bookshelf
|
- bookshelf
|
||||||
|
|
|
||||||
|
|
@ -11,3 +11,9 @@
|
||||||
- name: Reload systemd daemon
|
- name: Reload systemd daemon
|
||||||
ansible.builtin.systemd:
|
ansible.builtin.systemd:
|
||||||
daemon_reload: true
|
daemon_reload: true
|
||||||
|
|
||||||
|
- name: Restart alloy
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: alloy
|
||||||
|
state: restarted
|
||||||
|
daemon_reload: true
|
||||||
|
|
|
||||||
|
|
@ -162,3 +162,58 @@
|
||||||
name: cloudflared
|
name: cloudflared
|
||||||
state: absent
|
state: absent
|
||||||
purge: true
|
purge: true
|
||||||
|
|
||||||
|
# --- Grafana Alloy: restart resilience (PESO-149) ---
|
||||||
|
# Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On
|
||||||
|
# 2026-05-20 a transient TLS failure to Grafana fleet-management tripped
|
||||||
|
# systemd's default start rate-limit on copenhagen-c; systemd then gave up and
|
||||||
|
# the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so
|
||||||
|
# Alloy keeps retrying indefinitely, backing off 30s between attempts, instead
|
||||||
|
# of dying permanently on a momentary blip.
|
||||||
|
- name: Check for Alloy unit
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /lib/systemd/system/alloy.service
|
||||||
|
register: common_alloy_unit
|
||||||
|
|
||||||
|
- name: Install Alloy systemd resilience drop-in
|
||||||
|
ansible.builtin.copy:
|
||||||
|
dest: /etc/systemd/system/alloy.service.d/10-resilience.conf
|
||||||
|
mode: '0644'
|
||||||
|
content: |
|
||||||
|
# Managed by Ansible (pez-infra) — common role, PESO-149.
|
||||||
|
# Keep Alloy retrying through transient upstream/TLS failures instead of
|
||||||
|
# hitting systemd's start rate-limit and giving up.
|
||||||
|
[Unit]
|
||||||
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Restart=always
|
||||||
|
RestartSec=30
|
||||||
|
when: common_alloy_unit.stat.exists | default(false)
|
||||||
|
notify: Restart alloy
|
||||||
|
|
||||||
|
# --- Cleanup: leftover self-hosted Grafana (PESO-149) ---
|
||||||
|
# Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was
|
||||||
|
# never removed from copenhagen-c, where grafana-server.service sits enabled
|
||||||
|
# and failing. Remove it wherever it lingers (no-op on every other host).
|
||||||
|
- name: Detect leftover grafana-server unit
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /lib/systemd/system/grafana-server.service
|
||||||
|
register: common_grafana_unit
|
||||||
|
|
||||||
|
- name: Remove leftover self-hosted Grafana
|
||||||
|
when: common_grafana_unit.stat.exists | default(false)
|
||||||
|
block:
|
||||||
|
- name: Stop and disable grafana-server
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: grafana-server
|
||||||
|
state: stopped
|
||||||
|
enabled: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Uninstall grafana package
|
||||||
|
ansible.builtin.apt:
|
||||||
|
name: grafana
|
||||||
|
state: absent
|
||||||
|
purge: true
|
||||||
|
notify: Reload systemd daemon
|
||||||
|
|
|
||||||
|
|
@ -244,18 +244,6 @@ status.pez.sh {
|
||||||
file_server
|
file_server
|
||||||
}
|
}
|
||||||
|
|
||||||
# Miniflux RSS
|
|
||||||
rss.pez.sh {
|
|
||||||
tracing {
|
|
||||||
span miniflux
|
|
||||||
}
|
|
||||||
forward_auth localhost:9091 {
|
|
||||||
uri /api/authz/forward-auth
|
|
||||||
copy_headers Remote-User Remote-Groups Remote-Name Remote-Email
|
|
||||||
}
|
|
||||||
reverse_proxy 100.84.65.101:8181
|
|
||||||
}
|
|
||||||
|
|
||||||
# Forgejo Git Server (auth handled by Forgejo itself)
|
# Forgejo Git Server (auth handled by Forgejo itself)
|
||||||
git.pez.sh {
|
git.pez.sh {
|
||||||
tracing {
|
tracing {
|
||||||
|
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
# Miniflux
|
|
||||||
|
|
||||||
Lightweight RSS reader.
|
|
||||||
|
|
||||||
- **Host:** london-b
|
|
||||||
- **URL:** https://rss.pez.sh
|
|
||||||
- **Database:** PostgreSQL 15 (Alpine)
|
|
||||||
- **Bind address:** Tailscale IP only (100.84.65.101:8181)
|
|
||||||
- **Data:** Docker volume (`miniflux-db`)
|
|
||||||
- **Note:** Passwords templatized — set `MINIFLUX_DB_PASSWORD` and `MINIFLUX_ADMIN_PASSWORD` env vars before deploying
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
||||||
---
|
|
||||||
# Miniflux - RSS reader
|
|
||||||
# Host: london-b (100.84.65.101)
|
|
||||||
# Data: Docker volume (miniflux-db)
|
|
||||||
# Access: https://rss.pez.sh (via Caddy reverse proxy on helsinki-a)
|
|
||||||
|
|
||||||
services:
|
|
||||||
miniflux-db:
|
|
||||||
image: postgres:15-alpine
|
|
||||||
container_name: miniflux-db
|
|
||||||
restart: unless-stopped
|
|
||||||
volumes:
|
|
||||||
- miniflux-db:/var/lib/postgresql/data
|
|
||||||
environment:
|
|
||||||
POSTGRES_DB: miniflux
|
|
||||||
POSTGRES_USER: miniflux
|
|
||||||
POSTGRES_PASSWORD: "${MINIFLUX_DB_PASSWORD}"
|
|
||||||
|
|
||||||
miniflux:
|
|
||||||
image: miniflux/miniflux:latest
|
|
||||||
container_name: miniflux
|
|
||||||
restart: unless-stopped
|
|
||||||
depends_on:
|
|
||||||
- miniflux-db
|
|
||||||
ports:
|
|
||||||
- "100.84.65.101:8181:8080"
|
|
||||||
environment:
|
|
||||||
DATABASE_URL: "postgres://miniflux:${MINIFLUX_DB_PASSWORD}@miniflux-db/miniflux?sslmode=disable"
|
|
||||||
RUN_MIGRATIONS: "1"
|
|
||||||
CREATE_ADMIN: "1"
|
|
||||||
ADMIN_USERNAME: pez
|
|
||||||
ADMIN_PASSWORD: "${MINIFLUX_ADMIN_PASSWORD}"
|
|
||||||
BASE_URL: https://rss.pez.sh
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
miniflux-db:
|
|
||||||
|
|
@ -30,6 +30,8 @@ Alloy runs as `alloy.service` on every host in the inventory. Each host is regis
|
||||||
|
|
||||||
Pipelines (what to scrape, how to relabel, where to ship) live in `terraform/grafana/fleet_pipelines/` and are pushed to Grafana Cloud as a `grafana_fleet_management_pipeline` resource. The Alloy daemons on each host pull their config from Fleet Management.
|
Pipelines (what to scrape, how to relabel, where to ship) live in `terraform/grafana/fleet_pipelines/` and are pushed to Grafana Cloud as a `grafana_fleet_management_pipeline` resource. The Alloy daemons on each host pull their config from Fleet Management.
|
||||||
|
|
||||||
|
The `common` role drops a `10-resilience.conf` systemd override onto every host (`StartLimitIntervalSec=0`, `Restart=always`, `RestartSec=30`) so a transient upstream/TLS failure can't trip systemd's start rate-limit and permanently kill the collector — it keeps retrying until Grafana Cloud is reachable again. (Added after copenhagen-c sat unmonitored for ~2.5 weeks following one such blip — PESO-149.)
|
||||||
|
|
||||||
### Local exporters scraped by Alloy
|
### Local exporters scraped by Alloy
|
||||||
|
|
||||||
| Exporter | Hosts | What |
|
| Exporter | Hosts | What |
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue