From 3702584856746b896ae762f0d865396ecebc22b6 Mon Sep 17 00:00:00 2001 From: Rasmus Wejlgaard Date: Sun, 7 Jun 2026 14:06:03 +0100 Subject: [PATCH] Make Alloy resilient to transient failures; remove leftover Grafana (PESO-149) copenhagen-c stopped reporting to Grafana Cloud on 2026-05-20: a transient TLS failure to fleet-management tripped systemd's default start rate-limit, systemd gave up, and the host sat silently unmonitored for ~2.5 weeks. Add a 10-resilience.conf systemd drop-in for alloy.service on every host (StartLimitIntervalSec=0, Restart=always, RestartSec=30) so a momentary upstream/TLS blip can no longer permanently kill the collector. Also drop the old self-hosted Grafana package that was left enabled and failing on copenhagen-c after the move to Grafana Cloud. --- ansible/roles/common/handlers/main.yml | 6 +++ ansible/roles/common/tasks/main.yml | 55 ++++++++++++++++++++++++++ docs/monitoring.md | 2 + 3 files changed, 63 insertions(+) diff --git a/ansible/roles/common/handlers/main.yml b/ansible/roles/common/handlers/main.yml index 259efaf..43bc533 100644 --- a/ansible/roles/common/handlers/main.yml +++ b/ansible/roles/common/handlers/main.yml @@ -11,3 +11,9 @@ - name: Reload systemd daemon ansible.builtin.systemd: daemon_reload: true + +- name: Restart alloy + ansible.builtin.systemd: + name: alloy + state: restarted + daemon_reload: true diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index 5e3dee4..1db7e9b 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -162,3 +162,58 @@ name: cloudflared state: absent purge: true + +# --- Grafana Alloy: restart resilience (PESO-149) --- +# Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On +# 2026-05-20 a transient TLS failure to Grafana fleet-management tripped +# systemd's default start rate-limit on copenhagen-c; systemd then gave up and +# the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so +# Alloy keeps retrying indefinitely, backing off 30s between attempts, instead +# of dying permanently on a momentary blip. +- name: Check for Alloy unit + ansible.builtin.stat: + path: /lib/systemd/system/alloy.service + register: common_alloy_unit + +- name: Install Alloy systemd resilience drop-in + ansible.builtin.copy: + dest: /etc/systemd/system/alloy.service.d/10-resilience.conf + mode: '0644' + content: | + # Managed by Ansible (pez-infra) — common role, PESO-149. + # Keep Alloy retrying through transient upstream/TLS failures instead of + # hitting systemd's start rate-limit and giving up. + [Unit] + StartLimitIntervalSec=0 + + [Service] + Restart=always + RestartSec=30 + when: common_alloy_unit.stat.exists | default(false) + notify: Restart alloy + +# --- Cleanup: leftover self-hosted Grafana (PESO-149) --- +# Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was +# never removed from copenhagen-c, where grafana-server.service sits enabled +# and failing. Remove it wherever it lingers (no-op on every other host). +- name: Detect leftover grafana-server unit + ansible.builtin.stat: + path: /lib/systemd/system/grafana-server.service + register: common_grafana_unit + +- name: Remove leftover self-hosted Grafana + when: common_grafana_unit.stat.exists | default(false) + block: + - name: Stop and disable grafana-server + ansible.builtin.systemd: + name: grafana-server + state: stopped + enabled: false + failed_when: false + + - name: Uninstall grafana package + ansible.builtin.apt: + name: grafana + state: absent + purge: true + notify: Reload systemd daemon diff --git a/docs/monitoring.md b/docs/monitoring.md index cfce62a..6ed56e8 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -30,6 +30,8 @@ Alloy runs as `alloy.service` on every host in the inventory. Each host is regis Pipelines (what to scrape, how to relabel, where to ship) live in `terraform/grafana/fleet_pipelines/` and are pushed to Grafana Cloud as a `grafana_fleet_management_pipeline` resource. The Alloy daemons on each host pull their config from Fleet Management. +The `common` role drops a `10-resilience.conf` systemd override onto every host (`StartLimitIntervalSec=0`, `Restart=always`, `RestartSec=30`) so a transient upstream/TLS failure can't trip systemd's start rate-limit and permanently kill the collector — it keeps retrying until Grafana Cloud is reachable again. (Added after copenhagen-c sat unmonitored for ~2.5 weeks following one such blip — PESO-149.) + ### Local exporters scraped by Alloy | Exporter | Hosts | What |