From 3702584856746b896ae762f0d865396ecebc22b6 Mon Sep 17 00:00:00 2001
From: Rasmus Wejlgaard <pez@pez.sh>
Date: Sun, 7 Jun 2026 14:06:03 +0100
Subject: [PATCH] Make Alloy resilient to transient failures; remove leftover
 Grafana (PESO-149)

copenhagen-c stopped reporting to Grafana Cloud on 2026-05-20: a transient
TLS failure to fleet-management tripped systemd's default start rate-limit,
systemd gave up, and the host sat silently unmonitored for ~2.5 weeks.

Add a 10-resilience.conf systemd drop-in for alloy.service on every host
(StartLimitIntervalSec=0, Restart=always, RestartSec=30) so a momentary
upstream/TLS blip can no longer permanently kill the collector.

Also drop the old self-hosted Grafana package that was left enabled and
failing on copenhagen-c after the move to Grafana Cloud.
---
 ansible/roles/common/handlers/main.yml |  6 +++
 ansible/roles/common/tasks/main.yml    | 55 ++++++++++++++++++++++++++
 docs/monitoring.md                     |  2 +
 3 files changed, 63 insertions(+)

diff --git a/ansible/roles/common/handlers/main.yml b/ansible/roles/common/handlers/main.yml
index 259efaf..43bc533 100644
--- a/ansible/roles/common/handlers/main.yml
+++ b/ansible/roles/common/handlers/main.yml
@@ -11,3 +11,9 @@
 - name: Reload systemd daemon
   ansible.builtin.systemd:
     daemon_reload: true
+
+- name: Restart alloy
+  ansible.builtin.systemd:
+    name: alloy
+    state: restarted
+    daemon_reload: true
diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml
index 5e3dee4..1db7e9b 100644
--- a/ansible/roles/common/tasks/main.yml
+++ b/ansible/roles/common/tasks/main.yml
@@ -162,3 +162,58 @@
         name: cloudflared
         state: absent
         purge: true
+
+# --- Grafana Alloy: restart resilience (PESO-149) ---
+# Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On
+# 2026-05-20 a transient TLS failure to Grafana fleet-management tripped
+# systemd's default start rate-limit on copenhagen-c; systemd then gave up and
+# the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so
+# Alloy keeps retrying indefinitely, backing off 30s between attempts, instead
+# of dying permanently on a momentary blip.
+- name: Check for Alloy unit
+  ansible.builtin.stat:
+    path: /lib/systemd/system/alloy.service
+  register: common_alloy_unit
+
+- name: Install Alloy systemd resilience drop-in
+  ansible.builtin.copy:
+    dest: /etc/systemd/system/alloy.service.d/10-resilience.conf
+    mode: '0644'
+    content: |
+      # Managed by Ansible (pez-infra) — common role, PESO-149.
+      # Keep Alloy retrying through transient upstream/TLS failures instead of
+      # hitting systemd's start rate-limit and giving up.
+      [Unit]
+      StartLimitIntervalSec=0
+
+      [Service]
+      Restart=always
+      RestartSec=30
+  when: common_alloy_unit.stat.exists | default(false)
+  notify: Restart alloy
+
+# --- Cleanup: leftover self-hosted Grafana (PESO-149) ---
+# Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was
+# never removed from copenhagen-c, where grafana-server.service sits enabled
+# and failing. Remove it wherever it lingers (no-op on every other host).
+- name: Detect leftover grafana-server unit
+  ansible.builtin.stat:
+    path: /lib/systemd/system/grafana-server.service
+  register: common_grafana_unit
+
+- name: Remove leftover self-hosted Grafana
+  when: common_grafana_unit.stat.exists | default(false)
+  block:
+    - name: Stop and disable grafana-server
+      ansible.builtin.systemd:
+        name: grafana-server
+        state: stopped
+        enabled: false
+      failed_when: false
+
+    - name: Uninstall grafana package
+      ansible.builtin.apt:
+        name: grafana
+        state: absent
+        purge: true
+      notify: Reload systemd daemon
diff --git a/docs/monitoring.md b/docs/monitoring.md
index cfce62a..6ed56e8 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -30,6 +30,8 @@ Alloy runs as `alloy.service` on every host in the inventory. Each host is regis
 
 Pipelines (what to scrape, how to relabel, where to ship) live in `terraform/grafana/fleet_pipelines/` and are pushed to Grafana Cloud as a `grafana_fleet_management_pipeline` resource. The Alloy daemons on each host pull their config from Fleet Management.
 
+The `common` role drops a `10-resilience.conf` systemd override onto every host (`StartLimitIntervalSec=0`, `Restart=always`, `RestartSec=30`) so a transient upstream/TLS failure can't trip systemd's start rate-limit and permanently kill the collector — it keeps retrying until Grafana Cloud is reachable again. (Added after copenhagen-c sat unmonitored for ~2.5 weeks following one such blip — PESO-149.)
+
 ### Local exporters scraped by Alloy
 
 | Exporter | Hosts | What |