pez-infra/ansible/roles/common/tasks/main.yml
Rasmus "Pez" Wejlgaard 9ac179dbec
Make Alloy resilient to transient failures; remove leftover Grafana (PESO-149) (#126)
copenhagen-c stopped reporting to Grafana Cloud on 2026-05-20: a transient
TLS failure to fleet-management tripped systemd's default start rate-limit,
systemd gave up, and the host sat silently unmonitored for ~2.5 weeks.

Add a 10-resilience.conf systemd drop-in for alloy.service on every host
(StartLimitIntervalSec=0, Restart=always, RestartSec=30) so a momentary
upstream/TLS blip can no longer permanently kill the collector.

Also drop the old self-hosted Grafana package that was left enabled and
failing on copenhagen-c after the move to Grafana Cloud.
2026-06-07 14:30:08 +01:00

219 lines
6.5 KiB
YAML

---
# Common baseline for all Linux hosts.
# Installs core packages, configures SSH, sets up the shell environment.
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 3600
- name: Install baseline packages
ansible.builtin.apt:
name:
- curl
- wget
- git
- htop
- tmux
- vim
- jq
- unzip
- fish
- rsync
- fail2ban
- ufw
state: present
- name: Get fish shell path
ansible.builtin.command: which fish
changed_when: false
register: common_fish_path
when: inventory_hostname != 'london-a'
- name: Set fish as default shell
ansible.builtin.user:
name: root
shell: "{{ common_fish_path.stdout }}"
when: inventory_hostname != 'london-a'
- name: Ensure SSH directory exists
ansible.builtin.file:
path: /root/.ssh
state: directory
mode: '0700'
- name: Harden SSH config
ansible.builtin.lineinfile:
path: /etc/ssh/sshd_config
regexp: "{{ item.regexp }}"
line: "{{ item.line }}"
state: present
loop:
- {regexp: '^#?PermitRootLogin', line: 'PermitRootLogin prohibit-password'}
- {regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no'}
- {regexp: '^#?X11Forwarding', line: 'X11Forwarding no'}
notify: Restart sshd
- name: Enable fail2ban
ansible.builtin.service:
name: fail2ban
state: started
enabled: true
# --- UFW firewall ---
- name: Set UFW default deny incoming
community.general.ufw:
direction: incoming
default: deny
when: common_ufw_enabled | bool
notify: Reload ufw
- name: Set UFW default allow outgoing
community.general.ufw:
direction: outgoing
default: allow
when: common_ufw_enabled | bool
notify: Reload ufw
- name: Allow all traffic on Tailscale interface
community.general.ufw:
rule: allow
direction: in
interface: tailscale0
comment: "Tailscale mesh - allow all"
when: common_ufw_enabled | bool
notify: Reload ufw
- name: Allow SSH (safety net)
community.general.ufw:
rule: allow
port: '22'
proto: tcp
comment: "SSH"
when: common_ufw_enabled | bool
notify: Reload ufw
- name: Allow host-specific ports
community.general.ufw:
rule: allow
port: "{{ item.port | string }}"
proto: "{{ item.proto | default('tcp') }}"
from_ip: "{{ item.from_ip | default(omit) }}"
comment: "{{ item.comment | default(omit) }}"
loop: "{{ common_ufw_allowed_ports }}"
when:
- common_ufw_enabled | bool
- common_ufw_allowed_ports | length > 0
notify: Reload ufw
# When a port is restricted to a source (from_ip), make sure the older
# unrestricted "allow from anywhere" variant of the same rule isn't left
# lingering on the host — UFW keeps it otherwise, which would defeat the
# source restriction. Deleting an absent rule is a no-op, so this is safe
# on hosts that never had the broad rule.
- name: Remove superseded world-open rules for source-restricted ports
community.general.ufw:
rule: allow
port: "{{ item.port | string }}"
proto: "{{ item.proto | default('tcp') }}"
delete: true
loop: "{{ common_ufw_allowed_ports | selectattr('from_ip', 'defined') | list }}"
when: common_ufw_enabled | bool
notify: Reload ufw
- name: Enable UFW
community.general.ufw:
state: enabled
when: common_ufw_enabled | bool
# --- Cleanup: orphaned cloudflared (PESO-138) ---
# Cloudflare Tunnels were retired in favour of Caddy + Authelia (PESO-134, #56),
# which removed cloudflared from ansible config. copenhagen-a was unreachable at
# the time, so its cloudflared.service was never actually stopped and is still
# running. Remove it wherever the unit lingers. copenhagen-c legitimately runs a
# hand-configured cloudflared tunnel — never touch it.
- name: Detect lingering cloudflared unit
ansible.builtin.stat:
path: /etc/systemd/system/cloudflared.service
register: common_cloudflared_unit
when: inventory_hostname != 'copenhagen-c'
- name: Remove orphaned cloudflared
when:
- inventory_hostname != 'copenhagen-c'
- common_cloudflared_unit.stat.exists | default(false)
block:
- name: Stop and disable cloudflared
ansible.builtin.systemd:
name: cloudflared
state: stopped
enabled: false
failed_when: false
- name: Remove cloudflared systemd unit
ansible.builtin.file:
path: /etc/systemd/system/cloudflared.service
state: absent
notify: Reload systemd daemon
- name: Uninstall cloudflared package
ansible.builtin.apt:
name: cloudflared
state: absent
purge: true
# --- Grafana Alloy: restart resilience (PESO-149) ---
# Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On
# 2026-05-20 a transient TLS failure to Grafana fleet-management tripped
# systemd's default start rate-limit on copenhagen-c; systemd then gave up and
# the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so
# Alloy keeps retrying indefinitely, backing off 30s between attempts, instead
# of dying permanently on a momentary blip.
- name: Check for Alloy unit
ansible.builtin.stat:
path: /lib/systemd/system/alloy.service
register: common_alloy_unit
- name: Install Alloy systemd resilience drop-in
ansible.builtin.copy:
dest: /etc/systemd/system/alloy.service.d/10-resilience.conf
mode: '0644'
content: |
# Managed by Ansible (pez-infra) — common role, PESO-149.
# Keep Alloy retrying through transient upstream/TLS failures instead of
# hitting systemd's start rate-limit and giving up.
[Unit]
StartLimitIntervalSec=0
[Service]
Restart=always
RestartSec=30
when: common_alloy_unit.stat.exists | default(false)
notify: Restart alloy
# --- Cleanup: leftover self-hosted Grafana (PESO-149) ---
# Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was
# never removed from copenhagen-c, where grafana-server.service sits enabled
# and failing. Remove it wherever it lingers (no-op on every other host).
- name: Detect leftover grafana-server unit
ansible.builtin.stat:
path: /lib/systemd/system/grafana-server.service
register: common_grafana_unit
- name: Remove leftover self-hosted Grafana
when: common_grafana_unit.stat.exists | default(false)
block:
- name: Stop and disable grafana-server
ansible.builtin.systemd:
name: grafana-server
state: stopped
enabled: false
failed_when: false
- name: Uninstall grafana package
ansible.builtin.apt:
name: grafana
state: absent
purge: true
notify: Reload systemd daemon