mirror of
https://github.com/RWejlgaard/pez-infra.git
synced 2026-07-04 15:46:16 +00:00
copenhagen-c stopped reporting to Grafana Cloud on 2026-05-20: a transient TLS failure to fleet-management tripped systemd's default start rate-limit, systemd gave up, and the host sat silently unmonitored for ~2.5 weeks. Add a 10-resilience.conf systemd drop-in for alloy.service on every host (StartLimitIntervalSec=0, Restart=always, RestartSec=30) so a momentary upstream/TLS blip can no longer permanently kill the collector. Also drop the old self-hosted Grafana package that was left enabled and failing on copenhagen-c after the move to Grafana Cloud.
219 lines
6.5 KiB
YAML
219 lines
6.5 KiB
YAML
---
|
|
# Common baseline for all Linux hosts.
|
|
# Installs core packages, configures SSH, sets up the shell environment.
|
|
|
|
- name: Update apt cache
|
|
ansible.builtin.apt:
|
|
update_cache: true
|
|
cache_valid_time: 3600
|
|
|
|
- name: Install baseline packages
|
|
ansible.builtin.apt:
|
|
name:
|
|
- curl
|
|
- wget
|
|
- git
|
|
- htop
|
|
- tmux
|
|
- vim
|
|
- jq
|
|
- unzip
|
|
- fish
|
|
- rsync
|
|
- fail2ban
|
|
- ufw
|
|
state: present
|
|
|
|
- name: Get fish shell path
|
|
ansible.builtin.command: which fish
|
|
changed_when: false
|
|
register: common_fish_path
|
|
when: inventory_hostname != 'london-a'
|
|
|
|
- name: Set fish as default shell
|
|
ansible.builtin.user:
|
|
name: root
|
|
shell: "{{ common_fish_path.stdout }}"
|
|
when: inventory_hostname != 'london-a'
|
|
|
|
- name: Ensure SSH directory exists
|
|
ansible.builtin.file:
|
|
path: /root/.ssh
|
|
state: directory
|
|
mode: '0700'
|
|
|
|
- name: Harden SSH config
|
|
ansible.builtin.lineinfile:
|
|
path: /etc/ssh/sshd_config
|
|
regexp: "{{ item.regexp }}"
|
|
line: "{{ item.line }}"
|
|
state: present
|
|
loop:
|
|
- {regexp: '^#?PermitRootLogin', line: 'PermitRootLogin prohibit-password'}
|
|
- {regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no'}
|
|
- {regexp: '^#?X11Forwarding', line: 'X11Forwarding no'}
|
|
notify: Restart sshd
|
|
|
|
- name: Enable fail2ban
|
|
ansible.builtin.service:
|
|
name: fail2ban
|
|
state: started
|
|
enabled: true
|
|
|
|
# --- UFW firewall ---
|
|
|
|
- name: Set UFW default deny incoming
|
|
community.general.ufw:
|
|
direction: incoming
|
|
default: deny
|
|
when: common_ufw_enabled | bool
|
|
notify: Reload ufw
|
|
|
|
- name: Set UFW default allow outgoing
|
|
community.general.ufw:
|
|
direction: outgoing
|
|
default: allow
|
|
when: common_ufw_enabled | bool
|
|
notify: Reload ufw
|
|
|
|
- name: Allow all traffic on Tailscale interface
|
|
community.general.ufw:
|
|
rule: allow
|
|
direction: in
|
|
interface: tailscale0
|
|
comment: "Tailscale mesh - allow all"
|
|
when: common_ufw_enabled | bool
|
|
notify: Reload ufw
|
|
|
|
- name: Allow SSH (safety net)
|
|
community.general.ufw:
|
|
rule: allow
|
|
port: '22'
|
|
proto: tcp
|
|
comment: "SSH"
|
|
when: common_ufw_enabled | bool
|
|
notify: Reload ufw
|
|
|
|
- name: Allow host-specific ports
|
|
community.general.ufw:
|
|
rule: allow
|
|
port: "{{ item.port | string }}"
|
|
proto: "{{ item.proto | default('tcp') }}"
|
|
from_ip: "{{ item.from_ip | default(omit) }}"
|
|
comment: "{{ item.comment | default(omit) }}"
|
|
loop: "{{ common_ufw_allowed_ports }}"
|
|
when:
|
|
- common_ufw_enabled | bool
|
|
- common_ufw_allowed_ports | length > 0
|
|
notify: Reload ufw
|
|
|
|
# When a port is restricted to a source (from_ip), make sure the older
|
|
# unrestricted "allow from anywhere" variant of the same rule isn't left
|
|
# lingering on the host — UFW keeps it otherwise, which would defeat the
|
|
# source restriction. Deleting an absent rule is a no-op, so this is safe
|
|
# on hosts that never had the broad rule.
|
|
- name: Remove superseded world-open rules for source-restricted ports
|
|
community.general.ufw:
|
|
rule: allow
|
|
port: "{{ item.port | string }}"
|
|
proto: "{{ item.proto | default('tcp') }}"
|
|
delete: true
|
|
loop: "{{ common_ufw_allowed_ports | selectattr('from_ip', 'defined') | list }}"
|
|
when: common_ufw_enabled | bool
|
|
notify: Reload ufw
|
|
|
|
- name: Enable UFW
|
|
community.general.ufw:
|
|
state: enabled
|
|
when: common_ufw_enabled | bool
|
|
|
|
# --- Cleanup: orphaned cloudflared (PESO-138) ---
|
|
# Cloudflare Tunnels were retired in favour of Caddy + Authelia (PESO-134, #56),
|
|
# which removed cloudflared from ansible config. copenhagen-a was unreachable at
|
|
# the time, so its cloudflared.service was never actually stopped and is still
|
|
# running. Remove it wherever the unit lingers. copenhagen-c legitimately runs a
|
|
# hand-configured cloudflared tunnel — never touch it.
|
|
- name: Detect lingering cloudflared unit
|
|
ansible.builtin.stat:
|
|
path: /etc/systemd/system/cloudflared.service
|
|
register: common_cloudflared_unit
|
|
when: inventory_hostname != 'copenhagen-c'
|
|
|
|
- name: Remove orphaned cloudflared
|
|
when:
|
|
- inventory_hostname != 'copenhagen-c'
|
|
- common_cloudflared_unit.stat.exists | default(false)
|
|
block:
|
|
- name: Stop and disable cloudflared
|
|
ansible.builtin.systemd:
|
|
name: cloudflared
|
|
state: stopped
|
|
enabled: false
|
|
failed_when: false
|
|
|
|
- name: Remove cloudflared systemd unit
|
|
ansible.builtin.file:
|
|
path: /etc/systemd/system/cloudflared.service
|
|
state: absent
|
|
notify: Reload systemd daemon
|
|
|
|
- name: Uninstall cloudflared package
|
|
ansible.builtin.apt:
|
|
name: cloudflared
|
|
state: absent
|
|
purge: true
|
|
|
|
# --- Grafana Alloy: restart resilience (PESO-149) ---
|
|
# Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On
|
|
# 2026-05-20 a transient TLS failure to Grafana fleet-management tripped
|
|
# systemd's default start rate-limit on copenhagen-c; systemd then gave up and
|
|
# the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so
|
|
# Alloy keeps retrying indefinitely, backing off 30s between attempts, instead
|
|
# of dying permanently on a momentary blip.
|
|
- name: Check for Alloy unit
|
|
ansible.builtin.stat:
|
|
path: /lib/systemd/system/alloy.service
|
|
register: common_alloy_unit
|
|
|
|
- name: Install Alloy systemd resilience drop-in
|
|
ansible.builtin.copy:
|
|
dest: /etc/systemd/system/alloy.service.d/10-resilience.conf
|
|
mode: '0644'
|
|
content: |
|
|
# Managed by Ansible (pez-infra) — common role, PESO-149.
|
|
# Keep Alloy retrying through transient upstream/TLS failures instead of
|
|
# hitting systemd's start rate-limit and giving up.
|
|
[Unit]
|
|
StartLimitIntervalSec=0
|
|
|
|
[Service]
|
|
Restart=always
|
|
RestartSec=30
|
|
when: common_alloy_unit.stat.exists | default(false)
|
|
notify: Restart alloy
|
|
|
|
# --- Cleanup: leftover self-hosted Grafana (PESO-149) ---
|
|
# Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was
|
|
# never removed from copenhagen-c, where grafana-server.service sits enabled
|
|
# and failing. Remove it wherever it lingers (no-op on every other host).
|
|
- name: Detect leftover grafana-server unit
|
|
ansible.builtin.stat:
|
|
path: /lib/systemd/system/grafana-server.service
|
|
register: common_grafana_unit
|
|
|
|
- name: Remove leftover self-hosted Grafana
|
|
when: common_grafana_unit.stat.exists | default(false)
|
|
block:
|
|
- name: Stop and disable grafana-server
|
|
ansible.builtin.systemd:
|
|
name: grafana-server
|
|
state: stopped
|
|
enabled: false
|
|
failed_when: false
|
|
|
|
- name: Uninstall grafana package
|
|
ansible.builtin.apt:
|
|
name: grafana
|
|
state: absent
|
|
purge: true
|
|
notify: Reload systemd daemon
|