--- # Common baseline for all Linux hosts. # Installs core packages, configures SSH, sets up the shell environment. - name: Update apt cache ansible.builtin.apt: update_cache: true cache_valid_time: 3600 - name: Install baseline packages ansible.builtin.apt: name: - curl - wget - git - htop - tmux - vim - jq - unzip - fish - rsync - fail2ban - ufw state: present - name: Get fish shell path ansible.builtin.command: which fish changed_when: false register: common_fish_path when: inventory_hostname != 'london-a' - name: Set fish as default shell ansible.builtin.user: name: root shell: "{{ common_fish_path.stdout }}" when: inventory_hostname != 'london-a' - name: Ensure SSH directory exists ansible.builtin.file: path: /root/.ssh state: directory mode: '0700' - name: Harden SSH config ansible.builtin.lineinfile: path: /etc/ssh/sshd_config regexp: "{{ item.regexp }}" line: "{{ item.line }}" state: present loop: - {regexp: '^#?PermitRootLogin', line: 'PermitRootLogin prohibit-password'} - {regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no'} - {regexp: '^#?X11Forwarding', line: 'X11Forwarding no'} notify: Restart sshd - name: Enable fail2ban ansible.builtin.service: name: fail2ban state: started enabled: true # --- UFW firewall --- - name: Set UFW default deny incoming community.general.ufw: direction: incoming default: deny when: common_ufw_enabled | bool notify: Reload ufw - name: Set UFW default allow outgoing community.general.ufw: direction: outgoing default: allow when: common_ufw_enabled | bool notify: Reload ufw - name: Allow all traffic on Tailscale interface community.general.ufw: rule: allow direction: in interface: tailscale0 comment: "Tailscale mesh - allow all" when: common_ufw_enabled | bool notify: Reload ufw - name: Allow SSH (safety net) community.general.ufw: rule: allow port: '22' proto: tcp comment: "SSH" when: common_ufw_enabled | bool notify: Reload ufw - name: Allow host-specific ports community.general.ufw: rule: allow port: "{{ item.port | string }}" proto: "{{ item.proto | default('tcp') }}" from_ip: "{{ item.from_ip | default(omit) }}" comment: "{{ item.comment | default(omit) }}" loop: "{{ common_ufw_allowed_ports }}" when: - common_ufw_enabled | bool - common_ufw_allowed_ports | length > 0 notify: Reload ufw # When a port is restricted to a source (from_ip), make sure the older # unrestricted "allow from anywhere" variant of the same rule isn't left # lingering on the host — UFW keeps it otherwise, which would defeat the # source restriction. Deleting an absent rule is a no-op, so this is safe # on hosts that never had the broad rule. - name: Remove superseded world-open rules for source-restricted ports community.general.ufw: rule: allow port: "{{ item.port | string }}" proto: "{{ item.proto | default('tcp') }}" delete: true loop: "{{ common_ufw_allowed_ports | selectattr('from_ip', 'defined') | list }}" when: common_ufw_enabled | bool notify: Reload ufw - name: Enable UFW community.general.ufw: state: enabled when: common_ufw_enabled | bool # --- Cleanup: orphaned cloudflared (PESO-138) --- # Cloudflare Tunnels were retired in favour of Caddy + Authelia (PESO-134, #56), # which removed cloudflared from ansible config. copenhagen-a was unreachable at # the time, so its cloudflared.service was never actually stopped and is still # running. Remove it wherever the unit lingers. copenhagen-c legitimately runs a # hand-configured cloudflared tunnel — never touch it. - name: Detect lingering cloudflared unit ansible.builtin.stat: path: /etc/systemd/system/cloudflared.service register: common_cloudflared_unit when: inventory_hostname != 'copenhagen-c' - name: Remove orphaned cloudflared when: - inventory_hostname != 'copenhagen-c' - common_cloudflared_unit.stat.exists | default(false) block: - name: Stop and disable cloudflared ansible.builtin.systemd: name: cloudflared state: stopped enabled: false failed_when: false - name: Remove cloudflared systemd unit ansible.builtin.file: path: /etc/systemd/system/cloudflared.service state: absent notify: Reload systemd daemon - name: Uninstall cloudflared package ansible.builtin.apt: name: cloudflared state: absent purge: true # --- Grafana Alloy: restart resilience (PESO-149) --- # Every host runs alloy.service to ship metrics/logs to Grafana Cloud. On # 2026-05-20 a transient TLS failure to Grafana fleet-management tripped # systemd's default start rate-limit on copenhagen-c; systemd then gave up and # the host went silently unmonitored for ~2.5 weeks. Disable the rate limit so # Alloy keeps retrying indefinitely, backing off 30s between attempts, instead # of dying permanently on a momentary blip. - name: Check for Alloy unit ansible.builtin.stat: path: /lib/systemd/system/alloy.service register: common_alloy_unit - name: Install Alloy systemd resilience drop-in ansible.builtin.copy: dest: /etc/systemd/system/alloy.service.d/10-resilience.conf mode: '0644' content: | # Managed by Ansible (pez-infra) — common role, PESO-149. # Keep Alloy retrying through transient upstream/TLS failures instead of # hitting systemd's start rate-limit and giving up. [Unit] StartLimitIntervalSec=0 [Service] Restart=always RestartSec=30 when: common_alloy_unit.stat.exists | default(false) notify: Restart alloy # --- Cleanup: leftover self-hosted Grafana (PESO-149) --- # Monitoring moved fully to Grafana Cloud, but the old Grafana OSS package was # never removed from copenhagen-c, where grafana-server.service sits enabled # and failing. Remove it wherever it lingers (no-op on every other host). - name: Detect leftover grafana-server unit ansible.builtin.stat: path: /lib/systemd/system/grafana-server.service register: common_grafana_unit - name: Remove leftover self-hosted Grafana when: common_grafana_unit.stat.exists | default(false) block: - name: Stop and disable grafana-server ansible.builtin.systemd: name: grafana-server state: stopped enabled: false failed_when: false - name: Uninstall grafana package ansible.builtin.apt: name: grafana state: absent purge: true notify: Reload systemd daemon