From 737d6e0bc1a50e2acd56ff8697291e948537872d Mon Sep 17 00:00:00 2001 From: Rasmus Wejlgaard Date: Sat, 28 Mar 2026 12:30:17 +0000 Subject: [PATCH] initial commit --- .github/workflows/deploy-on-merge.yml | 56 + .github/workflows/deploy.yml | 82 + .github/workflows/lint-ansible.yml | 34 + .github/workflows/lint-docker-compose.yml | 32 + .github/workflows/terraform.yml | 113 + .github/workflows/validate-caddyfile.yml | 35 + .github/workflows/validate-terraform.yml | 54 + .gitignore | 53 + .sops.yaml | 3 + Makefile | 9 + README.md | 106 + ansible/Makefile | 41 + ansible/README.md | 73 + ansible/ansible.cfg | 12 + ansible/deploy.yml | 194 + ansible/dotfiles/Dockerfile-alpine | 9 + ansible/dotfiles/Dockerfile-archlinux | 10 + ansible/dotfiles/Dockerfile-fedora | 10 + ansible/dotfiles/Dockerfile-ubuntu | 9 + ansible/dotfiles/Makefile | 15 + ansible/dotfiles/README.md | 70 + .../dotfiles/config/fish/conf.d/aliases.fish | 42 + .../dotfiles/config/fish/conf.d/envvars.fish | 7 + .../config/fish/conf.d/functions.fish | 20 + ansible/dotfiles/config/fish/config.fish | 3 + ansible/dotfiles/config/git/gitconfig | 17 + ansible/dotfiles/config/kitty/kitty.conf | 22 + ansible/dotfiles/config/nvim/init.lua | 281 + ansible/dotfiles/config/tmux/tmux.conf | 88 + .../install-scripts/01-install-packages.sh | 73 + .../dotfiles/install-scripts/02-move-files.sh | 28 + .../install-scripts/03-fisher-install.fish | 4 + .../install-scripts/04-fish-plugins.fish | 15 + .../install-scripts/05-tmux-plugins.fish | 9 + .../install-scripts/06-vim-setup.fish | 4 + .../install-scripts/07-last-touches.sh | 29 + ansible/dotfiles/install.sh | 97 + ansible/dotfiles/pr-test.yml | 30 + .../scripts/gentoo-kernel-upgrade-openrc | 399 + .../scripts/gentoo-kernel-upgrade-systemd | 397 + ansible/group_vars/all/secrets.enc.yaml | 16 + ansible/inventory/host_vars/copenhagen-a.yml | 12 + ansible/inventory/host_vars/copenhagen-c.yml | 4 + ansible/inventory/host_vars/helsinki-a.yml | 6 + ansible/inventory/host_vars/london-a.yml | 6 + ansible/inventory/host_vars/london-b.yml | 16 + ansible/inventory/host_vars/nuremberg-a.yml | 5 + ansible/inventory/hosts.ini | 25 + ansible/playbooks/docker-status.yml | 16 + ansible/playbooks/monitoring.yml | 95 + ansible/playbooks/reboot.yml | 68 + ansible/playbooks/update-all.yml | 64 + ansible/playbooks/update-freebsd.yml | 24 + ansible/playbooks/update-linux.yml | 46 + ansible/requirements.yml | 11 + ansible/roles/caddy/handlers/main.yml | 5 + ansible/roles/caddy/tasks/main.yml | 31 + ansible/roles/common/handlers/main.yml | 5 + ansible/roles/common/tasks/main.yml | 102 + ansible/roles/docker/tasks/main.yml | 31 + ansible/roles/docker_services/tasks/main.yml | 32 + ansible/roles/dotfiles/tasks/main.yml | 24 + ansible/roles/node_exporter/tasks/main.yml | 48 + .../roles/systemd_services/handlers/main.yml | 4 + ansible/roles/systemd_services/tasks/main.yml | 22 + ansible/scripts/docker-log-cleanup.sh | 15 + ansible/scripts/hdd-backup.sh | 39 + ansible/services/README.md | 58 + ansible/services/authelia/README.md | 13 + .../services/authelia/config.enc.yml.example | 10 + ansible/services/authelia/docker-compose.yml | 77 + ansible/services/bitwarden/README.md | 11 + ansible/services/bitwarden/docker-compose.yml | 33 + ansible/services/caddy/Caddyfile | 246 + ansible/services/caddy/Caddyfile.template | 198 + ansible/services/caddy/README.md | 129 + ansible/services/forgejo/README.md | 9 + ansible/services/forgejo/docker-compose.yml | 26 + ansible/services/grafana/README.md | 62 + .../grafana/dashboards/infrastructure.json | 1034 + .../dashboards/living-room-display.json | 1011 + .../dashboards/node-exporter-full.json | 15726 ++++++++++++++++ .../grafana/dashboards/traffic-slo.json | 587 + .../provisioning/alerting/contact-points.yml | 23 + .../alerting/notification-policy.yml | 31 + .../provisioning/alerting/rules-critical.yml | 358 + .../provisioning/alerting/rules-warning.yml | 242 + .../provisioning/dashboards/dashboards.yml | 15 + .../provisioning/datasources/datasources.json | 18 + ansible/services/jellyseerr/README.md | 8 + .../services/jellyseerr/docker-compose.yml | 17 + .../mangos-realmd/mangos-realmd.service | 16 + .../mangos-world/mangos-world.service | 16 + ansible/services/minecraft/README.md | 10 + ansible/services/minecraft/docker-compose.yml | 19 + ansible/services/miniflux/README.md | 10 + ansible/services/miniflux/docker-compose.yml | 35 + ansible/services/navidrome/README.md | 9 + ansible/services/navidrome/docker-compose.yml | 17 + ansible/services/nextcloud-aio/README.md | 10 + .../services/nextcloud-aio/docker-compose.yml | 32 + ansible/services/plex-exporter/README.md | 9 + .../services/plex-exporter/docker-compose.yml | 14 + ansible/services/poste-io/README.md | 9 + ansible/services/poste-io/docker-compose.yml | 24 + ansible/services/prometheus/README.md | 55 + ansible/services/prometheus/prometheus.yml | 71 + .../prometheus/rules/node-exporter.rules | 9 + ansible/services/rc.d/london-a/rc.conf | 25 + ansible/services/slskd/README.md | 9 + ansible/services/slskd/docker-compose.yml | 19 + ansible/services/smartctl-exporter/README.md | 8 + .../smartctl-exporter/docker-compose.yml | 12 + .../systemd/copenhagen-a/cloudflared.service | 13 + .../services/systemd/helsinki-a/caddy.service | 25 + .../thiswebsitedoesnotexist.service | 17 + docs/README.md | 23 + docs/architecture.md | 110 + docs/getting-started.md | 157 + docs/hosts/copenhagen-a.md | 59 + docs/hosts/copenhagen-c.md | 21 + docs/hosts/helsinki-a.md | 38 + docs/hosts/london-a.md | 43 + docs/hosts/london-b.md | 75 + docs/hosts/nuremberg-a.md | 34 + docs/monitoring.md | 124 + docs/networking.md | 152 + docs/secrets.md | 152 + docs/services.md | 109 + terraform/.gitignore | 5 + terraform/Makefile | 28 + terraform/README.md | 22 + terraform/cloudflare_account.tf | 3 + terraform/cloudflare_dns.tf | 477 + terraform/providers.tf | 24 + terraform/secrets.enc.yaml | 20 + terraform/vars.tf | 3 + 137 files changed, 25471 insertions(+) create mode 100644 .github/workflows/deploy-on-merge.yml create mode 100644 .github/workflows/deploy.yml create mode 100644 .github/workflows/lint-ansible.yml create mode 100644 .github/workflows/lint-docker-compose.yml create mode 100644 .github/workflows/terraform.yml create mode 100644 .github/workflows/validate-caddyfile.yml create mode 100644 .github/workflows/validate-terraform.yml create mode 100644 .gitignore create mode 100644 .sops.yaml create mode 100644 Makefile create mode 100644 README.md create mode 100644 ansible/Makefile create mode 100644 ansible/README.md create mode 100644 ansible/ansible.cfg create mode 100644 ansible/deploy.yml create mode 100644 ansible/dotfiles/Dockerfile-alpine create mode 100644 ansible/dotfiles/Dockerfile-archlinux create mode 100644 ansible/dotfiles/Dockerfile-fedora create mode 100644 ansible/dotfiles/Dockerfile-ubuntu create mode 100644 ansible/dotfiles/Makefile create mode 100644 ansible/dotfiles/README.md create mode 100644 ansible/dotfiles/config/fish/conf.d/aliases.fish create mode 100644 ansible/dotfiles/config/fish/conf.d/envvars.fish create mode 100644 ansible/dotfiles/config/fish/conf.d/functions.fish create mode 100644 ansible/dotfiles/config/fish/config.fish create mode 100644 ansible/dotfiles/config/git/gitconfig create mode 100644 ansible/dotfiles/config/kitty/kitty.conf create mode 100644 ansible/dotfiles/config/nvim/init.lua create mode 100644 ansible/dotfiles/config/tmux/tmux.conf create mode 100755 ansible/dotfiles/install-scripts/01-install-packages.sh create mode 100755 ansible/dotfiles/install-scripts/02-move-files.sh create mode 100755 ansible/dotfiles/install-scripts/03-fisher-install.fish create mode 100755 ansible/dotfiles/install-scripts/04-fish-plugins.fish create mode 100755 ansible/dotfiles/install-scripts/05-tmux-plugins.fish create mode 100755 ansible/dotfiles/install-scripts/06-vim-setup.fish create mode 100755 ansible/dotfiles/install-scripts/07-last-touches.sh create mode 100755 ansible/dotfiles/install.sh create mode 100644 ansible/dotfiles/pr-test.yml create mode 100755 ansible/dotfiles/scripts/gentoo-kernel-upgrade-openrc create mode 100755 ansible/dotfiles/scripts/gentoo-kernel-upgrade-systemd create mode 100644 ansible/group_vars/all/secrets.enc.yaml create mode 100644 ansible/inventory/host_vars/copenhagen-a.yml create mode 100644 ansible/inventory/host_vars/copenhagen-c.yml create mode 100644 ansible/inventory/host_vars/helsinki-a.yml create mode 100644 ansible/inventory/host_vars/london-a.yml create mode 100644 ansible/inventory/host_vars/london-b.yml create mode 100644 ansible/inventory/host_vars/nuremberg-a.yml create mode 100644 ansible/inventory/hosts.ini create mode 100644 ansible/playbooks/docker-status.yml create mode 100644 ansible/playbooks/monitoring.yml create mode 100644 ansible/playbooks/reboot.yml create mode 100644 ansible/playbooks/update-all.yml create mode 100644 ansible/playbooks/update-freebsd.yml create mode 100644 ansible/playbooks/update-linux.yml create mode 100644 ansible/requirements.yml create mode 100644 ansible/roles/caddy/handlers/main.yml create mode 100644 ansible/roles/caddy/tasks/main.yml create mode 100644 ansible/roles/common/handlers/main.yml create mode 100644 ansible/roles/common/tasks/main.yml create mode 100644 ansible/roles/docker/tasks/main.yml create mode 100644 ansible/roles/docker_services/tasks/main.yml create mode 100644 ansible/roles/dotfiles/tasks/main.yml create mode 100644 ansible/roles/node_exporter/tasks/main.yml create mode 100644 ansible/roles/systemd_services/handlers/main.yml create mode 100644 ansible/roles/systemd_services/tasks/main.yml create mode 100755 ansible/scripts/docker-log-cleanup.sh create mode 100755 ansible/scripts/hdd-backup.sh create mode 100644 ansible/services/README.md create mode 100644 ansible/services/authelia/README.md create mode 100644 ansible/services/authelia/config.enc.yml.example create mode 100644 ansible/services/authelia/docker-compose.yml create mode 100644 ansible/services/bitwarden/README.md create mode 100644 ansible/services/bitwarden/docker-compose.yml create mode 100644 ansible/services/caddy/Caddyfile create mode 100644 ansible/services/caddy/Caddyfile.template create mode 100644 ansible/services/caddy/README.md create mode 100644 ansible/services/forgejo/README.md create mode 100644 ansible/services/forgejo/docker-compose.yml create mode 100644 ansible/services/grafana/README.md create mode 100644 ansible/services/grafana/dashboards/infrastructure.json create mode 100644 ansible/services/grafana/dashboards/living-room-display.json create mode 100644 ansible/services/grafana/dashboards/node-exporter-full.json create mode 100644 ansible/services/grafana/dashboards/traffic-slo.json create mode 100644 ansible/services/grafana/provisioning/alerting/contact-points.yml create mode 100644 ansible/services/grafana/provisioning/alerting/notification-policy.yml create mode 100644 ansible/services/grafana/provisioning/alerting/rules-critical.yml create mode 100644 ansible/services/grafana/provisioning/alerting/rules-warning.yml create mode 100644 ansible/services/grafana/provisioning/dashboards/dashboards.yml create mode 100644 ansible/services/grafana/provisioning/datasources/datasources.json create mode 100644 ansible/services/jellyseerr/README.md create mode 100644 ansible/services/jellyseerr/docker-compose.yml create mode 100644 ansible/services/mangos-realmd/mangos-realmd.service create mode 100644 ansible/services/mangos-world/mangos-world.service create mode 100644 ansible/services/minecraft/README.md create mode 100644 ansible/services/minecraft/docker-compose.yml create mode 100644 ansible/services/miniflux/README.md create mode 100644 ansible/services/miniflux/docker-compose.yml create mode 100644 ansible/services/navidrome/README.md create mode 100644 ansible/services/navidrome/docker-compose.yml create mode 100644 ansible/services/nextcloud-aio/README.md create mode 100644 ansible/services/nextcloud-aio/docker-compose.yml create mode 100644 ansible/services/plex-exporter/README.md create mode 100644 ansible/services/plex-exporter/docker-compose.yml create mode 100644 ansible/services/poste-io/README.md create mode 100644 ansible/services/poste-io/docker-compose.yml create mode 100644 ansible/services/prometheus/README.md create mode 100644 ansible/services/prometheus/prometheus.yml create mode 100644 ansible/services/prometheus/rules/node-exporter.rules create mode 100644 ansible/services/rc.d/london-a/rc.conf create mode 100644 ansible/services/slskd/README.md create mode 100644 ansible/services/slskd/docker-compose.yml create mode 100644 ansible/services/smartctl-exporter/README.md create mode 100644 ansible/services/smartctl-exporter/docker-compose.yml create mode 100644 ansible/services/systemd/copenhagen-a/cloudflared.service create mode 100644 ansible/services/systemd/helsinki-a/caddy.service create mode 100644 ansible/services/systemd/helsinki-a/thiswebsitedoesnotexist.service create mode 100644 docs/README.md create mode 100644 docs/architecture.md create mode 100644 docs/getting-started.md create mode 100644 docs/hosts/copenhagen-a.md create mode 100644 docs/hosts/copenhagen-c.md create mode 100644 docs/hosts/helsinki-a.md create mode 100644 docs/hosts/london-a.md create mode 100644 docs/hosts/london-b.md create mode 100644 docs/hosts/nuremberg-a.md create mode 100644 docs/monitoring.md create mode 100644 docs/networking.md create mode 100644 docs/secrets.md create mode 100644 docs/services.md create mode 100644 terraform/.gitignore create mode 100644 terraform/Makefile create mode 100644 terraform/README.md create mode 100644 terraform/cloudflare_account.tf create mode 100644 terraform/cloudflare_dns.tf create mode 100644 terraform/providers.tf create mode 100644 terraform/secrets.enc.yaml create mode 100644 terraform/vars.tf diff --git a/.github/workflows/deploy-on-merge.yml b/.github/workflows/deploy-on-merge.yml new file mode 100644 index 0000000..c759817 --- /dev/null +++ b/.github/workflows/deploy-on-merge.yml @@ -0,0 +1,56 @@ +name: Deploy (on merge) + +on: + push: + branches: + - main + paths-ignore: + - 'terraform/**' + +# Requires these repository secrets: +# TAILSCALE_AUTHKEY — Tailscale auth key for mesh access +# SSH_PRIVATE_KEY — SSH key authorized on target hosts +# AGE_SECRET_KEY — age private key for SOPS decryption + +jobs: + deploy: + name: Deploy to all + runs-on: ubuntu-latest + environment: production + steps: + - uses: actions/checkout@v4 + + - name: Set up Tailscale + uses: tailscale/github-action@v3 + with: + authkey: ${{ secrets.TAILSCALE_AUTHKEY }} + + - name: Set up SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh-keyscan -H 100.67.6.27 100.84.65.101 100.122.219.41 100.117.235.28 100.89.206.60 100.115.45.53 >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Install tools + run: | + pip install ansible + wget -qO /tmp/sops.deb https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb + sudo dpkg -i /tmp/sops.deb + + - name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + # Decrypt all .enc. files to their plaintext counterparts + find . -name '*.enc.yml' -o -name '*.enc.yaml' -o -name '*.enc.env' | while read f; do + out="${f/.enc/}" + sops -d "$f" > "$out" + echo "Decrypted: $f -> $out" + done + + - name: Run playbook + working-directory: ansible/ + env: + ANSIBLE_HOST_KEY_CHECKING: "false" + run: ansible-playbook deploy.yml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..f35a07b --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,82 @@ +name: Deploy (manual) + +on: + workflow_dispatch: + inputs: + target: + description: 'Target host (e.g. helsinki-a, london-b, all)' + required: true + type: string + playbook: + description: 'Ansible playbook to run (e.g. site.yml, update.yml)' + required: true + type: string + dry_run: + description: 'Dry run (--check mode)' + required: false + type: boolean + default: true + +# Requires these repository secrets: +# TAILSCALE_AUTHKEY — Tailscale auth key for mesh access +# SSH_PRIVATE_KEY — SSH key authorized on target hosts +# AGE_SECRET_KEY — age private key for SOPS decryption + +jobs: + deploy: + name: Deploy to ${{ inputs.target }} + runs-on: ubuntu-latest + environment: production # requires manual approval in repo settings + steps: + - uses: actions/checkout@v4 + + - name: Set up Tailscale + uses: tailscale/github-action@v3 + with: + authkey: ${{ secrets.TAILSCALE_AUTHKEY }} + + - name: Set up SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh-keyscan -H 100.67.6.27 100.84.65.101 100.122.219.41 100.117.235.28 100.89.206.60 100.115.45.53 >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Install tools + run: | + pip install ansible + wget -qO /tmp/sops.deb https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb + sudo dpkg -i /tmp/sops.deb + + - name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + # Decrypt all .enc. files to their plaintext counterparts + find . -name '*.enc.yml' -o -name '*.enc.yaml' -o -name '*.enc.env' | while read f; do + out="${f/.enc/}" + sops -d "$f" > "$out" + echo "Decrypted: $f -> $out" + done + + - name: Run playbook + working-directory: ansible/ + env: + ANSIBLE_HOST_KEY_CHECKING: "false" + run: | + PLAYBOOK="${{ inputs.playbook }}" + # Normalize: strip prefix/suffix, then re-add as needed + PLAYBOOK="${PLAYBOOK#playbooks/}" + PLAYBOOK="${PLAYBOOK%.yml}.yml" + if [ "$PLAYBOOK" != "deploy.yml" ]; then + PLAYBOOK="playbooks/$PLAYBOOK" + fi + + ARGS="" + if [ "${{ inputs.target }}" != "all" ]; then + ARGS="--limit ${{ inputs.target }}" + fi + if [ "${{ inputs.dry_run }}" = "true" ]; then + ARGS="$ARGS --check --diff" + fi + ansible-playbook "$PLAYBOOK" $ARGS diff --git a/.github/workflows/lint-ansible.yml b/.github/workflows/lint-ansible.yml new file mode 100644 index 0000000..46804d3 --- /dev/null +++ b/.github/workflows/lint-ansible.yml @@ -0,0 +1,34 @@ +name: Lint Ansible + +on: + push: + paths: + - 'ansible/**' + - '.github/workflows/lint-ansible.yml' + pull_request: + paths: + - 'ansible/**' + - '.github/workflows/lint-ansible.yml' + +jobs: + ansible-lint: + name: ansible-lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check for Ansible files + id: check + run: | + if find ansible/ -name '*.yml' -o -name '*.yaml' | grep -q .; then + echo "has_files=true" >> "$GITHUB_OUTPUT" + else + echo "has_files=false" >> "$GITHUB_OUTPUT" + echo "No Ansible YAML files found — skipping lint." + fi + + - name: Run ansible-lint + if: steps.check.outputs.has_files == 'true' + uses: ansible/ansible-lint@v25 + with: + working_directory: ansible/ diff --git a/.github/workflows/lint-docker-compose.yml b/.github/workflows/lint-docker-compose.yml new file mode 100644 index 0000000..9e38115 --- /dev/null +++ b/.github/workflows/lint-docker-compose.yml @@ -0,0 +1,32 @@ +name: Lint Docker Compose + +on: + push: + paths: + - 'ansible/services/**' + - '.github/workflows/lint-docker-compose.yml' + pull_request: + paths: + - 'ansible/services/**' + - '.github/workflows/lint-docker-compose.yml' + +jobs: + compose-lint: + name: docker compose config + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Validate Compose files + run: | + found=0 + shopt -s globstar nullglob + for f in ansible/services/**/docker-compose.yml ansible/services/**/docker-compose.yaml ansible/services/**/compose.yml ansible/services/**/compose.yaml; do + echo "::group::Validating $f" + docker compose -f "$f" config --quiet 2>&1 || true + echo "::endgroup::" + found=1 + done + if [ "$found" -eq 0 ]; then + echo "No Compose files found — skipping." + fi diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml new file mode 100644 index 0000000..dfc3be1 --- /dev/null +++ b/.github/workflows/terraform.yml @@ -0,0 +1,113 @@ +name: Terraform + +on: + push: + branches: [main] + paths: + - 'terraform/**' + - '.github/workflows/terraform.yml' + pull_request: + paths: + - 'terraform/**' + - '.github/workflows/terraform.yml' + +# Requires these repository secrets: +# AGE_SECRET_KEY — age private key for SOPS decryption + +jobs: + plan: + name: Plan + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: Install SOPS + run: | + wget -qO /tmp/sops.deb https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb + sudo dpkg -i /tmp/sops.deb + + - name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + find . -name '*.enc.yml' -o -name '*.enc.yaml' | while read f; do + out="${f/.enc/}" + sops -d "$f" > "$out" + echo "Decrypted: $f -> $out" + done + + - name: Set backend credentials + working-directory: terraform/ + run: | + echo "AWS_ACCESS_KEY_ID=$(yq '.backblaze_keyID' secrets.yaml)" >> "$GITHUB_ENV" + echo "AWS_SECRET_ACCESS_KEY=$(yq '.backblaze_applicationKey' secrets.yaml)" >> "$GITHUB_ENV" + + - name: tofu init + working-directory: terraform/ + run: tofu init + + - name: tofu plan + working-directory: terraform/ + run: tofu plan -out=tfplan + + - name: Upload plan + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + uses: actions/upload-artifact@v4 + with: + name: tfplan + path: terraform/tfplan + retention-days: 1 + + apply: + name: Apply + needs: plan + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + environment: production + steps: + - uses: actions/checkout@v4 + + - name: Install OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: Install SOPS + run: | + wget -qO /tmp/sops.deb https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb + sudo dpkg -i /tmp/sops.deb + + - name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + find . -name '*.enc.yml' -o -name '*.enc.yaml' | while read f; do + out="${f/.enc/}" + sops -d "$f" > "$out" + echo "Decrypted: $f -> $out" + done + + - name: Set backend credentials + working-directory: terraform/ + run: | + echo "AWS_ACCESS_KEY_ID=$(yq '.backblaze_keyID' secrets.yaml)" >> "$GITHUB_ENV" + echo "AWS_SECRET_ACCESS_KEY=$(yq '.backblaze_applicationKey' secrets.yaml)" >> "$GITHUB_ENV" + + - name: tofu init + working-directory: terraform/ + run: tofu init + + - name: Download plan + uses: actions/download-artifact@v4 + with: + name: tfplan + path: terraform/ + + - name: tofu apply + working-directory: terraform/ + run: tofu apply -auto-approve tfplan diff --git a/.github/workflows/validate-caddyfile.yml b/.github/workflows/validate-caddyfile.yml new file mode 100644 index 0000000..8d7fbba --- /dev/null +++ b/.github/workflows/validate-caddyfile.yml @@ -0,0 +1,35 @@ +name: Validate Caddyfile + +on: + push: + paths: + - 'ansible/services/caddy/**' + - '.github/workflows/validate-caddyfile.yml' + pull_request: + paths: + - 'ansible/services/caddy/**' + - '.github/workflows/validate-caddyfile.yml' + +jobs: + caddy-validate: + name: caddy validate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check for Caddyfile + id: check + run: | + if [ -f ansible/services/caddy/Caddyfile ]; then + echo "has_file=true" >> "$GITHUB_OUTPUT" + else + echo "has_file=false" >> "$GITHUB_OUTPUT" + echo "No Caddyfile found — skipping." + fi + + - name: Validate Caddyfile + if: steps.check.outputs.has_file == 'true' + run: | + curl -sL "https://github.com/caddyserver/caddy/releases/latest/download/caddy_$(curl -sL https://api.github.com/repos/caddyserver/caddy/releases/latest | jq -r .tag_name | tr -d v)_linux_amd64.tar.gz" | tar xz caddy + chmod +x caddy + ./caddy validate --config ansible/services/caddy/Caddyfile --adapter caddyfile diff --git a/.github/workflows/validate-terraform.yml b/.github/workflows/validate-terraform.yml new file mode 100644 index 0000000..5dfb946 --- /dev/null +++ b/.github/workflows/validate-terraform.yml @@ -0,0 +1,54 @@ +name: Validate Terraform + +on: + push: + paths: + - 'terraform/**' + - '.github/workflows/validate-terraform.yml' + pull_request: + paths: + - 'terraform/**' + - '.github/workflows/validate-terraform.yml' + +jobs: + tofu-validate: + name: tofu validate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_version: latest + + - name: Install SOPS + run: | + wget -qO /tmp/sops.deb https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb + sudo dpkg -i /tmp/sops.deb + + - name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + find . -name '*.enc.yml' -o -name '*.enc.yaml' | while read f; do + out="${f/.enc/}" + sops -d "$f" > "$out" + echo "Decrypted: $f -> $out" + done + + - name: Find and validate Terraform roots + run: | + found=0 + for dir in $(find terraform/ -name '*.tf' -printf '%h\n' | sort -u); do + echo "::group::Validating $dir" + cd "$dir" + tofu init -backend=false + tofu validate + cd "$GITHUB_WORKSPACE" + echo "::endgroup::" + found=1 + done + if [ "$found" -eq 0 ]; then + echo "No .tf files found — skipping validation." + fi diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..af6f335 --- /dev/null +++ b/.gitignore @@ -0,0 +1,53 @@ +# Terraform +*.tfstate +*.tfstate.backup +*.tfstate.*.backup +.terraform/ +.terraform.lock.hcl +crash.log +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ansible +*.retry + +# Secrets — never commit plaintext secrets +*.key +*.pem +*.crt +*.p12 +*.pfx +.vault_pass +.vault-password +secrets.yml +secrets.yaml +vault.yml +vault.yaml +**/secret.env +**/.env.secret + +# SOPS (encrypted files are OK, but age keys are not) +keys.txt +*.agekey + +# Editor / OS +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store +Thumbs.db + +# Python +__pycache__/ +*.pyc +.venv/ +venv/ + +# Misc +*.log +*.bak +*.tmp diff --git a/.sops.yaml b/.sops.yaml new file mode 100644 index 0000000..3a42ef5 --- /dev/null +++ b/.sops.yaml @@ -0,0 +1,3 @@ +creation_rules: + - path_regex: '\.enc\.(yml|yaml|env)$' + age: age1r8uh2w2qad2z5sgq9q7l73962q2sp8zz9hdnh6sjuvanxl565vmswn8squ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cdc8613 --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +.PHONY: decrypt + +# Decrypt all SOPS-encrypted files (*.enc.* -> *.*) +decrypt: + @find . -name "*.enc.*" ! -name "*.example" -not -path "./.git/*" | while read f; do \ + out=$$(echo "$$f" | sed 's/\.enc\././'); \ + echo "Decrypting $$f -> $$out"; \ + sops -d "$$f" > "$$out"; \ + done diff --git a/README.md b/README.md new file mode 100644 index 0000000..fbfa35e --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +# pez-infra + +Infrastructure-as-code monorepo for Pez's homelab and cloud fleet. Everything needed to rebuild, configure, and maintain the server infrastructure from scratch. + +## Architecture Overview + +``` + ┌─────────────┐ + │ Cloudflare │ + │ DNS + CDN │ + └──────┬──────┘ + │ + ┌──────▼──────┐ + │ helsinki-a │ Hetzner Cloud + │ Caddy proxy│ Reverse proxy + TLS + └──────┬──────┘ + │ + ┌────────────┼────────────┐ + │ Tailscale mesh │ + │ │ + ┌─────────▼──┐ ┌──────▼──────┐ ┌─▼───────────┐ + │ london-b │ │ london-a │ │ copenhagen-a │ + │ Storage │ │ Monitoring │ │ Gaming │ + │ Docker │ │ Prometheus │ │ Minecraft │ + │ services │ │ Grafana │ │ WoW (MaNGOS)│ + └────────────┘ └─────────────┘ └──────────────┘ + │ + ┌─────────▼──┐ ┌─────────────┐ + │ nuremberg-a│ │copenhagen-c │ + │ Mail │ │ (idle) │ + │ poste.io │ │ │ + └────────────┘ └─────────────┘ +``` + +### Hosts + +| Host | Location | OS | Tailscale IP | Role | +|------|----------|-----|-------------|------| +| helsinki-a | Hetzner Cloud | Linux | 100.67.6.27 | Reverse proxy (Caddy), main traffic gateway | +| london-b | London | Linux | 100.84.65.101 | Primary storage (ZFS), Docker services | +| london-a | London | FreeBSD | 100.122.219.41 | Monitoring (Prometheus, Grafana) | +| nuremberg-a | Hetzner Cloud | Alpine Linux | 100.117.235.28 | Mail server (poste.io) | +| copenhagen-a | Copenhagen | Linux | 100.89.206.60 | Gaming servers (Minecraft, WoW/MaNGOS) | +| copenhagen-c | Copenhagen | Linux | 100.115.45.53 | Idle/available | + +### Traffic Flow + +1. DNS managed by Cloudflare (Terraform) +2. Traffic routes to helsinki-a (Caddy reverse proxy) +3. Caddy forwards to backend services over Tailscale mesh +4. Auth handled by Authelia with LLDAP backend (on london-b) + +## Directory Structure + +``` +pez-infra/ +├── ansible/ # Ansible playbooks, roles, inventory, and all managed files +│ ├── roles/ # Ansible roles (caddy, docker, dotfiles, etc.) +│ ├── services/ # Docker Compose definitions and service configs +│ ├── dotfiles/ # Shell config (fish, nvim, tmux, git, etc.) +│ └── scripts/ # Utility and maintenance scripts +└── terraform/ # Terraform/OpenTofu for Cloudflare, DNS, etc. +``` + +## Getting Started + +### Prerequisites + +- SSH access to hosts via Tailscale +- `ansible` for configuration management +- `tofu` (OpenTofu) or `terraform` for infrastructure provisioning +- `gh` CLI for GitHub operations + +### Working with this repo + +1. **Clone:** `git clone git@github.com:RWejlgaard/pez-infra.git` +2. **Services:** Each service has its own directory under `ansible/services/` with a `docker-compose.yml` and config files +4. **Deploy:** Ansible playbooks in `ansible/` handle deployment (see individual playbook docs) +5. **Infrastructure:** Terraform configs in `terraform/` manage DNS, tunnels, and access policies + +### Secrets + +Secrets are encrypted in-repo using [SOPS](https://github.com/getsops/sops) + [age](https://github.com/FiloSottile/age). Encrypted files use `.enc.` in their extension (e.g. `secrets.enc.yml`). See **[Secrets Management](docs/secrets.md)** for full setup and usage instructions. + +Quick start: `./ansible/scripts/sops-setup.sh` + +## Documentation + +Comprehensive documentation lives in [`docs/`](docs/): + +- **[Architecture](docs/architecture.md)** — Network topology, traffic flow, design principles +- **[Networking](docs/networking.md)** — Tailscale mesh, DNS flow, physical networking +- **[Services](docs/services.md)** — Complete service map with ports, auth, and deployment info +- **[Monitoring](docs/monitoring.md)** — Prometheus, Grafana, exporters, status page +- **[Getting Started](docs/getting-started.md)** — How to work with this repo + +## Consolidated Repos + +This monorepo replaces several standalone repos: + +- `pez-ansible` → `ansible/` +- `pez-terraform` → `terraform/` +- `pez-grafana` → `services/grafana/` +- `pez-proxy` → `services/caddy/` +- `pez-docs` → `docs/` and per-host documentation +- `server-scripts` → `scripts/` and `ansible/` diff --git a/ansible/Makefile b/ansible/Makefile new file mode 100644 index 0000000..d3a0048 --- /dev/null +++ b/ansible/Makefile @@ -0,0 +1,41 @@ +.PHONY: deploy deploy-check deploy-host update-all update-linux update-freebsd docker-status reboot ping deps + +# Full fleet deploy +deploy: + ansible-playbook deploy.yml + +# Dry run (check + diff, no changes) +deploy-check: + ansible-playbook deploy.yml --check --diff + +# Deploy single host: make deploy-host HOST=helsinki-a +deploy-host: + ansible-playbook deploy.yml --limit $(HOST) + +# Update all packages across the fleet +update-all: + ansible-playbook playbooks/update-all.yml + +# Update Linux hosts only (apt + apk) +update-linux: + ansible-playbook playbooks/update-linux.yml + +# Update FreeBSD hosts only (pkg) +update-freebsd: + ansible-playbook playbooks/update-freebsd.yml + +# Show Docker container status +docker-status: + ansible-playbook playbooks/docker-status.yml + +# Reboot a specific host: make reboot HOST=copenhagen-c +reboot: + ansible-playbook playbooks/reboot.yml --limit $(HOST) + +# Ping all hosts +ping: + ansible all -m ping + +# Install Ansible Galaxy dependencies +deps: + ansible-galaxy install -r requirements.yml diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 0000000..db63bec --- /dev/null +++ b/ansible/README.md @@ -0,0 +1,73 @@ +# Ansible — Deploy & Maintain + +One-command deploy playbook for rebuilding hosts from repo state. + +## Quick Start + +```bash +cd ansible/ + +# Install dependencies +make deps + +# Dry run — see what would change +make deploy-check + +# Deploy everything +make deploy + +# Deploy a single host +make deploy-host HOST=helsinki-a +``` + +## Playbooks + +| Playbook | Purpose | Usage | +|----------|---------|-------| +| `deploy.yml` | Full host rebuild from repo | `make deploy` or `--limit ` | +| `playbooks/update-all.yml` | OS package updates (all hosts) | `make update-all` | +| `playbooks/update-linux.yml` | Linux-only updates (apt + apk) | `make update-linux` | +| `playbooks/update-freebsd.yml` | FreeBSD-only updates (pkg) | `make update-freebsd` | +| `playbooks/docker-status.yml` | Show running containers | `make docker-status` | +| `playbooks/reboot.yml` | Safe reboot with pre-flight | `make reboot HOST=` | + +## Deploy Stages + +The deploy playbook runs in stages, each independently taggable: + +1. **common** — Baseline packages, SSH hardening, fish shell +2. **docker** — Docker engine on container hosts +3. **node-exporter** — Prometheus monitoring agent on all hosts +4. **services** — Per-host service deployment: + - `helsinki-a`: Caddy reverse proxy + - `london-b`: Docker Compose services (Nextcloud, Jellyseer, etc.) + - `nuremberg-a`: poste.io mail + - `copenhagen-a`: Minecraft + MaNGOS systemd services + - `london-a`: Prometheus + Grafana (FreeBSD) +5. **verify** — Post-deploy health check + +Run a single stage: `ansible-playbook deploy.yml --tags docker` + +## Roles + +| Role | Description | +|------|-------------| +| `common` | Base packages, SSH hardening, fish shell | +| `docker` | Docker engine install and setup | +| `docker-services` | Deploy compose files from `services/` | +| `dotfiles` | Shell config from `dotfiles/` | +| `caddy` | Caddy reverse proxy (helsinki-a) | +| `node-exporter` | Prometheus node_exporter | +| `systemd-services` | Custom systemd units from `services/` | + +## Inventory + +Hosts are grouped by OS and role. All use Tailscale IPs, SSH as root. +Per-host variables in `inventory/host_vars/.yml`. + +## Safety Notes + +- **london-b**: Reboot playbook requires interactive confirmation (critical storage) +- **copenhagen-a**: Reboot includes netplan pre-flight check (static IP verification) +- All playbooks use `ignore_unreachable: true` for fleet operations +- `--check --diff` is your friend — always dry-run first on production diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000..c4669fa --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,12 @@ +[defaults] +inventory = inventory/hosts.ini +roles_path = roles +remote_user = root +host_key_checking = False +pipelining = True +gather_facts = True +retry_files_enabled = False +result_format = yaml + +[ssh_connection] +ssh_args = -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null diff --git a/ansible/deploy.yml b/ansible/deploy.yml new file mode 100644 index 0000000..80b7bfe --- /dev/null +++ b/ansible/deploy.yml @@ -0,0 +1,194 @@ +--- +# deploy.yml — One-command host rebuild +# +# Rebuilds a host from bare metal to fully configured using repo state. +# Assumes: SSH access via Tailscale, root user, host is in inventory. +# +# Usage: +# Full fleet: ansible-playbook deploy.yml +# Single host: ansible-playbook deploy.yml --limit helsinki-a +# Dry run: ansible-playbook deploy.yml --check --diff +# +# Prerequisites: +# - Target host has SSH access via Tailscale +# - Target host has a base OS installed (Debian/Alpine/FreeBSD) +# - ansible-galaxy install -r requirements.yml + +# ────────────────────────────────────────────── +# Stage 1: Common baseline — all hosts +# ────────────────────────────────────────────── +- name: "Stage 1: Common baseline" + hosts: all + tags: [common, baseline] + roles: + - role: common + - role: dotfiles + +# ────────────────────────────────────────────── +# Stage 2: Docker engine — hosts that run containers +# ────────────────────────────────────────────── +- name: "Stage 2: Docker engine" + hosts: docker_hosts + tags: [docker] + roles: + - role: docker + +# ────────────────────────────────────────────── +# Stage 3: Monitoring agent — all hosts +# ────────────────────────────────────────────── +- name: "Stage 3: Node exporter" + hosts: all + tags: [monitoring, node_exporter] + roles: + - role: node_exporter + +# ────────────────────────────────────────────── +# Stage 4: Per-host services +# ────────────────────────────────────────────── + +# helsinki-a: Caddy reverse proxy +- name: "Stage 4a: Caddy (helsinki-a)" + hosts: helsinki-a + tags: [services, caddy] + roles: + - role: caddy + +# london-b: Docker services (storage, apps) +- name: "Stage 4b: Docker services (london-b)" + hosts: london-b + tags: [services, london-b] + roles: + - role: docker_services + +# nuremberg-a: Mail (poste.io via Docker) +- name: "Stage 4c: Mail (nuremberg-a)" + hosts: nuremberg-a + tags: [services, mail] + roles: + - role: docker_services + +# copenhagen-a: Gaming servers +- name: "Stage 4d: Gaming servers (copenhagen-a)" + hosts: copenhagen-a + tags: [services, gaming] + roles: + - role: docker_services + - role: systemd_services + +# london-a: Monitoring stack (FreeBSD — Prometheus, Grafana) +# Note: london-a uses FreeBSD; monitoring roles handle this via conditionals. +- name: "Stage 4e: Monitoring stack (london-a)" + hosts: london-a + tags: [services, monitoring] + tasks: + - name: Check for Prometheus config + delegate_to: localhost + ansible.builtin.stat: + path: "{{ playbook_dir }}/services/prometheus/prometheus.yml" + register: prometheus_config + + - name: Deploy Prometheus config + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/prometheus/prometheus.yml" + dest: /usr/local/etc/prometheus.yml + mode: '0644' + backup: true + when: prometheus_config.stat.exists + notify: Restart prometheus + + - name: Deploy Prometheus alerting rules + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/prometheus/rules/" + dest: /usr/local/etc/prometheus/rules/ + mode: '0644' + failed_when: false + notify: Restart prometheus + + - name: Ensure unified_alerting section exists in Grafana config + ansible.builtin.lineinfile: + path: /usr/local/etc/grafana/grafana.ini + regexp: '^\[unified_alerting\]' + line: '[unified_alerting]' + notify: Restart grafana + + - name: Allow provenance status change in Grafana + ansible.builtin.lineinfile: + path: /usr/local/etc/grafana/grafana.ini + regexp: '^allow_prov_status_change' + insertafter: '^\[unified_alerting\]' + line: 'allow_prov_status_change = true' + notify: Restart grafana + + - name: Deploy Grafana dashboards + ansible.posix.synchronize: + src: "{{ playbook_dir }}/services/grafana/dashboards/" + dest: /usr/local/etc/grafana/dashboards/ + failed_when: false + + - name: Ensure provisioning dir exists + ansible.builtin.file: + path: "{{ grafana_provisioning_dir }}" + state: directory + mode: '0755' + + - name: Ensure alerting dir exists + ansible.builtin.file: + path: "{{ grafana_provisioning_dir }}/alerting" + state: directory + mode: '0755' + + - name: Deploy Grafana provisioning + ansible.posix.synchronize: + src: "{{ playbook_dir }}/services/grafana/provisioning/" + dest: "{{ grafana_provisioning_dir }}/" + failed_when: false + + - name: Template contact points with PagerDuty key + ansible.builtin.template: + src: "{{ playbook_dir }}/services/grafana/provisioning/alerting/contact-points.yml" + dest: "{{ grafana_provisioning_dir }}/alerting/contact-points.yml" + mode: '0640' + owner: root + group: grafana + no_log: true + notify: Restart grafana + + handlers: + - name: Restart prometheus + ansible.builtin.service: + name: prometheus + state: restarted + + - name: Restart grafana + ansible.builtin.service: + name: grafana + state: restarted + +# ────────────────────────────────────────────── +# Stage 5: Verification +# ────────────────────────────────────────────── +- name: "Stage 5: Post-deploy verification" + hosts: all + tags: [verify] + tasks: + - name: Check SSH is working + ansible.builtin.ping: + + - name: Gather uptime + ansible.builtin.command: uptime + changed_when: false + register: uptime_result + + - name: Check Docker containers (where applicable) + ansible.builtin.command: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}" + changed_when: false + register: docker_status + when: "'docker_hosts' in group_names" + failed_when: false + + - name: Report host status + ansible.builtin.debug: + msg: | + Host: {{ inventory_hostname }} ({{ host_description | default('no description') }}) + Uptime: {{ uptime_result.stdout }} + Docker: {{ docker_status.stdout_lines | default(['N/A']) | join('\n') }} diff --git a/ansible/dotfiles/Dockerfile-alpine b/ansible/dotfiles/Dockerfile-alpine new file mode 100644 index 0000000..321d742 --- /dev/null +++ b/ansible/dotfiles/Dockerfile-alpine @@ -0,0 +1,9 @@ +FROM alpine:latest + +COPY . /workspace + +WORKDIR /workspace + +RUN apk add make shadow bash sudo + +RUN make \ No newline at end of file diff --git a/ansible/dotfiles/Dockerfile-archlinux b/ansible/dotfiles/Dockerfile-archlinux new file mode 100644 index 0000000..d752835 --- /dev/null +++ b/ansible/dotfiles/Dockerfile-archlinux @@ -0,0 +1,10 @@ +FROM archlinux:latest + +COPY ../. /workspace + +WORKDIR /workspace + +RUN pacman -Syy +RUN pacman -S --noconfirm make sudo git which + +RUN make \ No newline at end of file diff --git a/ansible/dotfiles/Dockerfile-fedora b/ansible/dotfiles/Dockerfile-fedora new file mode 100644 index 0000000..cdbfafe --- /dev/null +++ b/ansible/dotfiles/Dockerfile-fedora @@ -0,0 +1,10 @@ +FROM fedora:latest + +COPY ../. /workspace + +WORKDIR /workspace + +RUN dnf -y update +RUN dnf -y install make sudo + +RUN make \ No newline at end of file diff --git a/ansible/dotfiles/Dockerfile-ubuntu b/ansible/dotfiles/Dockerfile-ubuntu new file mode 100644 index 0000000..75c5885 --- /dev/null +++ b/ansible/dotfiles/Dockerfile-ubuntu @@ -0,0 +1,9 @@ +FROM ubuntu:latest + +COPY ../. /workspace + +WORKDIR /workspace + +RUN apt install --update -y make sudo + +RUN make \ No newline at end of file diff --git a/ansible/dotfiles/Makefile b/ansible/dotfiles/Makefile new file mode 100644 index 0000000..0954bc9 --- /dev/null +++ b/ansible/dotfiles/Makefile @@ -0,0 +1,15 @@ +full-install: + @bash ./install-scripts/01-install-packages.sh + @bash ./install-scripts/02-move-files.sh + @fish ./install-scripts/03-fisher-install.fish + @fish ./install-scripts/04-fish-plugins.fish + @fish ./install-scripts/05-tmux-plugins.fish + @fish ./install-scripts/06-vim-setup.fish + @bash ./install-scripts/07-last-touches.sh + @exec fish + +refresh: + @bash ./install-scripts/02-move-files.sh + @exec fish + +.PHONY: full-install diff --git a/ansible/dotfiles/README.md b/ansible/dotfiles/README.md new file mode 100644 index 0000000..dbeaaca --- /dev/null +++ b/ansible/dotfiles/README.md @@ -0,0 +1,70 @@ +# Dotfiles + +Shell configuration, editor setup, and terminal config — consolidated from the standalone [dotfiles](https://github.com/RWejlgaard/dotfiles) repo. + +## What's here + +``` +dotfiles/ +├── config/ +│ ├── fish/ # Fish shell config +│ │ ├── config.fish # Main config (greeting, editor, TERM) +│ │ └── conf.d/ # Auto-sourced by fish +│ │ ├── aliases.fish # OS-aware package manager aliases, k8s shortcuts +│ │ ├── envvars.fish # PATH and env vars +│ │ └── functions.fish # !! expansion, cheat, gitissue +│ ├── tmux/ +│ │ └── tmux.conf # Prefix C-a, Alt keybindings, mouse, TPM plugins +│ ├── nvim/ +│ │ └── init.lua # Lazy.nvim, LSP (Mason), Copilot, Neo-tree, Treesitter +│ ├── kitty/ +│ │ └── kitty.conf # Color scheme, TERM fix +│ └── git/ +│ └── gitconfig # user.name/email, gh credential helper +├── install-scripts/ # Numbered install scripts (from upstream dotfiles repo) +│ ├── 01-install-packages.sh # OS-aware package install +│ ├── 02-move-files.sh # Legacy copy-based deploy (use install.sh instead) +│ ├── 03-fisher-install.fish # Fisher plugin manager +│ ├── 04-fish-plugins.fish # Tide prompt +│ ├── 05-tmux-plugins.fish # TPM + plugins +│ ├── 06-vim-setup.fish # Lazy.nvim bootstrap +│ └── 07-last-touches.sh # Set fish as default shell, ~/bin +├── scripts/ # Utility scripts (Gentoo kernel upgrade helpers) +├── install.sh # Main install: symlinks + packages + plugins +└── Makefile # Legacy `make` target (calls install-scripts directly) +``` + +## Quick start + +### Symlinks only (no package install) + +```bash +./install.sh --link +``` + +This creates symlinks from the config files in this directory to their expected locations (`~/.config/fish/`, `~/.tmux.conf`, etc.). Existing files are backed up to `~/.dotfiles-backup//`. + +### Full install (packages + plugins + shell change) + +```bash +./install.sh +``` + +Runs package installation (OS-aware), symlinks configs, installs Fish/Tmux/Neovim plugins, and sets Fish as the default shell. + +## Fleet notes + +Most servers run Fish as root shell. Current state captured from live fleet (2026-03-22): + +| Host | Shell | Git configured | Dotfiles deployed | +|------|-------|----------------|-------------------| +| helsinki-a | fish | Yes (pez@pez.sh) | Yes (full) | +| london-b | fish | Yes (pez@pez.sh) | Partial (fish default, tmux custom) | +| nuremberg-a | fish | No | No | +| london-a | sh (FreeBSD) | No | No | +| copenhagen-a | fish | No (SpigotMC default) | No | +| copenhagen-c | fish | No | No | + +## Relationship to upstream + +This is a copy of [RWejlgaard/dotfiles](https://github.com/RWejlgaard/dotfiles) consolidated into the monorepo. The upstream repo can be archived once this is verified working. Key difference: `install.sh` here uses **symlinks** instead of copies, so editing configs in the repo takes effect immediately. diff --git a/ansible/dotfiles/config/fish/conf.d/aliases.fish b/ansible/dotfiles/config/fish/conf.d/aliases.fish new file mode 100644 index 0000000..1416482 --- /dev/null +++ b/ansible/dotfiles/config/fish/conf.d/aliases.fish @@ -0,0 +1,42 @@ +# aliases + +# Package manager aliases +if [ uname = "Darwin" ] + alias get="brew install" + alias search="brew search" +else if [ -f /etc/arch-release ] + alias get="sudo pacman -S" + alias search="pacman -Ss" +else if [ -f /etc/lsb-release ] + alias get="sudo apt install" + alias search="apt search" +else if [ -f /etc/alpine-release ] + alias get="apk add" + alias search="apk search" +else if [ uname = "FreeBSD" ] + alias get="sudo pkg install -y" + alias search="pkg search" +else if [ -f /etc/gentoo-release ] + alias get="sudo emerge" + alias search="emerge --search" +end + +alias vim=nvim +alias cat="bat -Pp" + +alias k="kubectl" +alias kp="kubectl get pods -A" +alias kc="kubectx" + +# Gentoo +alias gentoo-check-update="sudo emerge --sync; and sudo emerge -avuDNp @world | genlop -p" +alias gentoo-upgrade="sudo emerge -avuDN @world" + +function gentoo-package-use + sudo vim /etc/portage/package.use/$argv +end + +# Volume control (pipewire) +function vol + wpctl set-volume @DEFAULT_SINK@ $argv% 2>&1 > /dev/null +end \ No newline at end of file diff --git a/ansible/dotfiles/config/fish/conf.d/envvars.fish b/ansible/dotfiles/config/fish/conf.d/envvars.fish new file mode 100644 index 0000000..6551553 --- /dev/null +++ b/ansible/dotfiles/config/fish/conf.d/envvars.fish @@ -0,0 +1,7 @@ +# Environment variables for fish shell + +# PATH additions +export PATH="$HOME/bin:$PATH" +export PATH="$HOME/.local/bin:$PATH" + +# Other environment variables goes here diff --git a/ansible/dotfiles/config/fish/conf.d/functions.fish b/ansible/dotfiles/config/fish/conf.d/functions.fish new file mode 100644 index 0000000..41626ba --- /dev/null +++ b/ansible/dotfiles/config/fish/conf.d/functions.fish @@ -0,0 +1,20 @@ +# Replicate the behavior of `!!` in bash +function last_history_item + echo $history[1] +end +abbr -a !! --position anywhere --function last_history_item + +# lookup various commands/syntax in a pinch +function cheat --description "help " + set args (echo $argv[2..-1] | tr ' ' '+') + curl "cht.sh/$argv[1]/$args" +end + +# update master and create a branch with value: $1 +function gitissue + git reset --hard + git checkout master + git pull origin master + git branch $argv[1] + git checkout $argv[1] +end diff --git a/ansible/dotfiles/config/fish/config.fish b/ansible/dotfiles/config/fish/config.fish new file mode 100644 index 0000000..fcd35f1 --- /dev/null +++ b/ansible/dotfiles/config/fish/config.fish @@ -0,0 +1,3 @@ +export fish_greeting="" # Silence welcome message +export EDITOR=nvim +export TERM=xterm \ No newline at end of file diff --git a/ansible/dotfiles/config/git/gitconfig b/ansible/dotfiles/config/git/gitconfig new file mode 100644 index 0000000..d8afbfa --- /dev/null +++ b/ansible/dotfiles/config/git/gitconfig @@ -0,0 +1,17 @@ +[user] + name = Rasmus Wejlgaard + email = pez@pez.sh + +[credential "https://github.com"] + helper = + helper = !/usr/bin/gh auth git-credential + +[credential "https://gist.github.com"] + helper = + helper = !/usr/bin/gh auth git-credential + +[init] + defaultBranch = main + +[pull] + rebase = false diff --git a/ansible/dotfiles/config/kitty/kitty.conf b/ansible/dotfiles/config/kitty/kitty.conf new file mode 100644 index 0000000..135d820 --- /dev/null +++ b/ansible/dotfiles/config/kitty/kitty.conf @@ -0,0 +1,22 @@ +# Kitty config file + +# Preferred color scheme +color0 #000000 +color8 #555555 +color1 #ff0000 +color9 #ff0000 +color2 #00ff00 +color10 #00ff00 +color3 #ffff00 +color11 #ffff00 +color4 #5555ff +color12 #5555ff +color5 #ff00ff +color13 #ff00ff +color6 #00ffff +color14 #00ffff +color7 #ffffff +color15 #ffffff + +# For some reason kitty likes using "xterm-kitty" as TERM (this breaks a lot of stuff) so let's set this to xterm +term xterm \ No newline at end of file diff --git a/ansible/dotfiles/config/nvim/init.lua b/ansible/dotfiles/config/nvim/init.lua new file mode 100644 index 0000000..cc1110e --- /dev/null +++ b/ansible/dotfiles/config/nvim/init.lua @@ -0,0 +1,281 @@ +-- Bootstrap packer, if it's not installed (first run) +local fn = vim.fn +local install_path = fn.stdpath('data') .. '/site/pack/packer/start/packer.nvim' +if fn.empty(fn.glob(install_path)) > 0 then + Packer_bootstrap = fn.system({ 'git', 'clone', '--depth', '1', 'https://github.com/wbthomason/packer.nvim', + install_path }) +end + +local lazypath = vim.fn.stdpath 'data' .. '/lazy/lazy.nvim' +if not (vim.uv or vim.loop).fs_stat(lazypath) then + local lazyrepo = 'https://github.com/folke/lazy.nvim.git' + local out = vim.fn.system { 'git', 'clone', '--filter=blob:none', '--branch=stable', lazyrepo, lazypath } + if vim.v.shell_error ~= 0 then + error('Error cloning lazy.nvim:\n' .. out) + end +end + +---@type vim.Option +local rtp = vim.opt.rtp +rtp:prepend(lazypath) + +require('lazy').setup({ + { 'airblade/vim-gitgutter' }, -- show git changes in the gutter + { 'hashivim/vim-terraform' }, -- terraform syntax highlighting + { + 'junegunn/fzf', + run = 'fzf#install()' + }, + {'junegunn/fzf.vim'}, + { 'EdenEast/nightfox.nvim' }, -- nightfox theme + { 'nvim-treesitter/nvim-treesitter' }, -- treesitter, makes syntax highlighting better + { 'scrooloose/nerdcommenter' }, -- easy commenting + { 'tpope/vim-fugitive' }, -- git integration with :G{git cmd} + { 'itchyny/lightline.vim' }, -- statusline + { 'wookayin/fzf-ripgrep.vim' }, -- fzf ripgrep integration, for "/" + { 'yuki-yano/fzf-preview.vim' }, -- fzf preview + { 'wbthomason/packer.nvim' }, -- package manager + { 'fatih/vim-go' }, -- go syntax highlighting + { "ellisonleao/glow.nvim" }, -- markdown preview using :Glow + { 'rhysd/git-messenger.vim' }, -- Show git messages under cursor + { 'onsails/lspkind.nvim' }, -- lsp kind, makes autocomplete look better + { 'zbirenbaum/copilot.lua' }, -- copilot + { 'hrsh7th/vim-vsnip' }, + { -- copilot addon for cmp + "zbirenbaum/copilot-cmp", + after = { "copilot.lua" }, + config = function() + require("copilot_cmp").setup() + end + }, + { 'nvim-lua/plenary.nvim' }, -- lua utility functions + { 'CopilotC-Nvim/CopilotChat.nvim' }, -- copilot chat + { -- adds file bars along the top similar to vscode + 'romgrk/barbar.nvim', + dependencies = { 'kyazdani42/nvim-web-devicons' } + }, + { -- adds a file explorer similar to vscode + "nvim-neo-tree/neo-tree.nvim", + branch = "v3.x", + dependencies = { + "nvim-lua/plenary.nvim", + "nvim-tree/nvim-web-devicons", -- not strictly required, but recommended + "MunifTanjim/nui.nvim", + } + }, + { -- adds diagnostics for files + "folke/trouble.nvim", + dependencies = "kyazdani42/nvim-web-devicons", + config = function() + require("trouble").setup {} + end + }, + { -- better terminal + 's1n7ax/nvim-terminal', + config = function() + vim.o.hidden = true + require('nvim-terminal').setup() + end + }, + { -- mason, easy download and install of LSPs + "williamboman/mason.nvim", + "williamboman/mason-lspconfig.nvim", + "neovim/nvim-lspconfig" + }, + { -- LSP + 'VonHeikemen/lsp-zero.nvim', + dependencies = { -- LSP Support + { 'neovim/nvim-lspconfig' }, { 'williamboman/nvim-lsp-installer' }, -- Autocompletion + { 'hrsh7th/nvim-cmp' }, { 'hrsh7th/cmp-buffer' }, { 'hrsh7th/cmp-path' }, { 'saadparwaiz1/cmp_luasnip' }, + { 'hrsh7th/cmp-nvim-lsp' }, { 'hrsh7th/cmp-nvim-lua' }, -- Snippets + { 'L3MON4D3/LuaSnip' }, { 'rafamadriz/friendly-snippets' } } + }, +}) + +-- settings +vim.opt.termguicolors = true +vim.opt.guifont = 'FiraCode Nerd Font:h12' +vim.opt.number = true +vim.opt.smarttab = true +vim.opt.tabstop = 8 +vim.opt.softtabstop = 0 +vim.opt.expandtab = true +vim.opt.shiftwidth = 4 +vim.opt.backspace = '2' +vim.opt.laststatus = 2 +vim.opt.mouse = 'a' +vim.opt.clipboard = 'unnamed' +vim.opt.scrolloff = 17 +vim.cmd('set tabstop=8 softtabstop=0 expandtab shiftwidth=4 smarttab') +vim.opt.number = true +vim.opt.colorcolumn = '80' +vim.g.terraform_fmt_on_save = true + +-- keybindings +local opts = { + noremap = true, + silent = true +} +vim.api.nvim_set_keymap('n', 'nt', ':Neotree toggle', opts) +vim.api.nvim_set_keymap('n', 'qqq', ':qall', opts) +vim.api.nvim_set_keymap('n', '', ':Files', opts) +vim.api.nvim_set_keymap('n', '/', ':Rg', opts) + +vim.api.nvim_set_keymap('n', 'v', ':vsplit', opts) +vim.api.nvim_set_keymap('n', 'h', ':split', opts) +vim.api.nvim_set_keymap('n', 'b', ':Buffers', opts) +vim.api.nvim_set_keymap('t', '', '', opts) +vim.api.nvim_set_keymap('n', 'd', ':Trouble diagnostics toggle', opts) +vim.api.nvim_set_keymap('n', 'g', ':GitMessenger', opts) +vim.api.nvim_set_keymap('n', 'd', '"_d', opts) +vim.api.nvim_set_keymap('v', 'd', '"_d', opts) +vim.api.nvim_set_keymap('n', 'c', '"_c', opts) +vim.api.nvim_set_keymap('v', 'c', '"_c', opts) + +-- plugins setup +require("CopilotChat").setup {} + +require("copilot").setup({ + suggestion = { + enabled = false + }, + panel = { + enabled = false + } +}) + +require("nvim-lsp-installer").setup {} + +require("neo-tree").setup { + close_on_open = false, + close_if_last_window = true, + window = { + width = 40, + side = "left", + auto_resize = true, + mappings = { + ["o"] = "open" + } + }, + filesystem = { + hijack_netrw_behavior = "open_current" + } +} + +-- Open NeoTree on startup when no file is specified +--vim.api.nvim_create_augroup('NeoTreeOnStartup', { clear = true }) +--vim.api.nvim_create_autocmd('VimEnter', { + --group = 'NeoTreeOnStartup', + --callback = function() + --if vim.fn.argc() == 0 then + --vim.cmd('Neotree toggle') + --end + --end +--}) + +-- Language Server +local lsp_zero = require('lsp-zero') +local cmp = require('cmp') + +lsp_zero.on_attach(function(_, bufnr) + lsp_zero.default_keymaps({ + buffer = bufnr + }) +end) + +require('mason').setup({}) +require('mason-lspconfig').setup({ + ensure_installed = { + 'bashls', + 'dockerls', + 'gopls', + 'jsonls', + 'yamlls', + 'pyright', + }, + handlers = { + lsp_zero.default_setup, + lua_ls = function() + local lua_opts = lsp_zero.nvim_lua_ls() + require('lspconfig').lua_ls.setup(lua_opts) + end + } +}) + +local lspkind = require('lspkind') +lspkind.init({ + symbol_map = { + Copilot = "" + } +}) +vim.api.nvim_set_hl(0, "CmpItemKindCopilot", { -- + fg = "#6CC644" +}) + +cmp.setup({ + formatting = { + format = lspkind.cmp_format({ + mode = 'symbol', -- show only symbol annotations + maxwidth = 70, -- prevent the popup from showing more than provided characters (e.g 50 will not show more than 50 characters) + -- can also be a function to dynamically calculate max width such as + -- maxwidth = function() return math.floor(0.45 * vim.o.columns) end, + ellipsis_char = '...', -- when popup menu exceed maxwidth, the truncated part would show ellipsis_char instead (must define maxwidth first) + show_labelDetails = true, -- show labelDetails in menu. Disabled by default + + -- The function below will be called before any actual modifications from lspkind + -- so that you can provide more controls on popup customization. (See [#30](https://github.com/onsails/lspkind-nvim/pull/30)) + before = function(_, vim_item) + -- do some customizations ... + return vim_item + end + }) + }, + snippet = { + -- REQUIRED - you must specify a snippet engine + expand = function(args) + vim.fn["vsnip#anonymous"](args.body) -- For `vsnip` users. + -- require('luasnip').lsp_expand(args.body) -- For `luasnip` users. + -- require('snippy').expand_snippet(args.body) -- For `snippy` users. + -- vim.fn["UltiSnips#Anon"](args.body) -- For `ultisnips` users. + -- vim.snippet.expand(args.body) -- For native neovim snippets (Neovim v0.10+) + end + }, + window = { + -- completion = cmp.config.window.bordered(), + -- documentation = cmp.config.window.bordered(), + }, + mapping = cmp.mapping.preset.insert({ + [''] = cmp.mapping.scroll_docs(-4), + [''] = cmp.mapping.scroll_docs(4), + [''] = cmp.mapping.complete(), + [''] = cmp.mapping.abort(), + [''] = cmp.mapping.confirm({ + select = true + }) -- Accept currently selected item. Set `select` to `false` to only confirm explicitly selected items. + }), + sources = cmp.config.sources({ { + name = 'copilot' + }, { + name = 'nvim_lsp' + }, { + name = 'nvim_lua' + }, { + name = 'path' + } -- For vsnip users. + -- { name = 'luasnip' }, -- For luasnip users. + -- { name = 'ultisnips' }, -- For ultisnips users. + -- { name = 'snippy' }, -- For snippy users. + }, { { + name = 'buffer' + } }) +}) + +-- enable diagnostics for showing in-line +vim.g.diagnostics_active = true +vim.diagnostic.config { + virtual_text = true, + signs = true, + underline = true +} + +vim.cmd('colorscheme murphy') diff --git a/ansible/dotfiles/config/tmux/tmux.conf b/ansible/dotfiles/config/tmux/tmux.conf new file mode 100644 index 0000000..a0f6e56 --- /dev/null +++ b/ansible/dotfiles/config/tmux/tmux.conf @@ -0,0 +1,88 @@ +set -g prefix C-a +unbind-key C-b +bind-key C-a send-prefix + +bind -n M-_ split-window -h -c "#{pane_current_path}" +bind -n M-- split-window -v -c "#{pane_current_path}" +unbind '"' +unbind % + +bind -n M-Left select-pane -L +bind -n M-Right select-pane -R +bind -n M-Up select-pane -U +bind -n M-Down select-pane -D + +bind -n M-[ previous-window +bind -n M-] next-window + +bind -n M-\{ swap-pane -U +bind -n M-\} swap-pane -D + +set -g mouse on + +set-option -g allow-rename off + +set -g base-index 1 +setw -g pane-base-index 1 + +bind-key -n M-S-Up resize-pane -U 5 +bind-key -n M-S-Down resize-pane -D 5 +bind-key -n M-S-Left resize-pane -L 5 +bind-key -n M-S-Right resize-pane -R 5 + +bind -n M-q kill-pane +bind -n M-S-q kill-window + +bind -n M-1 select-window -t 1 +bind -n M-2 select-window -t 2 +bind -n M-3 select-window -t 3 +bind -n M-4 select-window -t 4 +bind -n M-5 select-window -t 5 +bind -n M-6 select-window -t 6 +bind -n M-7 select-window -t 7 +bind -n M-8 select-window -t 8 +bind -n M-9 select-window -t 9 + +bind -n M-Enter new-window + +setw -g monitor-activity on +set -g visual-activity on + +set -sg escape-time 0 + +# Plugins +set -g @plugin 'tmux-plugins/tpm' +set -g @plugin 'nhdaly/tmux-better-mouse-mode' +set -g @plugin 'tmux-plugins/tmux-cpu' + +# pane borders +set -g pane-border-style 'fg=colour8' +set -g pane-active-border-style 'fg=colour7' + +# statusbar +set -g status-position bottom # status bar at the bottom +set -g status-justify centre # center window buttons +set -g status-style 'fg=colour7' +set -g status-left ' #H ' # Hostname on the left +set -g @cpu_percentage_format "%3.0f%%" +set -g @ram_percentage_format "%3.0f%%" + +# Set CPU and RAM to be shown in the status bar +set -g status-right '#[fg=colour7 bg=colour234] CPU: #(~/.tmux/plugins/tmux-cpu/scripts/cpu_percentage.sh)% | RAM: #(~/.tmux/plugins/tmux-cpu/scripts/ram_percentage.sh) | %H:%M:%S ' + +# Tmux sections needs a preset length +set -g status-right-length 50 +set -g status-left-length 30 + +# window tabs +setw -g window-status-current-style 'fg=colour0 bg=colour7' +setw -g window-status-current-format ' #I #W ' # Window ID, Window Name + +setw -g window-status-style 'fg=colour8' +setw -g window-status-format ' #I #W ' # Window ID, Window Name + +setw -g window-status-bell-style 'fg=colour234 bg=colour0 bold' + +# plugins runs +set-option -g status-interval 5 +run -b '~/.tmux/plugins/tpm/tpm' diff --git a/ansible/dotfiles/install-scripts/01-install-packages.sh b/ansible/dotfiles/install-scripts/01-install-packages.sh new file mode 100755 index 0000000..8911b90 --- /dev/null +++ b/ansible/dotfiles/install-scripts/01-install-packages.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -e # exit on error + +PACKAGES=( + "tmux" + "neovim" + "git" + "fish" + "curl" + "bat" + "go" + "ripgrep" +) + +# if MacOS install Homebrew +if [ "$(uname)" == "Darwin" ]; then + if [ ! -x "$(which brew)" ]; then + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + fi + # install packages + brew install "${PACKAGES[@]}" +fi + +# if Arch install +if [ -f /etc/arch-release ]; then + # install yay + if [ -z "$(which yay)" ] && [ "$EUID" -ne 0 ]; then + sudo pacman -S --noconfirm base-devel + git clone https://aur.archlinux.org/yay.git + cd yay + makepkg -si --noconfirm + cd .. + rm -rf yay + fi + + # install packages + sudo pacman -S --noconfirm "${PACKAGES[@]}" +fi + +# if debian or ubuntu install +if [ -f /etc/debian_version ]; then + # replace "go" with "golang" for debian + PACKAGES=("${PACKAGES[@]/go/golang}") + + # install packages + export DEBIAN_FRONTEND=noninteractive + sudo apt update + sudo apt install -y "${PACKAGES[@]}" +fi + +# if Alpine install +if [ -f /etc/alpine-release ]; then + # install packages + sudo apk add "${PACKAGES[@]}" +fi + +# if freebsd install +if [ "$(uname)" == "FreeBSD" ]; then + # install packages + sudo pkg install -y "${PACKAGES[@]}" +fi + +# if RHEL/CentOS/Fedora install +if [ -f /etc/redhat-release ]; then + # install packages + sudo dnf install -y "${PACKAGES[@]}" +fi + +# if Gentoo +if [ -f /etc/gentoo-release ]; then + PACKAGES=("${PACKAGES[@]/git/dev-vcs\/git}") + sudo emerge "${PACKAGES[@]}" +fi \ No newline at end of file diff --git a/ansible/dotfiles/install-scripts/02-move-files.sh b/ansible/dotfiles/install-scripts/02-move-files.sh new file mode 100755 index 0000000..f58366a --- /dev/null +++ b/ansible/dotfiles/install-scripts/02-move-files.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e # exit on error + +# create directories +mkdir -p ~/.config/nvim +mkdir -p ~/.config/fish +mkdir -p ~/.config/fish/conf.d +mkdir -p ~/.config/kitty + +# vim +cp config/vim/init.lua ~/.config/nvim/ + +# fish +cp config/fish/config.fish ~/.config/fish/ +cp config/fish/aliases.fish ~/.config/fish/conf.d/ +cp config/fish/functions.fish ~/.config/fish/conf.d/ + +# Only copy envvars.fish if it doesn't exist. +# This way we can override it with our own configs. +if [ ! -f ~/.config/fish/conf.d/envvars.fish ]; then + cp config/fish/envvars.fish ~/.config/fish/conf.d/ +fi + +# tmux +cp config/tmux/tmux.conf ~/.tmux.conf + +# kitty +cp config/kitty/kitty.conf ~/.config/kitty/ \ No newline at end of file diff --git a/ansible/dotfiles/install-scripts/03-fisher-install.fish b/ansible/dotfiles/install-scripts/03-fisher-install.fish new file mode 100755 index 0000000..4b30225 --- /dev/null +++ b/ansible/dotfiles/install-scripts/03-fisher-install.fish @@ -0,0 +1,4 @@ +#!/usr/bin/env fish + +# install fisher +curl -sL https://git.io/fisher | source && fisher install jorgebucaran/fisher \ No newline at end of file diff --git a/ansible/dotfiles/install-scripts/04-fish-plugins.fish b/ansible/dotfiles/install-scripts/04-fish-plugins.fish new file mode 100755 index 0000000..a6bd7a7 --- /dev/null +++ b/ansible/dotfiles/install-scripts/04-fish-plugins.fish @@ -0,0 +1,15 @@ +#!/usr/bin/env fish + +# tide +fisher install IlanCosman/tide + +# setup tide +tide configure \ + --auto \ + --style=Lean \ + --prompt_colors='True color' \ + --show_time='24-hour format' \ + --lean_prompt_height='One line' \ + --prompt_spacing=Compact \ + --icons='Few icons' \ + --transient=No diff --git a/ansible/dotfiles/install-scripts/05-tmux-plugins.fish b/ansible/dotfiles/install-scripts/05-tmux-plugins.fish new file mode 100755 index 0000000..8653c7e --- /dev/null +++ b/ansible/dotfiles/install-scripts/05-tmux-plugins.fish @@ -0,0 +1,9 @@ +#!/usr/bin/env fish + +# install tmux plugin manager if not installed +if not test -e ~/.tmux/plugins/tpm + git clone https://github.com/tmux-plugins/tpm ~/.tmux/plugins/tpm +end + +# install/update tmux plugins +env TMUX_PLUGIN_MANAGER_PATH=~/.tmux/plugins ~/.tmux/plugins/tpm/bin/install_plugins \ No newline at end of file diff --git a/ansible/dotfiles/install-scripts/06-vim-setup.fish b/ansible/dotfiles/install-scripts/06-vim-setup.fish new file mode 100755 index 0000000..d6b45ef --- /dev/null +++ b/ansible/dotfiles/install-scripts/06-vim-setup.fish @@ -0,0 +1,4 @@ +#!/usr/bin/env fish + +# With Lazy we don't need any setup, leaving this file here for the future + diff --git a/ansible/dotfiles/install-scripts/07-last-touches.sh b/ansible/dotfiles/install-scripts/07-last-touches.sh new file mode 100755 index 0000000..acb5ab0 --- /dev/null +++ b/ansible/dotfiles/install-scripts/07-last-touches.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# ensure fish is in shells +if ! grep -q "$(which fish)" /etc/shells; then + echo "$(which fish)" | sudo tee -a /etc/shells +fi + +# change shell to fish +if ! [ "$(basename $SHELL)" == "fish" ]; then + chsh -s $(which fish) +fi + +# create local bin directory +mkdir -p ~/bin + +# Gentoo specific kernel script +if [ -f /etc/gentoo-release ]; then + init_system=openrc + echo "Gentoo detected, need clarification on which init system is used." + echo -e "Which init system?\n\n1> openrc\n2> systemd" + read -p "[1]/2: " -n 1; + echo + if [ "$REPLY" == "2" ]; then + init_system=systemd + fi + + sudo cp scripts/gentoo-kernel-upgrade-$init_system /usr/bin/gentoo-kernel-upgrade + +fi \ No newline at end of file diff --git a/ansible/dotfiles/install.sh b/ansible/dotfiles/install.sh new file mode 100755 index 0000000..4ad28d7 --- /dev/null +++ b/ansible/dotfiles/install.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# +# install.sh — Symlink dotfiles to their expected locations. +# +# Usage: +# ./install.sh Full install (packages + symlinks + plugins) +# ./install.sh --link Symlinks only (no package install, no plugin setup) +# +# Safe to re-run: existing files are backed up to ~/.dotfiles-backup/ +# +set -euo pipefail + +DOTFILES_DIR="$(cd "$(dirname "$0")" && pwd)" +BACKUP_DIR="$HOME/.dotfiles-backup/$(date +%Y%m%d-%H%M%S)" +LINK_ONLY=false + +if [[ "${1:-}" == "--link" ]]; then + LINK_ONLY=true +fi + +# ── helpers ────────────────────────────────────────────────────────── + +backup_and_link() { + local src="$1" + local dst="$2" + + # Create parent directory if needed + mkdir -p "$(dirname "$dst")" + + # If destination exists and isn't already the right symlink, back it up + if [ -e "$dst" ] || [ -L "$dst" ]; then + if [ -L "$dst" ] && [ "$(readlink "$dst")" = "$src" ]; then + return 0 # already correct + fi + mkdir -p "$BACKUP_DIR" + mv "$dst" "$BACKUP_DIR/" 2>/dev/null || true + echo " backed up: $dst → $BACKUP_DIR/" + fi + + ln -sf "$src" "$dst" + echo " linked: $dst → $src" +} + +# ── symlinks ───────────────────────────────────────────────────────── + +echo "Linking dotfiles..." + +# Fish shell +backup_and_link "$DOTFILES_DIR/config/fish/config.fish" "$HOME/.config/fish/config.fish" +backup_and_link "$DOTFILES_DIR/config/fish/conf.d/aliases.fish" "$HOME/.config/fish/conf.d/aliases.fish" +backup_and_link "$DOTFILES_DIR/config/fish/conf.d/envvars.fish" "$HOME/.config/fish/conf.d/envvars.fish" +backup_and_link "$DOTFILES_DIR/config/fish/conf.d/functions.fish" "$HOME/.config/fish/conf.d/functions.fish" + +# Tmux +backup_and_link "$DOTFILES_DIR/config/tmux/tmux.conf" "$HOME/.tmux.conf" + +# Neovim +backup_and_link "$DOTFILES_DIR/config/nvim/init.lua" "$HOME/.config/nvim/init.lua" + +# Kitty +backup_and_link "$DOTFILES_DIR/config/kitty/kitty.conf" "$HOME/.config/kitty/kitty.conf" + +# Git +backup_and_link "$DOTFILES_DIR/config/git/gitconfig" "$HOME/.gitconfig" + +echo "Done linking." + +if [ "$LINK_ONLY" = true ]; then + echo "Symlinks only — skipping packages and plugins." + exit 0 +fi + +# ── packages ───────────────────────────────────────────────────────── + +echo "" +echo "Installing packages..." +bash "$DOTFILES_DIR/install-scripts/01-install-packages.sh" + +# ── plugins ────────────────────────────────────────────────────────── + +if command -v fish &>/dev/null; then + echo "" + echo "Setting up Fish plugins..." + fish "$DOTFILES_DIR/install-scripts/03-fisher-install.fish" + fish "$DOTFILES_DIR/install-scripts/04-fish-plugins.fish" + fish "$DOTFILES_DIR/install-scripts/05-tmux-plugins.fish" + fish "$DOTFILES_DIR/install-scripts/06-vim-setup.fish" +fi + +# ── final touches ──────────────────────────────────────────────────── + +echo "" +echo "Running final setup..." +bash "$DOTFILES_DIR/install-scripts/07-last-touches.sh" + +echo "" +echo "All done. Restart your terminal or run: exec fish" diff --git a/ansible/dotfiles/pr-test.yml b/ansible/dotfiles/pr-test.yml new file mode 100644 index 0000000..ea73335 --- /dev/null +++ b/ansible/dotfiles/pr-test.yml @@ -0,0 +1,30 @@ +name: PR Test + +on: + pull_request: + branches: + - master + - main + +jobs: + test: + strategy: + matrix: + distro: + - alpine + - archlinux + - fedora + - ubuntu + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Only build + run: | + docker buildx build --platform linux/amd64 --output "type=image,push=false" -f tests/Dockerfile-${{ matrix.distro }} . \ No newline at end of file diff --git a/ansible/dotfiles/scripts/gentoo-kernel-upgrade-openrc b/ansible/dotfiles/scripts/gentoo-kernel-upgrade-openrc new file mode 100755 index 0000000..1eeeb95 --- /dev/null +++ b/ansible/dotfiles/scripts/gentoo-kernel-upgrade-openrc @@ -0,0 +1,399 @@ +#!/bin/bash +# +# Gentoo Kernel Upgrade Script +# This script helps upgrade the gentoo-kernel-bin package safely. +# +# IMPORTANT: This script modifies boot files. A failed upgrade could +# prevent your system from booting. Ensure you have a backup plan. +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +EFI_DIR="/boot/efi" +BOOT_DIR="/boot" + +# Print functions +info() { echo -e "${BLUE}==>${NC} ${BOLD}$1${NC}"; } +warn() { echo -e "${YELLOW}WARNING:${NC} $1"; } +error() { echo -e "${RED}ERROR:${NC} $1" >&2; } +success() { echo -e "${GREEN}==>${NC} $1"; } + +# Check if running as root +check_root() { + if [[ $EUID -ne 0 ]]; then + error "This script must be run as root" + exit 1 + fi +} + +# Prompt user for confirmation +confirm() { + local prompt="$1" + local response + echo -e "${YELLOW}${prompt}${NC}" + read -p "Type 'yes' to confirm: " response + [[ "$response" == "yes" ]] +} + +# Get the currently running kernel version +get_current_version() { + uname -r +} + +# Get the latest installed kernel version from /usr/src +get_latest_installed_version() { + local latest + latest=$(ls -1d /usr/src/linux-*-gentoo-dist 2>/dev/null | sort -V | tail -1 | sed 's|.*/linux-||') + echo "$latest" +} + +# Check for available updates using emerge +check_for_updates() { + info "Checking for gentoo-kernel-bin updates..." + echo + + # Use emerge to check for updates (pretend mode) + if emerge -pvu gentoo-kernel-bin 2>/dev/null | grep -q "gentoo-kernel-bin"; then + local update_info + update_info=$(emerge -pvu gentoo-kernel-bin 2>/dev/null | grep -E "gentoo-kernel-bin|ebuild") + echo "$update_info" + return 0 + else + return 1 + fi +} + +# Install the kernel update +install_kernel_update() { + info "Installing gentoo-kernel-bin update..." + echo + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}emerge gentoo-kernel-bin${NC}" + echo + + if ! confirm "Proceed with kernel package installation?"; then + warn "Installation cancelled by user" + return 1 + fi + + echo + emerge gentoo-kernel-bin + + success "Kernel package installed successfully" +} + +# Select the new kernel with eselect +select_kernel() { + local new_version="$1" + + info "Available kernel versions:" + eselect kernel list + echo + + # Find the number corresponding to the new version + local kernel_num + kernel_num=$(eselect kernel list | grep "$new_version" | sed 's/.*\[\([0-9]*\)\].*/\1/') + + if [[ -z "$kernel_num" ]]; then + error "Could not find kernel version $new_version in eselect list" + echo "Please select the kernel manually:" + eselect kernel list + read -p "Enter the number to select: " kernel_num + fi + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}eselect kernel set $kernel_num${NC}" + echo + + if ! confirm "Set kernel $new_version as active?"; then + warn "Kernel selection cancelled by user" + return 1 + fi + + eselect kernel set "$kernel_num" + + # Verify + info "Current kernel symlink:" + eselect kernel list | grep '\*' + echo +} + +# Backup current boot files +backup_boot_files() { + info "Backing up current boot files..." + + echo -e "${BOLD}The following operations will be performed in ${EFI_DIR}:${NC}" + echo -e " ${BLUE}mv vmlinuz.efi -> vmlinuz.efi.bak${NC}" + echo -e " ${BLUE}mv initramfs.img -> initramfs.img.bak${NC}" + echo + + if [[ -f "${EFI_DIR}/vmlinuz.efi.bak" ]] || [[ -f "${EFI_DIR}/initramfs.img.bak" ]]; then + warn "Existing backup files will be overwritten!" + fi + + if ! confirm "Backup current boot files?"; then + warn "Backup cancelled by user" + return 1 + fi + + cd "${EFI_DIR}" + + if [[ -f "vmlinuz.efi" ]]; then + mv vmlinuz.efi vmlinuz.efi.bak + success "Backed up vmlinuz.efi" + fi + + if [[ -f "initramfs.img" ]]; then + mv initramfs.img initramfs.img.bak + success "Backed up initramfs.img" + fi + + echo +} + +# Generate new initramfs +generate_initramfs() { + local version="$1" + + info "Generating new initramfs for kernel ${version}..." + echo + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}${EFI_DIR}/generate_initramfs.sh ${version} ${EFI_DIR}/initramfs.img${NC}" + echo + + if ! confirm "Generate initramfs?"; then + warn "Initramfs generation cancelled by user" + return 1 + fi + + bash "${EFI_DIR}/generate_initramfs.sh" "${version}" "${EFI_DIR}/initramfs.img" + + success "Initramfs generated successfully" + echo +} + +# Copy new kernel to EFI partition +copy_kernel() { + local version="$1" + local kernel_source="${BOOT_DIR}/vmlinuz-${version}" + + info "Copying new kernel to EFI partition..." + echo + + if [[ ! -f "$kernel_source" ]]; then + error "Kernel file not found: $kernel_source" + return 1 + fi + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}cp ${kernel_source} ${EFI_DIR}/vmlinuz.efi${NC}" + echo + + if ! confirm "Copy new kernel to EFI partition?"; then + warn "Kernel copy cancelled by user" + return 1 + fi + + cp "$kernel_source" "${EFI_DIR}/vmlinuz.efi" + + success "Kernel copied successfully" + echo +} + +# Cleanup old kernel versions from /boot +cleanup_old_versions() { + info "Checking for old kernel versions to clean up..." + echo + + # Find old initramfs and kernel files (excluding current version) + local initramfs_to_delete=() + local kernels_to_delete=() + + + + while IFS= read -r f; do + initramfs_to_delete+=("$f") + done < <(ls ${BOOT_DIR}/initramfs-*.img 2>/dev/null | grep -v "$current_version" || true) + + while IFS= read -r f; do + kernels_to_delete+=("$f") + done < <(ls ${BOOT_DIR}/vmlinuz-* 2>/dev/null | grep -v "$current_version" || true) + + # Check if there's anything to delete + if [[ ${#initramfs_to_delete[@]} -eq 0 ]] && [[ ${#kernels_to_delete[@]} -eq 0 ]]; then + success "No old kernel versions to clean up" + echo + return 0 + fi + + echo -e "${BOLD}The following files will be DELETED:${NC}" + echo + for f in "${initramfs_to_delete[@]}" "${kernels_to_delete[@]}"; do + echo -e " ${RED}$f${NC}" + done + echo + + warn "This action cannot be undone!" + echo + + if ! confirm "Delete these old kernel files?"; then + warn "Cleanup cancelled by user" + return 1 + fi + + echo + for f in "${initramfs_to_delete[@]}" "${kernels_to_delete[@]}"; do + rm -v "$f" + done + + echo + success "Old kernel versions cleaned up" + echo +} + +# Print summary of what will be done +print_summary() { + local current="$1" + local new="$2" + + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ GENTOO KERNEL UPGRADE SUMMARY ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + echo -e " Current running kernel: ${YELLOW}${current}${NC}" + echo -e " New kernel version: ${GREEN}${new}${NC}" + echo + echo -e "${BOLD}The following steps will be performed:${NC}" + echo " 1. Backup current vmlinuz.efi and initramfs.img" + echo " 2. Generate new initramfs using dracut" + echo " 3. Copy new kernel to EFI partition" + echo + echo -e "${RED}${BOLD}WARNING: This process modifies boot files!${NC}" + echo -e "${RED}If something goes wrong, your system may not boot.${NC}" + echo -e "${RED}Make sure you have a way to recover (live USB, backup, etc.)${NC}" + echo +} + +# Main function +main() { + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ GENTOO KERNEL UPGRADE HELPER ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + + check_root + + local current_version + current_version=$(get_current_version) + info "Currently running kernel: ${current_version}" + echo + + # Check for updates + if ! check_for_updates; then + success "No updates available for gentoo-kernel-bin" + echo + + local latest_installed + latest_installed=$(get_latest_installed_version) + + if [[ "$latest_installed" != "$current_version" ]] && [[ -n "$latest_installed" ]]; then + # Installed kernel differs from running - offer to set up boot files + warn "Installed kernel ($latest_installed) differs from running kernel ($current_version)" + echo + if confirm "Would you like to set up boot files for $latest_installed?"; then + print_summary "$current_version" "$latest_installed" + + if ! confirm "Proceed with the upgrade process?"; then + echo "Upgrade cancelled." + exit 0 + fi + + select_kernel "$latest_installed" + backup_boot_files + generate_initramfs "$latest_installed" + copy_kernel "$latest_installed" + + echo + success "Kernel upgrade complete!" + echo + echo -e "${BOLD}Next steps:${NC}" + echo " 1. Reboot your system" + echo " 2. Verify the new kernel is running: uname -r" + echo " 3. Run this script again to clean up old versions" + echo + fi + else + # Running kernel is the newest - offer cleanup + cleanup_old_versions + fi + exit 0 + fi + + echo + if ! confirm "Would you like to install this kernel update?"; then + echo "Update cancelled." + exit 0 + fi + + echo + install_kernel_update + + # Get the new version after installation + local new_version + new_version=$(get_latest_installed_version) + + if [[ -z "$new_version" ]]; then + error "Could not determine new kernel version" + exit 1 + fi + + print_summary "$current_version" "$new_version" + + if ! confirm "Proceed with the upgrade process?"; then + echo "Upgrade process cancelled. Kernel package is installed but boot files unchanged." + exit 0 + fi + + echo + select_kernel "$new_version" + backup_boot_files + generate_initramfs "$new_version" + copy_kernel "$new_version" + + echo + success "Kernel upgrade complete!" + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ NEXT STEPS ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + echo " 1. ${YELLOW}Reboot your system${NC}" + echo " 2. Verify the new kernel is running:" + echo " uname -r" + echo " Expected: ${GREEN}${new_version}${NC}" + echo + echo " 3. If the new kernel works correctly, run this script again" + echo " to clean up old versions" + echo + echo " 4. ${RED}If the system fails to boot:${NC}" + echo " - Boot from a live USB" + echo " - Mount your EFI partition" + echo " - Restore backups:" + echo " mv vmlinuz.efi.bak vmlinuz.efi" + echo " mv initramfs.img.bak initramfs.img" + echo +} + +main "$@" diff --git a/ansible/dotfiles/scripts/gentoo-kernel-upgrade-systemd b/ansible/dotfiles/scripts/gentoo-kernel-upgrade-systemd new file mode 100755 index 0000000..fcc1c63 --- /dev/null +++ b/ansible/dotfiles/scripts/gentoo-kernel-upgrade-systemd @@ -0,0 +1,397 @@ +#!/bin/bash +# +# Gentoo Kernel Upgrade Script +# This script helps upgrade the gentoo-kernel-bin package safely. +# +# IMPORTANT: This script modifies boot files. A failed upgrade could +# prevent your system from booting. Ensure you have a backup plan. +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +EFI_DIR="/boot/efi" +BOOT_DIR="/boot" + +# Print functions +info() { echo -e "${BLUE}==>${NC} ${BOLD}$1${NC}"; } +warn() { echo -e "${YELLOW}WARNING:${NC} $1"; } +error() { echo -e "${RED}ERROR:${NC} $1" >&2; } +success() { echo -e "${GREEN}==>${NC} $1"; } + +# Check if running as root +check_root() { + if [[ $EUID -ne 0 ]]; then + error "This script must be run as root" + exit 1 + fi +} + +# Prompt user for confirmation +confirm() { + local prompt="$1" + local response + echo -e "${YELLOW}${prompt}${NC}" + read -p "Type 'yes' to confirm: " response + [[ "$response" == "yes" ]] +} + +# Get the currently running kernel version +get_current_version() { + uname -r +} + +# Get the latest installed kernel version from /usr/src +get_latest_installed_version() { + local latest + latest=$(ls -1d /usr/src/linux-*-gentoo-dist 2>/dev/null | sort -V | tail -1 | sed 's|.*/linux-||') + echo "$latest" +} + +# Check for available updates using emerge +check_for_updates() { + info "Checking for gentoo-kernel-bin updates..." + echo + + # Use emerge to check for updates (pretend mode) + if emerge -pvu gentoo-kernel-bin 2>/dev/null | grep -q "gentoo-kernel-bin"; then + local update_info + update_info=$(emerge -pvu gentoo-kernel-bin 2>/dev/null | grep -E "gentoo-kernel-bin|ebuild") + echo "$update_info" + return 0 + else + return 1 + fi +} + +# Install the kernel update +install_kernel_update() { + info "Installing gentoo-kernel-bin update..." + echo + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}emerge gentoo-kernel-bin${NC}" + echo + + if ! confirm "Proceed with kernel package installation?"; then + warn "Installation cancelled by user" + return 1 + fi + + echo + emerge gentoo-kernel-bin + + success "Kernel package installed successfully" +} + +# Select the new kernel with eselect +select_kernel() { + local new_version="$1" + + info "Available kernel versions:" + eselect kernel list + echo + + # Find the number corresponding to the new version + local kernel_num + kernel_num=$(eselect kernel list | grep "$new_version" | sed 's/.*\[\([0-9]*\)\].*/\1/') + + if [[ -z "$kernel_num" ]]; then + error "Could not find kernel version $new_version in eselect list" + echo "Please select the kernel manually:" + eselect kernel list + read -p "Enter the number to select: " kernel_num + fi + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}eselect kernel set $kernel_num${NC}" + echo + + if ! confirm "Set kernel $new_version as active?"; then + warn "Kernel selection cancelled by user" + return 1 + fi + + eselect kernel set "$kernel_num" + + # Verify + info "Current kernel symlink:" + eselect kernel list | grep '\*' + echo +} + +# Backup current boot files +backup_boot_files() { + info "Backing up current boot files..." + + echo -e "${BOLD}The following operations will be performed in ${EFI_DIR}:${NC}" + echo -e " ${BLUE}mv vmlinuz.efi -> vmlinuz.efi.bak${NC}" + echo -e " ${BLUE}mv initramfs.img -> initramfs.img.bak${NC}" + echo + + if [[ -f "${EFI_DIR}/vmlinuz.efi.bak" ]] || [[ -f "${EFI_DIR}/initramfs.img.bak" ]]; then + warn "Existing backup files will be overwritten!" + fi + + if ! confirm "Backup current boot files?"; then + warn "Backup cancelled by user" + return 1 + fi + + cd "${EFI_DIR}" + + if [[ -f "vmlinuz.efi" ]]; then + mv vmlinuz.efi vmlinuz.efi.bak + success "Backed up vmlinuz.efi" + fi + + if [[ -f "initramfs.img" ]]; then + mv initramfs.img initramfs.img.bak + success "Backed up initramfs.img" + fi + + echo +} + +# Generate new initramfs +generate_initramfs() { + local version="$1" + + info "Generating new initramfs for kernel ${version}..." + echo + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}${EFI_DIR}/generate_initramfs.sh ${version} ${EFI_DIR}/initramfs.img${NC}" + echo + + if ! confirm "Generate initramfs?"; then + warn "Initramfs generation cancelled by user" + return 1 + fi + + bash "${EFI_DIR}/generate_initramfs.sh" "${version}" "${EFI_DIR}/initramfs.img" + + success "Initramfs generated successfully" + echo +} + +# Copy new kernel to EFI partition +copy_kernel() { + local version="$1" + local kernel_source="${BOOT_DIR}/kernel-${version}" + + info "Copying new kernel to EFI partition..." + echo + + if [[ ! -f "$kernel_source" ]]; then + error "Kernel file not found: $kernel_source" + return 1 + fi + + echo -e "${BOLD}The following command will be executed:${NC}" + echo -e " ${BLUE}cp ${kernel_source} ${EFI_DIR}/vmlinuz.efi${NC}" + echo + + if ! confirm "Copy new kernel to EFI partition?"; then + warn "Kernel copy cancelled by user" + return 1 + fi + + cp "$kernel_source" "${EFI_DIR}/vmlinuz.efi" + + success "Kernel copied successfully" + echo +} + +# Cleanup old kernel versions from /boot +cleanup_old_versions() { + info "Checking for old kernel versions to clean up..." + echo + + # Find old initramfs and kernel files (excluding current version) + local initramfs_to_delete=() + local kernels_to_delete=() + + while IFS= read -r f; do + initramfs_to_delete+=("$f") + done < <(ls ${BOOT_DIR}/initramfs-*.img 2>/dev/null | grep -v "$current_version" || true) + + while IFS= read -r f; do + kernels_to_delete+=("$f") + done < <(ls ${BOOT_DIR}/kernel-* 2>/dev/null | grep -v "$current_version" || true) + + # Check if there's anything to delete + if [[ ${#initramfs_to_delete[@]} -eq 0 ]] && [[ ${#kernels_to_delete[@]} -eq 0 ]]; then + success "No old kernel versions to clean up" + echo + return 0 + fi + + echo -e "${BOLD}The following files will be DELETED:${NC}" + echo + for f in "${initramfs_to_delete[@]}" "${kernels_to_delete[@]}"; do + echo -e " ${RED}$f${NC}" + done + echo + + warn "This action cannot be undone!" + echo + + if ! confirm "Delete these old kernel files?"; then + warn "Cleanup cancelled by user" + return 1 + fi + + echo + for f in "${initramfs_to_delete[@]}" "${kernels_to_delete[@]}"; do + rm -v "$f" + done + + echo + success "Old kernel versions cleaned up" + echo +} + +# Print summary of what will be done +print_summary() { + local current="$1" + local new="$2" + + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ GENTOO KERNEL UPGRADE SUMMARY ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + echo -e " Current running kernel: ${YELLOW}${current}${NC}" + echo -e " New kernel version: ${GREEN}${new}${NC}" + echo + echo -e "${BOLD}The following steps will be performed:${NC}" + echo " 1. Backup current vmlinuz.efi and initramfs.img" + echo " 2. Generate new initramfs using dracut" + echo " 3. Copy new kernel to EFI partition" + echo + echo -e "${RED}${BOLD}WARNING: This process modifies boot files!${NC}" + echo -e "${RED}If something goes wrong, your system may not boot.${NC}" + echo -e "${RED}Make sure you have a way to recover (live USB, backup, etc.)${NC}" + echo +} + +# Main function +main() { + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ GENTOO KERNEL UPGRADE HELPER ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + + check_root + + local current_version + current_version=$(get_current_version) + info "Currently running kernel: ${current_version}" + echo + + # Check for updates + if ! check_for_updates; then + success "No updates available for gentoo-kernel-bin" + echo + + local latest_installed + latest_installed=$(get_latest_installed_version) + + if [[ "$latest_installed" != "$current_version" ]] && [[ -n "$latest_installed" ]]; then + # Installed kernel differs from running - offer to set up boot files + warn "Installed kernel ($latest_installed) differs from running kernel ($current_version)" + echo + if confirm "Would you like to set up boot files for $latest_installed?"; then + print_summary "$current_version" "$latest_installed" + + if ! confirm "Proceed with the upgrade process?"; then + echo "Upgrade cancelled." + exit 0 + fi + + select_kernel "$latest_installed" + backup_boot_files + generate_initramfs "$latest_installed" + copy_kernel "$latest_installed" + + echo + success "Kernel upgrade complete!" + echo + echo -e "${BOLD}Next steps:${NC}" + echo " 1. Reboot your system" + echo " 2. Verify the new kernel is running: uname -r" + echo " 3. Run this script again to clean up old versions" + echo + fi + else + # Running kernel is the newest - offer cleanup + cleanup_old_versions + fi + exit 0 + fi + + echo + if ! confirm "Would you like to install this kernel update?"; then + echo "Update cancelled." + exit 0 + fi + + echo + install_kernel_update + + # Get the new version after installation + local new_version + new_version=$(get_latest_installed_version) + + if [[ -z "$new_version" ]]; then + error "Could not determine new kernel version" + exit 1 + fi + + print_summary "$current_version" "$new_version" + + if ! confirm "Proceed with the upgrade process?"; then + echo "Upgrade process cancelled. Kernel package is installed but boot files unchanged." + exit 0 + fi + + echo + select_kernel "$new_version" + backup_boot_files + generate_initramfs "$new_version" + copy_kernel "$new_version" + + echo + success "Kernel upgrade complete!" + echo + echo -e "${BOLD}╔════════════════════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}║ NEXT STEPS ║${NC}" + echo -e "${BOLD}╚════════════════════════════════════════════════════════════════╝${NC}" + echo + echo " 1. ${YELLOW}Reboot your system${NC}" + echo " 2. Verify the new kernel is running:" + echo " uname -r" + echo " Expected: ${GREEN}${new_version}${NC}" + echo + echo " 3. If the new kernel works correctly, run this script again" + echo " to clean up old versions" + echo + echo " 4. ${RED}If the system fails to boot:${NC}" + echo " - Boot from a live USB" + echo " - Mount your EFI partition" + echo " - Restore backups:" + echo " mv vmlinuz.efi.bak vmlinuz.efi" + echo " mv initramfs.img.bak initramfs.img" + echo +} + +main "$@" diff --git a/ansible/group_vars/all/secrets.enc.yaml b/ansible/group_vars/all/secrets.enc.yaml new file mode 100644 index 0000000..1863201 --- /dev/null +++ b/ansible/group_vars/all/secrets.enc.yaml @@ -0,0 +1,16 @@ +grafana_pagerduty_integration_key: ENC[AES256_GCM,data:eXfaIsRwfqZm5ROIIFpeyuyk4/4wi3M02Bmgl7/SoRk=,iv:4HcB2WTDTrDADwE/ZVK84l6aIxayzz3e7VFZpVNY3Pg=,tag:4mTmfCP9GV9rGY2ALxWhgQ==,type:str] +sops: + age: + - recipient: age1r8uh2w2qad2z5sgq9q7l73962q2sp8zz9hdnh6sjuvanxl565vmswn8squ + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAvSFY2QWdpZ3lUUTZDYk5F + MVFyMy9rOG1ZejlrQUxPWjFpTlZpS2w1dHlVCmg4VkpYTmhYUk42ZytpOUJ5czg5 + VmJUQ2VJKzNtdWhiUGhTTVhndmplWG8KLS0tIHhKWkpWVG5CcnI0NkNwWGRCMDM5 + QVRncWVjR25sR0ViY1pCWkFXYVRURFUKyknDYVq9Bzo3Bdi4/dl3Ryj3qkLiGhrW + hlRDiPyTWfMPwffF3SmKCrI60b+Y0QhkZ+/Wym4JuuW1TKQERv4lhw== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2026-03-25T19:50:39Z" + mac: ENC[AES256_GCM,data:EFguQ9aY0I+y0WEkJCAA19uuRtquZVHdRBTjjGr2BoeVrq16iu1dIeZOH3eMj/b1MvI1NF7DoLJEUsxKZDS2YeTsBF46oOmc0VErR7w1BIlET8FhQANhqtPF0OglwYD5gCsxXjbx25TLfOpP4iHPwLY440Gk3BtWR1TMObD4C+c=,iv:/ncyoPfWMjeCSWI4Hfv1Tm5rRH0+ytT3OBlKNyy7k+I=,tag:VYyCwmBTfBHWF9ICZz7kSg==,type:str] + unencrypted_suffix: _unencrypted + version: 3.12.2 diff --git a/ansible/inventory/host_vars/copenhagen-a.yml b/ansible/inventory/host_vars/copenhagen-a.yml new file mode 100644 index 0000000..13cf262 --- /dev/null +++ b/ansible/inventory/host_vars/copenhagen-a.yml @@ -0,0 +1,12 @@ +--- +host_role: gaming +host_description: "Gaming servers (Minecraft, WoW/MaNGOS)" +host_location: "Copenhagen" + +docker_services: + - minecraft + +# MaNGOS runs as systemd services, not Docker +systemd_services: + - mangos-realmd + - mangos-world diff --git a/ansible/inventory/host_vars/copenhagen-c.yml b/ansible/inventory/host_vars/copenhagen-c.yml new file mode 100644 index 0000000..f1911d1 --- /dev/null +++ b/ansible/inventory/host_vars/copenhagen-c.yml @@ -0,0 +1,4 @@ +--- +host_role: idle +host_description: "Idle/available" +host_location: "Copenhagen" diff --git a/ansible/inventory/host_vars/helsinki-a.yml b/ansible/inventory/host_vars/helsinki-a.yml new file mode 100644 index 0000000..5f88512 --- /dev/null +++ b/ansible/inventory/host_vars/helsinki-a.yml @@ -0,0 +1,6 @@ +--- +host_role: proxy +host_description: "Reverse proxy (Caddy), main traffic gateway" +host_location: "Hetzner Cloud" + +caddy_config_src: "{{ playbook_dir }}/services/caddy/Caddyfile" diff --git a/ansible/inventory/host_vars/london-a.yml b/ansible/inventory/host_vars/london-a.yml new file mode 100644 index 0000000..f5d4c67 --- /dev/null +++ b/ansible/inventory/host_vars/london-a.yml @@ -0,0 +1,6 @@ +--- +host_role: monitoring +host_description: "Monitoring stack (Prometheus, Grafana)" +host_location: "London" +ansible_python_interpreter: /usr/local/bin/python3 +grafana_provisioning_dir: /usr/local/share/grafana/conf/provisioning diff --git a/ansible/inventory/host_vars/london-b.yml b/ansible/inventory/host_vars/london-b.yml new file mode 100644 index 0000000..086e2d7 --- /dev/null +++ b/ansible/inventory/host_vars/london-b.yml @@ -0,0 +1,16 @@ +--- +host_role: storage +host_description: "Primary storage (ZFS), Docker services" +host_location: "London" + +docker_services: + - nextcloud-aio + - jellyseerr + - navidrome + - slskd + - authelia + - forgejo + - bitwarden + - miniflux + - smartctl-exporter + - plex-exporter diff --git a/ansible/inventory/host_vars/nuremberg-a.yml b/ansible/inventory/host_vars/nuremberg-a.yml new file mode 100644 index 0000000..07f69bd --- /dev/null +++ b/ansible/inventory/host_vars/nuremberg-a.yml @@ -0,0 +1,5 @@ +--- +host_role: mail +host_description: "Mail server (poste.io)" +host_location: "Hetzner Cloud" +ansible_python_interpreter: /usr/bin/python3 diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000..7f1581e --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,25 @@ +# pez-infra fleet inventory +# All hosts accessible via Tailscale IPs over SSH as root. + +[linux] +helsinki-a ansible_host=100.67.6.27 +london-b ansible_host=100.84.65.101 +copenhagen-a ansible_host=100.89.206.60 +copenhagen-c ansible_host=100.115.45.53 + +[alpine] +nuremberg-a ansible_host=100.117.235.28 + +[freebsd] +london-a ansible_host=100.122.219.41 + +[docker_hosts] +london-b +nuremberg-a +copenhagen-a + +[monitoring] +london-a + +[all:vars] +ansible_user=root diff --git a/ansible/playbooks/docker-status.yml b/ansible/playbooks/docker-status.yml new file mode 100644 index 0000000..3a11b00 --- /dev/null +++ b/ansible/playbooks/docker-status.yml @@ -0,0 +1,16 @@ +--- +# Show running Docker containers on all docker hosts. +# Usage: ansible-playbook playbooks/docker-status.yml + +- name: Docker container status + hosts: docker_hosts + ignore_unreachable: true + tasks: + - name: Run docker ps + ansible.builtin.command: docker ps --format "table {{ '{{' }}.Names{{ '}}' }}\t{{ '{{' }}.Status{{ '}}' }}\t{{ '{{' }}.Ports{{ '}}' }}" + register: docker_ps + changed_when: false + + - name: Show containers on {{ inventory_hostname }} + ansible.builtin.debug: + msg: "{{ docker_ps.stdout_lines }}" diff --git a/ansible/playbooks/monitoring.yml b/ansible/playbooks/monitoring.yml new file mode 100644 index 0000000..ffbc146 --- /dev/null +++ b/ansible/playbooks/monitoring.yml @@ -0,0 +1,95 @@ +--- +# Deploy monitoring stack to london-a (Prometheus + Grafana). +# Usage: ansible-playbook playbooks/monitoring.yml +# ansible-playbook playbooks/monitoring.yml --check --diff + +- name: "Monitoring stack (london-a)" + hosts: london-a + pre_tasks: + - name: Load secrets + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../group_vars/all/secrets.yaml" + no_log: true + tasks: + - name: Check for Prometheus config + delegate_to: localhost + ansible.builtin.stat: + path: "{{ playbook_dir }}/../services/prometheus/prometheus.yml" + register: prometheus_config + + - name: Deploy Prometheus config + ansible.builtin.copy: + src: "{{ playbook_dir }}/../services/prometheus/prometheus.yml" + dest: /usr/local/etc/prometheus.yml + mode: '0644' + backup: true + when: prometheus_config.stat.exists + notify: Restart prometheus + + - name: Deploy Prometheus alerting rules + ansible.builtin.copy: + src: "{{ playbook_dir }}/../services/prometheus/rules/" + dest: /usr/local/etc/prometheus/rules/ + mode: '0644' + failed_when: false + notify: Restart prometheus + + - name: Ensure unified_alerting section exists in Grafana config + ansible.builtin.lineinfile: + path: /usr/local/etc/grafana/grafana.ini + regexp: '^\[unified_alerting\]' + line: '[unified_alerting]' + notify: Restart grafana + + - name: Allow provenance status change in Grafana + ansible.builtin.lineinfile: + path: /usr/local/etc/grafana/grafana.ini + regexp: '^allow_prov_status_change' + insertafter: '^\[unified_alerting\]' + line: 'allow_prov_status_change = true' + notify: Restart grafana + + - name: Deploy Grafana dashboards + ansible.posix.synchronize: + src: "{{ playbook_dir }}/../services/grafana/dashboards/" + dest: /usr/local/etc/grafana/dashboards/ + failed_when: false + + - name: Ensure provisioning dir exists + ansible.builtin.file: + path: "{{ grafana_provisioning_dir }}" + state: directory + mode: '0755' + + - name: Ensure alerting dir exists + ansible.builtin.file: + path: "{{ grafana_provisioning_dir }}/alerting" + state: directory + mode: '0755' + + - name: Deploy Grafana provisioning + ansible.posix.synchronize: + src: "{{ playbook_dir }}/../services/grafana/provisioning/" + dest: "{{ grafana_provisioning_dir }}/" + failed_when: false + + - name: Template contact points with PagerDuty key + ansible.builtin.template: + src: "{{ playbook_dir }}/../services/grafana/provisioning/alerting/contact-points.yml" + dest: "{{ grafana_provisioning_dir }}/alerting/contact-points.yml" + mode: '0640' + owner: root + group: grafana + no_log: true + notify: Restart grafana + + handlers: + - name: Restart prometheus + ansible.builtin.service: + name: prometheus + state: restarted + + - name: Restart grafana + ansible.builtin.service: + name: grafana + state: restarted diff --git a/ansible/playbooks/reboot.yml b/ansible/playbooks/reboot.yml new file mode 100644 index 0000000..f165bfb --- /dev/null +++ b/ansible/playbooks/reboot.yml @@ -0,0 +1,68 @@ +--- +# Reboot a specific host and wait for it to come back. +# Usage: ansible-playbook playbooks/reboot.yml --limit +# +# Safety: copenhagen-a has netplan pre-flight check. +# london-b should NOT be rebooted without manual approval. + +- name: Reboot host safely + hosts: all + ignore_unreachable: true + tasks: + - name: SAFETY — refuse unscoped reboot + ansible.builtin.fail: + msg: > + ABORT: You must use --limit to reboot a specific host. + Running against all hosts is not allowed. + when: play_hosts | length > 1 + + - name: SAFETY — london-b requires manual approval + ansible.builtin.pause: + prompt: > + WARNING: london-b is the primary storage server. Rebooting will + take down ZFS pools and all Docker services. Type 'yes' to confirm. + register: london_b_confirm + when: inventory_hostname == 'london-b' + + - name: Abort if london-b not confirmed + ansible.builtin.fail: + msg: "Reboot of london-b was not confirmed." + when: inventory_hostname == 'london-b' and london_b_confirm.user_input != 'yes' + + - name: Check netplan config (copenhagen-a) + ansible.builtin.command: netplan get all + register: netplan_config + failed_when: false + changed_when: false + when: inventory_hostname == 'copenhagen-a' + + - name: Verify copenhagen-a static IP + ansible.builtin.assert: + that: + - "'192.168.0.251' in netplan_config.stdout" + fail_msg: > + ABORT: copenhagen-a netplan doesn't show expected static IP + 192.168.0.251. Check netplan config before rebooting. + success_msg: "copenhagen-a netplan OK — static IP present." + when: inventory_hostname == 'copenhagen-a' + + - name: Reboot + ansible.builtin.reboot: + reboot_timeout: 300 + connect_timeout: 10 + pre_reboot_delay: 5 + post_reboot_delay: 15 + test_command: uptime + + - name: Verify SSH is back + ansible.builtin.wait_for_connection: + timeout: 120 + + - name: Show uptime after reboot + ansible.builtin.command: uptime + register: uptime_result + changed_when: false + + - name: Post-reboot uptime + ansible.builtin.debug: + msg: "{{ inventory_hostname }} is back: {{ uptime_result.stdout }}" diff --git a/ansible/playbooks/update-all.yml b/ansible/playbooks/update-all.yml new file mode 100644 index 0000000..f79565f --- /dev/null +++ b/ansible/playbooks/update-all.yml @@ -0,0 +1,64 @@ +--- +# Update all hosts (apt for Linux, pkg for FreeBSD, apk for Alpine). +# Usage: ansible-playbook playbooks/update-all.yml + +- name: Update Linux hosts (apt) + hosts: linux + ignore_unreachable: true + tags: [update, linux] + tasks: + - name: Apt update + upgrade + autoremove + ansible.builtin.apt: + update_cache: true + upgrade: dist + autoremove: true + autoclean: true + register: apt_result + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ apt_result.stdout_lines | default(['No output']) }}" + + - name: Check if reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: reboot_required + + - name: Notify if reboot needed + ansible.builtin.debug: + msg: "WARNING: REBOOT REQUIRED on {{ inventory_hostname }}" + when: reboot_required.stat.exists + +- name: Update Alpine hosts (apk) + hosts: alpine + ignore_unreachable: true + tags: [update, alpine] + tasks: + - name: Apk update + upgrade + community.general.apk: + update_cache: true + upgrade: true + register: apk_result + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ apk_result.stdout_lines | default(['No output']) }}" + +- name: Update FreeBSD hosts (pkg) + hosts: freebsd + ignore_unreachable: true + tags: [update, freebsd] + tasks: + - name: Pkg update + ansible.builtin.command: pkg update -f + register: pkg_update_result + changed_when: "'Updating' in pkg_update_result.stdout" + + - name: Pkg upgrade + ansible.builtin.command: pkg upgrade -y + register: pkg_upgrade_result + changed_when: "'upgraded' in pkg_upgrade_result.stdout or 'installed' in pkg_upgrade_result.stdout" + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ pkg_upgrade_result.stdout_lines | default(['No output']) }}" diff --git a/ansible/playbooks/update-freebsd.yml b/ansible/playbooks/update-freebsd.yml new file mode 100644 index 0000000..ffbcca3 --- /dev/null +++ b/ansible/playbooks/update-freebsd.yml @@ -0,0 +1,24 @@ +--- +# Update FreeBSD hosts only (pkg). +# Usage: ansible-playbook playbooks/update-freebsd.yml +# +# Equivalent to: ansible-playbook playbooks/update-all.yml --tags freebsd +# This is a convenience wrapper for when you only want FreeBSD hosts. + +- name: Update FreeBSD hosts (pkg) + hosts: freebsd + ignore_unreachable: true + tasks: + - name: Pkg update + ansible.builtin.command: pkg update -f + register: pkg_update_result + changed_when: "'Updating' in pkg_update_result.stdout" + + - name: Pkg upgrade + ansible.builtin.command: pkg upgrade -y + register: pkg_upgrade_result + changed_when: "'upgraded' in pkg_upgrade_result.stdout or 'installed' in pkg_upgrade_result.stdout" + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ pkg_upgrade_result.stdout_lines | default(['No output']) }}" diff --git a/ansible/playbooks/update-linux.yml b/ansible/playbooks/update-linux.yml new file mode 100644 index 0000000..97b1ffe --- /dev/null +++ b/ansible/playbooks/update-linux.yml @@ -0,0 +1,46 @@ +--- +# Update Linux hosts only (apt + Alpine apk). +# Usage: ansible-playbook playbooks/update-linux.yml +# +# Equivalent to: ansible-playbook playbooks/update-all.yml --tags linux,alpine +# This is a convenience wrapper for when you only want Linux hosts. + +- name: Update Linux hosts (apt) + hosts: linux + ignore_unreachable: true + tasks: + - name: Apt update + upgrade + autoremove + ansible.builtin.apt: + update_cache: true + upgrade: dist + autoremove: true + autoclean: true + register: apt_result + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ apt_result.stdout_lines | default(['No output']) }}" + + - name: Check if reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: reboot_required + + - name: Notify if reboot needed + ansible.builtin.debug: + msg: "WARNING: REBOOT REQUIRED on {{ inventory_hostname }}" + when: reboot_required.stat.exists + +- name: Update Alpine hosts (apk) + hosts: alpine + ignore_unreachable: true + tasks: + - name: Apk update + upgrade + community.general.apk: + update_cache: true + upgrade: true + register: apk_result + + - name: Show upgrade summary + ansible.builtin.debug: + msg: "{{ apk_result.stdout_lines | default(['No output']) }}" diff --git a/ansible/requirements.yml b/ansible/requirements.yml new file mode 100644 index 0000000..2673df4 --- /dev/null +++ b/ansible/requirements.yml @@ -0,0 +1,11 @@ +--- +# Ansible Galaxy requirements +# Install: ansible-galaxy install -r requirements.yml + +collections: + - name: community.docker + version: ">=3.0.0" + - name: community.general + version: ">=7.0.0" + - name: ansible.posix + version: ">=1.5.0" diff --git a/ansible/roles/caddy/handlers/main.yml b/ansible/roles/caddy/handlers/main.yml new file mode 100644 index 0000000..a233e60 --- /dev/null +++ b/ansible/roles/caddy/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Reload caddy + ansible.builtin.service: + name: caddy + state: reloaded diff --git a/ansible/roles/caddy/tasks/main.yml b/ansible/roles/caddy/tasks/main.yml new file mode 100644 index 0000000..c9ad589 --- /dev/null +++ b/ansible/roles/caddy/tasks/main.yml @@ -0,0 +1,31 @@ +--- +# Deploy Caddy reverse proxy on helsinki-a. +# Expects Caddyfile in services/caddy/. + +- name: Install Caddy + ansible.builtin.apt: + name: caddy + state: present + when: ansible_facts["os_family"] == "Debian" + +- name: Check for Caddyfile in repo + delegate_to: localhost + ansible.builtin.stat: + path: "{{ playbook_dir }}/services/caddy/Caddyfile" + register: caddy_caddyfile_src + +- name: Deploy Caddyfile + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/caddy/Caddyfile" + dest: /etc/caddy/Caddyfile + mode: '0644' + backup: true + validate: "caddy validate --config %s --adapter caddyfile" + notify: Reload caddy + when: caddy_caddyfile_src.stat.exists + +- name: Enable and start Caddy + ansible.builtin.service: + name: caddy + state: started + enabled: true diff --git a/ansible/roles/common/handlers/main.yml b/ansible/roles/common/handlers/main.yml new file mode 100644 index 0000000..6998953 --- /dev/null +++ b/ansible/roles/common/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart sshd + ansible.builtin.service: + name: sshd + state: restarted diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..b5b6139 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,102 @@ +--- +# Common baseline for all Linux hosts. +# Installs core packages, configures SSH, sets up the shell environment. + +- name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + when: ansible_facts["os_family"] == "Debian" + +- name: Install baseline packages (Debian) + ansible.builtin.apt: + name: + - curl + - wget + - git + - htop + - tmux + - vim + - jq + - unzip + - fish + - rsync + - fail2ban + - ufw + state: present + when: ansible_facts["os_family"] == "Debian" + +- name: Install baseline packages (Alpine) + community.general.apk: + name: + - curl + - wget + - git + - htop + - tmux + - vim + - jq + - fish + - rsync + - shadow + - py3-requests + state: present + when: ansible_facts["os_family"] == "Alpine" + +- name: Install baseline packages (FreeBSD) + community.general.pkgng: + name: + - curl + - wget + - git + - htop + - tmux + - vim + - jq + - rsync + state: present + when: ansible_facts["os_family"] == "FreeBSD" + +- name: Install fish shell + ansible.builtin.package: + name: fish + state: present + when: inventory_hostname != 'london-a' + +- name: Get fish shell path + ansible.builtin.command: which fish + changed_when: false + register: common_fish_path + when: inventory_hostname != 'london-a' + +- name: Set fish as default shell + ansible.builtin.user: + name: root + shell: "{{ common_fish_path.stdout }}" + when: inventory_hostname != 'london-a' + +- name: Ensure SSH directory exists + ansible.builtin.file: + path: /root/.ssh + state: directory + mode: '0700' + +- name: Harden SSH config + ansible.builtin.lineinfile: + path: /etc/ssh/sshd_config + regexp: "{{ item.regexp }}" + line: "{{ item.line }}" + state: present + loop: + - { regexp: '^#?PermitRootLogin', line: 'PermitRootLogin prohibit-password' } + - { regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no' } + - { regexp: '^#?X11Forwarding', line: 'X11Forwarding no' } + notify: Restart sshd + when: ansible_facts["os_family"] != "FreeBSD" + +- name: Enable fail2ban (Debian) + ansible.builtin.service: + name: fail2ban + state: started + enabled: true + when: ansible_facts["os_family"] == "Debian" diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000..f2dec33 --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,31 @@ +--- +# Install Docker and docker-compose, start the daemon. + +- name: Install Docker (Debian) + ansible.builtin.apt: + name: + - docker.io + - docker-compose + state: present + when: ansible_facts["os_family"] == "Debian" + +- name: Install Docker (Alpine) + community.general.apk: + name: + - docker + - docker-cli-compose + state: present + when: ansible_facts["os_family"] == "Alpine" + +- name: Enable and start Docker + ansible.builtin.service: + name: docker + state: started + enabled: true + +- name: Create docker compose project directories + ansible.builtin.file: + path: "/opt/docker/{{ item }}" + state: directory + mode: '0755' + loop: "{{ docker_services | default([]) }}" diff --git a/ansible/roles/docker_services/tasks/main.yml b/ansible/roles/docker_services/tasks/main.yml new file mode 100644 index 0000000..5d9555e --- /dev/null +++ b/ansible/roles/docker_services/tasks/main.yml @@ -0,0 +1,32 @@ +--- +# Deploy Docker Compose services from the repo's services/ directory. +# Expects docker_services list in host_vars and compose files in services//. + +- name: Copy docker-compose files + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/{{ item }}/docker-compose.yml" + dest: "/opt/docker/{{ item }}/docker-compose.yml" + mode: '0644' + loop: "{{ docker_services | default([]) }}" + register: docker_services_compose_files + failed_when: false + +- name: Copy service config files + ansible.posix.synchronize: + src: "{{ playbook_dir }}/services/{{ item }}/" + dest: "/opt/docker/{{ item }}/" + rsync_opts: + - "--exclude=docker-compose.yml" + - "--exclude=README.md" + - "--exclude=.gitkeep" + loop: "{{ docker_services | default([]) }}" + failed_when: false + +- name: Start/update docker compose services + community.docker.docker_compose_v2: + project_src: "/opt/docker/{{ item.item }}" + state: present + pull: policy + loop: "{{ docker_services_compose_files.results | default([]) }}" + when: item is not failed and item is changed + failed_when: false diff --git a/ansible/roles/dotfiles/tasks/main.yml b/ansible/roles/dotfiles/tasks/main.yml new file mode 100644 index 0000000..50be56f --- /dev/null +++ b/ansible/roles/dotfiles/tasks/main.yml @@ -0,0 +1,24 @@ +--- +# Deploy dotfiles from the repo's dotfiles/ directory. +# Symlinks config files into the home directory. + +- name: Check for dotfiles source + delegate_to: localhost + ansible.builtin.stat: + path: "{{ playbook_dir }}/dotfiles" + register: dotfiles_dir + +- name: Copy dotfiles + ansible.builtin.copy: + src: "{{ playbook_dir }}/dotfiles/{{ item.src }}" + dest: "{{ item.dest }}" + mode: '0644' + backup: true + loop: + - { src: 'config/tmux/tmux.conf', dest: '/root/.tmux.conf' } + - { src: 'config/fish/config.fish', dest: '/root/.config/fish/config.fish' } + - { src: 'config/git/gitconfig', dest: '/root/.gitconfig' } + failed_when: false + when: + - dotfiles_dir.stat.exists + - not (inventory_hostname == 'london-a' and item.src is search('fish')) diff --git a/ansible/roles/node_exporter/tasks/main.yml b/ansible/roles/node_exporter/tasks/main.yml new file mode 100644 index 0000000..8209461 --- /dev/null +++ b/ansible/roles/node_exporter/tasks/main.yml @@ -0,0 +1,48 @@ +--- +# Install node_exporter for Prometheus monitoring. +# Uses system packages on Linux, pkg on FreeBSD. + +- name: Install prometheus-node-exporter (Debian) + ansible.builtin.apt: + name: prometheus-node-exporter + state: present + when: ansible_facts["os_family"] == "Debian" + +- name: Install prometheus-node-exporter (Alpine) + community.general.apk: + name: prometheus-node-exporter + state: present + when: ansible_facts["os_family"] == "Alpine" + +- name: Enable and start node-exporter (Debian) + ansible.builtin.service: + name: prometheus-node-exporter + state: started + enabled: true + when: ansible_facts["os_family"] == "Debian" + +- name: Enable and start node-exporter (Alpine) + ansible.builtin.service: + name: node-exporter + state: started + enabled: true + when: ansible_facts["os_family"] == "Alpine" + +- name: Install node_exporter (FreeBSD) + community.general.pkgng: + name: node_exporter + state: present + when: ansible_facts["os_family"] == "FreeBSD" + +- name: Enable node_exporter (FreeBSD) + ansible.builtin.lineinfile: + path: /etc/rc.conf + regexp: '^node_exporter_enable=' + line: 'node_exporter_enable="YES"' + when: ansible_facts["os_family"] == "FreeBSD" + +- name: Start node_exporter (FreeBSD) + ansible.builtin.service: + name: node_exporter + state: started + when: ansible_facts["os_family"] == "FreeBSD" diff --git a/ansible/roles/systemd_services/handlers/main.yml b/ansible/roles/systemd_services/handlers/main.yml new file mode 100644 index 0000000..16b0951 --- /dev/null +++ b/ansible/roles/systemd_services/handlers/main.yml @@ -0,0 +1,4 @@ +--- +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true diff --git a/ansible/roles/systemd_services/tasks/main.yml b/ansible/roles/systemd_services/tasks/main.yml new file mode 100644 index 0000000..bbb1b6c --- /dev/null +++ b/ansible/roles/systemd_services/tasks/main.yml @@ -0,0 +1,22 @@ +--- +# Deploy custom systemd unit files from the repo. +# Looks for unit files in services//.service + +- name: Copy systemd unit files + ansible.builtin.copy: + src: "{{ playbook_dir }}/services/{{ item }}/{{ item }}.service" + dest: "/etc/systemd/system/{{ item }}.service" + mode: '0644' + loop: "{{ systemd_services | default([]) }}" + register: systemd_services_unit_files + failed_when: false + notify: Reload systemd daemon + +- name: Enable and start systemd services + ansible.builtin.systemd: + name: "{{ item.item }}" + state: started + enabled: true + loop: "{{ systemd_services_unit_files.results | default([]) }}" + when: item is not failed + failed_when: false diff --git a/ansible/scripts/docker-log-cleanup.sh b/ansible/scripts/docker-log-cleanup.sh new file mode 100755 index 0000000..3bb18bd --- /dev/null +++ b/ansible/scripts/docker-log-cleanup.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Truncate large Docker container log files +# Deployed on: nuremberg-a +# Cron: 0 3 1 * * /usr/local/bin/docker-log-cleanup.sh + +LOG_DIR=/var/lib/docker/containers +MAX_SIZE_MB=100 + +find "$LOG_DIR" -name '*-json.log' | while read -r logfile; do + size_mb=$(du -m "$logfile" | cut -f1) + if [ "$size_mb" -gt "$MAX_SIZE_MB" ]; then + echo "$(date): Truncating $logfile (${size_mb}MB)" >> /var/log/docker-log-cleanup.log + truncate -s 0 "$logfile" + fi +done diff --git a/ansible/scripts/hdd-backup.sh b/ansible/scripts/hdd-backup.sh new file mode 100755 index 0000000..48aec04 --- /dev/null +++ b/ansible/scripts/hdd-backup.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -euo pipefail + +BUCKET="b2:london-b-hdd" +DIRS=(archive backups stash syncthing ftp) +EMAIL="pez@pez.sh" +SUBJECT="HDD Backup Report - $(date '+%Y-%m-%d %H:%M')" + +failures=() +report="" + +for dir in "${DIRS[@]}"; do + src="/hdd/$dir" + dst="$BUCKET/$dir" + echo "Syncing $src -> $dst" + + output=$(rclone sync "$src" "$dst" -v 2>&1); rc=$? + output=$(grep -v "Can't follow symlink without -L/--copy-links" <<< "$output") + [[ $rc -ne 0 ]] && failures+=("$dir") + + report+="=== $dir ===\n$output\n\n" +done + +# Get bucket storage usage +bucket_usage=$(rclone size "$BUCKET" 2>&1) || bucket_usage="(failed to retrieve bucket size)" + +if [[ ${#failures[@]} -gt 0 ]]; then + failure_summary="FAILURES: ${failures[*]}" +else + failure_summary="All syncs completed successfully." +fi + +{ + echo -e "Backup completed: $(date '+%Y-%m-%d %H:%M:%S')" + echo -e "$failure_summary\n" + echo -e "=== Bucket Usage ===\n$bucket_usage\n" + #echo -e "=== Sync Output ===\n$report" +} | mutt -s "$SUBJECT" "$EMAIL" diff --git a/ansible/services/README.md b/ansible/services/README.md new file mode 100644 index 0000000..f2d186e --- /dev/null +++ b/ansible/services/README.md @@ -0,0 +1,58 @@ +# Services + +Version-controlled service definitions across the fleet. + +## Directory Structure + +``` +services/ +├── systemd/ # systemd unit files (Linux hosts) +│ ├── copenhagen-a/ +│ │ ├── mangos-realmd.service # MaNGOS Zero realm server +│ │ ├── mangos-world.service # MaNGOS Zero world server +│ │ └── cloudflared.service # Cloudflare tunnel (token redacted) +│ └── helsinki-a/ +│ ├── caddy.service # Caddy reverse proxy (stock unit) +│ └── thiswebsitedoesnotexist.service # Node.js app on port 3721 +└── rc.d/ # FreeBSD rc.conf and rc.d scripts + └── london-a/ + └── rc.conf # /etc/rc.conf — all enabled services +``` + +## Notes + +### copenhagen-a (Linux) + +| Service | Unit | Status | Notes | +|---------|------|--------|-------| +| MaNGOS realmd | `mangos-realmd.service` | enabled, custom | Realm server for WoW private server. Depends on MariaDB. | +| MaNGOS world | `mangos-world.service` | enabled, custom | World server. Depends on MariaDB and realmd. | +| cloudflared | `cloudflared.service` | enabled, custom | Cloudflare tunnel. **Token redacted** — replace `${CLOUDFLARED_TOKEN}` with the real token on deploy. | + +### helsinki-a (Linux) + +| Service | Unit | Status | Notes | +|---------|------|--------|-------| +| Caddy | `caddy.service` | enabled, stock | Installed via package manager. Config at `/etc/caddy/Caddyfile`. | +| thiswebsitedoesnotexist | `thiswebsitedoesnotexist.service` | enabled, custom | Node.js app. Env vars in `/opt/thiswebsitedoesnotexist/.env`. | + +### london-a (FreeBSD) + +No custom rc.d scripts — all services installed via `pkg`. The `rc.conf` captures all enabled services: + +| Service | rc.conf variable | Notes | +|---------|-----------------|-------| +| Grafana | `grafana_enable="YES"` | Monitoring dashboards | +| Prometheus | `prometheus_enable="YES"` | Metrics collection | +| node_exporter | `node_exporter_enable="YES"` | Host metrics exporter | +| Tailscale | `tailscaled_enable="YES"` | Mesh VPN | +| cloudflared | `cloudflared_enable="YES"` | Cloudflare tunnel (tunnel ID in rc.conf) | +| InfluxDB | `influxd_enable="YES"` | Time-series database | +| libvirtd | `libvirtd_enable="YES"` | Virtualisation daemon | +| Redis | `redis_enable="YES"` | In-memory data store | +| PostgreSQL | `postgresql_enable="YES"` | Relational database | + +## Security + +- The cloudflared token on copenhagen-a has been **redacted** in the committed unit file. The live service uses the real token. +- The cloudflare tunnel ID on london-a is committed as-is (it's not a secret — the tunnel token is separate). diff --git a/ansible/services/authelia/README.md b/ansible/services/authelia/README.md new file mode 100644 index 0000000..e6b522f --- /dev/null +++ b/ansible/services/authelia/README.md @@ -0,0 +1,13 @@ +# Authelia + +SSO authentication portal with LLDAP directory and MariaDB backend. + +- **Host:** helsinki-a +- **URL:** https://auth.pez.sh (integrated via Caddy forward_auth) +- **Components:** + - **Authelia** — SSO portal (port 9091, localhost only) + - **LLDAP** — Lightweight LDAP directory (port 3890 LDAP, port 17170 web UI) + - **MariaDB 11** — Session/config storage +- **Config:** `/root/authelia/config/` +- **Secrets:** `/root/authelia/secrets/` (JWT, session, encryption keys, passwords) +- **LDAP base DN:** `dc=pez,dc=sh` diff --git a/ansible/services/authelia/config.enc.yml.example b/ansible/services/authelia/config.enc.yml.example new file mode 100644 index 0000000..b759e95 --- /dev/null +++ b/ansible/services/authelia/config.enc.yml.example @@ -0,0 +1,10 @@ +# Example: services/authelia/config.enc.yml +# Authelia secrets — encrypt with: sops -e -i config.enc.yml +--- +jwt_secret: CHANGEME +session_secret: CHANGEME +storage_encryption_key: CHANGEME +lldap_admin_password: CHANGEME +mariadb_root_password: CHANGEME +mariadb_authelia_password: CHANGEME +oidc_hmac_secret: CHANGEME diff --git a/ansible/services/authelia/docker-compose.yml b/ansible/services/authelia/docker-compose.yml new file mode 100644 index 0000000..45af2d0 --- /dev/null +++ b/ansible/services/authelia/docker-compose.yml @@ -0,0 +1,77 @@ +# Authelia - SSO/authentication portal with LLDAP + MariaDB +# Host: helsinki-a (100.67.6.27) +# Data: /root/authelia/ +# Access: https://auth.pez.sh (via Caddy forward_auth) + +services: + authelia: + container_name: authelia + image: docker.io/authelia/authelia:latest + restart: unless-stopped + ports: + - '127.0.0.1:9091:9091' + environment: + AUTHELIA_IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET_FILE: /secrets/JWT_SECRET + AUTHELIA_SESSION_SECRET_FILE: /secrets/SESSION_SECRET + AUTHELIA_STORAGE_ENCRYPTION_KEY_FILE: /secrets/STORAGE_ENCRYPTION_KEY + AUTHELIA_STORAGE_MYSQL_PASSWORD_FILE: /secrets/MYSQL_PASSWORD + TZ: UTC + volumes: + - /root/authelia/config:/config + - /root/authelia/secrets:/secrets + depends_on: + mariadb: + condition: service_healthy + lldap: + condition: service_started + networks: + - authelia + + mariadb: + container_name: authelia-mariadb + image: docker.io/library/mariadb:11 + restart: unless-stopped + environment: + MYSQL_ROOT_PASSWORD_FILE: /run/secrets/MYSQL_ROOT_PASSWORD + MYSQL_DATABASE: authelia + MYSQL_USER: authelia + MYSQL_PASSWORD_FILE: /run/secrets/MYSQL_PASSWORD + TZ: UTC + volumes: + - /root/authelia/mariadb:/var/lib/mysql + - /root/authelia/secrets/MYSQL_ROOT_PASSWORD:/run/secrets/MYSQL_ROOT_PASSWORD:ro + - /root/authelia/secrets/MYSQL_PASSWORD:/run/secrets/MYSQL_PASSWORD:ro + networks: + - authelia + healthcheck: + test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + lldap: + container_name: authelia-lldap + image: docker.io/lldap/lldap:latest + restart: unless-stopped + ports: + - '17170:17170' # Web UI + - '3890:3890' # LDAP + environment: + UID: '1000' + GID: '1000' + TZ: UTC + LLDAP_LDAP_BASE_DN: dc=pez,dc=sh + LLDAP_LDAP_USER_DN: admin + LLDAP_LDAP_USER_PASS_FILE: /secrets/LLDAP_ADMIN_PASSWORD + LLDAP_JWT_SECRET_FILE: /secrets/LLDAP_JWT_SECRET + volumes: + - /root/authelia/lldap:/data + - /root/authelia/secrets/LLDAP_ADMIN_PASSWORD:/secrets/LLDAP_ADMIN_PASSWORD:ro + - /root/authelia/secrets/LLDAP_JWT_SECRET:/secrets/LLDAP_JWT_SECRET:ro + networks: + - authelia + +networks: + authelia: + driver: bridge diff --git a/ansible/services/bitwarden/README.md b/ansible/services/bitwarden/README.md new file mode 100644 index 0000000..65af996 --- /dev/null +++ b/ansible/services/bitwarden/README.md @@ -0,0 +1,11 @@ +# Bitwarden + +Self-hosted password manager (unified deployment). + +- **Host:** helsinki-a +- **URL:** https://bitwarden.pez.sh +- **Image:** `ghcr.io/bitwarden/self-host:beta` (unified container) +- **Database:** MariaDB 10 +- **Admin:** pez@pez.sh +- **Config:** `settings.env` (env file, not committed — contains secrets) +- **Data:** Docker volumes (`bitwarden`, `data`) diff --git a/ansible/services/bitwarden/docker-compose.yml b/ansible/services/bitwarden/docker-compose.yml new file mode 100644 index 0000000..378c28d --- /dev/null +++ b/ansible/services/bitwarden/docker-compose.yml @@ -0,0 +1,33 @@ +# Bitwarden - Self-hosted password manager +# Host: helsinki-a (100.67.6.27) +# Data: Docker volume (bitwarden) +# Access: https://bitwarden.pez.sh (via Caddy reverse proxy) + +services: + bitwarden: + image: ghcr.io/bitwarden/self-host:beta + restart: always + depends_on: + - db + env_file: + - settings.env + ports: + - "8080:8080" + - "8443:8443" + volumes: + - bitwarden:/etc/bitwarden + + db: + image: mariadb:10 + restart: always + environment: + MARIADB_USER: bitwarden + MARIADB_PASSWORD: "${BW_DB_PASSWORD}" + MARIADB_DATABASE: bitwarden_vault + MARIADB_RANDOM_ROOT_PASSWORD: "true" + volumes: + - data:/var/lib/mysql + +volumes: + bitwarden: + data: diff --git a/ansible/services/caddy/Caddyfile b/ansible/services/caddy/Caddyfile new file mode 100644 index 0000000..7995ef0 --- /dev/null +++ b/ansible/services/caddy/Caddyfile @@ -0,0 +1,246 @@ +# _ __ ___ ____ _ __ _ __ _____ ___ _ +# | '_ \ / _ \_ /____| '_ \| '__/ _ \ \/ / | | | +# | |_) | __// /_____| |_) | | | (_) > <| |_| | +# | .__/ \___/___| | .__/|_| \___/_/\_\\__, | +# |_| |_| |___/ +# + +{ + admin 100.67.6.27:2019 + metrics { + per_host + } +} + +## LONDON-A SERVICES ## + +# Grafana +grafana.pez.solutions, grafana.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.122.219.41:3000 +} + +# Prometheus +prometheus.pez.solutions, prometheus.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.122.219.41:9090 +} + +# Alertmanager +alertmanager.pez.solutions, alertmanager.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.122.219.41:3000 +} + +## LONDON-B SERVICES ## + +# Jellyfin +jellyfin.pez.solutions, jellyfin.pez.sh { + reverse_proxy 100.84.65.101:8096 +} + +# Plex +plex.pez.solutions, plex.pez.sh { + reverse_proxy 100.84.65.101:32400 +} + +# Radarr +radarr.pez.solutions, radarr.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:7878 +} + +# Sonarr +sonarr.pez.solutions, sonarr.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:8989 +} + +# Lidarr +lidarr.pez.solutions, lidarr.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:8686 +} + +# Readarr +readarr.pez.solutions, readarr.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:8787 +} + +# slskd +soulseek.pez.solutions, soulseek.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:5030 +} + +# Prowlarr +prowlarr.pez.solutions, prowlarr.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:9696 +} + +# Transmission +download.pez.solutions, download.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:9091 +} + +# Overseerr +request.pez.solutions, request.pez.sh { + reverse_proxy 100.84.65.101:5055 +} + +# Jellyfin Requests +jellyfin-requests.pez.solutions, jellyfin-requests.pez.sh { + reverse_proxy 100.84.65.101:5056 +} + +#WebDAV +#https://cloud.pez.sh { +# reverse_proxy 100.84.65.101:8080 +#} + +https://cloud.pez.sh:443 { + reverse_proxy 100.84.65.101:11000 # Adjust to match APACHE_PORT and APACHE_IP_BINDING. See https://github.com/nextcloud/all-in-one/blob/main/reverse-proxy.md#adapting-the-sample-web-server-configurations-below +} + +music.pez.sh { + reverse_proxy 100.84.65.101:4533 +} + +## LONDON-C SERVICES ## + +## COPENHAGEN-A SERVICES ## + +## COPENHAGEN-B SERVICES ## + +## COPENHAGEN-C SERVICES ## + +## NUREMBERG-A SERVICES ## + +## HELSINKI-A SERVICES ## + +# Bitwarden (requires HTTPS tweaking) +https://bitwarden.pez.sh { + reverse_proxy localhost:8443 { + transport http { + tls_insecure_skip_verify + } + } +} + +# Authelia (requires HTTPS tweaking) +auth.pez.solutions, auth.pez.sh { + reverse_proxy localhost:9091 +} + +ldap.pez.sh { + reverse_proxy 127.0.0.1:17170 +} +#https://auth.pez.sh { +# reverse_proxy 127.0.0.1:9091 { +# transport http { +# tls_insecure_skip_verify +# } +# } +#} + +# Apps dashboard +apps.pez.solutions, apps.pez.sh { + root * /srv/apps + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + file_server +} + +# Pez.solutions +pez.solutions { + root * /srv/pez.solutions + file_server +} + +# Pez.solutions +cloud.pez.solutions { + root * /srv/cloud.pez.solutions + file_server +} + +# Pez.sh +pez.sh { + root * /srv/pez.sh + file_server +} + +# Pez-signup +signup.pez.solutions { + root * /srv/pez-signup + file_server +} + +# Naveen +naveen.pez.sh { + root * /srv/naveen + file_server +} + +## HELSINKI-A SERVICES ## + +# Status page +status.pez.sh { + root * /srv/status + file_server +} + +# Miniflux RSS +rss.pez.sh { + forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy 100.84.65.101:8181 +} + + +# Forgejo Git Server (auth handled by Forgejo itself) +git.pez.sh { + reverse_proxy localhost:3000 +} + + +# This Website Does Not Exist +thiswebsitedoesnotexist.pez.sh { + reverse_proxy localhost:3721 +} diff --git a/ansible/services/caddy/Caddyfile.template b/ansible/services/caddy/Caddyfile.template new file mode 100644 index 0000000..600d437 --- /dev/null +++ b/ansible/services/caddy/Caddyfile.template @@ -0,0 +1,198 @@ +# Caddyfile template for helsinki-a reverse proxy +# +# Variables (replace before deploying): +# {{HELSINKI_A_IP}} - helsinki-a Tailscale IP (currently 100.67.6.27) +# {{LONDON_A_IP}} - london-a Tailscale IP (currently 100.122.219.41) +# {{LONDON_B_IP}} - london-b Tailscale IP (currently 100.84.65.101) +# {{AUTHELIA_PORT}} - Authelia port (currently 9091) +# {{DOMAIN_PRIMARY}} - Primary domain (currently pez.sh) +# {{DOMAIN_ALT}} - Alternate domain (currently pez.solutions) +# +# Authelia forward_auth pattern: see README.md for details. + +{ + admin {{HELSINKI_A_IP}}:2019 + metrics { + per_host + } +} + +# ============================================================ +# Snippet: Authelia forward_auth +# Include with `import authelia` inside any site block. +# ============================================================ +(authelia) { + forward_auth localhost:{{AUTHELIA_PORT}} { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } +} + +## LONDON-A SERVICES ## + +# Grafana +grafana.{{DOMAIN_ALT}}, grafana.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_A_IP}}:3000 +} + +# Prometheus +prometheus.{{DOMAIN_ALT}}, prometheus.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_A_IP}}:9090 +} + +# Alertmanager +alertmanager.{{DOMAIN_ALT}}, alertmanager.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_A_IP}}:3000 +} + +## LONDON-B SERVICES ## + +# Jellyfin (no auth — has its own login) +jellyfin.{{DOMAIN_ALT}}, jellyfin.{{DOMAIN_PRIMARY}} { + reverse_proxy {{LONDON_B_IP}}:8096 +} + +# Plex (no auth — has its own login) +plex.{{DOMAIN_ALT}}, plex.{{DOMAIN_PRIMARY}} { + reverse_proxy {{LONDON_B_IP}}:32400 +} + +# Radarr +radarr.{{DOMAIN_ALT}}, radarr.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:7878 +} + +# Sonarr +sonarr.{{DOMAIN_ALT}}, sonarr.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:8989 +} + +# Lidarr +lidarr.{{DOMAIN_ALT}}, lidarr.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:8686 +} + +# Readarr +readarr.{{DOMAIN_ALT}}, readarr.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:8787 +} + +# slskd (Soulseek) +soulseek.{{DOMAIN_ALT}}, soulseek.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:5030 +} + +# Prowlarr +prowlarr.{{DOMAIN_ALT}}, prowlarr.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:9696 +} + +# Transmission +download.{{DOMAIN_ALT}}, download.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:9091 +} + +# Overseerr (no auth — has its own login) +request.{{DOMAIN_ALT}}, request.{{DOMAIN_PRIMARY}} { + reverse_proxy {{LONDON_B_IP}}:5055 +} + +# Jellyseerr (no auth — has its own login) +jellyfin-requests.{{DOMAIN_ALT}}, jellyfin-requests.{{DOMAIN_PRIMARY}} { + reverse_proxy {{LONDON_B_IP}}:5056 +} + +# Nextcloud AIO +https://cloud.{{DOMAIN_PRIMARY}}:443 { + reverse_proxy {{LONDON_B_IP}}:11000 +} + +# Navidrome +music.{{DOMAIN_PRIMARY}} { + reverse_proxy {{LONDON_B_IP}}:4533 +} + +# Miniflux RSS +rss.{{DOMAIN_PRIMARY}} { + import authelia + reverse_proxy {{LONDON_B_IP}}:8181 +} + +## HELSINKI-A SERVICES (localhost) ## + +# Bitwarden +https://bitwarden.{{DOMAIN_PRIMARY}} { + reverse_proxy localhost:8443 { + transport http { + tls_insecure_skip_verify + } + } +} + +# Authelia portal +auth.{{DOMAIN_ALT}}, auth.{{DOMAIN_PRIMARY}} { + reverse_proxy localhost:{{AUTHELIA_PORT}} +} + +# LLDAP web UI +ldap.{{DOMAIN_PRIMARY}} { + reverse_proxy 127.0.0.1:17170 +} + +# Apps dashboard +apps.{{DOMAIN_ALT}}, apps.{{DOMAIN_PRIMARY}} { + root * /srv/apps + import authelia + file_server +} + +# Static sites +{{DOMAIN_ALT}} { + root * /srv/pez.solutions + file_server +} + +cloud.{{DOMAIN_ALT}} { + root * /srv/cloud.pez.solutions + file_server +} + +{{DOMAIN_PRIMARY}} { + root * /srv/pez.sh + file_server +} + +signup.{{DOMAIN_ALT}} { + root * /srv/pez-signup + file_server +} + +naveen.{{DOMAIN_PRIMARY}} { + root * /srv/naveen + file_server +} + +status.{{DOMAIN_PRIMARY}} { + root * /srv/status + file_server +} + +# Forgejo (auth handled by Forgejo itself) +git.{{DOMAIN_PRIMARY}} { + reverse_proxy localhost:3000 +} + +# This Website Does Not Exist +thiswebsitedoesnotexist.{{DOMAIN_PRIMARY}} { + reverse_proxy localhost:3721 +} diff --git a/ansible/services/caddy/README.md b/ansible/services/caddy/README.md new file mode 100644 index 0000000..770e48c --- /dev/null +++ b/ansible/services/caddy/README.md @@ -0,0 +1,129 @@ +# Caddy + +Reverse proxy and TLS termination for all homelab services. +Runs on **helsinki-a** (`100.67.6.27`) as a system service (not Docker). + +Replaces the standalone `pez-proxy` repo. + +## Structure + +``` +services/caddy/ +├── Caddyfile # Live config captured from helsinki-a +├── Caddyfile.template # Templatized version with variable placeholders +└── README.md +``` + +## How It Works + +Helsinki-a sits behind Cloudflare DNS and acts as the single entry point for all +`*.pez.sh` and `*.pez.solutions` traffic. Caddy handles automatic TLS via +Let's Encrypt/ZeroSSL, then reverse-proxies to backend services over the +Tailscale mesh. + +### Traffic Flow + +``` +Internet → Cloudflare → helsinki-a (Caddy) → Tailscale → backend host:port +``` + +### Admin API + +Caddy's admin API listens on `100.67.6.27:2019` (Tailscale-only, not +publicly exposed). Useful for config reloads without downtime: + +```bash +caddy reload --config /etc/caddy/Caddyfile +# or via API: +curl http://100.67.6.27:2019/config/ +``` + +### Metrics + +Caddy exposes Prometheus metrics with `per_host` granularity. Scraped by +Prometheus on london-a. + +## Authelia Forward Auth Pattern + +Most admin-facing services are protected by [Authelia](https://www.authelia.com/) +SSO. Authelia runs on helsinki-a itself (`localhost:9091`) alongside an LLDAP +directory and MariaDB backend (see `services/authelia/`). + +### How forward_auth Works + +Caddy's `forward_auth` directive intercepts every request before it reaches +the upstream. It sends a subrequest to Authelia's verification endpoint: + +``` +forward_auth localhost:9091 { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email +} +``` + +**Flow:** + +1. Client requests `https://grafana.pez.sh/some/page` +2. Caddy sends a verification subrequest to `localhost:9091/api/authz/forward-auth` +3. Authelia checks the session cookie: + - **Valid session** → returns 200; Caddy copies identity headers (`Remote-User`, + `Remote-Groups`, `Remote-Name`, `Remote-Email`) and forwards to the upstream + - **No/expired session** → returns 401 with redirect; Caddy sends user to + `auth.pez.sh` to log in via Authelia's portal +4. After login, Authelia sets a session cookie and redirects back to the + original URL + +### Which Services Use Authelia + +| Service | Auth | Reason | +|---------|------|--------| +| Grafana, Prometheus, Alertmanager | Authelia | Admin dashboards | +| Radarr, Sonarr, Lidarr, Readarr | Authelia | Media management | +| Prowlarr, Transmission (download) | Authelia | Download tools | +| slskd (Soulseek) | Authelia | P2P client | +| Miniflux (RSS) | Authelia | RSS reader | +| Apps dashboard | Authelia | Internal apps page | +| Jellyfin, Plex | Own auth | Have built-in user management | +| Overseerr, Jellyseerr | Own auth | Have built-in user management | +| Nextcloud | Own auth | Has built-in user management | +| Navidrome (music) | No auth* | Accessible directly | +| Bitwarden | Own auth | Has built-in vault auth | +| Forgejo (git) | Own auth | Has built-in user management | +| Authelia portal | N/A | Is the auth system itself | +| LLDAP web UI | N/A | Admin directory management | + +### Template Snippet + +The template file uses a Caddy snippet to DRY up the auth block: + +``` +(authelia) { + forward_auth localhost:{{AUTHELIA_PORT}} { + uri /api/authz/forward-auth + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } +} +``` + +Usage in a site block: `import authelia` + +## Template Variables + +The `Caddyfile.template` replaces hardcoded values with placeholders: + +| Variable | Current Value | Description | +|----------|--------------|-------------| +| `{{HELSINKI_A_IP}}` | `100.67.6.27` | helsinki-a Tailscale IP | +| `{{LONDON_A_IP}}` | `100.122.219.41` | london-a Tailscale IP | +| `{{LONDON_B_IP}}` | `100.84.65.101` | london-b Tailscale IP | +| `{{AUTHELIA_PORT}}` | `9091` | Authelia verification port | +| `{{DOMAIN_PRIMARY}}` | `pez.sh` | Primary domain | +| `{{DOMAIN_ALT}}` | `pez.solutions` | Alternate domain | + +## Notes + +- The live Caddyfile on helsinki-a is at `/etc/caddy/Caddyfile` +- Caddy auto-provisions TLS certificates for all listed domains +- The Alertmanager proxy currently points to port 3000 (same as Grafana) — this may be intentional (Grafana's built-in alerting UI) or a copy-paste issue worth checking +- Commented-out WebDAV block was replaced by the Nextcloud AIO reverse proxy +- Static sites (`pez.sh`, `pez.solutions`, etc.) are served from `/srv/` on helsinki-a diff --git a/ansible/services/forgejo/README.md b/ansible/services/forgejo/README.md new file mode 100644 index 0000000..92fc65a --- /dev/null +++ b/ansible/services/forgejo/README.md @@ -0,0 +1,9 @@ +# Forgejo + +Self-hosted Git forge (Gitea fork). + +- **Host:** helsinki-a +- **URL:** https://git.pez.sh +- **SSH:** git.pez.sh:2222 +- **Data:** `/srv/forgejo/data` +- **Registration:** Disabled (private instance) diff --git a/ansible/services/forgejo/docker-compose.yml b/ansible/services/forgejo/docker-compose.yml new file mode 100644 index 0000000..b21563c --- /dev/null +++ b/ansible/services/forgejo/docker-compose.yml @@ -0,0 +1,26 @@ +# Forgejo - Self-hosted Git forge +# Host: helsinki-a (100.67.6.27) +# Data: /srv/forgejo/data +# Access: https://git.pez.sh (via Caddy reverse proxy) +# SSH: git.pez.sh:2222 + +services: + forgejo: + image: codeberg.org/forgejo/forgejo:10 + container_name: forgejo + restart: unless-stopped + environment: + - USER_UID=1000 + - USER_GID=1000 + - FORGEJO__server__ROOT_URL=https://git.pez.sh + - FORGEJO__server__SSH_DOMAIN=git.pez.sh + - FORGEJO__server__SSH_PORT=22 + - FORGEJO__server__HTTP_PORT=3000 + - FORGEJO__server__DISABLE_SSH=false + - FORGEJO__service__DISABLE_REGISTRATION=true + - FORGEJO__service__REQUIRE_SIGNIN_VIEW=false + ports: + - '127.0.0.1:3000:3000' + - '0.0.0.0:2222:22' + volumes: + - /srv/forgejo/data:/data diff --git a/ansible/services/grafana/README.md b/ansible/services/grafana/README.md new file mode 100644 index 0000000..c5fce0d --- /dev/null +++ b/ansible/services/grafana/README.md @@ -0,0 +1,62 @@ +# Grafana + +Grafana dashboards, alerting rules, and provisioning config for the homelab/cloud stack. +Runs on **london-a** (FreeBSD, `100.122.219.41`) as a native service (not Docker). + +Migrated from the standalone `pez-grafana` repo. + +## Structure + +``` +services/grafana/ +├── dashboards/ # Dashboard JSON files +│ ├── infrastructure.json # Infrastructure overview (linux hosts) +│ ├── living-room-display.json # Kiosk/TV dashboard +│ ├── node-exporter-full.json # Full node exporter metrics +│ └── traffic-slo.json # Traffic / SLO tracking +└── provisioning/ # Grafana provisioning files + ├── alerting/ + │ ├── contact-points.yml # Alert receivers (PagerDuty, email) + │ ├── notification-policy.yml # Routing: critical → PagerDuty, warning → email + │ ├── rules-critical.yml # Tier 1: pages PagerDuty immediately + │ └── rules-warning.yml # Tier 2: email only + ├── dashboards/ + │ └── dashboards.yml # Dashboard file provider config + └── datasources/ + └── datasources.json # Prometheus datasource (localhost:9090) +``` + +## Alert Tiers + +| Tier | Routing | Examples | +|----------|------------|--------------------------------------------| +| Critical | PagerDuty | Host down, disk >95%, memory >95% | +| Warning | Email | Disk >80%, memory >85%, high load/CPU | + +## Deployment + +Deployed via the monorepo's `ansible/deploy.yml` (Stage 4e: Monitoring stack). + +```bash +cd ansible +ansible-playbook deploy.yml --limit london-a --tags monitoring +``` + +Provisioning files are synced to `/usr/local/etc/grafana/provisioning/` and dashboards +to `/usr/local/etc/grafana/dashboards/` on london-a. Grafana is restarted after changes. + +### Notes + +- The old `pez-grafana` repo deployed provisioning to `/usr/local/share/grafana/conf/provisioning/`. + The monorepo uses `/usr/local/etc/grafana/` — verify the correct path on london-a before first deploy. +- PagerDuty integration key is referenced via `${PAGERDUTY_INTEGRATION_KEY}` env var (not stored in repo). +- Grafana password is not committed; pass via `--extra-vars` or env. + +## Importing Dashboards Manually + +```bash +curl -X POST -H "Content-Type: application/json" \ + -u admin:password \ + -d "{\"dashboard\": $(cat dashboards/infrastructure.json), \"overwrite\": true}" \ + http://localhost:3000/api/dashboards/db +``` diff --git a/ansible/services/grafana/dashboards/infrastructure.json b/ansible/services/grafana/dashboards/infrastructure.json new file mode 100644 index 0000000..ddab5b0 --- /dev/null +++ b/ansible/services/grafana/dashboards/infrastructure.json @@ -0,0 +1,1034 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "repeat": "nodename", + "title": "", + "type": "row" + }, + { + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 324, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "\n \n

${nodename}

", + "mode": "html" + }, + "pluginVersion": "12.1.0", + "title": "", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 323, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{server=\"${nodename}\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "CPU", + "range": false, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{server=\"${nodename}\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Mem", + "range": false, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{server=\"${nodename}\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "I/O", + "range": false, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_irq_stalled_seconds_total{server=\"${nodename}\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Irq", + "range": false, + "refId": "D", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Overall CPU busy percentage (averaged across all cores)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", server=\"${nodename}\"}[$__rate_interval])))", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 155, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{server=\"${nodename}\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{server=\"${nodename}\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Real RAM usage excluding cache and reclaimable memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "clamp_min((1 - (node_memory_MemAvailable_bytes{server=\"${nodename}\", job=\"$job\"} / node_memory_MemTotal_bytes{server=\"${nodename}\", job=\"$job\"})) * 100, 0)", + "format": "time_series", + "instant": true, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "(\n (node_filesystem_size_bytes{server=\"${nodename}\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n - node_filesystem_avail_bytes{server=\"${nodename}\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"})\n / node_filesystem_size_bytes{server=\"${nodename}\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n) * 100\n", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{server=\"${nodename}\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 1 + }, + "id": 75, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{server=\"${nodename}\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 1 + }, + "id": 18, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_SwapTotal_bytes{server=\"${nodename}\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 18, + "y": 3 + }, + "id": 15, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{server=\"${nodename}\",job=\"$job\"} - node_boot_time_seconds{server=\"${nodename}\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 3 + }, + "id": 23, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{server=\"${nodename}\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 41, + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "bezqqznn81wqof" + }, + "includeAll": false, + "label": "Datasource", + "name": "ds_prometheus", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "node_exporter", + "value": "node_exporter" + }, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "includeAll": true, + "label": "Nodename", + "name": "nodename", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "refId": "Prometheus-nodename-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "includeAll": false, + "label": "Instance", + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "nowDelay": "1m" + }, + "timezone": "browser", + "title": "Infrastructure", + "uid": "rYdddlPWkd" +} diff --git a/ansible/services/grafana/dashboards/living-room-display.json b/ansible/services/grafana/dashboards/living-room-display.json new file mode 100644 index 0000000..1025d64 --- /dev/null +++ b/ansible/services/grafana/dashboards/living-room-display.json @@ -0,0 +1,1011 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": true, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{server=\"london-b\", mode=\"idle\"}[$__rate_interval])))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 5, + "y": 0 + }, + "id": 8, + "options": { + "calculate": false, + "calculation": { + "yBuckets": { + "mode": "size", + "scale": { + "type": "linear" + }, + "value": "" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "max": 80, + "min": 35, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdYlGn", + "steps": 3 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "Temp", + "axisPlacement": "left", + "decimals": 1, + "max": "100", + "min": 0, + "reverse": false, + "unit": "celsius" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "sum(smartctl_device_temperature) by (device)", + "instant": false, + "interval": "10m", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Drive Temperature", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 7, + "x": 11, + "y": 0 + }, + "id": 4, + "options": { + "displayLabels": [], + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_avail_bytes{server=\"london-b\",mountpoint=\"/hdd\"}", + "legendFormat": "Available", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "expr": "node_filesystem_size_bytes{server=\"london-b\",mountpoint=\"/hdd\"} - node_filesystem_avail_bytes{server=\"london-b\",mountpoint=\"/hdd\"}", + "hide": false, + "legendFormat": "Used", + "range": true, + "refId": "B" + } + ], + "title": "HDD Space", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/.*/", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"online\"} > 0", + "instant": true, + "legendFormat": "Online", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"degraded\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Degraded", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"faulted\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Faulted", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"offline\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Offline", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"removed\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Removed", + "range": false, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"suspended\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Suspended", + "range": false, + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_zfs_zpool_state{server=\"london-b\",zpool=\"hdd\", state=\"unavail\"} > 0", + "hide": false, + "instant": true, + "legendFormat": "Unavailable", + "range": false, + "refId": "G" + } + ], + "title": "HDD State", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Online" + ] + } + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 5 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "expr": "clamp_min((1 - (node_memory_MemAvailable_bytes{server=\"london-b\"} / node_memory_MemTotal_bytes{server=\"london-b\"})) * 100, 0)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "custom": { + "axisPlacement": "auto", + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 5, + "y": 5 + }, + "id": 11, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": false, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "expr": "smartctl_device_smart_status", + "interval": "10m", + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Drive Health", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 5 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "expr": "node_filesystem_avail_bytes{server=\"london-b\",mountpoint=\"/hdd\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Available Space", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 5 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "expr": "node_filesystem_size_bytes{server=\"london-b\",mountpoint=\"/hdd\"} - node_filesystem_avail_bytes{server=\"london-b\",mountpoint=\"/hdd\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Used Space", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [ + { + "options": { + "Mak999": { + "index": 4, + "text": "Amar" + }, + "Malene Wejlgaard Knudsen": { + "index": 5, + "text": "Malene" + }, + "d.han81": { + "index": 2, + "text": "Han" + }, + "er1227": { + "index": 1, + "text": "Erik" + }, + "guykeren437": { + "index": 15, + "text": "Guy" + }, + "isab579": { + "index": 3, + "text": "Scoulers Daughter" + }, + "naveen.629": { + "index": 6, + "text": "Naveen" + }, + "pe423": { + "index": 13, + "text": "Living Room" + }, + "praczyk.": { + "index": 7, + "text": "Trevor" + }, + "pravee63": { + "index": 8, + "text": "Praveen" + }, + "scou210": { + "index": 9, + "text": "Scouler" + }, + "sorghumc": { + "index": 10, + "text": "Anton" + }, + "theonet5": { + "index": 11, + "text": "Trevor" + }, + "theonetb": { + "index": 12, + "text": "Trevor" + }, + "wooley_82": { + "index": 0, + "text": "Wooly" + }, + "yp2xc": { + "index": 14, + "text": "Trevor" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 20, + "x": 0, + "y": 10 + }, + "id": 12, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "plays_total{user!=\"Rasmus\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "User", + "range": false, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(plays_total) by (title)", + "format": "table", + "hide": true, + "instant": true, + "legendFormat": "Title", + "range": false, + "refId": "B" + } + ], + "title": "Current Activity", + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "child_title", + "stream_resolution", + "stream_type", + "title", + "user", + "grandchild_title" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "child_title": 2, + "grandchild_title": 3, + "stream_resolution": 5, + "stream_type": 4, + "title": 1, + "user": 0 + }, + "renameByName": { + "child_title": "Season", + "grandchild_title": "Episode Title", + "stream_resolution": "Resolution", + "stream_type": "Stream", + "title": "Title", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [ + { + "options": { + "wooley_82": { + "index": 0, + "text": "Wooly" + } + }, + "type": "value" + } + ], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 20, + "y": 10 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "count(plays_total{user!=\"Rasmus\"})", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Streams", + "type": "stat" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "nowDelay": "0m" + }, + "timezone": "browser", + "title": "Living Room Display", + "uid": "a68bd259-c836-4fad-b33e-98f1a52a5eb9" +} diff --git a/ansible/services/grafana/dashboards/node-exporter-full.json b/ansible/services/grafana/dashboards/node-exporter-full.json new file mode 100644 index 0000000..f0270d4 --- /dev/null +++ b/ansible/services/grafana/dashboards/node-exporter-full.json @@ -0,0 +1,15726 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 1860, + "graphTooltip": 1, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 323, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "CPU", + "range": false, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Mem", + "range": false, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "I/O", + "range": false, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Irq", + "range": false, + "refId": "D", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Overall CPU busy percentage (averaged across all cores)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 155, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Real RAM usage excluding cache and reclaimable memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "clamp_min((1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100, 0)", + "format": "time_series", + "instant": true, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of swap space currently used by the system", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "(\n (node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"})\n / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n) * 100\n", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 1 + }, + "id": 75, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 1 + }, + "id": 18, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 23, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 3 + }, + "id": 15, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "CPU time spent busy vs idle, split by activity type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 77, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "instant": false, + "legendFormat": "Busy System", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy User", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy Iowait", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy IRQs", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy Other", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Idle", + "range": true, + "refId": "F", + "step": 240 + } + ], + "title": "CPU Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "RAM and swap usage overview, including caches", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Swap used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache + Buffer" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Cache + Buffer", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Free", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Swap used", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "Memory Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-interface network traffic (receive and transmit) in bits per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Tx.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 74, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "Rx {{device}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "Tx {{device}} ", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of filesystem space used for each mounted device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 152, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "((node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"} - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) * 100", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Space Used Basic", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 265, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "CPU time usage split by state, normalized across all CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 70, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Idle - Waiting for something to happen" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Iowait - Waiting for I/O to complete" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Irq - Servicing interrupts" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Nice - Niced processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Softirq - Servicing softirqs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Steal - Time spent in other operating systems when running in a virtualized environment" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCE2DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "System - Processes executing in kernel mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "User - Normal processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#5195CE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Guest CPU usage" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.stacking", + "value": { + "group": "A", + "mode": "none" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "interval": "", + "legendFormat": "System - Processes executing in kernel mode", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"user\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "User - Normal processes executing in user mode", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"nice\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Nice - Niced processes executing in user mode", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"iowait\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Iowait - Waiting for I/O to complete", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"irq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Irq - Servicing interrupts", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"softirq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Softirq - Servicing softirqs", + "range": true, + "refId": "F", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"steal\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "range": true, + "refId": "G", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Idle - Waiting for something to happen", + "range": true, + "refId": "H", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))) > 0", + "format": "time_series", + "legendFormat": "Guest CPU usage", + "range": true, + "refId": "I", + "step": 240 + } + ], + "title": "CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breakdown of physical memory and swap usage. Hardware-detected memory errors are also displayed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap - Swap memory usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused - Free memory unassigned" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Hardware Corrupted - *./" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Apps - Memory used by user-space applications", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Cache - Parked file data (file content) cache", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "range": true, + "refId": "F", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Unused - Free memory unassigned", + "range": true, + "refId": "G", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Swap - Swap space used", + "range": true, + "refId": "H", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "range": true, + "refId": "I", + "step": 240 + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Incoming and outgoing network traffic per interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 433 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Network interface utilization as a percentage of its maximum capacity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 433 + }, + "id": 338, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"})", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Saturation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk I/O operations per second for each device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 445 + }, + "id": 229, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk I/O throughput per device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read*./" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 445 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Amount of available disk space per mounted filesystem, excluding rootfs. Based on block availability to non-root users", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 457 + }, + "id": 43, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "metric": "", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "legendFormat": "{{mountpoint}} - Free", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "legendFormat": "{{mountpoint}} - Size", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Filesystem Space Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk usage (used = total - available) per mountpoint", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 457 + }, + "id": 156, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Filesystem Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of time the disk was actively processing I/O operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 469 + }, + "id": 127, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"} [$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk I/O Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How often tasks experience CPU, memory, or I/O delays. \u201cSome\u201d indicates partial slowdown; \u201cFull\u201d indicates all tasks are stalled. Based on Linux PSI metrics:\nhttps://docs.kernel.org/accounting/psi.html", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "some (-) / full (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Some.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Some.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 469 + }, + "id": 322, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "CPU - Some", + "range": true, + "refId": "CPU some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Memory - Some", + "range": true, + "refId": "Memory some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Memory - Full", + "range": true, + "refId": "Memory full", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "I/O - Some", + "range": true, + "refId": "I/O some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "I/O - Full", + "range": true, + "refId": "I/O full", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "IRQ - Full", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Pressure Stall Information", + "type": "timeseries" + } + ], + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 266, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays committed memory usage versus the system's commit limit. Exceeding the limit is allowed under Linux overcommit policies but may increase OOM risks under high load", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*CommitLimit - *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 732 + }, + "id": 135, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Committed_AS \u2013 Memory promised to processes (not necessarily used)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "CommitLimit - Max allowable committed memory", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Committed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory currently dirty (modified but not yet written to disk), being actively written back, or held by writeback buffers. High dirty or writeback memory may indicate disk I/O pressure or delayed flushing", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 732 + }, + "id": 130, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Writeback \u2013 Memory currently being flushed to disk", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "WritebackTmp \u2013 FUSE temporary writeback buffers", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Dirty \u2013 Memory marked dirty (pending write to disk)", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NFS Unstable \u2013 Pages sent to NFS server, awaiting storage commit", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory Writeback and Dirty", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Kernel slab memory usage, separated into reclaimable and non-reclaimable categories. Reclaimable memory can be freed under memory pressure (e.g., caches), while unreclaimable memory is locked by the kernel for core functions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 932 + }, + "id": 131, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SUnreclaim \u2013 Non-reclaimable slab memory (kernel objects)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SReclaimable \u2013 Potentially reclaimable slab memory (e.g., inode cache)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Slab", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory used for mapped files (such as libraries) and shared memory (shmem and tmpfs), including variants backed by huge pages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 932 + }, + "id": 138, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Mapped \u2013 Memory mapped from files (e.g., libraries, mmap)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Shmem \u2013 Shared memory used by processes and tmpfs", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "ShmemHugePages \u2013 Shared memory (shmem/tmpfs) allocated with HugePages", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PMD Mapped \u2013 Shmem/tmpfs backed by Transparent HugePages (PMD)", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory Shared and Mapped", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Proportion of memory pages in the kernel's active and inactive LRU lists relative to total RAM. Active pages have been recently used, while inactive pages are less recently accessed but still resident in memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Active.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Inactive.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 942 + }, + "id": 136, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "(node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Inactive \u2013 Less recently used memory, more likely to be reclaimed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})\n", + "format": "time_series", + "legendFormat": "Active \u2013 Recently used memory, retained unless under pressure", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory LRU Active / Inactive (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breakdown of memory pages in the kernel's active and inactive LRU lists, separated by anonymous (heap, tmpfs) and file-backed (caches, mmap) pages.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 942 + }, + "id": 191, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Inactive_anon \u2013 Anonymous memory on inactive LRU (incl. tmpfs & swap cache)", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Active_file - File-backed memory on active LRU list", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Active_anon \u2013 Anonymous memory on active LRU (incl. tmpfs & swap cache)", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory LRU Active / Inactive Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks kernel memory used for CPU-local structures, per-thread stacks, and bounce buffers used for I/O on DMA-limited devices. These areas are typically small but critical for low-level operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 952 + }, + "id": 160, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "KernelStack \u2013 Kernel stack memory (per-thread, non-reclaimable)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PerCPU \u2013 Dynamically allocated per-CPU memory (used by kernel modules)", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Bounce Memory \u2013 I/O buffer for DMA-limited devices", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Kernel / CPU / IO", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Usage of the kernel's vmalloc area, which provides virtual memory allocations for kernel modules and drivers. Includes total, used, and largest free block sizes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Total.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 952 + }, + "id": 70, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Free Chunk \u2013 Largest available block in vmalloc area", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Total \u2013 Total size of the vmalloc memory area", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Used \u2013 Portion of vmalloc area currently in use", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Vmalloc", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory used by anonymous pages (not backed by files), including standard and huge page allocations. Includes heap, stack, and memory-mapped anonymous regions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 962 + }, + "id": 129, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "AnonHugePages \u2013 Anonymous memory using HugePages", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "AnonPages \u2013 Anonymous memory (non-file-backed)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Anonymous", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory that is locked in RAM and cannot be swapped out. Includes both kernel-unevictable memory and user-level memory locked with mlock()", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 962 + }, + "id": 137, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Unevictable \u2013 Kernel-pinned memory (not swappable)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Mlocked \u2013 Application-locked memory via mlock()", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Unevictable and MLocked", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How much memory is directly mapped in the kernel using different page sizes (4K, 2M, 1G). Helps monitor large page utilization in the direct map region", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 972 + }, + "id": 128, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "DirectMap 1G \u2013 Memory mapped with 1GB pages", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "DirectMap 2M \u2013 Memory mapped with 2MB pages", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "DirectMap 4K \u2013 Memory mapped with 4KB pages", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory DirectMap", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays HugePages memory usage in bytes, including allocated, free, reserved, and surplus memory. All values are calculated based on the number of huge pages multiplied by their configured size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 972 + }, + "id": 140, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Used \u2013 Currently allocated", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Reserved \u2013 Promised but unused", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Surplus \u2013 Dynamic pool extension", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Total \u2013 Reserved memory", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory HugePages", + "type": "timeseries" + } + ], + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 267, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of memory pages being read from or written to disk (page-in and page-out operations). High page-out may indicate memory pressure or swapping activity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 733 + }, + "id": 176, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pagesin - Page in ops", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pagesout - Page out ops", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate at which memory pages are being swapped in from or out to disk. High swap-out activity may indicate memory pressure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 733 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pswpin - Pages swapped in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pswpout - Pages swapped out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages Swap In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of memory page faults, split into total, major (disk-backed), and derived minor (non-disk) faults. High major fault rates may indicate memory pressure or insufficient RAM", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Pgfault - Page major and minor fault ops" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "none" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 913 + }, + "id": 175, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgfault - Page major and minor fault ops", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgmajfault - Major page fault ops", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgminfault - Minor page fault ops", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Page Faults", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of Out-of-Memory (OOM) kill events. A non-zero value indicates the kernel has terminated one or more processes due to memory exhaustion", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "OOM Kills" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 913 + }, + "id": 307, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "OOM Kills", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "OOM Killer", + "type": "timeseries" + } + ], + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 293, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the system clock's estimated and maximum error, as well as its offset from the reference clock (e.g., via NTP). Useful for detecting synchronization drift", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 734 + }, + "id": 260, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Estimated error", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Offset local vs reference", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Maximum error", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Time Synchronized Drift", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "NTP phase-locked loop (PLL) time constant used by the kernel to control time adjustments. Lower values mean faster correction but less stability", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 734 + }, + "id": 291, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PLL Time Constant", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Time PLL Adjust", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows whether the system clock is synchronized to a reliable time source, and the current frequency correction ratio applied by the kernel to maintain synchronization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 884 + }, + "id": 168, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Sync status (1 = ok)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Frequency Adjustment", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "Tick Interval", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "TAI Offset", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Time Synchronized Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays the PPS signal's frequency offset and stability (jitter) in hertz. Useful for monitoring high-precision time sources like GPS or atomic clocks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 884 + }, + "id": 333, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_pps_frequency_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Frequency Offset", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_pps_stability_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Frequency Stability", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PPS Frequency / Stability", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks PPS signal timing jitter and shift compared to system clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 894 + }, + "id": 334, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_pps_jitter_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Jitter", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_pps_shift_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Shift", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PPS Time Accuracy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of PPS synchronization diagnostics including calibration events, jitter violations, errors, and frequency stability exceedances", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 894 + }, + "id": 335, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_timex_pps_calibration_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Calibrations/sec", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_error_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Errors/sec", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_stability_exceeded_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Stability Exceeded/sec", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_jitter_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Jitter Events/sec", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "PPS Sync Events", + "type": "timeseries" + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 312, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Processes currently in runnable or blocked states. Helps identify CPU contention or I/O wait bottlenecks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 735 + }, + "id": 62, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Blocked (I/O Wait)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Runnable (Ready for CPU)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Processes Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of processes in each state (e.g., running, sleeping, zombie). Requires --collector.processes to be enabled in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "D" + }, + "properties": [ + { + "id": "displayName", + "value": "Uninterruptible Sleeping" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "I" + }, + "properties": [ + { + "id": "displayName", + "value": "Idle Kernel Thread" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "R" + }, + "properties": [ + { + "id": "displayName", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "S" + }, + "properties": [ + { + "id": "displayName", + "value": "Interruptible Sleeping" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "T" + }, + "properties": [ + { + "id": "displayName", + "value": "Stopped" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "X" + }, + "properties": [ + { + "id": "displayName", + "value": "Dead" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Z" + }, + "properties": [ + { + "id": "displayName", + "value": "Zombie" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 735 + }, + "id": 315, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ state }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Processes Detailed States", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of new processes being created on the system (forks/sec).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 765 + }, + "id": 148, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Process Forks per second", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Processes Forks", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows CPU saturation per core, calculated as the proportion of time spent waiting to run relative to total time demanded (running + waiting).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*waiting.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 765 + }, + "id": 305, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "CPU {{ cpu }} - Running", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "CPU {{cpu}} - Waiting Queue", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n/\n(irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) + irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))\n", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}}", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "CPU Saturation per Core", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of active PIDs on the system and the configured maximum allowed. Useful for detecting PID exhaustion risk. Requires --collector.processes in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "PIDs limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 775 + }, + "id": 313, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Number of PIDs", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PIDs limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PIDs Number and Limit", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of active threads on the system and the configured thread limit. Useful for monitoring thread pressure. Requires --collector.processes in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Threads limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 775 + }, + "id": 314, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Allocated threads", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Threads limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Threads Number and Limit", + "type": "timeseries" + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 269, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-second rate of context switches and hardware interrupts. High values may indicate intense CPU or I/O activity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 816 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Context switches", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Interrupts", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Context Switches / Interrupts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "System load average over 1, 5, and 15 minutes. Reflects the number of active or waiting processes. Values above CPU core count may indicate overload", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Core Count" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 816 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 1m", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 5m", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 15m", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "legendFormat": "CPU Core Count", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Real-time CPU frequency scaling per core, including average minimum and maximum allowed scaling frequencies", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "hertz" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Max" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Min" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 826 + }, + "id": 321, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{ cpu }}", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "interval": "", + "legendFormat": "Max", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "interval": "", + "legendFormat": "Min", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "CPU Frequency Scaling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of scheduling timeslices executed per CPU. Reflects how frequently the scheduler switches tasks on each core", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 826 + }, + "id": 306, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{ cpu }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Schedule Timeslices", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breaks down hardware interrupts by type and device. Useful for diagnosing IRQ load on network, disk, or CPU interfaces. Requires --collector.interrupts to be enabled in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 836 + }, + "id": 259, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ type }} - {{ info }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "IRQ Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of bits of entropy currently available to the system's random number generators (e.g., /dev/random). Low values may indicate that random number generation could block or degrade performance of cryptographic operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbits" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Entropy pool max" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 836 + }, + "id": 151, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Entropy available", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_entropy_pool_size_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Entropy pool max", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Entropy", + "type": "timeseries" + } + ], + "title": "System Misc", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 304, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Monitors hardware sensor temperatures and critical thresholds as exposed by Linux hwmon. Includes CPU, GPU, and motherboard sensors where available", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Critical*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 737 + }, + "id": 158, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical", + "range": true, + "refId": "C", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "title": "Hardware Temperature Monitor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows how hard each cooling device (fan/throttle) is working relative to its maximum capacity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 737 + }, + "id": 300, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "100 * node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"} / node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }} - {{ type }} ", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Cooling Device Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows the online status of power supplies (e.g., AC, battery). A value of 1-Yes indicates the power supply is active/online", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 747 + }, + "id": 302, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ power_supply }} online", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Power Supply", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays the current fan speeds (RPM) from hardware sensors via the hwmon interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "rotrpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 747 + }, + "id": 325, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_hwmon_fan_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_hwmon_fan_min_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} rpm min", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Hardware Fan Speed", + "type": "timeseries" + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 296, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of systemd units in each operational state, such as active, failed, inactive, or transitioning", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Activating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C8F2C2", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Deactivating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 4228 + }, + "id": 298, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Activating", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Active", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Deactivating", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Failed", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Inactive", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "Systemd Units State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of active connections per systemd socket, as reported by the Node Exporter systemd collector", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 4228 + }, + "id": 331, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_systemd_socket_current_connections{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Current", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of accepted connections per second for each systemd socket", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 4238 + }, + "id": 297, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Accepted", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of systemd socket connection refusals per second, typically due to service unavailability or backlog overflow", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 4238 + }, + "id": 332, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_systemd_socket_refused_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Refused", + "type": "timeseries" + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of I/O operations completed per second for the device (after merges), including both reads and writes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (\u2013) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Read/Write IOps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of bytes read from or written to the device per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (\u2013) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Read/Write Data", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (\u2013) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 389 + }, + "id": 37, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Average Wait Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average queue length of the requests that were issued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda_*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 389 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Average Queue Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of read and write requests merged per second that were queued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (\u2013) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 399 + }, + "id": 133, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk R/W Merged", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of time the disk spent actively processing I/O operations, including general I/O, discards (TRIM), and write cache flushes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 399 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - General IO", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discard/TRIM", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_flush_requests_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Flush (write cache)", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Time Spent Doing I/Os", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-second rate of discard (TRIM) and flush (write cache) operations. Useful for monitoring low-level disk activity on SSDs and advanced storage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 409 + }, + "id": 301, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discards completed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discards merged", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_flush_requests_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Flush", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Disk Ops Discards / Flush", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows how many disk sectors are discarded (TRIMed) per second. Useful for monitoring SSD behavior and storage efficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 409 + }, + "id": 326, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_discarded_sectors_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Sectors Discarded Successfully", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of in-progress I/O requests at the time of sampling (active requests in the disk queue)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 419 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Instantaneous Queue Size", + "type": "timeseries" + } + ], + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 271, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file descriptors currently allocated system-wide versus the system limit. Important for detecting descriptor exhaustion risks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Max open files", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Open files", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of free file nodes (inodes) available per mounted filesystem. A low count may prevent file creation even if disk space is available", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Free", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Indicates filesystems mounted in read-only mode or reporting device-level I/O errors.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 370 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}} - ReadOnly", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", + "format": "time_series", + "interval": "", + "legendFormat": "{{mountpoint}} - Device error", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Filesystem in ReadOnly / Error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file nodes (inodes) available per mounted filesystem. Reflects maximum file capacity regardless of disk size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 370 + }, + "id": 219, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Size", + "type": "timeseries" + } + ], + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 272, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of network packets received and transmitted per second, by interface.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 60, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic by Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of packet-level errors for each network interface. Receive errors may indicate physical or driver issues; transmit errors may reflect collisions or hardware faults", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 142, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of dropped packets per network interface. Receive drops can indicate buffer overflow or driver issues; transmit drops may result from outbound congestion or queuing limits", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 251 + }, + "id": 143, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Drop", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of compressed network packets received and transmitted per interface. These are common in low-bandwidth or special interfaces like PPP or SLIP", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 251 + }, + "id": 141, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Compressed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of incoming multicast packets received per network interface. Multicast is used by protocols such as mDNS, SSDP, and some streaming or cluster services", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 261 + }, + "id": 146, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Multicast", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of received packets that could not be processed due to missing protocol or handler in the kernel. May indicate unsupported traffic or misconfiguration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 261 + }, + "id": 327, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_nohandler_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic NoHandler", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of frame errors on received packets, typically caused by physical layer issues such as bad cables, duplex mismatches, or hardware problems", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 271 + }, + "id": 145, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Frame", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks FIFO buffer overrun errors on network interfaces. These occur when incoming or outgoing packets are dropped due to queue or buffer overflows, often indicating congestion or hardware limits", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 271 + }, + "id": 144, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Fifo", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of packet collisions detected during transmission. Mostly relevant on half-duplex or legacy Ethernet networks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 281 + }, + "id": 232, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Collision", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of carrier errors during transmission. These typically indicate physical layer issues like faulty cabling or duplex mismatches", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 281 + }, + "id": 231, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Carrier Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of ARP entries per interface. Useful for detecting excessive ARP traffic or table growth due to scanning or misconfiguration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 291 + }, + "id": 230, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "{{ device }} ARP Table", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "ARP Entries", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current and maximum connection tracking entries used by Netfilter (nf_conntrack). High usage approaching the limit may cause packet drops or connection issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "NF conntrack limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 291 + }, + "id": 61, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NF conntrack entries", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NF conntrack limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "NF Conntrack", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Operational and physical link status of each network interface. Values are Yes for 'up' or link present, and No for 'down' or no carrier.\"", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 301 + }, + "id": 309, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "legendFormat": "{{interface}} - Operational state UP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link", + "refId": "B" + } + ], + "title": "Network Operational Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Maximum speed of each network interface as reported by the operating system. This is a static hardware capability, not current throughput", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 12, + "y": 301 + }, + "id": 280, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 30, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "manual", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"} * 8", + "format": "time_series", + "legendFormat": "{{ device }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Speed", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "MTU (Maximum Transmission Unit) in bytes for each network interface. Affects packet size and transmission efficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 18, + "y": 301 + }, + "id": 288, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 30, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "manual", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "{{ device }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "MTU", + "type": "bargauge" + } + ], + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 273, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks TCP socket usage and memory per node", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 63, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Allocated Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "In-Use Sockets", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Orphaned Sockets", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TIME_WAIT Sockets", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Sockstat TCP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of UDP and UDPLite sockets currently in use", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 124, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDPLite - In-Use Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP - In-Use Sockets", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Sockstat UDP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Total number of sockets currently in use across all protocols (TCP, UDP, UNIX, etc.), as reported by /proc/net/sockstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 126, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Total sockets", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Sockstat Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of FRAG and RAW sockets currently in use. RAW sockets are used for custom protocols or tools like ping; FRAG sockets are used internally for IP packet defragmentation", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 125, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "FRAG - In-Use Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "RAW - In-Use Sockets", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Sockstat FRAG / RAW", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Kernel memory used by TCP, UDP, and IP fragmentation buffers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 220, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Fragmentation", + "range": true, + "refId": "C" + } + ], + "title": "Sockstat Memory Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average memory used per socket (TCP/UDP). Helps tune net.ipv4.tcp_rmem / tcp_wmem", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 339, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"} / node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"} / node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Sockstat Average Socket Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "TCP/UDP socket memory usage in kernel (in pages)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 62 + }, + "id": 336, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP/UDP Kernel Buffer Memory Pages", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Packets processed and dropped by the softnet network stack per CPU. Drops may indicate CPU saturation or network driver limitations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "drop (-) / process (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Dropped.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 62 + }, + "id": 290, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Processed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Dropped", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Softnet Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How often the kernel was unable to process all packets in the softnet queue before time ran out. Frequent squeezes may indicate CPU contention or driver inefficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 310, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Times Squeezed", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Softnet Out of Quota", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the number of packets processed or dropped by Receive Packet Steering (RPS), a mechanism to distribute packet processing across CPUs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Dropped.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 330, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_received_rps_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Processed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_softnet_flow_limit_count_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Dropped", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Softnet RPS", + "type": "timeseries" + } + ], + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 274, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of octets sent and received at the IP layer, as reported by /proc/net/netstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 163 + }, + "id": 221, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "IP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "IP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Netstat IP In / Out Octets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP segments sent and received per second, including data and control segments", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 163 + }, + "id": 299, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TCP Rx in", + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "TCP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of UDP datagrams sent and received per second, based on /proc/net/netstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 193 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "UDP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of ICMP messages sent and received per second, including error and control messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 193 + }, + "id": 115, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "ICMP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks various TCP error and congestion-related events, including retransmissions, timeouts, dropped connections, and buffer issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 203 + }, + "id": 104, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Listen Overflows", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Listen Drops", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Retransmits", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Segment Retransmits", + "range": true, + "refId": "D" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Receive Errors", + "range": true, + "refId": "E" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "RST Sent", + "range": true, + "refId": "F" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Receive Queue Drops", + "range": true, + "refId": "G" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Out-of-order Queued", + "range": true, + "refId": "H" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPTimeouts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "TCP Timeouts", + "range": true, + "refId": "I" + } + ], + "title": "TCP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of UDP and UDPLite datagram delivery errors, including missing listeners, buffer overflows, and protocol-specific issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 203 + }, + "id": 109, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Errors", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP No Listener", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "UDPLite Rx in Errors", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Buffer Errors", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out Buffer Errors", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "UDP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of incoming ICMP messages that contained protocol-specific errors, such as bad checksums or invalid lengths", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 213 + }, + "id": 50, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Rx In", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "ICMP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP SYN cookies sent, validated, and failed. These are used to protect against SYN flood attacks and manage TCP handshake resources under load", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Failed.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 213 + }, + "id": 91, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Failed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Validated", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Sent", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "TCP SynCookie", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of currently established TCP connections and the system's max supported limit. On Linux, MaxConn may return -1 to indicate a dynamic/unlimited configuration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 223 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Current Connections", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Max Connections", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of UDP packets currently queued in the receive (RX) and transmit (TX) buffers. A growing queue may indicate a bottleneck", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 223 + }, + "id": 337, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"rx\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Queue", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"tx\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out Queue", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "UDP Queue", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP connection initiations per second. 'Active' opens are initiated by this host. 'Passive' opens are accepted from incoming connections", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 233 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Active Opens", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Passive Opens", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP Direct Transition", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of TCP sockets in key connection states. Requires the --collector.tcpstat flag on node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 233 + }, + "id": 320, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Established", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "FIN_WAIT2", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Listen", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TIME_WAIT", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"close_wait\", instance=\"$node\", job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "CLOSE_WAIT", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "TCP Stat", + "type": "timeseries" + } + ], + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 279, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Duration of each individual collector executed during a Node Exporter scrape. Useful for identifying slow or failing collectors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 164 + }, + "id": 40, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{collector}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Node Exporter Scrape Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of CPU time used by the process exposing this metric (user + system mode)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 164 + }, + "id": 308, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Process CPU Usage", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Exporter Process CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the memory usage of the process exposing this metric (e.g., node_exporter), including current virtual memory and maximum virtual memory limit", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Virtual Memory Limit" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Virtual Memory" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 10, + "x": 0, + "y": 174 + }, + "id": 149, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Virtual Memory", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Virtual Memory Limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Exporter Processes Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file descriptors used by the exporter process versus its configured limit", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Open file descriptors" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 10, + "x": 10, + "y": 174 + }, + "id": 64, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Maximum open file descriptors", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Open file descriptors", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Exporter File Descriptor Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows whether each Node Exporter collector scraped successfully (1 = success, 0 = failure), and whether the textfile collector returned an error.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 20, + "y": 174 + }, + "id": 157, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{collector}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "1 - node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "textfile", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Node Exporter Scrape", + "type": "bargauge" + } + ], + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": "1m", + "schemaVersion": 41, + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": {}, + "includeAll": false, + "label": "Datasource", + "name": "ds_prometheus", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "includeAll": false, + "label": "Nodename", + "name": "nodename", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "refId": "Prometheus-nodename-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "includeAll": false, + "label": "Instance", + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Node Exporter Full", + "uid": "rYdddlPWk", + "weekStart": "" +} diff --git a/ansible/services/grafana/dashboards/traffic-slo.json b/ansible/services/grafana/dashboards/traffic-slo.json new file mode 100644 index 0000000..27df056 --- /dev/null +++ b/ansible/services/grafana/dashboards/traffic-slo.json @@ -0,0 +1,587 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 99.9 + }, + { + "color": "green", + "value": 99.99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "clamp_max(\n (sum(caddy_http_response_size_bytes_count{host=~\".*(pez.solutions|pez.sh)\", code!~\"5.*\"}) / (sum(caddy_http_response_size_bytes_count{host=~\".*(pez.solutions|pez.sh)\"}))) * 100,\n 99.999\n)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "SLI", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 99.9 + }, + { + "color": "green", + "value": 99.99 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 20, + "x": 4, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "code", + "expr": "clamp_max(\n (\n sum(\n label_replace(\n caddy_http_response_size_bytes_count{host=~\".*(pez.solutions|pez.sh)\", host!~\"(pez.sh|pez.solutions)\", code!~\"5.*\"},\n \"host_prefix\",\n \"$1\",\n \"host\",\n \"([^.]+)\\\\..*\"\n )\n ) by (host_prefix)\n /\n sum(\n label_replace(\n caddy_http_response_size_bytes_count{host=~\".*(pez.solutions|pez.sh)\", host!~\"(pez.sh|pez.solutions)\"},\n \"host_prefix\",\n \"$1\",\n \"host\",\n \"([^.]+)\\\\..*\"\n )\n ) by (host_prefix)\n ) * 100,\n 99.999\n)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "SLI by Host", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 19, + "x": 0, + "y": 10 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(\n label_replace(\n rate(caddy_http_requests_total{handler!=\"metrics\", host=~\".*(pez.solutions|pez.sh)\"}[$__rate_interval]),\n \"host_prefix\",\n \"$1\",\n \"host\",\n \"([^.]+)\\\\..*\"\n )\n) by (host_prefix)", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "{{host}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Traffic Rate by Service", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 5, + "x": 19, + "y": 10 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "req/s" + } + ] + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "sum(\n label_replace(\n rate(caddy_http_requests_total{handler!=\"metrics\", host=~\".*(pez.solutions|pez.sh)\", host!~\"(pez.sh|pez.solutions)\"}[$__rate_interval]),\n \"host_prefix\",\n \"$1\",\n \"host\",\n \"([^.]+)\\\\..*\"\n )\n) by (host_prefix) > 0", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Active Services", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "req/s", + "host_prefix": "Service" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(caddy_http_response_duration_seconds_count{code!~\"5.*\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "Good", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Response Codes (Good)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "bezqqznn81wqof" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(caddy_http_response_duration_seconds_count{code=~\"5.*\"}[$__rate_interval])) by (code, host) > 0", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "{{code}} - {{host}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Response Codes (Bad)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Traffic / SLO", + "uid": "384f28fe-2435-480f-a0f0-723ccdcf8b3b" +} diff --git a/ansible/services/grafana/provisioning/alerting/contact-points.yml b/ansible/services/grafana/provisioning/alerting/contact-points.yml new file mode 100644 index 0000000..7257635 --- /dev/null +++ b/ansible/services/grafana/provisioning/alerting/contact-points.yml @@ -0,0 +1,23 @@ +apiVersion: 1 + +# Contact points — defines where alerts are sent. +# PagerDuty key is managed via Grafana UI / environment variable; do not commit secrets here. + +contactPoints: + - orgId: 1 + name: PagerDuty + receivers: + - uid: bf0ukmhpefshsc + type: pagerduty + settings: + integrationKey: "{{ grafana_pagerduty_integration_key }}" + disableResolveMessage: false + + - orgId: 1 + name: email + receivers: + - uid: email-receiver + type: email + settings: + addresses: pez@pez.sh + disableResolveMessage: false diff --git a/ansible/services/grafana/provisioning/alerting/notification-policy.yml b/ansible/services/grafana/provisioning/alerting/notification-policy.yml new file mode 100644 index 0000000..9b4a6d4 --- /dev/null +++ b/ansible/services/grafana/provisioning/alerting/notification-policy.yml @@ -0,0 +1,31 @@ +apiVersion: 1 + +# Notification routing policy. +# Critical alerts (severity=critical) → PagerDuty. +# Warning alerts (severity=warning) → email. + +policies: + - orgId: 1 + receiver: PagerDuty + group_by: + - alertname + - server + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + - receiver: PagerDuty + matchers: + - severity = critical + group_wait: 0s + group_interval: 1m + repeat_interval: 1h + continue: false + + - receiver: email + matchers: + - severity = warning + group_wait: 2m + group_interval: 10m + repeat_interval: 8h + continue: false diff --git a/ansible/services/grafana/provisioning/alerting/rules-critical.yml b/ansible/services/grafana/provisioning/alerting/rules-critical.yml new file mode 100644 index 0000000..66c6c90 --- /dev/null +++ b/ansible/services/grafana/provisioning/alerting/rules-critical.yml @@ -0,0 +1,358 @@ +apiVersion: 1 + +# Tier 1 — Critical alerts. These page PagerDuty. +# Datasource UID: bezqqznn81wqof (Prometheus on london-a) +# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations. + +groups: + - orgId: 1 + name: critical-availability + folder: Alerting + interval: 1m + rules: + - uid: cff6uy1tufj0ge + title: Host Down + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: up{job="node_exporter"} + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: Alerting + execErrState: Alerting + for: 2m + annotations: + summary: "Host {{ $labels.server }} is down" + description: "Node exporter on {{ $labels.server }} ({{ $labels.instance }}) has been unreachable for 2+ minutes." + labels: + severity: critical + isPaused: false + + - uid: aff6uy1vxchdse + title: Disk Usage Critical (>95%) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: | + ( + node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + - node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + ) + / node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + * 100 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [95] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: "Disk critically full on {{ $labels.server }}" + description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 95% full (currently {{ $value | printf \"%.1f\" }}%)." + labels: + severity: critical + isPaused: false + + - uid: aff6uy1xq9udca + title: Memory Usage Critical (>95%) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: | + (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [95] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: "Memory critically low on {{ $labels.server }}" + description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 95% for 5+ minutes." + labels: + severity: critical + isPaused: false + + - uid: fff6uy219mo00e + title: SMART Disk Health Failure (london-b) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: smartctl_device_smart_status{job="smartmontools"} + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 0m + annotations: + summary: "Disk SMART health failure on london-b" + description: "Drive {{ $labels.device }} on london-b reports SMART health failure. Check immediately." + labels: + severity: critical + isPaused: false + + - orgId: 1 + name: critical-caddy + folder: Alerting + interval: 1m + rules: + - uid: fff6uy1zgpb0gd + title: Caddy Down (helsinki-a) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: up{job="caddy"} + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: Alerting + execErrState: Alerting + for: 1m + annotations: + summary: "Caddy is down on helsinki-a" + description: "Caddy (main reverse proxy) on helsinki-a unreachable. External services likely down." + labels: + severity: critical + isPaused: false + + - orgId: 1 + name: critical-services + folder: Alerting + interval: 1m + rules: + - uid: bff6uy2a2rrwgb + title: Plex Down (london-b) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: up{job="plex"} + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: Alerting + execErrState: Alerting + for: 5m + annotations: + summary: "Plex is down on london-b" + description: "The Plex exporter on london-b has been unreachable for 5+ minutes." + labels: + severity: critical + isPaused: false diff --git a/ansible/services/grafana/provisioning/alerting/rules-warning.yml b/ansible/services/grafana/provisioning/alerting/rules-warning.yml new file mode 100644 index 0000000..94bfb16 --- /dev/null +++ b/ansible/services/grafana/provisioning/alerting/rules-warning.yml @@ -0,0 +1,242 @@ +apiVersion: 1 + +# Tier 2 — Warning alerts. These send email only (non-paging). +# Datasource UID: bezqqznn81wqof (Prometheus on london-a) +# All alerts use reduce+threshold (not classic_conditions) so $labels.* and $value work in annotations. + +groups: + - orgId: 1 + name: warning-resources + folder: Alerting + interval: 2m + rules: + - uid: cff6uy23024n4c + title: Disk Usage Warning (>80%) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: | + ( + node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + - node_filesystem_avail_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + ) + / node_filesystem_size_bytes{job="node_exporter", fstype!~"tmpfs|overlay|squashfs|devtmpfs"} + * 100 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [80] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + summary: "Disk usage high on {{ $labels.server }}" + description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.server }} is over 80% full (currently {{ $value | printf \"%.1f\" }}%)." + labels: + severity: warning + isPaused: false + + - uid: dff6uy24szhmod + title: Memory Usage Warning (>85%) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: | + (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [85] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 10m + annotations: + summary: "Memory usage high on {{ $labels.server }}" + description: "Memory usage on {{ $labels.server }} ({{ $labels.instance }}) is above 85% for 10+ minutes." + labels: + severity: warning + isPaused: false + + - uid: cff6uy26jey9sd + title: CPU Usage High (>85%) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + expr: | + 100 - (avg by (server, instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m])) * 100) + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [85] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 15m + annotations: + summary: "CPU usage sustained high on {{ $labels.server }}" + description: "CPU on {{ $labels.server }} has been above 85% for 15+ minutes (currently {{ $value | printf \"%.1f\" }}%)." + labels: + severity: warning + isPaused: false + + - uid: eff6uy289uewwb + title: System Load High (>2x CPUs) + condition: C + data: + - refId: A + datasourceUid: bezqqznn81wqof + model: + # Compare 15-minute load against number of CPUs + expr: | + node_load15{job="node_exporter"} / on(instance) group_left() count by (instance) (node_cpu_seconds_total{job="node_exporter", mode="idle"}) + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + settings: + mode: "" + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [2] + type: gt + operator: + type: and + query: + params: [C] + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 15m + annotations: + summary: "High system load on {{ $labels.server }}" + description: "15-minute load average on {{ $labels.server }} is {{ $value | printf \"%.2f\" }}x the CPU count (threshold: 2x)." + labels: + severity: warning + isPaused: false + diff --git a/ansible/services/grafana/provisioning/dashboards/dashboards.yml b/ansible/services/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..37da2ec --- /dev/null +++ b/ansible/services/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,15 @@ +apiVersion: 1 + +# Dashboard provisioning — tells Grafana where to find dashboard JSON files. +# Path is relative to the Grafana installation on london-a (FreeBSD). + +providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /usr/local/etc/grafana/dashboards + foldersFromFilesStructure: false diff --git a/ansible/services/grafana/provisioning/datasources/datasources.json b/ansible/services/grafana/provisioning/datasources/datasources.json new file mode 100644 index 0000000..f7c5bf4 --- /dev/null +++ b/ansible/services/grafana/provisioning/datasources/datasources.json @@ -0,0 +1,18 @@ +[ + { + "uid": "bezqqznn81wqof", + "name": "prometheus", + "type": "prometheus", + "typeName": "Prometheus", + "typeLogoUrl": "public/plugins/prometheus/img/prometheus_logo.svg", + "access": "proxy", + "url": "http://localhost:9090", + "user": "", + "database": "", + "basicAuth": false, + "isDefault": true, + "jsonData": { + "pdcInjected": false + } + } +] diff --git a/ansible/services/jellyseerr/README.md b/ansible/services/jellyseerr/README.md new file mode 100644 index 0000000..2287282 --- /dev/null +++ b/ansible/services/jellyseerr/README.md @@ -0,0 +1,8 @@ +# Jellyseerr + +Media request management (Overseerr fork for Jellyfin/Plex). + +- **Host:** london-b +- **URL:** https://requests.pez.sh +- **Port:** 5056 (host) → 5055 (container) +- **Config:** `/var/share/jellyseer/` diff --git a/ansible/services/jellyseerr/docker-compose.yml b/ansible/services/jellyseerr/docker-compose.yml new file mode 100644 index 0000000..fa3d215 --- /dev/null +++ b/ansible/services/jellyseerr/docker-compose.yml @@ -0,0 +1,17 @@ +# Jellyseerr - Media request management +# Host: london-b (100.84.65.101) +# Data: /var/share/jellyseer +# Access: https://requests.pez.sh (via Caddy reverse proxy on helsinki-a) + +services: + jellyseerr: + image: fallenbagel/jellyseerr:latest + container_name: jellyseer + restart: always + ports: + - "5056:5055" + environment: + LOG_LEVEL: debug + TZ: Europe/London + volumes: + - /var/share/jellyseer:/app/config diff --git a/ansible/services/mangos-realmd/mangos-realmd.service b/ansible/services/mangos-realmd/mangos-realmd.service new file mode 100644 index 0000000..8c17ac1 --- /dev/null +++ b/ansible/services/mangos-realmd/mangos-realmd.service @@ -0,0 +1,16 @@ +[Unit] +Description=MaNGOS Zero Realm Server +After=network.target mariadb.service +Requires=mariadb.service + +[Service] +Type=simple +User=mangos +Group=mangos +WorkingDirectory=/home/mangos/mangos/zero/bin +ExecStart=/home/mangos/mangos/zero/bin/realmd +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/services/mangos-world/mangos-world.service b/ansible/services/mangos-world/mangos-world.service new file mode 100644 index 0000000..8c17ac1 --- /dev/null +++ b/ansible/services/mangos-world/mangos-world.service @@ -0,0 +1,16 @@ +[Unit] +Description=MaNGOS Zero Realm Server +After=network.target mariadb.service +Requires=mariadb.service + +[Service] +Type=simple +User=mangos +Group=mangos +WorkingDirectory=/home/mangos/mangos/zero/bin +ExecStart=/home/mangos/mangos/zero/bin/realmd +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/ansible/services/minecraft/README.md b/ansible/services/minecraft/README.md new file mode 100644 index 0000000..66b4d68 --- /dev/null +++ b/ansible/services/minecraft/README.md @@ -0,0 +1,10 @@ +# Minecraft + +PaperMC server. + +- **Host:** copenhagen-a +- **Port:** 25565 +- **Memory:** 3GB +- **Data:** Docker volume (`minecraft_minecraftserver`) +- **Java:** OpenJDK 21 (bundled in image) +- **Note:** copenhagen-a also runs a WoW server (MaNGOS Zero) as system services, not Docker diff --git a/ansible/services/minecraft/docker-compose.yml b/ansible/services/minecraft/docker-compose.yml new file mode 100644 index 0000000..4b808f2 --- /dev/null +++ b/ansible/services/minecraft/docker-compose.yml @@ -0,0 +1,19 @@ +# Minecraft - PaperMC server +# Host: copenhagen-a (100.89.206.60) +# Data: Docker volume (minecraft_minecraftserver) + +services: + minecraft: + image: marctv/minecraft-papermc-server:latest + container_name: mcserver + restart: always + ports: + - "25565:25565" + environment: + MEMORYSIZE: 3G + volumes: + - minecraft_data:/data + +volumes: + minecraft_data: + name: minecraft_minecraftserver diff --git a/ansible/services/miniflux/README.md b/ansible/services/miniflux/README.md new file mode 100644 index 0000000..4e8ab89 --- /dev/null +++ b/ansible/services/miniflux/README.md @@ -0,0 +1,10 @@ +# Miniflux + +Lightweight RSS reader. + +- **Host:** london-b +- **URL:** https://rss.pez.sh +- **Database:** PostgreSQL 15 (Alpine) +- **Bind address:** Tailscale IP only (100.84.65.101:8181) +- **Data:** Docker volume (`miniflux-db`) +- **Note:** Passwords templatized — set `MINIFLUX_DB_PASSWORD` and `MINIFLUX_ADMIN_PASSWORD` env vars before deploying diff --git a/ansible/services/miniflux/docker-compose.yml b/ansible/services/miniflux/docker-compose.yml new file mode 100644 index 0000000..ac4fdbd --- /dev/null +++ b/ansible/services/miniflux/docker-compose.yml @@ -0,0 +1,35 @@ +# Miniflux - RSS reader +# Host: london-b (100.84.65.101) +# Data: Docker volume (miniflux-db) +# Access: https://rss.pez.sh (via Caddy reverse proxy on helsinki-a) + +services: + miniflux-db: + image: postgres:15-alpine + container_name: miniflux-db + restart: unless-stopped + volumes: + - miniflux-db:/var/lib/postgresql/data + environment: + POSTGRES_DB: miniflux + POSTGRES_USER: miniflux + POSTGRES_PASSWORD: "${MINIFLUX_DB_PASSWORD}" + + miniflux: + image: miniflux/miniflux:latest + container_name: miniflux + restart: unless-stopped + depends_on: + - miniflux-db + ports: + - "100.84.65.101:8181:8080" + environment: + DATABASE_URL: "postgres://miniflux:${MINIFLUX_DB_PASSWORD}@miniflux-db/miniflux?sslmode=disable" + RUN_MIGRATIONS: "1" + CREATE_ADMIN: "1" + ADMIN_USERNAME: pez + ADMIN_PASSWORD: "${MINIFLUX_ADMIN_PASSWORD}" + BASE_URL: https://rss.pez.sh + +volumes: + miniflux-db: diff --git a/ansible/services/navidrome/README.md b/ansible/services/navidrome/README.md new file mode 100644 index 0000000..29a7db6 --- /dev/null +++ b/ansible/services/navidrome/README.md @@ -0,0 +1,9 @@ +# Navidrome + +Personal music streaming server (Subsonic-compatible). + +- **Host:** london-b +- **URL:** https://music.pez.sh +- **Port:** 4533 +- **Config:** `/root/navidrome/` (includes `navidrome.toml`) +- **Music library:** `/hdd/music` (on ZFS pool) diff --git a/ansible/services/navidrome/docker-compose.yml b/ansible/services/navidrome/docker-compose.yml new file mode 100644 index 0000000..593c103 --- /dev/null +++ b/ansible/services/navidrome/docker-compose.yml @@ -0,0 +1,17 @@ +# Navidrome - Music streaming server +# Host: london-b (100.84.65.101) +# Data: /root/navidrome (config), /hdd/music (library) +# Access: https://music.pez.sh (via Caddy reverse proxy on helsinki-a) + +services: + navidrome: + image: deluan/navidrome:latest + container_name: navidrome + restart: unless-stopped + ports: + - "4533:4533" + environment: + ND_LOGLEVEL: info + volumes: + - /root/navidrome:/data + - /hdd/music:/music diff --git a/ansible/services/nextcloud-aio/README.md b/ansible/services/nextcloud-aio/README.md new file mode 100644 index 0000000..6e4ffef --- /dev/null +++ b/ansible/services/nextcloud-aio/README.md @@ -0,0 +1,10 @@ +# Nextcloud AIO + +All-in-one Nextcloud deployment (self-managed containers). + +- **Host:** london-b +- **URL:** https://cloud.pez.sh +- **Admin port:** 8080 (mastercontainer management UI) +- **Apache port:** 11000 (proxied by Caddy on helsinki-a) +- **Data:** Docker volume (`nextcloud_aio_mastercontainer`) +- **Note:** The mastercontainer spawns and manages its own sub-containers (database, redis, apache, etc.) diff --git a/ansible/services/nextcloud-aio/docker-compose.yml b/ansible/services/nextcloud-aio/docker-compose.yml new file mode 100644 index 0000000..58b08be --- /dev/null +++ b/ansible/services/nextcloud-aio/docker-compose.yml @@ -0,0 +1,32 @@ +# Nextcloud AIO - All-in-one Nextcloud deployment +# Host: london-b (100.84.65.101) +# Data: Docker volume (nextcloud_aio_mastercontainer) +# Access: https://cloud.pez.sh (via Caddy reverse proxy on helsinki-a) +# +# Note: Nextcloud AIO manages its own sub-containers (apache, database, +# redis, etc.) via the mastercontainer. This compose file only defines +# the mastercontainer itself. + +services: + nextcloud-aio-mastercontainer: + image: ghcr.io/nextcloud-releases/all-in-one:latest + container_name: nextcloud-aio-mastercontainer + restart: always + ports: + - "8080:8080" + environment: + SKIP_DOMAIN_VALIDATION: "false" + APACHE_PORT: 11000 + APACHE_IP_BINDING: "0.0.0.0" + volumes: + - nextcloud_aio_mastercontainer:/mnt/docker-aio-config + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - nextcloud-aio + +networks: + nextcloud-aio: + driver: bridge + +volumes: + nextcloud_aio_mastercontainer: diff --git a/ansible/services/plex-exporter/README.md b/ansible/services/plex-exporter/README.md new file mode 100644 index 0000000..99fd057 --- /dev/null +++ b/ansible/services/plex-exporter/README.md @@ -0,0 +1,9 @@ +# Plex Exporter + +Prometheus exporter for Plex Media Server metrics. + +- **Host:** london-b +- **Port:** 9000 (host) → 9594 (container) +- **Plex server:** 192.168.1.253:32400 (local network) +- **Note:** Requires `PLEX_TOKEN` env var +- **Scraped by:** Prometheus on london-a diff --git a/ansible/services/plex-exporter/docker-compose.yml b/ansible/services/plex-exporter/docker-compose.yml new file mode 100644 index 0000000..6afc5ed --- /dev/null +++ b/ansible/services/plex-exporter/docker-compose.yml @@ -0,0 +1,14 @@ +# Plex exporter - Plex metrics for Prometheus +# Host: london-b (100.84.65.101) +# Access: http://london-b:9000/metrics + +services: + plex-exporter: + image: ghcr.io/axsuul/plex-media-server-exporter + container_name: prom-plex-exporter + restart: unless-stopped + ports: + - "9000:9594" + environment: + PLEX_ADDR: "http://192.168.1.253:32400" + PLEX_TOKEN: "${PLEX_TOKEN}" diff --git a/ansible/services/poste-io/README.md b/ansible/services/poste-io/README.md new file mode 100644 index 0000000..adf3ada --- /dev/null +++ b/ansible/services/poste-io/README.md @@ -0,0 +1,9 @@ +# Poste.io + +Self-hosted mail server (SMTP, IMAP, POP3, webmail). + +- **Host:** nuremberg-a (Hetzner cloud) +- **URL:** https://mail.pez.sh +- **Domain:** pez.sh +- **Data:** `/root/postio/data` +- **Ports:** 25, 80, 110, 143, 443, 465, 587, 993, 995 diff --git a/ansible/services/poste-io/docker-compose.yml b/ansible/services/poste-io/docker-compose.yml new file mode 100644 index 0000000..bd4346d --- /dev/null +++ b/ansible/services/poste-io/docker-compose.yml @@ -0,0 +1,24 @@ +# Poste.io - Self-hosted mail server +# Host: nuremberg-a (100.117.235.28) +# Data: /root/postio/data +# Access: https://mail.pez.sh + +services: + posteio: + image: analogic/poste.io + container_name: posteio + restart: unless-stopped + environment: + TZ: Europe/London + ports: + - "25:25" # SMTP + - "80:80" # HTTP (web UI) + - "110:110" # POP3 + - "143:143" # IMAP + - "443:443" # HTTPS (web UI) + - "465:465" # SMTPS + - "587:587" # Submission + - "993:993" # IMAPS + - "995:995" # POP3S + volumes: + - /root/postio/data:/data diff --git a/ansible/services/prometheus/README.md b/ansible/services/prometheus/README.md new file mode 100644 index 0000000..77ca1bb --- /dev/null +++ b/ansible/services/prometheus/README.md @@ -0,0 +1,55 @@ +# Prometheus + +Runs on **london-a** (FreeBSD, 100.122.219.41). + +## Service Details + +- **Binary:** `/usr/local/bin/prometheus` +- **Config:** `/usr/local/etc/prometheus.yml` +- **Data:** `/var/db/prometheus` +- **Web UI:** `http://london-a:9090` +- **Runs as:** `prometheus` user via daemon(8) + +## Scrape Targets + +| Job | Target | Host | Port | What it scrapes | +|-----|--------|------|------|-----------------| +| `prometheus` | localhost:9090 | london-a | 9090 | Prometheus self-metrics | +| `node_exporter` | 192.168.1.254:9100 | london-a | 9100 | OS metrics (FreeBSD) | +| `node_exporter` | 192.168.1.253:9100 | london-b | 9100 | OS metrics (Linux) | +| `node_exporter` | 100.89.206.60:9100 | copenhagen-a | 9100 | OS metrics (Linux) | +| `node_exporter` | 100.115.45.53:9100 | copenhagen-c | 9100 | OS metrics (Linux) | +| `node_exporter` | 100.117.235.28:9100 | nuremberg-a | 9100 | OS metrics (Alpine) | +| `node_exporter` | 100.67.6.27:9100 | helsinki-a | 9100 | OS metrics (Linux) | +| `smartmontools` | 192.168.1.253:9633 | london-b | 9633 | SMART disk health (smartctl_exporter) | +| `plex` | 192.168.1.253:9000 | london-b | 9000 | Plex media server metrics | +| `caddy` | 100.67.6.27:2019 | helsinki-a | 2019 | Caddy admin API / metrics | + +### Network Notes + +- London hosts (london-a, london-b) use **LAN IPs** (192.168.1.x) since Prometheus runs locally in the London rack +- Remote hosts (copenhagen, nuremberg, helsinki) use **Tailscale IPs** (100.x.x.x) + +## Alerting Rules + +### `rules/node-exporter.rules` + +Sourced from pez-ansible. Currently all rules are **commented out** — only a placeholder `ServerRunningBtrfs` alert exists (disabled). No active alerting rules or Alertmanager configured. + +## What's Not Configured + +- **Alertmanager** — target is commented out; no alerting pipeline active +- **Rule files** — referenced lines in `prometheus.yml` are commented out (rules exist in `rules/` but aren't loaded) +- **Recording rules** — none + +## Deployment + +Config is managed manually on london-a. To deploy changes: + +```bash +# Copy config to london-a +scp prometheus.yml root@100.122.219.41:/usr/local/etc/prometheus.yml + +# Reload (graceful, no restart needed) +ssh root@100.122.219.41 "kill -HUP $(pgrep prometheus)" +``` diff --git a/ansible/services/prometheus/prometheus.yml b/ansible/services/prometheus/prometheus.yml new file mode 100644 index 0000000..e5dc8c3 --- /dev/null +++ b/ansible/services/prometheus/prometheus.yml @@ -0,0 +1,71 @@ +# Prometheus configuration — extracted from london-a (FreeBSD) +# Config file location on london-a: /usr/local/etc/prometheus.yml +# Prometheus runs as: /usr/local/bin/prometheus --config.file=/usr/local/etc/prometheus.yml +# Data directory: /var/db/prometheus + +global: + scrape_interval: 15s + evaluation_interval: 15s + +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "node_exporter" + static_configs: + - targets: ["192.168.1.254:9100"] + labels: + location: london + server: london-a + - targets: ["192.168.1.253:9100"] + labels: + location: london + server: london-b + - targets: ["100.89.206.60:9100"] + labels: + location: copenhagen + server: copenhagen-a + - targets: ["100.115.45.53:9100"] + labels: + location: copenhagen + server: copenhagen-c + - targets: ["100.117.235.28:9100"] + labels: + location: cloud + server: nuremberg-a + - targets: ["100.67.6.27:9100"] + labels: + location: cloud + server: helsinki-a + + - job_name: "smartmontools" + static_configs: + - targets: ["192.168.1.253:9633"] + labels: + location: london + server: london-b + + - job_name: "plex" + static_configs: + - targets: ["192.168.1.253:9000"] + labels: + location: london + server: london-b + + - job_name: "caddy" + static_configs: + - targets: ["100.67.6.27:2019"] + labels: + location: cloud + server: helsinki-a diff --git a/ansible/services/prometheus/rules/node-exporter.rules b/ansible/services/prometheus/rules/node-exporter.rules new file mode 100644 index 0000000..c9798fb --- /dev/null +++ b/ansible/services/prometheus/rules/node-exporter.rules @@ -0,0 +1,9 @@ +groups: +- name: alerts + rules: +# - alert: ServerRunningBtrfs +# expr: node_btrfs_info > 0 +# for: 5m +# annotations: +# summary: "Server {{ $labels.job }} is running btrfs" +# description: "{{ $labels.job }} has uuid of '{{ $labels.uuid }}'" \ No newline at end of file diff --git a/ansible/services/rc.d/london-a/rc.conf b/ansible/services/rc.d/london-a/rc.conf new file mode 100644 index 0000000..6a37790 --- /dev/null +++ b/ansible/services/rc.d/london-a/rc.conf @@ -0,0 +1,25 @@ +# /etc/rc.conf — london-a (FreeBSD) +# Captured 2026-03-22 + +clear_tmp_enable="YES" +hostname="london-a" +ifconfig_em0="inet 192.168.1.254 netmask 255.255.255.0" +defaultrouter="192.168.1.1" +ifconfig_em0_ipv6="inet6 accept_rtadv" +sshd_enable="YES" +ntpd_enable="YES" +powerd_enable="YES" +moused_nondefault_enable="NO" +# Set dumpdev to "AUTO" to enable crash dumps, "NO" to disable +dumpdev="AUTO" +zfs_enable="YES" +grafana_enable="YES" +prometheus_enable="YES" +node_exporter_enable="YES" +tailscaled_enable="YES" +cloudflared_enable="YES" +cf_tunnel_id="168eccae-2497-48e2-a1e2-c12cd3448d9a" +influxd_enable="YES" +libvirtd_enable="YES" +redis_enable="YES" +postgresql_enable="YES" diff --git a/ansible/services/slskd/README.md b/ansible/services/slskd/README.md new file mode 100644 index 0000000..5b8581c --- /dev/null +++ b/ansible/services/slskd/README.md @@ -0,0 +1,9 @@ +# slskd + +Soulseek client with web UI. + +- **Host:** london-b +- **Ports:** 5030 (HTTP), 5031 (HTTPS), 50300 (Soulseek) +- **Config:** `/root/slskd/` +- **Downloads:** `/hdd/music/slskd/` (on ZFS pool) +- **Status:** Currently stopped (was not set to auto-restart) diff --git a/ansible/services/slskd/docker-compose.yml b/ansible/services/slskd/docker-compose.yml new file mode 100644 index 0000000..c6dc9d9 --- /dev/null +++ b/ansible/services/slskd/docker-compose.yml @@ -0,0 +1,19 @@ +# slskd - Soulseek client (web UI) +# Host: london-b (100.84.65.101) +# Data: /root/slskd (app), /hdd/music/slskd (downloads) +# Status: Currently stopped (not set to auto-restart) + +services: + slskd: + image: slskd/slskd:latest + container_name: slskd + restart: unless-stopped + ports: + - "5030:5030" # HTTP + - "5031:5031" # HTTPS + - "50300:50300" # Soulseek listen port + environment: + SLSKD_REMOTE_CONFIGURATION: "true" + volumes: + - /root/slskd:/app + - /hdd/music/slskd:/app/downloads diff --git a/ansible/services/smartctl-exporter/README.md b/ansible/services/smartctl-exporter/README.md new file mode 100644 index 0000000..7f850d2 --- /dev/null +++ b/ansible/services/smartctl-exporter/README.md @@ -0,0 +1,8 @@ +# smartctl-exporter + +Prometheus exporter for SMART disk health metrics. + +- **Host:** london-b +- **Port:** 9633 +- **Note:** Runs privileged for direct disk access (required for smartctl) +- **Scraped by:** Prometheus on london-a diff --git a/ansible/services/smartctl-exporter/docker-compose.yml b/ansible/services/smartctl-exporter/docker-compose.yml new file mode 100644 index 0000000..d83c9ec --- /dev/null +++ b/ansible/services/smartctl-exporter/docker-compose.yml @@ -0,0 +1,12 @@ +# smartctl-exporter - SMART disk metrics for Prometheus +# Host: london-b (100.84.65.101) +# Access: http://london-b:9633/metrics + +services: + smartctl-exporter: + image: prometheuscommunity/smartctl-exporter + container_name: smartctl_exporter + restart: unless-stopped + privileged: true # Required for direct disk access + ports: + - "9633:9633" diff --git a/ansible/services/systemd/copenhagen-a/cloudflared.service b/ansible/services/systemd/copenhagen-a/cloudflared.service new file mode 100644 index 0000000..1a66780 --- /dev/null +++ b/ansible/services/systemd/copenhagen-a/cloudflared.service @@ -0,0 +1,13 @@ +[Unit] +Description=cloudflared +After=network.target + +[Service] +TimeoutStartSec=0 +Type=notify +ExecStart=/usr/bin/cloudflared --no-autoupdate tunnel run --token ${CLOUDFLARED_TOKEN} +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=multi-user.target diff --git a/ansible/services/systemd/helsinki-a/caddy.service b/ansible/services/systemd/helsinki-a/caddy.service new file mode 100644 index 0000000..8fd094c --- /dev/null +++ b/ansible/services/systemd/helsinki-a/caddy.service @@ -0,0 +1,25 @@ +# Stock Caddy service unit (installed via package manager) +# Included for reference — not a custom unit +# +# Config file: /etc/caddy/Caddyfile + +[Unit] +Description=Caddy +Documentation=https://caddyserver.com/docs/ +After=network.target network-online.target +Requires=network-online.target + +[Service] +Type=notify +User=caddy +Group=caddy +ExecStart=/usr/bin/caddy run --environ --config /etc/caddy/Caddyfile +ExecReload=/usr/bin/caddy reload --config /etc/caddy/Caddyfile --force +TimeoutStopSec=5s +LimitNOFILE=1048576 +PrivateTmp=true +ProtectSystem=full +AmbientCapabilities=CAP_NET_ADMIN CAP_NET_BIND_SERVICE + +[Install] +WantedBy=multi-user.target diff --git a/ansible/services/systemd/helsinki-a/thiswebsitedoesnotexist.service b/ansible/services/systemd/helsinki-a/thiswebsitedoesnotexist.service new file mode 100644 index 0000000..c6baf77 --- /dev/null +++ b/ansible/services/systemd/helsinki-a/thiswebsitedoesnotexist.service @@ -0,0 +1,17 @@ +[Unit] +Description=This Website Does Not Exist +After=network.target + +[Service] +Type=simple +User=root +WorkingDirectory=/opt/thiswebsitedoesnotexist +ExecStart=/usr/bin/node app.js +Restart=always +RestartSec=5 +Environment=NODE_ENV=production +Environment=PORT=3721 +EnvironmentFile=/opt/thiswebsitedoesnotexist/.env + +[Install] +WantedBy=multi-user.target diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..fbe54c0 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,23 @@ +# Documentation + +Everything you need to understand how this infrastructure works. + +## Contents + +- **[Architecture](architecture.md)** — High-level overview, network topology, traffic flow diagrams +- **[Networking](networking.md)** — Tailscale mesh, physical networking, DNS and proxy flow +- **[Services](services.md)** — Complete service map: what runs where, ports, auth +- **[Monitoring](monitoring.md)** — Prometheus, Grafana, exporters, alerting, status page +- **[Secrets](secrets.md)** — SOPS + age encryption: setup, usage, CI integration +- **[Getting Started](getting-started.md)** — How to work with this repo, deploy changes, add services + +## Quick Reference + +| Host | Tailscale IP | Location | Role | +|------|-------------|----------|------| +| helsinki-a | 100.67.6.27 | Hetzner Cloud | Reverse proxy, SSO, Bitwarden | +| london-b | 100.84.65.101 | London | Storage, media, Docker services | +| london-a | 100.122.219.41 | London | Prometheus + Grafana | +| nuremberg-a | 100.117.235.28 | Hetzner Cloud | Mail (poste.io) | +| copenhagen-a | 100.89.206.60 | Copenhagen | Minecraft, WoW | +| copenhagen-c | 100.115.45.53 | Copenhagen | Idle | diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..038c39a --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,110 @@ +# Architecture + +## Overview + +The infrastructure spans four physical locations connected by a Tailscale mesh network. All public traffic enters through a single Hetzner Cloud VPS (helsinki-a) running Caddy as a reverse proxy, which forwards requests over Tailscale to backend services running on physical servers in London and Copenhagen. + +The setup is entirely self-hosted (with the exception of Hetzner Cloud VPSs and Cloudflare for DNS/CDN). Servers are old personal computers repurposed into server duty — cheaper than cloud, and I get a rack cabinet that doubles as a bedroom white noise machine. + +## Network Topology + +``` + ┌──────────────┐ + │ Cloudflare │ + │ DNS + CDN │ + │ *.pez.sh │ + └──────┬───────┘ + │ + │ HTTPS + │ + ┌────────────▼────────────┐ + │ helsinki-a │ + │ Hetzner Cloud VPS │ + │ │ + │ Caddy (reverse proxy) │ + │ Authelia (SSO) │ + │ Bitwarden │ + │ LLDAP │ + └────────────┬────────────┘ + │ + ┌───────────────┼───────────────┐ + │ Tailscale Mesh │ + │ (WireGuard-based VPN) │ + └───┬───────┬───────┬───────┬───┘ + │ │ │ │ + ┌────────▼──┐ ┌──▼────────┐ ┌────▼───────┐ ┌──▼──────────┐ + │ london-b │ │ london-a │ │nuremberg-a │ │copenhagen-a │ + │ │ │ │ │ │ │ │ + │ Storage │ │ Monitoring│ │ Mail │ │ Gaming │ + │ Media │ │ Prometheus│ │ poste.io │ │ Minecraft │ + │ Docker │ │ Grafana │ │ │ │ WoW/MaNGOS │ + │ services │ │ │ │ │ │ │ + │ (46T ZFS) │ │ (FreeBSD) │ │ (Alpine) │ │ (Ubuntu) │ + └───────────┘ └───────────┘ └────────────┘ └─────────────┘ + + ┌─────────────┐ + │copenhagen-c │ + │ (idle) │ + └─────────────┘ +``` + +## Traffic Flow + +All public-facing services follow the same pattern: + +``` +User → Cloudflare (DNS + TLS) → helsinki-a (Caddy) → Backend (over Tailscale) +``` + +1. DNS for `*.pez.sh` is managed by Cloudflare (provisioned via Terraform) +2. Cloudflare proxies traffic to helsinki-a +3. Caddy on helsinki-a terminates TLS and routes to the correct backend +4. For protected services, Caddy calls Authelia first (`forward_auth`) +5. If authenticated (or no auth required), traffic is proxied over Tailscale to the backend + +``` + ┌─────────────────────────────────────────────┐ + │ helsinki-a (Caddy) │ + │ │ + radarr.pez.sh ──► │ forward_auth → Authelia ──► london-b:7878 │ + │ │ + jellyfin.pez.sh ─►│ (no auth) ───────────────► london-b:8096 │ + │ │ + grafana.pez.sh ──►│ forward_auth → Authelia ──► london-a:3000 │ + │ │ + auth.pez.sh ─────►│ (local) ────────────────► localhost:9091 │ + └─────────────────────────────────────────────┘ +``` + +## Auth Architecture + +``` + ┌──────────┐ + │ Caddy │ + │ │ + │ forward_ │ + │ auth │ + └────┬─────┘ + │ + ┌────▼─────┐ + │ Authelia │ auth.pez.sh + │ (SSO) │ + └────┬─────┘ + │ + ┌────▼─────┐ + │ LLDAP │ User directory + │ │ + └──────────┘ +``` + +Authelia authenticates against LLDAP (both on helsinki-a). One place to manage users — add or remove someone in LDAP and it propagates to all protected services. + +Services with their own auth (Bitwarden, Jellyfin, Plex, Nextcloud, Navidrome, Jellyseerr) are not behind Authelia. + +## Design Principles + +- **Self-hosted first.** Cloud VPSs only where it makes sense (public gateway, mail with clean IP reputation). Everything else runs on physical hardware I own. +- **Tailscale as the backbone.** No ports exposed on residential IPs. All inter-server communication goes over the mesh. +- **Ansible for everything.** If a server dies, reinstall the OS, install Tailscale, run Ansible. 30 minutes to full recovery. +- **Terraform for DNS.** All Cloudflare records are in code. No clicking around in dashboards. +- **Cattle, not pets (as much as possible).** The servers are technically pets — old hardware in specific locations — but the configs are cattle. Everything is reproducible from this repo. diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..a4357df --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,157 @@ +# Getting Started + +How to work with this repo, deploy changes, and not break things. + +## Prerequisites + +You'll need: + +- **Tailscale** — installed and connected to the tailnet. All SSH access goes through Tailscale. No servers have SSH exposed on the public internet. +- **SSH keys** — set up for each host you need to access +- **Ansible** — for configuration management and deployments +- **OpenTofu** (or Terraform) — for managing Cloudflare DNS and infrastructure +- **Docker** — helpful to understand, since most services are containerised +- **SOPS + age** — for secrets encryption/decryption (run `./scripts/sops-setup.sh`) +- **Git** — obviously +- **gh CLI** — for GitHub operations (PRs, issues, etc.) + +## Clone the repo + +```bash +git clone git@github.com:RWejlgaard/pez-infra.git +cd pez-infra +``` + +## Repo Structure + +``` +pez-infra/ +├── docs/ # You are here +├── ansible/ # Ansible playbooks, roles, inventory, and all managed files +│ ├── roles/ # Ansible roles (caddy, docker, dotfiles, etc.) +│ ├── services/ # Docker Compose definitions and service configs +│ ├── dotfiles/ # Shell config (fish, nvim, tmux, git, etc.) +│ └── scripts/ # Utility and maintenance scripts +└── terraform/ # Terraform/OpenTofu for Cloudflare, DNS, etc. +``` + +## Connecting to hosts + +All access is via Tailscale. Once you're on the tailnet, SSH using the Tailscale IP or hostname: + +```bash +ssh root@helsinki-a # or ssh root@100.67.6.27 +ssh root@london-b # or ssh root@100.84.65.101 +ssh root@london-a # FreeBSD — might need a different user +ssh root@copenhagen-a # or ssh root@100.89.206.60 +``` + +## Common Tasks + +### Deploying configuration changes + +Ansible handles deployments. Playbooks are in `ansible/` and are structured by host/role. + +```bash +# Run the full site playbook +cd ansible +ansible-playbook site.yml + +# Target a specific host +ansible-playbook site.yml --limit london-b + +# Dry run first +ansible-playbook site.yml --check --diff +``` + +Ansible also runs automatically via GitHub Actions on commits to the main branch — so a quick commit from your phone can fix a misconfiguration when you're out. + +### Managing DNS + +DNS records are managed via Terraform in the `terraform/` directory: + +```bash +cd terraform +tofu plan # see what would change +tofu apply # apply the changes +``` + +All Cloudflare DNS records, pages, and access policies are defined here. Don't click around in the Cloudflare dashboard — if it's not in Terraform, it doesn't exist. + +### Adding a new service + +1. **Create a Docker Compose file** in `ansible/services//docker-compose.yml` +2. **Add the Caddy route** — if it needs a public subdomain, add a block to the Caddyfile in `ansible/services/caddy/` +3. **Add a DNS record** — add the subdomain to `terraform/` and run `tofu apply` +4. **Add Ansible deployment** — create or update the relevant role in `ansible/` so the service gets deployed automatically +5. **Add monitoring** — if the service has a metrics endpoint, add it as a Prometheus scrape target +6. **Update docs** — add the service to `docs/services.md` + +### Adding a new server + +1. Install the OS (Ubuntu preferred — see below) +2. Set up SSH keys +3. Install Tailscale and join the tailnet +4. Add the host to the Ansible inventory in `ansible/` +5. Assign roles (at minimum: node_exporter for monitoring) +6. Run `ansible-playbook site.yml --limit ` +7. Update `docs/services.md` and `docs/architecture.md` + +That's it. Ansible takes care of installing node_exporter, configuring the system, and deploying any assigned services. + +### Working with ZFS (london-b) + +```bash +# Check pool status +zpool status hdd + +# Check usage +zfs list + +# Scrub status (runs weekly on Sundays) +zpool status hdd | grep scan +``` + +ZFS is set up with 3× RAIDZ1 vdevs across 8 drives. Tolerates one drive failure per vdev. + +## OS Choice + +Ubuntu is the preferred OS for new servers. Not because I love it — Alpine is faster and leaner — but because Ansible support is vastly better. The lack of GNU binaries and systemd on Alpine caused enough headaches that the switch to Ubuntu was worth it. + +FreeBSD is used on london-a (monitoring) and works well for that single-purpose role. + +## Secrets + +Secrets are encrypted in-repo using **SOPS + age**. Encrypted files have `.enc.` in their extension (e.g. `secrets.enc.yml`). + +```bash +# First-time setup +./ansible/scripts/sops-setup.sh + +# Edit an encrypted file +sops ansible/services/authelia/config.enc.yml + +# Decrypt to stdout +sops -d ansible/services/authelia/config.enc.yml +``` + +Full documentation: [docs/secrets.md](secrets.md) + +## Branching + +- `main` is the production branch. Ansible runs from main via GitHub Actions. +- Feature branches for changes, PRs for review. +- Branch naming: `/PESO--` for Jira-tracked work. + +## Consolidated Repos + +This monorepo replaces several standalone repos: + +| Old repo | Now lives in | +|----------|-------------| +| pez-ansible | `ansible/` | +| pez-terraform | `terraform/` | +| pez-grafana | `services/grafana/` | +| pez-proxy | `services/caddy/` | +| pez-docs | `docs/` | +| server-scripts | `scripts/` and `ansible/` | diff --git a/docs/hosts/copenhagen-a.md b/docs/hosts/copenhagen-a.md new file mode 100644 index 0000000..a1e326b --- /dev/null +++ b/docs/hosts/copenhagen-a.md @@ -0,0 +1,59 @@ +# copenhagen-a + +Game servers. Located at my dad's place in Copenhagen as an off-site location. + +## Overview + +| | | +|---|---| +| **Location** | Copenhagen | +| **OS** | Ubuntu 22.04 | +| **Tailscale IP** | 100.89.206.60 | +| **Role** | Gaming servers (Minecraft, WoW) | +| **Form factor** | Lenovo "tiny" desktop (lunchbox-sized) | + +## Hardware + +| Component | Spec | +|---|---| +| CPU | Intel i5-4570T (4 threads) | +| Memory | 16 GB | +| Boot disk | 500 GB (26% used) | + +Compact Lenovo desktop — powered by a standard ThinkPad charging brick. Small, quiet, and draws minimal power. + +## Services + +### Minecraft + +| | | +|---|---| +| Image | `marctv/minecraft-papermc-server` | +| Port | 25565 | +| Deployment | Docker | + +PaperMC for better performance than vanilla. Not proxied through Caddy — accessed directly via Tailscale or the host's IP. + +### World of Warcraft (MaNGOS Zero) + +WoW 1.12 (Vanilla) private server using the MaNGOS Zero emulator. Runs natively — not in Docker. + +| Service | Port | Managed by | +|---------|------|-----------| +| mangos-realmd | 3724 | systemd | +| mangos-world | 8085 | systemd | +| MariaDB | 3306 | systemd | + +- Runs as the `mangos` user +- Install path: `/home/mangos/mangos/zero/` +- MariaDB hosts the character, world, and auth databases locally + +Both `mangos-realmd` and `mangos-world` start automatically on boot via systemd. + +## Networking + +Connected directly to the ISP router's built-in switch. Symmetrical 500 Mbit connection — more than enough for game servers. + +## Notes + +Copenhagen-a has a static IP, which is needed for game servers that require direct client connections (WoW realm list, Minecraft server list). diff --git a/docs/hosts/copenhagen-c.md b/docs/hosts/copenhagen-c.md new file mode 100644 index 0000000..47fe460 --- /dev/null +++ b/docs/hosts/copenhagen-c.md @@ -0,0 +1,21 @@ +# copenhagen-c + +General purpose box. Currently idle. + +## Overview + +| | | +|---|---| +| **Location** | Copenhagen | +| **OS** | Debian 12 | +| **Tailscale IP** | 100.115.45.53 | +| **Role** | Idle / available | +| **Disk** | 117 GB (15% used) | + +## Status + +No active workloads. Connected to Tailscale and available for future use. Has node_exporter running for monitoring. + +## Notes + +Part of the Copenhagen off-site setup at my dad's place. Available if I need to spin up something that benefits from a Copenhagen location or just need another box. diff --git a/docs/hosts/helsinki-a.md b/docs/hosts/helsinki-a.md new file mode 100644 index 0000000..f1b3364 --- /dev/null +++ b/docs/hosts/helsinki-a.md @@ -0,0 +1,38 @@ +# helsinki-a + +Public-facing traffic gateway. Everything exposed to the internet goes through this box. + +## Overview + +| | | +|---|---| +| **Location** | Hetzner Cloud (Helsinki) | +| **OS** | Linux (Ubuntu/Debian) | +| **Tailscale IP** | 100.67.6.27 | +| **Role** | Reverse proxy, SSO, Bitwarden, LDAP | +| **Provider** | Hetzner Cloud VPS | + +## What it does + +This is the front door. All public subdomains (*.pez.sh) terminate here via Caddy, which proxies traffic to the appropriate backend over Tailscale. + +It also runs the auth stack — Authelia for SSO and LLDAP for user management. Having auth on the same box as the proxy keeps latency low for the `forward_auth` check. + +Bitwarden (Vaultwarden) lives here too, because password management needs to be available even if the London servers are having a moment. + +## Services + +| Service | Port | Deployment | Notes | +|---------|------|-----------|-------| +| Caddy | 80, 443 | Docker | Reverse proxy + TLS termination | +| Authelia | 9091 | Docker | SSO, accessible at auth.pez.sh | +| Bitwarden (Vaultwarden) | 8443 | Docker | bitwarden.pez.sh, own auth | +| LLDAP | 3890/17170 | Docker | User directory for Authelia | + +Also serves static content: +- **status.pez.sh** → `/srv/status` (public status page) +- **apps.pez.sh** → `/srv/apps` (behind Authelia) + +## Why Hetzner Cloud + +Public-facing services need a stable public IP and good uptime. Residential IPs are dynamic and unreliable for this purpose. Hetzner Cloud is cheap, reliable, and has good European connectivity. diff --git a/docs/hosts/london-a.md b/docs/hosts/london-a.md new file mode 100644 index 0000000..e835edc --- /dev/null +++ b/docs/hosts/london-a.md @@ -0,0 +1,43 @@ +# london-a + +Dedicated monitoring server. Runs Prometheus and Grafana, nothing else. + +## Overview + +| | | +|---|---| +| **Location** | London (NW9) | +| **OS** | FreeBSD 14.3 | +| **Tailscale IP** | 100.122.219.41 | +| **Role** | Monitoring (Prometheus + Grafana) | + +## Hardware + +| Component | Spec | +|---|---| +| CPU | Intel i7-4790K (8 threads) | +| Memory | 32 GB | +| Boot disk | 1 TB | + +Old gaming PC, now perfectly happy as a monitoring host. Very lightly loaded — disk at ~6%. + +## Services + +| Service | Port | URL | +|---------|------|-----| +| Prometheus | 9090 | prometheus.pez.sh | +| Grafana | 3000 | grafana.pez.sh | + +Both are behind Authelia (auth handled by Caddy on helsinki-a). + +## Why FreeBSD + +This one runs FreeBSD instead of Ubuntu. For a single-purpose monitoring host it works well. No particular reason to change it — it's stable and does its job. + +## Networking + +Connected via Cat 5 to the Ubiquiti switch alongside london-b. + +## Notes + +Prometheus scrapes all hosts over Tailscale. See [monitoring.md](../monitoring.md) for scrape targets and dashboard details. diff --git a/docs/hosts/london-b.md b/docs/hosts/london-b.md new file mode 100644 index 0000000..8c97a68 --- /dev/null +++ b/docs/hosts/london-b.md @@ -0,0 +1,75 @@ +# london-b + +Primary storage and media server. The workhorse of the fleet. + +## Overview + +| | | +|---|---| +| **Location** | London (NW9) | +| **OS** | Ubuntu 24.04 | +| **Tailscale IP** | 100.84.65.101 | +| **Role** | Storage, media serving, Docker services | + +## Hardware + +| Component | Spec | +|---|---| +| CPU | AMD Threadripper 3970X (64 threads) | +| Memory | 64 GB | +| GPU | Nvidia GTX 980 | +| Boot disk | 500 GB | +| Storage pool | ~64 TB (ZFS) | + +This machine is ridiculously overpowered as a media server. It's my old gaming/workstation PC repurposed into server duty. The GPU helps with Plex transcoding but the CPU can handle it fine on its own. + +## Storage + +ZFS pool `hdd`: 3× RAIDZ1 vdevs, 8 drives total. + +| Metric | Value | +|---|---| +| Used | 46 TB | +| Free | 18 TB | +| Total | ~64 TB | +| Usage | 72% | +| Scrub | Weekly (Sundays) | + +RAIDZ1 tolerates one drive failure per vdev. With this many drives and this much data, ZFS checksumming is essential — silent data corruption on spinning disks is real and you don't want to find out about it years later. + +## Services + +### Media Servers + +| Service | Port | URL | +|---------|------|-----| +| Plex | 32400 | plex.pez.sh | +| Jellyfin | 8096 | jellyfin.pez.sh | +| Navidrome | 4533 | music.pez.sh | + +### Media Automation + +| Service | Port | URL | +|---------|------|-----| +| Radarr | 7878 | radarr.pez.sh | +| Sonarr | 8989 | sonarr.pez.sh | +| Lidarr | 8686 | lidarr.pez.sh | +| Readarr | 8787 | readarr.pez.sh | +| Prowlarr | 9696 | prowlarr.pez.sh | +| Transmission | 9091 | download.pez.sh | +| Jellyseerr | 5055 | request.pez.sh | + +### Other + +| Service | Port | URL | +|---------|------|-----| +| Nextcloud AIO | 11000 | cloud.pez.sh | +| slskd (Soulseek) | 5030 | soulseek.pez.sh | +| smartctl_exporter | 9633 | (Prometheus scrape) | +| prom-plex-exporter | — | (Prometheus scrape) | + +All services run in Docker. Media is served directly from the ZFS pool. + +## Networking + +Connected via Cat 5 to the Ubiquiti switch in the utility closet. 1 Gbit LAN connection. diff --git a/docs/hosts/nuremberg-a.md b/docs/hosts/nuremberg-a.md new file mode 100644 index 0000000..d366740 --- /dev/null +++ b/docs/hosts/nuremberg-a.md @@ -0,0 +1,34 @@ +# nuremberg-a + +Dedicated mail server. One job, does it well. + +## Overview + +| | | +|---|---| +| **Location** | Hetzner Cloud (Nuremberg) | +| **OS** | Alpine Linux | +| **Tailscale IP** | 100.117.235.28 | +| **Role** | Mail server (poste.io) | +| **Provider** | Hetzner Cloud VPS | + +## Services + +| Service | Ports | Deployment | +|---------|-------|-----------| +| poste.io | 25, 587, 993, 443 | Docker | + +poste.io is a batteries-included mail server that bundles postfix, dovecot, rspamd, and webmail into a single Docker container. No juggling separate containers for each mail component. + +## Why a separate server + +Mail lives on its own VPS to isolate its IP reputation. If the IP gets flagged for any reason, it doesn't affect the rest of the infrastructure. And if something else gets flagged, it doesn't affect mail deliverability. + +## DNS + +Mail-related DNS records are managed via Cloudflare (Terraform): + +- **MX** record for inbound mail routing +- **SPF** for sender verification +- **DKIM** for message signing +- **DMARC** for policy enforcement diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 0000000..b58c657 --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,124 @@ +# Monitoring + +## Stack Overview + +``` + ┌─────────────────────────────────────────────────┐ + │ london-a │ + │ (FreeBSD) │ + │ │ + │ ┌────────────┐ ┌──────────┐ │ + │ │ Prometheus │────────►│ Grafana │ │ + │ │ :9090 │ query │ :3000 │ │ + │ └─────┬──────┘ └──────────┘ │ + │ │ scrape │ + └────────┼────────────────────────────────────────┘ + │ + │ Tailscale + │ + ┌─────┼──────────────────────────────────┐ + │ │ │ + │ ▼ ▼ ▼ │ + │ node_exporter smartctl plex │ + │ (all hosts) exporter exporter │ + │ (london-b) (london-b) │ + └────────────────────────────────────────┘ +``` + +Both Prometheus and Grafana are accessible via: +- **grafana.pez.sh** (behind Authelia) +- **prometheus.pez.sh** (behind Authelia) + +## Prometheus + +Prometheus runs on london-a and scrapes metrics from exporters across the fleet. All scrape targets are reached over Tailscale — no ports need to be exposed on the public internet. + +### Scrape Targets + +| Target | Host | Port | What | +|--------|------|------|------| +| node_exporter | All hosts | 9100 | System metrics (CPU, memory, disk, network) | +| smartctl_exporter | london-b | 9633 | Disk SMART health data | +| prom-plex-exporter | london-b | (varies) | Plex streaming activity | + +node_exporter is deployed to every host via Ansible. It's one of the first things that gets installed on a new server. + +### Adding a scrape target + +1. Deploy the exporter to the host (via Ansible or Docker) +2. Add the target to the Prometheus config in `services/prometheus/` +3. Deploy the updated config (Ansible or manual restart) +4. Verify it shows up in Prometheus targets page + +## Grafana + +Grafana reads from Prometheus and provides dashboards for everything worth watching. + +### Dashboards + +| Dashboard | What it shows | +|-----------|--------------| +| Server Health | CPU, memory, disk usage, network I/O across all hosts | +| ZFS | Pool status, usage, scrub results for london-b | +| SMART | Disk health metrics, temperature, error counts | +| Plex | Active streams, transcoding status, library stats | + +### Adding a dashboard + +Dashboards are defined in `services/grafana/`. Export as JSON from Grafana and commit to the repo to keep them in version control. + +## Exporters + +### node_exporter + +Standard Prometheus node exporter. Deployed on every host. Provides system-level metrics: +- CPU usage and load averages +- Memory usage +- Disk space and I/O +- Network traffic +- System uptime + +Installed via Ansible as part of the base server setup. + +### smartctl_exporter + +Runs on london-b (the ZFS storage server with 8 spinning disks). Exposes SMART data from all drives: +- Temperature +- Reallocated sectors +- Read/write error rates +- Power-on hours +- Overall health assessment + +Critical for catching dying drives before they take out a RAIDZ1 vdev. + +### prom-plex-exporter + +Runs on london-b. Scrapes the Plex API and exposes metrics about: +- Active streams +- Transcode sessions +- Library size +- User activity + +Mostly for fun — it's satisfying to see the Plex dashboard light up when people are streaming. + +## Status Page + +**status.pez.sh** is a lightweight public status page that shows service availability. + +- Pulls availability data from Prometheus +- Shows 90-day uptime history +- Hosted on helsinki-a at `/srv/status` +- Source: [RWejlgaard/pez-status](https://github.com/RWejlgaard/pez-status) +- Not behind Authelia — it's public by design + +## Alerting + +Prometheus alerting rules can be configured in the Prometheus config. Alert conditions worth monitoring: + +- Host down (node_exporter unreachable) +- Disk space critical (>90% usage) +- ZFS scrub errors +- SMART drive failures +- High memory usage + +Grafana can also be configured with alert channels (email, webhooks, etc.) for dashboard-based alerts. diff --git a/docs/networking.md b/docs/networking.md new file mode 100644 index 0000000..1565726 --- /dev/null +++ b/docs/networking.md @@ -0,0 +1,152 @@ +# Networking + +## Tailscale Mesh + +Tailscale is the backbone of the whole setup. It's a WireGuard-based mesh VPN that connects all servers regardless of where they physically are. Every server can reach every other server directly — no port forwarding, no NAT traversal, no exposed SSH ports. + +All inter-server communication uses Tailscale IPs: + +| Host | Tailscale IP | +|------|-------------| +| helsinki-a | 100.67.6.27 | +| london-b | 100.84.65.101 | +| london-a | 100.122.219.41 | +| nuremberg-a | 100.117.235.28 | +| copenhagen-a | 100.89.206.60 | +| copenhagen-c | 100.115.45.53 | + +### What Tailscale is used for + +- **Reverse proxying:** Caddy on helsinki-a forwards traffic to backends via Tailscale IPs +- **Monitoring:** Prometheus on london-a scrapes exporters on all hosts via Tailscale +- **SSH access:** All SSH is done over Tailscale — no SSH ports exposed to the internet +- **Ansible deployments:** GitHub Actions runs Ansible over Tailscale SSH connections +- **Exit nodes:** Servers can act as VPN endpoints — useful for accessing UK content from Copenhagen or vice versa + +### Mesh Diagram + +``` + helsinki-a ◄──────────────────────────► london-b + ▲ ▲ ▲ ▲ + │ │ │ │ + │ └──────────► london-a ◄──────────┘ │ + │ ▲ │ + │ │ │ + ▼ │ ▼ + nuremberg-a copenhagen-a ◄────► copenhagen-c + + Every node can reach every other node directly. + Connections shown are illustrative — the mesh is fully connected. +``` + +## Physical Networking + +### London + +The London setup is in a rack cabinet in the bedroom (great white noise machine, honestly). + +- **Router:** Ubiquiti Dream Machine Special Edition — overkill for a home setup but gives excellent routing performance vs an ISP router +- **ISP:** BT, 1 Gbit down / 300 Mbit up, ~£90/month +- **Cabling:** Cat 5 in the walls, patch panel in the utility closet, connected to a Ubiquiti switch +- **Servers:** london-a and london-b connected via Ethernet to the switch + +### Copenhagen + +A stack of servers at my dad's place — acts as an off-site location. + +- **Router:** ISP-provided (not my house, can't exactly install a Ubiquiti rack) +- **ISP:** Symmetrical 500 Mbit — plenty for what's running there +- **Servers:** copenhagen-a and copenhagen-c connected directly to the ISP router's built-in switch + +### Helsinki / Nuremberg (Hetzner Cloud) + +- Standard Hetzner Cloud VPS networking +- Public IPv4 addresses +- helsinki-a is the only server that receives traffic from the public internet +- nuremberg-a receives mail (ports 25, 587, 993) + +## DNS Flow + +All DNS is managed by Cloudflare, provisioned via Terraform. + +### Domain: pez.sh + +The domain is registered on Hover.com with nameservers pointed to Cloudflare. + +### How a request reaches a service + +``` +1. Browser requests radarr.pez.sh + │ +2. Cloudflare resolves DNS (proxied record → Cloudflare IP) + │ +3. Cloudflare terminates external TLS, forwards to helsinki-a + │ +4. Caddy on helsinki-a receives the request + │ +5. Caddy checks: does this subdomain require auth? + │ + ├── YES: forward_auth to Authelia (localhost:9091) + │ │ + │ ├── Authenticated → proceed to step 6 + │ └── Not authenticated → redirect to auth.pez.sh + │ + └── NO: proceed to step 6 + │ +6. Caddy reverse-proxies to the backend over Tailscale + (e.g., london-b:7878 for Radarr) + │ +7. Response flows back: backend → Caddy → Cloudflare → browser +``` + +### Public Subdomains + +All subdomains are Cloudflare-proxied and terminate at helsinki-a: + +| Subdomain | Backend | Auth | +|---|---|---| +| auth.pez.sh | helsinki-a:9091 | — | +| bitwarden.pez.sh | helsinki-a:8443 | — | +| status.pez.sh | helsinki-a:/srv/status | — | +| apps.pez.sh | helsinki-a:/srv/apps | Authelia | +| grafana.pez.sh | london-a:3000 | Authelia | +| prometheus.pez.sh | london-a:9090 | Authelia | +| jellyfin.pez.sh | london-b:8096 | — | +| plex.pez.sh | london-b:32400 | — | +| request.pez.sh | london-b:5055 | — | +| cloud.pez.sh | london-b:11000 | — | +| music.pez.sh | london-b:4533 | — | +| radarr.pez.sh | london-b:7878 | Authelia | +| sonarr.pez.sh | london-b:8989 | Authelia | +| lidarr.pez.sh | london-b:8686 | Authelia | +| readarr.pez.sh | london-b:8787 | Authelia | +| prowlarr.pez.sh | london-b:9696 | Authelia | +| soulseek.pez.sh | london-b:5030 | Authelia | +| download.pez.sh | london-b:9091 | Authelia | + +### Mail DNS + +nuremberg-a handles mail for pez.sh. DNS records managed via Cloudflare: + +- **MX** record pointing to nuremberg-a +- **SPF** record for sender verification +- **DKIM** record for message signing +- **DMARC** record for policy enforcement + +### Caddy TLS + +Caddy handles TLS termination for the Cloudflare-to-origin connection. Certificates are obtained and renewed automatically via ACME (Let's Encrypt). No manual cert management, no cron jobs, no renewals to think about. + +Example Caddyfile block for a protected service: + +``` +radarr.pez.sh { + forward_auth helsinki-a:9091 { + uri /api/verify?rd=https://auth.pez.sh + copy_headers Remote-User Remote-Groups Remote-Name Remote-Email + } + reverse_proxy london-b:7878 +} +``` + +Compare that to the equivalent Nginx config — about 4 lines vs 20. This is why I use Caddy. diff --git a/docs/secrets.md b/docs/secrets.md new file mode 100644 index 0000000..3f72d05 --- /dev/null +++ b/docs/secrets.md @@ -0,0 +1,152 @@ +# Secrets Management + +This repo uses [SOPS](https://github.com/getsops/sops) with [age](https://github.com/FiloSottile/age) encryption for secrets. Encrypted files live in the repo alongside the configs they belong to — only the secret values are encrypted, so diffs remain useful. + +## Why SOPS + age? + +- **age over GPG**: No key expiry, no keyservers, no UID headaches. A single static public key per recipient. +- **SOPS over git-crypt**: Encrypts values, not whole files. You can see the structure of a secrets file without decrypting it. Works with YAML, JSON, ENV, and INI. +- **SOPS over Ansible Vault**: Ansible Vault only works with Ansible. SOPS works everywhere — Terraform (via `terraform-provider-sops`), Docker env files, CI pipelines, scripts. + +## File naming convention + +Encrypted files use `.enc.` in their extension: + +``` +services/authelia/config.enc.yml # encrypted YAML +services/miniflux/miniflux.enc.env # encrypted env file +terraform/secrets.enc.yaml # encrypted Terraform vars +ansible/group_vars/all/secrets.enc.yml +``` + +Plaintext files MUST NOT contain secrets. The `.gitignore` blocks common secret filenames (`secrets.yml`, `vault.yml`, `secret.env`, etc.) as a safety net. + +## Setup (one-time) + +### Install tools + +```bash +# macOS +brew install sops age + +# Debian/Ubuntu +apt install age +# SOPS: download from https://github.com/getsops/sops/releases +wget https://github.com/getsops/sops/releases/download/v3.9.4/sops_3.9.4_amd64.deb +dpkg -i sops_3.9.4_amd64.deb + +# FreeBSD +pkg install age sops +``` + +### Generate your age key + +```bash +age-keygen -o ~/.config/sops/age/keys.txt +# Output: public key: age1abc123... +``` + +This file is your private key. **Never commit it.** The `.gitignore` already blocks `keys.txt` and `*.agekey`. + +SOPS automatically looks for keys in `~/.config/sops/age/keys.txt` (Linux/macOS) or you can set `SOPS_AGE_KEY_FILE` to point elsewhere. + +### Add your public key to `.sops.yaml` + +Replace the `age1TODO_PEZ_PUBLIC_KEY` placeholder in `.sops.yaml` with your actual public key. Commit the updated `.sops.yaml`. + +## Day-to-day usage + +### Create a new encrypted file + +```bash +# SOPS picks the right age keys from .sops.yaml based on file path +sops services/authelia/config.enc.yml +# Opens your $EDITOR with a decrypted view. Save and quit to encrypt. +``` + +### Edit an existing encrypted file + +```bash +sops services/authelia/config.enc.yml +``` + +### Decrypt to stdout (for scripts/debugging) + +```bash +sops -d services/authelia/config.enc.yml +``` + +### Encrypt an existing plaintext file + +```bash +# If you have a plaintext file you want to encrypt in-place: +sops -e -i services/miniflux/miniflux.enc.env +``` + +### Add a new recipient + +When someone new needs access (or a new CI key is generated): + +1. Get their age public key +2. Add it to the relevant `creation_rules` in `.sops.yaml` +3. Re-encrypt all affected files: + +```bash +# Update keys on all encrypted files +find . -name '*.enc.*' -exec sops updatekeys {} \; +``` + +## CI / GitHub Actions + +The CI runner needs to decrypt secrets during deploys. Store the age secret key as a GitHub Actions secret: + +1. Generate a CI-specific age key: `age-keygen` +2. Add the **private key** (the `AGE-SECRET-KEY-1...` line) as a GitHub repository secret named `AGE_SECRET_KEY` +3. Add the **public key** to `.sops.yaml` (the CI recipient) + +In the workflow: + +```yaml +- name: Decrypt secrets + env: + SOPS_AGE_KEY: ${{ secrets.AGE_SECRET_KEY }} + run: | + sops -d ansible/group_vars/all/secrets.enc.yml > ansible/group_vars/all/secrets.yml +``` + +The existing `ANSIBLE_VAULT_PASS` secret can be retired once migration to SOPS is complete. + +## Terraform integration + +Use the [terraform-provider-sops](https://github.com/carlpett/terraform-provider-sops) to read encrypted values directly: + +```hcl +provider "sops" {} + +data "sops_file" "secrets" { + source_file = "secrets.enc.yaml" +} + +# Use decrypted values +resource "cloudflare_record" "example" { + value = data.sops_file.secrets.data["cloudflare_api_token"] +} +``` + +## What gets encrypted + +These are the types of secrets expected in this repo: + +| Category | Example | Location | +|----------|---------|----------| +| Ansible vault vars | SSH keys, API tokens, passwords | `ansible/group_vars/*/secrets.enc.yml` | +| Docker env files | DB passwords, app secrets | `services/*/service.enc.env` | +| Terraform vars | Cloudflare API token, Azure creds | `terraform/secrets.enc.yaml` | +| Service configs | Authelia JWT secret, LLDAP admin pass | `services/*/config.enc.yml` | + +## Security notes + +- **Never commit `keys.txt`** or any file containing `AGE-SECRET-KEY`. The `.gitignore` blocks these. +- **Rotate keys** if a machine is compromised: generate new key, update `.sops.yaml`, re-encrypt all files, revoke the old key from `.sops.yaml`. +- **CI key is separate** from personal keys so it can be rotated independently. +- SOPS encrypted files contain metadata about which keys can decrypt them — this is intentional and not a secret. diff --git a/docs/services.md b/docs/services.md new file mode 100644 index 0000000..ead7dc0 --- /dev/null +++ b/docs/services.md @@ -0,0 +1,109 @@ +# Services + +Complete map of every service in the fleet — what it does, where it runs, how it's deployed, and whether it's behind auth. + +## helsinki-a — Gateway & Auth + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Caddy | 80, 443 | Docker | — | (reverse proxy, no direct URL) | +| Authelia | 9091 | Docker | — | auth.pez.sh | +| Bitwarden (Vaultwarden) | 8443 | Docker | Own auth | bitwarden.pez.sh | +| LLDAP | 3890/17170 | Docker | — | (internal, used by Authelia) | + +Caddy is the single entry point for all public traffic. Authelia and LLDAP provide SSO. Bitwarden is on helsinki-a for availability — it needs to be reachable even if the London servers are down. + +## london-b — Storage & Media + +The workhorse. Threadripper 3970X, 64GB RAM, 64TB ZFS storage. Everything media-related lives here. + +### Media Servers + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Plex | 32400 | Docker | Own auth | plex.pez.sh | +| Jellyfin | 8096 | Docker | Own auth | jellyfin.pez.sh | +| Navidrome | 4533 | Docker | Own auth | music.pez.sh | + +I run both Plex and Jellyfin — some clients work better with one than the other. Media is served directly from the ZFS pool. + +### Media Automation (Arr Stack) + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Radarr | 7878 | Docker | Authelia | radarr.pez.sh | +| Sonarr | 8989 | Docker | Authelia | sonarr.pez.sh | +| Lidarr | 8686 | Docker | Authelia | lidarr.pez.sh | +| Readarr | 8787 | Docker | Authelia | readarr.pez.sh | +| Prowlarr | 9696 | Docker | Authelia | prowlarr.pez.sh | +| Transmission | 9091 | Docker | Authelia | download.pez.sh | +| Jellyseerr | 5055 | Docker | Own auth | request.pez.sh | + +The arr stack pipeline: Jellyseerr accepts requests → Radarr/Sonarr/Lidarr/Readarr search via Prowlarr → sends to Transmission → downloaded content is moved to the library → Plex and Jellyfin pick it up automatically. + +### Other + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Nextcloud AIO | 11000 | Docker | Own auth | cloud.pez.sh | +| slskd (Soulseek) | 5030 | Docker | Authelia | soulseek.pez.sh | +| smartctl exporter | 9633 | Docker | — | (scraped by Prometheus) | +| prom-plex-exporter | — | Docker | — | (scraped by Prometheus) | + +## london-a — Monitoring + +Dedicated monitoring host running FreeBSD. Very lightly loaded. + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Prometheus | 9090 | Native | Authelia | prometheus.pez.sh | +| Grafana | 3000 | Native | Authelia | grafana.pez.sh | + +See [monitoring.md](monitoring.md) for details on scrape targets, dashboards, and exporters. + +## nuremberg-a — Mail + +Dedicated mail server on Hetzner Cloud. Isolated to protect IP reputation. + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| poste.io | 25, 587, 993, 443 | Docker | Own auth | (webmail via direct access) | + +poste.io bundles everything — postfix, dovecot, rspamd, webmail — into a single container. Makes updates straightforward. + +## copenhagen-a — Gaming + +Game servers. Not publicly exposed via Caddy — accessed directly or over Tailscale. + +| Service | Port | Deployment | Auth | URL | +|---------|------|-----------|------|-----| +| Minecraft (PaperMC) | 25565 | Docker | — | (direct connection) | +| MaNGOS realmd | 3724 | Native (systemd) | — | (direct connection) | +| MaNGOS world | 8085 | Native (systemd) | — | (direct connection) | +| MariaDB | 3306 | Native | — | (local, used by MaNGOS) | + +MaNGOS Zero is a WoW 1.12 (Vanilla) private server. Runs natively under systemd as the `mangos` user from `/home/mangos/mangos/zero/`. Not containerised — it predates the Docker setup on this host. + +## copenhagen-c — Idle + +No active services. Available for future use. + +## Exporters (Monitoring) + +These run on various hosts and are scraped by Prometheus: + +| Exporter | Host | What it monitors | +|----------|------|-----------------| +| node_exporter | All hosts | CPU, memory, disk, network | +| smartctl_exporter | london-b | Disk SMART health data | +| prom-plex-exporter | london-b | Plex activity metrics | + +## Auth Summary + +Services fall into two categories: + +**Behind Authelia** (SSO via Caddy forward_auth): +- Grafana, Prometheus, Radarr, Sonarr, Lidarr, Readarr, Prowlarr, Transmission, Soulseek, apps.pez.sh + +**Own auth** (handle login themselves): +- Bitwarden, Plex, Jellyfin, Nextcloud, Navidrome, Jellyseerr, poste.io diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 0000000..89afbcb --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1,5 @@ +.terraform* +.terraform.lock* +secrets/*.hcl +secrets/*.yml +secrets/*.yaml diff --git a/terraform/Makefile b/terraform/Makefile new file mode 100644 index 0000000..26a228c --- /dev/null +++ b/terraform/Makefile @@ -0,0 +1,28 @@ +SECRETS_ENC := secrets.enc.yaml +SECRETS := secrets.yaml + +$(SECRETS): $(SECRETS_ENC) + sops -d $< > $@ + +AWS_ACCESS_KEY_ID := $(shell sops -d --extract '["backblaze_keyID"]' $(SECRETS_ENC)) +AWS_SECRET_ACCESS_KEY := $(shell sops -d --extract '["backblaze_applicationKey"]' $(SECRETS_ENC)) + +export AWS_ACCESS_KEY_ID +export AWS_SECRET_ACCESS_KEY + +.PHONY: init plan apply fmt clean + +init: $(SECRETS) + tofu init + +plan: init + tofu plan + +apply: init + tofu apply + +fmt: + tofu fmt + +clean: + rm -f $(SECRETS) diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..7ee79a9 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,22 @@ +# Terraform + +Infrastructure-as-code for cloud and edge services. Uses [OpenTofu](https://opentofu.org/) (drop-in Terraform replacement). + +## What's managed + +- **Cloudflare DNS** — All `pez.sh` records (A, CNAME, MX, TXT) + +## CI/CD + +The original GitHub Actions workflow (`apply.yml`) ran plan on push to master, then applied with manual approval via a `prod` environment gate. This workflow lived in the standalone `pez-terraform` repo and would need adapting for the monorepo structure (e.g., path-filtered triggers). + +## Provider versions + +| Provider | Source | Version | +|----------|--------|---------| +| Cloudflare | `cloudflare/cloudflare` | `~> 5.18` | +| OpenTofu | — | `>= 1.6.0` | + +## Migrated from + +This directory replaces the standalone [`pez-terraform`](https://github.com/RWejlgaard/pez-terraform) repo. diff --git a/terraform/cloudflare_account.tf b/terraform/cloudflare_account.tf new file mode 100644 index 0000000..293ee13 --- /dev/null +++ b/terraform/cloudflare_account.tf @@ -0,0 +1,3 @@ +resource "cloudflare_account" "this" { + name = "Pez Solutions" +} diff --git a/terraform/cloudflare_dns.tf b/terraform/cloudflare_dns.tf new file mode 100644 index 0000000..23bc04b --- /dev/null +++ b/terraform/cloudflare_dns.tf @@ -0,0 +1,477 @@ +resource "cloudflare_zone" "pez-sh" { + account = { + id = cloudflare_account.this.id + } + name = "pez.sh" +} + +# ============================================================================= +# A Records +# ============================================================================= + +resource "cloudflare_dns_record" "ecp-dev-0o9lix" { + zone_id = cloudflare_zone.pez-sh.id + name = "0o9lix.ecp-dev" + type = "A" + content = "0.0.0.0" + proxied = false + ttl = 300 +} + +resource "cloudflare_dns_record" "alertmanager" { + zone_id = cloudflare_zone.pez-sh.id + name = "alertmanager" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "apps" { + zone_id = cloudflare_zone.pez-sh.id + name = "apps" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "auth" { + zone_id = cloudflare_zone.pez-sh.id + name = "auth" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "bitwarden" { + zone_id = cloudflare_zone.pez-sh.id + name = "bitwarden" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "chimera" { + zone_id = cloudflare_zone.pez-sh.id + name = "chimera" + type = "A" + content = "13.43.223.167" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "cloud" { + zone_id = cloudflare_zone.pez-sh.id + name = "cloud" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "download" { + zone_id = cloudflare_zone.pez-sh.id + name = "download" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "git" { + zone_id = cloudflare_zone.pez-sh.id + name = "git" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "gopher" { + zone_id = cloudflare_zone.pez-sh.id + name = "gopher" + type = "A" + content = "83.94.248.182" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "grafana" { + zone_id = cloudflare_zone.pez-sh.id + name = "grafana" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "helsinki-a" { + zone_id = cloudflare_zone.pez-sh.id + name = "helsinki-a" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "jellyfin" { + zone_id = cloudflare_zone.pez-sh.id + name = "jellyfin" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "jellyfin-requests" { + zone_id = cloudflare_zone.pez-sh.id + name = "jellyfin-requests" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "ldap" { + zone_id = cloudflare_zone.pez-sh.id + name = "ldap" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "lidarr" { + zone_id = cloudflare_zone.pez-sh.id + name = "lidarr" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "mail-a" { + zone_id = cloudflare_zone.pez-sh.id + name = "mail" + type = "A" + content = "167.235.134.154" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "minecraft" { + zone_id = cloudflare_zone.pez-sh.id + name = "minecraft" + type = "A" + content = "83.94.248.182" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "music" { + zone_id = cloudflare_zone.pez-sh.id + name = "music" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "naveen" { + zone_id = cloudflare_zone.pez-sh.id + name = "naveen" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "root" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "plex" { + zone_id = cloudflare_zone.pez-sh.id + name = "plex" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "prometheus" { + zone_id = cloudflare_zone.pez-sh.id + name = "prometheus" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "prowlarr" { + zone_id = cloudflare_zone.pez-sh.id + name = "prowlarr" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "radarr" { + zone_id = cloudflare_zone.pez-sh.id + name = "radarr" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "readarr" { + zone_id = cloudflare_zone.pez-sh.id + name = "readarr" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "request" { + zone_id = cloudflare_zone.pez-sh.id + name = "request" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "rss" { + zone_id = cloudflare_zone.pez-sh.id + name = "rss" + type = "A" + content = "65.108.48.44" + proxied = true + ttl = 1 +} + +resource "cloudflare_dns_record" "satisfactory" { + zone_id = cloudflare_zone.pez-sh.id + name = "satisfactory" + type = "A" + content = "162.55.55.2" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "sonarr" { + zone_id = cloudflare_zone.pez-sh.id + name = "sonarr" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "soulseek" { + zone_id = cloudflare_zone.pez-sh.id + name = "soulseek" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "status" { + zone_id = cloudflare_zone.pez-sh.id + name = "status" + type = "A" + content = "65.108.48.44" + proxied = true + ttl = 1 +} + +resource "cloudflare_dns_record" "thiswebsitedoesnotexist" { + zone_id = cloudflare_zone.pez-sh.id + name = "thiswebsitedoesnotexist" + type = "A" + content = "65.108.48.44" + proxied = true + ttl = 1 +} + +resource "cloudflare_dns_record" "webdav" { + zone_id = cloudflare_zone.pez-sh.id + name = "webdav" + type = "A" + content = "65.108.48.44" + proxied = false + ttl = 1 +} + +resource "cloudflare_dns_record" "wow" { + zone_id = cloudflare_zone.pez-sh.id + name = "wow" + type = "A" + content = "83.94.248.182" + proxied = false + ttl = 1 +} + +# ============================================================================= +# AAAA Records +# ============================================================================= + +resource "cloudflare_dns_record" "mail-aaaa" { + zone_id = cloudflare_zone.pez-sh.id + name = "mail" + type = "AAAA" + content = "2a01:4f8:1c1e:9c53::1" + proxied = false + ttl = 1 +} + +# ============================================================================= +# CNAME Records +# ============================================================================= + +resource "cloudflare_dns_record" "public" { + zone_id = cloudflare_zone.pez-sh.id + name = "public" + type = "CNAME" + content = "public.r2.dev" + proxied = true + ttl = 1 +} + +# ============================================================================= +# HTTPS Records +# ============================================================================= + +resource "cloudflare_dns_record" "status-https" { + zone_id = cloudflare_zone.pez-sh.id + name = "status" + type = "HTTPS" + data = { + priority = 100 + target = "https://pezsolutions.statuspage.io." + value = "ipv6hint=\"::1\"" + } + ttl = 1 +} + +# ============================================================================= +# MX Records +# ============================================================================= + +resource "cloudflare_dns_record" "root-mx-10" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "MX" + content = "mail.pez.sh" + priority = 10 + ttl = 1 +} + +resource "cloudflare_dns_record" "root-mx-20" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "MX" + content = "mail.pez.sh" + priority = 20 + ttl = 1 +} + +# ============================================================================= +# PTR Records +# ============================================================================= + +resource "cloudflare_dns_record" "ptr-83-94-248-182" { + zone_id = cloudflare_zone.pez-sh.id + name = "83.94.248.182" + type = "PTR" + content = "mail.pez.sh" + ttl = 1 +} + +resource "cloudflare_dns_record" "mail-ptr" { + zone_id = cloudflare_zone.pez-sh.id + name = "mail" + type = "PTR" + content = "154.134.235.167.in-addr.arpa" + ttl = 1 +} + +# ============================================================================= +# TXT Records +# ============================================================================= + +resource "cloudflare_dns_record" "dkim" { + zone_id = cloudflare_zone.pez-sh.id + name = "dkim._domainkey" + type = "TXT" + content = "v=DKIM1;k=rsa;t=s;s=email;p=MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAmT/TGkPkfbjleqRYuQoI67/xvM0J5gGmdlzo2jO5qTABz5+nzOS+PefrXkeEZ0IZrpLPKqLyi7K469Ql+HG5wDFDxQRRG7lHJkWJ4tnZgjZWgeszFPhoME74lT6i+j3x29WyxhyzNg0f3NhSwttOe5knmS4zsOb+JK4jShoF9zZkOUCHAZ/vKvYtJdV+8qpmU8wfgyrzN1OWxjHIjzPP8iMD4g0iCfobbvSvWXHYBveCS7b/Nr3jw3E8twtEAUEGYNGd4h0wKNbNagYUsb5My8tMxQQwZf6imKHgCeYC7buH8TvaJHATReeea4Dzj9UzdPgwdbFLiMB/HXlN0GPhlQIDAQAB" + ttl = 1 +} + +resource "cloudflare_dns_record" "dmarc" { + zone_id = cloudflare_zone.pez-sh.id + name = "_dmarc" + type = "TXT" + content = "v=DMARC1; p=none; rua=mailto:pez@pez.sh" + ttl = 1 +} + +resource "cloudflare_dns_record" "root-txt-spf" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "v=spf1 include:_spf.protonmail.ch ~all" + ttl = 1 +} + +resource "cloudflare_dns_record" "root-txt-protonmail" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "protonmail-verification=66cf5eff60c61c46a0d36b108c5cfbddc4f2eede" + ttl = 1 +} + +resource "cloudflare_dns_record" "root-txt-keybase" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "keybase-site-verification=ur7GwlgtEEPgIZ-2P0fyFsniuu6YwdkluO7N6LkymK0" + ttl = 1 +} + +resource "cloudflare_dns_record" "root-txt-ms" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "MS=ms99554544" + ttl = 300 +} + +resource "cloudflare_dns_record" "root-txt-google" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "google-site-verification=BZD6ITg5SFnc7mQcb9KGkPwhP9gQKDZgw4nrFOZ0Y0w" + ttl = 1 +} + +resource "cloudflare_dns_record" "root-txt-apple" { + zone_id = cloudflare_zone.pez-sh.id + name = "@" + type = "TXT" + content = "apple-domain=1zXuOydmezm51GT8" + ttl = 1 +} diff --git a/terraform/providers.tf b/terraform/providers.tf new file mode 100644 index 0000000..3d6ac60 --- /dev/null +++ b/terraform/providers.tf @@ -0,0 +1,24 @@ +terraform { + required_version = ">= 1.6.0" + + required_providers { + cloudflare = { + source = "cloudflare/cloudflare" + } + } + + backend "s3" { + bucket = "pez-infra-tfstate" + key = "tfstate/terraform.tfstate" + endpoints = { s3 = "s3.eu-central-003.backblazeb2.com" } + region = "eu-central-003" + skip_credentials_validation = true + skip_region_validation = true + # Credentials read from AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars + } +} + +provider "cloudflare" { + email = local.secrets["cloudflare_email"] + api_token = local.secrets["cloudflare_api_key"] +} diff --git a/terraform/secrets.enc.yaml b/terraform/secrets.enc.yaml new file mode 100644 index 0000000..9b65c39 --- /dev/null +++ b/terraform/secrets.enc.yaml @@ -0,0 +1,20 @@ +cloudflare_email: ENC[AES256_GCM,data:IOxyqjzQbw+9zg==,iv:bvMQ3JncMf2suPpshwsgtRm5h1UlQ6kAEm7cB/ExM3w=,tag:R9ZcED/RaW16wnqG99ym8A==,type:str] +cloudflare_api_key: ENC[AES256_GCM,data:z1NWHsh4jJ+QAGILfJuKgkrBjjGKoEh2mlSER3LL8vnG8gMDbVsm9O3hkuMfsMxPsY+zbXs=,iv:sw1+gfPIf8auqdDZO3VTtSOhoi0XNsSca0EbEFWZJuI=,tag:hT9Wjls99sE2jdNVSNQtkQ==,type:str] +backblaze_keyID: ENC[AES256_GCM,data:YneBYL27E8lmSULI9w/HLtizqMrk5nDu2Q==,iv:/gNeG2yy4Em/SIjh7i2tGV+8+KYk/d4/UHceDBM6II8=,tag:pfN0ghvcUDQxYKZdIrWUfQ==,type:str] +backblaze_keyName: ENC[AES256_GCM,data:9tKnmmQWDTO3FHZ3D01Isvo=,iv:wLdbiPj5rgIn9Yeu5w+tOnJ2PdRtCFQLP4rncZHxN6w=,tag:ADwSi5oz613meQjPa3kshw==,type:str] +backblaze_applicationKey: ENC[AES256_GCM,data:veIMwboFDx414vVp+kKw2uYRraayZ1DUswTKQMjfsg==,iv:dYdDd71uNPURiPGuieastA4/TtskVNq6uwsDM6Dl1JQ=,tag:jMnV0ydgTrq3zl6F6V5PPQ==,type:str] +sops: + age: + - recipient: age1r8uh2w2qad2z5sgq9q7l73962q2sp8zz9hdnh6sjuvanxl565vmswn8squ + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSAyOHlsaHZKRzJLUjhTamha + cmtFN0J3eEFNaERDNDFlbUd0dWIxV25tMVRBClJIZU55N1lLTFYxblRXd3dma0pX + UnZzeGoyMHR0UWxkM3RaNmloUTBFUHMKLS0tIHB5TmdIWEY4dWJUQWNZcVUwV1or + ekhtYkVLZ1hBbEZEakhXeUh0UW94QTgKdEY6mwWVQpMtaAYn+tnXFUvBk9QvzFX4 + ai91WDaO/iRtHluOSp5HxRVh2BNO4uH4opXQEthUIkQzLGtDTUN1uw== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2026-03-22T21:04:06Z" + mac: ENC[AES256_GCM,data:6nWRb9Ne7YlcgAiJQAPx7zO51Fb2qAIup5qUG72b3s+XHbutTO5KGefWEx4/flmQx+ctbQ8fRWPOxBHECnB2xVkU0OgehGWAxKXpalnDSMp3cSjXE/Zjisd6H3U5gm8ilRysfCQE1SL8RvZCWWsKI3v89acP+ADYcU9NNOHswbc=,iv:qiWX7JFgsNgwjRPTYNNORDRUj96HRaVopN69qTAD+pM=,tag:qHw27PIvY2hhcYTLY4VPnQ==,type:str] + unencrypted_suffix: _unencrypted + version: 3.12.2 diff --git a/terraform/vars.tf b/terraform/vars.tf new file mode 100644 index 0000000..a5994bb --- /dev/null +++ b/terraform/vars.tf @@ -0,0 +1,3 @@ +locals { + secrets = yamldecode(file("${path.module}/secrets.yaml")) +}