From c1580a06e24657781a0a2d36006a4bb74720a499 Mon Sep 17 00:00:00 2001 From: Rasmus Wejlgaard Date: Sat, 4 Apr 2026 08:54:58 +0000 Subject: [PATCH] fix grafana alert rules missing relativeTimeRange Grafana 12 requires explicit relativeTimeRange on all alert rule data queries. Without it, queries default to {from: 0, to: 0} which is rejected as invalid, causing Grafana to crash on startup during alerting provisioning. Added relativeTimeRange to all data entries: - Prometheus queries: {from: 600, to: 0} (10 min lookback) - Expression refs: {from: 0, to: 0} This was preventing Grafana from starting on london-a, which meant alerts (including Host Down for copenhagen-a) could never auto-resolve. --- .../provisioning/alerting/rules-critical.yml | 54 +++++++++++++++++++ .../provisioning/alerting/rules-warning.yml | 36 +++++++++++++ 2 files changed, 90 insertions(+) diff --git a/ansible/services/grafana/provisioning/alerting/rules-critical.yml b/ansible/services/grafana/provisioning/alerting/rules-critical.yml index 3216b01..c0bc714 100644 --- a/ansible/services/grafana/provisioning/alerting/rules-critical.yml +++ b/ansible/services/grafana/provisioning/alerting/rules-critical.yml @@ -17,6 +17,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: up{job="node_exporter"} instant: true @@ -25,6 +28,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -37,6 +43,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -72,6 +81,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: | ( @@ -86,6 +98,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -98,6 +113,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -133,6 +151,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: | (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 @@ -142,6 +163,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -154,6 +178,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -189,6 +216,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: smartctl_device_smart_status{job="smartmontools"} instant: true @@ -197,6 +227,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -209,6 +242,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -249,6 +285,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: up{job="caddy"} instant: true @@ -257,6 +296,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -269,6 +311,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -309,6 +354,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: up{job="plex"} instant: true @@ -317,6 +365,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -329,6 +380,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: diff --git a/ansible/services/grafana/provisioning/alerting/rules-warning.yml b/ansible/services/grafana/provisioning/alerting/rules-warning.yml index 58155f2..602532f 100644 --- a/ansible/services/grafana/provisioning/alerting/rules-warning.yml +++ b/ansible/services/grafana/provisioning/alerting/rules-warning.yml @@ -17,6 +17,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: | ( @@ -31,6 +34,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -43,6 +49,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -78,6 +87,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: | (1 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"})) * 100 @@ -87,6 +99,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -99,6 +114,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -134,6 +152,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: expr: | 100 - (avg by (server, instance) (rate(node_cpu_seconds_total{job="node_exporter", mode="idle"}[5m])) * 100) @@ -143,6 +164,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -155,6 +179,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: @@ -190,6 +217,9 @@ groups: data: - refId: A datasourceUid: bezqqznn81wqof + relativeTimeRange: + from: 600 + to: 0 model: # Compare 15-minute load against number of CPUs expr: | @@ -200,6 +230,9 @@ groups: refId: A - refId: B datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: datasource: type: __expr__ @@ -212,6 +245,9 @@ groups: type: reduce - refId: C datasourceUid: __expr__ + relativeTimeRange: + from: 0 + to: 0 model: conditions: - evaluator: