Loki-VL-proxy/alerting/loki-vl-proxy-prometheusrule.yaml at main · ReliablyObserve/Loki-VL-proxy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
groupName: loki-vl-proxy
interval: 30s
rules:
  - alert: LokiVLProxyDown
    expr: 'up{job="{{ include "loki-vl-proxy.fullname" . }}"} == 0'
    for: 1m
    labels:
      severity: critical
      service: loki-vl-proxy
      component: availability
      category: platform
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Loki-VL-proxy is down"
      description: 'Instance {{`{{ $labels.instance }}`}} has been down for >1m.'
      impact: "All Loki-compatible read paths through this instance are unavailable."
      action: "Check pod health, readiness endpoint, and backend connectivity."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-down.md'

  - alert: LokiVLProxyHighErrorRate
    expr: |
      100 * sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}",status=~"5.."}[5m]))
      / sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}"}[5m])) > 5
    for: 5m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: request-path
      category: reliability
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: 'High 5xx error rate ({{`{{ $value | printf "%.1f" }}`}}%)'
      description: "Proxy 5xx response ratio is above 5% for 5 minutes."
      impact: "Grafana users may see intermittent query failures."
      action: "Inspect proxy logs and backend status for failing endpoints."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-high-error-rate.md'

  - alert: LokiVLProxyHighLatency
    expr: |
      histogram_quantile(0.99,
        sum(rate(loki_vl_proxy_request_duration_seconds_bucket{job="{{ include "loki-vl-proxy.fullname" . }}",endpoint="query_range"}[5m])) by (le)
      ) > 10
    for: 5m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: query-range
      category: performance
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "P99 query_range latency >10s"
      description: "P99 query_range latency has exceeded 10s for 5 minutes."
      impact: "Dashboards and Explore become slow for most users."
      action: "Check backend latency, cache hit ratio, and query volume spikes."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-high-latency.md'

  - alert: LokiVLProxyBackendHighLatency
    expr: |
      histogram_quantile(0.95,
        sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{job="{{ include "loki-vl-proxy.fullname" . }}"}[5m])) by (le, endpoint)
      ) > 5
    for: 5m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: backend
      category: dependency
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Backend p95 latency >5s"
      description: "VictoriaLogs backend p95 latency is above 5s for 5 minutes."
      impact: "Query endpoints may remain up but user-facing latency degrades."
      action: "Identify slow backend endpoints and correlate with backend CPU/IO pressure."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-backend-high-latency.md'

  - alert: LokiVLProxyBackendUnreachable
    expr: |
      sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}",status="502"}[5m])) > 0.5
    for: 2m
    labels:
      severity: critical
      service: loki-vl-proxy
      component: backend
      category: dependency
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "VictoriaLogs backend unreachable"
      description: "Proxy sees sustained 502 responses while querying VictoriaLogs."
      impact: "Read-path compatibility is degraded or unavailable."
      action: "Check VictoriaLogs health, network paths, and auth headers."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-backend-unreachable.md'

  - alert: LokiVLProxyCircuitBreakerOpen
    expr: loki_vl_proxy_circuit_breaker_state{job="{{ include "loki-vl-proxy.fullname" . }}"} == 1
    for: 1m
    labels:
      severity: critical
      service: loki-vl-proxy
      component: resiliency
      category: dependency
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Circuit breaker is open"
      description: "Circuit breaker has opened due to repeated upstream failures."
      impact: "Requests are temporarily short-circuited to protect the backend."
      action: "Investigate backend stability and recent 5xx bursts."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-circuit-breaker-open.md'

  - alert: LokiVLProxyTenantHighErrorRate
    expr: |
      100 * sum(rate(loki_vl_proxy_tenant_requests_total{status=~"5.."}[5m])) by (tenant)
      / sum(rate(loki_vl_proxy_tenant_requests_total[5m])) by (tenant) > 10
    for: 5m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: multitenancy
      category: tenant
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: 'Tenant {{`{{ $labels.tenant }}`}} high error rate'
      description: 'Tenant {{`{{ $labels.tenant }}`}} exceeded 10% 5xx ratio for 5 minutes.'
      impact: "Tenant-scoped dashboards and queries are failing."
      action: "Inspect tenant-specific query patterns and backend limits."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-tenant-high-error-rate.md'

  - alert: LokiVLProxyRateLimiting
    expr: sum(rate(loki_vl_proxy_client_errors_total{reason="rate_limited"}[5m])) > 0.5
    for: 5m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: protection
      category: traffic
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Rate limiting active"
      description: "Rate-limited client errors are sustained above threshold."
      impact: "Some clients are throttled and may observe partial failures."
      action: "Review client request bursts and adjust rate limiter settings if needed."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-rate-limiting.md'

  - alert: LokiVLProxyClientBadRequestBurst
    expr: |
      sum(rate(loki_vl_proxy_client_errors_total{reason="bad_request"}[5m])) by (endpoint) > 2
    for: 10m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: client-errors
      category: traffic
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Client bad_request burst on {{`{{ $labels.endpoint }}`}}"
      description: "Bad request errors exceed 2 req/s for 10 minutes on endpoint {{`{{ $labels.endpoint }}`}}."
      impact: "Clients experience hard failures and retry storms can increase platform load."
      action: "Identify top offending clients and inspect rejected query patterns."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-client-bad-request-burst.md'

  - alert: LokiVLProxyUnexpectedTupleMode
    expr: sum(rate(loki_vl_proxy_response_tuple_mode_total{mode!~"default_2tuple|categorize_labels_3tuple"}[10m])) > 0
    for: 5m
    labels:
      severity: critical
      service: loki-vl-proxy
      component: tuple-contract
      category: compatibility
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Unexpected tuple mode emitted"
      description: "Proxy emitted tuple modes outside the strict contract set: default_2tuple and categorize_labels_3tuple."
      impact: "Client tuple-shape compatibility may regress and decode failures can follow."
      action: "Validate tuple contract smoke script and inspect recent query/merge/stream code changes."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-grafana-tuple-contract.md'

  - alert: LokiVLProxyDefault2TupleMissing
    expr: |
      sum(rate(loki_vl_proxy_response_tuple_mode_total{mode=~"default_2tuple|categorize_labels_3tuple"}[10m])) > 0
      and
      sum(rate(loki_vl_proxy_response_tuple_mode_total{mode="default_2tuple"}[10m])) == 0
    for: 10m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: tuple-contract
      category: compatibility
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "Default 2-tuple emissions missing"
      description: "Tuple-mode traffic exists but mode=default_2tuple stayed at zero for 10 minutes."
      impact: "All traffic may be forcing categorize-labels, reducing strict 2-tuple compatibility coverage."
      action: "Run tuple smoke canary and verify callers only send categorize-labels where 3-tuples are expected."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-grafana-tuple-contract.md'

  - alert: LokiVLProxySystemMetricsMissing
    expr: absent(process_memory_usage_ratio{job="{{ include "loki-vl-proxy.fullname" . }}"})
    for: 15m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: system-metrics
      category: observability
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "System resource metrics missing"
      description: "process_memory_usage_ratio is missing for loki-vl-proxy for 15 minutes."
      impact: "CPU/memory/disk/network/pressure visibility is degraded and related alerts cannot fire."
      action: "Check startup diagnostics, ensure /metrics exposure, and if running in Kubernetes enable host /proc mount plus -proc-root=/host/proc."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'

  - alert: LokiVLProxySystemMemoryHigh
    expr: max_over_time(process_memory_usage_ratio{job="{{ include "loki-vl-proxy.fullname" . }}"}[10m]) > 0.90
    for: 10m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: system-memory
      category: capacity
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "System memory usage > 90%"
      description: "process_memory_usage_ratio stayed above 90% for 10 minutes."
      impact: "Query latency and backend request failures may increase under memory pressure."
      action: "Correlate with process RSS/open FDs and node pressure, then reduce query load or increase pod/node capacity."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'

  - alert: LokiVLProxySystemCPUPressureHigh
    expr: max_over_time(process_pressure_cpu_some_ratio{job="{{ include "loki-vl-proxy.fullname" . }}",window="60s"}[10m]) > 0.30
    for: 10m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: system-cpu
      category: performance
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "CPU pressure elevated"
      description: "CPU PSI some ratio (60s window) stayed above 30% for 10 minutes."
      impact: "Scheduler contention can cause request tail latency and timeout amplification."
      action: "Check CPU mode split, top endpoints/tenants, and scale replicas or tune expensive queries."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'

  - alert: LokiVLProxySystemIOPressureHigh
    expr: max_over_time(process_pressure_io_some_ratio{job="{{ include "loki-vl-proxy.fullname" . }}",window="60s"}[10m]) > 0.20
    for: 10m
    labels:
      severity: warning
      service: loki-vl-proxy
      component: system-io
      category: performance
      team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
      owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
      source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
      managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
    annotations:
      summary: "I/O pressure elevated"
      description: "I/O PSI some ratio (60s window) stayed above 20% for 10 minutes."
      impact: "Disk/network stalls can slow backend and cache operations, increasing user-facing latency."
      action: "Check disk and network throughput trends, cache path storage performance, and backend saturation."
      runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'