-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathloki-vl-proxy-prometheusrule.yaml
More file actions
302 lines (288 loc) · 17.4 KB
/
loki-vl-proxy-prometheusrule.yaml
File metadata and controls
302 lines (288 loc) · 17.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
groupName: loki-vl-proxy
interval: 30s
rules:
- alert: LokiVLProxyDown
expr: 'up{job="{{ include "loki-vl-proxy.fullname" . }}"} == 0'
for: 1m
labels:
severity: critical
service: loki-vl-proxy
component: availability
category: platform
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Loki-VL-proxy is down"
description: 'Instance {{`{{ $labels.instance }}`}} has been down for >1m.'
impact: "All Loki-compatible read paths through this instance are unavailable."
action: "Check pod health, readiness endpoint, and backend connectivity."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-down.md'
- alert: LokiVLProxyHighErrorRate
expr: |
100 * sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}",status=~"5.."}[5m]))
/ sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}"}[5m])) > 5
for: 5m
labels:
severity: warning
service: loki-vl-proxy
component: request-path
category: reliability
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: 'High 5xx error rate ({{`{{ $value | printf "%.1f" }}`}}%)'
description: "Proxy 5xx response ratio is above 5% for 5 minutes."
impact: "Grafana users may see intermittent query failures."
action: "Inspect proxy logs and backend status for failing endpoints."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-high-error-rate.md'
- alert: LokiVLProxyHighLatency
expr: |
histogram_quantile(0.99,
sum(rate(loki_vl_proxy_request_duration_seconds_bucket{job="{{ include "loki-vl-proxy.fullname" . }}",endpoint="query_range"}[5m])) by (le)
) > 10
for: 5m
labels:
severity: warning
service: loki-vl-proxy
component: query-range
category: performance
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "P99 query_range latency >10s"
description: "P99 query_range latency has exceeded 10s for 5 minutes."
impact: "Dashboards and Explore become slow for most users."
action: "Check backend latency, cache hit ratio, and query volume spikes."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-high-latency.md'
- alert: LokiVLProxyBackendHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{job="{{ include "loki-vl-proxy.fullname" . }}"}[5m])) by (le, endpoint)
) > 5
for: 5m
labels:
severity: warning
service: loki-vl-proxy
component: backend
category: dependency
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Backend p95 latency >5s"
description: "VictoriaLogs backend p95 latency is above 5s for 5 minutes."
impact: "Query endpoints may remain up but user-facing latency degrades."
action: "Identify slow backend endpoints and correlate with backend CPU/IO pressure."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-backend-high-latency.md'
- alert: LokiVLProxyBackendUnreachable
expr: |
sum(rate(loki_vl_proxy_requests_total{job="{{ include "loki-vl-proxy.fullname" . }}",status="502"}[5m])) > 0.5
for: 2m
labels:
severity: critical
service: loki-vl-proxy
component: backend
category: dependency
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "VictoriaLogs backend unreachable"
description: "Proxy sees sustained 502 responses while querying VictoriaLogs."
impact: "Read-path compatibility is degraded or unavailable."
action: "Check VictoriaLogs health, network paths, and auth headers."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-backend-unreachable.md'
- alert: LokiVLProxyCircuitBreakerOpen
expr: loki_vl_proxy_circuit_breaker_state{job="{{ include "loki-vl-proxy.fullname" . }}"} == 1
for: 1m
labels:
severity: critical
service: loki-vl-proxy
component: resiliency
category: dependency
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Circuit breaker is open"
description: "Circuit breaker has opened due to repeated upstream failures."
impact: "Requests are temporarily short-circuited to protect the backend."
action: "Investigate backend stability and recent 5xx bursts."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-circuit-breaker-open.md'
- alert: LokiVLProxyTenantHighErrorRate
expr: |
100 * sum(rate(loki_vl_proxy_tenant_requests_total{status=~"5.."}[5m])) by (tenant)
/ sum(rate(loki_vl_proxy_tenant_requests_total[5m])) by (tenant) > 10
for: 5m
labels:
severity: warning
service: loki-vl-proxy
component: multitenancy
category: tenant
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: 'Tenant {{`{{ $labels.tenant }}`}} high error rate'
description: 'Tenant {{`{{ $labels.tenant }}`}} exceeded 10% 5xx ratio for 5 minutes.'
impact: "Tenant-scoped dashboards and queries are failing."
action: "Inspect tenant-specific query patterns and backend limits."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-tenant-high-error-rate.md'
- alert: LokiVLProxyRateLimiting
expr: sum(rate(loki_vl_proxy_client_errors_total{reason="rate_limited"}[5m])) > 0.5
for: 5m
labels:
severity: warning
service: loki-vl-proxy
component: protection
category: traffic
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Rate limiting active"
description: "Rate-limited client errors are sustained above threshold."
impact: "Some clients are throttled and may observe partial failures."
action: "Review client request bursts and adjust rate limiter settings if needed."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-rate-limiting.md'
- alert: LokiVLProxyClientBadRequestBurst
expr: |
sum(rate(loki_vl_proxy_client_errors_total{reason="bad_request"}[5m])) by (endpoint) > 2
for: 10m
labels:
severity: warning
service: loki-vl-proxy
component: client-errors
category: traffic
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Client bad_request burst on {{`{{ $labels.endpoint }}`}}"
description: "Bad request errors exceed 2 req/s for 10 minutes on endpoint {{`{{ $labels.endpoint }}`}}."
impact: "Clients experience hard failures and retry storms can increase platform load."
action: "Identify top offending clients and inspect rejected query patterns."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-client-bad-request-burst.md'
- alert: LokiVLProxyUnexpectedTupleMode
expr: sum(rate(loki_vl_proxy_response_tuple_mode_total{mode!~"default_2tuple|categorize_labels_3tuple"}[10m])) > 0
for: 5m
labels:
severity: critical
service: loki-vl-proxy
component: tuple-contract
category: compatibility
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Unexpected tuple mode emitted"
description: "Proxy emitted tuple modes outside the strict contract set: default_2tuple and categorize_labels_3tuple."
impact: "Client tuple-shape compatibility may regress and decode failures can follow."
action: "Validate tuple contract smoke script and inspect recent query/merge/stream code changes."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-grafana-tuple-contract.md'
- alert: LokiVLProxyDefault2TupleMissing
expr: |
sum(rate(loki_vl_proxy_response_tuple_mode_total{mode=~"default_2tuple|categorize_labels_3tuple"}[10m])) > 0
and
sum(rate(loki_vl_proxy_response_tuple_mode_total{mode="default_2tuple"}[10m])) == 0
for: 10m
labels:
severity: warning
service: loki-vl-proxy
component: tuple-contract
category: compatibility
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "Default 2-tuple emissions missing"
description: "Tuple-mode traffic exists but mode=default_2tuple stayed at zero for 10 minutes."
impact: "All traffic may be forcing categorize-labels, reducing strict 2-tuple compatibility coverage."
action: "Run tuple smoke canary and verify callers only send categorize-labels where 3-tuples are expected."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-grafana-tuple-contract.md'
- alert: LokiVLProxySystemMetricsMissing
expr: absent(process_memory_usage_ratio{job="{{ include "loki-vl-proxy.fullname" . }}"})
for: 15m
labels:
severity: warning
service: loki-vl-proxy
component: system-metrics
category: observability
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "System resource metrics missing"
description: "process_memory_usage_ratio is missing for loki-vl-proxy for 15 minutes."
impact: "CPU/memory/disk/network/pressure visibility is degraded and related alerts cannot fire."
action: "Check startup diagnostics, ensure /metrics exposure, and if running in Kubernetes enable host /proc mount plus -proc-root=/host/proc."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'
- alert: LokiVLProxySystemMemoryHigh
expr: max_over_time(process_memory_usage_ratio{job="{{ include "loki-vl-proxy.fullname" . }}"}[10m]) > 0.90
for: 10m
labels:
severity: warning
service: loki-vl-proxy
component: system-memory
category: capacity
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "System memory usage > 90%"
description: "process_memory_usage_ratio stayed above 90% for 10 minutes."
impact: "Query latency and backend request failures may increase under memory pressure."
action: "Correlate with process RSS/open FDs and node pressure, then reduce query load or increase pod/node capacity."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'
- alert: LokiVLProxySystemCPUPressureHigh
expr: max_over_time(process_pressure_cpu_some_ratio{job="{{ include "loki-vl-proxy.fullname" . }}",window="60s"}[10m]) > 0.30
for: 10m
labels:
severity: warning
service: loki-vl-proxy
component: system-cpu
category: performance
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "CPU pressure elevated"
description: "CPU PSI some ratio (60s window) stayed above 30% for 10 minutes."
impact: "Scheduler contention can cause request tail latency and timeout amplification."
action: "Check CPU mode split, top endpoints/tenants, and scale replicas or tune expensive queries."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'
- alert: LokiVLProxySystemIOPressureHigh
expr: max_over_time(process_pressure_io_some_ratio{job="{{ include "loki-vl-proxy.fullname" . }}",window="60s"}[10m]) > 0.20
for: 10m
labels:
severity: warning
service: loki-vl-proxy
component: system-io
category: performance
team: '{{ .Values.prometheusRule.standardLabels.team | default "sre" }}'
owner: '{{ .Values.prometheusRule.standardLabels.owner | default "reliablyobserve" }}'
source: '{{ .Values.prometheusRule.standardLabels.source | default "vmalert" }}'
managed_by: '{{ .Values.prometheusRule.standardLabels.managedBy | default "helm" }}'
annotations:
summary: "I/O pressure elevated"
description: "I/O PSI some ratio (60s window) stayed above 20% for 10 minutes."
impact: "Disk/network stalls can slow backend and cache operations, increasing user-facing latency."
action: "Check disk and network throughput trends, cache path storage performance, and backend saturation."
runbook_url: '{{ .Values.prometheusRule.runbookBaseUrl | default "https://github.com/ReliablyObserve/Loki-VL-proxy/blob/main/docs/runbooks" }}/loki-vl-proxy-system-resources.md'