Skip to content

Commit 1269a1d

Browse files
authored
cache: harden helper cache tiers and discovery (#219)
* cache: add tiered helper cache controls * changelog: restore Unreleased entry for PR gate * proxy: harden helper cache tiers and discovery * cache: compress peer transports and trim snapshot rewrites * cache: fix compat regressions and memoize helper keys * drilldown: relax empty detected field value lookups
1 parent 6abee7c commit 1269a1d

23 files changed

Lines changed: 1626 additions & 231 deletions

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Bug Fixes
11+
12+
- cache/tiering: move helper/read caches onto shared fresh reads with local-memory plus local-disk persistence, keep stale fallback local-first, and expose per-tier cache lookup metrics.
13+
- cache/keys: canonicalize helper/read cache keys across query-param ordering and alias pairs such as `from`/`start`, `to`/`end`, and `q`/`search`, plus normalize effective detected-field limits so Grafana refreshes can reuse the same helper cache entries instead of churning near-identical keys.
14+
- drilldown/discovery: stop relaxing helper discovery queries after a successful empty primary result for label names, label values, native field values, and detected-label scans; successful empty strict detected-field value resolution now stays strict instead of broadening into relaxed query data, and `service_name` metadata lookup stays on metadata endpoints instead of spilling into streams/scans when metadata is sufficient.
15+
- metrics/cache: promote cache-tier stats into the shared metrics pipeline so `/metrics` and OTLP now export the same L1/L2/L3 request, hit, miss, stale-hit, backend-fallthrough, object, and byte series instead of keeping them as proxy-local text-only metrics.
16+
- peer/persistence: advertise peer write-through compression support on existing GET/hot responses, opportunistically compress owner write-through pushes only when the remote peer has confirmed support, accept compressed peer cache POST bodies, request compressed peer snapshot warm responses, and skip periodic snapshot rewrites when the on-disk patterns or label-values payload is unchanged.
17+
18+
### Tests
19+
20+
- cache/tiering: add regression coverage for TTL-aware disk fresh/stale reads, shared L2 promotion into L1, and helper cache locality.
21+
- discovery/keys: add regression coverage for canonical helper cache keys, for stopping relaxed discovery fallback after a successful empty primary result, and for OTLP/Prometheus cache-tier metric export.
22+
- peer/persistence: add regression coverage for compressed peer write-through/set round trips, compressed peer snapshot warm fetches, and skipping unchanged periodic snapshot rewrites.
23+
1024
## [1.9.6] - 2026-04-20
1125

1226
### Bug Fixes

charts/loki-vl-proxy/dashboards/loki-vl-proxy.json

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -831,13 +831,67 @@
831831
{
832832
"title": "Fanout & Internal Ops",
833833
"type": "row",
834-
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 137},
834+
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 145},
835835
"collapsed": false
836836
},
837+
{
838+
"title": "Helper Cache Tier Requests / Hits",
839+
"type": "timeseries",
840+
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 137},
841+
"targets": [
842+
{
843+
"expr": "sum(rate(loki_vl_proxy_cache_tier_requests_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
844+
"legendFormat": "{{tier}} requests/s"
845+
},
846+
{
847+
"expr": "sum(rate(loki_vl_proxy_cache_tier_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
848+
"legendFormat": "{{tier}} hits/s"
849+
}
850+
],
851+
"fieldConfig": {
852+
"defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
853+
}
854+
},
855+
{
856+
"title": "Helper Cache Stale / Backend Fallthrough",
857+
"type": "timeseries",
858+
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 137},
859+
"targets": [
860+
{
861+
"expr": "sum(rate(loki_vl_proxy_cache_tier_stale_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
862+
"legendFormat": "{{tier}} stale/s"
863+
},
864+
{
865+
"expr": "sum(rate(loki_vl_proxy_cache_backend_fallthrough_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) or vector(0)",
866+
"legendFormat": "backend fallthrough/s"
867+
}
868+
],
869+
"fieldConfig": {
870+
"defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
871+
}
872+
},
873+
{
874+
"title": "Cache Objects / Bytes by Tier",
875+
"type": "timeseries",
876+
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 137},
877+
"targets": [
878+
{
879+
"expr": "sum(loki_vl_proxy_cache_objects{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
880+
"legendFormat": "{{tier}} objects"
881+
},
882+
{
883+
"expr": "sum(loki_vl_proxy_cache_bytes{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
884+
"legendFormat": "{{tier}} bytes"
885+
}
886+
],
887+
"fieldConfig": {
888+
"defaults": {"unit": "decbytes", "custom": {"fillOpacity": 10}}
889+
}
890+
},
837891
{
838892
"title": "VL Child P99 Latency by Upstream Endpoint",
839893
"type": "timeseries",
840-
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 138},
894+
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 146},
841895
"targets": [
842896
{
843897
"expr": "histogram_quantile(0.99, sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{system=\"vl\",direction=\"upstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -851,7 +905,7 @@
851905
{
852906
"title": "P95 Upstream Calls per Downstream Request",
853907
"type": "timeseries",
854-
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 138},
908+
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 146},
855909
"targets": [
856910
{
857911
"expr": "histogram_quantile(0.95, sum(rate(loki_vl_proxy_upstream_calls_per_request_bucket{system=\"loki\",direction=\"downstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -865,7 +919,7 @@
865919
{
866920
"title": "Internal Operations by Outcome",
867921
"type": "timeseries",
868-
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 138},
922+
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 146},
869923
"targets": [
870924
{
871925
"expr": "sum(rate(loki_vl_proxy_internal_operation_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (operation, outcome)",
@@ -972,5 +1026,5 @@
9721026
"timezone": "browser",
9731027
"title": "Loki-VL-proxy Operations",
9741028
"uid": "loki-vl-proxy-metrics",
975-
"version": 13
1029+
"version": 14
9761030
}

dashboard/loki-vl-proxy.json

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -831,13 +831,67 @@
831831
{
832832
"title": "Fanout & Internal Ops",
833833
"type": "row",
834-
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 137},
834+
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 145},
835835
"collapsed": false
836836
},
837+
{
838+
"title": "Helper Cache Tier Requests / Hits",
839+
"type": "timeseries",
840+
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 137},
841+
"targets": [
842+
{
843+
"expr": "sum(rate(loki_vl_proxy_cache_tier_requests_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
844+
"legendFormat": "{{tier}} requests/s"
845+
},
846+
{
847+
"expr": "sum(rate(loki_vl_proxy_cache_tier_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
848+
"legendFormat": "{{tier}} hits/s"
849+
}
850+
],
851+
"fieldConfig": {
852+
"defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
853+
}
854+
},
855+
{
856+
"title": "Helper Cache Stale / Backend Fallthrough",
857+
"type": "timeseries",
858+
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 137},
859+
"targets": [
860+
{
861+
"expr": "sum(rate(loki_vl_proxy_cache_tier_stale_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
862+
"legendFormat": "{{tier}} stale/s"
863+
},
864+
{
865+
"expr": "sum(rate(loki_vl_proxy_cache_backend_fallthrough_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) or vector(0)",
866+
"legendFormat": "backend fallthrough/s"
867+
}
868+
],
869+
"fieldConfig": {
870+
"defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
871+
}
872+
},
873+
{
874+
"title": "Cache Objects / Bytes by Tier",
875+
"type": "timeseries",
876+
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 137},
877+
"targets": [
878+
{
879+
"expr": "sum(loki_vl_proxy_cache_objects{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
880+
"legendFormat": "{{tier}} objects"
881+
},
882+
{
883+
"expr": "sum(loki_vl_proxy_cache_bytes{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
884+
"legendFormat": "{{tier}} bytes"
885+
}
886+
],
887+
"fieldConfig": {
888+
"defaults": {"unit": "decbytes", "custom": {"fillOpacity": 10}}
889+
}
890+
},
837891
{
838892
"title": "VL Child P99 Latency by Upstream Endpoint",
839893
"type": "timeseries",
840-
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 138},
894+
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 146},
841895
"targets": [
842896
{
843897
"expr": "histogram_quantile(0.99, sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{system=\"vl\",direction=\"upstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -851,7 +905,7 @@
851905
{
852906
"title": "P95 Upstream Calls per Downstream Request",
853907
"type": "timeseries",
854-
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 138},
908+
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 146},
855909
"targets": [
856910
{
857911
"expr": "histogram_quantile(0.95, sum(rate(loki_vl_proxy_upstream_calls_per_request_bucket{system=\"loki\",direction=\"downstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -865,7 +919,7 @@
865919
{
866920
"title": "Internal Operations by Outcome",
867921
"type": "timeseries",
868-
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 138},
922+
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 146},
869923
"targets": [
870924
{
871925
"expr": "sum(rate(loki_vl_proxy_internal_operation_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (operation, outcome)",
@@ -972,5 +1026,5 @@
9721026
"timezone": "browser",
9731027
"title": "Loki-VL-proxy Operations",
9741028
"uid": "loki-vl-proxy-metrics",
975-
"version": 13
1029+
"version": 14
9761030
}

0 commit comments

Comments
 (0)