ReliablyObserve
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎charts/loki-vl-proxy/dashboards/loki-vl-proxy.json‎
Lines changed: 59 additions & 5 deletions b/‎charts/loki-vl-proxy/dashboards/loki-vl-proxy.json‎
Lines changed: 59 additions & 5 deletions
diff --git a/‎dashboard/loki-vl-proxy.json‎
Lines changed: 59 additions & 5 deletions b/‎dashboard/loki-vl-proxy.json‎
Lines changed: 59 additions & 5 deletions
@@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Bug Fixes
+
+- cache/tiering: move helper/read caches onto shared fresh reads with local-memory plus local-disk persistence, keep stale fallback local-first, and expose per-tier cache lookup metrics.
+- cache/keys: canonicalize helper/read cache keys across query-param ordering and alias pairs such as `from`/`start`, `to`/`end`, and `q`/`search`, plus normalize effective detected-field limits so Grafana refreshes can reuse the same helper cache entries instead of churning near-identical keys.
+- drilldown/discovery: stop relaxing helper discovery queries after a successful empty primary result for label names, label values, native field values, and detected-label scans; successful empty strict detected-field value resolution now stays strict instead of broadening into relaxed query data, and `service_name` metadata lookup stays on metadata endpoints instead of spilling into streams/scans when metadata is sufficient.
+- metrics/cache: promote cache-tier stats into the shared metrics pipeline so `/metrics` and OTLP now export the same L1/L2/L3 request, hit, miss, stale-hit, backend-fallthrough, object, and byte series instead of keeping them as proxy-local text-only metrics.
+- peer/persistence: advertise peer write-through compression support on existing GET/hot responses, opportunistically compress owner write-through pushes only when the remote peer has confirmed support, accept compressed peer cache POST bodies, request compressed peer snapshot warm responses, and skip periodic snapshot rewrites when the on-disk patterns or label-values payload is unchanged.
+
+### Tests
+
+- cache/tiering: add regression coverage for TTL-aware disk fresh/stale reads, shared L2 promotion into L1, and helper cache locality.
+- discovery/keys: add regression coverage for canonical helper cache keys, for stopping relaxed discovery fallback after a successful empty primary result, and for OTLP/Prometheus cache-tier metric export.
+- peer/persistence: add regression coverage for compressed peer write-through/set round trips, compressed peer snapshot warm fetches, and skipping unchanged periodic snapshot rewrites.
+
 ## [1.9.6] - 2026-04-20
 
 ### Bug Fixes
 
@@ -831,13 +831,67 @@
     {
       "title": "Fanout & Internal Ops",
       "type": "row",
-      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 137},
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 145},
       "collapsed": false
     },
+    {
+      "title": "Helper Cache Tier Requests / Hits",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_requests_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} requests/s"
+        },
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} hits/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
+      }
+    },
+    {
+      "title": "Helper Cache Stale / Backend Fallthrough",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_stale_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} stale/s"
+        },
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_backend_fallthrough_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) or vector(0)",
+          "legendFormat": "backend fallthrough/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
+      }
+    },
+    {
+      "title": "Cache Objects / Bytes by Tier",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(loki_vl_proxy_cache_objects{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} objects"
+        },
+        {
+          "expr": "sum(loki_vl_proxy_cache_bytes{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} bytes"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "decbytes", "custom": {"fillOpacity": 10}}
+      }
+    },
     {
       "title": "VL Child P99 Latency by Upstream Endpoint",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 146},
       "targets": [
         {
           "expr": "histogram_quantile(0.99, sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{system=\"vl\",direction=\"upstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -851,7 +905,7 @@
     {
       "title": "P95 Upstream Calls per Downstream Request",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 146},
       "targets": [
         {
           "expr": "histogram_quantile(0.95, sum(rate(loki_vl_proxy_upstream_calls_per_request_bucket{system=\"loki\",direction=\"downstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -865,7 +919,7 @@
     {
       "title": "Internal Operations by Outcome",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 146},
       "targets": [
         {
           "expr": "sum(rate(loki_vl_proxy_internal_operation_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (operation, outcome)",
@@ -972,5 +1026,5 @@
   "timezone": "browser",
   "title": "Loki-VL-proxy Operations",
   "uid": "loki-vl-proxy-metrics",
-  "version": 13
+  "version": 14
 }
@@ -831,13 +831,67 @@
     {
       "title": "Fanout & Internal Ops",
       "type": "row",
-      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 137},
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 145},
       "collapsed": false
     },
+    {
+      "title": "Helper Cache Tier Requests / Hits",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_requests_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} requests/s"
+        },
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} hits/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
+      }
+    },
+    {
+      "title": "Helper Cache Stale / Backend Fallthrough",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_tier_stale_hits_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} stale/s"
+        },
+        {
+          "expr": "sum(rate(loki_vl_proxy_cache_backend_fallthrough_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) or vector(0)",
+          "legendFormat": "backend fallthrough/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "reqps", "custom": {"fillOpacity": 10}}
+      }
+    },
+    {
+      "title": "Cache Objects / Bytes by Tier",
+      "type": "timeseries",
+      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 137},
+      "targets": [
+        {
+          "expr": "sum(loki_vl_proxy_cache_objects{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} objects"
+        },
+        {
+          "expr": "sum(loki_vl_proxy_cache_bytes{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}) by (tier) or vector(0)",
+          "legendFormat": "{{tier}} bytes"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {"unit": "decbytes", "custom": {"fillOpacity": 10}}
+      }
+    },
     {
       "title": "VL Child P99 Latency by Upstream Endpoint",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 0, "y": 146},
       "targets": [
         {
           "expr": "histogram_quantile(0.99, sum(rate(loki_vl_proxy_backend_duration_seconds_bucket{system=\"vl\",direction=\"upstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -851,7 +905,7 @@
     {
       "title": "P95 Upstream Calls per Downstream Request",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 8, "y": 146},
       "targets": [
         {
           "expr": "histogram_quantile(0.95, sum(rate(loki_vl_proxy_upstream_calls_per_request_bucket{system=\"loki\",direction=\"downstream\",job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (endpoint, le))",
@@ -865,7 +919,7 @@
     {
       "title": "Internal Operations by Outcome",
       "type": "timeseries",
-      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 138},
+      "gridPos": {"h": 8, "w": 8, "x": 16, "y": 146},
       "targets": [
         {
           "expr": "sum(rate(loki_vl_proxy_internal_operation_total{job=~\"${job:regex}|${job:regex}-headless\",cluster=~\"${cluster:regex}\",env=~\"${env:regex}\",namespace=~\"${namespace:regex}|loki-vl-proxy\",service=~\"${service:regex}|${service:regex}-headless|\",pod=~\"${pod:regex}\"}[$__rate_interval])) by (operation, outcome)",
@@ -972,5 +1026,5 @@
   "timezone": "browser",
   "title": "Loki-VL-proxy Operations",
   "uid": "loki-vl-proxy-metrics",
-  "version": 13
+  "version": 14
 }