Skip to content

Commit 10e3f34

Browse files
authored
fix: serve stale read data on backend errors (#217)
1 parent d7a54e2 commit 10e3f34

7 files changed

Lines changed: 313 additions & 48 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Bug Fixes
11+
12+
- read-path/hardening: stop converting backend failures on Drilldown `detected_fields`, `detected_labels`, detected field values, `/index/volume`, and `/index/volume_range` into empty success payloads; these handlers now serve stale last-good cache entries when available and otherwise return real upstream-style errors, and volume helpers now reject non-success `/select/logsql/hits` responses instead of silently parsing them as empty data.
13+
14+
### Tests
15+
16+
- drilldown/cache: add regression coverage for stale-on-error recovery on detected fields, detected labels, detected field values, and near-now volume refreshes, plus cache coverage for reusing expired L1 entries as last-good fallback data.
17+
1018
## [1.9.5] - 2026-04-20
1119

1220
### Bug Fixes

internal/cache/cache.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,21 @@ func (c *Cache) GetWithTTL(key string) ([]byte, time.Duration, bool) {
148148
return nil, 0, false
149149
}
150150

151+
// GetStaleWithTTL returns a locally retained value even if its TTL has expired.
152+
// The returned TTL may be negative when the entry is stale. This never falls
153+
// back to L2/L3 because stale-on-error is only intended to reuse the serving
154+
// replica's last known-good payload.
155+
func (c *Cache) GetStaleWithTTL(key string) ([]byte, time.Duration, bool) {
156+
c.mu.RLock()
157+
defer c.mu.RUnlock()
158+
159+
e, ok := c.entries[key]
160+
if !ok {
161+
return nil, 0, false
162+
}
163+
return e.value, time.Until(e.expiresAt), true
164+
}
165+
151166
func (c *Cache) Get(key string) ([]byte, bool) {
152167
c.mu.Lock()
153168
e, ok := c.entries[key]

internal/cache/cache_coverage_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,25 @@ func TestGetWithTTL_Miss(t *testing.T) {
4848
}
4949
}
5050

51+
func TestGetStaleWithTTL_ReturnsExpiredEntry(t *testing.T) {
52+
c := NewWithMaxBytes(1*time.Millisecond, 100, 1024*1024)
53+
defer c.Close()
54+
55+
c.Set("stale", []byte("payload"))
56+
time.Sleep(5 * time.Millisecond)
57+
58+
val, ttl, ok := c.GetStaleWithTTL("stale")
59+
if !ok {
60+
t.Fatal("expected stale entry to be returned")
61+
}
62+
if string(val) != "payload" {
63+
t.Fatalf("unexpected stale payload %q", string(val))
64+
}
65+
if ttl >= 0 {
66+
t.Fatalf("expected negative remaining TTL for stale entry, got %v", ttl)
67+
}
68+
}
69+
5170
func TestCache_MaxEntrySizeBytes(t *testing.T) {
5271
c := NewWithMaxBytes(10*time.Second, 100, 1000)
5372
defer c.Close()

internal/proxy/drilldown_compat_test.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,123 @@ func TestDrilldown_DetectedFieldValues_IgnoreZeroHitNativeValues(t *testing.T) {
13011301
}
13021302
}
13031303

1304+
func TestDrilldown_DetectedFields_ServesStaleCacheOnBackendError(t *testing.T) {
1305+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1306+
http.Error(w, "backend unavailable", http.StatusBadGateway)
1307+
}))
1308+
defer vlBackend.Close()
1309+
1310+
p := newGapTestProxy(t, vlBackend.URL)
1311+
cacheKey := "detected_fields::query=%7Bservice_name%3D%22cached-svc%22%7D"
1312+
p.setJSONCacheWithTTL(cacheKey, time.Millisecond, map[string]interface{}{
1313+
"status": "success",
1314+
"data": []map[string]interface{}{
1315+
{"label": "cached_field", "type": "string", "cardinality": 1},
1316+
},
1317+
"fields": []map[string]interface{}{
1318+
{"label": "cached_field", "type": "string", "cardinality": 1},
1319+
},
1320+
"limit": 1000,
1321+
})
1322+
time.Sleep(5 * time.Millisecond)
1323+
1324+
w := httptest.NewRecorder()
1325+
r := httptest.NewRequest("GET", "/loki/api/v1/detected_fields?query=%7Bservice_name%3D%22cached-svc%22%7D", nil)
1326+
p.handleDetectedFields(w, r)
1327+
1328+
if w.Code != http.StatusOK {
1329+
t.Fatalf("expected stale cached detected_fields response, got %d body=%s", w.Code, w.Body.String())
1330+
}
1331+
var resp map[string]interface{}
1332+
mustUnmarshal(t, w.Body.Bytes(), &resp)
1333+
fields := resp["fields"].([]interface{})
1334+
if len(fields) != 1 || fields[0].(map[string]interface{})["label"] != "cached_field" {
1335+
t.Fatalf("expected stale cached detected_fields payload, got %v", resp)
1336+
}
1337+
}
1338+
1339+
func TestDrilldown_DetectedFields_ReturnsErrorWithoutCacheOnBackendFailure(t *testing.T) {
1340+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1341+
http.Error(w, "backend unavailable", http.StatusBadGateway)
1342+
}))
1343+
defer vlBackend.Close()
1344+
1345+
p := newGapTestProxy(t, vlBackend.URL)
1346+
w := httptest.NewRecorder()
1347+
r := httptest.NewRequest("GET", "/loki/api/v1/detected_fields?query=%7Bservice_name%3D%22svc%22%7D", nil)
1348+
p.handleDetectedFields(w, r)
1349+
1350+
if w.Code != http.StatusBadGateway {
1351+
t.Fatalf("expected 502 when detected_fields has no cache fallback, got %d body=%s", w.Code, w.Body.String())
1352+
}
1353+
}
1354+
1355+
func TestDrilldown_DetectedLabels_ServesStaleCacheOnBackendError(t *testing.T) {
1356+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1357+
http.Error(w, "backend unavailable", http.StatusBadGateway)
1358+
}))
1359+
defer vlBackend.Close()
1360+
1361+
p := newGapTestProxy(t, vlBackend.URL)
1362+
cacheKey := "detected_labels::query=%7Bk8s_cluster_name%3D%22ops-sand%22%7D"
1363+
p.setJSONCacheWithTTL(cacheKey, time.Millisecond, map[string]interface{}{
1364+
"status": "success",
1365+
"data": []map[string]interface{}{
1366+
{"label": "service_name", "cardinality": 1},
1367+
},
1368+
"detectedLabels": []map[string]interface{}{
1369+
{"label": "service_name", "cardinality": 1},
1370+
},
1371+
"limit": 1000,
1372+
})
1373+
time.Sleep(5 * time.Millisecond)
1374+
1375+
w := httptest.NewRecorder()
1376+
r := httptest.NewRequest("GET", "/loki/api/v1/detected_labels?query=%7Bk8s_cluster_name%3D%22ops-sand%22%7D", nil)
1377+
p.handleDetectedLabels(w, r)
1378+
1379+
if w.Code != http.StatusOK {
1380+
t.Fatalf("expected stale cached detected_labels response, got %d body=%s", w.Code, w.Body.String())
1381+
}
1382+
var resp map[string]interface{}
1383+
mustUnmarshal(t, w.Body.Bytes(), &resp)
1384+
items := resp["detectedLabels"].([]interface{})
1385+
if len(items) != 1 || items[0].(map[string]interface{})["label"] != "service_name" {
1386+
t.Fatalf("expected stale cached detected_labels payload, got %v", resp)
1387+
}
1388+
}
1389+
1390+
func TestDrilldown_DetectedFieldValues_ServesStaleCacheOnBackendError(t *testing.T) {
1391+
vlBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1392+
http.Error(w, "backend unavailable", http.StatusBadGateway)
1393+
}))
1394+
defer vlBackend.Close()
1395+
1396+
p := newGapTestProxy(t, vlBackend.URL)
1397+
cacheKey := "detected_field_values::service_name:query=%7Bservice_name%3D%22cached-svc%22%7D"
1398+
p.setJSONCacheWithTTL(cacheKey, time.Millisecond, map[string]interface{}{
1399+
"status": "success",
1400+
"data": []string{"cached-svc"},
1401+
"values": []string{"cached-svc"},
1402+
"limit": 1000,
1403+
})
1404+
time.Sleep(5 * time.Millisecond)
1405+
1406+
w := httptest.NewRecorder()
1407+
r := httptest.NewRequest("GET", "/loki/api/v1/detected_field/service_name/values?query=%7Bservice_name%3D%22cached-svc%22%7D", nil)
1408+
p.handleDetectedFieldValues(w, r)
1409+
1410+
if w.Code != http.StatusOK {
1411+
t.Fatalf("expected stale cached detected_field_values response, got %d body=%s", w.Code, w.Body.String())
1412+
}
1413+
var resp map[string]interface{}
1414+
mustUnmarshal(t, w.Body.Bytes(), &resp)
1415+
values := resp["values"].([]interface{})
1416+
if len(values) != 1 || values[0].(string) != "cached-svc" {
1417+
t.Fatalf("expected stale cached detected_field_values payload, got %v", resp)
1418+
}
1419+
}
1420+
13041421
func TestDetectedLabelValuesForField_ResolvesKnownUnderscoreAlias(t *testing.T) {
13051422
backend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
13061423
switch r.URL.Path {

internal/proxy/perf_hardening_test.go

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func TestHandleDetectedLabelsReuseCachedScan(t *testing.T) {
8989
}
9090
}
9191

92-
func TestHandleDetectedLabels_BackendErrorReturnsEmptySuccess(t *testing.T) {
92+
func TestHandleDetectedLabels_BackendErrorReturnsGatewayError(t *testing.T) {
9393
backend := httptest.NewServer(http.NotFoundHandler())
9494
backendURL := backend.URL
9595
backend.Close()
@@ -100,20 +100,8 @@ func TestHandleDetectedLabels_BackendErrorReturnsEmptySuccess(t *testing.T) {
100100

101101
p.handleDetectedLabels(w, r)
102102

103-
if w.Code != http.StatusOK {
104-
t.Fatalf("expected 200, got %d body=%s", w.Code, w.Body.String())
105-
}
106-
var resp struct {
107-
Status string `json:"status"`
108-
Data []interface{} `json:"data"`
109-
DetectedLabels []interface{} `json:"detectedLabels"`
110-
Limit int `json:"limit"`
111-
}
112-
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
113-
t.Fatalf("decode response: %v", err)
114-
}
115-
if resp.Status != "success" || len(resp.Data) != 0 || len(resp.DetectedLabels) != 0 || resp.Limit != 17 {
116-
t.Fatalf("expected empty detected_labels success response, got %#v", resp)
103+
if w.Code != http.StatusBadGateway {
104+
t.Fatalf("expected 502, got %d body=%s", w.Code, w.Body.String())
117105
}
118106
}
119107

@@ -144,8 +132,8 @@ func TestHandleDetectedLabels_DoesNotCacheTransientErrorFallback(t *testing.T) {
144132
w1 := httptest.NewRecorder()
145133
r1 := httptest.NewRequest(http.MethodGet, reqPath, nil)
146134
p.handleDetectedLabels(w1, r1)
147-
if w1.Code != http.StatusOK {
148-
t.Fatalf("first call expected 200, got %d body=%s", w1.Code, w1.Body.String())
135+
if w1.Code != http.StatusBadGateway {
136+
t.Fatalf("first call expected 502, got %d body=%s", w1.Code, w1.Body.String())
149137
}
150138

151139
fail.Store(false)
@@ -208,8 +196,8 @@ func TestHandleDetectedFields_DoesNotCacheTransientErrorFallback(t *testing.T) {
208196
w1 := httptest.NewRecorder()
209197
r1 := httptest.NewRequest(http.MethodGet, reqPath, nil)
210198
p.handleDetectedFields(w1, r1)
211-
if w1.Code != http.StatusOK {
212-
t.Fatalf("first call expected 200, got %d body=%s", w1.Code, w1.Body.String())
199+
if w1.Code != http.StatusBadGateway {
200+
t.Fatalf("first call expected 502, got %d body=%s", w1.Code, w1.Body.String())
213201
}
214202

215203
fail.Store(false)

0 commit comments

Comments
 (0)