Skip to content

Commit b02d556

Browse files
committed
refactor: consolidate zerodep modules into _vendor/
Move _structlog.py and _sparse_search.py into _vendor/ directory and update all vendored modules to latest via zerodep CLI: - yaml: 0.3.0 → 0.3.1 - sparse_search: 0.2.2 → 0.3.2 - structlog: 0.3.0 (already latest) - jsonc: 0.3.0 (already latest)
1 parent c54d75e commit b02d556

File tree

7 files changed

+363
-326
lines changed

7 files changed

+363
-326
lines changed
Lines changed: 138 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# /// zerodep
2-
# version = "0.2.2"
2+
# version = "0.3.2"
33
# deps = []
44
# tier = "medium"
55
# category = "utility"
6+
# note = "Install/update via `zerodep add sparse_search`"
67
# ///
78
"""Zero-dependency sparse text search with BM25 family and TF-IDF ranking.
89
@@ -568,156 +569,180 @@ def _idf(self, term: str) -> float:
568569
return 0.0
569570
return math.log((n - df + 0.5) / (df + 0.5) + 1.0)
570571

572+
def _weighted_term_freq(
573+
self,
574+
field_tfs: dict[str, int],
575+
field_weights: dict[str, float],
576+
) -> float:
577+
"""Compute BM25F weighted pseudo term frequency across fields."""
578+
return sum(field_weights.get(fn, 1.0) * tf for fn, tf in field_tfs.items())
579+
580+
def _weighted_doc_length(
581+
self,
582+
doc: _DocRecord,
583+
field_weights: dict[str, float],
584+
) -> float:
585+
"""Compute weighted document length across fields."""
586+
return sum(
587+
field_weights.get(fn, 1.0) * dl for fn, dl in doc.field_lengths.items()
588+
)
589+
590+
def _weighted_avg_doc_length(self, field_weights: dict[str, float]) -> float:
591+
"""Compute weighted average document length across fields."""
592+
avgdl = sum(w * self._avg_field_length(fn) for fn, w in field_weights.items())
593+
return avgdl if avgdl != 0 else 1.0
594+
595+
def _bm25_tf_norm(
596+
self,
597+
weighted_tf: float,
598+
weighted_dl: float,
599+
weighted_avgdl: float,
600+
) -> float:
601+
"""Compute BM25/BM25+/BM25L TF normalization factor."""
602+
k1, b, delta = self.k1, self.b, self.delta
603+
if self.variant == "bm25l":
604+
ctf = weighted_tf / (1.0 - b + b * weighted_dl / weighted_avgdl)
605+
return ((k1 + 1.0) * (ctf + delta)) / (k1 + ctf + delta)
606+
tf_norm = (weighted_tf * (k1 + 1.0)) / (
607+
weighted_tf + k1 * (1.0 - b + b * weighted_dl / weighted_avgdl)
608+
)
609+
return tf_norm + delta
610+
611+
def _calibrate_bm25_scores(
612+
self,
613+
raw_scores: dict[str, float],
614+
query_tokens: list[str],
615+
field_weights: dict[str, float],
616+
) -> dict[str, float]:
617+
"""Convert raw BM25 scores to calibrated probabilities via Bayesian BM25."""
618+
alpha = self._alpha
619+
beta = self._beta
620+
base_rate = self._base_rate
621+
weighted_avgdl = self._weighted_avg_doc_length(field_weights)
622+
623+
calibrated: dict[str, float] = {}
624+
for doc_id, raw_score in raw_scores.items():
625+
doc = self._docs[doc_id]
626+
total_tf = self._total_query_tf(doc_id, query_tokens, field_weights)
627+
w_dl = self._weighted_doc_length(doc, field_weights)
628+
doc_len_ratio = w_dl / weighted_avgdl
629+
calibrated[doc_id] = _score_to_probability(
630+
raw_score,
631+
total_tf,
632+
doc_len_ratio,
633+
alpha,
634+
beta,
635+
base_rate, # type: ignore[arg-type]
636+
)
637+
return calibrated
638+
639+
def _total_query_tf(
640+
self,
641+
doc_id: str,
642+
query_tokens: list[str],
643+
field_weights: dict[str, float],
644+
) -> float:
645+
"""Sum weighted term frequencies for all query tokens in a document."""
646+
total = 0.0
647+
for token in query_tokens:
648+
postings = self._index.get(token)
649+
if postings is not None and doc_id in postings:
650+
total += self._weighted_term_freq(postings[doc_id], field_weights)
651+
return total
652+
571653
def _score_bm25(self, query_tokens: list[str]) -> dict[str, float]:
572654
"""Score documents using BM25 / BM25+ / BM25L / BM25F."""
573655
scores: dict[str, float] = defaultdict(float)
574-
k1 = self.k1
575-
b = self.b
576-
delta = self.delta
577656
field_weights = self.field_weights or {"_default": 1.0}
578-
is_bm25l = self.variant == "bm25l"
579657

580658
for token in query_tokens:
581659
if token not in self._index:
582660
continue
583661

584662
idf = self._idf(token)
585-
postings = self._index[token]
586-
587-
for doc_id, field_tfs in postings.items():
588-
# BM25F: weighted pseudo term frequency and document length
589-
weighted_tf = 0.0
590-
weighted_dl = 0.0
663+
weighted_avgdl = self._weighted_avg_doc_length(field_weights)
591664

665+
for doc_id, field_tfs in self._index[token].items():
592666
doc = self._docs[doc_id]
593-
for field_name, tf in field_tfs.items():
594-
w = field_weights.get(field_name, 1.0)
595-
weighted_tf += w * tf
596-
weighted_dl += w * doc.field_lengths.get(field_name, 0)
597-
598-
# Weighted average document length
599-
weighted_avgdl = 0.0
600-
for field_name, w in field_weights.items():
601-
weighted_avgdl += w * self._avg_field_length(field_name)
602-
603-
if weighted_avgdl == 0:
604-
weighted_avgdl = 1.0
605-
606-
if is_bm25l:
607-
# BM25L: adjusted TF normalization
608-
ctf = weighted_tf / (1.0 - b + b * weighted_dl / weighted_avgdl)
609-
tf_norm = ((k1 + 1.0) * (ctf + delta)) / (k1 + ctf + delta)
610-
else:
611-
# BM25 / BM25+
612-
tf_norm = (weighted_tf * (k1 + 1.0)) / (
613-
weighted_tf + k1 * (1.0 - b + b * weighted_dl / weighted_avgdl)
614-
)
615-
tf_norm += delta
616-
617-
scores[doc_id] += idf * tf_norm
667+
w_tf = self._weighted_term_freq(field_tfs, field_weights)
668+
w_dl = self._weighted_doc_length(doc, field_weights)
669+
scores[doc_id] += idf * self._bm25_tf_norm(w_tf, w_dl, weighted_avgdl)
618670

619671
if not self.calibrated or self._alpha is None or self._beta is None:
620672
return dict(scores)
621673

622-
# Bayesian calibration: convert raw scores to probabilities
623-
alpha = self._alpha
624-
beta = self._beta
625-
base_rate = self._base_rate
626-
calibrated_scores: dict[str, float] = {}
627-
for doc_id, raw_score in scores.items():
628-
doc = self._docs[doc_id]
629-
# Weighted TF across query tokens
630-
total_tf = 0.0
631-
for token in query_tokens:
632-
if token in self._index and doc_id in self._index[token]:
633-
for fn, tf in self._index[token][doc_id].items():
634-
total_tf += field_weights.get(fn, 1.0) * tf
635-
# Doc-length ratio
636-
weighted_dl = sum(
637-
field_weights.get(fn, 1.0) * dl for fn, dl in doc.field_lengths.items()
638-
)
639-
weighted_avgdl = (
640-
sum(
641-
field_weights.get(fn, 1.0) * self._avg_field_length(fn)
642-
for fn in field_weights
643-
)
644-
or 1.0
645-
)
646-
doc_len_ratio = weighted_dl / weighted_avgdl
647-
648-
calibrated_scores[doc_id] = _score_to_probability(
649-
raw_score,
650-
total_tf,
651-
doc_len_ratio,
652-
alpha,
653-
beta,
654-
base_rate,
655-
)
656-
657-
return calibrated_scores
674+
return self._calibrate_bm25_scores(dict(scores), query_tokens, field_weights)
658675

659676
# -- internal: TF-IDF scoring --------------------------------------------
660677

661-
def _score_tfidf(self, query_tokens: list[str]) -> dict[str, float]:
662-
"""Score documents using TF-IDF with cosine similarity."""
663-
n = len(self._docs)
664-
if n == 0:
665-
return {}
666-
667-
field_weights = self.field_weights or {"_default": 1.0}
678+
def _tfidf_idf(self, term: str, n: int) -> float:
679+
"""Compute smoothed IDF for TF-IDF scoring."""
680+
df = self._df.get(term, 0)
681+
if df == 0:
682+
return 0.0
683+
return math.log(n / df) + 1.0
668684

669-
# Build query TF-IDF vector (term -> weight)
685+
def _build_query_tfidf_vec(
686+
self, query_tokens: list[str], n: int
687+
) -> dict[str, float]:
688+
"""Build TF-IDF weighted vector for query terms."""
670689
query_tf: dict[str, int] = defaultdict(int)
671690
for token in query_tokens:
672691
query_tf[token] += 1
673692

674-
query_vec: dict[str, float] = {}
693+
vec: dict[str, float] = {}
675694
for term, tf in query_tf.items():
676-
df = self._df.get(term, 0)
677-
if df == 0:
695+
idf = self._tfidf_idf(term, n)
696+
if idf > 0:
697+
vec[term] = (1.0 + math.log(tf)) * idf
698+
return vec
699+
700+
def _doc_tfidf_vec(
701+
self,
702+
doc_id: str,
703+
n: int,
704+
field_weights: dict[str, float],
705+
) -> dict[str, float]:
706+
"""Build TF-IDF weighted vector for a document."""
707+
vec: dict[str, float] = {}
708+
for term in self._doc_terms.get(doc_id, ()):
709+
postings = self._index.get(term)
710+
if postings is None or doc_id not in postings:
711+
continue
712+
w_tf = self._weighted_term_freq(postings[doc_id], field_weights)
713+
if w_tf <= 0:
678714
continue
679-
idf = math.log(n / df) + 1.0 # smoothed IDF
680-
query_vec[term] = (1.0 + math.log(tf)) * idf
715+
idf = self._tfidf_idf(term, n)
716+
if idf > 0:
717+
vec[term] = (1.0 + math.log(w_tf)) * idf
718+
return vec
681719

720+
def _score_tfidf(self, query_tokens: list[str]) -> dict[str, float]:
721+
"""Score documents using TF-IDF with cosine similarity."""
722+
n = len(self._docs)
723+
if n == 0:
724+
return {}
725+
726+
field_weights = self.field_weights or {"_default": 1.0}
727+
query_vec = self._build_query_tfidf_vec(query_tokens, n)
682728
if not query_vec:
683729
return {}
684730

685731
query_norm = math.sqrt(sum(v * v for v in query_vec.values()))
686732

687-
# Score each candidate document
688-
scores: dict[str, float] = {}
733+
# Collect candidate documents that contain at least one query term
689734
candidates: set[str] = set()
690735
for term in query_vec:
691736
if term in self._index:
692737
candidates.update(self._index[term].keys())
693738

739+
scores: dict[str, float] = {}
694740
for doc_id in candidates:
695-
dot_product = 0.0
696-
doc_norm_sq = 0.0
697-
698-
# Collect all terms in this document for norm calculation
699-
doc_terms: dict[str, float] = {}
700-
for term in self._index:
701-
if doc_id not in self._index[term]:
702-
continue
703-
704-
field_tfs = self._index[term][doc_id]
705-
weighted_tf = sum(
706-
field_weights.get(fn, 1.0) * tf for fn, tf in field_tfs.items()
707-
)
708-
if weighted_tf <= 0:
709-
continue
710-
711-
df = self._df.get(term, 1)
712-
idf = math.log(n / df) + 1.0
713-
tfidf = (1.0 + math.log(weighted_tf)) * idf
714-
doc_terms[term] = tfidf
715-
716-
for term, tfidf in doc_terms.items():
717-
doc_norm_sq += tfidf * tfidf
718-
if term in query_vec:
719-
dot_product += tfidf * query_vec[term]
720-
741+
doc_vec = self._doc_tfidf_vec(doc_id, n, field_weights)
742+
dot_product = sum(
743+
w * query_vec[t] for t, w in doc_vec.items() if t in query_vec
744+
)
745+
doc_norm_sq = sum(w * w for w in doc_vec.values())
721746
if doc_norm_sq > 0 and dot_product > 0:
722747
scores[doc_id] = dot_product / (query_norm * math.sqrt(doc_norm_sq))
723748

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# deps = []
44
# tier = "medium"
55
# category = "utility"
6+
# note = "Install/update via `zerodep add structlog`"
67
# ///
78

89
"""Zero-dependency structured logging with pretty console output.

0 commit comments

Comments
 (0)