|
1 | 1 | # /// zerodep |
2 | | -# version = "0.2.2" |
| 2 | +# version = "0.3.2" |
3 | 3 | # deps = [] |
4 | 4 | # tier = "medium" |
5 | 5 | # category = "utility" |
| 6 | +# note = "Install/update via `zerodep add sparse_search`" |
6 | 7 | # /// |
7 | 8 | """Zero-dependency sparse text search with BM25 family and TF-IDF ranking. |
8 | 9 |
|
@@ -568,156 +569,180 @@ def _idf(self, term: str) -> float: |
568 | 569 | return 0.0 |
569 | 570 | return math.log((n - df + 0.5) / (df + 0.5) + 1.0) |
570 | 571 |
|
| 572 | + def _weighted_term_freq( |
| 573 | + self, |
| 574 | + field_tfs: dict[str, int], |
| 575 | + field_weights: dict[str, float], |
| 576 | + ) -> float: |
| 577 | + """Compute BM25F weighted pseudo term frequency across fields.""" |
| 578 | + return sum(field_weights.get(fn, 1.0) * tf for fn, tf in field_tfs.items()) |
| 579 | + |
| 580 | + def _weighted_doc_length( |
| 581 | + self, |
| 582 | + doc: _DocRecord, |
| 583 | + field_weights: dict[str, float], |
| 584 | + ) -> float: |
| 585 | + """Compute weighted document length across fields.""" |
| 586 | + return sum( |
| 587 | + field_weights.get(fn, 1.0) * dl for fn, dl in doc.field_lengths.items() |
| 588 | + ) |
| 589 | + |
| 590 | + def _weighted_avg_doc_length(self, field_weights: dict[str, float]) -> float: |
| 591 | + """Compute weighted average document length across fields.""" |
| 592 | + avgdl = sum(w * self._avg_field_length(fn) for fn, w in field_weights.items()) |
| 593 | + return avgdl if avgdl != 0 else 1.0 |
| 594 | + |
| 595 | + def _bm25_tf_norm( |
| 596 | + self, |
| 597 | + weighted_tf: float, |
| 598 | + weighted_dl: float, |
| 599 | + weighted_avgdl: float, |
| 600 | + ) -> float: |
| 601 | + """Compute BM25/BM25+/BM25L TF normalization factor.""" |
| 602 | + k1, b, delta = self.k1, self.b, self.delta |
| 603 | + if self.variant == "bm25l": |
| 604 | + ctf = weighted_tf / (1.0 - b + b * weighted_dl / weighted_avgdl) |
| 605 | + return ((k1 + 1.0) * (ctf + delta)) / (k1 + ctf + delta) |
| 606 | + tf_norm = (weighted_tf * (k1 + 1.0)) / ( |
| 607 | + weighted_tf + k1 * (1.0 - b + b * weighted_dl / weighted_avgdl) |
| 608 | + ) |
| 609 | + return tf_norm + delta |
| 610 | + |
| 611 | + def _calibrate_bm25_scores( |
| 612 | + self, |
| 613 | + raw_scores: dict[str, float], |
| 614 | + query_tokens: list[str], |
| 615 | + field_weights: dict[str, float], |
| 616 | + ) -> dict[str, float]: |
| 617 | + """Convert raw BM25 scores to calibrated probabilities via Bayesian BM25.""" |
| 618 | + alpha = self._alpha |
| 619 | + beta = self._beta |
| 620 | + base_rate = self._base_rate |
| 621 | + weighted_avgdl = self._weighted_avg_doc_length(field_weights) |
| 622 | + |
| 623 | + calibrated: dict[str, float] = {} |
| 624 | + for doc_id, raw_score in raw_scores.items(): |
| 625 | + doc = self._docs[doc_id] |
| 626 | + total_tf = self._total_query_tf(doc_id, query_tokens, field_weights) |
| 627 | + w_dl = self._weighted_doc_length(doc, field_weights) |
| 628 | + doc_len_ratio = w_dl / weighted_avgdl |
| 629 | + calibrated[doc_id] = _score_to_probability( |
| 630 | + raw_score, |
| 631 | + total_tf, |
| 632 | + doc_len_ratio, |
| 633 | + alpha, |
| 634 | + beta, |
| 635 | + base_rate, # type: ignore[arg-type] |
| 636 | + ) |
| 637 | + return calibrated |
| 638 | + |
| 639 | + def _total_query_tf( |
| 640 | + self, |
| 641 | + doc_id: str, |
| 642 | + query_tokens: list[str], |
| 643 | + field_weights: dict[str, float], |
| 644 | + ) -> float: |
| 645 | + """Sum weighted term frequencies for all query tokens in a document.""" |
| 646 | + total = 0.0 |
| 647 | + for token in query_tokens: |
| 648 | + postings = self._index.get(token) |
| 649 | + if postings is not None and doc_id in postings: |
| 650 | + total += self._weighted_term_freq(postings[doc_id], field_weights) |
| 651 | + return total |
| 652 | + |
571 | 653 | def _score_bm25(self, query_tokens: list[str]) -> dict[str, float]: |
572 | 654 | """Score documents using BM25 / BM25+ / BM25L / BM25F.""" |
573 | 655 | scores: dict[str, float] = defaultdict(float) |
574 | | - k1 = self.k1 |
575 | | - b = self.b |
576 | | - delta = self.delta |
577 | 656 | field_weights = self.field_weights or {"_default": 1.0} |
578 | | - is_bm25l = self.variant == "bm25l" |
579 | 657 |
|
580 | 658 | for token in query_tokens: |
581 | 659 | if token not in self._index: |
582 | 660 | continue |
583 | 661 |
|
584 | 662 | idf = self._idf(token) |
585 | | - postings = self._index[token] |
586 | | - |
587 | | - for doc_id, field_tfs in postings.items(): |
588 | | - # BM25F: weighted pseudo term frequency and document length |
589 | | - weighted_tf = 0.0 |
590 | | - weighted_dl = 0.0 |
| 663 | + weighted_avgdl = self._weighted_avg_doc_length(field_weights) |
591 | 664 |
|
| 665 | + for doc_id, field_tfs in self._index[token].items(): |
592 | 666 | doc = self._docs[doc_id] |
593 | | - for field_name, tf in field_tfs.items(): |
594 | | - w = field_weights.get(field_name, 1.0) |
595 | | - weighted_tf += w * tf |
596 | | - weighted_dl += w * doc.field_lengths.get(field_name, 0) |
597 | | - |
598 | | - # Weighted average document length |
599 | | - weighted_avgdl = 0.0 |
600 | | - for field_name, w in field_weights.items(): |
601 | | - weighted_avgdl += w * self._avg_field_length(field_name) |
602 | | - |
603 | | - if weighted_avgdl == 0: |
604 | | - weighted_avgdl = 1.0 |
605 | | - |
606 | | - if is_bm25l: |
607 | | - # BM25L: adjusted TF normalization |
608 | | - ctf = weighted_tf / (1.0 - b + b * weighted_dl / weighted_avgdl) |
609 | | - tf_norm = ((k1 + 1.0) * (ctf + delta)) / (k1 + ctf + delta) |
610 | | - else: |
611 | | - # BM25 / BM25+ |
612 | | - tf_norm = (weighted_tf * (k1 + 1.0)) / ( |
613 | | - weighted_tf + k1 * (1.0 - b + b * weighted_dl / weighted_avgdl) |
614 | | - ) |
615 | | - tf_norm += delta |
616 | | - |
617 | | - scores[doc_id] += idf * tf_norm |
| 667 | + w_tf = self._weighted_term_freq(field_tfs, field_weights) |
| 668 | + w_dl = self._weighted_doc_length(doc, field_weights) |
| 669 | + scores[doc_id] += idf * self._bm25_tf_norm(w_tf, w_dl, weighted_avgdl) |
618 | 670 |
|
619 | 671 | if not self.calibrated or self._alpha is None or self._beta is None: |
620 | 672 | return dict(scores) |
621 | 673 |
|
622 | | - # Bayesian calibration: convert raw scores to probabilities |
623 | | - alpha = self._alpha |
624 | | - beta = self._beta |
625 | | - base_rate = self._base_rate |
626 | | - calibrated_scores: dict[str, float] = {} |
627 | | - for doc_id, raw_score in scores.items(): |
628 | | - doc = self._docs[doc_id] |
629 | | - # Weighted TF across query tokens |
630 | | - total_tf = 0.0 |
631 | | - for token in query_tokens: |
632 | | - if token in self._index and doc_id in self._index[token]: |
633 | | - for fn, tf in self._index[token][doc_id].items(): |
634 | | - total_tf += field_weights.get(fn, 1.0) * tf |
635 | | - # Doc-length ratio |
636 | | - weighted_dl = sum( |
637 | | - field_weights.get(fn, 1.0) * dl for fn, dl in doc.field_lengths.items() |
638 | | - ) |
639 | | - weighted_avgdl = ( |
640 | | - sum( |
641 | | - field_weights.get(fn, 1.0) * self._avg_field_length(fn) |
642 | | - for fn in field_weights |
643 | | - ) |
644 | | - or 1.0 |
645 | | - ) |
646 | | - doc_len_ratio = weighted_dl / weighted_avgdl |
647 | | - |
648 | | - calibrated_scores[doc_id] = _score_to_probability( |
649 | | - raw_score, |
650 | | - total_tf, |
651 | | - doc_len_ratio, |
652 | | - alpha, |
653 | | - beta, |
654 | | - base_rate, |
655 | | - ) |
656 | | - |
657 | | - return calibrated_scores |
| 674 | + return self._calibrate_bm25_scores(dict(scores), query_tokens, field_weights) |
658 | 675 |
|
659 | 676 | # -- internal: TF-IDF scoring -------------------------------------------- |
660 | 677 |
|
661 | | - def _score_tfidf(self, query_tokens: list[str]) -> dict[str, float]: |
662 | | - """Score documents using TF-IDF with cosine similarity.""" |
663 | | - n = len(self._docs) |
664 | | - if n == 0: |
665 | | - return {} |
666 | | - |
667 | | - field_weights = self.field_weights or {"_default": 1.0} |
| 678 | + def _tfidf_idf(self, term: str, n: int) -> float: |
| 679 | + """Compute smoothed IDF for TF-IDF scoring.""" |
| 680 | + df = self._df.get(term, 0) |
| 681 | + if df == 0: |
| 682 | + return 0.0 |
| 683 | + return math.log(n / df) + 1.0 |
668 | 684 |
|
669 | | - # Build query TF-IDF vector (term -> weight) |
| 685 | + def _build_query_tfidf_vec( |
| 686 | + self, query_tokens: list[str], n: int |
| 687 | + ) -> dict[str, float]: |
| 688 | + """Build TF-IDF weighted vector for query terms.""" |
670 | 689 | query_tf: dict[str, int] = defaultdict(int) |
671 | 690 | for token in query_tokens: |
672 | 691 | query_tf[token] += 1 |
673 | 692 |
|
674 | | - query_vec: dict[str, float] = {} |
| 693 | + vec: dict[str, float] = {} |
675 | 694 | for term, tf in query_tf.items(): |
676 | | - df = self._df.get(term, 0) |
677 | | - if df == 0: |
| 695 | + idf = self._tfidf_idf(term, n) |
| 696 | + if idf > 0: |
| 697 | + vec[term] = (1.0 + math.log(tf)) * idf |
| 698 | + return vec |
| 699 | + |
| 700 | + def _doc_tfidf_vec( |
| 701 | + self, |
| 702 | + doc_id: str, |
| 703 | + n: int, |
| 704 | + field_weights: dict[str, float], |
| 705 | + ) -> dict[str, float]: |
| 706 | + """Build TF-IDF weighted vector for a document.""" |
| 707 | + vec: dict[str, float] = {} |
| 708 | + for term in self._doc_terms.get(doc_id, ()): |
| 709 | + postings = self._index.get(term) |
| 710 | + if postings is None or doc_id not in postings: |
| 711 | + continue |
| 712 | + w_tf = self._weighted_term_freq(postings[doc_id], field_weights) |
| 713 | + if w_tf <= 0: |
678 | 714 | continue |
679 | | - idf = math.log(n / df) + 1.0 # smoothed IDF |
680 | | - query_vec[term] = (1.0 + math.log(tf)) * idf |
| 715 | + idf = self._tfidf_idf(term, n) |
| 716 | + if idf > 0: |
| 717 | + vec[term] = (1.0 + math.log(w_tf)) * idf |
| 718 | + return vec |
681 | 719 |
|
| 720 | + def _score_tfidf(self, query_tokens: list[str]) -> dict[str, float]: |
| 721 | + """Score documents using TF-IDF with cosine similarity.""" |
| 722 | + n = len(self._docs) |
| 723 | + if n == 0: |
| 724 | + return {} |
| 725 | + |
| 726 | + field_weights = self.field_weights or {"_default": 1.0} |
| 727 | + query_vec = self._build_query_tfidf_vec(query_tokens, n) |
682 | 728 | if not query_vec: |
683 | 729 | return {} |
684 | 730 |
|
685 | 731 | query_norm = math.sqrt(sum(v * v for v in query_vec.values())) |
686 | 732 |
|
687 | | - # Score each candidate document |
688 | | - scores: dict[str, float] = {} |
| 733 | + # Collect candidate documents that contain at least one query term |
689 | 734 | candidates: set[str] = set() |
690 | 735 | for term in query_vec: |
691 | 736 | if term in self._index: |
692 | 737 | candidates.update(self._index[term].keys()) |
693 | 738 |
|
| 739 | + scores: dict[str, float] = {} |
694 | 740 | for doc_id in candidates: |
695 | | - dot_product = 0.0 |
696 | | - doc_norm_sq = 0.0 |
697 | | - |
698 | | - # Collect all terms in this document for norm calculation |
699 | | - doc_terms: dict[str, float] = {} |
700 | | - for term in self._index: |
701 | | - if doc_id not in self._index[term]: |
702 | | - continue |
703 | | - |
704 | | - field_tfs = self._index[term][doc_id] |
705 | | - weighted_tf = sum( |
706 | | - field_weights.get(fn, 1.0) * tf for fn, tf in field_tfs.items() |
707 | | - ) |
708 | | - if weighted_tf <= 0: |
709 | | - continue |
710 | | - |
711 | | - df = self._df.get(term, 1) |
712 | | - idf = math.log(n / df) + 1.0 |
713 | | - tfidf = (1.0 + math.log(weighted_tf)) * idf |
714 | | - doc_terms[term] = tfidf |
715 | | - |
716 | | - for term, tfidf in doc_terms.items(): |
717 | | - doc_norm_sq += tfidf * tfidf |
718 | | - if term in query_vec: |
719 | | - dot_product += tfidf * query_vec[term] |
720 | | - |
| 741 | + doc_vec = self._doc_tfidf_vec(doc_id, n, field_weights) |
| 742 | + dot_product = sum( |
| 743 | + w * query_vec[t] for t, w in doc_vec.items() if t in query_vec |
| 744 | + ) |
| 745 | + doc_norm_sq = sum(w * w for w in doc_vec.values()) |
721 | 746 | if doc_norm_sq > 0 and dot_product > 0: |
722 | 747 | scores[doc_id] = dot_product / (query_norm * math.sqrt(doc_norm_sq)) |
723 | 748 |
|
|
0 commit comments