06. Sparse Retrieval BM25
Chapter 6 of 24 · 15 min
BM25 (Best Matching 25) is a probabilistic retrieval model that ranks documents based on term frequency and document length normalization.
Term frequency saturation prevents common terms from dominating. At a certain frequency, additional occurrences add minimal value. The parameter k1 controls saturation rate—typical values range from 1.2 to 2.0.
Document length normalization prevents bias toward longer documents. A term appearing once in a 10-word abstract is more significant than the same term appearing once in a 500-page book.
import math
from collections import Counter, defaultdict
class BM25Retriever:
def __init__(self, k1: float = 1.5, b: float = 0.75):
self.k1 = k1
self.b = b
self.doc_lengths = []
self.avg_doc_length = 0
self.doc_term_freqs = []
self.idf = {}
self.doc_count = 0
def index(self, documents: List[str]):
"""Build BM25 index from documents."""
self.doc_count = len(documents)
self.doc_lengths = []
self.doc_term_freqs = []
# Count documents containing each term
doc_freq = defaultdict(int)
for doc in documents:
tokens = self._tokenize(doc)
self.doc_lengths.append(len(tokens))
term_freq = Counter(tokens)
self.doc_term_freqs.append(term_freq)
for term in set(tokens):
doc_freq[term] += 1
self.avg_doc_length = sum(self.doc_lengths) / self.doc_count
# Calculate IDF for each term
for term, df in doc_freq.items():
self.idf[term] = math.log((self.doc_count - df + 0.5) / (df + 0.5) + 1)
def search(self, query: str, top_k: int = 10) -> List[dict]:
"""Retrieve documents by BM25 score."""
query_tokens = self._tokenize(query)
scores = []
for idx, term_freq in enumerate(self.doc_term_freqs):
score = 0
doc_length = self.doc_lengths[idx]
for term in query_tokens:
if term not in self.idf:
continue
tf = term_freq.get(term, 0)
idf = self.idf[term]
# BM25 formula
numerator = tf * (self.k1 + 1)
denominator = tf + self.k1 * (1 - self.b +
self.b * doc_length / self.avg_doc_length)
score += idf * (numerator / denominator)
scores.append((idx, score))
# Sort by score descending
scores.sort(key=lambda x: x[1], reverse=True)
return [{'index': idx, 'score': score} for idx, score in scores[:top_k]]
def _tokenize(self, text: str) -> List[str]:
"""Simple tokenization: lowercase, alphanumeric only."""
return re.findall(r'\b[a-z0-9]+\b', text.lower())
EXERCISE
Implement BM25 and evaluate against a naive TF-IDF baseline on a document ranking task. Measure Spearman correlation between methods.