07. Semantic Chunking
Semantic chunking groups text by meaning rather than by token count. Sentences that discuss the same topic stay together. This produces chunks that are internally coherent even when their token counts vary.
The sentence boundary problem
Before semantic chunking, you need sentence segmentation. Naive approaches (splitting on periods) fail with abbreviations (Dr., U.S.A.) and decimal numbers.
import re
def split_sentences(text: str) -> list[str]:
"""Split text into sentences, handling edge cases."""
# Pattern handles abbreviations and decimals
pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=\w\.)\s+(?=[A-Z])'
sentences = re.split(pattern, text)
# Clean up
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
Better yet, use a proper NLP library:
import spacy
def sentence_split_nlp(text: str) -> list[str]:
"""Use spaCy for accurate sentence segmentation."""
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
return [sent.text for sent in doc.sents]
Semantic similarity-based chunking
The core idea: accumulate sentences into a chunk until adding the next sentence would reduce semantic coherence below a threshold.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def semantic_chunk(
sentences: list[str],
max_tokens: int = 512,
similarity_threshold: float = 0.3
) -> list[str]:
"""
Chunk sentences based on semantic similarity.
Args:
sentences: List of sentences
max_tokens: Maximum tokens per chunk
similarity_threshold: Minimum similarity to keep adding to chunk
Returns:
List of semantically coherent chunks
"""
encoder = tiktoken.get_encoding("cl100k_base")
chunks = []
current_chunk = []
current_token_count = 0
# Compute sentence embeddings for similarity
# Using TF-IDF as a fast proxy (use proper embeddings for production)
if sentences:
vectorizer = TfidfVectorizer().fit(sentences)
sentence_vectors = vectorizer.transform(sentences)
for i, sentence in enumerate(sentences):
sentence_tokens = len(encoder.encode(sentence))
# Check if adding this sentence exceeds limit
if current_token_count + sentence_tokens > max_tokens:
# Finalize current chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_token_count = sentence_tokens
else:
# Check semantic similarity to chunk content
if current_chunk:
# Average similarity to current chunk sentences
current_indices = list(range(len(chunks), len(chunks) + len(current_chunk)))
if i < len(sentence_vectors.toarray()):
avg_similarity = cosine_similarity(
sentence_vectors[i:i+1],
sentence_vectors[len(current_chunk):len(current_chunk)+len(current_chunk)]
).mean()
else:
avg_similarity = similarity_threshold # Force new chunk
if avg_similarity < similarity_threshold:
# Start new chunk
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_token_count = sentence_tokens
else:
current_chunk.append(sentence)
current_token_count += sentence_tokens
else:
current_chunk.append(sentence)
current_token_count += sentence_tokens
# Don't forget last chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
Embedding-based semantic chunking
For production systems, use actual embeddings instead of TF-IDF. This produces better semantic clustering.
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticChunker:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.encoder = SentenceTransformer(model_name)
self.encoder_tiktoken = tiktoken.get_encoding("cl100k_base")
def chunk(
self,
text: str,
max_tokens: int = 512,
similarity_threshold: float = 0.5
) -> list[dict]:
"""Semantic chunking using sentence embeddings."""
sentences = sentence_split_nlp(text)
if not sentences:
return []
# Encode all sentences
embeddings = self.encoder.encode(sentences)
chunks = []
current_chunk_sentences = []
current_chunk_tokens = 0
for i, sentence in enumerate(sentences):
sentence_tokens = len(self.encoder_tiktoken.encode(sentence))
# Check token limit
if current_chunk_tokens + sentence_tokens > max_tokens:
# Finalize chunk
if current_chunk_sentences:
chunks.append({
"text": " ".join(current_chunk_sentences),
"sentence_count": len(current_chunk_sentences),
"start_idx": i - len(current_chunk_sentences),
"end_idx": i
})
current_chunk_sentences = [sentence]
current_chunk_tokens = sentence_tokens
else:
# Check semantic similarity
if current_chunk_sentences:
chunk_embedding = embeddings[
i - len(current_chunk_sentences):i
].mean(axis=0)
similarity = cosine_similarity(
[embeddings[i]],
[chunk_embedding]
)[0][0]
if similarity < similarity_threshold:
# New semantic cluster
chunks.append({
"text": " ".join(current_chunk_sentences),
"sentence_count": len(current_chunk_sentences),
"start_idx": i - len(current_chunk_sentences),
"end_idx": i
})
current_chunk_sentences = [sentence]
current_chunk_tokens = sentence_tokens
else:
current_chunk_sentences.append(sentence)
current_chunk_tokens += sentence_tokens
else:
current_chunk_sentences.append(sentence)
current_chunk_tokens += sentence_tokens
# Last chunk
if current_chunk_sentences:
chunks.append({
"text": " ".join(current_chunk_sentences),
"sentence_count": len(current_chunk_sentences),
"start_idx": len(sentences) - len(current_chunk_sentences),
"end_idx": len(sentences)
})
return chunks
Evaluating chunk quality
Semantic chunking quality depends on the similarity threshold. Too high and every sentence becomes its own chunk. Too low and chunks become too large and incoherent.
def evaluate_chunks(chunks: list[str]) -> dict:
"""Compute quality metrics for a set of chunks."""
encoder = tiktoken.get_encoding("cl100k_base")
token_counts = [len(encoder.encode(c)) for c in chunks]
return {
"num_chunks": len(chunks),
"avg_tokens": np.mean(token_counts),
"min_tokens": min(token_counts),
"max_tokens": max(token_counts),
"std_tokens": np.std(token_counts),
"token_range": max(token_counts) - min(token_counts)
}
# Compare semantic vs fixed-size
fixed_chunks = fixed_size_chunk(sample_text, chunk_size=512)
semantic_chunks = [c["text"] for c in SemanticChunker().chunk(sample_text)]
print("Fixed-size:", evaluate_chunks(fixed_chunks))
print("Semantic:", evaluate_chunks(semantic_chunks))
Take three paragraphs from different topics (e.g., sports, technology, food). Combine them into one document. Use semantic chunking with similarity_threshold=0.3 and 0.7. Show how the threshold affects which sentences group together. Calculate the coherence score for each threshold using a simple word overlap metric.