Semantic Chunking — RAG Systems: Part 1 (Chapter 7)

Semantic chunking groups text by meaning rather than by token count. Sentences that discuss the same topic stay together. This produces chunks that are internally coherent even when their token counts vary.

The sentence boundary problem

Before semantic chunking, you need sentence segmentation. Naive approaches (splitting on periods) fail with abbreviations (Dr., U.S.A.) and decimal numbers.

import re

def split_sentences(text: str) -> list[str]:
    """Split text into sentences, handling edge cases."""
    # Pattern handles abbreviations and decimals
    pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=\w\.)\s+(?=[A-Z])'

    sentences = re.split(pattern, text)

    # Clean up
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

Better yet, use a proper NLP library:

import spacy

def sentence_split_nlp(text: str) -> list[str]:
    """Use spaCy for accurate sentence segmentation."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

Semantic similarity-based chunking

The core idea: accumulate sentences into a chunk until adding the next sentence would reduce semantic coherence below a threshold.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def semantic_chunk(
    sentences: list[str],
    max_tokens: int = 512,
    similarity_threshold: float = 0.3
) -> list[str]:
    """
    Chunk sentences based on semantic similarity.

    Args:
        sentences: List of sentences
        max_tokens: Maximum tokens per chunk
        similarity_threshold: Minimum similarity to keep adding to chunk

    Returns:
        List of semantically coherent chunks
    """
    encoder = tiktoken.get_encoding("cl100k_base")

    chunks = []
    current_chunk = []
    current_token_count = 0

    # Compute sentence embeddings for similarity
    # Using TF-IDF as a fast proxy (use proper embeddings for production)
    if sentences:
        vectorizer = TfidfVectorizer().fit(sentences)
        sentence_vectors = vectorizer.transform(sentences)

        for i, sentence in enumerate(sentences):
            sentence_tokens = len(encoder.encode(sentence))

            # Check if adding this sentence exceeds limit
            if current_token_count + sentence_tokens > max_tokens:
                # Finalize current chunk
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_token_count = sentence_tokens
            else:
                # Check semantic similarity to chunk content
                if current_chunk:
                    # Average similarity to current chunk sentences
                    current_indices = list(range(len(chunks), len(chunks) + len(current_chunk)))
                    if i < len(sentence_vectors.toarray()):
                        avg_similarity = cosine_similarity(
                            sentence_vectors[i:i+1],
                            sentence_vectors[len(current_chunk):len(current_chunk)+len(current_chunk)]
                        ).mean()
                    else:
                        avg_similarity = similarity_threshold  # Force new chunk

                    if avg_similarity < similarity_threshold:
                        # Start new chunk
                        chunks.append(" ".join(current_chunk))
                        current_chunk = [sentence]
                        current_token_count = sentence_tokens
                    else:
                        current_chunk.append(sentence)
                        current_token_count += sentence_tokens
                else:
                    current_chunk.append(sentence)
                    current_token_count += sentence_tokens

        # Don't forget last chunk
        if current_chunk:
            chunks.append(" ".join(current_chunk))

    return chunks

Embedding-based semantic chunking

For production systems, use actual embeddings instead of TF-IDF. This produces better semantic clustering.

from sentence_transformers import SentenceTransformer
import numpy as np

class SemanticChunker:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.encoder = SentenceTransformer(model_name)
        self.encoder_tiktoken = tiktoken.get_encoding("cl100k_base")

    def chunk(
        self,
        text: str,
        max_tokens: int = 512,
        similarity_threshold: float = 0.5
    ) -> list[dict]:
        """Semantic chunking using sentence embeddings."""
        sentences = sentence_split_nlp(text)

        if not sentences:
            return []

        # Encode all sentences
        embeddings = self.encoder.encode(sentences)

        chunks = []
        current_chunk_sentences = []
        current_chunk_tokens = 0

        for i, sentence in enumerate(sentences):
            sentence_tokens = len(self.encoder_tiktoken.encode(sentence))

            # Check token limit
            if current_chunk_tokens + sentence_tokens > max_tokens:
                # Finalize chunk
                if current_chunk_sentences:
                    chunks.append({
                        "text": " ".join(current_chunk_sentences),
                        "sentence_count": len(current_chunk_sentences),
                        "start_idx": i - len(current_chunk_sentences),
                        "end_idx": i
                    })
                current_chunk_sentences = [sentence]
                current_chunk_tokens = sentence_tokens
            else:
                # Check semantic similarity
                if current_chunk_sentences:
                    chunk_embedding = embeddings[
                        i - len(current_chunk_sentences):i
                    ].mean(axis=0)
                    similarity = cosine_similarity(
                        [embeddings[i]],
                        [chunk_embedding]
                    )[0][0]

                    if similarity < similarity_threshold:
                        # New semantic cluster
                        chunks.append({
                            "text": " ".join(current_chunk_sentences),
                            "sentence_count": len(current_chunk_sentences),
                            "start_idx": i - len(current_chunk_sentences),
                            "end_idx": i
                        })
                        current_chunk_sentences = [sentence]
                        current_chunk_tokens = sentence_tokens
                    else:
                        current_chunk_sentences.append(sentence)
                        current_chunk_tokens += sentence_tokens
                else:
                    current_chunk_sentences.append(sentence)
                    current_chunk_tokens += sentence_tokens

        # Last chunk
        if current_chunk_sentences:
            chunks.append({
                "text": " ".join(current_chunk_sentences),
                "sentence_count": len(current_chunk_sentences),
                "start_idx": len(sentences) - len(current_chunk_sentences),
                "end_idx": len(sentences)
            })

        return chunks

Evaluating chunk quality

Semantic chunking quality depends on the similarity threshold. Too high and every sentence becomes its own chunk. Too low and chunks become too large and incoherent.

def evaluate_chunks(chunks: list[str]) -> dict:
    """Compute quality metrics for a set of chunks."""
    encoder = tiktoken.get_encoding("cl100k_base")

    token_counts = [len(encoder.encode(c)) for c in chunks]

    return {
        "num_chunks": len(chunks),
        "avg_tokens": np.mean(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "std_tokens": np.std(token_counts),
        "token_range": max(token_counts) - min(token_counts)
    }

# Compare semantic vs fixed-size
fixed_chunks = fixed_size_chunk(sample_text, chunk_size=512)
semantic_chunks = [c["text"] for c in SemanticChunker().chunk(sample_text)]

print("Fixed-size:", evaluate_chunks(fixed_chunks))
print("Semantic:", evaluate_chunks(semantic_chunks))