Recursive Character Splitter — RAG Systems: Part 1 (Chapter 8)

The RecursiveCharacterTextSplitter from LangChain is the most practical chunking strategy for general documents. It tries to split on characters in order of preference, falling back to the next character when a split would create an incoherent chunk.

How it works

The splitter tries these separators in order:

Double newlines (\n\n) - paragraph boundaries
Single newlines (\n) - line boundaries
Spaces ( ) - word boundaries
Characters - individual characters

If splitting at \n\n would create a chunk larger than chunk_size, it tries \n. If that still exceeds the limit, it tries . If a single space would exceed the limit, it falls back to individual characters.

This approach respects document structure while guaranteeing chunks stay within size limits.

Implementation from scratch

from typing import Callable

class RecursiveCharacterSplitter:
    def __init__(
        self,
        separators: list[str] = None,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        length_function: Callable[[str], int] = None
    ):
        self.separators = separators or ["\n\n", "\n", " ", ""]
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.length_function = length_function or len

    def split_text(self, text: str) -> list[str]:
        """Split text recursively using character separators."""
        chunks = []
        current_pos = 0

        while current_pos < len(text):
            # Find separator that fits
            chunk_text = text[current_pos:current_pos + self.chunk_size]

            # Try to split at separator within chunk
            split_done = False
            for separator in self.separators:
                if separator == "":
                    # Reached minimum separator, force split
                    if len(chunks) > 0:
                        # Add overlap
                        overlap_text = chunks[-1][-self.chunk_overlap:]
                        if overlap_text:
                            chunks.append(overlap_text)
                    chunks.append(chunk_text)
                    split_done = True
                    break

                if separator in chunk_text:
                    # Find last occurrence of separator in chunk
                    last_sep = chunk_text.rfind(separator)
                    if last_sep > 0:
                        split_point = current_pos + last_sep + len(separator)
                        actual_chunk = text[current_pos:split_point]

                        if self.length_function(actual_chunk.strip()) > 0:
                            chunks.append(actual_chunk)

                        current_pos = split_point
                        split_done = True
                        break

            if not split_done:
                # No separator found, take what we can
                if chunk_text.strip():
                    chunks.append(chunk_text)
                current_pos += self.chunk_size

        # Apply overlap
        if self.chunk_overlap > 0 and len(chunks) > 1:
            overlapped_chunks = []
            for i, chunk in enumerate(chunks):
                if i == 0:
                    overlapped_chunks.append(chunk)
                else:
                    prev_chunk = chunks[i - 1]
                    overlap_text = prev_chunk[-self.chunk_overlap:]
                    overlapped_chunks.append(overlap_text + chunk)
            chunks = overlapped_chunks

        return chunks

    def split_documents(self, documents: list[dict]) -> list[dict]:
        """Split multiple documents, preserving metadata."""
        all_chunks = []

        for doc in documents:
            text = doc.get("text", "")
            metadata = doc.get("metadata", {})

            doc_chunks = self.split_text(text)

            for i, chunk in enumerate(doc_chunks):
                all_chunks.append({
                    "text": chunk,
                    "metadata": {
                        **metadata,
                        "chunk_index": i,
                        "total_chunks": len(doc_chunks)
                    }
                })

        return all_chunks

Better version with token awareness

The previous version uses character count. For LLM applications, you need token count.

class TokenAwareRecursiveSplitter:
    def __init__(
        self,
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        encoding_name: str = "cl100k_base"
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.encoder = tiktoken.get_encoding(encoding_name)
        self.separators = ["\n\n", "\n", " ", ""]

    def _get_token_count(self, text: str) -> int:
        return len(self.encoder.encode(text))

    def split_text(self, text: str) -> list[str]:
        """Split text by tokens using recursive separator approach."""
        # First pass: try to split at double newlines (paragraphs)
        if self._get_token_count(text) <= self.chunk_size:
            return [text] if text.strip() else []

        # Split by paragraphs first
        paragraphs = text.split("\n\n")
        chunks = []
        current_chunk = []
        current_tokens = 0

        for para in paragraphs:
            para_tokens = self._get_token_count(para)

            if current_tokens + para_tokens <= self.chunk_size:
                current_chunk.append(para)
                current_tokens += para_tokens
            else:
                # Save current chunk
                if current_chunk:
                    chunks.append("\n\n".join(current_chunk))

                # If single paragraph exceeds limit, split by lines
                if para_tokens > self.chunk_size:
                    lines = para.split("\n")
                    current_chunk = []
                    current_tokens = 0

                    for line in lines:
                        line_tokens = self._get_token_count(line)
                        if current_tokens + line_tokens <= self.chunk_size:
                            current_chunk.append(line)
                            current_tokens += line_tokens
                        else:
                            if current_chunk:
                                chunks.append("\n".join(current_chunk))
                            current_chunk = [line]
                            current_tokens = line_tokens

                    if current_chunk:
                        chunks.append("\n".join(current_chunk))
                else:
                    current_chunk = [para]
                    current_tokens = para_tokens

        # Don't forget last chunk
        if current_chunk:
            chunks.append("\n\n".join(current_chunk))

        # Apply overlap
        if self.chunk_overlap > 0:
            chunks = self._add_overlap(chunks)

        return [c for c in chunks if c.strip()]

    def _add_overlap(self, chunks: list[str]) -> list[str]:
        """Add overlap between consecutive chunks."""
        if len(chunks) <= 1:
            return chunks

        overlapped = [chunks[0]]

        for i in range(1, len(chunks)):
            prev_chunk = chunks[i - 1]
            overlap_tokens = self._get_token_count(prev_chunk)
            overlap_text = prev_chunk

            # Reduce overlap until it fits
            while overlap_tokens > self.chunk_overlap and len(prev_chunk) > 100:
                # Remove from start of overlap
                overlap_text = overlap_text[len(overlap_text) // 2:]
                overlap_tokens = self._get_token_count(overlap_text)

            overlapped.append(overlap_text + chunks[i])

        return overlapped

Testing the splitter

def test_splitter():
    """Test splitter on various document types."""
    test_cases = [
        # Standard paragraph text
        "This is paragraph one.\n\nThis is paragraph two.",
        # Code and prose
        "# Heading\n\nSome text here.\n\n```python\nprint('code')\n```",
        # Long single word
        "a" * 1000,
        # Multiple short paragraphs
        "\n\n".join([f"Paragraph {i} with some content." for i in range(10)])
    ]

    splitter = TokenAwareRecursiveSplitter(chunk_size=100, chunk_overlap=20)

    for i, text in enumerate(test_cases):
        chunks = splitter.split_text(text)
        print(f"Test {i}: {len(chunks)} chunks")
        for j, chunk in enumerate(chunks):
            tokens = len(splitter.encoder.encode(chunk))
            print(f"  Chunk {j}: {tokens} tokens, {len(chunk)} chars")

Handling special characters in code

Code blocks often contain characters that look like separators but should not trigger splits. Handle this by protecting code blocks.

import re

def protect_code_blocks(text: str) -> tuple[str, list[str]]:
    """Replace code blocks with placeholders to protect them from splitting."""
    pattern = r'```[\s\S]*?```|`[^`]+`'

    code_blocks = []
    protected = text

    for match in re.finditer(pattern, text):
        placeholder = f"__CODE_BLOCK_{len(code_blocks)}__"
        code_blocks.append(match.group())
        protected = protected.replace(match.group(), placeholder)

    return protected, code_blocks

def restore_code_blocks(chunks: list[str], code_blocks: list[str]) -> list[str]:
    """Restore code blocks in chunks."""
    restored = []
    for chunk in chunks:
        for i, block in enumerate(code_blocks):
            chunk = chunk.replace(f"__CODE_BLOCK_{i}__", block)
        restored.append(chunk)
    return restored