08. Recursive Character Splitter
The RecursiveCharacterTextSplitter from LangChain is the most practical chunking strategy for general documents. It tries to split on characters in order of preference, falling back to the next character when a split would create an incoherent chunk.
How it works
The splitter tries these separators in order:
- Double newlines (
\n\n) - paragraph boundaries - Single newlines (
\n) - line boundaries - Spaces (
) - word boundaries - Characters - individual characters
If splitting at \n\n would create a chunk larger than chunk_size, it tries \n. If that still exceeds the limit, it tries . If a single space would exceed the limit, it falls back to individual characters.
This approach respects document structure while guaranteeing chunks stay within size limits.
Implementation from scratch
from typing import Callable
class RecursiveCharacterSplitter:
def __init__(
self,
separators: list[str] = None,
chunk_size: int = 1000,
chunk_overlap: int = 200,
length_function: Callable[[str], int] = None
):
self.separators = separators or ["\n\n", "\n", " ", ""]
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.length_function = length_function or len
def split_text(self, text: str) -> list[str]:
"""Split text recursively using character separators."""
chunks = []
current_pos = 0
while current_pos < len(text):
# Find separator that fits
chunk_text = text[current_pos:current_pos + self.chunk_size]
# Try to split at separator within chunk
split_done = False
for separator in self.separators:
if separator == "":
# Reached minimum separator, force split
if len(chunks) > 0:
# Add overlap
overlap_text = chunks[-1][-self.chunk_overlap:]
if overlap_text:
chunks.append(overlap_text)
chunks.append(chunk_text)
split_done = True
break
if separator in chunk_text:
# Find last occurrence of separator in chunk
last_sep = chunk_text.rfind(separator)
if last_sep > 0:
split_point = current_pos + last_sep + len(separator)
actual_chunk = text[current_pos:split_point]
if self.length_function(actual_chunk.strip()) > 0:
chunks.append(actual_chunk)
current_pos = split_point
split_done = True
break
if not split_done:
# No separator found, take what we can
if chunk_text.strip():
chunks.append(chunk_text)
current_pos += self.chunk_size
# Apply overlap
if self.chunk_overlap > 0 and len(chunks) > 1:
overlapped_chunks = []
for i, chunk in enumerate(chunks):
if i == 0:
overlapped_chunks.append(chunk)
else:
prev_chunk = chunks[i - 1]
overlap_text = prev_chunk[-self.chunk_overlap:]
overlapped_chunks.append(overlap_text + chunk)
chunks = overlapped_chunks
return chunks
def split_documents(self, documents: list[dict]) -> list[dict]:
"""Split multiple documents, preserving metadata."""
all_chunks = []
for doc in documents:
text = doc.get("text", "")
metadata = doc.get("metadata", {})
doc_chunks = self.split_text(text)
for i, chunk in enumerate(doc_chunks):
all_chunks.append({
"text": chunk,
"metadata": {
**metadata,
"chunk_index": i,
"total_chunks": len(doc_chunks)
}
})
return all_chunks
Better version with token awareness
The previous version uses character count. For LLM applications, you need token count.
class TokenAwareRecursiveSplitter:
def __init__(
self,
chunk_size: int = 512,
chunk_overlap: int = 50,
encoding_name: str = "cl100k_base"
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.encoder = tiktoken.get_encoding(encoding_name)
self.separators = ["\n\n", "\n", " ", ""]
def _get_token_count(self, text: str) -> int:
return len(self.encoder.encode(text))
def split_text(self, text: str) -> list[str]:
"""Split text by tokens using recursive separator approach."""
# First pass: try to split at double newlines (paragraphs)
if self._get_token_count(text) <= self.chunk_size:
return [text] if text.strip() else []
# Split by paragraphs first
paragraphs = text.split("\n\n")
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = self._get_token_count(para)
if current_tokens + para_tokens <= self.chunk_size:
current_chunk.append(para)
current_tokens += para_tokens
else:
# Save current chunk
if current_chunk:
chunks.append("\n\n".join(current_chunk))
# If single paragraph exceeds limit, split by lines
if para_tokens > self.chunk_size:
lines = para.split("\n")
current_chunk = []
current_tokens = 0
for line in lines:
line_tokens = self._get_token_count(line)
if current_tokens + line_tokens <= self.chunk_size:
current_chunk.append(line)
current_tokens += line_tokens
else:
if current_chunk:
chunks.append("\n".join(current_chunk))
current_chunk = [line]
current_tokens = line_tokens
if current_chunk:
chunks.append("\n".join(current_chunk))
else:
current_chunk = [para]
current_tokens = para_tokens
# Don't forget last chunk
if current_chunk:
chunks.append("\n\n".join(current_chunk))
# Apply overlap
if self.chunk_overlap > 0:
chunks = self._add_overlap(chunks)
return [c for c in chunks if c.strip()]
def _add_overlap(self, chunks: list[str]) -> list[str]:
"""Add overlap between consecutive chunks."""
if len(chunks) <= 1:
return chunks
overlapped = [chunks[0]]
for i in range(1, len(chunks)):
prev_chunk = chunks[i - 1]
overlap_tokens = self._get_token_count(prev_chunk)
overlap_text = prev_chunk
# Reduce overlap until it fits
while overlap_tokens > self.chunk_overlap and len(prev_chunk) > 100:
# Remove from start of overlap
overlap_text = overlap_text[len(overlap_text) // 2:]
overlap_tokens = self._get_token_count(overlap_text)
overlapped.append(overlap_text + chunks[i])
return overlapped
Testing the splitter
def test_splitter():
"""Test splitter on various document types."""
test_cases = [
# Standard paragraph text
"This is paragraph one.\n\nThis is paragraph two.",
# Code and prose
"# Heading\n\nSome text here.\n\n```python\nprint('code')\n```",
# Long single word
"a" * 1000,
# Multiple short paragraphs
"\n\n".join([f"Paragraph {i} with some content." for i in range(10)])
]
splitter = TokenAwareRecursiveSplitter(chunk_size=100, chunk_overlap=20)
for i, text in enumerate(test_cases):
chunks = splitter.split_text(text)
print(f"Test {i}: {len(chunks)} chunks")
for j, chunk in enumerate(chunks):
tokens = len(splitter.encoder.encode(chunk))
print(f" Chunk {j}: {tokens} tokens, {len(chunk)} chars")
Handling special characters in code
Code blocks often contain characters that look like separators but should not trigger splits. Handle this by protecting code blocks.
import re
def protect_code_blocks(text: str) -> tuple[str, list[str]]:
"""Replace code blocks with placeholders to protect them from splitting."""
pattern = r'```[\s\S]*?```|`[^`]+`'
code_blocks = []
protected = text
for match in re.finditer(pattern, text):
placeholder = f"__CODE_BLOCK_{len(code_blocks)}__"
code_blocks.append(match.group())
protected = protected.replace(match.group(), placeholder)
return protected, code_blocks
def restore_code_blocks(chunks: list[str], code_blocks: list[str]) -> list[str]:
"""Restore code blocks in chunks."""
restored = []
for chunk in chunks:
for i, block in enumerate(code_blocks):
chunk = chunk.replace(f"__CODE_BLOCK_{i}__", block)
restored.append(chunk)
return restored
Create a document with two paragraphs, each containing 5 sentences. First paragraph should be about cars. Second about food. Use the recursive splitter with chunk_size=100 tokens. Verify that no chunk contains sentences from both topics. Then increase chunk_size to 400 and verify that one topic can span multiple chunks.