11. Storing Embeddings in ChromaDB
ChromaDB is a purpose-built vector database optimized for RAG workloads. It stores embeddings alongside metadata and supports fast similarity search. It runs locally without setup, making it ideal for development and production.
Installation and setup
pip install chromadb
Verify:
import chromadb
print(f"ChromaDB version: {chromadb.__version__}")
ChromaDB architecture
ChromaDB has four key concepts:
Client: Connection to the database. Can be in-memory (ephemeral) or persistent (disk-based).
Collection: A named group of vectors, like a table. Each collection has a name and an optional distance function.
Embeddings: The vector data. Stored as lists of floats.
IDs: Unique identifiers for each embedding. You provide these; ChromaDB does not auto-generate them.
Basic operations
import chromadb
from chromadb.config import Settings
# Persistent client (data survives restarts)
client = chromadb.Client(Settings(persist_directory="./chroma_db"))
# Ephemeral client (data lost on restart)
ephemeral_client = chromadb.EphemeralClient()
# Create or get a collection
collection = client.get_or_create_collection(
name="documents",
metadata={"description": "Document chunks for RAG"}
)
# Add embeddings
collection.add(
ids=["doc1", "doc2", "doc3"],
embeddings=[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]],
documents=["First document text", "Second document text", "Third document text"],
metadatas=[{"source": "a.txt"}, {"source": "b.txt"}, {"source": "c.txt"}]
)
# Query
results = collection.query(
query_embeddings=[[0.15, 0.25]], # Query vector
n_results=2 # Return top 2
)
print(results["documents"]) # Retrieved documents
print(results["distances"]) # Similarity scores
Batch insertion with progress
For large datasets, batch insertion prevents memory issues.
def batch_insert_chunks(
collection,
chunks: list[EmbeddedChunk],
batch_size: int = 100
):
"""Insert chunks in batches."""
total = len(chunks)
for i in range(0, total, batch_size):
batch = chunks[i:i + batch_size]
collection.add(
ids=[chunk.chunk_id for chunk in batch],
embeddings=[chunk.embedding for chunk in batch],
documents=[chunk.text for chunk in batch],
metadatas=[chunk.metadata for chunk in batch]
)
print(f"Inserted {min(i + batch_size, total)}/{total} chunks")
Query with filtering
Metadata filtering narrows results to relevant documents.
# Filter by source
results = collection.query(
query_embeddings=[query_embedding],
where={"source": "policies.pdf"},
n_results=5
)
# Filter by year
results = collection.query(
query_embeddings=[query_embedding],
where={"year": {"$gte": 2024}},
n_results=5
)
# Filter by multiple conditions
results = collection.query(
query_embeddings=[query_embedding],
where={
"source": {"$in": ["a.pdf", "b.pdf"]},
"section": "electronics"
},
n_results=10
)
Distance functions
ChromaDB supports three distance functions:
Cosine (default): Measures angle between vectors. Range 0-2. Lower is better.
Euclidean: Straight-line distance. Range 0+. Lower is better.
IP (Inner Product): Dot product. Higher is better for normalized vectors.
# Specify distance function at collection creation
collection = client.get_or_create_collection(
name="documents_cosine",
metadata={"hnsw:space": "cosine"} # or "l2" or "ip"
)
Indexing and performance
ChromaDB uses HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search. This provides fast retrieval even with millions of vectors.
# Collection metadata controls indexing behavior
collection = client.get_or_create_collection(
name="documents",
metadata={
"hnsw:space": "cosine", # Distance metric
"hnsw:construction": 40, # Build quality (higher = better, slower)
"hnsw:search": 40 # Query quality (higher = better, slower)
}
)
Default values work for most use cases. Increase them only if you have performance issues.
Collection management
# List all collections
collections = client.list_collections()
for col in collections:
print(col.name, col.count)
# Get collection stats
collection = client.get_collection("documents")
print(f"Embedding count: {collection.count()}")
# Delete collection
client.delete_collection("documents")
# Reset database (delete all data)
client.reset()
Persistence and loading
import chromadb
# Save to disk
client = chromadb.Client(Settings(persist_directory="./chroma_data"))
# Reload from disk
loaded_client = chromadb.Client(Settings(persist_directory="./chroma_data"))
collection = loaded_client.get_collection("documents")
# Check data
print(f"Loaded {collection.count()} embeddings")
Complete ingestion pipeline
def ingest_to_chroma(
documents: list[dict],
collection_name: str = "documents",
embedding_model: str = "all-MiniLM-L6-v2"
) -> chromadb.Collection:
"""
Complete pipeline: documents -> chunks -> embeddings -> ChromaDB.
"""
# Initialize
client = chromadb.Client(Settings(persist_directory="./chroma_db"))
embedding_pipeline = DocumentEmbeddingPipeline(embedding_model)
# Get or create collection
collection = client.get_or_create_collection(
name=collection_name,
metadata={"description": "RAG document store"}
)
# Process documents
print("Processing documents...")
all_chunks = embedding_pipeline.process_documents(documents)
print(f"Created {len(all_chunks)} chunks")
# Store in ChromaDB
print("Storing embeddings in ChromaDB...")
batch_insert_chunks(collection, all_chunks, batch_size=100)
print(f"Done. Collection has {collection.count()} embeddings.")
return collection
Query and retrieval
def query_chroma(
collection,
query_text: str,
embedding_model,
top_k: int = 5,
filters: dict = None
) -> list[dict]:
"""Query the vector store and return results with sources."""
# Embed query
query_embedding = embedding_model.encode([query_text])[0].tolist()
# Search
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
where=filters,
include=["documents", "metadatas", "distances"]
)
# Format results
formatted = []
for i, doc in enumerate(results["documents"][0]):
formatted.append({
"text": doc,
"metadata": results["metadatas"][0][i],
"distance": results["distances"][0][i]
})
return formatted
# Usage
model = SentenceTransformer("all-MiniLM-L6-v2")
results = query_chroma(
collection,
"What is the return policy for electronics?",
model,
top_k=3,
filters={"year": 2024}
)
for r in results:
print(f"[{r['distance']:.3f}] {r['text'][:100]}...")
print(f" Source: {r['metadata']['source']}")
Debugging common issues
# Check what's in the collection
all_data = collection.get(include=["embeddings", "documents", "metadatas"])
print(f"Total: {len(all_data['ids'])}")
# Verify embeddings exist
if not all_data['embeddings']:
print("Warning: No embeddings found!")
# Check for issues
for i, emb in enumerate(all_data['embeddings'][:3]):
print(f"Chunk {all_data['ids'][i]}: {len(emb)} dimensions")
Backup and export
import json
def export_collection(collection, filepath: str):
"""Export collection data to JSON."""
data = collection.get(include=["embeddings", "documents", "metadatas"])
export_data = {
"ids": data["ids"],
"documents": data["documents"],
"embeddings": data["embeddings"],
"metadatas": data["metadatas"]
}
with open(filepath, "w") as f:
json.dump(export_data, f)
print(f"Exported {len(data['ids'])} records to {filepath}")
Create a Python script that ingests three documents (use the Markdown files you created in Chapter 5 exercises). Embed them and store in ChromaDB. Then query with three different questions and print the retrieved chunks with their sources. Verify that the correct chunks are returned for each query.