11. Storing Embeddings in ChromaDB

Chapter 11 of 22 · 30 min

ChromaDB is a purpose-built vector database optimized for RAG workloads. It stores embeddings alongside metadata and supports fast similarity search. It runs locally without setup, making it ideal for development and production.

Installation and setup

pip install chromadb

Verify:

import chromadb
print(f"ChromaDB version: {chromadb.__version__}")

ChromaDB architecture

ChromaDB has four key concepts:

  1. Client: Connection to the database. Can be in-memory (ephemeral) or persistent (disk-based).

  2. Collection: A named group of vectors, like a table. Each collection has a name and an optional distance function.

  3. Embeddings: The vector data. Stored as lists of floats.

  4. IDs: Unique identifiers for each embedding. You provide these; ChromaDB does not auto-generate them.

Basic operations

import chromadb
from chromadb.config import Settings

# Persistent client (data survives restarts)
client = chromadb.Client(Settings(persist_directory="./chroma_db"))

# Ephemeral client (data lost on restart)
ephemeral_client = chromadb.EphemeralClient()

# Create or get a collection
collection = client.get_or_create_collection(
    name="documents",
    metadata={"description": "Document chunks for RAG"}
)

# Add embeddings
collection.add(
    ids=["doc1", "doc2", "doc3"],
    embeddings=[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]],
    documents=["First document text", "Second document text", "Third document text"],
    metadatas=[{"source": "a.txt"}, {"source": "b.txt"}, {"source": "c.txt"}]
)

# Query
results = collection.query(
    query_embeddings=[[0.15, 0.25]],  # Query vector
    n_results=2  # Return top 2
)

print(results["documents"])  # Retrieved documents
print(results["distances"])  # Similarity scores

Batch insertion with progress

For large datasets, batch insertion prevents memory issues.

def batch_insert_chunks(
    collection,
    chunks: list[EmbeddedChunk],
    batch_size: int = 100
):
    """Insert chunks in batches."""
    total = len(chunks)

    for i in range(0, total, batch_size):
        batch = chunks[i:i + batch_size]

        collection.add(
            ids=[chunk.chunk_id for chunk in batch],
            embeddings=[chunk.embedding for chunk in batch],
            documents=[chunk.text for chunk in batch],
            metadatas=[chunk.metadata for chunk in batch]
        )

        print(f"Inserted {min(i + batch_size, total)}/{total} chunks")

Query with filtering

Metadata filtering narrows results to relevant documents.

# Filter by source
results = collection.query(
    query_embeddings=[query_embedding],
    where={"source": "policies.pdf"},
    n_results=5
)

# Filter by year
results = collection.query(
    query_embeddings=[query_embedding],
    where={"year": {"$gte": 2024}},
    n_results=5
)

# Filter by multiple conditions
results = collection.query(
    query_embeddings=[query_embedding],
    where={
        "source": {"$in": ["a.pdf", "b.pdf"]},
        "section": "electronics"
    },
    n_results=10
)

Distance functions

ChromaDB supports three distance functions:

  1. Cosine (default): Measures angle between vectors. Range 0-2. Lower is better.

  2. Euclidean: Straight-line distance. Range 0+. Lower is better.

  3. IP (Inner Product): Dot product. Higher is better for normalized vectors.

# Specify distance function at collection creation
collection = client.get_or_create_collection(
    name="documents_cosine",
    metadata={"hnsw:space": "cosine"}  # or "l2" or "ip"
)

Indexing and performance

ChromaDB uses HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search. This provides fast retrieval even with millions of vectors.

# Collection metadata controls indexing behavior
collection = client.get_or_create_collection(
    name="documents",
    metadata={
        "hnsw:space": "cosine",  # Distance metric
        "hnsw:construction": 40,  # Build quality (higher = better, slower)
        "hnsw:search": 40  # Query quality (higher = better, slower)
    }
)

Default values work for most use cases. Increase them only if you have performance issues.

Collection management

# List all collections
collections = client.list_collections()
for col in collections:
    print(col.name, col.count)

# Get collection stats
collection = client.get_collection("documents")
print(f"Embedding count: {collection.count()}")

# Delete collection
client.delete_collection("documents")

# Reset database (delete all data)
client.reset()

Persistence and loading

import chromadb

# Save to disk
client = chromadb.Client(Settings(persist_directory="./chroma_data"))

# Reload from disk
loaded_client = chromadb.Client(Settings(persist_directory="./chroma_data"))
collection = loaded_client.get_collection("documents")

# Check data
print(f"Loaded {collection.count()} embeddings")

Complete ingestion pipeline

def ingest_to_chroma(
    documents: list[dict],
    collection_name: str = "documents",
    embedding_model: str = "all-MiniLM-L6-v2"
) -> chromadb.Collection:
    """
    Complete pipeline: documents -> chunks -> embeddings -> ChromaDB.
    """
    # Initialize
    client = chromadb.Client(Settings(persist_directory="./chroma_db"))
    embedding_pipeline = DocumentEmbeddingPipeline(embedding_model)

    # Get or create collection
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"description": "RAG document store"}
    )

    # Process documents
    print("Processing documents...")
    all_chunks = embedding_pipeline.process_documents(documents)
    print(f"Created {len(all_chunks)} chunks")

    # Store in ChromaDB
    print("Storing embeddings in ChromaDB...")
    batch_insert_chunks(collection, all_chunks, batch_size=100)

    print(f"Done. Collection has {collection.count()} embeddings.")
    return collection

Query and retrieval

def query_chroma(
    collection,
    query_text: str,
    embedding_model,
    top_k: int = 5,
    filters: dict = None
) -> list[dict]:
    """Query the vector store and return results with sources."""
    # Embed query
    query_embedding = embedding_model.encode([query_text])[0].tolist()

    # Search
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        where=filters,
        include=["documents", "metadatas", "distances"]
    )

    # Format results
    formatted = []
    for i, doc in enumerate(results["documents"][0]):
        formatted.append({
            "text": doc,
            "metadata": results["metadatas"][0][i],
            "distance": results["distances"][0][i]
        })

    return formatted

# Usage
model = SentenceTransformer("all-MiniLM-L6-v2")
results = query_chroma(
    collection,
    "What is the return policy for electronics?",
    model,
    top_k=3,
    filters={"year": 2024}
)

for r in results:
    print(f"[{r['distance']:.3f}] {r['text'][:100]}...")
    print(f"  Source: {r['metadata']['source']}")

Debugging common issues

# Check what's in the collection
all_data = collection.get(include=["embeddings", "documents", "metadatas"])
print(f"Total: {len(all_data['ids'])}")

# Verify embeddings exist
if not all_data['embeddings']:
    print("Warning: No embeddings found!")

# Check for issues
for i, emb in enumerate(all_data['embeddings'][:3]):
    print(f"Chunk {all_data['ids'][i]}: {len(emb)} dimensions")

Backup and export

import json

def export_collection(collection, filepath: str):
    """Export collection data to JSON."""
    data = collection.get(include=["embeddings", "documents", "metadatas"])

    export_data = {
        "ids": data["ids"],
        "documents": data["documents"],
        "embeddings": data["embeddings"],
        "metadatas": data["metadatas"]
    }

    with open(filepath, "w") as f:
        json.dump(export_data, f)

    print(f"Exported {len(data['ids'])} records to {filepath}")
EXERCISE

Create a Python script that ingests three documents (use the Markdown files you created in Chapter 5 exercises). Embed them and store in ChromaDB. Then query with three different questions and print the retrieved chunks with their sources. Verify that the correct chunks are returned for each query.