15. Embedding Caching

Chapter 15 of 18 · 20 min

KEY INSIGHT

Cache embeddings to avoid recomputing them when documents haven't changedâ€”dramatically speeds up re-indexing. Embedding computation is the expensive part. If you re-index the same documents, regenerating embeddings wastes time. ### Hash-Based Cache ```python import hashlib import json from pathlib import Path class EmbeddingCache: def __init__(self, cache_dir: str = ".embedding_cache"): self.cache_dir = Path(cache_dir) self.cache_dir.mkdir(exist_ok=True) self.index_file = self.cache_dir / "index.json" self._load_index() def _load_index(self): if self.index_file.exists(): with open(self.index_file) as f: self.index = json.load(f) else: self.index = {} def _save_index(self): with open(self.index_file, 'w') as f: json.dump(self.index, f, indent=2) def _compute_hash(self, text: str) -> str: """Hash document text to create cache key.""" return hashlib.sha256(text.encode()).hexdigest() def _get_cache_path(self, doc_id: str) -> Path: """Get path for cached embedding.""" return self.cache_dir / f"{doc_id}.npy" def get(self, doc_id: str, text: str, embedding_model) -> list: """Get embedding from cache or compute and cache it.""" text_hash = self._compute_hash(text) if doc_id in self.index and self.index[doc_id]["hash"] == text_hash: # Cache hit cache_path = self._get_cache_path(doc_id) if cache_path.exists(): import numpy as np return np.load(cache_path).tolist() # Cache miss - compute embedding import numpy as np embedding = embedding_model.encode([text])[0].tolist() # Save to cache np.save(self._get_cache_path(doc_id), np.array(embedding)) self.index[doc_id] = {"hash": text_hash} self._save_index() return embedding def invalidate(self, doc_id: str): """Remove document from cache.""" if doc_id in self.index: del self.index[doc_id] cache_path = self._get_cache_path(doc_id) if cache_path.exists(): cache_path.unlink() self._save_index() def clear(self): """Clear entire cache.""" for path in self.cache_dir.glob("*.npy"): path.unlink() self.index = {} self._save_index() ``` ### Usage with Search Engine ```python class CachedSemanticSearchEngine(SemanticSearchEngine): def __init__(self, model_name: str = "all-MiniLM-L6-v2"): super().__init__(model_name) self.cache = EmbeddingCache() def index_documents(self, documents, ids=None, metadatas=None): if ids is None: ids = [f"doc_{i}" for i in range(len(documents))] # Get embeddings (from cache or computed) embeddings = [ self.cache.get(doc_id, doc, self.model) for doc_id, doc in zip(ids, documents) ] self.collection.add( documents=documents, embeddings=embeddings, ids=ids, metadatas=metadatas ) return len(documents) def reindex_document(self, doc_id: str, document: str, metadata: Dict = None): """Re-index a single document, updating cache automatically.""" self.cache.invalidate(doc_id) return self.index_documents([document], ids=[doc_id], metadatas=[metadata]) ``` ### Cache Statistics ```python def cache_stats(cache: EmbeddingCache) -> Dict: """Get statistics about cache usage.""" import os cache_files = list(cache.cache_dir.glob("*.npy")) total_size = sum(f.stat().st_size for f in cache_files) return { "cached_documents": len(cache_files), "total_size_mb": total_size / (1024 * 1024), "index_entries": len(cache.index) } ```

EXERCISE

Index 1000 documents with the cache. Delete 10 documents. Re-index all 1000 documents. Verify the second run is significantly faster (cache hits).