KEY INSIGHT
The same queries repeat; caching at the semantic-similarity level instead of exact-match catches more cache hits.
### Exact Caching Is Too Restrictive
A query like "how does consensus work" and "explain Raft consensus" are semantically equivalent but token-different. Exact-match caching misses these hits. Semantic caching stores results for queries with cosine-similarity above a threshold.
```python
from datetime import datetime, timedelta
import hashlib
class SemanticCache:
def __init__(self, similarityThreshold: float = 0.95):
self.cache = {} # query_hash -> {"embedding", "result", "timestamp"}
self.similarityThreshold = similarityThreshold
def _hashQuery(self, query: str) -> str:
return hashlib.sha256(query.encode()).hexdigest()
def get(self, query: str, queryEmbedding: list[float]) -> tuple[any, bool]:
token_hash = self._hashQuery(query)
if token_hash in self.cache:
return self.cache[token_hash]["result"], True
# Try semantic lookup
for cached_query, cached_entry in self.cache.items():
sim = cosineSimilarity(
np.array(queryEmbedding),
np.array(cached_entry["embedding"])
)
if sim >= self.similarityThreshold:
print(f"[CACHE HIT] Semantic match: '{cached_query}' -> this query")
return cached_entry["result"], True
return None, False
def set(self, query: str, queryEmbedding: list[float], result: any) -> None:
self.cache[self._hashQuery(query)] = {
"embedding": queryEmbedding,
"result": result,
"timestamp": datetime.utcnow()
}
def evictStale(self, max_age_seconds: int = 3600) -> int:
now = datetime.utcnow()
stale_keys = [
k for k, v in self.cache.items()
if (now - v["timestamp"]) > timedelta(seconds=max_age_seconds)
]
for k in stale_keys:
del self.cache[k]
return len(stale_keys)
```
### Cache Warming
On service startup, pre-load the cache with high-traffic queries identified from logging:
```python
def warmCache(cache: SemanticCache, hotQueries: list[tuple[str, any]]) -> None:
loaded = 0
for query, expected_result in hotQueries:
embedding = embed_texts([query])[0]
cache.set(query, embedding, expected_result)
loaded += 1
print(f"[CACHE] Warmed with {loaded} entries")
```
### TTL and Eviction Policies
```python
class LRUCache:
def __init__(self, maxEntries: int = 1000, ttlSeconds: int = 3600):
self.maxEntries = maxEntries
self.ttlSeconds = ttlSeconds
self.cache = {}
self.accessOrder = []
def get(self, key: str) -> any:
if key in self.cache:
entry = self.cache[key]
if self._isFresh(entry["timestamp"]):
self._markAccessed(key)
return entry["result"]
else:
del self.cache[key]
self.accessOrder.remove(key)
return None
def set(self, key: str, result: any) -> None:
if len(self.cache) >= self.maxEntries:
oldest = self.accessOrder.pop(0)
del self.cache[oldest]
self.cache[key] = {"result": result, "timestamp": datetime.utcnow()}
self.accessOrder.append(key)
def _isFresh(self, timestamp: datetime) -> bool:
return (datetime.utcnow() - timestamp).total_seconds() < self.ttlSeconds
def _markAccessed(self, key: str) -> None:
self.accessOrder.remove(key)
self.accessOrder.append(key)
```
### Failure Modes
Semantic cache with a low similarity threshold (<0.90) can return semantically different but lexically similar cached results, causing generation quality degradation. Use a high threshold (0.95+) and validate on logged queries. Cache invalidation on corpus updates is tricky—clearing the entire cache on any index update is safe but wasteful; incremental invalidation by versioned corpus segments is more efficient but complex to implement.