17. Filtering and Hybrid Search
Pure vector search ignores metadata. Production systems need to combine semantic similarity with structured filtering—finding "similar products in category X" or "restaurants near me that match this taste profile."
Filter Classification
Pre-filtering: Apply filters before vector search. This can hurt recall if the filtered set is small.
Post-filtering: Run vector search, then filter results. Fast but may return fewer than k results.
Indexed filtering: Maintain separate indexes per filter value. Best performance for high-cardinality filters.
from dataclasses import dataclass
from typing import Optional, Callable
import numpy as np
@dataclass
class FilterSpec:
field: str
operator: str # 'eq', 'ne', 'gt', 'lt', 'in', 'range'
value: any
class HybridSearchIndex:
"""
Vector index with metadata filtering support.
Uses post-filtering strategy for flexibility.
"""
def __init__(self, vectors: np.ndarray, metadata: list[dict]):
self.vectors = vectors
self.metadata = metadata
self.n = len(vectors)
# Build inverted indices for common filter patterns
self._build_inverted_indices()
def _build_inverted_indices(self):
"""Pre-compute inverted indices for string fields with low cardinality."""
self.inverted_indices = {}
for i, doc in enumerate(self.metadata):
for key, value in doc.items():
if isinstance(value, str) and len(set(d.get(key) for d in self.metadata)) < 1000:
if key not in self.inverted_indices:
self.inverted_indices[key] = {}
if value not in self.inverted_indices[key]:
self.inverted_indices[key][value] = set()
self.inverted_indices[key][value].add(i)
def _evaluate_filter(self, filter_spec: FilterSpec, candidate_ids: set) -> set:
"""Evaluate filter against candidate IDs."""
if filter_spec.field in self.inverted_indices:
index = self.inverted_indices[filter_spec.field]
if filter_spec.operator == 'eq':
return candidate_ids & index.get(filter_spec.value, set())
elif filter_spec.operator == 'ne':
return candidate_ids - index.get(filter_spec.value, set())
elif filter_spec.operator == 'in':
matching = set()
for val in filter_spec.value:
matching |= index.get(val, set())
return candidate_ids & matching
# Fallback: scan candidates
matching = set()
for cid in candidate_ids:
if self._matches_filter(cid, filter_spec):
matching.add(cid)
return matching
def _matches_filter(self, doc_id: int, filter_spec: FilterSpec) -> bool:
"""Check if a single document matches filter spec."""
value = self.metadata[doc_id].get(filter_spec.field)
if filter_spec.operator == 'eq':
return value == filter_spec.value
elif filter_spec.operator == 'ne':
return value != filter_spec.value
elif filter_spec.operator == 'gt':
return value > filter_spec.value
elif filter_spec.operator == 'lt':
return value < filter_spec.value
elif filter_spec.operator == 'in':
return value in filter_spec.value
return False
def search(
self,
query: np.ndarray,
k: int,
filter_spec: Optional[FilterSpec] = None,
ef_search: int = 100
) -> tuple[np.ndarray, np.ndarray]:
"""
Hybrid search: vector search with optional filtering.
Returns top-k matching results.
"""
# Determine candidates
if filter_spec:
# Start with filtered subset
if filter_spec.field in self.inverted_indices:
candidate_ids = self.inverted_indices[filter_spec.field].get(
filter_spec.value, set()
)
else:
candidate_ids = set(range(self.n))
candidate_ids = self._evaluate_filter(filter_spec, candidate_ids)
else:
candidate_ids = set(range(self.n))
if not candidate_ids:
return np.array([]), np.array([])
# Run vector search on all candidates (simplified; real impl uses graph)
candidate_list = list(candidate_ids)
candidate_vectors = self.vectors[candidate_list]
# Compute distances
distances = np.linalg.norm(candidate_vectors - query, axis=1)
# Get top-k
top_k_local = np.argpartition(distances, min(k, len(distances)-1))[:k]
top_k_local = top_k_local[np.argsort(distances[top_k_local])]
# Map back to global IDs
top_k_global = np.array(candidate_list)[top_k_local]
return distances[top_k_local], top_k_global
Optimized Pre-Filtering with Labeled HNSW
For high-selectivity filters, combine vector and scalar indexes:
class LabeledHNSW:
"""
HNSW variant that supports efficient filtered search.
Each layer maintains labels for navigation.
"""
def search_filtered(
self,
query: np.ndarray,
filter_labels: set[str],
k: int,
ef_search: int = 100
) -> tuple[np.ndarray, np.ndarray]:
"""
Search with filter-aware navigation.
Skips nodes that don't match filter during graph traversal.
"""
candidates = [] # (distance, node_id)
visited = set()
# Start from entry point
entry = self.entry_point
current_distance = self._distance(query, self.vectors[entry])
# Layer-by-layer search
for layer in range(self.max_layer, 0, -1):
neighbors = self._layer_neighbors(entry, layer)
for neighbor in neighbors:
dist = self._distance(query, self.vectors[neighbor])
if dist < current_distance:
entry = neighbor
current_distance = dist
# Bottom layer: collect candidates respecting filter
candidates = self._beam_search(
query, ef_search, filter_labels=filter_labels
)
return self._top_k(candidates, k)
def _beam_search(self, query, ef, filter_labels, candidates=None, visited=None):
"""Beam search with filter constraint."""
if candidates is None:
candidates = []
if visited is None:
visited = set()
# Priority queue: (negative_distance, node_id)
pq = [(0, self.entry_point)] # max-heap via negated distance
while pq:
neg_dist, node_id = heappop(pq)
if node_id in visited:
continue
visited.add(node_id)
# Check filter
if self.labels[node_id] not in filter_labels:
continue
heappush(candidates, (neg_dist, node_id))
if len(candidates) > ef:
heappop(candidates) # Remove worst
# Explore neighbors
for neighbor in self.layer_neighbors[node_id][0]:
if neighbor not in visited:
dist = -self._distance(query, self.vectors[neighbor])
heappush(pq, (dist, neighbor))
return candidates
Failure Modes
Filter explosion: Highly selective filters may return empty results. Always handle the empty case and provide helpful feedback.
Cross-talk between filters: Multiple filters may interact unexpectedly. Test all filter combinations during development.
Filter-aware recall degradation: Graph traversal skips nodes that don't match filters, potentially missing relevant results. Validate recall with synthetic filtered queries.
Generate 100K vectors with synthetic metadata (category, price_range, region). Measure query latency and recall for different filter selectivities (0.1%, 1%, 10%, 50% of data). Compare pre-filter, post-filter, and indexed approaches.
# Expected: pre-filter fastest at 0.1%, indexed best at 10-50%
# Plot the crossover point for your implementation