Filtering and Hybrid Search — Vector Database Internals (Chapter 17)

Pure vector search ignores metadata. Production systems need to combine semantic similarity with structured filtering—finding "similar products in category X" or "restaurants near me that match this taste profile."

Filter Classification

Pre-filtering: Apply filters before vector search. This can hurt recall if the filtered set is small.

Post-filtering: Run vector search, then filter results. Fast but may return fewer than k results.

Indexed filtering: Maintain separate indexes per filter value. Best performance for high-cardinality filters.

from dataclasses import dataclass
from typing import Optional, Callable
import numpy as np

@dataclass
class FilterSpec:
    field: str
    operator: str  # 'eq', 'ne', 'gt', 'lt', 'in', 'range'
    value: any

class HybridSearchIndex:
    """
    Vector index with metadata filtering support.
    Uses post-filtering strategy for flexibility.
    """
    
    def __init__(self, vectors: np.ndarray, metadata: list[dict]):
        self.vectors = vectors
        self.metadata = metadata
        self.n = len(vectors)
        
        # Build inverted indices for common filter patterns
        self._build_inverted_indices()
    
    def _build_inverted_indices(self):
        """Pre-compute inverted indices for string fields with low cardinality."""
        self.inverted_indices = {}
        
        for i, doc in enumerate(self.metadata):
            for key, value in doc.items():
                if isinstance(value, str) and len(set(d.get(key) for d in self.metadata)) < 1000:
                    if key not in self.inverted_indices:
                        self.inverted_indices[key] = {}
                    if value not in self.inverted_indices[key]:
                        self.inverted_indices[key][value] = set()
                    self.inverted_indices[key][value].add(i)
    
    def _evaluate_filter(self, filter_spec: FilterSpec, candidate_ids: set) -> set:
        """Evaluate filter against candidate IDs."""
        if filter_spec.field in self.inverted_indices:
            index = self.inverted_indices[filter_spec.field]
            
            if filter_spec.operator == 'eq':
                return candidate_ids & index.get(filter_spec.value, set())
            elif filter_spec.operator == 'ne':
                return candidate_ids - index.get(filter_spec.value, set())
            elif filter_spec.operator == 'in':
                matching = set()
                for val in filter_spec.value:
                    matching |= index.get(val, set())
                return candidate_ids & matching
        
        # Fallback: scan candidates
        matching = set()
        for cid in candidate_ids:
            if self._matches_filter(cid, filter_spec):
                matching.add(cid)
        return matching
    
    def _matches_filter(self, doc_id: int, filter_spec: FilterSpec) -> bool:
        """Check if a single document matches filter spec."""
        value = self.metadata[doc_id].get(filter_spec.field)
        
        if filter_spec.operator == 'eq':
            return value == filter_spec.value
        elif filter_spec.operator == 'ne':
            return value != filter_spec.value
        elif filter_spec.operator == 'gt':
            return value > filter_spec.value
        elif filter_spec.operator == 'lt':
            return value < filter_spec.value
        elif filter_spec.operator == 'in':
            return value in filter_spec.value
        
        return False
    
    def search(
        self,
        query: np.ndarray,
        k: int,
        filter_spec: Optional[FilterSpec] = None,
        ef_search: int = 100
    ) -> tuple[np.ndarray, np.ndarray]:
        """
        Hybrid search: vector search with optional filtering.
        Returns top-k matching results.
        """
        # Determine candidates
        if filter_spec:
            # Start with filtered subset
            if filter_spec.field in self.inverted_indices:
                candidate_ids = self.inverted_indices[filter_spec.field].get(
                    filter_spec.value, set()
                )
            else:
                candidate_ids = set(range(self.n))
                candidate_ids = self._evaluate_filter(filter_spec, candidate_ids)
        else:
            candidate_ids = set(range(self.n))
        
        if not candidate_ids:
            return np.array([]), np.array([])
        
        # Run vector search on all candidates (simplified; real impl uses graph)
        candidate_list = list(candidate_ids)
        candidate_vectors = self.vectors[candidate_list]
        
        # Compute distances
        distances = np.linalg.norm(candidate_vectors - query, axis=1)
        
        # Get top-k
        top_k_local = np.argpartition(distances, min(k, len(distances)-1))[:k]
        top_k_local = top_k_local[np.argsort(distances[top_k_local])]
        
        # Map back to global IDs
        top_k_global = np.array(candidate_list)[top_k_local]
        
        return distances[top_k_local], top_k_global

Optimized Pre-Filtering with Labeled HNSW

For high-selectivity filters, combine vector and scalar indexes:

class LabeledHNSW:
    """
    HNSW variant that supports efficient filtered search.
    Each layer maintains labels for navigation.
    """
    
    def search_filtered(
        self,
        query: np.ndarray,
        filter_labels: set[str],
        k: int,
        ef_search: int = 100
    ) -> tuple[np.ndarray, np.ndarray]:
        """
        Search with filter-aware navigation.
        Skips nodes that don't match filter during graph traversal.
        """
        candidates = []  # (distance, node_id)
        visited = set()
        
        # Start from entry point
        entry = self.entry_point
        current_distance = self._distance(query, self.vectors[entry])
        
        # Layer-by-layer search
        for layer in range(self.max_layer, 0, -1):
            neighbors = self._layer_neighbors(entry, layer)
            
            for neighbor in neighbors:
                dist = self._distance(query, self.vectors[neighbor])
                if dist < current_distance:
                    entry = neighbor
                    current_distance = dist
        
        # Bottom layer: collect candidates respecting filter
        candidates = self._beam_search(
            query, ef_search, filter_labels=filter_labels
        )
        
        return self._top_k(candidates, k)
    
    def _beam_search(self, query, ef, filter_labels, candidates=None, visited=None):
        """Beam search with filter constraint."""
        if candidates is None:
            candidates = []
        if visited is None:
            visited = set()
        
        # Priority queue: (negative_distance, node_id)
        pq = [(0, self.entry_point)]  # max-heap via negated distance
        
        while pq:
            neg_dist, node_id = heappop(pq)
            if node_id in visited:
                continue
            visited.add(node_id)
            
            # Check filter
            if self.labels[node_id] not in filter_labels:
                continue
            
            heappush(candidates, (neg_dist, node_id))
            
            if len(candidates) > ef:
                heappop(candidates)  # Remove worst
            
            # Explore neighbors
            for neighbor in self.layer_neighbors[node_id][0]:
                if neighbor not in visited:
                    dist = -self._distance(query, self.vectors[neighbor])
                    heappush(pq, (dist, neighbor))
        
        return candidates

Failure Modes

Filter explosion: Highly selective filters may return empty results. Always handle the empty case and provide helpful feedback.

Cross-talk between filters: Multiple filters may interact unexpectedly. Test all filter combinations during development.

Filter-aware recall degradation: Graph traversal skips nodes that don't match filters, potentially missing relevant results. Validate recall with synthetic filtered queries.