03. Brute Force Search
Chapter 3 of 18 · 15 min
Brute force is the baseline. Every advanced index is optimizing away comparisons, so understanding brute force's behavior tells you where the pain points are.
EXERCISE
Measure brute force latency as a function of dataset size. Plot the curve and identify where an index becomes worthwhile for your latency budget.
import numpy as np
import time
def benchmark_bruteforce(sizes, dim=128, n_queries=50):
results = {}
for n in sizes:
db = np.random.rand(n, dim).astype('float32')
queries = np.random.rand(n_queries, dim).astype('float32')
start = time.time()
for q in queries:
diff = db - q
dists = np.sqrt(np.sum(diff ** 2, axis=1))
np.argsort(dists)[:10]
elapsed = (time.time() - start) / n_queries
results[n] = elapsed * 1000 # ms
return results
sizes = [1000, 5000, 10000, 50000, 100000, 500000]
timings = benchmark_bruteforce(sizes)
for n, ms in timings.items():
print(f"{n:>8,}: {ms:.2f}ms")