12. Query Classification
Query classification determines retrieval strategy and can influence chunking and prompting downstream. Classifying queries before retrieval enables targeted system behavior.
Classification dimensions: Intent (informational vs. navigational), complexity (simple fact vs. multi-hop reasoning), domain (technical vs. general), and language (English vs. multilingual).
Rule-based classification uses pattern matching on query text. This approach works for high-precision scenarios but requires maintenance as vocabulary evolves.
import re
from enum import Enum
from typing import Callable
class QueryIntent(Enum):
FACTUAL = "factual"
DEFINITIONAL = "definitional"
PROCEDURAL = "procedural"
EXPLORATORY = "exploratory"
CONVERSATIONAL = "conversational"
class RuleBasedClassifier:
def __init__(self):
# Pattern definitions for each intent
self.patterns = {
QueryIntent.FACTUAL: [
r'\b(when|where|who|how many|what year|what amount)\b',
r'\b\d{4}\b', # Years
r'\b(price|cost|number|percentage|rate)\b'
],
QueryIntent.DEFINITIONAL: [
r'\b(what is|what are|define|definition of|meaning of)\b',
r'\bwho is\b'
],
QueryIntent.PROCEDURAL: [
r'\b(how do|how to|steps?|instructions?|guide|tutorial)\b',
r'\b(configure|install|set up|build|create)\b'
],
QueryIntent.EXPLORATORY: [
r'\b(why|explain|difference between|compare|relationship)\b',
r'\?(?!\s*$)' # Question mark not at end
],
QueryIntent.CONVERSATIONAL: [
r'\b(thanks|thank you|please|could you|would you)\b'
]
}
# Compile patterns
self.compiled_patterns = {
intent: [re.compile(p, re.IGNORECASE) for p in patterns]
for intent, patterns in self.patterns.items()
}
def classify(self, query: str) -> QueryIntent:
"""Classify query intent using pattern matching."""
scores = {}
for intent, patterns in self.compiled_patterns.items():
matches = sum(1 for p in patterns if p.search(query))
scores[intent] = matches
if max(scores.values()) == 0:
return QueryIntent.EXPLORATORY # Default
return max(scores, key=scores.get)
def get_retrieval_config(self, query: str) -> dict:
"""Get retrieval configuration based on classified intent."""
intent = self.classify(query)
configs = {
QueryIntent.FACTUAL: {
'strategies': ['sparse', 'keyword'],
'top_k': 20,
'chunk_size_preference': 'small'
},
QueryIntent.DEFINITIONAL: {
'strategies': ['dense'],
'top_k': 5,
'chunk_size_preference': 'medium'
},
QueryIntent.PROCEDURAL: {
'strategies': ['dense', 'sparse'],
'top_k': 15,
'chunk_size_preference': 'large'
},
QueryIntent.EXPLORATORY: {
'strategies': ['dense', 'sparse', 'keyword'],
'top_k': 30,
'chunk_size_preference': 'medium'
},
QueryIntent.CONVERSATIONAL: {
'strategies': ['dense'],
'top_k': 10,
'chunk_size_preference': 'medium'
}
}
return configs.get(intent, configs[QueryIntent.EXPLORATORY])
Evaluate classifier accuracy on a held-out set of 200 queries with manual labels. Report per-class precision, recall, and F1. Continue to Chapters 13-24 for advanced reranking techniques, evaluation frameworks, production deployment patterns, and monitoring strategies.