Query Classification — Advanced RAG — Chunking, Retrieval, Re-ranking (Chapter 12)

Query classification determines retrieval strategy and can influence chunking and prompting downstream. Classifying queries before retrieval enables targeted system behavior.

Classification dimensions: Intent (informational vs. navigational), complexity (simple fact vs. multi-hop reasoning), domain (technical vs. general), and language (English vs. multilingual).

Rule-based classification uses pattern matching on query text. This approach works for high-precision scenarios but requires maintenance as vocabulary evolves.

import re
from enum import Enum
from typing import Callable

class QueryIntent(Enum):
    FACTUAL = "factual"
    DEFINITIONAL = "definitional"
    PROCEDURAL = "procedural"
    EXPLORATORY = "exploratory"
    CONVERSATIONAL = "conversational"

class RuleBasedClassifier:
    def __init__(self):
        # Pattern definitions for each intent
        self.patterns = {
            QueryIntent.FACTUAL: [
                r'\b(when|where|who|how many|what year|what amount)\b',
                r'\b\d{4}\b',  # Years
                r'\b(price|cost|number|percentage|rate)\b'
            ],
            QueryIntent.DEFINITIONAL: [
                r'\b(what is|what are|define|definition of|meaning of)\b',
                r'\bwho is\b'
            ],
            QueryIntent.PROCEDURAL: [
                r'\b(how do|how to|steps?|instructions?|guide|tutorial)\b',
                r'\b(configure|install|set up|build|create)\b'
            ],
            QueryIntent.EXPLORATORY: [
                r'\b(why|explain|difference between|compare|relationship)\b',
                r'\?(?!\s*$)'  # Question mark not at end
            ],
            QueryIntent.CONVERSATIONAL: [
                r'\b(thanks|thank you|please|could you|would you)\b'
            ]
        }
        
        # Compile patterns
        self.compiled_patterns = {
            intent: [re.compile(p, re.IGNORECASE) for p in patterns]
            for intent, patterns in self.patterns.items()
        }
    
    def classify(self, query: str) -> QueryIntent:
        """Classify query intent using pattern matching."""
        scores = {}
        
        for intent, patterns in self.compiled_patterns.items():
            matches = sum(1 for p in patterns if p.search(query))
            scores[intent] = matches
        
        if max(scores.values()) == 0:
            return QueryIntent.EXPLORATORY  # Default
        
        return max(scores, key=scores.get)
    
    def get_retrieval_config(self, query: str) -> dict:
        """Get retrieval configuration based on classified intent."""
        intent = self.classify(query)
        
        configs = {
            QueryIntent.FACTUAL: {
                'strategies': ['sparse', 'keyword'],
                'top_k': 20,
                'chunk_size_preference': 'small'
            },
            QueryIntent.DEFINITIONAL: {
                'strategies': ['dense'],
                'top_k': 5,
                'chunk_size_preference': 'medium'
            },
            QueryIntent.PROCEDURAL: {
                'strategies': ['dense', 'sparse'],
                'top_k': 15,
                'chunk_size_preference': 'large'
            },
            QueryIntent.EXPLORATORY: {
                'strategies': ['dense', 'sparse', 'keyword'],
                'top_k': 30,
                'chunk_size_preference': 'medium'
            },
            QueryIntent.CONVERSATIONAL: {
                'strategies': ['dense'],
                'top_k': 10,
                'chunk_size_preference': 'medium'
            }
        }
        
        return configs.get(intent, configs[QueryIntent.EXPLORATORY])