18. Reasoning Application Project
Chapter 18 of 18 · 30 min
This chapter integrates the course content into a concrete project: building a reasoning-augmented document analysis system that handles complex multi-document queries.
Project Overview
Build a system that:
- Accepts queries about documents users provide
- Uses multi-step reasoning to break down complex queries
- Verifies each reasoning step against document constraints
- Handles queries exceeding single-document scope through tool-based retrieval
- Reports confidence scores based on verification results
Project Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Query Router │
│ Simple queries → Direct response │
│ Complex queries → Reasoning pipeline │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ Reasoning Engine │
│ - Chain-of-thought generation │
│ - Step-based verification │
│ - Self-correction loop │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ Tool Integration │
│ - Document search │
│ - Cross-reference extraction │
│ - Numeric calculations (if needed) │
└─────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ Response Builder │
│ - Synthesize verified reasoning │
│ - Attach confidence scores │
│ - Include verification evidence │
└─────────────────────────────────────────────────────────────────┘
Implementation
import json
from dataclasses import dataclass
from typing import List, Dict, Optional
@dataclass
class ReasoningStep:
"""A single step in a reasoning chain"""
step_number: int
claim: str
evidence_source: Optional[str] = None
verification_status: str = "pending" # pending, verified, failed
@dataclass
class AnalysisQuery:
"""A query requiring multi-document reasoning"""
question: str
documents: List[str]
complexity_hints: List[str] = None
class ReasoningDocumentAnalyzer:
def __init__(self, model_client, document_store):
self.model = model_client
self.documents = document_store
self.verification_cache = {}
def analyze(self, query: AnalysisQuery) -> Dict:
"""
Main entry point: analyze documents and answer complex queries
"""
# Step 1: Complexity assessment
is_complex = self._assess_complexity(query)
if not is_complex:
return self._fast_response(query)
# Step 2: Multi-step reasoning with verification
reasoning_steps = self._generate_verified_reasoning(query)
# Step 3: Build response from verified reasoning
return self._build_response(query, reasoning_steps)
def _assess_complexity(self, query: AnalysisQuery) -> bool:
"""
Determine if query requires full reasoning pipeline
Simple indicators: comparison, synthesis, multi-document
"""
complexity_indicators = [
'compare', 'relationship', 'between',
'multiple', 'synthesize', 'implications'
]
question_lower = query.question.lower()
for indicator in complexity_indicators:
if indicator in question_lower:
return True
return len(query.documents) > 3
def _generate_verified_reasoning(self, query: AnalysisQuery) -> List[ReasoningStep]:
"""
Generate and verify reasoning steps
"""
steps = []
evidence = self._extract_evidence(query.documents)
# Generate initial reasoning chain
reasoning_prompt = f"""
Question: {query.question}
Evidence: {evidence}
Generate a step-by-step reasoning chain. For each step:
1. State the claim clearly
2. Identify which evidence supports this claim
3. Note any logical dependencies on previous steps
"""
initial_chain = self.model.generate(reasoning_prompt)
parsed_steps = self._parse_reasoning_chain(initial_chain)
# Verify each step
for step in parsed_steps:
verification_result = self._verify_step(step, query.documents)
step.verification_status = verification_result
if verification_result == "failed":
# Attempt self-correction once
corrected = self._self_correct_step(step, query.documents)
if corrected:
steps.append(corrected)
# Note: we still include failed steps for transparency
steps.append(step)
else:
steps.append(step)
return steps
def _verify_step(self, step: ReasoningStep, documents: List[str]) -> str:
"""
Verify a reasoning step against source documents
"""
# Check if claim is consistent with evidence
if not step.evidence_source:
# No evidence cited—flag for review
return "unverified"
# Verify evidence actually supports the claim
relevant_doc = self._find_document(step.evidence_source, documents)
if not relevant_doc:
return "failed" # Cited evidence not found
claim_supported = self._check_claim_support(
step.claim, relevant_doc
)
return "verified" if claim_supported else "failed"
def _self_correct_step(self, step: ReasoningStep, documents: List[str]) -> Optional[ReasoningStep]:
"""
Attempt to correct a failed reasoning step
"""
correction_prompt = f"""
The following reasoning step failed verification:
Claim: {step.claim}
Evidence source: {step.evidence_source}
Documents available: {documents}
Generate a corrected version of this step or identify that
the original claim was incorrect.
"""
corrected = self.model.generate(correction_prompt)
# Parse and verify the correction
new_step = self._parse_single_step(corrected)
new_step.step_number = step.step_number
new_step.verification_status = "pending"
# Re-verify the correction
verification = self._verify_step(new_step, documents)
new_step.verification_status = verification
return new_step if verification == "verified" else None
def _build_response(self, query: AnalysisQuery, steps: List[ReasoningStep]) -> Dict:
"""
Build final response with verification transparency
"""
verified_steps = [s for s in steps if s.verification_status == "verified"]
failed_steps = [s for s in steps if s.verification_status == "failed"]
# Determine confidence based on verification rate
verification_rate = len(verified_steps) / len(steps) if steps else 0
confidence = self._calculate_confidence(verification_rate, len(failed_steps))
# Extract final answer from last verified step
answer = self._extract_answer(verified_steps, query.question)
return {
'answer': answer,
'confidence': confidence,
'reasoning_chain': [
{'step': s.step_number,
'claim': s.claim,
'status': s.verification_status,
'source': s.evidence_source}
for s in steps
],
'verification_summary': {
'total_steps': len(steps),
'verified': len(verified_steps),
'failed': len(failed_steps),
'verification_rate': verification_rate
}
}
Running the Project
# Example usage
def demo():
documents = [
"Q3 Revenue: $4.2M, up 15% from Q2",
"Q3 Customer count: 12,000, up 8% from Q2",
"Q3 Churn rate: 3.2%, down from 4.1% in Q2",
"Q2 Revenue: $3.65M, Customer count: 11,111"
]
query = AnalysisQuery(
question="How did customer growth and revenue growth compare in Q3, "
"and what does this suggest about revenue per customer?",
documents=documents
)
analyzer = ReasoningDocumentAnalyzer(
model_client=deepseek_client,
document_store=documents
)
result = analyzer.analyze(query)
print(f"Answer: {result['answer']}")
print(f"Confidence: {result['confidence']}")
print(f"Verified steps: {result['verification_summary']['verified']}/"
f"{result['verification_summary']['total_steps']}")
# Execute the demo
demo()
Expected Output Structure
# Typical verified response
{
'answer': 'Customer growth was 8% while revenue growth was 15%, suggesting '
'revenue per customer increased from approximately $328 to $350, '
'indicating either higher-value customer acquisition or existing '
'customer monetization improvement.',
'confidence': 0.85,
'reasoning_chain': [
{
'step': 1,
'claim': 'Q3 revenue was $4.2M, a 15% increase from Q2',
'status': 'verified',
'source': 'Doc 1'
},
{
'step': 2,
'claim': 'Q3 customers were 12,000, an 8% increase from Q2',
'status': 'verified',
'source': 'Doc 2'
},
{
'step': 3,
'claim': 'Revenue per customer increased from ~$328 to ~$350',
'status': 'verified',
'source': None # Calculated from steps 1 and 2
}
],
'verification_summary': {
'total_steps': 3,
'verified': 3,
'failed': 0,
'verification_rate': 1.0
}
}
Testing and Validation
def test_reasoning_pipeline():
"""
Test the reasoning pipeline with various query types
"""
test_cases = [
{
'name': 'Simple retrieval',
'query': AnalysisQuery(
question="What was Q3 revenue?",
documents=["Q3 Revenue: $4.2M"]
),
'expected_verification_rate': 1.0,
'expected_complexity': False
},
{
'name': 'Multi-document comparison',
'query': AnalysisQuery(
question="Comparing Q2 and Q3, which quarter had higher churn?",
documents=["Q2 Churn: 4:1%", "Q3 Churn: 3.2%"]
),
'expected_verification_rate': 1.0,
'expected_complexity': True
},
{
'name': 'Numeric inference',
'query': AnalysisQuery(
question="What percentage did the stock price change?",
documents=["Starting: $50, Ending: $47"]
),
'expected_verification_rate': 0.8, # Some numeric errors possible
'expected_complexity': True
}
]
failures = []
for test in test_cases:
result = analyzer.analyze(test['query'])
if test['expected_complexity'] != (result['verification_summary']['total_steps'] > 1):
failures.append(f"{test['name']}: complexity detection failed")
if result['verification_summary']['verification_rate'] < test['expected_verification_rate']:
failures.append(f"{test['name']}: verification rate {result['verification_summary']['verification_rate']} below expected")
return {'passed': len(failures) == 0, 'failures': failures}
Extension Ideas
After completing the base project:
- Add tool integration: Connect external data sources for real-time verification
- Implement confidence calibration: Track actual vs. reported confidence over time
- Add explanation generation: Automatically generate natural language summaries of reasoning chains
- Build audit logging: Track all reasoning steps for compliance and debugging
- Add self-correction with examples: Provide few-shot examples to improve self-correction accuracy
EXERCISE
Extend the project by adding one new capability: (1) tool-based live data retrieval, (2) cross-document conflict detection, or (3) numeric uncertainty propagation. Document how your extension changes the verification pipeline and what new failure modes it introduces.