21. Advanced RAG Evaluation
Chapter 21 of 22 · 20 min
Beyond basic precision and recall, advanced evaluation measures whether retrieved context actually helps answer questions correctly.
Context Utilization Analysis
def analyze_context_utilization(answer: str,
context: str,
question: str) -> dict:
"""Analyze how well the context was used in generating the answer."""
# Check if key entities from context appear in answer
context_entities = extract_entities(context)
answer_entities = extract_entities(answer)
entity_overlap = len(set(context_entities) & set(answer_entities))
entity_recall = entity_overlap / len(context_entities) if context_entities else 0
# Check if answer makes claims not supported by context
unsupported_claims = check_claims(context, answer)
# Check citation accuracy
citations_correct = verify_citations(answer, context)
return {
"entity_recall": entity_recall,
"unsupported_claims": unsupported_claims,
"citation_accuracy": citations_correct
}
def check_claims(context: str, answer: str) -> List[str]:
"""Identify claims in answer not supported by context."""
# Parse answer claims (simplified)
claims = parse_claims(answer)
unsupported = []
for claim in claims:
# Check if claim is entailed by context
if not is_entailed(claim, context):
unsupported.append(claim)
return unsupported
def is_entailed(claim: str, context: str) -> bool:
"""Check if claim is entailed by context using NLI model."""
nli_model = pipeline("text2text-generation",
model="facebook/bart-large-mnli")
result = nli_model(
f"Context: {context}\nClaim: {claim}",
return_text=False
)
# Entailment or neutral (context doesn't contradict) counts as supported
return result[0]["label"] in ["entailment", "neutral"]
Hallucination Detection
class HallucinationDetector:
def __init__(self):
self.entailment_model = pipeline("text2text-generation",
model="facebook/bart-large-mnli")
def detect(self, context: str, answer: str) -> dict:
"""Detect hallucinations in the generated answer."""
# Split answer into claims
claims = self._split_into_claims(answer)
results = []
hallucinated = []
for claim in claims:
if self._is_hallucinated(claim, context):
hallucinated.append(claim)
results.append({"claim": claim, "hallucinated": True})
else:
results.append({"claim": claim, "hallucinated": False})
return {
"total_claims": len(claims),
"hallucinated_claims": len(hallucinated),
"hallucination_rate": len(hallucinated) / len(claims) if claims else 0,
"hallucinated_claims_list": hallucinated
}
def _is_hallucinated(self, claim: str, context: str) -> bool:
"""Check if a claim is hallucinated given context."""
# Check entailment
result = self.entailment_model(
f"Premise: {context}\nHypothesis: {claim}"
)
# If contradiction, it's hallucinated
return result[0]["label"] == "contradiction"
Retrieval-Answer Consistency
def measure_rag_accuracy(test_set: list) -> dict:
"""Measure RAG accuracy on a test set with ground truth."""
results = []
for item in test_set:
question = item["question"]
expected_answer = item["expected_answer"]
retrieved_context = item["retrieved_context"]
# Generate answer from context
generated = generate_answer(question, retrieved_context)
# Check answer correctness
is_correct = check_answer_correctness(generated, expected_answer)
# Check context sufficiency
context_helps = check_context_helps(question, retrieved_context)
results.append({
"question": question,
"correct": is_correct,
"context_sufficient": context_helps
})
return {
"accuracy": sum(r["correct"] for r in results) / len(results),
"context_sufficiency": sum(r["context_sufficient"] for r in results) / len(results),
"per_question_results": results
}
EXERCISE
Implement hallucination detection on your RAG outputs. Calculate hallucination rate across 50 queries and identify common hallucination patterns.