KEY INSIGHT
Multi-modal evaluation requires measuring alignment between visual understanding and generated text, grounding accuracy in image evidence, and consistency across different phrasings of the same query.
Standard NLP metrics (BLEU, ROUGE) poorly capture visual grounding accuracy. Multi-modal evaluation needs metrics that verify claims made about images are actually supported by image content.
```python
from dataclasses import dataclass
from typing import Optional
import re
@dataclass
class MultiModalMetrics:
grounding_score: float # Can claims be verified in image?
consistency_score: float # Do semantically equivalent queries get same answer?
completeness_score: float # Are key visual elements mentioned?
hallucination_rate: float # What fraction of claims cannot be verified?
class MultiModalEvaluator:
def __init__(self, client):
self.client = client
async def evaluate_vqa_response(
self,
image_path: str,
question: str,
answer: str
) -> MultiModalMetrics:
"""
Evaluate whether generated answer is grounded in image.
"""
with open(image_path, "rb") as f:
img_b64 = base64.b64encode(f.read()).decode()
# Extract verifiable claims from answer
claims_prompt = f"""
Extract factual claims from this answer that can be verified by looking at the image.
Answer: {answer}
List each claim as a bullet point. Be specific.
"""
claims_response = await self.client.messages.create(
model="gemini-2.0-flash-thinking",
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "data": img_b64}},
{"type": "text", "text": claims_prompt}
]
}]
)
claims = self._parse_claims(claims_response.content[0].text)
# Verify each claim against image
verified = []
unverified = []
for claim in claims:
verification = await self._verify_claim(image_path, img_b64, claim)
if verification["verified"]:
verified.append(claim)
else:
unverified.append(claim)
grounding_score = len(verified) / len(claims) if claims else 0.0
hallucination_rate = len(unverified) / len(claims) if claims else 0.0
# Evaluate completeness
completeness = await self._evaluate_completeness(
image_path, img_b64, question, answer
)
return MultiModalMetrics(
grounding_score=grounding_score,
consistency_score=0.0, # Requires separate consistency evaluation
completeness_score=completeness,
hallucination_rate=hallucination_rate
)
async def evaluate_consistency(
self,
image_path: str,
queries: list[str]
) -> float:
"""
Evaluate whether semantically equivalent queries get consistent answers.
"""
answers = []
for query in queries:
response = await self.client.messages.create(
model="gemini-2.0-flash-thinking",
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "file", "file": image_path}},
{"type": "text", "text": query}
]
}]
)
answers.append(self._normalize(response.content[0].text))
# Measure agreement between normalized answers
unique_answers = set(answers)
consistency = 1.0 - (len(unique_answers) - 1) / len(answers)
return consistency
async def _verify_claim(
self,
image_path: str,
img_b64: str,
claim: str
) -> dict:
"""Verify a single claim against image evidence"""
verification_prompt = f"""
Can this claim be verified by looking at the image?
Claim: {claim}
Answer YES if the claim is clearly supported by the image.
Answer NO if the claim contradicts or cannot be verified from the image.
Answer PARTIAL if the claim is partially supported.
"""
response = await self.client.messages.create(
model="gemini-2.0-flash-thinking",
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "data": img_b64}},
{"type": "text", "text": verification_prompt}
]
}]
)
normalized = response.content[0].text.strip().upper()
return {
"verified": "YES" in normalized,
"partial": "PARTIAL" in normalized,
"response": response.content[0].text
}
def _parse_claims(self, text: str) -> list[str]:
"""Extract claims from model response"""
claims = []
for line in text.split("\n"):
line = line.strip()
if line.startswith("-") or line.startswith("*"):
claims.append(line.lstrip("-* ").strip())
return claims
def _normalize(self, text: str) -> str:
"""Normalize text for consistency comparison"""
text = text.lower()
text = re.sub(r"[^\w\s]", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
```
**Failure Modes:**
- Grounding evaluator hallucinating verification when image ambiguous. Use conservative thresholds.
- Consistency false negatives when legitimate multi-interpretations exist. Distinguish semantic equivalence from answer format.
- Incomplete evaluation when answer omits commonly tested visual elements. Compare against ground truth element list.