17. Evaluation Metrics
Chapter 17 of 18 · 20 min
Rigorous evaluation separates working NLP systems from wishful thinking. Different tasks require different metrics, and understanding metric properties guides both development and reporting.
Text Generation Metrics
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModel
import torch
import numpy as np
from collections import Counter
class NLPEvaluator:
def __init__(self):
self.bertscore_model = None
def bleu_score(self, reference: str, hypothesis: str, n: int = 4) -> float:
"""Compute BLEU score with n-gram precision."""
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
# Compute n-gram precisions
precisions = []
for n_gram in range(1, n + 1):
ref_ngrams = Counter(self._ngrams(ref_tokens, n_gram))
hyp_ngrams = Counter(self._ngrams(hyp_tokens, n_gram))
overlap = sum((ref_ngrams & hyp_ngrams).values())
total = sum(hyp_ngrams.values())
if total == 0:
precisions.append(0)
else:
precisions.append(overlap / total)
if not precisions or all(p == 0 for p in precisions):
return 0.0
# Geometric mean of precisions
log_precisions = [np.log(p + 1e-10) for p in precisions]
avg_log_precision = sum(log_precisions) / len(log_precisions)
precision_score = np.exp(avg_log_precision)
# Length penalty
ref_len = len(ref_tokens)
hyp_len = len(hyp_tokens)
if hyp_len < ref_len:
bp = np.exp(1 - ref_len / hyp_len)
else:
bp = 1.0
return bp * precision_score
def _ngrams(self, tokens: list[str], n: int) -> list[tuple]:
return tuple(tokens[i:i+n] for i in range(len(tokens) - n + 1))
def rouge_scores(self, reference: str, hypothesis: str) -> dict:
"""Compute ROUGE-1, ROUGE-2, and ROUGE-L."""
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
def rouge_n(ref, hyp, n):
ref_ngrams = Counter(self._ngrams(ref, n))
hyp_ngrams = Counter(self._ngrams(hyp, n))
overlap = sum((ref_ngrams & hyp_ngrams).values())
if sum(hyp_ngrams.values()) == 0:
return 0
return overlap / sum(hyp_ngrams.values())
def rouge_l(ref, hyp):
"""Longest common subsequence."""
m, n = len(ref), len(hyp)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if ref[i-1] == hyp[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
lcs_len = dp[m][n]
if sum(hyp) == 0:
return 0
return lcs_len / n
return {
"rouge_1": rouge_n(ref_tokens, hyp_tokens, 1),
"rouge_2": rouge_n(ref_tokens, hyp_tokens, 2),
"rouge_l": rouge_l(ref_tokens, hyp_tokens)
}
async def bertscore(self, reference: str, hypothesis: str,
model_name: str = "roberta-large") -> float:
"""Compute BERTScore using contextual embeddings."""
if self.bertscore_model is None:
self.bertscore_model = AutoModel.from_pretrained(model_name)
self.bertscore_tokenizer = AutoTokenizer.from_pretrained(model_name)
ref_inputs = self.bertscore_tokenizer(reference, return_tensors="pt",
truncation=True, max_length=512)
hyp_inputs = self.bertscore_tokenizer(hypothesis, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
ref_embeddings = self.bertscore_model(**ref_inputs).last_hidden_state[0]
hyp_embeddings = self.bertscore_model(**hyp_inputs).last_hidden_state[0]
# Compute cosine similarity matrix
ref_norm = ref_embeddings / ref_embeddings.norm(dim=0, keepdim=True)
hyp_norm = hyp_embeddings / hyp_embeddings.norm(dim=0, keepdim=True)
similarity = torch.mm(ref_norm, hyp_norm.T)
# Greedy matching
ref_max = similarity.max(dim=1)[0]
hyp_max = similarity.max(dim=0)[0]
p = ref_max.mean().item()
r = hyp_max.mean().item()
f1 = 2 * p * r / (p + r + 1e-10)
return f1
Classification Metrics
class ClassificationEvaluator:
@staticmethod
def confusion_matrix(y_true: list, y_pred: list, labels: list[str] = None) -> dict:
"""Compute confusion matrix as dictionary."""
label_set = labels or sorted(set(y_true + y_pred))
label_to_idx = {l: i for i, l in enumerate(label_set)}
n = len(label_set)
matrix = [[0] * n for _ in range(n)]
for true, pred in zip(y_true, y_pred):
matrix[label_to_idx[true]][label_to_idx[pred]] += 1
return {
"matrix": matrix,
"labels": label_set,
"dimensions": n
}
@staticmethod
def precision_recall_f1(y_true: list, y_pred: list, average: str = "macro") -> dict:
"""Compute precision, recall, and F1."""
label_set = sorted(set(y_true + y_pred))
per_class = {}
for label in label_set:
tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
per_class[label] = {
"precision": precision,
"recall": recall,
"f1": f1,
"support": sum(1 for t in y_true if t == label)
}
if average == "macro":
n = len(label_set)
return {
"precision": sum(p["precision"] for p in per_class.values()) / n,
"recall": sum(p["recall"] for p in per_class.values()) / n,
"f1": sum(p["f1"] for p in per_class.values()) / n,
"per_class": per_class
}
return {"per_class": per_class}
EXERCISE
Implement a thorough evaluation dashboard that tracks metrics over model versions. Include statistical significance testing to determine whether performance differences between versions are meaningful.