Reward Model Evaluation — RLHF, DPO, and PPO (Chapter 8)

Evaluating reward models is tricky because you don't have ground-truth preferences—only more noisy approximations. Standard classification metrics (accuracy, F1) don't capture what you care about: whether high-reward responses are actually better than low-reward ones.

Kendall's Tau correlation: Measures rank correlation between predicted rewards and true preferences. A tau of 0.5 means the model correctly orders half of all possible pairs.

from scipy.stats import kendalltau
import numpy as np

def evaluate_reward_model(reward_model, test_pairs):
    """
    test_pairs: list of {"prompt": str, "chosen": str, "rejected": str}
    """
    predicted_rewards_chosen = []
    predicted_rewards_rejected = []
    
    for pair in test_pairs:
        # Tokenize and get rewards
        chosen_reward = reward_model(pair["prompt"], pair["chosen"])
        rejected_reward = reward_model(pair["prompt"], pair["rejected"])
        predicted_rewards_chosen.append(chosen_reward)
        predicted_rewards_rejected.append(rejected_reward)
    
    # Calculate accuracy: how often is chosen > rejected?
    accuracy = np.mean([
        c > r for c, r in zip(predicted_rewards_chosen, predicted_rewards_rejected)
    ])
    
    # Kendall's Tau: rank correlation
    # Create binary labels (1 if chosen > rejected, 0 otherwise)
    # Perfect model would have tau = 1.0
    all_rewards = predicted_rewards_chosen + predicted_rewards_rejected
    labels = [1] * len(predicted_rewards_chosen) + [0] * len(predicted_rewards_rejected)
    tau, p_value = kendalltau(all_rewards, labels)
    
    return {"accuracy": accuracy, "kendall_tau": tau, "p_value": p_value}

Reward model tests for systematic biases:

def test_length_bias(reward_model, base_prompt):
    """Test if reward model prefers longer/shorter responses."""
    short_response = "The answer is 42."
    long_response = "The answer is 42. This is based on deep analysis of the problem. " * 5
    
    short_reward = reward_model(base_prompt, short_response)
    long_reward = reward_model(base_prompt, long_response)
    
    print(f"Short response reward: {short_reward:.3f}")
    print(f"Long response reward: {long_reward:.3f}")
    print(f"Length bias: {'Present' if abs(short_reward - long_reward) > 0.1 else 'Minimal'}")

def test_sybil_attack(reward_model, prompt, good_response, injected_response):
    """Test if reward model is fooled by sybil attacks (flattery, manipulation)."""
    # A good response followed by sycophantic flattery
    sycophant_response = good_response + " You are absolutely right, and your intelligence is remarkable."
    
    good_reward = reward_model(prompt, good_response)
    sycophant_reward = reward_model(prompt, sycophant_response)
    
    print(f"Good response reward: {good_reward:.3f}")
    print(f"Sycophant response reward: {sycophant_reward:.3f}")
    print(f"Sycophancy detected: {'Yes' if sycophant_reward > good_reward else 'No'}")

Human preference prediction: The ultimate test is whether the reward model's preferences match human preferences. Sample responses at the extremes of the reward distribution and have humans evaluate them.