CI/CD for RAG — RAG Evaluation and Metrics (Chapter 15)

Automating RAG evaluation in continuous integration catches regressions before deployment. Pipeline failures triggered by metric degradation create accountability and prevent quality drift across code changes.

GitHub Actions Pipeline

name: RAG Evaluation Pipeline

on:
  push:
    paths:
      - 'rag/**'
      - '.github/workflows/rag_eval.yml'
  pull_request:
    branches: [main]

jobs:
  evaluate:
    runs-on: ubuntu-latest
    
    steps:
      - uses: actions/checkout@v4
      
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      
      - name: Install dependencies
        run: |
          pip install langchain ragas pandas pytest
        
      - name: Run evaluation
        run: |
          pytest tests/test_retrieval.py -v
          pytest tests/test_generation.py -v
          python scripts/batch_evaluate.py \
            --dataset tests/test_data.jsonl \
            --output test_results.json
      
      - name: Check thresholds
        run: |
          python scripts/check_thresholds.py \
            --results test_results.json \
            --min-context-precision 0.80 \
            --min-answer-relevance 0.75
      
      - name: Upload results
        uses: actions/upload-artifact@v4
        with:
          name: eval-results
          path: test_results.json

  gate:
    needs: evaluate
    runs-on: ubuntu-latest
    steps:
      - name: Blocking check
        run: |
          if [ ${{ needs.evaluate.result }} == 'failure' ]; then
            echo "Evaluation failed: blocking deployment"
            exit 1
          fi

Semantic Chunking and Retrieval Testing

import pytest
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_precision
)
from ragas import evaluate
from datasets import Dataset

class TestRetrievalPipeline:
    """Test suite for the retrieval component."""
    
    @pytest.fixture
    def eval_dataset(self):
        return Dataset.from_jsonl("tests/test_data.jsonl")
    
    def test_top_k_retrieval_recall(
        self,
        eval_dataset,
        retrieval_pipeline
    ):
        """At least 80% of ground truth contexts should appear in top-5."""
        recall_scores = []
        
        for example in eval_dataset:
            retrieved = retrieval_pipeline(
                example["query"],
                top_k=5
            )
            retrieved_ids = {doc["id"] for doc in retrieved}
            ground_truth_ids = set(example["relevant_doc_ids"])
            
            if ground_truth_ids:
                recall = len(retrieved_ids & ground_truth_ids) / len(ground_truth_ids)
                recall_scores.append(recall)
        
        avg_recall = sum(recall_scores) / len(recall_scores)
        assert avg_recall >= 0.80, f"Recall {avg_recall:.2f} below threshold"
    
    def test_context_precision(
        self,
        eval_dataset,
        retrieval_pipeline,
        generator
    ):
        """Top retrieved chunks should be the most relevant."""
        test_df = eval_dataset.to_pandas()
        scores = evaluate(
            test_df,
            metrics=[context_precision],
            raise_exceptions=False
        )
        
        avg_precision = scores["context_precision"].mean()
        assert avg_precision >= 0.75

class TestGenerationPipeline:
    """Test suite for the generation component."""
    
    def test_faithfulness_threshold(
        self,
        eval_dataset,
        rag_pipeline
    ):
        """Generated answers should not contradict retrieved context."""
        test_df = eval_dataset.to_pandas()
        scores = evaluate(
            test_df,
            metrics=[faithfulness],
            raise_exceptions=False
        )
        
        min_faithfulness = scores["faithfulness"].min()
        assert min_faithfulness >= 0.70, \
            f"Faithfulness score {min_faithfulness:.2f} below threshold"

Threshold Configuration

# scripts/check_thresholds.py
import json
import sys

DEFAULT_THRESHOLDS = {
    "context_precision": 0.80,
    "context_recall": 0.85,
    "answer_relevancy": 0.75,
    "faithfulness": 0.70,
    "answer_correctness": 0.80
}

def check_thresholds(results_file: str, **overrides):
    """Validate evaluation results against thresholds."""
    with open(results_file) as f:
        results = json.load(f)
    
    thresholds = {**DEFAULT_THRESHOLDS, **overrides}
    failures = []
    
    for metric, threshold in thresholds.items():
        if metric in results:
            score = results[metric]
            if score < threshold:
                failures.append(
                    f"{metric}: {score:.3f} < {threshold}"
                )
    
    if failures:
        print("Threshold violations:")
        for f in failures:
            print(f"  - {f}")
        sys.exit(1)
    
    print("All thresholds passed")
    sys.exit(0)