15. CI/CD for RAG
Chapter 15 of 18 · 20 min
Automating RAG evaluation in continuous integration catches regressions before deployment. Pipeline failures triggered by metric degradation create accountability and prevent quality drift across code changes.
GitHub Actions Pipeline
name: RAG Evaluation Pipeline
on:
push:
paths:
- 'rag/**'
- '.github/workflows/rag_eval.yml'
pull_request:
branches: [main]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install langchain ragas pandas pytest
- name: Run evaluation
run: |
pytest tests/test_retrieval.py -v
pytest tests/test_generation.py -v
python scripts/batch_evaluate.py \
--dataset tests/test_data.jsonl \
--output test_results.json
- name: Check thresholds
run: |
python scripts/check_thresholds.py \
--results test_results.json \
--min-context-precision 0.80 \
--min-answer-relevance 0.75
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: eval-results
path: test_results.json
gate:
needs: evaluate
runs-on: ubuntu-latest
steps:
- name: Blocking check
run: |
if [ ${{ needs.evaluate.result }} == 'failure' ]; then
echo "Evaluation failed: blocking deployment"
exit 1
fi
Semantic Chunking and Retrieval Testing
import pytest
from ragas.metrics import (
answer_relevancy,
faithfulness,
context_precision
)
from ragas import evaluate
from datasets import Dataset
class TestRetrievalPipeline:
"""Test suite for the retrieval component."""
@pytest.fixture
def eval_dataset(self):
return Dataset.from_jsonl("tests/test_data.jsonl")
def test_top_k_retrieval_recall(
self,
eval_dataset,
retrieval_pipeline
):
"""At least 80% of ground truth contexts should appear in top-5."""
recall_scores = []
for example in eval_dataset:
retrieved = retrieval_pipeline(
example["query"],
top_k=5
)
retrieved_ids = {doc["id"] for doc in retrieved}
ground_truth_ids = set(example["relevant_doc_ids"])
if ground_truth_ids:
recall = len(retrieved_ids & ground_truth_ids) / len(ground_truth_ids)
recall_scores.append(recall)
avg_recall = sum(recall_scores) / len(recall_scores)
assert avg_recall >= 0.80, f"Recall {avg_recall:.2f} below threshold"
def test_context_precision(
self,
eval_dataset,
retrieval_pipeline,
generator
):
"""Top retrieved chunks should be the most relevant."""
test_df = eval_dataset.to_pandas()
scores = evaluate(
test_df,
metrics=[context_precision],
raise_exceptions=False
)
avg_precision = scores["context_precision"].mean()
assert avg_precision >= 0.75
class TestGenerationPipeline:
"""Test suite for the generation component."""
def test_faithfulness_threshold(
self,
eval_dataset,
rag_pipeline
):
"""Generated answers should not contradict retrieved context."""
test_df = eval_dataset.to_pandas()
scores = evaluate(
test_df,
metrics=[faithfulness],
raise_exceptions=False
)
min_faithfulness = scores["faithfulness"].min()
assert min_faithfulness >= 0.70, \
f"Faithfulness score {min_faithfulness:.2f} below threshold"
Threshold Configuration
# scripts/check_thresholds.py
import json
import sys
DEFAULT_THRESHOLDS = {
"context_precision": 0.80,
"context_recall": 0.85,
"answer_relevancy": 0.75,
"faithfulness": 0.70,
"answer_correctness": 0.80
}
def check_thresholds(results_file: str, **overrides):
"""Validate evaluation results against thresholds."""
with open(results_file) as f:
results = json.load(f)
thresholds = {**DEFAULT_THRESHOLDS, **overrides}
failures = []
for metric, threshold in thresholds.items():
if metric in results:
score = results[metric]
if score < threshold:
failures.append(
f"{metric}: {score:.3f} < {threshold}"
)
if failures:
print("Threshold violations:")
for f in failures:
print(f" - {f}")
sys.exit(1)
print("All thresholds passed")
sys.exit(0)
EXERCISE
Implement the GitHub Actions pipeline for an existing RAG project. Define realistic thresholds based on your current baseline metrics, and verify the pipeline catches at least one known regression scenario.