Evaluation and Testing Patterns
Evaluation and Testing Patterns
Testing LLM applications differs from traditional software testing. You can’t just check that the output equals a specific string—LLM outputs vary. This lesson covers evaluating quality, building golden datasets, regression testing, and using models as judges to automate evaluation.
The Challenge of Testing LLMs
Traditional testing is deterministic. Same input always produces same output. LLMs are non-deterministic—same prompt might produce different responses at temperature > 0.
This makes testing hard:
- You can’t use simple equality checks
- Some variation is acceptable
- Quality is subjective
- You need multiple test cases to catch issues
Golden Datasets
Start with a golden dataset: carefully curated input/output pairs representing ideal behavior.
from dataclasses import dataclass
from typing import Optional
@dataclass
class TestCase:
"""A single test case with expected output."""
input: str
expected_output: str
category: str
min_length: int = 10
max_length: int = 500
required_keywords: Optional[list[str]] = None
class GoldenDataset:
"""Manage a golden dataset for testing."""
def __init__(self):
self.test_cases: list[TestCase] = []
def add_test_case(self, test_case: TestCase):
"""Add a test case."""
self.test_cases.append(test_case)
def add_batch(self, test_cases: list[TestCase]):
"""Add multiple test cases."""
self.test_cases.extend(test_cases)
def get_by_category(self, category: str) -> list[TestCase]:
"""Get test cases for a category."""
return [tc for tc in self.test_cases if tc.category == category]
def save(self, filepath: str):
"""Save dataset to file."""
import json
data = [
{
"input": tc.input,
"expected_output": tc.expected_output,
"category": tc.category,
"min_length": tc.min_length,
"max_length": tc.max_length,
"required_keywords": tc.required_keywords
}
for tc in self.test_cases
]
with open(filepath, 'w') as f:
json.dump(data, f, indent=2)
def load(self, filepath: str):
"""Load dataset from file."""
import json
with open(filepath, 'r') as f:
data = json.load(f)
self.test_cases = [TestCase(**item) for item in data]
# Create golden dataset
dataset = GoldenDataset()
dataset.add_batch([
TestCase(
input="What is photosynthesis?",
expected_output="Process where plants convert light into chemical energy",
category="science",
required_keywords=["plants", "light", "energy"]
),
TestCase(
input="Explain machine learning",
expected_output="Subset of AI where systems learn from data",
category="technology",
required_keywords=["learning", "data"]
),
TestCase(
input="Tell a joke",
expected_output="Humorous statement",
category="entertainment",
min_length=20
)
])
print(f"Dataset has {len(dataset.test_cases)} test cases")
LLM-as-Judge Evaluation
Use a model to evaluate another model’s output. This automates quality assessment:
from openai import OpenAI
class LLMJudge:
"""Use an LLM to judge quality of another LLM's output."""
def __init__(self, judge_model: str = "gpt-4-turbo"):
self.client = OpenAI()
self.judge_model = judge_model
def score_output(self, prompt: str, output: str, rubric: str) -> tuple[int, str]:
"""Score output on a 1-5 scale using a rubric."""
response = self.client.chat.completions.create(
model=self.judge_model,
messages=[
{
"role": "system",
"content": f"""You are an expert evaluator. Score the response 1-5.
Rubric: {rubric}
Return: SCORE: [1-5] REASON: [explanation]"""
},
{
"role": "user",
"content": f"Prompt: {prompt}\n\nResponse: {output}"
}
],
temperature=0
)
response_text = response.choices[0].message.content
# Parse score
try:
score_line = [line for line in response_text.split('\n') if 'SCORE' in line][0]
score = int(score_line.split(':')[1].strip())
reason = response_text.split('REASON:')[1].strip() if 'REASON:' in response_text else ""
return score, reason
except (IndexError, ValueError):
return 0, "Could not parse response"
def evaluate_test_case(self, test_case: TestCase, output: str) -> dict:
"""Evaluate output against test case."""
rubric = f"""
- Does it answer the question? (required)
- Does it match expected quality level? (required)
- Required keywords present: {test_case.required_keywords}
- Length between {test_case.min_length}-{test_case.max_length} characters
"""
score, reason = self.score_output(test_case.input, output, rubric)
return {
"input": test_case.input,
"output": output,
"expected": test_case.expected_output,
"score": score,
"reason": reason,
"passed": score >= 3
}
# Usage
judge = LLMJudge()
test_case = TestCase(
input="What is AI?",
expected_output="Field of computer science focused on intelligent systems",
category="tech"
)
actual_output = "AI means artificial intelligence and is used for many things"
result = judge.evaluate_test_case(test_case, actual_output)
print(f"Score: {result['score']}/5")
print(f"Reason: {result['reason']}")
print(f"Passed: {result['passed']}")
Regression Testing
Track how model quality changes over time. Detect regressions (degradation):
from datetime import datetime
class RegressionTest:
"""Track test performance over time."""
def __init__(self):
self.runs: list[dict] = []
def run_tests(self, test_cases: list[TestCase], model_func) -> dict:
"""Run all tests and record results."""
judge = LLMJudge()
results = []
scores = []
for test_case in test_cases:
try:
output = model_func(test_case.input)
result = judge.evaluate_test_case(test_case, output)
results.append(result)
scores.append(result['score'])
except Exception as e:
results.append({
"input": test_case.input,
"error": str(e),
"score": 0
})
scores.append(0)
avg_score = sum(scores) / len(scores) if scores else 0
pass_rate = sum(1 for s in scores if s >= 3) / len(scores) if scores else 0
run = {
"timestamp": datetime.now().isoformat(),
"results": results,
"avg_score": avg_score,
"pass_rate": pass_rate,
"total_tests": len(test_cases)
}
self.runs.append(run)
return run
def detect_regression(self) -> Optional[str]:
"""Check if latest run regressed compared to previous."""
if len(self.runs) < 2:
return None
latest = self.runs[-1]
previous = self.runs[-2]
avg_diff = latest["avg_score"] - previous["avg_score"]
rate_diff = latest["pass_rate"] - previous["pass_rate"]
if avg_diff < -0.5:
return f"Quality regressed: avg score dropped {abs(avg_diff):.2f}"
if rate_diff < -0.1:
return f"Pass rate regressed: {rate_diff*100:.1f}% fewer tests passing"
return None
def get_history(self) -> list[dict]:
"""Get summary of all runs."""
return [
{
"timestamp": run["timestamp"],
"avg_score": run["avg_score"],
"pass_rate": run["pass_rate"]
}
for run in self.runs
]
# Usage
tester = RegressionTest()
def mock_model(prompt: str) -> str:
return f"Response to: {prompt}"
# Run 1
test_cases = [
TestCase("Q1", "Expected", "cat1"),
TestCase("Q2", "Expected", "cat2")
]
run1 = tester.run_tests(test_cases, mock_model)
print(f"Run 1 - Avg Score: {run1['avg_score']:.2f}, Pass Rate: {run1['pass_rate']:.1%}")
# Run 2
run2 = tester.run_tests(test_cases, mock_model)
regression = tester.detect_regression()
if regression:
print(f"⚠️ {regression}")
A/B Testing
Compare two models or prompts on the same test set:
class ABTest:
"""Compare two models/prompts."""
def __init__(self):
self.a_results = []
self.b_results = []
self.judge = LLMJudge()
def run(self, test_cases: list[TestCase], model_a, model_b):
"""Run both models against test cases."""
for test_case in test_cases:
try:
output_a = model_a(test_case.input)
result_a = self.judge.evaluate_test_case(test_case, output_a)
self.a_results.append(result_a)
except Exception as e:
self.a_results.append({"error": str(e), "score": 0})
try:
output_b = model_b(test_case.input)
result_b = self.judge.evaluate_test_case(test_case, output_b)
self.b_results.append(result_b)
except Exception as e:
self.b_results.append({"error": str(e), "score": 0})
def get_comparison(self) -> dict:
"""Compare results."""
scores_a = [r.get("score", 0) for r in self.a_results]
scores_b = [r.get("score", 0) for r in self.b_results]
avg_a = sum(scores_a) / len(scores_a) if scores_a else 0
avg_b = sum(scores_b) / len(scores_b) if scores_b else 0
wins_a = sum(1 for a, b in zip(scores_a, scores_b) if a > b)
wins_b = sum(1 for a, b in zip(scores_a, scores_b) if b > a)
ties = sum(1 for a, b in zip(scores_a, scores_b) if a == b)
return {
"model_a_avg_score": avg_a,
"model_b_avg_score": avg_b,
"difference": avg_b - avg_a,
"model_a_wins": wins_a,
"model_b_wins": wins_b,
"ties": ties,
"winner": "B" if avg_b > avg_a else ("A" if avg_a > avg_b else "Tie")
}
# Usage
ab_test = ABTest()
def model_a(prompt):
return "Response A to: " + prompt
def model_b(prompt):
return "Response B to: " + prompt
test_cases = [TestCase(f"Q{i}", "Expected", "cat") for i in range(5)]
ab_test.run(test_cases, model_a, model_b)
comparison = ab_test.get_comparison()
print(f"Winner: {comparison['winner']}")
print(f"Margin: {comparison['difference']:.2f} points")
Human-in-the-Loop Evaluation
For nuanced quality judgments, involve humans:
class HumanEvaluationSession:
"""Manage human evaluation of model outputs."""
def __init__(self, session_id: str):
self.session_id = session_id
self.evaluations: list[dict] = []
def create_evaluation_task(self, test_case: TestCase, outputs: list[tuple[str, str]]) -> dict:
"""Create a task for human evaluation."""
# outputs: list of (model_name, output_text)
return {
"task_id": len(self.evaluations),
"prompt": test_case.input,
"category": test_case.category,
"outputs": [
{"model": name, "text": text}
for name, text in outputs
],
"feedback": None
}
def record_evaluation(self, task_id: int, evaluation: dict):
"""Record human evaluation."""
if task_id < len(self.evaluations):
self.evaluations[task_id]["feedback"] = evaluation
def get_agreement_rate(self) -> float:
"""Calculate inter-rater agreement (for multiple raters)."""
# Simplified: check if multiple raters agree
# In practice, use Cohen's kappa or similar
if not self.evaluations:
return 0
agreed = sum(1 for e in self.evaluations if e.get("feedback"))
return agreed / len(self.evaluations)
# Usage
session = HumanEvaluationSession("eval_run_001")
test_case = TestCase("What is ML?", "Expected", "tech")
outputs = [
("GPT-3.5", "ML is a subset of AI"),
("GPT-4", "ML enables systems to learn from data")
]
task = session.create_evaluation_task(test_case, outputs)
print(f"Created evaluation task: {task['task_id']}")
# After human rates them
session.record_evaluation(0, {
"better_model": "GPT-4",
"quality": 4,
"notes": "More comprehensive explanation"
})
print(f"Agreement rate: {session.get_agreement_rate():.1%}")
Key Takeaway
Testing LLMs requires different approaches than traditional software. Build golden datasets with curated examples. Use LLMs as judges to automate evaluation. Track performance over time to catch regressions. A/B test competing approaches. Supplement with human evaluation for nuanced judgments. Combine multiple evaluation methods for robust quality assessment.
Exercises
-
Golden dataset: Create a golden dataset for a specific task (e.g., summarization). Include 10+ test cases with expected outputs.
-
LLM-as-judge: Build an LLM judge that scores outputs on a 1-5 scale. Test with multiple outputs.
-
Regression testing: Run tests at two different times. Detect if quality regressed.
-
A/B testing: Compare two models or prompt templates on the same test set. Identify which performs better.
-
Hybrid evaluation: Combine automated scoring with human evaluation. Compare results.