LLMOps: From Prototype to Production
LLMOps: From Prototype to Production
LLMOps encompasses the practices, tools, and infrastructure needed to develop, deploy, and manage large language model applications at scale. Unlike traditional MLOps, LLMOps addresses unique challenges: prompt versioning, model selection, cost management, and behavioral consistency across versions.
From Prototype to Production
The journey from a notebook prototype to a production system involves multiple stages:
Stage 1: Experimentation
# Prototype: Ad-hoc exploration
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{"role": "user", "content": "Summarize this document: ..."}
]
)
print(response.content[0].text)
Stage 2: Codification
Move from notebooks to modular, tested code:
from dataclasses import dataclass
from typing import Optional
@dataclass
class SummarizationConfig:
"""Configuration for summarization task."""
model: str = "claude-3-5-sonnet-20241022"
max_tokens: int = 1000
temperature: float = 0.7
system_prompt: str = "You are a helpful assistant that creates concise summaries."
class DocumentSummarizer:
"""Production-ready document summarization."""
def __init__(self, config: SummarizationConfig):
self.client = anthropic.Anthropic()
self.config = config
def summarize(self, document: str) -> str:
"""Summarize a document."""
response = self.client.messages.create(
model=self.config.model,
max_tokens=self.config.max_tokens,
temperature=self.config.temperature,
system=self.config.system_prompt,
messages=[
{"role": "user", "content": f"Summarize: {document}"}
]
)
return response.content[0].text
# Usage
config = SummarizationConfig()
summarizer = DocumentSummarizer(config)
result = summarizer.summarize("Long document here...")
Stage 3: Evaluation and Testing
from typing import List
from dataclasses import dataclass
@dataclass
class EvaluationResult:
"""Results of evaluation."""
test_case: str
output: str
expected: str
score: float
class SummarizationEvaluator:
"""Evaluate summarization quality."""
def __init__(self, summarizer: DocumentSummarizer):
self.summarizer = summarizer
def evaluate_rouge(self, generated: str, reference: str) -> float:
"""Calculate ROUGE-1 F1 score."""
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
scores = scorer.score(reference, generated)
return scores["rouge1"].fmeasure
def evaluate_dataset(self, test_cases: List[dict]) -> List[EvaluationResult]:
"""Evaluate on test dataset."""
results = []
for test in test_cases:
output = self.summarizer.summarize(test["document"])
score = self.evaluate_rouge(output, test["expected_summary"])
results.append(EvaluationResult(
test_case=test["id"],
output=output,
expected=test["expected_summary"],
score=score
))
return results
def get_average_score(self, results: List[EvaluationResult]) -> float:
"""Calculate average score."""
if not results:
return 0.0
return sum(r.score for r in results) / len(results)
# Test the system
test_data = [
{
"id": "test_1",
"document": "Long text...",
"expected_summary": "Brief summary..."
}
]
evaluator = SummarizationEvaluator(summarizer)
results = evaluator.evaluate_dataset(test_data)
print(f"Average ROUGE score: {evaluator.get_average_score(results)}")
Prompt Versioning and Management
Prompts are code and should be versioned like any software artifact:
import json
from datetime import datetime
from typing import Dict
class PromptVersion:
"""Represents a versioned prompt."""
def __init__(self, version: str, content: str, metadata: Dict = None):
self.version = version
self.content = content
self.metadata = metadata or {}
self.created_at = datetime.utcnow().isoformat()
self.performance_metrics = {}
def to_dict(self) -> Dict:
"""Serialize to dictionary."""
return {
"version": self.version,
"content": self.content,
"metadata": self.metadata,
"created_at": self.created_at,
"performance_metrics": self.performance_metrics
}
class PromptRegistry:
"""Central registry for prompts."""
def __init__(self):
self.prompts: Dict[str, List[PromptVersion]] = {}
def register_prompt(self, name: str, prompt: PromptVersion):
"""Register a new prompt version."""
if name not in self.prompts:
self.prompts[name] = []
self.prompts[name].append(prompt)
print(f"Registered {name}@{prompt.version}")
def get_prompt(self, name: str, version: str = "latest") -> PromptVersion:
"""Retrieve a prompt version."""
if name not in self.prompts:
raise ValueError(f"Prompt {name} not found")
versions = self.prompts[name]
if version == "latest":
return versions[-1]
for p in versions:
if p.version == version:
return p
raise ValueError(f"Prompt version {name}@{version} not found")
def compare_prompts(self, name: str, v1: str, v2: str) -> Dict:
"""Compare two prompt versions."""
p1 = self.get_prompt(name, v1)
p2 = self.get_prompt(name, v2)
return {
"v1": {"version": p1.version, "content": p1.content},
"v2": {"version": p2.version, "content": p2.content},
"created_v1": p1.created_at,
"created_v2": p2.created_at
}
# Usage
registry = PromptRegistry()
v1 = PromptVersion(
version="1.0.0",
content="Summarize the following: {document}",
metadata={"author": "team_a"}
)
v2 = PromptVersion(
version="1.1.0",
content="Create a concise summary of the document below, focusing on key points:\n\n{document}",
metadata={"author": "team_b", "improvement": "added focus instruction"}
)
registry.register_prompt("summarize", v1)
registry.register_prompt("summarize", v2)
# Use latest version
latest = registry.get_prompt("summarize")
print(latest.content)
Model Selection and A/B Testing
import random
from enum import Enum
class Model(Enum):
"""Available models."""
CLAUDE_OPUS = "claude-3-opus-20250219"
CLAUDE_SONNET = "claude-3-5-sonnet-20241022"
CLAUDE_HAIKU = "claude-3-haiku-20250307"
class ModelRouter:
"""Route requests to different models."""
def __init__(self):
self.traffic_split = {
Model.CLAUDE_OPUS: 0.1,
Model.CLAUDE_SONNET: 0.8,
Model.CLAUDE_HAIKU: 0.1
}
self.model_performance = {}
def select_model(self) -> Model:
"""Select model based on traffic split."""
models = list(self.traffic_split.keys())
weights = list(self.traffic_split.values())
return random.choices(models, weights=weights)[0]
def record_performance(self, model: Model, latency: float, quality_score: float):
"""Record model performance."""
if model not in self.model_performance:
self.model_performance[model] = {"latencies": [], "quality_scores": []}
self.model_performance[model]["latencies"].append(latency)
self.model_performance[model]["quality_scores"].append(quality_score)
def get_performance_summary(self) -> Dict:
"""Get performance summary for all models."""
summary = {}
for model, metrics in self.model_performance.items():
if metrics["latencies"]:
avg_latency = sum(metrics["latencies"]) / len(metrics["latencies"])
avg_quality = sum(metrics["quality_scores"]) / len(metrics["quality_scores"])
summary[model.value] = {
"avg_latency": avg_latency,
"avg_quality": avg_quality,
"num_requests": len(metrics["latencies"])
}
return summary
class ABTestingFramework:
"""A/B testing infrastructure."""
def __init__(self):
self.router = ModelRouter()
self.experiments = {}
def create_experiment(self, experiment_id: str, control_model: Model, variant_model: Model):
"""Create A/B test experiment."""
self.experiments[experiment_id] = {
"control": control_model,
"variant": variant_model,
"control_results": [],
"variant_results": []
}
def run_experiment(self, experiment_id: str, request_data: Dict):
"""Run single experiment iteration."""
exp = self.experiments[experiment_id]
# Run on control
control_model = exp["control"]
control_result = self._execute_model(control_model, request_data)
exp["control_results"].append(control_result)
# Run on variant
variant_model = exp["variant"]
variant_result = self._execute_model(variant_model, request_data)
exp["variant_results"].append(variant_result)
return {
"control": control_result,
"variant": variant_result
}
def analyze_experiment(self, experiment_id: str) -> Dict:
"""Analyze A/B test results."""
exp = self.experiments[experiment_id]
control_scores = [r["score"] for r in exp["control_results"]]
variant_scores = [r["score"] for r in exp["variant_results"]]
avg_control = sum(control_scores) / len(control_scores) if control_scores else 0
avg_variant = sum(variant_scores) / len(variant_scores) if variant_scores else 0
return {
"control_avg_score": avg_control,
"variant_avg_score": avg_variant,
"improvement": ((avg_variant - avg_control) / avg_control * 100) if avg_control > 0 else 0,
"control_count": len(control_scores),
"variant_count": len(variant_scores)
}
def _execute_model(self, model: Model, request_data: Dict) -> Dict:
"""Execute model and return result."""
# Simulated execution
return {"score": 0.85}
Cost Management
from typing import List, Tuple
class TokenCounter:
"""Estimate token usage and costs."""
# Approximate tokens per character
TOKENS_PER_CHAR = 0.25
# Pricing (per 1M tokens)
PRICING = {
"claude-3-opus-20250219": {"input": 15.00, "output": 75.00},
"claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
"claude-3-haiku-20250307": {"input": 0.80, "output": 4.00}
}
@classmethod
def estimate_tokens(cls, text: str) -> int:
"""Estimate token count."""
return int(len(text) * cls.TOKENS_PER_CHAR)
@classmethod
def estimate_cost(
cls,
model: str,
input_text: str,
output_estimate: str = "medium"
) -> float:
"""Estimate API call cost."""
input_tokens = cls.estimate_tokens(input_text)
# Estimate output tokens
if output_estimate == "small":
output_tokens = 100
elif output_estimate == "medium":
output_tokens = 500
else:
output_tokens = 2000
prices = cls.PRICING.get(model)
if not prices:
return 0.0
input_cost = (input_tokens / 1_000_000) * prices["input"]
output_cost = (output_tokens / 1_000_000) * prices["output"]
return input_cost + output_cost
class CostTracker:
"""Track and manage costs."""
def __init__(self, monthly_budget: float):
self.monthly_budget = monthly_budget
self.current_spending = 0.0
self.call_history: List[Tuple[str, float]] = []
def log_call(self, model: str, cost: float):
"""Log API call cost."""
self.call_history.append((model, cost))
self.current_spending += cost
if self.is_over_budget():
self.trigger_alert()
def is_over_budget(self) -> bool:
"""Check if over budget."""
return self.current_spending > self.monthly_budget
def get_budget_remaining(self) -> float:
"""Get remaining budget."""
return max(0, self.monthly_budget - self.current_spending)
def trigger_alert(self):
"""Alert when over budget."""
print(f"WARNING: Over budget! Spent ${self.current_spending:.2f} of ${self.monthly_budget:.2f}")
def get_cost_breakdown(self) -> Dict[str, float]:
"""Get breakdown by model."""
breakdown = {}
for model, cost in self.call_history:
breakdown[model] = breakdown.get(model, 0) + cost
return breakdown
# Usage
tracker = CostTracker(monthly_budget=1000.0)
cost = TokenCounter.estimate_cost(
"claude-3-5-sonnet-20241022",
"Long input text...",
output_estimate="medium"
)
tracker.log_call("claude-3-5-sonnet-20241022", cost)
Deployment Pipeline
from enum import Enum
class Environment(Enum):
"""Deployment environments."""
DEV = "development"
STAGING = "staging"
PRODUCTION = "production"
class DeploymentPipeline:
"""CI/CD pipeline for LLM applications."""
def __init__(self):
self.stages = []
def add_stage(self, name: str, check_func):
"""Add validation stage."""
self.stages.append({"name": name, "check": check_func})
def run_pipeline(self, artifact: Dict) -> bool:
"""Run full pipeline."""
for stage in self.stages:
print(f"Running: {stage['name']}")
if not stage["check"](artifact):
print(f"FAILED: {stage['name']}")
return False
print(f"PASSED: {stage['name']}")
return True
# Example pipeline
pipeline = DeploymentPipeline()
pipeline.add_stage(
"Syntax Check",
lambda a: True # Check prompt syntax
)
pipeline.add_stage(
"Unit Tests",
lambda a: True # Run unit tests
)
pipeline.add_stage(
"Integration Tests",
lambda a: True # Run integration tests
)
pipeline.add_stage(
"Performance Tests",
lambda a: True # Benchmark latency/quality
)
pipeline.add_stage(
"Security Review",
lambda a: True # Check for vulnerabilities
)
pipeline.run_pipeline({})
Key Takeaway
LLMOps bridges the gap between experimental AI development and production systems. Proper versioning, evaluation, cost management, and deployment infrastructure ensure that AI applications remain maintainable, cost-effective, and reliable at scale.
Exercises
-
Prompt Versioning: Build a prompt registry that tracks versions, metadata, and performance metrics.
-
A/B Testing: Implement an A/B testing framework comparing two different model configurations.
-
Cost Tracking: Create a cost tracker that monitors spending across different models and alerts on budget overruns.
-
Evaluation Pipeline: Build an evaluation suite that runs quality metrics on model outputs.
-
Deployment Pipeline: Implement a CI/CD pipeline with stages for testing, validation, and deployment.
-
Analytics Dashboard: Create dashboards to visualize cost, performance, and quality metrics over time.