LLMOps: From Prototype to Production

LLMOps encompasses the practices, tools, and infrastructure needed to develop, deploy, and manage large language model applications at scale. Unlike traditional MLOps, LLMOps addresses unique challenges: prompt versioning, model selection, cost management, and behavioral consistency across versions.

From Prototype to Production

The journey from a notebook prototype to a production system involves multiple stages:

Stage 1: Experimentation

# Prototype: Ad-hoc exploration
import anthropic

client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "Summarize this document: ..."}
    ]
)

print(response.content[0].text)

Stage 2: Codification

Move from notebooks to modular, tested code:

from dataclasses import dataclass
from typing import Optional

@dataclass
class SummarizationConfig:
    """Configuration for summarization task."""
    model: str = "claude-3-5-sonnet-20241022"
    max_tokens: int = 1000
    temperature: float = 0.7
    system_prompt: str = "You are a helpful assistant that creates concise summaries."

class DocumentSummarizer:
    """Production-ready document summarization."""

    def __init__(self, config: SummarizationConfig):
        self.client = anthropic.Anthropic()
        self.config = config

    def summarize(self, document: str) -> str:
        """Summarize a document."""
        response = self.client.messages.create(
            model=self.config.model,
            max_tokens=self.config.max_tokens,
            temperature=self.config.temperature,
            system=self.config.system_prompt,
            messages=[
                {"role": "user", "content": f"Summarize: {document}"}
            ]
        )
        return response.content[0].text

# Usage
config = SummarizationConfig()
summarizer = DocumentSummarizer(config)
result = summarizer.summarize("Long document here...")

Stage 3: Evaluation and Testing

from typing import List
from dataclasses import dataclass

@dataclass
class EvaluationResult:
    """Results of evaluation."""
    test_case: str
    output: str
    expected: str
    score: float

class SummarizationEvaluator:
    """Evaluate summarization quality."""

    def __init__(self, summarizer: DocumentSummarizer):
        self.summarizer = summarizer

    def evaluate_rouge(self, generated: str, reference: str) -> float:
        """Calculate ROUGE-1 F1 score."""
        from rouge_score import rouge_scorer

        scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
        scores = scorer.score(reference, generated)
        return scores["rouge1"].fmeasure

    def evaluate_dataset(self, test_cases: List[dict]) -> List[EvaluationResult]:
        """Evaluate on test dataset."""
        results = []

        for test in test_cases:
            output = self.summarizer.summarize(test["document"])
            score = self.evaluate_rouge(output, test["expected_summary"])

            results.append(EvaluationResult(
                test_case=test["id"],
                output=output,
                expected=test["expected_summary"],
                score=score
            ))

        return results

    def get_average_score(self, results: List[EvaluationResult]) -> float:
        """Calculate average score."""
        if not results:
            return 0.0
        return sum(r.score for r in results) / len(results)

# Test the system
test_data = [
    {
        "id": "test_1",
        "document": "Long text...",
        "expected_summary": "Brief summary..."
    }
]

evaluator = SummarizationEvaluator(summarizer)
results = evaluator.evaluate_dataset(test_data)
print(f"Average ROUGE score: {evaluator.get_average_score(results)}")

Prompt Versioning and Management

Prompts are code and should be versioned like any software artifact:

import json
from datetime import datetime
from typing import Dict

class PromptVersion:
    """Represents a versioned prompt."""

    def __init__(self, version: str, content: str, metadata: Dict = None):
        self.version = version
        self.content = content
        self.metadata = metadata or {}
        self.created_at = datetime.utcnow().isoformat()
        self.performance_metrics = {}

    def to_dict(self) -> Dict:
        """Serialize to dictionary."""
        return {
            "version": self.version,
            "content": self.content,
            "metadata": self.metadata,
            "created_at": self.created_at,
            "performance_metrics": self.performance_metrics
        }

class PromptRegistry:
    """Central registry for prompts."""

    def __init__(self):
        self.prompts: Dict[str, List[PromptVersion]] = {}

    def register_prompt(self, name: str, prompt: PromptVersion):
        """Register a new prompt version."""
        if name not in self.prompts:
            self.prompts[name] = []

        self.prompts[name].append(prompt)
        print(f"Registered {name}@{prompt.version}")

    def get_prompt(self, name: str, version: str = "latest") -> PromptVersion:
        """Retrieve a prompt version."""
        if name not in self.prompts:
            raise ValueError(f"Prompt {name} not found")

        versions = self.prompts[name]

        if version == "latest":
            return versions[-1]

        for p in versions:
            if p.version == version:
                return p

        raise ValueError(f"Prompt version {name}@{version} not found")

    def compare_prompts(self, name: str, v1: str, v2: str) -> Dict:
        """Compare two prompt versions."""
        p1 = self.get_prompt(name, v1)
        p2 = self.get_prompt(name, v2)

        return {
            "v1": {"version": p1.version, "content": p1.content},
            "v2": {"version": p2.version, "content": p2.content},
            "created_v1": p1.created_at,
            "created_v2": p2.created_at
        }

# Usage
registry = PromptRegistry()

v1 = PromptVersion(
    version="1.0.0",
    content="Summarize the following: {document}",
    metadata={"author": "team_a"}
)

v2 = PromptVersion(
    version="1.1.0",
    content="Create a concise summary of the document below, focusing on key points:\n\n{document}",
    metadata={"author": "team_b", "improvement": "added focus instruction"}
)

registry.register_prompt("summarize", v1)
registry.register_prompt("summarize", v2)

# Use latest version
latest = registry.get_prompt("summarize")
print(latest.content)

Model Selection and A/B Testing

import random
from enum import Enum

class Model(Enum):
    """Available models."""
    CLAUDE_OPUS = "claude-3-opus-20250219"
    CLAUDE_SONNET = "claude-3-5-sonnet-20241022"
    CLAUDE_HAIKU = "claude-3-haiku-20250307"

class ModelRouter:
    """Route requests to different models."""

    def __init__(self):
        self.traffic_split = {
            Model.CLAUDE_OPUS: 0.1,
            Model.CLAUDE_SONNET: 0.8,
            Model.CLAUDE_HAIKU: 0.1
        }
        self.model_performance = {}

    def select_model(self) -> Model:
        """Select model based on traffic split."""
        models = list(self.traffic_split.keys())
        weights = list(self.traffic_split.values())
        return random.choices(models, weights=weights)[0]

    def record_performance(self, model: Model, latency: float, quality_score: float):
        """Record model performance."""
        if model not in self.model_performance:
            self.model_performance[model] = {"latencies": [], "quality_scores": []}

        self.model_performance[model]["latencies"].append(latency)
        self.model_performance[model]["quality_scores"].append(quality_score)

    def get_performance_summary(self) -> Dict:
        """Get performance summary for all models."""
        summary = {}

        for model, metrics in self.model_performance.items():
            if metrics["latencies"]:
                avg_latency = sum(metrics["latencies"]) / len(metrics["latencies"])
                avg_quality = sum(metrics["quality_scores"]) / len(metrics["quality_scores"])

                summary[model.value] = {
                    "avg_latency": avg_latency,
                    "avg_quality": avg_quality,
                    "num_requests": len(metrics["latencies"])
                }

        return summary

class ABTestingFramework:
    """A/B testing infrastructure."""

    def __init__(self):
        self.router = ModelRouter()
        self.experiments = {}

    def create_experiment(self, experiment_id: str, control_model: Model, variant_model: Model):
        """Create A/B test experiment."""
        self.experiments[experiment_id] = {
            "control": control_model,
            "variant": variant_model,
            "control_results": [],
            "variant_results": []
        }

    def run_experiment(self, experiment_id: str, request_data: Dict):
        """Run single experiment iteration."""
        exp = self.experiments[experiment_id]

        # Run on control
        control_model = exp["control"]
        control_result = self._execute_model(control_model, request_data)
        exp["control_results"].append(control_result)

        # Run on variant
        variant_model = exp["variant"]
        variant_result = self._execute_model(variant_model, request_data)
        exp["variant_results"].append(variant_result)

        return {
            "control": control_result,
            "variant": variant_result
        }

    def analyze_experiment(self, experiment_id: str) -> Dict:
        """Analyze A/B test results."""
        exp = self.experiments[experiment_id]

        control_scores = [r["score"] for r in exp["control_results"]]
        variant_scores = [r["score"] for r in exp["variant_results"]]

        avg_control = sum(control_scores) / len(control_scores) if control_scores else 0
        avg_variant = sum(variant_scores) / len(variant_scores) if variant_scores else 0

        return {
            "control_avg_score": avg_control,
            "variant_avg_score": avg_variant,
            "improvement": ((avg_variant - avg_control) / avg_control * 100) if avg_control > 0 else 0,
            "control_count": len(control_scores),
            "variant_count": len(variant_scores)
        }

    def _execute_model(self, model: Model, request_data: Dict) -> Dict:
        """Execute model and return result."""
        # Simulated execution
        return {"score": 0.85}

Cost Management

from typing import List, Tuple

class TokenCounter:
    """Estimate token usage and costs."""

    # Approximate tokens per character
    TOKENS_PER_CHAR = 0.25

    # Pricing (per 1M tokens)
    PRICING = {
        "claude-3-opus-20250219": {"input": 15.00, "output": 75.00},
        "claude-3-5-sonnet-20241022": {"input": 3.00, "output": 15.00},
        "claude-3-haiku-20250307": {"input": 0.80, "output": 4.00}
    }

    @classmethod
    def estimate_tokens(cls, text: str) -> int:
        """Estimate token count."""
        return int(len(text) * cls.TOKENS_PER_CHAR)

    @classmethod
    def estimate_cost(
        cls,
        model: str,
        input_text: str,
        output_estimate: str = "medium"
    ) -> float:
        """Estimate API call cost."""
        input_tokens = cls.estimate_tokens(input_text)

        # Estimate output tokens
        if output_estimate == "small":
            output_tokens = 100
        elif output_estimate == "medium":
            output_tokens = 500
        else:
            output_tokens = 2000

        prices = cls.PRICING.get(model)
        if not prices:
            return 0.0

        input_cost = (input_tokens / 1_000_000) * prices["input"]
        output_cost = (output_tokens / 1_000_000) * prices["output"]

        return input_cost + output_cost

class CostTracker:
    """Track and manage costs."""

    def __init__(self, monthly_budget: float):
        self.monthly_budget = monthly_budget
        self.current_spending = 0.0
        self.call_history: List[Tuple[str, float]] = []

    def log_call(self, model: str, cost: float):
        """Log API call cost."""
        self.call_history.append((model, cost))
        self.current_spending += cost

        if self.is_over_budget():
            self.trigger_alert()

    def is_over_budget(self) -> bool:
        """Check if over budget."""
        return self.current_spending > self.monthly_budget

    def get_budget_remaining(self) -> float:
        """Get remaining budget."""
        return max(0, self.monthly_budget - self.current_spending)

    def trigger_alert(self):
        """Alert when over budget."""
        print(f"WARNING: Over budget! Spent ${self.current_spending:.2f} of ${self.monthly_budget:.2f}")

    def get_cost_breakdown(self) -> Dict[str, float]:
        """Get breakdown by model."""
        breakdown = {}
        for model, cost in self.call_history:
            breakdown[model] = breakdown.get(model, 0) + cost
        return breakdown

# Usage
tracker = CostTracker(monthly_budget=1000.0)

cost = TokenCounter.estimate_cost(
    "claude-3-5-sonnet-20241022",
    "Long input text...",
    output_estimate="medium"
)

tracker.log_call("claude-3-5-sonnet-20241022", cost)

Deployment Pipeline

from enum import Enum

class Environment(Enum):
    """Deployment environments."""
    DEV = "development"
    STAGING = "staging"
    PRODUCTION = "production"

class DeploymentPipeline:
    """CI/CD pipeline for LLM applications."""

    def __init__(self):
        self.stages = []

    def add_stage(self, name: str, check_func):
        """Add validation stage."""
        self.stages.append({"name": name, "check": check_func})

    def run_pipeline(self, artifact: Dict) -> bool:
        """Run full pipeline."""
        for stage in self.stages:
            print(f"Running: {stage['name']}")
            if not stage["check"](artifact):
                print(f"FAILED: {stage['name']}")
                return False
            print(f"PASSED: {stage['name']}")

        return True

# Example pipeline
pipeline = DeploymentPipeline()

pipeline.add_stage(
    "Syntax Check",
    lambda a: True  # Check prompt syntax
)

pipeline.add_stage(
    "Unit Tests",
    lambda a: True  # Run unit tests
)

pipeline.add_stage(
    "Integration Tests",
    lambda a: True  # Run integration tests
)

pipeline.add_stage(
    "Performance Tests",
    lambda a: True  # Benchmark latency/quality
)

pipeline.add_stage(
    "Security Review",
    lambda a: True  # Check for vulnerabilities
)

pipeline.run_pipeline({})

Key Takeaway

LLMOps bridges the gap between experimental AI development and production systems. Proper versioning, evaluation, cost management, and deployment infrastructure ensure that AI applications remain maintainable, cost-effective, and reliable at scale.

Exercises

Prompt Versioning: Build a prompt registry that tracks versions, metadata, and performance metrics.
A/B Testing: Implement an A/B testing framework comparing two different model configurations.
Cost Tracking: Create a cost tracker that monitors spending across different models and alerts on budget overruns.
Evaluation Pipeline: Build an evaluation suite that runs quality metrics on model outputs.
Deployment Pipeline: Implement a CI/CD pipeline with stages for testing, validation, and deployment.
Analytics Dashboard: Create dashboards to visualize cost, performance, and quality metrics over time.