Advanced
Building Trustworthy AI Systems
Building Trustworthy AI Systems
Trustworthy AI requires explainability, fairness, robustness, and human oversight. Users and stakeholders must understand how decisions are made, trust predictions are fair, and believe systems are transparent about limitations.
Explainability and Interpretability
Make model decisions understandable to non-technical stakeholders.
from typing import Any, List, Dict
import anthropic
class ModelExplainer:
"""Generate human-readable explanations for model decisions."""
def __init__(self):
self.client = anthropic.Anthropic()
def explain_prediction(
self,
input_data: dict,
prediction: str,
model_type: str = "classification"
) -> str:
"""Generate explanation for prediction."""
context = f"""
Model Type: {model_type}
Input: {input_data}
Prediction: {prediction}
Explain this prediction in simple terms suitable for non-technical users.
Focus on the most important factors.
"""
response = self.client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=500,
system="You are an AI explainability expert. Provide clear, non-technical explanations.",
messages=[{"role": "user", "content": context}]
)
return response.content[0].text
def feature_importance(
self,
model_output: dict,
top_k: int = 5
) -> List[Dict[str, Any]]:
"""Extract top contributing features."""
if "feature_scores" in model_output:
scores = model_output["feature_scores"]
sorted_features = sorted(
scores.items(),
key=lambda x: abs(x[1]),
reverse=True
)
return [
{"feature": name, "importance": score}
for name, score in sorted_features[:top_k]
]
return []
def counterfactual_explanation(
self,
original_input: dict,
prediction: str,
target_prediction: str
) -> dict:
"""Generate counterfactual explanation."""
prompt = f"""
Original Input: {original_input}
Current Prediction: {prediction}
Target Prediction: {target_prediction}
What minimal changes to the input would change the prediction to the target?
"""
response = self.client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=300,
messages=[{"role": "user", "content": prompt}]
)
return {
"original": original_input,
"current_prediction": prediction,
"target_prediction": target_prediction,
"counterfactual": response.content[0].text
}
Fairness and Bias Testing
Systematically detect and mitigate biases across demographic groups.
from typing import List, Optional
from dataclasses import dataclass
@dataclass
class FairnessMetrics:
"""Fairness evaluation metrics."""
demographic_group: str
accuracy: float
precision: float
recall: float
f1_score: float
class FairnessTester:
"""Test model fairness across demographics."""
def __init__(self):
self.test_results: Dict[str, FairnessMetrics] = {}
def evaluate_fairness(
self,
predictions: List[str],
ground_truth: List[str],
demographic_groups: List[str]
) -> dict:
"""Evaluate fairness across demographic groups."""
group_indices = self._partition_by_demographic(demographic_groups)
fairness_scores = {}
for group_name, indices in group_indices.items():
group_preds = [predictions[i] for i in indices]
group_truth = [ground_truth[i] for i in indices]
metrics = self._compute_metrics(group_preds, group_truth)
fairness_scores[group_name] = metrics
self.test_results[group_name] = metrics
return {
"fairness_scores": fairness_scores,
"disparities": self._compute_disparities(fairness_scores),
"fairness_alert": self._check_fairness_violations(fairness_scores)
}
def _partition_by_demographic(
self,
demographic_groups: List[str]
) -> dict:
"""Partition data by demographic groups."""
groups = {}
for i, group in enumerate(demographic_groups):
if group not in groups:
groups[group] = []
groups[group].append(i)
return groups
def _compute_metrics(
self,
predictions: List[str],
ground_truth: List[str]
) -> FairnessMetrics:
"""Compute fairness metrics."""
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
return FairnessMetrics(
demographic_group="",
accuracy=accuracy_score(ground_truth, predictions),
precision=precision_score(
ground_truth, predictions, average="weighted", zero_division=0
),
recall=recall_score(
ground_truth, predictions, average="weighted", zero_division=0
),
f1_score=f1_score(
ground_truth, predictions, average="weighted", zero_division=0
)
)
def _compute_disparities(self, scores: dict) -> dict:
"""Compute performance disparities between groups."""
disparities = {}
accuracies = {g: s.accuracy for g, s in scores.items()}
if len(accuracies) > 1:
max_acc = max(accuracies.values())
min_acc = min(accuracies.values())
disparities["max_accuracy_gap"] = max_acc - min_acc
return disparities
def _check_fairness_violations(self, scores: dict) -> bool:
"""Check if fairness threshold violated."""
fairness_threshold = 0.1 # 10% accuracy gap threshold
disparities = self._compute_disparities(scores)
gap = disparities.get("max_accuracy_gap", 0)
return gap > fairness_threshold
class BiasDetector:
"""Detect specific types of bias."""
def detect_gender_bias(
self,
predictions: List[str],
demographics: List[dict]
) -> dict:
"""Detect gender bias in predictions."""
male_accuracy = sum(
1 for i, pred in enumerate(predictions)
if demographics[i].get("gender") == "male" and pred == demographics[i].get("label")
) / max(1, sum(1 for d in demographics if d.get("gender") == "male"))
female_accuracy = sum(
1 for i, pred in enumerate(predictions)
if demographics[i].get("gender") == "female" and pred == demographics[i].get("label")
) / max(1, sum(1 for d in demographics if d.get("gender") == "female"))
return {
"male_accuracy": male_accuracy,
"female_accuracy": female_accuracy,
"gender_bias_gap": abs(male_accuracy - female_accuracy),
"biased": abs(male_accuracy - female_accuracy) > 0.05
}
Robustness Testing
Test model robustness to adversarial and edge case inputs.
import anthropic
class RobustnessTester:
"""Test model robustness to adversarial inputs."""
def __init__(self):
self.client = anthropic.Anthropic()
def test_adversarial_robustness(
self,
original_input: str,
model_predict_fn
) -> dict:
"""Test robustness to adversarial perturbations."""
adversarial_inputs = self._generate_adversarial_inputs(original_input)
results = {
"original_input": original_input,
"original_prediction": model_predict_fn(original_input),
"adversarial_tests": []
}
for adv_input in adversarial_inputs:
prediction = model_predict_fn(adv_input)
results["adversarial_tests"].append({
"input": adv_input,
"prediction": prediction,
"robust": prediction == results["original_prediction"]
})
return results
def _generate_adversarial_inputs(self, text: str) -> List[str]:
"""Generate adversarial variations of input."""
adversarial = [
text.lower(), # Case change
text + " " * 50, # Whitespace
text[::-1] if len(text) > 10 else text, # Reverse
text.replace("a", "@"), # Character substitution
" ".join(text.split()[::-1]) # Word reversal
]
return adversarial
def test_edge_cases(
self,
model_predict_fn,
edge_cases: List[str]
) -> dict:
"""Test model on edge cases."""
results = []
for case in edge_cases:
try:
prediction = model_predict_fn(case)
results.append({
"input": case,
"prediction": prediction,
"error": None
})
except Exception as e:
results.append({
"input": case,
"prediction": None,
"error": str(e)
})
return {
"total_cases": len(results),
"successful": sum(1 for r in results if r["error"] is None),
"failed": sum(1 for r in results if r["error"] is not None),
"results": results
}
Human Oversight and Appeals
Build mechanisms for human review and appeals.
from enum import Enum
class DecisionStatus(Enum):
AUTO_APPROVED = "auto_approved"
AUTO_REJECTED = "auto_rejected"
HUMAN_REVIEW = "human_review"
APPEALED = "appealed"
@dataclass
class Decision:
"""Decision with potential human review."""
decision_id: str
prediction: str
confidence: float
status: DecisionStatus
reviewer_id: Optional[str] = None
review_notes: Optional[str] = None
class HumanOversightSystem:
"""Route decisions for human review."""
def __init__(self, high_stakes_threshold: float = 0.6):
self.high_stakes_threshold = high_stakes_threshold
self.decisions: Dict[str, Decision] = {}
def route_decision(
self,
decision_id: str,
prediction: str,
confidence: float
) -> DecisionStatus:
"""Route decision based on confidence."""
if confidence < self.high_stakes_threshold:
return DecisionStatus.HUMAN_REVIEW
if confidence > 0.95:
return DecisionStatus.AUTO_APPROVED
return DecisionStatus.HUMAN_REVIEW
def submit_for_review(
self,
decision_id: str,
prediction: str,
confidence: float
) -> Decision:
"""Submit decision for human review."""
status = self.route_decision(decision_id, prediction, confidence)
decision = Decision(
decision_id=decision_id,
prediction=prediction,
confidence=confidence,
status=status
)
self.decisions[decision_id] = decision
return decision
def handle_appeal(
self,
decision_id: str,
appeal_reason: str,
reviewer_id: str
) -> dict:
"""Process appeal of decision."""
if decision_id not in self.decisions:
return {"error": "Decision not found"}
decision = self.decisions[decision_id]
decision.status = DecisionStatus.APPEALED
decision.reviewer_id = reviewer_id
return {
"appeal_id": decision_id,
"original_prediction": decision.prediction,
"appeal_reason": appeal_reason,
"status": "under_review"
}
Key Takeaway
Trustworthy AI requires explainability, fairness testing, robustness verification, and human oversight. Implement model cards, test for bias, generate counterfactual explanations, and provide appeal mechanisms.
Exercises
- Generate explanations for 10 predictions
- Test fairness across 3 demographic groups
- Implement counterfactual explanations
- Test robustness to 20 adversarial inputs
- Build human oversight approval flow
- Create fairness report with visualizations
- Implement appeal handling system