Security-First AI Design
Security-First AI Design
Building Security Into Architecture
Most security problems exist because security was added as an afterthought. This lesson teaches you to design AI systems with security built in from the start—not bolted on later.
Threat Modeling for AI Systems
Start every project with threat modeling:
class AISystemThreatModel:
def __init__(self, system_name):
self.system_name = system_name
self.assets = []
self.threat_actors = []
self.attack_paths = []
self.mitigations = []
def identify_assets(self):
"""What needs protecting?"""
return {
'data_assets': [
'Customer PII',
'Training data',
'Model weights',
'API keys/credentials',
],
'functional_assets': [
'Model availability',
'Model accuracy',
'System performance',
'User trust',
],
'business_assets': [
'Revenue',
'Reputation',
'Regulatory compliance',
'Intellectual property',
],
}
def identify_threats(self):
"""What could go wrong?"""
return {
'prompt_injection': {
'actor': 'Opportunistic attacker',
'impact': 'Data extraction, harmful content',
'likelihood': 'VERY_HIGH',
},
'data_leakage': {
'actor': 'Curious user, insider',
'impact': 'Privacy violation, regulatory fine',
'likelihood': 'HIGH',
},
'model_poisoning': {
'actor': 'Competitor, nation-state',
'impact': 'Biased outputs, backdoors',
'likelihood': 'MEDIUM',
},
'supply_chain_attack': {
'actor': 'Attacker targeting dependencies',
'impact': 'Compromised system',
'likelihood': 'MEDIUM',
},
'denial_of_service': {
'actor': 'Attacker seeking disruption',
'impact': 'Service unavailability',
'likelihood': 'HIGH',
},
}
def create_attack_tree(self, target_goal):
"""Map how attackers could achieve their goal."""
if target_goal == 'Extract Customer Data':
return {
'goal': 'Extract Customer Data',
'sub_goals': [
{
'name': 'Direct Extraction',
'methods': ['Prompt injection', 'Direct API call'],
'difficulty': 'EASY',
},
{
'name': 'Indirect Extraction',
'methods': ['Context window leakage', 'Inference-time exposure'],
'difficulty': 'MEDIUM',
},
{
'name': 'Supply Chain',
'methods': ['Compromised dependency', 'Model poisoning'],
'difficulty': 'HARD',
},
]
}
The Principles of Secure AI Design
Principle 1: Least Privilege
Give each component only the permissions it absolutely needs:
class LeastPrivilegeArchitecture:
def __init__(self):
self.permissions = {
'input_validator': ['read_input'], # That's it
'llm_processor': ['read_input', 'write_to_cache'],
'output_filter': ['read_llm_output'], # Can't modify
'database_connector': ['read_customer_data'], # Not write
'logging_service': ['append_logs'], # Only append, not read old logs
}
def grant_minimum_access(self, component, required_operations):
"""Grant only what's needed for the job."""
full_permissions = {
'input_validator': {
'operations': ['read_input'],
'data_access': [],
'network_access': False,
'file_system_access': False,
},
'llm_processor': {
'operations': ['read_input', 'write_to_cache'],
'data_access': ['input_only'], # Can't access other data
'network_access': True, # Only to LLM API
'file_system_access': False,
},
'database_connector': {
'operations': ['read_data'], # Not write, not delete
'data_access': ['customer_data'], # Only this table
'row_level_security': True, # Only current customer's rows
'network_access': True, # Only to database
},
}
return full_permissions.get(component, {})
Principle 2: Defense in Depth
Multiple layers so if one fails, others protect you:
class DefenseInDepthArchitecture:
def __init__(self):
self.layers = [
{
'name': 'Input Validation',
'purpose': 'Reject obviously malicious input',
'examples': ['Detect injection patterns', 'Size limits', 'Format validation'],
},
{
'name': 'Instruction Hardening',
'purpose': 'Make system instructions immutable',
'examples': ['Instruction hierarchy', 'Repeated constraints', 'Meta-instructions'],
},
{
'name': 'Access Control',
'purpose': 'Limit what AI can do',
'examples': ['Function whitelisting', 'Permission checking', 'Rate limiting'],
},
{
'name': 'Output Filtering',
'purpose': 'Prevent bad outputs from leaving',
'examples': ['PII redaction', 'Harmful content detection', 'Format validation'],
},
{
'name': 'Monitoring & Alerting',
'purpose': 'Detect attacks in progress',
'examples': ['Anomaly detection', 'Rate limiting', 'Suspicious pattern detection'],
},
{
'name': 'Incident Response',
'purpose': 'Respond quickly if attack succeeds',
'examples': ['Automatic rate limiting', 'Service degradation', 'Human escalation'],
},
]
def get_layers_for_threat(self, threat):
"""Show which layers defend against a threat."""
if threat == 'Prompt Injection':
return ['Input Validation', 'Instruction Hardening', 'Output Filtering']
elif threat == 'Data Leakage':
return ['Access Control', 'Output Filtering', 'Monitoring']
elif threat == 'DoS':
return ['Access Control', 'Rate Limiting', 'Incident Response']
return []
Principle 3: Secure Defaults
Assume the worst and require explicit authorization for exceptions:
class SecureDefaults:
def __init__(self):
# By default, everything is denied unless explicitly allowed
self.default_policies = {
'data_access': 'DENY', # No access by default
'function_calls': 'DENY', # Can't call functions
'external_requests': 'DENY', # Can't make API calls
'file_access': 'DENY', # Can't read/write files
'model_updates': 'DENY', # Can't update itself
}
def example_execution(self, request):
"""Example of secure defaults in action."""
# Request: "Process customer data"
steps = [
# Step 1: Check authorization
('check_authorization', {
'user': request['user'],
'data': request['data'],
'operation': 'read',
'result': 'authorized' if request['user'] == 'admin' else 'denied',
}),
# Step 2: Check data sensitivity
('check_sensitivity', {
'data': request['data'],
'sensitivity': 'HIGH', # Customer data is high sensitivity
'requires_encryption': True,
}),
# Step 3: Check audit requirements
('check_audit', {
'operation': 'accessing PII',
'must_log': True,
'must_encrypt_logs': True,
}),
# Step 4: Execute with restrictions
('execute', {
'permissions': ['read_only'],
'duration': '1_hour',
'data_scope': 'current_customer_only',
'logging': 'detailed',
}),
]
return steps
Principle 4: Fail Safely
When something goes wrong, fail in a way that doesn’t create security problems:
class FailSafeDesign:
def process_with_failsafes(self, input_data):
"""Process data with fail-safes."""
try:
# Try normal processing
validated_input = self.validate(input_data)
if not validated_input['is_safe']:
# Fail safe: reject input
return {'status': 'REJECTED', 'reason': 'Input validation failed'}
response = self.llm.process(validated_input)
filtered_response = self.filter_output(response)
if not filtered_response['is_safe']:
# Fail safe: return generic response, not unfiltered output
return {'status': 'ERROR', 'message': 'Unable to process request'}
return {'status': 'SUCCESS', 'data': filtered_response}
except Exception as e:
# Fail safe: never expose internal error details
# Log the error for debugging, but return generic message
self.log_error(e)
return {'status': 'ERROR', 'message': 'An error occurred'}
def fail_safe_on_external_service_failure(self):
"""If external service fails, fail safely."""
try:
result = self.external_api.call()
except Exception:
# Don't return error; return safe default
return self.get_safe_default_response()
Architectural Patterns
Pattern 1: Sandboxed Execution
Isolate AI processing:
┌─────────────────────────────────────┐
│ User Request │
└──────────────┬──────────────────────┘
│
┌──────────────▼──────────────────────┐
│ Input Validation & Sanitization │
└──────────────┬──────────────────────┘
│
┌──────────────▼──────────────────────┐
│ SANDBOX │
│ ┌─────────────────────────────────┐ │
│ │ LLM Processing │ │
│ │ (No access to: files, DB, │ │
│ │ network, credentials, etc.) │ │
│ └─────────────────────────────────┘ │
└──────────────┬──────────────────────┘
│
┌──────────────▼──────────────────────┐
│ Output Filtering & Validation │
└──────────────┬──────────────────────┘
│
┌──────────────▼──────────────────────┐
│ Encrypted Response to User │
└─────────────────────────────────────┘
Pattern 2: Staged Processing
Process data through multiple isolated stages:
class StagedProcessingArchitecture:
def process(self, input_data):
"""Process through isolated stages."""
# Stage 1: Intake
intake = self.stage_intake(input_data)
if not intake['valid']:
return {'error': 'Invalid input'}
# Stage 2: Sanitization
sanitized = self.stage_sanitize(intake['data'])
# Stage 3: Processing
result = self.stage_process(sanitized)
# Stage 4: Filtering
filtered = self.stage_filter(result)
# Stage 5: Response
return self.stage_respond(filtered)
def stage_intake(self, data):
"""Validate and parse input."""
# Runs in restricted environment
# Can only validate, not access resources
return {'valid': True, 'data': data}
def stage_sanitize(self, data):
"""Remove sensitive/dangerous content."""
# Runs in isolation
# Can't access original data
return data
def stage_process(self, data):
"""Main processing."""
# Runs in sandbox
# Can't access files, DB, network (except LLM API)
return data
def stage_filter(self, result):
"""Filter output."""
# Runs in isolation
# Sees output, not input
return result
def stage_respond(self, data):
"""Return to user."""
# Encrypts before returning
return data
Pattern 3: Capability-Based Security
Give functions only the capabilities they need:
class CapabilityBasedSecurity:
def execute_with_capabilities(self, function, required_capabilities):
"""Execute function with only required capabilities."""
allowed_capabilities = self.verify_capabilities(function, required_capabilities)
if len(allowed_capabilities) < len(required_capabilities):
raise PermissionError("Function requested unauthorized capabilities")
# Create limited execution context
context = self.create_context(allowed_capabilities)
# Execute in limited context
return function(context)
def create_context(self, capabilities):
"""Create execution context with specific capabilities."""
context = {
'can_read_input': 'read_input' in capabilities,
'can_call_function': 'call_function' in capabilities,
'can_access_data': 'access_data' in capabilities,
'can_make_requests': 'make_requests' in capabilities,
# ... more capabilities
}
# Override functions based on capabilities
class RestrictedLLM:
def __init__(self, context):
self.context = context
def process(self, input_text):
if not self.context['can_read_input']:
raise PermissionError("Cannot read input")
# Process...
return RestrictedLLM(context)
Key Takeaway
Key Takeaway: Security-first design means starting every project with threat modeling, applying security principles (least privilege, defense in depth, secure defaults, fail-safe), and using architectural patterns (sandboxing, staging, capability-based) to make security the foundation, not an afterthought.
Exercise: Design a Secure Architecture
- Create a threat model for a system you want to build
- Identify assets and threats
- Design layered defenses for each threat
- Implement principles: least privilege, defense in depth, secure defaults, fail-safe
- Choose architectural patterns (sandboxing, staging, capability-based)
- Document your design with diagrams and rationale
Next Lesson: Securing RAG Architectures—protecting systems that retrieve and generate.