Security-First AI Design

Building Security Into Architecture

Most security problems exist because security was added as an afterthought. This lesson teaches you to design AI systems with security built in from the start—not bolted on later.

Threat Modeling for AI Systems

Start every project with threat modeling:

class AISystemThreatModel:
    def __init__(self, system_name):
        self.system_name = system_name
        self.assets = []
        self.threat_actors = []
        self.attack_paths = []
        self.mitigations = []

    def identify_assets(self):
        """What needs protecting?"""

        return {
            'data_assets': [
                'Customer PII',
                'Training data',
                'Model weights',
                'API keys/credentials',
            ],
            'functional_assets': [
                'Model availability',
                'Model accuracy',
                'System performance',
                'User trust',
            ],
            'business_assets': [
                'Revenue',
                'Reputation',
                'Regulatory compliance',
                'Intellectual property',
            ],
        }

    def identify_threats(self):
        """What could go wrong?"""

        return {
            'prompt_injection': {
                'actor': 'Opportunistic attacker',
                'impact': 'Data extraction, harmful content',
                'likelihood': 'VERY_HIGH',
            },
            'data_leakage': {
                'actor': 'Curious user, insider',
                'impact': 'Privacy violation, regulatory fine',
                'likelihood': 'HIGH',
            },
            'model_poisoning': {
                'actor': 'Competitor, nation-state',
                'impact': 'Biased outputs, backdoors',
                'likelihood': 'MEDIUM',
            },
            'supply_chain_attack': {
                'actor': 'Attacker targeting dependencies',
                'impact': 'Compromised system',
                'likelihood': 'MEDIUM',
            },
            'denial_of_service': {
                'actor': 'Attacker seeking disruption',
                'impact': 'Service unavailability',
                'likelihood': 'HIGH',
            },
        }

    def create_attack_tree(self, target_goal):
        """Map how attackers could achieve their goal."""

        if target_goal == 'Extract Customer Data':
            return {
                'goal': 'Extract Customer Data',
                'sub_goals': [
                    {
                        'name': 'Direct Extraction',
                        'methods': ['Prompt injection', 'Direct API call'],
                        'difficulty': 'EASY',
                    },
                    {
                        'name': 'Indirect Extraction',
                        'methods': ['Context window leakage', 'Inference-time exposure'],
                        'difficulty': 'MEDIUM',
                    },
                    {
                        'name': 'Supply Chain',
                        'methods': ['Compromised dependency', 'Model poisoning'],
                        'difficulty': 'HARD',
                    },
                ]
            }

The Principles of Secure AI Design

Principle 1: Least Privilege

Give each component only the permissions it absolutely needs:

class LeastPrivilegeArchitecture:
    def __init__(self):
        self.permissions = {
            'input_validator': ['read_input'],  # That's it
            'llm_processor': ['read_input', 'write_to_cache'],
            'output_filter': ['read_llm_output'],  # Can't modify
            'database_connector': ['read_customer_data'],  # Not write
            'logging_service': ['append_logs'],  # Only append, not read old logs
        }

    def grant_minimum_access(self, component, required_operations):
        """Grant only what's needed for the job."""

        full_permissions = {
            'input_validator': {
                'operations': ['read_input'],
                'data_access': [],
                'network_access': False,
                'file_system_access': False,
            },

            'llm_processor': {
                'operations': ['read_input', 'write_to_cache'],
                'data_access': ['input_only'],  # Can't access other data
                'network_access': True,  # Only to LLM API
                'file_system_access': False,
            },

            'database_connector': {
                'operations': ['read_data'],  # Not write, not delete
                'data_access': ['customer_data'],  # Only this table
                'row_level_security': True,  # Only current customer's rows
                'network_access': True,  # Only to database
            },
        }

        return full_permissions.get(component, {})

Principle 2: Defense in Depth

Multiple layers so if one fails, others protect you:

class DefenseInDepthArchitecture:
    def __init__(self):
        self.layers = [
            {
                'name': 'Input Validation',
                'purpose': 'Reject obviously malicious input',
                'examples': ['Detect injection patterns', 'Size limits', 'Format validation'],
            },
            {
                'name': 'Instruction Hardening',
                'purpose': 'Make system instructions immutable',
                'examples': ['Instruction hierarchy', 'Repeated constraints', 'Meta-instructions'],
            },
            {
                'name': 'Access Control',
                'purpose': 'Limit what AI can do',
                'examples': ['Function whitelisting', 'Permission checking', 'Rate limiting'],
            },
            {
                'name': 'Output Filtering',
                'purpose': 'Prevent bad outputs from leaving',
                'examples': ['PII redaction', 'Harmful content detection', 'Format validation'],
            },
            {
                'name': 'Monitoring & Alerting',
                'purpose': 'Detect attacks in progress',
                'examples': ['Anomaly detection', 'Rate limiting', 'Suspicious pattern detection'],
            },
            {
                'name': 'Incident Response',
                'purpose': 'Respond quickly if attack succeeds',
                'examples': ['Automatic rate limiting', 'Service degradation', 'Human escalation'],
            },
        ]

    def get_layers_for_threat(self, threat):
        """Show which layers defend against a threat."""

        if threat == 'Prompt Injection':
            return ['Input Validation', 'Instruction Hardening', 'Output Filtering']

        elif threat == 'Data Leakage':
            return ['Access Control', 'Output Filtering', 'Monitoring']

        elif threat == 'DoS':
            return ['Access Control', 'Rate Limiting', 'Incident Response']

        return []

Principle 3: Secure Defaults

Assume the worst and require explicit authorization for exceptions:

class SecureDefaults:
    def __init__(self):
        # By default, everything is denied unless explicitly allowed
        self.default_policies = {
            'data_access': 'DENY',  # No access by default
            'function_calls': 'DENY',  # Can't call functions
            'external_requests': 'DENY',  # Can't make API calls
            'file_access': 'DENY',  # Can't read/write files
            'model_updates': 'DENY',  # Can't update itself
        }

    def example_execution(self, request):
        """Example of secure defaults in action."""

        # Request: "Process customer data"

        steps = [
            # Step 1: Check authorization
            ('check_authorization', {
                'user': request['user'],
                'data': request['data'],
                'operation': 'read',
                'result': 'authorized' if request['user'] == 'admin' else 'denied',
            }),

            # Step 2: Check data sensitivity
            ('check_sensitivity', {
                'data': request['data'],
                'sensitivity': 'HIGH',  # Customer data is high sensitivity
                'requires_encryption': True,
            }),

            # Step 3: Check audit requirements
            ('check_audit', {
                'operation': 'accessing PII',
                'must_log': True,
                'must_encrypt_logs': True,
            }),

            # Step 4: Execute with restrictions
            ('execute', {
                'permissions': ['read_only'],
                'duration': '1_hour',
                'data_scope': 'current_customer_only',
                'logging': 'detailed',
            }),
        ]

        return steps

Principle 4: Fail Safely

When something goes wrong, fail in a way that doesn’t create security problems:

class FailSafeDesign:
    def process_with_failsafes(self, input_data):
        """Process data with fail-safes."""

        try:
            # Try normal processing
            validated_input = self.validate(input_data)

            if not validated_input['is_safe']:
                # Fail safe: reject input
                return {'status': 'REJECTED', 'reason': 'Input validation failed'}

            response = self.llm.process(validated_input)

            filtered_response = self.filter_output(response)

            if not filtered_response['is_safe']:
                # Fail safe: return generic response, not unfiltered output
                return {'status': 'ERROR', 'message': 'Unable to process request'}

            return {'status': 'SUCCESS', 'data': filtered_response}

        except Exception as e:
            # Fail safe: never expose internal error details
            # Log the error for debugging, but return generic message
            self.log_error(e)
            return {'status': 'ERROR', 'message': 'An error occurred'}

    def fail_safe_on_external_service_failure(self):
        """If external service fails, fail safely."""

        try:
            result = self.external_api.call()
        except Exception:
            # Don't return error; return safe default
            return self.get_safe_default_response()

Architectural Patterns

Pattern 1: Sandboxed Execution

Isolate AI processing:

┌─────────────────────────────────────┐
│ User Request                        │
└──────────────┬──────────────────────┘
               │
┌──────────────▼──────────────────────┐
│ Input Validation & Sanitization    │
└──────────────┬──────────────────────┘
               │
┌──────────────▼──────────────────────┐
│ SANDBOX                             │
│ ┌─────────────────────────────────┐ │
│ │ LLM Processing                  │ │
│ │ (No access to: files, DB,       │ │
│ │  network, credentials, etc.)    │ │
│ └─────────────────────────────────┘ │
└──────────────┬──────────────────────┘
               │
┌──────────────▼──────────────────────┐
│ Output Filtering & Validation       │
└──────────────┬──────────────────────┘
               │
┌──────────────▼──────────────────────┐
│ Encrypted Response to User          │
└─────────────────────────────────────┘

Pattern 2: Staged Processing

Process data through multiple isolated stages:

class StagedProcessingArchitecture:
    def process(self, input_data):
        """Process through isolated stages."""

        # Stage 1: Intake
        intake = self.stage_intake(input_data)
        if not intake['valid']:
            return {'error': 'Invalid input'}

        # Stage 2: Sanitization
        sanitized = self.stage_sanitize(intake['data'])

        # Stage 3: Processing
        result = self.stage_process(sanitized)

        # Stage 4: Filtering
        filtered = self.stage_filter(result)

        # Stage 5: Response
        return self.stage_respond(filtered)

    def stage_intake(self, data):
        """Validate and parse input."""
        # Runs in restricted environment
        # Can only validate, not access resources
        return {'valid': True, 'data': data}

    def stage_sanitize(self, data):
        """Remove sensitive/dangerous content."""
        # Runs in isolation
        # Can't access original data
        return data

    def stage_process(self, data):
        """Main processing."""
        # Runs in sandbox
        # Can't access files, DB, network (except LLM API)
        return data

    def stage_filter(self, result):
        """Filter output."""
        # Runs in isolation
        # Sees output, not input
        return result

    def stage_respond(self, data):
        """Return to user."""
        # Encrypts before returning
        return data

Pattern 3: Capability-Based Security

Give functions only the capabilities they need:

class CapabilityBasedSecurity:
    def execute_with_capabilities(self, function, required_capabilities):
        """Execute function with only required capabilities."""

        allowed_capabilities = self.verify_capabilities(function, required_capabilities)

        if len(allowed_capabilities) < len(required_capabilities):
            raise PermissionError("Function requested unauthorized capabilities")

        # Create limited execution context
        context = self.create_context(allowed_capabilities)

        # Execute in limited context
        return function(context)

    def create_context(self, capabilities):
        """Create execution context with specific capabilities."""

        context = {
            'can_read_input': 'read_input' in capabilities,
            'can_call_function': 'call_function' in capabilities,
            'can_access_data': 'access_data' in capabilities,
            'can_make_requests': 'make_requests' in capabilities,
            # ... more capabilities
        }

        # Override functions based on capabilities
        class RestrictedLLM:
            def __init__(self, context):
                self.context = context

            def process(self, input_text):
                if not self.context['can_read_input']:
                    raise PermissionError("Cannot read input")
                # Process...

        return RestrictedLLM(context)

Key Takeaway

Key Takeaway: Security-first design means starting every project with threat modeling, applying security principles (least privilege, defense in depth, secure defaults, fail-safe), and using architectural patterns (sandboxing, staging, capability-based) to make security the foundation, not an afterthought.

Exercise: Design a Secure Architecture

Create a threat model for a system you want to build
Identify assets and threats
Design layered defenses for each threat
Implement principles: least privilege, defense in depth, secure defaults, fail-safe
Choose architectural patterns (sandboxing, staging, capability-based)
Document your design with diagrams and rationale

Next Lesson: Securing RAG Architectures—protecting systems that retrieve and generate.