Intermediate
Model Provenance and Integrity
Model Provenance and Integrity
Tracking Model Lineage and Authenticity
Where did your model come from? How was it trained? Who modified it? Provenance (lineage tracking) and integrity verification answer these questions.
Model Provenance
Model cards document model origin and training:
class ModelCard:
def __init__(self):
self.card = {
'model_name': 'SafeChat-v2',
'created_date': '2024-03-15',
'creators': ['Team A', 'Team B'],
'license': 'Apache 2.0',
'training': {
'base_model': 'claude-2',
'training_data': {
'sources': ['internal_conversations', 'public_datasets'],
'size': '10M examples',
'preprocessing': 'PII redaction, deduplication',
},
'training_procedure': {
'method': 'SFT + RLHF',
'duration': '2 weeks',
'compute': '8x A100 GPUs',
},
},
'evaluation': {
'benchmarks': ['MMLU', 'HumanEval'],
'results': {'MMLU': 0.92, 'HumanEval': 0.78},
'limitations': [
'Performs poorly on non-English text',
'May have biases from training data',
],
},
'intended_use': {
'primary': 'Customer support chatbot',
'secondary': ['Documentation generation'],
'out_of_scope': ['Medical diagnosis', 'Legal advice'],
},
'ethical_considerations': {
'risks': ['Bias against minorities', 'Tendency to be overly helpful'],
'mitigations': ['Regular bias audits', 'Human review of outputs'],
},
'maintenance': {
'versioning': 'semver',
'update_frequency': 'monthly',
'last_updated': '2024-03-20',
}
}
Model Integrity
Verify models haven’t been tampered with:
import hashlib
class ModelIntegrityVerification:
def __init__(self):
self.trusted_hashes = {
'gpt-4-v1': 'abc123def456...',
'claude-2-v1': 'xyz789uvw012...',
}
def verify_model(self, model_path, model_id):
"""Verify model integrity with cryptographic hash."""
# Calculate hash
sha256_hash = hashlib.sha256()
with open(model_path, 'rb') as f:
for byte_block in iter(lambda: f.read(4096), b''):
sha256_hash.update(byte_block)
actual_hash = sha256_hash.hexdigest()
expected_hash = self.trusted_hashes.get(model_id)
if actual_hash != expected_hash:
raise SecurityError(f"Model hash mismatch! Expected {expected_hash}, got {actual_hash}")
return True
def sign_model(self, model_path, signing_key):
"""Cryptographically sign model for authenticity."""
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.asymmetric import padding
# Read model
with open(model_path, 'rb') as f:
model_data = f.read()
# Sign
signature = signing_key.sign(
model_data,
padding.PSS(
mgf=padding.MGF1(hashes.SHA256()),
salt_length=padding.PSS.MAX_LENGTH
),
hashes.SHA256()
)
return signature
def verify_signature(self, model_path, signature, public_key):
"""Verify model signature."""
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.asymmetric import padding
with open(model_path, 'rb') as f:
model_data = f.read()
try:
public_key.verify(
signature,
model_data,
padding.PSS(
mgf=padding.MGF1(hashes.SHA256()),
salt_length=padding.PSS.MAX_LENGTH
),
hashes.SHA256()
)
return True
except:
return False
Model Lineage Tracking
Track how models evolve:
class ModelLineageTracking:
def __init__(self):
self.versions = {}
def record_model_version(self, model_id, version, metadata):
"""Record model version in lineage."""
self.versions[f"{model_id}_v{version}"] = {
'version': version,
'created_date': datetime.now(),
'base_model': metadata.get('base_model'),
'fine_tuning': metadata.get('fine_tuning'),
'modifications': metadata.get('modifications'),
'hash': metadata.get('hash'),
'trainer': metadata.get('trainer'),
'validation_results': metadata.get('validation'),
}
def trace_lineage(self, model_id):
"""Trace model's evolution from base to current."""
versions = [v for v in self.versions if v.startswith(model_id)]
lineage = []
for version_key in sorted(versions):
version_data = self.versions[version_key]
lineage.append({
'version': version_data['version'],
'date': version_data['created_date'],
'base': version_data['base_model'],
'changes': version_data['modifications'],
})
return lineage
SBOM for AI Models
Software Bill of Materials (SBOM) extended to AI:
class AISoftwareBillOfMaterials:
def __init__(self):
self.sbom = {
'spec_version': '1.0',
'creation_date': datetime.now().isoformat(),
'components': {
'models': [
{
'type': 'base-model',
'name': 'GPT-2',
'version': 'base',
'source': 'huggingface.co',
'hash': 'xyz789...',
}
],
'training_data': [
{
'name': 'Common Crawl',
'version': '2024-01',
'size': '5TB',
}
],
'dependencies': [
{
'name': 'transformers',
'version': '4.30.0',
'supplier': 'huggingface',
}
],
},
'known_vulnerabilities': [],
'licenses': [
{
'component': 'GPT-2',
'license': 'MIT',
}
]
}
def generate_sbom(self):
"""Generate SBOM for compliance."""
return json.dumps(self.sbom, indent=2)
Key Takeaway
Key Takeaway: Document model provenance with model cards, verify integrity with cryptographic hashing and signing, track lineage to understand evolution, and generate SBOMs for transparency and compliance.
Exercise: Create Model Provenance Documentation
- Write model card for a model you use
- Calculate and store hashes for model verification
- Cryptographically sign models with your key
- Track model lineage as you fine-tune and update
- Generate SBOM documenting all components
- Verify integrity of downloaded models
Next Lesson: Secure AI CI/CD Pipelines—security gates in deployment. EOF