Advanced
Multimodal Applications and Deployment
Multimodal Applications and Deployment
Combining vision, language, and audio enables rich applications like multimodal RAG, document AI, and cross-modal retrieval. This lesson covers building end-to-end systems, routing between modalities, and deploying at scale.
Core Concepts
Multimodal RAG
Retrieve documents based on images, text, or both:
Query (image/text) → Retrieve relevant documents → Generate answer
Document AI
Extract information from documents with mixed content:
Document (images + text) → Layout understanding → Information extraction
Cross-Modal Routing
Route queries to appropriate modality:
Query → Determine modality → Select appropriate model → Process
Practical Implementation
Multimodal RAG System
import faiss
from transformers import CLIPProcessor, CLIPModel
class MultimodalRAG:
def __init__(self, index_path=None):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.documents = []
self.index = None
if index_path:
self.load_index(index_path)
def index_documents(self, docs, doc_type="mixed"):
embeddings = []
for doc in docs:
if doc_type == "text":
inputs = self.processor(text=doc["content"], return_tensors="pt")
emb = self.model.get_text_features(**inputs)
elif doc_type == "image":
inputs = self.processor(images=doc["content"], return_tensors="pt")
emb = self.model.get_image_features(**inputs)
embeddings.append(emb.detach().cpu().numpy())
self.documents.append(doc)
embeddings = np.concatenate(embeddings).astype('float32')
self.index = faiss.IndexFlatL2(embeddings.shape[1])
self.index.add(embeddings)
def retrieve(self, query, query_type="text", k=5):
if query_type == "text":
inputs = self.processor(text=query, return_tensors="pt")
query_emb = self.model.get_text_features(**inputs)
else:
inputs = self.processor(images=query, return_tensors="pt")
query_emb = self.model.get_image_features(**inputs)
query_emb = query_emb.detach().cpu().numpy().astype('float32')
distances, indices = self.index.search(query_emb, k)
return [self.documents[i] for i in indices[0]]
Document AI with Multimodal Models
class DocumentProcessor:
def __init__(self):
self.vit_model = ViT() # Vision transformer
self.ocr_model = PaddleOCR() # Text extraction
self.vlm_model = LLaVA() # Vision-language understanding
def process_document(self, image_path):
image = Image.open(image_path)
# Extract text
ocr_result = self.ocr_model.ocr(image_path)
text_content = "\n".join([line[1][0] for line in ocr_result[0]])
# Understand layout
visual_features = self.vit_model(image)
# VQA for understanding content
questions = [
"What is the title?",
"What is the main content?",
"Are there tables or figures?"
]
answers = {}
for q in questions:
answer = self.vlm_model(image, q)
answers[q] = answer
return {
"text": text_content,
"visual_features": visual_features,
"understanding": answers
}
Modal Routing
class ModalityRouter:
def __init__(self):
self.text_classifier = TextClassifier()
self.image_classifier = ImageClassifier()
self.audio_classifier = AudioClassifier()
def route(self, query):
modality_scores = {
'text': self.text_classifier(query),
'image': self.image_classifier(query),
'audio': self.audio_classifier(query),
}
primary_modality = max(modality_scores, key=modality_scores.get)
return primary_modality, modality_scores
def process(self, query):
modality, scores = self.route(query)
if modality == 'text':
return self.text_model(query)
elif modality == 'image':
return self.image_model(query)
elif modality == 'audio':
return self.audio_model(query)
Advanced Techniques
Ensemble Methods
class MultimodalEnsemble:
def __init__(self, models):
self.models = models
def predict(self, query):
predictions = []
confidences = []
for model in self.models:
pred, conf = model.predict(query)
predictions.append(pred)
confidences.append(conf)
# Weighted ensemble
weights = np.array(confidences) / sum(confidences)
final_pred = weighted_vote(predictions, weights)
return final_pred
Production Considerations
Caching and Optimization
from functools import lru_cache
import hashlib
class OptimizedMultimodalSystem:
def __init__(self):
self.embedding_cache = {}
self.batch_size = 32
@lru_cache(maxsize=10000)
def get_embedding(self, content_hash):
return self.embedding_cache.get(content_hash)
def batch_embed(self, contents):
embeddings = []
for i in range(0, len(contents), self.batch_size):
batch = contents[i:i+self.batch_size]
batch_emb = self.model.embed_batch(batch)
embeddings.extend(batch_emb)
return embeddings
Key Takeaway
Multimodal systems combine vision, language, and audio to unlock rich applications from document AI to cross-modal retrieval. Effective routing, caching, and ensemble methods enable scalable, accurate multimodal applications.
Practical Exercise
Task: Build multimodal search engine supporting image, text, and voice queries.
Requirements:
- Index 10K documents (mixed modalities)
- Support query in image/text/audio
- Route to appropriate retrieval model
- Rank results by relevance
- Deploy with FastAPI
Evaluation:
- Mean Reciprocal Rank > 0.7
- Multi-modal query handling
- Latency < 500ms
- Scalability test
- User satisfaction