Text Classification and Sentiment Analysis
Text Classification and Sentiment Analysis
Text classification is a fundamental NLP task with wide applications: sentiment analysis, spam detection, topic categorization, and intent classification. This lesson covers fine-tuning BERT for classification, handling multi-label problems, data augmentation, and addressing class imbalance.
Core Concepts
Classification Problem Formulation
Single-label classification: Each document belongs to exactly one class
Input: "This movie is amazing!"
Output: positive
Multi-label classification: Document can belong to multiple classes
Input: "Great action and comedy film"
Output: [action, comedy]
Imbalanced classification: Classes have unequal distribution
Classes: negative (10%), neutral (20%), positive (70%)
Challenge: Model biased toward majority class
Loss Functions for Classification
Binary Cross-Entropy (BCE):
L = -Σ(y_i * log(ŷ_i) + (1-y_i) * log(1-ŷ_i))
Used for multi-label (sigmoid outputs per label)
Categorical Cross-Entropy:
L = -Σ y_i * log(ŷ_i)
Single-label with softmax outputs
Focal Loss: Addresses class imbalance by down-weighting easy examples
L = -Σ (1-p_t)^γ * log(p_t)
Where γ is focusing parameter (typically 2)
Data Augmentation for Text
Techniques to create synthetic training data:
- EDA (Easy Data Augmentation): synonym replacement, random insertion/swap/deletion
- Back-translation: Translate to another language and back
- Paraphrasing: Use T5 or other models to paraphrase
- Mixup: Blend embeddings of two samples
Practical Implementation
BERT Fine-tuning for Sentiment Analysis
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np
# Load sentiment dataset
dataset = load_dataset('glue', 'sst2')
# Initialize tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2, # positive, negative
problem_type='single_label_classification'
)
# Tokenization
def preprocess_function(examples):
return tokenizer(
examples['sentence'],
padding='max_length',
truncation=True,
max_length=128,
)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Evaluation metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='weighted'
)
return {
'accuracy': accuracy_score(labels, predictions),
'precision': precision,
'recall': recall,
'f1': f1,
}
# Training arguments
training_args = TrainingArguments(
output_dir='./sentiment_model',
num_train_epochs=3,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
logging_steps=100,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='f1',
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
compute_metrics=compute_metrics,
)
# Train
trainer.train()
# Save
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')
Multi-Label Classification
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
class MultiLabelDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = self.tokenizer(
self.texts[idx],
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'labels': torch.tensor(self.labels[idx], dtype=torch.float),
}
class MultiLabelBERT(nn.Module):
def __init__(self, num_labels):
super().__init__()
self.bert = AutoModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(768, num_labels)
def forward(self, input_ids, attention_mask, labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
# Use BCEWithLogitsLoss for multi-label
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(logits, labels)
return {'loss': loss, 'logits': logits}
# Training loop
model = MultiLabelBERT(num_labels=6)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in range(3):
model.train()
total_loss = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask, labels)
loss = outputs['loss']
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')
# Inference with thresholding
@torch.no_grad()
def predict_multilabel(texts, threshold=0.5):
model.eval()
dataset = MultiLabelDataset(texts, [[0]*6]*len(texts), tokenizer)
loader = DataLoader(dataset, batch_size=32)
all_predictions = []
for batch in loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
outputs = model(input_ids, attention_mask)
logits = outputs['logits']
probs = torch.sigmoid(logits)
predictions = (probs > threshold).int()
all_predictions.extend(predictions.cpu().numpy())
return all_predictions
Data Augmentation with EDA
import random
from nltk.corpus import wordnet
def get_synonyms(word):
"""Get synonyms using WordNet"""
synonyms = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
if lemma.name() != word:
synonyms.add(lemma.name().replace('_', ' '))
return list(synonyms)
def random_insertion(sentence, n=2):
"""Insert n random words from WordNet"""
words = sentence.split()
for _ in range(n):
add_word(words)
return ' '.join(words)
def add_word(words):
"""Add random synonym"""
synonyms = []
for word in words:
synonyms.extend(get_synonyms(word))
if not synonyms:
return
random_synonym = random.choice(synonyms)
random_idx = random.randint(0, len(words)-1)
words.insert(random_idx, random_synonym)
def random_swap(sentence, n=2):
"""Randomly swap two words"""
words = sentence.split()
for _ in range(n):
swap_word(words)
return ' '.join(words)
def swap_word(words):
"""Swap two random words"""
random_idx_1 = random.randint(0, len(words)-1)
random_idx_2 = random_idx_1
while random_idx_2 == random_idx_1:
random_idx_2 = random.randint(0, len(words)-1)
words[random_idx_1], words[random_idx_2] = words[random_idx_2], words[random_idx_1]
def random_deletion(sentence, p=0.1):
"""Randomly delete words with probability p"""
if len(sentence.split()) == 1:
return sentence
words = sentence.split()
new_words = []
for word in words:
r = random.uniform(0, 1)
if r > p:
new_words.append(word)
if len(new_words) == 0:
return random.choice(words)
return ' '.join(new_words)
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=4):
"""Easy Data Augmentation"""
augmented_sentences = []
for _ in range(num_aug):
a = random_insertion(sentence, int(alpha_ri * len(sentence.split())))
a = random_swap(a, int(alpha_rs * len(a.split())))
a = random_deletion(a, p_rd)
augmented_sentences.append(a)
return augmented_sentences
# Usage
original = "This movie is absolutely wonderful"
augmented = eda(original, num_aug=4)
print("Original:", original)
for i, aug in enumerate(augmented):
print(f"Augmented {i+1}:", aug)
Handling Class Imbalance
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Compute class weights
class_weights = compute_class_weight(
'balanced',
classes=np.unique(train_labels),
y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
# Use in loss function
criterion = nn.CrossEntropyLoss(weight=class_weights)
# Alternative: Focal Loss
class FocalLoss(nn.Module):
def __init__(self, alpha=1.0, gamma=2.0):
super().__init__()
self.alpha = alpha
self.gamma = gamma
def forward(self, inputs, targets):
ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
p = torch.exp(-ce_loss)
loss = self.alpha * (1 - p) ** self.gamma * ce_loss
return loss.mean()
focal_loss = FocalLoss(gamma=2.0)
# In training loop
loss = focal_loss(logits, labels)
Advanced Techniques
Ensemble Methods
def ensemble_predictions(texts, models, tokenizer, device):
"""Ensemble multiple models"""
all_logits = []
for model in models:
model.eval()
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
all_logits.append(logits)
# Average logits
avg_logits = torch.mean(torch.stack(all_logits), dim=0)
predictions = torch.argmax(avg_logits, dim=-1)
return predictions, avg_logits
Attention Visualization
import matplotlib.pyplot as plt
import seaborn as sns
def visualize_attention(text, model, tokenizer, layer=11, head=0):
"""Visualize BERT attention"""
inputs = tokenizer.encode(text, return_tensors='pt')
outputs = model(inputs, output_attentions=True)
attention = outputs[-1]
# Layer and head of interest
attn_matrix = attention[layer][0, head].detach().cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
plt.figure(figsize=(10, 8))
sns.heatmap(attn_matrix, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
plt.title(f'Attention Layer {layer}, Head {head}')
plt.show()
Production Considerations
Model Serving
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
model = AutoModelForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = AutoTokenizer.from_pretrained('./sentiment_model')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device).eval()
class TextInput(BaseModel):
text: str
@app.post('/classify')
async def classify(input: TextInput):
inputs = tokenizer(input.text, return_tensors='pt', padding=True, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=-1)
predicted_class = torch.argmax(probabilities, dim=-1).item()
labels = {0: 'negative', 1: 'positive'}
return {
'label': labels[predicted_class],
'confidence': probabilities[0, predicted_class].item(),
'all_scores': {labels[i]: p.item() for i, p in enumerate(probabilities[0])}
}
Key Takeaway
Text classification fundamentals apply across domains, but mastering fine-tuning strategies, multi-label handling, augmentation, and imbalance solutions distinguishes production systems from prototypes. Invest in data quality and careful evaluation metric selection.
Practical Exercise
Task: Build a toxicity classifier for online comments with multi-label support.
Requirements:
- Load Jigsaw Toxic Comments dataset (6 labels)
- Implement multi-label BERT classifier
- Apply class-weighted focal loss
- Augment training data with EDA
- Evaluate with per-label metrics
- Deploy with FastAPI endpoint
Evaluation:
- Achieve per-label F1 > 0.75
- Analyze false positives (low toxicity flagged)
- Compare single-label vs. multi-label approach
- Benchmark inference latency
- Test edge cases and adversarial examples