Text Classification and Sentiment Analysis

Text classification is a fundamental NLP task with wide applications: sentiment analysis, spam detection, topic categorization, and intent classification. This lesson covers fine-tuning BERT for classification, handling multi-label problems, data augmentation, and addressing class imbalance.

Core Concepts

Classification Problem Formulation

Single-label classification: Each document belongs to exactly one class

Input: "This movie is amazing!"
Output: positive

Multi-label classification: Document can belong to multiple classes

Input: "Great action and comedy film"
Output: [action, comedy]

Imbalanced classification: Classes have unequal distribution

Classes: negative (10%), neutral (20%), positive (70%)
Challenge: Model biased toward majority class

Loss Functions for Classification

Binary Cross-Entropy (BCE):

L = -Σ(y_i * log(ŷ_i) + (1-y_i) * log(1-ŷ_i))

Used for multi-label (sigmoid outputs per label)

Categorical Cross-Entropy:

L = -Σ y_i * log(ŷ_i)

Single-label with softmax outputs

Focal Loss: Addresses class imbalance by down-weighting easy examples

L = -Σ (1-p_t)^γ * log(p_t)

Where γ is focusing parameter (typically 2)

Data Augmentation for Text

Techniques to create synthetic training data:

EDA (Easy Data Augmentation): synonym replacement, random insertion/swap/deletion
Back-translation: Translate to another language and back
Paraphrasing: Use T5 or other models to paraphrase
Mixup: Blend embeddings of two samples

Practical Implementation

BERT Fine-tuning for Sentiment Analysis

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np

# Load sentiment dataset
dataset = load_dataset('glue', 'sst2')

# Initialize tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # positive, negative
    problem_type='single_label_classification'
)

# Tokenization
def preprocess_function(examples):
    return tokenizer(
        examples['sentence'],
        padding='max_length',
        truncation=True,
        max_length=128,
    )

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )

    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./sentiment_model',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Save
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

Multi-Label Classification

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
        }

class MultiLabelBERT(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            # Use BCEWithLogitsLoss for multi-label
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits}

# Training loop
model = MultiLabelBERT(num_labels=6)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, labels)
        loss = outputs['loss']

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')

# Inference with thresholding
@torch.no_grad()
def predict_multilabel(texts, threshold=0.5):
    model.eval()
    dataset = MultiLabelDataset(texts, [[0]*6]*len(texts), tokenizer)
    loader = DataLoader(dataset, batch_size=32)

    all_predictions = []
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        logits = outputs['logits']
        probs = torch.sigmoid(logits)

        predictions = (probs > threshold).int()
        all_predictions.extend(predictions.cpu().numpy())

    return all_predictions

Data Augmentation with EDA

import random
from nltk.corpus import wordnet

def get_synonyms(word):
    """Get synonyms using WordNet"""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() != word:
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def random_insertion(sentence, n=2):
    """Insert n random words from WordNet"""
    words = sentence.split()
    for _ in range(n):
        add_word(words)
    return ' '.join(words)

def add_word(words):
    """Add random synonym"""
    synonyms = []
    for word in words:
        synonyms.extend(get_synonyms(word))

    if not synonyms:
        return

    random_synonym = random.choice(synonyms)
    random_idx = random.randint(0, len(words)-1)
    words.insert(random_idx, random_synonym)

def random_swap(sentence, n=2):
    """Randomly swap two words"""
    words = sentence.split()
    for _ in range(n):
        swap_word(words)
    return ' '.join(words)

def swap_word(words):
    """Swap two random words"""
    random_idx_1 = random.randint(0, len(words)-1)
    random_idx_2 = random_idx_1
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(words)-1)

    words[random_idx_1], words[random_idx_2] = words[random_idx_2], words[random_idx_1]

def random_deletion(sentence, p=0.1):
    """Randomly delete words with probability p"""
    if len(sentence.split()) == 1:
        return sentence

    words = sentence.split()
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=4):
    """Easy Data Augmentation"""
    augmented_sentences = []

    for _ in range(num_aug):
        a = random_insertion(sentence, int(alpha_ri * len(sentence.split())))
        a = random_swap(a, int(alpha_rs * len(a.split())))
        a = random_deletion(a, p_rd)
        augmented_sentences.append(a)

    return augmented_sentences

# Usage
original = "This movie is absolutely wonderful"
augmented = eda(original, num_aug=4)
print("Original:", original)
for i, aug in enumerate(augmented):
    print(f"Augmented {i+1}:", aug)

Handling Class Imbalance

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Use in loss function
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Alternative: Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = nn.functional.cross_entropy(inputs, targets, reduction='none')
        p = torch.exp(-ce_loss)
        loss = self.alpha * (1 - p) ** self.gamma * ce_loss
        return loss.mean()

focal_loss = FocalLoss(gamma=2.0)

# In training loop
loss = focal_loss(logits, labels)

Advanced Techniques

Ensemble Methods

def ensemble_predictions(texts, models, tokenizer, device):
    """Ensemble multiple models"""
    all_logits = []

    for model in models:
        model.eval()
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            all_logits.append(logits)

    # Average logits
    avg_logits = torch.mean(torch.stack(all_logits), dim=0)
    predictions = torch.argmax(avg_logits, dim=-1)

    return predictions, avg_logits

Attention Visualization

import matplotlib.pyplot as plt
import seaborn as sns

def visualize_attention(text, model, tokenizer, layer=11, head=0):
    """Visualize BERT attention"""
    inputs = tokenizer.encode(text, return_tensors='pt')
    outputs = model(inputs, output_attentions=True)
    attention = outputs[-1]

    # Layer and head of interest
    attn_matrix = attention[layer][0, head].detach().cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs[0])

    plt.figure(figsize=(10, 8))
    sns.heatmap(attn_matrix, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
    plt.title(f'Attention Layer {layer}, Head {head}')
    plt.show()

Production Considerations

Model Serving

from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

model = AutoModelForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = AutoTokenizer.from_pretrained('./sentiment_model')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device).eval()

class TextInput(BaseModel):
    text: str

@app.post('/classify')
async def classify(input: TextInput):
    inputs = tokenizer(input.text, return_tensors='pt', padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()

    labels = {0: 'negative', 1: 'positive'}

    return {
        'label': labels[predicted_class],
        'confidence': probabilities[0, predicted_class].item(),
        'all_scores': {labels[i]: p.item() for i, p in enumerate(probabilities[0])}
    }

Key Takeaway

Text classification fundamentals apply across domains, but mastering fine-tuning strategies, multi-label handling, augmentation, and imbalance solutions distinguishes production systems from prototypes. Invest in data quality and careful evaluation metric selection.

Practical Exercise

Task: Build a toxicity classifier for online comments with multi-label support.

Requirements:

Load Jigsaw Toxic Comments dataset (6 labels)
Implement multi-label BERT classifier
Apply class-weighted focal loss
Augment training data with EDA
Evaluate with per-label metrics
Deploy with FastAPI endpoint

Evaluation:

Achieve per-label F1 > 0.75
Analyze false positives (low toxicity flagged)
Compare single-label vs. multi-label approach
Benchmark inference latency
Test edge cases and adversarial examples