Intermediate

Text Processing and Tokenization

Lesson 1 of 4 Estimated Time 50 min

Text Processing and Tokenization

Tokenization splits text into meaningful units. Different tokenization strategies impact model performance: word-level tokens lose subword information, character-level captures too much noise, while subword methods like BPE and WordPiece offer optimal balance.

Tokenization Approaches

from typing import List, Dict
import re
from collections import Counter

class WordTokenizer:
    """Simple word-level tokenization."""

    def __init__(self, lowercase: bool = True):
        self.lowercase = lowercase
        self.vocab = {}
        self.word_to_id = {}
        self.id_to_word = {}

    def tokenize(self, text: str) -> List[str]:
        """Tokenize to words."""
        text = text.lower() if self.lowercase else text
        tokens = re.findall(r'\b\w+\b', text)
        return tokens

    def build_vocab(self, texts: List[str], vocab_size: int = 10000):
        """Build vocabulary from texts."""
        word_counts = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            word_counts.update(tokens)

        # Keep top vocab_size words
        common = word_counts.most_common(vocab_size)
        self.vocab = {word: count for word, count in common}
        self.word_to_id = {word: i for i, (word, _) in enumerate(common)}
        self.id_to_word = {i: word for word, i in self.word_to_id.items()}

    def encode(self, text: str) -> List[int]:
        """Convert text to token IDs."""
        tokens = self.tokenize(text)
        return [self.word_to_id.get(token, 0) for token in tokens]

    def decode(self, ids: List[int]) -> str:
        """Convert IDs back to text."""
        tokens = [self.id_to_word.get(id_, '') for id_ in ids]
        return ' '.join(tokens)

class BytePairEncoding:
    """Byte Pair Encoding tokenization."""

    def __init__(self, vocab_size: int = 1000):
        self.vocab_size = vocab_size
        self.bpe_merges = []
        self.vocab = {}

    def get_stats(self, vocab: Dict) -> Dict:
        """Count frequency of adjacent pairs."""
        pairs = Counter()
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs

    def merge_vocab(self, pair: tuple, vocab: Dict) -> Dict:
        """Merge most frequent pair."""
        new_vocab = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)

        for word in vocab:
            new_word = word.replace(bigram, replacement)
            new_vocab[new_word] = vocab[word]

        return new_vocab

    def train(self, texts: List[str]):
        """Train BPE on texts."""
        # Initialize with character-level tokens
        vocab = {}
        for text in texts:
            word = ' '.join(list(text)) + ' </w>'
            vocab[word] = vocab.get(word, 0) + 1

        # Merge pairs until vocab_size
        for i in range(self.vocab_size):
            pairs = self.get_stats(vocab)
            if not pairs:
                break

            best = max(pairs, key=pairs.get)
            vocab = self.merge_vocab(best, vocab)
            self.bpe_merges.append(best)

        self.vocab = vocab

    def encode(self, text: str) -> List[str]:
        """Encode text using learned BPE."""
        word = ' '.join(list(text)) + ' </w>'

        for pair in self.bpe_merges:
            word = word.replace(' '.join(pair), ''.join(pair))

        return word.split()

class WordPieceTokenizer:
    """WordPiece tokenization (BERT-style)."""

    def __init__(self, vocab_size: int = 30522):
        self.vocab = {}
        self.subword_prefix = "##"

    def train_from_words(self, words: List[str], vocab_size: int):
        """Train WordPiece vocabulary."""
        # Start with single characters
        vocab = set()
        for word in words:
            vocab.add(word[0])
            for i in range(1, len(word)):
                vocab.add(f"{self.subword_prefix}{word[i]}")

        # Iteratively add most frequent subwords
        while len(vocab) < vocab_size:
            pairs = {}
            for word in words:
                tokens = self._tokenize_word(word, vocab)
                for i in range(len(tokens) - 1):
                    pair = (tokens[i], tokens[i + 1])
                    pairs[pair] = pairs.get(pair, 0) + 1

            if not pairs:
                break

            best_pair = max(pairs, key=pairs.get)
            new_token = best_pair[0] + best_pair[1].replace(self.subword_prefix, '')
            vocab.add(new_token)

        self.vocab = {token: i for i, token in enumerate(sorted(vocab))}

    def _tokenize_word(self, word: str, vocab: set) -> List[str]:
        """Tokenize word using vocabulary."""
        tokens = []
        start = 0

        while start < len(word):
            end = len(word)
            while start < end and word[start:end] not in vocab:
                if start == 0:
                    end -= 1
                else:
                    token_prefix = f"{self.subword_prefix}{word[start:end]}"
                    if token_prefix in vocab:
                        break
                    end -= 1

            if start < end:
                if start == 0:
                    tokens.append(word[start:end])
                else:
                    tokens.append(f"{self.subword_prefix}{word[start:end]}")
            start = end

        return tokens

    def encode(self, text: str) -> List[int]:
        """Encode text using WordPiece."""
        words = text.lower().split()
        tokens = []

        for word in words:
            word_tokens = self._tokenize_word(word, set(self.vocab.keys()))
            tokens.extend([self.vocab.get(t, 0) for t in word_tokens])

        return tokens

SentencePiece Tokenization

Language-agnostic tokenization treating text as sequences of characters.

class SentencePieceTokenizer:
    """SentencePiece tokenization."""

    def __init__(self, vocab_size: int = 8000):
        self.vocab_size = vocab_size
        self.model = None

    def train(self, texts: List[str]):
        """Train SentencePiece model."""
        # Simplified: in production use google/sentencepiece library
        # from sentencepiece import SentencePieceProcessor
        # sp = SentencePieceProcessor()
        # sp.Train(model_type='unigram', input=..., vocab_size=vocab_size)
        pass

    def encode(self, text: str) -> List[int]:
        """Encode with SentencePiece."""
        # pieces = self.model.encode_as_ids(text)
        return []

    def decode(self, ids: List[int]) -> str:
        """Decode from IDs."""
        # text = self.model.decode_ids(ids)
        return ""

Handling Special Tokens and Edge Cases

class SpecialTokenManager:
    """Manage special tokens and sequences."""

    def __init__(self):
        self.special_tokens = {
            "[PAD]": 0,
            "[UNK]": 1,
            "[CLS]": 2,
            "[SEP]": 3,
            "[MASK]": 4
        }

    def add_special_token(self, token: str, id_: int):
        """Add special token."""
        self.special_tokens[token] = id_

    def is_special(self, token: str) -> bool:
        """Check if token is special."""
        return token in self.special_tokens

    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
        """Convert tokens to IDs."""
        ids = []
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(1)  # UNK
        return ids

Key Takeaway

Tokenization choice impacts model performance. BPE and WordPiece balance vocabulary size and expressiveness. Special tokens enable model instructions. Proper tokenization is crucial for NLP success.

Exercises

  1. Implement word-level tokenizer
  2. Train BPE on 100K words
  3. Implement WordPiece tokenization
  4. Compare token counts across methods
  5. Handle edge cases (punctuation, numbers)
  6. Build vocabulary from corpus
  7. Analyze token distribution