Intermediate
Text Processing and Tokenization
Text Processing and Tokenization
Tokenization splits text into meaningful units. Different tokenization strategies impact model performance: word-level tokens lose subword information, character-level captures too much noise, while subword methods like BPE and WordPiece offer optimal balance.
Tokenization Approaches
from typing import List, Dict
import re
from collections import Counter
class WordTokenizer:
"""Simple word-level tokenization."""
def __init__(self, lowercase: bool = True):
self.lowercase = lowercase
self.vocab = {}
self.word_to_id = {}
self.id_to_word = {}
def tokenize(self, text: str) -> List[str]:
"""Tokenize to words."""
text = text.lower() if self.lowercase else text
tokens = re.findall(r'\b\w+\b', text)
return tokens
def build_vocab(self, texts: List[str], vocab_size: int = 10000):
"""Build vocabulary from texts."""
word_counts = Counter()
for text in texts:
tokens = self.tokenize(text)
word_counts.update(tokens)
# Keep top vocab_size words
common = word_counts.most_common(vocab_size)
self.vocab = {word: count for word, count in common}
self.word_to_id = {word: i for i, (word, _) in enumerate(common)}
self.id_to_word = {i: word for word, i in self.word_to_id.items()}
def encode(self, text: str) -> List[int]:
"""Convert text to token IDs."""
tokens = self.tokenize(text)
return [self.word_to_id.get(token, 0) for token in tokens]
def decode(self, ids: List[int]) -> str:
"""Convert IDs back to text."""
tokens = [self.id_to_word.get(id_, '') for id_ in ids]
return ' '.join(tokens)
class BytePairEncoding:
"""Byte Pair Encoding tokenization."""
def __init__(self, vocab_size: int = 1000):
self.vocab_size = vocab_size
self.bpe_merges = []
self.vocab = {}
def get_stats(self, vocab: Dict) -> Dict:
"""Count frequency of adjacent pairs."""
pairs = Counter()
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairs
def merge_vocab(self, pair: tuple, vocab: Dict) -> Dict:
"""Merge most frequent pair."""
new_vocab = {}
bigram = ' '.join(pair)
replacement = ''.join(pair)
for word in vocab:
new_word = word.replace(bigram, replacement)
new_vocab[new_word] = vocab[word]
return new_vocab
def train(self, texts: List[str]):
"""Train BPE on texts."""
# Initialize with character-level tokens
vocab = {}
for text in texts:
word = ' '.join(list(text)) + ' </w>'
vocab[word] = vocab.get(word, 0) + 1
# Merge pairs until vocab_size
for i in range(self.vocab_size):
pairs = self.get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = self.merge_vocab(best, vocab)
self.bpe_merges.append(best)
self.vocab = vocab
def encode(self, text: str) -> List[str]:
"""Encode text using learned BPE."""
word = ' '.join(list(text)) + ' </w>'
for pair in self.bpe_merges:
word = word.replace(' '.join(pair), ''.join(pair))
return word.split()
class WordPieceTokenizer:
"""WordPiece tokenization (BERT-style)."""
def __init__(self, vocab_size: int = 30522):
self.vocab = {}
self.subword_prefix = "##"
def train_from_words(self, words: List[str], vocab_size: int):
"""Train WordPiece vocabulary."""
# Start with single characters
vocab = set()
for word in words:
vocab.add(word[0])
for i in range(1, len(word)):
vocab.add(f"{self.subword_prefix}{word[i]}")
# Iteratively add most frequent subwords
while len(vocab) < vocab_size:
pairs = {}
for word in words:
tokens = self._tokenize_word(word, vocab)
for i in range(len(tokens) - 1):
pair = (tokens[i], tokens[i + 1])
pairs[pair] = pairs.get(pair, 0) + 1
if not pairs:
break
best_pair = max(pairs, key=pairs.get)
new_token = best_pair[0] + best_pair[1].replace(self.subword_prefix, '')
vocab.add(new_token)
self.vocab = {token: i for i, token in enumerate(sorted(vocab))}
def _tokenize_word(self, word: str, vocab: set) -> List[str]:
"""Tokenize word using vocabulary."""
tokens = []
start = 0
while start < len(word):
end = len(word)
while start < end and word[start:end] not in vocab:
if start == 0:
end -= 1
else:
token_prefix = f"{self.subword_prefix}{word[start:end]}"
if token_prefix in vocab:
break
end -= 1
if start < end:
if start == 0:
tokens.append(word[start:end])
else:
tokens.append(f"{self.subword_prefix}{word[start:end]}")
start = end
return tokens
def encode(self, text: str) -> List[int]:
"""Encode text using WordPiece."""
words = text.lower().split()
tokens = []
for word in words:
word_tokens = self._tokenize_word(word, set(self.vocab.keys()))
tokens.extend([self.vocab.get(t, 0) for t in word_tokens])
return tokens
SentencePiece Tokenization
Language-agnostic tokenization treating text as sequences of characters.
class SentencePieceTokenizer:
"""SentencePiece tokenization."""
def __init__(self, vocab_size: int = 8000):
self.vocab_size = vocab_size
self.model = None
def train(self, texts: List[str]):
"""Train SentencePiece model."""
# Simplified: in production use google/sentencepiece library
# from sentencepiece import SentencePieceProcessor
# sp = SentencePieceProcessor()
# sp.Train(model_type='unigram', input=..., vocab_size=vocab_size)
pass
def encode(self, text: str) -> List[int]:
"""Encode with SentencePiece."""
# pieces = self.model.encode_as_ids(text)
return []
def decode(self, ids: List[int]) -> str:
"""Decode from IDs."""
# text = self.model.decode_ids(ids)
return ""
Handling Special Tokens and Edge Cases
class SpecialTokenManager:
"""Manage special tokens and sequences."""
def __init__(self):
self.special_tokens = {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4
}
def add_special_token(self, token: str, id_: int):
"""Add special token."""
self.special_tokens[token] = id_
def is_special(self, token: str) -> bool:
"""Check if token is special."""
return token in self.special_tokens
def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
"""Convert tokens to IDs."""
ids = []
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
ids.append(1) # UNK
return ids
Key Takeaway
Tokenization choice impacts model performance. BPE and WordPiece balance vocabulary size and expressiveness. Special tokens enable model instructions. Proper tokenization is crucial for NLP success.
Exercises
- Implement word-level tokenizer
- Train BPE on 100K words
- Implement WordPiece tokenization
- Compare token counts across methods
- Handle edge cases (punctuation, numbers)
- Build vocabulary from corpus
- Analyze token distribution