Working with Document Loaders and Text Splitters
Working with Document Loaders and Text Splitters
Real AI systems don’t just answer questions about prompts—they need to process documents. LangChain provides loaders for reading documents and splitters for breaking them into manageable chunks. This lesson covers loading various document types and splitting them intelligently.
Document Loaders
Loaders read content from various sources and convert it to LangChain Document objects. A Document has page_content (text) and metadata (info about the source):
from langchain_core.documents import Document
# Document structure
doc = Document(
page_content="This is the main text content",
metadata={"source": "file.txt", "page": 1}
)
print(doc.page_content)
print(doc.metadata)
Loading Text Files
from langchain_community.document_loaders import TextLoader
loader = TextLoader("myfile.txt")
docs = loader.load()
for doc in docs:
print(f"Content length: {len(doc.page_content)}")
print(f"Source: {doc.metadata['source']}")
Loading Multiple Files
from pathlib import Path
from langchain_community.document_loaders import TextLoader
# Load all files in a directory
directory = Path("./documents")
loaders = [
TextLoader(str(file))
for file in directory.glob("*.txt")
]
docs = []
for loader in loaders:
docs.extend(loader.load())
print(f"Loaded {len(docs)} documents")
Loading from Web
from langchain_community.document_loaders import WebBaseLoader
# Load a single webpage
loader = WebBaseLoader("https://example.com/article")
docs = loader.load()
# Load multiple pages
urls = [
"https://example.com/page1",
"https://example.com/page2",
]
docs = []
for url in urls:
loader = WebBaseLoader(url)
docs.extend(loader.load())
Loading PDFs
from langchain_community.document_loaders import PyPDFLoader
# Load PDF
loader = PyPDFLoader("document.pdf")
docs = loader.load()
# Each page becomes a separate document
for i, doc in enumerate(docs):
print(f"Page {i}: {len(doc.page_content)} chars")
print(f"Metadata: {doc.metadata}")
Loading CSV Files
from langchain_community.document_loaders import CSVLoader
# Each row becomes a document
loader = CSVLoader("data.csv")
docs = loader.load()
# Each document contains one row
for doc in docs:
print(doc.page_content)
Loading JSON Files
from langchain_community.document_loaders import JSONLoader
# Specify which field contains the content
loader = JSONLoader(
file_path="data.json",
jq_schema=".data[].content" # JQ path to text field
)
docs = loader.load()
Custom Loaders
Create a loader for custom file formats:
from langchain_core.document_loaders import BaseLoader
class CustomMarkdownLoader(BaseLoader):
"""Load and parse Markdown files."""
def __init__(self, file_path: str):
self.file_path = file_path
def load(self):
docs = []
with open(self.file_path, 'r') as f:
content = f.read()
# Split by H1 headers
sections = content.split('\n# ')
for i, section in enumerate(sections):
doc = Document(
page_content=section,
metadata={
"source": self.file_path,
"section": i
}
)
docs.append(doc)
return docs
# Use custom loader
loader = CustomMarkdownLoader("document.md")
docs = loader.load()
Text Splitters
Documents are often too large for context windows. Splitters break them into chunks:
from langchain_text_splitters import CharacterTextSplitter
splitter = CharacterTextSplitter(
separator="\n\n", # Split on double newlines
chunk_size=1000, # Max 1000 characters per chunk
chunk_overlap=200 # 200 chars overlap between chunks
)
text = "Long document text here..."
chunks = splitter.split_text(text)
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: {len(chunk)} chars")
Recursive Character Splitter
More intelligent splitting that tries different separators:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", " ", ""], # Try these in order
chunk_size=1000,
chunk_overlap=200
)
chunks = splitter.split_text(text)
This tries splitting by paragraph, then sentence, then word, then character—preserving document structure.
Document-Based Splitting
Keep metadata intact when splitting:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
# Load documents
from langchain_community.document_loaders import TextLoader
loader = TextLoader("myfile.txt")
docs = loader.load()
# Split while preserving metadata
split_docs = splitter.split_documents(docs)
for doc in split_docs:
print(f"Content: {doc.page_content[:50]}...")
print(f"Source: {doc.metadata['source']}")
Language-Specific Splitters
Split code intelligently by language:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
# Python code splitter
splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.PYTHON,
chunk_size=1000,
chunk_overlap=200
)
python_code = """
def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
"""
chunks = splitter.split_text(python_code)
Supported languages: PYTHON, JAVA, JAVASCRIPT, TYPESCRIPT, GOLANG, RUST, CPP, KOTLIN, SQL, MARKDOWN, LATEX, HTML, SOL.
Chunk Size Optimization
Choosing good chunk sizes is important:
class ChunkSizeAnalyzer:
"""Analyze impact of chunk size."""
@staticmethod
def analyze_chunk_sizes(
text: str,
sizes: list[int] = [256, 512, 1024, 2048]
):
"""Test different chunk sizes."""
results = {}
for size in sizes:
splitter = RecursiveCharacterTextSplitter(
chunk_size=size,
chunk_overlap=int(size * 0.1)
)
chunks = splitter.split_text(text)
avg_chunk_size = sum(len(c) for c in chunks) / len(chunks)
results[size] = {
"num_chunks": len(chunks),
"avg_chunk_size": avg_chunk_size,
"largest_chunk": max(len(c) for c in chunks),
"smallest_chunk": min(len(c) for c in chunks)
}
return results
# Analyze
text = "Long document..."
analysis = ChunkSizeAnalyzer.analyze_chunk_sizes(text)
for size, stats in analysis.items():
print(f"Size {size}: {stats['num_chunks']} chunks")
Metadata Preservation
Keep important metadata through the pipeline:
from langchain_core.documents import Document
def add_source_metadata(loader, source_name: str):
"""Add custom metadata to loaded documents."""
docs = loader.load()
for doc in docs:
doc.metadata["custom_source"] = source_name
# Add timestamp, user, etc.
return docs
# Usage
loader = TextLoader("file.txt")
docs = add_source_metadata(loader, "user_upload")
for doc in docs:
print(f"Source: {doc.metadata['custom_source']}")
Loading and Splitting Pipeline
Full pipeline from loading to splitting:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
class DocumentPipeline:
"""End-to-end document loading and splitting."""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def process_file(self, filepath: str) -> list[Document]:
"""Load and split a file."""
loader = TextLoader(filepath)
docs = loader.load()
split_docs = self.splitter.split_documents(docs)
# Add processing metadata
for doc in split_docs:
doc.metadata["processed"] = True
return split_docs
def process_directory(self, directory: str) -> list[Document]:
"""Load and split all files in directory."""
from pathlib import Path
directory = Path(directory)
all_docs = []
for filepath in directory.glob("*.txt"):
docs = self.process_file(str(filepath))
all_docs.extend(docs)
return all_docs
# Usage
pipeline = DocumentPipeline(chunk_size=1000, chunk_overlap=200)
docs = pipeline.process_directory("./documents")
print(f"Processed {len(docs)} chunks")
Handling Edge Cases
Deal with real-world document issues:
import json
class RobustDocumentLoader:
"""Load documents with error handling."""
def __init__(self):
self.errors = []
def load_file_safely(self, filepath: str) -> list[Document]:
"""Load file with error handling."""
docs = []
try:
# Try loading as text
if filepath.endswith('.txt'):
from langchain_community.document_loaders import TextLoader
loader = TextLoader(filepath)
docs = loader.load()
elif filepath.endswith('.pdf'):
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(filepath)
docs = loader.load()
elif filepath.endswith('.json'):
with open(filepath, 'r') as f:
data = json.load(f)
docs.append(Document(page_content=json.dumps(data)))
except UnicodeDecodeError:
self.errors.append(f"Encoding error in {filepath}")
except Exception as e:
self.errors.append(f"Error loading {filepath}: {e}")
return docs
def get_errors(self):
return self.errors
# Usage
loader = RobustDocumentLoader()
docs = loader.load_file_safely("problematic.txt")
if loader.get_errors():
for error in loader.get_errors():
print(f"⚠️ {error}")
Key Takeaway
Document loaders read content from various sources and convert them to LangChain Documents. Text splitters break large documents into manageable chunks while preserving structure through metadata. Choose chunk sizes carefully—typically 512-2048 characters. Use RecursiveCharacterTextSplitter to preserve document structure. Maintain metadata through the entire pipeline.
Exercises
-
Load various formats: Load documents in .txt, .pdf, and .csv formats. Inspect the loaded content and metadata.
-
Split optimization: Load a large document. Test different chunk sizes (256, 512, 1024, 2048). Analyze which works best.
-
Custom loader: Create a loader for a custom format (YAML, TOML, or your choice).
-
Metadata enrichment: Load documents and add custom metadata (document type, processing date, user ID).
-
Error handling: Create a loader that handles encoding errors, missing files, and malformed data gracefully.
-
Pipeline: Build an end-to-end pipeline that loads all documents from a directory, splits them, and prepares them for use.