Memory, Chat History, and Callbacks
Memory, Chat History, and Callbacks
Conversations get better when models remember context. LangChain provides memory implementations that manage conversation history. Additionally, callbacks let you hook into the chain execution to log, debug, or customize behavior.
Memory Types
Buffer Memory
Keeps all messages in memory—simple but can exceed context limits:
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain_core.prompts import ChatPromptTemplate
# Create memory
memory = ConversationBufferMemory(return_messages=True)
# Create chain
model = ChatOpenAI()
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant"),
("user", "{input}"),
])
chain = LLMChain(llm=model, prompt=prompt, memory=memory)
# First turn
response1 = chain.run(input="My name is Alice")
print(response1)
# Second turn - model remembers Alice's name
response2 = chain.run(input="What is my name?")
print(response2)
# Check memory
print(memory.buffer) # See full history
Summary Memory
Summarizes older messages to save context:
from langchain.memory import ConversationSummaryMemory
from langchain_openai import ChatOpenAI
model = ChatOpenAI()
memory = ConversationSummaryMemory(
llm=model,
buffer="",
max_token_limit=2000
)
# Add messages
memory.save_context(
{"input": "What is machine learning?"},
{"output": "ML is a subset of AI..."}
)
memory.save_context(
{"input": "Give an example"},
{"output": "A spam detector learns from emails..."}
)
# Memory automatically summarizes old messages
print(memory.buffer)
Token Buffer Memory
Keeps messages up to a token limit (not character limit):
from langchain.memory import ConversationTokenBufferMemory
from langchain_openai import ChatOpenAI
model = ChatOpenAI()
memory = ConversationTokenBufferMemory(
llm=model,
max_token_limit=1000 # Keep ~1000 tokens
)
# Add messages
memory.save_context(
{"input": "Tell me about AI"},
{"output": "AI is..."}
)
# Memory tracks tokens, not characters
Entity Memory
Tracks specific entities and facts about them:
from langchain.memory import ConversationEntityMemory
from langchain_openai import ChatOpenAI
model = ChatOpenAI()
memory = ConversationEntityMemory(llm=model)
# Add context with entities
memory.save_context(
{"input": "Alice works at Google as an engineer"},
{"output": "Thanks for sharing"}
)
# Memory extracts and remembers entities
print(memory.entity_store)
Building a Conversation Handler
Create a reusable conversation manager:
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from typing import Optional
class ConversationHandler:
"""Manage a multi-turn conversation."""
def __init__(self, system_prompt: str = "You are a helpful assistant"):
self.model = ChatOpenAI()
self.memory = ConversationBufferMemory(return_messages=True)
self.system_prompt = system_prompt
def chat(self, user_message: str) -> str:
"""Send a message and get a response."""
from langchain.chains import LLMChain
from langchain_core.prompts import ChatPromptTemplate
# Get messages from memory
memory_messages = self.memory.chat_memory.messages if self.memory.chat_memory else []
prompt = ChatPromptTemplate.from_messages([
("system", self.system_prompt),
*[(msg.type, msg.content) for msg in memory_messages],
("user", "{input}")
])
chain = LLMChain(llm=self.model, prompt=prompt, memory=self.memory)
response = chain.run(input=user_message)
return response
def get_history(self) -> str:
"""Get conversation history."""
return self.memory.buffer
def clear(self):
"""Clear conversation history."""
self.memory.clear()
# Usage
conversation = ConversationHandler(
system_prompt="You are a Python programming assistant"
)
response1 = conversation.chat("What's the best way to iterate over a list?")
print(f"Assistant: {response1}")
response2 = conversation.chat("How about with an index?") # Remembers context
print(f"Assistant: {response2}")
print(f"History:\n{conversation.get_history()}")
Callbacks
Callbacks hook into chain execution at specific points:
from langchain.callbacks.base import BaseCallbackHandler
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
class LoggingCallback(BaseCallbackHandler):
"""Log chain execution events."""
def on_llm_start(self, serialized, prompts, **kwargs):
print(f"LLM called with {len(prompts)} prompt(s)")
def on_llm_end(self, response, **kwargs):
print(f"LLM returned, generation: {len(response.generations)} generation(s)")
def on_llm_error(self, error, **kwargs):
print(f"LLM error: {error}")
def on_chain_start(self, serialized, inputs, **kwargs):
print(f"Chain started with inputs: {list(inputs.keys())}")
def on_chain_end(self, outputs, **kwargs):
print(f"Chain finished with outputs: {list(outputs.keys())}")
# Use callback
callback = LoggingCallback()
model = ChatOpenAI()
prompt = ChatPromptTemplate.from_template("What is {topic}?")
chain = prompt | model | StrOutputParser()
result = chain.invoke(
{"topic": "machine learning"},
config={"callbacks": [callback]}
)
Multiple Callbacks
Combine multiple callbacks:
class CountingCallback(BaseCallbackHandler):
"""Count tokens and calls."""
def __init__(self):
self.token_count = 0
self.call_count = 0
def on_llm_start(self, serialized, prompts, **kwargs):
self.call_count += 1
def on_llm_end(self, response, **kwargs):
for generation_list in response.generations:
for generation in generation_list:
# Token count might not always be available
pass
# Use multiple callbacks
callbacks = [LoggingCallback(), CountingCallback()]
result = chain.invoke(
{"topic": "AI"},
config={"callbacks": callbacks}
)
Tracing with LangSmith
LangSmith provides detailed tracing of chain execution:
import os
from langchain_core.tracing import trace
# Set API key
os.environ["LANGSMITH_API_KEY"] = "your-api-key"
# LangSmith automatically traces chains
# Just run your chain normally
result = chain.invoke({"topic": "machine learning"})
# View traces at https://smith.langchain.com
View detailed traces of what happened at each step: prompts sent, outputs received, token counts, latencies.
Custom Callbacks for Monitoring
Track metrics about your chains:
import time
from langchain.callbacks.base import BaseCallbackHandler
class PerformanceCallback(BaseCallbackHandler):
"""Track performance metrics."""
def __init__(self):
self.start_time = None
self.metrics = {
"total_calls": 0,
"total_time": 0,
"errors": 0
}
def on_llm_start(self, serialized, prompts, **kwargs):
self.start_time = time.time()
def on_llm_end(self, response, **kwargs):
elapsed = time.time() - self.start_time
self.metrics["total_calls"] += 1
self.metrics["total_time"] += elapsed
def on_llm_error(self, error, **kwargs):
self.metrics["errors"] += 1
def get_metrics(self):
if self.metrics["total_calls"] == 0:
return self.metrics
return {
**self.metrics,
"avg_time": self.metrics["total_time"] / self.metrics["total_calls"]
}
# Usage
perf_callback = PerformanceCallback()
for i in range(5):
chain.invoke(
{"topic": f"topic{i}"},
config={"callbacks": [perf_callback]}
)
metrics = perf_callback.get_metrics()
print(f"Avg time per call: {metrics['avg_time']:.2f}s")
Memory + Callbacks Together
Combine memory and callbacks for full observability:
class ConversationMonitor(BaseCallbackHandler):
"""Monitor conversation quality."""
def __init__(self):
self.conversation_turns = 0
self.error_count = 0
def on_chain_end(self, outputs, **kwargs):
self.conversation_turns += 1
def on_chain_error(self, error, **kwargs):
self.error_count += 1
# Use in conversation
monitor = ConversationMonitor()
conversation = ConversationHandler()
for i in range(3):
user_input = f"Question {i}"
response = conversation.chat(user_input)
# Callback tracks metrics
print(f"Turns completed: {monitor.conversation_turns}")
print(f"Errors: {monitor.error_count}")
Streaming Callbacks
Handle streaming output in real-time:
class StreamingCallback(BaseCallbackHandler):
"""Handle streaming tokens."""
def on_llm_new_token(self, token: str, **kwargs):
"""Handle token streaming."""
print(token, end="", flush=True)
# Enable streaming
model = ChatOpenAI(streaming=True)
chain = prompt | model | StrOutputParser()
result = chain.invoke(
{"topic": "AI"},
config={"callbacks": [StreamingCallback()]}
)
print() # Newline
Key Takeaway
Memory keeps conversation context alive across multiple turns. LangChain provides several memory types: BufferMemory for all messages, SummaryMemory for compression, TokenBufferMemory for token limits. Callbacks hook into chain execution for logging, monitoring, and tracing. Combine memory and callbacks to build observable, context-aware conversational systems. Use LangSmith for production tracing and debugging.
Exercises
-
Buffer memory: Build a multi-turn conversation using ConversationBufferMemory. Verify it maintains context.
-
Memory types: Compare different memory types (Buffer, Summary, TokenBuffer) on the same conversation. Observe differences.
-
Custom callback: Create a callback that logs all prompts and responses to a file.
-
Performance monitoring: Build a callback that tracks latency and error rates. Calculate statistics.
-
Conversation handler: Create a reusable ConversationHandler class that manages memory, handles errors, and maintains history.
-
LangSmith tracing: Set up LangSmith and trace a complex chain. Examine the trace for insights.