Context Engineering for AI Agents: Memory and State Management
Context is the lifeblood of AI agents. While human conversations flow naturally with shared understanding, AI agents must explicitly manage what they remember, how they access information, and when to forget. This article explores memory architectures and context optimization techniques that make agents truly intelligent.
The Context Challenge
LLMs have finite context windows—typically 4K to 128K tokens. For long-running conversations or complex tasks, this limitation becomes critical. Agents need strategic approaches to memory management that go beyond stuffing everything into the prompt.
Memory Architecture Patterns
1. Episodic Memory: Conversation History
Episodic memory stores chronological conversation history, similar to how humans remember specific interactions.
from collections import deque
import json
class EpisodicMemory:
def __init__(self, max_episodes: int = 100):
self.episodes = deque(maxlen=max_episodes)
self.current_episode = []
def add_interaction(self, user_input: str, agent_response: str):
interaction = {
'timestamp': datetime.now().isoformat(),
'user': user_input,
'agent': agent_response,
'context_length': len(user_input + agent_response)
}
self.current_episode.append(interaction)
def finalize_episode(self, summary: str = None):
episode = {
'interactions': list(self.current_episode),
'summary': summary,
'total_tokens': sum(i['context_length'] for i in self.current_episode)
}
self.episodes.append(episode)
self.current_episode.clear()
def get_relevant_history(self, query: str, max_tokens: int = 2000) -> str:
# Retrieve most relevant episodes based on semantic similarity
relevant_episodes = self._rank_episodes_by_relevance(query)
history = []
token_count = 0
for episode in relevant_episodes:
episode_text = self._format_episode(episode)
if token_count + len(episode_text) > max_tokens:
break
history.append(episode_text)
token_count += len(episode_text)
return "\n".join(history)2. Semantic Memory: Knowledge Base
Semantic memory stores factual information and learned concepts, implemented through vector databases.
import chromadb
from sentence_transformers import SentenceTransformer
class SemanticMemory:
def __init__(self, collection_name: str = "agent_knowledge"):
self.client = chromadb.PersistentClient()
self.collection = self.client.get_or_create_collection(collection_name)
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
def store_knowledge(self, content: str, metadata: dict = None):
# Extract key facts and concepts
facts = self._extract_facts(content)
for i, fact in enumerate(facts):
embedding = self.encoder.encode([fact])[0]
self.collection.add(
embeddings=[embedding.tolist()],
documents=[fact],
metadatas=[metadata or {}],
ids=[f"fact_{hash(fact)}_{i}"]
)
def retrieve_knowledge(self, query: str, top_k: int = 5) -> list:
query_embedding = self.encoder.encode([query])[0]
results = self.collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=top_k
)
return results['documents'][0]
def _extract_facts(self, content: str) -> list:
# Use LLM to extract discrete facts from content
prompt = f"""
Extract key facts from this content as a list:
{content}
Return only factual statements, one per line.
"""
# Implementation depends on your LLM setup
return self.llm.invoke(prompt).strip().split('\n')3. Working Memory: Active Context
Working memory manages the current task context—what the agent is actively thinking about.
class WorkingMemory:
def __init__(self, capacity: int = 8000): # tokens
self.capacity = capacity
self.active_context = {}
self.priorities = {}
def set_context(self, key: str, value: str, priority: int = 1):
"""Add or update context with priority"""
self.active_context[key] = value
self.priorities[key] = priority
self._enforce_capacity()
def get_context(self) -> str:
"""Return formatted active context"""
sorted_items = sorted(
self.active_context.items(),
key=lambda x: self.priorities[x[0]],
reverse=True
)
return "\n".join(f"{k}: {v}" for k, v in sorted_items)
def _enforce_capacity(self):
"""Remove lowest priority items if over capacity"""
current_size = sum(len(v) for v in self.active_context.values())
while current_size > self.capacity and self.active_context:
# Remove lowest priority item
lowest_key = min(self.priorities.keys(),
key=lambda k: self.priorities[k])
current_size -= len(self.active_context[lowest_key])
del self.active_context[lowest_key]
del self.priorities[lowest_key]Context Compression Techniques
1. Hierarchical Summarization
Break long conversations into digestible summaries at multiple levels.
class HierarchicalSummarizer:
def __init__(self, llm):
self.llm = llm
self.chunk_size = 1000 # tokens
def compress_conversation(self, conversation: str) -> str:
if len(conversation) <= self.chunk_size:
return conversation
# Split into chunks
chunks = self._split_into_chunks(conversation)
# Summarize each chunk
summaries = [self._summarize_chunk(chunk) for chunk in chunks]
# If summaries are still too long, recursively compress
combined_summaries = "\n".join(summaries)
if len(combined_summaries) > self.chunk_size:
return self.compress_conversation(combined_summaries)
return combined_summaries
def _summarize_chunk(self, chunk: str) -> str:
prompt = f"""
Summarize this conversation chunk, preserving:
- Key decisions made
- Important facts mentioned
- Context needed for future reference
Chunk:
{chunk}
Summary:
"""
return self.llm.invoke(prompt).strip()2. Importance-Based Filtering
Keep only the most relevant information based on task importance.
class ImportanceFilter:
def filter_context(self, context_items: list, current_task: str,
max_items: int = 10) -> list:
scored_items = []
for item in context_items:
relevance_score = self._calculate_relevance(item, current_task)
recency_score = self._calculate_recency(item)
importance_score = self._calculate_importance(item)
total_score = (relevance_score * 0.5 +
recency_score * 0.3 +
importance_score * 0.2)
scored_items.append((item, total_score))
# Return top-scored items
scored_items.sort(key=lambda x: x[1], reverse=True)
return [item for item, score in scored_items[:max_items]]Retrieval-Augmented Generation (RAG) Integration
Combine memory systems with external knowledge retrieval.
class RAGEnabledAgent:
def __init__(self):
self.episodic_memory = EpisodicMemory()
self.semantic_memory = SemanticMemory()
self.working_memory = WorkingMemory()
self.external_knowledge = ExternalKnowledgeBase()
def generate_response(self, query: str) -> str:
# 1. Update working memory with current query
self.working_memory.set_context("current_query", query, priority=10)
# 2. Retrieve relevant episodic memories
relevant_history = self.episodic_memory.get_relevant_history(query)
# 3. Query semantic memory
relevant_knowledge = self.semantic_memory.retrieve_knowledge(query)
# 4. External knowledge retrieval (if needed)
external_context = self.external_knowledge.search(query)
# 5. Combine contexts with priority weighting
context = self._combine_contexts(
working_memory=self.working_memory.get_context(),
history=relevant_history,
knowledge=relevant_knowledge,
external=external_context
)
# 6. Generate response
response = self.llm.generate(query, context)
# 7. Update memories
self.episodic_memory.add_interaction(query, response)
self._extract_and_store_knowledge(query, response)
return responseState Persistence Strategies
Database-Backed Persistence
import sqlite3
import pickle
class PersistentMemoryManager:
def __init__(self, db_path: str = "agent_memory.db"):
self.conn = sqlite3.connect(db_path)
self._initialize_tables()
def save_memory_state(self, agent_id: str, memory_state: dict):
serialized_state = pickle.dumps(memory_state)
self.conn.execute("""
INSERT OR REPLACE INTO memory_states (agent_id, state, updated_at)
VALUES (?, ?, datetime('now'))
""", (agent_id, serialized_state))
self.conn.commit()
def load_memory_state(self, agent_id: str) -> dict:
cursor = self.conn.execute("""
SELECT state FROM memory_states WHERE agent_id = ?
""", (agent_id,))
row = cursor.fetchone()
return pickle.loads(row[0]) if row else {}Performance Optimization
1. Lazy Loading
class LazyMemoryLoader:
def __init__(self):
self._memory_cache = {}
self._loaded_keys = set()
def get_memory(self, key: str):
if key not in self._loaded_keys:
self._memory_cache[key] = self._load_from_storage(key)
self._loaded_keys.add(key)
return self._memory_cache[key]2. Memory Pruning
class MemoryPruner:
def prune_old_memories(self, memory_store, retention_days: int = 30):
cutoff_date = datetime.now() - timedelta(days=retention_days)
# Archive old memories instead of deleting
old_memories = memory_store.get_memories_before(cutoff_date)
archived = self._create_compressed_archive(old_memories)
memory_store.store_archive(archived)
memory_store.delete_memories_before(cutoff_date)Best Practices
-
Design for Forgetting: Not all information deserves permanent storage. Implement decay functions for less important memories.
-
Context Budgeting: Allocate token budgets across memory types based on task requirements.
-
Incremental Updates: Update memory systems incrementally rather than reprocessing everything.
-
Memory Validation: Regularly validate that stored information remains accurate and relevant.
Next Steps
Effective context engineering transforms simple chatbots into intelligent agents with persistent knowledge and learning capabilities. The next article will explore how to scale these memory-enabled agents across distributed systems while maintaining consistency and performance.
Remember: the goal isn’t perfect memory—it’s useful memory that enhances agent performance without overwhelming the system.