All Articles

Context Engineering for AI Agents: Memory and State Management

Context is the lifeblood of AI agents. While human conversations flow naturally with shared understanding, AI agents must explicitly manage what they remember, how they access information, and when to forget. This article explores memory architectures and context optimization techniques that make agents truly intelligent.

The Context Challenge

LLMs have finite context windows—typically 4K to 128K tokens. For long-running conversations or complex tasks, this limitation becomes critical. Agents need strategic approaches to memory management that go beyond stuffing everything into the prompt.

Memory Architecture Patterns

1. Episodic Memory: Conversation History

Episodic memory stores chronological conversation history, similar to how humans remember specific interactions.

from collections import deque
import json

class EpisodicMemory:
    def __init__(self, max_episodes: int = 100):
        self.episodes = deque(maxlen=max_episodes)
        self.current_episode = []
    
    def add_interaction(self, user_input: str, agent_response: str):
        interaction = {
            'timestamp': datetime.now().isoformat(),
            'user': user_input,
            'agent': agent_response,
            'context_length': len(user_input + agent_response)
        }
        self.current_episode.append(interaction)
    
    def finalize_episode(self, summary: str = None):
        episode = {
            'interactions': list(self.current_episode),
            'summary': summary,
            'total_tokens': sum(i['context_length'] for i in self.current_episode)
        }
        self.episodes.append(episode)
        self.current_episode.clear()
    
    def get_relevant_history(self, query: str, max_tokens: int = 2000) -> str:
        # Retrieve most relevant episodes based on semantic similarity
        relevant_episodes = self._rank_episodes_by_relevance(query)
        
        history = []
        token_count = 0
        for episode in relevant_episodes:
            episode_text = self._format_episode(episode)
            if token_count + len(episode_text) > max_tokens:
                break
            history.append(episode_text)
            token_count += len(episode_text)
        
        return "\n".join(history)

2. Semantic Memory: Knowledge Base

Semantic memory stores factual information and learned concepts, implemented through vector databases.

import chromadb
from sentence_transformers import SentenceTransformer

class SemanticMemory:
    def __init__(self, collection_name: str = "agent_knowledge"):
        self.client = chromadb.PersistentClient()
        self.collection = self.client.get_or_create_collection(collection_name)
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
    
    def store_knowledge(self, content: str, metadata: dict = None):
        # Extract key facts and concepts
        facts = self._extract_facts(content)
        
        for i, fact in enumerate(facts):
            embedding = self.encoder.encode([fact])[0]
            self.collection.add(
                embeddings=[embedding.tolist()],
                documents=[fact],
                metadatas=[metadata or {}],
                ids=[f"fact_{hash(fact)}_{i}"]
            )
    
    def retrieve_knowledge(self, query: str, top_k: int = 5) -> list:
        query_embedding = self.encoder.encode([query])[0]
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        return results['documents'][0]
    
    def _extract_facts(self, content: str) -> list:
        # Use LLM to extract discrete facts from content
        prompt = f"""
        Extract key facts from this content as a list:
        {content}
        
        Return only factual statements, one per line.
        """
        # Implementation depends on your LLM setup
        return self.llm.invoke(prompt).strip().split('\n')

3. Working Memory: Active Context

Working memory manages the current task context—what the agent is actively thinking about.

class WorkingMemory:
    def __init__(self, capacity: int = 8000):  # tokens
        self.capacity = capacity
        self.active_context = {}
        self.priorities = {}
    
    def set_context(self, key: str, value: str, priority: int = 1):
        """Add or update context with priority"""
        self.active_context[key] = value
        self.priorities[key] = priority
        self._enforce_capacity()
    
    def get_context(self) -> str:
        """Return formatted active context"""
        sorted_items = sorted(
            self.active_context.items(),
            key=lambda x: self.priorities[x[0]],
            reverse=True
        )
        return "\n".join(f"{k}: {v}" for k, v in sorted_items)
    
    def _enforce_capacity(self):
        """Remove lowest priority items if over capacity"""
        current_size = sum(len(v) for v in self.active_context.values())
        
        while current_size > self.capacity and self.active_context:
            # Remove lowest priority item
            lowest_key = min(self.priorities.keys(), 
                           key=lambda k: self.priorities[k])
            current_size -= len(self.active_context[lowest_key])
            del self.active_context[lowest_key]
            del self.priorities[lowest_key]

Context Compression Techniques

1. Hierarchical Summarization

Break long conversations into digestible summaries at multiple levels.

class HierarchicalSummarizer:
    def __init__(self, llm):
        self.llm = llm
        self.chunk_size = 1000  # tokens
    
    def compress_conversation(self, conversation: str) -> str:
        if len(conversation) <= self.chunk_size:
            return conversation
        
        # Split into chunks
        chunks = self._split_into_chunks(conversation)
        
        # Summarize each chunk
        summaries = [self._summarize_chunk(chunk) for chunk in chunks]
        
        # If summaries are still too long, recursively compress
        combined_summaries = "\n".join(summaries)
        if len(combined_summaries) > self.chunk_size:
            return self.compress_conversation(combined_summaries)
        
        return combined_summaries
    
    def _summarize_chunk(self, chunk: str) -> str:
        prompt = f"""
        Summarize this conversation chunk, preserving:
        - Key decisions made
        - Important facts mentioned
        - Context needed for future reference
        
        Chunk:
        {chunk}
        
        Summary:
        """
        return self.llm.invoke(prompt).strip()

2. Importance-Based Filtering

Keep only the most relevant information based on task importance.

class ImportanceFilter:
    def filter_context(self, context_items: list, current_task: str, 
                      max_items: int = 10) -> list:
        scored_items = []
        
        for item in context_items:
            relevance_score = self._calculate_relevance(item, current_task)
            recency_score = self._calculate_recency(item)
            importance_score = self._calculate_importance(item)
            
            total_score = (relevance_score * 0.5 + 
                          recency_score * 0.3 + 
                          importance_score * 0.2)
            
            scored_items.append((item, total_score))
        
        # Return top-scored items
        scored_items.sort(key=lambda x: x[1], reverse=True)
        return [item for item, score in scored_items[:max_items]]

Retrieval-Augmented Generation (RAG) Integration

Combine memory systems with external knowledge retrieval.

class RAGEnabledAgent:
    def __init__(self):
        self.episodic_memory = EpisodicMemory()
        self.semantic_memory = SemanticMemory()
        self.working_memory = WorkingMemory()
        self.external_knowledge = ExternalKnowledgeBase()
    
    def generate_response(self, query: str) -> str:
        # 1. Update working memory with current query
        self.working_memory.set_context("current_query", query, priority=10)
        
        # 2. Retrieve relevant episodic memories
        relevant_history = self.episodic_memory.get_relevant_history(query)
        
        # 3. Query semantic memory
        relevant_knowledge = self.semantic_memory.retrieve_knowledge(query)
        
        # 4. External knowledge retrieval (if needed)
        external_context = self.external_knowledge.search(query)
        
        # 5. Combine contexts with priority weighting
        context = self._combine_contexts(
            working_memory=self.working_memory.get_context(),
            history=relevant_history,
            knowledge=relevant_knowledge,
            external=external_context
        )
        
        # 6. Generate response
        response = self.llm.generate(query, context)
        
        # 7. Update memories
        self.episodic_memory.add_interaction(query, response)
        self._extract_and_store_knowledge(query, response)
        
        return response

State Persistence Strategies

Database-Backed Persistence

import sqlite3
import pickle

class PersistentMemoryManager:
    def __init__(self, db_path: str = "agent_memory.db"):
        self.conn = sqlite3.connect(db_path)
        self._initialize_tables()
    
    def save_memory_state(self, agent_id: str, memory_state: dict):
        serialized_state = pickle.dumps(memory_state)
        self.conn.execute("""
            INSERT OR REPLACE INTO memory_states (agent_id, state, updated_at)
            VALUES (?, ?, datetime('now'))
        """, (agent_id, serialized_state))
        self.conn.commit()
    
    def load_memory_state(self, agent_id: str) -> dict:
        cursor = self.conn.execute("""
            SELECT state FROM memory_states WHERE agent_id = ?
        """, (agent_id,))
        row = cursor.fetchone()
        return pickle.loads(row[0]) if row else {}

Performance Optimization

1. Lazy Loading

class LazyMemoryLoader:
    def __init__(self):
        self._memory_cache = {}
        self._loaded_keys = set()
    
    def get_memory(self, key: str):
        if key not in self._loaded_keys:
            self._memory_cache[key] = self._load_from_storage(key)
            self._loaded_keys.add(key)
        return self._memory_cache[key]

2. Memory Pruning

class MemoryPruner:
    def prune_old_memories(self, memory_store, retention_days: int = 30):
        cutoff_date = datetime.now() - timedelta(days=retention_days)
        
        # Archive old memories instead of deleting
        old_memories = memory_store.get_memories_before(cutoff_date)
        archived = self._create_compressed_archive(old_memories)
        
        memory_store.store_archive(archived)
        memory_store.delete_memories_before(cutoff_date)

Best Practices

  1. Design for Forgetting: Not all information deserves permanent storage. Implement decay functions for less important memories.

  2. Context Budgeting: Allocate token budgets across memory types based on task requirements.

  3. Incremental Updates: Update memory systems incrementally rather than reprocessing everything.

  4. Memory Validation: Regularly validate that stored information remains accurate and relevant.

Next Steps

Effective context engineering transforms simple chatbots into intelligent agents with persistent knowledge and learning capabilities. The next article will explore how to scale these memory-enabled agents across distributed systems while maintaining consistency and performance.

Remember: the goal isn’t perfect memory—it’s useful memory that enhances agent performance without overwhelming the system.