上下文优化 Token压缩 Context Budget OpenClaw
世界上有两种上下文——一种装得下所有信息,一种装不下。凌晨1点42分,当我的Agent第10次因为上下文溢出而报错时,我终于明白:不是上下文不够大,是我不会用。就像冰箱塞不下不是因为冰箱小,是你不会整理。
上下文优化是Agent性能的关键瓶颈。本教程将系统讲解如何最大化Token利用率,通过智能裁剪、优先级调度、压缩算法让有限的上下文窗口发挥最大价值。
┌─────────────────────────────────────────┐ │ Level 4: 智能压缩 │ ← 最高级:语义压缩、摘要 ├─────────────────────────────────────────┤ │ Level 3: 优先级调度 │ ← 动态选择最重要的内容 ├─────────────────────────────────────────┤ │ Level 2: 智能裁剪 │ ← 按规则裁剪不重要的内容 ├─────────────────────────────────────────┤ │ Level 1: 基础管理 │ ← Token计数、窗口监控 └─────────────────────────────────────────┘
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum
import tiktoken
class Priority(Enum):
CRITICAL = 5 # 必须保留
HIGH = 4 # 高优先级
MEDIUM = 3 # 中优先级
LOW = 2 # 低优先级
TRIVIAL = 1 # 可丢弃
@dataclass
class ContextItem:
"""上下文条目"""
id: str
content: str
priority: Priority
token_count: int = 0
timestamp: float = 0.0
tags: List[str] = None
def __post_init__(self):
if self.tags is None:
self.tags = []
if self.token_count == 0:
enc = tiktoken.get_encoding("cl100k_base")
self.token_count = len(enc.encode(self.content))
class ContextBudgetManager:
"""上下文预算管理器"""
def __init__(self, max_tokens: int = 4096):
self.max_tokens = max_tokens
self.items: List[ContextItem] = []
self.reserved_tokens = 0 # 为输出预留的Token
def add_item(self, item: ContextItem):
"""添加上下文条目"""
self.items.append(item)
print(f"📝 添加上下文: {item.id} ({item.token_count} tokens, 优先级: {item.priority.name})")
def get_total_tokens(self) -> int:
"""获取当前总Token数"""
return sum(item.token_count for item in self.items)
def optimize(self) -> List[ContextItem]:
"""优化上下文(裁剪+排序)"""
current_tokens = self.get_total_tokens()
available_tokens = self.max_tokens - self.reserved_tokens
if current_tokens <= available_tokens:
print(f"✅ 上下文在预算内 ({current_tokens}/{available_tokens} tokens)")
return self.items
print(f"⚠️ 上下文超限 ({current_tokens}/{available_tokens} tokens),开始优化...")
# 1. 按优先级排序(高优先级在前)
sorted_items = sorted(self.items, key=lambda x: (-x.priority.value, -x.timestamp))
# 2. 贪心选择:从高优先级开始,直到达到预算
optimized = []
token_sum = 0
for item in sorted_items:
if token_sum + item.token_count <= available_tokens:
optimized.append(item)
token_sum += item.token_count
else:
# 尝试压缩
compressed = self._compress_item(item, available_tokens - token_sum)
if compressed:
optimized.append(compressed)
token_sum += compressed.token_count
else:
print(f"🗑️ 丢弃: {item.id} (优先级: {item.priority.name})")
print(f"✅ 优化完成: {len(optimized)}/{len(self.items)} 条保留, {token_sum}/{available_tokens} tokens")
return optimized
def _compress_item(self, item: ContextItem, max_tokens: int) -> Optional[ContextItem]:
"""尝试压缩条目"""
if item.token_count <= max_tokens:
return item
# 简单压缩:截断
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(item.content)
truncated_tokens = tokens[:max_tokens - 10] # 留10个token给省略号
truncated_content = enc.decode(truncated_tokens) + "...[截断]"
return ContextItem(
id=f"{item.id}_compressed",
content=truncated_content,
priority=item.priority,
token_count=max_tokens,
timestamp=item.timestamp,
tags=item.tags + ["compressed"]
)
class SmartContextTrimmer:
"""智能上下文裁剪器 - 基于语义重要性"""
def __init__(self, budget_manager: ContextBudgetManager):
self.manager = budget_manager
def trim_by_relevance(self, query: str) -> List[ContextItem]:
"""根据查询相关性裁剪"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 计算相关性分数
texts = [query] + [item.content for item in self.manager.items]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
query_vec = tfidf_matrix[0]
item_vecs = tfidf_matrix[1:]
similarities = cosine_similarity(query_vec, item_vecs)[0]
# 更新优先级(相关性高的提升优先级)
for i, item in enumerate(self.manager.items):
relevance_boost = similarities[i] * 2 # 相关性作为boost
item.priority = Priority(max(1, min(5, int(item.priority.value + relevance_boost))))
return self.manager.optimize()
def trim_by_recency(self, keep_last_n: int = 10) -> List[ContextItem]:
"""按时间新鲜度裁剪(保留最近的N条)"""
sorted_by_time = sorted(self.manager.items, key=lambda x: x.timestamp, reverse=True)
# 最近的N条提升优先级
for i, item in enumerate(sorted_by_time):
if i < keep_last_n:
item.priority = Priority(max(item.priority.value, Priority.HIGH.value))
return self.manager.optimize()
def trim_by_redundancy(self) -> List[ContextItem]:
"""去除冗余内容"""
from difflib import SequenceMatcher
unique_items = []
for item in self.manager.items:
is_redundant = False
for existing in unique_items:
similarity = SequenceMatcher(None, item.content, existing.content).ratio()
if similarity > 0.8: # 80%相似视为冗余
is_redundant = True
print(f"🔄 发现冗余: {item.id} ≈ {existing.id} ({similarity:.0%})")
break
if not is_redundant:
unique_items.append(item)
self.manager.items = unique_items
return self.manager.optimize()
class ContextCompressor:
"""上下文压缩器 - 使用LLM进行语义压缩"""
def __init__(self, llm_client):
self.llm = llm_client
async def compress(self, items: List[ContextItem], target_ratio: float = 0.5) -> List[ContextItem]:
"""压缩上下文到目标比例"""
total_tokens = sum(item.token_count for item in items)
target_tokens = int(total_tokens * target_ratio)
print(f"🗜️ 开始压缩: {total_tokens} → {target_tokens} tokens (目标: {target_ratio:.0%})")
compressed_items = []
token_budget = target_tokens
for item in items:
if item.priority == Priority.CRITICAL:
# 关键内容不压缩
compressed_items.append(item)
token_budget -= item.token_count
else:
# 尝试压缩
compressed = await self._semantic_compress(item, token_budget)
if compressed:
compressed_items.append(compressed)
token_budget -= compressed.token_count
return compressed_items
async def _semantic_compress(self, item: ContextItem, max_tokens: int) -> Optional[ContextItem]:
"""语义压缩(使用LLM提取核心信息)"""
if item.token_count <= max_tokens:
return item
prompt = f"""
请将以下内容压缩到{max_tokens}个token以内,保留所有关键信息:
原始内容:
{item.content}
压缩后(只输出压缩内容,不要解释):
"""
try:
compressed_content = await self.llm.complete(prompt)
enc = tiktoken.get_encoding("cl100k_base")
compressed_tokens = len(enc.encode(compressed_content))
if compressed_tokens <= max_tokens:
return ContextItem(
id=f"{item.id}_semantic_compressed",
content=compressed_content,
priority=item.priority,
timestamp=item.timestamp,
tags=item.tags + ["semantic_compressed"]
)
except Exception as e:
print(f"⚠️ 语义压缩失败: {e}")
return None
import time
# 1. 创建上下文管理器
manager = ContextBudgetManager(max_tokens=2048)
manager.reserved_tokens = 512 # 为输出预留512 tokens
# 2. 添加各种上下文
manager.add_item(ContextItem(
id="system_prompt",
content="You are a helpful AI assistant specialized in coding.",
priority=Priority.CRITICAL,
token_count=100
))
manager.add_item(ContextItem(
id="user_query",
content="帮我优化这段Python代码的性能",
priority=Priority.CRITICAL,
token_count=50
))
manager.add_item(ContextItem(
id="code_snippet",
content="def slow_function():\n result = []\n for i in range(10000):\n for j in range(10000):\n result.append(i*j)\n return result",
priority=Priority.HIGH,
token_count=200
))
manager.add_item(ContextItem(
id="chat_history_1",
content="User: 你好\nAssistant: 你好!有什么可以帮你?",
priority=Priority.LOW,
token_count=80,
timestamp=time.time()
))
manager.add_item(ContextItem(
id="chat_history_2",
content="User: 帮我写个排序算法\nAssistant: 好的,这是快速排序...",
priority=Priority.MEDIUM,
token_count=150,
timestamp=time.time()
))
# 3. 基础优化
optimized = manager.optimize()
print(f"优化后保留 {len(optimized)} 条上下文")
# 4. 智能裁剪(根据查询相关性)
trimmer = SmartContextTrimmer(manager)
optimized = trimmer.trim_by_relevance("优化Python代码性能")
print(f"相关性裁剪后: {len(optimized)} 条")
# 5. 去除冗余
optimized = trimmer.trim_by_redundancy()
print(f"去冗余后: {len(optimized)} 条")
# 6. 查看最终上下文
print("\n最终上下文:")
for item in optimized:
print(f" - {item.id}: {item.token_count} tokens ({item.priority.name})")
| 组件 | 建议占比 | 说明 |
|---|---|---|
| System Prompt | 10-15% | 固定成本,必须保留 |
| 用户查询 | 5-10% | 核心输入,最高优先级 |
| 工具结果 | 30-40% | 动态内容,按需裁剪 |
| 对话历史 | 20-30% | 按相关性/时间裁剪 |
| 输出预留 | 15-20% | 保证输出空间 |
凌晨3点,我的上下文优化器帮我把4096个token的上下文压缩到了2048个,而且关键信息一个没丢。我看着优化后的结果,突然笑了——原来"少即是多"不是鸡汤,是数学。当你的上下文窗口从满到半满,Agent的智商好像突然提高了20%。这就是优化的魅力:不是做更多,是做得更聪明。