Summarization Skills开发 NLP OpenClaw
世界上有两种信息——一种值得读,一种值得摘要。凌晨1点42分,我盯着第100篇技术文档,突然明白:摘要技能不是偷懒,是给大脑装个过滤器。
在信息过载的时代,摘要技能(Summarization Skills)是AI Agent的必备能力。本教程将教你如何从零构建生产级摘要技能,支持长文本、多文档、流式摘要等场景。
摘要技能是OpenClaw Agent处理长文本的核心能力,它可以将:
使用递归摘要策略处理超长文本:
# summarization_skill/long_form.py
from openclaw.skills import Skill
from openclaw.context import ContextWindow
class LongFormSummarizer(Skill):
"""长文本递归摘要技能"""
def __init__(self, max_chunk_tokens=2000, overlap=200):
self.max_chunk_tokens = max_chunk_tokens
self.overlap = overlap
async def summarize(self, text: str, style: str = "concise") -> dict:
"""
递归摘要长文本
style: concise | detailed | bullet_points | tldr
"""
chunks = self._split_text(text)
summaries = []
for chunk in chunks:
chunk_summary = await self._summarize_chunk(chunk, style)
summaries.append(chunk_summary)
# 递归合并摘要
final_summary = await self._recursive_merge(summaries, style)
return {
"summary": final_summary,
"original_length": len(text),
"summary_length": len(final_summary),
"compression_ratio": len(final_summary) / len(text),
"chunks_processed": len(chunks)
}
def _split_text(self, text: str) -> list:
"""按token数智能分块"""
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
chunks = []
for i in range(0, len(tokens), self.max_chunk_tokens - self.overlap):
chunk_tokens = tokens[i:i + self.max_chunk_tokens]
chunks.append(enc.decode(chunk_tokens))
return chunks
跨文档去重、融合、生成统一摘要:
# summarization_skill/multi_doc.py
from openclaw.skills import Skill
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class MultiDocSummarizer(Skill):
"""多文档摘要技能 - 去重融合"""
async def summarize_docs(self, documents: list, strategy: str = "extractive") -> dict:
"""
多文档摘要
strategy: extractive | abstractive | hybrid
"""
# 1. 计算文档相似度,识别冗余
redundant_groups = self._find_redundant_docs(documents)
# 2. 提取核心信息(去重后)
core_info = []
for group in redundant_groups:
representative = group[0] # 选代表性文档
info = await self._extract_core_info(representative)
core_info.extend(info)
# 3. 生成统一摘要
unified_summary = await self._generate_unified(core_info, strategy)
return {
"summary": unified_summary,
"documents_processed": len(documents),
"redundant_groups": len(redundant_groups),
"unique_points": len(core_info)
}
def _find_redundant_docs(self, docs: list) -> list:
"""基于TF-IDF的冗余文档分组"""
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)
similarity_matrix = cosine_similarity(tfidf_matrix)
# 聚类相似文档
groups = []
visited = set()
for i in range(len(docs)):
if i in visited:
continue
group = [i]
for j in range(i + 1, len(docs)):
if similarity_matrix[i][j] > 0.7: # 相似度阈值
group.append(j)
visited.add(j)
groups.append([docs[idx] for idx in group])
return groups
实时处理流式数据(聊天记录、日志流):
# summarization_skill/streaming.py
from openclaw.skills import Skill
from collections import deque
class StreamingSummarizer(Skill):
"""流式摘要 - 实时增量更新"""
def __init__(self, window_size=100, update_interval=10):
self.window = deque(maxlen=window_size)
self.update_interval = update_interval
self.counter = 0
self.current_summary = ""
async def add_message(self, message: str) -> dict:
"""添加新消息并增量更新摘要"""
self.window.append(message)
self.counter += 1
# 定期更新摘要(避免每次都调用LLM)
if self.counter % self.update_interval == 0:
self.current_summary = await self._incremental_update()
return {
"summary": self.current_summary,
"messages_in_window": len(self.window),
"needs_update": self.counter % self.update_interval == 0
}
async def _incremental_update(self) -> str:
"""增量更新摘要(只处理新内容)"""
new_messages = list(self.window)[-self.update_interval:]
new_content = "\n".join(new_messages)
prompt = f"""
现有摘要:{self.current_summary}
新增内容:{new_content}
请生成更新后的摘要(保持简洁,突出关键信息):
"""
return await self.llm.complete(prompt)
# 安装官方摘要技能
openclaw skill install summarization
# 或手动添加到配置
# config/skills.yaml
skills:
summarization:
enabled: true
provider: "openai" # openai | anthropic | local
model: "gpt-4-turbo"
max_chunk_tokens: 2000
output_style: "concise" # concise | detailed | bullet_points
from openclaw import Agent
from openclaw.skills import SummarizationSkill
# 创建Agent并加载摘要技能
agent = Agent(
name="doc_summarizer",
skills=[SummarizationSkill()]
)
# 长文本摘要
long_article = open("tech_report.txt").read()
result = await agent.run_skill(
"summarization.long_form",
text=long_article,
style="bullet_points"
)
print(f"压缩比: {result['compression_ratio']:.2%}")
print(result['summary'])
# 多文档摘要
docs = [open(f"doc_{i}.txt").read() for i in range(5)]
result = await agent.run_skill(
"summarization.multi_doc",
documents=docs,
strategy="hybrid"
)
print(result['summary'])
| 场景 | 推荐策略 | 原因 |
|---|---|---|
| 技术文档 | extractive + bullet_points | 保留关键细节 |
| 新闻资讯 | abstractive + concise | 快速获取要点 |
| 学术论文 | hybrid + detailed | 平衡完整性与简洁 |
| 聊天记录 | streaming + tldr | 实时增量更新 |
from functools import lru_cache
import hashlib
class CachedSummarizer:
def __init__(self):
self.cache = {}
async def summarize(self, text: str, style: str):
# 生成文本hash作为缓存key
text_hash = hashlib.md5(f"{text}{style}".encode()).hexdigest()
if text_hash in self.cache:
print("✅ 缓存命中!")
return self.cache[text_hash]
result = await self._do_summarize(text, style)
self.cache[text_hash] = result
return result
import asyncio
async def parallel_summarize(documents: list) -> list:
"""并行处理多个文档的摘要"""
tasks = [summarize_doc(doc) for doc in documents]
results = await asyncio.gather(*tasks)
return results
# 速度提升:N个文档 → N倍加速(受API限流限制)
凌晨3点,我的摘要技能终于能在一秒内总结完100页PDF。我看着输出,突然笑了——原来我花3小时读的东西,AI用3秒就抓住了重点。这就是工具的意义:不是替代你,是让你有更多时间思考。