世界上有两种Agent:一种会犯错,另一种会通过A/B测试找到最优解。
A/B测试框架是OpenClaw Agent优化的「科学实验室」。它允许你同时运行多个版本的Agent(不同的prompt、工具配置、决策策略),通过控制变量和统计分析,找出哪个版本表现最好。这就像让Agent参加「盲品大赛」,不看名字,只看结果。
# 创建A/B测试实验
openclaw ab-test create \
--name "prompt-optimization-v2" \
--description "优化代码生成prompt" \
--metric "code_quality_score" \
--duration "7d" \
--traffic-split 50 50
# 添加实验变体
openclaw ab-test variant add \
--test-id "prompt-optimization-v2" \
--variant-name "control" \
--config "config/control-prompt.yaml"
openclaw ab-test variant add \
--test-id "prompt-optimization-v2" \
--variant-name "enhanced" \
--config "config/enhanced-prompt.yaml"
# 启动实验
openclaw ab-test start --test-id "prompt-optimization-v2"
# traffic-config.yaml
traffic:
strategy: "weighted" # weighted | random | contextual | sequential
weighted:
variants:
control: 40 # 40%流量
variant-a: 30 # 30%流量
variant-b: 30 # 30%流量
contextual:
rules:
- condition: "user_type == 'premium'"
weights:
control: 20
variant-a: 50
variant-b: 30
- condition: "task_type == 'code_generation'"
weights:
control: 30
variant-a: 40
variant-b: 30
sequential:
variants: ["control", "variant-a", "variant-b"]
cycle: "24h" # 每24小时切换一次
# metrics-config.yaml
metrics:
primary:
name: "success_rate"
type: "binary" # binary | continuous | count
aggregation: "mean"
threshold: 0.95
secondary:
- name: "response_time"
type: "continuous"
aggregation: "p95"
target: "< 2s"
- name: "token_usage"
type: "continuous"
aggregation: "mean"
target: "< 1000"
- name: "user_satisfaction"
type: "binary"
aggregation: "mean"
target: "> 0.8"
custom:
- name: "code_quality"
type: "continuous"
aggregation: "mean"
source: "code_quality_analyzer"
weight: 1.5
| 原则 | 说明 |
|---|---|
| 单一变量 | 每次只测试一个变量,避免混淆 |
| 足够样本量 | 确保统计显著性,避免随机误差 |
| 明确目标 | 定义清楚的成功指标 |
| 控制组 | 必须有对照组(当前版本) |
const { OpenClabABTest } = require('@openclab/abtest');
async function runExperiment() {
const abTest = new OpenClabABTest({
experimentId: 'prompt-optimization-v2',
config: {
trafficSplit: { control: 0.4, variantA: 0.3, variantB: 0.3 },
metrics: ['success_rate', 'response_time', 'token_usage'],
duration: '7d'
}
});
// 启动实验
await abTest.start();
// 监控实验进度
const monitor = setInterval(async () => {
const status = await abTest.getStatus();
console.log(`实验进度: ${status.progress}%`);
console.log(`样本数: ${status.samples.total}`);
if (status.isComplete) {
clearInterval(monitor);
const result = await abTest.analyze();
console.log('实验结果:', result);
// 如果variantA获胜,自动部署
if (result.winner === 'variantA') {
await abTest.deployWinner();
console.log('🎉 变体A已部署!');
}
}
}, 60000); // 每分钟检查一次
}
async function multivariateTest() {
const experiments = [
{
id: 'prompt-v1',
variants: [
{ id: 'control', config: { prompt: 'basic' } },
{ id: 'enhanced', config: { prompt: 'detailed' } }
]
},
{
id: 'tool-selection',
variants: [
{ id: 'auto', config: { toolSelection: 'auto' } },
{ id: 'manual', config: { toolSelection: 'manual' } }
]
}
];
// 创建因子实验
const factorialDesign = createFactorialDesign(experiments);
// 执行实验
const results = await Promise.all(
factorialDesign.map(design => runSingleVariant(design))
);
// 分析交互效应
const analysis = analyzeInteractionEffects(results);
return {
mainEffects: calculateMainEffects(results),
interactions: analysis.interactions,
recommendations: generateRecommendations(analysis)
};
}
function createFactorialDesign(experiments) {
const design = [];
// 生成所有可能的组合
const variants = experiments.map(exp =>
exp.variants.map(v => ({ [exp.id]: v.id }))
);
// 交叉组合
const combinations = cartesianProduct(variants);
return combinations.map(comb => {
const config = {};
const variantId = [];
Object.entries(comb).forEach(([expId, variant]) => {
config[expId] = experiments.find(e => e.id === expId).variants.find(v => v.id === variant).config;
variantId.push(`${expId}:${variant}`);
});
return {
id: variantId.join('-'),
config,
name: `实验组合 ${variantId.join('-')}`
};
});
}
📅 更新时间:2026-05-11 | 📖 更多OpenClaw教程请访问 工具教程索引