📊 Agent 监控与可观测性

生产环境稳定运行指南 (2026版)

📅 更新:2026年5月20日 | ⏱️ 阅读时间:14分钟 | 🏷️ 难度:中高级
OpenClaw教程 Agent监控 可观测性 日志管理 成本分析

🎬 开篇:凌晨3点的报警

凌晨3点12分,手机炸了。

Prometheus Alertmanager 发了17条告警。Agent的错误率从0.1%飙升到23%。CPU冲到95%。成本以每分钟$2的速度在烧。

我边穿裤子边打开终端,3分钟内定位到问题——一个第三方API改了返回格式,没有版本号变更。Agent每次调用都拿不到数据,反复重试,死了又起。

这就是为什么可观测性不是可选项,是必选项。

📖 三层可观测体系

99.97%
Uptime 月可用率
<3s
MTA 告警响应时间
12
关键指标数
$0.17
单次会话费用

好的可观测性 = Logs(日志) + Metrics(指标) + Traces(链路)

📝 1. 日志管理

结构化日志

// ❌ 非结构化日志
console.log('处理用户请求,ID:', userId, '状态:', status);

// ✅ 结构化日志(JSON格式)
const logger = {
  info: (event, data) => {
    console.log(JSON.stringify({
      level: 'info',
      event,
      ...data,
      timestamp: new Date().toISOString(),
      app: 'my-agent'
    }));
  },
  error: (event, data, err) => {
    console.error(JSON.stringify({
      level: 'error',
      event,
      ...data,
      error: err ? { message: err.message, stack: err.stack?.split('\n')[0] } : null,
      timestamp: new Date().toISOString()
    }));
  },
  metrics: (name, value, tags) => {
    console.log(JSON.stringify({
      level: 'metric',
      event: 'metric',
      metric_name: name,
      metric_value: value,
      ...tags,
      timestamp: new Date().toISOString()
    }));
  }
};

// 使用
logger.info('session_start', { userId, sessionId });
logger.info('tool_called', { tool: 'web_search', duration: 123 });
logger.error('api_failed', { api: 'openai', retries: 3 }, err);
logger.metrics('cost_per_session', 0.17, { model: 'gpt-4o' });

日志级别最佳实践

// 定义清晰的使用场景
const LOG_LEVELS = {
  error:   { priority: 0, desc: '系统错误,需要立即处理' },
  warn:    { priority: 1, desc: '异常情况,但系统仍在运行' },
  info:    { priority: 2, desc: '重要业务事件' },
  debug:   { priority: 3, desc: '开发调试信息' },
  trace:   { priority: 4, desc: '详细的执行链路' }
};

// 日志不要记录的信息:
// ❌ 密码、API Key、Token、身份证号、手机号
// ✅ 用户ID(脱敏后的)、Session ID、事件类型、执行时间

// 日志过滤示例
function sanitizeLog(data) {
  const sensitiveKeys = ['password', 'token', 'apiKey', 'secret', 'authorization'];
  const safe = { ...data };
  for (const key of sensitiveKeys) {
    if (key in safe) safe[key] = '***';
  }
  return safe;
}

📈 2. 关键指标

需要监控的指标

// metrics-collector.js
class MetricsCollector {
  constructor() {
    this.metrics = new Map();
  }

  // 计数
  increment(name, tags = {}) {
    const key = `${name}:${JSON.stringify(tags)}`;
    this.metrics.set(key, (this.metrics.get(key) || 0) + 1);
  }

  // 耗时记录
  timing(name, durationMs, tags = {}) {
    const key = `timing:${name}`;
    if (!this.metrics.has(key)) {
      this.metrics.set(key, { count: 0, sum: 0, min: Infinity, max: 0 });
    }
    const m = this.metrics.get(key);
    m.count++;
    m.sum += durationMs;
    m.min = Math.min(m.min, durationMs);
    m.max = Math.max(m.max, durationMs);
  }

  // 输出所有指标(JSON格式,可被日志系统采集)
  flush() {
    const output = [];
    for (const [key, value] of this.metrics) {
      output.push({ metric: key, value });
    }
    this.metrics.clear();
    return output;
  }
}

const metrics = new MetricsCollector();

// 关键业务指标
metrics.increment('agent.sessions.total');
metrics.increment('agent.tools.called', { tool: 'web_search' });
metrics.timing('agent.session.duration', sessionDuration);
metrics.increment('agent.errors', { type: 'api_timeout' });

// Token和成本
metrics.increment('llm.tokens.total', { model: 'gpt-4' }, tokenCount);
metrics.timing('llm.response_time', responseTime);
metrics.increment('cost.total', 0.17); // 美元

Prometheus 集成

# 通过 OpenClaw /metrics 端点暴露
# openclaw.yaml 配置
monitoring:
  prometheus:
    enabled: true
    path: /metrics
    port: 9090

# 暴露的指标
# HELP openclaw_sessions_total Total sessions
# TYPE openclaw_sessions_total counter
openclaw_sessions_total{status="success"} 1234
openclaw_sessions_total{status="error"} 23

# HELP openclaw_skill_execution_duration Skill execution duration
# TYPE openclaw_skill_execution_duration histogram
openclaw_skill_execution_duration_bucket{skill="fetch-data",le="0.1"} 100
openclaw_skill_execution_duration_bucket{skill="fetch-data",le="0.5"} 800

🚨 3. 告警设置

告警规则示例

# alerts/prometheus-rules.yaml
groups:
  - name: openclaw-agent
    rules:
    # P0: 高错误率
    - alert: HighErrorRate
      expr: rate(openclaw_errors_total[5m]) / rate(openclaw_requests_total[5m]) > 0.05
      for: 3m
      labels:
        severity: critical
        pagerduty: openclaw-prod
      annotations:
        summary: "OpenClaw Agent 错误率超过5%"
        description: "当前错误率 {{ $value | humanizePercentage }}"

    # P1: 响应时间慢
    - alert: HighLatency
      expr: histogram_quantile(0.95, rate(openclaw_request_duration_seconds_bucket[5m])) > 2
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Agent响应时间P95超过2秒"

    # P2: 成本异常
    - alert: CostSpike
      expr: rate(openclaw_cost_total[1h]) > 50
      for: 30m
      labels:
        severity: warning
      annotations:
        summary: "每小时成本超过$50,当前: {{ $value | humanize }}/h"

    # P3: Agent 重试过多
    - alert: TooManyRetries
      expr: rate(openclaw_retries_total[5m]) > 100
      for: 5m
      labels:
        severity: info
      annotations:
        summary: "Agent重试频繁,可能有API问题"

💰 4. 成本分析

Token级成本追踪

// cost-tracker.js
class CostTracker {
  constructor() {
    // 模型价格($/1M tokens)
    this.MODEL_PRICING = {
      'gpt-4o':           { input: 2.50,  output: 10.00 },
      'gpt-4o-mini':      { input: 0.15,  output: 0.60 },
      'claude-sonnet-4':  { input: 3.00,  output: 15.00 },
    };
    
    this.dailyCosts = new Map();
  }

  track(model, inputTokens, outputTokens) {
    const pricing = this.MODEL_PRICING[model];
    if (!pricing) return;
    
    const cost = (inputTokens / 1_000_000 * pricing.input) +
                 (outputTokens / 1_000_000 * pricing.output);
    
    const date = new Date().toISOString().split('T')[0];
    if (!this.dailyCosts.has(date)) {
      this.dailyCosts.set(date, { total: 0, models: {} });
    }
    
    const day = this.dailyCosts.get(date);
    day.total += cost;
    day.models[model] = (day.models[model] || 0) + cost;
    
    return { cost: Math.round(cost * 100000) / 100000 };
  }

  async report() {
    const today = new Date().toISOString().split('T')[0];
    const data = this.dailyCosts.get(today);
    
    console.log(JSON.stringify({
      level: 'metric',
      event: 'daily_cost_report',
      date: today,
      total: `$${data?.total.toFixed(2) || '0.00'}`,
      by_model: data?.models || {},
      sessions: sessionCount
    }));
    
    return data;
  }
}

// 使用
const costTracker = new CostTracker();

// 每次模型调用后记录
costTracker.track('gpt-4o', 1200, 450);
costTracker.track('gpt-4o-mini', 3500, 1200);

// 日报
await costTracker.report();
🎯 成本优化技巧:
1. 简单任务用 gpt-4o-mini(便宜10x),复杂任务用 gpt-4o
2. 启用上下文压缩,减少重复Token消耗
3. 缓存的常见请求结果(减少重复调用)
4. 设置每日成本上限(硬限制):MAX_DAILY_COST = 10
5. 非高峰时段用更便宜的模型(如夜间任务)

📋 Agent健康检查脚本

#!/bin/bash
# health-check.sh - Agent健康检查

echo "=== OpenClaw Agent Health Check ==="
echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""

# 1. 基础检查
echo "1. 进程检查"
if pgrep -f "openclaw" > /dev/null; then
  echo "   ✅ OpenClaw 进程运行中"
  echo "   进程数: $(pgrep -f 'openclaw' | wc -l)"
else
  echo "   ❌ OpenClaw 进程未运行!"
fi

# 2. HTTP健康检查
echo ""
echo "2. HTTP健康检查"
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/health 2>/dev/null)
if [ "$HTTP_CODE" = "200" ]; then
  echo "   ✅ Health endpoint OK (200)"
else
  echo "   ❌ Health endpoint 异常: $HTTP_CODE"
fi

# 3. 最近错误日志
echo ""
echo "3. 最近5条错误日志"
journalctl -u openclaw --since "1 hour ago" -p err 2>/dev/null | tail -5

# 4. 资源使用
echo ""
echo "4. 资源使用"
echo "   CPU: $(top -bn1 | grep openclaw | awk '{print $9"%"}')"
echo "   内存: $(ps aux | grep openclaw | awk '{sum+=$6} END {printf "%.1fMB", sum/1024}')"
echo "   磁盘: $(df -h /var/lib/openclaw | tail -1 | awk '{print $5}')"

# 5. 检查SSL证书
echo ""
echo "5. SSL证书检查"
CERT_EXPIRY=$(echo | openssl s_client -connect localhost:443 -servername miaoquai.com 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null)
echo "   $CERT_EXPIRY"

echo ""
echo "=== 检查完成 ==="
echo "状态: $([ "$HTTP_CODE" = "200" ] && echo '✅ 正常' || echo '❌ 异常')"

🔗 相关资源

🎭 结语

凌晨3点12分的那个电话之后,我花了一周建了这套可观测体系。

现在凌晨3点我再也不会被吵醒了——因为我比bug先知道它要来。

这就是监控的力量:不是发现问题,而是在问题变成灾难前发现它。