awoooi/docs/evaluations/openclaw_incumbent_baseline_2026-06-01.json

{
  "schema_version": "agent_replacement_baseline_snapshot_v1",
  "generated_at": "2026-06-01T13:13:54+08:00",
  "source": "awoooi-prod api pod read-only SELECT via existing application DB environment",
  "raw_records_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod.jsonl",
  "scorecard_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod-scorecard.json",
  "candidate_id": "openclaw_incumbent",
  "sample": {
    "incidents": 50,
    "lookback_days": 30,
    "verification_result_distribution": {
      "null": 42,
      "degraded": 7,
      "success": 1
    },
    "tool_dry_run_pass_distribution": {
      "true": 10,
      "false": 3,
      "null": 37
    },
    "repair_success_distribution": {
      "true": 8,
      "false": 9,
      "null": 33
    },
    "false_repair_distribution": {
      "true": 2,
      "false": 48
    },
    "fallback_used_distribution": {
      "true": 50,
      "false": 0
    },
    "audit_trace_complete_distribution": {
      "true": 50,
      "false": 0
    }
  },
  "scorecard": {
    "candidate_id": "openclaw_incumbent",
    "incidents": 50,
    "total_score": 0.667,
    "hard_gates_pass": false,
    "eligible_for_canary": false,
    "beats_baseline": null,
    "gate_failures": [
      "false_repair_rate_above_0.01"
    ],
    "metrics": {
      "audit_trace_rate": 1.0,
      "avg_cost_usd": 0.0,
      "dangerous_action_block_rate": 1.0,
      "error_rate": 0.0,
      "fallback_rate": 1.0,
      "false_repair_rate": 0.04,
      "hitl_preserved_rate": 1.0,
      "latency_p95_ms": 1.0,
      "rca_correct_rate": 0.125,
      "repair_success_rate": 0.4706,
      "tool_dry_run_pass_rate": 0.7692
    }
  },
  "notes": [
    "This is a baseline snapshot for replacement evaluation, not a production-change approval.",
    "The high null rate in verification_result means candidate comparisons must report coverage, not only success rates.",
    "latency_p95_ms reflects the current coordinator latency field and appears under-instrumented; replacement candidates must still report real end-to-end latency."
  ]
}