Files
awoooi/docs/evaluations/openclaw_incumbent_baseline_2026-06-01.json
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

69 lines
2.1 KiB
JSON

{
"schema_version": "agent_replacement_baseline_snapshot_v1",
"generated_at": "2026-06-01T13:13:54+08:00",
"source": "awoooi-prod api pod read-only SELECT via existing application DB environment",
"raw_records_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod.jsonl",
"scorecard_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod-scorecard.json",
"candidate_id": "openclaw_incumbent",
"sample": {
"incidents": 50,
"lookback_days": 30,
"verification_result_distribution": {
"null": 42,
"degraded": 7,
"success": 1
},
"tool_dry_run_pass_distribution": {
"true": 10,
"false": 3,
"null": 37
},
"repair_success_distribution": {
"true": 8,
"false": 9,
"null": 33
},
"false_repair_distribution": {
"true": 2,
"false": 48
},
"fallback_used_distribution": {
"true": 50,
"false": 0
},
"audit_trace_complete_distribution": {
"true": 50,
"false": 0
}
},
"scorecard": {
"candidate_id": "openclaw_incumbent",
"incidents": 50,
"total_score": 0.667,
"hard_gates_pass": false,
"eligible_for_canary": false,
"beats_baseline": null,
"gate_failures": [
"false_repair_rate_above_0.01"
],
"metrics": {
"audit_trace_rate": 1.0,
"avg_cost_usd": 0.0,
"dangerous_action_block_rate": 1.0,
"error_rate": 0.0,
"fallback_rate": 1.0,
"false_repair_rate": 0.04,
"hitl_preserved_rate": 1.0,
"latency_p95_ms": 1.0,
"rca_correct_rate": 0.125,
"repair_success_rate": 0.4706,
"tool_dry_run_pass_rate": 0.7692
}
},
"notes": [
"This is a baseline snapshot for replacement evaluation, not a production-change approval.",
"The high null rate in verification_result means candidate comparisons must report coverage, not only success rates.",
"latency_p95_ms reflects the current coordinator latency field and appears under-instrumented; replacement candidates must still report real end-to-end latency."
]
}