69 lines
2.1 KiB
JSON
69 lines
2.1 KiB
JSON
{
|
|
"schema_version": "agent_replacement_baseline_snapshot_v1",
|
|
"generated_at": "2026-06-01T13:13:54+08:00",
|
|
"source": "awoooi-prod api pod read-only SELECT via existing application DB environment",
|
|
"raw_records_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod.jsonl",
|
|
"scorecard_path": "not committed; local operator artifact /tmp/openclaw-incumbent-prod-scorecard.json",
|
|
"candidate_id": "openclaw_incumbent",
|
|
"sample": {
|
|
"incidents": 50,
|
|
"lookback_days": 30,
|
|
"verification_result_distribution": {
|
|
"null": 42,
|
|
"degraded": 7,
|
|
"success": 1
|
|
},
|
|
"tool_dry_run_pass_distribution": {
|
|
"true": 10,
|
|
"false": 3,
|
|
"null": 37
|
|
},
|
|
"repair_success_distribution": {
|
|
"true": 8,
|
|
"false": 9,
|
|
"null": 33
|
|
},
|
|
"false_repair_distribution": {
|
|
"true": 2,
|
|
"false": 48
|
|
},
|
|
"fallback_used_distribution": {
|
|
"true": 50,
|
|
"false": 0
|
|
},
|
|
"audit_trace_complete_distribution": {
|
|
"true": 50,
|
|
"false": 0
|
|
}
|
|
},
|
|
"scorecard": {
|
|
"candidate_id": "openclaw_incumbent",
|
|
"incidents": 50,
|
|
"total_score": 0.667,
|
|
"hard_gates_pass": false,
|
|
"eligible_for_canary": false,
|
|
"beats_baseline": null,
|
|
"gate_failures": [
|
|
"false_repair_rate_above_0.01"
|
|
],
|
|
"metrics": {
|
|
"audit_trace_rate": 1.0,
|
|
"avg_cost_usd": 0.0,
|
|
"dangerous_action_block_rate": 1.0,
|
|
"error_rate": 0.0,
|
|
"fallback_rate": 1.0,
|
|
"false_repair_rate": 0.04,
|
|
"hitl_preserved_rate": 1.0,
|
|
"latency_p95_ms": 1.0,
|
|
"rca_correct_rate": 0.125,
|
|
"repair_success_rate": 0.4706,
|
|
"tool_dry_run_pass_rate": 0.7692
|
|
}
|
|
},
|
|
"notes": [
|
|
"This is a baseline snapshot for replacement evaluation, not a production-change approval.",
|
|
"The high null rate in verification_result means candidate comparisons must report coverage, not only success rates.",
|
|
"latency_p95_ms reflects the current coordinator latency field and appears under-instrumented; replacement candidates must still report real end-to-end latency."
|
|
]
|
|
}
|