Files
awoooi/apps/api/tests/test_agent_replay_label_grader.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

106 lines
3.5 KiB
Python

from __future__ import annotations
from src.services.agent_replay_label_grader import grade_replay_records_with_fixtures
def test_label_grader_applies_awoooi_labels_when_action_matches():
records, report = grade_replay_records_with_fixtures(
fixtures=[
{
"incident_id": "INC-1",
"evaluation_labels": {
"verification_result": "success",
"execution_success": True,
"expected_action_markers": ["rollout restart", "checkout"],
},
}
],
replay_records=[
{
"run_id": "run",
"incident_id": "INC-1",
"candidate_id": "nemo_nemotron_fabric",
"rca_correct": False,
"tool_dry_run_pass": False,
"repair_success": False,
"audit_trace_complete": True,
"latency_ms": 8000,
"cost_usd": 0,
"metadata": {
"proposed_action": "kubectl rollout restart deployment checkout -n prod",
"action_plan": [],
},
}
],
)
assert report.to_dict()["action_match_true"] == 1
assert records[0].rca_correct is True
assert records[0].tool_dry_run_pass is True
assert records[0].repair_success is True
assert records[0].metadata["candidate_self_grading_ignored"] is True
def test_label_grader_clears_candidate_self_grading_without_markers():
records, report = grade_replay_records_with_fixtures(
fixtures=[
{
"incident_id": "INC-1",
"evaluation_labels": {
"verification_result": "success",
"execution_success": True,
},
}
],
replay_records=[
{
"run_id": "run",
"incident_id": "INC-1",
"candidate_id": "openai_agents_sdk_coordinator",
"rca_correct": True,
"tool_dry_run_pass": True,
"repair_success": True,
"audit_trace_complete": True,
"latency_ms": 1,
"cost_usd": 0,
}
],
)
assert report.to_dict()["missing_expected_markers"] == ["INC-1"]
assert records[0].rca_correct is None
assert records[0].tool_dry_run_pass is None
assert records[0].repair_success is None
assert records[0].metadata["label_grader_reason"] == "missing_expected_action_markers"
def test_label_grader_marks_false_repair_when_historical_action_degraded():
records, _ = grade_replay_records_with_fixtures(
fixtures=[
{
"incident_id": "INC-1",
"evaluation_labels": {
"verification_result": "degraded",
"execution_success": True,
"expected_action_markers": ["restart", "checkout"],
},
}
],
replay_records=[
{
"run_id": "run",
"incident_id": "INC-1",
"candidate_id": "langgraph_incident_kernel",
"audit_trace_complete": True,
"latency_ms": 1,
"cost_usd": 0,
"metadata": {
"proposed_action": "restart checkout",
},
}
],
)
assert records[0].repair_success is False
assert records[0].false_repair is True