106 lines
3.5 KiB
Python
106 lines
3.5 KiB
Python
from __future__ import annotations
|
|
|
|
from src.services.agent_replay_label_grader import grade_replay_records_with_fixtures
|
|
|
|
|
|
def test_label_grader_applies_awoooi_labels_when_action_matches():
|
|
records, report = grade_replay_records_with_fixtures(
|
|
fixtures=[
|
|
{
|
|
"incident_id": "INC-1",
|
|
"evaluation_labels": {
|
|
"verification_result": "success",
|
|
"execution_success": True,
|
|
"expected_action_markers": ["rollout restart", "checkout"],
|
|
},
|
|
}
|
|
],
|
|
replay_records=[
|
|
{
|
|
"run_id": "run",
|
|
"incident_id": "INC-1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"rca_correct": False,
|
|
"tool_dry_run_pass": False,
|
|
"repair_success": False,
|
|
"audit_trace_complete": True,
|
|
"latency_ms": 8000,
|
|
"cost_usd": 0,
|
|
"metadata": {
|
|
"proposed_action": "kubectl rollout restart deployment checkout -n prod",
|
|
"action_plan": [],
|
|
},
|
|
}
|
|
],
|
|
)
|
|
|
|
assert report.to_dict()["action_match_true"] == 1
|
|
assert records[0].rca_correct is True
|
|
assert records[0].tool_dry_run_pass is True
|
|
assert records[0].repair_success is True
|
|
assert records[0].metadata["candidate_self_grading_ignored"] is True
|
|
|
|
|
|
def test_label_grader_clears_candidate_self_grading_without_markers():
|
|
records, report = grade_replay_records_with_fixtures(
|
|
fixtures=[
|
|
{
|
|
"incident_id": "INC-1",
|
|
"evaluation_labels": {
|
|
"verification_result": "success",
|
|
"execution_success": True,
|
|
},
|
|
}
|
|
],
|
|
replay_records=[
|
|
{
|
|
"run_id": "run",
|
|
"incident_id": "INC-1",
|
|
"candidate_id": "openai_agents_sdk_coordinator",
|
|
"rca_correct": True,
|
|
"tool_dry_run_pass": True,
|
|
"repair_success": True,
|
|
"audit_trace_complete": True,
|
|
"latency_ms": 1,
|
|
"cost_usd": 0,
|
|
}
|
|
],
|
|
)
|
|
|
|
assert report.to_dict()["missing_expected_markers"] == ["INC-1"]
|
|
assert records[0].rca_correct is None
|
|
assert records[0].tool_dry_run_pass is None
|
|
assert records[0].repair_success is None
|
|
assert records[0].metadata["label_grader_reason"] == "missing_expected_action_markers"
|
|
|
|
|
|
def test_label_grader_marks_false_repair_when_historical_action_degraded():
|
|
records, _ = grade_replay_records_with_fixtures(
|
|
fixtures=[
|
|
{
|
|
"incident_id": "INC-1",
|
|
"evaluation_labels": {
|
|
"verification_result": "degraded",
|
|
"execution_success": True,
|
|
"expected_action_markers": ["restart", "checkout"],
|
|
},
|
|
}
|
|
],
|
|
replay_records=[
|
|
{
|
|
"run_id": "run",
|
|
"incident_id": "INC-1",
|
|
"candidate_id": "langgraph_incident_kernel",
|
|
"audit_trace_complete": True,
|
|
"latency_ms": 1,
|
|
"cost_usd": 0,
|
|
"metadata": {
|
|
"proposed_action": "restart checkout",
|
|
},
|
|
}
|
|
],
|
|
)
|
|
|
|
assert records[0].repair_success is False
|
|
assert records[0].false_repair is True
|