awoooi/apps/api/tests/test_agent_replay_label_grader.py

from __future__ import annotations

from src.services.agent_replay_label_grader import grade_replay_records_with_fixtures


def test_label_grader_applies_awoooi_labels_when_action_matches():
    records, report = grade_replay_records_with_fixtures(
        fixtures=[
            {
                "incident_id": "INC-1",
                "evaluation_labels": {
                    "verification_result": "success",
                    "execution_success": True,
                    "expected_action_markers": ["rollout restart", "checkout"],
                },
            }
        ],
        replay_records=[
            {
                "run_id": "run",
                "incident_id": "INC-1",
                "candidate_id": "nemo_nemotron_fabric",
                "rca_correct": False,
                "tool_dry_run_pass": False,
                "repair_success": False,
                "audit_trace_complete": True,
                "latency_ms": 8000,
                "cost_usd": 0,
                "metadata": {
                    "proposed_action": "kubectl rollout restart deployment checkout -n prod",
                    "action_plan": [],
                },
            }
        ],
    )

    assert report.to_dict()["action_match_true"] == 1
    assert records[0].rca_correct is True
    assert records[0].tool_dry_run_pass is True
    assert records[0].repair_success is True
    assert records[0].metadata["candidate_self_grading_ignored"] is True


def test_label_grader_clears_candidate_self_grading_without_markers():
    records, report = grade_replay_records_with_fixtures(
        fixtures=[
            {
                "incident_id": "INC-1",
                "evaluation_labels": {
                    "verification_result": "success",
                    "execution_success": True,
                },
            }
        ],
        replay_records=[
            {
                "run_id": "run",
                "incident_id": "INC-1",
                "candidate_id": "openai_agents_sdk_coordinator",
                "rca_correct": True,
                "tool_dry_run_pass": True,
                "repair_success": True,
                "audit_trace_complete": True,
                "latency_ms": 1,
                "cost_usd": 0,
            }
        ],
    )

    assert report.to_dict()["missing_expected_markers"] == ["INC-1"]
    assert records[0].rca_correct is None
    assert records[0].tool_dry_run_pass is None
    assert records[0].repair_success is None
    assert records[0].metadata["label_grader_reason"] == "missing_expected_action_markers"


def test_label_grader_marks_false_repair_when_historical_action_degraded():
    records, _ = grade_replay_records_with_fixtures(
        fixtures=[
            {
                "incident_id": "INC-1",
                "evaluation_labels": {
                    "verification_result": "degraded",
                    "execution_success": True,
                    "expected_action_markers": ["restart", "checkout"],
                },
            }
        ],
        replay_records=[
            {
                "run_id": "run",
                "incident_id": "INC-1",
                "candidate_id": "langgraph_incident_kernel",
                "audit_trace_complete": True,
                "latency_ms": 1,
                "cost_usd": 0,
                "metadata": {
                    "proposed_action": "restart checkout",
                },
            }
        ],
    )

    assert records[0].repair_success is False
    assert records[0].false_repair is True