awoooi/apps/api/src/services/agent_replay_label_grader.py

"""
Agent Replay Label Grader
=========================

Applies AWOOOI-owned fixture labels to normalized candidate replay records.

Candidate adapters must not provide RCA / dry-run / repair success grades. This
module joins internal fixtures with normalized candidate outputs after replay and
fills scorecard fields only when AWOOOI has enough label evidence.
"""

from __future__ import annotations

import json
from dataclasses import dataclass, field, replace
from typing import Any

from src.services.agent_replacement_evaluator import AgentReplayRecord


@dataclass(frozen=True)
class AgentReplayGradingReport:
    """Summary of local label grading coverage."""

    records: int
    graded_records: int
    missing_fixtures: list[str] = field(default_factory=list)
    missing_expected_markers: list[str] = field(default_factory=list)
    action_match_true: int = 0
    action_match_false: int = 0

    def to_dict(self) -> dict[str, Any]:
        return {
            "schema_version": "agent_replay_grading_report_v1",
            "records": self.records,
            "graded_records": self.graded_records,
            "missing_fixtures": list(self.missing_fixtures),
            "missing_expected_markers": list(self.missing_expected_markers),
            "action_match_true": self.action_match_true,
            "action_match_false": self.action_match_false,
        }


def grade_replay_records_with_fixtures(
    *,
    fixtures: list[dict[str, Any]],
    replay_records: list[AgentReplayRecord | dict[str, Any]],
) -> tuple[list[AgentReplayRecord], AgentReplayGradingReport]:
    """Apply fixture evaluation labels to normalized replay records."""
    fixture_index = _index_fixtures(fixtures)
    normalized = [
        record if isinstance(record, AgentReplayRecord) else AgentReplayRecord.from_dict(record)
        for record in replay_records
    ]

    graded: list[AgentReplayRecord] = []
    missing_fixtures: list[str] = []
    missing_expected_markers: list[str] = []
    action_match_true = 0
    action_match_false = 0

    for record in normalized:
        fixture = fixture_index.get(record.incident_id)
        if fixture is None:
            missing_fixtures.append(record.incident_id)
            graded.append(_clear_candidate_self_grades(record, reason="missing_fixture"))
            continue

        labels = dict(fixture.get("evaluation_labels") or {})
        markers = _expected_action_markers(labels)
        if not markers:
            missing_expected_markers.append(record.incident_id)
            graded.append(
                _clear_candidate_self_grades(
                    record,
                    reason="missing_expected_action_markers",
                    labels=labels,
                )
            )
            continue

        action_match = _action_matches(record, markers)
        if action_match:
            action_match_true += 1
        else:
            action_match_false += 1
        graded.append(_grade_record(record, labels=labels, action_match=action_match))

    report = AgentReplayGradingReport(
        records=len(normalized),
        graded_records=action_match_true + action_match_false,
        missing_fixtures=missing_fixtures,
        missing_expected_markers=missing_expected_markers,
        action_match_true=action_match_true,
        action_match_false=action_match_false,
    )
    return graded, report


def _grade_record(
    record: AgentReplayRecord,
    *,
    labels: dict[str, Any],
    action_match: bool,
) -> AgentReplayRecord:
    verification_success = _verification_success(labels)
    execution_success = _optional_bool(labels.get("execution_success"))

    rca_correct = verification_success if action_match else False
    repair_success = verification_success if action_match else False
    tool_dry_run_pass = execution_success if action_match else False
    false_repair = bool(
        action_match
        and execution_success is True
        and verification_success is False
    )

    return replace(
        record,
        rca_correct=rca_correct,
        tool_dry_run_pass=tool_dry_run_pass,
        repair_success=repair_success,
        false_repair=false_repair,
        metadata={
            **record.metadata,
            "candidate_self_grading_ignored": True,
            "label_grader": "agent_replay_label_grader_v1",
            "label_grader_action_match": action_match,
            "label_grader_expected_markers": _expected_action_markers(labels),
            "label_grader_verification_result": labels.get("verification_result"),
            "label_grader_execution_success": execution_success,
        },
    )


def _clear_candidate_self_grades(
    record: AgentReplayRecord,
    *,
    reason: str,
    labels: dict[str, Any] | None = None,
) -> AgentReplayRecord:
    return replace(
        record,
        rca_correct=None,
        tool_dry_run_pass=None,
        repair_success=None,
        false_repair=False,
        metadata={
            **record.metadata,
            "candidate_self_grading_ignored": True,
            "label_grader": "agent_replay_label_grader_v1",
            "label_grader_reason": reason,
            "label_grader_verification_result": (labels or {}).get("verification_result"),
        },
    )


def _index_fixtures(fixtures: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
    indexed: dict[str, dict[str, Any]] = {}
    for fixture in fixtures:
        incident_id = str(fixture.get("incident_id", "")).strip()
        if incident_id:
            indexed[incident_id] = fixture
    return indexed


def _expected_action_markers(labels: dict[str, Any]) -> list[str]:
    raw = labels.get("expected_action_markers") or []
    if isinstance(raw, str):
        raw = [raw]
    if not isinstance(raw, list):
        return []
    return [
        marker.strip().lower()
        for marker in (str(item) for item in raw)
        if marker.strip()
    ]


def _action_matches(record: AgentReplayRecord, markers: list[str]) -> bool:
    action_bundle = json.dumps(
        {
            "proposed_action": record.metadata.get("proposed_action"),
            "action_plan": record.metadata.get("action_plan"),
        },
        ensure_ascii=False,
        sort_keys=True,
    ).lower()
    return all(marker in action_bundle for marker in markers)


def _verification_success(labels: dict[str, Any]) -> bool | None:
    value = labels.get("verification_result")
    if value is None:
        return None
    return str(value).lower() == "success"


def _optional_bool(value: Any) -> bool | None:
    if value is None:
        return None
    return bool(value)