awoooi/scripts/agents/grade-agent-replay-results.py

#!/usr/bin/env python3
"""
Apply AWOOOI fixture labels to normalized candidate replay JSONL.

This is a local evaluator step. It does not call candidate agents or execute
tools, and it ignores any candidate-supplied self-grading fields.
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any


ROOT = Path(__file__).resolve().parents[2]
API_SRC = ROOT / "apps" / "api"
sys.path.insert(0, str(API_SRC))

from src.services.agent_replay_label_grader import (  # noqa: E402
    grade_replay_records_with_fixtures,
)
from src.services.agent_replacement_evaluator import AgentReplayRecord  # noqa: E402


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Grade normalized candidate replay records with fixture labels."
    )
    parser.add_argument("--fixtures", required=True, help="agent_replay_fixture_v1 JSONL")
    parser.add_argument("--input", required=True, help="normalized replay JSONL")
    parser.add_argument("--output", required=True, help="graded replay JSONL")
    parser.add_argument("--report", help="grading report JSON")
    args = parser.parse_args()

    graded_records, report = grade_replay_records_with_fixtures(
        fixtures=_read_jsonl(Path(args.fixtures)),
        replay_records=_read_replay_jsonl(Path(args.input)),
    )
    _write_replay_jsonl(Path(args.output), graded_records)
    report_payload = report.to_dict()
    if args.report:
        Path(args.report).write_text(
            json.dumps(report_payload, ensure_ascii=False, indent=2, sort_keys=True)
            + "\n",
            encoding="utf-8",
        )

    print(json.dumps(report_payload, ensure_ascii=False, sort_keys=True))
    return 0


def _read_jsonl(path: Path) -> list[dict[str, Any]]:
    records: list[dict[str, Any]] = []
    with path.open(encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            try:
                records.append(json.loads(line))
            except Exception as exc:
                raise SystemExit(f"{path}:{line_number}: invalid JSONL: {exc}") from exc
    return records


def _read_replay_jsonl(path: Path) -> list[AgentReplayRecord]:
    return [AgentReplayRecord.from_dict(payload) for payload in _read_jsonl(path)]


def _write_replay_jsonl(path: Path, records: list[AgentReplayRecord]) -> None:
    with path.open("w", encoding="utf-8") as handle:
        for record in records:
            handle.write(json.dumps(record.__dict__, ensure_ascii=False, sort_keys=True))
            handle.write("\n")


if __name__ == "__main__":
    raise SystemExit(main())