82 lines
2.7 KiB
Python
82 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply AWOOOI fixture labels to normalized candidate replay JSONL.
|
|
|
|
This is a local evaluator step. It does not call candidate agents or execute
|
|
tools, and it ignores any candidate-supplied self-grading fields.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[2]
|
|
API_SRC = ROOT / "apps" / "api"
|
|
sys.path.insert(0, str(API_SRC))
|
|
|
|
from src.services.agent_replay_label_grader import ( # noqa: E402
|
|
grade_replay_records_with_fixtures,
|
|
)
|
|
from src.services.agent_replacement_evaluator import AgentReplayRecord # noqa: E402
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description="Grade normalized candidate replay records with fixture labels."
|
|
)
|
|
parser.add_argument("--fixtures", required=True, help="agent_replay_fixture_v1 JSONL")
|
|
parser.add_argument("--input", required=True, help="normalized replay JSONL")
|
|
parser.add_argument("--output", required=True, help="graded replay JSONL")
|
|
parser.add_argument("--report", help="grading report JSON")
|
|
args = parser.parse_args()
|
|
|
|
graded_records, report = grade_replay_records_with_fixtures(
|
|
fixtures=_read_jsonl(Path(args.fixtures)),
|
|
replay_records=_read_replay_jsonl(Path(args.input)),
|
|
)
|
|
_write_replay_jsonl(Path(args.output), graded_records)
|
|
report_payload = report.to_dict()
|
|
if args.report:
|
|
Path(args.report).write_text(
|
|
json.dumps(report_payload, ensure_ascii=False, indent=2, sort_keys=True)
|
|
+ "\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(json.dumps(report_payload, ensure_ascii=False, sort_keys=True))
|
|
return 0
|
|
|
|
|
|
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
records: list[dict[str, Any]] = []
|
|
with path.open(encoding="utf-8") as handle:
|
|
for line_number, line in enumerate(handle, start=1):
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
try:
|
|
records.append(json.loads(line))
|
|
except Exception as exc:
|
|
raise SystemExit(f"{path}:{line_number}: invalid JSONL: {exc}") from exc
|
|
return records
|
|
|
|
|
|
def _read_replay_jsonl(path: Path) -> list[AgentReplayRecord]:
|
|
return [AgentReplayRecord.from_dict(payload) for payload in _read_jsonl(path)]
|
|
|
|
|
|
def _write_replay_jsonl(path: Path, records: list[AgentReplayRecord]) -> None:
|
|
with path.open("w", encoding="utf-8") as handle:
|
|
for record in records:
|
|
handle.write(json.dumps(record.__dict__, ensure_ascii=False, sort_keys=True))
|
|
handle.write("\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|