#!/usr/bin/env python3 """ Run the AWOOOI Agent replacement replay pipeline for one candidate. Pipeline: candidate input JSONL + candidate raw result JSONL -> contract validation -> normalized candidate replay JSONL -> OpenClaw baseline + candidate scorecard """ from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[2] API_SRC = ROOT / "apps" / "api" sys.path.insert(0, str(API_SRC)) from src.services.agent_replay_contract import ( # noqa: E402 validate_candidate_replay_contract, ) from src.services.agent_replay_label_grader import ( # noqa: E402 grade_replay_records_with_fixtures, ) from src.services.agent_replay_normalizer import ( # noqa: E402 CandidateReplayResult, normalize_candidate_result, ) from src.services.agent_replacement_evaluator import ( # noqa: E402 BASELINE_CANDIDATE_ID, MIN_INCIDENTS_FOR_CANARY, AgentReplayRecord, score_replay_records, ) def main() -> int: parser = argparse.ArgumentParser( description="Validate, normalize, and score one Agent replacement candidate." ) parser.add_argument("--inputs", required=True, help="candidate input JSONL") parser.add_argument("--results", required=True, help="candidate raw result JSONL") parser.add_argument("--baseline", required=True, help="OpenClaw baseline replay JSONL") parser.add_argument("--candidate-id", required=True, help="Expected candidate_id") parser.add_argument("--normalized-output", required=True, help="Normalized candidate JSONL") parser.add_argument("--fixtures", help="Optional internal fixture JSONL for local grading") parser.add_argument("--graded-output", help="Graded candidate replay JSONL") parser.add_argument("--grading-report", help="Local grading report JSON") parser.add_argument("--contract-report", required=True, help="Contract report JSON") parser.add_argument("--scorecard", required=True, help="Scorecard JSON") parser.add_argument("--summary", help="Pipeline summary JSON") parser.add_argument( "--baseline-id", default=BASELINE_CANDIDATE_ID, help=f"Baseline candidate id (default: {BASELINE_CANDIDATE_ID})", ) parser.add_argument( "--min-incidents", type=int, default=MIN_INCIDENTS_FOR_CANARY, help=f"Minimum incidents required for canary (default: {MIN_INCIDENTS_FOR_CANARY})", ) args = parser.parse_args() candidate_inputs = _read_jsonl(Path(args.inputs)) candidate_results = _read_jsonl(Path(args.results)) contract_report = validate_candidate_replay_contract( candidate_inputs=candidate_inputs, candidate_results=candidate_results, expected_candidate_id=args.candidate_id, ).to_dict() _write_json(Path(args.contract_report), contract_report) if not contract_report["valid"]: summary = _summary( args=args, contract_report=contract_report, normalized_records=0, scorecard_written=False, ) if args.summary: _write_json(Path(args.summary), summary) print(json.dumps(summary, ensure_ascii=False, sort_keys=True)) return 2 normalized_records = [ normalize_candidate_result(CandidateReplayResult.from_dict(payload)) for payload in candidate_results ] _write_replay_jsonl(Path(args.normalized_output), normalized_records) score_records = normalized_records grading_report: dict[str, Any] | None = None graded_records = 0 if args.fixtures: score_records, report = grade_replay_records_with_fixtures( fixtures=_read_jsonl(Path(args.fixtures)), replay_records=normalized_records, ) grading_report = report.to_dict() graded_records = len(score_records) if args.graded_output: _write_replay_jsonl(Path(args.graded_output), score_records) if args.grading_report: _write_json(Path(args.grading_report), grading_report) baseline_records = _read_replay_jsonl(Path(args.baseline)) report = score_replay_records( baseline_records + score_records, baseline_candidate_id=args.baseline_id, min_incidents_for_canary=args.min_incidents, ).to_dict() _write_json(Path(args.scorecard), report) summary = _summary( args=args, contract_report=contract_report, normalized_records=len(normalized_records), graded_records=graded_records, grading_report=grading_report, scorecard_written=True, ) if args.summary: _write_json(Path(args.summary), summary) print(json.dumps(summary, ensure_ascii=False, sort_keys=True)) return 0 def _summary( *, args, contract_report: dict[str, Any], normalized_records: int, scorecard_written: bool, graded_records: int = 0, grading_report: dict[str, Any] | None = None, ) -> dict[str, Any]: return { "schema_version": "agent_replay_pipeline_report_v1", "candidate_id": args.candidate_id, "inputs": args.inputs, "results": args.results, "baseline": args.baseline, "contract_report": args.contract_report, "normalized_output": args.normalized_output, "fixtures": args.fixtures, "graded_output": args.graded_output, "grading_report": args.grading_report, "scorecard": args.scorecard, "contract_valid": bool(contract_report.get("valid")), "input_records": int(contract_report.get("inputs", 0)), "result_records": int(contract_report.get("results", 0)), "normalized_records": normalized_records, "graded_records": graded_records, "label_grading_applied": bool(grading_report), "scorecard_written": scorecard_written, } def _read_jsonl(path: Path) -> list[dict[str, Any]]: records: list[dict[str, Any]] = [] with path.open(encoding="utf-8") as handle: for line_number, line in enumerate(handle, start=1): line = line.strip() if not line or line.startswith("#"): continue try: records.append(json.loads(line)) except Exception as exc: raise SystemExit(f"{path}:{line_number}: invalid JSONL: {exc}") from exc return records def _read_replay_jsonl(path: Path) -> list[AgentReplayRecord]: return [AgentReplayRecord.from_dict(payload) for payload in _read_jsonl(path)] def _write_replay_jsonl(path: Path, records: list[AgentReplayRecord]) -> None: with path.open("w", encoding="utf-8") as handle: for record in records: handle.write(json.dumps(record.__dict__, ensure_ascii=False, sort_keys=True)) handle.write("\n") def _write_json(path: Path, payload: dict[str, Any]) -> None: path.write_text( json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8", ) if __name__ == "__main__": raise SystemExit(main())