awoooi/apps/api/tests/test_agent_nemotron_external_runner_readiness.py

from __future__ import annotations

from src.services.agent_nemotron_external_runner_readiness import (
    evaluate_nemotron_external_runner_readiness,
)


def test_readiness_accepts_sanitized_ready_pack():
    report = evaluate_nemotron_external_runner_readiness(
        manifest=_manifest(),
        sanitize_report=_sanitize_report(),
        sanitized_preflight=_preflight(),
    ).to_dict()

    assert report["ready"] is True
    assert report["decision"] == "ready_for_approval"
    assert report["gates"]["external_execution_still_requires_approval"] is True
    assert report["counts"]["manifest"]["requests"] == 50
    assert report["safety"]["raw_artifacts_committed"] is False


def test_readiness_blocks_unsanitized_or_invalid_preflight():
    preflight = _preflight()
    preflight["valid"] = False
    preflight["failures"] = ["sensitive_marker_present_in_context:4"]
    preflight["sensitive_marker_present_in_context"] = True
    preflight["sensitive_marker_records"] = 4

    report = evaluate_nemotron_external_runner_readiness(
        manifest=_manifest(),
        sanitize_report=_sanitize_report(),
        sanitized_preflight=preflight,
    ).to_dict()

    assert report["ready"] is False
    assert report["decision"] == "blocked"
    assert "sanitized_preflight_invalid" in report["failures"]
    assert "sensitive_context_markers_present" in report["failures"]


def test_readiness_blocks_count_drift_and_external_call_drift():
    manifest = _manifest()
    manifest["request_pack"]["records"] = 49
    manifest["external_runner_output"]["required_records"] = 49
    manifest["external_calls_performed_by_codex"] = True

    report = evaluate_nemotron_external_runner_readiness(
        manifest=manifest,
        sanitize_report=_sanitize_report(),
        sanitized_preflight=_preflight(),
    ).to_dict()

    assert report["ready"] is False
    assert "external_calls_already_performed_by_codex" in report["failures"]
    assert "record_counts_mismatch" in report["failures"]
    assert report["gates"]["counts_match_across_reports"] is False


def _manifest() -> dict:
    return {
        "schema_version": "agent_nemotron_external_runner_manifest_v1",
        "candidate_id": "nemo_nemotron_fabric",
        "run_id": "nemotron-replay-prod-20260601165413",
        "status": "ready_for_approved_external_offline_runner_with_sanitized_pack",
        "external_calls_performed_by_codex": False,
        "approval_required_before_external_execution": True,
        "raw_artifacts_committed": False,
        "sanitize_report": "docs/evaluations/sanitize.json",
        "external_runner_preflight_report_sanitized": "docs/evaluations/preflight.json",
        "request_pack": {
            "local_path": "/tmp/run-sanitized-nemotron-requests.jsonl",
            "source_unsanitized_path": "/tmp/run-nemotron-requests.local.jsonl",
            "records": 50,
            "request_only_records": 50,
            "not_replacement_evidence_records": 50,
            "label_leak_records": 0,
            "sensitive_marker_records": 0,
        },
        "candidate_inputs": {
            "local_path": "/tmp/run-sanitized-candidate-inputs.jsonl",
            "source_unsanitized_path": "/tmp/run-candidate-inputs.jsonl",
            "records": 50,
            "label_leak_records": 0,
        },
        "fixtures": {
            "local_path": "/tmp/run-sanitized-fixtures.jsonl",
            "source_unsanitized_path": "/tmp/run-fixtures.jsonl",
            "records": 50,
            "expected_action_marker_records": 17,
            "operator_only": True,
        },
        "external_runner_output": {
            "required_path": "/tmp/run-external-results.jsonl",
            "schema": "docs/schemas/agent_nemotron_external_result_v1.schema.json",
            "required_records": 50,
            "one_result_per_request": True,
            "forbidden_model_output_fields": [
                "evaluation_labels",
                "verification_result",
                "execution_success",
                "execution_error",
                "self_healing_score",
                "rca_correct",
                "tool_dry_run_pass",
                "repair_success",
                "false_repair",
            ],
        },
        "preferred_post_external_run_command": (
            "apps/api/.venv/bin/python scripts/agents/nemotron-finalize-replay.py"
        ),
    }


def _sanitize_report() -> dict:
    return {
        "schema_version": "agent_nemotron_request_pack_sanitize_report_v1",
        "fixtures": 50,
        "candidate_inputs": 50,
        "requests": 50,
        "valid": True,
        "changed_fixture_records": 50,
        "sensitive_marker_records_before": 4,
        "sensitive_marker_records_after": 0,
        "marker_distribution_before": {"secret": 4},
        "marker_distribution_after": {},
        "preflight_valid": True,
        "preflight_failures": [],
        "failures": [],
    }


def _preflight() -> dict:
    return {
        "schema_version": "agent_nemotron_external_runner_preflight_v1",
        "candidate_id": "nemo_nemotron_fabric",
        "fixtures": 50,
        "candidate_inputs": 50,
        "requests": 50,
        "valid": True,
        "failures": [],
        "duplicate_fixtures": [],
        "duplicate_candidate_inputs": [],
        "duplicate_requests": [],
        "missing_candidate_inputs": [],
        "missing_requests": [],
        "unexpected_candidate_inputs": [],
        "unexpected_requests": [],
        "candidate_input_label_leak_records": 0,
        "request_context_label_leak_records": 0,
        "request_only_records": 50,
        "not_replacement_evidence_records": 50,
        "expected_action_marker_records": 17,
        "sensitive_marker_present_in_context": False,
        "sensitive_marker_records": 0,
        "sensitive_marker_distribution": {},
    }