158 lines
5.7 KiB
Python
158 lines
5.7 KiB
Python
from __future__ import annotations
|
|
|
|
from src.services.agent_nemotron_external_runner_readiness import (
|
|
evaluate_nemotron_external_runner_readiness,
|
|
)
|
|
|
|
|
|
def test_readiness_accepts_sanitized_ready_pack():
|
|
report = evaluate_nemotron_external_runner_readiness(
|
|
manifest=_manifest(),
|
|
sanitize_report=_sanitize_report(),
|
|
sanitized_preflight=_preflight(),
|
|
).to_dict()
|
|
|
|
assert report["ready"] is True
|
|
assert report["decision"] == "ready_for_approval"
|
|
assert report["gates"]["external_execution_still_requires_approval"] is True
|
|
assert report["counts"]["manifest"]["requests"] == 50
|
|
assert report["safety"]["raw_artifacts_committed"] is False
|
|
|
|
|
|
def test_readiness_blocks_unsanitized_or_invalid_preflight():
|
|
preflight = _preflight()
|
|
preflight["valid"] = False
|
|
preflight["failures"] = ["sensitive_marker_present_in_context:4"]
|
|
preflight["sensitive_marker_present_in_context"] = True
|
|
preflight["sensitive_marker_records"] = 4
|
|
|
|
report = evaluate_nemotron_external_runner_readiness(
|
|
manifest=_manifest(),
|
|
sanitize_report=_sanitize_report(),
|
|
sanitized_preflight=preflight,
|
|
).to_dict()
|
|
|
|
assert report["ready"] is False
|
|
assert report["decision"] == "blocked"
|
|
assert "sanitized_preflight_invalid" in report["failures"]
|
|
assert "sensitive_context_markers_present" in report["failures"]
|
|
|
|
|
|
def test_readiness_blocks_count_drift_and_external_call_drift():
|
|
manifest = _manifest()
|
|
manifest["request_pack"]["records"] = 49
|
|
manifest["external_runner_output"]["required_records"] = 49
|
|
manifest["external_calls_performed_by_codex"] = True
|
|
|
|
report = evaluate_nemotron_external_runner_readiness(
|
|
manifest=manifest,
|
|
sanitize_report=_sanitize_report(),
|
|
sanitized_preflight=_preflight(),
|
|
).to_dict()
|
|
|
|
assert report["ready"] is False
|
|
assert "external_calls_already_performed_by_codex" in report["failures"]
|
|
assert "record_counts_mismatch" in report["failures"]
|
|
assert report["gates"]["counts_match_across_reports"] is False
|
|
|
|
|
|
def _manifest() -> dict:
|
|
return {
|
|
"schema_version": "agent_nemotron_external_runner_manifest_v1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"run_id": "nemotron-replay-prod-20260601165413",
|
|
"status": "ready_for_approved_external_offline_runner_with_sanitized_pack",
|
|
"external_calls_performed_by_codex": False,
|
|
"approval_required_before_external_execution": True,
|
|
"raw_artifacts_committed": False,
|
|
"sanitize_report": "docs/evaluations/sanitize.json",
|
|
"external_runner_preflight_report_sanitized": "docs/evaluations/preflight.json",
|
|
"request_pack": {
|
|
"local_path": "/tmp/run-sanitized-nemotron-requests.jsonl",
|
|
"source_unsanitized_path": "/tmp/run-nemotron-requests.local.jsonl",
|
|
"records": 50,
|
|
"request_only_records": 50,
|
|
"not_replacement_evidence_records": 50,
|
|
"label_leak_records": 0,
|
|
"sensitive_marker_records": 0,
|
|
},
|
|
"candidate_inputs": {
|
|
"local_path": "/tmp/run-sanitized-candidate-inputs.jsonl",
|
|
"source_unsanitized_path": "/tmp/run-candidate-inputs.jsonl",
|
|
"records": 50,
|
|
"label_leak_records": 0,
|
|
},
|
|
"fixtures": {
|
|
"local_path": "/tmp/run-sanitized-fixtures.jsonl",
|
|
"source_unsanitized_path": "/tmp/run-fixtures.jsonl",
|
|
"records": 50,
|
|
"expected_action_marker_records": 17,
|
|
"operator_only": True,
|
|
},
|
|
"external_runner_output": {
|
|
"required_path": "/tmp/run-external-results.jsonl",
|
|
"schema": "docs/schemas/agent_nemotron_external_result_v1.schema.json",
|
|
"required_records": 50,
|
|
"one_result_per_request": True,
|
|
"forbidden_model_output_fields": [
|
|
"evaluation_labels",
|
|
"verification_result",
|
|
"execution_success",
|
|
"execution_error",
|
|
"self_healing_score",
|
|
"rca_correct",
|
|
"tool_dry_run_pass",
|
|
"repair_success",
|
|
"false_repair",
|
|
],
|
|
},
|
|
"preferred_post_external_run_command": (
|
|
"apps/api/.venv/bin/python scripts/agents/nemotron-finalize-replay.py"
|
|
),
|
|
}
|
|
|
|
|
|
def _sanitize_report() -> dict:
|
|
return {
|
|
"schema_version": "agent_nemotron_request_pack_sanitize_report_v1",
|
|
"fixtures": 50,
|
|
"candidate_inputs": 50,
|
|
"requests": 50,
|
|
"valid": True,
|
|
"changed_fixture_records": 50,
|
|
"sensitive_marker_records_before": 4,
|
|
"sensitive_marker_records_after": 0,
|
|
"marker_distribution_before": {"secret": 4},
|
|
"marker_distribution_after": {},
|
|
"preflight_valid": True,
|
|
"preflight_failures": [],
|
|
"failures": [],
|
|
}
|
|
|
|
|
|
def _preflight() -> dict:
|
|
return {
|
|
"schema_version": "agent_nemotron_external_runner_preflight_v1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"fixtures": 50,
|
|
"candidate_inputs": 50,
|
|
"requests": 50,
|
|
"valid": True,
|
|
"failures": [],
|
|
"duplicate_fixtures": [],
|
|
"duplicate_candidate_inputs": [],
|
|
"duplicate_requests": [],
|
|
"missing_candidate_inputs": [],
|
|
"missing_requests": [],
|
|
"unexpected_candidate_inputs": [],
|
|
"unexpected_requests": [],
|
|
"candidate_input_label_leak_records": 0,
|
|
"request_context_label_leak_records": 0,
|
|
"request_only_records": 50,
|
|
"not_replacement_evidence_records": 50,
|
|
"expected_action_marker_records": 17,
|
|
"sensitive_marker_present_in_context": False,
|
|
"sensitive_marker_records": 0,
|
|
"sensitive_marker_distribution": {},
|
|
}
|