Files
awoooi/apps/api/tests/test_agent_nemotron_external_runner_readiness.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

158 lines
5.7 KiB
Python

from __future__ import annotations
from src.services.agent_nemotron_external_runner_readiness import (
evaluate_nemotron_external_runner_readiness,
)
def test_readiness_accepts_sanitized_ready_pack():
report = evaluate_nemotron_external_runner_readiness(
manifest=_manifest(),
sanitize_report=_sanitize_report(),
sanitized_preflight=_preflight(),
).to_dict()
assert report["ready"] is True
assert report["decision"] == "ready_for_approval"
assert report["gates"]["external_execution_still_requires_approval"] is True
assert report["counts"]["manifest"]["requests"] == 50
assert report["safety"]["raw_artifacts_committed"] is False
def test_readiness_blocks_unsanitized_or_invalid_preflight():
preflight = _preflight()
preflight["valid"] = False
preflight["failures"] = ["sensitive_marker_present_in_context:4"]
preflight["sensitive_marker_present_in_context"] = True
preflight["sensitive_marker_records"] = 4
report = evaluate_nemotron_external_runner_readiness(
manifest=_manifest(),
sanitize_report=_sanitize_report(),
sanitized_preflight=preflight,
).to_dict()
assert report["ready"] is False
assert report["decision"] == "blocked"
assert "sanitized_preflight_invalid" in report["failures"]
assert "sensitive_context_markers_present" in report["failures"]
def test_readiness_blocks_count_drift_and_external_call_drift():
manifest = _manifest()
manifest["request_pack"]["records"] = 49
manifest["external_runner_output"]["required_records"] = 49
manifest["external_calls_performed_by_codex"] = True
report = evaluate_nemotron_external_runner_readiness(
manifest=manifest,
sanitize_report=_sanitize_report(),
sanitized_preflight=_preflight(),
).to_dict()
assert report["ready"] is False
assert "external_calls_already_performed_by_codex" in report["failures"]
assert "record_counts_mismatch" in report["failures"]
assert report["gates"]["counts_match_across_reports"] is False
def _manifest() -> dict:
return {
"schema_version": "agent_nemotron_external_runner_manifest_v1",
"candidate_id": "nemo_nemotron_fabric",
"run_id": "nemotron-replay-prod-20260601165413",
"status": "ready_for_approved_external_offline_runner_with_sanitized_pack",
"external_calls_performed_by_codex": False,
"approval_required_before_external_execution": True,
"raw_artifacts_committed": False,
"sanitize_report": "docs/evaluations/sanitize.json",
"external_runner_preflight_report_sanitized": "docs/evaluations/preflight.json",
"request_pack": {
"local_path": "/tmp/run-sanitized-nemotron-requests.jsonl",
"source_unsanitized_path": "/tmp/run-nemotron-requests.local.jsonl",
"records": 50,
"request_only_records": 50,
"not_replacement_evidence_records": 50,
"label_leak_records": 0,
"sensitive_marker_records": 0,
},
"candidate_inputs": {
"local_path": "/tmp/run-sanitized-candidate-inputs.jsonl",
"source_unsanitized_path": "/tmp/run-candidate-inputs.jsonl",
"records": 50,
"label_leak_records": 0,
},
"fixtures": {
"local_path": "/tmp/run-sanitized-fixtures.jsonl",
"source_unsanitized_path": "/tmp/run-fixtures.jsonl",
"records": 50,
"expected_action_marker_records": 17,
"operator_only": True,
},
"external_runner_output": {
"required_path": "/tmp/run-external-results.jsonl",
"schema": "docs/schemas/agent_nemotron_external_result_v1.schema.json",
"required_records": 50,
"one_result_per_request": True,
"forbidden_model_output_fields": [
"evaluation_labels",
"verification_result",
"execution_success",
"execution_error",
"self_healing_score",
"rca_correct",
"tool_dry_run_pass",
"repair_success",
"false_repair",
],
},
"preferred_post_external_run_command": (
"apps/api/.venv/bin/python scripts/agents/nemotron-finalize-replay.py"
),
}
def _sanitize_report() -> dict:
return {
"schema_version": "agent_nemotron_request_pack_sanitize_report_v1",
"fixtures": 50,
"candidate_inputs": 50,
"requests": 50,
"valid": True,
"changed_fixture_records": 50,
"sensitive_marker_records_before": 4,
"sensitive_marker_records_after": 0,
"marker_distribution_before": {"secret": 4},
"marker_distribution_after": {},
"preflight_valid": True,
"preflight_failures": [],
"failures": [],
}
def _preflight() -> dict:
return {
"schema_version": "agent_nemotron_external_runner_preflight_v1",
"candidate_id": "nemo_nemotron_fabric",
"fixtures": 50,
"candidate_inputs": 50,
"requests": 50,
"valid": True,
"failures": [],
"duplicate_fixtures": [],
"duplicate_candidate_inputs": [],
"duplicate_requests": [],
"missing_candidate_inputs": [],
"missing_requests": [],
"unexpected_candidate_inputs": [],
"unexpected_requests": [],
"candidate_input_label_leak_records": 0,
"request_context_label_leak_records": 0,
"request_only_records": 50,
"not_replacement_evidence_records": 50,
"expected_action_marker_records": 17,
"sensitive_marker_present_in_context": False,
"sensitive_marker_records": 0,
"sensitive_marker_distribution": {},
}