418 lines
15 KiB
Python
418 lines
15 KiB
Python
"""
|
|
NeMo/Nemotron External Runner Readiness Gate
|
|
============================================
|
|
|
|
Combines the external-runner manifest, sanitize report, and sanitized preflight
|
|
report into one pre-execution decision. This module is local and deterministic:
|
|
it does not call NIM, NVIDIA APIs, tools, production systems, or LLMs.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
from src.services.agent_nemotron_replay_adapter import NEMOTRON_CANDIDATE_ID
|
|
|
|
READINESS_SCHEMA_VERSION = "agent_nemotron_external_runner_readiness_v1"
|
|
MANIFEST_SCHEMA_VERSION = "agent_nemotron_external_runner_manifest_v1"
|
|
SANITIZE_SCHEMA_VERSION = "agent_nemotron_request_pack_sanitize_report_v1"
|
|
PREFLIGHT_SCHEMA_VERSION = "agent_nemotron_external_runner_preflight_v1"
|
|
READY_MANIFEST_STATUS = "ready_for_approved_external_offline_runner_with_sanitized_pack"
|
|
DEFAULT_MINIMUM_RECORDS = 50
|
|
|
|
_SELF_GRADING_FIELDS = {
|
|
"evaluation_labels",
|
|
"verification_result",
|
|
"execution_success",
|
|
"execution_error",
|
|
"self_healing_score",
|
|
"rca_correct",
|
|
"tool_dry_run_pass",
|
|
"repair_success",
|
|
"false_repair",
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class NemotronExternalRunnerReadinessReport:
|
|
"""Single readiness decision before a NeMo external runner can be used."""
|
|
|
|
candidate_id: str
|
|
run_id: str
|
|
ready: bool
|
|
decision: str
|
|
minimum_records: int
|
|
gates: dict[str, bool] = field(default_factory=dict)
|
|
failures: list[str] = field(default_factory=list)
|
|
counts: dict[str, Any] = field(default_factory=dict)
|
|
artifacts: dict[str, Any] = field(default_factory=dict)
|
|
safety: dict[str, Any] = field(default_factory=dict)
|
|
next_actions: list[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"schema_version": READINESS_SCHEMA_VERSION,
|
|
"candidate_id": self.candidate_id,
|
|
"run_id": self.run_id,
|
|
"ready": self.ready,
|
|
"decision": self.decision,
|
|
"minimum_records": self.minimum_records,
|
|
"gates": dict(self.gates),
|
|
"failures": list(self.failures),
|
|
"counts": dict(self.counts),
|
|
"artifacts": dict(self.artifacts),
|
|
"safety": dict(self.safety),
|
|
"next_actions": list(self.next_actions),
|
|
}
|
|
|
|
|
|
def evaluate_nemotron_external_runner_readiness(
|
|
*,
|
|
manifest: dict[str, Any],
|
|
sanitize_report: dict[str, Any],
|
|
sanitized_preflight: dict[str, Any],
|
|
minimum_records: int = DEFAULT_MINIMUM_RECORDS,
|
|
) -> NemotronExternalRunnerReadinessReport:
|
|
"""Evaluate whether the sanitized request pack is ready for approval."""
|
|
failures: list[str] = []
|
|
gates: dict[str, bool] = {}
|
|
|
|
def gate(name: str, passed: bool, failure: str | None = None) -> None:
|
|
gates[name] = bool(passed)
|
|
if not passed:
|
|
failures.append(failure or name)
|
|
|
|
candidate_id = str(manifest.get("candidate_id") or "")
|
|
run_id = str(manifest.get("run_id") or "")
|
|
manifest_counts = _manifest_counts(manifest)
|
|
sanitize_counts = _report_counts(sanitize_report)
|
|
preflight_counts = _report_counts(sanitized_preflight)
|
|
|
|
gate(
|
|
"manifest_schema_valid",
|
|
manifest.get("schema_version") == MANIFEST_SCHEMA_VERSION,
|
|
"manifest_schema_mismatch",
|
|
)
|
|
gate(
|
|
"candidate_is_nemotron_fabric",
|
|
candidate_id == NEMOTRON_CANDIDATE_ID,
|
|
"manifest_candidate_mismatch",
|
|
)
|
|
gate("run_id_present", bool(run_id.strip()), "manifest_run_id_missing")
|
|
gate(
|
|
"manifest_status_sanitized_ready",
|
|
manifest.get("status") == READY_MANIFEST_STATUS,
|
|
"manifest_status_not_sanitized_ready",
|
|
)
|
|
gate(
|
|
"external_calls_not_performed_by_codex",
|
|
manifest.get("external_calls_performed_by_codex") is False,
|
|
"external_calls_already_performed_by_codex",
|
|
)
|
|
gate(
|
|
"external_execution_still_requires_approval",
|
|
manifest.get("approval_required_before_external_execution") is True,
|
|
"approval_required_flag_missing",
|
|
)
|
|
gate(
|
|
"raw_artifacts_not_committed",
|
|
manifest.get("raw_artifacts_committed") is False,
|
|
"raw_artifacts_committed_or_unknown",
|
|
)
|
|
gate(
|
|
"sanitize_report_schema_valid",
|
|
sanitize_report.get("schema_version") == SANITIZE_SCHEMA_VERSION,
|
|
"sanitize_report_schema_mismatch",
|
|
)
|
|
gate(
|
|
"sanitize_report_valid",
|
|
sanitize_report.get("valid") is True,
|
|
"sanitize_report_invalid",
|
|
)
|
|
gate(
|
|
"sanitize_preflight_valid",
|
|
sanitize_report.get("preflight_valid") is True,
|
|
"sanitize_report_preflight_invalid",
|
|
)
|
|
gate(
|
|
"sanitize_failures_empty",
|
|
not (sanitize_report.get("failures") or [])
|
|
and not (sanitize_report.get("preflight_failures") or []),
|
|
"sanitize_report_has_failures",
|
|
)
|
|
gate(
|
|
"sanitize_sensitive_markers_removed",
|
|
sanitize_report.get("sensitive_marker_records_after") == 0,
|
|
"sanitize_sensitive_markers_remaining",
|
|
)
|
|
gate(
|
|
"sanitized_preflight_schema_valid",
|
|
sanitized_preflight.get("schema_version") == PREFLIGHT_SCHEMA_VERSION,
|
|
"sanitized_preflight_schema_mismatch",
|
|
)
|
|
gate(
|
|
"sanitized_preflight_candidate_valid",
|
|
sanitized_preflight.get("candidate_id") == NEMOTRON_CANDIDATE_ID,
|
|
"sanitized_preflight_candidate_mismatch",
|
|
)
|
|
gate(
|
|
"sanitized_preflight_valid",
|
|
sanitized_preflight.get("valid") is True,
|
|
"sanitized_preflight_invalid",
|
|
)
|
|
gate(
|
|
"sanitized_preflight_failures_empty",
|
|
not sanitized_preflight.get("failures"),
|
|
"sanitized_preflight_has_failures",
|
|
)
|
|
gate(
|
|
"no_missing_extra_or_duplicate_records",
|
|
_preflight_record_sets_clean(sanitized_preflight),
|
|
"sanitized_preflight_record_set_not_clean",
|
|
)
|
|
gate(
|
|
"no_label_leaks",
|
|
sanitized_preflight.get("candidate_input_label_leak_records") == 0
|
|
and sanitized_preflight.get("request_context_label_leak_records") == 0
|
|
and _manifest_request_pack(manifest).get("label_leak_records") == 0
|
|
and _manifest_candidate_inputs(manifest).get("label_leak_records") == 0,
|
|
"label_leak_records_present",
|
|
)
|
|
gate(
|
|
"no_sensitive_context_markers",
|
|
sanitized_preflight.get("sensitive_marker_present_in_context") is False
|
|
and sanitized_preflight.get("sensitive_marker_records") == 0
|
|
and _manifest_request_pack(manifest).get("sensitive_marker_records") == 0,
|
|
"sensitive_context_markers_present",
|
|
)
|
|
gate(
|
|
"request_pack_is_request_only",
|
|
sanitized_preflight.get("request_only_records")
|
|
== sanitized_preflight.get("requests")
|
|
and _manifest_request_pack(manifest).get("request_only_records")
|
|
== _manifest_request_pack(manifest).get("records"),
|
|
"request_pack_not_fully_request_only",
|
|
)
|
|
gate(
|
|
"request_pack_not_replacement_evidence",
|
|
sanitized_preflight.get("not_replacement_evidence_records")
|
|
== sanitized_preflight.get("requests")
|
|
and _manifest_request_pack(manifest).get("not_replacement_evidence_records")
|
|
== _manifest_request_pack(manifest).get("records"),
|
|
"request_pack_contains_replacement_evidence",
|
|
)
|
|
gate(
|
|
"counts_match_across_reports",
|
|
_counts_match(manifest_counts, sanitize_counts, preflight_counts),
|
|
"record_counts_mismatch",
|
|
)
|
|
gate(
|
|
"minimum_records_met",
|
|
_count_value(manifest_counts, "requests") >= minimum_records
|
|
and _count_value(sanitize_counts, "requests") >= minimum_records
|
|
and _count_value(preflight_counts, "requests") >= minimum_records,
|
|
"minimum_records_not_met",
|
|
)
|
|
gate(
|
|
"manifest_uses_sanitized_tmp_artifacts",
|
|
_uses_sanitized_tmp_artifacts(manifest),
|
|
"manifest_not_pointing_to_sanitized_tmp_artifacts",
|
|
)
|
|
gate(
|
|
"external_output_contract_declared",
|
|
_external_output_contract_declared(
|
|
manifest,
|
|
expected_records=_count_value(manifest_counts, "requests"),
|
|
),
|
|
"external_output_contract_incomplete",
|
|
)
|
|
gate(
|
|
"post_external_finalizer_declared",
|
|
bool(str(manifest.get("preferred_post_external_run_command") or "").strip()),
|
|
"preferred_post_external_run_command_missing",
|
|
)
|
|
|
|
ready = not failures
|
|
return NemotronExternalRunnerReadinessReport(
|
|
candidate_id=candidate_id,
|
|
run_id=run_id,
|
|
ready=ready,
|
|
decision="ready_for_approval" if ready else "blocked",
|
|
minimum_records=minimum_records,
|
|
gates=gates,
|
|
failures=failures,
|
|
counts={
|
|
"manifest": manifest_counts,
|
|
"sanitize_report": sanitize_counts,
|
|
"sanitized_preflight": preflight_counts,
|
|
},
|
|
artifacts=_artifacts(manifest),
|
|
safety=_safety(manifest, sanitized_preflight),
|
|
next_actions=_next_actions(manifest, ready=ready),
|
|
)
|
|
|
|
|
|
def _manifest_counts(manifest: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"fixtures": _manifest_fixtures(manifest).get("records"),
|
|
"candidate_inputs": _manifest_candidate_inputs(manifest).get("records"),
|
|
"requests": _manifest_request_pack(manifest).get("records"),
|
|
"expected_action_marker_records": _manifest_fixtures(manifest).get(
|
|
"expected_action_marker_records"
|
|
),
|
|
}
|
|
|
|
|
|
def _report_counts(report: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"fixtures": report.get("fixtures"),
|
|
"candidate_inputs": report.get("candidate_inputs"),
|
|
"requests": report.get("requests"),
|
|
"expected_action_marker_records": report.get("expected_action_marker_records"),
|
|
}
|
|
|
|
|
|
def _counts_match(*counts: dict[str, Any]) -> bool:
|
|
keys = {"fixtures", "candidate_inputs", "requests"}
|
|
for key in keys:
|
|
values = [_coerce_int(count.get(key)) for count in counts]
|
|
if any(value is None for value in values):
|
|
return False
|
|
if len(set(values)) != 1:
|
|
return False
|
|
marker_values = [
|
|
_coerce_int(count.get("expected_action_marker_records"))
|
|
for count in counts
|
|
if count.get("expected_action_marker_records") is not None
|
|
]
|
|
return len(set(marker_values)) <= 1
|
|
|
|
|
|
def _count_value(counts: dict[str, Any], key: str) -> int:
|
|
return _coerce_int(counts.get(key)) or 0
|
|
|
|
|
|
def _coerce_int(value: Any) -> int | None:
|
|
if isinstance(value, bool):
|
|
return None
|
|
if isinstance(value, int):
|
|
return value
|
|
return None
|
|
|
|
|
|
def _preflight_record_sets_clean(preflight: dict[str, Any]) -> bool:
|
|
fields = (
|
|
"duplicate_fixtures",
|
|
"duplicate_candidate_inputs",
|
|
"duplicate_requests",
|
|
"missing_candidate_inputs",
|
|
"missing_requests",
|
|
"unexpected_candidate_inputs",
|
|
"unexpected_requests",
|
|
)
|
|
return all(not preflight.get(field) for field in fields)
|
|
|
|
|
|
def _uses_sanitized_tmp_artifacts(manifest: dict[str, Any]) -> bool:
|
|
nodes = (
|
|
_manifest_fixtures(manifest),
|
|
_manifest_candidate_inputs(manifest),
|
|
_manifest_request_pack(manifest),
|
|
)
|
|
for node in nodes:
|
|
path = str(node.get("local_path") or "")
|
|
if not path.startswith("/tmp/") or "sanitized" not in path:
|
|
return False
|
|
source_path = str(node.get("source_unsanitized_path") or "")
|
|
if source_path and source_path == path:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _external_output_contract_declared(
|
|
manifest: dict[str, Any],
|
|
*,
|
|
expected_records: int,
|
|
) -> bool:
|
|
output = dict(manifest.get("external_runner_output") or {})
|
|
forbidden_fields = {str(field) for field in output.get("forbidden_model_output_fields") or []}
|
|
return (
|
|
str(output.get("required_path") or "").startswith("/tmp/")
|
|
and output.get("schema") == "docs/schemas/agent_nemotron_external_result_v1.schema.json"
|
|
and output.get("required_records") == expected_records
|
|
and output.get("one_result_per_request") is True
|
|
and _SELF_GRADING_FIELDS.issubset(forbidden_fields)
|
|
)
|
|
|
|
|
|
def _artifacts(manifest: dict[str, Any]) -> dict[str, Any]:
|
|
output = dict(manifest.get("external_runner_output") or {})
|
|
return {
|
|
"request_pack": _manifest_request_pack(manifest),
|
|
"candidate_inputs": _manifest_candidate_inputs(manifest),
|
|
"fixtures": _manifest_fixtures(manifest),
|
|
"sanitize_report": manifest.get("sanitize_report"),
|
|
"sanitized_preflight_report": manifest.get(
|
|
"external_runner_preflight_report_sanitized"
|
|
),
|
|
"external_results_required_path": output.get("required_path"),
|
|
"preferred_post_external_run_command": manifest.get(
|
|
"preferred_post_external_run_command"
|
|
),
|
|
}
|
|
|
|
|
|
def _safety(
|
|
manifest: dict[str, Any],
|
|
preflight: dict[str, Any],
|
|
) -> dict[str, Any]:
|
|
return {
|
|
"external_calls_performed_by_codex": manifest.get(
|
|
"external_calls_performed_by_codex"
|
|
),
|
|
"approval_required_before_external_execution": manifest.get(
|
|
"approval_required_before_external_execution"
|
|
),
|
|
"raw_artifacts_committed": manifest.get("raw_artifacts_committed"),
|
|
"sensitive_marker_records": preflight.get("sensitive_marker_records"),
|
|
"candidate_input_label_leak_records": preflight.get(
|
|
"candidate_input_label_leak_records"
|
|
),
|
|
"request_context_label_leak_records": preflight.get(
|
|
"request_context_label_leak_records"
|
|
),
|
|
"request_only_records": preflight.get("request_only_records"),
|
|
"not_replacement_evidence_records": preflight.get(
|
|
"not_replacement_evidence_records"
|
|
),
|
|
}
|
|
|
|
|
|
def _next_actions(manifest: dict[str, Any], *, ready: bool) -> list[str]:
|
|
if not ready:
|
|
return [
|
|
"Fix the readiness failures.",
|
|
"Regenerate sanitized fixtures, candidate inputs, and requests if needed.",
|
|
"Rerun sanitized preflight and readiness before any external execution.",
|
|
]
|
|
return [
|
|
"Obtain explicit commander approval before external execution.",
|
|
"Run the approved offline NeMo/NIM/Nemotron runner against the sanitized request pack only.",
|
|
"Write external results to "
|
|
f"{(manifest.get('external_runner_output') or {}).get('required_path')}.",
|
|
"Run the preferred post-external finalizer command.",
|
|
]
|
|
|
|
|
|
def _manifest_request_pack(manifest: dict[str, Any]) -> dict[str, Any]:
|
|
return dict(manifest.get("request_pack") or {})
|
|
|
|
|
|
def _manifest_candidate_inputs(manifest: dict[str, Any]) -> dict[str, Any]:
|
|
return dict(manifest.get("candidate_inputs") or {})
|
|
|
|
|
|
def _manifest_fixtures(manifest: dict[str, Any]) -> dict[str, Any]:
|
|
return dict(manifest.get("fixtures") or {})
|