Files
awoooi/docs/evaluations/nemotron_external_runner_manifest_2026-06-01.json
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

123 lines
10 KiB
JSON

{
"schema_version": "agent_nemotron_external_runner_manifest_v1",
"generated_at": "2026-06-01T17:35:00+08:00",
"candidate_id": "nemo_nemotron_fabric",
"run_id": "nemotron-replay-prod-20260601165413",
"status": "external_replay_completed_blocked_failure_analyzed",
"external_replay_status": "completed_blocked_failure_analyzed",
"external_calls_performed_by_codex": true,
"approval_required_before_external_execution": true,
"raw_artifacts_committed": false,
"request_pack_smoke_report": "docs/evaluations/agent_nemotron_replay_request_pack_smoke_2026-06-01.json",
"external_runner_preflight_report_original": "docs/evaluations/agent_nemotron_external_runner_preflight_2026-06-01.json",
"sanitize_report": "docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-01.json",
"external_runner_preflight_report_sanitized": "docs/evaluations/agent_nemotron_external_runner_preflight_sanitized_2026-06-01.json",
"external_runner_readiness_report": "docs/evaluations/agent_nemotron_external_runner_readiness_2026-06-01.json",
"required_pre_external_run_command": "apps/api/.venv/bin/python scripts/agents/nemotron-external-runner-preflight.py --fixtures /tmp/nemotron-replay-prod-20260601165413-sanitized-fixtures.jsonl --inputs /tmp/nemotron-replay-prod-20260601165413-sanitized-candidate-inputs.jsonl --requests /tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl --output /tmp/nemotron-replay-prod-20260601165413-sanitized-preflight.json",
"required_readiness_command": "apps/api/.venv/bin/python scripts/agents/nemotron-external-runner-readiness.py --manifest docs/evaluations/nemotron_external_runner_manifest_2026-06-01.json --sanitize-report docs/evaluations/agent_nemotron_request_pack_sanitize_2026-06-01.json --sanitized-preflight docs/evaluations/agent_nemotron_external_runner_preflight_sanitized_2026-06-01.json --output docs/evaluations/agent_nemotron_external_runner_readiness_2026-06-01.json",
"sanitize_command": "apps/api/.venv/bin/python scripts/agents/nemotron-sanitize-request-pack.py --fixtures /tmp/nemotron-replay-prod-20260601165413-fixtures.jsonl --output-fixtures /tmp/nemotron-replay-prod-20260601165413-sanitized-fixtures.jsonl --output-inputs /tmp/nemotron-replay-prod-20260601165413-sanitized-candidate-inputs.jsonl --output-requests /tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl --report /tmp/nemotron-replay-prod-20260601165413-sanitize-report.json",
"request_pack": {
"local_path": "/tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl",
"source_unsanitized_path": "/tmp/nemotron-replay-prod-20260601165413-nemotron-requests.local.jsonl",
"schema": "docs/schemas/agent_nemotron_replay_request_v1.schema.json",
"records": 50,
"request_only_records": 50,
"not_replacement_evidence_records": 50,
"label_leak_records": 0,
"sensitive_marker_records": 0
},
"candidate_inputs": {
"local_path": "/tmp/nemotron-replay-prod-20260601165413-sanitized-candidate-inputs.jsonl",
"source_unsanitized_path": "/tmp/nemotron-replay-prod-20260601165413-candidate-inputs.jsonl",
"schema": "docs/schemas/agent_replay_candidate_input_v1.schema.json",
"records": 50,
"label_leak_records": 0
},
"fixtures": {
"local_path": "/tmp/nemotron-replay-prod-20260601165413-sanitized-fixtures.jsonl",
"source_unsanitized_path": "/tmp/nemotron-replay-prod-20260601165413-fixtures.jsonl",
"schema": "docs/schemas/agent_replay_fixture_v1.schema.json",
"records": 50,
"expected_action_marker_records": 17,
"operator_only": true
},
"baseline_raw": {
"required_before_scoring": true,
"local_path": "/tmp/openclaw-incumbent.jsonl",
"schema": "docs/schemas/agent_replacement_replay_v1.schema.json",
"export_command": "apps/api/.venv/bin/python scripts/export-openclaw-incumbent-replay.py --output /tmp/openclaw-incumbent.jsonl --limit 50 --days 30",
"aggregate_snapshot": "docs/evaluations/openclaw_incumbent_baseline_2026-06-01.json"
},
"external_runner_output": {
"required_path": "/tmp/nemotron-replay-prod-20260601165413-external-results.jsonl",
"schema": "docs/schemas/agent_nemotron_external_result_v1.schema.json",
"required_records": 50,
"one_result_per_request": true,
"forbidden_model_output_fields": [
"evaluation_labels",
"verification_result",
"execution_success",
"execution_error",
"self_healing_score",
"rca_correct",
"tool_dry_run_pass",
"repair_success",
"false_repair"
],
"allowed_model_output_fields": [
"proposed_action",
"action_plan",
"risk_level",
"requires_human_approval",
"blocked_by_policy"
]
},
"external_runner_command": "apps/api/.venv/bin/python scripts/agents/nemotron-run-external-offline.py --readiness docs/evaluations/agent_nemotron_external_runner_readiness_2026-06-01.json --requests /tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl --output /tmp/nemotron-replay-prod-20260601165413-external-results.jsonl --report /tmp/nemotron-replay-prod-20260601165413-external-runner-report.json",
"external_runner_report": {
"local_path": "/tmp/nemotron-replay-prod-20260601165413-external-runner-report.json",
"schema": "docs/schemas/agent_nemotron_external_runner_report_v1.schema.json",
"aggregate_snapshot": "docs/evaluations/agent_nemotron_external_runner_report_2026-06-01.json"
},
"external_replay_result": {
"decision": "blocked",
"finalizer_report": "docs/evaluations/agent_nemotron_replay_finalizer_prod_2026-06-01.json",
"scorecard": "docs/evaluations/agent_nemotron_replay_scorecard_2026-06-01.json",
"failure_analysis": "docs/evaluations/agent_nemotron_replay_failure_analysis_2026-06-01.json",
"runner_valid": false,
"external_error_records": 11,
"output_contract_incomplete_records": 11,
"unsafe_hitl_records": 7,
"candidate_total_score": 0.3076,
"openclaw_total_score": 0.7001,
"candidate_beats_baseline": false,
"promotion_gate_approved": false,
"next_variant_id": "nemo_nemotron_fabric_contract_tuned_v1"
},
"follow_up_variant_manifest": "docs/evaluations/nemotron_contract_tuned_runner_manifest_2026-06-01.json",
"safety_constraints": [
"The external runner may read only the NeMo request pack, not fixture labels.",
"The pre-external-run preflight must pass before the request pack is sent outside AWOOOI.",
"The unsanitized 50-record request pack was blocked because 4 records contained sensitive-context markers such as redacted htpasswd/pgpass/secret paths.",
"The sanitized 50-record request pack passed preflight with sensitive_marker_records=0.",
"The external runner readiness gate must pass with decision=ready_for_approval before approval is requested.",
"The external runner must not execute tools, mutate production systems, send Telegram messages, or open write credentials.",
"The external runner must return JSONL only; AWOOOI will apply hidden labels locally after import.",
"The request pack is not replacement evidence until import, contract validation, normalization, grading, scoring, and promotion gate all pass."
],
"preferred_post_external_run_command": "apps/api/.venv/bin/python scripts/agents/nemotron-finalize-replay.py --requests /tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl --external-results /tmp/nemotron-replay-prod-20260601165413-external-results.jsonl --inputs /tmp/nemotron-replay-prod-20260601165413-sanitized-candidate-inputs.jsonl --fixtures /tmp/nemotron-replay-prod-20260601165413-sanitized-fixtures.jsonl --baseline /tmp/openclaw-incumbent.jsonl --output-prefix /tmp/nemotron-replay-prod-20260601165413 --target-stage shadow",
"preferred_failure_analysis_command": "apps/api/.venv/bin/python scripts/agents/analyze-nemotron-replay-failure.py --external-results /tmp/nemotron-replay-prod-20260601165413-external-results.jsonl --external-runner-report docs/evaluations/agent_nemotron_external_runner_report_2026-06-01.json --finalizer-report docs/evaluations/agent_nemotron_replay_finalizer_prod_2026-06-01.json --scorecard docs/evaluations/agent_nemotron_replay_scorecard_2026-06-01.json --output docs/evaluations/agent_nemotron_replay_failure_analysis_2026-06-01.json",
"manual_post_external_run_commands": [
"apps/api/.venv/bin/python scripts/agents/nemotron-import-replay-results.py --requests /tmp/nemotron-replay-prod-20260601165413-sanitized-nemotron-requests.jsonl --external-results /tmp/nemotron-replay-prod-20260601165413-external-results.jsonl --output /tmp/nemotron-replay-prod-20260601165413-candidate-raw.jsonl --report /tmp/nemotron-replay-prod-20260601165413-import-report.json",
"apps/api/.venv/bin/python scripts/agents/run-agent-replacement-replay.py --inputs /tmp/nemotron-replay-prod-20260601165413-sanitized-candidate-inputs.jsonl --results /tmp/nemotron-replay-prod-20260601165413-candidate-raw.jsonl --baseline /tmp/openclaw-incumbent.jsonl --candidate-id nemo_nemotron_fabric --fixtures /tmp/nemotron-replay-prod-20260601165413-sanitized-fixtures.jsonl --contract-report /tmp/nemotron-replay-prod-20260601165413-contract-report.json --normalized-output /tmp/nemotron-replay-prod-20260601165413-candidate-normalized.jsonl --graded-output /tmp/nemotron-replay-prod-20260601165413-candidate-graded.jsonl --grading-report /tmp/nemotron-replay-prod-20260601165413-grading-report.json --scorecard /tmp/nemotron-replay-prod-20260601165413-scorecard.json --summary /tmp/nemotron-replay-prod-20260601165413-pipeline-report.json",
"apps/api/.venv/bin/python scripts/agents/evaluate-agent-promotion-gate.py --candidate-id nemo_nemotron_fabric --scorecard /tmp/nemotron-replay-prod-20260601165413-scorecard.json --contract-report /tmp/nemotron-replay-prod-20260601165413-contract-report.json --raw-results /tmp/nemotron-replay-prod-20260601165413-candidate-raw.jsonl --import-report /tmp/nemotron-replay-prod-20260601165413-import-report.json --target-stage shadow --output /tmp/nemotron-replay-prod-20260601165413-promotion-gate.json"
],
"promotion_requires": [
"external_runner_preflight.valid=true before external execution",
"import_report.valid=true",
"contract_report.valid=true",
"grading_report.graded_records>0",
"scorecard.beats_baseline=true",
"promotion_gate.approved=true"
]
}