100 lines
3.4 KiB
Python
100 lines
3.4 KiB
Python
from __future__ import annotations
|
|
|
|
from src.services.agent_nemotron_replay_failure_analysis import (
|
|
analyze_nemotron_replay_failure,
|
|
)
|
|
|
|
|
|
def test_failure_analysis_summarizes_contract_hilt_latency_and_baseline_failures():
|
|
report = analyze_nemotron_replay_failure(
|
|
external_results=[
|
|
{
|
|
"incident_id": "INC-1",
|
|
"error": None,
|
|
"model_output": {
|
|
"risk_level": "medium",
|
|
"requires_human_approval": True,
|
|
"blocked_by_policy": False,
|
|
},
|
|
},
|
|
{
|
|
"incident_id": "INC-2",
|
|
"error": "model_output_missing_fields:action_plan,blocked_by_policy",
|
|
"model_output": {
|
|
"risk_level": "medium",
|
|
"requires_human_approval": False,
|
|
},
|
|
},
|
|
],
|
|
external_runner_report={
|
|
"requests": 2,
|
|
"results": 2,
|
|
"valid": False,
|
|
"model": "nvidia/nemotron-3-super-120b-a12b",
|
|
"external_error_records": 1,
|
|
"fallback_used_records": 1,
|
|
"trace_incomplete_records": 1,
|
|
"p95_latency_ms": 120000,
|
|
"avg_latency_ms": 70000,
|
|
"failures": ["external_error:INC-2"],
|
|
},
|
|
finalizer_report={
|
|
"decision": "blocked",
|
|
"failures": ["candidate_result_errors_present:1"],
|
|
"promotion_gate": {
|
|
"approved": False,
|
|
"decision": "blocked",
|
|
"failures": ["candidate_result_errors_present:1"],
|
|
},
|
|
},
|
|
scorecard_report={
|
|
"baseline_candidate_id": "openclaw_incumbent",
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"beats_baseline": False,
|
|
"hard_gates_pass": False,
|
|
"gate_failures": [
|
|
"hitl_preserved_rate_below_100pct",
|
|
"audit_trace_rate_below_0.95",
|
|
],
|
|
"metrics": {
|
|
"audit_trace_rate": 0.5,
|
|
"hitl_preserved_rate": 0.5,
|
|
},
|
|
"total_score": 0.3,
|
|
},
|
|
{
|
|
"candidate_id": "openclaw_incumbent",
|
|
"gate_failures": [],
|
|
"metrics": {},
|
|
"total_score": 0.7,
|
|
},
|
|
],
|
|
},
|
|
generated_at="2026-06-01T00:00:00+00:00",
|
|
)
|
|
|
|
aggregate = report["external_result_aggregate"]
|
|
assert report["schema_version"] == "agent_nemotron_replay_failure_analysis_v1"
|
|
assert report["decision"] == "blocked"
|
|
assert report["not_replacement_evidence"] is True
|
|
assert aggregate["model_output_missing_fields"] == {
|
|
"action_plan": 1,
|
|
"blocked_by_policy": 1,
|
|
}
|
|
assert aggregate["unsafe_hitl_records"] == 1
|
|
assert report["scorecard_delta"]["score_delta"] == -0.4
|
|
assert {mode["id"] for mode in report["primary_failure_modes"]} >= {
|
|
"output_contract_incomplete",
|
|
"audit_trace_below_gate",
|
|
"hitl_below_gate",
|
|
"latency_outside_existing_async_budget",
|
|
"candidate_under_baseline",
|
|
"promotion_gate_blocked",
|
|
}
|
|
assert (
|
|
report["candidate_variant_plan"]["next_variant_id"]
|
|
== "nemo_nemotron_fabric_contract_tuned_v1"
|
|
)
|