243 lines
8.1 KiB
Python
243 lines
8.1 KiB
Python
from __future__ import annotations
|
|
|
|
from src.services.agent_replay_promotion_gate import (
|
|
evaluate_agent_replay_promotion_gate,
|
|
)
|
|
|
|
|
|
def test_promotion_gate_blocks_contract_probe_even_with_valid_contract():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="nemo_nemotron_fabric",
|
|
contract_report={
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"valid": True,
|
|
"inputs": 50,
|
|
"results": 50,
|
|
},
|
|
raw_results=[
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"error": "external_candidate_adapter_not_configured",
|
|
"metadata": {
|
|
"adapter_mode": "contract_probe",
|
|
"not_replacement_evidence": True,
|
|
},
|
|
}
|
|
],
|
|
scorecard_report={
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"incidents": 50,
|
|
"hard_gates_pass": True,
|
|
"eligible_for_canary": True,
|
|
"beats_baseline": True,
|
|
"gate_failures": [],
|
|
"total_score": 0.9,
|
|
}
|
|
]
|
|
},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is False
|
|
assert report["decision"] == "blocked"
|
|
assert "not_replacement_evidence_present:1" in report["failures"]
|
|
assert "contract_probe_result_present:1" in report["failures"]
|
|
assert "candidate_result_errors_present:1" in report["failures"]
|
|
assert "nemotron_import_report_missing" in report["failures"]
|
|
|
|
|
|
def test_promotion_gate_approves_real_replay_when_all_gates_pass():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="langgraph_incident_kernel",
|
|
contract_report={
|
|
"candidate_id": "langgraph_incident_kernel",
|
|
"valid": True,
|
|
"inputs": 50,
|
|
"results": 50,
|
|
},
|
|
raw_results=[
|
|
{
|
|
"candidate_id": "langgraph_incident_kernel",
|
|
"error": None,
|
|
"metadata": {"adapter_mode": "real_offline_replay"},
|
|
}
|
|
],
|
|
scorecard_report={
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "langgraph_incident_kernel",
|
|
"incidents": 50,
|
|
"hard_gates_pass": True,
|
|
"eligible_for_canary": True,
|
|
"beats_baseline": True,
|
|
"gate_failures": [],
|
|
"total_score": 0.9,
|
|
}
|
|
]
|
|
},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is True
|
|
assert report["decision"] == "approved"
|
|
assert report["failures"] == []
|
|
|
|
|
|
def test_promotion_gate_blocks_small_sample_and_missing_scorecard():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="openai_agents_sdk_coordinator",
|
|
contract_report={
|
|
"candidate_id": "openai_agents_sdk_coordinator",
|
|
"valid": True,
|
|
},
|
|
raw_results=[{"candidate_id": "openai_agents_sdk_coordinator"}],
|
|
scorecard_report={"candidates": []},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is False
|
|
assert "scorecard_candidate_missing" in report["failures"]
|
|
|
|
|
|
def test_promotion_gate_requires_nemotron_import_report():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="nemo_nemotron_fabric",
|
|
contract_report={
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"valid": True,
|
|
"inputs": 50,
|
|
"results": 50,
|
|
},
|
|
raw_results=[
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"error": None,
|
|
"metadata": {"adapter_mode": "real_offline_replay"},
|
|
}
|
|
],
|
|
scorecard_report={
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"incidents": 50,
|
|
"hard_gates_pass": True,
|
|
"eligible_for_canary": True,
|
|
"beats_baseline": True,
|
|
"gate_failures": [],
|
|
"total_score": 0.9,
|
|
}
|
|
]
|
|
},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is False
|
|
assert "nemotron_import_report_missing" in report["failures"]
|
|
assert report["evidence"]["import_report"] == {"provided": False}
|
|
|
|
|
|
def test_promotion_gate_accepts_valid_nemotron_import_report():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="nemo_nemotron_fabric",
|
|
contract_report={
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"valid": True,
|
|
"inputs": 1,
|
|
"results": 1,
|
|
},
|
|
raw_results=[
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"error": None,
|
|
"metadata": {"adapter_mode": "real_offline_replay"},
|
|
}
|
|
],
|
|
import_report={
|
|
"schema_version": "agent_nemotron_import_report_v1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"external_results": 1,
|
|
"imported_results": 1,
|
|
"requests": 1,
|
|
"valid": True,
|
|
"failures": [],
|
|
"duplicate_results": [],
|
|
"missing_results": [],
|
|
"unexpected_results": [],
|
|
"external_error_records": 0,
|
|
"fallback_used_records": 0,
|
|
"incomplete_trace_records": 0,
|
|
"total_cost_usd": 0,
|
|
"avg_latency_ms": 1000,
|
|
"p95_latency_ms": 1000,
|
|
},
|
|
scorecard_report={
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"incidents": 50,
|
|
"hard_gates_pass": True,
|
|
"eligible_for_canary": True,
|
|
"beats_baseline": True,
|
|
"gate_failures": [],
|
|
"total_score": 0.9,
|
|
}
|
|
]
|
|
},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is True
|
|
assert report["evidence"]["import_report"]["provided"] is True
|
|
assert report["evidence"]["import_report"]["valid"] is True
|
|
|
|
|
|
def test_promotion_gate_blocks_bad_import_report_counts():
|
|
report = evaluate_agent_replay_promotion_gate(
|
|
candidate_id="nemo_nemotron_fabric",
|
|
contract_report={
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"valid": True,
|
|
"inputs": 2,
|
|
"results": 2,
|
|
},
|
|
raw_results=[
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"error": None,
|
|
"metadata": {"adapter_mode": "real_offline_replay"},
|
|
}
|
|
],
|
|
import_report={
|
|
"schema_version": "agent_nemotron_import_report_v1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"external_results": 1,
|
|
"imported_results": 1,
|
|
"requests": 1,
|
|
"valid": False,
|
|
"failures": ["missing_external_results:run::INC-2"],
|
|
"duplicate_results": [],
|
|
"missing_results": ["run::INC-2"],
|
|
"unexpected_results": [],
|
|
"external_error_records": 1,
|
|
"fallback_used_records": 0,
|
|
"incomplete_trace_records": 0,
|
|
},
|
|
scorecard_report={
|
|
"candidates": [
|
|
{
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"incidents": 50,
|
|
"hard_gates_pass": True,
|
|
"eligible_for_canary": True,
|
|
"beats_baseline": True,
|
|
"gate_failures": [],
|
|
"total_score": 0.9,
|
|
}
|
|
]
|
|
},
|
|
).to_dict()
|
|
|
|
assert report["approved"] is False
|
|
assert "import_report_invalid" in report["failures"]
|
|
assert "import_report_contract_result_count_mismatch:imported=1;contract=2" in report["failures"]
|
|
assert "import_report_contract_input_count_mismatch:requests=1;contract=2" in report["failures"]
|
|
assert "import_report_missing_results_present:1" in report["failures"]
|
|
assert "import_report_external_errors_present:1" in report["failures"]
|