194 lines
6.5 KiB
Python
194 lines
6.5 KiB
Python
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from src.services.agent_nemotron_external_runner import (
|
|
NemotronExternalRunnerConfig,
|
|
run_nemotron_external_replay,
|
|
)
|
|
from src.services.agent_nemotron_replay_adapter import (
|
|
NEMOTRON_CONTRACT_TUNED_VARIANT_ID,
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_external_runner_writes_valid_result_from_json_response():
|
|
results, report = await run_nemotron_external_replay(
|
|
requests=[_request()],
|
|
config=NemotronExternalRunnerConfig(api_key="test-key"),
|
|
client=_FakeClient({
|
|
"choices": [
|
|
{
|
|
"message": {
|
|
"content": (
|
|
'{"proposed_action":"rollout restart checkout",'
|
|
'"action_plan":["inspect deployment","restart"],'
|
|
'"risk_level":"medium",'
|
|
'"requires_human_approval":true,'
|
|
'"blocked_by_policy":false}'
|
|
)
|
|
}
|
|
}
|
|
],
|
|
"usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
|
|
}),
|
|
)
|
|
|
|
assert report.valid is True
|
|
assert report.results == 1
|
|
assert results[0]["schema_version"] == "agent_nemotron_external_result_v1"
|
|
assert results[0]["model_output"]["risk_level"] == "medium"
|
|
assert results[0]["model_output"]["requires_human_approval"] is True
|
|
assert results[0]["error"] is None
|
|
assert results[0]["trace_events"][0]["usage"]["total_tokens"] == 30
|
|
assert results[0]["retry_used"] is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_external_runner_fails_closed_on_invalid_model_output():
|
|
results, report = await run_nemotron_external_replay(
|
|
requests=[_request()],
|
|
config=NemotronExternalRunnerConfig(api_key="test-key"),
|
|
client=_FakeClient({"choices": [{"message": {"content": "not json"}}]}),
|
|
)
|
|
|
|
assert report.valid is False
|
|
assert report.external_error_records == 1
|
|
assert results[0]["fallback_used"] is True
|
|
assert results[0]["trace_complete"] is False
|
|
assert results[0]["model_output"]["blocked_by_policy"] is True
|
|
assert results[0]["model_output"]["requires_human_approval"] is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_contract_tuned_runner_retries_missing_fields_once():
|
|
request = _request()
|
|
request["metadata"]["candidate_variant_id"] = NEMOTRON_CONTRACT_TUNED_VARIANT_ID
|
|
request["metadata"]["prompt_profile"] = "contract_tuned_v1"
|
|
request["response_contract"] = {
|
|
"required": [
|
|
"proposed_action",
|
|
"action_plan",
|
|
"risk_level",
|
|
"requires_human_approval",
|
|
"blocked_by_policy",
|
|
],
|
|
}
|
|
client = _FakeClient([
|
|
{
|
|
"choices": [
|
|
{
|
|
"message": {
|
|
"content": '{"proposed_action":"restart checkout"}'
|
|
}
|
|
}
|
|
],
|
|
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
|
|
},
|
|
{
|
|
"choices": [
|
|
{
|
|
"message": {
|
|
"content": (
|
|
'{"proposed_action":"collect diagnostics",'
|
|
'"action_plan":["inspect logs"],'
|
|
'"risk_level":"medium",'
|
|
'"requires_human_approval":true,'
|
|
'"blocked_by_policy":false}'
|
|
)
|
|
}
|
|
}
|
|
],
|
|
"usage": {"prompt_tokens": 20, "completion_tokens": 30, "total_tokens": 50},
|
|
},
|
|
])
|
|
|
|
results, report = await run_nemotron_external_replay(
|
|
requests=[request],
|
|
config=NemotronExternalRunnerConfig(api_key="test-key"),
|
|
client=client,
|
|
)
|
|
|
|
assert report.valid is True
|
|
assert report.retry_used_records == 1
|
|
assert report.candidate_variant_id == NEMOTRON_CONTRACT_TUNED_VARIANT_ID
|
|
assert client.calls == 2
|
|
assert "EXACT JSON CONTRACT" in client.payloads[0]["json"]["messages"][1]["content"]
|
|
assert "Previous model output was invalid" in client.payloads[1]["json"]["messages"][1]["content"]
|
|
assert results[0]["candidate_variant_id"] == NEMOTRON_CONTRACT_TUNED_VARIANT_ID
|
|
assert results[0]["retry_used"] is True
|
|
assert results[0]["first_error"].startswith("model_output_missing_fields:")
|
|
assert results[0]["error"] is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_external_runner_blocks_missing_key_before_network_call():
|
|
client = _FakeClient({})
|
|
results, report = await run_nemotron_external_replay(
|
|
requests=[_request()],
|
|
config=NemotronExternalRunnerConfig(api_key=""),
|
|
client=client,
|
|
)
|
|
|
|
assert results == []
|
|
assert report.valid is False
|
|
assert "api_key_missing" in report.failures
|
|
assert client.calls == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_external_runner_rejects_self_grading_request_leak():
|
|
request = _request()
|
|
request["incident_context"]["evaluation_labels"] = {"repair_success": True}
|
|
results, report = await run_nemotron_external_replay(
|
|
requests=[request],
|
|
config=NemotronExternalRunnerConfig(api_key="test-key"),
|
|
client=_FakeClient({}),
|
|
)
|
|
|
|
assert results == []
|
|
assert report.valid is False
|
|
assert any("request_self_grading_leak" in failure for failure in report.failures)
|
|
|
|
|
|
class _FakeResponse:
|
|
def __init__(self, payload: dict):
|
|
self.payload = payload
|
|
|
|
def raise_for_status(self) -> None:
|
|
return None
|
|
|
|
def json(self) -> dict:
|
|
return self.payload
|
|
|
|
|
|
class _FakeClient:
|
|
def __init__(self, payload: dict | list[dict]):
|
|
self.payload = payload
|
|
self.payloads: list[dict] = []
|
|
self.calls = 0
|
|
|
|
async def post(self, *_args, **kwargs) -> _FakeResponse:
|
|
self.calls += 1
|
|
self.payloads.append(kwargs)
|
|
if isinstance(self.payload, list):
|
|
return _FakeResponse(self.payload[self.calls - 1])
|
|
return _FakeResponse(self.payload)
|
|
|
|
|
|
def _request() -> dict:
|
|
return {
|
|
"schema_version": "agent_nemotron_replay_request_v1",
|
|
"run_id": "run",
|
|
"incident_id": "INC-1",
|
|
"candidate_id": "nemo_nemotron_fabric",
|
|
"system_prompt": "Return JSON.",
|
|
"user_prompt": "Incident context",
|
|
"incident_context": {"alertname": "PodCrashLooping"},
|
|
"source_metadata": {"source": "test"},
|
|
"metadata": {
|
|
"request_only": True,
|
|
"not_replacement_evidence": True,
|
|
},
|
|
}
|