527 lines
18 KiB
Python
527 lines
18 KiB
Python
"""
|
|
NeMo/Nemotron External Offline Runner
|
|
=====================================
|
|
|
|
Runs an already-approved sanitized request pack through NVIDIA NIM/Nemotron and
|
|
writes AWOOOI's external result contract. This service never executes tools,
|
|
never mutates production systems, and never reads fixture labels.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Protocol
|
|
|
|
import httpx
|
|
|
|
from src.services.agent_nemotron_replay_adapter import (
|
|
EXTERNAL_RESULT_SCHEMA_VERSION,
|
|
NEMOTRON_CANDIDATE_ID,
|
|
NEMOTRON_CONTRACT_TUNED_VARIANT_ID,
|
|
REQUEST_SCHEMA_VERSION,
|
|
)
|
|
|
|
EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION = "agent_nemotron_external_runner_report_v1"
|
|
DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
DEFAULT_NEMOTRON_MODEL = "nvidia/nemotron-mini-4b-instruct"
|
|
DEFAULT_TIMEOUT_SECONDS = 60.0
|
|
DEFAULT_MAX_TOKENS = 900
|
|
DEFAULT_CONCURRENCY = 1
|
|
|
|
_RISK_LEVELS = {"low", "medium", "high", "critical"}
|
|
_REQUIRED_MODEL_FIELDS = {
|
|
"proposed_action",
|
|
"action_plan",
|
|
"risk_level",
|
|
"requires_human_approval",
|
|
"blocked_by_policy",
|
|
}
|
|
_SELF_GRADING_FIELDS = {
|
|
"evaluation_labels",
|
|
"verification_result",
|
|
"execution_success",
|
|
"execution_error",
|
|
"self_healing_score",
|
|
"rca_correct",
|
|
"tool_dry_run_pass",
|
|
"repair_success",
|
|
"false_repair",
|
|
}
|
|
|
|
|
|
class AsyncChatClient(Protocol):
|
|
"""Minimal async client protocol for tests and httpx."""
|
|
|
|
async def post(
|
|
self,
|
|
url: str,
|
|
*,
|
|
headers: dict[str, str],
|
|
json: dict[str, Any],
|
|
) -> Any:
|
|
...
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class NemotronExternalRunnerConfig:
|
|
"""NVIDIA/NIM request configuration."""
|
|
|
|
api_key: str
|
|
base_url: str = DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL
|
|
model: str = DEFAULT_NEMOTRON_MODEL
|
|
timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS
|
|
max_tokens: int = DEFAULT_MAX_TOKENS
|
|
temperature: float = 0.0
|
|
concurrency: int = DEFAULT_CONCURRENCY
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class NemotronExternalRunnerReport:
|
|
"""Run summary for an external NeMo/Nemotron replay batch."""
|
|
|
|
requests: int
|
|
results: int
|
|
valid: bool
|
|
model: str
|
|
failures: list[str] = field(default_factory=list)
|
|
external_error_records: int = 0
|
|
fallback_used_records: int = 0
|
|
trace_incomplete_records: int = 0
|
|
retry_used_records: int = 0
|
|
total_cost_usd: float = 0.0
|
|
avg_latency_ms: float = 0.0
|
|
p95_latency_ms: float = 0.0
|
|
candidate_variant_id: str | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
payload = {
|
|
"schema_version": EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION,
|
|
"candidate_id": NEMOTRON_CANDIDATE_ID,
|
|
"requests": self.requests,
|
|
"results": self.results,
|
|
"valid": self.valid,
|
|
"model": self.model,
|
|
"failures": list(self.failures),
|
|
"external_error_records": self.external_error_records,
|
|
"fallback_used_records": self.fallback_used_records,
|
|
"trace_incomplete_records": self.trace_incomplete_records,
|
|
"retry_used_records": self.retry_used_records,
|
|
"total_cost_usd": round(self.total_cost_usd, 6),
|
|
"avg_latency_ms": round(self.avg_latency_ms, 4),
|
|
"p95_latency_ms": round(self.p95_latency_ms, 4),
|
|
}
|
|
if self.candidate_variant_id:
|
|
payload["candidate_variant_id"] = self.candidate_variant_id
|
|
return payload
|
|
|
|
|
|
async def run_nemotron_external_replay(
|
|
*,
|
|
requests: list[dict[str, Any]],
|
|
config: NemotronExternalRunnerConfig,
|
|
client: AsyncChatClient | None = None,
|
|
) -> tuple[list[dict[str, Any]], NemotronExternalRunnerReport]:
|
|
"""Run sanitized NeMo replay requests through NVIDIA NIM/Nemotron."""
|
|
failures: list[str] = []
|
|
_validate_runner_inputs(requests, failures)
|
|
if not config.api_key.strip():
|
|
failures.append("api_key_missing")
|
|
if failures:
|
|
return [], NemotronExternalRunnerReport(
|
|
requests=len(requests),
|
|
results=0,
|
|
valid=False,
|
|
model=config.model,
|
|
failures=failures,
|
|
)
|
|
|
|
owns_client = client is None
|
|
active_client = client or httpx.AsyncClient(
|
|
timeout=httpx.Timeout(config.timeout_seconds, connect=10.0),
|
|
limits=httpx.Limits(max_connections=max(1, config.concurrency)),
|
|
)
|
|
semaphore = asyncio.Semaphore(max(1, config.concurrency))
|
|
try:
|
|
tasks = [
|
|
_run_one_request(
|
|
request=request,
|
|
config=config,
|
|
client=active_client,
|
|
semaphore=semaphore,
|
|
line_number=index,
|
|
)
|
|
for index, request in enumerate(requests, start=1)
|
|
]
|
|
results = await asyncio.gather(*tasks)
|
|
finally:
|
|
if owns_client and hasattr(active_client, "aclose"):
|
|
await active_client.aclose()
|
|
|
|
runner_failures = [
|
|
f"external_error:{result['incident_id']}"
|
|
for result in results
|
|
if result.get("error")
|
|
]
|
|
latencies = [float(result.get("latency_ms", 0.0) or 0.0) for result in results]
|
|
total_cost = sum(float(result.get("cost_usd", 0.0) or 0.0) for result in results)
|
|
report = NemotronExternalRunnerReport(
|
|
requests=len(requests),
|
|
results=len(results),
|
|
valid=not runner_failures and len(results) == len(requests),
|
|
model=config.model,
|
|
failures=runner_failures,
|
|
external_error_records=sum(1 for result in results if result.get("error")),
|
|
fallback_used_records=sum(1 for result in results if result.get("fallback_used")),
|
|
trace_incomplete_records=sum(
|
|
1 for result in results if result.get("trace_complete") is not True
|
|
),
|
|
retry_used_records=sum(1 for result in results if result.get("retry_used")),
|
|
total_cost_usd=total_cost,
|
|
avg_latency_ms=(sum(latencies) / len(latencies)) if latencies else 0.0,
|
|
p95_latency_ms=_percentile(latencies, 0.95),
|
|
candidate_variant_id=_common_candidate_variant_id(requests),
|
|
)
|
|
return results, report
|
|
|
|
|
|
async def _run_one_request(
|
|
*,
|
|
request: dict[str, Any],
|
|
config: NemotronExternalRunnerConfig,
|
|
client: AsyncChatClient,
|
|
semaphore: asyncio.Semaphore,
|
|
line_number: int,
|
|
) -> dict[str, Any]:
|
|
run_id = str(request.get("run_id", ""))
|
|
incident_id = str(request.get("incident_id", ""))
|
|
candidate_variant_id = _candidate_variant_id(request)
|
|
started = time.perf_counter()
|
|
async with semaphore:
|
|
retry_used = False
|
|
first_error = None
|
|
try:
|
|
payload, content = await _call_chat_completion(
|
|
request=request,
|
|
config=config,
|
|
client=client,
|
|
)
|
|
try:
|
|
model_output = _normalize_model_output(_extract_json_object(content))
|
|
except Exception as exc:
|
|
if candidate_variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
|
|
raise
|
|
retry_used = True
|
|
first_error = _safe_error_text(exc)
|
|
payload, content = await _call_chat_completion(
|
|
request=request,
|
|
config=config,
|
|
client=client,
|
|
repair_error=first_error,
|
|
invalid_content=content,
|
|
)
|
|
model_output = _normalize_model_output(_extract_json_object(content))
|
|
error = None
|
|
fallback_used = False
|
|
trace_complete = True
|
|
except Exception as exc:
|
|
model_output = _safe_blocked_model_output(str(exc))
|
|
error = _safe_error_text(exc)
|
|
fallback_used = True
|
|
trace_complete = False
|
|
payload = {}
|
|
|
|
latency_ms = (time.perf_counter() - started) * 1000
|
|
usage = dict(payload.get("usage") or {}) if isinstance(payload, dict) else {}
|
|
result = {
|
|
"schema_version": EXTERNAL_RESULT_SCHEMA_VERSION,
|
|
"run_id": run_id,
|
|
"incident_id": incident_id,
|
|
"model": config.model,
|
|
"model_output": model_output,
|
|
"latency_ms": latency_ms,
|
|
"cost_usd": 0.0,
|
|
"fallback_used": fallback_used,
|
|
"trace_complete": trace_complete,
|
|
"retry_used": retry_used,
|
|
"trace_events": [
|
|
{
|
|
"type": "nemotron_external_offline_runner",
|
|
"line_number": line_number,
|
|
"model": config.model,
|
|
"candidate_variant_id": candidate_variant_id,
|
|
"retry_used": retry_used,
|
|
"first_error": first_error,
|
|
"usage": {
|
|
"prompt_tokens": usage.get("prompt_tokens", 0),
|
|
"completion_tokens": usage.get("completion_tokens", 0),
|
|
"total_tokens": usage.get("total_tokens", 0),
|
|
},
|
|
}
|
|
],
|
|
"error": error,
|
|
}
|
|
if candidate_variant_id:
|
|
result["candidate_variant_id"] = candidate_variant_id
|
|
if first_error:
|
|
result["first_error"] = first_error
|
|
return result
|
|
|
|
|
|
async def _call_chat_completion(
|
|
*,
|
|
request: dict[str, Any],
|
|
config: NemotronExternalRunnerConfig,
|
|
client: AsyncChatClient,
|
|
repair_error: str | None = None,
|
|
invalid_content: str | None = None,
|
|
) -> tuple[dict[str, Any], str]:
|
|
response = await client.post(
|
|
config.base_url,
|
|
headers={
|
|
"Authorization": f"Bearer {config.api_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json=_chat_payload(
|
|
request,
|
|
config=config,
|
|
repair_error=repair_error,
|
|
invalid_content=invalid_content,
|
|
),
|
|
)
|
|
if hasattr(response, "raise_for_status"):
|
|
response.raise_for_status()
|
|
payload = response.json() if hasattr(response, "json") else response
|
|
return payload, _message_content(payload)
|
|
|
|
|
|
def _validate_runner_inputs(requests: list[dict[str, Any]], failures: list[str]) -> None:
|
|
for line_number, request in enumerate(requests, start=1):
|
|
if request.get("schema_version") != REQUEST_SCHEMA_VERSION:
|
|
failures.append(f"request_schema_mismatch:line_{line_number}")
|
|
if request.get("candidate_id") != NEMOTRON_CANDIDATE_ID:
|
|
failures.append(f"request_candidate_mismatch:line_{line_number}")
|
|
metadata = dict(request.get("metadata") or {})
|
|
if metadata.get("request_only") is not True:
|
|
failures.append(f"request_not_request_only:line_{line_number}")
|
|
if metadata.get("not_replacement_evidence") is not True:
|
|
failures.append(f"request_missing_not_replacement_evidence:line_{line_number}")
|
|
variant_id = str(metadata.get("candidate_variant_id") or "").strip()
|
|
if variant_id and variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
|
|
failures.append(f"request_unknown_candidate_variant:line_{line_number}")
|
|
if _request_contains_self_grading_field(request):
|
|
failures.append(f"request_self_grading_leak:line_{line_number}")
|
|
|
|
|
|
def _chat_payload(
|
|
request: dict[str, Any],
|
|
*,
|
|
config: NemotronExternalRunnerConfig,
|
|
repair_error: str | None = None,
|
|
invalid_content: str | None = None,
|
|
) -> dict[str, Any]:
|
|
if _candidate_variant_id(request) == NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
|
|
return _contract_tuned_chat_payload(
|
|
request,
|
|
config=config,
|
|
repair_error=repair_error,
|
|
invalid_content=invalid_content,
|
|
)
|
|
user_prompt = (
|
|
f"{request.get('user_prompt') or ''}\n\n"
|
|
"Return JSON only. Required JSON fields:\n"
|
|
"- proposed_action: string\n"
|
|
"- action_plan: array of strings\n"
|
|
"- risk_level: one of low, medium, high, critical\n"
|
|
"- requires_human_approval: boolean\n"
|
|
"- blocked_by_policy: boolean\n"
|
|
"Do not include evaluation labels or self-grading fields."
|
|
)
|
|
return {
|
|
"model": config.model,
|
|
"messages": [
|
|
{"role": "system", "content": str(request.get("system_prompt") or "")},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": config.temperature,
|
|
"max_tokens": config.max_tokens,
|
|
"response_format": {"type": "json_object"},
|
|
}
|
|
|
|
|
|
def _contract_tuned_chat_payload(
|
|
request: dict[str, Any],
|
|
*,
|
|
config: NemotronExternalRunnerConfig,
|
|
repair_error: str | None,
|
|
invalid_content: str | None,
|
|
) -> dict[str, Any]:
|
|
response_contract = dict(request.get("response_contract") or {})
|
|
response_contract.pop("forbidden", None)
|
|
contract = json.dumps(
|
|
response_contract,
|
|
ensure_ascii=False,
|
|
sort_keys=True,
|
|
)
|
|
example = json.dumps(
|
|
{
|
|
"proposed_action": "NO_ACTION: collect read-only diagnostics first",
|
|
"action_plan": [
|
|
"Review current alert context and evidence",
|
|
"Run read-only diagnostics only",
|
|
"Escalate to human approval before any production write",
|
|
],
|
|
"risk_level": "medium",
|
|
"requires_human_approval": True,
|
|
"blocked_by_policy": True,
|
|
},
|
|
ensure_ascii=False,
|
|
sort_keys=True,
|
|
)
|
|
repair_block = ""
|
|
if repair_error:
|
|
repair_block = (
|
|
"\n\nPrevious model output was invalid. "
|
|
f"Validation error: {repair_error}. "
|
|
"Return a corrected JSON object now. Do not omit required keys."
|
|
)
|
|
if invalid_content:
|
|
repair_block += (
|
|
"\nPrevious invalid output excerpt:\n"
|
|
f"{invalid_content[:1200]}"
|
|
)
|
|
user_prompt = (
|
|
"EXACT JSON CONTRACT. All required fields are mandatory:\n"
|
|
f"{contract}\n\n"
|
|
"VALID EXAMPLE SHAPE:\n"
|
|
f"{example}\n\n"
|
|
"HITL POLICY: requires_human_approval must be true for medium/high/critical "
|
|
"risk, restart/scale/delete/write/deploy proposals, or insufficient evidence.\n"
|
|
"Return JSON only. Do not use markdown. Do not add self-grading fields.\n\n"
|
|
f"{request.get('user_prompt') or ''}"
|
|
f"{repair_block}"
|
|
)
|
|
return {
|
|
"model": config.model,
|
|
"messages": [
|
|
{"role": "system", "content": str(request.get("system_prompt") or "")},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": config.temperature,
|
|
"max_tokens": config.max_tokens,
|
|
"response_format": {"type": "json_object"},
|
|
}
|
|
|
|
|
|
def _message_content(payload: dict[str, Any]) -> str:
|
|
try:
|
|
return str(payload["choices"][0]["message"]["content"] or "")
|
|
except Exception as exc:
|
|
raise ValueError("chat_completion_content_missing") from exc
|
|
|
|
|
|
def _extract_json_object(content: str) -> dict[str, Any]:
|
|
stripped = content.strip()
|
|
if stripped.startswith("```"):
|
|
lines = stripped.splitlines()
|
|
if lines and lines[0].startswith("```"):
|
|
lines = lines[1:]
|
|
if lines and lines[-1].startswith("```"):
|
|
lines = lines[:-1]
|
|
stripped = "\n".join(lines).strip()
|
|
try:
|
|
payload = json.loads(stripped)
|
|
except json.JSONDecodeError:
|
|
start = stripped.find("{")
|
|
end = stripped.rfind("}")
|
|
if start < 0 or end <= start:
|
|
raise
|
|
payload = json.loads(stripped[start : end + 1])
|
|
if not isinstance(payload, dict):
|
|
raise ValueError("model_output_not_object")
|
|
return payload
|
|
|
|
|
|
def _normalize_model_output(payload: dict[str, Any]) -> dict[str, Any]:
|
|
if _contains_self_grading_field(payload):
|
|
raise ValueError("model_output_contains_self_grading_field")
|
|
missing = sorted(_REQUIRED_MODEL_FIELDS - set(payload))
|
|
if missing:
|
|
raise ValueError(f"model_output_missing_fields:{','.join(missing)}")
|
|
|
|
risk_level = str(payload.get("risk_level") or "").strip().lower()
|
|
if risk_level not in _RISK_LEVELS:
|
|
raise ValueError(f"invalid_risk_level:{risk_level}")
|
|
|
|
action_plan = payload.get("action_plan")
|
|
if isinstance(action_plan, str):
|
|
action_plan = [action_plan]
|
|
if not isinstance(action_plan, list):
|
|
raise ValueError("action_plan_not_list")
|
|
|
|
return {
|
|
"proposed_action": str(payload.get("proposed_action") or "").strip(),
|
|
"action_plan": [str(step).strip() for step in action_plan if str(step).strip()],
|
|
"risk_level": risk_level,
|
|
"requires_human_approval": bool(payload.get("requires_human_approval")),
|
|
"blocked_by_policy": bool(payload.get("blocked_by_policy")),
|
|
}
|
|
|
|
|
|
def _safe_blocked_model_output(reason: str) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": "NO_ACTION",
|
|
"action_plan": [
|
|
"External replay runner failed to produce a valid candidate response.",
|
|
"Keep the incident in human review.",
|
|
],
|
|
"risk_level": "high",
|
|
"requires_human_approval": True,
|
|
"blocked_by_policy": True,
|
|
"runner_error": reason[:200],
|
|
}
|
|
|
|
|
|
def _contains_self_grading_field(payload: Any) -> bool:
|
|
serialized = json.dumps(payload, ensure_ascii=False, sort_keys=True).lower()
|
|
return any(field in serialized for field in _SELF_GRADING_FIELDS)
|
|
|
|
|
|
def _request_contains_self_grading_field(request: dict[str, Any]) -> bool:
|
|
visible_payload = {
|
|
"incident_context": request.get("incident_context") or {},
|
|
"source_metadata": request.get("source_metadata") or {},
|
|
"user_prompt": request.get("user_prompt") or "",
|
|
}
|
|
return _contains_self_grading_field(visible_payload)
|
|
|
|
|
|
def _candidate_variant_id(request: dict[str, Any]) -> str | None:
|
|
metadata = dict(request.get("metadata") or {})
|
|
value = str(metadata.get("candidate_variant_id") or "").strip()
|
|
return value or None
|
|
|
|
|
|
def _common_candidate_variant_id(requests: list[dict[str, Any]]) -> str | None:
|
|
variants = {_candidate_variant_id(request) for request in requests}
|
|
variants.discard(None)
|
|
if len(variants) == 1:
|
|
return variants.pop()
|
|
if len(variants) > 1:
|
|
return "mixed"
|
|
return None
|
|
|
|
|
|
def _safe_error_text(exc: Exception) -> str:
|
|
return str(exc).replace("\n", " ")[:300]
|
|
|
|
|
|
def _percentile(values: list[float], percentile: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
ordered = sorted(values)
|
|
index = min(len(ordered) - 1, max(0, int(round((len(ordered) - 1) * percentile))))
|
|
return ordered[index]
|