Files
awoooi/apps/api/src/services/agent_nemotron_external_runner.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

527 lines
18 KiB
Python

"""
NeMo/Nemotron External Offline Runner
=====================================
Runs an already-approved sanitized request pack through NVIDIA NIM/Nemotron and
writes AWOOOI's external result contract. This service never executes tools,
never mutates production systems, and never reads fixture labels.
"""
from __future__ import annotations
import asyncio
import json
import time
from dataclasses import dataclass, field
from typing import Any, Protocol
import httpx
from src.services.agent_nemotron_replay_adapter import (
EXTERNAL_RESULT_SCHEMA_VERSION,
NEMOTRON_CANDIDATE_ID,
NEMOTRON_CONTRACT_TUNED_VARIANT_ID,
REQUEST_SCHEMA_VERSION,
)
EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION = "agent_nemotron_external_runner_report_v1"
DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
DEFAULT_NEMOTRON_MODEL = "nvidia/nemotron-mini-4b-instruct"
DEFAULT_TIMEOUT_SECONDS = 60.0
DEFAULT_MAX_TOKENS = 900
DEFAULT_CONCURRENCY = 1
_RISK_LEVELS = {"low", "medium", "high", "critical"}
_REQUIRED_MODEL_FIELDS = {
"proposed_action",
"action_plan",
"risk_level",
"requires_human_approval",
"blocked_by_policy",
}
_SELF_GRADING_FIELDS = {
"evaluation_labels",
"verification_result",
"execution_success",
"execution_error",
"self_healing_score",
"rca_correct",
"tool_dry_run_pass",
"repair_success",
"false_repair",
}
class AsyncChatClient(Protocol):
"""Minimal async client protocol for tests and httpx."""
async def post(
self,
url: str,
*,
headers: dict[str, str],
json: dict[str, Any],
) -> Any:
...
@dataclass(frozen=True)
class NemotronExternalRunnerConfig:
"""NVIDIA/NIM request configuration."""
api_key: str
base_url: str = DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL
model: str = DEFAULT_NEMOTRON_MODEL
timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS
max_tokens: int = DEFAULT_MAX_TOKENS
temperature: float = 0.0
concurrency: int = DEFAULT_CONCURRENCY
@dataclass(frozen=True)
class NemotronExternalRunnerReport:
"""Run summary for an external NeMo/Nemotron replay batch."""
requests: int
results: int
valid: bool
model: str
failures: list[str] = field(default_factory=list)
external_error_records: int = 0
fallback_used_records: int = 0
trace_incomplete_records: int = 0
retry_used_records: int = 0
total_cost_usd: float = 0.0
avg_latency_ms: float = 0.0
p95_latency_ms: float = 0.0
candidate_variant_id: str | None = None
def to_dict(self) -> dict[str, Any]:
payload = {
"schema_version": EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION,
"candidate_id": NEMOTRON_CANDIDATE_ID,
"requests": self.requests,
"results": self.results,
"valid": self.valid,
"model": self.model,
"failures": list(self.failures),
"external_error_records": self.external_error_records,
"fallback_used_records": self.fallback_used_records,
"trace_incomplete_records": self.trace_incomplete_records,
"retry_used_records": self.retry_used_records,
"total_cost_usd": round(self.total_cost_usd, 6),
"avg_latency_ms": round(self.avg_latency_ms, 4),
"p95_latency_ms": round(self.p95_latency_ms, 4),
}
if self.candidate_variant_id:
payload["candidate_variant_id"] = self.candidate_variant_id
return payload
async def run_nemotron_external_replay(
*,
requests: list[dict[str, Any]],
config: NemotronExternalRunnerConfig,
client: AsyncChatClient | None = None,
) -> tuple[list[dict[str, Any]], NemotronExternalRunnerReport]:
"""Run sanitized NeMo replay requests through NVIDIA NIM/Nemotron."""
failures: list[str] = []
_validate_runner_inputs(requests, failures)
if not config.api_key.strip():
failures.append("api_key_missing")
if failures:
return [], NemotronExternalRunnerReport(
requests=len(requests),
results=0,
valid=False,
model=config.model,
failures=failures,
)
owns_client = client is None
active_client = client or httpx.AsyncClient(
timeout=httpx.Timeout(config.timeout_seconds, connect=10.0),
limits=httpx.Limits(max_connections=max(1, config.concurrency)),
)
semaphore = asyncio.Semaphore(max(1, config.concurrency))
try:
tasks = [
_run_one_request(
request=request,
config=config,
client=active_client,
semaphore=semaphore,
line_number=index,
)
for index, request in enumerate(requests, start=1)
]
results = await asyncio.gather(*tasks)
finally:
if owns_client and hasattr(active_client, "aclose"):
await active_client.aclose()
runner_failures = [
f"external_error:{result['incident_id']}"
for result in results
if result.get("error")
]
latencies = [float(result.get("latency_ms", 0.0) or 0.0) for result in results]
total_cost = sum(float(result.get("cost_usd", 0.0) or 0.0) for result in results)
report = NemotronExternalRunnerReport(
requests=len(requests),
results=len(results),
valid=not runner_failures and len(results) == len(requests),
model=config.model,
failures=runner_failures,
external_error_records=sum(1 for result in results if result.get("error")),
fallback_used_records=sum(1 for result in results if result.get("fallback_used")),
trace_incomplete_records=sum(
1 for result in results if result.get("trace_complete") is not True
),
retry_used_records=sum(1 for result in results if result.get("retry_used")),
total_cost_usd=total_cost,
avg_latency_ms=(sum(latencies) / len(latencies)) if latencies else 0.0,
p95_latency_ms=_percentile(latencies, 0.95),
candidate_variant_id=_common_candidate_variant_id(requests),
)
return results, report
async def _run_one_request(
*,
request: dict[str, Any],
config: NemotronExternalRunnerConfig,
client: AsyncChatClient,
semaphore: asyncio.Semaphore,
line_number: int,
) -> dict[str, Any]:
run_id = str(request.get("run_id", ""))
incident_id = str(request.get("incident_id", ""))
candidate_variant_id = _candidate_variant_id(request)
started = time.perf_counter()
async with semaphore:
retry_used = False
first_error = None
try:
payload, content = await _call_chat_completion(
request=request,
config=config,
client=client,
)
try:
model_output = _normalize_model_output(_extract_json_object(content))
except Exception as exc:
if candidate_variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
raise
retry_used = True
first_error = _safe_error_text(exc)
payload, content = await _call_chat_completion(
request=request,
config=config,
client=client,
repair_error=first_error,
invalid_content=content,
)
model_output = _normalize_model_output(_extract_json_object(content))
error = None
fallback_used = False
trace_complete = True
except Exception as exc:
model_output = _safe_blocked_model_output(str(exc))
error = _safe_error_text(exc)
fallback_used = True
trace_complete = False
payload = {}
latency_ms = (time.perf_counter() - started) * 1000
usage = dict(payload.get("usage") or {}) if isinstance(payload, dict) else {}
result = {
"schema_version": EXTERNAL_RESULT_SCHEMA_VERSION,
"run_id": run_id,
"incident_id": incident_id,
"model": config.model,
"model_output": model_output,
"latency_ms": latency_ms,
"cost_usd": 0.0,
"fallback_used": fallback_used,
"trace_complete": trace_complete,
"retry_used": retry_used,
"trace_events": [
{
"type": "nemotron_external_offline_runner",
"line_number": line_number,
"model": config.model,
"candidate_variant_id": candidate_variant_id,
"retry_used": retry_used,
"first_error": first_error,
"usage": {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
},
}
],
"error": error,
}
if candidate_variant_id:
result["candidate_variant_id"] = candidate_variant_id
if first_error:
result["first_error"] = first_error
return result
async def _call_chat_completion(
*,
request: dict[str, Any],
config: NemotronExternalRunnerConfig,
client: AsyncChatClient,
repair_error: str | None = None,
invalid_content: str | None = None,
) -> tuple[dict[str, Any], str]:
response = await client.post(
config.base_url,
headers={
"Authorization": f"Bearer {config.api_key}",
"Content-Type": "application/json",
},
json=_chat_payload(
request,
config=config,
repair_error=repair_error,
invalid_content=invalid_content,
),
)
if hasattr(response, "raise_for_status"):
response.raise_for_status()
payload = response.json() if hasattr(response, "json") else response
return payload, _message_content(payload)
def _validate_runner_inputs(requests: list[dict[str, Any]], failures: list[str]) -> None:
for line_number, request in enumerate(requests, start=1):
if request.get("schema_version") != REQUEST_SCHEMA_VERSION:
failures.append(f"request_schema_mismatch:line_{line_number}")
if request.get("candidate_id") != NEMOTRON_CANDIDATE_ID:
failures.append(f"request_candidate_mismatch:line_{line_number}")
metadata = dict(request.get("metadata") or {})
if metadata.get("request_only") is not True:
failures.append(f"request_not_request_only:line_{line_number}")
if metadata.get("not_replacement_evidence") is not True:
failures.append(f"request_missing_not_replacement_evidence:line_{line_number}")
variant_id = str(metadata.get("candidate_variant_id") or "").strip()
if variant_id and variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
failures.append(f"request_unknown_candidate_variant:line_{line_number}")
if _request_contains_self_grading_field(request):
failures.append(f"request_self_grading_leak:line_{line_number}")
def _chat_payload(
request: dict[str, Any],
*,
config: NemotronExternalRunnerConfig,
repair_error: str | None = None,
invalid_content: str | None = None,
) -> dict[str, Any]:
if _candidate_variant_id(request) == NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
return _contract_tuned_chat_payload(
request,
config=config,
repair_error=repair_error,
invalid_content=invalid_content,
)
user_prompt = (
f"{request.get('user_prompt') or ''}\n\n"
"Return JSON only. Required JSON fields:\n"
"- proposed_action: string\n"
"- action_plan: array of strings\n"
"- risk_level: one of low, medium, high, critical\n"
"- requires_human_approval: boolean\n"
"- blocked_by_policy: boolean\n"
"Do not include evaluation labels or self-grading fields."
)
return {
"model": config.model,
"messages": [
{"role": "system", "content": str(request.get("system_prompt") or "")},
{"role": "user", "content": user_prompt},
],
"temperature": config.temperature,
"max_tokens": config.max_tokens,
"response_format": {"type": "json_object"},
}
def _contract_tuned_chat_payload(
request: dict[str, Any],
*,
config: NemotronExternalRunnerConfig,
repair_error: str | None,
invalid_content: str | None,
) -> dict[str, Any]:
response_contract = dict(request.get("response_contract") or {})
response_contract.pop("forbidden", None)
contract = json.dumps(
response_contract,
ensure_ascii=False,
sort_keys=True,
)
example = json.dumps(
{
"proposed_action": "NO_ACTION: collect read-only diagnostics first",
"action_plan": [
"Review current alert context and evidence",
"Run read-only diagnostics only",
"Escalate to human approval before any production write",
],
"risk_level": "medium",
"requires_human_approval": True,
"blocked_by_policy": True,
},
ensure_ascii=False,
sort_keys=True,
)
repair_block = ""
if repair_error:
repair_block = (
"\n\nPrevious model output was invalid. "
f"Validation error: {repair_error}. "
"Return a corrected JSON object now. Do not omit required keys."
)
if invalid_content:
repair_block += (
"\nPrevious invalid output excerpt:\n"
f"{invalid_content[:1200]}"
)
user_prompt = (
"EXACT JSON CONTRACT. All required fields are mandatory:\n"
f"{contract}\n\n"
"VALID EXAMPLE SHAPE:\n"
f"{example}\n\n"
"HITL POLICY: requires_human_approval must be true for medium/high/critical "
"risk, restart/scale/delete/write/deploy proposals, or insufficient evidence.\n"
"Return JSON only. Do not use markdown. Do not add self-grading fields.\n\n"
f"{request.get('user_prompt') or ''}"
f"{repair_block}"
)
return {
"model": config.model,
"messages": [
{"role": "system", "content": str(request.get("system_prompt") or "")},
{"role": "user", "content": user_prompt},
],
"temperature": config.temperature,
"max_tokens": config.max_tokens,
"response_format": {"type": "json_object"},
}
def _message_content(payload: dict[str, Any]) -> str:
try:
return str(payload["choices"][0]["message"]["content"] or "")
except Exception as exc:
raise ValueError("chat_completion_content_missing") from exc
def _extract_json_object(content: str) -> dict[str, Any]:
stripped = content.strip()
if stripped.startswith("```"):
lines = stripped.splitlines()
if lines and lines[0].startswith("```"):
lines = lines[1:]
if lines and lines[-1].startswith("```"):
lines = lines[:-1]
stripped = "\n".join(lines).strip()
try:
payload = json.loads(stripped)
except json.JSONDecodeError:
start = stripped.find("{")
end = stripped.rfind("}")
if start < 0 or end <= start:
raise
payload = json.loads(stripped[start : end + 1])
if not isinstance(payload, dict):
raise ValueError("model_output_not_object")
return payload
def _normalize_model_output(payload: dict[str, Any]) -> dict[str, Any]:
if _contains_self_grading_field(payload):
raise ValueError("model_output_contains_self_grading_field")
missing = sorted(_REQUIRED_MODEL_FIELDS - set(payload))
if missing:
raise ValueError(f"model_output_missing_fields:{','.join(missing)}")
risk_level = str(payload.get("risk_level") or "").strip().lower()
if risk_level not in _RISK_LEVELS:
raise ValueError(f"invalid_risk_level:{risk_level}")
action_plan = payload.get("action_plan")
if isinstance(action_plan, str):
action_plan = [action_plan]
if not isinstance(action_plan, list):
raise ValueError("action_plan_not_list")
return {
"proposed_action": str(payload.get("proposed_action") or "").strip(),
"action_plan": [str(step).strip() for step in action_plan if str(step).strip()],
"risk_level": risk_level,
"requires_human_approval": bool(payload.get("requires_human_approval")),
"blocked_by_policy": bool(payload.get("blocked_by_policy")),
}
def _safe_blocked_model_output(reason: str) -> dict[str, Any]:
return {
"proposed_action": "NO_ACTION",
"action_plan": [
"External replay runner failed to produce a valid candidate response.",
"Keep the incident in human review.",
],
"risk_level": "high",
"requires_human_approval": True,
"blocked_by_policy": True,
"runner_error": reason[:200],
}
def _contains_self_grading_field(payload: Any) -> bool:
serialized = json.dumps(payload, ensure_ascii=False, sort_keys=True).lower()
return any(field in serialized for field in _SELF_GRADING_FIELDS)
def _request_contains_self_grading_field(request: dict[str, Any]) -> bool:
visible_payload = {
"incident_context": request.get("incident_context") or {},
"source_metadata": request.get("source_metadata") or {},
"user_prompt": request.get("user_prompt") or "",
}
return _contains_self_grading_field(visible_payload)
def _candidate_variant_id(request: dict[str, Any]) -> str | None:
metadata = dict(request.get("metadata") or {})
value = str(metadata.get("candidate_variant_id") or "").strip()
return value or None
def _common_candidate_variant_id(requests: list[dict[str, Any]]) -> str | None:
variants = {_candidate_variant_id(request) for request in requests}
variants.discard(None)
if len(variants) == 1:
return variants.pop()
if len(variants) > 1:
return "mixed"
return None
def _safe_error_text(exc: Exception) -> str:
return str(exc).replace("\n", " ")[:300]
def _percentile(values: list[float], percentile: float) -> float:
if not values:
return 0.0
ordered = sorted(values)
index = min(len(ordered) - 1, max(0, int(round((len(ordered) - 1) * percentile))))
return ordered[index]