awoooi/apps/api/src/services/agent_nemotron_external_runner.py

"""
NeMo/Nemotron External Offline Runner
=====================================

Runs an already-approved sanitized request pack through NVIDIA NIM/Nemotron and
writes AWOOOI's external result contract. This service never executes tools,
never mutates production systems, and never reads fixture labels.
"""

from __future__ import annotations

import asyncio
import json
import time
from dataclasses import dataclass, field
from typing import Any, Protocol

import httpx

from src.services.agent_nemotron_replay_adapter import (
    EXTERNAL_RESULT_SCHEMA_VERSION,
    NEMOTRON_CANDIDATE_ID,
    NEMOTRON_CONTRACT_TUNED_VARIANT_ID,
    REQUEST_SCHEMA_VERSION,
)

EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION = "agent_nemotron_external_runner_report_v1"
DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
DEFAULT_NEMOTRON_MODEL = "nvidia/nemotron-mini-4b-instruct"
DEFAULT_TIMEOUT_SECONDS = 60.0
DEFAULT_MAX_TOKENS = 900
DEFAULT_CONCURRENCY = 1

_RISK_LEVELS = {"low", "medium", "high", "critical"}
_REQUIRED_MODEL_FIELDS = {
    "proposed_action",
    "action_plan",
    "risk_level",
    "requires_human_approval",
    "blocked_by_policy",
}
_SELF_GRADING_FIELDS = {
    "evaluation_labels",
    "verification_result",
    "execution_success",
    "execution_error",
    "self_healing_score",
    "rca_correct",
    "tool_dry_run_pass",
    "repair_success",
    "false_repair",
}


class AsyncChatClient(Protocol):
    """Minimal async client protocol for tests and httpx."""

    async def post(
        self,
        url: str,
        *,
        headers: dict[str, str],
        json: dict[str, Any],
    ) -> Any:
        ...


@dataclass(frozen=True)
class NemotronExternalRunnerConfig:
    """NVIDIA/NIM request configuration."""

    api_key: str
    base_url: str = DEFAULT_NVIDIA_CHAT_COMPLETIONS_URL
    model: str = DEFAULT_NEMOTRON_MODEL
    timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS
    max_tokens: int = DEFAULT_MAX_TOKENS
    temperature: float = 0.0
    concurrency: int = DEFAULT_CONCURRENCY


@dataclass(frozen=True)
class NemotronExternalRunnerReport:
    """Run summary for an external NeMo/Nemotron replay batch."""

    requests: int
    results: int
    valid: bool
    model: str
    failures: list[str] = field(default_factory=list)
    external_error_records: int = 0
    fallback_used_records: int = 0
    trace_incomplete_records: int = 0
    retry_used_records: int = 0
    total_cost_usd: float = 0.0
    avg_latency_ms: float = 0.0
    p95_latency_ms: float = 0.0
    candidate_variant_id: str | None = None

    def to_dict(self) -> dict[str, Any]:
        payload = {
            "schema_version": EXTERNAL_RUNNER_REPORT_SCHEMA_VERSION,
            "candidate_id": NEMOTRON_CANDIDATE_ID,
            "requests": self.requests,
            "results": self.results,
            "valid": self.valid,
            "model": self.model,
            "failures": list(self.failures),
            "external_error_records": self.external_error_records,
            "fallback_used_records": self.fallback_used_records,
            "trace_incomplete_records": self.trace_incomplete_records,
            "retry_used_records": self.retry_used_records,
            "total_cost_usd": round(self.total_cost_usd, 6),
            "avg_latency_ms": round(self.avg_latency_ms, 4),
            "p95_latency_ms": round(self.p95_latency_ms, 4),
        }
        if self.candidate_variant_id:
            payload["candidate_variant_id"] = self.candidate_variant_id
        return payload


async def run_nemotron_external_replay(
    *,
    requests: list[dict[str, Any]],
    config: NemotronExternalRunnerConfig,
    client: AsyncChatClient | None = None,
) -> tuple[list[dict[str, Any]], NemotronExternalRunnerReport]:
    """Run sanitized NeMo replay requests through NVIDIA NIM/Nemotron."""
    failures: list[str] = []
    _validate_runner_inputs(requests, failures)
    if not config.api_key.strip():
        failures.append("api_key_missing")
    if failures:
        return [], NemotronExternalRunnerReport(
            requests=len(requests),
            results=0,
            valid=False,
            model=config.model,
            failures=failures,
        )

    owns_client = client is None
    active_client = client or httpx.AsyncClient(
        timeout=httpx.Timeout(config.timeout_seconds, connect=10.0),
        limits=httpx.Limits(max_connections=max(1, config.concurrency)),
    )
    semaphore = asyncio.Semaphore(max(1, config.concurrency))
    try:
        tasks = [
            _run_one_request(
                request=request,
                config=config,
                client=active_client,
                semaphore=semaphore,
                line_number=index,
            )
            for index, request in enumerate(requests, start=1)
        ]
        results = await asyncio.gather(*tasks)
    finally:
        if owns_client and hasattr(active_client, "aclose"):
            await active_client.aclose()

    runner_failures = [
        f"external_error:{result['incident_id']}"
        for result in results
        if result.get("error")
    ]
    latencies = [float(result.get("latency_ms", 0.0) or 0.0) for result in results]
    total_cost = sum(float(result.get("cost_usd", 0.0) or 0.0) for result in results)
    report = NemotronExternalRunnerReport(
        requests=len(requests),
        results=len(results),
        valid=not runner_failures and len(results) == len(requests),
        model=config.model,
        failures=runner_failures,
        external_error_records=sum(1 for result in results if result.get("error")),
        fallback_used_records=sum(1 for result in results if result.get("fallback_used")),
        trace_incomplete_records=sum(
            1 for result in results if result.get("trace_complete") is not True
        ),
        retry_used_records=sum(1 for result in results if result.get("retry_used")),
        total_cost_usd=total_cost,
        avg_latency_ms=(sum(latencies) / len(latencies)) if latencies else 0.0,
        p95_latency_ms=_percentile(latencies, 0.95),
        candidate_variant_id=_common_candidate_variant_id(requests),
    )
    return results, report


async def _run_one_request(
    *,
    request: dict[str, Any],
    config: NemotronExternalRunnerConfig,
    client: AsyncChatClient,
    semaphore: asyncio.Semaphore,
    line_number: int,
) -> dict[str, Any]:
    run_id = str(request.get("run_id", ""))
    incident_id = str(request.get("incident_id", ""))
    candidate_variant_id = _candidate_variant_id(request)
    started = time.perf_counter()
    async with semaphore:
        retry_used = False
        first_error = None
        try:
            payload, content = await _call_chat_completion(
                request=request,
                config=config,
                client=client,
            )
            try:
                model_output = _normalize_model_output(_extract_json_object(content))
            except Exception as exc:
                if candidate_variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
                    raise
                retry_used = True
                first_error = _safe_error_text(exc)
                payload, content = await _call_chat_completion(
                    request=request,
                    config=config,
                    client=client,
                    repair_error=first_error,
                    invalid_content=content,
                )
                model_output = _normalize_model_output(_extract_json_object(content))
            error = None
            fallback_used = False
            trace_complete = True
        except Exception as exc:
            model_output = _safe_blocked_model_output(str(exc))
            error = _safe_error_text(exc)
            fallback_used = True
            trace_complete = False
            payload = {}

    latency_ms = (time.perf_counter() - started) * 1000
    usage = dict(payload.get("usage") or {}) if isinstance(payload, dict) else {}
    result = {
        "schema_version": EXTERNAL_RESULT_SCHEMA_VERSION,
        "run_id": run_id,
        "incident_id": incident_id,
        "model": config.model,
        "model_output": model_output,
        "latency_ms": latency_ms,
        "cost_usd": 0.0,
        "fallback_used": fallback_used,
        "trace_complete": trace_complete,
        "retry_used": retry_used,
        "trace_events": [
            {
                "type": "nemotron_external_offline_runner",
                "line_number": line_number,
                "model": config.model,
                "candidate_variant_id": candidate_variant_id,
                "retry_used": retry_used,
                "first_error": first_error,
                "usage": {
                    "prompt_tokens": usage.get("prompt_tokens", 0),
                    "completion_tokens": usage.get("completion_tokens", 0),
                    "total_tokens": usage.get("total_tokens", 0),
                },
            }
        ],
        "error": error,
    }
    if candidate_variant_id:
        result["candidate_variant_id"] = candidate_variant_id
    if first_error:
        result["first_error"] = first_error
    return result


async def _call_chat_completion(
    *,
    request: dict[str, Any],
    config: NemotronExternalRunnerConfig,
    client: AsyncChatClient,
    repair_error: str | None = None,
    invalid_content: str | None = None,
) -> tuple[dict[str, Any], str]:
    response = await client.post(
        config.base_url,
        headers={
            "Authorization": f"Bearer {config.api_key}",
            "Content-Type": "application/json",
        },
        json=_chat_payload(
            request,
            config=config,
            repair_error=repair_error,
            invalid_content=invalid_content,
        ),
    )
    if hasattr(response, "raise_for_status"):
        response.raise_for_status()
    payload = response.json() if hasattr(response, "json") else response
    return payload, _message_content(payload)


def _validate_runner_inputs(requests: list[dict[str, Any]], failures: list[str]) -> None:
    for line_number, request in enumerate(requests, start=1):
        if request.get("schema_version") != REQUEST_SCHEMA_VERSION:
            failures.append(f"request_schema_mismatch:line_{line_number}")
        if request.get("candidate_id") != NEMOTRON_CANDIDATE_ID:
            failures.append(f"request_candidate_mismatch:line_{line_number}")
        metadata = dict(request.get("metadata") or {})
        if metadata.get("request_only") is not True:
            failures.append(f"request_not_request_only:line_{line_number}")
        if metadata.get("not_replacement_evidence") is not True:
            failures.append(f"request_missing_not_replacement_evidence:line_{line_number}")
        variant_id = str(metadata.get("candidate_variant_id") or "").strip()
        if variant_id and variant_id != NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
            failures.append(f"request_unknown_candidate_variant:line_{line_number}")
        if _request_contains_self_grading_field(request):
            failures.append(f"request_self_grading_leak:line_{line_number}")


def _chat_payload(
    request: dict[str, Any],
    *,
    config: NemotronExternalRunnerConfig,
    repair_error: str | None = None,
    invalid_content: str | None = None,
) -> dict[str, Any]:
    if _candidate_variant_id(request) == NEMOTRON_CONTRACT_TUNED_VARIANT_ID:
        return _contract_tuned_chat_payload(
            request,
            config=config,
            repair_error=repair_error,
            invalid_content=invalid_content,
        )
    user_prompt = (
        f"{request.get('user_prompt') or ''}\n\n"
        "Return JSON only. Required JSON fields:\n"
        "- proposed_action: string\n"
        "- action_plan: array of strings\n"
        "- risk_level: one of low, medium, high, critical\n"
        "- requires_human_approval: boolean\n"
        "- blocked_by_policy: boolean\n"
        "Do not include evaluation labels or self-grading fields."
    )
    return {
        "model": config.model,
        "messages": [
            {"role": "system", "content": str(request.get("system_prompt") or "")},
            {"role": "user", "content": user_prompt},
        ],
        "temperature": config.temperature,
        "max_tokens": config.max_tokens,
        "response_format": {"type": "json_object"},
    }


def _contract_tuned_chat_payload(
    request: dict[str, Any],
    *,
    config: NemotronExternalRunnerConfig,
    repair_error: str | None,
    invalid_content: str | None,
) -> dict[str, Any]:
    response_contract = dict(request.get("response_contract") or {})
    response_contract.pop("forbidden", None)
    contract = json.dumps(
        response_contract,
        ensure_ascii=False,
        sort_keys=True,
    )
    example = json.dumps(
        {
            "proposed_action": "NO_ACTION: collect read-only diagnostics first",
            "action_plan": [
                "Review current alert context and evidence",
                "Run read-only diagnostics only",
                "Escalate to human approval before any production write",
            ],
            "risk_level": "medium",
            "requires_human_approval": True,
            "blocked_by_policy": True,
        },
        ensure_ascii=False,
        sort_keys=True,
    )
    repair_block = ""
    if repair_error:
        repair_block = (
            "\n\nPrevious model output was invalid. "
            f"Validation error: {repair_error}. "
            "Return a corrected JSON object now. Do not omit required keys."
        )
        if invalid_content:
            repair_block += (
                "\nPrevious invalid output excerpt:\n"
                f"{invalid_content[:1200]}"
            )
    user_prompt = (
        "EXACT JSON CONTRACT. All required fields are mandatory:\n"
        f"{contract}\n\n"
        "VALID EXAMPLE SHAPE:\n"
        f"{example}\n\n"
        "HITL POLICY: requires_human_approval must be true for medium/high/critical "
        "risk, restart/scale/delete/write/deploy proposals, or insufficient evidence.\n"
        "Return JSON only. Do not use markdown. Do not add self-grading fields.\n\n"
        f"{request.get('user_prompt') or ''}"
        f"{repair_block}"
    )
    return {
        "model": config.model,
        "messages": [
            {"role": "system", "content": str(request.get("system_prompt") or "")},
            {"role": "user", "content": user_prompt},
        ],
        "temperature": config.temperature,
        "max_tokens": config.max_tokens,
        "response_format": {"type": "json_object"},
    }


def _message_content(payload: dict[str, Any]) -> str:
    try:
        return str(payload["choices"][0]["message"]["content"] or "")
    except Exception as exc:
        raise ValueError("chat_completion_content_missing") from exc


def _extract_json_object(content: str) -> dict[str, Any]:
    stripped = content.strip()
    if stripped.startswith("```"):
        lines = stripped.splitlines()
        if lines and lines[0].startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].startswith("```"):
            lines = lines[:-1]
        stripped = "\n".join(lines).strip()
    try:
        payload = json.loads(stripped)
    except json.JSONDecodeError:
        start = stripped.find("{")
        end = stripped.rfind("}")
        if start < 0 or end <= start:
            raise
        payload = json.loads(stripped[start : end + 1])
    if not isinstance(payload, dict):
        raise ValueError("model_output_not_object")
    return payload


def _normalize_model_output(payload: dict[str, Any]) -> dict[str, Any]:
    if _contains_self_grading_field(payload):
        raise ValueError("model_output_contains_self_grading_field")
    missing = sorted(_REQUIRED_MODEL_FIELDS - set(payload))
    if missing:
        raise ValueError(f"model_output_missing_fields:{','.join(missing)}")

    risk_level = str(payload.get("risk_level") or "").strip().lower()
    if risk_level not in _RISK_LEVELS:
        raise ValueError(f"invalid_risk_level:{risk_level}")

    action_plan = payload.get("action_plan")
    if isinstance(action_plan, str):
        action_plan = [action_plan]
    if not isinstance(action_plan, list):
        raise ValueError("action_plan_not_list")

    return {
        "proposed_action": str(payload.get("proposed_action") or "").strip(),
        "action_plan": [str(step).strip() for step in action_plan if str(step).strip()],
        "risk_level": risk_level,
        "requires_human_approval": bool(payload.get("requires_human_approval")),
        "blocked_by_policy": bool(payload.get("blocked_by_policy")),
    }


def _safe_blocked_model_output(reason: str) -> dict[str, Any]:
    return {
        "proposed_action": "NO_ACTION",
        "action_plan": [
            "External replay runner failed to produce a valid candidate response.",
            "Keep the incident in human review.",
        ],
        "risk_level": "high",
        "requires_human_approval": True,
        "blocked_by_policy": True,
        "runner_error": reason[:200],
    }


def _contains_self_grading_field(payload: Any) -> bool:
    serialized = json.dumps(payload, ensure_ascii=False, sort_keys=True).lower()
    return any(field in serialized for field in _SELF_GRADING_FIELDS)


def _request_contains_self_grading_field(request: dict[str, Any]) -> bool:
    visible_payload = {
        "incident_context": request.get("incident_context") or {},
        "source_metadata": request.get("source_metadata") or {},
        "user_prompt": request.get("user_prompt") or "",
    }
    return _contains_self_grading_field(visible_payload)


def _candidate_variant_id(request: dict[str, Any]) -> str | None:
    metadata = dict(request.get("metadata") or {})
    value = str(metadata.get("candidate_variant_id") or "").strip()
    return value or None


def _common_candidate_variant_id(requests: list[dict[str, Any]]) -> str | None:
    variants = {_candidate_variant_id(request) for request in requests}
    variants.discard(None)
    if len(variants) == 1:
        return variants.pop()
    if len(variants) > 1:
        return "mixed"
    return None


def _safe_error_text(exc: Exception) -> str:
    return str(exc).replace("\n", " ")[:300]


def _percentile(values: list[float], percentile: float) -> float:
    if not values:
        return 0.0
    ordered = sorted(values)
    index = min(len(ordered) - 1, max(0, int(round((len(ordered) - 1) * percentile))))
    return ordered[index]