awoooi/apps/api/src/services/agent_langgraph_adapter.py

"""
LangGraph Incident Kernel Replay Adapter
=======================================

Deterministic offline adapter for the `langgraph_incident_kernel` market
candidate. The real LangGraph SDK is not installed in this repo environment, so
this adapter models the expected state-machine boundary without adding a new
dependency or calling external services.

It never executes tools, never writes production systems, never sends messages,
and never reads fixture labels.
"""

from __future__ import annotations

import json
import time
from dataclasses import dataclass
from typing import Any

from src.services.agent_market_candidate_adapter import get_market_candidate_spec
from src.services.agent_replay_input import assert_no_evaluation_label_leak

LANGGRAPH_CANDIDATE_ID = "langgraph_incident_kernel"


@dataclass(frozen=True)
class LangGraphKernelDecision:
    """Candidate replay result produced by the LangGraph-shaped kernel."""

    payload: dict[str, Any]

    def to_dict(self) -> dict[str, Any]:
        return dict(self.payload)


def build_langgraph_candidate_result(
    candidate_input: dict[str, Any],
) -> LangGraphKernelDecision:
    """Build one offline LangGraph incident-kernel replay result."""
    started = time.perf_counter()
    assert_no_evaluation_label_leak(candidate_input)
    spec = get_market_candidate_spec(LANGGRAPH_CANDIDATE_ID)
    incident_id = str(candidate_input.get("incident_id", "")).strip()
    run_id = str(candidate_input.get("run_id", "")).strip()
    if not incident_id or not run_id:
        raise ValueError("candidate input must include incident_id and run_id")

    context = dict(candidate_input.get("incident_context") or {})
    state = _build_state(context)
    plan = _plan_from_state(state)
    risk_level = _risk_level(state, plan)
    requires_human_approval = _requires_human_approval(risk_level, plan)
    trace_events = _trace_events(state, plan, risk_level, requires_human_approval)
    latency_ms = (time.perf_counter() - started) * 1000

    return LangGraphKernelDecision(
        payload={
            "schema_version": "agent_candidate_replay_result_v1",
            "run_id": run_id,
            "incident_id": incident_id,
            "candidate_id": spec.candidate_id,
            "candidate_role": spec.candidate_role,
            "proposed_action": plan["proposed_action"],
            "action_plan": plan["action_plan"],
            "risk_level": risk_level,
            "requires_human_approval": requires_human_approval,
            "blocked_by_policy": plan["blocked_by_policy"],
            "fallback_used": False,
            "trace_complete": True,
            "trace_events": trace_events,
            "rca_correct": None,
            "tool_dry_run_pass": None,
            "repair_success": None,
            "false_repair": False,
            "latency_ms": latency_ms,
            "cost_usd": 0,
            "error": None,
            "metadata": {
                "adapter_mode": "deterministic_offline_workflow_kernel",
                "candidate_framework": "langgraph",
                "sdk_dependency": "langgraph_python_package_not_installed",
                "new_dependency_added": False,
                "state_nodes": [event["type"] for event in trace_events],
                "workflow_kernel": "awoooi_langgraph_incident_kernel_v1",
                "source": "langgraph_incident_kernel_offline_adapter",
            },
        }
    )


def build_langgraph_candidate_results(
    candidate_inputs: list[dict[str, Any]],
) -> list[LangGraphKernelDecision]:
    """Build many LangGraph incident-kernel replay results."""
    return [build_langgraph_candidate_result(candidate_input) for candidate_input in candidate_inputs]


def _build_state(context: dict[str, Any]) -> dict[str, Any]:
    haystack = json.dumps(context, ensure_ascii=False, sort_keys=True).lower()
    alertname = str(context.get("alertname") or "").strip()
    category = str(context.get("alert_category") or "general").strip().lower()
    severity = str(context.get("severity") or "P3").strip().upper()
    status = str(context.get("status") or "").strip().lower()
    service = _primary_service(context)
    namespace = _namespace(context)
    return {
        "alertname": alertname,
        "category": category,
        "severity": severity,
        "status": status,
        "service": service,
        "namespace": namespace,
        "haystack": haystack,
        "is_resolved": status == "resolved",
        "is_backup": "backup" in haystack,
        "is_postgres": any(marker in haystack for marker in ("postgres", "deadlock")),
        "is_host": any(marker in haystack for marker in ("host", "disk", "coldstart", "cold-start")),
        "is_container": any(
            marker in haystack
            for marker in ("docker", "container", "cadvisor", "memory", "cpu", "unhealthy")
        ),
        "is_flywheel": any(marker in haystack for marker in ("flywheel", "awooop")),
    }


def _plan_from_state(state: dict[str, Any]) -> dict[str, Any]:
    if state["is_resolved"]:
        return _observe_plan(state, "incident already resolved; preserve evidence")
    if state["is_backup"]:
        return _backup_plan(state)
    if state["is_postgres"]:
        return _postgres_plan(state)
    if state["is_flywheel"]:
        return _flywheel_plan(state)
    if state["is_host"]:
        return _host_plan(state)
    if state["is_container"]:
        return _container_plan(state)
    return _observe_plan(state, "general incident requires read-only triage first")


def _observe_plan(state: dict[str, Any], reason: str) -> dict[str, Any]:
    return {
        "proposed_action": (
            f"NO_ACTION: {reason}; keep monitoring {state['alertname']} for {state['service']}"
        ),
        "blocked_by_policy": True,
        "action_plan": [
            _step("classify", "policy", [state["category"], state["severity"]]),
            _step("observe", "awoooi", ["timeline", state["alertname"], state["service"]]),
            _step("handoff", "human", ["review-if-recurs"]),
        ],
    }


def _backup_plan(state: dict[str, Any]) -> dict[str, Any]:
    return {
        "proposed_action": (
            "READ_ONLY_BACKUP_DIAGNOSE: inspect backup job, freshness, logs, and "
            f"storage evidence for {state['service']}; do not delete or rotate backups"
        ),
        "blocked_by_policy": False,
        "action_plan": [
            _step("inspect-cronjob", "kubectl", ["get", "cronjob", "-A"]),
            _step("inspect-jobs", "kubectl", ["get", "jobs", "-A"]),
            _step("read-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
            _step("verify-textfile", "prometheus", ["backup_last_success_timestamp"]),
        ],
    }


def _postgres_plan(state: dict[str, Any]) -> dict[str, Any]:
    return {
        "proposed_action": (
            "READ_ONLY_POSTGRES_DIAGNOSE: inspect pg_stat_activity, locks, and deadlocks; "
            "do not terminate sessions without approval"
        ),
        "blocked_by_policy": False,
        "action_plan": [
            _step("inspect-activity", "postgres", ["select", "pg_stat_activity"]),
            _step("inspect-locks", "postgres", ["select", "pg_locks"]),
            _step("inspect-deadlocks", "prometheus", ["postgres_deadlocks_total"]),
        ],
    }


def _flywheel_plan(state: dict[str, Any]) -> dict[str, Any]:
    return {
        "proposed_action": (
            "READ_ONLY_FLYWHEEL_DIAGNOSE: inspect stuck incidents, agent sessions, "
            "approval queue, and timeline gaps before any repair"
        ),
        "blocked_by_policy": False,
        "action_plan": [
            _step("inspect-incidents", "awoooi-api", ["GET", "/api/v1/incidents"]),
            _step("inspect-agent-sessions", "database", ["select", "agent_sessions"]),
            _step("inspect-approvals", "database", ["select", "approval_records"]),
        ],
    }


def _host_plan(state: dict[str, Any]) -> dict[str, Any]:
    return {
        "proposed_action": (
            f"SSH_DIAGNOSE: run read-only host resource checks for {state['service']} "
            "including df, journalctl, systemctl status, and cold-start gate evidence"
        ),
        "blocked_by_policy": False,
        "action_plan": [
            _step("disk", "ssh", ["df", "-h"]),
            _step("journal", "ssh", ["journalctl", "--no-pager", "-n", "200"]),
            _step("systemd", "ssh", ["systemctl", "status", state["service"]]),
            _step("prometheus", "prometheus", ["node_filesystem_avail_bytes", state["alertname"]]),
        ],
    }


def _container_plan(state: dict[str, Any]) -> dict[str, Any]:
    return {
        "proposed_action": (
            f"READ_ONLY_CONTAINER_DIAGNOSE: inspect docker/kubernetes resource signals for "
            f"{state['service']}; require approval before restart, scale, deploy, or write"
        ),
        "blocked_by_policy": False,
        "action_plan": [
            _step("kubectl-describe", "kubectl", ["describe", "deployment", state["service"], "-n", state["namespace"]]),
            _step("kubectl-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
            _step("docker-stats", "prometheus", ["docker_container_cpu_cores", "docker_container_memory_usage_bytes"]),
            _step("approval-gate", "human", ["approve-before-restart-or-scale"]),
        ],
    }


def _risk_level(state: dict[str, Any], plan: dict[str, Any]) -> str:
    if state["severity"] == "P0":
        return "critical"
    if state["severity"] == "P1":
        return "high"
    action = json.dumps(plan, ensure_ascii=False).lower()
    if any(marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")):
        return "medium"
    if state["severity"] == "P2":
        return "medium"
    return "low"


def _requires_human_approval(risk_level: str, plan: dict[str, Any]) -> bool:
    action = json.dumps(plan, ensure_ascii=False).lower()
    return risk_level in {"medium", "high", "critical"} or any(
        marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")
    )


def _trace_events(
    state: dict[str, Any],
    plan: dict[str, Any],
    risk_level: str,
    requires_human_approval: bool,
) -> list[dict[str, Any]]:
    return [
        {"type": "input_loaded", "alertname": state["alertname"]},
        {"type": "state_classified", "category": state["category"], "severity": state["severity"]},
        {"type": "evidence_gate", "labels_visible_only": True},
        {"type": "plan_selected", "step_count": len(plan["action_plan"])},
        {
            "type": "safety_review",
            "risk_level": risk_level,
            "requires_human_approval": requires_human_approval,
            "blocked_by_policy": plan["blocked_by_policy"],
        },
        {"type": "finalized", "writes_executed": False, "tools_executed": False},
    ]


def _step(step: str, tool: str, args: list[str]) -> dict[str, Any]:
    return {"step": step, "tool": tool, "args": args, "mode": "read_only"}


def _primary_service(context: dict[str, Any]) -> str:
    services = context.get("affected_services") or []
    if services:
        return _resource_name(str(services[0]))
    for signal in context.get("signals") or []:
        labels = signal.get("labels") or {}
        for key in ("deployment", "service", "container", "app", "pod", "instance"):
            if labels.get(key):
                return _resource_name(str(labels[key]).split(":")[0].split("-")[0])
    return "unknown"


def _namespace(context: dict[str, Any]) -> str:
    for signal in context.get("signals") or []:
        labels = signal.get("labels") or {}
        if labels.get("namespace"):
            return _resource_name(str(labels["namespace"]))
    return "default"


def _resource_name(value: str) -> str:
    cleaned = "".join(
        char.lower()
        for char in value
        if char.isalnum() or char in {"-", "."}
    ).strip("-.")
    return cleaned or "unknown"