""" LangGraph Incident Kernel Replay Adapter ======================================= Deterministic offline adapter for the `langgraph_incident_kernel` market candidate. The real LangGraph SDK is not installed in this repo environment, so this adapter models the expected state-machine boundary without adding a new dependency or calling external services. It never executes tools, never writes production systems, never sends messages, and never reads fixture labels. """ from __future__ import annotations import json import time from dataclasses import dataclass from typing import Any from src.services.agent_market_candidate_adapter import get_market_candidate_spec from src.services.agent_replay_input import assert_no_evaluation_label_leak LANGGRAPH_CANDIDATE_ID = "langgraph_incident_kernel" @dataclass(frozen=True) class LangGraphKernelDecision: """Candidate replay result produced by the LangGraph-shaped kernel.""" payload: dict[str, Any] def to_dict(self) -> dict[str, Any]: return dict(self.payload) def build_langgraph_candidate_result( candidate_input: dict[str, Any], ) -> LangGraphKernelDecision: """Build one offline LangGraph incident-kernel replay result.""" started = time.perf_counter() assert_no_evaluation_label_leak(candidate_input) spec = get_market_candidate_spec(LANGGRAPH_CANDIDATE_ID) incident_id = str(candidate_input.get("incident_id", "")).strip() run_id = str(candidate_input.get("run_id", "")).strip() if not incident_id or not run_id: raise ValueError("candidate input must include incident_id and run_id") context = dict(candidate_input.get("incident_context") or {}) state = _build_state(context) plan = _plan_from_state(state) risk_level = _risk_level(state, plan) requires_human_approval = _requires_human_approval(risk_level, plan) trace_events = _trace_events(state, plan, risk_level, requires_human_approval) latency_ms = (time.perf_counter() - started) * 1000 return LangGraphKernelDecision( payload={ "schema_version": "agent_candidate_replay_result_v1", "run_id": run_id, "incident_id": incident_id, "candidate_id": spec.candidate_id, "candidate_role": spec.candidate_role, "proposed_action": plan["proposed_action"], "action_plan": plan["action_plan"], "risk_level": risk_level, "requires_human_approval": requires_human_approval, "blocked_by_policy": plan["blocked_by_policy"], "fallback_used": False, "trace_complete": True, "trace_events": trace_events, "rca_correct": None, "tool_dry_run_pass": None, "repair_success": None, "false_repair": False, "latency_ms": latency_ms, "cost_usd": 0, "error": None, "metadata": { "adapter_mode": "deterministic_offline_workflow_kernel", "candidate_framework": "langgraph", "sdk_dependency": "langgraph_python_package_not_installed", "new_dependency_added": False, "state_nodes": [event["type"] for event in trace_events], "workflow_kernel": "awoooi_langgraph_incident_kernel_v1", "source": "langgraph_incident_kernel_offline_adapter", }, } ) def build_langgraph_candidate_results( candidate_inputs: list[dict[str, Any]], ) -> list[LangGraphKernelDecision]: """Build many LangGraph incident-kernel replay results.""" return [build_langgraph_candidate_result(candidate_input) for candidate_input in candidate_inputs] def _build_state(context: dict[str, Any]) -> dict[str, Any]: haystack = json.dumps(context, ensure_ascii=False, sort_keys=True).lower() alertname = str(context.get("alertname") or "").strip() category = str(context.get("alert_category") or "general").strip().lower() severity = str(context.get("severity") or "P3").strip().upper() status = str(context.get("status") or "").strip().lower() service = _primary_service(context) namespace = _namespace(context) return { "alertname": alertname, "category": category, "severity": severity, "status": status, "service": service, "namespace": namespace, "haystack": haystack, "is_resolved": status == "resolved", "is_backup": "backup" in haystack, "is_postgres": any(marker in haystack for marker in ("postgres", "deadlock")), "is_host": any(marker in haystack for marker in ("host", "disk", "coldstart", "cold-start")), "is_container": any( marker in haystack for marker in ("docker", "container", "cadvisor", "memory", "cpu", "unhealthy") ), "is_flywheel": any(marker in haystack for marker in ("flywheel", "awooop")), } def _plan_from_state(state: dict[str, Any]) -> dict[str, Any]: if state["is_resolved"]: return _observe_plan(state, "incident already resolved; preserve evidence") if state["is_backup"]: return _backup_plan(state) if state["is_postgres"]: return _postgres_plan(state) if state["is_flywheel"]: return _flywheel_plan(state) if state["is_host"]: return _host_plan(state) if state["is_container"]: return _container_plan(state) return _observe_plan(state, "general incident requires read-only triage first") def _observe_plan(state: dict[str, Any], reason: str) -> dict[str, Any]: return { "proposed_action": ( f"NO_ACTION: {reason}; keep monitoring {state['alertname']} for {state['service']}" ), "blocked_by_policy": True, "action_plan": [ _step("classify", "policy", [state["category"], state["severity"]]), _step("observe", "awoooi", ["timeline", state["alertname"], state["service"]]), _step("handoff", "human", ["review-if-recurs"]), ], } def _backup_plan(state: dict[str, Any]) -> dict[str, Any]: return { "proposed_action": ( "READ_ONLY_BACKUP_DIAGNOSE: inspect backup job, freshness, logs, and " f"storage evidence for {state['service']}; do not delete or rotate backups" ), "blocked_by_policy": False, "action_plan": [ _step("inspect-cronjob", "kubectl", ["get", "cronjob", "-A"]), _step("inspect-jobs", "kubectl", ["get", "jobs", "-A"]), _step("read-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]), _step("verify-textfile", "prometheus", ["backup_last_success_timestamp"]), ], } def _postgres_plan(state: dict[str, Any]) -> dict[str, Any]: return { "proposed_action": ( "READ_ONLY_POSTGRES_DIAGNOSE: inspect pg_stat_activity, locks, and deadlocks; " "do not terminate sessions without approval" ), "blocked_by_policy": False, "action_plan": [ _step("inspect-activity", "postgres", ["select", "pg_stat_activity"]), _step("inspect-locks", "postgres", ["select", "pg_locks"]), _step("inspect-deadlocks", "prometheus", ["postgres_deadlocks_total"]), ], } def _flywheel_plan(state: dict[str, Any]) -> dict[str, Any]: return { "proposed_action": ( "READ_ONLY_FLYWHEEL_DIAGNOSE: inspect stuck incidents, agent sessions, " "approval queue, and timeline gaps before any repair" ), "blocked_by_policy": False, "action_plan": [ _step("inspect-incidents", "awoooi-api", ["GET", "/api/v1/incidents"]), _step("inspect-agent-sessions", "database", ["select", "agent_sessions"]), _step("inspect-approvals", "database", ["select", "approval_records"]), ], } def _host_plan(state: dict[str, Any]) -> dict[str, Any]: return { "proposed_action": ( f"SSH_DIAGNOSE: run read-only host resource checks for {state['service']} " "including df, journalctl, systemctl status, and cold-start gate evidence" ), "blocked_by_policy": False, "action_plan": [ _step("disk", "ssh", ["df", "-h"]), _step("journal", "ssh", ["journalctl", "--no-pager", "-n", "200"]), _step("systemd", "ssh", ["systemctl", "status", state["service"]]), _step("prometheus", "prometheus", ["node_filesystem_avail_bytes", state["alertname"]]), ], } def _container_plan(state: dict[str, Any]) -> dict[str, Any]: return { "proposed_action": ( f"READ_ONLY_CONTAINER_DIAGNOSE: inspect docker/kubernetes resource signals for " f"{state['service']}; require approval before restart, scale, deploy, or write" ), "blocked_by_policy": False, "action_plan": [ _step("kubectl-describe", "kubectl", ["describe", "deployment", state["service"], "-n", state["namespace"]]), _step("kubectl-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]), _step("docker-stats", "prometheus", ["docker_container_cpu_cores", "docker_container_memory_usage_bytes"]), _step("approval-gate", "human", ["approve-before-restart-or-scale"]), ], } def _risk_level(state: dict[str, Any], plan: dict[str, Any]) -> str: if state["severity"] == "P0": return "critical" if state["severity"] == "P1": return "high" action = json.dumps(plan, ensure_ascii=False).lower() if any(marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")): return "medium" if state["severity"] == "P2": return "medium" return "low" def _requires_human_approval(risk_level: str, plan: dict[str, Any]) -> bool: action = json.dumps(plan, ensure_ascii=False).lower() return risk_level in {"medium", "high", "critical"} or any( marker in action for marker in ("restart", "scale", "deploy", "write", "terminate") ) def _trace_events( state: dict[str, Any], plan: dict[str, Any], risk_level: str, requires_human_approval: bool, ) -> list[dict[str, Any]]: return [ {"type": "input_loaded", "alertname": state["alertname"]}, {"type": "state_classified", "category": state["category"], "severity": state["severity"]}, {"type": "evidence_gate", "labels_visible_only": True}, {"type": "plan_selected", "step_count": len(plan["action_plan"])}, { "type": "safety_review", "risk_level": risk_level, "requires_human_approval": requires_human_approval, "blocked_by_policy": plan["blocked_by_policy"], }, {"type": "finalized", "writes_executed": False, "tools_executed": False}, ] def _step(step: str, tool: str, args: list[str]) -> dict[str, Any]: return {"step": step, "tool": tool, "args": args, "mode": "read_only"} def _primary_service(context: dict[str, Any]) -> str: services = context.get("affected_services") or [] if services: return _resource_name(str(services[0])) for signal in context.get("signals") or []: labels = signal.get("labels") or {} for key in ("deployment", "service", "container", "app", "pod", "instance"): if labels.get(key): return _resource_name(str(labels[key]).split(":")[0].split("-")[0]) return "unknown" def _namespace(context: dict[str, Any]) -> str: for signal in context.get("signals") or []: labels = signal.get("labels") or {} if labels.get("namespace"): return _resource_name(str(labels["namespace"])) return "default" def _resource_name(value: str) -> str: cleaned = "".join( char.lower() for char in value if char.isalnum() or char in {"-", "."} ).strip("-.") return cleaned or "unknown"