307 lines
12 KiB
Python
307 lines
12 KiB
Python
"""
|
|
LangGraph Incident Kernel Replay Adapter
|
|
=======================================
|
|
|
|
Deterministic offline adapter for the `langgraph_incident_kernel` market
|
|
candidate. The real LangGraph SDK is not installed in this repo environment, so
|
|
this adapter models the expected state-machine boundary without adding a new
|
|
dependency or calling external services.
|
|
|
|
It never executes tools, never writes production systems, never sends messages,
|
|
and never reads fixture labels.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from src.services.agent_market_candidate_adapter import get_market_candidate_spec
|
|
from src.services.agent_replay_input import assert_no_evaluation_label_leak
|
|
|
|
LANGGRAPH_CANDIDATE_ID = "langgraph_incident_kernel"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LangGraphKernelDecision:
|
|
"""Candidate replay result produced by the LangGraph-shaped kernel."""
|
|
|
|
payload: dict[str, Any]
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return dict(self.payload)
|
|
|
|
|
|
def build_langgraph_candidate_result(
|
|
candidate_input: dict[str, Any],
|
|
) -> LangGraphKernelDecision:
|
|
"""Build one offline LangGraph incident-kernel replay result."""
|
|
started = time.perf_counter()
|
|
assert_no_evaluation_label_leak(candidate_input)
|
|
spec = get_market_candidate_spec(LANGGRAPH_CANDIDATE_ID)
|
|
incident_id = str(candidate_input.get("incident_id", "")).strip()
|
|
run_id = str(candidate_input.get("run_id", "")).strip()
|
|
if not incident_id or not run_id:
|
|
raise ValueError("candidate input must include incident_id and run_id")
|
|
|
|
context = dict(candidate_input.get("incident_context") or {})
|
|
state = _build_state(context)
|
|
plan = _plan_from_state(state)
|
|
risk_level = _risk_level(state, plan)
|
|
requires_human_approval = _requires_human_approval(risk_level, plan)
|
|
trace_events = _trace_events(state, plan, risk_level, requires_human_approval)
|
|
latency_ms = (time.perf_counter() - started) * 1000
|
|
|
|
return LangGraphKernelDecision(
|
|
payload={
|
|
"schema_version": "agent_candidate_replay_result_v1",
|
|
"run_id": run_id,
|
|
"incident_id": incident_id,
|
|
"candidate_id": spec.candidate_id,
|
|
"candidate_role": spec.candidate_role,
|
|
"proposed_action": plan["proposed_action"],
|
|
"action_plan": plan["action_plan"],
|
|
"risk_level": risk_level,
|
|
"requires_human_approval": requires_human_approval,
|
|
"blocked_by_policy": plan["blocked_by_policy"],
|
|
"fallback_used": False,
|
|
"trace_complete": True,
|
|
"trace_events": trace_events,
|
|
"rca_correct": None,
|
|
"tool_dry_run_pass": None,
|
|
"repair_success": None,
|
|
"false_repair": False,
|
|
"latency_ms": latency_ms,
|
|
"cost_usd": 0,
|
|
"error": None,
|
|
"metadata": {
|
|
"adapter_mode": "deterministic_offline_workflow_kernel",
|
|
"candidate_framework": "langgraph",
|
|
"sdk_dependency": "langgraph_python_package_not_installed",
|
|
"new_dependency_added": False,
|
|
"state_nodes": [event["type"] for event in trace_events],
|
|
"workflow_kernel": "awoooi_langgraph_incident_kernel_v1",
|
|
"source": "langgraph_incident_kernel_offline_adapter",
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
def build_langgraph_candidate_results(
|
|
candidate_inputs: list[dict[str, Any]],
|
|
) -> list[LangGraphKernelDecision]:
|
|
"""Build many LangGraph incident-kernel replay results."""
|
|
return [build_langgraph_candidate_result(candidate_input) for candidate_input in candidate_inputs]
|
|
|
|
|
|
def _build_state(context: dict[str, Any]) -> dict[str, Any]:
|
|
haystack = json.dumps(context, ensure_ascii=False, sort_keys=True).lower()
|
|
alertname = str(context.get("alertname") or "").strip()
|
|
category = str(context.get("alert_category") or "general").strip().lower()
|
|
severity = str(context.get("severity") or "P3").strip().upper()
|
|
status = str(context.get("status") or "").strip().lower()
|
|
service = _primary_service(context)
|
|
namespace = _namespace(context)
|
|
return {
|
|
"alertname": alertname,
|
|
"category": category,
|
|
"severity": severity,
|
|
"status": status,
|
|
"service": service,
|
|
"namespace": namespace,
|
|
"haystack": haystack,
|
|
"is_resolved": status == "resolved",
|
|
"is_backup": "backup" in haystack,
|
|
"is_postgres": any(marker in haystack for marker in ("postgres", "deadlock")),
|
|
"is_host": any(marker in haystack for marker in ("host", "disk", "coldstart", "cold-start")),
|
|
"is_container": any(
|
|
marker in haystack
|
|
for marker in ("docker", "container", "cadvisor", "memory", "cpu", "unhealthy")
|
|
),
|
|
"is_flywheel": any(marker in haystack for marker in ("flywheel", "awooop")),
|
|
}
|
|
|
|
|
|
def _plan_from_state(state: dict[str, Any]) -> dict[str, Any]:
|
|
if state["is_resolved"]:
|
|
return _observe_plan(state, "incident already resolved; preserve evidence")
|
|
if state["is_backup"]:
|
|
return _backup_plan(state)
|
|
if state["is_postgres"]:
|
|
return _postgres_plan(state)
|
|
if state["is_flywheel"]:
|
|
return _flywheel_plan(state)
|
|
if state["is_host"]:
|
|
return _host_plan(state)
|
|
if state["is_container"]:
|
|
return _container_plan(state)
|
|
return _observe_plan(state, "general incident requires read-only triage first")
|
|
|
|
|
|
def _observe_plan(state: dict[str, Any], reason: str) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
f"NO_ACTION: {reason}; keep monitoring {state['alertname']} for {state['service']}"
|
|
),
|
|
"blocked_by_policy": True,
|
|
"action_plan": [
|
|
_step("classify", "policy", [state["category"], state["severity"]]),
|
|
_step("observe", "awoooi", ["timeline", state["alertname"], state["service"]]),
|
|
_step("handoff", "human", ["review-if-recurs"]),
|
|
],
|
|
}
|
|
|
|
|
|
def _backup_plan(state: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
"READ_ONLY_BACKUP_DIAGNOSE: inspect backup job, freshness, logs, and "
|
|
f"storage evidence for {state['service']}; do not delete or rotate backups"
|
|
),
|
|
"blocked_by_policy": False,
|
|
"action_plan": [
|
|
_step("inspect-cronjob", "kubectl", ["get", "cronjob", "-A"]),
|
|
_step("inspect-jobs", "kubectl", ["get", "jobs", "-A"]),
|
|
_step("read-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
|
|
_step("verify-textfile", "prometheus", ["backup_last_success_timestamp"]),
|
|
],
|
|
}
|
|
|
|
|
|
def _postgres_plan(state: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
"READ_ONLY_POSTGRES_DIAGNOSE: inspect pg_stat_activity, locks, and deadlocks; "
|
|
"do not terminate sessions without approval"
|
|
),
|
|
"blocked_by_policy": False,
|
|
"action_plan": [
|
|
_step("inspect-activity", "postgres", ["select", "pg_stat_activity"]),
|
|
_step("inspect-locks", "postgres", ["select", "pg_locks"]),
|
|
_step("inspect-deadlocks", "prometheus", ["postgres_deadlocks_total"]),
|
|
],
|
|
}
|
|
|
|
|
|
def _flywheel_plan(state: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
"READ_ONLY_FLYWHEEL_DIAGNOSE: inspect stuck incidents, agent sessions, "
|
|
"approval queue, and timeline gaps before any repair"
|
|
),
|
|
"blocked_by_policy": False,
|
|
"action_plan": [
|
|
_step("inspect-incidents", "awoooi-api", ["GET", "/api/v1/incidents"]),
|
|
_step("inspect-agent-sessions", "database", ["select", "agent_sessions"]),
|
|
_step("inspect-approvals", "database", ["select", "approval_records"]),
|
|
],
|
|
}
|
|
|
|
|
|
def _host_plan(state: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
f"SSH_DIAGNOSE: run read-only host resource checks for {state['service']} "
|
|
"including df, journalctl, systemctl status, and cold-start gate evidence"
|
|
),
|
|
"blocked_by_policy": False,
|
|
"action_plan": [
|
|
_step("disk", "ssh", ["df", "-h"]),
|
|
_step("journal", "ssh", ["journalctl", "--no-pager", "-n", "200"]),
|
|
_step("systemd", "ssh", ["systemctl", "status", state["service"]]),
|
|
_step("prometheus", "prometheus", ["node_filesystem_avail_bytes", state["alertname"]]),
|
|
],
|
|
}
|
|
|
|
|
|
def _container_plan(state: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"proposed_action": (
|
|
f"READ_ONLY_CONTAINER_DIAGNOSE: inspect docker/kubernetes resource signals for "
|
|
f"{state['service']}; require approval before restart, scale, deploy, or write"
|
|
),
|
|
"blocked_by_policy": False,
|
|
"action_plan": [
|
|
_step("kubectl-describe", "kubectl", ["describe", "deployment", state["service"], "-n", state["namespace"]]),
|
|
_step("kubectl-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
|
|
_step("docker-stats", "prometheus", ["docker_container_cpu_cores", "docker_container_memory_usage_bytes"]),
|
|
_step("approval-gate", "human", ["approve-before-restart-or-scale"]),
|
|
],
|
|
}
|
|
|
|
|
|
def _risk_level(state: dict[str, Any], plan: dict[str, Any]) -> str:
|
|
if state["severity"] == "P0":
|
|
return "critical"
|
|
if state["severity"] == "P1":
|
|
return "high"
|
|
action = json.dumps(plan, ensure_ascii=False).lower()
|
|
if any(marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")):
|
|
return "medium"
|
|
if state["severity"] == "P2":
|
|
return "medium"
|
|
return "low"
|
|
|
|
|
|
def _requires_human_approval(risk_level: str, plan: dict[str, Any]) -> bool:
|
|
action = json.dumps(plan, ensure_ascii=False).lower()
|
|
return risk_level in {"medium", "high", "critical"} or any(
|
|
marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")
|
|
)
|
|
|
|
|
|
def _trace_events(
|
|
state: dict[str, Any],
|
|
plan: dict[str, Any],
|
|
risk_level: str,
|
|
requires_human_approval: bool,
|
|
) -> list[dict[str, Any]]:
|
|
return [
|
|
{"type": "input_loaded", "alertname": state["alertname"]},
|
|
{"type": "state_classified", "category": state["category"], "severity": state["severity"]},
|
|
{"type": "evidence_gate", "labels_visible_only": True},
|
|
{"type": "plan_selected", "step_count": len(plan["action_plan"])},
|
|
{
|
|
"type": "safety_review",
|
|
"risk_level": risk_level,
|
|
"requires_human_approval": requires_human_approval,
|
|
"blocked_by_policy": plan["blocked_by_policy"],
|
|
},
|
|
{"type": "finalized", "writes_executed": False, "tools_executed": False},
|
|
]
|
|
|
|
|
|
def _step(step: str, tool: str, args: list[str]) -> dict[str, Any]:
|
|
return {"step": step, "tool": tool, "args": args, "mode": "read_only"}
|
|
|
|
|
|
def _primary_service(context: dict[str, Any]) -> str:
|
|
services = context.get("affected_services") or []
|
|
if services:
|
|
return _resource_name(str(services[0]))
|
|
for signal in context.get("signals") or []:
|
|
labels = signal.get("labels") or {}
|
|
for key in ("deployment", "service", "container", "app", "pod", "instance"):
|
|
if labels.get(key):
|
|
return _resource_name(str(labels[key]).split(":")[0].split("-")[0])
|
|
return "unknown"
|
|
|
|
|
|
def _namespace(context: dict[str, Any]) -> str:
|
|
for signal in context.get("signals") or []:
|
|
labels = signal.get("labels") or {}
|
|
if labels.get("namespace"):
|
|
return _resource_name(str(labels["namespace"]))
|
|
return "default"
|
|
|
|
|
|
def _resource_name(value: str) -> str:
|
|
cleaned = "".join(
|
|
char.lower()
|
|
for char in value
|
|
if char.isalnum() or char in {"-", "."}
|
|
).strip("-.")
|
|
return cleaned or "unknown"
|