Files
awoooi/apps/api/src/services/agent_langgraph_adapter.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

307 lines
12 KiB
Python

"""
LangGraph Incident Kernel Replay Adapter
=======================================
Deterministic offline adapter for the `langgraph_incident_kernel` market
candidate. The real LangGraph SDK is not installed in this repo environment, so
this adapter models the expected state-machine boundary without adding a new
dependency or calling external services.
It never executes tools, never writes production systems, never sends messages,
and never reads fixture labels.
"""
from __future__ import annotations
import json
import time
from dataclasses import dataclass
from typing import Any
from src.services.agent_market_candidate_adapter import get_market_candidate_spec
from src.services.agent_replay_input import assert_no_evaluation_label_leak
LANGGRAPH_CANDIDATE_ID = "langgraph_incident_kernel"
@dataclass(frozen=True)
class LangGraphKernelDecision:
"""Candidate replay result produced by the LangGraph-shaped kernel."""
payload: dict[str, Any]
def to_dict(self) -> dict[str, Any]:
return dict(self.payload)
def build_langgraph_candidate_result(
candidate_input: dict[str, Any],
) -> LangGraphKernelDecision:
"""Build one offline LangGraph incident-kernel replay result."""
started = time.perf_counter()
assert_no_evaluation_label_leak(candidate_input)
spec = get_market_candidate_spec(LANGGRAPH_CANDIDATE_ID)
incident_id = str(candidate_input.get("incident_id", "")).strip()
run_id = str(candidate_input.get("run_id", "")).strip()
if not incident_id or not run_id:
raise ValueError("candidate input must include incident_id and run_id")
context = dict(candidate_input.get("incident_context") or {})
state = _build_state(context)
plan = _plan_from_state(state)
risk_level = _risk_level(state, plan)
requires_human_approval = _requires_human_approval(risk_level, plan)
trace_events = _trace_events(state, plan, risk_level, requires_human_approval)
latency_ms = (time.perf_counter() - started) * 1000
return LangGraphKernelDecision(
payload={
"schema_version": "agent_candidate_replay_result_v1",
"run_id": run_id,
"incident_id": incident_id,
"candidate_id": spec.candidate_id,
"candidate_role": spec.candidate_role,
"proposed_action": plan["proposed_action"],
"action_plan": plan["action_plan"],
"risk_level": risk_level,
"requires_human_approval": requires_human_approval,
"blocked_by_policy": plan["blocked_by_policy"],
"fallback_used": False,
"trace_complete": True,
"trace_events": trace_events,
"rca_correct": None,
"tool_dry_run_pass": None,
"repair_success": None,
"false_repair": False,
"latency_ms": latency_ms,
"cost_usd": 0,
"error": None,
"metadata": {
"adapter_mode": "deterministic_offline_workflow_kernel",
"candidate_framework": "langgraph",
"sdk_dependency": "langgraph_python_package_not_installed",
"new_dependency_added": False,
"state_nodes": [event["type"] for event in trace_events],
"workflow_kernel": "awoooi_langgraph_incident_kernel_v1",
"source": "langgraph_incident_kernel_offline_adapter",
},
}
)
def build_langgraph_candidate_results(
candidate_inputs: list[dict[str, Any]],
) -> list[LangGraphKernelDecision]:
"""Build many LangGraph incident-kernel replay results."""
return [build_langgraph_candidate_result(candidate_input) for candidate_input in candidate_inputs]
def _build_state(context: dict[str, Any]) -> dict[str, Any]:
haystack = json.dumps(context, ensure_ascii=False, sort_keys=True).lower()
alertname = str(context.get("alertname") or "").strip()
category = str(context.get("alert_category") or "general").strip().lower()
severity = str(context.get("severity") or "P3").strip().upper()
status = str(context.get("status") or "").strip().lower()
service = _primary_service(context)
namespace = _namespace(context)
return {
"alertname": alertname,
"category": category,
"severity": severity,
"status": status,
"service": service,
"namespace": namespace,
"haystack": haystack,
"is_resolved": status == "resolved",
"is_backup": "backup" in haystack,
"is_postgres": any(marker in haystack for marker in ("postgres", "deadlock")),
"is_host": any(marker in haystack for marker in ("host", "disk", "coldstart", "cold-start")),
"is_container": any(
marker in haystack
for marker in ("docker", "container", "cadvisor", "memory", "cpu", "unhealthy")
),
"is_flywheel": any(marker in haystack for marker in ("flywheel", "awooop")),
}
def _plan_from_state(state: dict[str, Any]) -> dict[str, Any]:
if state["is_resolved"]:
return _observe_plan(state, "incident already resolved; preserve evidence")
if state["is_backup"]:
return _backup_plan(state)
if state["is_postgres"]:
return _postgres_plan(state)
if state["is_flywheel"]:
return _flywheel_plan(state)
if state["is_host"]:
return _host_plan(state)
if state["is_container"]:
return _container_plan(state)
return _observe_plan(state, "general incident requires read-only triage first")
def _observe_plan(state: dict[str, Any], reason: str) -> dict[str, Any]:
return {
"proposed_action": (
f"NO_ACTION: {reason}; keep monitoring {state['alertname']} for {state['service']}"
),
"blocked_by_policy": True,
"action_plan": [
_step("classify", "policy", [state["category"], state["severity"]]),
_step("observe", "awoooi", ["timeline", state["alertname"], state["service"]]),
_step("handoff", "human", ["review-if-recurs"]),
],
}
def _backup_plan(state: dict[str, Any]) -> dict[str, Any]:
return {
"proposed_action": (
"READ_ONLY_BACKUP_DIAGNOSE: inspect backup job, freshness, logs, and "
f"storage evidence for {state['service']}; do not delete or rotate backups"
),
"blocked_by_policy": False,
"action_plan": [
_step("inspect-cronjob", "kubectl", ["get", "cronjob", "-A"]),
_step("inspect-jobs", "kubectl", ["get", "jobs", "-A"]),
_step("read-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
_step("verify-textfile", "prometheus", ["backup_last_success_timestamp"]),
],
}
def _postgres_plan(state: dict[str, Any]) -> dict[str, Any]:
return {
"proposed_action": (
"READ_ONLY_POSTGRES_DIAGNOSE: inspect pg_stat_activity, locks, and deadlocks; "
"do not terminate sessions without approval"
),
"blocked_by_policy": False,
"action_plan": [
_step("inspect-activity", "postgres", ["select", "pg_stat_activity"]),
_step("inspect-locks", "postgres", ["select", "pg_locks"]),
_step("inspect-deadlocks", "prometheus", ["postgres_deadlocks_total"]),
],
}
def _flywheel_plan(state: dict[str, Any]) -> dict[str, Any]:
return {
"proposed_action": (
"READ_ONLY_FLYWHEEL_DIAGNOSE: inspect stuck incidents, agent sessions, "
"approval queue, and timeline gaps before any repair"
),
"blocked_by_policy": False,
"action_plan": [
_step("inspect-incidents", "awoooi-api", ["GET", "/api/v1/incidents"]),
_step("inspect-agent-sessions", "database", ["select", "agent_sessions"]),
_step("inspect-approvals", "database", ["select", "approval_records"]),
],
}
def _host_plan(state: dict[str, Any]) -> dict[str, Any]:
return {
"proposed_action": (
f"SSH_DIAGNOSE: run read-only host resource checks for {state['service']} "
"including df, journalctl, systemctl status, and cold-start gate evidence"
),
"blocked_by_policy": False,
"action_plan": [
_step("disk", "ssh", ["df", "-h"]),
_step("journal", "ssh", ["journalctl", "--no-pager", "-n", "200"]),
_step("systemd", "ssh", ["systemctl", "status", state["service"]]),
_step("prometheus", "prometheus", ["node_filesystem_avail_bytes", state["alertname"]]),
],
}
def _container_plan(state: dict[str, Any]) -> dict[str, Any]:
return {
"proposed_action": (
f"READ_ONLY_CONTAINER_DIAGNOSE: inspect docker/kubernetes resource signals for "
f"{state['service']}; require approval before restart, scale, deploy, or write"
),
"blocked_by_policy": False,
"action_plan": [
_step("kubectl-describe", "kubectl", ["describe", "deployment", state["service"], "-n", state["namespace"]]),
_step("kubectl-logs", "kubectl", ["logs", f"deployment/{state['service']}", "-n", state["namespace"], "--tail=200"]),
_step("docker-stats", "prometheus", ["docker_container_cpu_cores", "docker_container_memory_usage_bytes"]),
_step("approval-gate", "human", ["approve-before-restart-or-scale"]),
],
}
def _risk_level(state: dict[str, Any], plan: dict[str, Any]) -> str:
if state["severity"] == "P0":
return "critical"
if state["severity"] == "P1":
return "high"
action = json.dumps(plan, ensure_ascii=False).lower()
if any(marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")):
return "medium"
if state["severity"] == "P2":
return "medium"
return "low"
def _requires_human_approval(risk_level: str, plan: dict[str, Any]) -> bool:
action = json.dumps(plan, ensure_ascii=False).lower()
return risk_level in {"medium", "high", "critical"} or any(
marker in action for marker in ("restart", "scale", "deploy", "write", "terminate")
)
def _trace_events(
state: dict[str, Any],
plan: dict[str, Any],
risk_level: str,
requires_human_approval: bool,
) -> list[dict[str, Any]]:
return [
{"type": "input_loaded", "alertname": state["alertname"]},
{"type": "state_classified", "category": state["category"], "severity": state["severity"]},
{"type": "evidence_gate", "labels_visible_only": True},
{"type": "plan_selected", "step_count": len(plan["action_plan"])},
{
"type": "safety_review",
"risk_level": risk_level,
"requires_human_approval": requires_human_approval,
"blocked_by_policy": plan["blocked_by_policy"],
},
{"type": "finalized", "writes_executed": False, "tools_executed": False},
]
def _step(step: str, tool: str, args: list[str]) -> dict[str, Any]:
return {"step": step, "tool": tool, "args": args, "mode": "read_only"}
def _primary_service(context: dict[str, Any]) -> str:
services = context.get("affected_services") or []
if services:
return _resource_name(str(services[0]))
for signal in context.get("signals") or []:
labels = signal.get("labels") or {}
for key in ("deployment", "service", "container", "app", "pod", "instance"):
if labels.get(key):
return _resource_name(str(labels[key]).split(":")[0].split("-")[0])
return "unknown"
def _namespace(context: dict[str, Any]) -> str:
for signal in context.get("signals") or []:
labels = signal.get("labels") or {}
if labels.get("namespace"):
return _resource_name(str(labels["namespace"]))
return "default"
def _resource_name(value: str) -> str:
cleaned = "".join(
char.lower()
for char in value
if char.isalnum() or char in {"-", "."}
).strip("-.")
return cleaned or "unknown"