439 lines
17 KiB
Python
439 lines
17 KiB
Python
"""
|
||
LLM Playbook Generator - ADR-104 T1/T2/T6
|
||
=========================================
|
||
從成功修復案例生成可治理的 Playbook 草稿。
|
||
|
||
設計重點:
|
||
- 只用 local/provider pool 順序(GCP-A -> 111 local),避免新增雲端成本。
|
||
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
|
||
- 不直接 APPROVED;先 DRAFT/REVIEW,再交治理 job 晉級。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from collections.abc import Awaitable, Callable
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
import structlog
|
||
from pydantic import BaseModel, Field, field_validator
|
||
|
||
from src.models.incident import Incident, IncidentStatus
|
||
from src.models.playbook import (
|
||
ActionType,
|
||
Playbook,
|
||
PlaybookSource,
|
||
PlaybookStatus,
|
||
RepairStep,
|
||
RiskLevel,
|
||
SymptomPattern,
|
||
)
|
||
from src.services.action_parser import kubectl_safety_reason
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
LLMCallable = Callable[[str, dict[str, Any]], Awaitable[tuple[str, str, bool]]]
|
||
|
||
|
||
class GeneratedRepairStep(BaseModel):
|
||
"""LLM repair step contract."""
|
||
|
||
action_type: str = Field(default="manual")
|
||
command: str = Field(default="")
|
||
expected_result: str | None = None
|
||
rollback_command: str | None = None
|
||
risk_level: str = Field(default="MEDIUM")
|
||
|
||
@field_validator("risk_level", mode="before")
|
||
@classmethod
|
||
def normalize_risk(cls, value: object) -> str:
|
||
risk = str(value or "MEDIUM").upper()
|
||
return risk if risk in {"LOW", "MEDIUM", "HIGH", "CRITICAL"} else "MEDIUM"
|
||
|
||
|
||
class GeneratedPlaybookPayload(BaseModel):
|
||
"""Strict JSON shape expected from the local LLM."""
|
||
|
||
name: str = Field(min_length=1, max_length=256)
|
||
description: str = Field(min_length=1, max_length=2000)
|
||
alert_names: list[str] = Field(default_factory=list)
|
||
affected_services: list[str] = Field(default_factory=list)
|
||
severity_range: list[str] = Field(default_factory=lambda: ["P2"])
|
||
keywords: list[str] = Field(default_factory=list)
|
||
repair_steps: list[GeneratedRepairStep] = Field(default_factory=list)
|
||
estimated_duration_minutes: int = Field(default=5, ge=1, le=480)
|
||
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
||
tags: list[str] = Field(default_factory=list)
|
||
notes: str | None = None
|
||
|
||
|
||
@dataclass
|
||
class PlaybookGenerationResult:
|
||
"""Generator result plus provenance for timeline/KM/metrics."""
|
||
|
||
playbook: Playbook | None
|
||
outcome: str
|
||
provider: str
|
||
reason: str = ""
|
||
|
||
|
||
def _extract_json_object(text: str) -> dict[str, Any] | None:
|
||
"""Parse a JSON object from an LLM response."""
|
||
text = (text or "").strip()
|
||
if not text:
|
||
return None
|
||
try:
|
||
data = json.loads(text)
|
||
return data if isinstance(data, dict) else None
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
||
if fenced:
|
||
try:
|
||
data = json.loads(fenced.group(1))
|
||
return data if isinstance(data, dict) else None
|
||
except json.JSONDecodeError:
|
||
return None
|
||
|
||
start = text.find("{")
|
||
end = text.rfind("}")
|
||
if start >= 0 and end > start:
|
||
try:
|
||
data = json.loads(text[start : end + 1])
|
||
return data if isinstance(data, dict) else None
|
||
except json.JSONDecodeError:
|
||
return None
|
||
return None
|
||
|
||
|
||
def _safe_risk(value: str) -> RiskLevel:
|
||
try:
|
||
return RiskLevel(value.upper())
|
||
except ValueError:
|
||
return RiskLevel.MEDIUM
|
||
|
||
|
||
def _manual_step(step_number: int, command: str, reason: str) -> RepairStep:
|
||
command_preview = command.strip()[:240] or "未提供命令"
|
||
return RepairStep(
|
||
step_number=step_number,
|
||
action_type=ActionType.MANUAL,
|
||
command=f"人工審核 LLM 建議: {command_preview}",
|
||
expected_result=reason,
|
||
requires_approval=True,
|
||
risk_level=RiskLevel.HIGH,
|
||
)
|
||
|
||
|
||
class LLMPlaybookGenerator:
|
||
"""Generate Playbook drafts from resolved incidents using local AI."""
|
||
|
||
def __init__(
|
||
self,
|
||
playbook_service: Any | None = None,
|
||
llm_callable: LLMCallable | None = None,
|
||
) -> None:
|
||
self._playbook_service = playbook_service
|
||
self._llm_callable = llm_callable
|
||
|
||
async def generate_from_incident(
|
||
self,
|
||
incident: Incident,
|
||
action: str | None = None,
|
||
persist: bool = True,
|
||
) -> PlaybookGenerationResult:
|
||
"""Generate and optionally persist a governed Playbook draft."""
|
||
if incident.status not in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||
return self._record(None, "skipped", "none", "incident_not_resolved")
|
||
if not incident.outcome or incident.outcome.execution_success is not True:
|
||
return self._record(None, "skipped", "none", "execution_not_successful")
|
||
|
||
prompt = self._build_prompt(incident, action)
|
||
context = {
|
||
"incident_id": incident.incident_id,
|
||
"intent_hint": "playbook_generation",
|
||
"task_type": "force_local",
|
||
"alert_type": self._first_alert_name(incident),
|
||
"target_resource": ",".join(incident.affected_services or []),
|
||
}
|
||
|
||
raw, provider, success = await self._call_local_llm(prompt, context)
|
||
payload = self._parse_payload(raw) if success else None
|
||
if payload is None:
|
||
fallback = self._deterministic_playbook(incident, action)
|
||
if fallback and persist:
|
||
fallback = await self._service().create(fallback)
|
||
return self._record(fallback, "fallback", provider, "llm_payload_invalid")
|
||
|
||
playbook = self._build_playbook(incident, payload, provider)
|
||
if not playbook.repair_steps:
|
||
playbook.repair_steps = self._deterministic_steps(incident, action)
|
||
if not playbook.repair_steps:
|
||
playbook.repair_steps = [
|
||
_manual_step(1, action or "未提供修復動作", "LLM 未產生可執行安全步驟")
|
||
]
|
||
playbook.status = PlaybookStatus.DRAFT
|
||
|
||
if persist:
|
||
playbook = await self._persist_with_lineage(playbook)
|
||
|
||
return self._record(playbook, "success", provider, "")
|
||
|
||
async def _persist_with_lineage(self, playbook: Playbook) -> Playbook:
|
||
"""Create a new lineage version when a close approved Playbook exists."""
|
||
try:
|
||
recommendations = await self._service().get_recommendations(
|
||
symptoms=playbook.symptom_pattern,
|
||
top_k=1,
|
||
use_rag=False,
|
||
)
|
||
if recommendations and recommendations[0].similarity_score >= 0.85:
|
||
base = recommendations[0].playbook
|
||
created = await self._service().create_new_version(
|
||
base_playbook_id=base.playbook_id,
|
||
candidate=playbook,
|
||
reason="ADR-104 local LLM generated improved Playbook from successful incident",
|
||
)
|
||
if created is not None:
|
||
return created
|
||
except Exception as exc:
|
||
logger.warning("playbook_generation_lineage_fallback", error=str(exc))
|
||
return await self._service().create(playbook)
|
||
|
||
async def _call_local_llm(
|
||
self,
|
||
prompt: str,
|
||
context: dict[str, Any],
|
||
) -> tuple[str, str, bool]:
|
||
if self._llm_callable is not None:
|
||
return await self._llm_callable(prompt, context)
|
||
|
||
try:
|
||
from src.services.ai_router import get_ai_executor
|
||
|
||
executor = get_ai_executor()
|
||
result = await executor.execute(
|
||
prompt=prompt,
|
||
provider_order=["ollama", "ollama_local"],
|
||
context=context,
|
||
cache_ttl=86400,
|
||
require_local=True,
|
||
)
|
||
return result.raw_response, result.provider, result.success
|
||
except Exception as exc:
|
||
logger.warning("playbook_generation_llm_failed", error=str(exc))
|
||
return "", "local_ai_error", False
|
||
|
||
def _parse_payload(self, raw: str) -> GeneratedPlaybookPayload | None:
|
||
data = _extract_json_object(raw)
|
||
if data is None:
|
||
return None
|
||
try:
|
||
return GeneratedPlaybookPayload.model_validate(data)
|
||
except Exception as exc:
|
||
logger.warning("playbook_generation_payload_invalid", error=str(exc))
|
||
return None
|
||
|
||
def _build_playbook(
|
||
self,
|
||
incident: Incident,
|
||
payload: GeneratedPlaybookPayload,
|
||
provider: str,
|
||
) -> Playbook:
|
||
steps = self._sanitize_steps(payload.repair_steps)
|
||
confidence = payload.confidence
|
||
status = PlaybookStatus.REVIEW if confidence >= 0.75 and steps else PlaybookStatus.DRAFT
|
||
|
||
alert_names = payload.alert_names or [self._first_alert_name(incident)]
|
||
affected = payload.affected_services or list(incident.affected_services or [])
|
||
severity = payload.severity_range or ([incident.severity.value] if incident.severity else ["P2"])
|
||
|
||
notes = payload.notes or ""
|
||
provenance = f"Generated by {provider} from {incident.incident_id}"
|
||
notes = f"{notes}\n{provenance}".strip()
|
||
|
||
return Playbook(
|
||
name=payload.name,
|
||
description=payload.description,
|
||
status=status,
|
||
source=PlaybookSource.LLM_GENERATED,
|
||
symptom_pattern=SymptomPattern(
|
||
alert_names=[x for x in alert_names if x],
|
||
affected_services=affected,
|
||
severity_range=severity,
|
||
keywords=payload.keywords[:10],
|
||
),
|
||
repair_steps=steps,
|
||
estimated_duration_minutes=payload.estimated_duration_minutes,
|
||
source_incident_ids=[incident.incident_id],
|
||
ai_confidence=confidence,
|
||
trust_score=0.3,
|
||
tags=[*payload.tags[:8], "llm_generated", provider],
|
||
notes=notes,
|
||
)
|
||
|
||
def _sanitize_steps(self, steps: list[GeneratedRepairStep]) -> list[RepairStep]:
|
||
sanitized: list[RepairStep] = []
|
||
for raw_step in steps[:8]:
|
||
command = raw_step.command.strip()
|
||
if not command:
|
||
continue
|
||
step_number = len(sanitized) + 1
|
||
action_type = raw_step.action_type.strip().lower()
|
||
if command.startswith("kubectl") or action_type == "kubectl":
|
||
safety_reason = kubectl_safety_reason(command)
|
||
if safety_reason is not None:
|
||
sanitized.append(_manual_step(step_number, command, safety_reason))
|
||
continue
|
||
sanitized.append(
|
||
RepairStep(
|
||
step_number=step_number,
|
||
action_type=ActionType.KUBECTL,
|
||
command=command,
|
||
expected_result=raw_step.expected_result,
|
||
rollback_command=raw_step.rollback_command,
|
||
requires_approval=_safe_risk(raw_step.risk_level) in (RiskLevel.HIGH, RiskLevel.CRITICAL),
|
||
risk_level=_safe_risk(raw_step.risk_level),
|
||
)
|
||
)
|
||
continue
|
||
if action_type == "ssh_command" or command.startswith("ssh "):
|
||
sanitized.append(
|
||
RepairStep(
|
||
step_number=step_number,
|
||
action_type=ActionType.SSH_COMMAND,
|
||
command=command,
|
||
expected_result=raw_step.expected_result,
|
||
rollback_command=raw_step.rollback_command,
|
||
requires_approval=True,
|
||
risk_level=max(_safe_risk(raw_step.risk_level), RiskLevel.MEDIUM, key=lambda r: list(RiskLevel).index(r)),
|
||
)
|
||
)
|
||
continue
|
||
sanitized.append(_manual_step(step_number, command, "non_kubectl_step_requires_review"))
|
||
return sanitized
|
||
|
||
def _deterministic_playbook(self, incident: Incident, action: str | None) -> Playbook | None:
|
||
steps = self._deterministic_steps(incident, action)
|
||
if not steps:
|
||
return None
|
||
alert_name = self._first_alert_name(incident) or "Unknown"
|
||
return Playbook(
|
||
name=f"{alert_name} - AI 生成 fallback Playbook",
|
||
description="LLM 產出不可解析時,從成功執行動作建立的保守 Playbook 草稿",
|
||
status=PlaybookStatus.DRAFT,
|
||
source=PlaybookSource.LLM_GENERATED,
|
||
symptom_pattern=SymptomPattern(
|
||
alert_names=[alert_name] if alert_name else [],
|
||
affected_services=list(incident.affected_services or []),
|
||
severity_range=[incident.severity.value] if incident.severity else ["P2"],
|
||
),
|
||
repair_steps=steps,
|
||
source_incident_ids=[incident.incident_id],
|
||
ai_confidence=0.45,
|
||
tags=["llm_generated", "fallback"],
|
||
notes=f"Generated deterministically after local LLM parse failure for {incident.incident_id}",
|
||
)
|
||
|
||
def _deterministic_steps(self, incident: Incident, action: str | None) -> list[RepairStep]:
|
||
command = (action or "").strip()
|
||
if not command and incident.outcome and incident.outcome.learning_notes:
|
||
command = incident.outcome.learning_notes.strip()
|
||
if not command:
|
||
return []
|
||
if command.startswith("kubectl"):
|
||
safety_reason = kubectl_safety_reason(command)
|
||
if safety_reason is None:
|
||
return [
|
||
RepairStep(
|
||
step_number=1,
|
||
action_type=ActionType.KUBECTL,
|
||
command=command,
|
||
requires_approval=False,
|
||
risk_level=RiskLevel.MEDIUM,
|
||
)
|
||
]
|
||
return [_manual_step(1, command, safety_reason)]
|
||
if command.startswith("ssh "):
|
||
return [
|
||
RepairStep(
|
||
step_number=1,
|
||
action_type=ActionType.SSH_COMMAND,
|
||
command=command,
|
||
requires_approval=True,
|
||
risk_level=RiskLevel.MEDIUM,
|
||
)
|
||
]
|
||
return [_manual_step(1, command, "unknown_action_type")]
|
||
|
||
def _build_prompt(self, incident: Incident, action: str | None) -> str:
|
||
signals = [
|
||
{
|
||
"alert_name": signal.alert_name,
|
||
"severity": signal.severity.value,
|
||
"labels": signal.labels,
|
||
"annotations": signal.annotations,
|
||
}
|
||
for signal in incident.signals[:5]
|
||
]
|
||
context = {
|
||
"incident_id": incident.incident_id,
|
||
"severity": incident.severity.value,
|
||
"affected_services": incident.affected_services,
|
||
"signals": signals,
|
||
"hypothesis": incident.decision_chain.hypothesis if incident.decision_chain else "",
|
||
"reasoning_steps": incident.decision_chain.reasoning_steps if incident.decision_chain else [],
|
||
"successful_action": action or (incident.outcome.learning_notes if incident.outcome else ""),
|
||
"effectiveness_score": incident.outcome.effectiveness_score if incident.outcome else None,
|
||
}
|
||
return (
|
||
"你是 AWOOOI ADR-104 Playbook Generator,由 OpenClaw/Hermes/NemoTron/ElephantAlpha 的角色視角共同產出。"
|
||
"請只輸出 JSON object,不要 markdown。任何破壞性命令必須改成 manual 步驟。\n"
|
||
"JSON schema: {name, description, alert_names, affected_services, severity_range, keywords, "
|
||
"repair_steps:[{action_type, command, expected_result, rollback_command, risk_level}], "
|
||
"estimated_duration_minutes, confidence, tags, notes}.\n"
|
||
f"Incident context:\n{json.dumps(context, ensure_ascii=False, default=str)}"
|
||
)
|
||
|
||
def _first_alert_name(self, incident: Incident) -> str:
|
||
return incident.signals[0].alert_name if incident.signals else ""
|
||
|
||
def _service(self) -> Any:
|
||
if self._playbook_service is None:
|
||
from src.services.playbook_service import get_playbook_service
|
||
|
||
self._playbook_service = get_playbook_service()
|
||
return self._playbook_service
|
||
|
||
def _record(
|
||
self,
|
||
playbook: Playbook | None,
|
||
outcome: str,
|
||
provider: str,
|
||
reason: str,
|
||
) -> PlaybookGenerationResult:
|
||
try:
|
||
from src.core.metrics import observe_playbook_status, record_playbook_generation
|
||
|
||
source = provider or "none"
|
||
record_playbook_generation(outcome=outcome, source=source)
|
||
if playbook is not None:
|
||
observe_playbook_status(status=playbook.status.value, source=source)
|
||
except Exception as exc:
|
||
logger.debug("playbook_generation_metric_failed", error=str(exc))
|
||
return PlaybookGenerationResult(playbook=playbook, outcome=outcome, provider=provider, reason=reason)
|
||
|
||
|
||
_generator: LLMPlaybookGenerator | None = None
|
||
|
||
|
||
def get_playbook_generator() -> LLMPlaybookGenerator:
|
||
"""Return global LLM Playbook generator."""
|
||
global _generator
|
||
if _generator is None:
|
||
_generator = LLMPlaybookGenerator()
|
||
return _generator
|