Files
awoooi/apps/api/src/services/playbook_generator.py
Your Name 4111ea4f9f
All checks were successful
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / tests (push) Successful in 1m13s
CD Pipeline / build-and-deploy (push) Successful in 3m36s
CD Pipeline / post-deploy-checks (push) Successful in 1m20s
fix(ai): remove 188 ollama provider
2026-05-06 14:34:48 +08:00

439 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
LLM Playbook Generator - ADR-104 T1/T2/T6
=========================================
從成功修復案例生成可治理的 Playbook 草稿。
設計重點:
- 只用 local/provider pool 順序GCP-A -> 111 local避免新增雲端成本。
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
- 不直接 APPROVED先 DRAFT/REVIEW再交治理 job 晉級。
"""
from __future__ import annotations
import json
import re
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from typing import Any
import structlog
from pydantic import BaseModel, Field, field_validator
from src.models.incident import Incident, IncidentStatus
from src.models.playbook import (
ActionType,
Playbook,
PlaybookSource,
PlaybookStatus,
RepairStep,
RiskLevel,
SymptomPattern,
)
from src.services.action_parser import kubectl_safety_reason
logger = structlog.get_logger(__name__)
LLMCallable = Callable[[str, dict[str, Any]], Awaitable[tuple[str, str, bool]]]
class GeneratedRepairStep(BaseModel):
"""LLM repair step contract."""
action_type: str = Field(default="manual")
command: str = Field(default="")
expected_result: str | None = None
rollback_command: str | None = None
risk_level: str = Field(default="MEDIUM")
@field_validator("risk_level", mode="before")
@classmethod
def normalize_risk(cls, value: object) -> str:
risk = str(value or "MEDIUM").upper()
return risk if risk in {"LOW", "MEDIUM", "HIGH", "CRITICAL"} else "MEDIUM"
class GeneratedPlaybookPayload(BaseModel):
"""Strict JSON shape expected from the local LLM."""
name: str = Field(min_length=1, max_length=256)
description: str = Field(min_length=1, max_length=2000)
alert_names: list[str] = Field(default_factory=list)
affected_services: list[str] = Field(default_factory=list)
severity_range: list[str] = Field(default_factory=lambda: ["P2"])
keywords: list[str] = Field(default_factory=list)
repair_steps: list[GeneratedRepairStep] = Field(default_factory=list)
estimated_duration_minutes: int = Field(default=5, ge=1, le=480)
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
tags: list[str] = Field(default_factory=list)
notes: str | None = None
@dataclass
class PlaybookGenerationResult:
"""Generator result plus provenance for timeline/KM/metrics."""
playbook: Playbook | None
outcome: str
provider: str
reason: str = ""
def _extract_json_object(text: str) -> dict[str, Any] | None:
"""Parse a JSON object from an LLM response."""
text = (text or "").strip()
if not text:
return None
try:
data = json.loads(text)
return data if isinstance(data, dict) else None
except json.JSONDecodeError:
pass
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if fenced:
try:
data = json.loads(fenced.group(1))
return data if isinstance(data, dict) else None
except json.JSONDecodeError:
return None
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
try:
data = json.loads(text[start : end + 1])
return data if isinstance(data, dict) else None
except json.JSONDecodeError:
return None
return None
def _safe_risk(value: str) -> RiskLevel:
try:
return RiskLevel(value.upper())
except ValueError:
return RiskLevel.MEDIUM
def _manual_step(step_number: int, command: str, reason: str) -> RepairStep:
command_preview = command.strip()[:240] or "未提供命令"
return RepairStep(
step_number=step_number,
action_type=ActionType.MANUAL,
command=f"人工審核 LLM 建議: {command_preview}",
expected_result=reason,
requires_approval=True,
risk_level=RiskLevel.HIGH,
)
class LLMPlaybookGenerator:
"""Generate Playbook drafts from resolved incidents using local AI."""
def __init__(
self,
playbook_service: Any | None = None,
llm_callable: LLMCallable | None = None,
) -> None:
self._playbook_service = playbook_service
self._llm_callable = llm_callable
async def generate_from_incident(
self,
incident: Incident,
action: str | None = None,
persist: bool = True,
) -> PlaybookGenerationResult:
"""Generate and optionally persist a governed Playbook draft."""
if incident.status not in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
return self._record(None, "skipped", "none", "incident_not_resolved")
if not incident.outcome or incident.outcome.execution_success is not True:
return self._record(None, "skipped", "none", "execution_not_successful")
prompt = self._build_prompt(incident, action)
context = {
"incident_id": incident.incident_id,
"intent_hint": "playbook_generation",
"task_type": "force_local",
"alert_type": self._first_alert_name(incident),
"target_resource": ",".join(incident.affected_services or []),
}
raw, provider, success = await self._call_local_llm(prompt, context)
payload = self._parse_payload(raw) if success else None
if payload is None:
fallback = self._deterministic_playbook(incident, action)
if fallback and persist:
fallback = await self._service().create(fallback)
return self._record(fallback, "fallback", provider, "llm_payload_invalid")
playbook = self._build_playbook(incident, payload, provider)
if not playbook.repair_steps:
playbook.repair_steps = self._deterministic_steps(incident, action)
if not playbook.repair_steps:
playbook.repair_steps = [
_manual_step(1, action or "未提供修復動作", "LLM 未產生可執行安全步驟")
]
playbook.status = PlaybookStatus.DRAFT
if persist:
playbook = await self._persist_with_lineage(playbook)
return self._record(playbook, "success", provider, "")
async def _persist_with_lineage(self, playbook: Playbook) -> Playbook:
"""Create a new lineage version when a close approved Playbook exists."""
try:
recommendations = await self._service().get_recommendations(
symptoms=playbook.symptom_pattern,
top_k=1,
use_rag=False,
)
if recommendations and recommendations[0].similarity_score >= 0.85:
base = recommendations[0].playbook
created = await self._service().create_new_version(
base_playbook_id=base.playbook_id,
candidate=playbook,
reason="ADR-104 local LLM generated improved Playbook from successful incident",
)
if created is not None:
return created
except Exception as exc:
logger.warning("playbook_generation_lineage_fallback", error=str(exc))
return await self._service().create(playbook)
async def _call_local_llm(
self,
prompt: str,
context: dict[str, Any],
) -> tuple[str, str, bool]:
if self._llm_callable is not None:
return await self._llm_callable(prompt, context)
try:
from src.services.ai_router import get_ai_executor
executor = get_ai_executor()
result = await executor.execute(
prompt=prompt,
provider_order=["ollama", "ollama_local"],
context=context,
cache_ttl=86400,
require_local=True,
)
return result.raw_response, result.provider, result.success
except Exception as exc:
logger.warning("playbook_generation_llm_failed", error=str(exc))
return "", "local_ai_error", False
def _parse_payload(self, raw: str) -> GeneratedPlaybookPayload | None:
data = _extract_json_object(raw)
if data is None:
return None
try:
return GeneratedPlaybookPayload.model_validate(data)
except Exception as exc:
logger.warning("playbook_generation_payload_invalid", error=str(exc))
return None
def _build_playbook(
self,
incident: Incident,
payload: GeneratedPlaybookPayload,
provider: str,
) -> Playbook:
steps = self._sanitize_steps(payload.repair_steps)
confidence = payload.confidence
status = PlaybookStatus.REVIEW if confidence >= 0.75 and steps else PlaybookStatus.DRAFT
alert_names = payload.alert_names or [self._first_alert_name(incident)]
affected = payload.affected_services or list(incident.affected_services or [])
severity = payload.severity_range or ([incident.severity.value] if incident.severity else ["P2"])
notes = payload.notes or ""
provenance = f"Generated by {provider} from {incident.incident_id}"
notes = f"{notes}\n{provenance}".strip()
return Playbook(
name=payload.name,
description=payload.description,
status=status,
source=PlaybookSource.LLM_GENERATED,
symptom_pattern=SymptomPattern(
alert_names=[x for x in alert_names if x],
affected_services=affected,
severity_range=severity,
keywords=payload.keywords[:10],
),
repair_steps=steps,
estimated_duration_minutes=payload.estimated_duration_minutes,
source_incident_ids=[incident.incident_id],
ai_confidence=confidence,
trust_score=0.3,
tags=[*payload.tags[:8], "llm_generated", provider],
notes=notes,
)
def _sanitize_steps(self, steps: list[GeneratedRepairStep]) -> list[RepairStep]:
sanitized: list[RepairStep] = []
for raw_step in steps[:8]:
command = raw_step.command.strip()
if not command:
continue
step_number = len(sanitized) + 1
action_type = raw_step.action_type.strip().lower()
if command.startswith("kubectl") or action_type == "kubectl":
safety_reason = kubectl_safety_reason(command)
if safety_reason is not None:
sanitized.append(_manual_step(step_number, command, safety_reason))
continue
sanitized.append(
RepairStep(
step_number=step_number,
action_type=ActionType.KUBECTL,
command=command,
expected_result=raw_step.expected_result,
rollback_command=raw_step.rollback_command,
requires_approval=_safe_risk(raw_step.risk_level) in (RiskLevel.HIGH, RiskLevel.CRITICAL),
risk_level=_safe_risk(raw_step.risk_level),
)
)
continue
if action_type == "ssh_command" or command.startswith("ssh "):
sanitized.append(
RepairStep(
step_number=step_number,
action_type=ActionType.SSH_COMMAND,
command=command,
expected_result=raw_step.expected_result,
rollback_command=raw_step.rollback_command,
requires_approval=True,
risk_level=max(_safe_risk(raw_step.risk_level), RiskLevel.MEDIUM, key=lambda r: list(RiskLevel).index(r)),
)
)
continue
sanitized.append(_manual_step(step_number, command, "non_kubectl_step_requires_review"))
return sanitized
def _deterministic_playbook(self, incident: Incident, action: str | None) -> Playbook | None:
steps = self._deterministic_steps(incident, action)
if not steps:
return None
alert_name = self._first_alert_name(incident) or "Unknown"
return Playbook(
name=f"{alert_name} - AI 生成 fallback Playbook",
description="LLM 產出不可解析時,從成功執行動作建立的保守 Playbook 草稿",
status=PlaybookStatus.DRAFT,
source=PlaybookSource.LLM_GENERATED,
symptom_pattern=SymptomPattern(
alert_names=[alert_name] if alert_name else [],
affected_services=list(incident.affected_services or []),
severity_range=[incident.severity.value] if incident.severity else ["P2"],
),
repair_steps=steps,
source_incident_ids=[incident.incident_id],
ai_confidence=0.45,
tags=["llm_generated", "fallback"],
notes=f"Generated deterministically after local LLM parse failure for {incident.incident_id}",
)
def _deterministic_steps(self, incident: Incident, action: str | None) -> list[RepairStep]:
command = (action or "").strip()
if not command and incident.outcome and incident.outcome.learning_notes:
command = incident.outcome.learning_notes.strip()
if not command:
return []
if command.startswith("kubectl"):
safety_reason = kubectl_safety_reason(command)
if safety_reason is None:
return [
RepairStep(
step_number=1,
action_type=ActionType.KUBECTL,
command=command,
requires_approval=False,
risk_level=RiskLevel.MEDIUM,
)
]
return [_manual_step(1, command, safety_reason)]
if command.startswith("ssh "):
return [
RepairStep(
step_number=1,
action_type=ActionType.SSH_COMMAND,
command=command,
requires_approval=True,
risk_level=RiskLevel.MEDIUM,
)
]
return [_manual_step(1, command, "unknown_action_type")]
def _build_prompt(self, incident: Incident, action: str | None) -> str:
signals = [
{
"alert_name": signal.alert_name,
"severity": signal.severity.value,
"labels": signal.labels,
"annotations": signal.annotations,
}
for signal in incident.signals[:5]
]
context = {
"incident_id": incident.incident_id,
"severity": incident.severity.value,
"affected_services": incident.affected_services,
"signals": signals,
"hypothesis": incident.decision_chain.hypothesis if incident.decision_chain else "",
"reasoning_steps": incident.decision_chain.reasoning_steps if incident.decision_chain else [],
"successful_action": action or (incident.outcome.learning_notes if incident.outcome else ""),
"effectiveness_score": incident.outcome.effectiveness_score if incident.outcome else None,
}
return (
"你是 AWOOOI ADR-104 Playbook Generator由 OpenClaw/Hermes/NemoTron/ElephantAlpha 的角色視角共同產出。"
"請只輸出 JSON object不要 markdown。任何破壞性命令必須改成 manual 步驟。\n"
"JSON schema: {name, description, alert_names, affected_services, severity_range, keywords, "
"repair_steps:[{action_type, command, expected_result, rollback_command, risk_level}], "
"estimated_duration_minutes, confidence, tags, notes}.\n"
f"Incident context:\n{json.dumps(context, ensure_ascii=False, default=str)}"
)
def _first_alert_name(self, incident: Incident) -> str:
return incident.signals[0].alert_name if incident.signals else ""
def _service(self) -> Any:
if self._playbook_service is None:
from src.services.playbook_service import get_playbook_service
self._playbook_service = get_playbook_service()
return self._playbook_service
def _record(
self,
playbook: Playbook | None,
outcome: str,
provider: str,
reason: str,
) -> PlaybookGenerationResult:
try:
from src.core.metrics import observe_playbook_status, record_playbook_generation
source = provider or "none"
record_playbook_generation(outcome=outcome, source=source)
if playbook is not None:
observe_playbook_status(status=playbook.status.value, source=source)
except Exception as exc:
logger.debug("playbook_generation_metric_failed", error=str(exc))
return PlaybookGenerationResult(playbook=playbook, outcome=outcome, provider=provider, reason=reason)
_generator: LLMPlaybookGenerator | None = None
def get_playbook_generator() -> LLMPlaybookGenerator:
"""Return global LLM Playbook generator."""
global _generator
if _generator is None:
_generator = LLMPlaybookGenerator()
return _generator