feat(playbook): generate drafts with local llm
Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
This commit is contained in:
@@ -132,6 +132,26 @@ class Settings(BaseSettings):
|
||||
description="W2 PR-L1: 同 symptom_pattern_hash 累積幾條 KM 後觸發 Playbook review_required 標記(預設 N=5)",
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# ADR-104: LLM Playbook Generator
|
||||
# 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。
|
||||
# 成本護欄:實作層只走 local provider(Ollama 111 → Ollama 188),不新增雲端 fallback。
|
||||
# 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false
|
||||
# ==========================================================================
|
||||
ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field(
|
||||
default=True,
|
||||
description="ADR-104 T1: True=成功修復無 matched_playbook_id 時啟動本地 LLM 生成 Playbook, False=只用 deterministic extraction",
|
||||
)
|
||||
ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB: bool = Field(
|
||||
default=True,
|
||||
description="ADR-104 T2: True=定期治理 LLM Playbook DRAFT/REVIEW 晉級, False=停用",
|
||||
)
|
||||
PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS: int = Field(
|
||||
default=3600,
|
||||
ge=60,
|
||||
description="ADR-104 T2: Playbook DRAFT governance job interval seconds",
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# aider-watch v2 integration (2026-04-20 ADR-091)
|
||||
# 整合 Mac aider CLI 監控進 awoooi 飛輪(events → incident → ai_router feedback)
|
||||
|
||||
@@ -253,6 +253,32 @@ RESOURCE_RESOLVE_TOTAL = Counter(
|
||||
["result"], # hit / miss / suggestion / error
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# ADR-100 / ADR-104 Flywheel Emitter Metrics
|
||||
# =============================================================================
|
||||
|
||||
PLAYBOOK_GENERATION_TOTAL = Counter(
|
||||
"playbook_generation_total",
|
||||
"LLM Playbook generation and governance outcomes",
|
||||
["outcome", "source"],
|
||||
)
|
||||
|
||||
PLAYBOOK_STATUS_TOTAL = Gauge(
|
||||
"playbook_status_total",
|
||||
"Playbook lifecycle status observations from generation/governance",
|
||||
["status", "source"],
|
||||
)
|
||||
|
||||
|
||||
def record_playbook_generation(outcome: str, source: str) -> None:
|
||||
"""Record Playbook generation/governance outcome."""
|
||||
PLAYBOOK_GENERATION_TOTAL.labels(outcome=outcome, source=source).inc()
|
||||
|
||||
|
||||
def observe_playbook_status(status: str, source: str) -> None:
|
||||
"""Expose latest observed Playbook lifecycle status."""
|
||||
PLAYBOOK_STATUS_TOTAL.labels(status=status, source=source).set(1)
|
||||
|
||||
# =============================================================================
|
||||
# Solver MCP Registry Metrics (H2, 2026-04-27 台北時區)
|
||||
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, B1 Fix Round)
|
||||
|
||||
125
apps/api/src/jobs/playbook_generation_governance_job.py
Normal file
125
apps/api/src/jobs/playbook_generation_governance_job.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Playbook Generation Governance Job - ADR-104 T2/T6
|
||||
==================================================
|
||||
定期處理 LLM 生成 Playbook 的 DRAFT 黑洞:
|
||||
- DRAFT + confidence >= 0.75 + 安全步驟 → REVIEW
|
||||
- REVIEW + confidence >= 0.9 + 安全步驟 → APPROVED
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
from src.models.playbook import ActionType, Playbook, PlaybookSource, PlaybookStatus
|
||||
from src.services.action_parser import is_safe_kubectl_action
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaybookGovernanceReport:
|
||||
reviewed_count: int = 0
|
||||
approved_count: int = 0
|
||||
skipped_count: int = 0
|
||||
errors: list[str] | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.errors is None:
|
||||
self.errors = []
|
||||
|
||||
|
||||
def _is_generated_candidate(playbook: Playbook) -> bool:
|
||||
return playbook.source in (PlaybookSource.LLM_GENERATED, PlaybookSource.EXTRACTED)
|
||||
|
||||
|
||||
def _has_safe_steps(playbook: Playbook) -> bool:
|
||||
if not playbook.repair_steps:
|
||||
return False
|
||||
for step in playbook.repair_steps:
|
||||
if step.action_type == ActionType.KUBECTL:
|
||||
if not is_safe_kubectl_action(step.command):
|
||||
return False
|
||||
elif step.action_type != ActionType.MANUAL and not step.requires_approval:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def run_playbook_generation_governance_once(force: bool = False) -> PlaybookGovernanceReport:
|
||||
"""Run one DRAFT/REVIEW governance pass."""
|
||||
if not force and not settings.ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB:
|
||||
return PlaybookGovernanceReport()
|
||||
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
|
||||
service = get_playbook_service()
|
||||
report = PlaybookGovernanceReport()
|
||||
|
||||
for status in (PlaybookStatus.DRAFT, PlaybookStatus.REVIEW):
|
||||
playbooks, _total = await service.list_playbooks(status=status, limit=200, offset=0)
|
||||
for playbook in playbooks:
|
||||
if not _is_generated_candidate(playbook) or not _has_safe_steps(playbook):
|
||||
report.skipped_count += 1
|
||||
continue
|
||||
try:
|
||||
if playbook.status == PlaybookStatus.DRAFT and playbook.ai_confidence >= 0.75:
|
||||
await service.update_with_validation(
|
||||
playbook.playbook_id,
|
||||
{
|
||||
"status": PlaybookStatus.REVIEW.value,
|
||||
"notes": (playbook.notes or "") + "\n[Governance: DRAFT -> REVIEW]",
|
||||
},
|
||||
)
|
||||
report.reviewed_count += 1
|
||||
_record_governance("reviewed")
|
||||
elif playbook.status == PlaybookStatus.REVIEW and playbook.ai_confidence >= 0.9:
|
||||
await service.update_with_validation(
|
||||
playbook.playbook_id,
|
||||
{
|
||||
"status": PlaybookStatus.APPROVED.value,
|
||||
"approved_by": "playbook_generation_governance",
|
||||
"notes": (playbook.notes or "") + "\n[Governance: REVIEW -> APPROVED]",
|
||||
},
|
||||
)
|
||||
report.approved_count += 1
|
||||
_record_governance("approved")
|
||||
else:
|
||||
report.skipped_count += 1
|
||||
except Exception as exc:
|
||||
report.errors.append(f"{playbook.playbook_id}:{exc}")
|
||||
logger.warning("playbook_governance_update_failed", playbook_id=playbook.playbook_id, error=str(exc))
|
||||
|
||||
logger.info(
|
||||
"playbook_generation_governance_done",
|
||||
reviewed=report.reviewed_count,
|
||||
approved=report.approved_count,
|
||||
skipped=report.skipped_count,
|
||||
errors=len(report.errors),
|
||||
)
|
||||
return report
|
||||
|
||||
|
||||
async def run_playbook_generation_governance_loop() -> None:
|
||||
"""Run governance forever at configured interval."""
|
||||
while True:
|
||||
try:
|
||||
await run_playbook_generation_governance_once()
|
||||
except Exception:
|
||||
logger.exception("playbook_generation_governance_fatal")
|
||||
await asyncio.sleep(settings.PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS)
|
||||
|
||||
|
||||
def _record_governance(outcome: str) -> None:
|
||||
try:
|
||||
from src.core.metrics import observe_playbook_status, record_playbook_generation
|
||||
|
||||
record_playbook_generation(outcome=outcome, source="governance")
|
||||
if outcome == "reviewed":
|
||||
observe_playbook_status(status=PlaybookStatus.REVIEW.value, source="governance")
|
||||
elif outcome == "approved":
|
||||
observe_playbook_status(status=PlaybookStatus.APPROVED.value, source="governance")
|
||||
except Exception:
|
||||
pass
|
||||
@@ -500,6 +500,17 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("evolver_loop_schedule_failed", error=str(e))
|
||||
|
||||
# ADR-104 T2: LLM Playbook DRAFT governance(每小時)
|
||||
try:
|
||||
from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_loop
|
||||
asyncio.create_task(run_playbook_generation_governance_loop())
|
||||
logger.info(
|
||||
"playbook_generation_governance_loop_scheduled",
|
||||
interval_sec=settings.PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("playbook_generation_governance_loop_schedule_failed", error=str(e))
|
||||
|
||||
# ADR-083 Phase 3: 知識遺忘 Job(每日)— 30d 未引用 KB entry 標記 archived
|
||||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||||
try:
|
||||
@@ -997,4 +1008,3 @@ if __name__ == "__main__":
|
||||
reload=settings.DEBUG,
|
||||
log_level=settings.LOG_LEVEL.lower(),
|
||||
)
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ class PlaybookStatus(str, Enum):
|
||||
"""Playbook 狀態"""
|
||||
|
||||
DRAFT = "draft" # AI 萃取,待人工審核
|
||||
REVIEW = "review" # AI 生成且安全檢查通過,等待治理晉級
|
||||
APPROVED = "approved" # 人工核准,可用於推薦
|
||||
DEPRECATED = "deprecated" # 已棄用 (有更好方案)
|
||||
|
||||
@@ -38,6 +39,7 @@ class PlaybookSource(str, Enum):
|
||||
"""Playbook 來源"""
|
||||
|
||||
EXTRACTED = "extracted" # 從 Incident 自動萃取
|
||||
LLM_GENERATED = "llm_generated" # ADR-104: LLM 從成功案例生成
|
||||
MANUAL = "manual" # 人工建立
|
||||
YAML_RULE = "yaml_rule" # 從 alert_rules.yaml 匯入(2026-04-15 ogt)
|
||||
|
||||
|
||||
@@ -439,6 +439,7 @@ class LearningService:
|
||||
try:
|
||||
from src.repositories.incident_repository import get_incident_repository
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
from src.core.config import settings
|
||||
|
||||
# 取得 Incident
|
||||
repo = get_incident_repository()
|
||||
@@ -451,6 +452,26 @@ class LearningService:
|
||||
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
|
||||
return None
|
||||
|
||||
# ADR-104: 優先用本地 LLM 生成可治理 Playbook。失敗時服務內會降級成
|
||||
# deterministic fallback;feature flag 關閉才回到舊萃取路徑。
|
||||
if settings.ENABLE_LLM_PLAYBOOK_GENERATION:
|
||||
from src.services.playbook_generator import get_playbook_generator
|
||||
|
||||
generated = await get_playbook_generator().generate_from_incident(
|
||||
incident=incident,
|
||||
action=action,
|
||||
persist=True,
|
||||
)
|
||||
if generated.playbook:
|
||||
logger.info(
|
||||
"playbook_llm_generated",
|
||||
incident_id=incident_id,
|
||||
playbook_id=generated.playbook.playbook_id,
|
||||
outcome=generated.outcome,
|
||||
provider=generated.provider,
|
||||
)
|
||||
return generated.playbook.playbook_id
|
||||
|
||||
# 萃取 Playbook
|
||||
service = get_playbook_service()
|
||||
playbook = await service.extract_from_incident(
|
||||
|
||||
418
apps/api/src/services/playbook_generator.py
Normal file
418
apps/api/src/services/playbook_generator.py
Normal file
@@ -0,0 +1,418 @@
|
||||
"""
|
||||
LLM Playbook Generator - ADR-104 T1/T2/T6
|
||||
=========================================
|
||||
從成功修復案例生成可治理的 Playbook 草稿。
|
||||
|
||||
設計重點:
|
||||
- 只用 local provider 順序(Ollama 111 -> Ollama 188),避免新增雲端成本。
|
||||
- LLM 產出必須經 Pydantic + action_parser 安全收斂。
|
||||
- 不直接 APPROVED;先 DRAFT/REVIEW,再交治理 job 晉級。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from src.models.incident import Incident, IncidentStatus
|
||||
from src.models.playbook import (
|
||||
ActionType,
|
||||
Playbook,
|
||||
PlaybookSource,
|
||||
PlaybookStatus,
|
||||
RepairStep,
|
||||
RiskLevel,
|
||||
SymptomPattern,
|
||||
)
|
||||
from src.services.action_parser import is_safe_kubectl_action
|
||||
from src.services.action_parser import kubectl_safety_reason
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
LLMCallable = Callable[[str, dict[str, Any]], Awaitable[tuple[str, str, bool]]]
|
||||
|
||||
|
||||
class GeneratedRepairStep(BaseModel):
|
||||
"""LLM repair step contract."""
|
||||
|
||||
action_type: str = Field(default="manual")
|
||||
command: str = Field(default="")
|
||||
expected_result: str | None = None
|
||||
rollback_command: str | None = None
|
||||
risk_level: str = Field(default="MEDIUM")
|
||||
|
||||
@field_validator("risk_level", mode="before")
|
||||
@classmethod
|
||||
def normalize_risk(cls, value: object) -> str:
|
||||
risk = str(value or "MEDIUM").upper()
|
||||
return risk if risk in {"LOW", "MEDIUM", "HIGH", "CRITICAL"} else "MEDIUM"
|
||||
|
||||
|
||||
class GeneratedPlaybookPayload(BaseModel):
|
||||
"""Strict JSON shape expected from the local LLM."""
|
||||
|
||||
name: str = Field(min_length=1, max_length=256)
|
||||
description: str = Field(min_length=1, max_length=2000)
|
||||
alert_names: list[str] = Field(default_factory=list)
|
||||
affected_services: list[str] = Field(default_factory=list)
|
||||
severity_range: list[str] = Field(default_factory=lambda: ["P2"])
|
||||
keywords: list[str] = Field(default_factory=list)
|
||||
repair_steps: list[GeneratedRepairStep] = Field(default_factory=list)
|
||||
estimated_duration_minutes: int = Field(default=5, ge=1, le=480)
|
||||
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
||||
tags: list[str] = Field(default_factory=list)
|
||||
notes: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaybookGenerationResult:
|
||||
"""Generator result plus provenance for timeline/KM/metrics."""
|
||||
|
||||
playbook: Playbook | None
|
||||
outcome: str
|
||||
provider: str
|
||||
reason: str = ""
|
||||
|
||||
|
||||
def _extract_json_object(text: str) -> dict[str, Any] | None:
|
||||
"""Parse a JSON object from an LLM response."""
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(text)
|
||||
return data if isinstance(data, dict) else None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
||||
if fenced:
|
||||
try:
|
||||
data = json.loads(fenced.group(1))
|
||||
return data if isinstance(data, dict) else None
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
start = text.find("{")
|
||||
end = text.rfind("}")
|
||||
if start >= 0 and end > start:
|
||||
try:
|
||||
data = json.loads(text[start : end + 1])
|
||||
return data if isinstance(data, dict) else None
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _safe_risk(value: str) -> RiskLevel:
|
||||
try:
|
||||
return RiskLevel(value.upper())
|
||||
except ValueError:
|
||||
return RiskLevel.MEDIUM
|
||||
|
||||
|
||||
def _manual_step(step_number: int, command: str, reason: str) -> RepairStep:
|
||||
command_preview = command.strip()[:240] or "未提供命令"
|
||||
return RepairStep(
|
||||
step_number=step_number,
|
||||
action_type=ActionType.MANUAL,
|
||||
command=f"人工審核 LLM 建議: {command_preview}",
|
||||
expected_result=reason,
|
||||
requires_approval=True,
|
||||
risk_level=RiskLevel.HIGH,
|
||||
)
|
||||
|
||||
|
||||
class LLMPlaybookGenerator:
|
||||
"""Generate Playbook drafts from resolved incidents using local AI."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
playbook_service: Any | None = None,
|
||||
llm_callable: LLMCallable | None = None,
|
||||
) -> None:
|
||||
self._playbook_service = playbook_service
|
||||
self._llm_callable = llm_callable
|
||||
|
||||
async def generate_from_incident(
|
||||
self,
|
||||
incident: Incident,
|
||||
action: str | None = None,
|
||||
persist: bool = True,
|
||||
) -> PlaybookGenerationResult:
|
||||
"""Generate and optionally persist a governed Playbook draft."""
|
||||
if incident.status not in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||||
return self._record(None, "skipped", "none", "incident_not_resolved")
|
||||
if not incident.outcome or incident.outcome.execution_success is not True:
|
||||
return self._record(None, "skipped", "none", "execution_not_successful")
|
||||
|
||||
prompt = self._build_prompt(incident, action)
|
||||
context = {
|
||||
"incident_id": incident.incident_id,
|
||||
"intent_hint": "playbook_generation",
|
||||
"task_type": "force_local",
|
||||
"alert_type": self._first_alert_name(incident),
|
||||
"target_resource": ",".join(incident.affected_services or []),
|
||||
}
|
||||
|
||||
raw, provider, success = await self._call_local_llm(prompt, context)
|
||||
payload = self._parse_payload(raw) if success else None
|
||||
if payload is None:
|
||||
fallback = self._deterministic_playbook(incident, action)
|
||||
if fallback and persist:
|
||||
fallback = await self._service().create(fallback)
|
||||
return self._record(fallback, "fallback", provider, "llm_payload_invalid")
|
||||
|
||||
playbook = self._build_playbook(incident, payload, provider)
|
||||
if not playbook.repair_steps:
|
||||
playbook.repair_steps = self._deterministic_steps(incident, action)
|
||||
if not playbook.repair_steps:
|
||||
playbook.repair_steps = [
|
||||
_manual_step(1, action or "未提供修復動作", "LLM 未產生可執行安全步驟")
|
||||
]
|
||||
playbook.status = PlaybookStatus.DRAFT
|
||||
|
||||
if persist:
|
||||
playbook = await self._service().create(playbook)
|
||||
|
||||
return self._record(playbook, "success", provider, "")
|
||||
|
||||
async def _call_local_llm(
|
||||
self,
|
||||
prompt: str,
|
||||
context: dict[str, Any],
|
||||
) -> tuple[str, str, bool]:
|
||||
if self._llm_callable is not None:
|
||||
return await self._llm_callable(prompt, context)
|
||||
|
||||
try:
|
||||
from src.services.ai_router import get_ai_executor
|
||||
|
||||
executor = get_ai_executor()
|
||||
result = await executor.execute(
|
||||
prompt=prompt,
|
||||
provider_order=["ollama", "ollama_188"],
|
||||
context=context,
|
||||
cache_ttl=86400,
|
||||
require_local=True,
|
||||
)
|
||||
return result.raw_response, result.provider, result.success
|
||||
except Exception as exc:
|
||||
logger.warning("playbook_generation_llm_failed", error=str(exc))
|
||||
return "", "local_ai_error", False
|
||||
|
||||
def _parse_payload(self, raw: str) -> GeneratedPlaybookPayload | None:
|
||||
data = _extract_json_object(raw)
|
||||
if data is None:
|
||||
return None
|
||||
try:
|
||||
return GeneratedPlaybookPayload.model_validate(data)
|
||||
except Exception as exc:
|
||||
logger.warning("playbook_generation_payload_invalid", error=str(exc))
|
||||
return None
|
||||
|
||||
def _build_playbook(
|
||||
self,
|
||||
incident: Incident,
|
||||
payload: GeneratedPlaybookPayload,
|
||||
provider: str,
|
||||
) -> Playbook:
|
||||
steps = self._sanitize_steps(payload.repair_steps)
|
||||
confidence = payload.confidence
|
||||
status = PlaybookStatus.REVIEW if confidence >= 0.75 and steps else PlaybookStatus.DRAFT
|
||||
|
||||
alert_names = payload.alert_names or [self._first_alert_name(incident)]
|
||||
affected = payload.affected_services or list(incident.affected_services or [])
|
||||
severity = payload.severity_range or ([incident.severity.value] if incident.severity else ["P2"])
|
||||
|
||||
notes = payload.notes or ""
|
||||
provenance = f"Generated by {provider} from {incident.incident_id}"
|
||||
notes = f"{notes}\n{provenance}".strip()
|
||||
|
||||
return Playbook(
|
||||
name=payload.name,
|
||||
description=payload.description,
|
||||
status=status,
|
||||
source=PlaybookSource.LLM_GENERATED,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=[x for x in alert_names if x],
|
||||
affected_services=affected,
|
||||
severity_range=severity,
|
||||
keywords=payload.keywords[:10],
|
||||
),
|
||||
repair_steps=steps,
|
||||
estimated_duration_minutes=payload.estimated_duration_minutes,
|
||||
source_incident_ids=[incident.incident_id],
|
||||
ai_confidence=confidence,
|
||||
trust_score=0.3,
|
||||
tags=[*payload.tags[:8], "llm_generated", provider],
|
||||
notes=notes,
|
||||
)
|
||||
|
||||
def _sanitize_steps(self, steps: list[GeneratedRepairStep]) -> list[RepairStep]:
|
||||
sanitized: list[RepairStep] = []
|
||||
for raw_step in steps[:8]:
|
||||
command = raw_step.command.strip()
|
||||
if not command:
|
||||
continue
|
||||
step_number = len(sanitized) + 1
|
||||
action_type = raw_step.action_type.strip().lower()
|
||||
if command.startswith("kubectl") or action_type == "kubectl":
|
||||
safety_reason = kubectl_safety_reason(command)
|
||||
if safety_reason is not None:
|
||||
sanitized.append(_manual_step(step_number, command, safety_reason))
|
||||
continue
|
||||
sanitized.append(
|
||||
RepairStep(
|
||||
step_number=step_number,
|
||||
action_type=ActionType.KUBECTL,
|
||||
command=command,
|
||||
expected_result=raw_step.expected_result,
|
||||
rollback_command=raw_step.rollback_command,
|
||||
requires_approval=_safe_risk(raw_step.risk_level) in (RiskLevel.HIGH, RiskLevel.CRITICAL),
|
||||
risk_level=_safe_risk(raw_step.risk_level),
|
||||
)
|
||||
)
|
||||
continue
|
||||
if action_type == "ssh_command" or command.startswith("ssh "):
|
||||
sanitized.append(
|
||||
RepairStep(
|
||||
step_number=step_number,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command=command,
|
||||
expected_result=raw_step.expected_result,
|
||||
rollback_command=raw_step.rollback_command,
|
||||
requires_approval=True,
|
||||
risk_level=max(_safe_risk(raw_step.risk_level), RiskLevel.MEDIUM, key=lambda r: list(RiskLevel).index(r)),
|
||||
)
|
||||
)
|
||||
continue
|
||||
sanitized.append(_manual_step(step_number, command, "non_kubectl_step_requires_review"))
|
||||
return sanitized
|
||||
|
||||
def _deterministic_playbook(self, incident: Incident, action: str | None) -> Playbook | None:
|
||||
steps = self._deterministic_steps(incident, action)
|
||||
if not steps:
|
||||
return None
|
||||
alert_name = self._first_alert_name(incident) or "Unknown"
|
||||
return Playbook(
|
||||
name=f"{alert_name} - AI 生成 fallback Playbook",
|
||||
description="LLM 產出不可解析時,從成功執行動作建立的保守 Playbook 草稿",
|
||||
status=PlaybookStatus.DRAFT,
|
||||
source=PlaybookSource.LLM_GENERATED,
|
||||
symptom_pattern=SymptomPattern(
|
||||
alert_names=[alert_name] if alert_name else [],
|
||||
affected_services=list(incident.affected_services or []),
|
||||
severity_range=[incident.severity.value] if incident.severity else ["P2"],
|
||||
),
|
||||
repair_steps=steps,
|
||||
source_incident_ids=[incident.incident_id],
|
||||
ai_confidence=0.45,
|
||||
tags=["llm_generated", "fallback"],
|
||||
notes=f"Generated deterministically after local LLM parse failure for {incident.incident_id}",
|
||||
)
|
||||
|
||||
def _deterministic_steps(self, incident: Incident, action: str | None) -> list[RepairStep]:
|
||||
command = (action or "").strip()
|
||||
if not command and incident.outcome and incident.outcome.learning_notes:
|
||||
command = incident.outcome.learning_notes.strip()
|
||||
if not command:
|
||||
return []
|
||||
if command.startswith("kubectl"):
|
||||
safety_reason = kubectl_safety_reason(command)
|
||||
if safety_reason is None:
|
||||
return [
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.KUBECTL,
|
||||
command=command,
|
||||
requires_approval=False,
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
)
|
||||
]
|
||||
return [_manual_step(1, command, safety_reason)]
|
||||
if command.startswith("ssh "):
|
||||
return [
|
||||
RepairStep(
|
||||
step_number=1,
|
||||
action_type=ActionType.SSH_COMMAND,
|
||||
command=command,
|
||||
requires_approval=True,
|
||||
risk_level=RiskLevel.MEDIUM,
|
||||
)
|
||||
]
|
||||
return [_manual_step(1, command, "unknown_action_type")]
|
||||
|
||||
def _build_prompt(self, incident: Incident, action: str | None) -> str:
|
||||
signals = [
|
||||
{
|
||||
"alert_name": signal.alert_name,
|
||||
"severity": signal.severity.value,
|
||||
"labels": signal.labels,
|
||||
"annotations": signal.annotations,
|
||||
}
|
||||
for signal in incident.signals[:5]
|
||||
]
|
||||
context = {
|
||||
"incident_id": incident.incident_id,
|
||||
"severity": incident.severity.value,
|
||||
"affected_services": incident.affected_services,
|
||||
"signals": signals,
|
||||
"hypothesis": incident.decision_chain.hypothesis if incident.decision_chain else "",
|
||||
"reasoning_steps": incident.decision_chain.reasoning_steps if incident.decision_chain else [],
|
||||
"successful_action": action or (incident.outcome.learning_notes if incident.outcome else ""),
|
||||
"effectiveness_score": incident.outcome.effectiveness_score if incident.outcome else None,
|
||||
}
|
||||
return (
|
||||
"你是 AWOOOI ADR-104 Playbook Generator,由 OpenClaw/Hermes/NemoTron/ElephantAlpha 的角色視角共同產出。"
|
||||
"請只輸出 JSON object,不要 markdown。任何破壞性命令必須改成 manual 步驟。\n"
|
||||
"JSON schema: {name, description, alert_names, affected_services, severity_range, keywords, "
|
||||
"repair_steps:[{action_type, command, expected_result, rollback_command, risk_level}], "
|
||||
"estimated_duration_minutes, confidence, tags, notes}.\n"
|
||||
f"Incident context:\n{json.dumps(context, ensure_ascii=False, default=str)}"
|
||||
)
|
||||
|
||||
def _first_alert_name(self, incident: Incident) -> str:
|
||||
return incident.signals[0].alert_name if incident.signals else ""
|
||||
|
||||
def _service(self) -> Any:
|
||||
if self._playbook_service is None:
|
||||
from src.services.playbook_service import get_playbook_service
|
||||
|
||||
self._playbook_service = get_playbook_service()
|
||||
return self._playbook_service
|
||||
|
||||
def _record(
|
||||
self,
|
||||
playbook: Playbook | None,
|
||||
outcome: str,
|
||||
provider: str,
|
||||
reason: str,
|
||||
) -> PlaybookGenerationResult:
|
||||
try:
|
||||
from src.core.metrics import observe_playbook_status, record_playbook_generation
|
||||
|
||||
source = provider or "none"
|
||||
record_playbook_generation(outcome=outcome, source=source)
|
||||
if playbook is not None:
|
||||
observe_playbook_status(status=playbook.status.value, source=source)
|
||||
except Exception as exc:
|
||||
logger.debug("playbook_generation_metric_failed", error=str(exc))
|
||||
return PlaybookGenerationResult(playbook=playbook, outcome=outcome, provider=provider, reason=reason)
|
||||
|
||||
|
||||
_generator: LLMPlaybookGenerator | None = None
|
||||
|
||||
|
||||
def get_playbook_generator() -> LLMPlaybookGenerator:
|
||||
"""Return global LLM Playbook generator."""
|
||||
global _generator
|
||||
if _generator is None:
|
||||
_generator = LLMPlaybookGenerator()
|
||||
return _generator
|
||||
155
apps/api/tests/test_playbook_generator.py
Normal file
155
apps/api/tests/test_playbook_generator.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_once
|
||||
from src.models.incident import Incident, IncidentOutcome, IncidentStatus, Severity, Signal
|
||||
from src.models.playbook import ActionType, PlaybookStatus, RepairStep, RiskLevel
|
||||
from src.services.playbook_generator import LLMPlaybookGenerator
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
|
||||
class InMemoryPlaybookService:
|
||||
def __init__(self):
|
||||
self.items = {}
|
||||
|
||||
async def create(self, playbook):
|
||||
self.items[playbook.playbook_id] = playbook
|
||||
return playbook
|
||||
|
||||
async def list_playbooks(self, status=None, tags=None, limit=20, offset=0):
|
||||
values = list(self.items.values())
|
||||
if status is not None:
|
||||
values = [pb for pb in values if pb.status == status]
|
||||
return values[offset : offset + limit], len(values)
|
||||
|
||||
async def update_with_validation(self, playbook_id, update_data):
|
||||
playbook = self.items[playbook_id]
|
||||
for key, value in update_data.items():
|
||||
if key == "status" and isinstance(value, str):
|
||||
value = PlaybookStatus(value)
|
||||
setattr(playbook, key, value)
|
||||
self.items[playbook_id] = playbook
|
||||
return playbook
|
||||
|
||||
|
||||
def make_resolved_incident(action: str = "kubectl rollout restart deployment/awoooi-api -n awoooi-prod") -> Incident:
|
||||
return Incident(
|
||||
incident_id="INC-20260430-PLAYBK",
|
||||
status=IncidentStatus.RESOLVED,
|
||||
severity=Severity.P2,
|
||||
signals=[
|
||||
Signal(
|
||||
alert_name="ApiErrorRateHigh",
|
||||
severity=Severity.P2,
|
||||
source="alertmanager",
|
||||
fired_at=now_taipei(),
|
||||
labels={"namespace": "awoooi-prod", "deployment": "awoooi-api"},
|
||||
annotations={"summary": "API error rate high"},
|
||||
)
|
||||
],
|
||||
affected_services=["awoooi-api"],
|
||||
outcome=IncidentOutcome(
|
||||
proposal_executed=True,
|
||||
execution_success=True,
|
||||
effectiveness_score=5,
|
||||
learning_notes=action,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def local_llm_ok(_prompt, _context):
|
||||
return (
|
||||
"""
|
||||
{
|
||||
"name": "API error rate recovery",
|
||||
"description": "Restart the affected API deployment after error-rate alert confirmation.",
|
||||
"alert_names": ["ApiErrorRateHigh"],
|
||||
"affected_services": ["awoooi-api"],
|
||||
"severity_range": ["P2"],
|
||||
"keywords": ["error rate", "api"],
|
||||
"repair_steps": [
|
||||
{
|
||||
"action_type": "kubectl",
|
||||
"command": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod",
|
||||
"expected_result": "new pods become ready",
|
||||
"risk_level": "MEDIUM"
|
||||
}
|
||||
],
|
||||
"estimated_duration_minutes": 5,
|
||||
"confidence": 0.86,
|
||||
"tags": ["api", "rollout"]
|
||||
}
|
||||
""",
|
||||
"ollama",
|
||||
True,
|
||||
)
|
||||
|
||||
|
||||
async def local_llm_unsafe(_prompt, _context):
|
||||
return (
|
||||
"""
|
||||
{
|
||||
"name": "Unsafe namespace cleanup",
|
||||
"description": "Bad suggestion should be gated.",
|
||||
"alert_names": ["ApiErrorRateHigh"],
|
||||
"affected_services": ["awoooi-api"],
|
||||
"repair_steps": [
|
||||
{
|
||||
"action_type": "kubectl",
|
||||
"command": "kubectl delete namespace awoooi-prod",
|
||||
"risk_level": "CRITICAL"
|
||||
}
|
||||
],
|
||||
"confidence": 0.95,
|
||||
"tags": ["unsafe"]
|
||||
}
|
||||
""",
|
||||
"ollama",
|
||||
True,
|
||||
)
|
||||
|
||||
|
||||
async def test_llm_playbook_generator_creates_review_playbook():
|
||||
service = InMemoryPlaybookService()
|
||||
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
||||
|
||||
result = await generator.generate_from_incident(make_resolved_incident())
|
||||
|
||||
assert result.outcome == "success"
|
||||
assert result.playbook is not None
|
||||
assert result.playbook.status == PlaybookStatus.REVIEW
|
||||
assert result.playbook.source.value == "llm_generated"
|
||||
assert result.playbook.repair_steps[0].action_type == ActionType.KUBECTL
|
||||
assert result.playbook.repair_steps[0].command == "kubectl rollout restart deployment/awoooi-api -n awoooi-prod"
|
||||
|
||||
|
||||
async def test_llm_playbook_generator_downgrades_unsafe_kubectl_to_manual():
|
||||
service = InMemoryPlaybookService()
|
||||
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_unsafe)
|
||||
|
||||
result = await generator.generate_from_incident(make_resolved_incident())
|
||||
|
||||
assert result.playbook is not None
|
||||
step = result.playbook.repair_steps[0]
|
||||
assert step.action_type == ActionType.MANUAL
|
||||
assert step.requires_approval is True
|
||||
assert step.risk_level == RiskLevel.HIGH
|
||||
assert "namespace" in step.command
|
||||
|
||||
|
||||
async def test_playbook_generation_governance_promotes_review_to_approved(monkeypatch):
|
||||
service = InMemoryPlaybookService()
|
||||
generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok)
|
||||
result = await generator.generate_from_incident(make_resolved_incident())
|
||||
assert result.playbook is not None
|
||||
result.playbook.ai_confidence = 0.93
|
||||
|
||||
class FakeSettings:
|
||||
ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB = True
|
||||
|
||||
import src.jobs.playbook_generation_governance_job as job
|
||||
|
||||
monkeypatch.setattr(job, "settings", FakeSettings())
|
||||
monkeypatch.setattr("src.services.playbook_service.get_playbook_service", lambda: service)
|
||||
|
||||
report = await run_playbook_generation_governance_once()
|
||||
|
||||
assert report.approved_count == 1
|
||||
assert service.items[result.playbook.playbook_id].status == PlaybookStatus.APPROVED
|
||||
@@ -6,6 +6,21 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-04-30 | ADR-104 LLM Playbook Generator 第一段落地
|
||||
|
||||
承接統帥 AI 自動化目標中「自動建立 Playbook」最低分缺口,先把成功修復後的 learn 階段從 deterministic extraction 擴成 local LLM Playbook generation。
|
||||
|
||||
### 完成
|
||||
- 新增 `LLMPlaybookGenerator`:成功修復且未命中既有 Playbook 時,用本地 provider 順序 `ollama -> ollama_188` 生成 Playbook JSON,不新增 Gemini/Claude 雲端成本 fallback。
|
||||
- 新增 `PlaybookStatus.REVIEW` 與 `PlaybookSource.LLM_GENERATED`,LLM 產物先進 DRAFT/REVIEW,不直接 APPROVED。
|
||||
- LLM 產出的 kubectl command 必須通過 `action_parser`;危險命令自動降級為 manual review step。
|
||||
- 新增 `playbook_generation_governance_job`:定期處理 DRAFT 黑洞,安全且高信心度的 LLM Playbook 可 DRAFT→REVIEW→APPROVED。
|
||||
- 補 `playbook_generation_total{outcome,source}` 與 `playbook_status_total{status,source}` emitter。
|
||||
|
||||
### 驗證
|
||||
- `python3 -m py_compile` 針對 generator / governance / model / config / metrics / learning / main 通過。
|
||||
- `pytest apps/api/tests/test_playbook_generator.py apps/api/tests/test_playbook_service.py apps/api/tests/test_learning_service.py apps/api/tests/test_action_parser_safety.py -q` → 56 passed, 2 skipped。
|
||||
|
||||
## 2026-04-30 | Telegram 告警收件人全面切到 SRE 戰情室
|
||||
|
||||
統帥指示所有發到 @tsenyangbot 個人通道的告警訊息,完整轉移到「AwoooI SRE戰情室」Telegram 群組,個人 DM 不再作為正式告警收件通道。
|
||||
|
||||
Reference in New Issue
Block a user