diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index d207b8ac..f9137a4a 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -132,6 +132,26 @@ class Settings(BaseSettings): description="W2 PR-L1: 同 symptom_pattern_hash 累積幾條 KM 後觸發 Playbook review_required 標記(預設 N=5)", ) + # ========================================================================== + # ADR-104: LLM Playbook Generator + # 成功修復且未命中既有 Playbook 時,用本地 LLM 生成 DRAFT/REVIEW Playbook。 + # 成本護欄:實作層只走 local provider(Ollama 111 → Ollama 188),不新增雲端 fallback。 + # 回滾指令: kubectl set env deployment/awoooi-api ENABLE_LLM_PLAYBOOK_GENERATION=false + # ========================================================================== + ENABLE_LLM_PLAYBOOK_GENERATION: bool = Field( + default=True, + description="ADR-104 T1: True=成功修復無 matched_playbook_id 時啟動本地 LLM 生成 Playbook, False=只用 deterministic extraction", + ) + ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB: bool = Field( + default=True, + description="ADR-104 T2: True=定期治理 LLM Playbook DRAFT/REVIEW 晉級, False=停用", + ) + PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS: int = Field( + default=3600, + ge=60, + description="ADR-104 T2: Playbook DRAFT governance job interval seconds", + ) + # ========================================================================== # aider-watch v2 integration (2026-04-20 ADR-091) # 整合 Mac aider CLI 監控進 awoooi 飛輪(events → incident → ai_router feedback) diff --git a/apps/api/src/core/metrics.py b/apps/api/src/core/metrics.py index f4af2b96..260adc67 100644 --- a/apps/api/src/core/metrics.py +++ b/apps/api/src/core/metrics.py @@ -253,6 +253,32 @@ RESOURCE_RESOLVE_TOTAL = Counter( ["result"], # hit / miss / suggestion / error ) +# ============================================================================= +# ADR-100 / ADR-104 Flywheel Emitter Metrics +# ============================================================================= + +PLAYBOOK_GENERATION_TOTAL = Counter( + "playbook_generation_total", + "LLM Playbook generation and governance outcomes", + ["outcome", "source"], +) + +PLAYBOOK_STATUS_TOTAL = Gauge( + "playbook_status_total", + "Playbook lifecycle status observations from generation/governance", + ["status", "source"], +) + + +def record_playbook_generation(outcome: str, source: str) -> None: + """Record Playbook generation/governance outcome.""" + PLAYBOOK_GENERATION_TOTAL.labels(outcome=outcome, source=source).inc() + + +def observe_playbook_status(status: str, source: str) -> None: + """Expose latest observed Playbook lifecycle status.""" + PLAYBOOK_STATUS_TOTAL.labels(status=status, source=source).set(1) + # ============================================================================= # Solver MCP Registry Metrics (H2, 2026-04-27 台北時區) # 建立者: Claude Sonnet 4.6 (fullstack-engineer, B1 Fix Round) diff --git a/apps/api/src/jobs/playbook_generation_governance_job.py b/apps/api/src/jobs/playbook_generation_governance_job.py new file mode 100644 index 00000000..6ddff81b --- /dev/null +++ b/apps/api/src/jobs/playbook_generation_governance_job.py @@ -0,0 +1,125 @@ +""" +Playbook Generation Governance Job - ADR-104 T2/T6 +================================================== +定期處理 LLM 生成 Playbook 的 DRAFT 黑洞: +- DRAFT + confidence >= 0.75 + 安全步驟 → REVIEW +- REVIEW + confidence >= 0.9 + 安全步驟 → APPROVED +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass + +import structlog + +from src.core.config import settings +from src.models.playbook import ActionType, Playbook, PlaybookSource, PlaybookStatus +from src.services.action_parser import is_safe_kubectl_action + +logger = structlog.get_logger(__name__) + + +@dataclass +class PlaybookGovernanceReport: + reviewed_count: int = 0 + approved_count: int = 0 + skipped_count: int = 0 + errors: list[str] | None = None + + def __post_init__(self) -> None: + if self.errors is None: + self.errors = [] + + +def _is_generated_candidate(playbook: Playbook) -> bool: + return playbook.source in (PlaybookSource.LLM_GENERATED, PlaybookSource.EXTRACTED) + + +def _has_safe_steps(playbook: Playbook) -> bool: + if not playbook.repair_steps: + return False + for step in playbook.repair_steps: + if step.action_type == ActionType.KUBECTL: + if not is_safe_kubectl_action(step.command): + return False + elif step.action_type != ActionType.MANUAL and not step.requires_approval: + return False + return True + + +async def run_playbook_generation_governance_once(force: bool = False) -> PlaybookGovernanceReport: + """Run one DRAFT/REVIEW governance pass.""" + if not force and not settings.ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB: + return PlaybookGovernanceReport() + + from src.services.playbook_service import get_playbook_service + + service = get_playbook_service() + report = PlaybookGovernanceReport() + + for status in (PlaybookStatus.DRAFT, PlaybookStatus.REVIEW): + playbooks, _total = await service.list_playbooks(status=status, limit=200, offset=0) + for playbook in playbooks: + if not _is_generated_candidate(playbook) or not _has_safe_steps(playbook): + report.skipped_count += 1 + continue + try: + if playbook.status == PlaybookStatus.DRAFT and playbook.ai_confidence >= 0.75: + await service.update_with_validation( + playbook.playbook_id, + { + "status": PlaybookStatus.REVIEW.value, + "notes": (playbook.notes or "") + "\n[Governance: DRAFT -> REVIEW]", + }, + ) + report.reviewed_count += 1 + _record_governance("reviewed") + elif playbook.status == PlaybookStatus.REVIEW and playbook.ai_confidence >= 0.9: + await service.update_with_validation( + playbook.playbook_id, + { + "status": PlaybookStatus.APPROVED.value, + "approved_by": "playbook_generation_governance", + "notes": (playbook.notes or "") + "\n[Governance: REVIEW -> APPROVED]", + }, + ) + report.approved_count += 1 + _record_governance("approved") + else: + report.skipped_count += 1 + except Exception as exc: + report.errors.append(f"{playbook.playbook_id}:{exc}") + logger.warning("playbook_governance_update_failed", playbook_id=playbook.playbook_id, error=str(exc)) + + logger.info( + "playbook_generation_governance_done", + reviewed=report.reviewed_count, + approved=report.approved_count, + skipped=report.skipped_count, + errors=len(report.errors), + ) + return report + + +async def run_playbook_generation_governance_loop() -> None: + """Run governance forever at configured interval.""" + while True: + try: + await run_playbook_generation_governance_once() + except Exception: + logger.exception("playbook_generation_governance_fatal") + await asyncio.sleep(settings.PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS) + + +def _record_governance(outcome: str) -> None: + try: + from src.core.metrics import observe_playbook_status, record_playbook_generation + + record_playbook_generation(outcome=outcome, source="governance") + if outcome == "reviewed": + observe_playbook_status(status=PlaybookStatus.REVIEW.value, source="governance") + elif outcome == "approved": + observe_playbook_status(status=PlaybookStatus.APPROVED.value, source="governance") + except Exception: + pass diff --git a/apps/api/src/main.py b/apps/api/src/main.py index a35d7998..66b0c111 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -500,6 +500,17 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("evolver_loop_schedule_failed", error=str(e)) + # ADR-104 T2: LLM Playbook DRAFT governance(每小時) + try: + from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_loop + asyncio.create_task(run_playbook_generation_governance_loop()) + logger.info( + "playbook_generation_governance_loop_scheduled", + interval_sec=settings.PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS, + ) + except Exception as e: + logger.warning("playbook_generation_governance_loop_schedule_failed", error=str(e)) + # ADR-083 Phase 3: 知識遺忘 Job(每日)— 30d 未引用 KB entry 標記 archived # 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立 try: @@ -997,4 +1008,3 @@ if __name__ == "__main__": reload=settings.DEBUG, log_level=settings.LOG_LEVEL.lower(), ) - diff --git a/apps/api/src/models/playbook.py b/apps/api/src/models/playbook.py index 34b02186..ddf9eb49 100644 --- a/apps/api/src/models/playbook.py +++ b/apps/api/src/models/playbook.py @@ -30,6 +30,7 @@ class PlaybookStatus(str, Enum): """Playbook 狀態""" DRAFT = "draft" # AI 萃取,待人工審核 + REVIEW = "review" # AI 生成且安全檢查通過,等待治理晉級 APPROVED = "approved" # 人工核准,可用於推薦 DEPRECATED = "deprecated" # 已棄用 (有更好方案) @@ -38,6 +39,7 @@ class PlaybookSource(str, Enum): """Playbook 來源""" EXTRACTED = "extracted" # 從 Incident 自動萃取 + LLM_GENERATED = "llm_generated" # ADR-104: LLM 從成功案例生成 MANUAL = "manual" # 人工建立 YAML_RULE = "yaml_rule" # 從 alert_rules.yaml 匯入(2026-04-15 ogt) diff --git a/apps/api/src/services/learning_service.py b/apps/api/src/services/learning_service.py index fd337fda..2defdc1d 100644 --- a/apps/api/src/services/learning_service.py +++ b/apps/api/src/services/learning_service.py @@ -439,6 +439,7 @@ class LearningService: try: from src.repositories.incident_repository import get_incident_repository from src.services.playbook_service import get_playbook_service + from src.core.config import settings # 取得 Incident repo = get_incident_repository() @@ -451,6 +452,26 @@ class LearningService: if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]: return None + # ADR-104: 優先用本地 LLM 生成可治理 Playbook。失敗時服務內會降級成 + # deterministic fallback;feature flag 關閉才回到舊萃取路徑。 + if settings.ENABLE_LLM_PLAYBOOK_GENERATION: + from src.services.playbook_generator import get_playbook_generator + + generated = await get_playbook_generator().generate_from_incident( + incident=incident, + action=action, + persist=True, + ) + if generated.playbook: + logger.info( + "playbook_llm_generated", + incident_id=incident_id, + playbook_id=generated.playbook.playbook_id, + outcome=generated.outcome, + provider=generated.provider, + ) + return generated.playbook.playbook_id + # 萃取 Playbook service = get_playbook_service() playbook = await service.extract_from_incident( diff --git a/apps/api/src/services/playbook_generator.py b/apps/api/src/services/playbook_generator.py new file mode 100644 index 00000000..ca1e455a --- /dev/null +++ b/apps/api/src/services/playbook_generator.py @@ -0,0 +1,418 @@ +""" +LLM Playbook Generator - ADR-104 T1/T2/T6 +========================================= +從成功修復案例生成可治理的 Playbook 草稿。 + +設計重點: +- 只用 local provider 順序(Ollama 111 -> Ollama 188),避免新增雲端成本。 +- LLM 產出必須經 Pydantic + action_parser 安全收斂。 +- 不直接 APPROVED;先 DRAFT/REVIEW,再交治理 job 晉級。 +""" + +from __future__ import annotations + +import json +import re +from collections.abc import Awaitable, Callable +from dataclasses import dataclass +from typing import Any + +import structlog +from pydantic import BaseModel, Field, field_validator + +from src.models.incident import Incident, IncidentStatus +from src.models.playbook import ( + ActionType, + Playbook, + PlaybookSource, + PlaybookStatus, + RepairStep, + RiskLevel, + SymptomPattern, +) +from src.services.action_parser import is_safe_kubectl_action +from src.services.action_parser import kubectl_safety_reason + +logger = structlog.get_logger(__name__) + +LLMCallable = Callable[[str, dict[str, Any]], Awaitable[tuple[str, str, bool]]] + + +class GeneratedRepairStep(BaseModel): + """LLM repair step contract.""" + + action_type: str = Field(default="manual") + command: str = Field(default="") + expected_result: str | None = None + rollback_command: str | None = None + risk_level: str = Field(default="MEDIUM") + + @field_validator("risk_level", mode="before") + @classmethod + def normalize_risk(cls, value: object) -> str: + risk = str(value or "MEDIUM").upper() + return risk if risk in {"LOW", "MEDIUM", "HIGH", "CRITICAL"} else "MEDIUM" + + +class GeneratedPlaybookPayload(BaseModel): + """Strict JSON shape expected from the local LLM.""" + + name: str = Field(min_length=1, max_length=256) + description: str = Field(min_length=1, max_length=2000) + alert_names: list[str] = Field(default_factory=list) + affected_services: list[str] = Field(default_factory=list) + severity_range: list[str] = Field(default_factory=lambda: ["P2"]) + keywords: list[str] = Field(default_factory=list) + repair_steps: list[GeneratedRepairStep] = Field(default_factory=list) + estimated_duration_minutes: int = Field(default=5, ge=1, le=480) + confidence: float = Field(default=0.5, ge=0.0, le=1.0) + tags: list[str] = Field(default_factory=list) + notes: str | None = None + + +@dataclass +class PlaybookGenerationResult: + """Generator result plus provenance for timeline/KM/metrics.""" + + playbook: Playbook | None + outcome: str + provider: str + reason: str = "" + + +def _extract_json_object(text: str) -> dict[str, Any] | None: + """Parse a JSON object from an LLM response.""" + text = (text or "").strip() + if not text: + return None + try: + data = json.loads(text) + return data if isinstance(data, dict) else None + except json.JSONDecodeError: + pass + + fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) + if fenced: + try: + data = json.loads(fenced.group(1)) + return data if isinstance(data, dict) else None + except json.JSONDecodeError: + return None + + start = text.find("{") + end = text.rfind("}") + if start >= 0 and end > start: + try: + data = json.loads(text[start : end + 1]) + return data if isinstance(data, dict) else None + except json.JSONDecodeError: + return None + return None + + +def _safe_risk(value: str) -> RiskLevel: + try: + return RiskLevel(value.upper()) + except ValueError: + return RiskLevel.MEDIUM + + +def _manual_step(step_number: int, command: str, reason: str) -> RepairStep: + command_preview = command.strip()[:240] or "未提供命令" + return RepairStep( + step_number=step_number, + action_type=ActionType.MANUAL, + command=f"人工審核 LLM 建議: {command_preview}", + expected_result=reason, + requires_approval=True, + risk_level=RiskLevel.HIGH, + ) + + +class LLMPlaybookGenerator: + """Generate Playbook drafts from resolved incidents using local AI.""" + + def __init__( + self, + playbook_service: Any | None = None, + llm_callable: LLMCallable | None = None, + ) -> None: + self._playbook_service = playbook_service + self._llm_callable = llm_callable + + async def generate_from_incident( + self, + incident: Incident, + action: str | None = None, + persist: bool = True, + ) -> PlaybookGenerationResult: + """Generate and optionally persist a governed Playbook draft.""" + if incident.status not in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED): + return self._record(None, "skipped", "none", "incident_not_resolved") + if not incident.outcome or incident.outcome.execution_success is not True: + return self._record(None, "skipped", "none", "execution_not_successful") + + prompt = self._build_prompt(incident, action) + context = { + "incident_id": incident.incident_id, + "intent_hint": "playbook_generation", + "task_type": "force_local", + "alert_type": self._first_alert_name(incident), + "target_resource": ",".join(incident.affected_services or []), + } + + raw, provider, success = await self._call_local_llm(prompt, context) + payload = self._parse_payload(raw) if success else None + if payload is None: + fallback = self._deterministic_playbook(incident, action) + if fallback and persist: + fallback = await self._service().create(fallback) + return self._record(fallback, "fallback", provider, "llm_payload_invalid") + + playbook = self._build_playbook(incident, payload, provider) + if not playbook.repair_steps: + playbook.repair_steps = self._deterministic_steps(incident, action) + if not playbook.repair_steps: + playbook.repair_steps = [ + _manual_step(1, action or "未提供修復動作", "LLM 未產生可執行安全步驟") + ] + playbook.status = PlaybookStatus.DRAFT + + if persist: + playbook = await self._service().create(playbook) + + return self._record(playbook, "success", provider, "") + + async def _call_local_llm( + self, + prompt: str, + context: dict[str, Any], + ) -> tuple[str, str, bool]: + if self._llm_callable is not None: + return await self._llm_callable(prompt, context) + + try: + from src.services.ai_router import get_ai_executor + + executor = get_ai_executor() + result = await executor.execute( + prompt=prompt, + provider_order=["ollama", "ollama_188"], + context=context, + cache_ttl=86400, + require_local=True, + ) + return result.raw_response, result.provider, result.success + except Exception as exc: + logger.warning("playbook_generation_llm_failed", error=str(exc)) + return "", "local_ai_error", False + + def _parse_payload(self, raw: str) -> GeneratedPlaybookPayload | None: + data = _extract_json_object(raw) + if data is None: + return None + try: + return GeneratedPlaybookPayload.model_validate(data) + except Exception as exc: + logger.warning("playbook_generation_payload_invalid", error=str(exc)) + return None + + def _build_playbook( + self, + incident: Incident, + payload: GeneratedPlaybookPayload, + provider: str, + ) -> Playbook: + steps = self._sanitize_steps(payload.repair_steps) + confidence = payload.confidence + status = PlaybookStatus.REVIEW if confidence >= 0.75 and steps else PlaybookStatus.DRAFT + + alert_names = payload.alert_names or [self._first_alert_name(incident)] + affected = payload.affected_services or list(incident.affected_services or []) + severity = payload.severity_range or ([incident.severity.value] if incident.severity else ["P2"]) + + notes = payload.notes or "" + provenance = f"Generated by {provider} from {incident.incident_id}" + notes = f"{notes}\n{provenance}".strip() + + return Playbook( + name=payload.name, + description=payload.description, + status=status, + source=PlaybookSource.LLM_GENERATED, + symptom_pattern=SymptomPattern( + alert_names=[x for x in alert_names if x], + affected_services=affected, + severity_range=severity, + keywords=payload.keywords[:10], + ), + repair_steps=steps, + estimated_duration_minutes=payload.estimated_duration_minutes, + source_incident_ids=[incident.incident_id], + ai_confidence=confidence, + trust_score=0.3, + tags=[*payload.tags[:8], "llm_generated", provider], + notes=notes, + ) + + def _sanitize_steps(self, steps: list[GeneratedRepairStep]) -> list[RepairStep]: + sanitized: list[RepairStep] = [] + for raw_step in steps[:8]: + command = raw_step.command.strip() + if not command: + continue + step_number = len(sanitized) + 1 + action_type = raw_step.action_type.strip().lower() + if command.startswith("kubectl") or action_type == "kubectl": + safety_reason = kubectl_safety_reason(command) + if safety_reason is not None: + sanitized.append(_manual_step(step_number, command, safety_reason)) + continue + sanitized.append( + RepairStep( + step_number=step_number, + action_type=ActionType.KUBECTL, + command=command, + expected_result=raw_step.expected_result, + rollback_command=raw_step.rollback_command, + requires_approval=_safe_risk(raw_step.risk_level) in (RiskLevel.HIGH, RiskLevel.CRITICAL), + risk_level=_safe_risk(raw_step.risk_level), + ) + ) + continue + if action_type == "ssh_command" or command.startswith("ssh "): + sanitized.append( + RepairStep( + step_number=step_number, + action_type=ActionType.SSH_COMMAND, + command=command, + expected_result=raw_step.expected_result, + rollback_command=raw_step.rollback_command, + requires_approval=True, + risk_level=max(_safe_risk(raw_step.risk_level), RiskLevel.MEDIUM, key=lambda r: list(RiskLevel).index(r)), + ) + ) + continue + sanitized.append(_manual_step(step_number, command, "non_kubectl_step_requires_review")) + return sanitized + + def _deterministic_playbook(self, incident: Incident, action: str | None) -> Playbook | None: + steps = self._deterministic_steps(incident, action) + if not steps: + return None + alert_name = self._first_alert_name(incident) or "Unknown" + return Playbook( + name=f"{alert_name} - AI 生成 fallback Playbook", + description="LLM 產出不可解析時,從成功執行動作建立的保守 Playbook 草稿", + status=PlaybookStatus.DRAFT, + source=PlaybookSource.LLM_GENERATED, + symptom_pattern=SymptomPattern( + alert_names=[alert_name] if alert_name else [], + affected_services=list(incident.affected_services or []), + severity_range=[incident.severity.value] if incident.severity else ["P2"], + ), + repair_steps=steps, + source_incident_ids=[incident.incident_id], + ai_confidence=0.45, + tags=["llm_generated", "fallback"], + notes=f"Generated deterministically after local LLM parse failure for {incident.incident_id}", + ) + + def _deterministic_steps(self, incident: Incident, action: str | None) -> list[RepairStep]: + command = (action or "").strip() + if not command and incident.outcome and incident.outcome.learning_notes: + command = incident.outcome.learning_notes.strip() + if not command: + return [] + if command.startswith("kubectl"): + safety_reason = kubectl_safety_reason(command) + if safety_reason is None: + return [ + RepairStep( + step_number=1, + action_type=ActionType.KUBECTL, + command=command, + requires_approval=False, + risk_level=RiskLevel.MEDIUM, + ) + ] + return [_manual_step(1, command, safety_reason)] + if command.startswith("ssh "): + return [ + RepairStep( + step_number=1, + action_type=ActionType.SSH_COMMAND, + command=command, + requires_approval=True, + risk_level=RiskLevel.MEDIUM, + ) + ] + return [_manual_step(1, command, "unknown_action_type")] + + def _build_prompt(self, incident: Incident, action: str | None) -> str: + signals = [ + { + "alert_name": signal.alert_name, + "severity": signal.severity.value, + "labels": signal.labels, + "annotations": signal.annotations, + } + for signal in incident.signals[:5] + ] + context = { + "incident_id": incident.incident_id, + "severity": incident.severity.value, + "affected_services": incident.affected_services, + "signals": signals, + "hypothesis": incident.decision_chain.hypothesis if incident.decision_chain else "", + "reasoning_steps": incident.decision_chain.reasoning_steps if incident.decision_chain else [], + "successful_action": action or (incident.outcome.learning_notes if incident.outcome else ""), + "effectiveness_score": incident.outcome.effectiveness_score if incident.outcome else None, + } + return ( + "你是 AWOOOI ADR-104 Playbook Generator,由 OpenClaw/Hermes/NemoTron/ElephantAlpha 的角色視角共同產出。" + "請只輸出 JSON object,不要 markdown。任何破壞性命令必須改成 manual 步驟。\n" + "JSON schema: {name, description, alert_names, affected_services, severity_range, keywords, " + "repair_steps:[{action_type, command, expected_result, rollback_command, risk_level}], " + "estimated_duration_minutes, confidence, tags, notes}.\n" + f"Incident context:\n{json.dumps(context, ensure_ascii=False, default=str)}" + ) + + def _first_alert_name(self, incident: Incident) -> str: + return incident.signals[0].alert_name if incident.signals else "" + + def _service(self) -> Any: + if self._playbook_service is None: + from src.services.playbook_service import get_playbook_service + + self._playbook_service = get_playbook_service() + return self._playbook_service + + def _record( + self, + playbook: Playbook | None, + outcome: str, + provider: str, + reason: str, + ) -> PlaybookGenerationResult: + try: + from src.core.metrics import observe_playbook_status, record_playbook_generation + + source = provider or "none" + record_playbook_generation(outcome=outcome, source=source) + if playbook is not None: + observe_playbook_status(status=playbook.status.value, source=source) + except Exception as exc: + logger.debug("playbook_generation_metric_failed", error=str(exc)) + return PlaybookGenerationResult(playbook=playbook, outcome=outcome, provider=provider, reason=reason) + + +_generator: LLMPlaybookGenerator | None = None + + +def get_playbook_generator() -> LLMPlaybookGenerator: + """Return global LLM Playbook generator.""" + global _generator + if _generator is None: + _generator = LLMPlaybookGenerator() + return _generator diff --git a/apps/api/tests/test_playbook_generator.py b/apps/api/tests/test_playbook_generator.py new file mode 100644 index 00000000..31ec041d --- /dev/null +++ b/apps/api/tests/test_playbook_generator.py @@ -0,0 +1,155 @@ +from src.jobs.playbook_generation_governance_job import run_playbook_generation_governance_once +from src.models.incident import Incident, IncidentOutcome, IncidentStatus, Severity, Signal +from src.models.playbook import ActionType, PlaybookStatus, RepairStep, RiskLevel +from src.services.playbook_generator import LLMPlaybookGenerator +from src.utils.timezone import now_taipei + + +class InMemoryPlaybookService: + def __init__(self): + self.items = {} + + async def create(self, playbook): + self.items[playbook.playbook_id] = playbook + return playbook + + async def list_playbooks(self, status=None, tags=None, limit=20, offset=0): + values = list(self.items.values()) + if status is not None: + values = [pb for pb in values if pb.status == status] + return values[offset : offset + limit], len(values) + + async def update_with_validation(self, playbook_id, update_data): + playbook = self.items[playbook_id] + for key, value in update_data.items(): + if key == "status" and isinstance(value, str): + value = PlaybookStatus(value) + setattr(playbook, key, value) + self.items[playbook_id] = playbook + return playbook + + +def make_resolved_incident(action: str = "kubectl rollout restart deployment/awoooi-api -n awoooi-prod") -> Incident: + return Incident( + incident_id="INC-20260430-PLAYBK", + status=IncidentStatus.RESOLVED, + severity=Severity.P2, + signals=[ + Signal( + alert_name="ApiErrorRateHigh", + severity=Severity.P2, + source="alertmanager", + fired_at=now_taipei(), + labels={"namespace": "awoooi-prod", "deployment": "awoooi-api"}, + annotations={"summary": "API error rate high"}, + ) + ], + affected_services=["awoooi-api"], + outcome=IncidentOutcome( + proposal_executed=True, + execution_success=True, + effectiveness_score=5, + learning_notes=action, + ), + ) + + +async def local_llm_ok(_prompt, _context): + return ( + """ + { + "name": "API error rate recovery", + "description": "Restart the affected API deployment after error-rate alert confirmation.", + "alert_names": ["ApiErrorRateHigh"], + "affected_services": ["awoooi-api"], + "severity_range": ["P2"], + "keywords": ["error rate", "api"], + "repair_steps": [ + { + "action_type": "kubectl", + "command": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", + "expected_result": "new pods become ready", + "risk_level": "MEDIUM" + } + ], + "estimated_duration_minutes": 5, + "confidence": 0.86, + "tags": ["api", "rollout"] + } + """, + "ollama", + True, + ) + + +async def local_llm_unsafe(_prompt, _context): + return ( + """ + { + "name": "Unsafe namespace cleanup", + "description": "Bad suggestion should be gated.", + "alert_names": ["ApiErrorRateHigh"], + "affected_services": ["awoooi-api"], + "repair_steps": [ + { + "action_type": "kubectl", + "command": "kubectl delete namespace awoooi-prod", + "risk_level": "CRITICAL" + } + ], + "confidence": 0.95, + "tags": ["unsafe"] + } + """, + "ollama", + True, + ) + + +async def test_llm_playbook_generator_creates_review_playbook(): + service = InMemoryPlaybookService() + generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok) + + result = await generator.generate_from_incident(make_resolved_incident()) + + assert result.outcome == "success" + assert result.playbook is not None + assert result.playbook.status == PlaybookStatus.REVIEW + assert result.playbook.source.value == "llm_generated" + assert result.playbook.repair_steps[0].action_type == ActionType.KUBECTL + assert result.playbook.repair_steps[0].command == "kubectl rollout restart deployment/awoooi-api -n awoooi-prod" + + +async def test_llm_playbook_generator_downgrades_unsafe_kubectl_to_manual(): + service = InMemoryPlaybookService() + generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_unsafe) + + result = await generator.generate_from_incident(make_resolved_incident()) + + assert result.playbook is not None + step = result.playbook.repair_steps[0] + assert step.action_type == ActionType.MANUAL + assert step.requires_approval is True + assert step.risk_level == RiskLevel.HIGH + assert "namespace" in step.command + + +async def test_playbook_generation_governance_promotes_review_to_approved(monkeypatch): + service = InMemoryPlaybookService() + generator = LLMPlaybookGenerator(playbook_service=service, llm_callable=local_llm_ok) + result = await generator.generate_from_incident(make_resolved_incident()) + assert result.playbook is not None + result.playbook.ai_confidence = 0.93 + + class FakeSettings: + ENABLE_PLAYBOOK_DRAFT_GOVERNANCE_JOB = True + + import src.jobs.playbook_generation_governance_job as job + + monkeypatch.setattr(job, "settings", FakeSettings()) + monkeypatch.setattr("src.services.playbook_service.get_playbook_service", lambda: service) + + report = await run_playbook_generation_governance_once() + + assert report.approved_count == 1 + assert service.items[result.playbook.playbook_id].status == PlaybookStatus.APPROVED diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2acd01d1..16f91af4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,21 @@ --- +## 2026-04-30 | ADR-104 LLM Playbook Generator 第一段落地 + +承接統帥 AI 自動化目標中「自動建立 Playbook」最低分缺口,先把成功修復後的 learn 階段從 deterministic extraction 擴成 local LLM Playbook generation。 + +### 完成 +- 新增 `LLMPlaybookGenerator`:成功修復且未命中既有 Playbook 時,用本地 provider 順序 `ollama -> ollama_188` 生成 Playbook JSON,不新增 Gemini/Claude 雲端成本 fallback。 +- 新增 `PlaybookStatus.REVIEW` 與 `PlaybookSource.LLM_GENERATED`,LLM 產物先進 DRAFT/REVIEW,不直接 APPROVED。 +- LLM 產出的 kubectl command 必須通過 `action_parser`;危險命令自動降級為 manual review step。 +- 新增 `playbook_generation_governance_job`:定期處理 DRAFT 黑洞,安全且高信心度的 LLM Playbook 可 DRAFT→REVIEW→APPROVED。 +- 補 `playbook_generation_total{outcome,source}` 與 `playbook_status_total{status,source}` emitter。 + +### 驗證 +- `python3 -m py_compile` 針對 generator / governance / model / config / metrics / learning / main 通過。 +- `pytest apps/api/tests/test_playbook_generator.py apps/api/tests/test_playbook_service.py apps/api/tests/test_learning_service.py apps/api/tests/test_action_parser_safety.py -q` → 56 passed, 2 skipped。 + ## 2026-04-30 | Telegram 告警收件人全面切到 SRE 戰情室 統帥指示所有發到 @tsenyangbot 個人通道的告警訊息,完整轉移到「AwoooI SRE戰情室」Telegram 群組,個人 DM 不再作為正式告警收件通道。