feat(mcp-phase4c): AI 動態規則生成 — 新 alertname 自動產 Playbook 草稿

_generate_playbook_draft_if_new(): - Playbook 無命中時非同步觸發（不阻塞決策主流程） - 先用 semantic_search(threshold=0.92) 確認 KM 無同名 Playbook - 呼叫 qwen2.5:7b-instruct (Ollama 188) 生成五段結構化草稿 (症狀/根因/診斷步驟/修復動作/驗收條件) - 寫入 KnowledgeEntry(type=PLAYBOOK, status=DRAFT, source=AI_EXTRACTED) - 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件 - 失敗靜默 debug log 完成 MCP Phase 4 全三項: 4a NemoClaw second opinion (信心 < 0.7) 4b K8s 狀態快照 k8s_state_after 4c AI 動態 Playbook 草稿生成 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 09:16:39 +08:00
parent 0fa3b35a1c
commit 7eb49f9c20
1 changed files with 101 additions and 0 deletions
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -326,6 +326,104 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) -
        return None


+async def _generate_playbook_draft_if_new(incident: "Incident") -> None:
+    """
+    MCP Phase 4c: Playbook 無命中時，自動生成 AI 草稿 Playbook 寫入 KM
+    =====================================================================
+    - 僅在 KM 中不存在同 alertname 的 Playbook 時觸發（避免重複）
+    - 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿
+    - 寫入 KnowledgeEntry，status=DRAFT，需人工審核後升為 APPROVED
+    - 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件
+
+    2026-04-11 Claude Sonnet 4.6 Asia/Taipei
+    """
+    try:
+        import httpx as _httpx
+        from src.core.config import settings
+        from src.models.knowledge import (
+            EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate,
+        )
+        from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
+        from src.services.knowledge_service import get_knowledge_service
+
+        alertname = ""
+        if incident.signals:
+            alertname = incident.signals[0].labels.get("alertname", "")
+        if not alertname:
+            return
+
+        # 已存在同 alertname 的 KM 條目則跳過
+        knowledge_svc = get_knowledge_service()
+        existing = await knowledge_svc.semantic_search(alertname, limit=1, threshold=0.92)
+        if existing:
+            return
+
+        # 用 qwen2.5:7b-instruct 生成 Playbook 草稿
+        severity = incident.signals[0].labels.get("severity", "warning") if incident.signals else "warning"
+        services = ", ".join(incident.affected_services or ["unknown"])
+        prompt = (
+            f"你是資深 SRE，請為以下告警生成一份結構化 Playbook 草稿（繁體中文）。\n"
+            f"告警名稱: {alertname}\n"
+            f"嚴重度: {severity}\n"
+            f"受影響服務: {services}\n\n"
+            f"請按以下格式輸出（不超過 300 字）:\n"
+            f"## 症狀\n（描述此告警代表什麼）\n"
+            f"## 根因假設\n（最常見的 2-3 個原因）\n"
+            f"## 診斷步驟\n（kubectl 或 shell 指令）\n"
+            f"## 修復動作\n（具體修復指令，含 kubectl rollout restart 等）\n"
+            f"## 驗收條件\n（如何確認修復成功）"
+        )
+
+        ollama_url = getattr(settings, "OLLAMA_URL", "http://192.168.0.188:11434")
+        async with _httpx.AsyncClient(timeout=45.0) as client:
+            resp = await client.post(
+                f"{ollama_url}/api/generate",
+                json={"model": "qwen2.5:7b-instruct", "prompt": prompt, "stream": False},
+            )
+            resp.raise_for_status()
+            content = resp.json().get("response", "").strip()
+
+        if not content or len(content) < 50:
+            return
+
+        # 寫入 KM，status=DRAFT
+        entry = await knowledge_svc.create_entry(
+            KnowledgeEntryCreate(
+                title=f"[AI草稿] {alertname} Playbook",
+                content=content,
+                entry_type=EntryType.PLAYBOOK,
+                category="auto_generated",
+                tags=[alertname, severity, "ai_draft", "mcp_phase4c"],
+                source=EntrySource.AI_EXTRACTED,
+                status=EntryStatus.DRAFT,
+                related_incident_id=incident.incident_id,
+            )
+        )
+
+        # 寫入操作日誌
+        op_repo = get_alert_operation_log_repository()
+        await op_repo.append(
+            event_type="PLAYBOOK_DRAFT_CREATED",
+            incident_id=incident.incident_id,
+            actor="mcp_phase4c",
+            action_detail=f"AI 草稿 Playbook: {entry.entry_id}",
+            success=True,
+            context={"alertname": alertname, "km_entry_id": entry.entry_id},
+        )
+
+        import structlog as _sl
+        _sl.get_logger(__name__).info(
+            "playbook_draft_created",
+            incident_id=incident.incident_id,
+            alertname=alertname,
+            entry_id=entry.entry_id,
+        )
+
+    except Exception as e:
+        import structlog as _sl
+        _sl.get_logger(__name__).debug("playbook_draft_failed", error=str(e))
+
+
 async def _fetch_metrics_snapshot(incident: Incident) -> dict:
    """
    ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照
@@ -1052,6 +1150,9 @@ class DecisionManager:
        if playbook_result:
            return playbook_result

+        # MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6)
+        asyncio.create_task(_generate_playbook_draft_if_new(incident))
+
        # Expert System 同步執行 (立即可用)
        expert_result = expert_analyze(incident)