From 7eb49f9c202b2326a8132f44cb32358bf23c67cf Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 09:16:39 +0800 Subject: [PATCH] =?UTF-8?q?feat(mcp-phase4c):=20AI=20=E5=8B=95=E6=85=8B?= =?UTF-8?q?=E8=A6=8F=E5=89=87=E7=94=9F=E6=88=90=20=E2=80=94=20=E6=96=B0=20?= =?UTF-8?q?alertname=20=E8=87=AA=E5=8B=95=E7=94=A2=20Playbook=20=E8=8D=89?= =?UTF-8?q?=E7=A8=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _generate_playbook_draft_if_new(): - Playbook 無命中時非同步觸發(不阻塞決策主流程) - 先用 semantic_search(threshold=0.92) 確認 KM 無同名 Playbook - 呼叫 qwen2.5:7b-instruct (Ollama 188) 生成五段結構化草稿 (症狀/根因/診斷步驟/修復動作/驗收條件) - 寫入 KnowledgeEntry(type=PLAYBOOK, status=DRAFT, source=AI_EXTRACTED) - 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件 - 失敗靜默 debug log 完成 MCP Phase 4 全三項: 4a NemoClaw second opinion (信心 < 0.7) 4b K8s 狀態快照 k8s_state_after 4c AI 動態 Playbook 草稿生成 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/decision_manager.py | 101 ++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 8b9847a0..aaa302ef 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -326,6 +326,104 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) - return None +async def _generate_playbook_draft_if_new(incident: "Incident") -> None: + """ + MCP Phase 4c: Playbook 無命中時,自動生成 AI 草稿 Playbook 寫入 KM + ===================================================================== + - 僅在 KM 中不存在同 alertname 的 Playbook 時觸發(避免重複) + - 用 qwen2.5:7b-instruct (Ollama 188) 生成結構化 Playbook 草稿 + - 寫入 KnowledgeEntry,status=DRAFT,需人工審核後升為 APPROVED + - 寫入 AlertOperationLog PLAYBOOK_DRAFT_CREATED 事件 + + 2026-04-11 Claude Sonnet 4.6 Asia/Taipei + """ + try: + import httpx as _httpx + from src.core.config import settings + from src.models.knowledge import ( + EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate, + ) + from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository + from src.services.knowledge_service import get_knowledge_service + + alertname = "" + if incident.signals: + alertname = incident.signals[0].labels.get("alertname", "") + if not alertname: + return + + # 已存在同 alertname 的 KM 條目則跳過 + knowledge_svc = get_knowledge_service() + existing = await knowledge_svc.semantic_search(alertname, limit=1, threshold=0.92) + if existing: + return + + # 用 qwen2.5:7b-instruct 生成 Playbook 草稿 + severity = incident.signals[0].labels.get("severity", "warning") if incident.signals else "warning" + services = ", ".join(incident.affected_services or ["unknown"]) + prompt = ( + f"你是資深 SRE,請為以下告警生成一份結構化 Playbook 草稿(繁體中文)。\n" + f"告警名稱: {alertname}\n" + f"嚴重度: {severity}\n" + f"受影響服務: {services}\n\n" + f"請按以下格式輸出(不超過 300 字):\n" + f"## 症狀\n(描述此告警代表什麼)\n" + f"## 根因假設\n(最常見的 2-3 個原因)\n" + f"## 診斷步驟\n(kubectl 或 shell 指令)\n" + f"## 修復動作\n(具體修復指令,含 kubectl rollout restart 等)\n" + f"## 驗收條件\n(如何確認修復成功)" + ) + + ollama_url = getattr(settings, "OLLAMA_URL", "http://192.168.0.188:11434") + async with _httpx.AsyncClient(timeout=45.0) as client: + resp = await client.post( + f"{ollama_url}/api/generate", + json={"model": "qwen2.5:7b-instruct", "prompt": prompt, "stream": False}, + ) + resp.raise_for_status() + content = resp.json().get("response", "").strip() + + if not content or len(content) < 50: + return + + # 寫入 KM,status=DRAFT + entry = await knowledge_svc.create_entry( + KnowledgeEntryCreate( + title=f"[AI草稿] {alertname} Playbook", + content=content, + entry_type=EntryType.PLAYBOOK, + category="auto_generated", + tags=[alertname, severity, "ai_draft", "mcp_phase4c"], + source=EntrySource.AI_EXTRACTED, + status=EntryStatus.DRAFT, + related_incident_id=incident.incident_id, + ) + ) + + # 寫入操作日誌 + op_repo = get_alert_operation_log_repository() + await op_repo.append( + event_type="PLAYBOOK_DRAFT_CREATED", + incident_id=incident.incident_id, + actor="mcp_phase4c", + action_detail=f"AI 草稿 Playbook: {entry.entry_id}", + success=True, + context={"alertname": alertname, "km_entry_id": entry.entry_id}, + ) + + import structlog as _sl + _sl.get_logger(__name__).info( + "playbook_draft_created", + incident_id=incident.incident_id, + alertname=alertname, + entry_id=entry.entry_id, + ) + + except Exception as e: + import structlog as _sl + _sl.get_logger(__name__).debug("playbook_draft_failed", error=str(e)) + + async def _fetch_metrics_snapshot(incident: Incident) -> dict: """ ADR-071-I: 從 Prometheus 抓取與此 incident 相關的指標快照 @@ -1052,6 +1150,9 @@ class DecisionManager: if playbook_result: return playbook_result + # MCP Phase 4c: Playbook 無命中 → 非同步產生 AI 草稿 Playbook (2026-04-11 Claude Sonnet 4.6) + asyncio.create_task(_generate_playbook_draft_if_new(incident)) + # Expert System 同步執行 (立即可用) expert_result = expert_analyze(incident)