""" Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting ========================================================== 修復後自動生成 Runbook(成功)或 Anti-Pattern(失敗) 透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫 設計原則: - 非阻塞:asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程 - 失敗靜默:生成失敗只記 log,不拋例外 - DRAFT/PUBLISHED:成功 → DRAFT(需人工審核),失敗 → PUBLISHED(直接封鎖) 版本: v1.1 建立: 2026-04-04 (台北時區) 建立者: ogt (首席架構師設計) + Claude Code (實作) 關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一 變更紀錄: | 版本 | 日期 | 執行者 | 變更內容 | |------|------|--------|----------| | v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) | | v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback | """ from __future__ import annotations import asyncio import time from typing import TYPE_CHECKING import structlog from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate if TYPE_CHECKING: from src.models.incident import Incident from src.models.playbook import Playbook from src.services.auto_repair_service import AutoRepairResult logger = structlog.get_logger(__name__) class NemotronRunbookGenerator: """ Nemotron 驅動的 Runbook 自動生成器 職責: - 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card - 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知 leWOOOgo 積木化: - 呼叫 KnowledgeService(不直接存 DB) - 呼叫 NvidiaProvider.chat()(非 AIRouter,Runbook 是知識副作用) """ _RUNBOOK_SYSTEM = ( "你是 AWOOOI 平台的 SRE Runbook 撰寫專家。" "根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。" ) _ANTI_PATTERN_SYSTEM = ( "你是 AWOOOI 平台的故障分析專家。" "根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。" ) async def generate_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", symptoms_hash: str, ) -> None: """ 成功修復後生成 AUTO_RUNBOOK(fire-and-forget,呼叫方不等待) Args: incident: 觸發的 Incident playbook: 執行的 Playbook result: 執行結果(success=True) symptoms_hash: SymptomPattern.compute_hash() 的 hash """ try: content = await self._call_nemotron_for_runbook(incident, playbook, result) if not content: return from src.services.knowledge_service import get_knowledge_service ks = get_knowledge_service() entry_data = KnowledgeEntryCreate( title=f"[AUTO] {incident.incident_id} — {playbook.name}", content=content, entry_type=EntryType.AUTO_RUNBOOK, category="auto_generated", tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"], source=EntrySource.AI_EXTRACTED, status=EntryStatus.DRAFT, related_incident_id=incident.incident_id, related_playbook_id=playbook.playbook_id, symptoms_hash=symptoms_hash, created_by="nemotron_runbook_generator", ) entry = await ks.create_entry(entry_data) logger.info( "auto_runbook_created", incident_id=incident.incident_id, entry_id=entry.id, playbook_id=playbook.playbook_id, ) await self._push_runbook_review_card(incident, entry.id, content[:200]) except Exception as e: logger.error( "runbook_generation_failed", incident_id=incident.incident_id, error=str(e), ) async def generate_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", symptoms_hash: str, ) -> None: """ 失敗修復後生成 ANTI_PATTERN(fire-and-forget,直接 PUBLISHED) Args: incident: 觸發的 Incident playbook: 嘗試執行的 Playbook result: 執行結果(success=False) symptoms_hash: SymptomPattern.compute_hash() 的 hash """ try: content = await self._call_nemotron_for_anti_pattern(incident, playbook, result) if not content: return from src.services.knowledge_service import get_knowledge_service ks = get_knowledge_service() title = f"[FAIL] {incident.incident_id} — {playbook.name}" entry_data = KnowledgeEntryCreate( title=title, content=content, entry_type=EntryType.ANTI_PATTERN, category="failure_cases", tags=list(incident.affected_services or []) + ["anti_pattern", "failure"], source=EntrySource.AI_EXTRACTED, status=EntryStatus.PUBLISHED, # 直接發布,無需審核 related_incident_id=incident.incident_id, related_playbook_id=playbook.playbook_id, symptoms_hash=symptoms_hash, created_by="nemotron_runbook_generator", ) entry = await ks.create_entry(entry_data) logger.info( "anti_pattern_created", incident_id=incident.incident_id, entry_id=entry.id, symptoms_hash=symptoms_hash, ) await self._push_anti_pattern_notification(incident, title) except Exception as e: logger.error( "anti_pattern_generation_failed", incident_id=incident.incident_id, error=str(e), ) # ========================================================================= # Private # ========================================================================= async def _call_nemotron_for_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """呼叫 Nemotron chat() 生成 9 段 Runbook,回傳 Markdown 字串""" from src.core.config import get_settings from src.services.nvidia_provider import get_nvidia_provider settings = get_settings() prompt = ( f"## Incident 資訊\n" f"- ID: {incident.incident_id}\n" f"- 受影響服務: {', '.join(incident.affected_services or [])}\n" f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n" f"## 執行的 Playbook\n" f"- 名稱: {playbook.name}\n" f"- 執行步驟:\n" + "\n".join(f" {s}" for s in result.executed_steps[:5]) + f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n" "請生成包含以下 9 段的 Runbook(Markdown 格式):\n" "1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n" "4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n" "7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件" ) try: nvidia = get_nvidia_provider() start = time.time() # chat() 回傳 (response_text, success, total_tokens, cost_usd) response_text, success, _tokens, _cost = await asyncio.wait_for( nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"), timeout=settings.NEMOTRON_TIMEOUT_SECONDS, ) latency_ms = (time.time() - start) * 1000 logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1)) if success and response_text: return response_text except Exception as e: logger.warning("runbook_nemotron_call_failed", error=str(e)) # Fallback:組裝基本 Runbook return self._build_minimal_runbook(incident, playbook, result) async def _call_nemotron_for_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串""" from src.core.config import get_settings from src.services.nvidia_provider import get_nvidia_provider settings = get_settings() prompt = ( f"## Incident 資訊\n" f"- ID: {incident.incident_id}\n" f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n" f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n" f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n" "請生成失敗案例文件(Markdown 格式),包含:\n" "## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n" "## 已知不適用條件\n## 替代方案建議" ) try: nvidia = get_nvidia_provider() response_text, success, _tokens, _cost = await asyncio.wait_for( nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"), timeout=settings.NEMOTRON_TIMEOUT_SECONDS, ) if success and response_text: return response_text except Exception as e: logger.warning("anti_pattern_nemotron_call_failed", error=str(e)) return self._build_minimal_anti_pattern(incident, playbook, result) def _build_minimal_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """Nemotron 超時/失敗時的基本 Runbook fallback""" steps = "\n".join(f"- {s}" for s in result.executed_steps) return ( f"## 症狀描述\nIncident {incident.incident_id}," f"受影響服務:{', '.join(incident.affected_services or [])}\n\n" f"## 執行步驟\n{steps}\n\n" f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n" "*本文件由系統自動生成(Nemotron fallback),建議人工補充完善。*" ) def _build_minimal_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """Nemotron 超時/失敗時的基本 Anti-Pattern fallback""" return ( f"## 症狀描述\nIncident {incident.incident_id}," f"受影響服務:{', '.join(incident.affected_services or [])}\n\n" f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n" f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n" "*本文件由系統自動生成(Nemotron fallback)。*" ) async def _push_runbook_review_card( self, incident: "Incident", entry_id: str, content_preview: str, ) -> None: """推送 Runbook 審核 card 到 Telegram""" try: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() await tg.send_text( f"📄 Auto Runbook 待審核\n" f"Incident: {incident.incident_id}\n" f"Entry ID: {entry_id}\n\n" f"{content_preview}...\n\n" f"請至知識庫審核並發布。" ) except Exception as e: logger.warning("runbook_review_card_failed", error=str(e)) async def _push_anti_pattern_notification( self, incident: "Incident", title: str, ) -> None: """推送 Anti-Pattern 已記錄通知到 Telegram""" try: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() await tg.send_text( f"⚠️ 已記錄失敗案例\n" f"Incident: {incident.incident_id}\n" f"標題: {title}\n\n" f"相同症狀的後續告警將阻斷自動修復,要求人工介入。" ) except Exception as e: logger.warning("anti_pattern_notification_failed", error=str(e)) # ============================================================================= # 單例管理 # ============================================================================= _generator: NemotronRunbookGenerator | None = None def get_runbook_generator() -> NemotronRunbookGenerator: global _generator if _generator is None: _generator = NemotronRunbookGenerator() return _generator