""" Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting ========================================================== 修復後自動生成 Runbook(成功)或 Anti-Pattern(失敗) 透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫 設計原則: - 非阻塞:asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程 - 失敗靜默:生成失敗只記 log,不拋例外 - DRAFT/PUBLISHED:成功 → DRAFT(需人工審核),失敗 → PUBLISHED(直接封鎖) 版本: v1.1 建立: 2026-04-04 (台北時區) 建立者: ogt (首席架構師設計) + Claude Code (實作) 關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一 變更紀錄: | 版本 | 日期 | 執行者 | 變更內容 | |------|------|--------|----------| | v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) | | v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback | """ from __future__ import annotations import asyncio import html import re import time from typing import TYPE_CHECKING import structlog from src.models.knowledge import ( EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate, ) if TYPE_CHECKING: from src.models.incident import Incident from src.models.playbook import Playbook from src.services.auto_repair_service import AutoRepairResult logger = structlog.get_logger(__name__) _CARD_MAX_LEN = 3600 _SECTION_RE = re.compile(r"^#{1,6}\s+(?P.+?)\s*$") _BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*") def _html(text: object) -> str: return html.escape(str(text), quote=False) def _shorten(text: object, limit: int = 120) -> str: compact = " ".join(str(text or "").split()) if len(compact) <= limit: return compact return compact[: max(0, limit - 1)].rstrip() + "…" def _clean_preview_line(line: str) -> str: line = _SECTION_RE.sub("", line.strip()) line = _BULLET_RE.sub("", line).strip() line = line.replace("`", "") return " ".join(line.split()) def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str: """從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。""" lines = str(content or "").splitlines() in_section = False for raw_line in lines: line = raw_line.strip() if not line: continue heading = _SECTION_RE.match(line) if heading: in_section = title_keyword in heading.group("title") continue if not in_section: continue preview = _clean_preview_line(line) if preview: return _shorten(preview, 120) return fallback def _step_preview(content: str) -> str: preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟") if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")): return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布" return _shorten(preview, 120) def format_runbook_review_card( incident: object, entry_id: str, content: str, ) -> str: """格式化 Telegram Runbook 審核卡片。 2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE 能快速判斷知識狀態、受影響服務與審核重點。 """ incident_id = getattr(incident, "incident_id", "unknown") services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown" symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核") step = _step_preview(content) message = ( "📄 <b>RUNBOOK REVIEW|待審核</b>\n" "──────────────────────\n" f"📋 Incident:<code>{_html(incident_id)}</code>\n" f"🧩 受影響服務:<code>{_html(services)}</code>\n" "🧠 知識狀態:<b>DRAFT|需人工審核</b>\n" f"🗂️ Entry ID:<code>{_html(entry_id)}</code>\n\n" "🧾 <b>內容摘要</b>\n" f"├ 症狀:{_html(symptom)}\n" f"└ 執行:{_html(step)}\n\n" "✅ <b>審核重點</b>\n" "1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n" "2. 補齊適用條件、rollback 與驗證方式\n\n" "🔎 AwoooP:知識庫 / Runbook Review" ) return message[:_CARD_MAX_LEN] class NemotronRunbookGenerator: """ Nemotron 驅動的 Runbook 自動生成器 職責: - 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card - 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知 leWOOOgo 積木化: - 呼叫 KnowledgeService(不直接存 DB) - 呼叫 NvidiaProvider.chat()(非 AIRouter,Runbook 是知識副作用) """ _RUNBOOK_SYSTEM = ( "你是 AWOOOI 平台的 SRE Runbook 撰寫專家。" "根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。" ) _ANTI_PATTERN_SYSTEM = ( "你是 AWOOOI 平台的故障分析專家。" "根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。" ) async def generate_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", symptoms_hash: str, ) -> None: """ 成功修復後生成 AUTO_RUNBOOK(fire-and-forget,呼叫方不等待) Args: incident: 觸發的 Incident playbook: 執行的 Playbook result: 執行結果(success=True) symptoms_hash: SymptomPattern.compute_hash() 的 hash """ try: content = await self._call_nemotron_for_runbook(incident, playbook, result) if not content: return from src.services.knowledge_service import get_knowledge_service ks = get_knowledge_service() entry_data = KnowledgeEntryCreate( title=f"[AUTO] {incident.incident_id} — {playbook.name}", content=content, entry_type=EntryType.AUTO_RUNBOOK, category="ai_system", tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"], source=EntrySource.AI_EXTRACTED, status=EntryStatus.DRAFT, related_incident_id=incident.incident_id, related_playbook_id=playbook.playbook_id, symptoms_hash=symptoms_hash, created_by="nemotron_runbook_generator", ) entry = await ks.create_entry(entry_data) logger.info( "auto_runbook_created", incident_id=incident.incident_id, entry_id=entry.id, playbook_id=playbook.playbook_id, ) await self._push_runbook_review_card(incident, entry.id, content) except Exception as e: logger.error( "runbook_generation_failed", incident_id=incident.incident_id, error=str(e), ) async def generate_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", symptoms_hash: str, ) -> None: """ 失敗修復後生成 ANTI_PATTERN(fire-and-forget,直接 PUBLISHED) Args: incident: 觸發的 Incident playbook: 嘗試執行的 Playbook result: 執行結果(success=False) symptoms_hash: SymptomPattern.compute_hash() 的 hash """ try: content = await self._call_nemotron_for_anti_pattern(incident, playbook, result) if not content: return from src.services.knowledge_service import get_knowledge_service ks = get_knowledge_service() title = f"[FAIL] {incident.incident_id} — {playbook.name}" entry_data = KnowledgeEntryCreate( title=title, content=content, entry_type=EntryType.ANTI_PATTERN, category="failure_cases", tags=list(incident.affected_services or []) + ["anti_pattern", "failure"], source=EntrySource.AI_EXTRACTED, status=EntryStatus.PUBLISHED, # 直接發布,無需審核 related_incident_id=incident.incident_id, related_playbook_id=playbook.playbook_id, symptoms_hash=symptoms_hash, created_by="nemotron_runbook_generator", ) entry = await ks.create_entry(entry_data) logger.info( "anti_pattern_created", incident_id=incident.incident_id, entry_id=entry.id, symptoms_hash=symptoms_hash, ) await self._push_anti_pattern_notification(incident, title) except Exception as e: logger.error( "anti_pattern_generation_failed", incident_id=incident.incident_id, error=str(e), ) # ========================================================================= # Private # ========================================================================= async def _call_nemotron_for_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """呼叫 Nemotron chat() 生成 9 段 Runbook,回傳 Markdown 字串""" from src.core.config import get_settings from src.services.nvidia_provider import get_nvidia_provider settings = get_settings() prompt = ( f"## Incident 資訊\n" f"- ID: {incident.incident_id}\n" f"- 受影響服務: {', '.join(incident.affected_services or [])}\n" f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n" f"## 執行的 Playbook\n" f"- 名稱: {playbook.name}\n" f"- 執行步驟:\n" + "\n".join(f" {s}" for s in result.executed_steps[:5]) + f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n" "請生成包含以下 9 段的 Runbook(Markdown 格式):\n" "1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n" "4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n" "7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件" ) try: nvidia = get_nvidia_provider() start = time.time() # chat() 回傳 (response_text, success, total_tokens, cost_usd) response_text, success, _tokens, _cost = await asyncio.wait_for( nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"), timeout=settings.NEMOTRON_TIMEOUT_SECONDS, ) latency_ms = (time.time() - start) * 1000 logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1)) if success and response_text: return response_text except Exception as e: logger.warning("runbook_nemotron_call_failed", error=str(e)) # Fallback:組裝基本 Runbook return self._build_minimal_runbook(incident, playbook, result) async def _call_nemotron_for_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串""" from src.core.config import get_settings from src.services.nvidia_provider import get_nvidia_provider settings = get_settings() prompt = ( f"## Incident 資訊\n" f"- ID: {incident.incident_id}\n" f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n" f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n" f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n" "請生成失敗案例文件(Markdown 格式),包含:\n" "## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n" "## 已知不適用條件\n## 替代方案建議" ) try: nvidia = get_nvidia_provider() response_text, success, _tokens, _cost = await asyncio.wait_for( nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"), timeout=settings.NEMOTRON_TIMEOUT_SECONDS, ) if success and response_text: return response_text except Exception as e: logger.warning("anti_pattern_nemotron_call_failed", error=str(e)) return self._build_minimal_anti_pattern(incident, playbook, result) def _build_minimal_runbook( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """Nemotron 超時/失敗時的基本 Runbook fallback""" steps = "\n".join(f"- {s}" for s in result.executed_steps) return ( f"## 症狀描述\nIncident {incident.incident_id}," f"受影響服務:{', '.join(incident.affected_services or [])}\n\n" f"## 執行步驟\n{steps}\n\n" f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n" "*本文件由系統自動生成(Nemotron fallback),建議人工補充完善。*" ) def _build_minimal_anti_pattern( self, incident: "Incident", playbook: "Playbook", result: "AutoRepairResult", ) -> str: """Nemotron 超時/失敗時的基本 Anti-Pattern fallback""" return ( f"## 症狀描述\nIncident {incident.incident_id}," f"受影響服務:{', '.join(incident.affected_services or [])}\n\n" f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n" f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n" "*本文件由系統自動生成(Nemotron fallback)。*" ) async def _push_runbook_review_card( self, incident: "Incident", entry_id: str, content_preview: str, ) -> None: """推送 Runbook 審核 card 到 Telegram""" try: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview)) except Exception as e: logger.warning("runbook_review_card_failed", error=str(e)) async def _push_anti_pattern_notification( self, incident: "Incident", title: str, ) -> None: """推送 Anti-Pattern 已記錄通知到 Telegram""" try: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() await tg.send_text( f"⚠️ <b>已記錄失敗案例</b>\n" f"Incident: <code>{incident.incident_id}</code>\n" f"標題: {title}\n\n" f"相同症狀的後續告警將阻斷自動修復,要求人工介入。" ) except Exception as e: logger.warning("anti_pattern_notification_failed", error=str(e)) # ============================================================================= # 單例管理 # ============================================================================= _generator: NemotronRunbookGenerator | None = None def get_runbook_generator() -> NemotronRunbookGenerator: global _generator if _generator is None: _generator = NemotronRunbookGenerator() return _generator