"""
Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
==========================================================
修復後自動生成 Runbook(成功)或 Anti-Pattern(失敗)
透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫
設計原則:
- 非阻塞:asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程
- 失敗靜默:生成失敗只記 log,不拋例外
- DRAFT/PUBLISHED:成功 → DRAFT(需人工審核),失敗 → PUBLISHED(直接封鎖)
版本: v1.1
建立: 2026-04-04 (台北時區)
建立者: ogt (首席架構師設計) + Claude Code (實作)
關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一
變更紀錄:
| 版本 | 日期 | 執行者 | 變更內容 |
|------|------|--------|----------|
| v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) |
| v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback |
"""
from __future__ import annotations
import asyncio
import time
from typing import TYPE_CHECKING
import structlog
from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate
if TYPE_CHECKING:
from src.models.incident import Incident
from src.models.playbook import Playbook
from src.services.auto_repair_service import AutoRepairResult
logger = structlog.get_logger(__name__)
class NemotronRunbookGenerator:
"""
Nemotron 驅動的 Runbook 自動生成器
職責:
- 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card
- 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知
leWOOOgo 積木化:
- 呼叫 KnowledgeService(不直接存 DB)
- 呼叫 NvidiaProvider.chat()(非 AIRouter,Runbook 是知識副作用)
"""
_RUNBOOK_SYSTEM = (
"你是 AWOOOI 平台的 SRE Runbook 撰寫專家。"
"根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。"
)
_ANTI_PATTERN_SYSTEM = (
"你是 AWOOOI 平台的故障分析專家。"
"根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。"
)
async def generate_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
symptoms_hash: str,
) -> None:
"""
成功修復後生成 AUTO_RUNBOOK(fire-and-forget,呼叫方不等待)
Args:
incident: 觸發的 Incident
playbook: 執行的 Playbook
result: 執行結果(success=True)
symptoms_hash: SymptomPattern.compute_hash() 的 hash
"""
try:
content = await self._call_nemotron_for_runbook(incident, playbook, result)
if not content:
return
from src.services.knowledge_service import get_knowledge_service
ks = get_knowledge_service()
entry_data = KnowledgeEntryCreate(
title=f"[AUTO] {incident.incident_id} — {playbook.name}",
content=content,
entry_type=EntryType.AUTO_RUNBOOK,
category="auto_generated",
tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.DRAFT,
related_incident_id=incident.incident_id,
related_playbook_id=playbook.playbook_id,
symptoms_hash=symptoms_hash,
created_by="nemotron_runbook_generator",
)
entry = await ks.create_entry(entry_data)
logger.info(
"auto_runbook_created",
incident_id=incident.incident_id,
entry_id=entry.id,
playbook_id=playbook.playbook_id,
)
await self._push_runbook_review_card(incident, entry.id, content[:200])
except Exception as e:
logger.error(
"runbook_generation_failed",
incident_id=incident.incident_id,
error=str(e),
)
async def generate_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
symptoms_hash: str,
) -> None:
"""
失敗修復後生成 ANTI_PATTERN(fire-and-forget,直接 PUBLISHED)
Args:
incident: 觸發的 Incident
playbook: 嘗試執行的 Playbook
result: 執行結果(success=False)
symptoms_hash: SymptomPattern.compute_hash() 的 hash
"""
try:
content = await self._call_nemotron_for_anti_pattern(incident, playbook, result)
if not content:
return
from src.services.knowledge_service import get_knowledge_service
ks = get_knowledge_service()
title = f"[FAIL] {incident.incident_id} — {playbook.name}"
entry_data = KnowledgeEntryCreate(
title=title,
content=content,
entry_type=EntryType.ANTI_PATTERN,
category="failure_cases",
tags=list(incident.affected_services or []) + ["anti_pattern", "failure"],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.PUBLISHED, # 直接發布,無需審核
related_incident_id=incident.incident_id,
related_playbook_id=playbook.playbook_id,
symptoms_hash=symptoms_hash,
created_by="nemotron_runbook_generator",
)
entry = await ks.create_entry(entry_data)
logger.info(
"anti_pattern_created",
incident_id=incident.incident_id,
entry_id=entry.id,
symptoms_hash=symptoms_hash,
)
await self._push_anti_pattern_notification(incident, title)
except Exception as e:
logger.error(
"anti_pattern_generation_failed",
incident_id=incident.incident_id,
error=str(e),
)
# =========================================================================
# Private
# =========================================================================
async def _call_nemotron_for_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""呼叫 Nemotron chat() 生成 9 段 Runbook,回傳 Markdown 字串"""
from src.core.config import get_settings
from src.services.nvidia_provider import get_nvidia_provider
settings = get_settings()
prompt = (
f"## Incident 資訊\n"
f"- ID: {incident.incident_id}\n"
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n"
f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n"
f"## 執行的 Playbook\n"
f"- 名稱: {playbook.name}\n"
f"- 執行步驟:\n"
+ "\n".join(f" {s}" for s in result.executed_steps[:5])
+ f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n"
"請生成包含以下 9 段的 Runbook(Markdown 格式):\n"
"1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n"
"4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n"
"7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件"
)
try:
nvidia = get_nvidia_provider()
start = time.time()
# chat() 回傳 (response_text, success, total_tokens, cost_usd)
response_text, success, _tokens, _cost = await asyncio.wait_for(
nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"),
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
)
latency_ms = (time.time() - start) * 1000
logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1))
if success and response_text:
return response_text
except Exception as e:
logger.warning("runbook_nemotron_call_failed", error=str(e))
# Fallback:組裝基本 Runbook
return self._build_minimal_runbook(incident, playbook, result)
async def _call_nemotron_for_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串"""
from src.core.config import get_settings
from src.services.nvidia_provider import get_nvidia_provider
settings = get_settings()
prompt = (
f"## Incident 資訊\n"
f"- ID: {incident.incident_id}\n"
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n"
f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n"
f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n"
"請生成失敗案例文件(Markdown 格式),包含:\n"
"## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n"
"## 已知不適用條件\n## 替代方案建議"
)
try:
nvidia = get_nvidia_provider()
response_text, success, _tokens, _cost = await asyncio.wait_for(
nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"),
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
)
if success and response_text:
return response_text
except Exception as e:
logger.warning("anti_pattern_nemotron_call_failed", error=str(e))
return self._build_minimal_anti_pattern(incident, playbook, result)
def _build_minimal_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""Nemotron 超時/失敗時的基本 Runbook fallback"""
steps = "\n".join(f"- {s}" for s in result.executed_steps)
return (
f"## 症狀描述\nIncident {incident.incident_id},"
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
f"## 執行步驟\n{steps}\n\n"
f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n"
"*本文件由系統自動生成(Nemotron fallback),建議人工補充完善。*"
)
def _build_minimal_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""Nemotron 超時/失敗時的基本 Anti-Pattern fallback"""
return (
f"## 症狀描述\nIncident {incident.incident_id},"
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n"
f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n"
"*本文件由系統自動生成(Nemotron fallback)。*"
)
async def _push_runbook_review_card(
self,
incident: "Incident",
entry_id: str,
content_preview: str,
) -> None:
"""推送 Runbook 審核 card 到 Telegram"""
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
await tg.send_text(
f"📄 Auto Runbook 待審核\n"
f"Incident: {incident.incident_id}\n"
f"Entry ID: {entry_id}\n\n"
f"{content_preview}...\n\n"
f"請至知識庫審核並發布。"
)
except Exception as e:
logger.warning("runbook_review_card_failed", error=str(e))
async def _push_anti_pattern_notification(
self,
incident: "Incident",
title: str,
) -> None:
"""推送 Anti-Pattern 已記錄通知到 Telegram"""
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
await tg.send_text(
f"⚠️ 已記錄失敗案例\n"
f"Incident: {incident.incident_id}\n"
f"標題: {title}\n\n"
f"相同症狀的後續告警將阻斷自動修復,要求人工介入。"
)
except Exception as e:
logger.warning("anti_pattern_notification_failed", error=str(e))
# =============================================================================
# 單例管理
# =============================================================================
_generator: NemotronRunbookGenerator | None = None
def get_runbook_generator() -> NemotronRunbookGenerator:
global _generator
if _generator is None:
_generator = NemotronRunbookGenerator()
return _generator