427 lines
16 KiB
Python
427 lines
16 KiB
Python
"""
|
||
Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
|
||
==========================================================
|
||
修復後自動生成 Runbook(成功)或 Anti-Pattern(失敗)
|
||
透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫
|
||
|
||
設計原則:
|
||
- 非阻塞:asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程
|
||
- 失敗靜默:生成失敗只記 log,不拋例外
|
||
- DRAFT/PUBLISHED:成功 → DRAFT(需人工審核),失敗 → PUBLISHED(直接封鎖)
|
||
|
||
版本: v1.1
|
||
建立: 2026-04-04 (台北時區)
|
||
建立者: ogt (首席架構師設計) + Claude Code (實作)
|
||
關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一
|
||
|
||
變更紀錄:
|
||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||
|------|------|--------|----------|
|
||
| v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) |
|
||
| v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback |
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import html
|
||
import re
|
||
import time
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
|
||
from src.models.knowledge import (
|
||
EntrySource,
|
||
EntryStatus,
|
||
EntryType,
|
||
KnowledgeEntryCreate,
|
||
)
|
||
|
||
if TYPE_CHECKING:
|
||
from src.models.incident import Incident
|
||
from src.models.playbook import Playbook
|
||
from src.services.auto_repair_service import AutoRepairResult
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
_CARD_MAX_LEN = 3600
|
||
_SECTION_RE = re.compile(r"^#{1,6}\s+(?P<title>.+?)\s*$")
|
||
_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*")
|
||
|
||
|
||
def _html(text: object) -> str:
|
||
return html.escape(str(text), quote=False)
|
||
|
||
|
||
def _shorten(text: object, limit: int = 120) -> str:
|
||
compact = " ".join(str(text or "").split())
|
||
if len(compact) <= limit:
|
||
return compact
|
||
return compact[: max(0, limit - 1)].rstrip() + "…"
|
||
|
||
|
||
def _clean_preview_line(line: str) -> str:
|
||
line = _SECTION_RE.sub("", line.strip())
|
||
line = _BULLET_RE.sub("", line).strip()
|
||
line = line.replace("`", "")
|
||
return " ".join(line.split())
|
||
|
||
|
||
def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str:
|
||
"""從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。"""
|
||
lines = str(content or "").splitlines()
|
||
in_section = False
|
||
for raw_line in lines:
|
||
line = raw_line.strip()
|
||
if not line:
|
||
continue
|
||
heading = _SECTION_RE.match(line)
|
||
if heading:
|
||
in_section = title_keyword in heading.group("title")
|
||
continue
|
||
if not in_section:
|
||
continue
|
||
preview = _clean_preview_line(line)
|
||
if preview:
|
||
return _shorten(preview, 120)
|
||
return fallback
|
||
|
||
|
||
def _step_preview(content: str) -> str:
|
||
preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟")
|
||
if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")):
|
||
return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布"
|
||
return _shorten(preview, 120)
|
||
|
||
|
||
def format_runbook_review_card(
|
||
incident: object,
|
||
entry_id: str,
|
||
content: str,
|
||
) -> str:
|
||
"""格式化 Telegram Runbook 審核卡片。
|
||
|
||
2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE
|
||
能快速判斷知識狀態、受影響服務與審核重點。
|
||
"""
|
||
incident_id = getattr(incident, "incident_id", "unknown")
|
||
services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown"
|
||
symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核")
|
||
step = _step_preview(content)
|
||
|
||
message = (
|
||
"📄 <b>RUNBOOK REVIEW|待審核</b>\n"
|
||
"──────────────────────\n"
|
||
f"📋 Incident:<code>{_html(incident_id)}</code>\n"
|
||
f"🧩 受影響服務:<code>{_html(services)}</code>\n"
|
||
"🧠 知識狀態:<b>DRAFT|需人工審核</b>\n"
|
||
f"🗂️ Entry ID:<code>{_html(entry_id)}</code>\n\n"
|
||
"🧾 <b>內容摘要</b>\n"
|
||
f"├ 症狀:{_html(symptom)}\n"
|
||
f"└ 執行:{_html(step)}\n\n"
|
||
"✅ <b>審核重點</b>\n"
|
||
"1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n"
|
||
"2. 補齊適用條件、rollback 與驗證方式\n\n"
|
||
"🔎 AwoooP:知識庫 / Runbook Review"
|
||
)
|
||
return message[:_CARD_MAX_LEN]
|
||
|
||
|
||
class NemotronRunbookGenerator:
|
||
"""
|
||
Nemotron 驅動的 Runbook 自動生成器
|
||
|
||
職責:
|
||
- 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card
|
||
- 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知
|
||
|
||
leWOOOgo 積木化:
|
||
- 呼叫 KnowledgeService(不直接存 DB)
|
||
- 呼叫 NvidiaProvider.chat()(非 AIRouter,Runbook 是知識副作用)
|
||
"""
|
||
|
||
_RUNBOOK_SYSTEM = (
|
||
"你是 AWOOOI 平台的 SRE Runbook 撰寫專家。"
|
||
"根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。"
|
||
)
|
||
|
||
_ANTI_PATTERN_SYSTEM = (
|
||
"你是 AWOOOI 平台的故障分析專家。"
|
||
"根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。"
|
||
)
|
||
|
||
async def generate_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
symptoms_hash: str,
|
||
) -> None:
|
||
"""
|
||
成功修復後生成 AUTO_RUNBOOK(fire-and-forget,呼叫方不等待)
|
||
|
||
Args:
|
||
incident: 觸發的 Incident
|
||
playbook: 執行的 Playbook
|
||
result: 執行結果(success=True)
|
||
symptoms_hash: SymptomPattern.compute_hash() 的 hash
|
||
"""
|
||
try:
|
||
content = await self._call_nemotron_for_runbook(incident, playbook, result)
|
||
if not content:
|
||
return
|
||
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
ks = get_knowledge_service()
|
||
|
||
entry_data = KnowledgeEntryCreate(
|
||
title=f"[AUTO] {incident.incident_id} — {playbook.name}",
|
||
content=content,
|
||
entry_type=EntryType.AUTO_RUNBOOK,
|
||
category="ai_system",
|
||
tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"],
|
||
source=EntrySource.AI_EXTRACTED,
|
||
status=EntryStatus.DRAFT,
|
||
related_incident_id=incident.incident_id,
|
||
related_playbook_id=playbook.playbook_id,
|
||
symptoms_hash=symptoms_hash,
|
||
created_by="nemotron_runbook_generator",
|
||
)
|
||
|
||
entry = await ks.create_entry(entry_data)
|
||
|
||
logger.info(
|
||
"auto_runbook_created",
|
||
incident_id=incident.incident_id,
|
||
entry_id=entry.id,
|
||
playbook_id=playbook.playbook_id,
|
||
)
|
||
|
||
await self._push_runbook_review_card(incident, entry.id, content)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"runbook_generation_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
async def generate_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
symptoms_hash: str,
|
||
) -> None:
|
||
"""
|
||
失敗修復後生成 ANTI_PATTERN(fire-and-forget,直接 PUBLISHED)
|
||
|
||
Args:
|
||
incident: 觸發的 Incident
|
||
playbook: 嘗試執行的 Playbook
|
||
result: 執行結果(success=False)
|
||
symptoms_hash: SymptomPattern.compute_hash() 的 hash
|
||
"""
|
||
try:
|
||
content = await self._call_nemotron_for_anti_pattern(incident, playbook, result)
|
||
if not content:
|
||
return
|
||
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
ks = get_knowledge_service()
|
||
|
||
title = f"[FAIL] {incident.incident_id} — {playbook.name}"
|
||
entry_data = KnowledgeEntryCreate(
|
||
title=title,
|
||
content=content,
|
||
entry_type=EntryType.ANTI_PATTERN,
|
||
category="failure_cases",
|
||
tags=list(incident.affected_services or []) + ["anti_pattern", "failure"],
|
||
source=EntrySource.AI_EXTRACTED,
|
||
status=EntryStatus.PUBLISHED, # 直接發布,無需審核
|
||
related_incident_id=incident.incident_id,
|
||
related_playbook_id=playbook.playbook_id,
|
||
symptoms_hash=symptoms_hash,
|
||
created_by="nemotron_runbook_generator",
|
||
)
|
||
|
||
entry = await ks.create_entry(entry_data)
|
||
|
||
logger.info(
|
||
"anti_pattern_created",
|
||
incident_id=incident.incident_id,
|
||
entry_id=entry.id,
|
||
symptoms_hash=symptoms_hash,
|
||
)
|
||
|
||
await self._push_anti_pattern_notification(incident, title)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"anti_pattern_generation_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# =========================================================================
|
||
# Private
|
||
# =========================================================================
|
||
|
||
async def _call_nemotron_for_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""呼叫 Nemotron chat() 生成 9 段 Runbook,回傳 Markdown 字串"""
|
||
from src.core.config import get_settings
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
|
||
settings = get_settings()
|
||
prompt = (
|
||
f"## Incident 資訊\n"
|
||
f"- ID: {incident.incident_id}\n"
|
||
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n"
|
||
f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n"
|
||
f"## 執行的 Playbook\n"
|
||
f"- 名稱: {playbook.name}\n"
|
||
f"- 執行步驟:\n"
|
||
+ "\n".join(f" {s}" for s in result.executed_steps[:5])
|
||
+ f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n"
|
||
"請生成包含以下 9 段的 Runbook(Markdown 格式):\n"
|
||
"1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n"
|
||
"4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n"
|
||
"7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件"
|
||
)
|
||
|
||
try:
|
||
nvidia = get_nvidia_provider()
|
||
start = time.time()
|
||
# chat() 回傳 (response_text, success, total_tokens, cost_usd)
|
||
response_text, success, _tokens, _cost = await asyncio.wait_for(
|
||
nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"),
|
||
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
|
||
)
|
||
latency_ms = (time.time() - start) * 1000
|
||
logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1))
|
||
if success and response_text:
|
||
return response_text
|
||
except Exception as e:
|
||
logger.warning("runbook_nemotron_call_failed", error=str(e))
|
||
|
||
# Fallback:組裝基本 Runbook
|
||
return self._build_minimal_runbook(incident, playbook, result)
|
||
|
||
async def _call_nemotron_for_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串"""
|
||
from src.core.config import get_settings
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
|
||
settings = get_settings()
|
||
prompt = (
|
||
f"## Incident 資訊\n"
|
||
f"- ID: {incident.incident_id}\n"
|
||
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n"
|
||
f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n"
|
||
"請生成失敗案例文件(Markdown 格式),包含:\n"
|
||
"## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n"
|
||
"## 已知不適用條件\n## 替代方案建議"
|
||
)
|
||
|
||
try:
|
||
nvidia = get_nvidia_provider()
|
||
response_text, success, _tokens, _cost = await asyncio.wait_for(
|
||
nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"),
|
||
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
|
||
)
|
||
if success and response_text:
|
||
return response_text
|
||
except Exception as e:
|
||
logger.warning("anti_pattern_nemotron_call_failed", error=str(e))
|
||
|
||
return self._build_minimal_anti_pattern(incident, playbook, result)
|
||
|
||
def _build_minimal_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""Nemotron 超時/失敗時的基本 Runbook fallback"""
|
||
steps = "\n".join(f"- {s}" for s in result.executed_steps)
|
||
return (
|
||
f"## 症狀描述\nIncident {incident.incident_id},"
|
||
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 執行步驟\n{steps}\n\n"
|
||
f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n"
|
||
"*本文件由系統自動生成(Nemotron fallback),建議人工補充完善。*"
|
||
)
|
||
|
||
def _build_minimal_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""Nemotron 超時/失敗時的基本 Anti-Pattern fallback"""
|
||
return (
|
||
f"## 症狀描述\nIncident {incident.incident_id},"
|
||
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n"
|
||
f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n"
|
||
"*本文件由系統自動生成(Nemotron fallback)。*"
|
||
)
|
||
|
||
async def _push_runbook_review_card(
|
||
self,
|
||
incident: "Incident",
|
||
entry_id: str,
|
||
content_preview: str,
|
||
) -> None:
|
||
"""推送 Runbook 審核 card 到 Telegram"""
|
||
try:
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
tg = get_telegram_gateway()
|
||
await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
|
||
except Exception as e:
|
||
logger.warning("runbook_review_card_failed", error=str(e))
|
||
|
||
async def _push_anti_pattern_notification(
|
||
self,
|
||
incident: "Incident",
|
||
title: str,
|
||
) -> None:
|
||
"""推送 Anti-Pattern 已記錄通知到 Telegram"""
|
||
try:
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
tg = get_telegram_gateway()
|
||
await tg.send_text(
|
||
f"⚠️ <b>已記錄失敗案例</b>\n"
|
||
f"Incident: <code>{incident.incident_id}</code>\n"
|
||
f"標題: {title}\n\n"
|
||
f"相同症狀的後續告警將阻斷自動修復,要求人工介入。"
|
||
)
|
||
except Exception as e:
|
||
logger.warning("anti_pattern_notification_failed", error=str(e))
|
||
|
||
|
||
# =============================================================================
|
||
# 單例管理
|
||
# =============================================================================
|
||
|
||
_generator: NemotronRunbookGenerator | None = None
|
||
|
||
|
||
def get_runbook_generator() -> NemotronRunbookGenerator:
|
||
global _generator
|
||
if _generator is None:
|
||
_generator = NemotronRunbookGenerator()
|
||
return _generator
|