Files
awoooi/apps/api/src/services/runbook_generator.py
Your Name 341c3b6523
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m13s
CD Pipeline / build-and-deploy (push) Successful in 3m22s
CD Pipeline / post-deploy-checks (push) Successful in 1m28s
fix(telegram): format governance and runbook alerts
2026-05-07 00:58:20 +08:00

427 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
==========================================================
修復後自動生成 Runbook成功或 Anti-Pattern失敗
透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫
設計原則:
- 非阻塞asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程
- 失敗靜默:生成失敗只記 log不拋例外
- DRAFT/PUBLISHED成功 → DRAFT需人工審核失敗 → PUBLISHED直接封鎖
版本: v1.1
建立: 2026-04-04 (台北時區)
建立者: ogt (首席架構師設計) + Claude Code (實作)
關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一
變更紀錄:
| 版本 | 日期 | 執行者 | 變更內容 |
|------|------|--------|----------|
| v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) |
| v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback |
"""
from __future__ import annotations
import asyncio
import html
import re
import time
from typing import TYPE_CHECKING
import structlog
from src.models.knowledge import (
EntrySource,
EntryStatus,
EntryType,
KnowledgeEntryCreate,
)
if TYPE_CHECKING:
from src.models.incident import Incident
from src.models.playbook import Playbook
from src.services.auto_repair_service import AutoRepairResult
logger = structlog.get_logger(__name__)
_CARD_MAX_LEN = 3600
_SECTION_RE = re.compile(r"^#{1,6}\s+(?P<title>.+?)\s*$")
_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*")
def _html(text: object) -> str:
return html.escape(str(text), quote=False)
def _shorten(text: object, limit: int = 120) -> str:
compact = " ".join(str(text or "").split())
if len(compact) <= limit:
return compact
return compact[: max(0, limit - 1)].rstrip() + ""
def _clean_preview_line(line: str) -> str:
line = _SECTION_RE.sub("", line.strip())
line = _BULLET_RE.sub("", line).strip()
line = line.replace("`", "")
return " ".join(line.split())
def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str:
"""從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。"""
lines = str(content or "").splitlines()
in_section = False
for raw_line in lines:
line = raw_line.strip()
if not line:
continue
heading = _SECTION_RE.match(line)
if heading:
in_section = title_keyword in heading.group("title")
continue
if not in_section:
continue
preview = _clean_preview_line(line)
if preview:
return _shorten(preview, 120)
return fallback
def _step_preview(content: str) -> str:
preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟")
if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")):
return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布"
return _shorten(preview, 120)
def format_runbook_review_card(
incident: object,
entry_id: str,
content: str,
) -> str:
"""格式化 Telegram Runbook 審核卡片。
2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE
能快速判斷知識狀態、受影響服務與審核重點。
"""
incident_id = getattr(incident, "incident_id", "unknown")
services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown"
symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核")
step = _step_preview(content)
message = (
"📄 <b>RUNBOOK REVIEW待審核</b>\n"
"──────────────────────\n"
f"📋 Incident<code>{_html(incident_id)}</code>\n"
f"🧩 受影響服務:<code>{_html(services)}</code>\n"
"🧠 知識狀態:<b>DRAFT需人工審核</b>\n"
f"🗂️ Entry ID<code>{_html(entry_id)}</code>\n\n"
"🧾 <b>內容摘要</b>\n"
f"├ 症狀:{_html(symptom)}\n"
f"└ 執行:{_html(step)}\n\n"
"✅ <b>審核重點</b>\n"
"1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n"
"2. 補齊適用條件、rollback 與驗證方式\n\n"
"🔎 AwoooP知識庫 / Runbook Review"
)
return message[:_CARD_MAX_LEN]
class NemotronRunbookGenerator:
"""
Nemotron 驅動的 Runbook 自動生成器
職責:
- 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card
- 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知
leWOOOgo 積木化:
- 呼叫 KnowledgeService不直接存 DB
- 呼叫 NvidiaProvider.chat()(非 AIRouterRunbook 是知識副作用)
"""
_RUNBOOK_SYSTEM = (
"你是 AWOOOI 平台的 SRE Runbook 撰寫專家。"
"根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。"
)
_ANTI_PATTERN_SYSTEM = (
"你是 AWOOOI 平台的故障分析專家。"
"根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。"
)
async def generate_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
symptoms_hash: str,
) -> None:
"""
成功修復後生成 AUTO_RUNBOOKfire-and-forget呼叫方不等待
Args:
incident: 觸發的 Incident
playbook: 執行的 Playbook
result: 執行結果success=True
symptoms_hash: SymptomPattern.compute_hash() 的 hash
"""
try:
content = await self._call_nemotron_for_runbook(incident, playbook, result)
if not content:
return
from src.services.knowledge_service import get_knowledge_service
ks = get_knowledge_service()
entry_data = KnowledgeEntryCreate(
title=f"[AUTO] {incident.incident_id}{playbook.name}",
content=content,
entry_type=EntryType.AUTO_RUNBOOK,
category="ai_system",
tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.DRAFT,
related_incident_id=incident.incident_id,
related_playbook_id=playbook.playbook_id,
symptoms_hash=symptoms_hash,
created_by="nemotron_runbook_generator",
)
entry = await ks.create_entry(entry_data)
logger.info(
"auto_runbook_created",
incident_id=incident.incident_id,
entry_id=entry.id,
playbook_id=playbook.playbook_id,
)
await self._push_runbook_review_card(incident, entry.id, content)
except Exception as e:
logger.error(
"runbook_generation_failed",
incident_id=incident.incident_id,
error=str(e),
)
async def generate_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
symptoms_hash: str,
) -> None:
"""
失敗修復後生成 ANTI_PATTERNfire-and-forget直接 PUBLISHED
Args:
incident: 觸發的 Incident
playbook: 嘗試執行的 Playbook
result: 執行結果success=False
symptoms_hash: SymptomPattern.compute_hash() 的 hash
"""
try:
content = await self._call_nemotron_for_anti_pattern(incident, playbook, result)
if not content:
return
from src.services.knowledge_service import get_knowledge_service
ks = get_knowledge_service()
title = f"[FAIL] {incident.incident_id}{playbook.name}"
entry_data = KnowledgeEntryCreate(
title=title,
content=content,
entry_type=EntryType.ANTI_PATTERN,
category="failure_cases",
tags=list(incident.affected_services or []) + ["anti_pattern", "failure"],
source=EntrySource.AI_EXTRACTED,
status=EntryStatus.PUBLISHED, # 直接發布,無需審核
related_incident_id=incident.incident_id,
related_playbook_id=playbook.playbook_id,
symptoms_hash=symptoms_hash,
created_by="nemotron_runbook_generator",
)
entry = await ks.create_entry(entry_data)
logger.info(
"anti_pattern_created",
incident_id=incident.incident_id,
entry_id=entry.id,
symptoms_hash=symptoms_hash,
)
await self._push_anti_pattern_notification(incident, title)
except Exception as e:
logger.error(
"anti_pattern_generation_failed",
incident_id=incident.incident_id,
error=str(e),
)
# =========================================================================
# Private
# =========================================================================
async def _call_nemotron_for_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""呼叫 Nemotron chat() 生成 9 段 Runbook回傳 Markdown 字串"""
from src.core.config import get_settings
from src.services.nvidia_provider import get_nvidia_provider
settings = get_settings()
prompt = (
f"## Incident 資訊\n"
f"- ID: {incident.incident_id}\n"
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n"
f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n"
f"## 執行的 Playbook\n"
f"- 名稱: {playbook.name}\n"
f"- 執行步驟:\n"
+ "\n".join(f" {s}" for s in result.executed_steps[:5])
+ f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n"
"請生成包含以下 9 段的 RunbookMarkdown 格式):\n"
"1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n"
"4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n"
"7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件"
)
try:
nvidia = get_nvidia_provider()
start = time.time()
# chat() 回傳 (response_text, success, total_tokens, cost_usd)
response_text, success, _tokens, _cost = await asyncio.wait_for(
nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"),
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
)
latency_ms = (time.time() - start) * 1000
logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1))
if success and response_text:
return response_text
except Exception as e:
logger.warning("runbook_nemotron_call_failed", error=str(e))
# Fallback組裝基本 Runbook
return self._build_minimal_runbook(incident, playbook, result)
async def _call_nemotron_for_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串"""
from src.core.config import get_settings
from src.services.nvidia_provider import get_nvidia_provider
settings = get_settings()
prompt = (
f"## Incident 資訊\n"
f"- ID: {incident.incident_id}\n"
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n"
f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n"
f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n"
"請生成失敗案例文件Markdown 格式),包含:\n"
"## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n"
"## 已知不適用條件\n## 替代方案建議"
)
try:
nvidia = get_nvidia_provider()
response_text, success, _tokens, _cost = await asyncio.wait_for(
nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"),
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
)
if success and response_text:
return response_text
except Exception as e:
logger.warning("anti_pattern_nemotron_call_failed", error=str(e))
return self._build_minimal_anti_pattern(incident, playbook, result)
def _build_minimal_runbook(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""Nemotron 超時/失敗時的基本 Runbook fallback"""
steps = "\n".join(f"- {s}" for s in result.executed_steps)
return (
f"## 症狀描述\nIncident {incident.incident_id}"
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
f"## 執行步驟\n{steps}\n\n"
f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n"
"*本文件由系統自動生成Nemotron fallback建議人工補充完善。*"
)
def _build_minimal_anti_pattern(
self,
incident: "Incident",
playbook: "Playbook",
result: "AutoRepairResult",
) -> str:
"""Nemotron 超時/失敗時的基本 Anti-Pattern fallback"""
return (
f"## 症狀描述\nIncident {incident.incident_id}"
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n"
f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n"
"*本文件由系統自動生成Nemotron fallback。*"
)
async def _push_runbook_review_card(
self,
incident: "Incident",
entry_id: str,
content_preview: str,
) -> None:
"""推送 Runbook 審核 card 到 Telegram"""
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
except Exception as e:
logger.warning("runbook_review_card_failed", error=str(e))
async def _push_anti_pattern_notification(
self,
incident: "Incident",
title: str,
) -> None:
"""推送 Anti-Pattern 已記錄通知到 Telegram"""
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
await tg.send_text(
f"⚠️ <b>已記錄失敗案例</b>\n"
f"Incident: <code>{incident.incident_id}</code>\n"
f"標題: {title}\n\n"
f"相同症狀的後續告警將阻斷自動修復,要求人工介入。"
)
except Exception as e:
logger.warning("anti_pattern_notification_failed", error=str(e))
# =============================================================================
# 單例管理
# =============================================================================
_generator: NemotronRunbookGenerator | None = None
def get_runbook_generator() -> NemotronRunbookGenerator:
global _generator
if _generator is None:
_generator = NemotronRunbookGenerator()
return _generator