P0 - DIAGNOSE Privacy-First Routing: - ai_router.py: _local_fallback_chain [NEMOTRON→OLLAMA→REJECT] - DIAGNOSE 意圖 override 改為 NEMOTRON (原 OLLAMA) - DIAGNOSE fallback 使用 local-only 鏈,不觸碰雲端 - 全部失敗時 REJECT + Telegram 通知 - config.py: NEMOTRON_DIAGNOSE_TIMEOUT_SECONDS=30, OLLAMA_DIAGNOSE_TIMEOUT_SECONDS=60 - nemotron.py: 根據 context[task_type] 選擇 timeout P1 - Knowledge Auto-Harvesting: - models/knowledge.py: EntryType.AUTO_RUNBOOK + ANTI_PATTERN + symptoms_hash - EntryStatus.PUBLISHED (ANTI_PATTERN 直接發布,無需審核) - models/playbook.py: SymptomPattern.compute_hash() (16字元確定性 hash) - services/runbook_generator.py: NemotronRunbookGenerator (v1.1) - generate_runbook() → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card - generate_anti_pattern() → ANTI_PATTERN (PUBLISHED) + Telegram 通知 - 使用 nvidia.chat() (正確介面),Nemotron 超時時 Minimal fallback - knowledge_service.py: check_anti_pattern(symptoms_hash, days=7) - db/models.py: symptoms_hash VARCHAR(16) + ix_knowledge_symptoms_hash - repositories/knowledge_repository.py: create() 支援 symptoms_hash + status - auto_repair_service.py: anti_pattern_gate 在 decide() + runbook hook 在 execute() - migrations/phase8_symptoms_hash.sql: ALTER TABLE + partial index + PUBLISHED constraint P2 - Config Drift Detection: - models/drift.py: DriftItem/DriftReport/DriftLevel/DriftIntent/DriftStatus - services/drift_detector.py: GitStateReader + K8sStateReader + DriftDetector - services/drift_analyzer.py: 白名單過濾 + DriftLevel 分級 - services/drift_interpreter.py: NemotronDriftInterpreter(意圖分析,不生成修復指令) - services/drift_remediator.py: rollback(kubectl apply) + adopt(git push gitea) - api/v1/drift.py: POST /scan, GET /reports, POST /rollback, POST /adopt - migrations/phase9_drift_reports.sql: drift_reports 表 - k8s/drift-cronjob.yaml: 每小時自動掃描 CronJob Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
344 lines
13 KiB
Python
344 lines
13 KiB
Python
"""
|
||
Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
|
||
==========================================================
|
||
修復後自動生成 Runbook(成功)或 Anti-Pattern(失敗)
|
||
透過 Nemotron NIM 生成,結果沉澱至 KM 知識庫
|
||
|
||
設計原則:
|
||
- 非阻塞:asyncio.create_task() 呼叫,絕不影響 AutoRepair 主流程
|
||
- 失敗靜默:生成失敗只記 log,不拋例外
|
||
- DRAFT/PUBLISHED:成功 → DRAFT(需人工審核),失敗 → PUBLISHED(直接封鎖)
|
||
|
||
版本: v1.1
|
||
建立: 2026-04-04 (台北時區)
|
||
建立者: ogt (首席架構師設計) + Claude Code (實作)
|
||
關聯設計: docs/superpowers/specs/2026-04-04-nemotron-active-defense-design.md 方向一
|
||
|
||
變更紀錄:
|
||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||
|------|------|--------|----------|
|
||
| v1.0 | 2026-04-04 | Claude Code | 初始佔位(使用 generate() 但介面不存在) |
|
||
| v1.1 | 2026-04-04 | ogt (首席架構師) | 改用正確的 nvidia.chat() 介面;新增 Minimal fallback |
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import time
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
|
||
from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate
|
||
|
||
if TYPE_CHECKING:
|
||
from src.models.incident import Incident
|
||
from src.models.playbook import Playbook
|
||
from src.services.auto_repair_service import AutoRepairResult
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
class NemotronRunbookGenerator:
|
||
"""
|
||
Nemotron 驅動的 Runbook 自動生成器
|
||
|
||
職責:
|
||
- 成功修復 → AUTO_RUNBOOK (DRAFT) + Telegram 審核 card
|
||
- 失敗修復 → ANTI_PATTERN (PUBLISHED) + Telegram 通知
|
||
|
||
leWOOOgo 積木化:
|
||
- 呼叫 KnowledgeService(不直接存 DB)
|
||
- 呼叫 NvidiaProvider.chat()(非 AIRouter,Runbook 是知識副作用)
|
||
"""
|
||
|
||
_RUNBOOK_SYSTEM = (
|
||
"你是 AWOOOI 平台的 SRE Runbook 撰寫專家。"
|
||
"根據提供的 Incident 與修復結果,用繁體中文生成完整結構化 Runbook。"
|
||
)
|
||
|
||
_ANTI_PATTERN_SYSTEM = (
|
||
"你是 AWOOOI 平台的故障分析專家。"
|
||
"根據失敗的修復嘗試,用繁體中文生成失敗案例記錄,幫助未來避免重蹈覆轍。"
|
||
)
|
||
|
||
async def generate_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
symptoms_hash: str,
|
||
) -> None:
|
||
"""
|
||
成功修復後生成 AUTO_RUNBOOK(fire-and-forget,呼叫方不等待)
|
||
|
||
Args:
|
||
incident: 觸發的 Incident
|
||
playbook: 執行的 Playbook
|
||
result: 執行結果(success=True)
|
||
symptoms_hash: SymptomPattern.compute_hash() 的 hash
|
||
"""
|
||
try:
|
||
content = await self._call_nemotron_for_runbook(incident, playbook, result)
|
||
if not content:
|
||
return
|
||
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
ks = get_knowledge_service()
|
||
|
||
entry_data = KnowledgeEntryCreate(
|
||
title=f"[AUTO] {incident.incident_id} — {playbook.name}",
|
||
content=content,
|
||
entry_type=EntryType.AUTO_RUNBOOK,
|
||
category="auto_generated",
|
||
tags=list(incident.affected_services or []) + ["auto_runbook", "nemotron"],
|
||
source=EntrySource.AI_EXTRACTED,
|
||
status=EntryStatus.DRAFT,
|
||
related_incident_id=incident.incident_id,
|
||
related_playbook_id=playbook.playbook_id,
|
||
symptoms_hash=symptoms_hash,
|
||
created_by="nemotron_runbook_generator",
|
||
)
|
||
|
||
entry = await ks.create_entry(entry_data)
|
||
|
||
logger.info(
|
||
"auto_runbook_created",
|
||
incident_id=incident.incident_id,
|
||
entry_id=entry.id,
|
||
playbook_id=playbook.playbook_id,
|
||
)
|
||
|
||
await self._push_runbook_review_card(incident, entry.id, content[:200])
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"runbook_generation_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
async def generate_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
symptoms_hash: str,
|
||
) -> None:
|
||
"""
|
||
失敗修復後生成 ANTI_PATTERN(fire-and-forget,直接 PUBLISHED)
|
||
|
||
Args:
|
||
incident: 觸發的 Incident
|
||
playbook: 嘗試執行的 Playbook
|
||
result: 執行結果(success=False)
|
||
symptoms_hash: SymptomPattern.compute_hash() 的 hash
|
||
"""
|
||
try:
|
||
content = await self._call_nemotron_for_anti_pattern(incident, playbook, result)
|
||
if not content:
|
||
return
|
||
|
||
from src.services.knowledge_service import get_knowledge_service
|
||
ks = get_knowledge_service()
|
||
|
||
title = f"[FAIL] {incident.incident_id} — {playbook.name}"
|
||
entry_data = KnowledgeEntryCreate(
|
||
title=title,
|
||
content=content,
|
||
entry_type=EntryType.ANTI_PATTERN,
|
||
category="failure_cases",
|
||
tags=list(incident.affected_services or []) + ["anti_pattern", "failure"],
|
||
source=EntrySource.AI_EXTRACTED,
|
||
status=EntryStatus.PUBLISHED, # 直接發布,無需審核
|
||
related_incident_id=incident.incident_id,
|
||
related_playbook_id=playbook.playbook_id,
|
||
symptoms_hash=symptoms_hash,
|
||
created_by="nemotron_runbook_generator",
|
||
)
|
||
|
||
entry = await ks.create_entry(entry_data)
|
||
|
||
logger.info(
|
||
"anti_pattern_created",
|
||
incident_id=incident.incident_id,
|
||
entry_id=entry.id,
|
||
symptoms_hash=symptoms_hash,
|
||
)
|
||
|
||
await self._push_anti_pattern_notification(incident, title)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"anti_pattern_generation_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
|
||
# =========================================================================
|
||
# Private
|
||
# =========================================================================
|
||
|
||
async def _call_nemotron_for_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""呼叫 Nemotron chat() 生成 9 段 Runbook,回傳 Markdown 字串"""
|
||
from src.core.config import get_settings
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
|
||
settings = get_settings()
|
||
prompt = (
|
||
f"## Incident 資訊\n"
|
||
f"- ID: {incident.incident_id}\n"
|
||
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n"
|
||
f"- 嚴重度: {incident.severity.value if incident.severity else 'unknown'}\n\n"
|
||
f"## 執行的 Playbook\n"
|
||
f"- 名稱: {playbook.name}\n"
|
||
f"- 執行步驟:\n"
|
||
+ "\n".join(f" {s}" for s in result.executed_steps[:5])
|
||
+ f"\n\n## 執行結果\n- 狀態: 成功,耗時 {result.execution_time_ms}ms\n\n"
|
||
"請生成包含以下 9 段的 Runbook(Markdown 格式):\n"
|
||
"1. ## 症狀描述\n2. ## 根因分析\n3. ## 執行步驟\n"
|
||
"4. ## 驗證步驟\n5. ## 注意事項\n6. ## 影響範圍\n"
|
||
"7. ## 相關 Incident\n8. ## 下次預防建議\n9. ## 適用條件"
|
||
)
|
||
|
||
try:
|
||
nvidia = get_nvidia_provider()
|
||
start = time.time()
|
||
# chat() 回傳 (response_text, success, total_tokens, cost_usd)
|
||
response_text, success, _tokens, _cost = await asyncio.wait_for(
|
||
nvidia.chat(prompt=f"[SYSTEM]{self._RUNBOOK_SYSTEM}\n\n{prompt}"),
|
||
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
|
||
)
|
||
latency_ms = (time.time() - start) * 1000
|
||
logger.info("runbook_nemotron_call_ok", latency_ms=round(latency_ms, 1))
|
||
if success and response_text:
|
||
return response_text
|
||
except Exception as e:
|
||
logger.warning("runbook_nemotron_call_failed", error=str(e))
|
||
|
||
# Fallback:組裝基本 Runbook
|
||
return self._build_minimal_runbook(incident, playbook, result)
|
||
|
||
async def _call_nemotron_for_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""呼叫 Nemotron chat() 生成失敗案例記錄,回傳 Markdown 字串"""
|
||
from src.core.config import get_settings
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
|
||
settings = get_settings()
|
||
prompt = (
|
||
f"## Incident 資訊\n"
|
||
f"- ID: {incident.incident_id}\n"
|
||
f"- 受影響服務: {', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 嘗試的 Playbook\n- 名稱: {playbook.name}\n\n"
|
||
f"## 失敗原因\n{result.error or '執行中發生未知異常'}\n\n"
|
||
"請生成失敗案例文件(Markdown 格式),包含:\n"
|
||
"## 症狀描述\n## 嘗試的修復方案\n## 失敗原因分析\n"
|
||
"## 已知不適用條件\n## 替代方案建議"
|
||
)
|
||
|
||
try:
|
||
nvidia = get_nvidia_provider()
|
||
response_text, success, _tokens, _cost = await asyncio.wait_for(
|
||
nvidia.chat(prompt=f"[SYSTEM]{self._ANTI_PATTERN_SYSTEM}\n\n{prompt}"),
|
||
timeout=settings.NEMOTRON_TIMEOUT_SECONDS,
|
||
)
|
||
if success and response_text:
|
||
return response_text
|
||
except Exception as e:
|
||
logger.warning("anti_pattern_nemotron_call_failed", error=str(e))
|
||
|
||
return self._build_minimal_anti_pattern(incident, playbook, result)
|
||
|
||
def _build_minimal_runbook(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""Nemotron 超時/失敗時的基本 Runbook fallback"""
|
||
steps = "\n".join(f"- {s}" for s in result.executed_steps)
|
||
return (
|
||
f"## 症狀描述\nIncident {incident.incident_id},"
|
||
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 執行步驟\n{steps}\n\n"
|
||
f"## 執行結果\n成功,耗時 {result.execution_time_ms}ms\n\n"
|
||
"*本文件由系統自動生成(Nemotron fallback),建議人工補充完善。*"
|
||
)
|
||
|
||
def _build_minimal_anti_pattern(
|
||
self,
|
||
incident: "Incident",
|
||
playbook: "Playbook",
|
||
result: "AutoRepairResult",
|
||
) -> str:
|
||
"""Nemotron 超時/失敗時的基本 Anti-Pattern fallback"""
|
||
return (
|
||
f"## 症狀描述\nIncident {incident.incident_id},"
|
||
f"受影響服務:{', '.join(incident.affected_services or [])}\n\n"
|
||
f"## 失敗原因\n{result.error or '執行中發生異常'}\n\n"
|
||
f"## 已知不適用條件\nPlaybook '{playbook.name}' 在此症狀下失敗,請勿自動重試。\n\n"
|
||
"*本文件由系統自動生成(Nemotron fallback)。*"
|
||
)
|
||
|
||
async def _push_runbook_review_card(
|
||
self,
|
||
incident: "Incident",
|
||
entry_id: str,
|
||
content_preview: str,
|
||
) -> None:
|
||
"""推送 Runbook 審核 card 到 Telegram"""
|
||
try:
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
tg = get_telegram_gateway()
|
||
await tg.send_text(
|
||
f"📄 <b>Auto Runbook 待審核</b>\n"
|
||
f"Incident: <code>{incident.incident_id}</code>\n"
|
||
f"Entry ID: <code>{entry_id}</code>\n\n"
|
||
f"<i>{content_preview}...</i>\n\n"
|
||
f"請至知識庫審核並發布。"
|
||
)
|
||
except Exception as e:
|
||
logger.warning("runbook_review_card_failed", error=str(e))
|
||
|
||
async def _push_anti_pattern_notification(
|
||
self,
|
||
incident: "Incident",
|
||
title: str,
|
||
) -> None:
|
||
"""推送 Anti-Pattern 已記錄通知到 Telegram"""
|
||
try:
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
tg = get_telegram_gateway()
|
||
await tg.send_text(
|
||
f"⚠️ <b>已記錄失敗案例</b>\n"
|
||
f"Incident: <code>{incident.incident_id}</code>\n"
|
||
f"標題: {title}\n\n"
|
||
f"相同症狀的後續告警將阻斷自動修復,要求人工介入。"
|
||
)
|
||
except Exception as e:
|
||
logger.warning("anti_pattern_notification_failed", error=str(e))
|
||
|
||
|
||
# =============================================================================
|
||
# 單例管理
|
||
# =============================================================================
|
||
|
||
_generator: NemotronRunbookGenerator | None = None
|
||
|
||
|
||
def get_runbook_generator() -> NemotronRunbookGenerator:
|
||
global _generator
|
||
if _generator is None:
|
||
_generator = NemotronRunbookGenerator()
|
||
return _generator
|