feat(auto-repair): 完整自動修復閉環 + KM 沉澱串接
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

1. DB Migration: playbooks 資料表 (phase7_playbooks_table.sql)
   - 這是自動修復無法啟動的根本原因 — table 從未建立
   - 5 個索引: status/tags/alert_names/source_incidents/created_at
   - 已在 prod DB 執行

2. playbook_service: 萃取後自動沉澱 KM
   - extract_from_incident() 完成後 fire-and-forget _write_to_km()
   - 內容含症狀模式、修復步驟、信心度、來源 Incident

3. approval_execution: 執行結果沉澱 KM
   - _trigger_learning() 後 fire-and-forget _write_execution_result_to_km()
   - 成功/失敗記錄都寫入,category=execution_result

完整閉環:
告警 → AI分析 → 查Playbook → 決策 → 執行 → 結果寫KM
                                              ↓
                              Incident解決 → KM(knowledge_extractor)
                                          → Playbook萃取 → KM

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-04 11:54:15 +08:00
parent 429d81d29b
commit 72d7536ead
3 changed files with 176 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
-- Phase 7: Playbook 萃取功能 — playbooks 資料表
-- 建立時間: 2026-04-04 (台北時區)
-- 建立者: Claude Code (Phase 7 補齊 migration)
-- 對應設計: memory/project_playbook_design.md
-- 對應模型: apps/api/src/models/playbook.py
CREATE TABLE IF NOT EXISTS playbooks (
-- 識別
playbook_id VARCHAR(32) UNIQUE NOT NULL,
-- 元資料
name VARCHAR(256) NOT NULL,
description TEXT NOT NULL DEFAULT '',
status VARCHAR(32) NOT NULL DEFAULT 'draft', -- draft|approved|deprecated
source VARCHAR(32) NOT NULL DEFAULT 'extracted', -- extracted|manual
-- 症狀模式 (SymptomPattern JSON)
symptom_pattern JSONB NOT NULL DEFAULT '{}',
-- 修復步驟 (list[RepairStep] JSON)
repair_steps JSONB NOT NULL DEFAULT '[]',
estimated_duration_minutes INT NOT NULL DEFAULT 5,
-- 來源追溯
source_incident_ids TEXT[] NOT NULL DEFAULT '{}',
ai_confidence DECIMAL(4,3) NOT NULL DEFAULT 0.0,
-- 統計數據
success_count INT NOT NULL DEFAULT 0,
failure_count INT NOT NULL DEFAULT 0,
last_used_at TIMESTAMPTZ,
-- 人工標記
approved_by VARCHAR(128),
approved_at TIMESTAMPTZ,
tags TEXT[] NOT NULL DEFAULT '{}',
notes TEXT,
-- 時間軸
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- 索引
CREATE INDEX IF NOT EXISTS idx_playbooks_status
ON playbooks(status);
CREATE INDEX IF NOT EXISTS idx_playbooks_tags
ON playbooks USING GIN(tags);
CREATE INDEX IF NOT EXISTS idx_playbooks_alert_names
ON playbooks USING GIN((symptom_pattern->'alert_names'));
CREATE INDEX IF NOT EXISTS idx_playbooks_source_incidents
ON playbooks USING GIN(source_incident_ids);
CREATE INDEX IF NOT EXISTS idx_playbooks_created_at
ON playbooks(created_at DESC);

View File

@@ -239,6 +239,13 @@ class ApprovalExecutionService:
result=result,
)
# 2026-04-04 ogt: 執行結果沉澱到 KM
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
import asyncio
asyncio.create_task(
self._write_execution_result_to_km(approval, success, error_message)
)
except Exception as e:
# 學習失敗不影響主流程
logger.warning(
@@ -247,6 +254,57 @@ class ApprovalExecutionService:
error=str(e),
)
async def _write_execution_result_to_km(
self,
approval: "ApprovalRequest",
success: bool,
error_message: str | None,
) -> None:
"""
執行結果沉澱到 KM (Knowledge Base)
2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM
"""
try:
from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate
from src.services.knowledge_service import get_knowledge_service
status_icon = "" if success else ""
status_text = "成功" if success else f"失敗: {error_message or '未知原因'}"
content = (
f"# {status_icon} 執行記錄: {approval.action[:80]}\n\n"
f"**Approval ID**: {approval.id}\n"
f"**Incident ID**: {approval.incident_id or '未關聯'}\n"
f"**執行結果**: {status_text}\n"
f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n\n"
f"## 操作內容\n{approval.description or '無描述'}\n"
)
entry_data = KnowledgeEntryCreate(
title=f"[執行記錄] {status_icon} {approval.action[:60]}",
content=content,
entry_type=EntryType.INCIDENT_CASE,
category="execution_result",
tags=["execution", "auto_repair" if success else "execution_failed"],
source=EntrySource.AI_EXTRACTED,
related_incident_id=approval.incident_id,
created_by="approval_execution",
)
await get_knowledge_service().create_entry(entry_data)
logger.debug(
"execution_result_written_to_km",
approval_id=str(approval.id),
success=success,
)
except Exception as e:
logger.warning(
"execution_result_km_write_failed",
approval_id=str(approval.id),
error=str(e),
)
async def _send_execution_notification(
self,
approval: ApprovalRequest,

View File

@@ -176,6 +176,10 @@ class PlaybookService:
import asyncio
asyncio.create_task(self._index_playbook_async(playbook))
# 9. 2026-04-04 ogt: 沉澱到 KM (Knowledge Base)
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
asyncio.create_task(self._write_to_km(playbook, incident))
logger.info(
"playbook_extracted",
playbook_id=playbook.playbook_id,
@@ -186,6 +190,62 @@ class PlaybookService:
return playbook
async def _write_to_km(self, playbook: Playbook, incident: Incident) -> None:
"""
Playbook 萃取後沉澱到 KM (Knowledge Base)
2026-04-04 ogt: 統帥鐵律 — 異常+自動修復記錄必須回寫 KM
火後不忘記 (fire-and-forget),失敗不影響主流程
"""
try:
from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate
from src.services.knowledge_service import get_knowledge_service
# 組 Playbook 修復步驟摘要
steps_text = "\n".join(
f"{i+1}. [{s.action_type}] {s.command}"
for i, s in enumerate(playbook.repair_steps)
) or "(無明確修復步驟)"
alert_names = ", ".join(playbook.symptom_pattern.alert_names) or "未知"
services = ", ".join(playbook.symptom_pattern.affected_services) or "未知"
content = (
f"# Playbook: {playbook.name}\n\n"
f"**來源 Incident**: {', '.join(playbook.source_incident_ids)}\n"
f"**AI 信心度**: {playbook.ai_confidence:.0%}\n"
f"**狀態**: {playbook.status.value}\n\n"
f"## 症狀模式\n"
f"- 告警: {alert_names}\n"
f"- 受影響服務: {services}\n\n"
f"## 修復步驟\n{steps_text}\n\n"
f"## 描述\n{playbook.description}"
)
entry_data = KnowledgeEntryCreate(
title=f"[Playbook] {playbook.name}",
content=content,
entry_type=EntryType.INCIDENT_CASE,
category="auto_repair",
tags=[*playbook.tags, "playbook", "auto_extracted", playbook.status.value],
source=EntrySource.AI_EXTRACTED,
related_incident_id=incident.incident_id,
created_by="playbook_service",
)
await get_knowledge_service().create_entry(entry_data)
logger.info(
"playbook_written_to_km",
playbook_id=playbook.playbook_id,
incident_id=incident.incident_id,
)
except Exception as e:
logger.warning(
"playbook_km_write_failed",
playbook_id=playbook.playbook_id,
error=str(e),
)
async def _index_playbook_async(self, playbook: Playbook) -> None:
"""非同步建立 Playbook 向量索引 (ADR-030 Phase 3)"""
try: