feat(auto-repair): 完整自動修復閉環 + KM 沉澱串接
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. DB Migration: playbooks 資料表 (phase7_playbooks_table.sql)
- 這是自動修復無法啟動的根本原因 — table 從未建立
- 5 個索引: status/tags/alert_names/source_incidents/created_at
- 已在 prod DB 執行
2. playbook_service: 萃取後自動沉澱 KM
- extract_from_incident() 完成後 fire-and-forget _write_to_km()
- 內容含症狀模式、修復步驟、信心度、來源 Incident
3. approval_execution: 執行結果沉澱 KM
- _trigger_learning() 後 fire-and-forget _write_execution_result_to_km()
- 成功/失敗記錄都寫入,category=execution_result
完整閉環:
告警 → AI分析 → 查Playbook → 決策 → 執行 → 結果寫KM
↓
Incident解決 → KM(knowledge_extractor)
→ Playbook萃取 → KM
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
58
apps/api/migrations/phase7_playbooks_table.sql
Normal file
58
apps/api/migrations/phase7_playbooks_table.sql
Normal file
@@ -0,0 +1,58 @@
|
||||
-- Phase 7: Playbook 萃取功能 — playbooks 資料表
|
||||
-- 建立時間: 2026-04-04 (台北時區)
|
||||
-- 建立者: Claude Code (Phase 7 補齊 migration)
|
||||
-- 對應設計: memory/project_playbook_design.md
|
||||
-- 對應模型: apps/api/src/models/playbook.py
|
||||
|
||||
CREATE TABLE IF NOT EXISTS playbooks (
|
||||
-- 識別
|
||||
playbook_id VARCHAR(32) UNIQUE NOT NULL,
|
||||
|
||||
-- 元資料
|
||||
name VARCHAR(256) NOT NULL,
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
status VARCHAR(32) NOT NULL DEFAULT 'draft', -- draft|approved|deprecated
|
||||
source VARCHAR(32) NOT NULL DEFAULT 'extracted', -- extracted|manual
|
||||
|
||||
-- 症狀模式 (SymptomPattern JSON)
|
||||
symptom_pattern JSONB NOT NULL DEFAULT '{}',
|
||||
|
||||
-- 修復步驟 (list[RepairStep] JSON)
|
||||
repair_steps JSONB NOT NULL DEFAULT '[]',
|
||||
estimated_duration_minutes INT NOT NULL DEFAULT 5,
|
||||
|
||||
-- 來源追溯
|
||||
source_incident_ids TEXT[] NOT NULL DEFAULT '{}',
|
||||
ai_confidence DECIMAL(4,3) NOT NULL DEFAULT 0.0,
|
||||
|
||||
-- 統計數據
|
||||
success_count INT NOT NULL DEFAULT 0,
|
||||
failure_count INT NOT NULL DEFAULT 0,
|
||||
last_used_at TIMESTAMPTZ,
|
||||
|
||||
-- 人工標記
|
||||
approved_by VARCHAR(128),
|
||||
approved_at TIMESTAMPTZ,
|
||||
tags TEXT[] NOT NULL DEFAULT '{}',
|
||||
notes TEXT,
|
||||
|
||||
-- 時間軸
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- 索引
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_status
|
||||
ON playbooks(status);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_tags
|
||||
ON playbooks USING GIN(tags);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_alert_names
|
||||
ON playbooks USING GIN((symptom_pattern->'alert_names'));
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_source_incidents
|
||||
ON playbooks USING GIN(source_incident_ids);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_playbooks_created_at
|
||||
ON playbooks(created_at DESC);
|
||||
@@ -239,6 +239,13 @@ class ApprovalExecutionService:
|
||||
result=result,
|
||||
)
|
||||
|
||||
# 2026-04-04 ogt: 執行結果沉澱到 KM
|
||||
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
|
||||
import asyncio
|
||||
asyncio.create_task(
|
||||
self._write_execution_result_to_km(approval, success, error_message)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 學習失敗不影響主流程
|
||||
logger.warning(
|
||||
@@ -247,6 +254,57 @@ class ApprovalExecutionService:
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _write_execution_result_to_km(
|
||||
self,
|
||||
approval: "ApprovalRequest",
|
||||
success: bool,
|
||||
error_message: str | None,
|
||||
) -> None:
|
||||
"""
|
||||
執行結果沉澱到 KM (Knowledge Base)
|
||||
|
||||
2026-04-04 ogt: 統帥鐵律 — 成功/失敗執行記錄都必須回寫 KM
|
||||
"""
|
||||
try:
|
||||
from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate
|
||||
from src.services.knowledge_service import get_knowledge_service
|
||||
|
||||
status_icon = "✅" if success else "❌"
|
||||
status_text = "成功" if success else f"失敗: {error_message or '未知原因'}"
|
||||
|
||||
content = (
|
||||
f"# {status_icon} 執行記錄: {approval.action[:80]}\n\n"
|
||||
f"**Approval ID**: {approval.id}\n"
|
||||
f"**Incident ID**: {approval.incident_id or '未關聯'}\n"
|
||||
f"**執行結果**: {status_text}\n"
|
||||
f"**風險等級**: {approval.risk_level.value if approval.risk_level else '未知'}\n\n"
|
||||
f"## 操作內容\n{approval.description or '無描述'}\n"
|
||||
)
|
||||
|
||||
entry_data = KnowledgeEntryCreate(
|
||||
title=f"[執行記錄] {status_icon} {approval.action[:60]}",
|
||||
content=content,
|
||||
entry_type=EntryType.INCIDENT_CASE,
|
||||
category="execution_result",
|
||||
tags=["execution", "auto_repair" if success else "execution_failed"],
|
||||
source=EntrySource.AI_EXTRACTED,
|
||||
related_incident_id=approval.incident_id,
|
||||
created_by="approval_execution",
|
||||
)
|
||||
await get_knowledge_service().create_entry(entry_data)
|
||||
|
||||
logger.debug(
|
||||
"execution_result_written_to_km",
|
||||
approval_id=str(approval.id),
|
||||
success=success,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"execution_result_km_write_failed",
|
||||
approval_id=str(approval.id),
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _send_execution_notification(
|
||||
self,
|
||||
approval: ApprovalRequest,
|
||||
|
||||
@@ -176,6 +176,10 @@ class PlaybookService:
|
||||
import asyncio
|
||||
asyncio.create_task(self._index_playbook_async(playbook))
|
||||
|
||||
# 9. 2026-04-04 ogt: 沉澱到 KM (Knowledge Base)
|
||||
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
|
||||
asyncio.create_task(self._write_to_km(playbook, incident))
|
||||
|
||||
logger.info(
|
||||
"playbook_extracted",
|
||||
playbook_id=playbook.playbook_id,
|
||||
@@ -186,6 +190,62 @@ class PlaybookService:
|
||||
|
||||
return playbook
|
||||
|
||||
async def _write_to_km(self, playbook: Playbook, incident: Incident) -> None:
|
||||
"""
|
||||
Playbook 萃取後沉澱到 KM (Knowledge Base)
|
||||
|
||||
2026-04-04 ogt: 統帥鐵律 — 異常+自動修復記錄必須回寫 KM
|
||||
火後不忘記 (fire-and-forget),失敗不影響主流程
|
||||
"""
|
||||
try:
|
||||
from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate
|
||||
from src.services.knowledge_service import get_knowledge_service
|
||||
|
||||
# 組 Playbook 修復步驟摘要
|
||||
steps_text = "\n".join(
|
||||
f"{i+1}. [{s.action_type}] {s.command}"
|
||||
for i, s in enumerate(playbook.repair_steps)
|
||||
) or "(無明確修復步驟)"
|
||||
|
||||
alert_names = ", ".join(playbook.symptom_pattern.alert_names) or "未知"
|
||||
services = ", ".join(playbook.symptom_pattern.affected_services) or "未知"
|
||||
|
||||
content = (
|
||||
f"# Playbook: {playbook.name}\n\n"
|
||||
f"**來源 Incident**: {', '.join(playbook.source_incident_ids)}\n"
|
||||
f"**AI 信心度**: {playbook.ai_confidence:.0%}\n"
|
||||
f"**狀態**: {playbook.status.value}\n\n"
|
||||
f"## 症狀模式\n"
|
||||
f"- 告警: {alert_names}\n"
|
||||
f"- 受影響服務: {services}\n\n"
|
||||
f"## 修復步驟\n{steps_text}\n\n"
|
||||
f"## 描述\n{playbook.description}"
|
||||
)
|
||||
|
||||
entry_data = KnowledgeEntryCreate(
|
||||
title=f"[Playbook] {playbook.name}",
|
||||
content=content,
|
||||
entry_type=EntryType.INCIDENT_CASE,
|
||||
category="auto_repair",
|
||||
tags=[*playbook.tags, "playbook", "auto_extracted", playbook.status.value],
|
||||
source=EntrySource.AI_EXTRACTED,
|
||||
related_incident_id=incident.incident_id,
|
||||
created_by="playbook_service",
|
||||
)
|
||||
await get_knowledge_service().create_entry(entry_data)
|
||||
|
||||
logger.info(
|
||||
"playbook_written_to_km",
|
||||
playbook_id=playbook.playbook_id,
|
||||
incident_id=incident.incident_id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"playbook_km_write_failed",
|
||||
playbook_id=playbook.playbook_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _index_playbook_async(self, playbook: Playbook) -> None:
|
||||
"""非同步建立 Playbook 向量索引 (ADR-030 Phase 3)"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user