fix(auto-repair): 首席架構師 Review — 4 Critical/Important 修復
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 7m2s

Critical #1: KM write task 移出 try/except
- _trigger_learning 的 KM 寫入原在 try 內,learning 失敗時不寫 KM
- 移至 except 後確保成功/失敗都寫入
- 移除冗餘 import asyncio(已在頂層 import)
- Minor: approval.incident_id or None 防空字串

Important #2: migration 加 PRIMARY KEY
- playbook_id 從 UNIQUE 升為 PRIMARY KEY
- prod DB 已執行 ALTER TABLE ADD PRIMARY KEY

Important #3: s.sequence→s.step_number, s.description→s.command
- embed_playbook() 使用不存在的欄位名,RAG 向量索引靜默失敗
- RepairStep 正確欄位: step_number, command

Important #1: PlaybookService._get_rag_service 不再 Service 層快取
- 改為每次呼叫工廠 get_playbook_rag_service()
- 避免舊實例繞過工廠的 is_closed 重建邏輯

冷啟動修復 (首席架構師建議B+C):
- _trigger_playbook_extraction 執行成功後自動設定
  execution_success=True, effectiveness_score=4, status=RESOLVED
- skip 路徑 logger.debug → logger.info 提升可觀測性

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-04 12:02:03 +08:00
parent 902443f376
commit df3ef9006c
4 changed files with 48 additions and 42 deletions

View File

@@ -6,7 +6,8 @@
CREATE TABLE IF NOT EXISTS playbooks (
-- 識別
playbook_id VARCHAR(32) UNIQUE NOT NULL,
-- 2026-04-04 ogt: 首席架構師 Review — 加 PRIMARY KEY移除多餘 UNIQUE
playbook_id VARCHAR(32) PRIMARY KEY,
-- 元資料
name VARCHAR(256) NOT NULL,

View File

@@ -239,13 +239,6 @@ class ApprovalExecutionService:
result=result,
)
# 2026-04-04 ogt: 執行結果沉澱到 KM
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
import asyncio
asyncio.create_task(
self._write_execution_result_to_km(approval, success, error_message)
)
except Exception as e:
# 學習失敗不影響主流程
logger.warning(
@@ -254,6 +247,12 @@ class ApprovalExecutionService:
error=str(e),
)
# 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
# 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
asyncio.create_task(
self._write_execution_result_to_km(approval, success, error_message)
)
async def _write_execution_result_to_km(
self,
approval: "ApprovalRequest",
@@ -288,7 +287,7 @@ class ApprovalExecutionService:
category="execution_result",
tags=["execution", "auto_repair" if success else "execution_failed"],
source=EntrySource.AI_EXTRACTED,
related_incident_id=approval.incident_id,
related_incident_id=approval.incident_id or None,
created_by="approval_execution",
)
await get_knowledge_service().create_entry(entry_data)
@@ -395,8 +394,8 @@ class ApprovalExecutionService:
# 暫時從 description 或 action 解析
incident_id = self._extract_incident_id_from_approval(approval)
if not incident_id:
logger.debug(
"playbook_extraction_skip",
logger.info(
"playbook_extraction_skipped",
approval_id=str(approval.id),
reason="No incident_id found",
)
@@ -409,43 +408,46 @@ class ApprovalExecutionService:
incident = await incident_service.get_incident(incident_id)
if not incident:
logger.debug(
"playbook_extraction_skip",
logger.info(
"playbook_extraction_skipped",
approval_id=str(approval.id),
incident_id=incident_id,
reason="Incident not found",
)
return
# 3. 檢查 Incident 狀態
from src.models.incident import IncidentStatus
# 3. 執行成功後自動設定 outcome (冷啟動關鍵)
# 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
# 確保 Playbook 萃取前置條件能成立,不再依賴人工填分
from src.models.incident import IncidentOutcome, IncidentStatus
from src.utils.timezone import now_taipei
if incident.outcome is None:
incident.outcome = IncidentOutcome()
if not incident.outcome.execution_success:
incident.outcome.execution_success = True
if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
incident.outcome.effectiveness_score = 4 # 系統判斷K8s 執行成功 = 有效
if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
logger.debug(
"playbook_extraction_skip",
approval_id=str(approval.id),
incident_id=incident_id,
incident_status=incident.status.value,
reason="Incident not resolved",
)
return
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = now_taipei()
# 4. 檢查 effectiveness_score
effectiveness = incident.outcome.effectiveness_score if incident.outcome else 0
if effectiveness < 4:
logger.debug(
"playbook_extraction_skip",
approval_id=str(approval.id),
incident_id=incident_id,
effectiveness=effectiveness,
reason="Low effectiveness score",
)
return
# 回存 Incidentfire-and-forget 路徑,失敗不影響主流程)
await incident_service.save_to_working_memory(incident)
# 5. 觸發萃取
logger.info(
"playbook_extraction_incident_updated",
approval_id=str(approval.id),
incident_id=incident_id,
effectiveness_score=incident.outcome.effectiveness_score,
status=incident.status.value,
)
# 4. 觸發萃取effectiveness 已保證 >= 4
from src.services.playbook_service import get_playbook_service
playbook_service = get_playbook_service()
effectiveness = incident.outcome.effectiveness_score or 4
playbook = await playbook_service.extract_from_incident(
incident=incident,
auto_approve=effectiveness >= 5, # 滿分自動核准

View File

@@ -239,9 +239,10 @@ class PlaybookRAGService:
text_parts.append(f"描述: {playbook.description}")
# 修復步驟
# 2026-04-04 ogt: 修正欄位名稱 s.sequence→s.step_number, s.description→s.command
if playbook.repair_steps:
steps_text = "; ".join(
f"{s.sequence}. {s.description}"
f"{s.step_number}. {s.command}"
for s in playbook.repair_steps[:5] # 最多 5 步
)
text_parts.append(f"步驟: {steps_text}")

View File

@@ -87,14 +87,16 @@ class PlaybookService:
def __init__(self, repository: IPlaybookRepository | None = None):
self._repository = repository or get_playbook_repository()
# 2026-03-27 ogt: RAG Service 改為 lazy initialization (async factory)
self._rag_service = None
async def _get_rag_service(self):
"""Lazy initialization for RAG service (2026-03-27 async factory)"""
if self._rag_service is None:
self._rag_service = await get_playbook_rag_service()
return self._rag_service
"""
取得 RAG Service — 每次走工廠,不在 Service 層快取
2026-04-04 ogt: 首席架構師 Review — 移除 Service 層快取
原因: PlaybookService 快取舊實例會繞過工廠的 is_closed 重建邏輯
由 get_playbook_rag_service() 工廠統一管理生命週期
"""
return await get_playbook_rag_service()
# === Core Operations ===