From df3ef9006cdd21389e5f3ebfd8ec2c5d5e1dbc89 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 4 Apr 2026 12:02:03 +0800 Subject: [PATCH] =?UTF-8?q?fix(auto-repair):=20=E9=A6=96=E5=B8=AD=E6=9E=B6?= =?UTF-8?q?=E6=A7=8B=E5=B8=AB=20Review=20=E2=80=94=204=20Critical/Importan?= =?UTF-8?q?t=20=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical #1: KM write task 移出 try/except - _trigger_learning 的 KM 寫入原在 try 內,learning 失敗時不寫 KM - 移至 except 後確保成功/失敗都寫入 - 移除冗餘 import asyncio(已在頂層 import) - Minor: approval.incident_id or None 防空字串 Important #2: migration 加 PRIMARY KEY - playbook_id 從 UNIQUE 升為 PRIMARY KEY - prod DB 已執行 ALTER TABLE ADD PRIMARY KEY Important #3: s.sequence→s.step_number, s.description→s.command - embed_playbook() 使用不存在的欄位名,RAG 向量索引靜默失敗 - RepairStep 正確欄位: step_number, command Important #1: PlaybookService._get_rag_service 不再 Service 層快取 - 改為每次呼叫工廠 get_playbook_rag_service() - 避免舊實例繞過工廠的 is_closed 重建邏輯 冷啟動修復 (首席架構師建議B+C): - _trigger_playbook_extraction 執行成功後自動設定 execution_success=True, effectiveness_score=4, status=RESOLVED - skip 路徑 logger.debug → logger.info 提升可觀測性 Co-Authored-By: Claude Sonnet 4.6 --- .../api/migrations/phase7_playbooks_table.sql | 3 +- apps/api/src/services/approval_execution.py | 70 ++++++++++--------- apps/api/src/services/playbook_rag.py | 3 +- apps/api/src/services/playbook_service.py | 14 ++-- 4 files changed, 48 insertions(+), 42 deletions(-) diff --git a/apps/api/migrations/phase7_playbooks_table.sql b/apps/api/migrations/phase7_playbooks_table.sql index 61bc1c85..0376567a 100644 --- a/apps/api/migrations/phase7_playbooks_table.sql +++ b/apps/api/migrations/phase7_playbooks_table.sql @@ -6,7 +6,8 @@ CREATE TABLE IF NOT EXISTS playbooks ( -- 識別 - playbook_id VARCHAR(32) UNIQUE NOT NULL, + -- 2026-04-04 ogt: 首席架構師 Review — 加 PRIMARY KEY,移除多餘 UNIQUE + playbook_id VARCHAR(32) PRIMARY KEY, -- 元資料 name VARCHAR(256) NOT NULL, diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index f584a4f9..87c81c06 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -239,13 +239,6 @@ class ApprovalExecutionService: result=result, ) - # 2026-04-04 ogt: 執行結果沉澱到 KM - # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM - import asyncio - asyncio.create_task( - self._write_execution_result_to_km(approval, success, error_message) - ) - except Exception as e: # 學習失敗不影響主流程 logger.warning( @@ -254,6 +247,12 @@ class ApprovalExecutionService: error=str(e), ) + # 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入 + # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM + asyncio.create_task( + self._write_execution_result_to_km(approval, success, error_message) + ) + async def _write_execution_result_to_km( self, approval: "ApprovalRequest", @@ -288,7 +287,7 @@ class ApprovalExecutionService: category="execution_result", tags=["execution", "auto_repair" if success else "execution_failed"], source=EntrySource.AI_EXTRACTED, - related_incident_id=approval.incident_id, + related_incident_id=approval.incident_id or None, created_by="approval_execution", ) await get_knowledge_service().create_entry(entry_data) @@ -395,8 +394,8 @@ class ApprovalExecutionService: # 暫時從 description 或 action 解析 incident_id = self._extract_incident_id_from_approval(approval) if not incident_id: - logger.debug( - "playbook_extraction_skip", + logger.info( + "playbook_extraction_skipped", approval_id=str(approval.id), reason="No incident_id found", ) @@ -409,43 +408,46 @@ class ApprovalExecutionService: incident = await incident_service.get_incident(incident_id) if not incident: - logger.debug( - "playbook_extraction_skip", + logger.info( + "playbook_extraction_skipped", approval_id=str(approval.id), incident_id=incident_id, reason="Incident not found", ) return - # 3. 檢查 Incident 狀態 - from src.models.incident import IncidentStatus + # 3. 執行成功後自動設定 outcome (冷啟動關鍵) + # 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score + # 確保 Playbook 萃取前置條件能成立,不再依賴人工填分 + from src.models.incident import IncidentOutcome, IncidentStatus + from src.utils.timezone import now_taipei + if incident.outcome is None: + incident.outcome = IncidentOutcome() + if not incident.outcome.execution_success: + incident.outcome.execution_success = True + if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4: + incident.outcome.effectiveness_score = 4 # 系統判斷:K8s 執行成功 = 有效 if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]: - logger.debug( - "playbook_extraction_skip", - approval_id=str(approval.id), - incident_id=incident_id, - incident_status=incident.status.value, - reason="Incident not resolved", - ) - return + incident.status = IncidentStatus.RESOLVED + incident.resolved_at = now_taipei() - # 4. 檢查 effectiveness_score - effectiveness = incident.outcome.effectiveness_score if incident.outcome else 0 - if effectiveness < 4: - logger.debug( - "playbook_extraction_skip", - approval_id=str(approval.id), - incident_id=incident_id, - effectiveness=effectiveness, - reason="Low effectiveness score", - ) - return + # 回存 Incident(fire-and-forget 路徑,失敗不影響主流程) + await incident_service.save_to_working_memory(incident) - # 5. 觸發萃取 + logger.info( + "playbook_extraction_incident_updated", + approval_id=str(approval.id), + incident_id=incident_id, + effectiveness_score=incident.outcome.effectiveness_score, + status=incident.status.value, + ) + + # 4. 觸發萃取(effectiveness 已保證 >= 4) from src.services.playbook_service import get_playbook_service playbook_service = get_playbook_service() + effectiveness = incident.outcome.effectiveness_score or 4 playbook = await playbook_service.extract_from_incident( incident=incident, auto_approve=effectiveness >= 5, # 滿分自動核准 diff --git a/apps/api/src/services/playbook_rag.py b/apps/api/src/services/playbook_rag.py index 86b9059e..41a0c9f9 100644 --- a/apps/api/src/services/playbook_rag.py +++ b/apps/api/src/services/playbook_rag.py @@ -239,9 +239,10 @@ class PlaybookRAGService: text_parts.append(f"描述: {playbook.description}") # 修復步驟 + # 2026-04-04 ogt: 修正欄位名稱 s.sequence→s.step_number, s.description→s.command if playbook.repair_steps: steps_text = "; ".join( - f"{s.sequence}. {s.description}" + f"{s.step_number}. {s.command}" for s in playbook.repair_steps[:5] # 最多 5 步 ) text_parts.append(f"步驟: {steps_text}") diff --git a/apps/api/src/services/playbook_service.py b/apps/api/src/services/playbook_service.py index 1b2ba267..d51a40a5 100644 --- a/apps/api/src/services/playbook_service.py +++ b/apps/api/src/services/playbook_service.py @@ -87,14 +87,16 @@ class PlaybookService: def __init__(self, repository: IPlaybookRepository | None = None): self._repository = repository or get_playbook_repository() - # 2026-03-27 ogt: RAG Service 改為 lazy initialization (async factory) - self._rag_service = None async def _get_rag_service(self): - """Lazy initialization for RAG service (2026-03-27 async factory)""" - if self._rag_service is None: - self._rag_service = await get_playbook_rag_service() - return self._rag_service + """ + 取得 RAG Service — 每次走工廠,不在 Service 層快取 + + 2026-04-04 ogt: 首席架構師 Review — 移除 Service 層快取 + 原因: PlaybookService 快取舊實例會繞過工廠的 is_closed 重建邏輯 + 由 get_playbook_rag_service() 工廠統一管理生命週期 + """ + return await get_playbook_rag_service() # === Core Operations ===