fix(auto-repair): 首席架構師 Review — 4 Critical/Important 修復

Critical #1: KM write task 移出 try/except - _trigger_learning 的 KM 寫入原在 try 內，learning 失敗時不寫 KM - 移至 except 後確保成功/失敗都寫入 - 移除冗餘 import asyncio（已在頂層 import） - Minor: approval.incident_id or None 防空字串 Important #2: migration 加 PRIMARY KEY - playbook_id 從 UNIQUE 升為 PRIMARY KEY - prod DB 已執行 ALTER TABLE ADD PRIMARY KEY Important #3: s.sequence→s.step_number, s.description→s.command - embed_playbook() 使用不存在的欄位名，RAG 向量索引靜默失敗 - RepairStep 正確欄位: step_number, command Important #1: PlaybookService._get_rag_service 不再 Service 層快取 - 改為每次呼叫工廠 get_playbook_rag_service() - 避免舊實例繞過工廠的 is_closed 重建邏輯冷啟動修復 (首席架構師建議B+C): - _trigger_playbook_extraction 執行成功後自動設定 execution_success=True, effectiveness_score=4, status=RESOLVED - skip 路徑 logger.debug → logger.info 提升可觀測性 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 12:02:03 +08:00
parent 902443f376
commit df3ef9006c
4 changed files with 48 additions and 42 deletions
--- a/apps/api/migrations/phase7_playbooks_table.sql
+++ b/apps/api/migrations/phase7_playbooks_table.sql
@@ -6,7 +6,8 @@

 CREATE TABLE IF NOT EXISTS playbooks (
    -- 識別
-    playbook_id         VARCHAR(32)  UNIQUE NOT NULL,
+    -- 2026-04-04 ogt: 首席架構師 Review — 加 PRIMARY KEY，移除多餘 UNIQUE
+    playbook_id         VARCHAR(32)  PRIMARY KEY,

    -- 元資料
    name                VARCHAR(256) NOT NULL,
--- a/apps/api/src/services/approval_execution.py
+++ b/apps/api/src/services/approval_execution.py
@@ -239,13 +239,6 @@ class ApprovalExecutionService:
                result=result,
            )

-            # 2026-04-04 ogt: 執行結果沉澱到 KM
-            # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
-            import asyncio
-            asyncio.create_task(
-                self._write_execution_result_to_km(approval, success, error_message)
-            )
-
        except Exception as e:
            # 學習失敗不影響主流程
            logger.warning(
@@ -254,6 +247,12 @@ class ApprovalExecutionService:
                error=str(e),
            )

+        # 2026-04-04 ogt: 執行結果沉澱到 KM — 移出 try/except 確保 learning 失敗也寫入
+        # 統帥鐵律: 所有異常與自動修復紀錄必須回寫 KM
+        asyncio.create_task(
+            self._write_execution_result_to_km(approval, success, error_message)
+        )
+
    async def _write_execution_result_to_km(
        self,
        approval: "ApprovalRequest",
@@ -288,7 +287,7 @@ class ApprovalExecutionService:
                category="execution_result",
                tags=["execution", "auto_repair" if success else "execution_failed"],
                source=EntrySource.AI_EXTRACTED,
-                related_incident_id=approval.incident_id,
+                related_incident_id=approval.incident_id or None,
                created_by="approval_execution",
            )
            await get_knowledge_service().create_entry(entry_data)
@@ -395,8 +394,8 @@ class ApprovalExecutionService:
            # 暫時從 description 或 action 解析
            incident_id = self._extract_incident_id_from_approval(approval)
            if not incident_id:
-                logger.debug(
-                    "playbook_extraction_skip",
+                logger.info(
+                    "playbook_extraction_skipped",
                    approval_id=str(approval.id),
                    reason="No incident_id found",
                )
@@ -409,43 +408,46 @@ class ApprovalExecutionService:
            incident = await incident_service.get_incident(incident_id)

            if not incident:
-                logger.debug(
-                    "playbook_extraction_skip",
+                logger.info(
+                    "playbook_extraction_skipped",
                    approval_id=str(approval.id),
                    incident_id=incident_id,
                    reason="Incident not found",
                )
                return

-            # 3. 檢查 Incident 狀態
-            from src.models.incident import IncidentStatus
+            # 3. 執行成功後自動設定 outcome (冷啟動關鍵)
+            # 2026-04-04 ogt: 首席架構師 Review — 補上 execution_success + effectiveness_score
+            # 確保 Playbook 萃取前置條件能成立，不再依賴人工填分
+            from src.models.incident import IncidentOutcome, IncidentStatus
+            from src.utils.timezone import now_taipei

+            if incident.outcome is None:
+                incident.outcome = IncidentOutcome()
+            if not incident.outcome.execution_success:
+                incident.outcome.execution_success = True
+            if incident.outcome.effectiveness_score is None or incident.outcome.effectiveness_score < 4:
+                incident.outcome.effectiveness_score = 4  # 系統判斷：K8s 執行成功 = 有效
            if incident.status not in [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]:
-                logger.debug(
-                    "playbook_extraction_skip",
-                    approval_id=str(approval.id),
-                    incident_id=incident_id,
-                    incident_status=incident.status.value,
-                    reason="Incident not resolved",
-                )
-                return
+                incident.status = IncidentStatus.RESOLVED
+                incident.resolved_at = now_taipei()

-            # 4. 檢查 effectiveness_score
-            effectiveness = incident.outcome.effectiveness_score if incident.outcome else 0
-            if effectiveness < 4:
-                logger.debug(
-                    "playbook_extraction_skip",
-                    approval_id=str(approval.id),
-                    incident_id=incident_id,
-                    effectiveness=effectiveness,
-                    reason="Low effectiveness score",
-                )
-                return
+            # 回存 Incident（fire-and-forget 路徑，失敗不影響主流程）
+            await incident_service.save_to_working_memory(incident)

-            # 5. 觸發萃取
+            logger.info(
+                "playbook_extraction_incident_updated",
+                approval_id=str(approval.id),
+                incident_id=incident_id,
+                effectiveness_score=incident.outcome.effectiveness_score,
+                status=incident.status.value,
+            )
+
+            # 4. 觸發萃取（effectiveness 已保證 >= 4）
            from src.services.playbook_service import get_playbook_service

            playbook_service = get_playbook_service()
+            effectiveness = incident.outcome.effectiveness_score or 4
            playbook = await playbook_service.extract_from_incident(
                incident=incident,
                auto_approve=effectiveness >= 5,  # 滿分自動核准
--- a/apps/api/src/services/playbook_rag.py
+++ b/apps/api/src/services/playbook_rag.py
@@ -239,9 +239,10 @@ class PlaybookRAGService:
            text_parts.append(f"描述: {playbook.description}")

        # 修復步驟
+        # 2026-04-04 ogt: 修正欄位名稱 s.sequence→s.step_number, s.description→s.command
        if playbook.repair_steps:
            steps_text = "; ".join(
-                f"{s.sequence}. {s.description}"
+                f"{s.step_number}. {s.command}"
                for s in playbook.repair_steps[:5]  # 最多 5 步
            )
            text_parts.append(f"步驟: {steps_text}")
--- a/apps/api/src/services/playbook_service.py
+++ b/apps/api/src/services/playbook_service.py
@@ -87,14 +87,16 @@ class PlaybookService:

    def __init__(self, repository: IPlaybookRepository | None = None):
        self._repository = repository or get_playbook_repository()
-        # 2026-03-27 ogt: RAG Service 改為 lazy initialization (async factory)
-        self._rag_service = None

    async def _get_rag_service(self):
-        """Lazy initialization for RAG service (2026-03-27 async factory)"""
-        if self._rag_service is None:
-            self._rag_service = await get_playbook_rag_service()
-        return self._rag_service
+        """
+        取得 RAG Service — 每次走工廠，不在 Service 層快取
+
+        2026-04-04 ogt: 首席架構師 Review — 移除 Service 層快取
+        原因: PlaybookService 快取舊實例會繞過工廠的 is_closed 重建邏輯
+        由 get_playbook_rag_service() 工廠統一管理生命週期
+        """
+        return await get_playbook_rag_service()

    # === Core Operations ===