From 658337ec18b9181bf7e46647123f886b94dbb004 Mon Sep 17 00:00:00 2001 From: OG T Date: Mon, 6 Apr 2026 11:46:05 +0800 Subject: [PATCH] =?UTF-8?q?fix(phase26):=20=E6=89=93=E9=80=9A=20Incident?= =?UTF-8?q?=E2=86=92DB=E2=86=92KM=20=E5=AE=8C=E6=95=B4=E9=8F=88=E8=B7=AF?= =?UTF-8?q?=20+=20namespace=20=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題根因: 1. create_incident_for_approval 只存 Redis,不存 PostgreSQL → TTL 7天後消失,Playbook 萃取永遠找不到 Incident 2. ApprovalRecord 無 incident_id 欄位 → _trigger_playbook_extraction 靠 regex 掃中文文字找 INC-,永遠失敗 3. operation_parser namespace fallback 是 "default" → 所有 deployment 在 awoooi-prod,203 次執行全失敗 修復: - Incident 同時寫入 Redis + PostgreSQL (save_to_episodic_memory) - ApprovalRecord 加入 incident_id 欄位 (model + ORM + migration) - alertmanager_webhook 建立 Approval 後回寫 incident_id - _trigger_playbook_extraction 直接用 approval.incident_id - operation_parser DEFAULT_NAMESPACE = "awoooi-prod" Co-Authored-By: Claude Sonnet 4.6 --- .../phase26_incident_km_integration.sql | 30 ++++++++++++++++ apps/api/src/api/v1/webhooks.py | 26 +++++++++++++- apps/api/src/db/models.py | 9 +++++ apps/api/src/models/approval.py | 2 ++ apps/api/src/services/approval_db.py | 12 +++++++ apps/api/src/services/approval_execution.py | 12 ++++--- apps/api/src/services/operation_parser.py | 34 +++++++++++-------- 7 files changed, 104 insertions(+), 21 deletions(-) create mode 100644 apps/api/migrations/phase26_incident_km_integration.sql diff --git a/apps/api/migrations/phase26_incident_km_integration.sql b/apps/api/migrations/phase26_incident_km_integration.sql new file mode 100644 index 00000000..49d3c155 --- /dev/null +++ b/apps/api/migrations/phase26_incident_km_integration.sql @@ -0,0 +1,30 @@ +-- ============================================================================= +-- Phase 26: Incident → KM 完整鏈路補全 +-- 2026-04-06 ogt: 修復三重死鎖 — 告警必須寫入 DB 並建立 KM +-- ============================================================================= + +-- 1. approval_records 加入 incident_id 欄位 +ALTER TABLE approval_records + ADD COLUMN IF NOT EXISTS incident_id TEXT; + +CREATE INDEX IF NOT EXISTS idx_approval_records_incident_id + ON approval_records (incident_id) + WHERE incident_id IS NOT NULL; + +-- 2. incidents 表確保有 source 欄位 (alertmanager / manual 等) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS source TEXT DEFAULT 'alertmanager'; + +-- 3. knowledge_entries 確保有 related_approval_id 欄位 +ALTER TABLE knowledge_entries + ADD COLUMN IF NOT EXISTS related_approval_id TEXT; + +CREATE INDEX IF NOT EXISTS idx_knowledge_entries_related_approval + ON knowledge_entries (related_approval_id) + WHERE related_approval_id IS NOT NULL; + +-- 完成確認 +DO $$ +BEGIN + RAISE NOTICE 'Phase 26 migration completed: incident_id + source + related_approval_id'; +END $$; diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index bf7b8a18..b98795a7 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -132,9 +132,20 @@ async def create_incident_for_approval( proposal_ids=[UUID(approval_id)], ) - # Phase 17 P0: 透過 Service 存入 Working Memory + # Phase 17 P0: 透過 Service 存入 Working Memory (Redis) await incident_service.save_to_working_memory(incident) + # 2026-04-06 ogt: Phase 26 — 同時寫入 Episodic Memory (PostgreSQL) + # 原本只存 Redis,TTL 7天後消失,Playbook 萃取和 KM 永遠找不到 incident + try: + await incident_service.save_to_episodic_memory(incident) + except Exception as _pg_err: + logger.warning( + "incident_episodic_memory_failed", + incident_id=incident.incident_id, + error=str(_pg_err), + ) + logger.info( "incident_created_for_approval", incident_id=incident.incident_id, @@ -1207,6 +1218,19 @@ async def alertmanager_webhook( source="alertmanager", ) + # 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval + # 這樣 Playbook 萃取和 KM 寫入才能找到對應的 Incident + try: + await service.update_incident_id(approval.id, incident_id) + approval.incident_id = incident_id + except Exception as _meta_err: + logger.warning( + "approval_incident_id_update_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(_meta_err), + ) + root_cause = analysis_result.description or message estimated_downtime = blast.estimated_downtime if blast else "~30s" primary_responsibility = analysis_result.primary_responsibility or "COLLAB" diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py index 7c2ff5b3..22a0ea9f 100644 --- a/apps/api/src/db/models.py +++ b/apps/api/src/db/models.py @@ -124,6 +124,15 @@ class ApprovalRecord(Base): comment="Last time this alert pattern was seen", ) + # 2026-04-06 ogt: Phase 26 — 關聯 Incident ID + # Playbook 萃取和 KM 寫入必須知道 incident_id,不能靠文字解析 + incident_id: Mapped[str | None] = mapped_column( + String(64), + nullable=True, + index=True, + comment="Associated Incident ID (INC-YYYYMMDD-XXXXXX)", + ) + # Timestamps created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), diff --git a/apps/api/src/models/approval.py b/apps/api/src/models/approval.py index ea63e87c..8a37abc5 100644 --- a/apps/api/src/models/approval.py +++ b/apps/api/src/models/approval.py @@ -161,6 +161,8 @@ class ApprovalRequest(ApprovalRequestBase): fingerprint: str | None = Field(default=None, description="告警指紋 Hash") hit_count: int = Field(default=1, description="聚合觸發次數") last_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC), description="最後觸發時間") + # 2026-04-06 ogt: 關聯 Incident — 萃取 Playbook 與 KM 寫入必須知道 incident_id + incident_id: str | None = Field(default=None, description="關聯的 Incident ID") @property def current_signatures(self) -> int: diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index dc418e0c..a3065650 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -598,6 +598,18 @@ class ApprovalDBService: success=success, ) + async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None: + """ + 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 approval_records + 讓 Playbook 萃取和 KM 寫入能找到對應的 Incident + """ + async with get_db_context() as db: + await db.execute( + update(ApprovalRecord) + .where(ApprovalRecord.id == str(approval_id)) + .values(incident_id=incident_id) + ) + # ========================================================================= # Phase 6.4h: Proposals API 支援方法 # ========================================================================= diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index 87c81c06..cd8c8d26 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -389,15 +389,17 @@ class ApprovalExecutionService: 此函數為 fire-and-forget,失敗不影響主流程 """ try: - # 1. 從 approval 取得關聯的 incident_id - # approval.requested_by 可能包含 incident 資訊,或從 metadata 取得 - # 暫時從 description 或 action 解析 - incident_id = self._extract_incident_id_from_approval(approval) + # 1. 從 approval.incident_id 直接取得 (Phase 26 修復) + # 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到 + incident_id = getattr(approval, "incident_id", None) + if not incident_id: + # Fallback: 嘗試文字解析 (向後兼容舊資料) + incident_id = self._extract_incident_id_from_approval(approval) if not incident_id: logger.info( "playbook_extraction_skipped", approval_id=str(approval.id), - reason="No incident_id found", + reason="No incident_id found in approval.incident_id or text", ) return diff --git a/apps/api/src/services/operation_parser.py b/apps/api/src/services/operation_parser.py index a87cac6c..c455fa87 100644 --- a/apps/api/src/services/operation_parser.py +++ b/apps/api/src/services/operation_parser.py @@ -19,6 +19,10 @@ from dataclasses import dataclass from src.services.executor import OperationType +# 2026-04-06 ogt: Phase 26 — 預設 namespace 改為 awoooi-prod +# 原本 "default" 導致 203 次執行全失敗(deployment 全在 awoooi-prod) +DEFAULT_NAMESPACE = "awoooi-prod" + @dataclass class ParsedOperation: @@ -28,7 +32,7 @@ class ParsedOperation: Attributes: operation_type: K8s 操作類型 (RESTART_DEPLOYMENT, DELETE_POD, etc.) resource_name: 目標資源名稱 - namespace: K8s Namespace (預設 "default") + namespace: K8s Namespace (預設 "awoooi-prod") Note: 支援 tuple 解包以向後兼容: @@ -81,7 +85,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation: if kubectl_restart_match: deploy_name = kubectl_restart_match.group(1) ns_match = re.search(r"-n\s+(\S+)", action_lower) - namespace = ns_match.group(1) if ns_match else "default" + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace) # Pattern: kubectl delete pod @@ -91,14 +95,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation: if delete_pod_match: pod_name = delete_pod_match.group(1) ns_match = re.search(r"-n\s+(\S+)", action_lower) - namespace = ns_match.group(1) if ns_match else "default" + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE return ParsedOperation(OperationType.DELETE_POD, pod_name, namespace) # Pattern: 刪除 Pod (Chinese delete) chinese_delete_match = re.search(r"刪除\s*[Pp]od\s+([a-z0-9][\w.-]*)", action) if chinese_delete_match: pod_name = chinese_delete_match.group(1) - return ParsedOperation(OperationType.DELETE_POD, pod_name, "default") + return ParsedOperation(OperationType.DELETE_POD, pod_name, DEFAULT_NAMESPACE) # Pattern: restart deployment (English - with explicit "deployment") restart_deploy_match = re.search( @@ -107,7 +111,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation: if restart_deploy_match: deploy_name = restart_deploy_match.group(1) ns_match = re.search(r"-n\s+(\S+)", action_lower) - namespace = ns_match.group(1) if ns_match else "default" + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace) # Pattern: restart (English - without "deployment" keyword) @@ -117,7 +121,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation: # Skip if captured word is "deployment" (handled above) if deploy_name != "deployment": ns_match = re.search(r"-n\s+(\S+)", action_lower) - namespace = ns_match.group(1) if ns_match else "default" + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE return ParsedOperation( OperationType.RESTART_DEPLOYMENT, deploy_name, namespace ) @@ -128,7 +132,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation: ) if chinese_restart_deploy_match: deploy_name = chinese_restart_deploy_match.group(1) - return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, "default") + return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE) # Pattern: 重新啟動 服務 (Chinese) chinese_restart_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)\s*服務", action) @@ -136,9 +140,9 @@ def parse_operation_from_action(action: str) -> ParsedOperation: resource_name = chinese_restart_match.group(1) # StatefulSet Pod 格式: name-N (如 postgres-primary-0) if re.match(r".*-\d+$", resource_name): - return ParsedOperation(OperationType.DELETE_POD, resource_name, "default") + return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE) return ParsedOperation( - OperationType.RESTART_DEPLOYMENT, resource_name, "default" + OperationType.RESTART_DEPLOYMENT, resource_name, DEFAULT_NAMESPACE ) # Pattern: scale deployment @@ -148,14 +152,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation: if scale_match: deploy_name = scale_match.group(1) ns_match = re.search(r"-n\s+(\S+)", action_lower) - namespace = ns_match.group(1) if ns_match else "default" + namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, namespace) # Pattern: 擴容 (Chinese scale) chinese_scale_match = re.search(r"擴容\s+([a-z0-9][\w.-]*)", action) if chinese_scale_match: deploy_name = chinese_scale_match.group(1) - return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default") + return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE) # Pattern: 擴展 副本數 (Chinese scale variant) chinese_scale2_match = re.search(r"擴展\s+([a-z0-9][\w.-]*)\s*副本", action) @@ -163,7 +167,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation: deploy_name = chinese_scale2_match.group(1) # 移除常見的後綴如 -deployment deploy_name = re.sub(r"-deployment$", "", deploy_name) - return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default") + return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE) # Pattern: 重新啟動 (Chinese restart without 服務) chinese_restart2_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)", action) @@ -171,11 +175,11 @@ def parse_operation_from_action(action: str) -> ParsedOperation: resource_name = chinese_restart2_match.group(1) # StatefulSet Pod 格式: name-N (如 postgres-primary-0) if re.match(r".*-\d+$", resource_name): - return ParsedOperation(OperationType.DELETE_POD, resource_name, "default") + return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE) # 移除常見的後綴 deploy_name = re.sub(r"-deployment$", "", resource_name) return ParsedOperation( - OperationType.RESTART_DEPLOYMENT, deploy_name, "default" + OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE ) - return ParsedOperation(None, None, "default") + return ParsedOperation(None, None, DEFAULT_NAMESPACE)