fix(phase26): 打通 Incident→DB→KM 完整鏈路 + namespace 修正
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m29s
Type Sync Check / check-type-sync (push) Failing after 52s

問題根因:
1. create_incident_for_approval 只存 Redis,不存 PostgreSQL
   → TTL 7天後消失,Playbook 萃取永遠找不到 Incident
2. ApprovalRecord 無 incident_id 欄位
   → _trigger_playbook_extraction 靠 regex 掃中文文字找 INC-,永遠失敗
3. operation_parser namespace fallback 是 "default"
   → 所有 deployment 在 awoooi-prod,203 次執行全失敗

修復:
- Incident 同時寫入 Redis + PostgreSQL (save_to_episodic_memory)
- ApprovalRecord 加入 incident_id 欄位 (model + ORM + migration)
- alertmanager_webhook 建立 Approval 後回寫 incident_id
- _trigger_playbook_extraction 直接用 approval.incident_id
- operation_parser DEFAULT_NAMESPACE = "awoooi-prod"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-06 11:46:05 +08:00
parent 286a96d1aa
commit 658337ec18
7 changed files with 104 additions and 21 deletions

View File

@@ -0,0 +1,30 @@
-- =============================================================================
-- Phase 26: Incident → KM 完整鏈路補全
-- 2026-04-06 ogt: 修復三重死鎖 — 告警必須寫入 DB 並建立 KM
-- =============================================================================
-- 1. approval_records 加入 incident_id 欄位
ALTER TABLE approval_records
ADD COLUMN IF NOT EXISTS incident_id TEXT;
CREATE INDEX IF NOT EXISTS idx_approval_records_incident_id
ON approval_records (incident_id)
WHERE incident_id IS NOT NULL;
-- 2. incidents 表確保有 source 欄位 (alertmanager / manual 等)
ALTER TABLE incidents
ADD COLUMN IF NOT EXISTS source TEXT DEFAULT 'alertmanager';
-- 3. knowledge_entries 確保有 related_approval_id 欄位
ALTER TABLE knowledge_entries
ADD COLUMN IF NOT EXISTS related_approval_id TEXT;
CREATE INDEX IF NOT EXISTS idx_knowledge_entries_related_approval
ON knowledge_entries (related_approval_id)
WHERE related_approval_id IS NOT NULL;
-- 完成確認
DO $$
BEGIN
RAISE NOTICE 'Phase 26 migration completed: incident_id + source + related_approval_id';
END $$;

View File

@@ -132,9 +132,20 @@ async def create_incident_for_approval(
proposal_ids=[UUID(approval_id)],
)
# Phase 17 P0: 透過 Service 存入 Working Memory
# Phase 17 P0: 透過 Service 存入 Working Memory (Redis)
await incident_service.save_to_working_memory(incident)
# 2026-04-06 ogt: Phase 26 — 同時寫入 Episodic Memory (PostgreSQL)
# 原本只存 RedisTTL 7天後消失Playbook 萃取和 KM 永遠找不到 incident
try:
await incident_service.save_to_episodic_memory(incident)
except Exception as _pg_err:
logger.warning(
"incident_episodic_memory_failed",
incident_id=incident.incident_id,
error=str(_pg_err),
)
logger.info(
"incident_created_for_approval",
incident_id=incident.incident_id,
@@ -1207,6 +1218,19 @@ async def alertmanager_webhook(
source="alertmanager",
)
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
# 這樣 Playbook 萃取和 KM 寫入才能找到對應的 Incident
try:
await service.update_incident_id(approval.id, incident_id)
approval.incident_id = incident_id
except Exception as _meta_err:
logger.warning(
"approval_incident_id_update_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_meta_err),
)
root_cause = analysis_result.description or message
estimated_downtime = blast.estimated_downtime if blast else "~30s"
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"

View File

@@ -124,6 +124,15 @@ class ApprovalRecord(Base):
comment="Last time this alert pattern was seen",
)
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
# Playbook 萃取和 KM 寫入必須知道 incident_id不能靠文字解析
incident_id: Mapped[str | None] = mapped_column(
String(64),
nullable=True,
index=True,
comment="Associated Incident ID (INC-YYYYMMDD-XXXXXX)",
)
# Timestamps
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),

View File

@@ -161,6 +161,8 @@ class ApprovalRequest(ApprovalRequestBase):
fingerprint: str | None = Field(default=None, description="告警指紋 Hash")
hit_count: int = Field(default=1, description="聚合觸發次數")
last_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC), description="最後觸發時間")
# 2026-04-06 ogt: 關聯 Incident — 萃取 Playbook 與 KM 寫入必須知道 incident_id
incident_id: str | None = Field(default=None, description="關聯的 Incident ID")
@property
def current_signatures(self) -> int:

View File

@@ -598,6 +598,18 @@ class ApprovalDBService:
success=success,
)
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
"""
2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 approval_records
讓 Playbook 萃取和 KM 寫入能找到對應的 Incident
"""
async with get_db_context() as db:
await db.execute(
update(ApprovalRecord)
.where(ApprovalRecord.id == str(approval_id))
.values(incident_id=incident_id)
)
# =========================================================================
# Phase 6.4h: Proposals API 支援方法
# =========================================================================

View File

@@ -389,15 +389,17 @@ class ApprovalExecutionService:
此函數為 fire-and-forget失敗不影響主流程
"""
try:
# 1. 從 approval 取得關聯的 incident_id
# approval.requested_by 可能包含 incident 資訊,或從 metadata 取得
# 暫時從 description 或 action 解析
incident_id = self._extract_incident_id_from_approval(approval)
# 1. 從 approval.incident_id 直接取得 (Phase 26 修復)
# 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到
incident_id = getattr(approval, "incident_id", None)
if not incident_id:
# Fallback: 嘗試文字解析 (向後兼容舊資料)
incident_id = self._extract_incident_id_from_approval(approval)
if not incident_id:
logger.info(
"playbook_extraction_skipped",
approval_id=str(approval.id),
reason="No incident_id found",
reason="No incident_id found in approval.incident_id or text",
)
return

View File

@@ -19,6 +19,10 @@ from dataclasses import dataclass
from src.services.executor import OperationType
# 2026-04-06 ogt: Phase 26 — 預設 namespace 改為 awoooi-prod
# 原本 "default" 導致 203 次執行全失敗deployment 全在 awoooi-prod
DEFAULT_NAMESPACE = "awoooi-prod"
@dataclass
class ParsedOperation:
@@ -28,7 +32,7 @@ class ParsedOperation:
Attributes:
operation_type: K8s 操作類型 (RESTART_DEPLOYMENT, DELETE_POD, etc.)
resource_name: 目標資源名稱
namespace: K8s Namespace (預設 "default")
namespace: K8s Namespace (預設 "awoooi-prod")
Note:
支援 tuple 解包以向後兼容:
@@ -81,7 +85,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
if kubectl_restart_match:
deploy_name = kubectl_restart_match.group(1)
ns_match = re.search(r"-n\s+(\S+)", action_lower)
namespace = ns_match.group(1) if ns_match else "default"
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace)
# Pattern: kubectl delete pod <name>
@@ -91,14 +95,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
if delete_pod_match:
pod_name = delete_pod_match.group(1)
ns_match = re.search(r"-n\s+(\S+)", action_lower)
namespace = ns_match.group(1) if ns_match else "default"
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
return ParsedOperation(OperationType.DELETE_POD, pod_name, namespace)
# Pattern: 刪除 Pod <name> (Chinese delete)
chinese_delete_match = re.search(r"刪除\s*[Pp]od\s+([a-z0-9][\w.-]*)", action)
if chinese_delete_match:
pod_name = chinese_delete_match.group(1)
return ParsedOperation(OperationType.DELETE_POD, pod_name, "default")
return ParsedOperation(OperationType.DELETE_POD, pod_name, DEFAULT_NAMESPACE)
# Pattern: restart deployment <name> (English - with explicit "deployment")
restart_deploy_match = re.search(
@@ -107,7 +111,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
if restart_deploy_match:
deploy_name = restart_deploy_match.group(1)
ns_match = re.search(r"-n\s+(\S+)", action_lower)
namespace = ns_match.group(1) if ns_match else "default"
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace)
# Pattern: restart <name> (English - without "deployment" keyword)
@@ -117,7 +121,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
# Skip if captured word is "deployment" (handled above)
if deploy_name != "deployment":
ns_match = re.search(r"-n\s+(\S+)", action_lower)
namespace = ns_match.group(1) if ns_match else "default"
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
return ParsedOperation(
OperationType.RESTART_DEPLOYMENT, deploy_name, namespace
)
@@ -128,7 +132,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
)
if chinese_restart_deploy_match:
deploy_name = chinese_restart_deploy_match.group(1)
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, "default")
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
# Pattern: 重新啟動 <name> 服務 (Chinese)
chinese_restart_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)\s*服務", action)
@@ -136,9 +140,9 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
resource_name = chinese_restart_match.group(1)
# StatefulSet Pod 格式: name-N (如 postgres-primary-0)
if re.match(r".*-\d+$", resource_name):
return ParsedOperation(OperationType.DELETE_POD, resource_name, "default")
return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE)
return ParsedOperation(
OperationType.RESTART_DEPLOYMENT, resource_name, "default"
OperationType.RESTART_DEPLOYMENT, resource_name, DEFAULT_NAMESPACE
)
# Pattern: scale deployment <name>
@@ -148,14 +152,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
if scale_match:
deploy_name = scale_match.group(1)
ns_match = re.search(r"-n\s+(\S+)", action_lower)
namespace = ns_match.group(1) if ns_match else "default"
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, namespace)
# Pattern: 擴容 <name> (Chinese scale)
chinese_scale_match = re.search(r"擴容\s+([a-z0-9][\w.-]*)", action)
if chinese_scale_match:
deploy_name = chinese_scale_match.group(1)
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default")
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
# Pattern: 擴展 <name> 副本數 (Chinese scale variant)
chinese_scale2_match = re.search(r"擴展\s+([a-z0-9][\w.-]*)\s*副本", action)
@@ -163,7 +167,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
deploy_name = chinese_scale2_match.group(1)
# 移除常見的後綴如 -deployment
deploy_name = re.sub(r"-deployment$", "", deploy_name)
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default")
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
# Pattern: 重新啟動 <name> (Chinese restart without 服務)
chinese_restart2_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)", action)
@@ -171,11 +175,11 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
resource_name = chinese_restart2_match.group(1)
# StatefulSet Pod 格式: name-N (如 postgres-primary-0)
if re.match(r".*-\d+$", resource_name):
return ParsedOperation(OperationType.DELETE_POD, resource_name, "default")
return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE)
# 移除常見的後綴
deploy_name = re.sub(r"-deployment$", "", resource_name)
return ParsedOperation(
OperationType.RESTART_DEPLOYMENT, deploy_name, "default"
OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE
)
return ParsedOperation(None, None, "default")
return ParsedOperation(None, None, DEFAULT_NAMESPACE)