fix(phase26): 打通 Incident→DB→KM 完整鏈路 + namespace 修正
問題根因: 1. create_incident_for_approval 只存 Redis,不存 PostgreSQL → TTL 7天後消失,Playbook 萃取永遠找不到 Incident 2. ApprovalRecord 無 incident_id 欄位 → _trigger_playbook_extraction 靠 regex 掃中文文字找 INC-,永遠失敗 3. operation_parser namespace fallback 是 "default" → 所有 deployment 在 awoooi-prod,203 次執行全失敗 修復: - Incident 同時寫入 Redis + PostgreSQL (save_to_episodic_memory) - ApprovalRecord 加入 incident_id 欄位 (model + ORM + migration) - alertmanager_webhook 建立 Approval 後回寫 incident_id - _trigger_playbook_extraction 直接用 approval.incident_id - operation_parser DEFAULT_NAMESPACE = "awoooi-prod" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
30
apps/api/migrations/phase26_incident_km_integration.sql
Normal file
30
apps/api/migrations/phase26_incident_km_integration.sql
Normal file
@@ -0,0 +1,30 @@
|
||||
-- =============================================================================
|
||||
-- Phase 26: Incident → KM 完整鏈路補全
|
||||
-- 2026-04-06 ogt: 修復三重死鎖 — 告警必須寫入 DB 並建立 KM
|
||||
-- =============================================================================
|
||||
|
||||
-- 1. approval_records 加入 incident_id 欄位
|
||||
ALTER TABLE approval_records
|
||||
ADD COLUMN IF NOT EXISTS incident_id TEXT;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_approval_records_incident_id
|
||||
ON approval_records (incident_id)
|
||||
WHERE incident_id IS NOT NULL;
|
||||
|
||||
-- 2. incidents 表確保有 source 欄位 (alertmanager / manual 等)
|
||||
ALTER TABLE incidents
|
||||
ADD COLUMN IF NOT EXISTS source TEXT DEFAULT 'alertmanager';
|
||||
|
||||
-- 3. knowledge_entries 確保有 related_approval_id 欄位
|
||||
ALTER TABLE knowledge_entries
|
||||
ADD COLUMN IF NOT EXISTS related_approval_id TEXT;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_knowledge_entries_related_approval
|
||||
ON knowledge_entries (related_approval_id)
|
||||
WHERE related_approval_id IS NOT NULL;
|
||||
|
||||
-- 完成確認
|
||||
DO $$
|
||||
BEGIN
|
||||
RAISE NOTICE 'Phase 26 migration completed: incident_id + source + related_approval_id';
|
||||
END $$;
|
||||
@@ -132,9 +132,20 @@ async def create_incident_for_approval(
|
||||
proposal_ids=[UUID(approval_id)],
|
||||
)
|
||||
|
||||
# Phase 17 P0: 透過 Service 存入 Working Memory
|
||||
# Phase 17 P0: 透過 Service 存入 Working Memory (Redis)
|
||||
await incident_service.save_to_working_memory(incident)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 同時寫入 Episodic Memory (PostgreSQL)
|
||||
# 原本只存 Redis,TTL 7天後消失,Playbook 萃取和 KM 永遠找不到 incident
|
||||
try:
|
||||
await incident_service.save_to_episodic_memory(incident)
|
||||
except Exception as _pg_err:
|
||||
logger.warning(
|
||||
"incident_episodic_memory_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(_pg_err),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"incident_created_for_approval",
|
||||
incident_id=incident.incident_id,
|
||||
@@ -1207,6 +1218,19 @@ async def alertmanager_webhook(
|
||||
source="alertmanager",
|
||||
)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
|
||||
# 這樣 Playbook 萃取和 KM 寫入才能找到對應的 Incident
|
||||
try:
|
||||
await service.update_incident_id(approval.id, incident_id)
|
||||
approval.incident_id = incident_id
|
||||
except Exception as _meta_err:
|
||||
logger.warning(
|
||||
"approval_incident_id_update_failed",
|
||||
approval_id=str(approval.id),
|
||||
incident_id=incident_id,
|
||||
error=str(_meta_err),
|
||||
)
|
||||
|
||||
root_cause = analysis_result.description or message
|
||||
estimated_downtime = blast.estimated_downtime if blast else "~30s"
|
||||
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
|
||||
|
||||
@@ -124,6 +124,15 @@ class ApprovalRecord(Base):
|
||||
comment="Last time this alert pattern was seen",
|
||||
)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
|
||||
# Playbook 萃取和 KM 寫入必須知道 incident_id,不能靠文字解析
|
||||
incident_id: Mapped[str | None] = mapped_column(
|
||||
String(64),
|
||||
nullable=True,
|
||||
index=True,
|
||||
comment="Associated Incident ID (INC-YYYYMMDD-XXXXXX)",
|
||||
)
|
||||
|
||||
# Timestamps
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
|
||||
@@ -161,6 +161,8 @@ class ApprovalRequest(ApprovalRequestBase):
|
||||
fingerprint: str | None = Field(default=None, description="告警指紋 Hash")
|
||||
hit_count: int = Field(default=1, description="聚合觸發次數")
|
||||
last_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC), description="最後觸發時間")
|
||||
# 2026-04-06 ogt: 關聯 Incident — 萃取 Playbook 與 KM 寫入必須知道 incident_id
|
||||
incident_id: str | None = Field(default=None, description="關聯的 Incident ID")
|
||||
|
||||
@property
|
||||
def current_signatures(self) -> int:
|
||||
|
||||
@@ -598,6 +598,18 @@ class ApprovalDBService:
|
||||
success=success,
|
||||
)
|
||||
|
||||
async def update_incident_id(self, approval_id: UUID, incident_id: str) -> None:
|
||||
"""
|
||||
2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 approval_records
|
||||
讓 Playbook 萃取和 KM 寫入能找到對應的 Incident
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
await db.execute(
|
||||
update(ApprovalRecord)
|
||||
.where(ApprovalRecord.id == str(approval_id))
|
||||
.values(incident_id=incident_id)
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# Phase 6.4h: Proposals API 支援方法
|
||||
# =========================================================================
|
||||
|
||||
@@ -389,15 +389,17 @@ class ApprovalExecutionService:
|
||||
此函數為 fire-and-forget,失敗不影響主流程
|
||||
"""
|
||||
try:
|
||||
# 1. 從 approval 取得關聯的 incident_id
|
||||
# approval.requested_by 可能包含 incident 資訊,或從 metadata 取得
|
||||
# 暫時從 description 或 action 解析
|
||||
incident_id = self._extract_incident_id_from_approval(approval)
|
||||
# 1. 從 approval.incident_id 直接取得 (Phase 26 修復)
|
||||
# 原本靠 regex 掃文字找 INC- 前綴,中文 action 完全找不到
|
||||
incident_id = getattr(approval, "incident_id", None)
|
||||
if not incident_id:
|
||||
# Fallback: 嘗試文字解析 (向後兼容舊資料)
|
||||
incident_id = self._extract_incident_id_from_approval(approval)
|
||||
if not incident_id:
|
||||
logger.info(
|
||||
"playbook_extraction_skipped",
|
||||
approval_id=str(approval.id),
|
||||
reason="No incident_id found",
|
||||
reason="No incident_id found in approval.incident_id or text",
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@@ -19,6 +19,10 @@ from dataclasses import dataclass
|
||||
|
||||
from src.services.executor import OperationType
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 預設 namespace 改為 awoooi-prod
|
||||
# 原本 "default" 導致 203 次執行全失敗(deployment 全在 awoooi-prod)
|
||||
DEFAULT_NAMESPACE = "awoooi-prod"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedOperation:
|
||||
@@ -28,7 +32,7 @@ class ParsedOperation:
|
||||
Attributes:
|
||||
operation_type: K8s 操作類型 (RESTART_DEPLOYMENT, DELETE_POD, etc.)
|
||||
resource_name: 目標資源名稱
|
||||
namespace: K8s Namespace (預設 "default")
|
||||
namespace: K8s Namespace (預設 "awoooi-prod")
|
||||
|
||||
Note:
|
||||
支援 tuple 解包以向後兼容:
|
||||
@@ -81,7 +85,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
if kubectl_restart_match:
|
||||
deploy_name = kubectl_restart_match.group(1)
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else "default"
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace)
|
||||
|
||||
# Pattern: kubectl delete pod <name>
|
||||
@@ -91,14 +95,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
if delete_pod_match:
|
||||
pod_name = delete_pod_match.group(1)
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else "default"
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
return ParsedOperation(OperationType.DELETE_POD, pod_name, namespace)
|
||||
|
||||
# Pattern: 刪除 Pod <name> (Chinese delete)
|
||||
chinese_delete_match = re.search(r"刪除\s*[Pp]od\s+([a-z0-9][\w.-]*)", action)
|
||||
if chinese_delete_match:
|
||||
pod_name = chinese_delete_match.group(1)
|
||||
return ParsedOperation(OperationType.DELETE_POD, pod_name, "default")
|
||||
return ParsedOperation(OperationType.DELETE_POD, pod_name, DEFAULT_NAMESPACE)
|
||||
|
||||
# Pattern: restart deployment <name> (English - with explicit "deployment")
|
||||
restart_deploy_match = re.search(
|
||||
@@ -107,7 +111,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
if restart_deploy_match:
|
||||
deploy_name = restart_deploy_match.group(1)
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else "default"
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, namespace)
|
||||
|
||||
# Pattern: restart <name> (English - without "deployment" keyword)
|
||||
@@ -117,7 +121,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
# Skip if captured word is "deployment" (handled above)
|
||||
if deploy_name != "deployment":
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else "default"
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
return ParsedOperation(
|
||||
OperationType.RESTART_DEPLOYMENT, deploy_name, namespace
|
||||
)
|
||||
@@ -128,7 +132,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
)
|
||||
if chinese_restart_deploy_match:
|
||||
deploy_name = chinese_restart_deploy_match.group(1)
|
||||
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, "default")
|
||||
return ParsedOperation(OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
|
||||
|
||||
# Pattern: 重新啟動 <name> 服務 (Chinese)
|
||||
chinese_restart_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)\s*服務", action)
|
||||
@@ -136,9 +140,9 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
resource_name = chinese_restart_match.group(1)
|
||||
# StatefulSet Pod 格式: name-N (如 postgres-primary-0)
|
||||
if re.match(r".*-\d+$", resource_name):
|
||||
return ParsedOperation(OperationType.DELETE_POD, resource_name, "default")
|
||||
return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE)
|
||||
return ParsedOperation(
|
||||
OperationType.RESTART_DEPLOYMENT, resource_name, "default"
|
||||
OperationType.RESTART_DEPLOYMENT, resource_name, DEFAULT_NAMESPACE
|
||||
)
|
||||
|
||||
# Pattern: scale deployment <name>
|
||||
@@ -148,14 +152,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
if scale_match:
|
||||
deploy_name = scale_match.group(1)
|
||||
ns_match = re.search(r"-n\s+(\S+)", action_lower)
|
||||
namespace = ns_match.group(1) if ns_match else "default"
|
||||
namespace = ns_match.group(1) if ns_match else DEFAULT_NAMESPACE
|
||||
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, namespace)
|
||||
|
||||
# Pattern: 擴容 <name> (Chinese scale)
|
||||
chinese_scale_match = re.search(r"擴容\s+([a-z0-9][\w.-]*)", action)
|
||||
if chinese_scale_match:
|
||||
deploy_name = chinese_scale_match.group(1)
|
||||
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default")
|
||||
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
|
||||
|
||||
# Pattern: 擴展 <name> 副本數 (Chinese scale variant)
|
||||
chinese_scale2_match = re.search(r"擴展\s+([a-z0-9][\w.-]*)\s*副本", action)
|
||||
@@ -163,7 +167,7 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
deploy_name = chinese_scale2_match.group(1)
|
||||
# 移除常見的後綴如 -deployment
|
||||
deploy_name = re.sub(r"-deployment$", "", deploy_name)
|
||||
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, "default")
|
||||
return ParsedOperation(OperationType.SCALE_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE)
|
||||
|
||||
# Pattern: 重新啟動 <name> (Chinese restart without 服務)
|
||||
chinese_restart2_match = re.search(r"重新啟動\s+([a-z0-9][\w.-]*)", action)
|
||||
@@ -171,11 +175,11 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
resource_name = chinese_restart2_match.group(1)
|
||||
# StatefulSet Pod 格式: name-N (如 postgres-primary-0)
|
||||
if re.match(r".*-\d+$", resource_name):
|
||||
return ParsedOperation(OperationType.DELETE_POD, resource_name, "default")
|
||||
return ParsedOperation(OperationType.DELETE_POD, resource_name, DEFAULT_NAMESPACE)
|
||||
# 移除常見的後綴
|
||||
deploy_name = re.sub(r"-deployment$", "", resource_name)
|
||||
return ParsedOperation(
|
||||
OperationType.RESTART_DEPLOYMENT, deploy_name, "default"
|
||||
OperationType.RESTART_DEPLOYMENT, deploy_name, DEFAULT_NAMESPACE
|
||||
)
|
||||
|
||||
return ParsedOperation(None, None, "default")
|
||||
return ParsedOperation(None, None, DEFAULT_NAMESPACE)
|
||||
|
||||
Reference in New Issue
Block a user