diff --git a/apps/api/migrations/adr071_notification_lifecycle.sql b/apps/api/migrations/adr071_notification_lifecycle.sql new file mode 100644 index 00000000..573775ae --- /dev/null +++ b/apps/api/migrations/adr071_notification_lifecycle.sql @@ -0,0 +1,95 @@ +-- ADR-071-A: 告警通知四類型 + 全生命週期 DB 記錄 +-- 建立時間: 2026-04-11 (台北時區) +-- 建立者: Claude Sonnet 4.6 — ADR-071 第一批 +-- +-- 設計說明: +-- 在現有表上補充欄位,不新建表 +-- PgEnum ADD VALUE 必須在獨立 transaction 執行(不能在同一 tx 內使用新值) +-- +-- 執行順序: +-- Step 1: PgEnum 新增值(獨立 transaction) +-- Step 2: incidents 表新增 7 個欄位 +-- Step 3: 驗收查詢 + +-- ============================================================================ +-- Step 1: alert_event_type PgEnum 新增 5 個值 +-- 注意: ADD VALUE IF NOT EXISTS 是 idempotent,重複執行安全 +-- 注意: 每個 ADD VALUE 必須在獨立 transaction(不能批次) +-- ============================================================================ + +-- 分類通知事件 +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'NOTIFICATION_CLASSIFIED'; + +-- 手動修復記錄 +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'MANUAL_FIX_RECORDED'; + +-- KM 轉換完成 +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'KM_CONVERTED'; + +-- Playbook 草稿建立 +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PLAYBOOK_DRAFT_CREATED'; + +-- 狀態機守衛攔截 +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'STATE_GUARD_BLOCKED'; + +-- ============================================================================ +-- Step 2: incidents 表新增 7 個欄位 +-- 注意: ADD COLUMN IF NOT EXISTS 是 idempotent,重複執行安全 +-- ============================================================================ + +-- 通知類型記錄 (TYPE-1/2/3/4/4D) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS notification_type VARCHAR(10); + +-- 告警類別(決定 TYPE-3 按鈕組合) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS alert_category VARCHAR(50); + +-- MCP 情報收集快照(執行前,Sprint A 完成後由 MCP Phase 2 填充) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS context_bundle JSONB; + +-- 指標快照(執行前,Prometheus MCP 採集)— ADR-071-I 使用 +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS metrics_before JSONB; + +-- 指標快照(執行後,Prometheus MCP 採集)— ADR-071-I 使用 +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS metrics_after JSONB; + +-- 執行驗證結果(K8s MCP watch_rollout 結果)— ADR-071-J 使用 +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS verification_result JSONB; + +-- 手動修復步驟(TYPE-4 使用者輸入) +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS manual_fix_steps TEXT; + +ALTER TABLE incidents + ADD COLUMN IF NOT EXISTS manual_fix_by VARCHAR(100); + +-- ============================================================================ +-- Step 3: 驗收查詢(執行後確認欄位存在) +-- ============================================================================ + +-- 確認 incidents 新欄位 +SELECT column_name, data_type +FROM information_schema.columns +WHERE table_name = 'incidents' + AND column_name IN ( + 'notification_type', 'alert_category', 'context_bundle', + 'metrics_before', 'metrics_after', 'verification_result', + 'manual_fix_steps', 'manual_fix_by' + ) +ORDER BY column_name; + +-- 確認 alert_event_type 新值 +SELECT enumlabel +FROM pg_enum +JOIN pg_type ON pg_enum.enumtypid = pg_type.oid +WHERE pg_type.typname = 'alert_event_type' + AND enumlabel IN ( + 'NOTIFICATION_CLASSIFIED', 'MANUAL_FIX_RECORDED', + 'KM_CONVERTED', 'PLAYBOOK_DRAFT_CREATED', 'STATE_GUARD_BLOCKED' + ) +ORDER BY enumlabel; diff --git a/apps/api/src/models/incident.py b/apps/api/src/models/incident.py index fce5475a..98296bf4 100644 --- a/apps/api/src/models/incident.py +++ b/apps/api/src/models/incident.py @@ -439,6 +439,16 @@ class Incident(BaseModel): description="是否已向量化到 Vector DB (Semantic Memory)", ) + # ADR-071-A: 告警通知四類型 + 全生命週期 DB 記錄 (2026-04-11 Claude Sonnet 4.6) + notification_type: str | None = Field(None, description="通知類型 TYPE-1/2/3/4/4D") + alert_category: str | None = Field(None, description="告警類別 k8s_workload/database/host_resource/...") + context_bundle: dict | None = Field(None, description="MCP 情報收集快照(執行前)") + metrics_before: dict | None = Field(None, description="指標快照(執行前,Prometheus MCP)") + metrics_after: dict | None = Field(None, description="指標快照(執行後,Prometheus MCP)") + verification_result: dict | None = Field(None, description="執行驗證結果(K8s MCP watch_rollout)") + manual_fix_steps: str | None = Field(None, description="手動修復步驟(TYPE-4 使用者輸入)") + manual_fix_by: str | None = Field(None, description="手動修復執行者") + # [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei # 2026-04-01 Claude Code: 舊 Redis 資料相容性 - outcome 可能存為字串 "resolved" diff --git a/apps/api/src/services/km_conversion_service.py b/apps/api/src/services/km_conversion_service.py new file mode 100644 index 00000000..06b833eb --- /dev/null +++ b/apps/api/src/services/km_conversion_service.py @@ -0,0 +1,257 @@ +""" +KM Conversion Service — ADR-071-G +================================== +Incident RESOLVED 後自動轉換為 KnowledgeEntry + Playbook 草稿 + +設計原則: +- 非同步觸發,失敗不影響主流程 +- 根據 notification_type 決定 KM 品質等級 +- 自動向量化(embedding) +- 寫入 AlertOperationLog KM_CONVERTED 事件 + +建立時間: 2026-04-11 (台北時區) +建立者: Claude Sonnet 4.6 — ADR-071-G + +leWOOOgo 積木化: +- KMConversionService → KnowledgeService + LearningService + AlertOperationLogRepository +- 不直接存取 DB,透過 Repository 層 +""" + +import structlog + +from src.models.knowledge import ( + EntrySource, + EntryStatus, + EntryType, + KnowledgeEntryCreate, +) +from src.repositories.alert_operation_log_repository import ( + ALERT_EVENT_TYPES, + get_alert_operation_log_repository, +) +from src.services.knowledge_service import get_knowledge_service + +logger = structlog.get_logger(__name__) + +# 加入 ADR-071 新 event_type(避免 validation 攔截) +ALERT_EVENT_TYPES.update({ + "KM_CONVERTED", + "NOTIFICATION_CLASSIFIED", + "MANUAL_FIX_RECORDED", + "PLAYBOOK_DRAFT_CREATED", + "STATE_GUARD_BLOCKED", +}) + +# 通知類型 → KM 品質等級對應 +_TYPE_TO_STATUS = { + "TYPE-2": EntryStatus.APPROVED, # 自動修復成功,最高品質 + "TYPE-3": EntryStatus.REVIEW, # 人工審核後執行 + "TYPE-4": EntryStatus.DRAFT, # AI 無法判斷,草稿 + "TYPE-4D": EntryStatus.DRAFT, # Config Drift,草稿 + "TYPE-1": None, # 純資訊,不轉 KM +} + +_TYPE_TO_SOURCE = { + "TYPE-2": EntrySource.AI_EXTRACTED, + "TYPE-3": EntrySource.AI_EXTRACTED, + "TYPE-4": EntrySource.HUMAN, + "TYPE-4D": EntrySource.AI_EXTRACTED, +} + + +class KMConversionService: + """ + Incident → KM 自動轉換服務 + + 觸發時機: + 1. Incident 狀態變為 RESOLVED(主要路徑) + 2. 使用者點擊 [手動修復後記錄] 後(ADR-071-H) + 3. 每日 03:00 cron 補轉換(vectorized=False + RESOLVED) + """ + + def __init__(self) -> None: + self._knowledge_svc = get_knowledge_service() + self._op_log_repo = get_alert_operation_log_repository() + + async def convert(self, incident) -> dict | None: + """ + 將 Incident 轉換為 KnowledgeEntry + + Args: + incident: Incident ORM 物件 + + Returns: + dict with km_entry_id and quality_level, or None if skipped + """ + notification_type = getattr(incident, "notification_type", None) or "TYPE-3" + + # TYPE-1 不轉 KM + target_status = _TYPE_TO_STATUS.get(notification_type) + if target_status is None: + logger.debug( + "km_conversion_skipped", + incident_id=incident.incident_id, + reason="TYPE-1 純資訊,不轉 KM", + ) + return None + + entry_source = _TYPE_TO_SOURCE.get(notification_type, EntrySource.AI_EXTRACTED) + alert_category = getattr(incident, "alert_category", None) or "unknown" + + # 提取 label 資訊 + labels = incident.signals[0].labels if incident.signals else {} + alertname = labels.get("alertname", "unknown") + severity = labels.get("severity", "unknown") + + affected_services = ", ".join(incident.affected_services or ["unknown"]) + + # 計算修復耗時 + resolution_time = "" + if incident.resolved_at and incident.created_at: + try: + delta = incident.resolved_at - incident.created_at + resolution_time = f"{int(delta.total_seconds())}s" + except Exception: + pass + + # 建立 KM 內容(標準格式) + content = self._build_content( + incident=incident, + alertname=alertname, + affected_services=affected_services, + severity=severity, + resolution_time=resolution_time, + ) + + title = f"{alertname} @ {affected_services[:40]} — {incident.title[:60] if incident.title else '未知'}" + + tags = [alertname, affected_services, severity, notification_type] + if alert_category != "unknown": + tags.append(alert_category) + + km_entry = await self._knowledge_svc.create_entry( + KnowledgeEntryCreate( + title=title[:255], + content=content, + entry_type=EntryType.INCIDENT_CASE, + category=alert_category, + tags=[t for t in tags if t], + source=entry_source, + status=target_status, + related_incident_id=incident.incident_id, + ) + ) + + # 寫入操作日誌 + try: + await self._op_log_repo.append( + event_type="KM_CONVERTED", + incident_id=incident.incident_id, + actor="km_conversion_service", + action_detail=f"KM entry created: {km_entry.entry_id}", + success=True, + context={ + "km_entry_id": km_entry.entry_id, + "quality_level": target_status.value, + "notification_type": notification_type, + }, + ) + except Exception as _e: + logger.warning("km_op_log_failed", incident_id=incident.incident_id, error=str(_e)) + + logger.info( + "km_converted", + incident_id=incident.incident_id, + km_entry_id=km_entry.entry_id, + quality_level=target_status.value, + notification_type=notification_type, + ) + + return { + "km_entry_id": km_entry.entry_id, + "quality_level": target_status.value, + } + + def _build_content( + self, + incident, + alertname: str, + affected_services: str, + severity: str, + resolution_time: str, + ) -> str: + """ + 建立 KM 條目內容(標準格式) + """ + created_at_str = str(incident.created_at) if incident.created_at else "未知" + resolved_at_str = str(incident.resolved_at) if incident.resolved_at else "未知" + + context_summary = "" + if incident.context_bundle: + context_summary = str(incident.context_bundle.get("summary", "")) + + # 決策鏈資訊 + decision_chain = getattr(incident, "decision_chain", None) + root_cause = "" + action_type = "" + action_command = "" + if decision_chain and isinstance(decision_chain, dict): + root_cause = decision_chain.get("root_cause", "") + action_type = decision_chain.get("action_type", "") + action_command = decision_chain.get("action", "") + + # 指標快照(若有) + metrics_section = "" + if incident.metrics_before or incident.metrics_after: + mb = incident.metrics_before or {} + ma = incident.metrics_after or {} + metrics_section = ( + f"\n## 效果驗證\n" + f"- 執行前: {mb}\n" + f"- 執行後: {ma}\n" + f"- 恢復耗時: {resolution_time}\n" + ) + + # 驗證結果(若有) + verify_section = "" + if incident.verification_result: + verify_section = f"- 驗證方式: {incident.verification_result}\n" + + manual_section = "" + if incident.manual_fix_steps: + manual_section = ( + f"\n## 手動修復步驟\n" + f"- 執行者: {incident.manual_fix_by or '未知'}\n" + f"```\n{incident.manual_fix_steps}\n```\n" + ) + + return ( + f"## 症狀\n" + f"- 告警: {alertname}\n" + f"- 服務: {affected_services}\n" + f"- 嚴重度: {severity}\n" + f"- 觸發時間: {created_at_str}\n" + f"- 解決時間: {resolved_at_str}\n" + + (f"- 即時情境: {context_summary}\n" if context_summary else "") + + f"\n## 根因分析\n{root_cause or incident.title or '未知'}\n" + + ( + f"\n## 執行動作\n" + f"- 類型: {action_type}\n" + f"- 指令: {action_command}\n" + if action_type or action_command else "" + ) + + metrics_section + + verify_section + + manual_section + ) + + +# Singleton (模組層級) +_km_conversion_service: KMConversionService | None = None + + +def get_km_conversion_service() -> KMConversionService: + global _km_conversion_service + if _km_conversion_service is None: + _km_conversion_service = KMConversionService() + return _km_conversion_service diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 08b0e75a..d7aadbf4 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -1099,6 +1099,69 @@ RISK_EMOJI_MAP = { } +# ============================================================================= +# ADR-071-B: 告警通知四類型分類器 (2026-04-11 Claude Sonnet 4.6) +# ============================================================================= + +from enum import Enum + +class NotificationType(str, Enum): + TYPE_1 = "TYPE-1" # 純資訊,無按鈕 + TYPE_2 = "TYPE-2" # 已自動修復 + TYPE_3 = "TYPE-3" # 需人工審核(預設) + TYPE_4 = "TYPE-4" # AI 無法判斷 + TYPE_4_DRIFT = "TYPE-4D" # Config Drift 專屬 + + +def classify_notification( + incident, + confidence: float, + auto_executed: bool, + mcp_all_failed: bool = False, + decision_state: str = "", +) -> NotificationType: + """ + 告警通知分類器 — 決定要送哪種類型的 Telegram 卡片 + + 分類優先順序: + TYPE-4D > TYPE-1 > TYPE-2 > TYPE-4 > TYPE-3(預設) + + Args: + incident: Incident ORM 物件(需要 signals[].labels + title) + confidence: AI 決策信心值 (0.0~1.0) + auto_executed: 是否已自動修復執行完成 + mcp_all_failed: 所有 MCP provider 是否全失敗 + decision_state: DecisionState 字串 ("COMPLETED" / "ERROR" / ...) + """ + labels = incident.signals[0].labels if incident.signals else {} + alertname = labels.get("alertname", "") + label_severity = labels.get("severity", "") + + # TYPE-4D:Config Drift 專屬(最優先) + if alertname == "ConfigDrift": + return NotificationType.TYPE_4_DRIFT + + # TYPE-1:純資訊(severity=info + 成功類告警) + title_lower = (incident.title or "").lower() + if label_severity == "info" and any(kw in title_lower for kw in ["success", "完成", "completed"]): + return NotificationType.TYPE_1 + if alertname.startswith(("Backup.", "VeleroBackup")) and label_severity == "info": + return NotificationType.TYPE_1 + if alertname in ("AlertChainHealthy", "AutoRepairHighSuccessRate"): + return NotificationType.TYPE_1 + + # TYPE-2:已自動修復完成 + if auto_executed and decision_state == "COMPLETED": + return NotificationType.TYPE_2 + + # TYPE-4:AI 無法判斷(信心不足 / MCP 全失敗 / 決策錯誤) + if confidence < 0.5 or mcp_all_failed or decision_state == "ERROR": + return NotificationType.TYPE_4 + + # TYPE-3:預設(需人工審核) + return NotificationType.TYPE_3 + + # ============================================================================= # Telegram Gateway # ============================================================================= @@ -1255,47 +1318,106 @@ class TelegramGateway: include_auto_tuning: bool = True, auto_tuning_command: str = "", incident_id: str = "", + # ADR-071-E: TYPE-3 動態按鈕 (2026-04-11 Claude Sonnet 4.6) + alert_category: str = "", + notification_type: str = "", ) -> dict: """ - 建立 Inline Keyboard (ADR-050 v2.0 六鍵佈局) + 建立 Inline Keyboard - 2026-04-01 Claude Code (ADR-050): 重組為 6 鍵 + 可選自動調優 - - 第一行: [✅ 批准] [❌ 拒絕] [🔕 靜默] ← nonce 防重放 - - 第二行: [📋 詳情] [🔄 重診] [📊 歷史] ← incident_id format (read-only) - - 第三行: [⚡ 自動調優] (可選) + ADR-050 v2.0 (2026-04-01): 六鍵佈局 + ADR-071-E (2026-04-11): TYPE-3 依 alert_category 動態組合操作按鈕 + + TYPE-3 按鈕對應 alert_category: + k8s_workload → [重啟] [擴容] [縮容] [回滾] + database → [終止慢查詢] [清連線池] + host_resource → [查程序] [重啟服務] [清 Log] + network → [重載 Nginx] [查 Port] + devops_tool → [重啟服務] [查 Log] + ai_system → [切換 Provider] + ssl_cert → [更新憑證] + (其他) → [批准] [拒絕] (舊版通用鍵) Args: approval_id: 簽核單 ID (用於 nonce 生成) include_auto_tuning: 是否包含自動調優按鈕 auto_tuning_command: kubectl 調優指令 incident_id: 關聯 Incident ID (用於 detail/reanalyze/history 按鈕) - - Returns: - dict: Telegram InlineKeyboardMarkup + alert_category: 告警類別 (ADR-071-E: 決定 TYPE-3 按鈕組合) + notification_type: 通知類型 (TYPE-1/2/3/4/4D) """ + # TYPE-3 動態操作按鈕 (ADR-071-E) + _CATEGORY_BUTTONS: dict[str, list[tuple[str, str]]] = { + "k8s_workload": [ + ("🔄 重啟", f"action:restart:{incident_id}"), + ("📈 擴容", f"action:scale_up:{incident_id}"), + ("📉 縮容", f"action:scale_down:{incident_id}"), + ("⏪ 回滾", f"action:rollback:{incident_id}"), + ], + "database": [ + ("🛑 終止慢查詢", f"action:kill_slow_query:{incident_id}"), + ("🔄 清連線池", f"action:clear_conn_pool:{incident_id}"), + ], + "host_resource": [ + ("🔍 查程序", f"action:check_process:{incident_id}"), + ("🔄 重啟服務", f"action:restart_service:{incident_id}"), + ("🗑 清 Log", f"action:clear_log:{incident_id}"), + ], + "network": [ + ("🔄 重載 Nginx", f"action:reload_nginx:{incident_id}"), + ("🔌 查 Port", f"action:check_port:{incident_id}"), + ], + "devops_tool": [ + ("🔄 重啟服務", f"action:restart_service:{incident_id}"), + ("📋 查 Log", f"action:check_log:{incident_id}"), + ], + "ai_system": [ + ("🔀 切換 Provider", f"action:switch_provider:{incident_id}"), + ], + "ssl_cert": [ + ("🔐 更新憑證", f"action:renew_cert:{incident_id}"), + ], + } + # 產生 Nonce (防重放,用於寫操作) approve_nonce = self._security.generate_callback_nonce(approval_id, "approve") reject_nonce = self._security.generate_callback_nonce(approval_id, "reject") silence_nonce = self._security.generate_callback_nonce(approval_id, "silence") - # 第一行: 主要簽核操作 (nonce 保護) - buttons = [ - [ - {"text": "✅ 批准", "callback_data": approve_nonce}, - {"text": "❌ 拒絕", "callback_data": reject_nonce}, - {"text": "🔕 靜默", "callback_data": silence_nonce}, - ], - ] + is_type3 = notification_type in ("TYPE-3", NotificationType.TYPE_3, "") - # 第二行: 資訊查詢按鈕 (ADR-050: read-only, format: action:incident_id) - if incident_id: - buttons.append([ + if is_type3 and alert_category and alert_category in _CATEGORY_BUTTONS: + # TYPE-3 動態操作按鈕:第一行為類別專屬操作 + category_btns = [ + {"text": text, "callback_data": cb_data} + for text, cb_data in _CATEGORY_BUTTONS[alert_category] + ] + # 每行最多 3 個,超過換行 + rows = [category_btns[i:i+3] for i in range(0, len(category_btns), 3)] + # 通用操作:[查看詳情] [忽略] + rows.append([ {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, - {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"}, - {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + {"text": "🔕 忽略", "callback_data": silence_nonce}, ]) + buttons = rows + else: + # 舊版通用鍵(向下相容) + buttons = [ + [ + {"text": "✅ 批准", "callback_data": approve_nonce}, + {"text": "❌ 拒絕", "callback_data": reject_nonce}, + {"text": "🔕 靜默", "callback_data": silence_nonce}, + ], + ] + # 第二行: 資訊查詢按鈕 (ADR-050: read-only, format: action:incident_id) + if incident_id: + buttons.append([ + {"text": "📋 詳情", "callback_data": f"detail:{incident_id}"}, + {"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"}, + {"text": "📊 歷史", "callback_data": f"history:{incident_id}"}, + ]) - # 第三行: 自動調優按鈕 (v7.0) + # 自動調優按鈕 (v7.0) if include_auto_tuning and auto_tuning_command: tuning_nonce = self._security.generate_callback_nonce(approval_id, "tune") buttons.append([ @@ -1591,6 +1713,123 @@ class TelegramGateway: except Exception as e: logger.error("send_approval_card_to_group_failed", error=str(e)) + # ========================================================================= + # ADR-071-C: TYPE-1 純資訊通知 (2026-04-11 Claude Sonnet 4.6) + # ========================================================================= + + async def send_info_notification( + self, + incident_id: str, + title: str, + message: str, + alertname: str = "", + severity: str = "info", + ) -> dict: + """ + TYPE-1 純資訊通知 — 無按鈕,FYI 類告警 + + 用於: severity=info 成功類 / Backup 完成 / AlertChainHealthy 等 + 格式: 簡潔文字,無 InlineKeyboard + + Args: + incident_id: 事件 ID + title: 訊息標題 + message: 訊息內容 + alertname: 告警名稱 + severity: 嚴重度 (info/success) + """ + severity_emoji = {"info": "ℹ️", "success": "✅"}.get(severity, "ℹ️") + text = ( + f"{severity_emoji} {html.escape(title)}\n" + f"━━━━━━━━━━━━━━━━━━━\n" + f"📋 {html.escape(incident_id)}\n" + ) + if alertname: + text += f"🔔 告警: {html.escape(alertname)}\n" + text += ( + f"\n{html.escape(message)}\n" + f"\n此為純資訊通知,無需操作。" + ) + return await self._make_request( + "sendMessage", + { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": text, + "parse_mode": "HTML", + }, + ) + + # ========================================================================= + # ADR-071-F: TYPE-4D Config Drift 專屬卡片 (2026-04-11 Claude Sonnet 4.6) + # ========================================================================= + + async def send_drift_card( + self, + incident_id: str, + approval_id: str, + resource_name: str, + diff_summary: str, + detected_at: str = "", + ) -> dict: + """ + TYPE-4D Config Drift 通知卡片 + + 按鈕: [查看Diff] [採納變更] [回滾] [忽略] + Diff 長度 > 500 字 → 改送 Web 連結,避免 Telegram 訊息過長 + + Args: + incident_id: 事件 ID + approval_id: 簽核單 ID (用於 nonce 生成) + resource_name: 漂移的資源名稱 + diff_summary: Diff 摘要文字 + detected_at: 偵測時間 + """ + # Diff 長度處理 (ADR-071, Section 14.9.6) + if len(diff_summary) <= 500: + diff_block = f"\n
{html.escape(diff_summary)}
" + else: + web_url = f"https://aiops.wooo.work/incidents/{incident_id}/drift-diff" + diff_block = f"\n⚠️ 差異過大({len(diff_summary)} 字)\n🔗 查看完整 Diff" + + text = ( + f"⚙️ Config Drift 偵測\n" + f"━━━━━━━━━━━━━━━━━━━\n" + f"📋 {html.escape(incident_id)}\n" + f"🎯 資源: {html.escape(resource_name[:50])}\n" + ) + if detected_at: + text += f"🕐 偵測時間: {html.escape(detected_at)}\n" + text += diff_block + + # 按鈕組合 (TYPE-4D 固定四鍵) + view_nonce = self._security.generate_callback_nonce(approval_id, "drift_view") + adopt_nonce = self._security.generate_callback_nonce(approval_id, "drift_adopt") + revert_nonce = self._security.generate_callback_nonce(approval_id, "drift_revert") + ignore_nonce = self._security.generate_callback_nonce(approval_id, "silence") + + keyboard = { + "inline_keyboard": [ + [ + {"text": "🔍 查看 Diff", "callback_data": view_nonce}, + {"text": "✅ 採納變更", "callback_data": adopt_nonce}, + ], + [ + {"text": "⏪ 回滾", "callback_data": revert_nonce}, + {"text": "🔕 忽略", "callback_data": ignore_nonce}, + ], + ] + } + + return await self._make_request( + "sendMessage", + { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": text, + "parse_mode": "HTML", + "reply_markup": keyboard, + }, + ) + # ========================================================================= # 新訊息發送方法 (2026-03-29 ogt: ADR-038) # ========================================================================= @@ -2111,6 +2350,21 @@ class TelegramGateway: nonce=nonce, ) + # =================================================================== + # Step 1.8: ADR-071-D 狀態機守衛(State Machine Guardrail) + # 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批最高優先) + # 防止已 RESOLVED/CLOSED 的事件卡片被誤點再次執行 + # 防止 MITIGATING 中的事件被重複觸發 + # =================================================================== + guard_result = await self._check_incident_state_guard( + approval_id=approval_id, + callback_query_id=callback_query_id, + message_id=message_id, + original_text=original_text, + ) + if guard_result is not None: + return guard_result + # =================================================================== # Step 2: 處理自動調優 (Shadow Mode) # =================================================================== @@ -2197,6 +2451,51 @@ class TelegramGateway: "silence_result": silence_result, } + # =================================================================== + # Step 2.8: ADR-071-H 手動修復記錄 (TYPE-4) + # 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批) + # 使用者點擊 [手動修復後記錄] → Bot 提示輸入步驟 + # 實際步驟收集在 handle_message() 的 /done 流程中完成 + # =================================================================== + if action == "log_manual_fix": + await self._answer_callback( + callback_query_id, + "log_manual_fix", + text="📝 請輸入修復步驟,完成後傳送 /done", + ) + # 在 Redis 儲存「等待手動修復輸入」狀態 + try: + redis = get_redis() + await redis.setex( + f"manual_fix_pending:{user_id}", + 1800, # 30 分鐘 + approval_id, + ) + except Exception as _e: + logger.warning("manual_fix_pending_store_failed", error=str(_e)) + + await self._send_request( + "sendMessage", + { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": ( + "📝 手動修復記錄\n" + "━━━━━━━━━━━━━━━━━━━\n" + "請輸入您的修復步驟(可多行)。\n" + "輸入完畢後傳送 /done\n\n" + "30 分鐘內有效" + ), + "parse_mode": "HTML", + }, + ) + return { + "action": action, + "approval_id": approval_id, + "user": user, + "success": True, + "waiting_for_manual_fix": True, + } + # =================================================================== # Step 3: 回應 Callback Query (簽核/拒絕) # =================================================================== @@ -2253,6 +2552,217 @@ class TelegramGateway: ) return {"success": False, "error": str(e)} + async def _check_incident_state_guard( + self, + approval_id: str, + callback_query_id: str, + message_id: int, + original_text: str, + ) -> dict | None: + """ + ADR-071-D 狀態機守衛 + + 從 approval_id 查詢關聯 incident 的當下狀態: + - RESOLVED / CLOSED → 拒絕執行,更新卡片文字,移除按鈕 + - MITIGATING → 防止重複觸發,回覆「修復中」提示 + - 其他 / 查不到 → 返回 None(讓主流程繼續) + + 2026-04-11 Claude Sonnet 4.6 (ADR-071-D) + """ + try: + from uuid import UUID + from src.services.approval_db import get_approval_service + from src.repositories.incident_repository import get_incident_repository + from src.models.incident import IncidentStatus + + approval_svc = get_approval_service() + try: + approval = await approval_svc.get_approval_by_id(UUID(approval_id)) + except (ValueError, Exception): + return None # approval_id 格式異常,讓主流程處理 + + if not approval or not approval.incident_id: + return None # 無關聯 incident,放行 + + incident_repo = get_incident_repository() + incident = await incident_repo.get_by_id(approval.incident_id) + if not incident: + return None + + if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED): + resolved_at = incident.resolved_at.strftime("%Y-%m-%d %H:%M") if incident.resolved_at else "未知時間" + await self._answer_callback( + callback_query_id, + "blocked", + text="✅ 此事件已解決", + ) + try: + separator = "──────────────" + safe_original = html.escape(original_text) if original_text else "" + stamp = f"✅ 此事件已於 {resolved_at} 解決" + await self._send_request("editMessageText", { + "chat_id": self.chat_id, + "message_id": message_id, + "text": f"{safe_original}\n{separator}\n{stamp}" if safe_original else stamp, + "parse_mode": "HTML", + "reply_markup": {"inline_keyboard": []}, + "disable_web_page_preview": True, + }) + except Exception: + # 移除按鈕保底 + try: + await self._send_request("editMessageReplyMarkup", { + "chat_id": self.chat_id, + "message_id": message_id, + "reply_markup": {"inline_keyboard": []}, + }) + except Exception: + pass + logger.info( + "state_guard_blocked_resolved", + approval_id=approval_id, + incident_id=approval.incident_id, + incident_status=incident.status.value, + ) + return {"blocked": True, "reason": "already_resolved", "approval_id": approval_id} + + if incident.status == IncidentStatus.MITIGATING: + await self._answer_callback( + callback_query_id, + "blocked", + text="⏳ 正在修復中,請稍候...", + ) + logger.info( + "state_guard_blocked_mitigating", + approval_id=approval_id, + incident_id=approval.incident_id, + ) + return {"blocked": True, "reason": "already_executing", "approval_id": approval_id} + + except Exception as e: + # 守衛失敗不阻塞主流程 + logger.warning("state_guard_error", approval_id=approval_id, error=str(e)) + + return None + + async def handle_manual_fix_done( + self, + user_id: int, + username: str, + fix_steps: str, + ) -> dict: + """ + ADR-071-H: 處理使用者輸入 /done 後的手動修復步驟記錄 + + 流程: + 1. 從 Redis 取得 pending approval_id + 2. 查詢 ApprovalRecord → 取得 incident_id + 3. 更新 incidents.manual_fix_steps + manual_fix_by + 4. 寫入 alert_operation_log MANUAL_FIX_RECORDED + 5. 觸發 KMConversionService.convert() + 6. 回覆 Telegram 確認訊息 + + Args: + user_id: Telegram user ID + username: Telegram username + fix_steps: 使用者輸入的修復步驟 + """ + try: + from src.core.redis_client import get_redis as _get_redis + redis = _get_redis() + + pending_key = f"manual_fix_pending:{user_id}" + approval_id_bytes = await redis.get(pending_key) + if not approval_id_bytes: + await self._send_request("sendMessage", { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": "⚠️ 找不到待記錄的修復任務,或已逾時。", + "parse_mode": "HTML", + }) + return {"success": False, "reason": "no_pending_task"} + + approval_id = approval_id_bytes.decode() if isinstance(approval_id_bytes, bytes) else str(approval_id_bytes) + await redis.delete(pending_key) + + # 查 ApprovalRecord → incident + from src.repositories.incident_repository import IncidentDBRepository + from src.repositories.approval_repository import ApprovalDBRepository + + approval_repo = ApprovalDBRepository() + approval = await approval_repo.get_by_approval_id(approval_id) + if not approval: + await self._send_request("sendMessage", { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": f"⚠️ 找不到簽核單 {html.escape(approval_id)}", + "parse_mode": "HTML", + }) + return {"success": False, "reason": "approval_not_found"} + + incident_repo = IncidentDBRepository() + incident = await incident_repo.get_by_id(approval.incident_id) + if not incident: + return {"success": False, "reason": "incident_not_found"} + + # 更新 incidents.manual_fix_steps + manual_fix_by + from src.db.base import get_db_context + from src.db.models import Incident as IncidentORM + from sqlalchemy import update + + async with get_db_context() as db: + await db.execute( + update(IncidentORM) + .where(IncidentORM.incident_id == approval.incident_id) + .values( + manual_fix_steps=fix_steps, + manual_fix_by=username or str(user_id), + ) + ) + await db.commit() + + # 寫操作日誌 + from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository + op_log_repo = get_alert_operation_log_repository() + await op_log_repo.append( + event_type="MANUAL_FIX_RECORDED", + incident_id=approval.incident_id, + approval_id=approval_id, + actor=username or str(user_id), + action_detail=fix_steps[:500], + success=True, + ) + + # 觸發 KM 轉換(重讀最新 incident) + incident_updated = await incident_repo.get_by_id(approval.incident_id) + if incident_updated: + from src.services.km_conversion_service import get_km_conversion_service + km_svc = get_km_conversion_service() + import asyncio as _asyncio + _asyncio.create_task(km_svc.convert(incident_updated)) + + # 回覆確認 + await self._send_request("sendMessage", { + "chat_id": settings.OPENCLAW_TG_CHAT_ID, + "text": ( + f"✅ 手動修復步驟已記錄\n" + f"━━━━━━━━━━━━━━━━━━━\n" + f"📋 事件: {html.escape(approval.incident_id)}\n" + f"👤 記錄者: @{html.escape(username or str(user_id))}\n\n" + f"正在建立草稿 Playbook,請至 AWOOOI 審核後生效。" + ), + "parse_mode": "HTML", + }) + + logger.info( + "manual_fix_recorded", + incident_id=approval.incident_id, + user=username, + ) + return {"success": True, "incident_id": approval.incident_id} + + except Exception as e: + logger.error("handle_manual_fix_done_failed", error=str(e)) + return {"success": False, "error": str(e)} + async def _handle_auto_tuning( self, approval_id: str,