From c759b4eeabd0be064c8b69c32ce18aff360883ae Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 17 Apr 2026 16:29:15 +0800 Subject: [PATCH] =?UTF-8?q?fix(webhook+decision):=20ADR-089=20async=20webh?= =?UTF-8?q?ook=20+=20=E8=B6=85=E6=99=82=E9=AB=92=E8=B3=87=E6=96=99?= =?UTF-8?q?=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 — Webhook async (ADR-089): - Alertmanager 收到告警立即回 202,不再同步等 90s LLM - 新增 _process_new_alert_background():LLM 分析/Approval/Incident/Telegram 全進背景 - 根治 Alertmanager Fallback 風暴(超時 → 重送 → 指數退避風暴) P1 — 超時髒資料 (decision_manager): - _package_to_proposal_data: blocked_reason 禁止進 desc_parts(禁進卡片) - _push_decision_to_telegram: suggested_action fallback 改「待分析」,禁止 description 流入 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 477 +++++++++++----------- apps/api/src/services/decision_manager.py | 12 +- 2 files changed, 248 insertions(+), 241 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index c56b8f2f..2767d3a0 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1055,6 +1055,219 @@ def is_internal_ip(client_ip: str) -> bool: return False +async def _process_new_alert_background( + alert_context: dict, + alert_id: str, + fingerprint: str, + target_resource: str, + namespace: str, + alert_type: str, + message: str, + alertname: str, + severity: str, + alert_labels: dict, + notification_type: str, + alert_category: str, + can_auto_repair: bool, +) -> None: + """ + 背景任務: LLM 分析 + Approval/Incident 建立 + Telegram 推送 + + ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): + Alertmanager 收到告警後立即回傳 HTTP 202, + 所有 AI 辯證放入背景執行,避免 Alertmanager 等待 >90s 觸發 Fallback 風暴。 + """ + try: + service = get_approval_service() + openclaw = get_openclaw() + + analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context) + + if analysis_result: + risk_mapping = { + "low": RiskLevel.LOW, + "medium": RiskLevel.MEDIUM, + "critical": RiskLevel.CRITICAL, + } + risk_level = risk_mapping.get(analysis_result.risk_level.value, RiskLevel.MEDIUM) + + blast = analysis_result.blast_radius + impact_mapping = { + "NONE": DataImpact.NONE, + "READ_ONLY": DataImpact.READ_ONLY, + "WRITE": DataImpact.WRITE, + "DESTRUCTIVE": DataImpact.DESTRUCTIVE, + } + data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE + + approval_create = ApprovalRequestCreate( + action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}", + description=f"[AI: {ai_provider}] {analysis_result.description}", + risk_level=risk_level, + blast_radius=BlastRadius( + affected_pods=blast.affected_pods if blast else 1, + estimated_downtime=blast.estimated_downtime if blast else "~30s", + related_services=list(set((blast.related_services if blast else []) + analysis_result.affected_services)), + data_impact=data_impact, + ), + dry_run_checks=[ + DryRunCheck(name="AI 信心度", passed=analysis_result.confidence >= 0.7, message=f"{analysis_result.confidence:.0%}"), + DryRunCheck(name="來源", passed=True, message="alertmanager"), + ], + requested_by=f"OpenClaw ({ai_provider})", + ) + + approval = await service.create_approval_with_fingerprint( + request=approval_create, + fingerprint=fingerprint, + ) + + incident_id = await create_incident_for_approval( + approval_id=str(approval.id), + risk_level=risk_level.value, + target_resource=target_resource, + namespace=namespace, + alert_type=alert_type, + message=message, + source="alertmanager", + alertname=alertname, + alert_labels=alert_labels, + notification_type=notification_type, + alert_category=alert_category, + ) + + try: + await service.update_incident_id(approval.id, incident_id) + approval.incident_id = incident_id + except Exception as _meta_err: + logger.warning( + "approval_incident_id_update_failed", + approval_id=str(approval.id), + incident_id=incident_id, + error=str(_meta_err), + ) + + root_cause = analysis_result.description or message + estimated_downtime = blast.estimated_downtime if blast else "~30s" + primary_responsibility = analysis_result.primary_responsibility or "COLLAB" + confidence = analysis_result.confidence + + _is_heartbeat = is_heartbeat_alertname(alertname) + if _is_heartbeat: + logger.info( + "auto_repair_skipped_heartbeat", + incident_id=incident_id, + alertname=alertname, + ) + + if can_auto_repair and not _is_heartbeat: + await _try_auto_repair_background( + incident_id=incident_id, + approval_id=str(approval.id), + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + ) + else: + from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository + _op_log_rule = get_alert_operation_log_repository() + await _op_log_rule.append( + "GUARDRAIL_BLOCKED", + incident_id=incident_id, + approval_id=str(approval.id), + actor="prometheus-rule", + action_detail=f"Prometheus rule 設定 auto_repair=false,強制人工審核: {alertname}", + success=False, + context={"alertname": alertname, "auto_repair_flag": False}, + ) + + await _push_to_telegram_background( + approval_id=str(approval.id), + risk_level=risk_level.value, + resource_name=target_resource, + root_cause=root_cause, + suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value, + estimated_downtime=estimated_downtime, + hit_count=1, + primary_responsibility=primary_responsibility, + confidence=confidence, + namespace=namespace, + signoz_rps=signoz_metrics.rps if signoz_metrics else 0, + signoz_rps_trend=signoz_metrics.rps_trend if signoz_metrics else "stable", + signoz_error_rate=signoz_metrics.error_rate if signoz_metrics else 0, + signoz_p99_latency=signoz_metrics.p99_latency_ms if signoz_metrics else 0, + signoz_latency_trend=signoz_metrics.latency_trend if signoz_metrics else "stable", + signoz_trace_url=signoz_trace_url or "", + ai_tokens=ai_tokens, + ai_cost=ai_cost, + ai_provider=ai_provider, + incident_id=incident_id, + notification_type=notification_type, + alert_category=alert_category, + ) + + record_alert_chain_success("alertmanager") + + else: + # LLM 失敗 - 使用預設值 + fallback_create = ApprovalRequestCreate( + action="OBSERVE", + description=f"[LLM Failed] {message}", + risk_level=RiskLevel.MEDIUM, + blast_radius=BlastRadius( + affected_pods=1, + estimated_downtime="unknown", + related_services=[], + data_impact=DataImpact.NONE, + ), + dry_run_checks=[], + requested_by="OpenClaw (fallback)", + ) + + approval = await service.create_approval_with_fingerprint( + request=fallback_create, + fingerprint=fingerprint, + ) + + fallback_incident_id = await create_incident_for_approval( + approval_id=str(approval.id), + risk_level="medium", + target_resource=target_resource, + namespace=namespace, + alert_type=alert_type, + message=message, + source="alertmanager", + alertname=alertname, + alert_labels=alert_labels, + notification_type=notification_type, + alert_category=alert_category, + ) + + await _push_to_telegram_background( + approval_id=str(approval.id), + risk_level="medium", + resource_name=target_resource, + root_cause=message, + suggested_action="OBSERVE", + estimated_downtime="unknown", + hit_count=1, + primary_responsibility="HUMAN", + confidence=0.0, + namespace=namespace, + incident_id=fallback_incident_id, + notification_type=notification_type, + alert_category=alert_category, + ) + + except Exception as e: + logger.error( + "process_new_alert_background_error", + alert_id=alert_id, + alertname=alertname, + error=str(e), + ) + + @router.post( "/alertmanager", response_model=AlertResponse, @@ -1375,11 +1588,10 @@ async def alertmanager_webhook( ) # ========================================================================== - # 新告警 - LLM 分析 + # ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析 + # 立即回傳 202,AI 辯證在背景非同步執行 + # 修復根因: 同步等待 90s LLM → Alertmanager 超時 → Fallback 風暴 # ========================================================================== - # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 置頂,LLM 才能知道是什麼告警 - # 舊版 alertname 埋在 labels 中,alert_type 永遠是 "custom" - # → LLM 全部輸出「重啟 AWOOOI 服務」(見 INC-20260416-C365D0 postgres 磁碟告警事故) alert_context = { "alertname": alertname, # 主要識別符 — LLM 必讀 "alert_category": alert_category, # kubernetes/database/storage/host_resource/ssl_cert @@ -1394,241 +1606,30 @@ async def alertmanager_webhook( "labels": alert.labels, } - # 2026-03-29 ogt: 加入 Token/Cost 追蹤 - openclaw = get_openclaw() - analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context) + background_tasks.add_task( + _process_new_alert_background, + alert_context=alert_context, + alert_id=alert_id, + fingerprint=fingerprint, + target_resource=target_resource, + namespace=namespace, + alert_type=alert_type, + message=message, + alertname=alertname, + severity=severity, + alert_labels=alert.labels, + notification_type=notification_type, + alert_category=alert_category, + can_auto_repair=_can_auto_repair_by_rule, + ) - if analysis_result: - # analysis_result 是 OpenClawDecision Pydantic 模型 - # 轉換風險等級 - risk_mapping = { - "low": RiskLevel.LOW, - "medium": RiskLevel.MEDIUM, - "critical": RiskLevel.CRITICAL, - } - risk_level = risk_mapping.get(analysis_result.risk_level.value, RiskLevel.MEDIUM) - - # 提取爆炸半徑 - blast = analysis_result.blast_radius - impact_mapping = { - "NONE": DataImpact.NONE, - "READ_ONLY": DataImpact.READ_ONLY, - "WRITE": DataImpact.WRITE, - "DESTRUCTIVE": DataImpact.DESTRUCTIVE, - } - data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE - - # 建立 ApprovalRequestCreate (同 /alerts 流程) - approval_create = ApprovalRequestCreate( - action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}", - description=f"[AI: {ai_provider}] {analysis_result.description}", - risk_level=risk_level, - blast_radius=BlastRadius( - affected_pods=blast.affected_pods if blast else 1, - estimated_downtime=blast.estimated_downtime if blast else "~30s", - related_services=list(set((blast.related_services if blast else []) + analysis_result.affected_services)), - data_impact=data_impact, - ), - dry_run_checks=[ - DryRunCheck(name="AI 信心度", passed=analysis_result.confidence >= 0.7, message=f"{analysis_result.confidence:.0%}"), - DryRunCheck(name="來源", passed=True, message="alertmanager"), - ], - requested_by=f"OpenClaw ({ai_provider})", - ) - - # 使用 create_approval_with_fingerprint (同 /alerts) - approval = await service.create_approval_with_fingerprint( - request=approval_create, - fingerprint=fingerprint, - ) - - # ================================================================ - # Incident-Approval 同步 (鐵律: 必須同時創建) - # ================================================================ - incident_id = await create_incident_for_approval( - approval_id=str(approval.id), - risk_level=risk_level.value, - target_resource=target_resource, - namespace=namespace, - alert_type=alert_type, - message=message, - source="alertmanager", - alertname=alertname, - alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services - notification_type=notification_type, # ADR-073 Phase 2-2 - alert_category=alert_category, # ADR-073 Phase 2-2 - ) - - # 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval - # 這樣 Playbook 萃取和 KM 寫入才能找到對應的 Incident - try: - await service.update_incident_id(approval.id, incident_id) - approval.incident_id = incident_id - except Exception as _meta_err: - logger.warning( - "approval_incident_id_update_failed", - approval_id=str(approval.id), - incident_id=incident_id, - error=str(_meta_err), - ) - - root_cause = analysis_result.description or message - estimated_downtime = blast.estimated_downtime if blast else "~30s" - primary_responsibility = analysis_result.primary_responsibility or "COLLAB" - confidence = analysis_result.confidence - - # ================================================================ - # 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環) - # Incident 建立後立即評估是否可自動修復 - # P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行 - # Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務 - # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) - # ================================================================ - # 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 心跳/看門狗告警不進飛輪 - # NoAlertsReceived2Hours 等代表監控系統狀態,不是服務故障 - _is_heartbeat = is_heartbeat_alertname(alertname) - if _is_heartbeat: - logger.info( - "auto_repair_skipped_heartbeat", - incident_id=incident_id, - alertname=alertname, - ) - - if _can_auto_repair_by_rule and not _is_heartbeat: - background_tasks.add_task( - _try_auto_repair_background, - incident_id=incident_id, - approval_id=str(approval.id), - alert_type=alert_type, - target_resource=target_resource, - namespace=namespace, - ) - else: - # auto_repair=false → 記錄 GUARDRAIL_BLOCKED,不觸發自動修復 - from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository - _op_log_rule = get_alert_operation_log_repository() - background_tasks.add_task( - _op_log_rule.append, - "GUARDRAIL_BLOCKED", - incident_id=incident_id, - approval_id=str(approval.id), - actor="prometheus-rule", - action_detail=f"Prometheus rule 設定 auto_repair=false,強制人工審核: {alertname}", - success=False, - context={"alertname": alertname, "auto_repair_flag": False}, - ) - - # 推送 Telegram - background_tasks.add_task( - _push_to_telegram_background, - approval_id=str(approval.id), - risk_level=risk_level.value, - resource_name=target_resource, - root_cause=root_cause, - suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value, - estimated_downtime=estimated_downtime, - hit_count=1, - primary_responsibility=primary_responsibility, - confidence=confidence, - namespace=namespace, - signoz_rps=signoz_metrics.rps if signoz_metrics else 0, - signoz_rps_trend=signoz_metrics.rps_trend if signoz_metrics else "stable", - signoz_error_rate=signoz_metrics.error_rate if signoz_metrics else 0, - signoz_p99_latency=signoz_metrics.p99_latency_ms if signoz_metrics else 0, - signoz_latency_trend=signoz_metrics.latency_trend if signoz_metrics else "stable", - signoz_trace_url=signoz_trace_url or "", - # 2026-03-29 ogt: AI Token/Cost 追蹤 - ai_tokens=ai_tokens, - ai_cost=ai_cost, - ai_provider=ai_provider, - # 2026-04-08 ogt: 補傳 incident_id 以啟用詳情/重診/歷史按鈕 - incident_id=incident_id, - # ADR-073: 路由 TYPE-4D → send_drift_card - notification_type=notification_type, - # ADR-075 斷點 E 修復: 路由 TYPE-8M → send_meta_alert - alert_category=alert_category, - ) - - record_alert_chain_success("alertmanager") - return AlertResponse( - success=True, - message=f"✅ LLM 分析完成 (via {ai_provider})", - alert_id=alert_id, - approval_created=True, - approval_id=str(approval.id), - risk_level=risk_level.value, - suggested_action=approval_create.action, - hit_count=1, - converged=False, - ) - else: - # LLM 失敗 - 使用預設值 - fallback_create = ApprovalRequestCreate( - action="OBSERVE", - description=f"[LLM Failed] {message}", - risk_level=RiskLevel.MEDIUM, - blast_radius=BlastRadius( - affected_pods=1, - estimated_downtime="unknown", - related_services=[], - data_impact=DataImpact.NONE, - ), - dry_run_checks=[], - requested_by="OpenClaw (fallback)", - ) - - approval = await service.create_approval_with_fingerprint( - request=fallback_create, - fingerprint=fingerprint, - ) - - # ================================================================ - # Incident-Approval 同步 (鐵律: 即使 LLM 失敗也必須創建) - # ================================================================ - fallback_incident_id = await create_incident_for_approval( - approval_id=str(approval.id), - risk_level="medium", - target_resource=target_resource, - namespace=namespace, - alert_type=alert_type, - message=message, - source="alertmanager", - alertname=alertname, - alert_labels=alert.labels, # Phase 1: 完整 labels - notification_type=notification_type, # ADR-073 Phase 2-2 - alert_category=alert_category, # ADR-073 Phase 2-2 - ) - - background_tasks.add_task( - _push_to_telegram_background, - approval_id=str(approval.id), - risk_level="medium", - resource_name=target_resource, - root_cause=message, - suggested_action="OBSERVE", - estimated_downtime="unknown", - hit_count=1, - primary_responsibility="HUMAN", - confidence=0.0, - namespace=namespace, - incident_id=fallback_incident_id, - # ADR-073: 路由 TYPE-4D → send_drift_card - notification_type=notification_type, - # ADR-075 斷點 E 修復: 路由 TYPE-8M → send_meta_alert - alert_category=alert_category, - ) - - return AlertResponse( - success=True, - message="⚠️ LLM 分析失敗,使用預設值", - alert_id=alert_id, - approval_created=True, - approval_id=str(approval.id), - risk_level="medium", - suggested_action="OBSERVE", - hit_count=1, - converged=False, - ) + record_alert_chain_success("alertmanager") + return AlertResponse( + success=True, + message="✅ 告警已排入背景分析 (202 Accepted)", + alert_id=alert_id, + approval_created=False, + ) except Exception as e: logger.error("alertmanager_error", error=str(e)) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index bc213274..c218603d 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -408,7 +408,11 @@ async def _push_decision_to_telegram( risk_level=risk_level, resource_name=target[:50], root_cause=_card_root_cause, - suggested_action=action[:120] if action else (description[:120] if description else "待分析"), + # 2026-04-17 ogt + Claude Sonnet 4.6(亞太): 修復超時降級髒資料 + # 舊:action="" 時 fallback 到 description,而 description 可能是「待分析」或診斷摘要 + # 這導致 description 中的診斷文字(如「根因:...」)出現在「建議修復動作」欄位 + # 新:action="" 時固定顯示「待分析」,禁止 description 流進 suggested_action + suggested_action=action[:120] if action else "待分析", estimated_downtime="5-15 min", primary_responsibility="INFRA", confidence=confidence, @@ -1085,8 +1089,10 @@ def _package_to_proposal_data(package: Any) -> dict[str, Any]: if plan and getattr(plan, "top_candidate", None): c = plan.top_candidate desc_parts.append(f"方案:{c.action[:100]}") - if package.blocked_reason: - desc_parts.append(f"備注:{package.blocked_reason[:100]}") + # blocked_reason 是系統內部診斷,不能放進 description(Telegram 卡片顯示用) + # 2026-04-17 ogt + Claude Sonnet 4.6(亞太): 修復超時髒資料污染卡片 + # 舊:blocked_reason → desc_parts → description → suggested_action 欄位顯示「備注:全局超時 > 90.0s」 + # 新:blocked_reason 只寫入 proposal_data["blocked_reason"],供下游閘門邏輯用,禁止進卡片顯示 description = ";".join(desc_parts) if desc_parts else (action[:200] if action else "待分析") return {