fix(webhook+decision): ADR-089 async webhook + 超時髒資料修復
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 10m16s

P0 — Webhook async (ADR-089):
- Alertmanager 收到告警立即回 202,不再同步等 90s LLM
- 新增 _process_new_alert_background():LLM 分析/Approval/Incident/Telegram 全進背景
- 根治 Alertmanager Fallback 風暴(超時 → 重送 → 指數退避風暴)

P1 — 超時髒資料 (decision_manager):
- _package_to_proposal_data: blocked_reason 禁止進 desc_parts(禁進卡片)
- _push_decision_to_telegram: suggested_action fallback 改「待分析」,禁止 description 流入

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-17 16:29:15 +08:00
parent f2ac5d01c6
commit c759b4eeab
2 changed files with 248 additions and 241 deletions

View File

@@ -1055,6 +1055,219 @@ def is_internal_ip(client_ip: str) -> bool:
return False
async def _process_new_alert_background(
alert_context: dict,
alert_id: str,
fingerprint: str,
target_resource: str,
namespace: str,
alert_type: str,
message: str,
alertname: str,
severity: str,
alert_labels: dict,
notification_type: str,
alert_category: str,
can_auto_repair: bool,
) -> None:
"""
背景任務: LLM 分析 + Approval/Incident 建立 + Telegram 推送
ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6):
Alertmanager 收到告警後立即回傳 HTTP 202
所有 AI 辯證放入背景執行,避免 Alertmanager 等待 >90s 觸發 Fallback 風暴。
"""
try:
service = get_approval_service()
openclaw = get_openclaw()
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
if analysis_result:
risk_mapping = {
"low": RiskLevel.LOW,
"medium": RiskLevel.MEDIUM,
"critical": RiskLevel.CRITICAL,
}
risk_level = risk_mapping.get(analysis_result.risk_level.value, RiskLevel.MEDIUM)
blast = analysis_result.blast_radius
impact_mapping = {
"NONE": DataImpact.NONE,
"READ_ONLY": DataImpact.READ_ONLY,
"WRITE": DataImpact.WRITE,
"DESTRUCTIVE": DataImpact.DESTRUCTIVE,
}
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE
approval_create = ApprovalRequestCreate(
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
description=f"[AI: {ai_provider}] {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
affected_pods=blast.affected_pods if blast else 1,
estimated_downtime=blast.estimated_downtime if blast else "~30s",
related_services=list(set((blast.related_services if blast else []) + analysis_result.affected_services)),
data_impact=data_impact,
),
dry_run_checks=[
DryRunCheck(name="AI 信心度", passed=analysis_result.confidence >= 0.7, message=f"{analysis_result.confidence:.0%}"),
DryRunCheck(name="來源", passed=True, message="alertmanager"),
],
requested_by=f"OpenClaw ({ai_provider})",
)
approval = await service.create_approval_with_fingerprint(
request=approval_create,
fingerprint=fingerprint,
)
incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level=risk_level.value,
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
try:
await service.update_incident_id(approval.id, incident_id)
approval.incident_id = incident_id
except Exception as _meta_err:
logger.warning(
"approval_incident_id_update_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_meta_err),
)
root_cause = analysis_result.description or message
estimated_downtime = blast.estimated_downtime if blast else "~30s"
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
confidence = analysis_result.confidence
_is_heartbeat = is_heartbeat_alertname(alertname)
if _is_heartbeat:
logger.info(
"auto_repair_skipped_heartbeat",
incident_id=incident_id,
alertname=alertname,
)
if can_auto_repair and not _is_heartbeat:
await _try_auto_repair_background(
incident_id=incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
else:
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
_op_log_rule = get_alert_operation_log_repository()
await _op_log_rule.append(
"GUARDRAIL_BLOCKED",
incident_id=incident_id,
approval_id=str(approval.id),
actor="prometheus-rule",
action_detail=f"Prometheus rule 設定 auto_repair=false強制人工審核: {alertname}",
success=False,
context={"alertname": alertname, "auto_repair_flag": False},
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level=risk_level.value,
resource_name=target_resource,
root_cause=root_cause,
suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value,
estimated_downtime=estimated_downtime,
hit_count=1,
primary_responsibility=primary_responsibility,
confidence=confidence,
namespace=namespace,
signoz_rps=signoz_metrics.rps if signoz_metrics else 0,
signoz_rps_trend=signoz_metrics.rps_trend if signoz_metrics else "stable",
signoz_error_rate=signoz_metrics.error_rate if signoz_metrics else 0,
signoz_p99_latency=signoz_metrics.p99_latency_ms if signoz_metrics else 0,
signoz_latency_trend=signoz_metrics.latency_trend if signoz_metrics else "stable",
signoz_trace_url=signoz_trace_url or "",
ai_tokens=ai_tokens,
ai_cost=ai_cost,
ai_provider=ai_provider,
incident_id=incident_id,
notification_type=notification_type,
alert_category=alert_category,
)
record_alert_chain_success("alertmanager")
else:
# LLM 失敗 - 使用預設值
fallback_create = ApprovalRequestCreate(
action="OBSERVE",
description=f"[LLM Failed] {message}",
risk_level=RiskLevel.MEDIUM,
blast_radius=BlastRadius(
affected_pods=1,
estimated_downtime="unknown",
related_services=[],
data_impact=DataImpact.NONE,
),
dry_run_checks=[],
requested_by="OpenClaw (fallback)",
)
approval = await service.create_approval_with_fingerprint(
request=fallback_create,
fingerprint=fingerprint,
)
fallback_incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level="medium",
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert_labels,
notification_type=notification_type,
alert_category=alert_category,
)
await _push_to_telegram_background(
approval_id=str(approval.id),
risk_level="medium",
resource_name=target_resource,
root_cause=message,
suggested_action="OBSERVE",
estimated_downtime="unknown",
hit_count=1,
primary_responsibility="HUMAN",
confidence=0.0,
namespace=namespace,
incident_id=fallback_incident_id,
notification_type=notification_type,
alert_category=alert_category,
)
except Exception as e:
logger.error(
"process_new_alert_background_error",
alert_id=alert_id,
alertname=alertname,
error=str(e),
)
@router.post(
"/alertmanager",
response_model=AlertResponse,
@@ -1375,11 +1588,10 @@ async def alertmanager_webhook(
)
# ==========================================================================
# 新告警 - LLM 分析
# ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析
# 立即回傳 202AI 辯證在背景非同步執行
# 修復根因: 同步等待 90s LLM → Alertmanager 超時 → Fallback 風暴
# ==========================================================================
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 置頂LLM 才能知道是什麼告警
# 舊版 alertname 埋在 labels 中alert_type 永遠是 "custom"
# → LLM 全部輸出「重啟 AWOOOI 服務」(見 INC-20260416-C365D0 postgres 磁碟告警事故)
alert_context = {
"alertname": alertname, # 主要識別符 — LLM 必讀
"alert_category": alert_category, # kubernetes/database/storage/host_resource/ssl_cert
@@ -1394,241 +1606,30 @@ async def alertmanager_webhook(
"labels": alert.labels,
}
# 2026-03-29 ogt: 加入 Token/Cost 追蹤
openclaw = get_openclaw()
analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
background_tasks.add_task(
_process_new_alert_background,
alert_context=alert_context,
alert_id=alert_id,
fingerprint=fingerprint,
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
alertname=alertname,
severity=severity,
alert_labels=alert.labels,
notification_type=notification_type,
alert_category=alert_category,
can_auto_repair=_can_auto_repair_by_rule,
)
if analysis_result:
# analysis_result 是 OpenClawDecision Pydantic 模型
# 轉換風險等級
risk_mapping = {
"low": RiskLevel.LOW,
"medium": RiskLevel.MEDIUM,
"critical": RiskLevel.CRITICAL,
}
risk_level = risk_mapping.get(analysis_result.risk_level.value, RiskLevel.MEDIUM)
# 提取爆炸半徑
blast = analysis_result.blast_radius
impact_mapping = {
"NONE": DataImpact.NONE,
"READ_ONLY": DataImpact.READ_ONLY,
"WRITE": DataImpact.WRITE,
"DESTRUCTIVE": DataImpact.DESTRUCTIVE,
}
data_impact = impact_mapping.get(blast.data_impact.value, DataImpact.NONE) if blast else DataImpact.NONE
# 建立 ApprovalRequestCreate (同 /alerts 流程)
approval_create = ApprovalRequestCreate(
action=f"{analysis_result.action_title} | {analysis_result.kubectl_command}",
description=f"[AI: {ai_provider}] {analysis_result.description}",
risk_level=risk_level,
blast_radius=BlastRadius(
affected_pods=blast.affected_pods if blast else 1,
estimated_downtime=blast.estimated_downtime if blast else "~30s",
related_services=list(set((blast.related_services if blast else []) + analysis_result.affected_services)),
data_impact=data_impact,
),
dry_run_checks=[
DryRunCheck(name="AI 信心度", passed=analysis_result.confidence >= 0.7, message=f"{analysis_result.confidence:.0%}"),
DryRunCheck(name="來源", passed=True, message="alertmanager"),
],
requested_by=f"OpenClaw ({ai_provider})",
)
# 使用 create_approval_with_fingerprint (同 /alerts)
approval = await service.create_approval_with_fingerprint(
request=approval_create,
fingerprint=fingerprint,
)
# ================================================================
# Incident-Approval 同步 (鐵律: 必須同時創建)
# ================================================================
incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level=risk_level.value,
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert.labels, # Phase 1: 完整 labels 供 _extract_affected_services
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
# 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 Approval
# 這樣 Playbook 萃取和 KM 寫入才能找到對應的 Incident
try:
await service.update_incident_id(approval.id, incident_id)
approval.incident_id = incident_id
except Exception as _meta_err:
logger.warning(
"approval_incident_id_update_failed",
approval_id=str(approval.id),
incident_id=incident_id,
error=str(_meta_err),
)
root_cause = analysis_result.description or message
estimated_downtime = blast.estimated_downtime if blast else "~30s"
primary_responsibility = analysis_result.primary_responsibility or "COLLAB"
confidence = analysis_result.confidence
# ================================================================
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
# Incident 建立後立即評估是否可自動修復
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
# Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL不觸發背景任務
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# ================================================================
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 心跳/看門狗告警不進飛輪
# NoAlertsReceived2Hours 等代表監控系統狀態,不是服務故障
_is_heartbeat = is_heartbeat_alertname(alertname)
if _is_heartbeat:
logger.info(
"auto_repair_skipped_heartbeat",
incident_id=incident_id,
alertname=alertname,
)
if _can_auto_repair_by_rule and not _is_heartbeat:
background_tasks.add_task(
_try_auto_repair_background,
incident_id=incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
else:
# auto_repair=false → 記錄 GUARDRAIL_BLOCKED不觸發自動修復
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
_op_log_rule = get_alert_operation_log_repository()
background_tasks.add_task(
_op_log_rule.append,
"GUARDRAIL_BLOCKED",
incident_id=incident_id,
approval_id=str(approval.id),
actor="prometheus-rule",
action_detail=f"Prometheus rule 設定 auto_repair=false強制人工審核: {alertname}",
success=False,
context={"alertname": alertname, "auto_repair_flag": False},
)
# 推送 Telegram
background_tasks.add_task(
_push_to_telegram_background,
approval_id=str(approval.id),
risk_level=risk_level.value,
resource_name=target_resource,
root_cause=root_cause,
suggested_action=(analysis_result.kubectl_command or "").strip() or analysis_result.suggested_action.value,
estimated_downtime=estimated_downtime,
hit_count=1,
primary_responsibility=primary_responsibility,
confidence=confidence,
namespace=namespace,
signoz_rps=signoz_metrics.rps if signoz_metrics else 0,
signoz_rps_trend=signoz_metrics.rps_trend if signoz_metrics else "stable",
signoz_error_rate=signoz_metrics.error_rate if signoz_metrics else 0,
signoz_p99_latency=signoz_metrics.p99_latency_ms if signoz_metrics else 0,
signoz_latency_trend=signoz_metrics.latency_trend if signoz_metrics else "stable",
signoz_trace_url=signoz_trace_url or "",
# 2026-03-29 ogt: AI Token/Cost 追蹤
ai_tokens=ai_tokens,
ai_cost=ai_cost,
ai_provider=ai_provider,
# 2026-04-08 ogt: 補傳 incident_id 以啟用詳情/重診/歷史按鈕
incident_id=incident_id,
# ADR-073: 路由 TYPE-4D → send_drift_card
notification_type=notification_type,
# ADR-075 斷點 E 修復: 路由 TYPE-8M → send_meta_alert
alert_category=alert_category,
)
record_alert_chain_success("alertmanager")
return AlertResponse(
success=True,
message=f"✅ LLM 分析完成 (via {ai_provider})",
alert_id=alert_id,
approval_created=True,
approval_id=str(approval.id),
risk_level=risk_level.value,
suggested_action=approval_create.action,
hit_count=1,
converged=False,
)
else:
# LLM 失敗 - 使用預設值
fallback_create = ApprovalRequestCreate(
action="OBSERVE",
description=f"[LLM Failed] {message}",
risk_level=RiskLevel.MEDIUM,
blast_radius=BlastRadius(
affected_pods=1,
estimated_downtime="unknown",
related_services=[],
data_impact=DataImpact.NONE,
),
dry_run_checks=[],
requested_by="OpenClaw (fallback)",
)
approval = await service.create_approval_with_fingerprint(
request=fallback_create,
fingerprint=fingerprint,
)
# ================================================================
# Incident-Approval 同步 (鐵律: 即使 LLM 失敗也必須創建)
# ================================================================
fallback_incident_id = await create_incident_for_approval(
approval_id=str(approval.id),
risk_level="medium",
target_resource=target_resource,
namespace=namespace,
alert_type=alert_type,
message=message,
source="alertmanager",
alertname=alertname,
alert_labels=alert.labels, # Phase 1: 完整 labels
notification_type=notification_type, # ADR-073 Phase 2-2
alert_category=alert_category, # ADR-073 Phase 2-2
)
background_tasks.add_task(
_push_to_telegram_background,
approval_id=str(approval.id),
risk_level="medium",
resource_name=target_resource,
root_cause=message,
suggested_action="OBSERVE",
estimated_downtime="unknown",
hit_count=1,
primary_responsibility="HUMAN",
confidence=0.0,
namespace=namespace,
incident_id=fallback_incident_id,
# ADR-073: 路由 TYPE-4D → send_drift_card
notification_type=notification_type,
# ADR-075 斷點 E 修復: 路由 TYPE-8M → send_meta_alert
alert_category=alert_category,
)
return AlertResponse(
success=True,
message="⚠️ LLM 分析失敗,使用預設值",
alert_id=alert_id,
approval_created=True,
approval_id=str(approval.id),
risk_level="medium",
suggested_action="OBSERVE",
hit_count=1,
converged=False,
)
record_alert_chain_success("alertmanager")
return AlertResponse(
success=True,
message="✅ 告警已排入背景分析 (202 Accepted)",
alert_id=alert_id,
approval_created=False,
)
except Exception as e:
logger.error("alertmanager_error", error=str(e))

View File

@@ -408,7 +408,11 @@ async def _push_decision_to_telegram(
risk_level=risk_level,
resource_name=target[:50],
root_cause=_card_root_cause,
suggested_action=action[:120] if action else (description[:120] if description else "待分析"),
# 2026-04-17 ogt + Claude Sonnet 4.6(亞太): 修復超時降級髒資料
# 舊action="" 時 fallback 到 description而 description 可能是「待分析」或診斷摘要
# 這導致 description 中的診斷文字(如「根因:...」)出現在「建議修復動作」欄位
# 新action="" 時固定顯示「待分析」,禁止 description 流進 suggested_action
suggested_action=action[:120] if action else "待分析",
estimated_downtime="5-15 min",
primary_responsibility="INFRA",
confidence=confidence,
@@ -1085,8 +1089,10 @@ def _package_to_proposal_data(package: Any) -> dict[str, Any]:
if plan and getattr(plan, "top_candidate", None):
c = plan.top_candidate
desc_parts.append(f"方案:{c.action[:100]}")
if package.blocked_reason:
desc_parts.append(f"備注:{package.blocked_reason[:100]}")
# blocked_reason 是系統內部診斷,不能放進 descriptionTelegram 卡片顯示用)
# 2026-04-17 ogt + Claude Sonnet 4.6(亞太): 修復超時髒資料污染卡片
# 舊blocked_reason → desc_parts → description → suggested_action 欄位顯示「備注:全局超時 > 90.0s」
# 新blocked_reason 只寫入 proposal_data["blocked_reason"],供下游閘門邏輯用,禁止進卡片顯示
description = "".join(desc_parts) if desc_parts else (action[:200] if action else "待分析")
return {