From d835b666cfb81a64ff5124efa6ea1e93f8f33e1b Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 14 May 2026 00:06:34 +0800 Subject: [PATCH] fix(alertmanager): keep auto repair moving on ai fallback --- apps/api/src/api/v1/webhooks.py | 87 ++++++++++++++++++- .../tests/test_alertmanager_rule_bypass.py | 41 +++++++++ 2 files changed, 125 insertions(+), 3 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 2beafd26..e9d23ae4 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -139,6 +139,38 @@ def _should_use_alertmanager_rule_first( ) +async def _analyze_alertmanager_with_timeout( + openclaw, + alert_context: dict, + *, + alert_id: str, + alertname: str, +) -> tuple: + """Run Alertmanager AI analysis without letting it block the workflow forever.""" + + try: + return await asyncio.wait_for( + openclaw.analyze_alert(alert_context), + timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS, + ) + except TimeoutError: + logger.warning( + "alertmanager_openclaw_timeout_fallback", + alert_id=alert_id, + alertname=alertname, + timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS, + ) + return None, "fallback_timeout", "", None, "", 0, 0.0 + except Exception as exc: + logger.warning( + "alertmanager_openclaw_failed_fallback", + alert_id=alert_id, + alertname=alertname, + error=str(exc), + ) + return None, "fallback_error", "", None, "", 0, 0.0 + + async def _escalate_auto_repair_unavailable( *, incident_id: str, @@ -796,6 +828,7 @@ async def verify_webhook_signature( # 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘,防同一問題反覆重建 Incident,2026-04-12 ogt) DEBOUNCE_WINDOW_MINUTES = 30 +ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0 # ============================================================================= @@ -1109,7 +1142,12 @@ async def receive_alert( # 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合) # 2026-03-29 ogt: 加入 Token/Cost 追蹤 openclaw = get_openclaw() - analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context) + analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout( + openclaw, + alert_context, + alert_id=alert_id, + alertname=alert.alert_type, + ) if analysis_result: # LLM 分析成功 @@ -1815,7 +1853,12 @@ async def _process_new_alert_background( record_alert_chain_success("alertmanager") return - analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context) + analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout( + openclaw, + alert_context, + alert_id=alert_id, + alertname=alertname, + ) if analysis_result: risk_mapping = { @@ -2115,11 +2158,17 @@ async def _process_new_alert_background( else: # LLM 失敗 - 使用預設值 # 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg,讓 extra_metadata 可觀測 + _matched_playbook_id_cs4 = await resolve_playbook_id_for_alert( + rule_id=str(rule_response.get("rule_id", "")), + alertname=alertname, + affected_services=[target_resource] if target_resource else [], + severity="medium", + ) _approval_metadata_cs4 = { "source": "fallback", "confidence_score": None, "is_rule_based": False, - "playbook_id": None, + "playbook_id": _matched_playbook_id_cs4, } fallback_create = ApprovalRequestCreate( action="OBSERVE", @@ -2134,6 +2183,7 @@ async def _process_new_alert_background( dry_run_checks=[], requested_by="OpenClaw (fallback)", metadata=_approval_metadata_cs4, + matched_playbook_id=_matched_playbook_id_cs4, ) approval = await service.create_approval_with_fingerprint( @@ -2205,6 +2255,37 @@ async def _process_new_alert_background( annotations=alert_context.get("annotations", {}), ) + _is_heartbeat = is_heartbeat_alertname(alertname) + if can_auto_repair and not _is_heartbeat: + await _try_auto_repair_background( + incident_id=fallback_incident_id, + approval_id=str(approval.id), + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + ) + elif not can_auto_repair and not _is_heartbeat: + from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository + _op_log_fallback = get_alert_operation_log_repository() + await _op_log_fallback.append( + "GUARDRAIL_BLOCKED", + incident_id=fallback_incident_id, + approval_id=str(approval.id), + actor="prometheus-rule", + action_detail=f"Prometheus rule 設定 auto_repair=false,fallback 轉人工: {alertname}", + success=False, + context={"alertname": alertname, "auto_repair_flag": False}, + ) + await _escalate_auto_repair_unavailable( + incident_id=fallback_incident_id, + approval_id=str(approval.id), + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + failure_reason="Prometheus rule auto_repair=false,fallback 未進入自動修復評估", + attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention", + ) + await _push_to_telegram_background( approval_id=str(approval.id), risk_level="medium", diff --git a/apps/api/tests/test_alertmanager_rule_bypass.py b/apps/api/tests/test_alertmanager_rule_bypass.py index 49052fb3..742c71d2 100644 --- a/apps/api/tests/test_alertmanager_rule_bypass.py +++ b/apps/api/tests/test_alertmanager_rule_bypass.py @@ -1,6 +1,10 @@ +import asyncio from datetime import datetime +import pytest + from src.api.v1.webhooks import ( + _analyze_alertmanager_with_timeout, _should_bypass_alertmanager_llm, _should_use_alertmanager_rule_first, ) @@ -111,6 +115,43 @@ def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped(): assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600 +@pytest.mark.asyncio +async def test_alertmanager_analysis_timeout_returns_fallback(monkeypatch): + from src.api.v1 import webhooks as webhooks_module + + class SlowOpenClaw: + async def analyze_alert(self, alert_context): + await asyncio.sleep(1) + return "unexpected" + + monkeypatch.setattr(webhooks_module, "ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS", 0.01) + + result = await _analyze_alertmanager_with_timeout( + SlowOpenClaw(), + {"alertname": "AwoooPTimeoutCanary"}, + alert_id="alert-timeout", + alertname="AwoooPTimeoutCanary", + ) + + assert result == (None, "fallback_timeout", "", None, "", 0, 0.0) + + +@pytest.mark.asyncio +async def test_alertmanager_analysis_error_returns_fallback(): + class BrokenOpenClaw: + async def analyze_alert(self, alert_context): + raise RuntimeError("provider chain failed") + + result = await _analyze_alertmanager_with_timeout( + BrokenOpenClaw(), + {"alertname": "AwoooPErrorCanary"}, + alert_id="alert-error", + alertname="AwoooPErrorCanary", + ) + + assert result == (None, "fallback_error", "", None, "", 0, 0.0) + + def test_resolved_guard_stamp_without_timestamp_is_clean(): assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"