fix(alertmanager): keep auto repair moving on ai fallback

2026-05-14 00:06:34 +08:00
parent 39581ab824
commit d835b666cf
2 changed files with 125 additions and 3 deletions
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -139,6 +139,38 @@ def _should_use_alertmanager_rule_first(
    )


+async def _analyze_alertmanager_with_timeout(
+    openclaw,
+    alert_context: dict,
+    *,
+    alert_id: str,
+    alertname: str,
+) -> tuple:
+    """Run Alertmanager AI analysis without letting it block the workflow forever."""
+
+    try:
+        return await asyncio.wait_for(
+            openclaw.analyze_alert(alert_context),
+            timeout=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
+        )
+    except TimeoutError:
+        logger.warning(
+            "alertmanager_openclaw_timeout_fallback",
+            alert_id=alert_id,
+            alertname=alertname,
+            timeout_sec=ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS,
+        )
+        return None, "fallback_timeout", "", None, "", 0, 0.0
+    except Exception as exc:
+        logger.warning(
+            "alertmanager_openclaw_failed_fallback",
+            alert_id=alert_id,
+            alertname=alertname,
+            error=str(exc),
+        )
+        return None, "fallback_error", "", None, "", 0, 0.0
+
+
 async def _escalate_auto_repair_unavailable(
    *,
    incident_id: str,
@@ -796,6 +828,7 @@ async def verify_webhook_signature(

 # 戰略 B: 滑動時間窗 (ADR-073: 5 分鐘改 30 分鐘，防同一問題反覆重建 Incident，2026-04-12 ogt)
 DEBOUNCE_WINDOW_MINUTES = 30
+ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS = 90.0


 # =============================================================================
@@ -1109,7 +1142,12 @@ async def receive_alert(
        # 呼叫 OpenClaw LLM 分析 (v7.0 含 SignOz 整合)
        # 2026-03-29 ogt: 加入 Token/Cost 追蹤
        openclaw = get_openclaw()
-        analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
+        analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
+            openclaw,
+            alert_context,
+            alert_id=alert_id,
+            alertname=alert.alert_type,
+        )

        if analysis_result:
            # LLM 分析成功
@@ -1815,7 +1853,12 @@ async def _process_new_alert_background(
            record_alert_chain_success("alertmanager")
            return

-        analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await openclaw.analyze_alert(alert_context)
+        analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, ai_tokens, ai_cost = await _analyze_alertmanager_with_timeout(
+            openclaw,
+            alert_context,
+            alert_id=alert_id,
+            alertname=alertname,
+        )

        if analysis_result:
            risk_mapping = {
@@ -2115,11 +2158,17 @@ async def _process_new_alert_background(
        else:
            # LLM 失敗 - 使用預設值
            # 2026-04-27 Claude Sonnet 4.6: shadow-run Step1 — 補 metadata kwarg，讓 extra_metadata 可觀測
+            _matched_playbook_id_cs4 = await resolve_playbook_id_for_alert(
+                rule_id=str(rule_response.get("rule_id", "")),
+                alertname=alertname,
+                affected_services=[target_resource] if target_resource else [],
+                severity="medium",
+            )
            _approval_metadata_cs4 = {
                "source": "fallback",
                "confidence_score": None,
                "is_rule_based": False,
-                "playbook_id": None,
+                "playbook_id": _matched_playbook_id_cs4,
            }
            fallback_create = ApprovalRequestCreate(
                action="OBSERVE",
@@ -2134,6 +2183,7 @@ async def _process_new_alert_background(
                dry_run_checks=[],
                requested_by="OpenClaw (fallback)",
                metadata=_approval_metadata_cs4,
+                matched_playbook_id=_matched_playbook_id_cs4,
            )

            approval = await service.create_approval_with_fingerprint(
@@ -2205,6 +2255,37 @@ async def _process_new_alert_background(
                annotations=alert_context.get("annotations", {}),
            )

+            _is_heartbeat = is_heartbeat_alertname(alertname)
+            if can_auto_repair and not _is_heartbeat:
+                await _try_auto_repair_background(
+                    incident_id=fallback_incident_id,
+                    approval_id=str(approval.id),
+                    alert_type=alert_type,
+                    target_resource=target_resource,
+                    namespace=namespace,
+                )
+            elif not can_auto_repair and not _is_heartbeat:
+                from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
+                _op_log_fallback = get_alert_operation_log_repository()
+                await _op_log_fallback.append(
+                    "GUARDRAIL_BLOCKED",
+                    incident_id=fallback_incident_id,
+                    approval_id=str(approval.id),
+                    actor="prometheus-rule",
+                    action_detail=f"Prometheus rule 設定 auto_repair=false，fallback 轉人工: {alertname}",
+                    success=False,
+                    context={"alertname": alertname, "auto_repair_flag": False},
+                )
+                await _escalate_auto_repair_unavailable(
+                    incident_id=fallback_incident_id,
+                    approval_id=str(approval.id),
+                    alert_type=alert_type,
+                    target_resource=target_resource,
+                    namespace=namespace,
+                    failure_reason="Prometheus rule auto_repair=false，fallback 未進入自動修復評估",
+                    attempted_actions="llm_fallback -> guardrail:auto_repair_false -> emergency_intervention",
+                )
+
            await _push_to_telegram_background(
                approval_id=str(approval.id),
                risk_level="medium",
--- a/apps/api/tests/test_alertmanager_rule_bypass.py
+++ b/apps/api/tests/test_alertmanager_rule_bypass.py
@@ -1,6 +1,10 @@
+import asyncio
 from datetime import datetime

+import pytest
+
 from src.api.v1.webhooks import (
+    _analyze_alertmanager_with_timeout,
    _should_bypass_alertmanager_llm,
    _should_use_alertmanager_rule_first,
 )
@@ -111,6 +115,43 @@ def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
    assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600


+@pytest.mark.asyncio
+async def test_alertmanager_analysis_timeout_returns_fallback(monkeypatch):
+    from src.api.v1 import webhooks as webhooks_module
+
+    class SlowOpenClaw:
+        async def analyze_alert(self, alert_context):
+            await asyncio.sleep(1)
+            return "unexpected"
+
+    monkeypatch.setattr(webhooks_module, "ALERTMANAGER_BACKGROUND_AI_TIMEOUT_SECONDS", 0.01)
+
+    result = await _analyze_alertmanager_with_timeout(
+        SlowOpenClaw(),
+        {"alertname": "AwoooPTimeoutCanary"},
+        alert_id="alert-timeout",
+        alertname="AwoooPTimeoutCanary",
+    )
+
+    assert result == (None, "fallback_timeout", "", None, "", 0, 0.0)
+
+
+@pytest.mark.asyncio
+async def test_alertmanager_analysis_error_returns_fallback():
+    class BrokenOpenClaw:
+        async def analyze_alert(self, alert_context):
+            raise RuntimeError("provider chain failed")
+
+    result = await _analyze_alertmanager_with_timeout(
+        BrokenOpenClaw(),
+        {"alertname": "AwoooPErrorCanary"},
+        alert_id="alert-error",
+        alertname="AwoooPErrorCanary",
+    )
+
+    assert result == (None, "fallback_error", "", None, "", 0, 0.0)
+
+
 def test_resolved_guard_stamp_without_timestamp_is_clean():
    assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"