fix(aiops): suppress repeated llm alert loops

2026-05-01 13:02:07 +08:00
parent 3691402561
commit 9db87f177e
9 changed files with 244 additions and 24 deletions
--- a/.agents/skills/03-openclaw-cognitive-expert.md
+++ b/.agents/skills/03-openclaw-cognitive-expert.md
@@ -10,11 +10,11 @@

 | 欄位 | 值 |
 |------|-----|
-| **版本** | v1.7 |
+| **版本** | v1.8 |
 | **建立日期** | 2026-03-20 (台北) |
 | **建立者** | Claude Code |
-| **最後修改** | 2026-03-31 18:00 (台北) |
-| **修改者** | Claude Code (首席架構師) |
+| **最後修改** | 2026-05-01 15:30 (台北) |
+| **修改者** | Codex |

 ### 變更紀錄

@@ -28,6 +28,7 @@
 | v1.5 | 2026-03-27 | Claude Code | Stream Key 統一 + 告警去重機制 |
 | v1.6 | 2026-03-27 | Claude Code | **P1 優化: 稍後/靜默按鈕** |
 | v1.7 | 2026-03-31 | Claude Code | **Phase 22: OpenClaw + Nemotron 協作 (ADR-044)** |
+| v1.8 | 2026-05-01 | Codex | **LLM 鬼循環治理: stable alert cache key + no裸奔重試** |

 ---

@@ -115,6 +116,18 @@ async def analyze_with_ai(context: str) -> str:
 response = await _call_ollama(context)
 ```

+#### 2.1 告警快取鍵必須使用穩定維度
+
+告警分析的 prompt 會包含 annotations、SignOz 即時數值、MCP evidence 等動態資料；不得把完整 prompt 當成同一告警的唯一 cache key，否則 firing 告警每 20 秒都會 miss cache。
+
+正確維度：
+
+```
+prompt_family + alertname + alert_category + namespace + target_resource + severity + fingerprint
+```
+
+禁止把 `annotations.description`、`message`、即時 metrics 數值、trace URL 當成重複告警 cache key 的必要組成。需要重新分析時，應由 fingerprint 變化、人工刷新、Playbook/KM 版本變化、或明確 TTL 到期觸發。
+
 ### 3. Multi-Sig 動作必須 Dry-Run

 ```python
--- a/.agents/skills/08-model-router-expert.md
+++ b/.agents/skills/08-model-router-expert.md
@@ -1,8 +1,8 @@
 # Skill 08: Model Router Expert

-> 版本: v1.1
+> 版本: v1.2
 > 建立: 2026-03-26 (台北時區)
-> 更新: 2026-03-29 (加入 NVIDIA Nemotron 整合)
+> 更新: 2026-05-01 (加入 LLM ghost-loop 成本治理)
 > 管轄: Phase 13.3 智能路由、複雜度評估、意圖分類、Tool Calling 路由

 ---
@@ -138,6 +138,20 @@ alerts:
    action: notify_admin
 ```

+### Provider 成本治理鐵律
+
+外部 AI 費用不是第一層問題。當同一告警形成鬼循環時，任何 provider 都會被放大；先修 dedupe/cache/retry，再調 provider。
+
+| 狀態 | Router 行為 |
+|------|-------------|
+| 同 fingerprint 10 分鐘內重複 delivery | 命中 Alertmanager in-flight lock / DB convergence，不進 provider routing |
+| 同告警 annotations 或 metrics 變動 | 命中 stable LLM cache，不因動態 prompt 重新計費 |
+| provider timeout / 500 | 走 circuit breaker + fallback，但 webhook 不得回 500 造成 Alertmanager retry storm |
+| 高複雜度且本地模型信心不足 | 才允許 Gemini/Groq/Claude/OpenRouter 等外部 capped fallback |
+| 訂閱方案評估 | 以「新問題數」估算，不以 retry storm 的 delivery 數估算 |
+
+健康飛輪下，外部 provider 用量應接近每天新告警/新 incident 數，而不是 Alertmanager 重送次數。Gemini/Groq/Claude 只能補專業度與 fallback 韌性，不能拿來遮住收斂失效。
+
 ---

 ## Fallback 策略 (ADR-006 v1.3 + ADR-036)
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -53,6 +53,10 @@ from src.models.approval import (
 # [首席架構師] 移除 generate_alert_fingerprint 直接 import，改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
 from src.models.webhook import AlertPayload, AlertResponse
 from src.services.alert_analyzer_service import AlertAnalyzer
+from src.services.alertmanager_llm_guard import (
+    ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
+    try_acquire_alertmanager_llm_lock,
+)
 from src.services.approval_db import get_approval_service

 # Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
@@ -2150,12 +2154,22 @@ async def alertmanager_webhook(
    # 2026-04-14 Claude Haiku 4.5 Asia/Taipei
    # 位置：指紋生成後、LLM 分析前（短路子告警）
    # ==========================================================================
-    grouping_result = await get_alert_grouping_service().evaluate(
-        alertname=alertname,
-        namespace=namespace,
-        fingerprint=fingerprint,
-    )
-    if grouping_result.is_grouped:
+    try:
+        grouping_result = await get_alert_grouping_service().evaluate(
+            alertname=alertname,
+            namespace=namespace,
+            fingerprint=fingerprint,
+        )
+    except Exception as e:
+        grouping_result = None
+        logger.warning(
+            "alertmanager_grouping_failed_fail_open",
+            alert_id=alert_id,
+            fingerprint=fingerprint,
+            error=str(e),
+        )
+
+    if grouping_result and grouping_result.is_grouped:
        logger.info(
            "alertmanager_grouped_skip",
            alert_id=alert_id,
@@ -2258,6 +2272,21 @@ async def alertmanager_webhook(
                approval_created=False,
            )

+        if not await try_acquire_alertmanager_llm_lock(fingerprint, alert_id):
+            logger.info(
+                "alertmanager_llm_inflight_suppressed",
+                alert_id=alert_id,
+                fingerprint=fingerprint,
+                ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
+            )
+            return AlertResponse(
+                success=True,
+                message="🛡️ 告警已由同指紋背景 AI 分析處理中，跳過重複 LLM 呼叫",
+                alert_id=alert_id,
+                approval_created=False,
+                converged=True,
+            )
+
        # ==========================================================================
        # ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析
        # 立即回傳 202，AI 辯證在背景非同步執行
@@ -2271,6 +2300,7 @@ async def alertmanager_webhook(
            "source": "alertmanager",
            "target_resource": target_resource,
            "namespace": namespace,
+            "fingerprint": fingerprint,
            "message": message,
            "annotations": dict(alert.annotations) if alert.annotations else {},
            "metrics": {},
@@ -2303,11 +2333,18 @@ async def alertmanager_webhook(
        )

    except Exception as e:
-        logger.error("alertmanager_error", error=str(e))
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Failed to process alert: {str(e)}",
-        ) from e
+        logger.error(
+            "alertmanager_degraded_accepted_no_retry",
+            alert_id=alert_id,
+            fingerprint=fingerprint,
+            error=str(e),
+        )
+        return AlertResponse(
+            success=False,
+            message="⚠️ 告警已接收但處理降級，避免 Alertmanager retry storm；已交由背景治理/人工介入追蹤",
+            alert_id=alert_id,
+            approval_created=False,
+        )


@router.get(
--- a/apps/api/src/services/alertmanager_llm_guard.py
+++ b/apps/api/src/services/alertmanager_llm_guard.py
@@ -0,0 +1,45 @@
+"""Alertmanager LLM storm guards.
+
+Service-layer Redis helpers used by webhook routers to avoid spawning duplicate
+LLM analysis tasks for the same Alertmanager fingerprint.
+"""
+
+from src.core.logging import get_logger
+from src.core.redis_client import get_redis
+
+logger = get_logger("awoooi.alertmanager_llm_guard")
+
+ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS = 600
+
+
+def alertmanager_llm_inflight_key(fingerprint: str) -> str:
+    """Return the Redis lock key for one Alertmanager fingerprint entering AI analysis."""
+
+    return f"alertmanager:llm_inflight:{fingerprint}"
+
+
+async def try_acquire_alertmanager_llm_lock(
+    fingerprint: str,
+    alert_id: str,
+    *,
+    ttl_seconds: int = ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
+) -> bool:
+    """Prevent same-second duplicate Alertmanager deliveries from spawning LLM calls."""
+
+    try:
+        redis = get_redis()
+        acquired = await redis.set(
+            alertmanager_llm_inflight_key(fingerprint),
+            alert_id,
+            ex=ttl_seconds,
+            nx=True,
+        )
+        return bool(acquired)
+    except Exception as exc:
+        logger.warning(
+            "alertmanager_llm_inflight_lock_failed_fail_open",
+            fingerprint=fingerprint,
+            alert_id=alert_id,
+            error=str(exc),
+        )
+        return True
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -124,6 +124,21 @@ def _backfill_kubectl_command(proposal: dict, tools: list) -> None:
 # OpenClaw Service
 # =============================================================================

+def _build_alert_cache_context_hash(alert_context: dict | None) -> str:
+    """Build a stable LLM cache scope for repeat alerts without dynamic annotations."""
+
+    if not alert_context:
+        return ""
+
+    alertname = alert_context.get("alertname") or alert_context.get("alert_type", "")
+    category = alert_context.get("alert_category", "")
+    namespace = alert_context.get("namespace", "")
+    target = alert_context.get("target_resource", "")
+    severity = alert_context.get("severity", "")
+    fingerprint = alert_context.get("fingerprint", "")
+    return f"{alertname}:{category}:{namespace}:{target}:{severity}:{fingerprint}"
+
+
 class OpenClawService:
    """
    OpenClaw AI 決策服務 - True LLM + SignOz Integration
@@ -727,9 +742,19 @@ class OpenClawService:
        """
        生成 LLM 快取鍵

-        使用 prompt 內容的 SHA256 作為快取鍵，確保相同問題不重複呼叫 LLM
+        有告警上下文時，使用 prompt family + 穩定告警維度，避免 annotations /
+        SignOz 即時數值讓同一告警每 20 秒打穿快取；沒有上下文時仍用完整 prompt。
        """
-        content = f"{prompt}:{context_hash}"
+        if context_hash:
+            prompt_family_source = (
+                "openclaw_alert_analysis"
+                if "## Alert Data:" in prompt
+                else prompt[:512]
+            )
+            prompt_family = hashlib.sha256(prompt_family_source.encode()).hexdigest()[:8]
+            content = f"{prompt_family}:{context_hash}"
+        else:
+            content = prompt
        hash_digest = hashlib.sha256(content.encode()).hexdigest()[:16]
        return f"llm_cache:{hash_digest}"

@@ -760,12 +785,7 @@ class OpenClawService:
        # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符
        # 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop)
        # 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果
-        context_hash = ""
-        if alert_context:
-            # alertname 優先；無 alertname 時 fallback 到 alert_type
-            _alertname = alert_context.get("alertname") or alert_context.get("alert_type", "")
-            _target = alert_context.get("target_resource", "")
-            context_hash = f"{_alertname}:{_target}"
+        context_hash = _build_alert_cache_context_hash(alert_context)

        cache_key = self._generate_cache_key(prompt, context_hash)

--- a/apps/api/tests/test_alertmanager_rule_bypass.py
+++ b/apps/api/tests/test_alertmanager_rule_bypass.py
@@ -4,6 +4,10 @@ from src.api.v1.webhooks import (
    _should_bypass_alertmanager_llm,
    _should_use_alertmanager_rule_first,
 )
+from src.services.alertmanager_llm_guard import (
+    ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
+    alertmanager_llm_inflight_key,
+)
 from src.services.decision_manager import (
    _is_host_layer_ssh_category,
    _is_non_k8s_host_category,
@@ -100,6 +104,13 @@ def test_backup_failure_blocks_k8s_auto_execute():
    assert _is_non_k8s_host_category("infrastructure") is False


+def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
+    fingerprint = "abc123"
+
+    assert alertmanager_llm_inflight_key(fingerprint) == "alertmanager:llm_inflight:abc123"
+    assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
+
+
 def test_resolved_guard_stamp_without_timestamp_is_clean():
    assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"

--- a/apps/api/tests/test_openclaw_cache_key.py
+++ b/apps/api/tests/test_openclaw_cache_key.py
@@ -0,0 +1,42 @@
+from src.services.openclaw import OpenClawService, _build_alert_cache_context_hash
+
+
+def test_openclaw_cache_key_uses_stable_alert_scope_when_context_hash_exists():
+    service = object.__new__(OpenClawService)
+    context_hash = "HostBackupFailed:backup_failure:awoooi-prod:awoooi-frequent:critical:fp-1"
+    prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1"
+    prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2"
+
+    assert service._generate_cache_key(prompt_a, context_hash) == service._generate_cache_key(
+        prompt_b,
+        context_hash,
+    )
+
+
+def test_openclaw_cache_key_keeps_full_prompt_specificity_without_context_hash():
+    service = object.__new__(OpenClawService)
+    prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1"
+    prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2"
+
+    assert service._generate_cache_key(prompt_a) != service._generate_cache_key(prompt_b)
+
+
+def test_openclaw_alert_cache_context_hash_ignores_dynamic_annotations():
+    base_context = {
+        "alertname": "HostBackupFailed",
+        "alert_category": "backup_failure",
+        "namespace": "awoooi-prod",
+        "target_resource": "awoooi-frequent",
+        "severity": "critical",
+        "fingerprint": "fp-1",
+        "annotations": {"description": "failed at 14:00"},
+    }
+    next_context = {
+        **base_context,
+        "annotations": {"description": "failed at 14:01"},
+        "message": "new volatile message",
+    }
+
+    assert _build_alert_cache_context_hash(base_context) == _build_alert_cache_context_hash(
+        next_context
+    )
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,21 @@

 ---

+## 2026-05-01 | LLM 鬼循環治理 — in-flight lock + stable cache + no-retry 2xx
+
+Claude Code 成本評估指出真正瓶頸不是外部 AI 費用，而是同一告警 0 秒重入、20 秒週期反覆呼叫 LLM、以及 HTTP 500 讓 Alertmanager 立即重試。結論：先修飛輪，再談 Gemini/Groq/Claude 訂閱；健康狀態下外部 provider 只應作為 capped fallback。
+
+### 完成
+- Alertmanager 同指紋新告警在排入背景 LLM 前先拿 Redis in-flight lock，TTL 10 分鐘；同秒或短時間重複 delivery 不再各自 spawn LLM task。
+- Alertmanager grouping 失敗改 fail-open 並留 log，不再因聚合服務小故障回 500 造成 Alertmanager retry storm。
+- Alertmanager 內部處理例外改成已接收的降級 2xx 回應，避免外部 retry 把同一事件打成 LLM 風暴；安全拒絕如外網來源仍維持 403。
+- OpenClaw cache key 改成 `prompt_family + alertname/category/namespace/target/severity/fingerprint`；annotations、message、SignOz 即時數值變動不再讓同一告警每次 miss cache。
+- 補 LLM cache / Alertmanager in-flight lock 單元測試，鎖住重複告警不得打穿 cache 的行為。
+
+### 驗證
+- `python3 -m py_compile apps/api/src/api/v1/webhooks.py apps/api/src/services/openclaw.py` 通過。
+- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_openclaw_cache_key.py tests/test_callback_dispatcher.py tests/test_telegram_button_consistency.py -q` → 60 passed。
+
 ## 2026-05-01 | HostBackupFailed rule-first e2e 補洞

 Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會被分類成 `backup_failure`，未命中原本只允許 `host_resource` 的 rule-first gate，導致又進 OpenClaw LLM。
--- a/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md
+++ b/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md
@@ -182,6 +182,29 @@ If SSH MCP fails, the incident must not silently become a manual approval card;

 ---

+## Appendix C — LLM Ghost Loop Controls (2026-05-01)
+
+Alertmanager 重複 delivery、修復失敗後告警仍 firing、以及 provider timeout/500 會形成「告警 → LLM → 修復失敗 → 20 秒後再 LLM」的鬼循環。這不是 AI 訂閱費用問題，而是飛輪收斂問題。
+
+### 必須維持的控制
+
+| 控制 | 行為 |
+|------|------|
+| Alertmanager in-flight lock | 同一 fingerprint 排入背景 LLM 前必須取得 Redis `alertmanager:llm_inflight:{fingerprint}`，TTL 10 分鐘 |
+| Stable LLM cache key | 有 alert context 時，OpenClaw cache key 不得使用完整動態 prompt；必須使用 prompt family + stable alert dimensions |
+| No retry storm | Alertmanager webhook 已完成來源驗證後，內部處理錯誤應回 2xx degraded accepted，不回 500 觸發 Alertmanager 重試 |
+| Learning after blocked automation | executor 後成功/失敗必須寫 KM；executor 前 guard block 也應優先沉澱成 KM/AOL/timeline，避免下一次同告警重跑完整 LLM |
+| Paid provider policy | Gemini/Groq/Claude/OpenRouter 只能作為 rate-limited fallback 或高複雜度專家路徑；不得用付費 provider 掩蓋 dedupe/cache/retry 問題 |
+
+### 健康狀態判準
+
+- 同一 fingerprint 10 分鐘內最多一個背景 LLM analysis task。
+- 重複告警應命中 DB convergence、Redis in-flight lock、rule-first、Playbook/KM/RAG、或 LLM cache 其中一層。
+- 外部 provider 用量應接近「新問題數」而不是「告警 delivery 數」。
+- HTTP 500 比例不得由已接收告警的後處理錯誤造成；若有 500，需先確認是否在來源驗證/反序列化前失敗。
+
+---
+
 ## 首席架構師 Review 記錄 (2026-04-05)

 評分：**72/100 → 修正後 88/100**