diff --git a/.agents/skills/03-openclaw-cognitive-expert.md b/.agents/skills/03-openclaw-cognitive-expert.md index 1770d7a0..8830ea70 100644 --- a/.agents/skills/03-openclaw-cognitive-expert.md +++ b/.agents/skills/03-openclaw-cognitive-expert.md @@ -10,11 +10,11 @@ | 欄位 | 值 | |------|-----| -| **版本** | v1.7 | +| **版本** | v1.8 | | **建立日期** | 2026-03-20 (台北) | | **建立者** | Claude Code | -| **最後修改** | 2026-03-31 18:00 (台北) | -| **修改者** | Claude Code (首席架構師) | +| **最後修改** | 2026-05-01 15:30 (台北) | +| **修改者** | Codex | ### 變更紀錄 @@ -28,6 +28,7 @@ | v1.5 | 2026-03-27 | Claude Code | Stream Key 統一 + 告警去重機制 | | v1.6 | 2026-03-27 | Claude Code | **P1 優化: 稍後/靜默按鈕** | | v1.7 | 2026-03-31 | Claude Code | **Phase 22: OpenClaw + Nemotron 協作 (ADR-044)** | +| v1.8 | 2026-05-01 | Codex | **LLM 鬼循環治理: stable alert cache key + no裸奔重試** | --- @@ -115,6 +116,18 @@ async def analyze_with_ai(context: str) -> str: response = await _call_ollama(context) ``` +#### 2.1 告警快取鍵必須使用穩定維度 + +告警分析的 prompt 會包含 annotations、SignOz 即時數值、MCP evidence 等動態資料;不得把完整 prompt 當成同一告警的唯一 cache key,否則 firing 告警每 20 秒都會 miss cache。 + +正確維度: + +``` +prompt_family + alertname + alert_category + namespace + target_resource + severity + fingerprint +``` + +禁止把 `annotations.description`、`message`、即時 metrics 數值、trace URL 當成重複告警 cache key 的必要組成。需要重新分析時,應由 fingerprint 變化、人工刷新、Playbook/KM 版本變化、或明確 TTL 到期觸發。 + ### 3. Multi-Sig 動作必須 Dry-Run ```python diff --git a/.agents/skills/08-model-router-expert.md b/.agents/skills/08-model-router-expert.md index a74897b2..e7ad0bdb 100644 --- a/.agents/skills/08-model-router-expert.md +++ b/.agents/skills/08-model-router-expert.md @@ -1,8 +1,8 @@ # Skill 08: Model Router Expert -> 版本: v1.1 +> 版本: v1.2 > 建立: 2026-03-26 (台北時區) -> 更新: 2026-03-29 (加入 NVIDIA Nemotron 整合) +> 更新: 2026-05-01 (加入 LLM ghost-loop 成本治理) > 管轄: Phase 13.3 智能路由、複雜度評估、意圖分類、Tool Calling 路由 --- @@ -138,6 +138,20 @@ alerts: action: notify_admin ``` +### Provider 成本治理鐵律 + +外部 AI 費用不是第一層問題。當同一告警形成鬼循環時,任何 provider 都會被放大;先修 dedupe/cache/retry,再調 provider。 + +| 狀態 | Router 行為 | +|------|-------------| +| 同 fingerprint 10 分鐘內重複 delivery | 命中 Alertmanager in-flight lock / DB convergence,不進 provider routing | +| 同告警 annotations 或 metrics 變動 | 命中 stable LLM cache,不因動態 prompt 重新計費 | +| provider timeout / 500 | 走 circuit breaker + fallback,但 webhook 不得回 500 造成 Alertmanager retry storm | +| 高複雜度且本地模型信心不足 | 才允許 Gemini/Groq/Claude/OpenRouter 等外部 capped fallback | +| 訂閱方案評估 | 以「新問題數」估算,不以 retry storm 的 delivery 數估算 | + +健康飛輪下,外部 provider 用量應接近每天新告警/新 incident 數,而不是 Alertmanager 重送次數。Gemini/Groq/Claude 只能補專業度與 fallback 韌性,不能拿來遮住收斂失效。 + --- ## Fallback 策略 (ADR-006 v1.3 + ADR-036) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 81e6c746..ef312579 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -53,6 +53,10 @@ from src.models.approval import ( # [首席架構師] 移除 generate_alert_fingerprint 直接 import,改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei from src.models.webhook import AlertPayload, AlertResponse from src.services.alert_analyzer_service import AlertAnalyzer +from src.services.alertmanager_llm_guard import ( + ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS, + try_acquire_alertmanager_llm_lock, +) from src.services.approval_db import get_approval_service # Phase 17 P0: Service 層 (消除 Router 直接存取 Redis) @@ -2150,12 +2154,22 @@ async def alertmanager_webhook( # 2026-04-14 Claude Haiku 4.5 Asia/Taipei # 位置:指紋生成後、LLM 分析前(短路子告警) # ========================================================================== - grouping_result = await get_alert_grouping_service().evaluate( - alertname=alertname, - namespace=namespace, - fingerprint=fingerprint, - ) - if grouping_result.is_grouped: + try: + grouping_result = await get_alert_grouping_service().evaluate( + alertname=alertname, + namespace=namespace, + fingerprint=fingerprint, + ) + except Exception as e: + grouping_result = None + logger.warning( + "alertmanager_grouping_failed_fail_open", + alert_id=alert_id, + fingerprint=fingerprint, + error=str(e), + ) + + if grouping_result and grouping_result.is_grouped: logger.info( "alertmanager_grouped_skip", alert_id=alert_id, @@ -2258,6 +2272,21 @@ async def alertmanager_webhook( approval_created=False, ) + if not await try_acquire_alertmanager_llm_lock(fingerprint, alert_id): + logger.info( + "alertmanager_llm_inflight_suppressed", + alert_id=alert_id, + fingerprint=fingerprint, + ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS, + ) + return AlertResponse( + success=True, + message="🛡️ 告警已由同指紋背景 AI 分析處理中,跳過重複 LLM 呼叫", + alert_id=alert_id, + approval_created=False, + converged=True, + ) + # ========================================================================== # ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析 # 立即回傳 202,AI 辯證在背景非同步執行 @@ -2271,6 +2300,7 @@ async def alertmanager_webhook( "source": "alertmanager", "target_resource": target_resource, "namespace": namespace, + "fingerprint": fingerprint, "message": message, "annotations": dict(alert.annotations) if alert.annotations else {}, "metrics": {}, @@ -2303,11 +2333,18 @@ async def alertmanager_webhook( ) except Exception as e: - logger.error("alertmanager_error", error=str(e)) - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Failed to process alert: {str(e)}", - ) from e + logger.error( + "alertmanager_degraded_accepted_no_retry", + alert_id=alert_id, + fingerprint=fingerprint, + error=str(e), + ) + return AlertResponse( + success=False, + message="⚠️ 告警已接收但處理降級,避免 Alertmanager retry storm;已交由背景治理/人工介入追蹤", + alert_id=alert_id, + approval_created=False, + ) @router.get( diff --git a/apps/api/src/services/alertmanager_llm_guard.py b/apps/api/src/services/alertmanager_llm_guard.py new file mode 100644 index 00000000..72b0a713 --- /dev/null +++ b/apps/api/src/services/alertmanager_llm_guard.py @@ -0,0 +1,45 @@ +"""Alertmanager LLM storm guards. + +Service-layer Redis helpers used by webhook routers to avoid spawning duplicate +LLM analysis tasks for the same Alertmanager fingerprint. +""" + +from src.core.logging import get_logger +from src.core.redis_client import get_redis + +logger = get_logger("awoooi.alertmanager_llm_guard") + +ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS = 600 + + +def alertmanager_llm_inflight_key(fingerprint: str) -> str: + """Return the Redis lock key for one Alertmanager fingerprint entering AI analysis.""" + + return f"alertmanager:llm_inflight:{fingerprint}" + + +async def try_acquire_alertmanager_llm_lock( + fingerprint: str, + alert_id: str, + *, + ttl_seconds: int = ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS, +) -> bool: + """Prevent same-second duplicate Alertmanager deliveries from spawning LLM calls.""" + + try: + redis = get_redis() + acquired = await redis.set( + alertmanager_llm_inflight_key(fingerprint), + alert_id, + ex=ttl_seconds, + nx=True, + ) + return bool(acquired) + except Exception as exc: + logger.warning( + "alertmanager_llm_inflight_lock_failed_fail_open", + fingerprint=fingerprint, + alert_id=alert_id, + error=str(exc), + ) + return True diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index eddcf518..c472ca92 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -124,6 +124,21 @@ def _backfill_kubectl_command(proposal: dict, tools: list) -> None: # OpenClaw Service # ============================================================================= +def _build_alert_cache_context_hash(alert_context: dict | None) -> str: + """Build a stable LLM cache scope for repeat alerts without dynamic annotations.""" + + if not alert_context: + return "" + + alertname = alert_context.get("alertname") or alert_context.get("alert_type", "") + category = alert_context.get("alert_category", "") + namespace = alert_context.get("namespace", "") + target = alert_context.get("target_resource", "") + severity = alert_context.get("severity", "") + fingerprint = alert_context.get("fingerprint", "") + return f"{alertname}:{category}:{namespace}:{target}:{severity}:{fingerprint}" + + class OpenClawService: """ OpenClaw AI 決策服務 - True LLM + SignOz Integration @@ -727,9 +742,19 @@ class OpenClawService: """ 生成 LLM 快取鍵 - 使用 prompt 內容的 SHA256 作為快取鍵,確保相同問題不重複呼叫 LLM + 有告警上下文時,使用 prompt family + 穩定告警維度,避免 annotations / + SignOz 即時數值讓同一告警每 20 秒打穿快取;沒有上下文時仍用完整 prompt。 """ - content = f"{prompt}:{context_hash}" + if context_hash: + prompt_family_source = ( + "openclaw_alert_analysis" + if "## Alert Data:" in prompt + else prompt[:512] + ) + prompt_family = hashlib.sha256(prompt_family_source.encode()).hexdigest()[:8] + content = f"{prompt_family}:{context_hash}" + else: + content = prompt hash_digest = hashlib.sha256(content.encode()).hexdigest()[:16] return f"llm_cache:{hash_digest}" @@ -760,12 +785,7 @@ class OpenClawService: # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符 # 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop) # 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果 - context_hash = "" - if alert_context: - # alertname 優先;無 alertname 時 fallback 到 alert_type - _alertname = alert_context.get("alertname") or alert_context.get("alert_type", "") - _target = alert_context.get("target_resource", "") - context_hash = f"{_alertname}:{_target}" + context_hash = _build_alert_cache_context_hash(alert_context) cache_key = self._generate_cache_key(prompt, context_hash) diff --git a/apps/api/tests/test_alertmanager_rule_bypass.py b/apps/api/tests/test_alertmanager_rule_bypass.py index 80cd892d..49052fb3 100644 --- a/apps/api/tests/test_alertmanager_rule_bypass.py +++ b/apps/api/tests/test_alertmanager_rule_bypass.py @@ -4,6 +4,10 @@ from src.api.v1.webhooks import ( _should_bypass_alertmanager_llm, _should_use_alertmanager_rule_first, ) +from src.services.alertmanager_llm_guard import ( + ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS, + alertmanager_llm_inflight_key, +) from src.services.decision_manager import ( _is_host_layer_ssh_category, _is_non_k8s_host_category, @@ -100,6 +104,13 @@ def test_backup_failure_blocks_k8s_auto_execute(): assert _is_non_k8s_host_category("infrastructure") is False +def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped(): + fingerprint = "abc123" + + assert alertmanager_llm_inflight_key(fingerprint) == "alertmanager:llm_inflight:abc123" + assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600 + + def test_resolved_guard_stamp_without_timestamp_is_clean(): assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決" diff --git a/apps/api/tests/test_openclaw_cache_key.py b/apps/api/tests/test_openclaw_cache_key.py new file mode 100644 index 00000000..814dfb9d --- /dev/null +++ b/apps/api/tests/test_openclaw_cache_key.py @@ -0,0 +1,42 @@ +from src.services.openclaw import OpenClawService, _build_alert_cache_context_hash + + +def test_openclaw_cache_key_uses_stable_alert_scope_when_context_hash_exists(): + service = object.__new__(OpenClawService) + context_hash = "HostBackupFailed:backup_failure:awoooi-prod:awoooi-frequent:critical:fp-1" + prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1" + prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2" + + assert service._generate_cache_key(prompt_a, context_hash) == service._generate_cache_key( + prompt_b, + context_hash, + ) + + +def test_openclaw_cache_key_keeps_full_prompt_specificity_without_context_hash(): + service = object.__new__(OpenClawService) + prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1" + prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2" + + assert service._generate_cache_key(prompt_a) != service._generate_cache_key(prompt_b) + + +def test_openclaw_alert_cache_context_hash_ignores_dynamic_annotations(): + base_context = { + "alertname": "HostBackupFailed", + "alert_category": "backup_failure", + "namespace": "awoooi-prod", + "target_resource": "awoooi-frequent", + "severity": "critical", + "fingerprint": "fp-1", + "annotations": {"description": "failed at 14:00"}, + } + next_context = { + **base_context, + "annotations": {"description": "failed at 14:01"}, + "message": "new volatile message", + } + + assert _build_alert_cache_context_hash(base_context) == _build_alert_cache_context_hash( + next_context + ) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 84a9ab10..e155d206 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,21 @@ --- +## 2026-05-01 | LLM 鬼循環治理 — in-flight lock + stable cache + no-retry 2xx + +Claude Code 成本評估指出真正瓶頸不是外部 AI 費用,而是同一告警 0 秒重入、20 秒週期反覆呼叫 LLM、以及 HTTP 500 讓 Alertmanager 立即重試。結論:先修飛輪,再談 Gemini/Groq/Claude 訂閱;健康狀態下外部 provider 只應作為 capped fallback。 + +### 完成 +- Alertmanager 同指紋新告警在排入背景 LLM 前先拿 Redis in-flight lock,TTL 10 分鐘;同秒或短時間重複 delivery 不再各自 spawn LLM task。 +- Alertmanager grouping 失敗改 fail-open 並留 log,不再因聚合服務小故障回 500 造成 Alertmanager retry storm。 +- Alertmanager 內部處理例外改成已接收的降級 2xx 回應,避免外部 retry 把同一事件打成 LLM 風暴;安全拒絕如外網來源仍維持 403。 +- OpenClaw cache key 改成 `prompt_family + alertname/category/namespace/target/severity/fingerprint`;annotations、message、SignOz 即時數值變動不再讓同一告警每次 miss cache。 +- 補 LLM cache / Alertmanager in-flight lock 單元測試,鎖住重複告警不得打穿 cache 的行為。 + +### 驗證 +- `python3 -m py_compile apps/api/src/api/v1/webhooks.py apps/api/src/services/openclaw.py` 通過。 +- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_openclaw_cache_key.py tests/test_callback_dispatcher.py tests/test_telegram_button_consistency.py -q` → 60 passed。 + ## 2026-05-01 | HostBackupFailed rule-first e2e 補洞 Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會被分類成 `backup_failure`,未命中原本只允許 `host_resource` 的 rule-first gate,導致又進 OpenClaw LLM。 diff --git a/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md b/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md index faa883d6..f59da313 100644 --- a/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md +++ b/docs/adr/ADR-058-host-auto-repair-ssh-whitelist.md @@ -182,6 +182,29 @@ If SSH MCP fails, the incident must not silently become a manual approval card; --- +## Appendix C — LLM Ghost Loop Controls (2026-05-01) + +Alertmanager 重複 delivery、修復失敗後告警仍 firing、以及 provider timeout/500 會形成「告警 → LLM → 修復失敗 → 20 秒後再 LLM」的鬼循環。這不是 AI 訂閱費用問題,而是飛輪收斂問題。 + +### 必須維持的控制 + +| 控制 | 行為 | +|------|------| +| Alertmanager in-flight lock | 同一 fingerprint 排入背景 LLM 前必須取得 Redis `alertmanager:llm_inflight:{fingerprint}`,TTL 10 分鐘 | +| Stable LLM cache key | 有 alert context 時,OpenClaw cache key 不得使用完整動態 prompt;必須使用 prompt family + stable alert dimensions | +| No retry storm | Alertmanager webhook 已完成來源驗證後,內部處理錯誤應回 2xx degraded accepted,不回 500 觸發 Alertmanager 重試 | +| Learning after blocked automation | executor 後成功/失敗必須寫 KM;executor 前 guard block 也應優先沉澱成 KM/AOL/timeline,避免下一次同告警重跑完整 LLM | +| Paid provider policy | Gemini/Groq/Claude/OpenRouter 只能作為 rate-limited fallback 或高複雜度專家路徑;不得用付費 provider 掩蓋 dedupe/cache/retry 問題 | + +### 健康狀態判準 + +- 同一 fingerprint 10 分鐘內最多一個背景 LLM analysis task。 +- 重複告警應命中 DB convergence、Redis in-flight lock、rule-first、Playbook/KM/RAG、或 LLM cache 其中一層。 +- 外部 provider 用量應接近「新問題數」而不是「告警 delivery 數」。 +- HTTP 500 比例不得由已接收告警的後處理錯誤造成;若有 500,需先確認是否在來源驗證/反序列化前失敗。 + +--- + ## 首席架構師 Review 記錄 (2026-04-05) 評分:**72/100 → 修正後 88/100**