fix(aiops): suppress repeated llm alert loops
This commit is contained in:
@@ -10,11 +10,11 @@
|
||||
|
||||
| 欄位 | 值 |
|
||||
|------|-----|
|
||||
| **版本** | v1.7 |
|
||||
| **版本** | v1.8 |
|
||||
| **建立日期** | 2026-03-20 (台北) |
|
||||
| **建立者** | Claude Code |
|
||||
| **最後修改** | 2026-03-31 18:00 (台北) |
|
||||
| **修改者** | Claude Code (首席架構師) |
|
||||
| **最後修改** | 2026-05-01 15:30 (台北) |
|
||||
| **修改者** | Codex |
|
||||
|
||||
### 變更紀錄
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
| v1.5 | 2026-03-27 | Claude Code | Stream Key 統一 + 告警去重機制 |
|
||||
| v1.6 | 2026-03-27 | Claude Code | **P1 優化: 稍後/靜默按鈕** |
|
||||
| v1.7 | 2026-03-31 | Claude Code | **Phase 22: OpenClaw + Nemotron 協作 (ADR-044)** |
|
||||
| v1.8 | 2026-05-01 | Codex | **LLM 鬼循環治理: stable alert cache key + no裸奔重試** |
|
||||
|
||||
---
|
||||
|
||||
@@ -115,6 +116,18 @@ async def analyze_with_ai(context: str) -> str:
|
||||
response = await _call_ollama(context)
|
||||
```
|
||||
|
||||
#### 2.1 告警快取鍵必須使用穩定維度
|
||||
|
||||
告警分析的 prompt 會包含 annotations、SignOz 即時數值、MCP evidence 等動態資料;不得把完整 prompt 當成同一告警的唯一 cache key,否則 firing 告警每 20 秒都會 miss cache。
|
||||
|
||||
正確維度:
|
||||
|
||||
```
|
||||
prompt_family + alertname + alert_category + namespace + target_resource + severity + fingerprint
|
||||
```
|
||||
|
||||
禁止把 `annotations.description`、`message`、即時 metrics 數值、trace URL 當成重複告警 cache key 的必要組成。需要重新分析時,應由 fingerprint 變化、人工刷新、Playbook/KM 版本變化、或明確 TTL 到期觸發。
|
||||
|
||||
### 3. Multi-Sig 動作必須 Dry-Run
|
||||
|
||||
```python
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Skill 08: Model Router Expert
|
||||
|
||||
> 版本: v1.1
|
||||
> 版本: v1.2
|
||||
> 建立: 2026-03-26 (台北時區)
|
||||
> 更新: 2026-03-29 (加入 NVIDIA Nemotron 整合)
|
||||
> 更新: 2026-05-01 (加入 LLM ghost-loop 成本治理)
|
||||
> 管轄: Phase 13.3 智能路由、複雜度評估、意圖分類、Tool Calling 路由
|
||||
|
||||
---
|
||||
@@ -138,6 +138,20 @@ alerts:
|
||||
action: notify_admin
|
||||
```
|
||||
|
||||
### Provider 成本治理鐵律
|
||||
|
||||
外部 AI 費用不是第一層問題。當同一告警形成鬼循環時,任何 provider 都會被放大;先修 dedupe/cache/retry,再調 provider。
|
||||
|
||||
| 狀態 | Router 行為 |
|
||||
|------|-------------|
|
||||
| 同 fingerprint 10 分鐘內重複 delivery | 命中 Alertmanager in-flight lock / DB convergence,不進 provider routing |
|
||||
| 同告警 annotations 或 metrics 變動 | 命中 stable LLM cache,不因動態 prompt 重新計費 |
|
||||
| provider timeout / 500 | 走 circuit breaker + fallback,但 webhook 不得回 500 造成 Alertmanager retry storm |
|
||||
| 高複雜度且本地模型信心不足 | 才允許 Gemini/Groq/Claude/OpenRouter 等外部 capped fallback |
|
||||
| 訂閱方案評估 | 以「新問題數」估算,不以 retry storm 的 delivery 數估算 |
|
||||
|
||||
健康飛輪下,外部 provider 用量應接近每天新告警/新 incident 數,而不是 Alertmanager 重送次數。Gemini/Groq/Claude 只能補專業度與 fallback 韌性,不能拿來遮住收斂失效。
|
||||
|
||||
---
|
||||
|
||||
## Fallback 策略 (ADR-006 v1.3 + ADR-036)
|
||||
|
||||
@@ -53,6 +53,10 @@ from src.models.approval import (
|
||||
# [首席架構師] 移除 generate_alert_fingerprint 直接 import,改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
|
||||
from src.models.webhook import AlertPayload, AlertResponse
|
||||
from src.services.alert_analyzer_service import AlertAnalyzer
|
||||
from src.services.alertmanager_llm_guard import (
|
||||
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
try_acquire_alertmanager_llm_lock,
|
||||
)
|
||||
from src.services.approval_db import get_approval_service
|
||||
|
||||
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
|
||||
@@ -2150,12 +2154,22 @@ async def alertmanager_webhook(
|
||||
# 2026-04-14 Claude Haiku 4.5 Asia/Taipei
|
||||
# 位置:指紋生成後、LLM 分析前(短路子告警)
|
||||
# ==========================================================================
|
||||
grouping_result = await get_alert_grouping_service().evaluate(
|
||||
alertname=alertname,
|
||||
namespace=namespace,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
if grouping_result.is_grouped:
|
||||
try:
|
||||
grouping_result = await get_alert_grouping_service().evaluate(
|
||||
alertname=alertname,
|
||||
namespace=namespace,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
except Exception as e:
|
||||
grouping_result = None
|
||||
logger.warning(
|
||||
"alertmanager_grouping_failed_fail_open",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
if grouping_result and grouping_result.is_grouped:
|
||||
logger.info(
|
||||
"alertmanager_grouped_skip",
|
||||
alert_id=alert_id,
|
||||
@@ -2258,6 +2272,21 @@ async def alertmanager_webhook(
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
if not await try_acquire_alertmanager_llm_lock(fingerprint, alert_id):
|
||||
logger.info(
|
||||
"alertmanager_llm_inflight_suppressed",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
)
|
||||
return AlertResponse(
|
||||
success=True,
|
||||
message="🛡️ 告警已由同指紋背景 AI 分析處理中,跳過重複 LLM 呼叫",
|
||||
alert_id=alert_id,
|
||||
approval_created=False,
|
||||
converged=True,
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析
|
||||
# 立即回傳 202,AI 辯證在背景非同步執行
|
||||
@@ -2271,6 +2300,7 @@ async def alertmanager_webhook(
|
||||
"source": "alertmanager",
|
||||
"target_resource": target_resource,
|
||||
"namespace": namespace,
|
||||
"fingerprint": fingerprint,
|
||||
"message": message,
|
||||
"annotations": dict(alert.annotations) if alert.annotations else {},
|
||||
"metrics": {},
|
||||
@@ -2303,11 +2333,18 @@ async def alertmanager_webhook(
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("alertmanager_error", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to process alert: {str(e)}",
|
||||
) from e
|
||||
logger.error(
|
||||
"alertmanager_degraded_accepted_no_retry",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
error=str(e),
|
||||
)
|
||||
return AlertResponse(
|
||||
success=False,
|
||||
message="⚠️ 告警已接收但處理降級,避免 Alertmanager retry storm;已交由背景治理/人工介入追蹤",
|
||||
alert_id=alert_id,
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
|
||||
45
apps/api/src/services/alertmanager_llm_guard.py
Normal file
45
apps/api/src/services/alertmanager_llm_guard.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Alertmanager LLM storm guards.
|
||||
|
||||
Service-layer Redis helpers used by webhook routers to avoid spawning duplicate
|
||||
LLM analysis tasks for the same Alertmanager fingerprint.
|
||||
"""
|
||||
|
||||
from src.core.logging import get_logger
|
||||
from src.core.redis_client import get_redis
|
||||
|
||||
logger = get_logger("awoooi.alertmanager_llm_guard")
|
||||
|
||||
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS = 600
|
||||
|
||||
|
||||
def alertmanager_llm_inflight_key(fingerprint: str) -> str:
|
||||
"""Return the Redis lock key for one Alertmanager fingerprint entering AI analysis."""
|
||||
|
||||
return f"alertmanager:llm_inflight:{fingerprint}"
|
||||
|
||||
|
||||
async def try_acquire_alertmanager_llm_lock(
|
||||
fingerprint: str,
|
||||
alert_id: str,
|
||||
*,
|
||||
ttl_seconds: int = ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
) -> bool:
|
||||
"""Prevent same-second duplicate Alertmanager deliveries from spawning LLM calls."""
|
||||
|
||||
try:
|
||||
redis = get_redis()
|
||||
acquired = await redis.set(
|
||||
alertmanager_llm_inflight_key(fingerprint),
|
||||
alert_id,
|
||||
ex=ttl_seconds,
|
||||
nx=True,
|
||||
)
|
||||
return bool(acquired)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"alertmanager_llm_inflight_lock_failed_fail_open",
|
||||
fingerprint=fingerprint,
|
||||
alert_id=alert_id,
|
||||
error=str(exc),
|
||||
)
|
||||
return True
|
||||
@@ -124,6 +124,21 @@ def _backfill_kubectl_command(proposal: dict, tools: list) -> None:
|
||||
# OpenClaw Service
|
||||
# =============================================================================
|
||||
|
||||
def _build_alert_cache_context_hash(alert_context: dict | None) -> str:
|
||||
"""Build a stable LLM cache scope for repeat alerts without dynamic annotations."""
|
||||
|
||||
if not alert_context:
|
||||
return ""
|
||||
|
||||
alertname = alert_context.get("alertname") or alert_context.get("alert_type", "")
|
||||
category = alert_context.get("alert_category", "")
|
||||
namespace = alert_context.get("namespace", "")
|
||||
target = alert_context.get("target_resource", "")
|
||||
severity = alert_context.get("severity", "")
|
||||
fingerprint = alert_context.get("fingerprint", "")
|
||||
return f"{alertname}:{category}:{namespace}:{target}:{severity}:{fingerprint}"
|
||||
|
||||
|
||||
class OpenClawService:
|
||||
"""
|
||||
OpenClaw AI 決策服務 - True LLM + SignOz Integration
|
||||
@@ -727,9 +742,19 @@ class OpenClawService:
|
||||
"""
|
||||
生成 LLM 快取鍵
|
||||
|
||||
使用 prompt 內容的 SHA256 作為快取鍵,確保相同問題不重複呼叫 LLM
|
||||
有告警上下文時,使用 prompt family + 穩定告警維度,避免 annotations /
|
||||
SignOz 即時數值讓同一告警每 20 秒打穿快取;沒有上下文時仍用完整 prompt。
|
||||
"""
|
||||
content = f"{prompt}:{context_hash}"
|
||||
if context_hash:
|
||||
prompt_family_source = (
|
||||
"openclaw_alert_analysis"
|
||||
if "## Alert Data:" in prompt
|
||||
else prompt[:512]
|
||||
)
|
||||
prompt_family = hashlib.sha256(prompt_family_source.encode()).hexdigest()[:8]
|
||||
content = f"{prompt_family}:{context_hash}"
|
||||
else:
|
||||
content = prompt
|
||||
hash_digest = hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||
return f"llm_cache:{hash_digest}"
|
||||
|
||||
@@ -760,12 +785,7 @@ class OpenClawService:
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符
|
||||
# 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop)
|
||||
# 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果
|
||||
context_hash = ""
|
||||
if alert_context:
|
||||
# alertname 優先;無 alertname 時 fallback 到 alert_type
|
||||
_alertname = alert_context.get("alertname") or alert_context.get("alert_type", "")
|
||||
_target = alert_context.get("target_resource", "")
|
||||
context_hash = f"{_alertname}:{_target}"
|
||||
context_hash = _build_alert_cache_context_hash(alert_context)
|
||||
|
||||
cache_key = self._generate_cache_key(prompt, context_hash)
|
||||
|
||||
|
||||
@@ -4,6 +4,10 @@ from src.api.v1.webhooks import (
|
||||
_should_bypass_alertmanager_llm,
|
||||
_should_use_alertmanager_rule_first,
|
||||
)
|
||||
from src.services.alertmanager_llm_guard import (
|
||||
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
alertmanager_llm_inflight_key,
|
||||
)
|
||||
from src.services.decision_manager import (
|
||||
_is_host_layer_ssh_category,
|
||||
_is_non_k8s_host_category,
|
||||
@@ -100,6 +104,13 @@ def test_backup_failure_blocks_k8s_auto_execute():
|
||||
assert _is_non_k8s_host_category("infrastructure") is False
|
||||
|
||||
|
||||
def test_alertmanager_llm_inflight_lock_key_is_fingerprint_scoped():
|
||||
fingerprint = "abc123"
|
||||
|
||||
assert alertmanager_llm_inflight_key(fingerprint) == "alertmanager:llm_inflight:abc123"
|
||||
assert ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS == 600
|
||||
|
||||
|
||||
def test_resolved_guard_stamp_without_timestamp_is_clean():
|
||||
assert _format_resolved_guard_stamp(None) == "✅ 此事件已解決"
|
||||
|
||||
|
||||
42
apps/api/tests/test_openclaw_cache_key.py
Normal file
42
apps/api/tests/test_openclaw_cache_key.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from src.services.openclaw import OpenClawService, _build_alert_cache_context_hash
|
||||
|
||||
|
||||
def test_openclaw_cache_key_uses_stable_alert_scope_when_context_hash_exists():
|
||||
service = object.__new__(OpenClawService)
|
||||
context_hash = "HostBackupFailed:backup_failure:awoooi-prod:awoooi-frequent:critical:fp-1"
|
||||
prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1"
|
||||
prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2"
|
||||
|
||||
assert service._generate_cache_key(prompt_a, context_hash) == service._generate_cache_key(
|
||||
prompt_b,
|
||||
context_hash,
|
||||
)
|
||||
|
||||
|
||||
def test_openclaw_cache_key_keeps_full_prompt_specificity_without_context_hash():
|
||||
service = object.__new__(OpenClawService)
|
||||
prompt_a = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=1"
|
||||
prompt_b = "System prompt and instructions\n\n## Alert Data:\ncurrent_value=2"
|
||||
|
||||
assert service._generate_cache_key(prompt_a) != service._generate_cache_key(prompt_b)
|
||||
|
||||
|
||||
def test_openclaw_alert_cache_context_hash_ignores_dynamic_annotations():
|
||||
base_context = {
|
||||
"alertname": "HostBackupFailed",
|
||||
"alert_category": "backup_failure",
|
||||
"namespace": "awoooi-prod",
|
||||
"target_resource": "awoooi-frequent",
|
||||
"severity": "critical",
|
||||
"fingerprint": "fp-1",
|
||||
"annotations": {"description": "failed at 14:00"},
|
||||
}
|
||||
next_context = {
|
||||
**base_context,
|
||||
"annotations": {"description": "failed at 14:01"},
|
||||
"message": "new volatile message",
|
||||
}
|
||||
|
||||
assert _build_alert_cache_context_hash(base_context) == _build_alert_cache_context_hash(
|
||||
next_context
|
||||
)
|
||||
@@ -6,6 +6,21 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-01 | LLM 鬼循環治理 — in-flight lock + stable cache + no-retry 2xx
|
||||
|
||||
Claude Code 成本評估指出真正瓶頸不是外部 AI 費用,而是同一告警 0 秒重入、20 秒週期反覆呼叫 LLM、以及 HTTP 500 讓 Alertmanager 立即重試。結論:先修飛輪,再談 Gemini/Groq/Claude 訂閱;健康狀態下外部 provider 只應作為 capped fallback。
|
||||
|
||||
### 完成
|
||||
- Alertmanager 同指紋新告警在排入背景 LLM 前先拿 Redis in-flight lock,TTL 10 分鐘;同秒或短時間重複 delivery 不再各自 spawn LLM task。
|
||||
- Alertmanager grouping 失敗改 fail-open 並留 log,不再因聚合服務小故障回 500 造成 Alertmanager retry storm。
|
||||
- Alertmanager 內部處理例外改成已接收的降級 2xx 回應,避免外部 retry 把同一事件打成 LLM 風暴;安全拒絕如外網來源仍維持 403。
|
||||
- OpenClaw cache key 改成 `prompt_family + alertname/category/namespace/target/severity/fingerprint`;annotations、message、SignOz 即時數值變動不再讓同一告警每次 miss cache。
|
||||
- 補 LLM cache / Alertmanager in-flight lock 單元測試,鎖住重複告警不得打穿 cache 的行為。
|
||||
|
||||
### 驗證
|
||||
- `python3 -m py_compile apps/api/src/api/v1/webhooks.py apps/api/src/services/openclaw.py` 通過。
|
||||
- `cd apps/api && pytest tests/test_alertmanager_rule_bypass.py tests/test_openclaw_cache_key.py tests/test_callback_dispatcher.py tests/test_telegram_button_consistency.py -q` → 60 passed。
|
||||
|
||||
## 2026-05-01 | HostBackupFailed rule-first e2e 補洞
|
||||
|
||||
Live e2e 用 `HostBackupFailed` 打 Alertmanager 後發現 aged backup 告警會被分類成 `backup_failure`,未命中原本只允許 `host_resource` 的 rule-first gate,導致又進 OpenClaw LLM。
|
||||
|
||||
@@ -182,6 +182,29 @@ If SSH MCP fails, the incident must not silently become a manual approval card;
|
||||
|
||||
---
|
||||
|
||||
## Appendix C — LLM Ghost Loop Controls (2026-05-01)
|
||||
|
||||
Alertmanager 重複 delivery、修復失敗後告警仍 firing、以及 provider timeout/500 會形成「告警 → LLM → 修復失敗 → 20 秒後再 LLM」的鬼循環。這不是 AI 訂閱費用問題,而是飛輪收斂問題。
|
||||
|
||||
### 必須維持的控制
|
||||
|
||||
| 控制 | 行為 |
|
||||
|------|------|
|
||||
| Alertmanager in-flight lock | 同一 fingerprint 排入背景 LLM 前必須取得 Redis `alertmanager:llm_inflight:{fingerprint}`,TTL 10 分鐘 |
|
||||
| Stable LLM cache key | 有 alert context 時,OpenClaw cache key 不得使用完整動態 prompt;必須使用 prompt family + stable alert dimensions |
|
||||
| No retry storm | Alertmanager webhook 已完成來源驗證後,內部處理錯誤應回 2xx degraded accepted,不回 500 觸發 Alertmanager 重試 |
|
||||
| Learning after blocked automation | executor 後成功/失敗必須寫 KM;executor 前 guard block 也應優先沉澱成 KM/AOL/timeline,避免下一次同告警重跑完整 LLM |
|
||||
| Paid provider policy | Gemini/Groq/Claude/OpenRouter 只能作為 rate-limited fallback 或高複雜度專家路徑;不得用付費 provider 掩蓋 dedupe/cache/retry 問題 |
|
||||
|
||||
### 健康狀態判準
|
||||
|
||||
- 同一 fingerprint 10 分鐘內最多一個背景 LLM analysis task。
|
||||
- 重複告警應命中 DB convergence、Redis in-flight lock、rule-first、Playbook/KM/RAG、或 LLM cache 其中一層。
|
||||
- 外部 provider 用量應接近「新問題數」而不是「告警 delivery 數」。
|
||||
- HTTP 500 比例不得由已接收告警的後處理錯誤造成;若有 500,需先確認是否在來源驗證/反序列化前失敗。
|
||||
|
||||
---
|
||||
|
||||
## 首席架構師 Review 記錄 (2026-04-05)
|
||||
|
||||
評分:**72/100 → 修正後 88/100**
|
||||
|
||||
Reference in New Issue
Block a user