From c4f40235f4574e0d6e0885664b2938365578754b Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 6 May 2026 13:45:33 +0800 Subject: [PATCH] fix(alertmanager): gate direct telegram to alertchain emergencies --- docs/LOGBOOK.md | 13 +++++++++- ...AI-AUTONOMOUS-FLYWHEEL-INTEGRATION-PLAN.md | 2 +- ops/alertmanager/alertmanager.yml | 24 +++++++++++-------- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c6dc74ab..74d929e2 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -3619,7 +3619,18 @@ Sentry consumers after reset # recent 45s logs: no OffsetOutOfRange / UNKNOWN_TABLE / ERROR markers ``` +### 第二段收斂:旁路改成 Emergency-only + +後續同一輪已再收斂 Alertmanager 路由: + +| 範圍 | 結果 | +|------|------| +| Direct route gate | `telegram-direct` 不再匹配所有 `severity=critical`,只匹配 `AWOOOIApiDown` / `AlertmanagerDown` / `AlertChainBroken_*` / `AlertChainUnhealthy` / `NoAlertsReceived2Hours` | +| Main route | 一般 critical(含 Docker/Sentry container restart)只走 `awoooi-webhook`,回到 AWOOOI API 去重、AI 分析、Approval 與 Audit 主鏈 | +| Live webhook URL | `/home/wooo/monitoring/alertmanager.yml` 從 `192.168.0.121:32334` 對齊 repo 的 VIP `192.168.0.125:32334` | +| Config check | `docker exec alertmanager amtool check-config /etc/alertmanager/alertmanager.yml` 成功,HUP reload 完成 | + ### 注意 - `DockerContainerRestartSpike` 使用 15 分鐘窗口,已發生的 restart spike 會在 Prometheus 窗口過去後退火;修復完成後短時間內 `ALERTS{alertname="DockerContainerRestartSpike"}` 仍可能暫時為 firing。 -- 目前 Alertmanager 的 `telegram-direct` 仍是 critical 旁路,不是真正「webhook failure conditional fallback」。後續應改成只由 AWOOOI API health / AlertChainDown 類告警觸發,避免 critical 告警在主鏈正常時雙路徑通知。 +- Alertmanager 本身不支援「webhook send failed 後再 fallback receiver」語義;因此 direct Telegram 只能以明確的 API/AlertChain 健康告警作為 emergency gate。 diff --git a/docs/awooop/AWOOOI-AWOOOP-AI-AUTONOMOUS-FLYWHEEL-INTEGRATION-PLAN.md b/docs/awooop/AWOOOI-AWOOOP-AI-AUTONOMOUS-FLYWHEEL-INTEGRATION-PLAN.md index 80195643..9760fcc0 100644 --- a/docs/awooop/AWOOOI-AWOOOP-AI-AUTONOMOUS-FLYWHEEL-INTEGRATION-PLAN.md +++ b/docs/awooop/AWOOOI-AWOOOP-AI-AUTONOMOUS-FLYWHEEL-INTEGRATION-PLAN.md @@ -293,7 +293,7 @@ For AI routing releases, also verify: ## 11. Immediate Next Items -1. Convert Alertmanager `telegram-direct` from "all critical bypass" to a true alert-chain/API-health-only emergency route; the 2026-05-06 hotfix moved it to `SRE_GROUP_CHAT_ID`, but it still duplicates critical alerts by design. +1. Make Alertmanager config deployment deterministic: the live `telegram-direct` route is now emergency-only, but the inject/deploy path still needs a checked script so the 110 config cannot drift from `ops/alertmanager/alertmanager.yml`. 2. Continue Wave 1 with MCP Gateway bypass and MCP audit completeness, because production callers can still route around the gateway. 3. Keep GCP-A/GCP-B/111 Ollama routing verification in every alert-path release until EffectivePolicy becomes authoritative. 4. Add a Sentry/Snuba post-reboot health gate: ClickHouse table existence, Snuba migration status, and Kafka consumer offsets must be part of cold-start validation. diff --git a/ops/alertmanager/alertmanager.yml b/ops/alertmanager/alertmanager.yml index e851e31a..fcfcf64f 100644 --- a/ops/alertmanager/alertmanager.yml +++ b/ops/alertmanager/alertmanager.yml @@ -4,8 +4,9 @@ # 修正後: http://192.168.0.121:32334/api/v1/webhooks/alertmanager (AWOOOI API,複數,正確) # 根據 feedback_alertmanager_awoooi_flow.md 鐵律 # 2026-04-09 Claude Sonnet 4.6 Asia/Taipei: 新增 Telegram Fallback (ADR-035) -# 架構: awoooi-webhook (主路徑) + telegram-direct (critical 旁路,獨立路由) -# critical 旁路必須送 AwoooI SRE 戰情室;OPENCLAW_TG_CHAT_ID 只允許作缺值時的 fail-soft fallback +# 架構: awoooi-webhook (主路徑) + telegram-direct (告警鏈路緊急旁路) +# telegram-direct 只允許處理 AWOOOI API / AlertChain 自身異常;一般 critical 必須走 AWOOOI API 治理鏈。 +# 旁路目的地必須是 AwoooI SRE 戰情室;OPENCLAW_TG_CHAT_ID 只允許作缺值時的 fail-soft fallback。 # ⚠️ bot_token/chat_id 部署時由 secrets 替換,此檔為模板 # # 2026-04-29 ogt + Claude Opus 4.7: P1-4 新版語法升級 + 因果抑制擴展 @@ -29,15 +30,18 @@ route: group_interval: 5m repeat_interval: 4h routes: + # Emergency-only direct Telegram route. + # Alertmanager cannot "fallback on webhook send failure", so this route is gated by + # explicit alert-chain/API health alertnames instead of severity=critical. - matchers: - - severity="critical" - receiver: 'awoooi-webhook' + - alertname=~"AWOOOIApiDown|AlertmanagerDown|AlertChainBroken_.*|AlertChainUnhealthy|NoAlertsReceived2Hours" + receiver: 'telegram-direct' group_wait: 10s - # continue:true 讓 critical 同時送 telegram-direct 旁路(SRE 群組) + repeat_interval: 30m continue: true - matchers: - severity="critical" - receiver: 'telegram-direct' + receiver: 'awoooi-webhook' group_wait: 10s - matchers: - severity="warning" @@ -59,8 +63,8 @@ receivers: - url: 'http://192.168.0.125:32334/api/v1/webhooks/alertmanager' send_resolved: true - # Fallback 路徑: AWOOOI API 掛掉時,critical 告警直接送 Telegram - # 只有 critical severity 走此路徑(避免 warning 雙重通知) + # Emergency 路徑: AWOOOI API / AlertChain 自身異常時直接送 Telegram 到 SRE 群組。 + # 一般 critical 不得走此 receiver,避免繞過 AWOOOI 去重、AI 分析、Approval 與 Audit。 # ⚠️ bot_token / chat_id 由 CD pipeline 在 deploy 時用 K8s Secret 注入 # feedback_telegram_secrets_injection.md 鐵律:禁止 PLACEHOLDER 上線 - name: 'telegram-direct' @@ -69,14 +73,14 @@ receivers: chat_id: SRE_GROUP_CHAT_ID_PLACEHOLDER parse_mode: 'HTML' message: | - 🚨 [Alertmanager Fallback] + 🚨 [Alertmanager Emergency] {{ range .Alerts }} ├ {{ .Labels.alertname }} ├ 嚴重度: {{ .Labels.severity }} ├ 主機: {{ .Labels.host }}{{ .Labels.instance }} └ {{ .Annotations.summary }} {{ end }} - ⚠️ AWOOOI API 可能離線,此為直接告警 + ⚠️ AWOOOI API / 告警鏈路可能異常,此為 SRE 戰情室緊急旁路 send_resolved: false inhibit_rules: