Files
awoooi/ops/alertmanager/alertmanager.yml
OG T c4f40235f4
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
fix(alertmanager): gate direct telegram to alertchain emergencies
2026-05-06 13:45:33 +08:00

147 lines
6.2 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AWOOOI Alertmanager 配置
# 2026-04-05 Claude Code: 修正 webhook URL
# 修正前: http://192.168.0.188:8088/api/v1/webhook/alertmanager (OpenClaw舊系統錯誤)
# 修正後: http://192.168.0.121:32334/api/v1/webhooks/alertmanager (AWOOOI API複數正確)
# 根據 feedback_alertmanager_awoooi_flow.md 鐵律
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei: 新增 Telegram Fallback (ADR-035)
# 架構: awoooi-webhook (主路徑) + telegram-direct (告警鏈路緊急旁路)
# telegram-direct 只允許處理 AWOOOI API / AlertChain 自身異常;一般 critical 必須走 AWOOOI API 治理鏈。
# 旁路目的地必須是 AwoooI SRE 戰情室OPENCLAW_TG_CHAT_ID 只允許作缺值時的 fail-soft fallback。
# ⚠️ bot_token/chat_id 部署時由 secrets 替換,此檔為模板
#
# 2026-04-29 ogt + Claude Opus 4.7: P1-4 新版語法升級 + 因果抑制擴展
# 改動:
# 1. match/match_re → matchers (Alertmanager v0.27+ deprecated 警告)
# 2. source_match/target_match/target_match_re → source_matchers/target_matchers
# 3. group_by 加 team label防 4 條 SLO 同秒爆,依 web-researcher 文件)
# 4. PostgreSQLDown / RedisDown inhibit 補 equal: ['instance'](防全 ns 爆炸抑制)
# 5. 新增 OllamaInstanceDown / KMConverterDown / SLO FastBurn 三組因果抑制
# 根因:本次 4 SLO 雪崩證實 Ollama 111 掛 → AI 推理鏈斷 → SLO 級聯爆炸無守門
# 6. 命名鐵律 feedback_telegram_alert_format.md 對齊label team=ai/component/auto_repair
global:
resolve_timeout: 5m
route:
receiver: 'awoooi-webhook'
# 2026-04-29: 加 team — SLO/AI PrometheusRule 含 team=ai 時可獨立分組合併
group_by: ['team', 'alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
# Emergency-only direct Telegram route.
# Alertmanager cannot "fallback on webhook send failure", so this route is gated by
# explicit alert-chain/API health alertnames instead of severity=critical.
- matchers:
- alertname=~"AWOOOIApiDown|AlertmanagerDown|AlertChainBroken_.*|AlertChainUnhealthy|NoAlertsReceived2Hours"
receiver: 'telegram-direct'
group_wait: 10s
repeat_interval: 30m
continue: true
- matchers:
- severity="critical"
receiver: 'awoooi-webhook'
group_wait: 10s
- matchers:
- severity="warning"
receiver: 'awoooi-webhook'
- matchers:
- alertname=~"Zombie.*|Container.*"
receiver: 'awoooi-webhook'
group_wait: 1m
receivers:
# 主路徑: AWOOOI API 處理所有告警 (AI 分析 + 去重 + Telegram)
# 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 VIP 192.168.0.125
# 根因: 121:32334 Connection Refused120:32334 也 Refused
# 只有 VIP 125:32334 可連通kube-proxy NodePort 路由正常)
# ⚠️ SPF-1 風險VIP 125 為單點VIP host 整機 down → 主鏈斷
# 緩解計畫見 critic SPF 治理設計中度方案webhook_configs 多 url round-robin
- name: 'awoooi-webhook'
webhook_configs:
- url: 'http://192.168.0.125:32334/api/v1/webhooks/alertmanager'
send_resolved: true
# Emergency 路徑: AWOOOI API / AlertChain 自身異常時直接送 Telegram 到 SRE 群組。
# 一般 critical 不得走此 receiver避免繞過 AWOOOI 去重、AI 分析、Approval 與 Audit。
# ⚠️ bot_token / chat_id 由 CD pipeline 在 deploy 時用 K8s Secret 注入
# feedback_telegram_secrets_injection.md 鐵律:禁止 PLACEHOLDER 上線
- name: 'telegram-direct'
telegram_configs:
- bot_token: 'TELEGRAM_BOT_TOKEN_PLACEHOLDER'
chat_id: SRE_GROUP_CHAT_ID_PLACEHOLDER
parse_mode: 'HTML'
message: |
🚨 <b>[Alertmanager Emergency]</b>
{{ range .Alerts }}
├ <b>{{ .Labels.alertname }}</b>
├ 嚴重度: {{ .Labels.severity }}
├ 主機: {{ .Labels.host }}{{ .Labels.instance }}
└ {{ .Annotations.summary }}
{{ end }}
<i>⚠️ AWOOOI API / 告警鏈路可能異常,此為 SRE 戰情室緊急旁路</i>
send_resolved: false
inhibit_rules:
# === 基礎因果抑制(原有規則,新語法重寫)===
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
equal: ['alertname', 'instance']
- source_matchers:
- alertname="HostDown"
target_matchers:
- alertname=~"HostHighCpuLoad|HostOutOfMemory|HostOutOfDiskSpace"
equal: ['host']
- source_matchers:
- alertname="KubeNodeNotReady"
target_matchers:
- alertname=~"KubePodCrashLooping|KubePodNotReady|KubeDeploymentReplicasMismatch"
equal: ['node']
# 2026-04-29: 補 equal: ['instance'] — 原本缺PG 在 instance A down
# 不該抑制 instance B 的 HighConnections爆炸抑制 bug
- source_matchers:
- alertname="PostgreSQLDown"
target_matchers:
- alertname="PostgreSQLHighConnections"
equal: ['instance']
- source_matchers:
- alertname="RedisDown"
target_matchers:
- alertname="RedisMemoryHigh"
equal: ['instance']
# === 新增AI 鏈因果抑制2026-04-29 ADR-035 因果抑制擴展)===
# 根因:本次 4 SLO 雪崩證實 Ollama 111 掛 → AI 推理鏈斷 → 4 SLO 同秒爆
# 無此抑制 → 假警報淹沒真警報Ollama down 本身才是真信號)
# Ollama 任一實例掛 → 抑制所有 AI/SLO 告警 30 分鐘
# 2026-04-29 ogt + Claude Opus 4.7: critic M4 修 — equal:[] 過寬,可能誤抑跨 cluster
# 加 ['cluster'] 約束(同 cluster 才抑制)
# 注意:本 cluster 目前單一,若 instance label 同步加在 SLO rule 可進一步收緊
- source_matchers:
- alertname="OllamaInstanceDown"
target_matchers:
- alertname=~"SLO_.*|AI_.*"
equal: ['cluster']
# KM converter 掛 → 抑制 KM Growth Rate SLO避免 KM 寫入失敗本身觸發 SLO
- source_matchers:
- alertname="KMConverterDown"
target_matchers:
- alertname=~"SLO_KMGrowthRate.*"
equal: ['cluster']
# 同 SLO 較嚴重抑制較輕FastBurn 抑制 Medium/Slow Burn
- source_matchers:
- alertname=~"SLO_.+_FastBurn"
target_matchers:
- alertname=~"SLO_.+_(Medium|Slow)Burn"
equal: ['alertname']