# AWOOOI Alertmanager 配置
# 2026-04-05 Claude Code: 修正 webhook URL
# 修正前: http://192.168.0.188:8088/api/v1/webhook/alertmanager (OpenClaw,舊系統,錯誤)
# 修正後: http://192.168.0.121:32334/api/v1/webhooks/alertmanager (AWOOOI API,複數,正確)
# 根據 feedback_alertmanager_awoooi_flow.md 鐵律
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei: 新增 Telegram Fallback (ADR-035)
# 架構: awoooi-webhook (主路徑) + telegram-direct (告警鏈路緊急旁路)
# telegram-direct 只允許處理 AWOOOI API / AlertChain 自身異常;一般 critical 必須走 AWOOOI API 治理鏈。
# 旁路目的地必須是 AwoooI SRE 戰情室;OPENCLAW_TG_CHAT_ID 只允許作缺值時的 fail-soft fallback。
# ⚠️ bot_token/chat_id 部署時由 secrets 替換,此檔為模板
#
# 2026-04-29 ogt + Claude Opus 4.7: P1-4 新版語法升級 + 因果抑制擴展
# 改動:
# 1. match/match_re → matchers (Alertmanager v0.27+ deprecated 警告)
# 2. source_match/target_match/target_match_re → source_matchers/target_matchers
# 3. group_by 加 team label(防 4 條 SLO 同秒爆,依 web-researcher 文件)
# 4. PostgreSQLDown / RedisDown inhibit 補 equal: ['instance'](防全 ns 爆炸抑制)
# 5. 新增 OllamaInstanceDown / KMConverterDown / SLO FastBurn 三組因果抑制
# 根因:本次 4 SLO 雪崩證實 Ollama 111 掛 → AI 推理鏈斷 → SLO 級聯爆炸無守門
# 6. 命名鐵律 feedback_telegram_alert_format.md 對齊(label team=ai/component/auto_repair)
global:
resolve_timeout: 5m
route:
receiver: 'awoooi-webhook'
# 2026-04-29: 加 team — SLO/AI PrometheusRule 含 team=ai 時可獨立分組合併
group_by: ['team', 'alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
# Emergency-only direct Telegram route.
# Alertmanager cannot "fallback on webhook send failure", so this route is gated by
# explicit alert-chain/API health alertnames instead of severity=critical.
- matchers:
- alertname=~"AWOOOIApiDown|AlertmanagerDown|AlertChainBroken_.*|AlertChainUnhealthy|NoAlertsReceived2Hours"
receiver: 'telegram-direct'
group_wait: 10s
repeat_interval: 30m
continue: true
- matchers:
- severity="critical"
receiver: 'awoooi-webhook'
group_wait: 10s
- matchers:
- severity="warning"
receiver: 'awoooi-webhook'
- matchers:
- alertname=~"Zombie.*|Container.*"
receiver: 'awoooi-webhook'
group_wait: 1m
receivers:
# 主路徑: AWOOOI API 處理所有告警 (AI 分析 + 去重 + Telegram)
# 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 VIP 192.168.0.125
# 根因: 121:32334 Connection Refused,120:32334 也 Refused
# 只有 VIP 125:32334 可連通(kube-proxy NodePort 路由正常)
# ⚠️ SPF-1 風險:VIP 125 為單點,VIP host 整機 down → 主鏈斷
# 緩解計畫見 critic SPF 治理設計(中度方案:webhook_configs 多 url round-robin)
- name: 'awoooi-webhook'
webhook_configs:
- url: 'http://192.168.0.125:32334/api/v1/webhooks/alertmanager'
send_resolved: true
# Emergency 路徑: AWOOOI API / AlertChain 自身異常時直接送 Telegram 到 SRE 群組。
# 一般 critical 不得走此 receiver,避免繞過 AWOOOI 去重、AI 分析、Approval 與 Audit。
# ⚠️ bot_token / chat_id 由 CD pipeline 在 deploy 時用 K8s Secret 注入
# feedback_telegram_secrets_injection.md 鐵律:禁止 PLACEHOLDER 上線
- name: 'telegram-direct'
telegram_configs:
- bot_token: 'TELEGRAM_BOT_TOKEN_PLACEHOLDER'
chat_id: SRE_GROUP_CHAT_ID_PLACEHOLDER
parse_mode: 'HTML'
message: |
🚨 [Alertmanager Emergency]
{{ range .Alerts }}
├ {{ .Labels.alertname }}
├ 嚴重度: {{ .Labels.severity }}
├ 主機: {{ .Labels.host }}{{ .Labels.instance }}
└ {{ .Annotations.summary }}
{{ end }}
⚠️ AWOOOI API / 告警鏈路可能異常,此為 SRE 戰情室緊急旁路
send_resolved: false
inhibit_rules:
# === 基礎因果抑制(原有規則,新語法重寫)===
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
equal: ['alertname', 'instance']
- source_matchers:
- alertname="HostDown"
target_matchers:
- alertname=~"HostHighCpuLoad|HostOutOfMemory|HostOutOfDiskSpace"
equal: ['host']
- source_matchers:
- alertname="KubeNodeNotReady"
target_matchers:
- alertname=~"KubePodCrashLooping|KubePodNotReady|KubeDeploymentReplicasMismatch"
equal: ['node']
# 2026-04-29: 補 equal: ['instance'] — 原本缺,PG 在 instance A down
# 不該抑制 instance B 的 HighConnections(爆炸抑制 bug)
- source_matchers:
- alertname="PostgreSQLDown"
target_matchers:
- alertname="PostgreSQLHighConnections"
equal: ['instance']
- source_matchers:
- alertname="RedisDown"
target_matchers:
- alertname="RedisMemoryHigh"
equal: ['instance']
# === 新增:AI 鏈因果抑制(2026-04-29 ADR-035 因果抑制擴展)===
# 根因:本次 4 SLO 雪崩證實 Ollama 111 掛 → AI 推理鏈斷 → 4 SLO 同秒爆
# 無此抑制 → 假警報淹沒真警報(Ollama down 本身才是真信號)
# Ollama 任一實例掛 → 抑制所有 AI/SLO 告警 30 分鐘
# 2026-04-29 ogt + Claude Opus 4.7: critic M4 修 — equal:[] 過寬,可能誤抑跨 cluster
# 加 ['cluster'] 約束(同 cluster 才抑制)
# 注意:本 cluster 目前單一,若 instance label 同步加在 SLO rule 可進一步收緊
- source_matchers:
- alertname="OllamaInstanceDown"
target_matchers:
- alertname=~"SLO_.*|AI_.*"
equal: ['cluster']
# KM converter 掛 → 抑制 KM Growth Rate SLO(避免 KM 寫入失敗本身觸發 SLO)
- source_matchers:
- alertname="KMConverterDown"
target_matchers:
- alertname=~"SLO_KMGrowthRate.*"
equal: ['cluster']
# 同 SLO 較嚴重抑制較輕(FastBurn 抑制 Medium/Slow Burn)
- source_matchers:
- alertname=~"SLO_.+_FastBurn"
target_matchers:
- alertname=~"SLO_.+_(Medium|Slow)Burn"
equal: ['alertname']