150 lines
7.6 KiB
YAML
150 lines
7.6 KiB
YAML
# ops/monitoring/ollama_health_rules.yaml
|
||
# AWOOOI Ollama 容災健康告警規則
|
||
# 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — Ollama 容災監控告警規則
|
||
# 2026-05-03 ogt: ADR-110 GCP 三層容災,更新健康規則 action 說明(GCP-A/B + Local)
|
||
# 部署目標: 與 alerts-unified.yml 一起部署到 192.168.0.110:/home/wooo/monitoring/alerts.yml
|
||
# 部署方式: 手動合併至 alerts-unified.yml,或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用
|
||
#
|
||
# 標籤規範 (對齊 alerts-unified.yml):
|
||
# layer: ai-provider
|
||
# team: ai
|
||
# auto_repair: "true" | "false"
|
||
#
|
||
# ⚠️ Backlog 指標(尚未在 API 暴露,需 Part 3 補完後才能啟用):
|
||
# - OllamaSlowInference: ollama_inference_duration_seconds_bucket — BACKLOG
|
||
# - GeminiQuotaApproaching: gemini_daily_call_count / gemini_daily_quota — 部分實作
|
||
# (Redis key 存在,但 Prometheus Gauge 需 Part 3 手動刷新)
|
||
# - AutoRepairVerificationFailureHigh: post_execution_verification_* — BACKLOG
|
||
# 以上規則已寫入但標記 # [BACKLOG],上線前需先確認 metric 已暴露
|
||
|
||
groups:
|
||
|
||
# ===========================================================================
|
||
# Ollama 容災健康 (ollama_health)
|
||
# ===========================================================================
|
||
- name: ollama_health
|
||
interval: 30s
|
||
rules:
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🔴 [ACTIVE] Ollama 主機離線
|
||
# metric: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}
|
||
# 前置條件: Prometheus scrape job 命名對齊 ADR-110 provider pool
|
||
# (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml)
|
||
# -----------------------------------------------------------------------
|
||
- alert: OllamaInstanceDown
|
||
expr: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: ai-provider
|
||
team: ai
|
||
auto_repair: "false"
|
||
alert_category: "ollama_failover"
|
||
annotations:
|
||
summary: "Ollama {{ $labels.job }} 離線 ({{ $labels.instance }})"
|
||
description: "Prometheus 探測 Ollama {{ $labels.job }} 失敗超過 2 分鐘。預期容災應已觸發,路由已切 Gemini。"
|
||
runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#ollama-instance-down"
|
||
action: "curl http://34.143.170.20:11434/api/tags(GCP-A)或 curl http://34.21.145.224:11434/api/tags(GCP-B)或 ssh wooo@192.168.0.111 'systemctl status ollama'(Local 後備)"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🟡 [ACTIVE] Failover 觸發頻率過高
|
||
# metric: ollama_failover_triggered_total{from_provider,to_provider}
|
||
# 由 apps/api/src/core/metrics.py OLLAMA_FAILOVER_TRIGGERED_TOTAL 暴露
|
||
# -----------------------------------------------------------------------
|
||
- alert: OllamaFailoverFrequent
|
||
expr: rate(ollama_failover_triggered_total[1h]) > 5
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: ai-provider
|
||
team: ai
|
||
auto_repair: "false"
|
||
alert_category: "ollama_failover"
|
||
annotations:
|
||
summary: "Ollama 容災觸發頻率 > 5/h,主機可能不穩定"
|
||
description: "過去 1 小時 Ollama failover 超過 5 次。建議檢查 111 主機穩定性。"
|
||
runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#failover-frequent"
|
||
action: "curl http://34.143.170.20:11434/api/tags 或 ssh wooo@192.168.0.111 'nvidia-smi && journalctl -u ollama -n 50'(GCP-A 掛才用 111)"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🟡 [ACTIVE] Auto Recovery 停滯(111 已恢復但仍走 Gemini)
|
||
# metric: ollama_health_status{host} (Gauge, 0=offline, 1=healthy)
|
||
# ollama_current_primary_is_ollama (Gauge, 1=primary是ollama)
|
||
# 兩個 metric 均由 Part 3 補入
|
||
# -----------------------------------------------------------------------
|
||
- alert: OllamaRecoveryStuck
|
||
expr: |
|
||
ollama_health_status{host="111"} == 1
|
||
and
|
||
ollama_current_primary_is_ollama == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: ai
|
||
auto_repair: "false"
|
||
alert_category: "ollama_failover"
|
||
annotations:
|
||
summary: "111 已 HEALTHY 但路由仍走 Gemini,auto recovery 可能停滯"
|
||
description: "OllamaHealthMonitor 回報 111=HEALTHY 已超過 5 分鐘,但 primary 仍非 ollama。請確認 OllamaAutoRecoveryService 是否正常運行。"
|
||
runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#recovery-stuck"
|
||
action: "kubectl logs -n awoooi-prod deploy/api | grep ollama_auto_recovery | tail -20"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🟡 [BACKLOG] P99 推理延遲過高
|
||
# metric: ollama_inference_duration_seconds_bucket — 尚未暴露,需 Part 3 補入
|
||
# -----------------------------------------------------------------------
|
||
# [BACKLOG] 等 ollama_inference_duration_seconds_bucket 暴露後啟用
|
||
# - alert: OllamaSlowInference
|
||
# expr: |
|
||
# histogram_quantile(0.99,
|
||
# rate(ollama_inference_duration_seconds_bucket[5m])
|
||
# ) > 30
|
||
# for: 5m
|
||
# labels:
|
||
# severity: warning
|
||
# team: ai
|
||
# annotations:
|
||
# summary: "Ollama P99 推理延遲 > 30s"
|
||
# action: "curl http://34.143.170.20:11434/api/tags 或 ssh wooo@192.168.0.111 'nvidia-smi'(GCP-A 掛才用 111)"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🟡 [PARTIAL] Gemini 配額即將耗盡
|
||
# metric: gemini_daily_call_count (Gauge)
|
||
# gemini_daily_quota (Gauge)
|
||
# Redis key "ollama:gemini_daily_count:{date}" 已存在
|
||
# Gauge 需由 Part 3 補入(從 Redis 讀出並設值)
|
||
# -----------------------------------------------------------------------
|
||
- alert: GeminiQuotaApproaching
|
||
expr: gemini_daily_call_count / gemini_daily_quota > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ai
|
||
auto_repair: "false"
|
||
alert_category: "ollama_failover"
|
||
annotations:
|
||
summary: "Gemini 每日配額已用 >80%,即將觸發 failover"
|
||
description: "每日 Gemini call 已超過配額 80%。當日剩餘配額不足時,路由將自動切至 188 CPU-only 備援。"
|
||
runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#gemini-quota"
|
||
action: "確認 GEMINI_DAILY_QUOTA 設定值,考慮升級配額或提前切 Nemotron"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 🟡 [BACKLOG] Auto Repair Verifier 失敗率高(飛輪健康)
|
||
# metric: post_execution_verification_failed_total — 尚未暴露
|
||
# post_execution_verification_total — 尚未暴露
|
||
# -----------------------------------------------------------------------
|
||
# [BACKLOG] 等 post_execution_verification_* 暴露後啟用
|
||
# - alert: AutoRepairVerificationFailureHigh
|
||
# expr: |
|
||
# sum(rate(post_execution_verification_failed_total[15m])) /
|
||
# sum(rate(post_execution_verification_total[15m])) > 0.3
|
||
# for: 10m
|
||
# labels:
|
||
# severity: warning
|
||
# team: ai
|
||
# annotations:
|
||
# summary: "Auto Repair Verifier 失敗率 >30%(飛輪可能腐爛)"
|