awoooi/ops/monitoring/ollama_health_rules.yaml

# ops/monitoring/ollama_health_rules.yaml
# AWOOOI Ollama 容災健康告警規則
# 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — Ollama 容災監控告警規則
# 2026-05-03 ogt: ADR-110 GCP 三層容災，更新健康規則 action 說明（GCP-A/B + Local）
# 部署目標: 與 alerts-unified.yml 一起部署到 192.168.0.110:/home/wooo/monitoring/alerts.yml
# 部署方式: 手動合併至 alerts-unified.yml，或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用
#
# 標籤規範 (對齊 alerts-unified.yml):
#   layer: ai-provider
#   team: ai
#   auto_repair: "true" | "false"
#
# ⚠️ Backlog 指標（尚未在 API 暴露，需 Part 3 補完後才能啟用）：
#   - OllamaSlowInference: ollama_inference_duration_seconds_bucket — BACKLOG
#   - GeminiQuotaApproaching: gemini_daily_call_count / gemini_daily_quota — 部分實作
#     (Redis key 存在，但 Prometheus Gauge 需 Part 3 手動刷新)
#   - AutoRepairVerificationFailureHigh: post_execution_verification_* — BACKLOG
#   以上規則已寫入但標記 # [BACKLOG]，上線前需先確認 metric 已暴露

groups:

  # ===========================================================================
  # Ollama 容災健康 (ollama_health)
  # ===========================================================================
  - name: ollama_health
    interval: 30s
    rules:

      # -----------------------------------------------------------------------
      # 🔴 [ACTIVE] Ollama 主機離線
      # metric: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"}
      # 前置條件: Prometheus scrape job 命名對齊 ADR-110 provider pool
      #   (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml)
      # -----------------------------------------------------------------------
      - alert: OllamaInstanceDown
        expr: up{job=~"ollama_gcp_a|ollama_gcp_b|ollama_local|ollama_111"} == 0
        for: 2m
        labels:
          severity: critical
          layer: ai-provider
          team: ai
          auto_repair: "false"
          alert_category: "ollama_failover"
        annotations:
          summary: "Ollama {{ $labels.job }} 離線 ({{ $labels.instance }})"
          description: "Prometheus 探測 Ollama {{ $labels.job }} 失敗超過 2 分鐘。預期容災應已觸發，路由已切 Gemini。"
          runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#ollama-instance-down"
          action: "curl http://34.143.170.20:11434/api/tags（GCP-A）或 curl http://34.21.145.224:11434/api/tags（GCP-B）或 ssh wooo@192.168.0.111 'systemctl status ollama'（Local 後備）"

      # -----------------------------------------------------------------------
      # 🟡 [ACTIVE] Failover 觸發頻率過高
      # metric: ollama_failover_triggered_total{from_provider,to_provider}
      # 由 apps/api/src/core/metrics.py OLLAMA_FAILOVER_TRIGGERED_TOTAL 暴露
      # -----------------------------------------------------------------------
      - alert: OllamaFailoverFrequent
        expr: rate(ollama_failover_triggered_total[1h]) > 5
        for: 10m
        labels:
          severity: warning
          layer: ai-provider
          team: ai
          auto_repair: "false"
          alert_category: "ollama_failover"
        annotations:
          summary: "Ollama 容災觸發頻率 > 5/h，主機可能不穩定"
          description: "過去 1 小時 Ollama failover 超過 5 次。建議檢查 111 主機穩定性。"
          runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#failover-frequent"
          action: "curl http://34.143.170.20:11434/api/tags 或 ssh wooo@192.168.0.111 'nvidia-smi && journalctl -u ollama -n 50'（GCP-A 掛才用 111）"

      # -----------------------------------------------------------------------
      # 🟡 [ACTIVE] Auto Recovery 停滯（111 已恢復但仍走 Gemini）
      # metric: ollama_health_status{host} (Gauge, 0=offline, 1=healthy)
      #         ollama_current_primary_is_ollama (Gauge, 1=primary是ollama)
      # 兩個 metric 均由 Part 3 補入
      # -----------------------------------------------------------------------
      - alert: OllamaRecoveryStuck
        expr: |
          ollama_health_status{host="111"} == 1
          and
          ollama_current_primary_is_ollama == 0
        for: 5m
        labels:
          severity: critical
          layer: systemd-188
          team: ai
          auto_repair: "false"
          alert_category: "ollama_failover"
        annotations:
          summary: "111 已 HEALTHY 但路由仍走 Gemini，auto recovery 可能停滯"
          description: "OllamaHealthMonitor 回報 111=HEALTHY 已超過 5 分鐘，但 primary 仍非 ollama。請確認 OllamaAutoRecoveryService 是否正常運行。"
          runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#recovery-stuck"
          action: "kubectl logs -n awoooi-prod deploy/api | grep ollama_auto_recovery | tail -20"

      # -----------------------------------------------------------------------
      # 🟡 [BACKLOG] P99 推理延遲過高
      # metric: ollama_inference_duration_seconds_bucket — 尚未暴露，需 Part 3 補入
      # -----------------------------------------------------------------------
      # [BACKLOG] 等 ollama_inference_duration_seconds_bucket 暴露後啟用
      # - alert: OllamaSlowInference
      #   expr: |
      #     histogram_quantile(0.99,
      #       rate(ollama_inference_duration_seconds_bucket[5m])
      #     ) > 30
      #   for: 5m
      #   labels:
      #     severity: warning
      #     team: ai
      #   annotations:
      #     summary: "Ollama P99 推理延遲 > 30s"
      #     action: "curl http://34.143.170.20:11434/api/tags 或 ssh wooo@192.168.0.111 'nvidia-smi'（GCP-A 掛才用 111）"

      # -----------------------------------------------------------------------
      # 🟡 [PARTIAL] Gemini 配額即將耗盡
      # metric: gemini_daily_call_count (Gauge)
      #         gemini_daily_quota (Gauge)
      # Redis key "ollama:gemini_daily_count:{date}" 已存在
      # Gauge 需由 Part 3 補入（從 Redis 讀出並設值）
      # -----------------------------------------------------------------------
      - alert: GeminiQuotaApproaching
        expr: gemini_daily_call_count / gemini_daily_quota > 0.8
        for: 5m
        labels:
          severity: warning
          layer: systemd-188
          team: ai
          auto_repair: "false"
          alert_category: "ollama_failover"
        annotations:
          summary: "Gemini 每日配額已用 >80%，即將觸發 failover"
          description: "每日 Gemini call 已超過配額 80%。當日剩餘配額不足時，路由將自動切至 188 CPU-only 備援。"
          runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#gemini-quota"
          action: "確認 GEMINI_DAILY_QUOTA 設定值，考慮升級配額或提前切 Nemotron"

      # -----------------------------------------------------------------------
      # 🟡 [BACKLOG] Auto Repair Verifier 失敗率高（飛輪健康）
      # metric: post_execution_verification_failed_total — 尚未暴露
      #         post_execution_verification_total       — 尚未暴露
      # -----------------------------------------------------------------------
      # [BACKLOG] 等 post_execution_verification_* 暴露後啟用
      # - alert: AutoRepairVerificationFailureHigh
      #   expr: |
      #     sum(rate(post_execution_verification_failed_total[15m])) /
      #     sum(rate(post_execution_verification_total[15m])) > 0.3
      #   for: 10m
      #   labels:
      #     severity: warning
      #     team: ai
      #   annotations:
      #     summary: "Auto Repair Verifier 失敗率 >30%（飛輪可能腐爛）"