awoooi/k8s/monitoring/k3s-alerts.yaml

# =============================================================================
# K3s Infrastructure Alerts
# =============================================================================
# K-MON 2026-03-28: Alert rules for K3s infrastructure monitoring
# Deployed by: Claude Code (首席架構師)
# Deployed to: 192.168.0.188 Prometheus/Alertmanager
# =============================================================================
#
# 這個檔案是 Prometheus alert rules 的版本控制副本
# 實際部署位置: /home/ollama/momo-pro/monitoring/alerts.yml (188)
#
# =============================================================================

groups:
  # ===== K3s 基礎設施告警 =====
  - name: k3s_infrastructure
    rules:
      # K3s VIP 不可達
      - alert: K3sVIPDown
        expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:6443"} == 0
        for: 30s
        labels:
          severity: critical
          team: ops
          component: k3s
        annotations:
          summary: "🔴 K3s VIP 無法連接"
          description: "K3s API VIP 192.168.0.125:6443 已離線超過 30 秒"

      # K3s 節點不可達
      - alert: K3sNodeDown
        expr: probe_success{job="blackbox-icmp", instance=~"192.168.0.12[01]"} == 0
        for: 1m
        labels:
          severity: critical
          team: ops
          component: k3s
        annotations:
          summary: "🔴 K3s 節點離線"
          description: "K3s 節點 {{ $labels.instance }} 已離線超過 1 分鐘"

      # AWOOOI API 不可達
      - alert: AWOOOIApiDown
        expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32334"} == 0
        for: 1m
        labels:
          severity: critical
          team: ops
          component: awoooi
        annotations:
          summary: "🔴 AWOOOI API 離線"
          description: "AWOOOI API (192.168.0.125:32334) 已離線超過 1 分鐘"

      # AWOOOI Web 不可達
      - alert: AWOOOIWebDown
        expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32335"} == 0
        for: 1m
        labels:
          severity: warning
          team: ops
          component: awoooi
        annotations:
          summary: "⚠️ AWOOOI Web 離線"
          description: "AWOOOI Web (192.168.0.125:32335) 已離線超過 1 分鐘"

  # ===== Velero 備份告警 =====
  - name: velero_alerts
    rules:
      # Velero 備份失敗
      - alert: VeleroBackupFailed
        expr: increase(velero_backup_failure_total[24h]) > 0
        labels:
          severity: warning
          team: ops
          component: velero
        annotations:
          summary: "⚠️ Velero 備份失敗"
          description: "過去 24 小時有 {{ $value }} 次備份失敗"

      # Velero 備份超時
      - alert: VeleroBackupMissing
        expr: (time() - velero_backup_last_successful_timestamp{schedule=~".*awoooi.*"}) > 129600
        for: 1h
        labels:
          severity: critical
          team: ops
          component: velero
        annotations:
          summary: "🔴 Velero 備份超時"
          description: "awoooi 排程備份已超過 36 小時未成功"

  # ===== 可觀測性服務告警 =====
  - name: observability_alerts
    rules:
      # SignOz 離線
      - alert: SignOzDown
        expr: probe_success{job="blackbox-tcp", instance="192.168.0.188:3301"} == 0
        for: 2m
        labels:
          severity: warning
          team: ops
          component: signoz
        annotations:
          summary: "⚠️ SignOz 服務離線"
          description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"

      # Sentry 離線
      - alert: SentryDown
        expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
        for: 2m
        labels:
          severity: warning
          team: ops
          component: sentry
        annotations:
          summary: "⚠️ Sentry 服務離線"
          description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"

  # ===== kube-state-metrics 告警 (K8s 物件狀態) =====
  - name: kube_state_metrics_alerts
    rules:
      # Pod 重啟過多
      - alert: PodRestartingTooMuch
        expr: increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[1h]) > 3
        labels:
          severity: warning
          team: ops
          component: k8s
        annotations:
          summary: "⚠️ Pod 重啟過多"
          description: "{{ $labels.namespace }}/{{ $labels.pod }} 過去 1 小時重啟 {{ $value }} 次"

      # Deployment 副本數不足
      - alert: DeploymentReplicasMismatch
        expr: kube_deployment_status_replicas_available{namespace="awoooi-prod"} != kube_deployment_spec_replicas{namespace="awoooi-prod"}
        for: 5m
        labels:
          severity: warning
          team: ops
          component: k8s
        annotations:
          summary: "⚠️ Deployment 副本不足"
          description: "{{ $labels.deployment }} 期望 {{ $value }} 副本但可用數不符"

      # Pod 長時間處於 Pending
      - alert: PodPendingTooLong
        expr: kube_pod_status_phase{phase="Pending", namespace="awoooi-prod"} == 1
        for: 10m
        labels:
          severity: warning
          team: ops
          component: k8s
        annotations:
          summary: "⚠️ Pod 卡在 Pending"
          description: "{{ $labels.pod }} 已處於 Pending 狀態超過 10 分鐘"

      # Node 記憶體壓力 (NPD 整合)
      - alert: NodeMemoryPressure
        expr: kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
        for: 2m
        labels:
          severity: critical
          team: ops
          component: k8s
        annotations:
          summary: "🔴 節點記憶體壓力"
          description: "節點 {{ $labels.node }} 記憶體不足"

      # Node 磁碟壓力 (NPD 整合)
      - alert: NodeDiskPressure
        expr: kube_node_status_condition{condition="DiskPressure", status="true"} == 1
        for: 2m
        labels:
          severity: critical
          team: ops
          component: k8s
        annotations:
          summary: "🔴 節點磁碟壓力"
          description: "節點 {{ $labels.node }} 磁碟空間不足"

      # Node PID 壓力 (NPD 整合)
      - alert: NodePIDPressure
        expr: kube_node_status_condition{condition="PIDPressure", status="true"} == 1
        for: 2m
        labels:
          severity: warning
          team: ops
          component: k8s
        annotations:
          summary: "⚠️ 節點 PID 壓力"
          description: "節點 {{ $labels.node }} PID 資源不足"

      # Node Not Ready
      - alert: NodeNotReady
        expr: kube_node_status_condition{condition="Ready", status="true"} == 0
        for: 2m
        labels:
          severity: critical
          team: ops
          component: k8s
        annotations:
          summary: "🔴 節點未就緒"
          description: "節點 {{ $labels.node }} 狀態不是 Ready"