P2 改進: - 新增 kube-state-metrics v2.10.1 (NodePort:30888) - 新增 7 條 kube-state-metrics 告警規則 (NPD 整合) P3 改進: - 修復 Kured 維護窗口時區 (18:00→02:00 台北時間) - Descheduler threshold 20%→30% (避免過度遷移) 首席架構師審查建議執行項目 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
204 lines
6.9 KiB
YAML
204 lines
6.9 KiB
YAML
# =============================================================================
|
|
# K3s Infrastructure Alerts
|
|
# =============================================================================
|
|
# K-MON 2026-03-28: Alert rules for K3s infrastructure monitoring
|
|
# Deployed by: Claude Code (首席架構師)
|
|
# Deployed to: 192.168.0.188 Prometheus/Alertmanager
|
|
# =============================================================================
|
|
#
|
|
# 這個檔案是 Prometheus alert rules 的版本控制副本
|
|
# 實際部署位置: /home/ollama/momo-pro/monitoring/alerts.yml (188)
|
|
#
|
|
# =============================================================================
|
|
|
|
groups:
|
|
# ===== K3s 基礎設施告警 =====
|
|
- name: k3s_infrastructure
|
|
rules:
|
|
# K3s VIP 不可達
|
|
- alert: K3sVIPDown
|
|
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:6443"} == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: k3s
|
|
annotations:
|
|
summary: "🔴 K3s VIP 無法連接"
|
|
description: "K3s API VIP 192.168.0.125:6443 已離線超過 30 秒"
|
|
|
|
# K3s 節點不可達
|
|
- alert: K3sNodeDown
|
|
expr: probe_success{job="blackbox-icmp", instance=~"192.168.0.12[01]"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: k3s
|
|
annotations:
|
|
summary: "🔴 K3s 節點離線"
|
|
description: "K3s 節點 {{ $labels.instance }} 已離線超過 1 分鐘"
|
|
|
|
# AWOOOI API 不可達
|
|
- alert: AWOOOIApiDown
|
|
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32334"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: awoooi
|
|
annotations:
|
|
summary: "🔴 AWOOOI API 離線"
|
|
description: "AWOOOI API (192.168.0.125:32334) 已離線超過 1 分鐘"
|
|
|
|
# AWOOOI Web 不可達
|
|
- alert: AWOOOIWebDown
|
|
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32335"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: awoooi
|
|
annotations:
|
|
summary: "⚠️ AWOOOI Web 離線"
|
|
description: "AWOOOI Web (192.168.0.125:32335) 已離線超過 1 分鐘"
|
|
|
|
# ===== Velero 備份告警 =====
|
|
- name: velero_alerts
|
|
rules:
|
|
# Velero 備份失敗
|
|
- alert: VeleroBackupFailed
|
|
expr: increase(velero_backup_failure_total[24h]) > 0
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: velero
|
|
annotations:
|
|
summary: "⚠️ Velero 備份失敗"
|
|
description: "過去 24 小時有 {{ $value }} 次備份失敗"
|
|
|
|
# Velero 備份超時
|
|
- alert: VeleroBackupMissing
|
|
expr: (time() - velero_backup_last_successful_timestamp{schedule=~".*awoooi.*"}) > 129600
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: velero
|
|
annotations:
|
|
summary: "🔴 Velero 備份超時"
|
|
description: "awoooi 排程備份已超過 36 小時未成功"
|
|
|
|
# ===== 可觀測性服務告警 =====
|
|
- name: observability_alerts
|
|
rules:
|
|
# SignOz 離線
|
|
- alert: SignOzDown
|
|
expr: probe_success{job="blackbox-tcp", instance="192.168.0.188:3301"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: signoz
|
|
annotations:
|
|
summary: "⚠️ SignOz 服務離線"
|
|
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
|
|
|
|
# Sentry 離線
|
|
- alert: SentryDown
|
|
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: sentry
|
|
annotations:
|
|
summary: "⚠️ Sentry 服務離線"
|
|
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
|
|
|
|
# ===== kube-state-metrics 告警 (K8s 物件狀態) =====
|
|
- name: kube_state_metrics_alerts
|
|
rules:
|
|
# Pod 重啟過多
|
|
- alert: PodRestartingTooMuch
|
|
expr: increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[1h]) > 3
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "⚠️ Pod 重啟過多"
|
|
description: "{{ $labels.namespace }}/{{ $labels.pod }} 過去 1 小時重啟 {{ $value }} 次"
|
|
|
|
# Deployment 副本數不足
|
|
- alert: DeploymentReplicasMismatch
|
|
expr: kube_deployment_status_replicas_available{namespace="awoooi-prod"} != kube_deployment_spec_replicas{namespace="awoooi-prod"}
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "⚠️ Deployment 副本不足"
|
|
description: "{{ $labels.deployment }} 期望 {{ $value }} 副本但可用數不符"
|
|
|
|
# Pod 長時間處於 Pending
|
|
- alert: PodPendingTooLong
|
|
expr: kube_pod_status_phase{phase="Pending", namespace="awoooi-prod"} == 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "⚠️ Pod 卡在 Pending"
|
|
description: "{{ $labels.pod }} 已處於 Pending 狀態超過 10 分鐘"
|
|
|
|
# Node 記憶體壓力 (NPD 整合)
|
|
- alert: NodeMemoryPressure
|
|
expr: kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "🔴 節點記憶體壓力"
|
|
description: "節點 {{ $labels.node }} 記憶體不足"
|
|
|
|
# Node 磁碟壓力 (NPD 整合)
|
|
- alert: NodeDiskPressure
|
|
expr: kube_node_status_condition{condition="DiskPressure", status="true"} == 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "🔴 節點磁碟壓力"
|
|
description: "節點 {{ $labels.node }} 磁碟空間不足"
|
|
|
|
# Node PID 壓力 (NPD 整合)
|
|
- alert: NodePIDPressure
|
|
expr: kube_node_status_condition{condition="PIDPressure", status="true"} == 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "⚠️ 節點 PID 壓力"
|
|
description: "節點 {{ $labels.node }} PID 資源不足"
|
|
|
|
# Node Not Ready
|
|
- alert: NodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: ops
|
|
component: k8s
|
|
annotations:
|
|
summary: "🔴 節點未就緒"
|
|
description: "節點 {{ $labels.node }} 狀態不是 Ready"
|