Files
awoooi/k8s/monitoring/k3s-alerts.yaml
OG T 0b68352fc2 feat(k3s): P2/P3 改進 - kube-state-metrics + Kured 時區修復 + Descheduler 調整
P2 改進:
- 新增 kube-state-metrics v2.10.1 (NodePort:30888)
- 新增 7 條 kube-state-metrics 告警規則 (NPD 整合)

P3 改進:
- 修復 Kured 維護窗口時區 (18:00→02:00 台北時間)
- Descheduler threshold 20%→30% (避免過度遷移)

首席架構師審查建議執行項目

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-28 22:23:42 +08:00

204 lines
6.9 KiB
YAML

# =============================================================================
# K3s Infrastructure Alerts
# =============================================================================
# K-MON 2026-03-28: Alert rules for K3s infrastructure monitoring
# Deployed by: Claude Code (首席架構師)
# Deployed to: 192.168.0.188 Prometheus/Alertmanager
# =============================================================================
#
# 這個檔案是 Prometheus alert rules 的版本控制副本
# 實際部署位置: /home/ollama/momo-pro/monitoring/alerts.yml (188)
#
# =============================================================================
groups:
# ===== K3s 基礎設施告警 =====
- name: k3s_infrastructure
rules:
# K3s VIP 不可達
- alert: K3sVIPDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:6443"} == 0
for: 30s
labels:
severity: critical
team: ops
component: k3s
annotations:
summary: "🔴 K3s VIP 無法連接"
description: "K3s API VIP 192.168.0.125:6443 已離線超過 30 秒"
# K3s 節點不可達
- alert: K3sNodeDown
expr: probe_success{job="blackbox-icmp", instance=~"192.168.0.12[01]"} == 0
for: 1m
labels:
severity: critical
team: ops
component: k3s
annotations:
summary: "🔴 K3s 節點離線"
description: "K3s 節點 {{ $labels.instance }} 已離線超過 1 分鐘"
# AWOOOI API 不可達
- alert: AWOOOIApiDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32334"} == 0
for: 1m
labels:
severity: critical
team: ops
component: awoooi
annotations:
summary: "🔴 AWOOOI API 離線"
description: "AWOOOI API (192.168.0.125:32334) 已離線超過 1 分鐘"
# AWOOOI Web 不可達
- alert: AWOOOIWebDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.125:32335"} == 0
for: 1m
labels:
severity: warning
team: ops
component: awoooi
annotations:
summary: "⚠️ AWOOOI Web 離線"
description: "AWOOOI Web (192.168.0.125:32335) 已離線超過 1 分鐘"
# ===== Velero 備份告警 =====
- name: velero_alerts
rules:
# Velero 備份失敗
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[24h]) > 0
labels:
severity: warning
team: ops
component: velero
annotations:
summary: "⚠️ Velero 備份失敗"
description: "過去 24 小時有 {{ $value }} 次備份失敗"
# Velero 備份超時
- alert: VeleroBackupMissing
expr: (time() - velero_backup_last_successful_timestamp{schedule=~".*awoooi.*"}) > 129600
for: 1h
labels:
severity: critical
team: ops
component: velero
annotations:
summary: "🔴 Velero 備份超時"
description: "awoooi 排程備份已超過 36 小時未成功"
# ===== 可觀測性服務告警 =====
- name: observability_alerts
rules:
# SignOz 離線
- alert: SignOzDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.188:3301"} == 0
for: 2m
labels:
severity: warning
team: ops
component: signoz
annotations:
summary: "⚠️ SignOz 服務離線"
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
# Sentry 離線
- alert: SentryDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
for: 2m
labels:
severity: warning
team: ops
component: sentry
annotations:
summary: "⚠️ Sentry 服務離線"
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
# ===== kube-state-metrics 告警 (K8s 物件狀態) =====
- name: kube_state_metrics_alerts
rules:
# Pod 重啟過多
- alert: PodRestartingTooMuch
expr: increase(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[1h]) > 3
labels:
severity: warning
team: ops
component: k8s
annotations:
summary: "⚠️ Pod 重啟過多"
description: "{{ $labels.namespace }}/{{ $labels.pod }} 過去 1 小時重啟 {{ $value }} 次"
# Deployment 副本數不足
- alert: DeploymentReplicasMismatch
expr: kube_deployment_status_replicas_available{namespace="awoooi-prod"} != kube_deployment_spec_replicas{namespace="awoooi-prod"}
for: 5m
labels:
severity: warning
team: ops
component: k8s
annotations:
summary: "⚠️ Deployment 副本不足"
description: "{{ $labels.deployment }} 期望 {{ $value }} 副本但可用數不符"
# Pod 長時間處於 Pending
- alert: PodPendingTooLong
expr: kube_pod_status_phase{phase="Pending", namespace="awoooi-prod"} == 1
for: 10m
labels:
severity: warning
team: ops
component: k8s
annotations:
summary: "⚠️ Pod 卡在 Pending"
description: "{{ $labels.pod }} 已處於 Pending 狀態超過 10 分鐘"
# Node 記憶體壓力 (NPD 整合)
- alert: NodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
for: 2m
labels:
severity: critical
team: ops
component: k8s
annotations:
summary: "🔴 節點記憶體壓力"
description: "節點 {{ $labels.node }} 記憶體不足"
# Node 磁碟壓力 (NPD 整合)
- alert: NodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure", status="true"} == 1
for: 2m
labels:
severity: critical
team: ops
component: k8s
annotations:
summary: "🔴 節點磁碟壓力"
description: "節點 {{ $labels.node }} 磁碟空間不足"
# Node PID 壓力 (NPD 整合)
- alert: NodePIDPressure
expr: kube_node_status_condition{condition="PIDPressure", status="true"} == 1
for: 2m
labels:
severity: warning
team: ops
component: k8s
annotations:
summary: "⚠️ 節點 PID 壓力"
description: "節點 {{ $labels.node }} PID 資源不足"
# Node Not Ready
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 2m
labels:
severity: critical
team: ops
component: k8s
annotations:
summary: "🔴 節點未就緒"
description: "節點 {{ $labels.node }} 狀態不是 Ready"