# ============================================================================= # WOOO TECH - Momo Pro System # Kube-Prometheus-Stack Values # ============================================================================= # Alertmanager 配置 alertmanager: enabled: true config: global: resolve_timeout: 5m route: group_by: ['alertname', 'namespace'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'null' # 預設靜默,只發送明確匹配的告警 routes: # ============================================ # 靜默規則 (這些告警不發送通知) # ============================================ # 系統內建告警 - match: alertname: InfoInhibitor receiver: 'null' - match: alertname: Watchdog receiver: 'null' # API 錯誤預算告警 (常見誤報) - match: alertname: KubeAPIErrorBudgetBurn receiver: 'null' - match: alertname: KubeAPILatencyHigh receiver: 'null' # info/none 等級告警 - match: severity: none receiver: 'null' - match: severity: info receiver: 'null' # Rancher 系統告警 - match: namespace: cattle-system receiver: 'null' - match: namespace: cattle-fleet-system receiver: 'null' # kube-system 系統告警 - match: namespace: kube-system receiver: 'null' # monitoring namespace 的 CPU 節流告警 (常見) - match: namespace: monitoring alertname: CPUThrottlingHigh receiver: 'null' # ============================================ # 發送規則 (只有 momo namespace 的重要告警) # ============================================ - match: namespace: momo severity: critical receiver: 'telegram' repeat_interval: 1h - match: namespace: momo severity: warning receiver: 'telegram' repeat_interval: 4h receivers: - name: 'null' - name: 'telegram' telegram_configs: - bot_token: '' chat_id: '' parse_mode: 'HTML' message: | {{ if eq .Status "firing" }}🚨🔥 告警觸發 🔥🚨{{ else }}✅💚 告警恢復 💚✅{{ end }} 🏢 環境: 🟦 UAT (mo.wooo.work) 📋 告警名稱: {{ .CommonLabels.alertname }} {{ if eq .CommonLabels.severity "critical" }}🔴{{ else if eq .CommonLabels.severity "warning" }}🟡{{ else }}🔵{{ end }} 嚴重程度: {{ .CommonLabels.severity }} 📦 命名空間: {{ .CommonLabels.namespace }} {{ range .Alerts }} 📝 摘要: {{ .Annotations.summary }} 💬 詳情: {{ .Annotations.description }} ⏰ 時間: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} {{ end }} 🏷️ MOMO Pro K8s 監控系統 (UAT) inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'namespace'] alertmanagerSpec: resources: requests: memory: 64Mi cpu: 10m limits: memory: 128Mi cpu: 100m storage: volumeClaimTemplate: spec: storageClassName: local-path accessModes: ["ReadWriteOnce"] resources: requests: storage: 1Gi # Prometheus 配置 prometheus: enabled: true prometheusSpec: retention: 7d resources: requests: memory: 256Mi cpu: 100m limits: memory: 1Gi cpu: 500m storageSpec: volumeClaimTemplate: spec: storageClassName: local-path accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi # 監控 momo namespace serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} podMonitorNamespaceSelector: {} podMonitorSelector: {} # Grafana 配置 grafana: enabled: true adminPassword: "Wooo_Grafana_2026" persistence: enabled: true storageClassName: local-path size: 2Gi resources: requests: memory: 128Mi cpu: 50m limits: memory: 256Mi cpu: 200m # 預設 Dashboard defaultDashboardsEnabled: true defaultDashboardsTimezone: Asia/Taipei # Node Exporter nodeExporter: enabled: true resources: requests: memory: 32Mi cpu: 10m limits: memory: 64Mi cpu: 100m # Kube State Metrics kubeStateMetrics: enabled: true resources: requests: memory: 32Mi cpu: 10m limits: memory: 128Mi cpu: 100m # 禁用不需要的組件 (K3s 已內建) kubeControllerManager: enabled: false kubeScheduler: enabled: false kubeProxy: enabled: false kubeEtcd: enabled: false # Prometheus Operator prometheusOperator: resources: requests: memory: 64Mi cpu: 10m limits: memory: 256Mi cpu: 200m