Files
ewoooc/k8s/monitoring/values-prometheus.yaml
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

209 lines
5.3 KiB
YAML

# =============================================================================
# WOOO TECH - Momo Pro System
# Kube-Prometheus-Stack Values
# =============================================================================
# Alertmanager 配置
alertmanager:
enabled: true
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'null' # 預設靜默,只發送明確匹配的告警
routes:
# ============================================
# 靜默規則 (這些告警不發送通知)
# ============================================
# 系統內建告警
- match:
alertname: InfoInhibitor
receiver: 'null'
- match:
alertname: Watchdog
receiver: 'null'
# API 錯誤預算告警 (常見誤報)
- match:
alertname: KubeAPIErrorBudgetBurn
receiver: 'null'
- match:
alertname: KubeAPILatencyHigh
receiver: 'null'
# info/none 等級告警
- match:
severity: none
receiver: 'null'
- match:
severity: info
receiver: 'null'
# Rancher 系統告警
- match:
namespace: cattle-system
receiver: 'null'
- match:
namespace: cattle-fleet-system
receiver: 'null'
# kube-system 系統告警
- match:
namespace: kube-system
receiver: 'null'
# monitoring namespace 的 CPU 節流告警 (常見)
- match:
namespace: monitoring
alertname: CPUThrottlingHigh
receiver: 'null'
# ============================================
# 發送規則 (只有 momo namespace 的重要告警)
# ============================================
- match:
namespace: momo
severity: critical
receiver: 'telegram'
repeat_interval: 1h
- match:
namespace: momo
severity: warning
receiver: 'telegram'
repeat_interval: 4h
receivers:
- name: 'null'
- name: 'telegram'
telegram_configs:
- bot_token: '8075645931:AAH-EGKMo8ZC4QJs-Nc1_0s92xHrGdQvdpg'
chat_id: 5619078117
parse_mode: 'HTML'
message: |
{{ if eq .Status "firing" }}🚨🔥 <b>告警觸發</b> 🔥🚨{{ else }}✅💚 <b>告警恢復</b> 💚✅{{ end }}
🏢 <b>環境:</b> 🟦 <code>UAT</code> (mo.wooo.work)
📋 <b>告警名稱:</b> {{ .CommonLabels.alertname }}
{{ if eq .CommonLabels.severity "critical" }}🔴{{ else if eq .CommonLabels.severity "warning" }}🟡{{ else }}🔵{{ end }} <b>嚴重程度:</b> {{ .CommonLabels.severity }}
📦 <b>命名空間:</b> {{ .CommonLabels.namespace }}
{{ range .Alerts }}
📝 <b>摘要:</b> {{ .Annotations.summary }}
💬 <b>詳情:</b> {{ .Annotations.description }}
⏰ <i>時間: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}</i>
{{ end }}
🏷️ <i>MOMO Pro K8s 監控系統 (UAT)</i>
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'namespace']
alertmanagerSpec:
resources:
requests:
memory: 64Mi
cpu: 10m
limits:
memory: 128Mi
cpu: 100m
storage:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 1Gi
# Prometheus 配置
prometheus:
enabled: true
prometheusSpec:
retention: 7d
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 1Gi
cpu: 500m
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
# 監控 momo namespace
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
# Grafana 配置
grafana:
enabled: true
adminPassword: "Wooo_Grafana_2026"
persistence:
enabled: true
storageClassName: local-path
size: 2Gi
resources:
requests:
memory: 128Mi
cpu: 50m
limits:
memory: 256Mi
cpu: 200m
# 預設 Dashboard
defaultDashboardsEnabled: true
defaultDashboardsTimezone: Asia/Taipei
# Node Exporter
nodeExporter:
enabled: true
resources:
requests:
memory: 32Mi
cpu: 10m
limits:
memory: 64Mi
cpu: 100m
# Kube State Metrics
kubeStateMetrics:
enabled: true
resources:
requests:
memory: 32Mi
cpu: 10m
limits:
memory: 128Mi
cpu: 100m
# 禁用不需要的組件 (K3s 已內建)
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false
kubeEtcd:
enabled: false
# Prometheus Operator
prometheusOperator:
resources:
requests:
memory: 64Mi
cpu: 10m
limits:
memory: 256Mi
cpu: 200m