diff --git a/k8s/monitoring/docker-compose-110.yml b/k8s/monitoring/docker-compose-110.yml index 3116b227..6f23c1ca 100644 --- a/k8s/monitoring/docker-compose-110.yml +++ b/k8s/monitoring/docker-compose-110.yml @@ -1,6 +1,14 @@ version: '3.8' services: + # --------------------------------------------------------------------------- + # cAdvisor 110 防爆網(2026-04-19 台北凌晨 / Claude Opus 4.7 / Phase 7 盲區治理) + # Why: 110 cadvisor 目前 0% CPU(已有 flags 降維),但無 mem_limit/cpus 防爆網 + # 加配額作為 L2 永久防爆:萬一未來量爆,會 OOMKill 不會拖垮 110 主機 + # 對映:ADR-090 Layer 2 資源配額強制 + # 注意:此 compose 與 110 live (/home/wooo/monitoring/docker-compose.yml) 有 drift 風險, + # 目前無 CD workflow 同步 → 列為技術債,下一 session 納入 Git + CD + # --------------------------------------------------------------------------- cadvisor: image: gcr.io/cadvisor/cadvisor:latest container_name: cadvisor @@ -27,6 +35,8 @@ services: - /dev/kmsg networks: - monitoring + mem_limit: 512m + cpus: 1.0 prometheus: image: prom/prometheus:latest diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index bf261f3b..eeca4100 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -929,3 +929,170 @@ groups: summary: "主機 {{ $labels.instance }} 無法連通" description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。" runbook: "SSH 檢查路由和防火牆規則" + + # ========================================================================= + # 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7 + # 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控 + # 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷 + # 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8 + # 比寫死 80% 更智能 — 配額改告警閾值自動跟著變 + # ========================================================================= + - name: infra_self_monitoring + interval: 1m + rules: + + # --- cadvisor 自監控 --- + - alert: CadvisorDown + expr: up{job=~".*cadvisor.*"} == 0 + for: 5m + labels: + severity: critical + layer: docker-110-188 + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "cAdvisor ({{ $labels.instance }}) 停擺" + description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。" + runbook: "SSH 主機 docker compose up -d cadvisor;檢查 OOMKill 訊號" + + - alert: CadvisorMemoryPressure + expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8 + for: 10m + labels: + severity: warning + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "cAdvisor 記憶體使用率 > 80% limit" + description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。" + runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics" + + - alert: CadvisorCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5 + for: 15m + labels: + severity: warning + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "cAdvisor CPU 被 throttle(配額不足)" + description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。" + runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality" + + # --- node-exporter 自監控 --- + - alert: NodeExporterDown + expr: up{job=~"node-exporter.*|node_exporter.*"} == 0 + for: 5m + labels: + severity: critical + component: node-exporter + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "node-exporter ({{ $labels.instance }}) 停擺" + description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。" + runbook: "SSH 主機檢查 docker ps node-exporter;重啟 docker compose up -d node-exporter" + + - alert: NodeExporterCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5 + for: 15m + labels: + severity: warning + component: node-exporter + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "node-exporter CPU 被 throttle(配額不足)" + description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。" + runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe" + + # --- Sentry self-hosted 自監控(110)--- + - alert: SentryClickHouseMemoryPressure + expr: container_memory_usage_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.8 + for: 10m + labels: + severity: warning + component: sentry-clickhouse + team: platform + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Sentry ClickHouse 記憶體使用率 > 80% limit" + description: "sentry clickhouse 用量 / mem_limit = {{ $value | humanizePercentage }}。" + runbook: "檢查 Sentry 查詢壓力;調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit" + + - alert: SentryClickHouseCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0 + for: 15m + labels: + severity: warning + component: sentry-clickhouse + team: platform + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Sentry ClickHouse CPU 持續被 throttle" + description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。" + runbook: "檢查 Sentry retention / query pattern;必要時調高 override.yml cpus" + + # --- Gitea 自監控 --- + - alert: GiteaMemoryPressure + expr: container_memory_usage_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.8 + for: 10m + labels: + severity: warning + component: gitea + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Gitea 記憶體使用率 > 80% limit" + description: "gitea 用量 / mem_limit = {{ $value | humanizePercentage }}。" + runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit" + + - alert: GiteaCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0 + for: 15m + labels: + severity: warning + component: gitea + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Gitea / Runner CPU 持續被 throttle" + description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒,CD peak 可能卡關。" + runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus" + + # --- 監控自監控元層(Prometheus 本身)--- + - alert: PrometheusDown + expr: up{job="prometheus"} == 0 + for: 2m + labels: + severity: critical + component: prometheus + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Prometheus ({{ $labels.instance }}) 停擺" + description: "Prometheus 自己停擺 → 所有其他告警失效" + runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"