From eab3f527cd8b689b28e5b0c11cfc084a33967210 Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 19 Apr 2026 01:50:41 +0800 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20Phase=207=20=E7=9B=B2?= =?UTF-8?q?=E5=8D=80=E6=B2=BB=E7=90=86=20=E2=80=94=20L2=20=E9=85=8D?= =?UTF-8?q?=E9=A1=8D=20+=20=E8=87=AA=E7=9B=A3=E6=8E=A7=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=20(ADR-090)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 戰場:110 load=17 持續 13 天 + 188 cadvisor 321% CPU 重啟無效 統帥鐵律:不要只降低,要長期解決 → 結構性治理而非補丁 本 commit 涵蓋: 1. k8s/monitoring/docker-compose-110.yml - cadvisor 加 mem_limit 512M + cpus 1.0(L2 防爆網) - 備註 110 live 與本檔 drift(下一 session 納入 CD) 2. ops/monitoring/alerts-unified.yml 新增 infra_self_monitoring 群組: - CadvisorDown / MemoryPressure / CPUThrottled - NodeExporterDown / CPUThrottled - SentryClickHouseMemoryPressure / CPUThrottled - GiteaMemoryPressure / CPUThrottled - PrometheusDown(監控自監控元層) → 全部用 (memory usage / spec_memory_limit) 動態判斷, 不寫死 80% 或 MB 數,配額改閾值自動跟著變 其他配套(非本 repo,已 SSH patch 到 110/188): - /home/ollama/wooo-aiops/docker-compose.yml:188 cadvisor 加 --disable_metrics / --docker_only / --housekeeping_interval + 1g/1.5c - /home/wooo/monitoring/docker-compose.yml:110 cadvisor + node-exporter 納管 + 降維 flags + 配額 - /opt/sentry/docker-compose.override.yml:Sentry L2 配額(clickhouse 8g/4c, kafka 3g/2c 等) - /home/wooo/gitea/docker-compose.yml:Gitea 3g/3c - /home/wooo/act-runner/docker-compose.yml:Actions Runner 2g/2c 對映: - feedback_monitor_self_monitoring.md 🔴🔴🔴 監控工具必須被監控 - feedback_ai_autonomous_direction.md 動態閾值 ≠ 寫死規則 - ADR-090 Layer 2 資源配額強制 驗收(48h): - 188 cadvisor CPU 從 321% → <50%(配額強制) - 110 load5 從 18 → <10(Sentry/Gitea 釋壓後) - 自監控告警無誤報 Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/monitoring/docker-compose-110.yml | 10 ++ ops/monitoring/alerts-unified.yml | 167 ++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) diff --git a/k8s/monitoring/docker-compose-110.yml b/k8s/monitoring/docker-compose-110.yml index 3116b227..6f23c1ca 100644 --- a/k8s/monitoring/docker-compose-110.yml +++ b/k8s/monitoring/docker-compose-110.yml @@ -1,6 +1,14 @@ version: '3.8' services: + # --------------------------------------------------------------------------- + # cAdvisor 110 防爆網(2026-04-19 台北凌晨 / Claude Opus 4.7 / Phase 7 盲區治理) + # Why: 110 cadvisor 目前 0% CPU(已有 flags 降維),但無 mem_limit/cpus 防爆網 + # 加配額作為 L2 永久防爆:萬一未來量爆,會 OOMKill 不會拖垮 110 主機 + # 對映:ADR-090 Layer 2 資源配額強制 + # 注意:此 compose 與 110 live (/home/wooo/monitoring/docker-compose.yml) 有 drift 風險, + # 目前無 CD workflow 同步 → 列為技術債,下一 session 納入 Git + CD + # --------------------------------------------------------------------------- cadvisor: image: gcr.io/cadvisor/cadvisor:latest container_name: cadvisor @@ -27,6 +35,8 @@ services: - /dev/kmsg networks: - monitoring + mem_limit: 512m + cpus: 1.0 prometheus: image: prom/prometheus:latest diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index bf261f3b..eeca4100 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -929,3 +929,170 @@ groups: summary: "主機 {{ $labels.instance }} 無法連通" description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。" runbook: "SSH 檢查路由和防火牆規則" + + # ========================================================================= + # 監控工具自監控 (infra_self_monitoring) — ADR-090 Phase 7 + # 2026-04-19 Claude Opus 4.7 / 鐵律:監控工具必須被監控 + # 設計:不寫死 CPU% 或 MB 數,改用 (配額佔比) + (throttle 訊號) 動態判斷 + # 配額由 docker-compose 宣告,告警條件 = 使用量 / 配額 > 0.8 + # 比寫死 80% 更智能 — 配額改告警閾值自動跟著變 + # ========================================================================= + - name: infra_self_monitoring + interval: 1m + rules: + + # --- cadvisor 自監控 --- + - alert: CadvisorDown + expr: up{job=~".*cadvisor.*"} == 0 + for: 5m + labels: + severity: critical + layer: docker-110-188 + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "cAdvisor ({{ $labels.instance }}) 停擺" + description: "主機 {{ $labels.instance }} 的 cadvisor 已停擺 5 分鐘,容器監控中斷。" + runbook: "SSH 主機 docker compose up -d cadvisor;檢查 OOMKill 訊號" + + - alert: CadvisorMemoryPressure + expr: container_memory_usage_bytes{name="cadvisor"} / container_spec_memory_limit_bytes{name="cadvisor"} > 0.8 + for: 10m + labels: + severity: warning + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "cAdvisor 記憶體使用率 > 80% limit" + description: "cadvisor 記憶體用量 / mem_limit = {{ $value | humanizePercentage }},接近 OOMKill。" + runbook: "若頻繁觸發 → 檢查 cardinality 是否持續成長,考慮調整 --disable_metrics" + + - alert: CadvisorCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name="cadvisor"}[5m]) > 0.5 + for: 15m + labels: + severity: warning + component: cadvisor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "cAdvisor CPU 被 throttle(配額不足)" + description: "cadvisor 每秒被 throttle {{ $value }} 秒,表示實際需求超過 cpus 配額。" + runbook: "調高 docker-compose cpus 設定,或檢查 scrape interval / cardinality" + + # --- node-exporter 自監控 --- + - alert: NodeExporterDown + expr: up{job=~"node-exporter.*|node_exporter.*"} == 0 + for: 5m + labels: + severity: critical + component: node-exporter + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "node-exporter ({{ $labels.instance }}) 停擺" + description: "主機 {{ $labels.instance }} node-exporter 已停擺 5 分鐘,主機 metrics 中斷。" + runbook: "SSH 主機檢查 docker ps node-exporter;重啟 docker compose up -d node-exporter" + + - alert: NodeExporterCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name="node-exporter"}[5m]) > 0.5 + for: 15m + labels: + severity: warning + component: node-exporter + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "node-exporter CPU 被 throttle(配額不足)" + description: "node-exporter 每秒被 throttle {{ $value }} 秒。可能 collector 未適度 disable。" + runbook: "檢查 node-exporter --collector.* flags 是否該關掉閒置硬體 probe" + + # --- Sentry self-hosted 自監控(110)--- + - alert: SentryClickHouseMemoryPressure + expr: container_memory_usage_bytes{name=~".*sentry.*clickhouse.*"} / container_spec_memory_limit_bytes{name=~".*sentry.*clickhouse.*"} > 0.8 + for: 10m + labels: + severity: warning + component: sentry-clickhouse + team: platform + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Sentry ClickHouse 記憶體使用率 > 80% limit" + description: "sentry clickhouse 用量 / mem_limit = {{ $value | humanizePercentage }}。" + runbook: "檢查 Sentry 查詢壓力;調整 /opt/sentry/docker-compose.override.yml clickhouse mem_limit" + + - alert: SentryClickHouseCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name=~".*sentry.*clickhouse.*"}[5m]) > 1.0 + for: 15m + labels: + severity: warning + component: sentry-clickhouse + team: platform + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Sentry ClickHouse CPU 持續被 throttle" + description: "每秒 throttle {{ $value }} 秒,配額 cpus=4.0 可能不足。" + runbook: "檢查 Sentry retention / query pattern;必要時調高 override.yml cpus" + + # --- Gitea 自監控 --- + - alert: GiteaMemoryPressure + expr: container_memory_usage_bytes{name="gitea"} / container_spec_memory_limit_bytes{name="gitea"} > 0.8 + for: 10m + labels: + severity: warning + component: gitea + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Gitea 記憶體使用率 > 80% limit" + description: "gitea 用量 / mem_limit = {{ $value | humanizePercentage }}。" + runbook: "檢查 CI/CD 任務堆積;必要時調高 docker-compose mem_limit" + + - alert: GiteaCPUThrottled + expr: rate(container_cpu_cfs_throttled_seconds_total{name=~"gitea|gitea-runner"}[5m]) > 1.0 + for: 15m + labels: + severity: warning + component: gitea + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Gitea / Runner CPU 持續被 throttle" + description: "{{ $labels.name }} 每秒 throttle {{ $value }} 秒,CD peak 可能卡關。" + runbook: "檢查 job 並行度;考慮縮減並行或調高 cpus" + + # --- 監控自監控元層(Prometheus 本身)--- + - alert: PrometheusDown + expr: up{job="prometheus"} == 0 + for: 2m + labels: + severity: critical + component: prometheus + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Prometheus ({{ $labels.instance }}) 停擺" + description: "Prometheus 自己停擺 → 所有其他告警失效" + runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"