From e8e6748f7054626e12959f9692ba2afa762fe4fd Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 13:45:09 +0800 Subject: [PATCH] fix(ops): add docker host resource baseline guardrails --- apps/api/alert_rules.yaml | 31 ++++ apps/api/src/services/proactive_inspector.py | 59 ++++++- docs/LOGBOOK.md | 110 ++++++++++++- .../HOST-RESOURCE-BASELINE-110-188.md | 72 +++++++++ k8s/awoooi-prod/kustomization.yaml | 2 +- k8s/monitoring/prometheus.yml | 1 + ops/monitoring/alerts-unified.yml | 87 +++++++++++ ops/monitoring/alerts.yml | 87 +++++++++++ scripts/ops/docker-stats-textfile-exporter.py | 145 ++++++++++++++++++ 9 files changed, 586 insertions(+), 8 deletions(-) create mode 100644 docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md create mode 100755 scripts/ops/docker-stats-textfile-exporter.py diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 25ccd552..700d6821 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -163,6 +163,37 @@ rules: responsibility: INFRA reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。" + # 2026-05-05 ogt + Codex: 110/188 長時間過載事故後補 Docker Compose 過載與 restart spike 路由。 + # 原則:過載與重啟暴增只能先診斷,禁止通用 docker restart;由 LLM + Playbook trust 決定 service-specific 修復。 + - id: docker_baseline_overload_alert + priority: 44 + description: Docker Compose 服務過載 / restart spike 基線告警(cadvisor + textfile exporter) + match: + alertname: + - HostLoadAverageSustainedHigh + - DockerContainerCpuSustainedHigh + - DockerContainerCpuRunawayCritical + - DockerContainerMemoryLimitPressure + - DockerContainerRestartSpike + response: + action_title: "🔍 Docker/Host 過載自動診斷 — 禁止通用重啟" + description: "110/188 Docker Compose 或主機 load 長時間偏離 baseline。AI 需先收集容器 CPU、restart、logs、ClickHouse/Kafka/爬蟲狀態,再選擇限流、降併發或服務專屬 playbook。" + suggested_action: SSH_DIAGNOSE + kubectl_command: "ssh {host} 'echo \"=== LOAD ===\"; uptime; echo \"=== TOP ===\"; ps aux --sort=-%cpu | head -20; echo \"=== DOCKER ===\"; docker stats --no-stream | head -40'" + estimated_downtime: "N/A" + risk: low + responsibility: INFRA + responsibility_reasoning: "Docker Compose / bare-metal 過載屬主機與平台資源治理,不能交給 K8s restart 處理" + secondary_teams: [BE, SRE] + optimization: + - type: BASELINE_CHECK + description: "比較 load5/core、單容器 CPU core、restart spike 與 24h 動態基線" + command: "Prometheus query: node_load5/core + rate(container_cpu_usage_seconds_total[5m]) + increase(docker_container_restart_count[15m])" + - type: SERVICE_SPECIFIC_REPAIR + description: "依服務選擇專屬修復:ClickHouse 降 merge / scheduler 限 concurrency / litellm 修 health 或路由 / exporter 降 collector" + command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook" + reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流,禁止通用 docker restart;修復必須服務專屬且可回寫 Playbook trust。" + - id: high_cpu priority: 40 description: K8s Pod/Deployment CPU 使用率過高 diff --git a/apps/api/src/services/proactive_inspector.py b/apps/api/src/services/proactive_inspector.py index 681db065..ba49df7b 100644 --- a/apps/api/src/services/proactive_inspector.py +++ b/apps/api/src/services/proactive_inspector.py @@ -68,20 +68,66 @@ MONITORED_METRICS: list[dict[str, Any]] = [ # cadvisor up=0(prod-docker-188 離線),改用 node-exporter node-level CPU # 實測確認:avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) → 有資料 # threshold 0.85 = 85% CPU 使用率(node level,0-1 比例) - "promql": 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m]))', + "promql": 'avg(rate(node_cpu_seconds_total{host="188",mode!="idle"}[5m]))', "threshold": 0.85, # > 85% node CPU(所有 core 平均) "description": "Node 188 CPU 使用率(node-exporter,cadvisor 停止時替代)", }, + { + # 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 動態基線。 + # Why: 事故主熱點在 110 Sentry ClickHouse/Snuba/Kafka,舊 inspector 只看未過濾的全域 CPU。 + "name": "cpu_usage_node_110", + "promql": 'avg(rate(node_cpu_seconds_total{host="110",mode!="idle"}[5m]))', + "threshold": 0.85, + "description": "Node 110 CPU 使用率(node-exporter)", + }, { # 2026-04-26 H2 hotfix: 改名 _node_188 "name": "memory_usage_node_188", # cadvisor 停止,改用 node-exporter 節點記憶體使用率比例(0-1) # 實測確認:188 機器 62.76 GiB,當前 ~30% 使用率 # threshold 0.85 = 85% node memory usage - "promql": '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes', + "promql": '(node_memory_MemTotal_bytes{host="188"} - node_memory_MemAvailable_bytes{host="188"}) / node_memory_MemTotal_bytes{host="188"}', "threshold": 0.85, # > 85% node memory(0-1 比例) "description": "Node 188 記憶體使用率(node-exporter,cadvisor 停止時替代)", }, + { + # 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 記憶體基線。 + "name": "memory_usage_node_110", + "promql": '(node_memory_MemTotal_bytes{host="110"} - node_memory_MemAvailable_bytes{host="110"}) / node_memory_MemTotal_bytes{host="110"}', + "threshold": 0.85, + "description": "Node 110 記憶體使用率(node-exporter)", + }, + { + # 2026-05-05 ogt + Codex: 用 load/core 比例抓長時間過載,避免 CPU% 被 runnable queue 稀釋。 + "name": "load5_per_core_110", + "promql": 'node_load5{host="110"} / count(count by(cpu) (node_cpu_seconds_total{host="110",mode="idle"}))', + "threshold": 1.50, + "description": "Node 110 load5 / CPU core 比例", + }, + { + "name": "load5_per_core_188", + "promql": 'node_load5{host="188"} / count(count by(cpu) (node_cpu_seconds_total{host="188",mode="idle"}))', + "threshold": 1.50, + "description": "Node 188 load5 / CPU core 比例", + }, + { + # 2026-05-05 ogt + Codex: 補 Docker Compose 服務層基線,抓單容器長期吃超過 2 core。 + "name": "docker_max_container_cpu_cores", + "promql": 'max(docker_container_cpu_cores) or max(rate(container_cpu_usage_seconds_total{name!="",id!="/"}[5m])) or on() vector(0)', + "threshold": 2.0, + "description": "Docker 單容器 CPU core 使用量上限", + }, + { + # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 container restart metric,改吃 node-exporter textfile。 + "name": "docker_restart_spike_15m", + "promql": ( + 'sum(increase(docker_container_restart_count[15m])) ' + 'or sum(increase(docker_container_inspect_restart_count[15m])) ' + 'or on() vector(0)' + ), + "threshold": 5.0, + "description": "Docker 容器 15 分鐘重啟增量", + }, { "name": "pod_restart_rate", # kube-state-metrics: namespace=awoooi-prod,sum 聚合避免 multi-vector @@ -222,8 +268,8 @@ class ProactiveInspector: async def _inspect_dynamic_baseline(self, report: InspectionReport) -> None: """從 Prometheus 抓取當前值,與 Holt-Winters 基線比對。""" - from src.services.dynamic_baseline_service import get_dynamic_baseline_service from src.core.feature_flags import aiops_flags + from src.services.dynamic_baseline_service import get_dynamic_baseline_service if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE: return @@ -273,8 +319,8 @@ class ProactiveInspector: async def _inspect_log_patterns(self, report: InspectionReport) -> None: """掃描 K8s Pod 日誌,偵測新 log pattern。""" - from src.services.log_anomaly_detector import get_log_anomaly_detector from src.core.feature_flags import aiops_flags + from src.services.log_anomaly_detector import get_log_anomaly_detector if not aiops_flags.AIOPS_P4_LOG_ANOMALY: return @@ -313,8 +359,8 @@ class ProactiveInspector: async def _inspect_trends(self, report: InspectionReport) -> None: """對各 metric 做 4h 趨勢外推。""" - from src.services.trend_predictor import get_trend_predictor from src.core.feature_flags import aiops_flags + from src.services.trend_predictor import get_trend_predictor if not aiops_flags.AIOPS_P4_TREND_PREDICTOR: return @@ -365,8 +411,8 @@ class ProactiveInspector: async def _retrain_baselines_background(self) -> None: """背景重訓所有 Holt-Winters 基線(不阻塞巡檢)。""" - from src.services.dynamic_baseline_service import get_dynamic_baseline_service from src.core.feature_flags import aiops_flags + from src.services.dynamic_baseline_service import get_dynamic_baseline_service if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE: return @@ -392,6 +438,7 @@ class ProactiveInspector: async def _fetch_current_value(self, promql: str) -> float | None: """從 Prometheus 抓取當前值(instant query)。""" import httpx + from src.core.config import settings try: diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3174929d..70691cc6 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,114 @@ --- +## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地 + +**背景**:統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置,造成服務卡死或慢性過載;本輪接續盤點 live Docker inspect / docker stats / compose 宣告。 + +**現場結論**: +- 110 仍高負載,不是單純等待回補即可:load 約 `23.84 / 27.11 / 34.67`;Sentry ClickHouse 4 CPU / 8GiB 貼著 CPU 上限跑,Kafka 3GiB 使用率約 84%,taskbroker 1 CPU 接近滿載,taskscheduler 512MiB 約 75%。 +- 110 Kafka lag 近乎清空,ClickHouse 仍在重 merge,node-exporter 自己曾因 `arp` / `netclass` / `netdev` collector 單次 scrape 花 17s+ 而自傷。 +- 188 已回穩但仍需節流治理:momo-scheduler 2 CPU / 2GiB 是安全欄不是根治;SignOz ClickHouse 4 CPU / 24GiB 目前合理。 +- 188 momo-scheduler 日誌顯示三張 schema 缺表(`ai_calls` / `learning_episodes` / `host_health_probes`)與 Elephant Alpha/OpenClaw action drift,這是背景任務反覆失敗,不是 CPU/memory limit 問題。 +- 110 node-exporter textfile path live drift:原指向 `/home/ollama/node_exporter_textfiles`,110 上不存在,造成 Docker Compose 指標半盲。 + +**本次落地**: +- 新增 `scripts/ops/docker-stats-textfile-exporter.py`,輸出 Docker container CPU cores / CPU limit / memory usage / memory limit / restart count / info。 +- 110:部署 exporter 到 `/home/wooo/scripts/`,新增 cron,每分鐘寫 `/home/wooo/node_exporter_textfiles/docker_stats.prom`;修正 `/home/wooo/monitoring/docker-compose.yml` 的 node-exporter textfile path,並只重建 node-exporter。 +- 110:關閉 node-exporter 高成本 collector:`arp`、`netclass`、`netdev`;scrape duration 從約 17s+ 降到 CPU/mem/load/textfile 等核心 collector 都 < 0.1s,node-exporter CPU 從約 80% 降到 0-5%。 +- 110:Kafka lag 已近零後,將 `/opt/sentry/.env` `SENTRY_TASKWORKER_CONCURRENCY` 從 4 降到 2,只重建 taskworker(snuba-api 因 compose dependency 被重建一次),taskworker command 已確認 `--concurrency=2`。 +- 188:部署 exporter 到 `/home/ollama/scripts/`,新增 cron,每分鐘寫 `/home/ollama/node_exporter_textfiles/docker_stats.prom`;保留既有 `docker_restart_count.prom`。 +- 188:套用既有 additive migrations `024_create_ai_calls_table.sql`、`028_create_learning_episodes.sql`、`029_create_host_health_probes.sql`,補齊 scheduler 正在寫入的 schema,未重啟服務。 +- `ops/monitoring/alerts*.yml`:新增 `HostLoadAverageSustainedHigh`、`DockerContainerCpuSustainedHigh`、`DockerContainerCpuRunawayCritical`、`DockerContainerMemoryLimitPressure`、`DockerContainerRestartSpike`。 +- `apps/api/alert_rules.yaml`:新增 Docker/Host 過載路由,強制走 `SSH_DIAGNOSE`,禁止通用 docker restart。 +- API GitOps:用最新 `main` (`a57e3d3d`) 加本次兩個 API 修補檔,在 188 建置並推送 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-a57e3d3`;`k8s/awoooi-prod/kustomization.yaml` 指向此 tag,避免手動 `kubectl set image` 被 Argo 回滾。 +- `docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md`:記錄 live 配額盤點、baseline policy、反模式與下一步 rollout 順序。 +- Prometheus 已 reload,97 條規則載入;新 baseline rules 全部存在。 + +**驗證**: +- `node_textfile_scrape_error`:110/188/112 全為 0。 +- Prometheus 已可查到 `docker_container_cpu_cores{host="110",container_name="sentry-self-hosted-clickhouse-1"}`、`docker_container_memory_limit_bytes{host="110",container_name="sentry-self-hosted-kafka-1"}`、`docker_container_cpu_cores{host="188",container_name="momo-scheduler"}`。 +- 110:taskworker / snuba-api / ClickHouse / Kafka healthy;Sentry Kafka `snuba-consumers` 主要 lag 0-1;load 從約 30+ 降到 `11.83 / 20.97 / 27.41`(1m 已降,15m 仍需等 merge 平滑)。 +- 188:三張 DB 表存在;migration 後只剩 `Fallback (111)` 健康警告,`UndefinedTable` 未再出現;momo-db CPU 回到約 0.6-2.5%,host load 約 `2.47 / 2.80 / 4.28`。 +- Prometheus 新 baseline alerts 查詢目前無 firing。 +- 新規則目前 pending:110 `HostLoadAverageSustainedHigh`、110 `DockerContainerCpuSustainedHigh` for Sentry ClickHouse。 +- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_classify_alert_early.py apps/api/tests/test_alert_rule_engine_validation.py -q` → 89 passed。 +- `ruff check apps/api/src/services/proactive_inspector.py`、`py_compile scripts/ops/docker-stats-textfile-exporter.py`、`git diff --check` → passed。 +- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-a57e3d3`。 + +**下一步**: +- 不要再降低 ClickHouse / Kafka memory limit;先觀察 backlog drain。 +- 若 110 ClickHouse 15-30 分鐘後仍持續 >2.5 cores,下一步查 merge/query 類型;不要靠降低 memory 或泛用 restart。 +- 188 下一步修 Elephant Alpha/OpenClaw allowed-action drift,避免 AI 自動修復決策計入 circuit breaker;momo-scheduler 2 CPU / 2GiB 暫時保留。 + +## 2026-05-05 | ADR-110 / AwoooP GCP Ollama compute pool 收斂 + +**背景**:統帥批准將 GCP-A / GCP-B Ollama 納入 AwoooP 推進計畫,不只作 failover,而是作為 platform-level Ollama compute pool。 + +**2026-05-05 live 驗證結論**: +- 生產 Deployment 實際 env:`OLLAMA_URL=110:11435`、`OLLAMA_SECONDARY_URL=110:11436`、`OLLAMA_FALLBACK_URL=192.168.0.111:11434`;ConfigMap 已是 `110:11437`,但 Deployment explicit env 尚未一致。 +- Pod 內 `110:11435` / `110:11436` 均可 `/api/tags` 成功,兩台 GCP Ollama 有實際可用。 +- `192.168.0.111:11434` 從 Pod 內 `No route to host`;`110:11437/nginx-health` 從外部可回 OK,但 `/api/tags` 回 502,表示 110 proxy block 存在但 upstream `.111` 不健康或不可達。 +- live NetworkPolicy 只允許 Pod → 110 的 `11435/11436`,未允許 `11437`;repo manifest 已補 11437,但尚未 live apply。 +- 最近告警跑到 Gemini 的主因不是 fallback order 沒設定,而是 `OllamaGcpBProvider` 只 override `_endpoint_url()`,但繼承的 `analyze()` 仍硬打 `settings.OLLAMA_URL`;log 顯示 router 選 `ollama_gcp_b`,實際錯打 `110:11435` 504,Local 又不可用,最後才落 Gemini。 + +**本次修補**: +- `ADR-110`:從 direct GCP IP 拓撲改寫為正式 runtime 拓撲:K8s → `192.168.0.110:11435/11436/11437` → GCP-A/GCP-B/Local;direct GCP IP 僅是 upstream / 非 K8s fallback。 +- `DEPLOY-GCP-OLLAMA-PROXY.md`:補 11437 Local fallback 驗證、NetworkPolicy port、`kubectl set env` 警告與三層 proxy route。 +- `k8s/awoooi-prod/06-deployment-api.yaml`:修正宣告檔 drift,`OLLAMA_FALLBACK_URL` 與 ConfigMap 對齊為 `http://192.168.0.110:11437`。未執行 live apply。 +- 新增 `INV-10-ollama-call-sites.md`:盤點 failover-aware 路徑與仍直讀 `OLLAMA_URL` 的 production call sites,並定義 GCP-A interactive / GCP-B batch+RAG+shadow / Local privacy+DR 分工。 +- 新增 `apps/api/tests/test_ollama_call_site_inventory.py`:把現有 direct `OLLAMA_URL` legacy debt 鎖成上限;新增 direct call site 必須改走 resolver/provider registry/EffectivePolicy,且 ConfigMap / Deployment 的三層 Ollama env 必須一致。 +- 新增 `services/ollama_endpoint_resolver.py`:最小 workload-aware resolver;`embedding` / `rag` / `code_review` / `batch` / `shadow` / `canary` 優先 GCP-B,interactive 留 GCP-A,local-required 留 Local。 +- 第一批低風險 runtime slice:`embedding_service.py`、`knowledge_rag_service.py`、`playbook_rag.py`、`local_code_review_service.py` 改走 resolver,讓批次/RAG/審查路徑優先用 GCP-B;未碰 `decision_manager`、OpenClaw、Hermes、chat manager 主線。 +- `ai_providers/ollama.py`:修正 base `OllamaProvider.analyze()` / `health_check()` 使用 `_endpoint_url()`,讓 `OllamaGcpBProvider` 選中時真正打 `OLLAMA_SECONDARY_URL`,不是錯打 primary。 +- `k8s/awoooi-prod/02-network-policy.yaml`:repo source 補 Pod → 110:11437 egress;未執行 live apply。 +- `MASTER-WORKPLAN.md`、`DETAILED-IMPLEMENTATION-PLAN.md`、`INV-4`、`INV-6`、`AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`:整合 INV-10 與 GCP-B active-active 策略。 + +**驗證**: +- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_call_site_inventory.py -q` → 2 passed。 +- `apps/api/.venv/bin/python -m ruff check apps/api/tests/test_ollama_call_site_inventory.py --fix` → fixed import order,rerun clean。 +- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 6 passed。 +- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ollama_endpoint_resolver.py apps/api/src/services/embedding_service.py apps/api/src/services/knowledge_rag_service.py apps/api/src/services/local_code_review_service.py apps/api/src/services/playbook_rag.py apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py` → passed after ruff import-order fix。 +- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_provider_endpoints.py apps/api/tests/test_ollama_failover_manager.py::TestThreeLayerFailover::test_gcp_a_offline_gcp_b_healthy_uses_gcp_b apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 9 passed。 +- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ai_providers/ollama.py apps/api/tests/test_ollama_provider_endpoints.py` → passed after import-order fix。 + +**下一步**: +- 不直接重寫 Tier 3 runtime;下一批先收斂 `apps/api/src/api/v1/rag.py` 與 `apps/api/scripts/reembed_bge_m3.py` 這兩個仍偏 batch 的 direct path。 +- 再補 provider health snapshot,讓 health/report 類路徑可同時呈現 GCP-A/GCP-B/Local,而不是只看 primary。 +- OpenClaw/Hermes/chat manager 只做 EffectivePolicy shadow compare,不直接切換。 + +--- + +## 2026-05-05 | AwoooP Claude Code 盤點修補 + convergence map 整合 + +**盤點結論**: +- Claude Code 的 AwoooP 檔案多數確實已落地(ADR-106~124、INV-1~9、migrations、contract packages、runtime/API shell、Operator Console routes)。 +- 但有幾個「宣告完成 ≠ 線上路徑生效」缺口:MCP redaction middleware 有寫但 Gateway 回傳 Runtime/LLM 前未強制套用;Operator Console 前端讀 `items/status/name/is_suspended`,後端實際回 `tenants/contracts/runs/state/display_name/is_active`;ADR-106 本體缺 Quantified Gates 補章。 +- 沒有執行 production DB migration;`awooop_phase*.sql` 仍需依部署順序、rollback 檢查、DB expert review 後再套用。 + +**本次修補**: +- `plugins/mcp/gateway.py`:Gateway 成功執行後先 `redact_mcp_output()` 再回傳給 Runtime/LLM;gateway audit hash 改用 redacted input/output 計算。 +- `services/mcp_audit_service.py`:legacy `mcp_audit_log` 寫入前補上 string pattern redaction,避免 DSN/token/internal IP 只因 key 名未命中而外洩。 +- `tests/test_mcp_credential_isolation.py`:新增 gateway return redaction + legacy audit redaction regression tests。 +- `ADR-106`:新增 `D9.1 Quantify Strangler Fig Promotion Gates`,正式化 shadow→canary→read_only→suggest→auto_remediate 的量化 gate。 +- `MASTER-WORKPLAN.md` + `AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`:納入 monitoring/alerting convergence map,固定 mirror → read-only EffectivePolicy comparison → read-only MCP Gateway wrapper → Channel Event wrapper → low-risk LLM strangler 順序。 +- `apps/web/src/app/[locale]/awooop/*`:修正 Operator Console 前端與後端 response contract 對齊;approval decide 補 `project_id`;run list 改用 `state` filter 與 lowercase FSM state。 + +**驗證**: +- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_mcp_credential_isolation.py -q` → 12 passed。 +- `apps/api/.venv/bin/python -m ruff check apps/api/src/plugins/mcp/gateway.py apps/api/src/services/mcp_audit_service.py apps/api/tests/test_mcp_credential_isolation.py` → passed。 +- `pnpm --dir apps/web exec tsc --noEmit` → passed。 +- `pnpm --dir apps/web run build` → passed;AwoooP routes `/[locale]/awooop/*` 全部成功建置。 +- `git diff --check` → passed。 + +**仍未完成 / 不可誤判完成**: +- production DB migration 尚未 apply。 +- `approval_records` 仍未 project-scoped;部分 legacy repository/service 仍依賴 RLS default 或無 explicit project filter。 +- direct MCP/provider call sites 尚未全面 `forbid-new`;只能視為 wrapper 過渡期。 +- `apps/web/package.json` / `pnpm-lock.yaml` 的 Next 14.2.25 bump 及 `tsconfig.tsbuildinfo` dirty state 是既有 session 變更,本次未回退。 + +--- + ## 2026-05-05 | ADR-110 三層容災補齊 + 四台主機密碼 SSH 恢復 **ADR-110 Local Fallback(port 11437)**: @@ -30,7 +138,7 @@ ``` OLLAMA_URL = http://192.168.0.110:11435 ← GCP-A primary(via nginx proxy) OLLAMA_SECONDARY_URL = http://192.168.0.110:11436 ← GCP-B secondary(via nginx proxy) -OLLAMA_FALLBACK_URL = http://192.168.0.111:11434 ← 111 兜底 +OLLAMA_FALLBACK_URL = http://192.168.0.110:11437 ← Local 111 fallback(via nginx proxy) ``` - 驗證:兩台 GCP 各 10 個模型,200 OK - 熱更新:`kubectl set env`(不動 image tag,避免 IMAGE_TAG_PLACEHOLDER 蓋掉) diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md new file mode 100644 index 00000000..31a23eb5 --- /dev/null +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -0,0 +1,72 @@ +# 110/188 Host Resource Baseline + +> 2026-05-05 ogt + Codex +> Scope: 110 DevOps/Sentry host and 188 AI/Web host Docker Compose resource limits. + +## Current Live Finding + +### 110 + +| Service | Live Limit | Live Usage Snapshot | Verdict | +|---|---:|---:|---| +| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. | +| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. | +| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. | +| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. | +| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. | +| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. | +| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. | +| cadvisor | 1 CPU / 512 MiB | ~0% CPU / 27 MiB | Good safety cap. | +| Harbor / Langfuse / Prometheus / Grafana / Nginx | mostly unlimited | currently low | Needs staged limits, but not during Sentry backlog drain. | + +### 188 + +| Service | Live Limit | Live Usage Snapshot | Verdict | +|---|---:|---:|---| +| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. | +| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. | +| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. | +| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. | +| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. | +| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. | +| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. | + +## Baseline Policy + +Use these thresholds for alerting and AI triage: + +| Signal | Threshold | Action | +|---|---:|---| +| host load5 / core | > 1.5 for 15m | Critical SSH diagnosis; classify top service before repair. | +| Docker container CPU | > 2 cores for 10m | Warning diagnosis; check limit, backlog, and workload type. | +| Docker container CPU | > 4 cores for 15m | Critical diagnosis; never generic restart. | +| Docker memory / limit | > 85% for 10m | Warning; raise memory or reduce workload, never lower the limit. | +| Docker restarts | > 5 in 15m | Critical; pull logs and fix crash signature. | + +## Rules + +1. Do not reduce ClickHouse CPU/memory while merges or Kafka backlog are draining. +2. Do not set ClickHouse `background_pool_size` below the three MergeTree thresholds unless all thresholds are explicitly lower than `pool * ratio`. +3. Do not use lower memory limits as a load-shedding tool. That creates OOM/restart loops. +4. For Chrome crawlers, cap concurrency first; CPU caps are only the safety rail. +5. For Kafka/Snuba, treat high CPU as backlog digestion unless lag stops decreasing. +6. For monitoring tools, caps are required, but every cap must be paired with self-monitoring. +7. Every Docker Compose host must emit `docker_container_cpu_cores`, `docker_container_memory_*`, and a restart counter via node-exporter textfile. +8. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline. + +## Next Safe Rollout Order + +1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron. +2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts. +3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers. +4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low. +5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis. +6. Add modest caps to currently unlimited low-risk services in small batches. + +## Known Anti-Patterns + +- `docker restart` as a response to sustained CPU. +- Lowering ClickHouse merge pool without validating ClickHouse 25.x thresholds. +- Capping Kafka below current working set during backlog recovery. +- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing. +- Letting monitoring collectors spend seconds per scrape; this turns observability into load. diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index 82ec7cb7..b3300fc3 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -39,7 +39,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: 2e17325c3f66c16783172e50c2d29f686d2b574e + newTag: resource-baseline-20260505-a57e3d3 - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web newTag: 00684403887745e35848bbbab5ac795cfdd6fd58 diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index a51efcbf..24008e6f 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -55,6 +55,7 @@ scrape_configs: - targets: - https://aiops.wooo.work - https://mo.wooo.work + - http://192.168.0.188:4000/health/liveliness - http://192.168.0.110:3001 - http://192.168.0.120:31234 - http://192.168.0.120:31235 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index dc6f9acc..cc4decc8 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -48,6 +48,25 @@ groups: summary: "主機 {{ $labels.host }} CPU 高負載" description: "CPU 使用率超過 80%" + - alert: HostLoadAverageSustainedHigh + # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。 + # Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。 + expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5 + for: 15m + labels: + severity: critical + layer: systemd-188 + team: ops + auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" + alert_category: "host_resource" + annotations: + summary: "主機 {{ $labels.host }} load5/core 長時間過高" + description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'" + runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。" + - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m @@ -576,6 +595,74 @@ groups: summary: "容器 {{ $labels.container }} 已停止" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘" + - alert: DockerContainerCpuSustainedHigh + # 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。 + # Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。 + expr: docker_container_cpu_cores > 2 + for: 10m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core" + description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" + runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。" + + - alert: DockerContainerCpuRunawayCritical + expr: docker_container_cpu_cores > 4 + for: 15m + labels: + severity: critical + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core" + description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'" + runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。" + + - alert: DockerContainerMemoryLimitPressure + # 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。 + expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85 + for: 10m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%" + description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" + runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。" + + - alert: DockerContainerRestartSpike + # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。 + expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5 + for: 3m + labels: + severity: critical + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次" + description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" + runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。" + # ========================================================================= # MinIO / Kali 告警 # ========================================================================= diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 20e9dea6..0325fac3 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -51,6 +51,25 @@ groups: auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)" runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain" + - alert: HostLoadAverageSustainedHigh + # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。 + # Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。 + expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5 + for: 15m + labels: + severity: critical + layer: systemd-188 + team: ops + auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" + alert_category: "host_resource" + annotations: + summary: "主機 {{ $labels.host }} load5/core 長時間過高" + description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'" + runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。" + - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m @@ -582,6 +601,74 @@ groups: summary: "容器 {{ $labels.container }} 已停止" description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘" + - alert: DockerContainerCpuSustainedHigh + # 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。 + # Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。 + expr: docker_container_cpu_cores > 2 + for: 10m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core" + description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" + runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。" + + - alert: DockerContainerCpuRunawayCritical + expr: docker_container_cpu_cores > 4 + for: 15m + labels: + severity: critical + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core" + description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。" + auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'" + runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。" + + - alert: DockerContainerMemoryLimitPressure + # 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。 + expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85 + for: 10m + labels: + severity: warning + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%" + description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'" + runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。" + + - alert: DockerContainerRestartSpike + # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。 + expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5 + for: 3m + labels: + severity: critical + layer: docker + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "true" + annotations: + summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次" + description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。" + auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" + runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。" + # ========================================================================= # MinIO / Kali 告警 # ========================================================================= diff --git a/scripts/ops/docker-stats-textfile-exporter.py b/scripts/ops/docker-stats-textfile-exporter.py new file mode 100755 index 00000000..12841672 --- /dev/null +++ b/scripts/ops/docker-stats-textfile-exporter.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +Docker stats textfile exporter for host-level AIOps baselines. + +2026-05-05 ogt + Codex: 110/188 CPU overload follow-up. +Why: cAdvisor v0.47 may not expose per-container restart count and the live +110 scrape currently only exposes the root cgroup. This exporter writes a small +node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts. +""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +import tempfile +from pathlib import Path + + +TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) +OUTPUT_NAME = "docker_stats.prom" +HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) +LABEL_RE = re.compile(r'["\\\n]') + + +def _escape_label(value: str) -> str: + return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) + + +def _run_json_lines(command: list[str]) -> list[dict]: + result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30) + rows: list[dict] = [] + for line in result.stdout.splitlines(): + if not line.strip(): + continue + rows.append(json.loads(line)) + return rows + + +def _run_text_lines(command: list[str]) -> list[str]: + result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30) + return [line.strip() for line in result.stdout.splitlines() if line.strip()] + + +def _cpu_cores(cpu_perc: str) -> float: + return float(cpu_perc.strip().rstrip("%")) / 100.0 + + +def _memory_bytes(value: str) -> float: + raw = value.strip() + match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw) + if not match: + return 0.0 + number, unit = match.groups() + scale = { + "B": 1, + "KiB": 1024, + "MiB": 1024**2, + "GiB": 1024**3, + "TiB": 1024**4, + "KB": 1000, + "MB": 1000**2, + "GB": 1000**3, + "TB": 1000**4, + }.get(unit, 1) + return float(number) * scale + + +def collect() -> str: + stats = _run_json_lines([ + "docker", + "stats", + "--no-stream", + "--format", + "{{json .}}", + ]) + names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"]) + inspect_by_name = {} + if names: + inspected = json.loads(subprocess.run( + ["docker", "inspect", *names], + check=True, + capture_output=True, + text=True, + timeout=30, + ).stdout) + inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected} + + lines = [ + "# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.", + "# TYPE docker_container_cpu_cores gauge", + "# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.", + "# TYPE docker_container_cpu_limit_cores gauge", + "# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.", + "# TYPE docker_container_memory_usage_bytes gauge", + "# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.", + "# TYPE docker_container_memory_limit_bytes gauge", + "# HELP docker_container_pids Current Docker container process/thread count from docker stats.", + "# TYPE docker_container_pids gauge", + "# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.", + "# TYPE docker_container_inspect_restart_count gauge", + "# HELP docker_container_info Docker container inventory exposed by the textfile exporter.", + "# TYPE docker_container_info gauge", + ] + + for row in stats: + name = row.get("Name", "") + if not name: + continue + inspected = inspect_by_name.get(name, {}) + host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {} + state = inspected.get("State", {}) if isinstance(inspected, dict) else {} + nano_cpus = float(host_config.get("NanoCpus") or 0) + memory_limit = float(host_config.get("Memory") or 0) + restart_count = int(inspected.get("RestartCount") or 0) + labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"' + mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip() + pids = row.get("PIDs") or "0" + lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}") + lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}") + lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}") + lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}") + lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}") + lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}") + lines.append( + f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1' + ) + + return "\n".join(lines) + "\n" + + +def main() -> None: + TEXTFILE_DIR.mkdir(parents=True, exist_ok=True) + payload = collect() + with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp: + tmp.write(payload) + tmp_path = Path(tmp.name) + output_path = TEXTFILE_DIR / OUTPUT_NAME + tmp_path.replace(output_path) + output_path.chmod(0o644) + + +if __name__ == "__main__": + main()