fix(ops): add docker host resource baseline guardrails

2026-05-05 13:45:09 +08:00
parent a57e3d3d75
commit e8e6748f70
9 changed files with 586 additions and 8 deletions
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -163,6 +163,37 @@ rules:
      responsibility: INFRA
      reasoning: "[規則匹配] 主機層資源告警，自動 SSH 執行診斷指令（只讀，不修改），收集根因資訊後推送 Telegram 讓 SRE 決策。"

+  # 2026-05-05 ogt + Codex: 110/188 長時間過載事故後補 Docker Compose 過載與 restart spike 路由。
+  # 原則：過載與重啟暴增只能先診斷，禁止通用 docker restart；由 LLM + Playbook trust 決定 service-specific 修復。
+  - id: docker_baseline_overload_alert
+    priority: 44
+    description: Docker Compose 服務過載 / restart spike 基線告警（cadvisor + textfile exporter）
+    match:
+      alertname:
+        - HostLoadAverageSustainedHigh
+        - DockerContainerCpuSustainedHigh
+        - DockerContainerCpuRunawayCritical
+        - DockerContainerMemoryLimitPressure
+        - DockerContainerRestartSpike
+    response:
+      action_title: "🔍 Docker/Host 過載自動診斷 — 禁止通用重啟"
+      description: "110/188 Docker Compose 或主機 load 長時間偏離 baseline。AI 需先收集容器 CPU、restart、logs、ClickHouse/Kafka/爬蟲狀態，再選擇限流、降併發或服務專屬 playbook。"
+      suggested_action: SSH_DIAGNOSE
+      kubectl_command: "ssh {host} 'echo \"=== LOAD ===\"; uptime; echo \"=== TOP ===\"; ps aux --sort=-%cpu | head -20; echo \"=== DOCKER ===\"; docker stats --no-stream | head -40'"
+      estimated_downtime: "N/A"
+      risk: low
+      responsibility: INFRA
+      responsibility_reasoning: "Docker Compose / bare-metal 過載屬主機與平台資源治理，不能交給 K8s restart 處理"
+      secondary_teams: [BE, SRE]
+      optimization:
+        - type: BASELINE_CHECK
+          description: "比較 load5/core、單容器 CPU core、restart spike 與 24h 動態基線"
+          command: "Prometheus query: node_load5/core + rate(container_cpu_usage_seconds_total[5m]) + increase(docker_container_restart_count[15m])"
+        - type: SERVICE_SPECIFIC_REPAIR
+          description: "依服務選擇專屬修復：ClickHouse 降 merge / scheduler 限 concurrency / litellm 修 health 或路由 / exporter 降 collector"
+          command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook"
+      reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流，禁止通用 docker restart；修復必須服務專屬且可回寫 Playbook trust。"
+
  - id: high_cpu
    priority: 40
    description: K8s Pod/Deployment CPU 使用率過高
--- a/apps/api/src/services/proactive_inspector.py
+++ b/apps/api/src/services/proactive_inspector.py
@@ -68,20 +68,66 @@ MONITORED_METRICS: list[dict[str, Any]] = [
        # cadvisor up=0（prod-docker-188 離線），改用 node-exporter node-level CPU
        # 實測確認：avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) → 有資料
        # threshold 0.85 = 85% CPU 使用率（node level，0-1 比例）
-        "promql": 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m]))',
+        "promql": 'avg(rate(node_cpu_seconds_total{host="188",mode!="idle"}[5m]))',
        "threshold": 0.85,   # > 85% node CPU（所有 core 平均）
        "description": "Node 188 CPU 使用率（node-exporter，cadvisor 停止時替代）",
    },
+    {
+        # 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 動態基線。
+        # Why: 事故主熱點在 110 Sentry ClickHouse/Snuba/Kafka，舊 inspector 只看未過濾的全域 CPU。
+        "name": "cpu_usage_node_110",
+        "promql": 'avg(rate(node_cpu_seconds_total{host="110",mode!="idle"}[5m]))',
+        "threshold": 0.85,
+        "description": "Node 110 CPU 使用率（node-exporter）",
+    },
    {
        # 2026-04-26 H2 hotfix: 改名 _node_188
        "name": "memory_usage_node_188",
        # cadvisor 停止，改用 node-exporter 節點記憶體使用率比例（0-1）
        # 實測確認：188 機器 62.76 GiB，當前 ~30% 使用率
        # threshold 0.85 = 85% node memory usage
-        "promql": '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes',
+        "promql": '(node_memory_MemTotal_bytes{host="188"} - node_memory_MemAvailable_bytes{host="188"}) / node_memory_MemTotal_bytes{host="188"}',
        "threshold": 0.85,   # > 85% node memory（0-1 比例）
        "description": "Node 188 記憶體使用率（node-exporter，cadvisor 停止時替代）",
    },
+    {
+        # 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 記憶體基線。
+        "name": "memory_usage_node_110",
+        "promql": '(node_memory_MemTotal_bytes{host="110"} - node_memory_MemAvailable_bytes{host="110"}) / node_memory_MemTotal_bytes{host="110"}',
+        "threshold": 0.85,
+        "description": "Node 110 記憶體使用率（node-exporter）",
+    },
+    {
+        # 2026-05-05 ogt + Codex: 用 load/core 比例抓長時間過載，避免 CPU% 被 runnable queue 稀釋。
+        "name": "load5_per_core_110",
+        "promql": 'node_load5{host="110"} / count(count by(cpu) (node_cpu_seconds_total{host="110",mode="idle"}))',
+        "threshold": 1.50,
+        "description": "Node 110 load5 / CPU core 比例",
+    },
+    {
+        "name": "load5_per_core_188",
+        "promql": 'node_load5{host="188"} / count(count by(cpu) (node_cpu_seconds_total{host="188",mode="idle"}))',
+        "threshold": 1.50,
+        "description": "Node 188 load5 / CPU core 比例",
+    },
+    {
+        # 2026-05-05 ogt + Codex: 補 Docker Compose 服務層基線，抓單容器長期吃超過 2 core。
+        "name": "docker_max_container_cpu_cores",
+        "promql": 'max(docker_container_cpu_cores) or max(rate(container_cpu_usage_seconds_total{name!="",id!="/"}[5m])) or on() vector(0)',
+        "threshold": 2.0,
+        "description": "Docker 單容器 CPU core 使用量上限",
+    },
+    {
+        # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 container restart metric，改吃 node-exporter textfile。
+        "name": "docker_restart_spike_15m",
+        "promql": (
+            'sum(increase(docker_container_restart_count[15m])) '
+            'or sum(increase(docker_container_inspect_restart_count[15m])) '
+            'or on() vector(0)'
+        ),
+        "threshold": 5.0,
+        "description": "Docker 容器 15 分鐘重啟增量",
+    },
    {
        "name": "pod_restart_rate",
        # kube-state-metrics: namespace=awoooi-prod，sum 聚合避免 multi-vector
@@ -222,8 +268,8 @@ class ProactiveInspector:

    async def _inspect_dynamic_baseline(self, report: InspectionReport) -> None:
        """從 Prometheus 抓取當前值，與 Holt-Winters 基線比對。"""
-        from src.services.dynamic_baseline_service import get_dynamic_baseline_service
        from src.core.feature_flags import aiops_flags
+        from src.services.dynamic_baseline_service import get_dynamic_baseline_service

        if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE:
            return
@@ -273,8 +319,8 @@ class ProactiveInspector:

    async def _inspect_log_patterns(self, report: InspectionReport) -> None:
        """掃描 K8s Pod 日誌，偵測新 log pattern。"""
-        from src.services.log_anomaly_detector import get_log_anomaly_detector
        from src.core.feature_flags import aiops_flags
+        from src.services.log_anomaly_detector import get_log_anomaly_detector

        if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
            return
@@ -313,8 +359,8 @@ class ProactiveInspector:

    async def _inspect_trends(self, report: InspectionReport) -> None:
        """對各 metric 做 4h 趨勢外推。"""
-        from src.services.trend_predictor import get_trend_predictor
        from src.core.feature_flags import aiops_flags
+        from src.services.trend_predictor import get_trend_predictor

        if not aiops_flags.AIOPS_P4_TREND_PREDICTOR:
            return
@@ -365,8 +411,8 @@ class ProactiveInspector:

    async def _retrain_baselines_background(self) -> None:
        """背景重訓所有 Holt-Winters 基線（不阻塞巡檢）。"""
-        from src.services.dynamic_baseline_service import get_dynamic_baseline_service
        from src.core.feature_flags import aiops_flags
+        from src.services.dynamic_baseline_service import get_dynamic_baseline_service

        if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE:
            return
@@ -392,6 +438,7 @@ class ProactiveInspector:
    async def _fetch_current_value(self, promql: str) -> float | None:
        """從 Prometheus 抓取當前值（instant query）。"""
        import httpx
+
        from src.core.config import settings

        try:
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,114 @@

 ---

+## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地
+
+**背景**：統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置，造成服務卡死或慢性過載；本輪接續盤點 live Docker inspect / docker stats / compose 宣告。
+
+**現場結論**：
+- 110 仍高負載，不是單純等待回補即可：load 約 `23.84 / 27.11 / 34.67`；Sentry ClickHouse 4 CPU / 8GiB 貼著 CPU 上限跑，Kafka 3GiB 使用率約 84%，taskbroker 1 CPU 接近滿載，taskscheduler 512MiB 約 75%。
+- 110 Kafka lag 近乎清空，ClickHouse 仍在重 merge，node-exporter 自己曾因 `arp` / `netclass` / `netdev` collector 單次 scrape 花 17s+ 而自傷。
+- 188 已回穩但仍需節流治理：momo-scheduler 2 CPU / 2GiB 是安全欄不是根治；SignOz ClickHouse 4 CPU / 24GiB 目前合理。
+- 188 momo-scheduler 日誌顯示三張 schema 缺表（`ai_calls` / `learning_episodes` / `host_health_probes`）與 Elephant Alpha/OpenClaw action drift，這是背景任務反覆失敗，不是 CPU/memory limit 問題。
+- 110 node-exporter textfile path live drift：原指向 `/home/ollama/node_exporter_textfiles`，110 上不存在，造成 Docker Compose 指標半盲。
+
+**本次落地**：
+- 新增 `scripts/ops/docker-stats-textfile-exporter.py`，輸出 Docker container CPU cores / CPU limit / memory usage / memory limit / restart count / info。
+- 110：部署 exporter 到 `/home/wooo/scripts/`，新增 cron，每分鐘寫 `/home/wooo/node_exporter_textfiles/docker_stats.prom`；修正 `/home/wooo/monitoring/docker-compose.yml` 的 node-exporter textfile path，並只重建 node-exporter。
+- 110：關閉 node-exporter 高成本 collector：`arp`、`netclass`、`netdev`；scrape duration 從約 17s+ 降到 CPU/mem/load/textfile 等核心 collector 都 < 0.1s，node-exporter CPU 從約 80% 降到 0-5%。
+- 110：Kafka lag 已近零後，將 `/opt/sentry/.env` `SENTRY_TASKWORKER_CONCURRENCY` 從 4 降到 2，只重建 taskworker（snuba-api 因 compose dependency 被重建一次），taskworker command 已確認 `--concurrency=2`。
+- 188：部署 exporter 到 `/home/ollama/scripts/`，新增 cron，每分鐘寫 `/home/ollama/node_exporter_textfiles/docker_stats.prom`；保留既有 `docker_restart_count.prom`。
+- 188：套用既有 additive migrations `024_create_ai_calls_table.sql`、`028_create_learning_episodes.sql`、`029_create_host_health_probes.sql`，補齊 scheduler 正在寫入的 schema，未重啟服務。
+- `ops/monitoring/alerts*.yml`：新增 `HostLoadAverageSustainedHigh`、`DockerContainerCpuSustainedHigh`、`DockerContainerCpuRunawayCritical`、`DockerContainerMemoryLimitPressure`、`DockerContainerRestartSpike`。
+- `apps/api/alert_rules.yaml`：新增 Docker/Host 過載路由，強制走 `SSH_DIAGNOSE`，禁止通用 docker restart。
+- API GitOps：用最新 `main` (`a57e3d3d`) 加本次兩個 API 修補檔，在 188 建置並推送 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-a57e3d3`；`k8s/awoooi-prod/kustomization.yaml` 指向此 tag，避免手動 `kubectl set image` 被 Argo 回滾。
+- `docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md`：記錄 live 配額盤點、baseline policy、反模式與下一步 rollout 順序。
+- Prometheus 已 reload，97 條規則載入；新 baseline rules 全部存在。
+
+**驗證**：
+- `node_textfile_scrape_error`：110/188/112 全為 0。
+- Prometheus 已可查到 `docker_container_cpu_cores{host="110",container_name="sentry-self-hosted-clickhouse-1"}`、`docker_container_memory_limit_bytes{host="110",container_name="sentry-self-hosted-kafka-1"}`、`docker_container_cpu_cores{host="188",container_name="momo-scheduler"}`。
+- 110：taskworker / snuba-api / ClickHouse / Kafka healthy；Sentry Kafka `snuba-consumers` 主要 lag 0-1；load 從約 30+ 降到 `11.83 / 20.97 / 27.41`（1m 已降，15m 仍需等 merge 平滑）。
+- 188：三張 DB 表存在；migration 後只剩 `Fallback (111)` 健康警告，`UndefinedTable` 未再出現；momo-db CPU 回到約 0.6-2.5%，host load 約 `2.47 / 2.80 / 4.28`。
+- Prometheus 新 baseline alerts 查詢目前無 firing。
+- 新規則目前 pending：110 `HostLoadAverageSustainedHigh`、110 `DockerContainerCpuSustainedHigh` for Sentry ClickHouse。
+- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_classify_alert_early.py apps/api/tests/test_alert_rule_engine_validation.py -q` → 89 passed。
+- `ruff check apps/api/src/services/proactive_inspector.py`、`py_compile scripts/ops/docker-stats-textfile-exporter.py`、`git diff --check` → passed。
+- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-a57e3d3`。
+
+**下一步**：
+- 不要再降低 ClickHouse / Kafka memory limit；先觀察 backlog drain。
+- 若 110 ClickHouse 15-30 分鐘後仍持續 >2.5 cores，下一步查 merge/query 類型；不要靠降低 memory 或泛用 restart。
+- 188 下一步修 Elephant Alpha/OpenClaw allowed-action drift，避免 AI 自動修復決策計入 circuit breaker；momo-scheduler 2 CPU / 2GiB 暫時保留。
+
+## 2026-05-05 | ADR-110 / AwoooP GCP Ollama compute pool 收斂
+
+**背景**：統帥批准將 GCP-A / GCP-B Ollama 納入 AwoooP 推進計畫，不只作 failover，而是作為 platform-level Ollama compute pool。
+
+**2026-05-05 live 驗證結論**：
+- 生產 Deployment 實際 env：`OLLAMA_URL=110:11435`、`OLLAMA_SECONDARY_URL=110:11436`、`OLLAMA_FALLBACK_URL=192.168.0.111:11434`；ConfigMap 已是 `110:11437`，但 Deployment explicit env 尚未一致。
+- Pod 內 `110:11435` / `110:11436` 均可 `/api/tags` 成功，兩台 GCP Ollama 有實際可用。
+- `192.168.0.111:11434` 從 Pod 內 `No route to host`；`110:11437/nginx-health` 從外部可回 OK，但 `/api/tags` 回 502，表示 110 proxy block 存在但 upstream `.111` 不健康或不可達。
+- live NetworkPolicy 只允許 Pod → 110 的 `11435/11436`，未允許 `11437`；repo manifest 已補 11437，但尚未 live apply。
+- 最近告警跑到 Gemini 的主因不是 fallback order 沒設定，而是 `OllamaGcpBProvider` 只 override `_endpoint_url()`，但繼承的 `analyze()` 仍硬打 `settings.OLLAMA_URL`；log 顯示 router 選 `ollama_gcp_b`，實際錯打 `110:11435` 504，Local 又不可用，最後才落 Gemini。
+
+**本次修補**：
+- `ADR-110`：從 direct GCP IP 拓撲改寫為正式 runtime 拓撲：K8s → `192.168.0.110:11435/11436/11437` → GCP-A/GCP-B/Local；direct GCP IP 僅是 upstream / 非 K8s fallback。
+- `DEPLOY-GCP-OLLAMA-PROXY.md`：補 11437 Local fallback 驗證、NetworkPolicy port、`kubectl set env` 警告與三層 proxy route。
+- `k8s/awoooi-prod/06-deployment-api.yaml`：修正宣告檔 drift，`OLLAMA_FALLBACK_URL` 與 ConfigMap 對齊為 `http://192.168.0.110:11437`。未執行 live apply。
+- 新增 `INV-10-ollama-call-sites.md`：盤點 failover-aware 路徑與仍直讀 `OLLAMA_URL` 的 production call sites，並定義 GCP-A interactive / GCP-B batch+RAG+shadow / Local privacy+DR 分工。
+- 新增 `apps/api/tests/test_ollama_call_site_inventory.py`：把現有 direct `OLLAMA_URL` legacy debt 鎖成上限；新增 direct call site 必須改走 resolver/provider registry/EffectivePolicy，且 ConfigMap / Deployment 的三層 Ollama env 必須一致。
+- 新增 `services/ollama_endpoint_resolver.py`：最小 workload-aware resolver；`embedding` / `rag` / `code_review` / `batch` / `shadow` / `canary` 優先 GCP-B，interactive 留 GCP-A，local-required 留 Local。
+- 第一批低風險 runtime slice：`embedding_service.py`、`knowledge_rag_service.py`、`playbook_rag.py`、`local_code_review_service.py` 改走 resolver，讓批次/RAG/審查路徑優先用 GCP-B；未碰 `decision_manager`、OpenClaw、Hermes、chat manager 主線。
+- `ai_providers/ollama.py`：修正 base `OllamaProvider.analyze()` / `health_check()` 使用 `_endpoint_url()`，讓 `OllamaGcpBProvider` 選中時真正打 `OLLAMA_SECONDARY_URL`，不是錯打 primary。
+- `k8s/awoooi-prod/02-network-policy.yaml`：repo source 補 Pod → 110:11437 egress；未執行 live apply。
+- `MASTER-WORKPLAN.md`、`DETAILED-IMPLEMENTATION-PLAN.md`、`INV-4`、`INV-6`、`AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`：整合 INV-10 與 GCP-B active-active 策略。
+
+**驗證**：
+- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_call_site_inventory.py -q` → 2 passed。
+- `apps/api/.venv/bin/python -m ruff check apps/api/tests/test_ollama_call_site_inventory.py --fix` → fixed import order，rerun clean。
+- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 6 passed。
+- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ollama_endpoint_resolver.py apps/api/src/services/embedding_service.py apps/api/src/services/knowledge_rag_service.py apps/api/src/services/local_code_review_service.py apps/api/src/services/playbook_rag.py apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py` → passed after ruff import-order fix。
+- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_provider_endpoints.py apps/api/tests/test_ollama_failover_manager.py::TestThreeLayerFailover::test_gcp_a_offline_gcp_b_healthy_uses_gcp_b apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 9 passed。
+- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ai_providers/ollama.py apps/api/tests/test_ollama_provider_endpoints.py` → passed after import-order fix。
+
+**下一步**：
+- 不直接重寫 Tier 3 runtime；下一批先收斂 `apps/api/src/api/v1/rag.py` 與 `apps/api/scripts/reembed_bge_m3.py` 這兩個仍偏 batch 的 direct path。
+- 再補 provider health snapshot，讓 health/report 類路徑可同時呈現 GCP-A/GCP-B/Local，而不是只看 primary。
+- OpenClaw/Hermes/chat manager 只做 EffectivePolicy shadow compare，不直接切換。
+
+---
+
+## 2026-05-05 | AwoooP Claude Code 盤點修補 + convergence map 整合
+
+**盤點結論**：
+- Claude Code 的 AwoooP 檔案多數確實已落地（ADR-106~124、INV-1~9、migrations、contract packages、runtime/API shell、Operator Console routes）。
+- 但有幾個「宣告完成 ≠ 線上路徑生效」缺口：MCP redaction middleware 有寫但 Gateway 回傳 Runtime/LLM 前未強制套用；Operator Console 前端讀 `items/status/name/is_suspended`，後端實際回 `tenants/contracts/runs/state/display_name/is_active`；ADR-106 本體缺 Quantified Gates 補章。
+- 沒有執行 production DB migration；`awooop_phase*.sql` 仍需依部署順序、rollback 檢查、DB expert review 後再套用。
+
+**本次修補**：
+- `plugins/mcp/gateway.py`：Gateway 成功執行後先 `redact_mcp_output()` 再回傳給 Runtime/LLM；gateway audit hash 改用 redacted input/output 計算。
+- `services/mcp_audit_service.py`：legacy `mcp_audit_log` 寫入前補上 string pattern redaction，避免 DSN/token/internal IP 只因 key 名未命中而外洩。
+- `tests/test_mcp_credential_isolation.py`：新增 gateway return redaction + legacy audit redaction regression tests。
+- `ADR-106`：新增 `D9.1 Quantify Strangler Fig Promotion Gates`，正式化 shadow→canary→read_only→suggest→auto_remediate 的量化 gate。
+- `MASTER-WORKPLAN.md` + `AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`：納入 monitoring/alerting convergence map，固定 mirror → read-only EffectivePolicy comparison → read-only MCP Gateway wrapper → Channel Event wrapper → low-risk LLM strangler 順序。
+- `apps/web/src/app/[locale]/awooop/*`：修正 Operator Console 前端與後端 response contract 對齊；approval decide 補 `project_id`；run list 改用 `state` filter 與 lowercase FSM state。
+
+**驗證**：
+- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_mcp_credential_isolation.py -q` → 12 passed。
+- `apps/api/.venv/bin/python -m ruff check apps/api/src/plugins/mcp/gateway.py apps/api/src/services/mcp_audit_service.py apps/api/tests/test_mcp_credential_isolation.py` → passed。
+- `pnpm --dir apps/web exec tsc --noEmit` → passed。
+- `pnpm --dir apps/web run build` → passed；AwoooP routes `/[locale]/awooop/*` 全部成功建置。
+- `git diff --check` → passed。
+
+**仍未完成 / 不可誤判完成**：
+- production DB migration 尚未 apply。
+- `approval_records` 仍未 project-scoped；部分 legacy repository/service 仍依賴 RLS default 或無 explicit project filter。
+- direct MCP/provider call sites 尚未全面 `forbid-new`；只能視為 wrapper 過渡期。
+- `apps/web/package.json` / `pnpm-lock.yaml` 的 Next 14.2.25 bump 及 `tsconfig.tsbuildinfo` dirty state 是既有 session 變更，本次未回退。
+
+---
+
 ## 2026-05-05 | ADR-110 三層容災補齊 + 四台主機密碼 SSH 恢復

 **ADR-110 Local Fallback（port 11437）**：
@@ -30,7 +138,7 @@
 ```
 OLLAMA_URL           = http://192.168.0.110:11435  ← GCP-A primary（via nginx proxy）
 OLLAMA_SECONDARY_URL = http://192.168.0.110:11436  ← GCP-B secondary（via nginx proxy）
-OLLAMA_FALLBACK_URL  = http://192.168.0.111:11434  ← 111 兜底
+OLLAMA_FALLBACK_URL  = http://192.168.0.110:11437  ← Local 111 fallback（via nginx proxy）
 ```
 - 驗證：兩台 GCP 各 10 個模型，200 OK
 - 熱更新：`kubectl set env`（不動 image tag，避免 IMAGE_TAG_PLACEHOLDER 蓋掉）
--- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md
+++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md
@@ -0,0 +1,72 @@
+# 110/188 Host Resource Baseline
+
+> 2026-05-05 ogt + Codex
+> Scope: 110 DevOps/Sentry host and 188 AI/Web host Docker Compose resource limits.
+
+## Current Live Finding
+
+### 110
+
+| Service | Live Limit | Live Usage Snapshot | Verdict |
+|---|---:|---:|---|
+| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. |
+| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. |
+| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. |
+| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. |
+| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. |
+| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. |
+| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. |
+| cadvisor | 1 CPU / 512 MiB | ~0% CPU / 27 MiB | Good safety cap. |
+| Harbor / Langfuse / Prometheus / Grafana / Nginx | mostly unlimited | currently low | Needs staged limits, but not during Sentry backlog drain. |
+
+### 188
+
+| Service | Live Limit | Live Usage Snapshot | Verdict |
+|---|---:|---:|---|
+| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. |
+| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. |
+| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. |
+| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. |
+| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. |
+| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. |
+| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. |
+
+## Baseline Policy
+
+Use these thresholds for alerting and AI triage:
+
+| Signal | Threshold | Action |
+|---|---:|---|
+| host load5 / core | > 1.5 for 15m | Critical SSH diagnosis; classify top service before repair. |
+| Docker container CPU | > 2 cores for 10m | Warning diagnosis; check limit, backlog, and workload type. |
+| Docker container CPU | > 4 cores for 15m | Critical diagnosis; never generic restart. |
+| Docker memory / limit | > 85% for 10m | Warning; raise memory or reduce workload, never lower the limit. |
+| Docker restarts | > 5 in 15m | Critical; pull logs and fix crash signature. |
+
+## Rules
+
+1. Do not reduce ClickHouse CPU/memory while merges or Kafka backlog are draining.
+2. Do not set ClickHouse `background_pool_size` below the three MergeTree thresholds unless all thresholds are explicitly lower than `pool * ratio`.
+3. Do not use lower memory limits as a load-shedding tool. That creates OOM/restart loops.
+4. For Chrome crawlers, cap concurrency first; CPU caps are only the safety rail.
+5. For Kafka/Snuba, treat high CPU as backlog digestion unless lag stops decreasing.
+6. For monitoring tools, caps are required, but every cap must be paired with self-monitoring.
+7. Every Docker Compose host must emit `docker_container_cpu_cores`, `docker_container_memory_*`, and a restart counter via node-exporter textfile.
+8. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline.
+
+## Next Safe Rollout Order
+
+1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron.
+2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts.
+3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers.
+4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
+5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
+6. Add modest caps to currently unlimited low-risk services in small batches.
+
+## Known Anti-Patterns
+
+- `docker restart` as a response to sustained CPU.
+- Lowering ClickHouse merge pool without validating ClickHouse 25.x thresholds.
+- Capping Kafka below current working set during backlog recovery.
+- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing.
+- Letting monitoring collectors spend seconds per scrape; this turns observability into load.
--- a/k8s/awoooi-prod/kustomization.yaml
+++ b/k8s/awoooi-prod/kustomization.yaml
@@ -39,7 +39,7 @@ resources:
 images:
 - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
  newName: 192.168.0.110:5000/awoooi/api
-  newTag: 2e17325c3f66c16783172e50c2d29f686d2b574e
+  newTag: resource-baseline-20260505-a57e3d3
 - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
  newName: 192.168.0.110:5000/awoooi/web
  newTag: 00684403887745e35848bbbab5ac795cfdd6fd58
--- a/k8s/monitoring/prometheus.yml
+++ b/k8s/monitoring/prometheus.yml
@@ -55,6 +55,7 @@ scrape_configs:
      - targets:
          - https://aiops.wooo.work
          - https://mo.wooo.work
+          - http://192.168.0.188:4000/health/liveliness
          - http://192.168.0.110:3001
          - http://192.168.0.120:31234
          - http://192.168.0.120:31235
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -48,6 +48,25 @@ groups:
          summary: "主機 {{ $labels.host }} CPU 高負載"
          description: "CPU 使用率超過 80%"

+      - alert: HostLoadAverageSustainedHigh
+        # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
+        # Why: CPU% 只看 busy time，無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
+        expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
+        for: 15m
+        labels:
+          severity: critical
+          layer: systemd-188
+          team: ops
+          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
+          alert_category: "host_resource"
+        annotations:
+          summary: "主機 {{ $labels.host }} load5/core 長時間過高"
+          description: "load5 / CPU core > 1.5 持續 15 分鐘；這通常代表 runnable queue 已長期塞車，不是短暫尖峰。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
+          runbook: "先判斷高 load 來源：ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter；只允許 read-only 診斷，自動修復需走服務專屬 playbook。"
+
      - alert: HostOutOfMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
@@ -576,6 +595,74 @@ groups:
          summary: "容器 {{ $labels.container }} 已停止"
          description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead，持續 1 分鐘"

+      - alert: DockerContainerCpuSustainedHigh
+        # 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
+        # Baseline: 單容器 >2 core 10m 為 warning；用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
+        expr: docker_container_cpu_cores > 2
+        for: 10m
+        labels:
+          severity: warning
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
+          description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘，需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
+          runbook: "自動階段只做診斷；若是已知服務，交給 service-specific playbook：ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
+
+      - alert: DockerContainerCpuRunawayCritical
+        expr: docker_container_cpu_cores > 4
+        for: 15m
+        labels:
+          severity: critical
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
+          description: "{{ $labels.container_name }} 已持續吃超過 4 core，會拖垮 110/188 主機；需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
+          runbook: "禁止通用 docker restart；先抓根因，只有 health down 或 crash loop 才可走重啟。"
+
+      - alert: DockerContainerMemoryLimitPressure
+        # 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
+        expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
+        for: 10m
+        labels:
+          severity: warning
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
+          description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker，需先判斷 workload，不可直接降 limit。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
+          runbook: "若服務已接近 limit：優先調整 retention/concurrency/cache，再評估提高 memory；禁止用更低 memory limit 當止血。"
+
+      - alert: DockerContainerRestartSpike
+        # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric，吃 node-exporter textfile docker_container_restart_count。
+        expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
+        for: 3m
+        labels:
+          severity: critical
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
+          description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增，避免再次出現 litellm 24,464 次靜默崩潰。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
+          runbook: "先抓 crash signature；若是 config/DB/網路問題，修設定，不用無限 restart。"
+
  # =========================================================================
  # MinIO / Kali 告警
  # =========================================================================
--- a/ops/monitoring/alerts.yml
+++ b/ops/monitoring/alerts.yml
@@ -51,6 +51,25 @@ groups:
          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷；禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
          runbook: "host CPU 高負載排查：先 SSH ps aux 看 top 進程；若為第三方服務（Sentry/ClickHouse 等）寫 ADR 升級資源或調 limit，禁止 kubectl restart 跨 domain"

+      - alert: HostLoadAverageSustainedHigh
+        # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
+        # Why: CPU% 只看 busy time，無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
+        expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
+        for: 15m
+        labels:
+          severity: critical
+          layer: systemd-188
+          team: ops
+          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
+          alert_category: "host_resource"
+        annotations:
+          summary: "主機 {{ $labels.host }} load5/core 長時間過高"
+          description: "load5 / CPU core > 1.5 持續 15 分鐘；這通常代表 runnable queue 已長期塞車，不是短暫尖峰。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
+          runbook: "先判斷高 load 來源：ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter；只允許 read-only 診斷，自動修復需走服務專屬 playbook。"
+
      - alert: HostOutOfMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
@@ -582,6 +601,74 @@ groups:
          summary: "容器 {{ $labels.container }} 已停止"
          description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead，持續 1 分鐘"

+      - alert: DockerContainerCpuSustainedHigh
+        # 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
+        # Baseline: 單容器 >2 core 10m 為 warning；用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
+        expr: docker_container_cpu_cores > 2
+        for: 10m
+        labels:
+          severity: warning
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
+          description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘，需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
+          runbook: "自動階段只做診斷；若是已知服務，交給 service-specific playbook：ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
+
+      - alert: DockerContainerCpuRunawayCritical
+        expr: docker_container_cpu_cores > 4
+        for: 15m
+        labels:
+          severity: critical
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
+          description: "{{ $labels.container_name }} 已持續吃超過 4 core，會拖垮 110/188 主機；需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
+          runbook: "禁止通用 docker restart；先抓根因，只有 health down 或 crash loop 才可走重啟。"
+
+      - alert: DockerContainerMemoryLimitPressure
+        # 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
+        expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
+        for: 10m
+        labels:
+          severity: warning
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
+          description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker，需先判斷 workload，不可直接降 limit。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
+          runbook: "若服務已接近 limit：優先調整 retention/concurrency/cache，再評估提高 memory；禁止用更低 memory limit 當止血。"
+
+      - alert: DockerContainerRestartSpike
+        # 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric，吃 node-exporter textfile docker_container_restart_count。
+        expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
+        for: 3m
+        labels:
+          severity: critical
+          layer: docker
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "true"
+        annotations:
+          summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
+          description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增，避免再次出現 litellm 24,464 次靜默崩潰。"
+          auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
+          runbook: "先抓 crash signature；若是 config/DB/網路問題，修設定，不用無限 restart。"
+
  # =========================================================================
  # MinIO / Kali 告警
  # =========================================================================
--- a/scripts/ops/docker-stats-textfile-exporter.py
+++ b/scripts/ops/docker-stats-textfile-exporter.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""
+Docker stats textfile exporter for host-level AIOps baselines.
+
+2026-05-05 ogt + Codex: 110/188 CPU overload follow-up.
+Why: cAdvisor v0.47 may not expose per-container restart count and the live
+110 scrape currently only exposes the root cgroup. This exporter writes a small
+node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+
+
+TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
+OUTPUT_NAME = "docker_stats.prom"
+HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
+LABEL_RE = re.compile(r'["\\\n]')
+
+
+def _escape_label(value: str) -> str:
+    return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
+
+
+def _run_json_lines(command: list[str]) -> list[dict]:
+    result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
+    rows: list[dict] = []
+    for line in result.stdout.splitlines():
+        if not line.strip():
+            continue
+        rows.append(json.loads(line))
+    return rows
+
+
+def _run_text_lines(command: list[str]) -> list[str]:
+    result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
+    return [line.strip() for line in result.stdout.splitlines() if line.strip()]
+
+
+def _cpu_cores(cpu_perc: str) -> float:
+    return float(cpu_perc.strip().rstrip("%")) / 100.0
+
+
+def _memory_bytes(value: str) -> float:
+    raw = value.strip()
+    match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw)
+    if not match:
+        return 0.0
+    number, unit = match.groups()
+    scale = {
+        "B": 1,
+        "KiB": 1024,
+        "MiB": 1024**2,
+        "GiB": 1024**3,
+        "TiB": 1024**4,
+        "KB": 1000,
+        "MB": 1000**2,
+        "GB": 1000**3,
+        "TB": 1000**4,
+    }.get(unit, 1)
+    return float(number) * scale
+
+
+def collect() -> str:
+    stats = _run_json_lines([
+        "docker",
+        "stats",
+        "--no-stream",
+        "--format",
+        "{{json .}}",
+    ])
+    names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"])
+    inspect_by_name = {}
+    if names:
+        inspected = json.loads(subprocess.run(
+            ["docker", "inspect", *names],
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        ).stdout)
+        inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected}
+
+    lines = [
+        "# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.",
+        "# TYPE docker_container_cpu_cores gauge",
+        "# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.",
+        "# TYPE docker_container_cpu_limit_cores gauge",
+        "# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.",
+        "# TYPE docker_container_memory_usage_bytes gauge",
+        "# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.",
+        "# TYPE docker_container_memory_limit_bytes gauge",
+        "# HELP docker_container_pids Current Docker container process/thread count from docker stats.",
+        "# TYPE docker_container_pids gauge",
+        "# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.",
+        "# TYPE docker_container_inspect_restart_count gauge",
+        "# HELP docker_container_info Docker container inventory exposed by the textfile exporter.",
+        "# TYPE docker_container_info gauge",
+    ]
+
+    for row in stats:
+        name = row.get("Name", "")
+        if not name:
+            continue
+        inspected = inspect_by_name.get(name, {})
+        host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {}
+        state = inspected.get("State", {}) if isinstance(inspected, dict) else {}
+        nano_cpus = float(host_config.get("NanoCpus") or 0)
+        memory_limit = float(host_config.get("Memory") or 0)
+        restart_count = int(inspected.get("RestartCount") or 0)
+        labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"'
+        mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip()
+        pids = row.get("PIDs") or "0"
+        lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}")
+        lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}")
+        lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}")
+        lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}")
+        lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}")
+        lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}")
+        lines.append(
+            f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1'
+        )
+
+    return "\n".join(lines) + "\n"
+
+
+def main() -> None:
+    TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
+    payload = collect()
+    with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
+        tmp.write(payload)
+        tmp_path = Path(tmp.name)
+    output_path = TEXTFILE_DIR / OUTPUT_NAME
+    tmp_path.replace(output_path)
+    output_path.chmod(0o644)
+
+
+if __name__ == "__main__":
+    main()