fix(ops): add docker host resource baseline guardrails
Some checks failed
CD Pipeline / tests (push) Failing after 1m50s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 25s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 38s
Some checks failed
CD Pipeline / tests (push) Failing after 1m50s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 25s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 38s
This commit is contained in:
@@ -163,6 +163,37 @@ rules:
|
||||
responsibility: INFRA
|
||||
reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。"
|
||||
|
||||
# 2026-05-05 ogt + Codex: 110/188 長時間過載事故後補 Docker Compose 過載與 restart spike 路由。
|
||||
# 原則:過載與重啟暴增只能先診斷,禁止通用 docker restart;由 LLM + Playbook trust 決定 service-specific 修復。
|
||||
- id: docker_baseline_overload_alert
|
||||
priority: 44
|
||||
description: Docker Compose 服務過載 / restart spike 基線告警(cadvisor + textfile exporter)
|
||||
match:
|
||||
alertname:
|
||||
- HostLoadAverageSustainedHigh
|
||||
- DockerContainerCpuSustainedHigh
|
||||
- DockerContainerCpuRunawayCritical
|
||||
- DockerContainerMemoryLimitPressure
|
||||
- DockerContainerRestartSpike
|
||||
response:
|
||||
action_title: "🔍 Docker/Host 過載自動診斷 — 禁止通用重啟"
|
||||
description: "110/188 Docker Compose 或主機 load 長時間偏離 baseline。AI 需先收集容器 CPU、restart、logs、ClickHouse/Kafka/爬蟲狀態,再選擇限流、降併發或服務專屬 playbook。"
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'echo \"=== LOAD ===\"; uptime; echo \"=== TOP ===\"; ps aux --sort=-%cpu | head -20; echo \"=== DOCKER ===\"; docker stats --no-stream | head -40'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "Docker Compose / bare-metal 過載屬主機與平台資源治理,不能交給 K8s restart 處理"
|
||||
secondary_teams: [BE, SRE]
|
||||
optimization:
|
||||
- type: BASELINE_CHECK
|
||||
description: "比較 load5/core、單容器 CPU core、restart spike 與 24h 動態基線"
|
||||
command: "Prometheus query: node_load5/core + rate(container_cpu_usage_seconds_total[5m]) + increase(docker_container_restart_count[15m])"
|
||||
- type: SERVICE_SPECIFIC_REPAIR
|
||||
description: "依服務選擇專屬修復:ClickHouse 降 merge / scheduler 限 concurrency / litellm 修 health 或路由 / exporter 降 collector"
|
||||
command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook"
|
||||
reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流,禁止通用 docker restart;修復必須服務專屬且可回寫 Playbook trust。"
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
description: K8s Pod/Deployment CPU 使用率過高
|
||||
|
||||
@@ -68,20 +68,66 @@ MONITORED_METRICS: list[dict[str, Any]] = [
|
||||
# cadvisor up=0(prod-docker-188 離線),改用 node-exporter node-level CPU
|
||||
# 實測確認:avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) → 有資料
|
||||
# threshold 0.85 = 85% CPU 使用率(node level,0-1 比例)
|
||||
"promql": 'avg(rate(node_cpu_seconds_total{mode!="idle"}[5m]))',
|
||||
"promql": 'avg(rate(node_cpu_seconds_total{host="188",mode!="idle"}[5m]))',
|
||||
"threshold": 0.85, # > 85% node CPU(所有 core 平均)
|
||||
"description": "Node 188 CPU 使用率(node-exporter,cadvisor 停止時替代)",
|
||||
},
|
||||
{
|
||||
# 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 動態基線。
|
||||
# Why: 事故主熱點在 110 Sentry ClickHouse/Snuba/Kafka,舊 inspector 只看未過濾的全域 CPU。
|
||||
"name": "cpu_usage_node_110",
|
||||
"promql": 'avg(rate(node_cpu_seconds_total{host="110",mode!="idle"}[5m]))',
|
||||
"threshold": 0.85,
|
||||
"description": "Node 110 CPU 使用率(node-exporter)",
|
||||
},
|
||||
{
|
||||
# 2026-04-26 H2 hotfix: 改名 _node_188
|
||||
"name": "memory_usage_node_188",
|
||||
# cadvisor 停止,改用 node-exporter 節點記憶體使用率比例(0-1)
|
||||
# 實測確認:188 機器 62.76 GiB,當前 ~30% 使用率
|
||||
# threshold 0.85 = 85% node memory usage
|
||||
"promql": '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes',
|
||||
"promql": '(node_memory_MemTotal_bytes{host="188"} - node_memory_MemAvailable_bytes{host="188"}) / node_memory_MemTotal_bytes{host="188"}',
|
||||
"threshold": 0.85, # > 85% node memory(0-1 比例)
|
||||
"description": "Node 188 記憶體使用率(node-exporter,cadvisor 停止時替代)",
|
||||
},
|
||||
{
|
||||
# 2026-05-05 ogt + Codex: 110/188 CPU 過載事故後補齊 110 記憶體基線。
|
||||
"name": "memory_usage_node_110",
|
||||
"promql": '(node_memory_MemTotal_bytes{host="110"} - node_memory_MemAvailable_bytes{host="110"}) / node_memory_MemTotal_bytes{host="110"}',
|
||||
"threshold": 0.85,
|
||||
"description": "Node 110 記憶體使用率(node-exporter)",
|
||||
},
|
||||
{
|
||||
# 2026-05-05 ogt + Codex: 用 load/core 比例抓長時間過載,避免 CPU% 被 runnable queue 稀釋。
|
||||
"name": "load5_per_core_110",
|
||||
"promql": 'node_load5{host="110"} / count(count by(cpu) (node_cpu_seconds_total{host="110",mode="idle"}))',
|
||||
"threshold": 1.50,
|
||||
"description": "Node 110 load5 / CPU core 比例",
|
||||
},
|
||||
{
|
||||
"name": "load5_per_core_188",
|
||||
"promql": 'node_load5{host="188"} / count(count by(cpu) (node_cpu_seconds_total{host="188",mode="idle"}))',
|
||||
"threshold": 1.50,
|
||||
"description": "Node 188 load5 / CPU core 比例",
|
||||
},
|
||||
{
|
||||
# 2026-05-05 ogt + Codex: 補 Docker Compose 服務層基線,抓單容器長期吃超過 2 core。
|
||||
"name": "docker_max_container_cpu_cores",
|
||||
"promql": 'max(docker_container_cpu_cores) or max(rate(container_cpu_usage_seconds_total{name!="",id!="/"}[5m])) or on() vector(0)',
|
||||
"threshold": 2.0,
|
||||
"description": "Docker 單容器 CPU core 使用量上限",
|
||||
},
|
||||
{
|
||||
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 container restart metric,改吃 node-exporter textfile。
|
||||
"name": "docker_restart_spike_15m",
|
||||
"promql": (
|
||||
'sum(increase(docker_container_restart_count[15m])) '
|
||||
'or sum(increase(docker_container_inspect_restart_count[15m])) '
|
||||
'or on() vector(0)'
|
||||
),
|
||||
"threshold": 5.0,
|
||||
"description": "Docker 容器 15 分鐘重啟增量",
|
||||
},
|
||||
{
|
||||
"name": "pod_restart_rate",
|
||||
# kube-state-metrics: namespace=awoooi-prod,sum 聚合避免 multi-vector
|
||||
@@ -222,8 +268,8 @@ class ProactiveInspector:
|
||||
|
||||
async def _inspect_dynamic_baseline(self, report: InspectionReport) -> None:
|
||||
"""從 Prometheus 抓取當前值,與 Holt-Winters 基線比對。"""
|
||||
from src.services.dynamic_baseline_service import get_dynamic_baseline_service
|
||||
from src.core.feature_flags import aiops_flags
|
||||
from src.services.dynamic_baseline_service import get_dynamic_baseline_service
|
||||
|
||||
if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE:
|
||||
return
|
||||
@@ -273,8 +319,8 @@ class ProactiveInspector:
|
||||
|
||||
async def _inspect_log_patterns(self, report: InspectionReport) -> None:
|
||||
"""掃描 K8s Pod 日誌,偵測新 log pattern。"""
|
||||
from src.services.log_anomaly_detector import get_log_anomaly_detector
|
||||
from src.core.feature_flags import aiops_flags
|
||||
from src.services.log_anomaly_detector import get_log_anomaly_detector
|
||||
|
||||
if not aiops_flags.AIOPS_P4_LOG_ANOMALY:
|
||||
return
|
||||
@@ -313,8 +359,8 @@ class ProactiveInspector:
|
||||
|
||||
async def _inspect_trends(self, report: InspectionReport) -> None:
|
||||
"""對各 metric 做 4h 趨勢外推。"""
|
||||
from src.services.trend_predictor import get_trend_predictor
|
||||
from src.core.feature_flags import aiops_flags
|
||||
from src.services.trend_predictor import get_trend_predictor
|
||||
|
||||
if not aiops_flags.AIOPS_P4_TREND_PREDICTOR:
|
||||
return
|
||||
@@ -365,8 +411,8 @@ class ProactiveInspector:
|
||||
|
||||
async def _retrain_baselines_background(self) -> None:
|
||||
"""背景重訓所有 Holt-Winters 基線(不阻塞巡檢)。"""
|
||||
from src.services.dynamic_baseline_service import get_dynamic_baseline_service
|
||||
from src.core.feature_flags import aiops_flags
|
||||
from src.services.dynamic_baseline_service import get_dynamic_baseline_service
|
||||
|
||||
if not aiops_flags.AIOPS_P4_DYNAMIC_BASELINE:
|
||||
return
|
||||
@@ -392,6 +438,7 @@ class ProactiveInspector:
|
||||
async def _fetch_current_value(self, promql: str) -> float | None:
|
||||
"""從 Prometheus 抓取當前值(instant query)。"""
|
||||
import httpx
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
try:
|
||||
|
||||
110
docs/LOGBOOK.md
110
docs/LOGBOOK.md
@@ -6,6 +6,114 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05 | 110/188 CPU/Mem 配額全景盤點 + Docker baseline 監控落地
|
||||
|
||||
**背景**:統帥擔心 Claude Code 對 110/188 服務 CPU/memory limit 亂配置,造成服務卡死或慢性過載;本輪接續盤點 live Docker inspect / docker stats / compose 宣告。
|
||||
|
||||
**現場結論**:
|
||||
- 110 仍高負載,不是單純等待回補即可:load 約 `23.84 / 27.11 / 34.67`;Sentry ClickHouse 4 CPU / 8GiB 貼著 CPU 上限跑,Kafka 3GiB 使用率約 84%,taskbroker 1 CPU 接近滿載,taskscheduler 512MiB 約 75%。
|
||||
- 110 Kafka lag 近乎清空,ClickHouse 仍在重 merge,node-exporter 自己曾因 `arp` / `netclass` / `netdev` collector 單次 scrape 花 17s+ 而自傷。
|
||||
- 188 已回穩但仍需節流治理:momo-scheduler 2 CPU / 2GiB 是安全欄不是根治;SignOz ClickHouse 4 CPU / 24GiB 目前合理。
|
||||
- 188 momo-scheduler 日誌顯示三張 schema 缺表(`ai_calls` / `learning_episodes` / `host_health_probes`)與 Elephant Alpha/OpenClaw action drift,這是背景任務反覆失敗,不是 CPU/memory limit 問題。
|
||||
- 110 node-exporter textfile path live drift:原指向 `/home/ollama/node_exporter_textfiles`,110 上不存在,造成 Docker Compose 指標半盲。
|
||||
|
||||
**本次落地**:
|
||||
- 新增 `scripts/ops/docker-stats-textfile-exporter.py`,輸出 Docker container CPU cores / CPU limit / memory usage / memory limit / restart count / info。
|
||||
- 110:部署 exporter 到 `/home/wooo/scripts/`,新增 cron,每分鐘寫 `/home/wooo/node_exporter_textfiles/docker_stats.prom`;修正 `/home/wooo/monitoring/docker-compose.yml` 的 node-exporter textfile path,並只重建 node-exporter。
|
||||
- 110:關閉 node-exporter 高成本 collector:`arp`、`netclass`、`netdev`;scrape duration 從約 17s+ 降到 CPU/mem/load/textfile 等核心 collector 都 < 0.1s,node-exporter CPU 從約 80% 降到 0-5%。
|
||||
- 110:Kafka lag 已近零後,將 `/opt/sentry/.env` `SENTRY_TASKWORKER_CONCURRENCY` 從 4 降到 2,只重建 taskworker(snuba-api 因 compose dependency 被重建一次),taskworker command 已確認 `--concurrency=2`。
|
||||
- 188:部署 exporter 到 `/home/ollama/scripts/`,新增 cron,每分鐘寫 `/home/ollama/node_exporter_textfiles/docker_stats.prom`;保留既有 `docker_restart_count.prom`。
|
||||
- 188:套用既有 additive migrations `024_create_ai_calls_table.sql`、`028_create_learning_episodes.sql`、`029_create_host_health_probes.sql`,補齊 scheduler 正在寫入的 schema,未重啟服務。
|
||||
- `ops/monitoring/alerts*.yml`:新增 `HostLoadAverageSustainedHigh`、`DockerContainerCpuSustainedHigh`、`DockerContainerCpuRunawayCritical`、`DockerContainerMemoryLimitPressure`、`DockerContainerRestartSpike`。
|
||||
- `apps/api/alert_rules.yaml`:新增 Docker/Host 過載路由,強制走 `SSH_DIAGNOSE`,禁止通用 docker restart。
|
||||
- API GitOps:用最新 `main` (`a57e3d3d`) 加本次兩個 API 修補檔,在 188 建置並推送 `192.168.0.110:5000/awoooi/api:resource-baseline-20260505-a57e3d3`;`k8s/awoooi-prod/kustomization.yaml` 指向此 tag,避免手動 `kubectl set image` 被 Argo 回滾。
|
||||
- `docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md`:記錄 live 配額盤點、baseline policy、反模式與下一步 rollout 順序。
|
||||
- Prometheus 已 reload,97 條規則載入;新 baseline rules 全部存在。
|
||||
|
||||
**驗證**:
|
||||
- `node_textfile_scrape_error`:110/188/112 全為 0。
|
||||
- Prometheus 已可查到 `docker_container_cpu_cores{host="110",container_name="sentry-self-hosted-clickhouse-1"}`、`docker_container_memory_limit_bytes{host="110",container_name="sentry-self-hosted-kafka-1"}`、`docker_container_cpu_cores{host="188",container_name="momo-scheduler"}`。
|
||||
- 110:taskworker / snuba-api / ClickHouse / Kafka healthy;Sentry Kafka `snuba-consumers` 主要 lag 0-1;load 從約 30+ 降到 `11.83 / 20.97 / 27.41`(1m 已降,15m 仍需等 merge 平滑)。
|
||||
- 188:三張 DB 表存在;migration 後只剩 `Fallback (111)` 健康警告,`UndefinedTable` 未再出現;momo-db CPU 回到約 0.6-2.5%,host load 約 `2.47 / 2.80 / 4.28`。
|
||||
- Prometheus 新 baseline alerts 查詢目前無 firing。
|
||||
- 新規則目前 pending:110 `HostLoadAverageSustainedHigh`、110 `DockerContainerCpuSustainedHigh` for Sentry ClickHouse。
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_classify_alert_early.py apps/api/tests/test_alert_rule_engine_validation.py -q` → 89 passed。
|
||||
- `ruff check apps/api/src/services/proactive_inspector.py`、`py_compile scripts/ops/docker-stats-textfile-exporter.py`、`git diff --check` → passed。
|
||||
- `kubectl kustomize k8s/awoooi-prod` → API/worker image 均解析為 `resource-baseline-20260505-a57e3d3`。
|
||||
|
||||
**下一步**:
|
||||
- 不要再降低 ClickHouse / Kafka memory limit;先觀察 backlog drain。
|
||||
- 若 110 ClickHouse 15-30 分鐘後仍持續 >2.5 cores,下一步查 merge/query 類型;不要靠降低 memory 或泛用 restart。
|
||||
- 188 下一步修 Elephant Alpha/OpenClaw allowed-action drift,避免 AI 自動修復決策計入 circuit breaker;momo-scheduler 2 CPU / 2GiB 暫時保留。
|
||||
|
||||
## 2026-05-05 | ADR-110 / AwoooP GCP Ollama compute pool 收斂
|
||||
|
||||
**背景**:統帥批准將 GCP-A / GCP-B Ollama 納入 AwoooP 推進計畫,不只作 failover,而是作為 platform-level Ollama compute pool。
|
||||
|
||||
**2026-05-05 live 驗證結論**:
|
||||
- 生產 Deployment 實際 env:`OLLAMA_URL=110:11435`、`OLLAMA_SECONDARY_URL=110:11436`、`OLLAMA_FALLBACK_URL=192.168.0.111:11434`;ConfigMap 已是 `110:11437`,但 Deployment explicit env 尚未一致。
|
||||
- Pod 內 `110:11435` / `110:11436` 均可 `/api/tags` 成功,兩台 GCP Ollama 有實際可用。
|
||||
- `192.168.0.111:11434` 從 Pod 內 `No route to host`;`110:11437/nginx-health` 從外部可回 OK,但 `/api/tags` 回 502,表示 110 proxy block 存在但 upstream `.111` 不健康或不可達。
|
||||
- live NetworkPolicy 只允許 Pod → 110 的 `11435/11436`,未允許 `11437`;repo manifest 已補 11437,但尚未 live apply。
|
||||
- 最近告警跑到 Gemini 的主因不是 fallback order 沒設定,而是 `OllamaGcpBProvider` 只 override `_endpoint_url()`,但繼承的 `analyze()` 仍硬打 `settings.OLLAMA_URL`;log 顯示 router 選 `ollama_gcp_b`,實際錯打 `110:11435` 504,Local 又不可用,最後才落 Gemini。
|
||||
|
||||
**本次修補**:
|
||||
- `ADR-110`:從 direct GCP IP 拓撲改寫為正式 runtime 拓撲:K8s → `192.168.0.110:11435/11436/11437` → GCP-A/GCP-B/Local;direct GCP IP 僅是 upstream / 非 K8s fallback。
|
||||
- `DEPLOY-GCP-OLLAMA-PROXY.md`:補 11437 Local fallback 驗證、NetworkPolicy port、`kubectl set env` 警告與三層 proxy route。
|
||||
- `k8s/awoooi-prod/06-deployment-api.yaml`:修正宣告檔 drift,`OLLAMA_FALLBACK_URL` 與 ConfigMap 對齊為 `http://192.168.0.110:11437`。未執行 live apply。
|
||||
- 新增 `INV-10-ollama-call-sites.md`:盤點 failover-aware 路徑與仍直讀 `OLLAMA_URL` 的 production call sites,並定義 GCP-A interactive / GCP-B batch+RAG+shadow / Local privacy+DR 分工。
|
||||
- 新增 `apps/api/tests/test_ollama_call_site_inventory.py`:把現有 direct `OLLAMA_URL` legacy debt 鎖成上限;新增 direct call site 必須改走 resolver/provider registry/EffectivePolicy,且 ConfigMap / Deployment 的三層 Ollama env 必須一致。
|
||||
- 新增 `services/ollama_endpoint_resolver.py`:最小 workload-aware resolver;`embedding` / `rag` / `code_review` / `batch` / `shadow` / `canary` 優先 GCP-B,interactive 留 GCP-A,local-required 留 Local。
|
||||
- 第一批低風險 runtime slice:`embedding_service.py`、`knowledge_rag_service.py`、`playbook_rag.py`、`local_code_review_service.py` 改走 resolver,讓批次/RAG/審查路徑優先用 GCP-B;未碰 `decision_manager`、OpenClaw、Hermes、chat manager 主線。
|
||||
- `ai_providers/ollama.py`:修正 base `OllamaProvider.analyze()` / `health_check()` 使用 `_endpoint_url()`,讓 `OllamaGcpBProvider` 選中時真正打 `OLLAMA_SECONDARY_URL`,不是錯打 primary。
|
||||
- `k8s/awoooi-prod/02-network-policy.yaml`:repo source 補 Pod → 110:11437 egress;未執行 live apply。
|
||||
- `MASTER-WORKPLAN.md`、`DETAILED-IMPLEMENTATION-PLAN.md`、`INV-4`、`INV-6`、`AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`:整合 INV-10 與 GCP-B active-active 策略。
|
||||
|
||||
**驗證**:
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_call_site_inventory.py -q` → 2 passed。
|
||||
- `apps/api/.venv/bin/python -m ruff check apps/api/tests/test_ollama_call_site_inventory.py --fix` → fixed import order,rerun clean。
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 6 passed。
|
||||
- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ollama_endpoint_resolver.py apps/api/src/services/embedding_service.py apps/api/src/services/knowledge_rag_service.py apps/api/src/services/local_code_review_service.py apps/api/src/services/playbook_rag.py apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py` → passed after ruff import-order fix。
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_ollama_provider_endpoints.py apps/api/tests/test_ollama_failover_manager.py::TestThreeLayerFailover::test_gcp_a_offline_gcp_b_healthy_uses_gcp_b apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_call_site_inventory.py -q` → 9 passed。
|
||||
- `apps/api/.venv/bin/python -m ruff check apps/api/src/services/ai_providers/ollama.py apps/api/tests/test_ollama_provider_endpoints.py` → passed after import-order fix。
|
||||
|
||||
**下一步**:
|
||||
- 不直接重寫 Tier 3 runtime;下一批先收斂 `apps/api/src/api/v1/rag.py` 與 `apps/api/scripts/reembed_bge_m3.py` 這兩個仍偏 batch 的 direct path。
|
||||
- 再補 provider health snapshot,讓 health/report 類路徑可同時呈現 GCP-A/GCP-B/Local,而不是只看 primary。
|
||||
- OpenClaw/Hermes/chat manager 只做 EffectivePolicy shadow compare,不直接切換。
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05 | AwoooP Claude Code 盤點修補 + convergence map 整合
|
||||
|
||||
**盤點結論**:
|
||||
- Claude Code 的 AwoooP 檔案多數確實已落地(ADR-106~124、INV-1~9、migrations、contract packages、runtime/API shell、Operator Console routes)。
|
||||
- 但有幾個「宣告完成 ≠ 線上路徑生效」缺口:MCP redaction middleware 有寫但 Gateway 回傳 Runtime/LLM 前未強制套用;Operator Console 前端讀 `items/status/name/is_suspended`,後端實際回 `tenants/contracts/runs/state/display_name/is_active`;ADR-106 本體缺 Quantified Gates 補章。
|
||||
- 沒有執行 production DB migration;`awooop_phase*.sql` 仍需依部署順序、rollback 檢查、DB expert review 後再套用。
|
||||
|
||||
**本次修補**:
|
||||
- `plugins/mcp/gateway.py`:Gateway 成功執行後先 `redact_mcp_output()` 再回傳給 Runtime/LLM;gateway audit hash 改用 redacted input/output 計算。
|
||||
- `services/mcp_audit_service.py`:legacy `mcp_audit_log` 寫入前補上 string pattern redaction,避免 DSN/token/internal IP 只因 key 名未命中而外洩。
|
||||
- `tests/test_mcp_credential_isolation.py`:新增 gateway return redaction + legacy audit redaction regression tests。
|
||||
- `ADR-106`:新增 `D9.1 Quantify Strangler Fig Promotion Gates`,正式化 shadow→canary→read_only→suggest→auto_remediate 的量化 gate。
|
||||
- `MASTER-WORKPLAN.md` + `AWOOOP-MONITORING-ALERTING-CONVERGENCE.md`:納入 monitoring/alerting convergence map,固定 mirror → read-only EffectivePolicy comparison → read-only MCP Gateway wrapper → Channel Event wrapper → low-risk LLM strangler 順序。
|
||||
- `apps/web/src/app/[locale]/awooop/*`:修正 Operator Console 前端與後端 response contract 對齊;approval decide 補 `project_id`;run list 改用 `state` filter 與 lowercase FSM state。
|
||||
|
||||
**驗證**:
|
||||
- `apps/api/.venv/bin/python -m pytest apps/api/tests/test_mcp_credential_isolation.py -q` → 12 passed。
|
||||
- `apps/api/.venv/bin/python -m ruff check apps/api/src/plugins/mcp/gateway.py apps/api/src/services/mcp_audit_service.py apps/api/tests/test_mcp_credential_isolation.py` → passed。
|
||||
- `pnpm --dir apps/web exec tsc --noEmit` → passed。
|
||||
- `pnpm --dir apps/web run build` → passed;AwoooP routes `/[locale]/awooop/*` 全部成功建置。
|
||||
- `git diff --check` → passed。
|
||||
|
||||
**仍未完成 / 不可誤判完成**:
|
||||
- production DB migration 尚未 apply。
|
||||
- `approval_records` 仍未 project-scoped;部分 legacy repository/service 仍依賴 RLS default 或無 explicit project filter。
|
||||
- direct MCP/provider call sites 尚未全面 `forbid-new`;只能視為 wrapper 過渡期。
|
||||
- `apps/web/package.json` / `pnpm-lock.yaml` 的 Next 14.2.25 bump 及 `tsconfig.tsbuildinfo` dirty state 是既有 session 變更,本次未回退。
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05 | ADR-110 三層容災補齊 + 四台主機密碼 SSH 恢復
|
||||
|
||||
**ADR-110 Local Fallback(port 11437)**:
|
||||
@@ -30,7 +138,7 @@
|
||||
```
|
||||
OLLAMA_URL = http://192.168.0.110:11435 ← GCP-A primary(via nginx proxy)
|
||||
OLLAMA_SECONDARY_URL = http://192.168.0.110:11436 ← GCP-B secondary(via nginx proxy)
|
||||
OLLAMA_FALLBACK_URL = http://192.168.0.111:11434 ← 111 兜底
|
||||
OLLAMA_FALLBACK_URL = http://192.168.0.110:11437 ← Local 111 fallback(via nginx proxy)
|
||||
```
|
||||
- 驗證:兩台 GCP 各 10 個模型,200 OK
|
||||
- 熱更新:`kubectl set env`(不動 image tag,避免 IMAGE_TAG_PLACEHOLDER 蓋掉)
|
||||
|
||||
72
docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md
Normal file
72
docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md
Normal file
@@ -0,0 +1,72 @@
|
||||
# 110/188 Host Resource Baseline
|
||||
|
||||
> 2026-05-05 ogt + Codex
|
||||
> Scope: 110 DevOps/Sentry host and 188 AI/Web host Docker Compose resource limits.
|
||||
|
||||
## Current Live Finding
|
||||
|
||||
### 110
|
||||
|
||||
| Service | Live Limit | Live Usage Snapshot | Verdict |
|
||||
|---|---:|---:|---|
|
||||
| Sentry ClickHouse | 4 CPU / 8 GiB | ~235-291% CPU / 3.3-3.4 GiB | CPU capped but still hottest. Do not lower memory; keep merge settings explicit. |
|
||||
| Sentry Kafka | 2 CPU / 3 GiB | ~40-55% CPU / 2.5 GiB (84%) | Memory is close to pressure. Do not reduce memory. |
|
||||
| Sentry taskworker | 2 CPU / 2 GiB, concurrency 2 | ~120-181% CPU after restart | Concurrency reduced from 4 to 2 after Kafka lag cleared. Watch Sentry task latency before further changes. |
|
||||
| Sentry taskbroker | 1 CPU / 512 MiB | ~70-98% CPU / 160 MiB | CPU is tight; increasing may improve backlog but can raise host load. |
|
||||
| Sentry taskscheduler | 0.5 CPU / 512 MiB | ~13% CPU / 387 MiB (76%) | Memory is tight; alert at 85% before it stalls. |
|
||||
| Gitea | 3 CPU / 3 GiB | ~4% CPU / 2.18 GiB (73%) | Good cap; memory headroom is not huge. |
|
||||
| node-exporter | 1 CPU / 256 MiB | ~0-5% CPU / 8 MiB | Good after disabling expensive `arp`, `netclass`, and `netdev` collectors. |
|
||||
| cadvisor | 1 CPU / 512 MiB | ~0% CPU / 27 MiB | Good safety cap. |
|
||||
| Harbor / Langfuse / Prometheus / Grafana / Nginx | mostly unlimited | currently low | Needs staged limits, but not during Sentry backlog drain. |
|
||||
|
||||
### 188
|
||||
|
||||
| Service | Live Limit | Live Usage Snapshot | Verdict |
|
||||
|---|---:|---:|---|
|
||||
| momo-scheduler | 2 CPU / 2 GiB | ~0.3% CPU / 163 MiB after crawler burst | CPU cap is working. Next fix is crawler concurrency and failed background jobs, not lower CPU. |
|
||||
| SignOz ClickHouse | 4 CPU / 24 GiB | ~93-133% CPU / 1.1 GiB | Healthy enough; keep current cap. |
|
||||
| SignOz Zookeeper | 1 CPU / 2 GiB | ~8-18% CPU / 1.09 GiB | OK. |
|
||||
| cadvisor | 1.5 CPU / 1 GiB | ~0% CPU / 28 MiB | Good. |
|
||||
| litellm | unlimited | ~0.6-0.9% CPU / 780 MiB | Add modest cap after observing traffic; do not re-add DATABASE_URL. |
|
||||
| momo-pro-system / momo-db | unlimited | DB had short CPU bursts, then ~0.6% with no active long query | Needs service-specific limits after scheduler/schema pressure is controlled. |
|
||||
| Monitoring tools / websites / exporters | mostly unlimited | low | Add caps gradually with textfile alerts watching pressure. |
|
||||
|
||||
## Baseline Policy
|
||||
|
||||
Use these thresholds for alerting and AI triage:
|
||||
|
||||
| Signal | Threshold | Action |
|
||||
|---|---:|---|
|
||||
| host load5 / core | > 1.5 for 15m | Critical SSH diagnosis; classify top service before repair. |
|
||||
| Docker container CPU | > 2 cores for 10m | Warning diagnosis; check limit, backlog, and workload type. |
|
||||
| Docker container CPU | > 4 cores for 15m | Critical diagnosis; never generic restart. |
|
||||
| Docker memory / limit | > 85% for 10m | Warning; raise memory or reduce workload, never lower the limit. |
|
||||
| Docker restarts | > 5 in 15m | Critical; pull logs and fix crash signature. |
|
||||
|
||||
## Rules
|
||||
|
||||
1. Do not reduce ClickHouse CPU/memory while merges or Kafka backlog are draining.
|
||||
2. Do not set ClickHouse `background_pool_size` below the three MergeTree thresholds unless all thresholds are explicitly lower than `pool * ratio`.
|
||||
3. Do not use lower memory limits as a load-shedding tool. That creates OOM/restart loops.
|
||||
4. For Chrome crawlers, cap concurrency first; CPU caps are only the safety rail.
|
||||
5. For Kafka/Snuba, treat high CPU as backlog digestion unless lag stops decreasing.
|
||||
6. For monitoring tools, caps are required, but every cap must be paired with self-monitoring.
|
||||
7. Every Docker Compose host must emit `docker_container_cpu_cores`, `docker_container_memory_*`, and a restart counter via node-exporter textfile.
|
||||
8. Disable node-exporter collectors that are slow or failing on each host; exporter scrape time is part of the resource baseline.
|
||||
|
||||
## Next Safe Rollout Order
|
||||
|
||||
1. Deploy `scripts/ops/docker-stats-textfile-exporter.py` to 110 and 188 textfile collector cron.
|
||||
2. Reload Prometheus rules with the new Docker CPU/memory/restart baseline alerts.
|
||||
3. Observe 110 for one drain window after node-exporter collector trim and taskworker concurrency 2. Kafka lag is now near zero; if ClickHouse remains high, tune merge/query behavior, not Kafka consumers.
|
||||
4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
|
||||
5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
|
||||
6. Add modest caps to currently unlimited low-risk services in small batches.
|
||||
|
||||
## Known Anti-Patterns
|
||||
|
||||
- `docker restart` as a response to sustained CPU.
|
||||
- Lowering ClickHouse merge pool without validating ClickHouse 25.x thresholds.
|
||||
- Capping Kafka below current working set during backlog recovery.
|
||||
- Treating "no alert" as healthy when cAdvisor or textfile exporters are missing.
|
||||
- Letting monitoring collectors spend seconds per scrape; this turns observability into load.
|
||||
@@ -39,7 +39,7 @@ resources:
|
||||
images:
|
||||
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/api
|
||||
newTag: 2e17325c3f66c16783172e50c2d29f686d2b574e
|
||||
newTag: resource-baseline-20260505-a57e3d3
|
||||
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/web
|
||||
newTag: 00684403887745e35848bbbab5ac795cfdd6fd58
|
||||
|
||||
@@ -55,6 +55,7 @@ scrape_configs:
|
||||
- targets:
|
||||
- https://aiops.wooo.work
|
||||
- https://mo.wooo.work
|
||||
- http://192.168.0.188:4000/health/liveliness
|
||||
- http://192.168.0.110:3001
|
||||
- http://192.168.0.120:31234
|
||||
- http://192.168.0.120:31235
|
||||
|
||||
@@ -48,6 +48,25 @@ groups:
|
||||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||||
description: "CPU 使用率超過 80%"
|
||||
|
||||
- alert: HostLoadAverageSustainedHigh
|
||||
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
|
||||
# Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
|
||||
expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-188
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
mcp_provider: "ssh_host"
|
||||
host_type: "bare_metal"
|
||||
alert_category: "host_resource"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} load5/core 長時間過高"
|
||||
description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
|
||||
runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。"
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
@@ -576,6 +595,74 @@ groups:
|
||||
summary: "容器 {{ $labels.container }} 已停止"
|
||||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘"
|
||||
|
||||
- alert: DockerContainerCpuSustainedHigh
|
||||
# 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
|
||||
# Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
|
||||
expr: docker_container_cpu_cores > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
|
||||
description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||||
runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
|
||||
|
||||
- alert: DockerContainerCpuRunawayCritical
|
||||
expr: docker_container_cpu_cores > 4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
|
||||
description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
|
||||
runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。"
|
||||
|
||||
- alert: DockerContainerMemoryLimitPressure
|
||||
# 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
|
||||
expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
|
||||
description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||||
runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。"
|
||||
|
||||
- alert: DockerContainerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。
|
||||
expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
|
||||
description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。"
|
||||
|
||||
# =========================================================================
|
||||
# MinIO / Kali 告警
|
||||
# =========================================================================
|
||||
|
||||
@@ -51,6 +51,25 @@ groups:
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
|
||||
runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain"
|
||||
|
||||
- alert: HostLoadAverageSustainedHigh
|
||||
# 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
|
||||
# Why: CPU% 只看 busy time,無法充分表達 ClickHouse merge、Kafka 回補、Chrome/Ollama 造成的 runnable queue。
|
||||
expr: node_load5{host=~"110|188"} / on(host) count by(host) (count by(host,cpu) (node_cpu_seconds_total{mode="idle",host=~"110|188"})) > 1.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: systemd-188
|
||||
team: ops
|
||||
auto_repair: "true"
|
||||
mcp_provider: "ssh_host"
|
||||
host_type: "bare_metal"
|
||||
alert_category: "host_resource"
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.host }} load5/core 長時間過高"
|
||||
description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
|
||||
runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。"
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
@@ -582,6 +601,74 @@ groups:
|
||||
summary: "容器 {{ $labels.container }} 已停止"
|
||||
description: "主機 {{ $labels.host }} 容器 {{ $labels.container }} 已 exited/dead,持續 1 分鐘"
|
||||
|
||||
- alert: DockerContainerCpuSustainedHigh
|
||||
# 2026-05-05 ogt + Codex: Docker Compose 長期過載基線。
|
||||
# Baseline: 單容器 >2 core 10m 為 warning;用於提早抓 cadvisor、ClickHouse、momo-scheduler、Ollama runner 類問題。
|
||||
expr: docker_container_cpu_cores > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
|
||||
description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||||
runbook: "自動階段只做診斷;若是已知服務,交給 service-specific playbook:ClickHouse 降 merge/查 backlog、scheduler 限 concurrency、litellm/blackbox 查 liveliness。"
|
||||
|
||||
- alert: DockerContainerCpuRunawayCritical
|
||||
expr: docker_container_cpu_cores > 4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
|
||||
description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CONTAINER ===\"; docker stats --no-stream {{ $labels.container_name }}; echo \"=== TOP ===\"; docker top {{ $labels.container_name }} -eo pid,ppid,stat,pcpu,pmem,comm,args | head -30'"
|
||||
runbook: "禁止通用 docker restart;先抓根因,只有 health down 或 crash loop 才可走重啟。"
|
||||
|
||||
- alert: DockerContainerMemoryLimitPressure
|
||||
# 2026-05-05 ogt + Codex: 防止亂設 memory limit 把 Kafka/Gitea/Taskworker 類服務卡死。
|
||||
expr: docker_container_memory_limit_bytes > 0 and docker_container_memory_usage_bytes / docker_container_memory_limit_bytes > 0.85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-1
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
|
||||
description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker stats --no-stream {{ $labels.container_name }}; docker inspect {{ $labels.container_name }} | head -80'"
|
||||
runbook: "若服務已接近 limit:優先調整 retention/concurrency/cache,再評估提高 memory;禁止用更低 memory limit 當止血。"
|
||||
|
||||
- alert: DockerContainerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: cAdvisor v0.47 無 restart metric,吃 node-exporter textfile docker_container_restart_count。
|
||||
expr: increase(docker_container_restart_count[15m]) > 5 or increase(docker_container_inspect_restart_count[15m]) > 5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
layer: docker
|
||||
team: ops
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
auto_repair: "true"
|
||||
annotations:
|
||||
summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
|
||||
description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。"
|
||||
auto_repair_action: "ssh {{ $labels.instance }} 'docker ps -a --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先抓 crash signature;若是 config/DB/網路問題,修設定,不用無限 restart。"
|
||||
|
||||
# =========================================================================
|
||||
# MinIO / Kali 告警
|
||||
# =========================================================================
|
||||
|
||||
145
scripts/ops/docker-stats-textfile-exporter.py
Executable file
145
scripts/ops/docker-stats-textfile-exporter.py
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Docker stats textfile exporter for host-level AIOps baselines.
|
||||
|
||||
2026-05-05 ogt + Codex: 110/188 CPU overload follow-up.
|
||||
Why: cAdvisor v0.47 may not expose per-container restart count and the live
|
||||
110 scrape currently only exposes the root cgroup. This exporter writes a small
|
||||
node-exporter textfile so Prometheus can alert on Docker Compose CPU/restarts.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
||||
OUTPUT_NAME = "docker_stats.prom"
|
||||
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
||||
LABEL_RE = re.compile(r'["\\\n]')
|
||||
|
||||
|
||||
def _escape_label(value: str) -> str:
|
||||
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
||||
|
||||
|
||||
def _run_json_lines(command: list[str]) -> list[dict]:
|
||||
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
|
||||
rows: list[dict] = []
|
||||
for line in result.stdout.splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def _run_text_lines(command: list[str]) -> list[str]:
|
||||
result = subprocess.run(command, check=True, capture_output=True, text=True, timeout=30)
|
||||
return [line.strip() for line in result.stdout.splitlines() if line.strip()]
|
||||
|
||||
|
||||
def _cpu_cores(cpu_perc: str) -> float:
|
||||
return float(cpu_perc.strip().rstrip("%")) / 100.0
|
||||
|
||||
|
||||
def _memory_bytes(value: str) -> float:
|
||||
raw = value.strip()
|
||||
match = re.fullmatch(r"([0-9.]+)\s*([A-Za-z]+)", raw)
|
||||
if not match:
|
||||
return 0.0
|
||||
number, unit = match.groups()
|
||||
scale = {
|
||||
"B": 1,
|
||||
"KiB": 1024,
|
||||
"MiB": 1024**2,
|
||||
"GiB": 1024**3,
|
||||
"TiB": 1024**4,
|
||||
"KB": 1000,
|
||||
"MB": 1000**2,
|
||||
"GB": 1000**3,
|
||||
"TB": 1000**4,
|
||||
}.get(unit, 1)
|
||||
return float(number) * scale
|
||||
|
||||
|
||||
def collect() -> str:
|
||||
stats = _run_json_lines([
|
||||
"docker",
|
||||
"stats",
|
||||
"--no-stream",
|
||||
"--format",
|
||||
"{{json .}}",
|
||||
])
|
||||
names = _run_text_lines(["docker", "ps", "-a", "--format", "{{.Names}}"])
|
||||
inspect_by_name = {}
|
||||
if names:
|
||||
inspected = json.loads(subprocess.run(
|
||||
["docker", "inspect", *names],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
).stdout)
|
||||
inspect_by_name = {row.get("Name", "").lstrip("/"): row for row in inspected}
|
||||
|
||||
lines = [
|
||||
"# HELP docker_container_cpu_cores Current Docker container CPU usage in cores from docker stats.",
|
||||
"# TYPE docker_container_cpu_cores gauge",
|
||||
"# HELP docker_container_cpu_limit_cores Docker container CPU quota in cores, 0 when unlimited.",
|
||||
"# TYPE docker_container_cpu_limit_cores gauge",
|
||||
"# HELP docker_container_memory_usage_bytes Current Docker container memory usage in bytes from docker stats.",
|
||||
"# TYPE docker_container_memory_usage_bytes gauge",
|
||||
"# HELP docker_container_memory_limit_bytes Docker container memory limit in bytes, 0 when unlimited.",
|
||||
"# TYPE docker_container_memory_limit_bytes gauge",
|
||||
"# HELP docker_container_pids Current Docker container process/thread count from docker stats.",
|
||||
"# TYPE docker_container_pids gauge",
|
||||
"# HELP docker_container_inspect_restart_count Docker container restart count from Docker inspect.",
|
||||
"# TYPE docker_container_inspect_restart_count gauge",
|
||||
"# HELP docker_container_info Docker container inventory exposed by the textfile exporter.",
|
||||
"# TYPE docker_container_info gauge",
|
||||
]
|
||||
|
||||
for row in stats:
|
||||
name = row.get("Name", "")
|
||||
if not name:
|
||||
continue
|
||||
inspected = inspect_by_name.get(name, {})
|
||||
host_config = inspected.get("HostConfig", {}) if isinstance(inspected, dict) else {}
|
||||
state = inspected.get("State", {}) if isinstance(inspected, dict) else {}
|
||||
nano_cpus = float(host_config.get("NanoCpus") or 0)
|
||||
memory_limit = float(host_config.get("Memory") or 0)
|
||||
restart_count = int(inspected.get("RestartCount") or 0)
|
||||
labels = f'host="{_escape_label(HOST_LABEL)}",container_name="{_escape_label(name)}"'
|
||||
mem_current = (row.get("MemUsage") or "0 B / 0 B").split("/", 1)[0].strip()
|
||||
pids = row.get("PIDs") or "0"
|
||||
lines.append(f"docker_container_cpu_cores{{{labels}}} {_cpu_cores(row.get('CPUPerc', '0%')):.6f}")
|
||||
lines.append(f"docker_container_cpu_limit_cores{{{labels}}} {nano_cpus / 1_000_000_000:.6f}")
|
||||
lines.append(f"docker_container_memory_usage_bytes{{{labels}}} {_memory_bytes(mem_current):.0f}")
|
||||
lines.append(f"docker_container_memory_limit_bytes{{{labels}}} {memory_limit:.0f}")
|
||||
lines.append(f"docker_container_pids{{{labels}}} {int(float(pids))}")
|
||||
lines.append(f"docker_container_inspect_restart_count{{{labels}}} {restart_count}")
|
||||
lines.append(
|
||||
f'docker_container_info{{{labels},status="{_escape_label(state.get("Status", ""))}"}} 1'
|
||||
)
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
payload = collect()
|
||||
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
||||
tmp.write(payload)
|
||||
tmp_path = Path(tmp.name)
|
||||
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
||||
tmp_path.replace(output_path)
|
||||
output_path.chmod(0o644)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user