diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index ce27489d..d85ecbef 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -45,6 +45,7 @@ **本次修補**: - 將 canonical `ops/monitoring/alerts-unified.yml` 補齊 SSH diagnosis action、host_resource category、`mcp_provider=ssh_host` 與 guarded disk-prune route,避免下次 deploy-alerts 覆蓋掉 live baseline。 +- Docker baseline 與 systemd runner baseline 告警也補 `mcp_provider=ssh_host` / `host_type=bare_metal`,避免 LLM 在 Docker/host 事故中猜錯執行域。 - 維持原則:host/Docker 高負載先只讀診斷;stateful DB/ClickHouse/Harbor/Sentry 不允許通用 restart。 ## 2026-05-05 | 重開機後排程與 startup baseline 修復 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 921e89b6..27e68900 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -615,6 +615,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core" description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。" @@ -631,6 +633,8 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core" description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。" @@ -648,6 +652,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%" description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。" @@ -665,6 +671,8 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次" description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。" @@ -682,6 +690,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit" description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。" @@ -699,6 +709,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" @@ -716,6 +728,8 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次" description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。" @@ -732,6 +746,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec" description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。" @@ -748,6 +764,8 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota" description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"