fix(monitoring): route docker baseline alerts to ssh

2026-05-06 00:00:12 +08:00
parent 2f50c67f5c
commit 23932773ef
2 changed files with 19 additions and 0 deletions
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -45,6 +45,7 @@

 **本次修補**：
 - 將 canonical `ops/monitoring/alerts-unified.yml` 補齊 SSH diagnosis action、host_resource category、`mcp_provider=ssh_host` 與 guarded disk-prune route，避免下次 deploy-alerts 覆蓋掉 live baseline。
+- Docker baseline 與 systemd runner baseline 告警也補 `mcp_provider=ssh_host` / `host_type=bare_metal`，避免 LLM 在 Docker/host 事故中猜錯執行域。
 - 維持原則：host/Docker 高負載先只讀診斷；stateful DB/ClickHouse/Harbor/Sentry 不允許通用 restart。

 ## 2026-05-05 | 重開機後排程與 startup baseline 修復
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -615,6 +615,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
          description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘，需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
@@ -631,6 +633,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-3
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
          description: "{{ $labels.container_name }} 已持續吃超過 4 core，會拖垮 110/188 主機；需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
@@ -648,6 +652,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
          description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker，需先判斷 workload，不可直接降 limit。"
@@ -665,6 +671,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-3
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
          description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增，避免再次出現 litellm 24,464 次靜默崩潰。"
@@ -682,6 +690,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
          description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail，長時間尖峰可能拖垮 110/188。"
@@ -699,6 +709,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
          description: "{{ $labels.container_name }} 已超過 20 分鐘，110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
@@ -716,6 +728,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-3
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
          description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增；110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
@@ -732,6 +746,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
          description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
@@ -748,6 +764,8 @@ groups:
          alert_category: infrastructure
          notification_type: TYPE-1
          auto_repair: "true"
+          mcp_provider: "ssh_host"
+          host_type: "bare_metal"
        annotations:
          summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
          description: "{{ $labels.unit }} 仍為 unlimited；CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"