From 1cc9de5722eb2fca8bab080077f792fa02c5d5fb Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 15:25:37 +0800 Subject: [PATCH] fix(ops): point runner guardrail alerts to host script --- apps/api/alert_rules.yaml | 2 +- .../runbooks/HOST-RESOURCE-BASELINE-110-188.md | 18 +----------------- ops/monitoring/alerts-unified.yml | 6 +++--- ops/monitoring/alerts.yml | 6 +++--- 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index b00f2319..641b4782 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -219,7 +219,7 @@ rules: optimization: - type: SYSTEMD_GUARDRAIL description: "人工批准後停用錯誤 watchdog drop-in,並為 runner 加 CPUQuota=200%、MemoryMax=2G" - command: "bash scripts/ops/apply-runner-systemd-guardrails.sh --apply" + command: "sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply" - type: CI_CAPACITY description: "若 110 同時承載 Sentry/ClickHouse/Gitea,不應讓多個 runner 無限制並行" command: "檢查 active jobs、runner 數量與 Gitea Actions concurrency,必要時分流 runner" diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md index 58cbd1c6..9402a9de 100644 --- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -77,23 +77,7 @@ Use these thresholds for alerting and AI triage: 8. Fix 110 runner services with sudo-capable host maintenance: ```bash -unit=actions.runner.owenhytsai-awoooi.awoooi-110.service -sudo mv /etc/systemd/system/$unit.d/watchdog.conf /etc/systemd/system/$unit.d/watchdog.conf.disabled-20260505 - -for u in \ - actions.runner.owenhytsai-awoooi.awoooi-110.service \ - actions.runner.owenhytsai-awoooi.awoooi-110-3.service \ - actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-2.service \ - actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-3.service \ - actions.runner.owenhytsai-wooo-aiops.wooo-runner-110.service -do - sudo mkdir -p /etc/systemd/system/$u.d - printf "[Service]\nCPUAccounting=yes\nCPUQuota=200%%\nMemoryAccounting=yes\nMemoryMax=2G\n" \ - | sudo tee /etc/systemd/system/$u.d/resource-guard.conf -done - -sudo systemctl daemon-reload -sudo systemctl restart actions.runner.owenhytsai-awoooi.awoooi-110.service +sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply ``` ## Known Anti-Patterns diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 7f962afb..d5dad3b7 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -712,7 +712,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次" description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'" - runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 停用 watchdog drop-in,並加 CPUQuota/MemoryMax。" + runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerWatchdogEnabled expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0 @@ -728,7 +728,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec" description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'" - runbook: "確認 drop-in 來源;需要 sudo 時由人工套用:移除 watchdog.conf、daemon-reload、restart service。" + runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerMissingResourceQuota expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0 @@ -744,7 +744,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota" description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'" - runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;若同時有多個 runner,需限制並行度或分流到非 Sentry 主機。" + runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。" # ========================================================================= # MinIO / Kali 告警 diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 928a00ec..19fb7afd 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -718,7 +718,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次" description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'" - runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 停用 watchdog drop-in,並加 CPUQuota/MemoryMax。" + runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerWatchdogEnabled expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0 @@ -734,7 +734,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec" description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'" - runbook: "確認 drop-in 來源;需要 sudo 時由人工套用:移除 watchdog.conf、daemon-reload、restart service。" + runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。" - alert: SystemdRunnerMissingResourceQuota expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0 @@ -750,7 +750,7 @@ groups: summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota" description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'" - runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;若同時有多個 runner,需限制並行度或分流到非 Sentry 主機。" + runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。" # ========================================================================= # MinIO / Kali 告警