fix(ops): point runner guardrail alerts to host script
All checks were successful
CD Pipeline / tests (push) Successful in 5m31s
Code Review / ai-code-review (push) Successful in 30s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 37s
CD Pipeline / build-and-deploy (push) Successful in 7m45s
CD Pipeline / post-deploy-checks (push) Successful in 5m4s

This commit is contained in:
Your Name
2026-05-05 15:25:37 +08:00
parent 96c1ba20da
commit 1cc9de5722
4 changed files with 8 additions and 24 deletions

View File

@@ -219,7 +219,7 @@ rules:
optimization:
- type: SYSTEMD_GUARDRAIL
description: "人工批准後停用錯誤 watchdog drop-in並為 runner 加 CPUQuota=200%、MemoryMax=2G"
command: "bash scripts/ops/apply-runner-systemd-guardrails.sh --apply"
command: "sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply"
- type: CI_CAPACITY
description: "若 110 同時承載 Sentry/ClickHouse/Gitea不應讓多個 runner 無限制並行"
command: "檢查 active jobs、runner 數量與 Gitea Actions concurrency必要時分流 runner"

View File

@@ -77,23 +77,7 @@ Use these thresholds for alerting and AI triage:
8. Fix 110 runner services with sudo-capable host maintenance:
```bash
unit=actions.runner.owenhytsai-awoooi.awoooi-110.service
sudo mv /etc/systemd/system/$unit.d/watchdog.conf /etc/systemd/system/$unit.d/watchdog.conf.disabled-20260505
for u in \
actions.runner.owenhytsai-awoooi.awoooi-110.service \
actions.runner.owenhytsai-awoooi.awoooi-110-3.service \
actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-2.service \
actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-3.service \
actions.runner.owenhytsai-wooo-aiops.wooo-runner-110.service
do
sudo mkdir -p /etc/systemd/system/$u.d
printf "[Service]\nCPUAccounting=yes\nCPUQuota=200%%\nMemoryAccounting=yes\nMemoryMax=2G\n" \
| sudo tee /etc/systemd/system/$u.d/resource-guard.conf
done
sudo systemctl daemon-reload
sudo systemctl restart actions.runner.owenhytsai-awoooi.awoooi-110.service
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
```
## Known Anti-Patterns

View File

@@ -712,7 +712,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 停用 watchdog drop-in並加 CPUQuota/MemoryMax。"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerWatchdogEnabled
expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0
@@ -728,7 +728,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用:移除 watchdog.conf、daemon-reload、restart service。"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerMissingResourceQuota
expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0
@@ -744,7 +744,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
description: "{{ $labels.unit }} 仍為 unlimitedCI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G若同時有多個 runner需限制並行度或分流到非 Sentry 主機。"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。"
# =========================================================================
# MinIO / Kali 告警

View File

@@ -718,7 +718,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax; journalctl -u {{ $labels.unit }} --since \"20 minutes ago\" --no-pager | tail -120'"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 停用 watchdog drop-in並加 CPUQuota/MemoryMax。"
runbook: "自動階段先診斷。若確認 WatchdogSec 對 GitHub runner 誤設,需人工或 sudo playbook 執行 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerWatchdogEnabled
expr: systemd_unit_watchdog_seconds{unit=~"actions\\.runner\\..*"} > 0
@@ -734,7 +734,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p WatchdogUSec -p DropInPaths -p NRestarts'"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用:移除 watchdog.conf、daemon-reload、restart service。"
runbook: "確認 drop-in 來源;需要 sudo 時由人工套用 /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply。"
- alert: SystemdRunnerMissingResourceQuota
expr: systemd_unit_cpu_quota_cores{unit=~"actions\\.runner\\..*"} == 0 or systemd_unit_memory_max_bytes{unit=~"actions\\.runner\\..*"} == 0
@@ -750,7 +750,7 @@ groups:
summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
description: "{{ $labels.unit }} 仍為 unlimitedCI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G若同時有多個 runner需限制並行度或分流到非 Sentry 主機。"
runbook: "建議 baseline每個 runner CPUQuota=200%、MemoryMax=2G由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。"
# =========================================================================
# MinIO / Kali 告警