fix(ops): route systemd runner baseline alerts
This commit is contained in:
@@ -194,6 +194,35 @@ rules:
|
||||
command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook"
|
||||
reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流,禁止通用 docker restart;修復必須服務專屬且可回寫 Playbook trust。"
|
||||
|
||||
# 2026-05-05 ogt + Codex: 110 self-hosted runner 是 systemd service,不在 Docker/cAdvisor 覆蓋內。
|
||||
# 原則:AI 可自動診斷 watchdog/quota/restart storm;套用 systemd drop-in 需要 sudo,必須走人工批准或 sudo playbook。
|
||||
- id: systemd_runner_baseline_alert
|
||||
priority: 43
|
||||
description: 110 self-hosted runner systemd watchdog / restart / quota 基線告警
|
||||
match:
|
||||
alertname:
|
||||
- SystemdRunnerRestartSpike
|
||||
- SystemdRunnerWatchdogEnabled
|
||||
- SystemdRunnerMissingResourceQuota
|
||||
response:
|
||||
action_title: "🔍 Systemd Runner 基線診斷 — 需要 sudo 才可修復"
|
||||
description: "110 self-hosted runner 發生 watchdog/restart storm 或缺 CPU/Memory quota。這會讓 CI 與 Sentry/ClickHouse/Gitea 搶主機資源,且 Docker/cAdvisor 看不到。"
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'systemctl show {unit} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState; journalctl -u {unit} --since \"20 minutes ago\" --no-pager | tail -120'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
responsibility_reasoning: "self-hosted runner 是 bare-metal systemd 資源治理,非 K8s 或 Docker workload"
|
||||
secondary_teams: [SRE]
|
||||
optimization:
|
||||
- type: SYSTEMD_GUARDRAIL
|
||||
description: "人工批准後停用錯誤 watchdog drop-in,並為 runner 加 CPUQuota=200%、MemoryMax=2G"
|
||||
command: "bash scripts/ops/apply-runner-systemd-guardrails.sh --apply"
|
||||
- type: CI_CAPACITY
|
||||
description: "若 110 同時承載 Sentry/ClickHouse/Gitea,不應讓多個 runner 無限制並行"
|
||||
command: "檢查 active jobs、runner 數量與 Gitea Actions concurrency,必要時分流 runner"
|
||||
reasoning: "[規則匹配] systemd runner 過載先 read-only 診斷;改 systemd drop-in 需 sudo 與人工批准,避免 AI 擅自改 host unit。"
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
description: K8s Pod/Deployment CPU 使用率過高
|
||||
|
||||
70
scripts/ops/apply-runner-systemd-guardrails.sh
Executable file
70
scripts/ops/apply-runner-systemd-guardrails.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 2026-05-05 ogt + Codex
|
||||
# Apply resource guardrails to 110 self-hosted runner systemd services.
|
||||
#
|
||||
# This script intentionally requires sudo. Run it from 110 when the operator
|
||||
# approves host-level systemd changes:
|
||||
#
|
||||
# bash scripts/ops/apply-runner-systemd-guardrails.sh --apply
|
||||
#
|
||||
# Without --apply it prints the exact changes and exits.
|
||||
|
||||
APPLY=0
|
||||
if [[ "${1:-}" == "--apply" ]]; then
|
||||
APPLY=1
|
||||
fi
|
||||
|
||||
RUNNER_UNITS=(
|
||||
actions.runner.owenhytsai-awoooi.awoooi-110.service
|
||||
actions.runner.owenhytsai-awoooi.awoooi-110-3.service
|
||||
actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-2.service
|
||||
actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-3.service
|
||||
actions.runner.owenhytsai-wooo-aiops.wooo-runner-110.service
|
||||
)
|
||||
|
||||
WATCHDOG_UNIT="actions.runner.owenhytsai-awoooi.awoooi-110.service"
|
||||
WATCHDOG_DROPIN="/etc/systemd/system/${WATCHDOG_UNIT}.d/watchdog.conf"
|
||||
WATCHDOG_BACKUP="${WATCHDOG_DROPIN}.disabled-20260505"
|
||||
|
||||
echo "Runner systemd guardrail plan:"
|
||||
echo "- Disable bad watchdog drop-in: ${WATCHDOG_DROPIN}"
|
||||
echo "- Set CPUAccounting=yes, CPUQuota=200%, MemoryAccounting=yes, MemoryMax=2G"
|
||||
echo "- Restart only ${WATCHDOG_UNIT}; other runner units pick up quotas on next restart"
|
||||
|
||||
if [[ "$APPLY" != "1" ]]; then
|
||||
echo
|
||||
echo "Dry run only. Re-run with --apply to make changes."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ -f "$WATCHDOG_DROPIN" ]]; then
|
||||
sudo mv "$WATCHDOG_DROPIN" "$WATCHDOG_BACKUP"
|
||||
fi
|
||||
|
||||
for unit in "${RUNNER_UNITS[@]}"; do
|
||||
sudo mkdir -p "/etc/systemd/system/${unit}.d"
|
||||
sudo tee "/etc/systemd/system/${unit}.d/resource-guard.conf" >/dev/null <<'EOF'
|
||||
[Service]
|
||||
CPUAccounting=yes
|
||||
CPUQuota=200%
|
||||
MemoryAccounting=yes
|
||||
MemoryMax=2G
|
||||
EOF
|
||||
done
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart "$WATCHDOG_UNIT"
|
||||
|
||||
for unit in "${RUNNER_UNITS[@]}"; do
|
||||
echo "=== ${unit} ==="
|
||||
systemctl show "$unit" \
|
||||
-p WatchdogUSec \
|
||||
-p NRestarts \
|
||||
-p DropInPaths \
|
||||
-p CPUQuotaPerSecUSec \
|
||||
-p MemoryMax \
|
||||
-p ActiveState \
|
||||
-p SubState
|
||||
done
|
||||
Reference in New Issue
Block a user