From 34d1c76be92bae4d38b8a7a5e80dcf57bd28ad8e Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 14:19:58 +0800 Subject: [PATCH] fix(ops): route systemd runner baseline alerts --- apps/api/alert_rules.yaml | 29 ++++++++ .../ops/apply-runner-systemd-guardrails.sh | 70 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100755 scripts/ops/apply-runner-systemd-guardrails.sh diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 700d6821..3d494b1b 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -194,6 +194,35 @@ rules: command: "由 AI 根據 evidence snapshot 選擇已驗證 playbook" reasoning: "[規則匹配] 長期過載先 read-only 診斷與分流,禁止通用 docker restart;修復必須服務專屬且可回寫 Playbook trust。" + # 2026-05-05 ogt + Codex: 110 self-hosted runner 是 systemd service,不在 Docker/cAdvisor 覆蓋內。 + # 原則:AI 可自動診斷 watchdog/quota/restart storm;套用 systemd drop-in 需要 sudo,必須走人工批准或 sudo playbook。 + - id: systemd_runner_baseline_alert + priority: 43 + description: 110 self-hosted runner systemd watchdog / restart / quota 基線告警 + match: + alertname: + - SystemdRunnerRestartSpike + - SystemdRunnerWatchdogEnabled + - SystemdRunnerMissingResourceQuota + response: + action_title: "🔍 Systemd Runner 基線診斷 — 需要 sudo 才可修復" + description: "110 self-hosted runner 發生 watchdog/restart storm 或缺 CPU/Memory quota。這會讓 CI 與 Sentry/ClickHouse/Gitea 搶主機資源,且 Docker/cAdvisor 看不到。" + suggested_action: SSH_DIAGNOSE + kubectl_command: "ssh {host} 'systemctl show {unit} -p WatchdogUSec -p NRestarts -p DropInPaths -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState; journalctl -u {unit} --since \"20 minutes ago\" --no-pager | tail -120'" + estimated_downtime: "N/A" + risk: low + responsibility: INFRA + responsibility_reasoning: "self-hosted runner 是 bare-metal systemd 資源治理,非 K8s 或 Docker workload" + secondary_teams: [SRE] + optimization: + - type: SYSTEMD_GUARDRAIL + description: "人工批准後停用錯誤 watchdog drop-in,並為 runner 加 CPUQuota=200%、MemoryMax=2G" + command: "bash scripts/ops/apply-runner-systemd-guardrails.sh --apply" + - type: CI_CAPACITY + description: "若 110 同時承載 Sentry/ClickHouse/Gitea,不應讓多個 runner 無限制並行" + command: "檢查 active jobs、runner 數量與 Gitea Actions concurrency,必要時分流 runner" + reasoning: "[規則匹配] systemd runner 過載先 read-only 診斷;改 systemd drop-in 需 sudo 與人工批准,避免 AI 擅自改 host unit。" + - id: high_cpu priority: 40 description: K8s Pod/Deployment CPU 使用率過高 diff --git a/scripts/ops/apply-runner-systemd-guardrails.sh b/scripts/ops/apply-runner-systemd-guardrails.sh new file mode 100755 index 00000000..0f77526b --- /dev/null +++ b/scripts/ops/apply-runner-systemd-guardrails.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 2026-05-05 ogt + Codex +# Apply resource guardrails to 110 self-hosted runner systemd services. +# +# This script intentionally requires sudo. Run it from 110 when the operator +# approves host-level systemd changes: +# +# bash scripts/ops/apply-runner-systemd-guardrails.sh --apply +# +# Without --apply it prints the exact changes and exits. + +APPLY=0 +if [[ "${1:-}" == "--apply" ]]; then + APPLY=1 +fi + +RUNNER_UNITS=( + actions.runner.owenhytsai-awoooi.awoooi-110.service + actions.runner.owenhytsai-awoooi.awoooi-110-3.service + actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-2.service + actions.runner.owenhytsai-wooo-aiops.wooo-110-runner-3.service + actions.runner.owenhytsai-wooo-aiops.wooo-runner-110.service +) + +WATCHDOG_UNIT="actions.runner.owenhytsai-awoooi.awoooi-110.service" +WATCHDOG_DROPIN="/etc/systemd/system/${WATCHDOG_UNIT}.d/watchdog.conf" +WATCHDOG_BACKUP="${WATCHDOG_DROPIN}.disabled-20260505" + +echo "Runner systemd guardrail plan:" +echo "- Disable bad watchdog drop-in: ${WATCHDOG_DROPIN}" +echo "- Set CPUAccounting=yes, CPUQuota=200%, MemoryAccounting=yes, MemoryMax=2G" +echo "- Restart only ${WATCHDOG_UNIT}; other runner units pick up quotas on next restart" + +if [[ "$APPLY" != "1" ]]; then + echo + echo "Dry run only. Re-run with --apply to make changes." + exit 0 +fi + +if [[ -f "$WATCHDOG_DROPIN" ]]; then + sudo mv "$WATCHDOG_DROPIN" "$WATCHDOG_BACKUP" +fi + +for unit in "${RUNNER_UNITS[@]}"; do + sudo mkdir -p "/etc/systemd/system/${unit}.d" + sudo tee "/etc/systemd/system/${unit}.d/resource-guard.conf" >/dev/null <<'EOF' +[Service] +CPUAccounting=yes +CPUQuota=200% +MemoryAccounting=yes +MemoryMax=2G +EOF +done + +sudo systemctl daemon-reload +sudo systemctl restart "$WATCHDOG_UNIT" + +for unit in "${RUNNER_UNITS[@]}"; do + echo "=== ${unit} ===" + systemctl show "$unit" \ + -p WatchdogUSec \ + -p NRestarts \ + -p DropInPaths \ + -p CPUQuotaPerSecUSec \ + -p MemoryMax \ + -p ActiveState \ + -p SubState +done