diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 8c16d683..e5d76e8c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1606,13 +1606,15 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks | `ab0f0a8` | deploy API image `runner-classify-20260505-0e14935` | | `2e128f9` | Gitea Code Review stale-run guard,避免快速連推堆疊多個 runner job | | `3b73cc7` | CD paths 收斂,workflow-only commits 不再觸發完整 image build/deploy | -| `pending` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` | +| `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` | +| `pending` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation | ### 下一步 1. 在 110 以 sudo 執行 `bash scripts/ops/apply-runner-systemd-guardrails.sh --apply`。 2. 驗證 Prometheus 的 `SystemdRunnerWatchdogEnabled` / `SystemdRunnerMissingResourceQuota` 消失。 -3. 觀察 110 load5/core 是否穩定低於 1.5,若仍高再調 Sentry ingestion/ClickHouse parts。 +3. 部署 `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`,讓 `DockerGiteaActionsJobStale` 先 dry-run、再人工/AI 審核後 `--apply`。 +4. 觀察 110 load5/core 是否穩定低於 1.5,若仍高再調 Sentry ingestion/ClickHouse parts。 --- diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md index e48b99c3..1fae6a31 100644 --- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -46,7 +46,7 @@ Use these thresholds for alerting and AI triage: | Systemd runner restarts | > 2 in 15m | Critical; inspect watchdog/drop-ins and active CI jobs. | | Systemd runner WatchdogSec | > 0 for 10m | Warning; GitHub Actions runner should not be killed by systemd watchdog. | | Systemd runner quota | CPU or memory unlimited for 30m | Warning; apply CPUQuota/MemoryMax or move CI away from Sentry host. | -| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and stop stale job containers if they outlive workflow timeout. | +| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and run `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh` dry-run before stopping stale job containers. | ## Rules @@ -61,6 +61,7 @@ Use these thresholds for alerting and AI triage: 9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services. 10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes. 11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. +12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than the workflow timeout and has no recent useful logs. ## Next Safe Rollout Order @@ -70,7 +71,8 @@ Use these thresholds for alerting and AI triage: 4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low. 5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis. 6. Add modest caps to currently unlimited low-risk services in small batches. -7. Fix 110 runner services with sudo-capable host maintenance: +7. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode. +8. Fix 110 runner services with sudo-capable host maintenance: ```bash unit=actions.runner.owenhytsai-awoooi.awoooi-110.service diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index e0ac963f..d3c6763f 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -677,8 +677,8 @@ groups: annotations: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" - runbook: "先確認是否仍有輸出與是否為 CD 關鍵步驟;若 logs 空白且超過 workflow timeout,可人工停止 stale job container。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'" + runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" - alert: SystemdRunnerRestartSpike # 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage. diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 6abb823d..b123aceb 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -683,8 +683,8 @@ groups: annotations: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'" - runbook: "先確認是否仍有輸出與是否為 CD 關鍵步驟;若 logs 空白且超過 workflow timeout,可人工停止 stale job container。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'" + runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" - alert: SystemdRunnerRestartSpike # 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage. diff --git a/scripts/ops/stop-stale-gitea-actions-jobs.sh b/scripts/ops/stop-stale-gitea-actions-jobs.sh new file mode 100644 index 00000000..23a63f32 --- /dev/null +++ b/scripts/ops/stop-stale-gitea-actions-jobs.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 2026-05-05 ogt + Codex +# Find or stop stale Gitea/act job containers on the 110 host. +# +# Default is dry-run: +# bash scripts/ops/stop-stale-gitea-actions-jobs.sh +# +# Apply after reviewing candidates: +# bash scripts/ops/stop-stale-gitea-actions-jobs.sh --apply +# +# Safety rules: +# - Only touches Docker containers named GITEA-ACTIONS-*. +# - Defaults to containers older than 20 minutes. +# - Skips containers with recent log output unless --force is provided. + +MIN_AGE_SECONDS=1200 +APPLY=0 +FORCE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --apply) + APPLY=1 + shift + ;; + --force) + FORCE=1 + shift + ;; + --min-age-seconds) + MIN_AGE_SECONDS="${2:?--min-age-seconds requires a value}" + shift 2 + ;; + *) + echo "Unknown argument: $1" >&2 + exit 2 + ;; + esac +done + +now="$(date +%s)" +found=0 + +while read -r name; do + [[ -n "$name" ]] || continue + started_raw="$(docker inspect "$name" --format '{{.State.StartedAt}}')" + started="$(date -u -d "$started_raw" +%s 2>/dev/null || echo 0)" + age=$((now - started)) + [[ "$age" -ge "$MIN_AGE_SECONDS" ]] || continue + + found=1 + log_tail="$(docker logs --since 5m --tail 5 "$name" 2>&1 || true)" + has_recent_logs=0 + if [[ -n "$log_tail" ]]; then + has_recent_logs=1 + fi + + printf 'candidate name=%s age_seconds=%s recent_logs=%s\n' "$name" "$age" "$has_recent_logs" + if [[ "$has_recent_logs" == "1" ]]; then + printf '%s\n' "$log_tail" | sed 's/^/ log: /' + fi + + if [[ "$APPLY" == "1" ]]; then + if [[ "$has_recent_logs" == "1" && "$FORCE" != "1" ]]; then + echo "skip $name: recent logs exist; use --force only after manual review" + continue + fi + docker stop "$name" + fi +done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true) + +if [[ "$found" == "0" ]]; then + echo "No stale Gitea Actions containers older than ${MIN_AGE_SECONDS}s." +fi