From 72d66e4ae6b1d067bdc9bb762b21ed574eab5ff3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 14:54:17 +0800 Subject: [PATCH] fix(ops): align stale job cleanup thresholds --- docs/LOGBOOK.md | 3 +- .../HOST-RESOURCE-BASELINE-110-188.md | 2 +- ops/monitoring/alerts-unified.yml | 2 +- ops/monitoring/alerts.yml | 2 +- scripts/ops/stop-stale-gitea-actions-jobs.sh | 34 +++++++++++++++++-- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e5d76e8c..9be7d2ba 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1607,7 +1607,8 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks | `2e128f9` | Gitea Code Review stale-run guard,避免快速連推堆疊多個 runner job | | `3b73cc7` | CD paths 收斂,workflow-only commits 不再觸發完整 image build/deploy | | `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` | -| `pending` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation | +| `5e625f7` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation | +| `pending` | stale job cleanup policy thresholds aligned with workflow/job timeout buffers | ### 下一步 diff --git a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md index 1fae6a31..9c79d074 100644 --- a/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md +++ b/docs/runbooks/HOST-RESOURCE-BASELINE-110-188.md @@ -61,7 +61,7 @@ Use these thresholds for alerting and AI triage: 9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services. 10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes. 11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change. -12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than the workflow timeout and has no recent useful logs. +12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs. ## Next Safe Rollout Order diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index d3c6763f..1f9f28d9 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -678,7 +678,7 @@ groups: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'" - runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" + runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" - alert: SystemdRunnerRestartSpike # 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage. diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index b123aceb..388dd581 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -684,7 +684,7 @@ groups: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'" - runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" + runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。" - alert: SystemdRunnerRestartSpike # 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage. diff --git a/scripts/ops/stop-stale-gitea-actions-jobs.sh b/scripts/ops/stop-stale-gitea-actions-jobs.sh index 23a63f32..181fede0 100644 --- a/scripts/ops/stop-stale-gitea-actions-jobs.sh +++ b/scripts/ops/stop-stale-gitea-actions-jobs.sh @@ -13,12 +13,37 @@ set -euo pipefail # Safety rules: # - Only touches Docker containers named GITEA-ACTIONS-*. # - Defaults to containers older than 20 minutes. +# - Known long-running workflows get a higher stop threshold than the alert threshold. # - Skips containers with recent log output unless --force is provided. MIN_AGE_SECONDS=1200 APPLY=0 FORCE=0 +threshold_for_name() { + local name="$1" + + case "$name" in + *WORKFLOW-CD-Pipeline_JOB-deploy*) + # .gitea/workflows/cd.yaml deploy job timeout is 60m. Give act/Gitea + # cleanup a buffer before treating the container as abandoned. + echo 4500 + ;; + *WORKFLOW-CD-Pipeline_JOB-tests*|*WORKFLOW-CD-Pipeline_JOB-post-deploy-checks*) + echo 2400 + ;; + *WORKFLOW-Code-Review_JOB-ai-code-review*) + echo 720 + ;; + *WORKFLOW-Deploy-Alert-Rules_JOB-deploy-alerts*) + echo 900 + ;; + *) + echo "$MIN_AGE_SECONDS" + ;; + esac +} + while [[ $# -gt 0 ]]; do case "$1" in --apply) @@ -48,7 +73,9 @@ while read -r name; do started_raw="$(docker inspect "$name" --format '{{.State.StartedAt}}')" started="$(date -u -d "$started_raw" +%s 2>/dev/null || echo 0)" age=$((now - started)) - [[ "$age" -ge "$MIN_AGE_SECONDS" ]] || continue + stop_threshold="$(threshold_for_name "$name")" + [[ "$stop_threshold" -ge "$MIN_AGE_SECONDS" ]] || stop_threshold="$MIN_AGE_SECONDS" + [[ "$age" -ge "$stop_threshold" ]] || continue found=1 log_tail="$(docker logs --since 5m --tail 5 "$name" 2>&1 || true)" @@ -57,7 +84,8 @@ while read -r name; do has_recent_logs=1 fi - printf 'candidate name=%s age_seconds=%s recent_logs=%s\n' "$name" "$age" "$has_recent_logs" + printf 'candidate name=%s age_seconds=%s stop_threshold_seconds=%s recent_logs=%s\n' \ + "$name" "$age" "$stop_threshold" "$has_recent_logs" if [[ "$has_recent_logs" == "1" ]]; then printf '%s\n' "$log_tail" | sed 's/^/ log: /' fi @@ -72,5 +100,5 @@ while read -r name; do done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true) if [[ "$found" == "0" ]]; then - echo "No stale Gitea Actions containers older than ${MIN_AGE_SECONDS}s." + echo "No stale Gitea Actions containers older than policy threshold (minimum ${MIN_AGE_SECONDS}s)." fi