fix(ops): align stale job cleanup thresholds
All checks were successful
Code Review / ai-code-review (push) Successful in 28s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 36s

This commit is contained in:
Your Name
2026-05-05 14:54:17 +08:00
parent 5e625f777d
commit 72d66e4ae6
5 changed files with 36 additions and 7 deletions

View File

@@ -1607,7 +1607,8 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks
| `2e128f9` | Gitea Code Review stale-run guard避免快速連推堆疊多個 runner job |
| `3b73cc7` | CD paths 收斂workflow-only commits 不再觸發完整 image build/deploy |
| `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` |
| `pending` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation |
| `5e625f7` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation |
| `pending` | stale job cleanup policy thresholds aligned with workflow/job timeout buffers |
### 下一步

View File

@@ -61,7 +61,7 @@ Use these thresholds for alerting and AI triage:
9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services.
10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes.
11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change.
12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than the workflow timeout and has no recent useful logs.
12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs.
## Next Safe Rollout Order

View File

@@ -678,7 +678,7 @@ groups:
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
description: "{{ $labels.container_name }} 已超過 20 分鐘110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
runbook: "先執行 dry-run若 logs 空白且超過 workflow timeout再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
runbook: "先執行 dry-run清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
- alert: SystemdRunnerRestartSpike
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.

View File

@@ -684,7 +684,7 @@ groups:
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
description: "{{ $labels.container_name }} 已超過 20 分鐘110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
runbook: "先執行 dry-run若 logs 空白且超過 workflow timeout再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
runbook: "先執行 dry-run清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
- alert: SystemdRunnerRestartSpike
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.

View File

@@ -13,12 +13,37 @@ set -euo pipefail
# Safety rules:
# - Only touches Docker containers named GITEA-ACTIONS-*.
# - Defaults to containers older than 20 minutes.
# - Known long-running workflows get a higher stop threshold than the alert threshold.
# - Skips containers with recent log output unless --force is provided.
MIN_AGE_SECONDS=1200
APPLY=0
FORCE=0
threshold_for_name() {
local name="$1"
case "$name" in
*WORKFLOW-CD-Pipeline_JOB-deploy*)
# .gitea/workflows/cd.yaml deploy job timeout is 60m. Give act/Gitea
# cleanup a buffer before treating the container as abandoned.
echo 4500
;;
*WORKFLOW-CD-Pipeline_JOB-tests*|*WORKFLOW-CD-Pipeline_JOB-post-deploy-checks*)
echo 2400
;;
*WORKFLOW-Code-Review_JOB-ai-code-review*)
echo 720
;;
*WORKFLOW-Deploy-Alert-Rules_JOB-deploy-alerts*)
echo 900
;;
*)
echo "$MIN_AGE_SECONDS"
;;
esac
}
while [[ $# -gt 0 ]]; do
case "$1" in
--apply)
@@ -48,7 +73,9 @@ while read -r name; do
started_raw="$(docker inspect "$name" --format '{{.State.StartedAt}}')"
started="$(date -u -d "$started_raw" +%s 2>/dev/null || echo 0)"
age=$((now - started))
[[ "$age" -ge "$MIN_AGE_SECONDS" ]] || continue
stop_threshold="$(threshold_for_name "$name")"
[[ "$stop_threshold" -ge "$MIN_AGE_SECONDS" ]] || stop_threshold="$MIN_AGE_SECONDS"
[[ "$age" -ge "$stop_threshold" ]] || continue
found=1
log_tail="$(docker logs --since 5m --tail 5 "$name" 2>&1 || true)"
@@ -57,7 +84,8 @@ while read -r name; do
has_recent_logs=1
fi
printf 'candidate name=%s age_seconds=%s recent_logs=%s\n' "$name" "$age" "$has_recent_logs"
printf 'candidate name=%s age_seconds=%s stop_threshold_seconds=%s recent_logs=%s\n' \
"$name" "$age" "$stop_threshold" "$has_recent_logs"
if [[ "$has_recent_logs" == "1" ]]; then
printf '%s\n' "$log_tail" | sed 's/^/ log: /'
fi
@@ -72,5 +100,5 @@ while read -r name; do
done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true)
if [[ "$found" == "0" ]]; then
echo "No stale Gitea Actions containers older than ${MIN_AGE_SECONDS}s."
echo "No stale Gitea Actions containers older than policy threshold (minimum ${MIN_AGE_SECONDS}s)."
fi