fix(ops): align stale job cleanup thresholds
This commit is contained in:
@@ -1607,7 +1607,8 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks
|
||||
| `2e128f9` | Gitea Code Review stale-run guard,避免快速連推堆疊多個 runner job |
|
||||
| `3b73cc7` | CD paths 收斂,workflow-only commits 不再觸發完整 image build/deploy |
|
||||
| `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` |
|
||||
| `pending` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation |
|
||||
| `5e625f7` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation |
|
||||
| `pending` | stale job cleanup policy thresholds aligned with workflow/job timeout buffers |
|
||||
|
||||
### 下一步
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ Use these thresholds for alerting and AI triage:
|
||||
9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services.
|
||||
10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes.
|
||||
11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change.
|
||||
12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than the workflow timeout and has no recent useful logs.
|
||||
12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than its workflow/job policy threshold and has no recent useful logs.
|
||||
|
||||
## Next Safe Rollout Order
|
||||
|
||||
|
||||
@@ -678,7 +678,7 @@ groups:
|
||||
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
|
||||
description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
|
||||
runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
|
||||
- alert: SystemdRunnerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
|
||||
|
||||
@@ -684,7 +684,7 @@ groups:
|
||||
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
|
||||
description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
|
||||
runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
runbook: "先執行 dry-run;清理腳本會依 workflow/job 名稱套停止門檻。若 logs 空白且超過該 job timeout buffer,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
|
||||
- alert: SystemdRunnerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
|
||||
|
||||
@@ -13,12 +13,37 @@ set -euo pipefail
|
||||
# Safety rules:
|
||||
# - Only touches Docker containers named GITEA-ACTIONS-*.
|
||||
# - Defaults to containers older than 20 minutes.
|
||||
# - Known long-running workflows get a higher stop threshold than the alert threshold.
|
||||
# - Skips containers with recent log output unless --force is provided.
|
||||
|
||||
MIN_AGE_SECONDS=1200
|
||||
APPLY=0
|
||||
FORCE=0
|
||||
|
||||
threshold_for_name() {
|
||||
local name="$1"
|
||||
|
||||
case "$name" in
|
||||
*WORKFLOW-CD-Pipeline_JOB-deploy*)
|
||||
# .gitea/workflows/cd.yaml deploy job timeout is 60m. Give act/Gitea
|
||||
# cleanup a buffer before treating the container as abandoned.
|
||||
echo 4500
|
||||
;;
|
||||
*WORKFLOW-CD-Pipeline_JOB-tests*|*WORKFLOW-CD-Pipeline_JOB-post-deploy-checks*)
|
||||
echo 2400
|
||||
;;
|
||||
*WORKFLOW-Code-Review_JOB-ai-code-review*)
|
||||
echo 720
|
||||
;;
|
||||
*WORKFLOW-Deploy-Alert-Rules_JOB-deploy-alerts*)
|
||||
echo 900
|
||||
;;
|
||||
*)
|
||||
echo "$MIN_AGE_SECONDS"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--apply)
|
||||
@@ -48,7 +73,9 @@ while read -r name; do
|
||||
started_raw="$(docker inspect "$name" --format '{{.State.StartedAt}}')"
|
||||
started="$(date -u -d "$started_raw" +%s 2>/dev/null || echo 0)"
|
||||
age=$((now - started))
|
||||
[[ "$age" -ge "$MIN_AGE_SECONDS" ]] || continue
|
||||
stop_threshold="$(threshold_for_name "$name")"
|
||||
[[ "$stop_threshold" -ge "$MIN_AGE_SECONDS" ]] || stop_threshold="$MIN_AGE_SECONDS"
|
||||
[[ "$age" -ge "$stop_threshold" ]] || continue
|
||||
|
||||
found=1
|
||||
log_tail="$(docker logs --since 5m --tail 5 "$name" 2>&1 || true)"
|
||||
@@ -57,7 +84,8 @@ while read -r name; do
|
||||
has_recent_logs=1
|
||||
fi
|
||||
|
||||
printf 'candidate name=%s age_seconds=%s recent_logs=%s\n' "$name" "$age" "$has_recent_logs"
|
||||
printf 'candidate name=%s age_seconds=%s stop_threshold_seconds=%s recent_logs=%s\n' \
|
||||
"$name" "$age" "$stop_threshold" "$has_recent_logs"
|
||||
if [[ "$has_recent_logs" == "1" ]]; then
|
||||
printf '%s\n' "$log_tail" | sed 's/^/ log: /'
|
||||
fi
|
||||
@@ -72,5 +100,5 @@ while read -r name; do
|
||||
done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true)
|
||||
|
||||
if [[ "$found" == "0" ]]; then
|
||||
echo "No stale Gitea Actions containers older than ${MIN_AGE_SECONDS}s."
|
||||
echo "No stale Gitea Actions containers older than policy threshold (minimum ${MIN_AGE_SECONDS}s)."
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user