fix(ops): add stale gitea job cleanup guard
This commit is contained in:
@@ -1606,13 +1606,15 @@ psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks
|
||||
| `ab0f0a8` | deploy API image `runner-classify-20260505-0e14935` |
|
||||
| `2e128f9` | Gitea Code Review stale-run guard,避免快速連推堆疊多個 runner job |
|
||||
| `3b73cc7` | CD paths 收斂,workflow-only commits 不再觸發完整 image build/deploy |
|
||||
| `pending` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` |
|
||||
| `7d45f0c` | Docker textfile 補 `docker_container_started_seconds` + `DockerGiteaActionsJobStale` |
|
||||
| `pending` | 110 stale Gitea Actions job dry-run cleanup script + runbook/alert annotation |
|
||||
|
||||
### 下一步
|
||||
|
||||
1. 在 110 以 sudo 執行 `bash scripts/ops/apply-runner-systemd-guardrails.sh --apply`。
|
||||
2. 驗證 Prometheus 的 `SystemdRunnerWatchdogEnabled` / `SystemdRunnerMissingResourceQuota` 消失。
|
||||
3. 觀察 110 load5/core 是否穩定低於 1.5,若仍高再調 Sentry ingestion/ClickHouse parts。
|
||||
3. 部署 `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`,讓 `DockerGiteaActionsJobStale` 先 dry-run、再人工/AI 審核後 `--apply`。
|
||||
4. 觀察 110 load5/core 是否穩定低於 1.5,若仍高再調 Sentry ingestion/ClickHouse parts。
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ Use these thresholds for alerting and AI triage:
|
||||
| Systemd runner restarts | > 2 in 15m | Critical; inspect watchdog/drop-ins and active CI jobs. |
|
||||
| Systemd runner WatchdogSec | > 0 for 10m | Warning; GitHub Actions runner should not be killed by systemd watchdog. |
|
||||
| Systemd runner quota | CPU or memory unlimited for 30m | Warning; apply CPUQuota/MemoryMax or move CI away from Sentry host. |
|
||||
| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and stop stale job containers if they outlive workflow timeout. |
|
||||
| Gitea Actions job runtime | > 20m for 5m | Warning; inspect logs and run `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh` dry-run before stopping stale job containers. |
|
||||
|
||||
## Rules
|
||||
|
||||
@@ -61,6 +61,7 @@ Use these thresholds for alerting and AI triage:
|
||||
9. Runner services are part of the resource baseline even when jobs run outside Docker. Export `systemd_unit_*` metrics for all `actions.runner.*` services.
|
||||
10. CI runner limits must be set at the systemd unit level. Docker container limits do not protect host load from runner listener, worker, and `act` helper processes.
|
||||
11. Workflow-only commits must not trigger full CD image builds; CD should run only when runtime inputs change.
|
||||
12. Stale Gitea Actions cleanup must be dry-run first. Use `--apply` only when the container is older than the workflow timeout and has no recent useful logs.
|
||||
|
||||
## Next Safe Rollout Order
|
||||
|
||||
@@ -70,7 +71,8 @@ Use these thresholds for alerting and AI triage:
|
||||
4. Tune `momo-scheduler` crawler concurrency on 188; keep 2 CPU / 2 GiB until success rate and latency prove it is too low.
|
||||
5. Fix 188 Elephant Alpha/OpenClaw allowed-action drift before enabling resource auto-repair beyond diagnosis.
|
||||
6. Add modest caps to currently unlimited low-risk services in small batches.
|
||||
7. Fix 110 runner services with sudo-capable host maintenance:
|
||||
7. Deploy `scripts/ops/stop-stale-gitea-actions-jobs.sh` to 110 as `/home/wooo/scripts/stop-stale-gitea-actions-jobs.sh`; keep Prometheus auto action in dry-run mode.
|
||||
8. Fix 110 runner services with sudo-capable host maintenance:
|
||||
|
||||
```bash
|
||||
unit=actions.runner.owenhytsai-awoooi.awoooi-110.service
|
||||
|
||||
@@ -677,8 +677,8 @@ groups:
|
||||
annotations:
|
||||
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
|
||||
description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先確認是否仍有輸出與是否為 CD 關鍵步驟;若 logs 空白且超過 workflow timeout,可人工停止 stale job container。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
|
||||
runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
|
||||
- alert: SystemdRunnerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
|
||||
|
||||
@@ -683,8 +683,8 @@ groups:
|
||||
annotations:
|
||||
summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
|
||||
description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}'"
|
||||
runbook: "先確認是否仍有輸出與是否為 CD 關鍵步驟;若 logs 空白且超過 workflow timeout,可人工停止 stale job container。"
|
||||
auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'docker ps --filter name={{ $labels.container_name }}; docker logs --tail=120 {{ $labels.container_name }}; bash /home/wooo/scripts/stop-stale-gitea-actions-jobs.sh --min-age-seconds 1200'"
|
||||
runbook: "先執行 dry-run;若 logs 空白且超過 workflow timeout,再用 --apply 停止 stale job container。不要停止仍在輸出或正在部署的 job。"
|
||||
|
||||
- alert: SystemdRunnerRestartSpike
|
||||
# 2026-05-05 ogt + Codex: 110 GitHub Actions runner watchdog loop was outside Docker/cAdvisor coverage.
|
||||
|
||||
76
scripts/ops/stop-stale-gitea-actions-jobs.sh
Normal file
76
scripts/ops/stop-stale-gitea-actions-jobs.sh
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 2026-05-05 ogt + Codex
|
||||
# Find or stop stale Gitea/act job containers on the 110 host.
|
||||
#
|
||||
# Default is dry-run:
|
||||
# bash scripts/ops/stop-stale-gitea-actions-jobs.sh
|
||||
#
|
||||
# Apply after reviewing candidates:
|
||||
# bash scripts/ops/stop-stale-gitea-actions-jobs.sh --apply
|
||||
#
|
||||
# Safety rules:
|
||||
# - Only touches Docker containers named GITEA-ACTIONS-*.
|
||||
# - Defaults to containers older than 20 minutes.
|
||||
# - Skips containers with recent log output unless --force is provided.
|
||||
|
||||
MIN_AGE_SECONDS=1200
|
||||
APPLY=0
|
||||
FORCE=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--apply)
|
||||
APPLY=1
|
||||
shift
|
||||
;;
|
||||
--force)
|
||||
FORCE=1
|
||||
shift
|
||||
;;
|
||||
--min-age-seconds)
|
||||
MIN_AGE_SECONDS="${2:?--min-age-seconds requires a value}"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
now="$(date +%s)"
|
||||
found=0
|
||||
|
||||
while read -r name; do
|
||||
[[ -n "$name" ]] || continue
|
||||
started_raw="$(docker inspect "$name" --format '{{.State.StartedAt}}')"
|
||||
started="$(date -u -d "$started_raw" +%s 2>/dev/null || echo 0)"
|
||||
age=$((now - started))
|
||||
[[ "$age" -ge "$MIN_AGE_SECONDS" ]] || continue
|
||||
|
||||
found=1
|
||||
log_tail="$(docker logs --since 5m --tail 5 "$name" 2>&1 || true)"
|
||||
has_recent_logs=0
|
||||
if [[ -n "$log_tail" ]]; then
|
||||
has_recent_logs=1
|
||||
fi
|
||||
|
||||
printf 'candidate name=%s age_seconds=%s recent_logs=%s\n' "$name" "$age" "$has_recent_logs"
|
||||
if [[ "$has_recent_logs" == "1" ]]; then
|
||||
printf '%s\n' "$log_tail" | sed 's/^/ log: /'
|
||||
fi
|
||||
|
||||
if [[ "$APPLY" == "1" ]]; then
|
||||
if [[ "$has_recent_logs" == "1" && "$FORCE" != "1" ]]; then
|
||||
echo "skip $name: recent logs exist; use --force only after manual review"
|
||||
continue
|
||||
fi
|
||||
docker stop "$name"
|
||||
fi
|
||||
done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true)
|
||||
|
||||
if [[ "$found" == "0" ]]; then
|
||||
echo "No stale Gitea Actions containers older than ${MIN_AGE_SECONDS}s."
|
||||
fi
|
||||
Reference in New Issue
Block a user