From 85d5b5c8238e4b0afb88d134d384bcf1810119fe Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 23:48:35 +0800 Subject: [PATCH] fix(cd): clear empty docker build locks --- .gitea/workflows/cd.yaml | 18 ++++++++++++++++-- docs/LOGBOOK.md | 8 ++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index d263a20c..90f010e2 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -300,6 +300,7 @@ jobs: run: | LOCK_NAME="awoooi-cd-docker-build-lock" STALE_SECONDS=7200 + EMPTY_LOCK_SECONDS=300 WAIT_ATTEMPTS=180 for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do @@ -323,9 +324,22 @@ jobs: python3 -c "import sys, datetime, re; ts = re.sub(r'\\.\d+', '', sys.argv[1]); ts = re.sub(r'\\s+[A-Z]{2,4}$', '', ts.strip()); print(int(datetime.datetime.strptime(ts, '%Y-%m-%d %H:%M:%S %z').timestamp()))" \ "$CREATED_AT" 2>/dev/null || echo 0) NOW_EPOCH=$(date +%s) + LOCK_AGE=$((NOW_EPOCH - CREATED_EPOCH)) + # 2026-05-05 Codex: dirty reboot / cancelled Actions can leave + # the Docker-network lock behind with no active build or push. + # Waiting the full 30m CD timeout keeps deploys queued even + # though no job is protected, so clear empty locks after 5m. + ACTIVE_DOCKER_WORK=$(ps -eo args | grep -E 'docker (build|push)|buildx build' | grep -v grep || true) if [ "$CREATED_EPOCH" -gt 0 ] && \ - [ $((NOW_EPOCH - CREATED_EPOCH)) -gt "$STALE_SECONDS" ]; then - echo "⚠️ stale Docker build lock detected (age=$((NOW_EPOCH - CREATED_EPOCH))s > ${STALE_SECONDS}s), removing ${LOCK_NAME}" + [ "$LOCK_AGE" -gt "$EMPTY_LOCK_SECONDS" ] && \ + [ -z "$ACTIVE_DOCKER_WORK" ]; then + echo "⚠️ empty Docker build lock detected (age=${LOCK_AGE}s > ${EMPTY_LOCK_SECONDS}s, no active docker build/push), removing ${LOCK_NAME}" + docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true + continue + fi + if [ "$CREATED_EPOCH" -gt 0 ] && \ + [ "$LOCK_AGE" -gt "$STALE_SECONDS" ]; then + echo "⚠️ stale Docker build lock detected (age=${LOCK_AGE}s > ${STALE_SECONDS}s), removing ${LOCK_NAME}" docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true continue fi diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2397f3bc..5e7a2cfc 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -31,6 +31,14 @@ **驗證**: - `kubectl kustomize k8s/awoooi-prod` 通過,build output 中 `drift-scanner` image 會被解析為目前 kustomization 的 `awoooi/api:c4854bb3...`。 +## 2026-05-05 | CD Docker build 空鎖自動清理 + +**背景**:重開機後 Gitea Actions 曾留下 `awoooi-cd-docker-build-lock` Docker network 空鎖;live host 無 `docker build/buildx/docker push` 進程,但後續 CD 仍會等滿 30 分鐘才 timeout。 + +**本次修補**: +- `.gitea/workflows/cd.yaml` 的 `Acquire Docker Build Lock` 新增 `EMPTY_LOCK_SECONDS=300`。 +- lock 超過 5 分鐘且 host 上沒有 active docker build/push 時,自動移除空鎖後重新嘗試取得 lock;真正超過 2 小時的 stale lock 仍保留原有強制清理邏輯。 + ## 2026-05-05 | 重開機後排程與 startup baseline 修復 **背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`。