ci(runner): drain legacy docker act runner safely
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
This commit is contained in:
@@ -217,6 +217,39 @@ AWOOI 的 Docker lock,會和 AWOOI Web image 內的 Next production build 疊
|
||||
長期方向仍是 runner 隔離或 build offload;此 gate 是在 shared runner 尚未
|
||||
拆分前,降低重型前端 build 互相踩踏的保守保護層。
|
||||
|
||||
### 第五層修復: legacy Docker runner drain
|
||||
|
||||
2026-05-21 再次確認 110 同時存在兩個 runner:
|
||||
|
||||
- host-level `gitea-act-runner-host.service`
|
||||
- Docker-wrapped `gitea-runner`
|
||||
|
||||
兩者使用同一份 labels/config,會同時接 `awoooi`、`stockplatform-v2`、
|
||||
`ewoooc` 等 repo 的 job。這會讓 AWOOI CD 的 runner ownership 失真,也會
|
||||
讓 shared Docker daemon 壓力無法預測。
|
||||
|
||||
正確處理不是在 task container 正在跑時直接 `docker stop gitea-runner`。
|
||||
`ops/runner/install-gitea-host-runner-service.sh` 現在採用 drain 流程:
|
||||
|
||||
1. `docker update --restart=no gitea-runner`
|
||||
2. 若沒有 `GITEA-ACTIONS-TASK-*`,用長 timeout 停止 container
|
||||
3. 若仍有 `GITEA-ACTIONS-TASK-*`,送 `SIGINT` 給 `gitea-runner`
|
||||
4. act-runner 依 `shutdown_timeout: 1h` 停止接新 job,等待手上的 job 收尾
|
||||
|
||||
現場判讀:
|
||||
|
||||
```bash
|
||||
docker inspect gitea-runner --format 'Restart={{.HostConfig.RestartPolicy.Name}} Status={{.State.Status}}'
|
||||
docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-TASK-' || true
|
||||
docker logs --since 10m gitea-runner
|
||||
```
|
||||
|
||||
目標狀態:
|
||||
|
||||
```text
|
||||
Restart=no Status=exited
|
||||
```
|
||||
|
||||
---
|
||||
版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code
|
||||
變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups
|
||||
|
||||
@@ -5,11 +5,33 @@ RUNNER_DIR="${RUNNER_DIR:-/home/wooo/act-runner}"
|
||||
RUNNER_USER="${RUNNER_USER:-wooo}"
|
||||
SERVICE_NAME="${SERVICE_NAME:-gitea-act-runner-host.service}"
|
||||
SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-1h}"
|
||||
LEGACY_RUNNER_STOP_TIMEOUT_SECONDS="${LEGACY_RUNNER_STOP_TIMEOUT_SECONDS:-3700}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SYSTEM_SERVICE_SRC="${SCRIPT_DIR}/gitea-act-runner-host.service"
|
||||
USER_SERVICE_SRC="${SCRIPT_DIR}/gitea-act-runner-host.user.service"
|
||||
CONFIG_FILE="${RUNNER_DIR}/config.yaml"
|
||||
|
||||
gitea_task_containers_running() {
|
||||
docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'
|
||||
}
|
||||
|
||||
disable_legacy_docker_runner() {
|
||||
if ! docker ps -a --format '{{.Names}}' | grep -qx 'gitea-runner'; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Disabling legacy docker-wrapped gitea-runner container"
|
||||
docker update --restart=no gitea-runner >/dev/null 2>&1 || true
|
||||
|
||||
if gitea_task_containers_running; then
|
||||
echo "Active Gitea Actions task containers are running; sending SIGINT to drain gitea-runner" >&2
|
||||
docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true
|
||||
return 0
|
||||
fi
|
||||
|
||||
docker stop -t "${LEGACY_RUNNER_STOP_TIMEOUT_SECONDS}" gitea-runner >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
if [ ! -x "${RUNNER_DIR}/act_runner" ]; then
|
||||
echo "act_runner binary not found: ${RUNNER_DIR}/act_runner" >&2
|
||||
exit 1
|
||||
@@ -69,11 +91,7 @@ if not found:
|
||||
path.write_text("\n".join(updated) + "\n")
|
||||
PY
|
||||
|
||||
if docker ps --format '{{.Names}}' | grep -qx 'gitea-runner'; then
|
||||
echo "Disabling legacy docker-wrapped gitea-runner container"
|
||||
docker update --restart=no gitea-runner >/dev/null 2>&1 || true
|
||||
docker stop gitea-runner >/dev/null 2>&1 || true
|
||||
fi
|
||||
disable_legacy_docker_runner
|
||||
|
||||
install_system_service() {
|
||||
sudo install -o root -g root -m 0644 "${SYSTEM_SERVICE_SRC}" "/etc/systemd/system/${SERVICE_NAME}"
|
||||
@@ -110,7 +128,7 @@ else
|
||||
fi
|
||||
|
||||
if [ "${RESTART_NOW:-0}" = "1" ]; then
|
||||
if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then
|
||||
if gitea_task_containers_running; then
|
||||
echo "Refusing to restart: Gitea Actions task containers are running" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -37,6 +37,17 @@ cleanup_artifacts() {
|
||||
rm -rf node_modules apps/web/node_modules 2>/dev/null || true
|
||||
}
|
||||
|
||||
cleanup_empty_docker_network() {
|
||||
local network="$1"
|
||||
local containers_json
|
||||
|
||||
command -v docker >/dev/null 2>&1 || return 0
|
||||
containers_json="$(docker network inspect "$network" --format '{{json .Containers}}' 2>/dev/null || true)"
|
||||
[ "$containers_json" = "{}" ] || return 0
|
||||
|
||||
docker network rm "$network" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
has_leftovers() {
|
||||
local root="$1"
|
||||
[ -d "$root/.pytest_cache" ] && return 0
|
||||
@@ -53,6 +64,7 @@ has_leftovers() {
|
||||
}
|
||||
|
||||
cleanup_artifacts "$ROOT"
|
||||
cleanup_empty_docker_network "b5-test-net"
|
||||
|
||||
if has_leftovers "$ROOT" && [ -n "$CLEANUP_IMAGE" ] && command -v docker >/dev/null 2>&1; then
|
||||
echo "host cleanup left root-owned artifacts; retrying through ${CLEANUP_IMAGE}"
|
||||
|
||||
@@ -228,8 +228,15 @@ PY
|
||||
fi
|
||||
|
||||
# 已停用 Docker-wrapped runner;避免它搶走 host label job。
|
||||
# 若手動執行此 recovery script 時仍有 task container,送 SIGINT
|
||||
# 讓 act_runner drain,不再接新 job,並等手上的 job 收尾。
|
||||
docker update --restart=no gitea-runner >/dev/null 2>&1 || true
|
||||
docker stop gitea-runner >/dev/null 2>&1 || true
|
||||
if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then
|
||||
log "⚠️ Gitea Actions task container still running; draining docker-wrapped gitea-runner"
|
||||
docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true
|
||||
else
|
||||
docker stop -t 3700 gitea-runner >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
sleep 15
|
||||
|
||||
|
||||
Reference in New Issue
Block a user