diff --git a/ops/runner/README.md b/ops/runner/README.md index 380476c7..c659fe0b 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -217,6 +217,39 @@ AWOOI 的 Docker lock,會和 AWOOI Web image 內的 Next production build 疊 長期方向仍是 runner 隔離或 build offload;此 gate 是在 shared runner 尚未 拆分前,降低重型前端 build 互相踩踏的保守保護層。 +### 第五層修復: legacy Docker runner drain + +2026-05-21 再次確認 110 同時存在兩個 runner: + +- host-level `gitea-act-runner-host.service` +- Docker-wrapped `gitea-runner` + +兩者使用同一份 labels/config,會同時接 `awoooi`、`stockplatform-v2`、 +`ewoooc` 等 repo 的 job。這會讓 AWOOI CD 的 runner ownership 失真,也會 +讓 shared Docker daemon 壓力無法預測。 + +正確處理不是在 task container 正在跑時直接 `docker stop gitea-runner`。 +`ops/runner/install-gitea-host-runner-service.sh` 現在採用 drain 流程: + +1. `docker update --restart=no gitea-runner` +2. 若沒有 `GITEA-ACTIONS-TASK-*`,用長 timeout 停止 container +3. 若仍有 `GITEA-ACTIONS-TASK-*`,送 `SIGINT` 給 `gitea-runner` +4. act-runner 依 `shutdown_timeout: 1h` 停止接新 job,等待手上的 job 收尾 + +現場判讀: + +```bash +docker inspect gitea-runner --format 'Restart={{.HostConfig.RestartPolicy.Name}} Status={{.State.Status}}' +docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-TASK-' || true +docker logs --since 10m gitea-runner +``` + +目標狀態: + +```text +Restart=no Status=exited +``` + --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code 變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups diff --git a/ops/runner/install-gitea-host-runner-service.sh b/ops/runner/install-gitea-host-runner-service.sh index b72f901a..a00f5b77 100755 --- a/ops/runner/install-gitea-host-runner-service.sh +++ b/ops/runner/install-gitea-host-runner-service.sh @@ -5,11 +5,33 @@ RUNNER_DIR="${RUNNER_DIR:-/home/wooo/act-runner}" RUNNER_USER="${RUNNER_USER:-wooo}" SERVICE_NAME="${SERVICE_NAME:-gitea-act-runner-host.service}" SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-1h}" +LEGACY_RUNNER_STOP_TIMEOUT_SECONDS="${LEGACY_RUNNER_STOP_TIMEOUT_SECONDS:-3700}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SYSTEM_SERVICE_SRC="${SCRIPT_DIR}/gitea-act-runner-host.service" USER_SERVICE_SRC="${SCRIPT_DIR}/gitea-act-runner-host.user.service" CONFIG_FILE="${RUNNER_DIR}/config.yaml" +gitea_task_containers_running() { + docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-' +} + +disable_legacy_docker_runner() { + if ! docker ps -a --format '{{.Names}}' | grep -qx 'gitea-runner'; then + return 0 + fi + + echo "Disabling legacy docker-wrapped gitea-runner container" + docker update --restart=no gitea-runner >/dev/null 2>&1 || true + + if gitea_task_containers_running; then + echo "Active Gitea Actions task containers are running; sending SIGINT to drain gitea-runner" >&2 + docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true + return 0 + fi + + docker stop -t "${LEGACY_RUNNER_STOP_TIMEOUT_SECONDS}" gitea-runner >/dev/null 2>&1 || true +} + if [ ! -x "${RUNNER_DIR}/act_runner" ]; then echo "act_runner binary not found: ${RUNNER_DIR}/act_runner" >&2 exit 1 @@ -69,11 +91,7 @@ if not found: path.write_text("\n".join(updated) + "\n") PY -if docker ps --format '{{.Names}}' | grep -qx 'gitea-runner'; then - echo "Disabling legacy docker-wrapped gitea-runner container" - docker update --restart=no gitea-runner >/dev/null 2>&1 || true - docker stop gitea-runner >/dev/null 2>&1 || true -fi +disable_legacy_docker_runner install_system_service() { sudo install -o root -g root -m 0644 "${SYSTEM_SERVICE_SRC}" "/etc/systemd/system/${SERVICE_NAME}" @@ -110,7 +128,7 @@ else fi if [ "${RESTART_NOW:-0}" = "1" ]; then - if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then + if gitea_task_containers_running; then echo "Refusing to restart: Gitea Actions task containers are running" >&2 exit 1 fi diff --git a/scripts/ci/cleanup-host-runner-workspace.sh b/scripts/ci/cleanup-host-runner-workspace.sh index 6491f570..08e56bd9 100755 --- a/scripts/ci/cleanup-host-runner-workspace.sh +++ b/scripts/ci/cleanup-host-runner-workspace.sh @@ -37,6 +37,17 @@ cleanup_artifacts() { rm -rf node_modules apps/web/node_modules 2>/dev/null || true } +cleanup_empty_docker_network() { + local network="$1" + local containers_json + + command -v docker >/dev/null 2>&1 || return 0 + containers_json="$(docker network inspect "$network" --format '{{json .Containers}}' 2>/dev/null || true)" + [ "$containers_json" = "{}" ] || return 0 + + docker network rm "$network" >/dev/null 2>&1 || true +} + has_leftovers() { local root="$1" [ -d "$root/.pytest_cache" ] && return 0 @@ -53,6 +64,7 @@ has_leftovers() { } cleanup_artifacts "$ROOT" +cleanup_empty_docker_network "b5-test-net" if has_leftovers "$ROOT" && [ -n "$CLEANUP_IMAGE" ] && command -v docker >/dev/null 2>&1; then echo "host cleanup left root-owned artifacts; retrying through ${CLEANUP_IMAGE}" diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index fc3e897f..78e18093 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -228,8 +228,15 @@ PY fi # 已停用 Docker-wrapped runner;避免它搶走 host label job。 + # 若手動執行此 recovery script 時仍有 task container,送 SIGINT + # 讓 act_runner drain,不再接新 job,並等手上的 job 收尾。 docker update --restart=no gitea-runner >/dev/null 2>&1 || true - docker stop gitea-runner >/dev/null 2>&1 || true + if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then + log "⚠️ Gitea Actions task container still running; draining docker-wrapped gitea-runner" + docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true + else + docker stop -t 3700 gitea-runner >/dev/null 2>&1 || true + fi sleep 15