diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 48272da8..1a51fbc6 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -842,11 +842,17 @@ jobs: exit 0 fi cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT' + set -euo pipefail cd apps/api # 安裝 psql client if ! command -v psql &>/dev/null; then apt-get install -y -q postgresql-client fi + if ! docker info >/dev/null 2>&1; then + echo "BLOCKER b5_docker_socket_unavailable" + echo "NEXT_ACTION ensure_b5_ci_container_runs_with_docker_socket_permission_then_retry_cd" + exit 65 + fi # 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network # 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效 # 真因: default bridge 不支援 container name DNS,必須 user-defined network @@ -866,9 +872,20 @@ jobs: -e POSTGRES_PASSWORD=awoooi_test_2026 \ pgvector/pgvector:pg16 # 等待就緒(用 container name,最多 60 秒) + B5_DB_READY=0 for i in $(seq 1 30); do - PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2 + if PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi; then + B5_DB_READY=1 + break + fi + sleep 2 done + if [ "$B5_DB_READY" != "1" ]; then + echo "BLOCKER b5_pg_test_container_not_ready" + echo "NEXT_ACTION inspect_b5_test_network_and_docker_socket_then_retry_cd" + docker ps --filter name=pg-test-b5 --format 'b5_container={{.Names}} status={{.Status}}' || true + exit 66 + fi # 初始化 schema PGPASSWORD=awoooi_test_2026 psql \ -h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \ @@ -890,6 +907,7 @@ jobs: CI_SCRIPT docker run --rm \ --name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \ + --user 0:0 \ --cpus "2.0" \ --memory "2g" \ -v "$PWD:/workspace" \ diff --git a/.gitea/workflows/harbor-110-local-repair.yaml b/.gitea/workflows/harbor-110-local-repair.yaml index 75a8df94..1c244107 100644 --- a/.gitea/workflows/harbor-110-local-repair.yaml +++ b/.gitea/workflows/harbor-110-local-repair.yaml @@ -3,7 +3,8 @@ # Controlled runtime: # - workflow_dispatch + low-frequency schedule only # - no push / pull_request / pull_request_target trigger -# - runs only on the 110-local awoooi-host lane +# - runs on the non-110 controlled host lane, then reaches 110 only through a +# bounded SSH control channel # - no secret read, no Docker daemon restart, no reboot, no node drain # - runs the existing bounded recovery script, then verifies local and public /v2/ @@ -17,6 +18,7 @@ on: env: AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED: "1" AWOOOI_110_EXPECTED_HOST_IP: 192.168.0.110 + AWOOOI_110_SSH_TARGET: wooo@192.168.0.110 AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER: ${{ github.event_name }} jobs: @@ -31,7 +33,7 @@ jobs: harbor-110-local-repair: if: ${{ env.AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED == '1' }} - runs-on: awoooi-host + runs-on: awoooi-non110-host timeout-minutes: 15 steps: - uses: actions/checkout@v4 @@ -45,68 +47,84 @@ jobs: echo "operation_boundary_docker_daemon_restart_performed=false" echo "operation_boundary_host_reboot_performed=false" echo "operation_boundary_node_drain_performed=false" + echo "operation_boundary_remote_ssh_bounded=true" - - name: Run 110 local control path and Harbor repair + - name: Run 110 remote control path and Harbor repair run: | set -euo pipefail - if ! command -v ip >/dev/null 2>&1; then - echo "BLOCKED ip_command_missing_for_110_host_guard" - exit 65 - fi - if ! ip -o -4 addr show 2>/dev/null | grep -q " ${AWOOOI_110_EXPECTED_HOST_IP}/"; then - echo "BLOCKED expected_110_host_ip_not_present" + if ! command -v ssh >/dev/null 2>&1; then + echo "BLOCKED ssh_command_missing_for_110_control_channel" exit 65 fi - chmod +x scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh - chmod +x scripts/reboot-recovery/harbor-watchdog.sh - chmod +x scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh + ssh_base=( + ssh + -o BatchMode=yes + -o ConnectTimeout=8 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=2 + "${AWOOOI_110_SSH_TARGET}" + ) + + run_ssh() { + timeout 30 "${ssh_base[@]}" "$@" + } + + if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then + echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}" + echo "harbor_110_remote_ssh_reachable=false" + exit 65 + fi + echo "harbor_110_remote_ssh_reachable=true" + + if ! run_ssh 'test -x /usr/local/bin/recover-110-control-path-and-harbor-local.sh'; then + echo "BLOCKED harbor_110_local_recovery_script_missing path=/usr/local/bin/recover-110-control-path-and-harbor-local.sh" + exit 65 + fi run_recovery() { - sudo -n env \ + timeout 240 "${ssh_base[@]}" \ + sudo -n env \ TARGET_USER=wooo \ RELOAD_SSH=0 \ AWOOOI_110_EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP}" \ - AWOOOI_110_SSH_REPAIR_SCRIPT="${PWD}/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh" \ - AWOOOI_HARBOR_WATCHDOG_SCRIPT="${PWD}/scripts/reboot-recovery/harbor-watchdog.sh" \ - "${PWD}/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh" \ + /usr/local/bin/recover-110-control-path-and-harbor-local.sh \ "$@" } - echo "harbor_110_local_repair_check_start=1" + echo "harbor_110_remote_repair_check_start=1" set +e check_output="$(run_recovery --check 2>&1)" check_rc=$? set -e printf '%s\n' "${check_output}" - echo "harbor_110_local_repair_check_rc=${check_rc}" + echo "harbor_110_remote_repair_check_rc=${check_rc}" trigger="${GITHUB_EVENT_NAME:-${AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER:-unknown}}" - echo "harbor_110_local_repair_trigger=${trigger}" + echo "harbor_110_remote_repair_trigger=${trigger}" if [ "${trigger}" != "workflow_dispatch" ] \ && printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then - echo "harbor_110_local_repair_skipped=already_ready" + echo "harbor_110_remote_repair_skipped=already_ready" exit 0 fi - echo "harbor_110_local_repair_apply_all_start=1" + echo "harbor_110_remote_repair_apply_all_start=1" run_recovery --apply-all - - name: Verify Harbor registry routes after local repair - run: | - set -euo pipefail - local_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true)" + local_status="$( + run_ssh 'curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true' + )" public_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 https://registry.wooo.work/v2/ || true)" [ -n "${local_status}" ] || local_status="000" [ -n "${public_status}" ] || public_status="000" - echo "harbor_110_local_v2_http_status=${local_status}" + echo "harbor_110_remote_local_v2_http_status=${local_status}" echo "harbor_public_registry_v2_http_status=${public_status}" case "${local_status}" in 200|401) ;; - *) echo "BLOCKED harbor_local_registry_v2_unavailable status=${local_status}"; exit 1 ;; + *) echo "BLOCKED harbor_110_remote_local_registry_v2_unavailable status=${local_status}"; exit 1 ;; esac case "${public_status}" in 200|401) ;; *) echo "BLOCKED harbor_public_registry_v2_unavailable status=${public_status}"; exit 1 ;; esac - echo "harbor_110_local_repair_verified=true" + echo "harbor_110_remote_repair_verified=true" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index f903c92c..93a90bdf 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,21 @@ +## 2026-07-01 — 09:18 non-110 runner controlled apply / B5 Docker socket fail-fast + +**照主線修正的問題**: +- Gitea main 已到 `4aaa95c44 fix(runner): keep non110 service alive after registration`,但最新 CD `#4208` 先因 `awoooi-non110-host` waiting 卡住;188 verifier 回 registration / config / binary ready、service inactive、keepalive timer active。 +- 執行 188 user-level controlled apply:`AWOOOI_NON110_ENABLE=1 ... install-awoooi-non110-runner-user-service.sh --enable`,讀回 service `ActiveState=active`、`MainPID=3380348`、keepalive timer active、`runner_token_read=false`、`raw_runner_registration_read=false`。 +- apply 後 verifier 回 `AWOOOI_NON110_RUNNER_READY=1`、`BLOCKER_COUNT=0`,CD `#4208` 從 Waiting 進 Running;之後 tests job 失敗點不是 runner,而是 B5 容器內 Docker socket 權限不足,`pg-test-b5` 無法啟動 / 解析,導致 `tests/integration/test_b5_core_flows.py` setup 全部 `socket.gaierror`。 +- `.gitea/workflows/cd.yaml` 的 B5 full profile 補上 `set -euo pipefail`、`docker info` fail-fast blocker、Postgres readiness gate 與 B5 test container `--user 0:0`,避免 Docker socket 不可用時繼續跑 pytest 產生模糊 DB socket error。 +- `.gitea/workflows/harbor-110-local-repair.yaml` 從已無 matching runner 的 `awoooi-host` 改為 `awoooi-non110-host`,再用 bounded SSH control channel 進 110 執行既有 `recover-110-control-path-and-harbor-local.sh`;若 110 SSH 不通,會明確回 `harbor_110_remote_control_channel_unavailable`,不再讓 queue 停在 no-matching runner。 +- `ops/runner/test_cd_controlled_runtime_profile.py` 補 B5 Docker socket / DB network fail-fast shape test,並把 Harbor repair workflow 測試對齊現行 non-110 bounded SSH control-channel source truth;未恢復 110 legacy / generic runner。 + +**驗證**: +- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q --tb=short -p no:cacheprovider` 通過(34 passed)。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .` 通過,`auto_branch_events_on_110=0`、`generic_runner_labels=0`。 +- `node scripts/ci/check-gitea-step-env-secrets.js` 通過。 +- `git diff --check` 通過。 + +**邊界**:本輪只對 188 user-level non-110 runner service 做 allowlisted controlled apply,未讀 token / secret / `.env` / raw sessions / SQLite / auth,未讀 `.runner` 內容,未使用 GitHub / `gh` / GitHub API,未 workflow_dispatch,未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;110 Harbor 下一步是讓 non-110 repair lane 跑 bounded SSH control channel,若 SSH 仍不通則以 receipt 形式收斂 blocker。 + ## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier **照主線修正的問題**: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index a4539460..d5fef637 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -115,15 +115,22 @@ def test_harbor_110_local_repair_workflow_is_dispatch_only_and_bounded() -> None assert "push:" not in text assert "pull_request:" not in text assert "pull_request_target:" not in text - assert "runs-on: awoooi-host" in text assert "runs-on: awoooi-non110-host" in text + assert "runs-on: awoooi-host" not in text assert "guard-gitea-runner-pressure.py --root ." in text assert "recover-110-control-path-and-harbor-local.sh" in text assert "--check" in text assert "--apply-all" in text assert "sudo -n env" in text + assert "AWOOOI_110_SSH_TARGET" in text + assert "BatchMode=yes" in text + assert "ConnectTimeout=8" in text + assert "operation_boundary_remote_ssh_bounded=true" in text + assert "harbor_110_remote_control_channel_unavailable" in text + assert "harbor_110_remote_repair_check_start=1" in text + assert "harbor_110_remote_repair_apply_all_start=1" in text assert "GITHUB_EVENT_NAME" in text - assert "harbor_110_local_repair_skipped=already_ready" in text + assert "harbor_110_remote_repair_skipped=already_ready" in text assert "192.168.0.110" in text assert "http://127.0.0.1:5000/v2/" in text assert "https://registry.wooo.work/v2/" in text @@ -564,6 +571,25 @@ def test_controlled_runtime_skips_b5_before_docker_socket_use() -> None: assert controlled_gate < exit_zero < docker_socket +def test_b5_full_profile_fails_fast_when_docker_socket_or_db_network_is_unready() -> None: + text = _workflow_text() + b5_block = text.split("- name: Integration Tests (B5", 1)[1] + b5_block = b5_block.split("- name: Clean Test Workspace Artifacts", 1)[0] + + assert "set -euo pipefail" in b5_block + assert "docker info >/dev/null 2>&1" in b5_block + assert "BLOCKER b5_docker_socket_unavailable" in b5_block + assert "B5_DB_READY=0" in b5_block + assert "BLOCKER b5_pg_test_container_not_ready" in b5_block + assert "--user 0:0" in b5_block + assert b5_block.index("docker info >/dev/null 2>&1") < b5_block.index( + "docker run -d --name pg-test-b5" + ) + assert b5_block.index("BLOCKER b5_pg_test_container_not_ready") < b5_block.index( + "/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py" + ) + + def test_controlled_runtime_pytest_paths_exist() -> None: text = _workflow_text() block = text.split("PYTHONFAULTHANDLER=1 python3.11 -m pytest", 1)[1]