fix(runner): route harbor repair through non110 lane
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -842,11 +842,17 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
|
||||
set -euo pipefail
|
||||
cd apps/api
|
||||
# 安裝 psql client
|
||||
if ! command -v psql &>/dev/null; then
|
||||
apt-get install -y -q postgresql-client
|
||||
fi
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
echo "BLOCKER b5_docker_socket_unavailable"
|
||||
echo "NEXT_ACTION ensure_b5_ci_container_runs_with_docker_socket_permission_then_retry_cd"
|
||||
exit 65
|
||||
fi
|
||||
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
|
||||
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
|
||||
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
|
||||
@@ -866,9 +872,20 @@ jobs:
|
||||
-e POSTGRES_PASSWORD=awoooi_test_2026 \
|
||||
pgvector/pgvector:pg16
|
||||
# 等待就緒(用 container name,最多 60 秒)
|
||||
B5_DB_READY=0
|
||||
for i in $(seq 1 30); do
|
||||
PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2
|
||||
if PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi; then
|
||||
B5_DB_READY=1
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
if [ "$B5_DB_READY" != "1" ]; then
|
||||
echo "BLOCKER b5_pg_test_container_not_ready"
|
||||
echo "NEXT_ACTION inspect_b5_test_network_and_docker_socket_then_retry_cd"
|
||||
docker ps --filter name=pg-test-b5 --format 'b5_container={{.Names}} status={{.Status}}' || true
|
||||
exit 66
|
||||
fi
|
||||
# 初始化 schema
|
||||
PGPASSWORD=awoooi_test_2026 psql \
|
||||
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
|
||||
@@ -890,6 +907,7 @@ jobs:
|
||||
CI_SCRIPT
|
||||
docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
|
||||
--user 0:0 \
|
||||
--cpus "2.0" \
|
||||
--memory "2g" \
|
||||
-v "$PWD:/workspace" \
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
# Controlled runtime:
|
||||
# - workflow_dispatch + low-frequency schedule only
|
||||
# - no push / pull_request / pull_request_target trigger
|
||||
# - runs only on the 110-local awoooi-host lane
|
||||
# - runs on the non-110 controlled host lane, then reaches 110 only through a
|
||||
# bounded SSH control channel
|
||||
# - no secret read, no Docker daemon restart, no reboot, no node drain
|
||||
# - runs the existing bounded recovery script, then verifies local and public /v2/
|
||||
|
||||
@@ -17,6 +18,7 @@ on:
|
||||
env:
|
||||
AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED: "1"
|
||||
AWOOOI_110_EXPECTED_HOST_IP: 192.168.0.110
|
||||
AWOOOI_110_SSH_TARGET: wooo@192.168.0.110
|
||||
AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER: ${{ github.event_name }}
|
||||
|
||||
jobs:
|
||||
@@ -31,7 +33,7 @@ jobs:
|
||||
|
||||
harbor-110-local-repair:
|
||||
if: ${{ env.AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED == '1' }}
|
||||
runs-on: awoooi-host
|
||||
runs-on: awoooi-non110-host
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -45,68 +47,84 @@ jobs:
|
||||
echo "operation_boundary_docker_daemon_restart_performed=false"
|
||||
echo "operation_boundary_host_reboot_performed=false"
|
||||
echo "operation_boundary_node_drain_performed=false"
|
||||
echo "operation_boundary_remote_ssh_bounded=true"
|
||||
|
||||
- name: Run 110 local control path and Harbor repair
|
||||
- name: Run 110 remote control path and Harbor repair
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if ! command -v ip >/dev/null 2>&1; then
|
||||
echo "BLOCKED ip_command_missing_for_110_host_guard"
|
||||
exit 65
|
||||
fi
|
||||
if ! ip -o -4 addr show 2>/dev/null | grep -q " ${AWOOOI_110_EXPECTED_HOST_IP}/"; then
|
||||
echo "BLOCKED expected_110_host_ip_not_present"
|
||||
if ! command -v ssh >/dev/null 2>&1; then
|
||||
echo "BLOCKED ssh_command_missing_for_110_control_channel"
|
||||
exit 65
|
||||
fi
|
||||
|
||||
chmod +x scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh
|
||||
chmod +x scripts/reboot-recovery/harbor-watchdog.sh
|
||||
chmod +x scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh
|
||||
ssh_base=(
|
||||
ssh
|
||||
-o BatchMode=yes
|
||||
-o ConnectTimeout=8
|
||||
-o ServerAliveInterval=5
|
||||
-o ServerAliveCountMax=2
|
||||
"${AWOOOI_110_SSH_TARGET}"
|
||||
)
|
||||
|
||||
run_ssh() {
|
||||
timeout 30 "${ssh_base[@]}" "$@"
|
||||
}
|
||||
|
||||
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
|
||||
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
|
||||
echo "harbor_110_remote_ssh_reachable=false"
|
||||
exit 65
|
||||
fi
|
||||
echo "harbor_110_remote_ssh_reachable=true"
|
||||
|
||||
if ! run_ssh 'test -x /usr/local/bin/recover-110-control-path-and-harbor-local.sh'; then
|
||||
echo "BLOCKED harbor_110_local_recovery_script_missing path=/usr/local/bin/recover-110-control-path-and-harbor-local.sh"
|
||||
exit 65
|
||||
fi
|
||||
|
||||
run_recovery() {
|
||||
sudo -n env \
|
||||
timeout 240 "${ssh_base[@]}" \
|
||||
sudo -n env \
|
||||
TARGET_USER=wooo \
|
||||
RELOAD_SSH=0 \
|
||||
AWOOOI_110_EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP}" \
|
||||
AWOOOI_110_SSH_REPAIR_SCRIPT="${PWD}/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh" \
|
||||
AWOOOI_HARBOR_WATCHDOG_SCRIPT="${PWD}/scripts/reboot-recovery/harbor-watchdog.sh" \
|
||||
"${PWD}/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh" \
|
||||
/usr/local/bin/recover-110-control-path-and-harbor-local.sh \
|
||||
"$@"
|
||||
}
|
||||
|
||||
echo "harbor_110_local_repair_check_start=1"
|
||||
echo "harbor_110_remote_repair_check_start=1"
|
||||
set +e
|
||||
check_output="$(run_recovery --check 2>&1)"
|
||||
check_rc=$?
|
||||
set -e
|
||||
printf '%s\n' "${check_output}"
|
||||
echo "harbor_110_local_repair_check_rc=${check_rc}"
|
||||
echo "harbor_110_remote_repair_check_rc=${check_rc}"
|
||||
|
||||
trigger="${GITHUB_EVENT_NAME:-${AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER:-unknown}}"
|
||||
echo "harbor_110_local_repair_trigger=${trigger}"
|
||||
echo "harbor_110_remote_repair_trigger=${trigger}"
|
||||
if [ "${trigger}" != "workflow_dispatch" ] \
|
||||
&& printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then
|
||||
echo "harbor_110_local_repair_skipped=already_ready"
|
||||
echo "harbor_110_remote_repair_skipped=already_ready"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "harbor_110_local_repair_apply_all_start=1"
|
||||
echo "harbor_110_remote_repair_apply_all_start=1"
|
||||
run_recovery --apply-all
|
||||
|
||||
- name: Verify Harbor registry routes after local repair
|
||||
run: |
|
||||
set -euo pipefail
|
||||
local_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true)"
|
||||
local_status="$(
|
||||
run_ssh 'curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true'
|
||||
)"
|
||||
public_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 https://registry.wooo.work/v2/ || true)"
|
||||
[ -n "${local_status}" ] || local_status="000"
|
||||
[ -n "${public_status}" ] || public_status="000"
|
||||
echo "harbor_110_local_v2_http_status=${local_status}"
|
||||
echo "harbor_110_remote_local_v2_http_status=${local_status}"
|
||||
echo "harbor_public_registry_v2_http_status=${public_status}"
|
||||
case "${local_status}" in
|
||||
200|401) ;;
|
||||
*) echo "BLOCKED harbor_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
|
||||
*) echo "BLOCKED harbor_110_remote_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
|
||||
esac
|
||||
case "${public_status}" in
|
||||
200|401) ;;
|
||||
*) echo "BLOCKED harbor_public_registry_v2_unavailable status=${public_status}"; exit 1 ;;
|
||||
esac
|
||||
echo "harbor_110_local_repair_verified=true"
|
||||
echo "harbor_110_remote_repair_verified=true"
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
## 2026-07-01 — 09:18 non-110 runner controlled apply / B5 Docker socket fail-fast
|
||||
|
||||
**照主線修正的問題**:
|
||||
- Gitea main 已到 `4aaa95c44 fix(runner): keep non110 service alive after registration`,但最新 CD `#4208` 先因 `awoooi-non110-host` waiting 卡住;188 verifier 回 registration / config / binary ready、service inactive、keepalive timer active。
|
||||
- 執行 188 user-level controlled apply:`AWOOOI_NON110_ENABLE=1 ... install-awoooi-non110-runner-user-service.sh --enable`,讀回 service `ActiveState=active`、`MainPID=3380348`、keepalive timer active、`runner_token_read=false`、`raw_runner_registration_read=false`。
|
||||
- apply 後 verifier 回 `AWOOOI_NON110_RUNNER_READY=1`、`BLOCKER_COUNT=0`,CD `#4208` 從 Waiting 進 Running;之後 tests job 失敗點不是 runner,而是 B5 容器內 Docker socket 權限不足,`pg-test-b5` 無法啟動 / 解析,導致 `tests/integration/test_b5_core_flows.py` setup 全部 `socket.gaierror`。
|
||||
- `.gitea/workflows/cd.yaml` 的 B5 full profile 補上 `set -euo pipefail`、`docker info` fail-fast blocker、Postgres readiness gate 與 B5 test container `--user 0:0`,避免 Docker socket 不可用時繼續跑 pytest 產生模糊 DB socket error。
|
||||
- `.gitea/workflows/harbor-110-local-repair.yaml` 從已無 matching runner 的 `awoooi-host` 改為 `awoooi-non110-host`,再用 bounded SSH control channel 進 110 執行既有 `recover-110-control-path-and-harbor-local.sh`;若 110 SSH 不通,會明確回 `harbor_110_remote_control_channel_unavailable`,不再讓 queue 停在 no-matching runner。
|
||||
- `ops/runner/test_cd_controlled_runtime_profile.py` 補 B5 Docker socket / DB network fail-fast shape test,並把 Harbor repair workflow 測試對齊現行 non-110 bounded SSH control-channel source truth;未恢復 110 legacy / generic runner。
|
||||
|
||||
**驗證**:
|
||||
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q --tb=short -p no:cacheprovider` 通過(34 passed)。
|
||||
- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .` 通過,`auto_branch_events_on_110=0`、`generic_runner_labels=0`。
|
||||
- `node scripts/ci/check-gitea-step-env-secrets.js` 通過。
|
||||
- `git diff --check` 通過。
|
||||
|
||||
**邊界**:本輪只對 188 user-level non-110 runner service 做 allowlisted controlled apply,未讀 token / secret / `.env` / raw sessions / SQLite / auth,未讀 `.runner` 內容,未使用 GitHub / `gh` / GitHub API,未 workflow_dispatch,未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;110 Harbor 下一步是讓 non-110 repair lane 跑 bounded SSH control channel,若 SSH 仍不通則以 receipt 形式收斂 blocker。
|
||||
|
||||
## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier
|
||||
|
||||
**照主線修正的問題**:
|
||||
|
||||
@@ -115,15 +115,22 @@ def test_harbor_110_local_repair_workflow_is_dispatch_only_and_bounded() -> None
|
||||
assert "push:" not in text
|
||||
assert "pull_request:" not in text
|
||||
assert "pull_request_target:" not in text
|
||||
assert "runs-on: awoooi-host" in text
|
||||
assert "runs-on: awoooi-non110-host" in text
|
||||
assert "runs-on: awoooi-host" not in text
|
||||
assert "guard-gitea-runner-pressure.py --root ." in text
|
||||
assert "recover-110-control-path-and-harbor-local.sh" in text
|
||||
assert "--check" in text
|
||||
assert "--apply-all" in text
|
||||
assert "sudo -n env" in text
|
||||
assert "AWOOOI_110_SSH_TARGET" in text
|
||||
assert "BatchMode=yes" in text
|
||||
assert "ConnectTimeout=8" in text
|
||||
assert "operation_boundary_remote_ssh_bounded=true" in text
|
||||
assert "harbor_110_remote_control_channel_unavailable" in text
|
||||
assert "harbor_110_remote_repair_check_start=1" in text
|
||||
assert "harbor_110_remote_repair_apply_all_start=1" in text
|
||||
assert "GITHUB_EVENT_NAME" in text
|
||||
assert "harbor_110_local_repair_skipped=already_ready" in text
|
||||
assert "harbor_110_remote_repair_skipped=already_ready" in text
|
||||
assert "192.168.0.110" in text
|
||||
assert "http://127.0.0.1:5000/v2/" in text
|
||||
assert "https://registry.wooo.work/v2/" in text
|
||||
@@ -564,6 +571,25 @@ def test_controlled_runtime_skips_b5_before_docker_socket_use() -> None:
|
||||
assert controlled_gate < exit_zero < docker_socket
|
||||
|
||||
|
||||
def test_b5_full_profile_fails_fast_when_docker_socket_or_db_network_is_unready() -> None:
|
||||
text = _workflow_text()
|
||||
b5_block = text.split("- name: Integration Tests (B5", 1)[1]
|
||||
b5_block = b5_block.split("- name: Clean Test Workspace Artifacts", 1)[0]
|
||||
|
||||
assert "set -euo pipefail" in b5_block
|
||||
assert "docker info >/dev/null 2>&1" in b5_block
|
||||
assert "BLOCKER b5_docker_socket_unavailable" in b5_block
|
||||
assert "B5_DB_READY=0" in b5_block
|
||||
assert "BLOCKER b5_pg_test_container_not_ready" in b5_block
|
||||
assert "--user 0:0" in b5_block
|
||||
assert b5_block.index("docker info >/dev/null 2>&1") < b5_block.index(
|
||||
"docker run -d --name pg-test-b5"
|
||||
)
|
||||
assert b5_block.index("BLOCKER b5_pg_test_container_not_ready") < b5_block.index(
|
||||
"/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py"
|
||||
)
|
||||
|
||||
|
||||
def test_controlled_runtime_pytest_paths_exist() -> None:
|
||||
text = _workflow_text()
|
||||
block = text.split("PYTHONFAULTHANDLER=1 python3.11 -m pytest", 1)[1]
|
||||
|
||||
Reference in New Issue
Block a user