fix(runner): route harbor repair through non110 lane
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 09:14:30 +08:00
parent 62820cabbc
commit e40d77dd9c
4 changed files with 111 additions and 31 deletions

View File

@@ -842,11 +842,17 @@ jobs:
exit 0
fi
cat > /tmp/awoooi-b5-tests.sh <<'CI_SCRIPT'
set -euo pipefail
cd apps/api
# 安裝 psql client
if ! command -v psql &>/dev/null; then
apt-get install -y -q postgresql-client
fi
if ! docker info >/dev/null 2>&1; then
echo "BLOCKER b5_docker_socket_unavailable"
echo "NEXT_ACTION ensure_b5_ci_container_runs_with_docker_socket_permission_then_retry_cd"
exit 65
fi
# 2026-04-19 ogt + Claude Opus 4.7 v3: 主動創 shared network
# 之前 grep ACT_NET 在 c0f3509 run 沒 match → fallback bridge → container name DNS 失效
# 真因: default bridge 不支援 container name DNS,必須 user-defined network
@@ -866,9 +872,20 @@ jobs:
-e POSTGRES_PASSWORD=awoooi_test_2026 \
pgvector/pgvector:pg16
# 等待就緒(用 container name,最多 60 秒)
B5_DB_READY=0
for i in $(seq 1 30); do
PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi && break || sleep 2
if PGPASSWORD=awoooi_test_2026 pg_isready -h pg-test-b5 -p 5432 -U awoooi; then
B5_DB_READY=1
break
fi
sleep 2
done
if [ "$B5_DB_READY" != "1" ]; then
echo "BLOCKER b5_pg_test_container_not_ready"
echo "NEXT_ACTION inspect_b5_test_network_and_docker_socket_then_retry_cd"
docker ps --filter name=pg-test-b5 --format 'b5_container={{.Names}} status={{.Status}}' || true
exit 66
fi
# 初始化 schema
PGPASSWORD=awoooi_test_2026 psql \
-h pg-test-b5 -p 5432 -U awoooi -d awoooi_test \
@@ -890,6 +907,7 @@ jobs:
CI_SCRIPT
docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-b5-tests" \
--user 0:0 \
--cpus "2.0" \
--memory "2g" \
-v "$PWD:/workspace" \

View File

@@ -3,7 +3,8 @@
# Controlled runtime:
# - workflow_dispatch + low-frequency schedule only
# - no push / pull_request / pull_request_target trigger
# - runs only on the 110-local awoooi-host lane
# - runs on the non-110 controlled host lane, then reaches 110 only through a
# bounded SSH control channel
# - no secret read, no Docker daemon restart, no reboot, no node drain
# - runs the existing bounded recovery script, then verifies local and public /v2/
@@ -17,6 +18,7 @@ on:
env:
AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED: "1"
AWOOOI_110_EXPECTED_HOST_IP: 192.168.0.110
AWOOOI_110_SSH_TARGET: wooo@192.168.0.110
AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER: ${{ github.event_name }}
jobs:
@@ -31,7 +33,7 @@ jobs:
harbor-110-local-repair:
if: ${{ env.AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED == '1' }}
runs-on: awoooi-host
runs-on: awoooi-non110-host
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
@@ -45,68 +47,84 @@ jobs:
echo "operation_boundary_docker_daemon_restart_performed=false"
echo "operation_boundary_host_reboot_performed=false"
echo "operation_boundary_node_drain_performed=false"
echo "operation_boundary_remote_ssh_bounded=true"
- name: Run 110 local control path and Harbor repair
- name: Run 110 remote control path and Harbor repair
run: |
set -euo pipefail
if ! command -v ip >/dev/null 2>&1; then
echo "BLOCKED ip_command_missing_for_110_host_guard"
exit 65
fi
if ! ip -o -4 addr show 2>/dev/null | grep -q " ${AWOOOI_110_EXPECTED_HOST_IP}/"; then
echo "BLOCKED expected_110_host_ip_not_present"
if ! command -v ssh >/dev/null 2>&1; then
echo "BLOCKED ssh_command_missing_for_110_control_channel"
exit 65
fi
chmod +x scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh
chmod +x scripts/reboot-recovery/harbor-watchdog.sh
chmod +x scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh
ssh_base=(
ssh
-o BatchMode=yes
-o ConnectTimeout=8
-o ServerAliveInterval=5
-o ServerAliveCountMax=2
"${AWOOOI_110_SSH_TARGET}"
)
run_ssh() {
timeout 30 "${ssh_base[@]}" "$@"
}
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
echo "harbor_110_remote_ssh_reachable=false"
exit 65
fi
echo "harbor_110_remote_ssh_reachable=true"
if ! run_ssh 'test -x /usr/local/bin/recover-110-control-path-and-harbor-local.sh'; then
echo "BLOCKED harbor_110_local_recovery_script_missing path=/usr/local/bin/recover-110-control-path-and-harbor-local.sh"
exit 65
fi
run_recovery() {
sudo -n env \
timeout 240 "${ssh_base[@]}" \
sudo -n env \
TARGET_USER=wooo \
RELOAD_SSH=0 \
AWOOOI_110_EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP}" \
AWOOOI_110_SSH_REPAIR_SCRIPT="${PWD}/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh" \
AWOOOI_HARBOR_WATCHDOG_SCRIPT="${PWD}/scripts/reboot-recovery/harbor-watchdog.sh" \
"${PWD}/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh" \
/usr/local/bin/recover-110-control-path-and-harbor-local.sh \
"$@"
}
echo "harbor_110_local_repair_check_start=1"
echo "harbor_110_remote_repair_check_start=1"
set +e
check_output="$(run_recovery --check 2>&1)"
check_rc=$?
set -e
printf '%s\n' "${check_output}"
echo "harbor_110_local_repair_check_rc=${check_rc}"
echo "harbor_110_remote_repair_check_rc=${check_rc}"
trigger="${GITHUB_EVENT_NAME:-${AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER:-unknown}}"
echo "harbor_110_local_repair_trigger=${trigger}"
echo "harbor_110_remote_repair_trigger=${trigger}"
if [ "${trigger}" != "workflow_dispatch" ] \
&& printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then
echo "harbor_110_local_repair_skipped=already_ready"
echo "harbor_110_remote_repair_skipped=already_ready"
exit 0
fi
echo "harbor_110_local_repair_apply_all_start=1"
echo "harbor_110_remote_repair_apply_all_start=1"
run_recovery --apply-all
- name: Verify Harbor registry routes after local repair
run: |
set -euo pipefail
local_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true)"
local_status="$(
run_ssh 'curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true'
)"
public_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 https://registry.wooo.work/v2/ || true)"
[ -n "${local_status}" ] || local_status="000"
[ -n "${public_status}" ] || public_status="000"
echo "harbor_110_local_v2_http_status=${local_status}"
echo "harbor_110_remote_local_v2_http_status=${local_status}"
echo "harbor_public_registry_v2_http_status=${public_status}"
case "${local_status}" in
200|401) ;;
*) echo "BLOCKED harbor_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
*) echo "BLOCKED harbor_110_remote_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
esac
case "${public_status}" in
200|401) ;;
*) echo "BLOCKED harbor_public_registry_v2_unavailable status=${public_status}"; exit 1 ;;
esac
echo "harbor_110_local_repair_verified=true"
echo "harbor_110_remote_repair_verified=true"

View File

@@ -1,3 +1,21 @@
## 2026-07-01 — 09:18 non-110 runner controlled apply / B5 Docker socket fail-fast
**照主線修正的問題**
- Gitea main 已到 `4aaa95c44 fix(runner): keep non110 service alive after registration`,但最新 CD `#4208` 先因 `awoooi-non110-host` waiting 卡住188 verifier 回 registration / config / binary ready、service inactive、keepalive timer active。
- 執行 188 user-level controlled apply`AWOOOI_NON110_ENABLE=1 ... install-awoooi-non110-runner-user-service.sh --enable`,讀回 service `ActiveState=active``MainPID=3380348`、keepalive timer active、`runner_token_read=false``raw_runner_registration_read=false`
- apply 後 verifier 回 `AWOOOI_NON110_RUNNER_READY=1``BLOCKER_COUNT=0`CD `#4208` 從 Waiting 進 Running之後 tests job 失敗點不是 runner而是 B5 容器內 Docker socket 權限不足,`pg-test-b5` 無法啟動 / 解析,導致 `tests/integration/test_b5_core_flows.py` setup 全部 `socket.gaierror`
- `.gitea/workflows/cd.yaml` 的 B5 full profile 補上 `set -euo pipefail``docker info` fail-fast blocker、Postgres readiness gate 與 B5 test container `--user 0:0`,避免 Docker socket 不可用時繼續跑 pytest 產生模糊 DB socket error。
- `.gitea/workflows/harbor-110-local-repair.yaml` 從已無 matching runner 的 `awoooi-host` 改為 `awoooi-non110-host`,再用 bounded SSH control channel 進 110 執行既有 `recover-110-control-path-and-harbor-local.sh`;若 110 SSH 不通,會明確回 `harbor_110_remote_control_channel_unavailable`,不再讓 queue 停在 no-matching runner。
- `ops/runner/test_cd_controlled_runtime_profile.py` 補 B5 Docker socket / DB network fail-fast shape test並把 Harbor repair workflow 測試對齊現行 non-110 bounded SSH control-channel source truth未恢復 110 legacy / generic runner。
**驗證**
- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q --tb=short -p no:cacheprovider` 通過34 passed
- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .` 通過,`auto_branch_events_on_110=0``generic_runner_labels=0`
- `node scripts/ci/check-gitea-step-env-secrets.js` 通過。
- `git diff --check` 通過。
**邊界**:本輪只對 188 user-level non-110 runner service 做 allowlisted controlled apply未讀 token / secret / `.env` / raw sessions / SQLite / auth未讀 `.runner` 內容,未使用 GitHub / `gh` / GitHub API未 workflow_dispatch未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall110 Harbor 下一步是讓 non-110 repair lane 跑 bounded SSH control channel若 SSH 仍不通則以 receipt 形式收斂 blocker。
## 2026-07-01 — 08:54 CD host pressure interrupted / Postgres recovery classifier
**照主線修正的問題**

View File

@@ -115,15 +115,22 @@ def test_harbor_110_local_repair_workflow_is_dispatch_only_and_bounded() -> None
assert "push:" not in text
assert "pull_request:" not in text
assert "pull_request_target:" not in text
assert "runs-on: awoooi-host" in text
assert "runs-on: awoooi-non110-host" in text
assert "runs-on: awoooi-host" not in text
assert "guard-gitea-runner-pressure.py --root ." in text
assert "recover-110-control-path-and-harbor-local.sh" in text
assert "--check" in text
assert "--apply-all" in text
assert "sudo -n env" in text
assert "AWOOOI_110_SSH_TARGET" in text
assert "BatchMode=yes" in text
assert "ConnectTimeout=8" in text
assert "operation_boundary_remote_ssh_bounded=true" in text
assert "harbor_110_remote_control_channel_unavailable" in text
assert "harbor_110_remote_repair_check_start=1" in text
assert "harbor_110_remote_repair_apply_all_start=1" in text
assert "GITHUB_EVENT_NAME" in text
assert "harbor_110_local_repair_skipped=already_ready" in text
assert "harbor_110_remote_repair_skipped=already_ready" in text
assert "192.168.0.110" in text
assert "http://127.0.0.1:5000/v2/" in text
assert "https://registry.wooo.work/v2/" in text
@@ -564,6 +571,25 @@ def test_controlled_runtime_skips_b5_before_docker_socket_use() -> None:
assert controlled_gate < exit_zero < docker_socket
def test_b5_full_profile_fails_fast_when_docker_socket_or_db_network_is_unready() -> None:
text = _workflow_text()
b5_block = text.split("- name: Integration Tests (B5", 1)[1]
b5_block = b5_block.split("- name: Clean Test Workspace Artifacts", 1)[0]
assert "set -euo pipefail" in b5_block
assert "docker info >/dev/null 2>&1" in b5_block
assert "BLOCKER b5_docker_socket_unavailable" in b5_block
assert "B5_DB_READY=0" in b5_block
assert "BLOCKER b5_pg_test_container_not_ready" in b5_block
assert "--user 0:0" in b5_block
assert b5_block.index("docker info >/dev/null 2>&1") < b5_block.index(
"docker run -d --name pg-test-b5"
)
assert b5_block.index("BLOCKER b5_pg_test_container_not_ready") < b5_block.index(
"/opt/api-venv/bin/pytest tests/integration/test_b5_core_flows.py"
)
def test_controlled_runtime_pytest_paths_exist() -> None:
text = _workflow_text()
block = text.split("PYTHONFAULTHANDLER=1 python3.11 -m pytest", 1)[1]