From f50e363a59a3a754c8dee7dd0dd27ce507a54ea4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 09:45:48 +0800 Subject: [PATCH] fix(runner): retry harbor repair ssh probe --- .gitea/workflows/harbor-110-local-repair.yaml | 19 ++++++++++++++++++- docs/LOGBOOK.md | 14 ++++++++++++++ .../test_cd_controlled_runtime_profile.py | 6 ++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/harbor-110-local-repair.yaml b/.gitea/workflows/harbor-110-local-repair.yaml index 23dc5e99..760e5d7f 100644 --- a/.gitea/workflows/harbor-110-local-repair.yaml +++ b/.gitea/workflows/harbor-110-local-repair.yaml @@ -65,9 +65,26 @@ jobs: -o ServerAliveCountMax=2 "${AWOOOI_110_SSH_TARGET}" ) + SSH_PROBE_ATTEMPTS="${AWOOOI_110_SSH_PROBE_ATTEMPTS:-6}" + SSH_PROBE_SLEEP_SECONDS="${AWOOOI_110_SSH_PROBE_SLEEP_SECONDS:-10}" run_ssh() { - timeout 30 "${ssh_base[@]}" "$@" + local attempt rc + attempt=1 + rc=1 + while [ "${attempt}" -le "${SSH_PROBE_ATTEMPTS}" ]; do + if timeout 30 "${ssh_base[@]}" "$@"; then + echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=success" + return 0 + fi + rc=$? + echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=failure rc=${rc}" + if [ "${attempt}" -lt "${SSH_PROBE_ATTEMPTS}" ]; then + sleep "${SSH_PROBE_SLEEP_SECONDS}" + fi + attempt=$((attempt + 1)) + done + return "${rc}" } diagnose_ssh_control_channel() { diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 272a1149..6bca7441 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,17 @@ +## 2026-07-01 — 09:44 Harbor repair SSH probe bounded retry + +**照主線修正的問題**: +- 最新 live truth:CD `#4215` 仍因 Harbor public `/v2/` = `502` 失敗;Harbor repair `#4212` 的具體 blocker 是 `harbor_110_remote_control_channel_unavailable`。 +- 188 non-110 runner lane 讀回 ready、host pressure 正常;但 188 → 110 bounded SSH probe 呈現間歇性,一次 `true` 可成功,下一次 `recover-110-control-path-and-harbor-local.sh --check` 又 timeout。 +- `.gitea/workflows/harbor-110-local-repair.yaml` 對非寫入的 SSH probe / verifier 加 bounded retry:預設 `6` 次、每次仍受 `ConnectTimeout=8`、`ServerAlive*` 與外層 `timeout 30` 限制,並輸出 `harbor_110_remote_ssh_probe_attempt=...` receipt。`run_recovery --apply-all` 不自動 retry,避免半套用被重跑。 + +**驗證**: +- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q`:`35 passed`。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost:5432/test PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_install_awoooi_non110_runner_user_service.py ops/runner/test_verify_awoooi_non110_cd_closure.py apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py -q`:`96 passed`。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`node scripts/ci/check-gitea-step-env-secrets.js .gitea/workflows/harbor-110-local-repair.yaml`、YAML parse、`git diff --check`:通過。 + +**邊界**:只改 Harbor repair workflow 的 bounded non-write SSH probe retry / tests / LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。 + ## 2026-07-01 — 09:36 188 runner drain / 110 stale docker CPU metric correction **照主線修正的問題**: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index 513e65b9..28b732ab 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -136,6 +136,12 @@ def test_harbor_110_local_repair_workflow_is_dispatch_only_and_bounded() -> None assert "AWOOOI_110_SSH_TARGET" in text assert "BatchMode=yes" in text assert "ConnectTimeout=8" in text + assert 'SSH_PROBE_ATTEMPTS="${AWOOOI_110_SSH_PROBE_ATTEMPTS:-6}"' in text + assert ( + 'SSH_PROBE_SLEEP_SECONDS="${AWOOOI_110_SSH_PROBE_SLEEP_SECONDS:-10}"' + in text + ) + assert "harbor_110_remote_ssh_probe_attempt=" in text assert "operation_boundary_remote_ssh_bounded=true" in text assert "harbor_110_remote_control_channel_unavailable" in text assert "harbor_110_remote_repair_check_start=1" in text