diff --git a/.gitea/workflows/harbor-110-local-repair.yaml b/.gitea/workflows/harbor-110-local-repair.yaml index 35139577..337b31f6 100644 --- a/.gitea/workflows/harbor-110-local-repair.yaml +++ b/.gitea/workflows/harbor-110-local-repair.yaml @@ -131,7 +131,15 @@ jobs: else echo "harbor_110_remote_ssh_publickey_offered=false" fi - if printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \ + if printf '%s\n' "${diag_output}" | grep -q "Server accepts key" \ + && printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then + echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" + echo "SSH_AUTH user=wooo mode=publickey rc=${diag_rc} classification=server_accepts_key_then_timeout" + else + echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=false" + fi + if { printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \ + || printf '%s\n' "${diag_output}" | grep -q "Server accepts key"; } \ && printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true" echo "harbor_110_remote_ssh_publickey_auth_stalled=true" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index fe062dfd..b98ab023 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -12,6 +12,20 @@ **邊界**:只改 priority work order overlay 排序邏輯、測試與 LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未執行 runtime apply。 +## 2026-07-01 — 12:50 Harbor repair workflow SSH session-timeout marker + +**照主線修正的問題**: +- `ce5bcab8b` 已讓 queue parser / AI Loop 能讀 `remote_ssh_server_accepts_key_then_session_timeout`,但 Harbor repair workflow 的 SSH 診斷尚未輸出該 marker,導致最新 queue 仍只能從舊 repair log 看到泛稱 `remote_ssh_publickey_auth_stalled`。 +- `.gitea/workflows/harbor-110-local-repair.yaml` 的 `diagnose_ssh_control_channel` 現在在 OpenSSH `Server accepts key` + timeout 時輸出 `harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true` 與 metadata-only `SSH_AUTH user=wooo mode=publickey rc=... classification=server_accepts_key_then_timeout`。 +- publickey auth stalled 判定同步接受 `Server accepts key` + timeout,不再只依賴 `we sent a publickey packet, wait for reply`。 +- workflow 仍維持 hourly / workflow_dispatch、bounded SSH、no secret、no raw SSH log、no reboot / Docker restart / node drain。 + +**驗證**: +- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_read_public_gitea_actions_queue.py -q --tb=short -p no:cacheprovider`:通過。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`、`node scripts/ci/check-gitea-step-env-secrets.js .gitea/workflows/harbor-110-local-repair.yaml`、`git diff --check`:通過。 + +**邊界**:只改 Harbor repair workflow 診斷 marker、workflow profile test 與 LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall;未執行 110 runtime apply。 + ## 2026-07-01 — 12:38 110 SSH local metadata receipt 擴充 **照主線修正的問題**: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index d018c66a..500a9a9c 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -148,6 +148,9 @@ def test_harbor_110_local_repair_workflow_is_dispatch_only_and_bounded() -> None assert 'timeout "${AWOOOI_110_SSH_COMMAND_TIMEOUT_SECONDS}"' in text assert "operation_boundary_remote_ssh_bounded=true" in text assert "harbor_110_remote_control_channel_unavailable" in text + assert "harbor_110_remote_ssh_server_accepts_key_then_session_timeout" in text + assert "classification=server_accepts_key_then_timeout" in text + assert "harbor_110_remote_ssh_diag_raw_log_printed=false" in text assert "harbor_110_remote_repair_check_start=1" in text assert 'if [ "${check_rc}" -ne 0 ]; then' in text assert "BLOCKED harbor_110_remote_repair_check_unavailable" in text