fix(runner): diagnose harbor ssh auth stalls
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / build-and-deploy (push) Failing after 28s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 09:45:46 +08:00
parent d1ec8c41aa
commit fa42099d85
3 changed files with 260 additions and 1 deletions

View File

@@ -70,7 +70,66 @@ jobs:
timeout 30 "${ssh_base[@]}" "$@"
}
diagnose_ssh_control_channel() {
set +e
diag_output="$(
timeout 20 ssh -vvv -4 \
-o BatchMode=yes \
-o PreferredAuthentications=publickey \
-o PasswordAuthentication=no \
-o KbdInteractiveAuthentication=no \
-o GSSAPIAuthentication=no \
-o NumberOfPasswordPrompts=0 \
-o ConnectTimeout=8 \
-o ConnectionAttempts=1 \
-o ServerAliveInterval=3 \
-o ServerAliveCountMax=1 \
"${AWOOOI_110_SSH_TARGET}" \
'true' 2>&1
)"
diag_rc=$?
set -e
echo "harbor_110_remote_ssh_diag_rc=${diag_rc}"
if printf '%s\n' "${diag_output}" | grep -q "Connection established."; then
echo "harbor_110_remote_ssh_tcp_connected=true"
else
echo "harbor_110_remote_ssh_tcp_connected=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Remote protocol version"; then
echo "harbor_110_remote_ssh_banner_seen=true"
else
echo "harbor_110_remote_ssh_banner_seen=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "SSH2_MSG_SERVICE_ACCEPT received"; then
echo "harbor_110_remote_ssh_userauth_service_accept_seen=true"
else
echo "harbor_110_remote_ssh_userauth_service_accept_seen=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Offering public key:"; then
echo "harbor_110_remote_ssh_publickey_offered=true"
else
echo "harbor_110_remote_ssh_publickey_offered=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true"
echo "harbor_110_remote_ssh_publickey_auth_stalled=true"
echo "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=${AWOOOI_110_SSH_TARGET}"
else
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=false"
echo "harbor_110_remote_ssh_publickey_auth_stalled=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Permission denied"; then
echo "harbor_110_remote_ssh_auth_permission_denied=true"
else
echo "harbor_110_remote_ssh_auth_permission_denied=false"
fi
echo "harbor_110_remote_ssh_diag_raw_log_printed=false"
}
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
diagnose_ssh_control_channel || true
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
echo "harbor_110_remote_ssh_reachable=false"
exit 65

View File

@@ -85,6 +85,9 @@ _HARBOR_110_REMOTE_SSH_TIMEOUT_RE = re.compile(
r"(Connection to 192\.168\.0\.110 port 22 timed out|"
r"ssh: connect to host 192\.168\.0\.110 port 22: Operation timed out)"
)
_HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE = (
r"{name}=(?P<value>true|false)"
)
_HARBOR_110_REMOTE_LOCAL_V2_STATUS_RE = re.compile(
r"harbor_110_remote_local_v2_http_status=(?P<status>\d{3})"
)
@@ -630,6 +633,31 @@ def build_readback(
"latest_visible_harbor_110_repair_bounded_ssh_timeout_seen": (
harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"]
),
"latest_visible_harbor_110_repair_remote_ssh_tcp_connected": (
harbor_110_repair_log_classifier["remote_ssh_tcp_connected"]
),
"latest_visible_harbor_110_repair_remote_ssh_banner_seen": (
harbor_110_repair_log_classifier["remote_ssh_banner_seen"]
),
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen": (
harbor_110_repair_log_classifier[
"remote_ssh_userauth_service_accept_seen"
]
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_offered": (
harbor_110_repair_log_classifier["remote_ssh_publickey_offered"]
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
harbor_110_repair_log_classifier[
"remote_ssh_publickey_reply_timeout_seen"
]
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
"latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": (
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
),
"latest_visible_harbor_110_repair_local_registry_v2_status": (
harbor_110_repair_log_classifier["local_registry_v2_status"]
),
@@ -727,6 +755,8 @@ def build_readback(
if latest_cd_visible_blocked
else "blocked_current_cd_workflow_waiting_for_runner_or_queue"
if latest_cd_waiting
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
else "blocked_harbor_110_remote_control_channel_unavailable"
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
@@ -874,6 +904,31 @@ def build_readback(
"harbor_110_repair_bounded_ssh_timeout_seen": (
harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"]
),
"harbor_110_repair_remote_ssh_tcp_connected": (
harbor_110_repair_log_classifier["remote_ssh_tcp_connected"]
),
"harbor_110_repair_remote_ssh_banner_seen": (
harbor_110_repair_log_classifier["remote_ssh_banner_seen"]
),
"harbor_110_repair_remote_ssh_userauth_service_accept_seen": (
harbor_110_repair_log_classifier[
"remote_ssh_userauth_service_accept_seen"
]
),
"harbor_110_repair_remote_ssh_publickey_offered": (
harbor_110_repair_log_classifier["remote_ssh_publickey_offered"]
),
"harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
harbor_110_repair_log_classifier[
"remote_ssh_publickey_reply_timeout_seen"
]
),
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
),
"harbor_110_repair_remote_ssh_auth_permission_denied": (
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
),
"harbor_110_repair_local_registry_v2_status": (
harbor_110_repair_log_classifier["local_registry_v2_status"]
),
@@ -992,9 +1047,43 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
)
bounded_ssh_timeout_seen = _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None
remote_ssh_tcp_connected = _last_bool_marker(
"harbor_110_remote_ssh_tcp_connected",
text,
)
remote_ssh_banner_seen = _last_bool_marker(
"harbor_110_remote_ssh_banner_seen",
text,
)
remote_ssh_userauth_service_accept_seen = _last_bool_marker(
"harbor_110_remote_ssh_userauth_service_accept_seen",
text,
)
remote_ssh_publickey_offered = _last_bool_marker(
"harbor_110_remote_ssh_publickey_offered",
text,
)
remote_ssh_publickey_reply_timeout_seen = _last_bool_marker(
"harbor_110_remote_ssh_publickey_reply_timeout_seen",
text,
)
remote_ssh_publickey_auth_stalled = (
"harbor_110_remote_ssh_publickey_auth_stalled=true" in text
or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text
or (
remote_ssh_userauth_service_accept_seen is True
and remote_ssh_publickey_offered is True
and remote_ssh_publickey_reply_timeout_seen is True
)
)
remote_ssh_auth_permission_denied = _last_bool_marker(
"harbor_110_remote_ssh_auth_permission_denied",
text,
)
remote_control_channel_unavailable = (
"harbor_110_remote_control_channel_unavailable" in text
or (bounded_ssh_timeout_seen and remote_ssh_reachable is False)
or remote_ssh_publickey_auth_stalled
)
local_registry_v2_unavailable = (
_HARBOR_110_REMOTE_LOCAL_V2_BLOCKER_RE.search(text) is not None
@@ -1024,7 +1113,9 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
return {
"failure_classifier": (
"harbor_110_remote_control_channel_unavailable"
"harbor_110_remote_ssh_publickey_auth_stalled"
if remote_ssh_publickey_auth_stalled
else "harbor_110_remote_control_channel_unavailable"
if remote_control_channel_unavailable
else "harbor_110_remote_local_registry_v2_unavailable"
if local_registry_v2_unavailable
@@ -1035,6 +1126,17 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
"remote_control_channel_unavailable": remote_control_channel_unavailable,
"remote_ssh_reachable": remote_ssh_reachable,
"bounded_ssh_timeout_seen": bounded_ssh_timeout_seen,
"remote_ssh_tcp_connected": remote_ssh_tcp_connected,
"remote_ssh_banner_seen": remote_ssh_banner_seen,
"remote_ssh_userauth_service_accept_seen": (
remote_ssh_userauth_service_accept_seen
),
"remote_ssh_publickey_offered": remote_ssh_publickey_offered,
"remote_ssh_publickey_reply_timeout_seen": (
remote_ssh_publickey_reply_timeout_seen
),
"remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled,
"remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied,
"local_registry_v2_status": local_status,
"public_registry_v2_status": public_status,
"local_registry_v2_unavailable": local_registry_v2_unavailable,
@@ -1127,6 +1229,16 @@ def _last_named_match_group(pattern: re.Pattern[str], text: str, group: str) ->
return matches[-1].group(group) if matches else ""
def _last_bool_marker(name: str, text: str) -> bool | None:
pattern = re.compile(
_HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE.format(name=re.escape(name))
)
matches = list(pattern.finditer(text))
if not matches:
return None
return matches[-1].group("value") == "true"
def _read_text_file(path: Path) -> str:
return path.read_text(encoding="utf-8")
@@ -1194,6 +1306,14 @@ def _human_summary(payload: dict[str, Any]) -> str:
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_REACHABLE="
f"{readback['latest_visible_harbor_110_repair_remote_ssh_reachable']}"
),
(
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_AUTH_STALLED="
f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled'])}"
),
(
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_REPLY_TIMEOUT_SEEN="
f"{readback['latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen']}"
),
(
"HARBOR_110_REPAIR_WAITING_AFTER_CD_HARBOR_BLOCKER="
f"{int(readback['harbor_110_repair_waiting_after_cd_harbor_blocker'])}"

View File

@@ -282,6 +282,28 @@ harbor_110_remote_ssh_reachable=false
"""
def _harbor_110_repair_publickey_auth_stalled_log() -> str:
return """
operation_boundary_secret_value_read=false
operation_boundary_docker_daemon_restart_performed=false
operation_boundary_host_reboot_performed=false
operation_boundary_node_drain_performed=false
operation_boundary_remote_ssh_bounded=true
harbor_110_remote_ssh_diag_rc=255
harbor_110_remote_ssh_tcp_connected=true
harbor_110_remote_ssh_banner_seen=true
harbor_110_remote_ssh_userauth_service_accept_seen=true
harbor_110_remote_ssh_publickey_offered=true
harbor_110_remote_ssh_publickey_reply_timeout_seen=true
harbor_110_remote_ssh_publickey_auth_stalled=true
BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110
harbor_110_remote_ssh_auth_permission_denied=false
harbor_110_remote_ssh_diag_raw_log_printed=false
BLOCKED harbor_110_remote_control_channel_unavailable target=wooo@192.168.0.110
harbor_110_remote_ssh_reachable=false
"""
def _harbor_110_repair_success_jobs() -> dict:
return {
"total_count": 2,
@@ -697,6 +719,64 @@ def test_build_readback_classifies_harbor_repair_remote_control_unavailable() ->
assert payload["operation_boundaries"]["host_write_performed"] is False
def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> None:
module = _load_module()
payload = module.build_readback(
actions_html=_actions_html_cd_running_harbor_repair_waiting().replace(
'data-tooltip-content="Waiting"',
'data-tooltip-content="Failure"',
1,
),
actions_list_http_status=401,
actions_list_payload={"message": "token is required"},
cd_jobs_http_status=200,
cd_jobs_payload={"jobs": [], "total_count": 0},
harbor_110_repair_jobs_http_status=200,
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
latest_cd_build_log_http_status=200,
latest_cd_build_log_text=_harbor_blocked_log(),
latest_harbor_110_repair_log_http_status=200,
latest_harbor_110_repair_log_text=(
_harbor_110_repair_publickey_auth_stalled_log()
),
)
assert payload["status"] == "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
assert payload["readback"]["latest_visible_harbor_110_repair_failure_classifier"] == (
"harbor_110_remote_ssh_publickey_auth_stalled"
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen"
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen"
]
is True
)
assert (
payload["readback"][
"latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied"
]
is False
)
assert (
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
is True
)
assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False
assert payload["operation_boundaries"]["host_write_performed"] is False
def test_build_readback_surfaces_harbor_110_repair_no_matching_runner() -> None:
module = _load_module()
payload = module.build_readback(