fix(runner): diagnose harbor ssh auth stalls
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / build-and-deploy (push) Failing after 28s
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / build-and-deploy (push) Failing after 28s
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -70,7 +70,66 @@ jobs:
|
||||
timeout 30 "${ssh_base[@]}" "$@"
|
||||
}
|
||||
|
||||
diagnose_ssh_control_channel() {
|
||||
set +e
|
||||
diag_output="$(
|
||||
timeout 20 ssh -vvv -4 \
|
||||
-o BatchMode=yes \
|
||||
-o PreferredAuthentications=publickey \
|
||||
-o PasswordAuthentication=no \
|
||||
-o KbdInteractiveAuthentication=no \
|
||||
-o GSSAPIAuthentication=no \
|
||||
-o NumberOfPasswordPrompts=0 \
|
||||
-o ConnectTimeout=8 \
|
||||
-o ConnectionAttempts=1 \
|
||||
-o ServerAliveInterval=3 \
|
||||
-o ServerAliveCountMax=1 \
|
||||
"${AWOOOI_110_SSH_TARGET}" \
|
||||
'true' 2>&1
|
||||
)"
|
||||
diag_rc=$?
|
||||
set -e
|
||||
|
||||
echo "harbor_110_remote_ssh_diag_rc=${diag_rc}"
|
||||
if printf '%s\n' "${diag_output}" | grep -q "Connection established."; then
|
||||
echo "harbor_110_remote_ssh_tcp_connected=true"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_tcp_connected=false"
|
||||
fi
|
||||
if printf '%s\n' "${diag_output}" | grep -q "Remote protocol version"; then
|
||||
echo "harbor_110_remote_ssh_banner_seen=true"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_banner_seen=false"
|
||||
fi
|
||||
if printf '%s\n' "${diag_output}" | grep -q "SSH2_MSG_SERVICE_ACCEPT received"; then
|
||||
echo "harbor_110_remote_ssh_userauth_service_accept_seen=true"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_userauth_service_accept_seen=false"
|
||||
fi
|
||||
if printf '%s\n' "${diag_output}" | grep -q "Offering public key:"; then
|
||||
echo "harbor_110_remote_ssh_publickey_offered=true"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_publickey_offered=false"
|
||||
fi
|
||||
if printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \
|
||||
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
|
||||
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true"
|
||||
echo "harbor_110_remote_ssh_publickey_auth_stalled=true"
|
||||
echo "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=${AWOOOI_110_SSH_TARGET}"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=false"
|
||||
echo "harbor_110_remote_ssh_publickey_auth_stalled=false"
|
||||
fi
|
||||
if printf '%s\n' "${diag_output}" | grep -q "Permission denied"; then
|
||||
echo "harbor_110_remote_ssh_auth_permission_denied=true"
|
||||
else
|
||||
echo "harbor_110_remote_ssh_auth_permission_denied=false"
|
||||
fi
|
||||
echo "harbor_110_remote_ssh_diag_raw_log_printed=false"
|
||||
}
|
||||
|
||||
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
|
||||
diagnose_ssh_control_channel || true
|
||||
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
|
||||
echo "harbor_110_remote_ssh_reachable=false"
|
||||
exit 65
|
||||
|
||||
@@ -85,6 +85,9 @@ _HARBOR_110_REMOTE_SSH_TIMEOUT_RE = re.compile(
|
||||
r"(Connection to 192\.168\.0\.110 port 22 timed out|"
|
||||
r"ssh: connect to host 192\.168\.0\.110 port 22: Operation timed out)"
|
||||
)
|
||||
_HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE = (
|
||||
r"{name}=(?P<value>true|false)"
|
||||
)
|
||||
_HARBOR_110_REMOTE_LOCAL_V2_STATUS_RE = re.compile(
|
||||
r"harbor_110_remote_local_v2_http_status=(?P<status>\d{3})"
|
||||
)
|
||||
@@ -630,6 +633,31 @@ def build_readback(
|
||||
"latest_visible_harbor_110_repair_bounded_ssh_timeout_seen": (
|
||||
harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_tcp_connected": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_tcp_connected"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_banner_seen": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_banner_seen"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_userauth_service_accept_seen"
|
||||
]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_offered": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_offered"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_reply_timeout_seen"
|
||||
]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
|
||||
),
|
||||
"latest_visible_harbor_110_repair_local_registry_v2_status": (
|
||||
harbor_110_repair_log_classifier["local_registry_v2_status"]
|
||||
),
|
||||
@@ -727,6 +755,8 @@ def build_readback(
|
||||
if latest_cd_visible_blocked
|
||||
else "blocked_current_cd_workflow_waiting_for_runner_or_queue"
|
||||
if latest_cd_waiting
|
||||
else "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
if harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
else "blocked_harbor_110_remote_control_channel_unavailable"
|
||||
if harbor_110_repair_log_classifier["remote_control_channel_unavailable"]
|
||||
else "blocked_harbor_110_remote_local_registry_v2_unavailable"
|
||||
@@ -874,6 +904,31 @@ def build_readback(
|
||||
"harbor_110_repair_bounded_ssh_timeout_seen": (
|
||||
harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_tcp_connected": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_tcp_connected"]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_banner_seen": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_banner_seen"]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_userauth_service_accept_seen": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_userauth_service_accept_seen"
|
||||
]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_publickey_offered": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_offered"]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
|
||||
harbor_110_repair_log_classifier[
|
||||
"remote_ssh_publickey_reply_timeout_seen"
|
||||
]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"]
|
||||
),
|
||||
"harbor_110_repair_remote_ssh_auth_permission_denied": (
|
||||
harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"]
|
||||
),
|
||||
"harbor_110_repair_local_registry_v2_status": (
|
||||
harbor_110_repair_log_classifier["local_registry_v2_status"]
|
||||
),
|
||||
@@ -992,9 +1047,43 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
)
|
||||
|
||||
bounded_ssh_timeout_seen = _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None
|
||||
remote_ssh_tcp_connected = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_tcp_connected",
|
||||
text,
|
||||
)
|
||||
remote_ssh_banner_seen = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_banner_seen",
|
||||
text,
|
||||
)
|
||||
remote_ssh_userauth_service_accept_seen = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_userauth_service_accept_seen",
|
||||
text,
|
||||
)
|
||||
remote_ssh_publickey_offered = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_publickey_offered",
|
||||
text,
|
||||
)
|
||||
remote_ssh_publickey_reply_timeout_seen = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_publickey_reply_timeout_seen",
|
||||
text,
|
||||
)
|
||||
remote_ssh_publickey_auth_stalled = (
|
||||
"harbor_110_remote_ssh_publickey_auth_stalled=true" in text
|
||||
or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text
|
||||
or (
|
||||
remote_ssh_userauth_service_accept_seen is True
|
||||
and remote_ssh_publickey_offered is True
|
||||
and remote_ssh_publickey_reply_timeout_seen is True
|
||||
)
|
||||
)
|
||||
remote_ssh_auth_permission_denied = _last_bool_marker(
|
||||
"harbor_110_remote_ssh_auth_permission_denied",
|
||||
text,
|
||||
)
|
||||
remote_control_channel_unavailable = (
|
||||
"harbor_110_remote_control_channel_unavailable" in text
|
||||
or (bounded_ssh_timeout_seen and remote_ssh_reachable is False)
|
||||
or remote_ssh_publickey_auth_stalled
|
||||
)
|
||||
local_registry_v2_unavailable = (
|
||||
_HARBOR_110_REMOTE_LOCAL_V2_BLOCKER_RE.search(text) is not None
|
||||
@@ -1024,7 +1113,9 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
|
||||
return {
|
||||
"failure_classifier": (
|
||||
"harbor_110_remote_control_channel_unavailable"
|
||||
"harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
if remote_ssh_publickey_auth_stalled
|
||||
else "harbor_110_remote_control_channel_unavailable"
|
||||
if remote_control_channel_unavailable
|
||||
else "harbor_110_remote_local_registry_v2_unavailable"
|
||||
if local_registry_v2_unavailable
|
||||
@@ -1035,6 +1126,17 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]:
|
||||
"remote_control_channel_unavailable": remote_control_channel_unavailable,
|
||||
"remote_ssh_reachable": remote_ssh_reachable,
|
||||
"bounded_ssh_timeout_seen": bounded_ssh_timeout_seen,
|
||||
"remote_ssh_tcp_connected": remote_ssh_tcp_connected,
|
||||
"remote_ssh_banner_seen": remote_ssh_banner_seen,
|
||||
"remote_ssh_userauth_service_accept_seen": (
|
||||
remote_ssh_userauth_service_accept_seen
|
||||
),
|
||||
"remote_ssh_publickey_offered": remote_ssh_publickey_offered,
|
||||
"remote_ssh_publickey_reply_timeout_seen": (
|
||||
remote_ssh_publickey_reply_timeout_seen
|
||||
),
|
||||
"remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled,
|
||||
"remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied,
|
||||
"local_registry_v2_status": local_status,
|
||||
"public_registry_v2_status": public_status,
|
||||
"local_registry_v2_unavailable": local_registry_v2_unavailable,
|
||||
@@ -1127,6 +1229,16 @@ def _last_named_match_group(pattern: re.Pattern[str], text: str, group: str) ->
|
||||
return matches[-1].group(group) if matches else ""
|
||||
|
||||
|
||||
def _last_bool_marker(name: str, text: str) -> bool | None:
|
||||
pattern = re.compile(
|
||||
_HARBOR_110_REMOTE_SSH_BOOL_RE_TEMPLATE.format(name=re.escape(name))
|
||||
)
|
||||
matches = list(pattern.finditer(text))
|
||||
if not matches:
|
||||
return None
|
||||
return matches[-1].group("value") == "true"
|
||||
|
||||
|
||||
def _read_text_file(path: Path) -> str:
|
||||
return path.read_text(encoding="utf-8")
|
||||
|
||||
@@ -1194,6 +1306,14 @@ def _human_summary(payload: dict[str, Any]) -> str:
|
||||
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_REACHABLE="
|
||||
f"{readback['latest_visible_harbor_110_repair_remote_ssh_reachable']}"
|
||||
),
|
||||
(
|
||||
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_AUTH_STALLED="
|
||||
f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled'])}"
|
||||
),
|
||||
(
|
||||
"LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_REPLY_TIMEOUT_SEEN="
|
||||
f"{readback['latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen']}"
|
||||
),
|
||||
(
|
||||
"HARBOR_110_REPAIR_WAITING_AFTER_CD_HARBOR_BLOCKER="
|
||||
f"{int(readback['harbor_110_repair_waiting_after_cd_harbor_blocker'])}"
|
||||
|
||||
@@ -282,6 +282,28 @@ harbor_110_remote_ssh_reachable=false
|
||||
"""
|
||||
|
||||
|
||||
def _harbor_110_repair_publickey_auth_stalled_log() -> str:
|
||||
return """
|
||||
operation_boundary_secret_value_read=false
|
||||
operation_boundary_docker_daemon_restart_performed=false
|
||||
operation_boundary_host_reboot_performed=false
|
||||
operation_boundary_node_drain_performed=false
|
||||
operation_boundary_remote_ssh_bounded=true
|
||||
harbor_110_remote_ssh_diag_rc=255
|
||||
harbor_110_remote_ssh_tcp_connected=true
|
||||
harbor_110_remote_ssh_banner_seen=true
|
||||
harbor_110_remote_ssh_userauth_service_accept_seen=true
|
||||
harbor_110_remote_ssh_publickey_offered=true
|
||||
harbor_110_remote_ssh_publickey_reply_timeout_seen=true
|
||||
harbor_110_remote_ssh_publickey_auth_stalled=true
|
||||
BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=wooo@192.168.0.110
|
||||
harbor_110_remote_ssh_auth_permission_denied=false
|
||||
harbor_110_remote_ssh_diag_raw_log_printed=false
|
||||
BLOCKED harbor_110_remote_control_channel_unavailable target=wooo@192.168.0.110
|
||||
harbor_110_remote_ssh_reachable=false
|
||||
"""
|
||||
|
||||
|
||||
def _harbor_110_repair_success_jobs() -> dict:
|
||||
return {
|
||||
"total_count": 2,
|
||||
@@ -697,6 +719,64 @@ def test_build_readback_classifies_harbor_repair_remote_control_unavailable() ->
|
||||
assert payload["operation_boundaries"]["host_write_performed"] is False
|
||||
|
||||
|
||||
def test_build_readback_classifies_harbor_repair_publickey_auth_stalled() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
actions_html=_actions_html_cd_running_harbor_repair_waiting().replace(
|
||||
'data-tooltip-content="Waiting"',
|
||||
'data-tooltip-content="Failure"',
|
||||
1,
|
||||
),
|
||||
actions_list_http_status=401,
|
||||
actions_list_payload={"message": "token is required"},
|
||||
cd_jobs_http_status=200,
|
||||
cd_jobs_payload={"jobs": [], "total_count": 0},
|
||||
harbor_110_repair_jobs_http_status=200,
|
||||
harbor_110_repair_jobs_payload=_harbor_110_repair_stale_code_review_jobs(),
|
||||
latest_cd_build_log_http_status=200,
|
||||
latest_cd_build_log_text=_harbor_blocked_log(),
|
||||
latest_harbor_110_repair_log_http_status=200,
|
||||
latest_harbor_110_repair_log_text=(
|
||||
_harbor_110_repair_publickey_auth_stalled_log()
|
||||
),
|
||||
)
|
||||
|
||||
assert payload["status"] == "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
assert payload["readback"]["latest_visible_harbor_110_repair_failure_classifier"] == (
|
||||
"harbor_110_remote_ssh_publickey_auth_stalled"
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_userauth_service_accept_seen"
|
||||
]
|
||||
is True
|
||||
)
|
||||
assert (
|
||||
payload["readback"][
|
||||
"latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied"
|
||||
]
|
||||
is False
|
||||
)
|
||||
assert (
|
||||
payload["rollups"]["harbor_110_repair_remote_ssh_publickey_auth_stalled"]
|
||||
is True
|
||||
)
|
||||
assert payload["operation_boundaries"]["secret_or_runner_token_read"] is False
|
||||
assert payload["operation_boundaries"]["host_write_performed"] is False
|
||||
|
||||
|
||||
def test_build_readback_surfaces_harbor_110_repair_no_matching_runner() -> None:
|
||||
module = _load_module()
|
||||
payload = module.build_readback(
|
||||
|
||||
Reference in New Issue
Block a user