fix(runner): block closure on harbor ssh auth stall
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 33s
CD Pipeline / build-and-deploy (push) Failing after 28s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 12:38:01 +08:00
parent a3dd6a3199
commit 398ff6f86a
2 changed files with 155 additions and 15 deletions

View File

@@ -24,18 +24,27 @@ def _load_module():
return module
def _queue(*, no_matching: bool, harbor_110_no_matching: bool = False) -> dict:
def _queue(
*,
no_matching: bool,
harbor_110_no_matching: bool = False,
harbor_110_remote_control_unavailable: bool = False,
harbor_110_publickey_auth_stalled: bool = False,
) -> dict:
status = (
"blocked_harbor_110_remote_ssh_publickey_auth_stalled"
if harbor_110_publickey_auth_stalled
else "blocked_harbor_110_remote_control_channel_unavailable"
if harbor_110_remote_control_unavailable
else "blocked_harbor_110_repair_no_matching_runner"
if harbor_110_no_matching
else "blocked_no_matching_online_runner"
if no_matching
else "no_matching_runner_not_visible"
)
return {
"schema_version": "awoooi_public_gitea_actions_queue_readback_v1",
"status": (
"blocked_harbor_110_repair_no_matching_runner"
if harbor_110_no_matching
else (
"blocked_no_matching_online_runner"
if no_matching
else "no_matching_runner_not_visible"
)
),
"status": status,
"readback": {
"no_matching_online_runner_visible": no_matching,
"latest_visible_no_matching_runner_label": (
@@ -48,6 +57,28 @@ def _queue(*, no_matching: bool, harbor_110_no_matching: bool = False) -> dict:
"latest_visible_harbor_110_repair_no_matching_runner_label": (
"awoooi-host" if harbor_110_no_matching else ""
),
"latest_visible_harbor_110_repair_remote_control_channel_unavailable": (
harbor_110_remote_control_unavailable
or harbor_110_publickey_auth_stalled
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_publickey_auth_stalled
),
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
harbor_110_publickey_auth_stalled
),
},
"rollups": {
"harbor_110_repair_remote_control_channel_unavailable": (
harbor_110_remote_control_unavailable
or harbor_110_publickey_auth_stalled
),
"harbor_110_repair_remote_ssh_publickey_auth_stalled": (
harbor_110_publickey_auth_stalled
),
"harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": (
harbor_110_publickey_auth_stalled
),
},
"operation_boundaries": {
"workflow_dispatch_performed": False,
@@ -216,6 +247,47 @@ def test_closure_verifier_prioritizes_harbor_110_runner_label_blocker() -> None:
]
def test_closure_verifier_blocks_harbor_110_publickey_auth_stalled() -> None:
module = _load_module()
payload = module.build_closure_verifier(
readiness_text=_readiness(ready=True),
queue=_queue(no_matching=False, harbor_110_publickey_auth_stalled=True),
production_workbench=_workbench(image_current=True, governance_ready=True),
)
assert payload["status"] == "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
assert "harbor_110_remote_ssh_publickey_auth_stalled" in payload["blockers"]
assert payload["readback"]["harbor_110_remote_control_channel_unavailable"] is True
assert payload["readback"]["harbor_110_remote_ssh_publickey_auth_stalled"] is True
assert (
payload["readback"]["harbor_110_remote_ssh_publickey_reply_timeout_seen"]
is True
)
assert payload["progress"]["next_blocked_step_id"] == "public_queue_runner_match"
assert payload["ordered_steps"][2]["status"] == "blocked"
assert "repair_110_ssh_publickey_auth_local_check" in payload["next_actions"][0]
assert "harbor_v2_readback" in payload["next_actions"][0]
assert "repair_110_ssh_publickey_auth_local_check" in payload["progress"][
"next_blocked_step_action"
]
def test_closure_verifier_blocks_harbor_110_remote_control_unavailable() -> None:
module = _load_module()
payload = module.build_closure_verifier(
readiness_text=_readiness(ready=True),
queue=_queue(no_matching=False, harbor_110_remote_control_unavailable=True),
production_workbench=_workbench(image_current=True, governance_ready=True),
)
assert payload["status"] == "blocked_harbor_110_remote_control_channel_unavailable"
assert "harbor_110_remote_control_channel_unavailable" in payload["blockers"]
assert payload["readback"]["harbor_110_remote_control_channel_unavailable"] is True
assert payload["readback"]["harbor_110_remote_ssh_publickey_auth_stalled"] is False
assert payload["progress"]["next_blocked_step_id"] == "public_queue_runner_match"
assert "repair_110_ssh_publickey_auth_local_check" in payload["next_actions"][0]
def test_closure_verifier_uses_deploy_snapshot_when_readiness_file_missing() -> None:
module = _load_module()
payload = module.build_closure_verifier(

View File

@@ -175,7 +175,7 @@ def _production_image_tag_current(production: dict[str, Any]) -> bool:
def _build_ordered_steps(
*,
readiness: dict[str, Any],
no_matching_runner_visible: bool,
public_queue_closure_ready: bool,
queue_runner_match_next_action: str,
production_workbench_present: bool,
production_image_tag_matches_main: bool,
@@ -207,8 +207,11 @@ def _build_ordered_steps(
},
{
"id": "public_queue_runner_match",
"title": "public Gitea queue no longer shows no-matching-runner",
"evidence_ready": not no_matching_runner_visible,
"title": (
"public Gitea queue no longer shows controlled CD or Harbor "
"repair blockers"
),
"evidence_ready": public_queue_closure_ready,
"next_action": queue_runner_match_next_action,
},
{
@@ -302,6 +305,9 @@ def build_closure_verifier(
else {}
)
queue_status = str(queue.get("status") or "")
queue_rollups = (
queue.get("rollups") if isinstance(queue.get("rollups"), dict) else {}
)
production = _production_summary(production_workbench)
no_matching_runner_visible = (
@@ -315,12 +321,55 @@ def build_closure_verifier(
queue_status == "blocked_harbor_110_repair_no_matching_runner"
or bool(harbor_110_repair_no_matching_runner_label)
)
harbor_110_remote_control_channel_unavailable = (
queue_status == "blocked_harbor_110_remote_control_channel_unavailable"
or queue_readback.get(
"latest_visible_harbor_110_repair_remote_control_channel_unavailable"
)
is True
or queue_rollups.get("harbor_110_repair_remote_control_channel_unavailable")
is True
)
harbor_110_remote_ssh_publickey_auth_stalled = (
queue_status == "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
or queue_readback.get(
"latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled"
)
is True
or queue_rollups.get("harbor_110_repair_remote_ssh_publickey_auth_stalled")
is True
)
harbor_110_remote_ssh_publickey_reply_timeout_seen = (
queue_readback.get(
"latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen"
)
is True
or queue_rollups.get(
"harbor_110_repair_remote_ssh_publickey_reply_timeout_seen"
)
is True
)
queue_runner_match_next_action = (
"run_sudo_usr_local_bin_repair_110_ssh_publickey_auth_local_check_on_110_"
"local_console_then_apply_if_metadata_only_then_rerun_public_queue_and_"
"harbor_v2_readback"
if harbor_110_remote_ssh_publickey_auth_stalled
else "run_sudo_usr_local_bin_repair_110_ssh_publickey_auth_local_check_on_110_"
"local_console_then_rerun_public_queue_and_harbor_v2_readback"
if harbor_110_remote_control_channel_unavailable
else (
"run_ops_runner_check_awoooi_110_controlled_cd_lane_readiness_on_110_"
"then_restore_awoooi_host_runner_control_path_without_legacy_or_generic_labels_"
"then_rerun_harbor_110_repair_queue_readback"
if harbor_110_repair_no_matching_runner
else "rerun_public_queue_readback_until_no_matching_runner_is_absent"
)
)
public_queue_closure_ready = not (
no_matching_runner_visible
or harbor_110_repair_no_matching_runner
or harbor_110_remote_control_channel_unavailable
or harbor_110_remote_ssh_publickey_auth_stalled
)
production_workbench_present = bool(production)
production_image_tag_matches_main = _production_image_tag_current(production)
@@ -337,6 +386,10 @@ def build_closure_verifier(
blockers.append("raw_runner_registration_read_not_allowed")
if harbor_110_repair_no_matching_runner:
blockers.append("harbor_110_repair_no_matching_runner")
elif harbor_110_remote_ssh_publickey_auth_stalled:
blockers.append("harbor_110_remote_ssh_publickey_auth_stalled")
elif harbor_110_remote_control_channel_unavailable:
blockers.append("harbor_110_remote_control_channel_unavailable")
elif no_matching_runner_visible:
blockers.append("public_queue_still_has_no_matching_online_runner")
if not production_workbench_present:
@@ -354,6 +407,10 @@ def build_closure_verifier(
status = "blocked_secret_boundary_violation"
elif "harbor_110_repair_no_matching_runner" in blockers:
status = "blocked_harbor_110_repair_no_matching_runner"
elif "harbor_110_remote_ssh_publickey_auth_stalled" in blockers:
status = "blocked_harbor_110_remote_ssh_publickey_auth_stalled"
elif "harbor_110_remote_control_channel_unavailable" in blockers:
status = "blocked_harbor_110_remote_control_channel_unavailable"
elif "public_queue_still_has_no_matching_online_runner" in blockers:
status = "blocked_no_matching_online_runner"
elif "production_workbench_readback_missing" in blockers:
@@ -367,7 +424,7 @@ def build_closure_verifier(
ordered_steps = _build_ordered_steps(
readiness=readiness,
no_matching_runner_visible=no_matching_runner_visible,
public_queue_closure_ready=public_queue_closure_ready,
queue_runner_match_next_action=queue_runner_match_next_action,
production_workbench_present=production_workbench_present,
production_image_tag_matches_main=production_image_tag_matches_main,
@@ -410,6 +467,15 @@ def build_closure_verifier(
"harbor_110_repair_no_matching_runner_label": (
harbor_110_repair_no_matching_runner_label
),
"harbor_110_remote_control_channel_unavailable": (
harbor_110_remote_control_channel_unavailable
),
"harbor_110_remote_ssh_publickey_auth_stalled": (
harbor_110_remote_ssh_publickey_auth_stalled
),
"harbor_110_remote_ssh_publickey_reply_timeout_seen": (
harbor_110_remote_ssh_publickey_reply_timeout_seen
),
"production_workbench_present": production_workbench_present,
"production_workbench_source_count": _int(production.get("source_count")),
"production_deploy_image_tag_matches_main": (
@@ -445,7 +511,9 @@ def build_closure_verifier(
readiness["safe_next_step"]
if not readiness["ready"]
else "",
queue_runner_match_next_action if no_matching_runner_visible else "",
queue_runner_match_next_action
if not public_queue_closure_ready
else "",
"read_production_delivery_workbench_after_deploy"
if not production_workbench_present
else "",