From 398ff6f86aa211b913474e034b0b5d7fced76e6a Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 12:38:01 +0800 Subject: [PATCH] fix(runner): block closure on harbor ssh auth stall --- .../test_verify_awoooi_non110_cd_closure.py | 92 +++++++++++++++++-- ops/runner/verify-awoooi-non110-cd-closure.py | 78 +++++++++++++++- 2 files changed, 155 insertions(+), 15 deletions(-) diff --git a/ops/runner/test_verify_awoooi_non110_cd_closure.py b/ops/runner/test_verify_awoooi_non110_cd_closure.py index c8826100..10965156 100644 --- a/ops/runner/test_verify_awoooi_non110_cd_closure.py +++ b/ops/runner/test_verify_awoooi_non110_cd_closure.py @@ -24,18 +24,27 @@ def _load_module(): return module -def _queue(*, no_matching: bool, harbor_110_no_matching: bool = False) -> dict: +def _queue( + *, + no_matching: bool, + harbor_110_no_matching: bool = False, + harbor_110_remote_control_unavailable: bool = False, + harbor_110_publickey_auth_stalled: bool = False, +) -> dict: + status = ( + "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + if harbor_110_publickey_auth_stalled + else "blocked_harbor_110_remote_control_channel_unavailable" + if harbor_110_remote_control_unavailable + else "blocked_harbor_110_repair_no_matching_runner" + if harbor_110_no_matching + else "blocked_no_matching_online_runner" + if no_matching + else "no_matching_runner_not_visible" + ) return { "schema_version": "awoooi_public_gitea_actions_queue_readback_v1", - "status": ( - "blocked_harbor_110_repair_no_matching_runner" - if harbor_110_no_matching - else ( - "blocked_no_matching_online_runner" - if no_matching - else "no_matching_runner_not_visible" - ) - ), + "status": status, "readback": { "no_matching_online_runner_visible": no_matching, "latest_visible_no_matching_runner_label": ( @@ -48,6 +57,28 @@ def _queue(*, no_matching: bool, harbor_110_no_matching: bool = False) -> dict: "latest_visible_harbor_110_repair_no_matching_runner_label": ( "awoooi-host" if harbor_110_no_matching else "" ), + "latest_visible_harbor_110_repair_remote_control_channel_unavailable": ( + harbor_110_remote_control_unavailable + or harbor_110_publickey_auth_stalled + ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( + harbor_110_publickey_auth_stalled + ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( + harbor_110_publickey_auth_stalled + ), + }, + "rollups": { + "harbor_110_repair_remote_control_channel_unavailable": ( + harbor_110_remote_control_unavailable + or harbor_110_publickey_auth_stalled + ), + "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( + harbor_110_publickey_auth_stalled + ), + "harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( + harbor_110_publickey_auth_stalled + ), }, "operation_boundaries": { "workflow_dispatch_performed": False, @@ -216,6 +247,47 @@ def test_closure_verifier_prioritizes_harbor_110_runner_label_blocker() -> None: ] +def test_closure_verifier_blocks_harbor_110_publickey_auth_stalled() -> None: + module = _load_module() + payload = module.build_closure_verifier( + readiness_text=_readiness(ready=True), + queue=_queue(no_matching=False, harbor_110_publickey_auth_stalled=True), + production_workbench=_workbench(image_current=True, governance_ready=True), + ) + + assert payload["status"] == "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + assert "harbor_110_remote_ssh_publickey_auth_stalled" in payload["blockers"] + assert payload["readback"]["harbor_110_remote_control_channel_unavailable"] is True + assert payload["readback"]["harbor_110_remote_ssh_publickey_auth_stalled"] is True + assert ( + payload["readback"]["harbor_110_remote_ssh_publickey_reply_timeout_seen"] + is True + ) + assert payload["progress"]["next_blocked_step_id"] == "public_queue_runner_match" + assert payload["ordered_steps"][2]["status"] == "blocked" + assert "repair_110_ssh_publickey_auth_local_check" in payload["next_actions"][0] + assert "harbor_v2_readback" in payload["next_actions"][0] + assert "repair_110_ssh_publickey_auth_local_check" in payload["progress"][ + "next_blocked_step_action" + ] + + +def test_closure_verifier_blocks_harbor_110_remote_control_unavailable() -> None: + module = _load_module() + payload = module.build_closure_verifier( + readiness_text=_readiness(ready=True), + queue=_queue(no_matching=False, harbor_110_remote_control_unavailable=True), + production_workbench=_workbench(image_current=True, governance_ready=True), + ) + + assert payload["status"] == "blocked_harbor_110_remote_control_channel_unavailable" + assert "harbor_110_remote_control_channel_unavailable" in payload["blockers"] + assert payload["readback"]["harbor_110_remote_control_channel_unavailable"] is True + assert payload["readback"]["harbor_110_remote_ssh_publickey_auth_stalled"] is False + assert payload["progress"]["next_blocked_step_id"] == "public_queue_runner_match" + assert "repair_110_ssh_publickey_auth_local_check" in payload["next_actions"][0] + + def test_closure_verifier_uses_deploy_snapshot_when_readiness_file_missing() -> None: module = _load_module() payload = module.build_closure_verifier( diff --git a/ops/runner/verify-awoooi-non110-cd-closure.py b/ops/runner/verify-awoooi-non110-cd-closure.py index 2185fcf7..7392ee73 100755 --- a/ops/runner/verify-awoooi-non110-cd-closure.py +++ b/ops/runner/verify-awoooi-non110-cd-closure.py @@ -175,7 +175,7 @@ def _production_image_tag_current(production: dict[str, Any]) -> bool: def _build_ordered_steps( *, readiness: dict[str, Any], - no_matching_runner_visible: bool, + public_queue_closure_ready: bool, queue_runner_match_next_action: str, production_workbench_present: bool, production_image_tag_matches_main: bool, @@ -207,8 +207,11 @@ def _build_ordered_steps( }, { "id": "public_queue_runner_match", - "title": "public Gitea queue no longer shows no-matching-runner", - "evidence_ready": not no_matching_runner_visible, + "title": ( + "public Gitea queue no longer shows controlled CD or Harbor " + "repair blockers" + ), + "evidence_ready": public_queue_closure_ready, "next_action": queue_runner_match_next_action, }, { @@ -302,6 +305,9 @@ def build_closure_verifier( else {} ) queue_status = str(queue.get("status") or "") + queue_rollups = ( + queue.get("rollups") if isinstance(queue.get("rollups"), dict) else {} + ) production = _production_summary(production_workbench) no_matching_runner_visible = ( @@ -315,12 +321,55 @@ def build_closure_verifier( queue_status == "blocked_harbor_110_repair_no_matching_runner" or bool(harbor_110_repair_no_matching_runner_label) ) + harbor_110_remote_control_channel_unavailable = ( + queue_status == "blocked_harbor_110_remote_control_channel_unavailable" + or queue_readback.get( + "latest_visible_harbor_110_repair_remote_control_channel_unavailable" + ) + is True + or queue_rollups.get("harbor_110_repair_remote_control_channel_unavailable") + is True + ) + harbor_110_remote_ssh_publickey_auth_stalled = ( + queue_status == "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + or queue_readback.get( + "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled" + ) + is True + or queue_rollups.get("harbor_110_repair_remote_ssh_publickey_auth_stalled") + is True + ) + harbor_110_remote_ssh_publickey_reply_timeout_seen = ( + queue_readback.get( + "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen" + ) + is True + or queue_rollups.get( + "harbor_110_repair_remote_ssh_publickey_reply_timeout_seen" + ) + is True + ) queue_runner_match_next_action = ( + "run_sudo_usr_local_bin_repair_110_ssh_publickey_auth_local_check_on_110_" + "local_console_then_apply_if_metadata_only_then_rerun_public_queue_and_" + "harbor_v2_readback" + if harbor_110_remote_ssh_publickey_auth_stalled + else "run_sudo_usr_local_bin_repair_110_ssh_publickey_auth_local_check_on_110_" + "local_console_then_rerun_public_queue_and_harbor_v2_readback" + if harbor_110_remote_control_channel_unavailable + else ( "run_ops_runner_check_awoooi_110_controlled_cd_lane_readiness_on_110_" "then_restore_awoooi_host_runner_control_path_without_legacy_or_generic_labels_" "then_rerun_harbor_110_repair_queue_readback" if harbor_110_repair_no_matching_runner else "rerun_public_queue_readback_until_no_matching_runner_is_absent" + ) + ) + public_queue_closure_ready = not ( + no_matching_runner_visible + or harbor_110_repair_no_matching_runner + or harbor_110_remote_control_channel_unavailable + or harbor_110_remote_ssh_publickey_auth_stalled ) production_workbench_present = bool(production) production_image_tag_matches_main = _production_image_tag_current(production) @@ -337,6 +386,10 @@ def build_closure_verifier( blockers.append("raw_runner_registration_read_not_allowed") if harbor_110_repair_no_matching_runner: blockers.append("harbor_110_repair_no_matching_runner") + elif harbor_110_remote_ssh_publickey_auth_stalled: + blockers.append("harbor_110_remote_ssh_publickey_auth_stalled") + elif harbor_110_remote_control_channel_unavailable: + blockers.append("harbor_110_remote_control_channel_unavailable") elif no_matching_runner_visible: blockers.append("public_queue_still_has_no_matching_online_runner") if not production_workbench_present: @@ -354,6 +407,10 @@ def build_closure_verifier( status = "blocked_secret_boundary_violation" elif "harbor_110_repair_no_matching_runner" in blockers: status = "blocked_harbor_110_repair_no_matching_runner" + elif "harbor_110_remote_ssh_publickey_auth_stalled" in blockers: + status = "blocked_harbor_110_remote_ssh_publickey_auth_stalled" + elif "harbor_110_remote_control_channel_unavailable" in blockers: + status = "blocked_harbor_110_remote_control_channel_unavailable" elif "public_queue_still_has_no_matching_online_runner" in blockers: status = "blocked_no_matching_online_runner" elif "production_workbench_readback_missing" in blockers: @@ -367,7 +424,7 @@ def build_closure_verifier( ordered_steps = _build_ordered_steps( readiness=readiness, - no_matching_runner_visible=no_matching_runner_visible, + public_queue_closure_ready=public_queue_closure_ready, queue_runner_match_next_action=queue_runner_match_next_action, production_workbench_present=production_workbench_present, production_image_tag_matches_main=production_image_tag_matches_main, @@ -410,6 +467,15 @@ def build_closure_verifier( "harbor_110_repair_no_matching_runner_label": ( harbor_110_repair_no_matching_runner_label ), + "harbor_110_remote_control_channel_unavailable": ( + harbor_110_remote_control_channel_unavailable + ), + "harbor_110_remote_ssh_publickey_auth_stalled": ( + harbor_110_remote_ssh_publickey_auth_stalled + ), + "harbor_110_remote_ssh_publickey_reply_timeout_seen": ( + harbor_110_remote_ssh_publickey_reply_timeout_seen + ), "production_workbench_present": production_workbench_present, "production_workbench_source_count": _int(production.get("source_count")), "production_deploy_image_tag_matches_main": ( @@ -445,7 +511,9 @@ def build_closure_verifier( readiness["safe_next_step"] if not readiness["ready"] else "", - queue_runner_match_next_action if no_matching_runner_visible else "", + queue_runner_match_next_action + if not public_queue_closure_ready + else "", "read_production_delivery_workbench_after_deploy" if not production_workbench_present else "",