From 1d97b3ffeaa425841cc9d34ddd53fba9df99f082 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 20:15:52 +0800 Subject: [PATCH] fix(recovery): orchestrate 110 harbor local repair --- .gitea/workflows/cd.yaml | 1 + .../test_cd_controlled_runtime_profile.py | 15 ++ scripts/reboot-recovery/deploy-to-110.sh | 6 + ...cover-110-control-path-and-harbor-local.sh | 187 ++++++++++++++++++ ...cover_110_control_path_and_harbor_local.py | 80 ++++++++ 5 files changed, 289 insertions(+) create mode 100644 scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh create mode 100644 scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index f5a3320a..e4279780 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -986,6 +986,7 @@ jobs: echo "BLOCKER harbor_registry_public_route_unavailable registry_v2_status=${registry_status}" echo "NEXT_ACTION run_on_110_local_console_or_restored_ssh: sudo /usr/local/bin/harbor-watchdog.sh --check" echo "NEXT_ACTION if_check_confirms_unhealthy_on_110: sudo /usr/local/bin/harbor-watchdog.sh --repair-once" + echo "NEXT_ACTION combined_110_control_path_then_harbor: sudo /usr/local/bin/recover-110-control-path-and-harbor-local.sh --apply-all" exit 1 fi diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index fdcdef33..66932f11 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -82,6 +82,10 @@ def test_harbor_login_has_public_route_retry_and_safe_secret_transport() -> None assert "BLOCKER harbor_registry_public_route_unavailable" in block assert "sudo /usr/local/bin/harbor-watchdog.sh --check" in block assert "sudo /usr/local/bin/harbor-watchdog.sh --repair-once" in block + assert ( + "sudo /usr/local/bin/recover-110-control-path-and-harbor-local.sh --apply-all" + in block + ) assert "sleep \"${LOGIN_SLEEP_SECONDS}\"" in block assert "${HARBOR_PASSWORD}" in block assert "--password " not in block @@ -117,6 +121,17 @@ def test_harbor_watchdog_exposes_controlled_check_and_one_shot_repair() -> None: assert "while true" in text +def test_deploy_to_110_syncs_local_control_path_recovery_helpers() -> None: + text = (ROOT / "scripts/reboot-recovery/deploy-to-110.sh").read_text( + encoding="utf-8" + ) + + assert "repair-110-ssh-publickey-auth-local.sh" in text + assert "recover-110-control-path-and-harbor-local.sh" in text + assert "/usr/local/bin/repair-110-ssh-publickey-auth-local.sh" in text + assert "/usr/local/bin/recover-110-control-path-and-harbor-local.sh" in text + + def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None: text = _workflow_text() assert "onboarding warning-step workflow is" in text diff --git a/scripts/reboot-recovery/deploy-to-110.sh b/scripts/reboot-recovery/deploy-to-110.sh index 0d23949f..b887a352 100644 --- a/scripts/reboot-recovery/deploy-to-110.sh +++ b/scripts/reboot-recovery/deploy-to-110.sh @@ -16,6 +16,8 @@ echo "=== 部署 awoooi-startup-110 + harbor-watchdog 到 192.168.0.110 ===" echo "[1/5] 上傳啟動腳本..." scp "$SCRIPT_DIR/awoooi-startup-110.sh" "$HOST:/tmp/awoooi-startup-110.sh" scp "$SCRIPT_DIR/awoooi-startup-110.service" "$HOST:/tmp/awoooi-startup-110.service" +scp "$SCRIPT_DIR/repair-110-ssh-publickey-auth-local.sh" "$HOST:/tmp/repair-110-ssh-publickey-auth-local.sh" +scp "$SCRIPT_DIR/recover-110-control-path-and-harbor-local.sh" "$HOST:/tmp/recover-110-control-path-and-harbor-local.sh" # 2. 上傳 watchdog echo "[2/5] 上傳 harbor-watchdog..." @@ -26,6 +28,10 @@ scp "$SCRIPT_DIR/harbor-watchdog.service" "$HOST:/tmp/harbor-watchdog.service" echo "[3/5] 安裝 startup service..." ssh "$HOST" "sudo cp /tmp/awoooi-startup-110.sh /usr/local/bin/awoooi-startup-110.sh && \ sudo chmod +x /usr/local/bin/awoooi-startup-110.sh && \ + sudo cp /tmp/repair-110-ssh-publickey-auth-local.sh /usr/local/bin/repair-110-ssh-publickey-auth-local.sh && \ + sudo chmod +x /usr/local/bin/repair-110-ssh-publickey-auth-local.sh && \ + sudo cp /tmp/recover-110-control-path-and-harbor-local.sh /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \ + sudo chmod +x /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \ sudo cp /tmp/awoooi-startup-110.service /etc/systemd/system/awoooi-startup-110.service && \ sudo systemctl daemon-reload && \ sudo systemctl enable awoooi-startup-110.service && \ diff --git a/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh b/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh new file mode 100644 index 00000000..a4a6c97b --- /dev/null +++ b/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# Local-only orchestrator for the current P0 110 control path + Harbor blocker. +# +# Run on host 110 from a trusted local console or an already working root shell. +# Default mode is read-only. Apply modes do not read key material, do not create +# keys, do not restart the Docker daemon, and do not reboot the host. + +set -euo pipefail + +MODE="check" +TARGET_USER="${TARGET_USER:-wooo}" +EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP:-192.168.0.110}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SSH_REPAIR_SCRIPT="${AWOOOI_110_SSH_REPAIR_SCRIPT:-}" +HARBOR_WATCHDOG_SCRIPT="${AWOOOI_HARBOR_WATCHDOG_SCRIPT:-}" +RELOAD_SSH="${RELOAD_SSH:-0}" + +usage() { + cat <<'USAGE' +Usage: recover-110-control-path-and-harbor-local.sh [--check|--apply-ssh-metadata|--repair-harbor-once|--apply-all] + +Modes: + --check Read-only checks for SSH metadata and Harbor readiness. + --apply-ssh-metadata Fix TARGET_USER home/.ssh/authorized_keys metadata only. + --repair-harbor-once Run one bounded Harbor watchdog repair cycle only. + --apply-all Apply SSH metadata repair, then one Harbor repair cycle. + +Environment: + TARGET_USER=wooo + RELOAD_SSH=0 + ALLOW_NON_110=0 + +Safety: + This script refuses to apply outside 192.168.0.110 unless ALLOW_NON_110=1. +USAGE +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --check) + MODE="check" + ;; + --apply-ssh-metadata) + MODE="apply_ssh_metadata" + ;; + --repair-harbor-once) + MODE="repair_harbor_once" + ;; + --apply-all) + MODE="apply_all" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 64 + ;; + esac + shift +done + +log() { + printf '[%s] [recover-110] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" +} + +host_has_expected_ip() { + if command -v hostname >/dev/null 2>&1; then + hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$EXPECTED_HOST_IP" && return 0 + fi + if command -v ip >/dev/null 2>&1; then + ip -o -4 addr show 2>/dev/null | grep -q " ${EXPECTED_HOST_IP}/" && return 0 + fi + return 1 +} + +require_expected_host_for_apply() { + if [ "${ALLOW_NON_110:-0}" = "1" ]; then + log "ALLOW_NON_110=1 set; expected host guard bypassed" + return 0 + fi + if host_has_expected_ip; then + return 0 + fi + echo "BLOCKED not running on ${EXPECTED_HOST_IP}; use 110 local console/root shell" >&2 + exit 65 +} + +resolve_ssh_repair_script() { + if [ -n "$SSH_REPAIR_SCRIPT" ] && [ -x "$SSH_REPAIR_SCRIPT" ]; then + printf '%s\n' "$SSH_REPAIR_SCRIPT" + return 0 + fi + if [ -x "$SCRIPT_DIR/repair-110-ssh-publickey-auth-local.sh" ]; then + printf '%s\n' "$SCRIPT_DIR/repair-110-ssh-publickey-auth-local.sh" + return 0 + fi + if [ -x "/usr/local/bin/repair-110-ssh-publickey-auth-local.sh" ]; then + printf '%s\n' "/usr/local/bin/repair-110-ssh-publickey-auth-local.sh" + return 0 + fi + return 1 +} + +resolve_harbor_watchdog_script() { + if [ -n "$HARBOR_WATCHDOG_SCRIPT" ] && [ -x "$HARBOR_WATCHDOG_SCRIPT" ]; then + printf '%s\n' "$HARBOR_WATCHDOG_SCRIPT" + return 0 + fi + if [ -x "/usr/local/bin/harbor-watchdog.sh" ]; then + printf '%s\n' "/usr/local/bin/harbor-watchdog.sh" + return 0 + fi + if [ -x "$SCRIPT_DIR/harbor-watchdog.sh" ]; then + printf '%s\n' "$SCRIPT_DIR/harbor-watchdog.sh" + return 0 + fi + return 1 +} + +run_ssh_check() { + local script + if ! script="$(resolve_ssh_repair_script)"; then + echo "SSH_REPAIR_SCRIPT_STATUS=missing" + return 1 + fi + TARGET_USER="$TARGET_USER" RELOAD_SSH=0 "$script" --check +} + +run_ssh_apply() { + local script + require_expected_host_for_apply + if ! script="$(resolve_ssh_repair_script)"; then + echo "SSH_REPAIR_SCRIPT_STATUS=missing" + return 1 + fi + TARGET_USER="$TARGET_USER" RELOAD_SSH="$RELOAD_SSH" "$script" --apply +} + +run_harbor_check() { + local script + if ! script="$(resolve_harbor_watchdog_script)"; then + echo "HARBOR_WATCHDOG_SCRIPT_STATUS=missing" + return 1 + fi + "$script" --check +} + +run_harbor_repair_once() { + local script + require_expected_host_for_apply + if ! script="$(resolve_harbor_watchdog_script)"; then + echo "HARBOR_WATCHDOG_SCRIPT_STATUS=missing" + return 1 + fi + "$script" --repair-once +} + +echo "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=${MODE} target_user=${TARGET_USER}" +echo "expected_host_ip=${EXPECTED_HOST_IP}" +echo "operation_boundary_secret_value_read=false" +echo "operation_boundary_host_reboot_performed=false" +echo "operation_boundary_docker_daemon_restart_performed=false" +echo "operation_boundary_node_drain_performed=false" + +case "$MODE" in + check) + run_ssh_check || true + run_harbor_check || true + ;; + apply_ssh_metadata) + run_ssh_apply + ;; + repair_harbor_once) + run_harbor_repair_once + ;; + apply_all) + run_ssh_apply + run_harbor_repair_once + ;; + *) + echo "Unknown internal mode: $MODE" >&2 + exit 64 + ;; +esac diff --git a/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py b/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py new file mode 100644 index 00000000..ebb9fc86 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import os +import stat +import subprocess +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +RECOVERY = ROOT / "scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh" + + +def test_recover_110_orchestrator_contracts() -> None: + text = RECOVERY.read_text(encoding="utf-8") + + assert "--check" in text + assert "--apply-ssh-metadata" in text + assert "--repair-harbor-once" in text + assert "--apply-all" in text + assert "operation_boundary_secret_value_read=false" in text + assert "operation_boundary_host_reboot_performed=false" in text + assert "operation_boundary_docker_daemon_restart_performed=false" in text + assert "repair-110-ssh-publickey-auth-local.sh" in text + assert "harbor-watchdog.sh" in text + assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text + + forbidden = [ + "systemctl restart docker", + "service docker restart", + "\nreboot", + "\nsudo reboot", + "\nshutdown", + "\nsudo shutdown", + "docker system prune", + "docker volume rm", + ] + for pattern in forbidden: + assert pattern not in text + + +def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> None: + ssh_helper = tmp_path / "ssh-helper.sh" + harbor_helper = tmp_path / "harbor-helper.sh" + ssh_helper.write_text( + "#!/usr/bin/env bash\n" + "echo SSH_HELPER_MODE=$1\n" + "echo SSH_METADATA_WRITE=false\n", + encoding="utf-8", + ) + harbor_helper.write_text( + "#!/usr/bin/env bash\n" + "echo HARBOR_HELPER_MODE=$1\n" + "echo HARBOR_RUNTIME_WRITE=false\n", + encoding="utf-8", + ) + for helper in (ssh_helper, harbor_helper): + helper.chmod(helper.stat().st_mode | stat.S_IXUSR) + + env = { + **os.environ, + "ALLOW_NON_110": "1", + "AWOOOI_110_SSH_REPAIR_SCRIPT": str(ssh_helper), + "AWOOOI_HARBOR_WATCHDOG_SCRIPT": str(harbor_helper), + } + result = subprocess.run( + ["bash", str(RECOVERY), "--check"], + check=False, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + assert result.returncode == 0, result.stdout + result.stderr + assert "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=check" in result.stdout + assert "SSH_HELPER_MODE=--check" in result.stdout + assert "HARBOR_HELPER_MODE=--check" in result.stdout + assert "SSH_METADATA_WRITE=false" in result.stdout + assert "HARBOR_RUNTIME_WRITE=false" in result.stdout + assert "operation_boundary_secret_value_read=false" in result.stdout