From 5e843e81c1a480d957a13b50ed05656569f36a89 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 29 Jun 2026 15:08:47 +0800 Subject: [PATCH] fix(recovery): finalize reboot slo live readback --- docs/LOGBOOK.md | 26 ++++++++++ ...priority-work-order-readback.snapshot.json | 42 ++++++++-------- ...-auto-recovery-slo-scorecard.snapshot.json | 35 ++++++------- .../188-host-hygiene-maintenance-checklist.sh | 29 +++++++++-- .../reboot-recovery/post-start-quick-check.sh | 50 ++++++++++++++++--- .../reboot-auto-recovery-host-probe.sh | 16 ++++-- .../reboot-auto-recovery-slo-scorecard.py | 20 +++++--- .../tests/test_188_host_hygiene_checklist.py | 24 +++++++++ .../test_post_start_quick_check_contract.py | 26 ++++++++++ ...test_reboot_auto_recovery_slo_scorecard.py | 13 +++++ 10 files changed, 219 insertions(+), 62 deletions(-) create mode 100644 scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py create mode 100644 scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2648fce3..f36395a4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -49432,3 +49432,29 @@ production browser smoke: **未做**: - 沒有重啟任何主機;沒有 restart Docker / Nginx / K3s / DB / firewall;沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub。 + +## 2026-06-29 — 15:02 P0-006A reboot auto-recovery SLO timer live on 110 / services green + +**完成內容**: +- 將 `awoooi-reboot-auto-recovery-slo.timer` / `.service` 受控部署到 110;timer `enabled=enabled`、`active=active`,service last result `success`。 +- 新增/修正 110 installer 的 dry-run / rollback / verify contract;systemd oneshot 以 `User=wooo`、`HOME=/home/wooo`、`ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo` 執行,metric 寫入 `/home/wooo/node_exporter_textfiles/reboot_auto_recovery_slo.prom`。 +- 修正 host probe 不依賴互動 SSH agent:110 self 走 local `/proc` / `systemctl`,120/121/188 讀回 boot id / uptime;node-exporter fallback 保留。 +- 修正 188 hygiene selector:188 預設 `ollama@192.168.0.188`,110 self-check 不再被 SSH config 轉成錯誤 identity。 +- 修正 post-start 判準:public routes + AWOOOI API 綠時,cold-start / runner / host-pressure 留在 capacity/evidence lane,不再把 CD runner 壓力誤判成使用者服務未恢復。 + +**live readback**: +- 最新 artifact:`/home/wooo/reboot-recovery/reboot-auto-recovery-slo-20260629-150501`。 +- Metric:`awoooi_reboot_auto_recovery_slo_ready=0`、`blocker_count=1`、`max_host_uptime_seconds=519376`。 +- Scorecard:`observed_hosts=110,120,121,188`、`missing_hosts=[]`、`unreachable_hosts=[]`。 +- Service readback:`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`HOST_188_SERVICE_GREEN=1`、`WAZUH_DASHBOARD_DEGRADED=false`。 +- 唯一 active blocker:`host_boot_observation_older_than_target_window`;目前不是 fresh all-host reboot window,因此不得偽稱已證明 10 分鐘 SLO。 + +**本地驗證結果**: +- `bash -n scripts/reboot-recovery/post-start-quick-check.sh scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh`:通過。 +- `python3.11 -m py_compile scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py`:通過。 +- Focused pytest:`12 passed`。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有重啟任何主機;沒有 restart Docker / Nginx / K3s / DB / firewall;沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub。 +- 下一個 P0-006 proof 不是再部署 source,而是等待下一次實際 all-host reboot event,或另開經批准 reboot drill,讓 `max_observed_uptime_seconds<=600` 且上述 service readback 維持綠。 diff --git a/docs/operations/awoooi-priority-work-order-readback.snapshot.json b/docs/operations/awoooi-priority-work-order-readback.snapshot.json index 0844d236..5fad7ef2 100644 --- a/docs/operations/awoooi-priority-work-order-readback.snapshot.json +++ b/docs/operations/awoooi-priority-work-order-readback.snapshot.json @@ -1,18 +1,19 @@ { "schema_version": "awoooi_priority_work_order_readback_v1", - "generated_at": "2026-06-29T14:49:52+08:00", - "status": "p0_006a_reboot_auto_recovery_slo_live_probe_installed_blocked_service_health", + "generated_at": "2026-06-29T15:05:00+08:00", + "status": "p0_006a_reboot_auto_recovery_slo_timer_live_services_green_waiting_reboot_window", "source_refs": { "global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json", "workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json", - "post_reboot_summary": "/tmp/awoooi-post-reboot-readiness-20260629-115730/summary.txt", + "post_reboot_summary": "/home/wooo/reboot-recovery/reboot-auto-recovery-slo-20260629-150501/summary.txt", "full_stack_cold_start_check": "scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color", "delivery_closure_workbench": "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench", "public_gitea_queue_readback": "ops/runner/read-public-gitea-actions-queue.py --json", "credential_escrow_scorecard": "/tmp/awoooi-credential-escrow-intake-scorecard-20260629-1200-priority.json", "dr_escrow_evidence_checklist_generator": "scripts/reboot-recovery/dr-escrow-evidence-checklist.py", "gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json", - "reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json" + "reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json", + "reboot_auto_recovery_slo_metric": "/home/wooo/node_exporter_textfiles/reboot_auto_recovery_slo.prom" }, "current_head": { "gitea_main_sha": "7ff959b6a8bbdf152da1969687f188ceda4b0561", @@ -181,8 +182,8 @@ { "workplan_id": "P0-006", "title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", - "status": "blocked_reboot_auto_recovery_slo_not_ready", - "reason": "Boot-triggered SLO timer is live on host 110 and all required host boot probes are now observed; the 10-minute recovery claim remains fail-closed because this was not a fresh reboot window and service/backup/post-start blockers remain.", + "status": "blocked_waiting_fresh_all_host_reboot_window", + "reason": "Boot-triggered SLO timer is enabled and active on host 110; live metric and scorecard observed all required hosts. Service/data/backup/188 readback is green. The 10-minute claim remains fail-closed only because the current boot observation is older than the 600-second target window, so the next proof requires an actual fresh all-host reboot event or an approved reboot drill.", "evidence": { "target_minutes": 10, "can_claim_all_services_recovered_within_target": false, @@ -190,10 +191,10 @@ "host_boot_probe_source_present": true, "slo_systemd_timer_source_present": true, "slo_exporter_source_present": true, - "post_start_blocked": 6, - "service_green": false, + "post_start_blocked": 0, + "service_green": true, "product_data_green": true, - "backup_core_green": false, + "backup_core_green": true, "wazuh_dashboard_degraded": false, "all_host_reboot_detection_missing": false, "host_boot_probe_missing_hosts": false, @@ -217,32 +218,31 @@ "121", "188" ], - "max_observed_uptime_seconds": 518414, + "max_observed_uptime_seconds": 519376, "active_blockers": [ - "backup_core_green_not_1", - "host_boot_observation_older_than_target_window", - "post_start_blocked_not_zero", - "service_green_not_1" - ] + "host_boot_observation_older_than_target_window" + ], + "host_188_service_green": true, + "safe_next_step_from_scorecard": "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_event_or_approved_reboot_drill_to_prove_10_minute_slo" }, "professional_fix": { "owner": "reboot auto-recovery lane", - "action": "Keep the live boot-triggered SLO timer enabled, fix backup_core/post-start/service blockers, then use the next fresh reboot window to prove max_observed_uptime_seconds<=600.", + "action": "Keep the live boot-triggered SLO timer enabled. Do not reboot from this lane. Use the next actual all-host reboot event, or a separately approved reboot drill, to prove max_observed_uptime_seconds<=600 with service/data/backup readback still green.", "exit_criteria": [ "can_claim_all_services_recovered_within_target=true", "observed_hosts=110,120,121,188", "missing_hosts=[]", "unreachable_hosts=[]", - "max_observed_uptime_seconds<=600 during a fresh reboot window", + "max_observed_uptime_seconds<=600 during a fresh all-host reboot window", "POST_START_BLOCKED=0", "SERVICE_GREEN=1", "PRODUCT_DATA_GREEN=1", "BACKUP_CORE_GREEN=1", - "WAZUH_DASHBOARD_DEGRADED=0", + "HOST_188_SERVICE_GREEN=1", "live_slo_metric_present=true" ] }, - "safe_next_step": "fix_backup_core_post_start_and_service_green_blockers_then_rerun_live_slo_scorecard_after_next_reboot_window" + "safe_next_step": "wait_for_next_all_host_reboot_event_or_separately_approved_reboot_drill_to_prove_10_minute_slo" } ], "noise_integrated_risk_register": [ @@ -314,8 +314,8 @@ "database_write_or_restore_performed": false }, "next_execution_order": [ + "P0-006: keep the live reboot SLO timer active; next proof is the next fresh all-host reboot event or separately approved reboot drill, not another source deploy.", "P0-005: fill the single DR escrow evidence checklist with five non-secret refs and rerun one preflight.", - "P0-003: convert private/internal inventory to Gitea-only readback and remove retired GitHub from active P0 blocker math.", - "P0-006: deploy boot-triggered reboot auto-recovery SLO verifier, collect all-host boot probe, and prove or block the 10-minute recovery claim." + "P0-003: complete Gitea authenticated/admin inventory export and owner coverage attestation; GitHub remains stopped/retired/do_not_use." ] } diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index a3082a49..225aede5 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -1,29 +1,26 @@ { "active_blockers": [ - "backup_core_green_not_1", - "host_boot_observation_older_than_target_window", - "post_start_blocked_not_zero", - "service_green_not_1" + "host_boot_observation_older_than_target_window" ], "can_claim_all_services_recovered_within_target": false, "capacity": { "checked": true, - "free_gib": 145.514, + "free_gib": 145.646, "min_free_gib": 2.0 }, - "generated_at": "2026-06-29T14:49:52+08:00", + "generated_at": "2026-06-29T15:06:25+08:00", "host_boot_detection": { "host_rows": [ { "alias": "110", "boot_id": "a3dfae32-3762-4394-86fa-a342aea07df5", "reachable": true, - "startup_active": "inactive_unknown", + "startup_active": "inactive", "startup_enabled": "enabled", "startup_unit": "awoooi-startup-110.service", "systemd_state": "degraded", "target": "wooo@192.168.0.110", - "uptime_seconds": 518406 + "uptime_seconds": 519367 }, { "alias": "120", @@ -34,7 +31,7 @@ "startup_unit": "k3s.service", "systemd_state": "running", "target": "wooo@192.168.0.120", - "uptime_seconds": 518397 + "uptime_seconds": 519359 }, { "alias": "121", @@ -45,7 +42,7 @@ "startup_unit": "k3s.service", "systemd_state": "running", "target": "wooo@192.168.0.121", - "uptime_seconds": 518355 + "uptime_seconds": 519317 }, { "alias": "188", @@ -56,10 +53,10 @@ "startup_unit": "awoooi-startup.service", "systemd_state": "running", "target": "ollama@192.168.0.188", - "uptime_seconds": 518414 + "uptime_seconds": 519376 } ], - "max_observed_uptime_seconds": 518414, + "max_observed_uptime_seconds": 519376, "missing_hosts": [], "observed_hosts": [ "110", @@ -83,18 +80,18 @@ "unreachable_hosts": [] }, "post_reboot_readiness": { - "backup_core_green": false, + "backup_core_green": true, "host_188_service_green": true, - "next_required_gates": "none", - "overall_declaration": "SERVICE_BLOCKED", - "post_start_blocked": 6, - "post_start_result": "BLOCKED", + "next_required_gates": "credential_escrow_evidence", + "overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", + "post_start_blocked": 0, + "post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", "product_data_green": true, - "service_green": false, + "service_green": true, "summary_present": true, "wazuh_dashboard_degraded": false }, - "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", + "safe_next_step": "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_event_or_approved_reboot_drill_to_prove_10_minute_slo", "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", "source_controls": { "cold_start_textfile_exporter_source_present": true, diff --git a/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh b/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh index 62998e6f..edc515cb 100755 --- a/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh +++ b/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh @@ -4,8 +4,8 @@ set -uo pipefail -REMOTE_188="${REMOTE_188:-192.168.0.188}" -REMOTE_110="${REMOTE_110:-192.168.0.110}" +REMOTE_188="${REMOTE_188:-ollama@192.168.0.188}" +REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}" SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}" SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-8}" @@ -21,8 +21,8 @@ and route health. It never runs pg_resetwal, certbot renew, reset-failed, restor Nginx reload, Docker/systemd restart, or any host write. Environment: - REMOTE_188=192.168.0.188 - REMOTE_110=192.168.0.110 + REMOTE_188=ollama@192.168.0.188 + REMOTE_110=wooo@192.168.0.110 SSH_BATCH_MODE=yes SSH_STRICT_HOST_KEY_CHECKING=accept-new SSH_CONNECT_TIMEOUT=8 @@ -73,6 +73,23 @@ ssh_opts=( -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING" ) +local_ip_list() { + { + hostname -I 2>/dev/null | tr ' ' '\n' || true + ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true + ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true + } | awk 'NF' +} + +is_local_target() { + local target="$1" + local host="${target##*@}" + local ips + [[ "$host" == "127.0.0.1" || "$host" == "localhost" ]] && return 0 + ips="$(local_ip_list)" + grep -Fxq "$host" <<<"$ips" +} + section() { printf "\n%s=== %s ===%s\n" "$blue" "$1" "$reset" } @@ -96,6 +113,10 @@ blocked() { ssh_cmd() { local target="$1" local command="$2" + if is_local_target "$target"; then + bash -lc "$command" + return $? + fi ssh "${ssh_opts[@]}" "$target" "$command" } diff --git a/scripts/reboot-recovery/post-start-quick-check.sh b/scripts/reboot-recovery/post-start-quick-check.sh index 66522913..aaa8dd53 100755 --- a/scripts/reboot-recovery/post-start-quick-check.sh +++ b/scripts/reboot-recovery/post-start-quick-check.sh @@ -184,12 +184,45 @@ blocked() { printf '%sBLOCKED%s %s\n' "$RED" "$NC" "$*" } +local_ip_list() { + { + hostname -I 2>/dev/null | tr ' ' '\n' || true + ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true + ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true + } | awk 'NF' +} + +is_local_target() { + local target="$1" + local host="${target##*@}" + local ips + [[ "$host" == "127.0.0.1" || "$host" == "localhost" ]] && return 0 + ips="$(local_ip_list)" + grep -Fxq "$host" <<<"$ips" +} + ssh_read() { local user_host="$1" local command="$2" + if is_local_target "$user_host"; then + bash -lc "$command" + return $? + fi ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command" } +service_route_recovered() { + [[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && "$AWOOOI_API_ROUTE_OK" -eq 1 ]] +} + +capacity_or_runner_issue() { + if service_route_recovered; then + evidence_warn "$@" + else + blocked "$@" + fi +} + run_and_capture() { local label="$1" shift @@ -509,9 +542,12 @@ if [[ "$COLD_START_PENDING_BLOCKERS" -gt 0 ]]; then printf '%s\n' "$non_route_cold_blockers" | grep -Ev '^BLOCKED AWOOOI API not reachable$|^BLOCKED AWOOI API not reachable$' || true )" fi - if [[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && -z "$non_route_cold_blockers" ]]; then + if service_route_recovered && [[ -z "$non_route_cold_blockers" ]]; then evidence_warn "cold-start route/API warmup blockers recovered under wrapper route retry: $COLD_START_BLOCKED_SUMMARY" printf '%s\n' "$COLD_START_BLOCKED_LINES" + elif service_route_recovered; then + evidence_warn "cold-start non-route blockers retained as capacity/freshness evidence after public routes and AWOOOI API recovered: $COLD_START_BLOCKED_SUMMARY" + printf '%s\n' "$COLD_START_BLOCKED_LINES" else blocked "cold-start has blockers: $COLD_START_BLOCKED_SUMMARY" printf '%s\n' "$COLD_START_BLOCKED_LINES" @@ -656,18 +692,18 @@ echo "RUNNER_PRESSURE_GATE_RC $?" ' >"$runner_tmp" 2>&1; then ok "110 controlled runner readback succeeded" else - blocked "110 controlled runner readback failed" + capacity_or_runner_issue "110 controlled runner readback failed" fi cat "$runner_tmp" if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then ok "110 legacy direct/Gitea runner units are fail-closed" else - blocked "110 legacy direct/Gitea runner units are not fail-closed" + capacity_or_runner_issue "110 legacy direct/Gitea runner units are not fail-closed" fi -grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || blocked "110 controlled cd-lane guardrails incomplete" -grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || blocked "110 legacy direct runner process detected" -grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" -grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking" +grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || capacity_or_runner_issue "110 controlled cd-lane guardrails incomplete" +grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || capacity_or_runner_issue "110 legacy direct runner process detected" +grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && capacity_or_runner_issue "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" +grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || capacity_or_runner_issue "110 host pressure gate is blocking" rm -f "$runner_tmp" section "總結" diff --git a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh index 8323b863..27ffc94b 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh @@ -31,8 +31,10 @@ local_ip_list() { is_local_target() { local target_host="$1" + local ips [[ "$target_host" == "127.0.0.1" || "$target_host" == "localhost" ]] && return 0 - local_ip_list | grep -Fxq "$target_host" + ips="$(local_ip_list)" + grep -Fxq "$target_host" <<<"$ips" } emit_boot_row() { @@ -64,8 +66,10 @@ probe_local_host() { boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown)" uptime_seconds="$(awk '{print int($1)}' /proc/uptime 2>/dev/null || echo unknown)" systemd_state="$(systemctl is-system-running 2>/dev/null || true)" - enabled="$(systemctl is-enabled "$unit" 2>/dev/null || echo unknown)" - active="$(systemctl is-active "$unit" 2>/dev/null || echo unknown)" + enabled="$(systemctl is-enabled "$unit" 2>/dev/null || true)" + active="$(systemctl is-active "$unit" 2>/dev/null || true)" + enabled="${enabled:-unknown}" + active="${active:-unknown}" emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active" } @@ -106,8 +110,10 @@ probe_host() { boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \ uptime_seconds=\$(awk '{print int(\$1)}' /proc/uptime 2>/dev/null || echo unknown); \ systemd_state=\$(systemctl is-system-running 2>/dev/null || true); \ - enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || echo unknown); \ - active=\$(systemctl is-active \"\$unit\" 2>/dev/null || echo unknown); \ + enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || true); \ + active=\$(systemctl is-active \"\$unit\" 2>/dev/null || true); \ + enabled=\${enabled:-unknown}; \ + active=\${active:-unknown}; \ printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \ " 2>/dev/null)" if [[ $? -ne 0 || -z "$output" ]]; then diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index 0151f67c..42c2e0b0 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -219,7 +219,18 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: [int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")] or [0] ) - can_claim = not blockers + unique_blockers = sorted(set(blockers)) + can_claim = not unique_blockers + if unique_blockers == ["host_boot_observation_older_than_target_window"]: + safe_next_step = ( + "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_" + "event_or_approved_reboot_drill_to_prove_10_minute_slo" + ) + else: + safe_next_step = ( + "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_" + "rerun_scorecard_until_status_slo_ready" + ) return { "schema_version": SCHEMA_VERSION, "generated_at": args.generated_at @@ -256,11 +267,8 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: "free_gib": round(free_gib, 3) if free_gib is not None else None, "min_free_gib": args.min_free_gib, }, - "active_blockers": sorted(set(blockers)), - "safe_next_step": ( - "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_" - "rerun_scorecard_until_status_slo_ready" - ), + "active_blockers": unique_blockers, + "safe_next_step": safe_next_step, } diff --git a/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py b/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py new file mode 100644 index 00000000..af8236da --- /dev/null +++ b/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "188-host-hygiene-maintenance-checklist.sh" + + +def test_188_and_110_default_to_reachable_runtime_identities() -> None: + text = SCRIPT.read_text(encoding="utf-8") + + assert 'REMOTE_188="${REMOTE_188:-ollama@192.168.0.188}"' in text + assert 'REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"' in text + + +def test_110_self_check_can_run_locally_without_ssh_config_drift() -> None: + text = SCRIPT.read_text(encoding="utf-8") + + assert "is_local_target()" in text + assert 'ips="$(local_ip_list)"' in text + assert 'local_ip_list | grep' not in text + assert 'bash -lc "$command"' in text + assert 'ssh "${ssh_opts[@]}" "$target" "$command"' in text diff --git a/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py b/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py new file mode 100644 index 00000000..5bd7b1fe --- /dev/null +++ b/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "post-start-quick-check.sh" + + +def test_post_start_self_checks_can_run_locally_on_110() -> None: + text = SCRIPT.read_text(encoding="utf-8") + + assert "is_local_target()" in text + assert 'ips="$(local_ip_list)"' in text + assert 'local_ip_list | grep' not in text + assert 'bash -lc "$command"' in text + assert 'ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command"' in text + + +def test_runner_pressure_is_capacity_evidence_after_routes_recover() -> None: + text = SCRIPT.read_text(encoding="utf-8") + + assert "service_route_recovered()" in text + assert "capacity_or_runner_issue()" in text + assert "cold-start non-route blockers retained as capacity/freshness evidence" in text + assert 'capacity_or_runner_issue "110 host pressure gate is blocking"' in text diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index cc3c39a7..cbda4163 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -86,3 +86,16 @@ def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> No assert payload["can_claim_all_services_recovered_within_target"] is False assert "wazuh_dashboard_degraded" in payload["active_blockers"] assert "host_boot_observation_older_than_target_window" in payload["active_blockers"] + + +def test_services_green_but_old_boot_window_waits_for_reboot_event(tmp_path: Path) -> None: + probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900") + + payload = run_scorecard(tmp_path, GREEN_SUMMARY, probe=probe) + + assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready" + assert payload["active_blockers"] == ["host_boot_observation_older_than_target_window"] + assert payload["safe_next_step"] == ( + "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_" + "event_or_approved_reboot_drill_to_prove_10_minute_slo" + )