diff --git a/docs/operations/awoooi-priority-work-order-readback.snapshot.json b/docs/operations/awoooi-priority-work-order-readback.snapshot.json index ee545028..84f16c6a 100644 --- a/docs/operations/awoooi-priority-work-order-readback.snapshot.json +++ b/docs/operations/awoooi-priority-work-order-readback.snapshot.json @@ -1,7 +1,7 @@ { "schema_version": "awoooi_priority_work_order_readback_v1", - "generated_at": "2026-06-29T14:27:32+08:00", - "status": "p0_006a_reboot_auto_recovery_slo_control_plane_added_blocked_until_live_probe", + "generated_at": "2026-06-29T14:49:52+08:00", + "status": "p0_006a_reboot_auto_recovery_slo_live_probe_installed_blocked_service_health", "source_refs": { "global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json", "workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json", @@ -15,9 +15,9 @@ "reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json" }, "current_head": { - "gitea_main_sha": "748ee37ca958df1e1e25453363de3d9f3f02a6c1", - "latest_successful_deploy_marker": "9362588ce chore(cd): deploy a423301 [skip ci]", - "latest_successful_deployed_source_sha": "a4233017ad5fd03977233f3db6a4bb45d71507ed", + "gitea_main_sha": "7ff959b6a8bbdf152da1969687f188ceda4b0561", + "latest_successful_deploy_marker": "15824e9ec chore(cd): deploy 57c1df1 [skip ci]", + "latest_successful_deployed_source_sha": "57c1df19fca580dafa5980d82d164819de0dbcd5", "latest_source_readiness_commit_sha": "0c8d4e88c39157b92322fa41a92e6b15c317ac49", "latest_source_readiness_cd_run_id": "3882", "latest_source_readiness_cd_run_status": "Success", @@ -187,7 +187,7 @@ "workplan_id": "P0-006", "title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", "status": "blocked_reboot_auto_recovery_slo_not_ready", - "reason": "The required target is automatic all-host reboot detection plus boot-triggered recovery verification. Current source now has the control-plane verifier, but live all-host boot probe has not been collected and Wazuh dashboard is still degraded.", + "reason": "Boot-triggered SLO timer is live on host 110 and all required host boot probes are now observed; the 10-minute recovery claim remains fail-closed because this was not a fresh reboot window and service/backup/post-start blockers remain.", "evidence": { "target_minutes": 10, "can_claim_all_services_recovered_within_target": false, @@ -195,31 +195,59 @@ "host_boot_probe_source_present": true, "slo_systemd_timer_source_present": true, "slo_exporter_source_present": true, - "post_start_blocked": 0, - "service_green": true, + "post_start_blocked": 6, + "service_green": false, "product_data_green": true, - "backup_core_green": true, - "wazuh_dashboard_degraded": true, - "all_host_reboot_detection_missing": true, - "host_boot_probe_missing_hosts": true, - "local_disk_free_gib_after_cleanup": 3.271 + "backup_core_green": false, + "wazuh_dashboard_degraded": false, + "all_host_reboot_detection_missing": false, + "host_boot_probe_missing_hosts": false, + "local_disk_free_gib_after_cleanup": 145.514, + "slo_installer_source_present": true, + "live_slo_timer_enabled": true, + "live_slo_timer_active": true, + "live_slo_service_last_result": "success", + "live_slo_metric_present": true, + "observed_hosts": [ + "110", + "120", + "121", + "188" + ], + "missing_hosts": [], + "unreachable_hosts": [], + "stale_hosts": [ + "110", + "120", + "121", + "188" + ], + "max_observed_uptime_seconds": 518414, + "active_blockers": [ + "backup_core_green_not_1", + "host_boot_observation_older_than_target_window", + "post_start_blocked_not_zero", + "service_green_not_1" + ] }, "professional_fix": { "owner": "reboot auto-recovery lane", - "action": "Deploy the boot-triggered SLO timer/exporter, collect all-host boot probes, and rerun the scorecard until it can prove all services recovered inside 10 minutes.", + "action": "Keep the live boot-triggered SLO timer enabled, fix backup_core/post-start/service blockers, then use the next fresh reboot window to prove max_observed_uptime_seconds<=600.", "exit_criteria": [ "can_claim_all_services_recovered_within_target=true", "observed_hosts=110,120,121,188", - "max_observed_uptime_seconds<=600", + "missing_hosts=[]", + "unreachable_hosts=[]", + "max_observed_uptime_seconds<=600 during a fresh reboot window", "POST_START_BLOCKED=0", "SERVICE_GREEN=1", "PRODUCT_DATA_GREEN=1", "BACKUP_CORE_GREEN=1", "WAZUH_DASHBOARD_DEGRADED=0", - "local_disk_free_gib>=2" + "live_slo_metric_present=true" ] }, - "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard" + "safe_next_step": "fix_backup_core_post_start_and_service_green_blockers_then_rerun_live_slo_scorecard_after_next_reboot_window" } ], "noise_integrated_risk_register": [ diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index 8660a63c..a3082a49 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -1,47 +1,98 @@ { "active_blockers": [ - "all_host_reboot_detection_missing", - "host_boot_probe_missing_hosts", - "wazuh_dashboard_degraded" + "backup_core_green_not_1", + "host_boot_observation_older_than_target_window", + "post_start_blocked_not_zero", + "service_green_not_1" ], "can_claim_all_services_recovered_within_target": false, "capacity": { "checked": true, - "free_gib": 2.707, + "free_gib": 145.514, "min_free_gib": 2.0 }, - "generated_at": "2026-06-29T14:27:32+08:00", + "generated_at": "2026-06-29T14:49:52+08:00", "host_boot_detection": { - "host_rows": [], - "max_observed_uptime_seconds": 0, - "missing_hosts": [ + "host_rows": [ + { + "alias": "110", + "boot_id": "a3dfae32-3762-4394-86fa-a342aea07df5", + "reachable": true, + "startup_active": "inactive_unknown", + "startup_enabled": "enabled", + "startup_unit": "awoooi-startup-110.service", + "systemd_state": "degraded", + "target": "wooo@192.168.0.110", + "uptime_seconds": 518406 + }, + { + "alias": "120", + "boot_id": "866d621e-bad0-4096-938e-b103db8e5e03", + "reachable": true, + "startup_active": "active", + "startup_enabled": "enabled", + "startup_unit": "k3s.service", + "systemd_state": "running", + "target": "wooo@192.168.0.120", + "uptime_seconds": 518397 + }, + { + "alias": "121", + "boot_id": "119c4ea7-8b49-45aa-b60e-bcd4b5dd0979", + "reachable": true, + "startup_active": "active", + "startup_enabled": "enabled", + "startup_unit": "k3s.service", + "systemd_state": "running", + "target": "wooo@192.168.0.121", + "uptime_seconds": 518355 + }, + { + "alias": "188", + "boot_id": "9cc1f1fc-b7cc-42c1-bc83-d495f0e3c863", + "reachable": true, + "startup_active": "inactive", + "startup_enabled": "enabled", + "startup_unit": "awoooi-startup.service", + "systemd_state": "running", + "target": "ollama@192.168.0.188", + "uptime_seconds": 518414 + } + ], + "max_observed_uptime_seconds": 518414, + "missing_hosts": [], + "observed_hosts": [ "110", "120", "121", "188" ], - "observed_hosts": [], "required_hosts": [ "110", "120", "121", "188" ], - "stale_hosts": [], + "stale_hosts": [ + "110", + "120", + "121", + "188" + ], "unknown_uptime_hosts": [], "unreachable_hosts": [] }, "post_reboot_readiness": { - "backup_core_green": true, + "backup_core_green": false, "host_188_service_green": true, - "next_required_gates": "credential_escrow_evidence", - "overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", - "post_start_blocked": 0, - "post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED", + "next_required_gates": "none", + "overall_declaration": "SERVICE_BLOCKED", + "post_start_blocked": 6, + "post_start_result": "BLOCKED", "product_data_green": true, - "service_green": true, + "service_green": false, "summary_present": true, - "wazuh_dashboard_degraded": true + "wazuh_dashboard_degraded": false }, "safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready", "schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1", @@ -52,6 +103,7 @@ "host_boot_probe_source_present": true, "post_reboot_summary_source_present": true, "slo_exporter_source_present": true, + "slo_installer_source_present": true, "slo_systemd_service_source_present": true, "slo_systemd_timer_source_present": true }, diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 50d04c25..ffcda404 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -79,7 +79,7 @@ spec: - name: AWOOOI_BUILD_COMMIT_SHA # 2026-06-29 Codex: CD rewrites this to the deployed image tag so # production deploy readback does not rely on a stale static snapshot. - value: "57c1df19fca580dafa5980d82d164819de0dbcd5" + value: "7ff959b6a8bbdf152da1969687f188ceda4b0561" - name: USE_AI_ROUTER value: "true" - name: ENABLE_NEMOTRON_COLLABORATION diff --git a/k8s/awoooi-prod/kustomization.yaml b/k8s/awoooi-prod/kustomization.yaml index fd6c040c..9dab9403 100644 --- a/k8s/awoooi-prod/kustomization.yaml +++ b/k8s/awoooi-prod/kustomization.yaml @@ -41,7 +41,7 @@ resources: images: - name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/api - newTag: 57c1df19fca580dafa5980d82d164819de0dbcd5 + newTag: 7ff959b6a8bbdf152da1969687f188ceda4b0561 - name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER newName: 192.168.0.110:5000/awoooi/web - newTag: 57c1df19fca580dafa5980d82d164819de0dbcd5 + newTag: 7ff959b6a8bbdf152da1969687f188ceda4b0561 diff --git a/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service index ae81c7b1..722b6d8f 100644 --- a/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service +++ b/scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service @@ -5,7 +5,14 @@ Wants=network-online.target [Service] Type=oneshot -Environment=ROOT_DIR=/opt/awoooi +User=wooo +Group=wooo +WorkingDirectory=/home/wooo +Environment=HOME=/home/wooo +Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +Environment=ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo +Environment=TEXTFILE_DIR=/home/wooo/node_exporter_textfiles +Environment=LOG_DIR=/home/wooo/reboot-recovery Environment=TARGET_MINUTES=10 ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh TimeoutStartSec=600 diff --git a/scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh b/scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh new file mode 100644 index 00000000..a8e0741f --- /dev/null +++ b/scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh @@ -0,0 +1,252 @@ +#!/usr/bin/env bash +# Install the read-only AWOOOI reboot auto-recovery SLO verifier on host 110. +# +# This installer only stages source scripts, enables a systemd timer, starts the +# verifier service once, and reads back metrics/status. It does not reboot hosts +# or restart Docker, Nginx, K3s, PostgreSQL, Redis, firewall, or product units. + +set -euo pipefail + +HOST="wooo@192.168.0.110" +REMOTE_ROOT="/home/wooo/awoooi-reboot-recovery-slo" +TEXTFILE_DIR="/home/wooo/node_exporter_textfiles" +LOG_DIR="/home/wooo/reboot-recovery" +MODE="install" +RUN_ONCE=1 +SSH_CONNECT_TIMEOUT_SECONDS="${SSH_CONNECT_TIMEOUT_SECONDS:-8}" + +usage() { + cat <<'USAGE' +Usage: install-reboot-auto-recovery-slo-110.sh [options] + +Installs the AWOOOI reboot auto-recovery 10-minute SLO verifier on host 110. + +Options: + --dry-run Print the controlled apply plan without writing host state. + --verify-only Read back timer/service/metric state without writing. + --rollback Disable/remove the verifier timer/service/wrapper. + --no-run-once Install and enable the timer without starting the verifier once. + --host HOST SSH target. Default: wooo@192.168.0.110 + --remote-root DIR Remote source root. Default: /home/wooo/awoooi-reboot-recovery-slo + --textfile-dir DIR Node exporter textfile dir. Default: /home/wooo/node_exporter_textfiles + --log-dir DIR Evidence log dir. Default: /home/wooo/reboot-recovery + -h, --help Show this help. + +Controlled apply boundaries: + - allowed: stage repo scripts, install verifier unit/timer, daemon-reload, + enable/start the verifier timer, start only awoooi-reboot-auto-recovery-slo.service + - forbidden here: reboot, node drain, firewall changes, destructive DB work, + service restarts for Docker/Nginx/K3s/PostgreSQL/Redis/product workloads, + secret/session/auth/.env reads, GitHub usage, force push +USAGE +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + MODE="dry-run" + ;; + --verify-only) + MODE="verify" + ;; + --rollback) + MODE="rollback" + ;; + --no-run-once) + RUN_ONCE=0 + ;; + --host) + HOST="${2:?--host requires a value}" + shift + ;; + --remote-root) + REMOTE_ROOT="${2:?--remote-root requires a value}" + shift + ;; + --textfile-dir) + TEXTFILE_DIR="${2:?--textfile-dir requires a value}" + shift + ;; + --log-dir) + LOG_DIR="${2:?--log-dir requires a value}" + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + printf 'Unknown argument: %s\n' "$1" >&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT_SECONDS") +UNIT_NAME="awoooi-reboot-auto-recovery-slo" +REMOTE_TARBALL="/tmp/${UNIT_NAME}-$(date '+%Y%m%d%H%M%S').tar.gz" + +quote() { + printf '%q' "$1" +} + +print_plan() { + cat </dev/null | sort | tail -n 1)" +echo "SLO_TIMER_ENABLED=$(systemctl is-enabled "${unit}.timer" 2>/dev/null || true)" +echo "SLO_TIMER_ACTIVE=$(systemctl is-active "${unit}.timer" 2>/dev/null || true)" +echo "SLO_SERVICE_LOAD=$(systemctl show "${unit}.service" -p LoadState --value 2>/dev/null || true)" +echo "SLO_SERVICE_RESULT=$(systemctl show "${unit}.service" -p Result --value 2>/dev/null || true)" +echo "SLO_SERVICE_LAST_STATUS=$(systemctl show "${unit}.service" -p ExecMainStatus --value 2>/dev/null || true)" +echo "SLO_METRIC_PRESENT=$([ -s "$metric" ] && echo 1 || echo 0)" +echo "SLO_METRIC_PATH=$metric" +if [ -s "$metric" ]; then + awk 'NF && $1 !~ /^#/ {print "SLO_METRIC_LINE " $0}' "$metric" | tail -20 +fi +echo "SLO_LATEST_ARTIFACT_DIR=${latest_dir:-none}" +if [ -n "${latest_dir:-}" ] && [ -s "$latest_dir/scorecard.json" ]; then + python3 - "$latest_dir/scorecard.json" <<'PY' +import json, sys +payload = json.load(open(sys.argv[1], encoding="utf-8")) +print("SLO_SCORECARD_STATUS=" + str(payload.get("status", "unknown"))) +print( + "SLO_CAN_CLAIM_ALL_SERVICES_RECOVERED_WITHIN_TARGET=" + + ("1" if payload.get("can_claim_all_services_recovered_within_target") else "0") +) +print("SLO_ACTIVE_BLOCKERS=" + ",".join(payload.get("active_blockers") or [])) +print( + "SLO_OBSERVED_HOSTS=" + + ",".join(payload.get("host_boot_detection", {}).get("observed_hosts") or []) +) +PY +fi +REMOTE +} + +ssh_remote() { + ssh "${SSH_OPTS[@]}" "$HOST" \ + "TEXTFILE_DIR=$(quote "$TEXTFILE_DIR") LOG_DIR=$(quote "$LOG_DIR") bash -s" +} + +verify_remote() { + remote_verify_command | ssh_remote +} + +rollback_remote() { + cat <<'REMOTE' | ssh_remote +set -euo pipefail +unit="awoooi-reboot-auto-recovery-slo" +sudo systemctl disable --now "${unit}.timer" >/dev/null 2>&1 || true +sudo systemctl stop "${unit}.service" >/dev/null 2>&1 || true +sudo rm -f "/etc/systemd/system/${unit}.timer" "/etc/systemd/system/${unit}.service" +sudo rm -f "/usr/local/bin/${unit}.sh" +sudo systemctl daemon-reload +echo "SLO_ROLLBACK_DONE=1" +REMOTE +} + +install_remote() { + local run_once_command=":" + if [[ "$RUN_ONCE" -eq 1 ]]; then + run_once_command="sudo systemctl start ${UNIT_NAME}.service" + fi + + cat <&2 + exit 2 + ;; +esac + +payload_dir="$(mktemp -d "${TMPDIR:-/tmp}/${UNIT_NAME}.XXXXXX")" +payload="${payload_dir}/payload.tar.gz" +trap 'rm -rf "$payload_dir"' EXIT +create_payload "$payload" +scp "${SSH_OPTS[@]}" "$payload" "$HOST:$REMOTE_TARBALL" +install_remote +verify_remote diff --git a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh index b38fa992..8323b863 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh @@ -8,6 +8,8 @@ set -uo pipefail SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}") +NODE_EXPORTER_PORT="${NODE_EXPORTER_PORT:-9100}" +NODE_EXPORTER_TIMEOUT_SECONDS="${NODE_EXPORTER_TIMEOUT_SECONDS:-4}" HOST_SPECS=( "110=wooo@192.168.0.110:awoooi-startup-110.service" "120=wooo@192.168.0.120:k3s.service" @@ -19,11 +21,86 @@ escape_value() { printf '%s' "$1" | tr ' \t\n' '___' } +local_ip_list() { + { + hostname -I 2>/dev/null | tr ' ' '\n' || true + ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true + ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true + } | awk 'NF' +} + +is_local_target() { + local target_host="$1" + [[ "$target_host" == "127.0.0.1" || "$target_host" == "localhost" ]] && return 0 + local_ip_list | grep -Fxq "$target_host" +} + +emit_boot_row() { + local alias="$1" + local target="$2" + local unit="$3" + local reachable="$4" + local boot_id="$5" + local uptime_seconds="$6" + local systemd_state="$7" + local enabled="$8" + local active="$9" + + printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=%s boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \ + "$alias" "$target" "$unit" "$reachable" \ + "$(escape_value "${boot_id:-unknown}")" \ + "$(escape_value "${uptime_seconds:-unknown}")" \ + "$(escape_value "${systemd_state:-unknown}")" \ + "$(escape_value "${enabled:-unknown}")" \ + "$(escape_value "${active:-unknown}")" +} + +probe_local_host() { + local alias="$1" + local target="$2" + local unit="$3" + local boot_id uptime_seconds systemd_state enabled active + + boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown)" + uptime_seconds="$(awk '{print int($1)}' /proc/uptime 2>/dev/null || echo unknown)" + systemd_state="$(systemctl is-system-running 2>/dev/null || true)" + enabled="$(systemctl is-enabled "$unit" 2>/dev/null || echo unknown)" + active="$(systemctl is-active "$unit" 2>/dev/null || echo unknown)" + emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active" +} + +probe_node_exporter() { + local alias="$1" + local target="$2" + local unit="$3" + local target_host="${target##*@}" + local metrics boot_time now uptime_seconds + + if command -v timeout >/dev/null 2>&1; then + metrics="$(timeout "$NODE_EXPORTER_TIMEOUT_SECONDS" curl -fsS "http://${target_host}:${NODE_EXPORTER_PORT}/metrics" 2>/dev/null || true)" + else + metrics="$(curl --max-time "$NODE_EXPORTER_TIMEOUT_SECONDS" -fsS "http://${target_host}:${NODE_EXPORTER_PORT}/metrics" 2>/dev/null || true)" + fi + boot_time="$(awk '$1 == "node_boot_time_seconds" {printf "%d", $2; found=1; exit} END {if (!found) print ""}' <<<"$metrics")" + if [[ -z "$boot_time" ]]; then + return 1 + fi + now="$(date +%s)" + uptime_seconds=$((now - boot_time)) + emit_boot_row "$alias" "$target" "$unit" 1 "node_exporter_${boot_time}" "$uptime_seconds" "node_exporter" "unknown" "unknown" +} + probe_host() { local alias="$1" local target="$2" local unit="$3" local output boot_id uptime_seconds systemd_state enabled active + local target_host="${target##*@}" + + if is_local_target "$target_host"; then + probe_local_host "$alias" "$target" "$unit" + return 0 + fi output="$(ssh "${SSH_OPTS[@]}" "$target" "unit='$unit'; \ boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \ @@ -34,8 +111,10 @@ probe_host() { printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \ " 2>/dev/null)" if [[ $? -ne 0 || -z "$output" ]]; then - printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=0 boot_id=unknown uptime_seconds=unknown systemd_state=unknown startup_enabled=unknown startup_active=unknown\n' \ - "$alias" "$target" "$unit" + if probe_node_exporter "$alias" "$target" "$unit"; then + return 0 + fi + emit_boot_row "$alias" "$target" "$unit" 0 "unknown" "unknown" "unknown" "unknown" "unknown" return 0 fi @@ -45,13 +124,7 @@ probe_host() { enabled="$(sed -n 's/.*startup_enabled=\([^ ]*\).*/\1/p' <<<"$output")" active="$(sed -n 's/.*startup_active=\([^ ]*\).*/\1/p' <<<"$output")" - printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=1 boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \ - "$alias" "$target" "$unit" \ - "$(escape_value "${boot_id:-unknown}")" \ - "$(escape_value "${uptime_seconds:-unknown}")" \ - "$(escape_value "${systemd_state:-unknown}")" \ - "$(escape_value "${enabled:-unknown}")" \ - "$(escape_value "${active:-unknown}")" + emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active" } echo "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1" diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh index ebca537f..c9064b15 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -12,14 +12,15 @@ LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}" OUTPUT_NAME="${OUTPUT_NAME:-reboot_auto_recovery_slo.prom}" TARGET_MINUTES="${TARGET_MINUTES:-10}" MIN_FREE_GIB="${MIN_FREE_GIB:-2}" -LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-reboot-auto-recovery-slo.lock}" +LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}" + +mkdir -p "$TEXTFILE_DIR" "$LOG_DIR" if command -v flock >/dev/null 2>&1; then exec 9>"$LOCK_FILE" flock -n 9 || exit 0 fi -mkdir -p "$TEXTFILE_DIR" "$LOG_DIR" run_id="$(date '+%Y%m%d-%H%M%S')" artifact_dir="$LOG_DIR/reboot-auto-recovery-slo-$run_id" mkdir -p "$artifact_dir" diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index a2979ad1..0151f67c 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -119,6 +119,13 @@ def source_controls() -> dict[str, bool]: "OnBootSec=", "OnUnitActiveSec=", ), + "slo_installer_source_present": file_contains( + source_file("scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh"), + "--dry-run", + "--verify-only", + "--rollback", + "systemctl enable --now", + ), "post_reboot_summary_source_present": source_file( "scripts/reboot-recovery/post-reboot-readiness-summary.sh" ).exists(), diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py new file mode 100644 index 00000000..be5c66dc --- /dev/null +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import subprocess +import re +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +INSTALLER = ROOT / "scripts" / "reboot-recovery" / "install-reboot-auto-recovery-slo-110.sh" +SERVICE = ROOT / "scripts" / "reboot-recovery" / "awoooi-reboot-auto-recovery-slo.service" +EXPORTER = ROOT / "scripts" / "reboot-recovery" / "reboot-auto-recovery-slo-exporter.sh" + + +def test_installer_dry_run_exposes_apply_rollback_and_verify_contract() -> None: + result = subprocess.run( + ["bash", str(INSTALLER), "--dry-run"], + text=True, + capture_output=True, + check=True, + ) + + assert "DRY_RUN=1" in result.stdout + assert "target_selector=host_110_systemd_timer:awoooi-reboot-auto-recovery-slo.timer" in result.stdout + assert "rollback_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --rollback" in result.stdout + assert "verify_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --verify-only" in result.stdout + assert "would_enable_timer=awoooi-reboot-auto-recovery-slo.timer" in result.stdout + + +def test_installer_is_limited_to_verifier_timer_not_product_restarts() -> None: + text = INSTALLER.read_text(encoding="utf-8") + executable_text = "\n".join( + line for line in text.splitlines() if not line.lstrip().startswith("#") + ) + forbidden_fragments = [ + "shutdown", + "systemctl restart docker", + "systemctl restart nginx", + "systemctl restart k3s", + "systemctl restart postgresql", + "systemctl restart redis", + "docker restart", + "kubectl drain", + "iptables ", + "ufw ", + "DROP ", + "TRUNCATE ", + "gh ", + "github.com", + ] + + for fragment in forbidden_fragments: + assert fragment not in executable_text + assert re.search(r"(?m)^\s*(sudo\s+)?reboot(\s|$)", executable_text) is None + assert "systemctl enable --now" in text + assert "systemctl start ${UNIT_NAME}.service" in text + assert 'payload_dir="$(mktemp -d "${TMPDIR:-/tmp}/${UNIT_NAME}.XXXXXX")"' in text + assert 'payload="${payload_dir}/payload.tar.gz"' in text + + +def test_service_uses_deployed_source_root_and_bounded_oneshot() -> None: + text = SERVICE.read_text(encoding="utf-8") + + assert "Type=oneshot" in text + assert "User=wooo" in text + assert "Group=wooo" in text + assert "WorkingDirectory=/home/wooo" in text + assert "Environment=HOME=/home/wooo" in text + assert "Environment=ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo" in text + assert "Environment=TEXTFILE_DIR=/home/wooo/node_exporter_textfiles" in text + assert "Environment=LOG_DIR=/home/wooo/reboot-recovery" in text + assert "ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh" in text + assert "TimeoutStartSec=600" in text + + +def test_exporter_uses_user_writable_lock_after_creating_log_dir() -> None: + text = EXPORTER.read_text(encoding="utf-8") + + assert 'LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}"' in text + assert text.index('mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"') < text.index('exec 9>"$LOCK_FILE"')