Merge remote-tracking branch 'gitea-ssh/main' into codex/p0-product-manifest-standard-20260629
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"schema_version": "awoooi_priority_work_order_readback_v1",
|
||||
"generated_at": "2026-06-29T14:27:32+08:00",
|
||||
"status": "p0_006a_reboot_auto_recovery_slo_control_plane_added_blocked_until_live_probe",
|
||||
"generated_at": "2026-06-29T14:49:52+08:00",
|
||||
"status": "p0_006a_reboot_auto_recovery_slo_live_probe_installed_blocked_service_health",
|
||||
"source_refs": {
|
||||
"global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json",
|
||||
"workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json",
|
||||
@@ -15,9 +15,9 @@
|
||||
"reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json"
|
||||
},
|
||||
"current_head": {
|
||||
"gitea_main_sha": "748ee37ca958df1e1e25453363de3d9f3f02a6c1",
|
||||
"latest_successful_deploy_marker": "9362588ce chore(cd): deploy a423301 [skip ci]",
|
||||
"latest_successful_deployed_source_sha": "a4233017ad5fd03977233f3db6a4bb45d71507ed",
|
||||
"gitea_main_sha": "7ff959b6a8bbdf152da1969687f188ceda4b0561",
|
||||
"latest_successful_deploy_marker": "15824e9ec chore(cd): deploy 57c1df1 [skip ci]",
|
||||
"latest_successful_deployed_source_sha": "57c1df19fca580dafa5980d82d164819de0dbcd5",
|
||||
"latest_source_readiness_commit_sha": "0c8d4e88c39157b92322fa41a92e6b15c317ac49",
|
||||
"latest_source_readiness_cd_run_id": "3882",
|
||||
"latest_source_readiness_cd_run_status": "Success",
|
||||
@@ -187,7 +187,7 @@
|
||||
"workplan_id": "P0-006",
|
||||
"title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
|
||||
"status": "blocked_reboot_auto_recovery_slo_not_ready",
|
||||
"reason": "The required target is automatic all-host reboot detection plus boot-triggered recovery verification. Current source now has the control-plane verifier, but live all-host boot probe has not been collected and Wazuh dashboard is still degraded.",
|
||||
"reason": "Boot-triggered SLO timer is live on host 110 and all required host boot probes are now observed; the 10-minute recovery claim remains fail-closed because this was not a fresh reboot window and service/backup/post-start blockers remain.",
|
||||
"evidence": {
|
||||
"target_minutes": 10,
|
||||
"can_claim_all_services_recovered_within_target": false,
|
||||
@@ -195,31 +195,59 @@
|
||||
"host_boot_probe_source_present": true,
|
||||
"slo_systemd_timer_source_present": true,
|
||||
"slo_exporter_source_present": true,
|
||||
"post_start_blocked": 0,
|
||||
"service_green": true,
|
||||
"post_start_blocked": 6,
|
||||
"service_green": false,
|
||||
"product_data_green": true,
|
||||
"backup_core_green": true,
|
||||
"wazuh_dashboard_degraded": true,
|
||||
"all_host_reboot_detection_missing": true,
|
||||
"host_boot_probe_missing_hosts": true,
|
||||
"local_disk_free_gib_after_cleanup": 3.271
|
||||
"backup_core_green": false,
|
||||
"wazuh_dashboard_degraded": false,
|
||||
"all_host_reboot_detection_missing": false,
|
||||
"host_boot_probe_missing_hosts": false,
|
||||
"local_disk_free_gib_after_cleanup": 145.514,
|
||||
"slo_installer_source_present": true,
|
||||
"live_slo_timer_enabled": true,
|
||||
"live_slo_timer_active": true,
|
||||
"live_slo_service_last_result": "success",
|
||||
"live_slo_metric_present": true,
|
||||
"observed_hosts": [
|
||||
"110",
|
||||
"120",
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"missing_hosts": [],
|
||||
"unreachable_hosts": [],
|
||||
"stale_hosts": [
|
||||
"110",
|
||||
"120",
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"max_observed_uptime_seconds": 518414,
|
||||
"active_blockers": [
|
||||
"backup_core_green_not_1",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1"
|
||||
]
|
||||
},
|
||||
"professional_fix": {
|
||||
"owner": "reboot auto-recovery lane",
|
||||
"action": "Deploy the boot-triggered SLO timer/exporter, collect all-host boot probes, and rerun the scorecard until it can prove all services recovered inside 10 minutes.",
|
||||
"action": "Keep the live boot-triggered SLO timer enabled, fix backup_core/post-start/service blockers, then use the next fresh reboot window to prove max_observed_uptime_seconds<=600.",
|
||||
"exit_criteria": [
|
||||
"can_claim_all_services_recovered_within_target=true",
|
||||
"observed_hosts=110,120,121,188",
|
||||
"max_observed_uptime_seconds<=600",
|
||||
"missing_hosts=[]",
|
||||
"unreachable_hosts=[]",
|
||||
"max_observed_uptime_seconds<=600 during a fresh reboot window",
|
||||
"POST_START_BLOCKED=0",
|
||||
"SERVICE_GREEN=1",
|
||||
"PRODUCT_DATA_GREEN=1",
|
||||
"BACKUP_CORE_GREEN=1",
|
||||
"WAZUH_DASHBOARD_DEGRADED=0",
|
||||
"local_disk_free_gib>=2"
|
||||
"live_slo_metric_present=true"
|
||||
]
|
||||
},
|
||||
"safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard"
|
||||
"safe_next_step": "fix_backup_core_post_start_and_service_green_blockers_then_rerun_live_slo_scorecard_after_next_reboot_window"
|
||||
}
|
||||
],
|
||||
"noise_integrated_risk_register": [
|
||||
|
||||
@@ -1,47 +1,98 @@
|
||||
{
|
||||
"active_blockers": [
|
||||
"all_host_reboot_detection_missing",
|
||||
"host_boot_probe_missing_hosts",
|
||||
"wazuh_dashboard_degraded"
|
||||
"backup_core_green_not_1",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1"
|
||||
],
|
||||
"can_claim_all_services_recovered_within_target": false,
|
||||
"capacity": {
|
||||
"checked": true,
|
||||
"free_gib": 2.707,
|
||||
"free_gib": 145.514,
|
||||
"min_free_gib": 2.0
|
||||
},
|
||||
"generated_at": "2026-06-29T14:27:32+08:00",
|
||||
"generated_at": "2026-06-29T14:49:52+08:00",
|
||||
"host_boot_detection": {
|
||||
"host_rows": [],
|
||||
"max_observed_uptime_seconds": 0,
|
||||
"missing_hosts": [
|
||||
"host_rows": [
|
||||
{
|
||||
"alias": "110",
|
||||
"boot_id": "a3dfae32-3762-4394-86fa-a342aea07df5",
|
||||
"reachable": true,
|
||||
"startup_active": "inactive_unknown",
|
||||
"startup_enabled": "enabled",
|
||||
"startup_unit": "awoooi-startup-110.service",
|
||||
"systemd_state": "degraded",
|
||||
"target": "wooo@192.168.0.110",
|
||||
"uptime_seconds": 518406
|
||||
},
|
||||
{
|
||||
"alias": "120",
|
||||
"boot_id": "866d621e-bad0-4096-938e-b103db8e5e03",
|
||||
"reachable": true,
|
||||
"startup_active": "active",
|
||||
"startup_enabled": "enabled",
|
||||
"startup_unit": "k3s.service",
|
||||
"systemd_state": "running",
|
||||
"target": "wooo@192.168.0.120",
|
||||
"uptime_seconds": 518397
|
||||
},
|
||||
{
|
||||
"alias": "121",
|
||||
"boot_id": "119c4ea7-8b49-45aa-b60e-bcd4b5dd0979",
|
||||
"reachable": true,
|
||||
"startup_active": "active",
|
||||
"startup_enabled": "enabled",
|
||||
"startup_unit": "k3s.service",
|
||||
"systemd_state": "running",
|
||||
"target": "wooo@192.168.0.121",
|
||||
"uptime_seconds": 518355
|
||||
},
|
||||
{
|
||||
"alias": "188",
|
||||
"boot_id": "9cc1f1fc-b7cc-42c1-bc83-d495f0e3c863",
|
||||
"reachable": true,
|
||||
"startup_active": "inactive",
|
||||
"startup_enabled": "enabled",
|
||||
"startup_unit": "awoooi-startup.service",
|
||||
"systemd_state": "running",
|
||||
"target": "ollama@192.168.0.188",
|
||||
"uptime_seconds": 518414
|
||||
}
|
||||
],
|
||||
"max_observed_uptime_seconds": 518414,
|
||||
"missing_hosts": [],
|
||||
"observed_hosts": [
|
||||
"110",
|
||||
"120",
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"observed_hosts": [],
|
||||
"required_hosts": [
|
||||
"110",
|
||||
"120",
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"stale_hosts": [],
|
||||
"stale_hosts": [
|
||||
"110",
|
||||
"120",
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"unknown_uptime_hosts": [],
|
||||
"unreachable_hosts": []
|
||||
},
|
||||
"post_reboot_readiness": {
|
||||
"backup_core_green": true,
|
||||
"backup_core_green": false,
|
||||
"host_188_service_green": true,
|
||||
"next_required_gates": "credential_escrow_evidence",
|
||||
"overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
|
||||
"post_start_blocked": 0,
|
||||
"post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
|
||||
"next_required_gates": "none",
|
||||
"overall_declaration": "SERVICE_BLOCKED",
|
||||
"post_start_blocked": 6,
|
||||
"post_start_result": "BLOCKED",
|
||||
"product_data_green": true,
|
||||
"service_green": true,
|
||||
"service_green": false,
|
||||
"summary_present": true,
|
||||
"wazuh_dashboard_degraded": true
|
||||
"wazuh_dashboard_degraded": false
|
||||
},
|
||||
"safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready",
|
||||
"schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1",
|
||||
@@ -52,6 +103,7 @@
|
||||
"host_boot_probe_source_present": true,
|
||||
"post_reboot_summary_source_present": true,
|
||||
"slo_exporter_source_present": true,
|
||||
"slo_installer_source_present": true,
|
||||
"slo_systemd_service_source_present": true,
|
||||
"slo_systemd_timer_source_present": true
|
||||
},
|
||||
|
||||
@@ -79,7 +79,7 @@ spec:
|
||||
- name: AWOOOI_BUILD_COMMIT_SHA
|
||||
# 2026-06-29 Codex: CD rewrites this to the deployed image tag so
|
||||
# production deploy readback does not rely on a stale static snapshot.
|
||||
value: "57c1df19fca580dafa5980d82d164819de0dbcd5"
|
||||
value: "7ff959b6a8bbdf152da1969687f188ceda4b0561"
|
||||
- name: USE_AI_ROUTER
|
||||
value: "true"
|
||||
- name: ENABLE_NEMOTRON_COLLABORATION
|
||||
|
||||
@@ -41,7 +41,7 @@ resources:
|
||||
images:
|
||||
- name: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/api
|
||||
newTag: 57c1df19fca580dafa5980d82d164819de0dbcd5
|
||||
newTag: 7ff959b6a8bbdf152da1969687f188ceda4b0561
|
||||
- name: 192.168.0.110:5000/library/web:IMAGE_TAG_PLACEHOLDER
|
||||
newName: 192.168.0.110:5000/awoooi/web
|
||||
newTag: 57c1df19fca580dafa5980d82d164819de0dbcd5
|
||||
newTag: 7ff959b6a8bbdf152da1969687f188ceda4b0561
|
||||
|
||||
@@ -5,7 +5,14 @@ Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment=ROOT_DIR=/opt/awoooi
|
||||
User=wooo
|
||||
Group=wooo
|
||||
WorkingDirectory=/home/wooo
|
||||
Environment=HOME=/home/wooo
|
||||
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
Environment=ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo
|
||||
Environment=TEXTFILE_DIR=/home/wooo/node_exporter_textfiles
|
||||
Environment=LOG_DIR=/home/wooo/reboot-recovery
|
||||
Environment=TARGET_MINUTES=10
|
||||
ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh
|
||||
TimeoutStartSec=600
|
||||
|
||||
252
scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh
Normal file
252
scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh
Normal file
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env bash
|
||||
# Install the read-only AWOOOI reboot auto-recovery SLO verifier on host 110.
|
||||
#
|
||||
# This installer only stages source scripts, enables a systemd timer, starts the
|
||||
# verifier service once, and reads back metrics/status. It does not reboot hosts
|
||||
# or restart Docker, Nginx, K3s, PostgreSQL, Redis, firewall, or product units.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HOST="wooo@192.168.0.110"
|
||||
REMOTE_ROOT="/home/wooo/awoooi-reboot-recovery-slo"
|
||||
TEXTFILE_DIR="/home/wooo/node_exporter_textfiles"
|
||||
LOG_DIR="/home/wooo/reboot-recovery"
|
||||
MODE="install"
|
||||
RUN_ONCE=1
|
||||
SSH_CONNECT_TIMEOUT_SECONDS="${SSH_CONNECT_TIMEOUT_SECONDS:-8}"
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: install-reboot-auto-recovery-slo-110.sh [options]
|
||||
|
||||
Installs the AWOOOI reboot auto-recovery 10-minute SLO verifier on host 110.
|
||||
|
||||
Options:
|
||||
--dry-run Print the controlled apply plan without writing host state.
|
||||
--verify-only Read back timer/service/metric state without writing.
|
||||
--rollback Disable/remove the verifier timer/service/wrapper.
|
||||
--no-run-once Install and enable the timer without starting the verifier once.
|
||||
--host HOST SSH target. Default: wooo@192.168.0.110
|
||||
--remote-root DIR Remote source root. Default: /home/wooo/awoooi-reboot-recovery-slo
|
||||
--textfile-dir DIR Node exporter textfile dir. Default: /home/wooo/node_exporter_textfiles
|
||||
--log-dir DIR Evidence log dir. Default: /home/wooo/reboot-recovery
|
||||
-h, --help Show this help.
|
||||
|
||||
Controlled apply boundaries:
|
||||
- allowed: stage repo scripts, install verifier unit/timer, daemon-reload,
|
||||
enable/start the verifier timer, start only awoooi-reboot-auto-recovery-slo.service
|
||||
- forbidden here: reboot, node drain, firewall changes, destructive DB work,
|
||||
service restarts for Docker/Nginx/K3s/PostgreSQL/Redis/product workloads,
|
||||
secret/session/auth/.env reads, GitHub usage, force push
|
||||
USAGE
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
MODE="dry-run"
|
||||
;;
|
||||
--verify-only)
|
||||
MODE="verify"
|
||||
;;
|
||||
--rollback)
|
||||
MODE="rollback"
|
||||
;;
|
||||
--no-run-once)
|
||||
RUN_ONCE=0
|
||||
;;
|
||||
--host)
|
||||
HOST="${2:?--host requires a value}"
|
||||
shift
|
||||
;;
|
||||
--remote-root)
|
||||
REMOTE_ROOT="${2:?--remote-root requires a value}"
|
||||
shift
|
||||
;;
|
||||
--textfile-dir)
|
||||
TEXTFILE_DIR="${2:?--textfile-dir requires a value}"
|
||||
shift
|
||||
;;
|
||||
--log-dir)
|
||||
LOG_DIR="${2:?--log-dir requires a value}"
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
printf 'Unknown argument: %s\n' "$1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT_SECONDS")
|
||||
UNIT_NAME="awoooi-reboot-auto-recovery-slo"
|
||||
REMOTE_TARBALL="/tmp/${UNIT_NAME}-$(date '+%Y%m%d%H%M%S').tar.gz"
|
||||
|
||||
quote() {
|
||||
printf '%q' "$1"
|
||||
}
|
||||
|
||||
print_plan() {
|
||||
cat <<PLAN
|
||||
AWOOOI_REBOOT_AUTO_RECOVERY_SLO_INSTALL_PLAN=1
|
||||
mode=$MODE
|
||||
host=$HOST
|
||||
remote_root=$REMOTE_ROOT
|
||||
textfile_dir=$TEXTFILE_DIR
|
||||
log_dir=$LOG_DIR
|
||||
run_once=$RUN_ONCE
|
||||
source_of_truth=$ROOT_DIR
|
||||
target_selector=host_110_systemd_timer:${UNIT_NAME}.timer
|
||||
dry_run_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --dry-run
|
||||
rollback_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --rollback
|
||||
verify_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --verify-only
|
||||
post_apply_verifier=systemctl_is_enabled_active_timer_and_reboot_auto_recovery_slo_metric_present
|
||||
PLAN
|
||||
}
|
||||
|
||||
remote_verify_command() {
|
||||
cat <<'REMOTE'
|
||||
set -uo pipefail
|
||||
unit="awoooi-reboot-auto-recovery-slo"
|
||||
metric="${TEXTFILE_DIR}/reboot_auto_recovery_slo.prom"
|
||||
latest_dir="$(find "${LOG_DIR}" -maxdepth 1 -type d -name 'reboot-auto-recovery-slo-*' 2>/dev/null | sort | tail -n 1)"
|
||||
echo "SLO_TIMER_ENABLED=$(systemctl is-enabled "${unit}.timer" 2>/dev/null || true)"
|
||||
echo "SLO_TIMER_ACTIVE=$(systemctl is-active "${unit}.timer" 2>/dev/null || true)"
|
||||
echo "SLO_SERVICE_LOAD=$(systemctl show "${unit}.service" -p LoadState --value 2>/dev/null || true)"
|
||||
echo "SLO_SERVICE_RESULT=$(systemctl show "${unit}.service" -p Result --value 2>/dev/null || true)"
|
||||
echo "SLO_SERVICE_LAST_STATUS=$(systemctl show "${unit}.service" -p ExecMainStatus --value 2>/dev/null || true)"
|
||||
echo "SLO_METRIC_PRESENT=$([ -s "$metric" ] && echo 1 || echo 0)"
|
||||
echo "SLO_METRIC_PATH=$metric"
|
||||
if [ -s "$metric" ]; then
|
||||
awk 'NF && $1 !~ /^#/ {print "SLO_METRIC_LINE " $0}' "$metric" | tail -20
|
||||
fi
|
||||
echo "SLO_LATEST_ARTIFACT_DIR=${latest_dir:-none}"
|
||||
if [ -n "${latest_dir:-}" ] && [ -s "$latest_dir/scorecard.json" ]; then
|
||||
python3 - "$latest_dir/scorecard.json" <<'PY'
|
||||
import json, sys
|
||||
payload = json.load(open(sys.argv[1], encoding="utf-8"))
|
||||
print("SLO_SCORECARD_STATUS=" + str(payload.get("status", "unknown")))
|
||||
print(
|
||||
"SLO_CAN_CLAIM_ALL_SERVICES_RECOVERED_WITHIN_TARGET="
|
||||
+ ("1" if payload.get("can_claim_all_services_recovered_within_target") else "0")
|
||||
)
|
||||
print("SLO_ACTIVE_BLOCKERS=" + ",".join(payload.get("active_blockers") or []))
|
||||
print(
|
||||
"SLO_OBSERVED_HOSTS="
|
||||
+ ",".join(payload.get("host_boot_detection", {}).get("observed_hosts") or [])
|
||||
)
|
||||
PY
|
||||
fi
|
||||
REMOTE
|
||||
}
|
||||
|
||||
ssh_remote() {
|
||||
ssh "${SSH_OPTS[@]}" "$HOST" \
|
||||
"TEXTFILE_DIR=$(quote "$TEXTFILE_DIR") LOG_DIR=$(quote "$LOG_DIR") bash -s"
|
||||
}
|
||||
|
||||
verify_remote() {
|
||||
remote_verify_command | ssh_remote
|
||||
}
|
||||
|
||||
rollback_remote() {
|
||||
cat <<'REMOTE' | ssh_remote
|
||||
set -euo pipefail
|
||||
unit="awoooi-reboot-auto-recovery-slo"
|
||||
sudo systemctl disable --now "${unit}.timer" >/dev/null 2>&1 || true
|
||||
sudo systemctl stop "${unit}.service" >/dev/null 2>&1 || true
|
||||
sudo rm -f "/etc/systemd/system/${unit}.timer" "/etc/systemd/system/${unit}.service"
|
||||
sudo rm -f "/usr/local/bin/${unit}.sh"
|
||||
sudo systemctl daemon-reload
|
||||
echo "SLO_ROLLBACK_DONE=1"
|
||||
REMOTE
|
||||
}
|
||||
|
||||
install_remote() {
|
||||
local run_once_command=":"
|
||||
if [[ "$RUN_ONCE" -eq 1 ]]; then
|
||||
run_once_command="sudo systemctl start ${UNIT_NAME}.service"
|
||||
fi
|
||||
|
||||
cat <<REMOTE | ssh_remote
|
||||
set -euo pipefail
|
||||
remote_root=$(quote "$REMOTE_ROOT")
|
||||
tarball=$(quote "$REMOTE_TARBALL")
|
||||
textfile_dir=$(quote "$TEXTFILE_DIR")
|
||||
log_dir=$(quote "$LOG_DIR")
|
||||
unit=$(quote "$UNIT_NAME")
|
||||
mkdir -p "\$remote_root" "\$textfile_dir" "\$log_dir"
|
||||
tar -xzf "\$tarball" -C "\$remote_root"
|
||||
find "\$remote_root/scripts/reboot-recovery" "\$remote_root/scripts/security" -type f \( -name '*.sh' -o -name '*.py' -o -name '*.awk' \) -exec chmod 0755 {} \;
|
||||
sudo install -m 0755 "\$remote_root/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh" "/usr/local/bin/\${unit}.sh"
|
||||
sudo install -m 0644 "\$remote_root/scripts/reboot-recovery/\${unit}.service" "/etc/systemd/system/\${unit}.service"
|
||||
sudo install -m 0644 "\$remote_root/scripts/reboot-recovery/\${unit}.timer" "/etc/systemd/system/\${unit}.timer"
|
||||
sudo sed -i "s|^Environment=ROOT_DIR=.*|Environment=ROOT_DIR=\$remote_root|" "/etc/systemd/system/\${unit}.service"
|
||||
sudo sed -i "s|^Environment=TEXTFILE_DIR=.*|Environment=TEXTFILE_DIR=\$textfile_dir|" "/etc/systemd/system/\${unit}.service"
|
||||
sudo sed -i "s|^Environment=LOG_DIR=.*|Environment=LOG_DIR=\$log_dir|" "/etc/systemd/system/\${unit}.service"
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now "\${unit}.timer"
|
||||
$run_once_command
|
||||
rm -f "\$tarball"
|
||||
echo "SLO_INSTALL_DONE=1"
|
||||
REMOTE
|
||||
}
|
||||
|
||||
create_payload() {
|
||||
local tarball="$1"
|
||||
export COPYFILE_DISABLE=1
|
||||
tar --no-xattrs -C "$ROOT_DIR" \
|
||||
--exclude='*/__pycache__' \
|
||||
--exclude='*.pyc' \
|
||||
--exclude='.pytest_cache' \
|
||||
-czf "$tarball" \
|
||||
scripts/reboot-recovery \
|
||||
scripts/security
|
||||
}
|
||||
|
||||
print_plan
|
||||
|
||||
case "$MODE" in
|
||||
dry-run)
|
||||
cat <<DRYRUN
|
||||
DRY_RUN=1
|
||||
would_create_tarball_from=$ROOT_DIR/scripts/reboot-recovery,$ROOT_DIR/scripts/security
|
||||
would_copy_tarball_to=$HOST:$REMOTE_TARBALL
|
||||
would_enable_timer=${UNIT_NAME}.timer
|
||||
would_start_service_once=$RUN_ONCE
|
||||
would_write_metric=$TEXTFILE_DIR/reboot_auto_recovery_slo.prom
|
||||
DRYRUN
|
||||
exit 0
|
||||
;;
|
||||
verify)
|
||||
verify_remote
|
||||
exit 0
|
||||
;;
|
||||
rollback)
|
||||
rollback_remote
|
||||
verify_remote
|
||||
exit 0
|
||||
;;
|
||||
install)
|
||||
;;
|
||||
*)
|
||||
printf 'Unsupported mode: %s\n' "$MODE" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
payload_dir="$(mktemp -d "${TMPDIR:-/tmp}/${UNIT_NAME}.XXXXXX")"
|
||||
payload="${payload_dir}/payload.tar.gz"
|
||||
trap 'rm -rf "$payload_dir"' EXIT
|
||||
create_payload "$payload"
|
||||
scp "${SSH_OPTS[@]}" "$payload" "$HOST:$REMOTE_TARBALL"
|
||||
install_remote
|
||||
verify_remote
|
||||
@@ -8,6 +8,8 @@
|
||||
set -uo pipefail
|
||||
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}")
|
||||
NODE_EXPORTER_PORT="${NODE_EXPORTER_PORT:-9100}"
|
||||
NODE_EXPORTER_TIMEOUT_SECONDS="${NODE_EXPORTER_TIMEOUT_SECONDS:-4}"
|
||||
HOST_SPECS=(
|
||||
"110=wooo@192.168.0.110:awoooi-startup-110.service"
|
||||
"120=wooo@192.168.0.120:k3s.service"
|
||||
@@ -19,11 +21,86 @@ escape_value() {
|
||||
printf '%s' "$1" | tr ' \t\n' '___'
|
||||
}
|
||||
|
||||
local_ip_list() {
|
||||
{
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' || true
|
||||
ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true
|
||||
ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true
|
||||
} | awk 'NF'
|
||||
}
|
||||
|
||||
is_local_target() {
|
||||
local target_host="$1"
|
||||
[[ "$target_host" == "127.0.0.1" || "$target_host" == "localhost" ]] && return 0
|
||||
local_ip_list | grep -Fxq "$target_host"
|
||||
}
|
||||
|
||||
emit_boot_row() {
|
||||
local alias="$1"
|
||||
local target="$2"
|
||||
local unit="$3"
|
||||
local reachable="$4"
|
||||
local boot_id="$5"
|
||||
local uptime_seconds="$6"
|
||||
local systemd_state="$7"
|
||||
local enabled="$8"
|
||||
local active="$9"
|
||||
|
||||
printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=%s boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \
|
||||
"$alias" "$target" "$unit" "$reachable" \
|
||||
"$(escape_value "${boot_id:-unknown}")" \
|
||||
"$(escape_value "${uptime_seconds:-unknown}")" \
|
||||
"$(escape_value "${systemd_state:-unknown}")" \
|
||||
"$(escape_value "${enabled:-unknown}")" \
|
||||
"$(escape_value "${active:-unknown}")"
|
||||
}
|
||||
|
||||
probe_local_host() {
|
||||
local alias="$1"
|
||||
local target="$2"
|
||||
local unit="$3"
|
||||
local boot_id uptime_seconds systemd_state enabled active
|
||||
|
||||
boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown)"
|
||||
uptime_seconds="$(awk '{print int($1)}' /proc/uptime 2>/dev/null || echo unknown)"
|
||||
systemd_state="$(systemctl is-system-running 2>/dev/null || true)"
|
||||
enabled="$(systemctl is-enabled "$unit" 2>/dev/null || echo unknown)"
|
||||
active="$(systemctl is-active "$unit" 2>/dev/null || echo unknown)"
|
||||
emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active"
|
||||
}
|
||||
|
||||
probe_node_exporter() {
|
||||
local alias="$1"
|
||||
local target="$2"
|
||||
local unit="$3"
|
||||
local target_host="${target##*@}"
|
||||
local metrics boot_time now uptime_seconds
|
||||
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
metrics="$(timeout "$NODE_EXPORTER_TIMEOUT_SECONDS" curl -fsS "http://${target_host}:${NODE_EXPORTER_PORT}/metrics" 2>/dev/null || true)"
|
||||
else
|
||||
metrics="$(curl --max-time "$NODE_EXPORTER_TIMEOUT_SECONDS" -fsS "http://${target_host}:${NODE_EXPORTER_PORT}/metrics" 2>/dev/null || true)"
|
||||
fi
|
||||
boot_time="$(awk '$1 == "node_boot_time_seconds" {printf "%d", $2; found=1; exit} END {if (!found) print ""}' <<<"$metrics")"
|
||||
if [[ -z "$boot_time" ]]; then
|
||||
return 1
|
||||
fi
|
||||
now="$(date +%s)"
|
||||
uptime_seconds=$((now - boot_time))
|
||||
emit_boot_row "$alias" "$target" "$unit" 1 "node_exporter_${boot_time}" "$uptime_seconds" "node_exporter" "unknown" "unknown"
|
||||
}
|
||||
|
||||
probe_host() {
|
||||
local alias="$1"
|
||||
local target="$2"
|
||||
local unit="$3"
|
||||
local output boot_id uptime_seconds systemd_state enabled active
|
||||
local target_host="${target##*@}"
|
||||
|
||||
if is_local_target "$target_host"; then
|
||||
probe_local_host "$alias" "$target" "$unit"
|
||||
return 0
|
||||
fi
|
||||
|
||||
output="$(ssh "${SSH_OPTS[@]}" "$target" "unit='$unit'; \
|
||||
boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \
|
||||
@@ -34,8 +111,10 @@ probe_host() {
|
||||
printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \
|
||||
" 2>/dev/null)"
|
||||
if [[ $? -ne 0 || -z "$output" ]]; then
|
||||
printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=0 boot_id=unknown uptime_seconds=unknown systemd_state=unknown startup_enabled=unknown startup_active=unknown\n' \
|
||||
"$alias" "$target" "$unit"
|
||||
if probe_node_exporter "$alias" "$target" "$unit"; then
|
||||
return 0
|
||||
fi
|
||||
emit_boot_row "$alias" "$target" "$unit" 0 "unknown" "unknown" "unknown" "unknown" "unknown"
|
||||
return 0
|
||||
fi
|
||||
|
||||
@@ -45,13 +124,7 @@ probe_host() {
|
||||
enabled="$(sed -n 's/.*startup_enabled=\([^ ]*\).*/\1/p' <<<"$output")"
|
||||
active="$(sed -n 's/.*startup_active=\([^ ]*\).*/\1/p' <<<"$output")"
|
||||
|
||||
printf 'HOST_BOOT alias=%s target=%s startup_unit=%s reachable=1 boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \
|
||||
"$alias" "$target" "$unit" \
|
||||
"$(escape_value "${boot_id:-unknown}")" \
|
||||
"$(escape_value "${uptime_seconds:-unknown}")" \
|
||||
"$(escape_value "${systemd_state:-unknown}")" \
|
||||
"$(escape_value "${enabled:-unknown}")" \
|
||||
"$(escape_value "${active:-unknown}")"
|
||||
emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active"
|
||||
}
|
||||
|
||||
echo "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1"
|
||||
|
||||
@@ -12,14 +12,15 @@ LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}"
|
||||
OUTPUT_NAME="${OUTPUT_NAME:-reboot_auto_recovery_slo.prom}"
|
||||
TARGET_MINUTES="${TARGET_MINUTES:-10}"
|
||||
MIN_FREE_GIB="${MIN_FREE_GIB:-2}"
|
||||
LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-reboot-auto-recovery-slo.lock}"
|
||||
LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}"
|
||||
|
||||
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
|
||||
|
||||
if command -v flock >/dev/null 2>&1; then
|
||||
exec 9>"$LOCK_FILE"
|
||||
flock -n 9 || exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
|
||||
run_id="$(date '+%Y%m%d-%H%M%S')"
|
||||
artifact_dir="$LOG_DIR/reboot-auto-recovery-slo-$run_id"
|
||||
mkdir -p "$artifact_dir"
|
||||
|
||||
@@ -119,6 +119,13 @@ def source_controls() -> dict[str, bool]:
|
||||
"OnBootSec=",
|
||||
"OnUnitActiveSec=",
|
||||
),
|
||||
"slo_installer_source_present": file_contains(
|
||||
source_file("scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh"),
|
||||
"--dry-run",
|
||||
"--verify-only",
|
||||
"--rollback",
|
||||
"systemctl enable --now",
|
||||
),
|
||||
"post_reboot_summary_source_present": source_file(
|
||||
"scripts/reboot-recovery/post-reboot-readiness-summary.sh"
|
||||
).exists(),
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
INSTALLER = ROOT / "scripts" / "reboot-recovery" / "install-reboot-auto-recovery-slo-110.sh"
|
||||
SERVICE = ROOT / "scripts" / "reboot-recovery" / "awoooi-reboot-auto-recovery-slo.service"
|
||||
EXPORTER = ROOT / "scripts" / "reboot-recovery" / "reboot-auto-recovery-slo-exporter.sh"
|
||||
|
||||
|
||||
def test_installer_dry_run_exposes_apply_rollback_and_verify_contract() -> None:
|
||||
result = subprocess.run(
|
||||
["bash", str(INSTALLER), "--dry-run"],
|
||||
text=True,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
assert "DRY_RUN=1" in result.stdout
|
||||
assert "target_selector=host_110_systemd_timer:awoooi-reboot-auto-recovery-slo.timer" in result.stdout
|
||||
assert "rollback_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --rollback" in result.stdout
|
||||
assert "verify_command=bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh --verify-only" in result.stdout
|
||||
assert "would_enable_timer=awoooi-reboot-auto-recovery-slo.timer" in result.stdout
|
||||
|
||||
|
||||
def test_installer_is_limited_to_verifier_timer_not_product_restarts() -> None:
|
||||
text = INSTALLER.read_text(encoding="utf-8")
|
||||
executable_text = "\n".join(
|
||||
line for line in text.splitlines() if not line.lstrip().startswith("#")
|
||||
)
|
||||
forbidden_fragments = [
|
||||
"shutdown",
|
||||
"systemctl restart docker",
|
||||
"systemctl restart nginx",
|
||||
"systemctl restart k3s",
|
||||
"systemctl restart postgresql",
|
||||
"systemctl restart redis",
|
||||
"docker restart",
|
||||
"kubectl drain",
|
||||
"iptables ",
|
||||
"ufw ",
|
||||
"DROP ",
|
||||
"TRUNCATE ",
|
||||
"gh ",
|
||||
"github.com",
|
||||
]
|
||||
|
||||
for fragment in forbidden_fragments:
|
||||
assert fragment not in executable_text
|
||||
assert re.search(r"(?m)^\s*(sudo\s+)?reboot(\s|$)", executable_text) is None
|
||||
assert "systemctl enable --now" in text
|
||||
assert "systemctl start ${UNIT_NAME}.service" in text
|
||||
assert 'payload_dir="$(mktemp -d "${TMPDIR:-/tmp}/${UNIT_NAME}.XXXXXX")"' in text
|
||||
assert 'payload="${payload_dir}/payload.tar.gz"' in text
|
||||
|
||||
|
||||
def test_service_uses_deployed_source_root_and_bounded_oneshot() -> None:
|
||||
text = SERVICE.read_text(encoding="utf-8")
|
||||
|
||||
assert "Type=oneshot" in text
|
||||
assert "User=wooo" in text
|
||||
assert "Group=wooo" in text
|
||||
assert "WorkingDirectory=/home/wooo" in text
|
||||
assert "Environment=HOME=/home/wooo" in text
|
||||
assert "Environment=ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo" in text
|
||||
assert "Environment=TEXTFILE_DIR=/home/wooo/node_exporter_textfiles" in text
|
||||
assert "Environment=LOG_DIR=/home/wooo/reboot-recovery" in text
|
||||
assert "ExecStart=/usr/local/bin/awoooi-reboot-auto-recovery-slo.sh" in text
|
||||
assert "TimeoutStartSec=600" in text
|
||||
|
||||
|
||||
def test_exporter_uses_user_writable_lock_after_creating_log_dir() -> None:
|
||||
text = EXPORTER.read_text(encoding="utf-8")
|
||||
|
||||
assert 'LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}"' in text
|
||||
assert text.index('mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"') < text.index('exec 9>"$LOCK_FILE"')
|
||||
Reference in New Issue
Block a user