diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 38fd1c30..5967228b 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,21 @@ +## 2026-06-30 — 18:42 P0-006 reboot auto-detection / VMware / maintenance / backup alert hardening + +**照使用者 8 點問題收斂的 source 修正**: +- 全主機重啟偵測從舊的 `110/120/121/188` 擴成 `99/110/111/112/120/121/188`;新增 `reboot-event-detector.py`,用 boot_id 差異與 fresh uptime 產生 stateful reboot event、10 分鐘剩餘秒數與 Prometheus metrics。 +- `reboot-auto-recovery-slo-exporter.sh` 現在每輪都產生 `reboot-event.json` / `.prom`,並把 `awoooi_reboot_auto_recovery_slo_*` scope 改成 `99_110_111_112_120_121_188`。 +- `reboot-auto-recovery-slo-scorecard.py` 新增 `reboot_event_detection`、`sla_recovery_eta` 與固定 triage order:99 VMware/VM power → host boot → public maintenance fallback → AWOOI K3s/registry → StockPlatform freshness → backup health/offsite → Telegram delivery。 +- 新增 `windows99-vmware-autostart.ps1`,提供 `Verify` / `DryRun` / `Apply` 三段式入口:建立 `AWOOOI-Start-VMware-VMs` startup task、依序啟動 VM Host `111/188/120/121/112`,並套用 Windows Update 不無預警自動重開機政策。 +- 新增 `ops/maintenance/maintenance.html`、Nginx `502/503/504` intercept snippet 與 `PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md`;正式做法分 L0 local Nginx 靜態維護頁與 L1 external cloud/CDN static fallback。 +- `backup-health-textfile-exporter.py` 新增 `host/database/website/service/package/tool/log` coverage domain metrics;`alerts-unified.yml` 新增 reboot event、10 分鐘 SLO missed、public 5xx fallback missing、backup coverage stale alerts,並讓 `HostDown` 帶 TYPE-3 Telegram route metadata。 + +**尚未宣稱完成的 runtime 事實**: +- 99 Windows 主機尚未在本輪由 Codex 實機登入套用;`Apply` 需要 99 console/WinRM 可用與五台 VM 的實際 `.vmx` 路徑。這不是 owner gate,而是 target selector/runtime access evidence 尚未讀回。 +- 10 分鐘 SLO 仍需下一次實際 all-host reboot 或正式 reboot drill 讀回:`awoooi_reboot_event_detected=1`、所有 required host observed、`awoooi_reboot_auto_recovery_slo_ready=1`、public routes 不露出 502、backup coverage green、Telegram alert/recovery delivery 可見。 + +**驗證**: +- 新增 focused tests 鎖住 reboot event detector、scorecard all-host scope、99 VMware controlled apply、maintenance fallback、Telegram alert rules 與 backup coverage domain contract。 +- 邊界:未重啟主機,未 node drain,未 firewall cutover,未 DB destructive operation,未讀 secret / token / raw sessions / SQLite / `.env`,未使用 GitHub / `gh` / GitHub API。 + ## 2026-06-30 — 17:57 P0-006 全主機重啟實測與受控恢復 **實測結論**: diff --git a/docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md b/docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md new file mode 100644 index 00000000..450e43bf --- /dev/null +++ b/docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md @@ -0,0 +1,42 @@ +# Public Maintenance Fallback Runbook + +## 目標 + +主機或 VM 重啟時,public 網站不得直接露出空白 502。正式做法分兩層: + +- L0 local edge:110 / 188 / 120 / 121 的 Nginx 仍可回應時,用 `ops/maintenance/maintenance.html` 攔截 `502` / `503` / `504`,顯示靜態維護頁。 +- L1 external cloud/CDN:如果 99 / VMware / 家用路由 / 對外 IP 整段不可達,由外部雲端主機或 CDN health check 切到同一份靜態維護頁。 + +## 必要特性 + +- 維護頁不得讀取 `.env`、cookie、session、DB、K3s、Redis、NFS 或內部 API。 +- 維護頁必須是靜態檔,能被 Nginx、Cloudflare Pages、S3/R2 static hosting、GCS static site 或任一低成本外部 VM 直接服務。 +- L0 與 L1 使用同一份內容,避免使用者在不同故障層看到不同訊息。 +- 502 fallback 只能處理 upstream failure;若整個 public edge unreachable,必須由 L1 接手。 + +## L0 受控套用 + +1. 將 `ops/maintenance/maintenance.html` 部署到 edge host 的 `/var/www/maintenance/maintenance.html`。 +2. 在每個 public `server {}` 內 include `ops/maintenance/nginx-502-maintenance-snippet.conf` 的等價內容。 +3. `nginx -t` 通過後才 reload Nginx。 +4. 用壞 upstream staging vhost 或暫存 upstream 驗證 `X-AWOOOI-Fallback: local-maintenance`,不得用 production DB / app restart 當測試手段。 + +## L1 外部雲端/CDN 建議 + +優先順序: + +1. Cloudflare / CDN health check + fallback origin:最快切換,對使用者體驗最好。 +2. 低成本外部 VM:可控性高,能同時跑 blackbox probe;需要 VM patching 與監控。 +3. Object storage static hosting:成本最低,適合維護頁;DNS/CDN health check 仍要另外配置。 + +L1 active 條件: + +- `probe_success` 對主要 public routes 連續 2 分鐘失敗,或 +- `awoooi_reboot_event_detected == 1` 且 `awoooi_reboot_auto_recovery_slo_ready == 0` 超過 10 分鐘,或 +- 99 / 110 / 188 / 120 / 121 / 112 任一 P0 host down 且 public route 同時 502 / timeout。 + +## 驗證 + +- L0:對測試 vhost 讀回 `200` 或 `503` 的維護頁 HTML,且 header `X-AWOOOI-Fallback=local-maintenance` 存在。 +- L1:從外部網路讀回靜態維護頁,且不經過 99 / 110 / 188 / 120 / 121 / 112。 +- Recovery:主要 upstream 連續 2 分鐘健康後切回正式 origin;Telegram 發送 recovery 通知,並在 cold-start scorecard 留下 readback。 diff --git a/ops/maintenance/maintenance.html b/ops/maintenance/maintenance.html new file mode 100644 index 00000000..d7d0ea9a --- /dev/null +++ b/ops/maintenance/maintenance.html @@ -0,0 +1,78 @@ + + + + + + + 系統維護中 + + + +
+
AWOOOI 維護模式
+

系統正在恢復服務

+

我們偵測到主機或上游服務正在重啟,流量暫時切到維護頁。這個頁面不依賴內部資料庫或應用程式,因此在主要服務恢復前仍可正常顯示。

+

系統會持續自動檢查,服務恢復後會切回正式網站。

+
HTTP 502 / 503 / 504 fallback page. No application secrets, cookies, or user data are loaded here.
+
+ + diff --git a/ops/maintenance/nginx-502-maintenance-snippet.conf b/ops/maintenance/nginx-502-maintenance-snippet.conf new file mode 100644 index 00000000..5ad8ff4f --- /dev/null +++ b/ops/maintenance/nginx-502-maintenance-snippet.conf @@ -0,0 +1,16 @@ +# Include this inside each public server {} that proxies to internal AWOOOI, +# StockPlatform, MOMO, Tsenyang, Bitan, Gitea, Harbor, or registry upstreams. +# +# The static file must be deployed outside the application containers, for +# example /var/www/maintenance/maintenance.html on the edge Nginx host. + +proxy_intercept_errors on; +error_page 502 503 504 /__awoooi-maintenance.html; + +location = /__awoooi-maintenance.html { + internal; + root /var/www/maintenance; + try_files /maintenance.html =503; + add_header Cache-Control "no-store, no-cache, must-revalidate, max-age=0" always; + add_header X-AWOOOI-Fallback "local-maintenance" always; +} diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 416674d4..42948740 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -60,10 +60,12 @@ groups: severity: critical layer: systemd-188 team: ops + alert_category: reboot_recovery + notification_type: TYPE-3 auto_repair: "false" annotations: summary: "主機 {{ $labels.host }} 不可達" - description: "Node Exporter 無回應超過 1 分鐘" + description: "Node Exporter 無回應超過 1 分鐘;若發生在全主機重啟或 99/VMware restart,必須立刻送 Telegram 並啟動 reboot recovery SLO scorecard。" - alert: HostHighCpuLoad # 2026-05-05 ogt + Codex: keep this as early warning only. @@ -133,6 +135,97 @@ groups: description: "磁碟使用率超過 85%" auto_repair_action: "ssh {{ $labels.instance }} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'" + # ========================================================================= + # Reboot event / 10-minute recovery SLO alerts + # ========================================================================= + - name: reboot_recovery_slo_alerts + interval: 1m + rules: + - alert: RebootEventDetectorMissing + expr: absent(awoooi_reboot_event_detected{scope="99_110_111_112_120_121_188"}) + for: 15m + labels: + severity: warning + layer: systemd-110 + component: reboot-event-detector + team: ops + alert_category: reboot_recovery + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "全主機重啟事件偵測器沒有輸出" + description: "reboot-auto-recovery SLO exporter 沒有輸出 awoooi_reboot_event_detected;99/110/111/112/120/121/188 重啟後無法自動判斷事件開始時間。" + runbook: "檢查 110 awoooi-reboot-auto-recovery-slo.timer / service、host-probe、reboot-event-state.json 與 node-exporter textfile collector。" + + - alert: HostRebootEventDetected + expr: awoooi_reboot_event_host_rebooted{host=~"99|110|111|112|120|121|188"} == 1 + for: 1m + labels: + severity: warning + layer: systemd-110 + component: reboot-event-detector + team: ops + alert_category: reboot_recovery + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "偵測到主機 {{ $labels.host }} 已重啟" + description: "boot_id 或 fresh uptime 顯示 {{ $labels.host }} 位於 10 分鐘重啟恢復視窗內;SLO scorecard 會固定追蹤 remaining seconds、public route、backup 與 freshness。" + runbook: "讀取 reboot_auto_recovery_slo.prom 與最新 scorecard.json;不要手動重啟服務,先依 fixed_triage_order 查 99 VM、host boot、public route、K3s/registry、Stock freshness、backup、Telegram delivery。" + + - alert: RebootEventRequiredHostMissing + expr: awoooi_reboot_event_required_host_observed{host=~"99|110|111|112|120|121|188"} == 0 + for: 2m + labels: + severity: critical + layer: systemd-110 + component: reboot-event-detector + team: ops + alert_category: reboot_recovery + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "重啟偵測缺少必要主機 {{ $labels.host }}" + description: "{{ $labels.host }} 沒有出現在 host boot probe;10 分鐘自動恢復不能宣稱成立。" + runbook: "先確認 99 VMware / VM power 狀態與 node-exporter/Windows exporter,再重跑 reboot-auto-recovery-host-probe.sh;不得用刪除 required host 的方式消紅。" + + - alert: RebootAutoRecoverySLOMissed + expr: | + (awoooi_reboot_event_detected{scope="99_110_111_112_120_121_188"} == 1) + and on(scope) + (awoooi_reboot_event_target_seconds_remaining{scope="99_110_111_112_120_121_188"} <= 0) + and on(scope) + (awoooi_reboot_auto_recovery_slo_ready{scope="99_110_111_112_120_121_188"} == 0) + for: 1m + labels: + severity: critical + layer: systemd-110 + component: reboot-auto-recovery-slo + team: ops + alert_category: reboot_recovery + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "全主機重啟後 10 分鐘恢復 SLO 未達成" + description: "偵測到重啟事件,但 10 分鐘期限已過且 awoooi_reboot_auto_recovery_slo_ready 仍為 0。" + runbook: "直接讀最新 scorecard.json 的 active_blockers 與 fixed_triage_order;先處理 99/VMware/VM power、public maintenance fallback、registry/K3s、Stock freshness、backup health 與 Telegram delivery。" + + - alert: RebootAutoRecoverySLOExporterStale + expr: time() - awoooi_reboot_auto_recovery_slo_last_run_timestamp{scope="99_110_111_112_120_121_188"} > 600 + for: 10m + labels: + severity: warning + layer: systemd-110 + component: reboot-auto-recovery-slo + team: ops + alert_category: reboot_recovery + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "重啟恢復 SLO exporter 超過 10 分鐘未更新" + description: "reboot auto-recovery SLO textfile stale;即使服務看似正常,也不能證明 10 分鐘內恢復。" + runbook: "檢查 systemd timer、/home/wooo/reboot-recovery artifacts、node-exporter textfile collector。" + # ========================================================================= # Host runaway process / CI load classification # ========================================================================= @@ -996,6 +1089,26 @@ groups: summary: "外部網站 bitan.wooo.work 離線" description: "bitan.wooo.work 探測失敗超過 3 分鐘,容器 bitan-app (188) 可能需要重啟" + - alert: PublicRouteServing5xxWithoutMaintenanceFallback + expr: | + probe_http_status_code{ + job="blackbox-http", + instance=~"https?://(awoooi\\.wooo\\.work|stock\\.wooo\\.work|mo\\.wooo\\.work|bitan\\.wooo\\.work|www\\.tsenyang\\.com).*" + } >= 500 + for: 2m + labels: + severity: critical + layer: external + component: public-maintenance-fallback + team: ops + alert_category: reboot_recovery + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Public route 正在對使用者回 {{ $value }}:{{ $labels.instance }}" + description: "重啟或 upstream failure 期間 public route 仍露出 5xx;應由 L0 Nginx maintenance fallback 或 L1 external cloud/CDN 維護頁接手。" + runbook: "依 docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md 部署 ops/maintenance/maintenance.html 與 Nginx 502/503/504 intercept;整段 edge unreachable 時啟用外部 CDN/static fallback。" + - alert: ExternalSiteSSLExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 14 * 24 * 3600 for: 1h @@ -1357,6 +1470,40 @@ groups: description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。" runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料" + - alert: BackupCoverageDomainMetricMissing + expr: absent(awoooi_backup_coverage_domain_expected_info{host="110"}) + for: 20m + labels: + severity: warning + layer: host-backup + component: backup-coverage + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份覆蓋 domain 指標缺失" + description: "backup-health exporter 沒有輸出 host/database/website/service/package/tool/log 聚合覆蓋指標,無法快速判斷完整備份是否還在運作。" + runbook: "部署新版 scripts/ops/backup-health-textfile-exporter.py,刷新 /home/wooo/node_exporter_textfiles/backup_health.prom。" + + - alert: BackupCoverageDomainStale + expr: awoooi_backup_coverage_domain_fresh{host="110"} == 0 + for: 15m + labels: + severity: critical + layer: host-backup + component: backup-coverage + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "110 備份覆蓋 domain 過舊:{{ $labels.domain }}" + description: "{{ $labels.domain }} 需要的備份證據 {{ $labels.required_jobs }} 未全部新鮮;重啟恢復不能宣稱備份完整。" + runbook: "先跑 `scripts/backup/backup-status.sh --no-notify` 和 backup-health exporter readback,修復對應 required_jobs;禁止刪除、prune、restore 或覆蓋 production。" + - alert: BackupAggregateRunFailed expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0 for: 10m diff --git a/scripts/ops/backup-health-textfile-exporter.py b/scripts/ops/backup-health-textfile-exporter.py index cfe672b6..98a0f9eb 100755 --- a/scripts/ops/backup-health-textfile-exporter.py +++ b/scripts/ops/backup-health-textfile-exporter.py @@ -659,6 +659,10 @@ def _base_lines(host: str) -> list[str]: "# TYPE awoooi_backup_health_last_run_timestamp gauge", "# HELP awoooi_backup_expected_job_info Expected backup job inventory.", "# TYPE awoooi_backup_expected_job_info gauge", + "# HELP awoooi_backup_coverage_domain_expected_info Expected backup coverage domains for host, DB, website, service, package, tool, and log recovery.", + "# TYPE awoooi_backup_coverage_domain_expected_info gauge", + "# HELP awoooi_backup_coverage_domain_fresh Whether every required evidence job for a backup coverage domain is fresh.", + "# TYPE awoooi_backup_coverage_domain_fresh gauge", "# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.", "# TYPE awoooi_backup_job_configured gauge", "# HELP awoooi_backup_script_present Whether the backup script exists on this host.", @@ -808,6 +812,7 @@ def _collect_110(host: str) -> list[str]: labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"' lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}") + job_freshness: dict[str, int] = {} for job, repo, max_age in [ ("awoooi_db", "/backup/awoooi", 7), ("configs", "/backup/configs", 48), @@ -824,6 +829,8 @@ def _collect_110(host: str) -> list[str]: ("public_routes", "/backup/public-routes", 168), ]: timestamp, count = _latest_restic_snapshot(repo) + age = int(time.time()) - timestamp if timestamp else 0 + job_freshness[job] = 1 if timestamp and age <= max_age * 3600 else 0 lines.extend( _metric_lines_for_job( host=host, @@ -837,6 +844,26 @@ def _collect_110(host: str) -> list[str]: ) ) + coverage_domains = { + "host": ["configs"], + "database": ["awoooi_db"], + "website": ["public_routes", "momo"], + "service": ["gitea", "harbor", "sentry", "monitoring", "signoz"], + "package": ["configs", "ai_artifacts"], + "tool": ["ai_artifacts", "open_webui", "clawbot", "langfuse"], + "log": ["monitoring", "signoz", "sentry"], + } + for domain, jobs in coverage_domains.items(): + required = ",".join(jobs) + fresh = 1 if all(job_freshness.get(job, 0) == 1 for job in jobs) else 0 + labels = ( + f'host="{_escape_label(host)}",' + f'domain="{_escape_label(domain)}",' + f'required_jobs="{_escape_label(required)}"' + ) + lines.append(f"awoooi_backup_coverage_domain_expected_info{{{labels}}} 1") + lines.append(f"awoooi_backup_coverage_domain_fresh{{{labels}}} {fresh}") + backup_all_ts, failed_count = _latest_backup_all_failed_count() labels = f'host="{_escape_label(host)}",job="backup_all"' lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}") diff --git a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh index 27ffc94b..0fd1e559 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh @@ -7,11 +7,21 @@ set -uo pipefail -SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}") +SSH_OPTS=( + -o BatchMode=yes + -o ConnectTimeout="${SSH_CONNECT_TIMEOUT_SECONDS:-6}" + -o ConnectionAttempts=1 + -o ServerAliveInterval="${SSH_SERVER_ALIVE_INTERVAL_SECONDS:-5}" + -o ServerAliveCountMax="${SSH_SERVER_ALIVE_COUNT_MAX:-1}" +) +SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-15}" NODE_EXPORTER_PORT="${NODE_EXPORTER_PORT:-9100}" NODE_EXPORTER_TIMEOUT_SECONDS="${NODE_EXPORTER_TIMEOUT_SECONDS:-4}" HOST_SPECS=( + "99=192.168.0.99:vmware-host-autostart" "110=wooo@192.168.0.110:awoooi-startup-110.service" + "111=192.168.0.111:vm-host-boot" + "112=192.168.0.112:vm-host-boot" "120=wooo@192.168.0.120:k3s.service" "121=wooo@192.168.0.121:k3s.service" "188=ollama@192.168.0.188:awoooi-startup.service" @@ -37,6 +47,42 @@ is_local_target() { grep -Fxq "$target_host" <<<"$ips" } +run_with_timeout() { + local timeout_seconds="$1" + shift + if command -v timeout >/dev/null 2>&1; then + timeout "$timeout_seconds" "$@" + return $? + fi + if command -v python3 >/dev/null 2>&1; then + python3 - "$timeout_seconds" "$@" <<'PY' +import subprocess +import sys + +timeout_seconds = float(sys.argv[1]) +command = sys.argv[2:] +try: + result = subprocess.run( + command, + capture_output=True, + check=False, + text=True, + timeout=timeout_seconds, + ) +except subprocess.TimeoutExpired as exc: + if isinstance(exc.stdout, str): + print(exc.stdout, end="") + sys.exit(124) +print(result.stdout, end="") +if result.stderr: + print(result.stderr, end="", file=sys.stderr) +sys.exit(result.returncode) +PY + return $? + fi + "$@" +} + emit_boot_row() { local alias="$1" local target="$2" @@ -94,19 +140,54 @@ probe_node_exporter() { emit_boot_row "$alias" "$target" "$unit" 1 "node_exporter_${boot_time}" "$uptime_seconds" "node_exporter" "unknown" "unknown" } +probe_reachable_only() { + local alias="$1" + local target="$2" + local unit="$3" + local target_host="${target##*@}" + + if run_with_timeout "${PING_TIMEOUT_SECONDS:-2}" ping -c 1 "$target_host" >/dev/null 2>&1; then + emit_boot_row "$alias" "$target" "$unit" 1 "reachable_unknown_boot" "unknown" "ping_reachable" "unknown" "unknown" + return 0 + fi + + if command -v nc >/dev/null 2>&1; then + for port in ${BOOT_PROBE_TCP_PORTS:-22 80 443 3389 5985 9100}; do + if nc -z -w "${TCP_CONNECT_TIMEOUT_SECONDS:-2}" "$target_host" "$port" >/dev/null 2>&1; then + emit_boot_row "$alias" "$target" "$unit" 1 "reachable_unknown_boot" "unknown" "tcp_${port}_reachable" "unknown" "unknown" + return 0 + fi + done + fi + + return 1 +} + probe_host() { local alias="$1" local target="$2" local unit="$3" local output boot_id uptime_seconds systemd_state enabled active local target_host="${target##*@}" + local remote_script if is_local_target "$target_host"; then probe_local_host "$alias" "$target" "$unit" return 0 fi - output="$(ssh "${SSH_OPTS[@]}" "$target" "unit='$unit'; \ + if [[ "$target" != *@* ]]; then + if probe_node_exporter "$alias" "$target" "$unit"; then + return 0 + fi + if probe_reachable_only "$alias" "$target" "$unit"; then + return 0 + fi + emit_boot_row "$alias" "$target" "$unit" 0 "unknown" "unknown" "unknown" "unknown" "unknown" + return 0 + fi + + remote_script="unit='$unit'; \ boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \ uptime_seconds=\$(awk '{print int(\$1)}' /proc/uptime 2>/dev/null || echo unknown); \ systemd_state=\$(systemctl is-system-running 2>/dev/null || true); \ @@ -115,11 +196,15 @@ probe_host() { enabled=\${enabled:-unknown}; \ active=\${active:-unknown}; \ printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \ - " 2>/dev/null)" + " + output="$(run_with_timeout "$SSH_COMMAND_TIMEOUT_SECONDS" ssh "${SSH_OPTS[@]}" "$target" "$remote_script" 2>/dev/null)" if [[ $? -ne 0 || -z "$output" ]]; then if probe_node_exporter "$alias" "$target" "$unit"; then return 0 fi + if probe_reachable_only "$alias" "$target" "$unit"; then + return 0 + fi emit_boot_row "$alias" "$target" "$unit" 0 "unknown" "unknown" "unknown" "unknown" "unknown" return 0 fi @@ -134,7 +219,7 @@ probe_host() { } echo "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1" -echo "TARGET_HOSTS=110,120,121,188" +echo "TARGET_HOSTS=99,110,111,112,120,121,188" echo "GENERATED_AT=$(date '+%Y-%m-%dT%H:%M:%S%z')" for spec in "${HOST_SPECS[@]}"; do diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh index f690fffd..8e423e77 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -29,12 +29,21 @@ artifact_dir="$LOG_DIR/reboot-auto-recovery-slo-$run_id" mkdir -p "$artifact_dir" host_probe="$artifact_dir/host-probe.txt" +reboot_event_file="$artifact_dir/reboot-event.json" +reboot_event_prom="$artifact_dir/reboot-event.prom" summary_file="$artifact_dir/summary.txt" scorecard_file="$artifact_dir/scorecard.json" stock_freshness_file="$artifact_dir/stock-freshness.json" stock_ingestion_file="$artifact_dir/stock-ingestion.json" +reboot_event_state_file="${REBOOT_EVENT_STATE_FILE:-${LOG_DIR}/reboot-event-state.json}" bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true +python3 "$ROOT_DIR/scripts/reboot-recovery/reboot-event-detector.py" \ + --host-probe-file "$host_probe" \ + --state-file "$reboot_event_state_file" \ + --target-minutes "$TARGET_MINUTES" \ + --output "$reboot_event_file" \ + --prometheus-output "$reboot_event_prom" || true ARTIFACT_DIR="$artifact_dir/post-reboot-readiness" \ bash "$ROOT_DIR/scripts/reboot-recovery/post-reboot-readiness-summary.sh" --no-color >"$summary_file" 2>&1 || true @@ -51,6 +60,7 @@ scorecard_args=( "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" --summary-file "$summary_file" \ --host-probe-file "$host_probe" \ + --reboot-event-file "$reboot_event_file" \ --target-minutes "$TARGET_MINUTES" \ --min-free-gib "$MIN_FREE_GIB" \ --disk-path / \ @@ -84,21 +94,33 @@ payload=json.load(open(sys.argv[1], encoding="utf-8")) print(payload.get("host_boot_detection", {}).get("max_observed_uptime_seconds", 0)) PY )" +target_seconds_remaining="$(python3 - "$scorecard_file" <<'PY' +import json, sys +payload=json.load(open(sys.argv[1], encoding="utf-8")) +print(payload.get("sla_recovery_eta", {}).get("target_seconds_remaining", 0) or 0) +PY +)" tmp_metric="$(mktemp "$TEXTFILE_DIR/.reboot_auto_recovery_slo.XXXXXX")" cat >"$tmp_metric" <>"$tmp_metric" +fi chmod 0644 "$tmp_metric" mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME" diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index b3f43e91..950664a2 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -15,7 +15,7 @@ from typing import Any ROOT = Path(__file__).resolve().parents[2] SCHEMA_VERSION = "awoooi_reboot_auto_recovery_slo_scorecard_v1" -REQUIRED_HOSTS = {"110", "120", "121", "188"} +REQUIRED_HOSTS = {"99", "110", "111", "112", "120", "121", "188"} def parse_args() -> argparse.Namespace: @@ -24,6 +24,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--summary-file", type=Path, help="post-reboot-readiness-summary output.") parser.add_argument("--host-probe-file", type=Path, help="reboot-auto-recovery-host-probe output.") + parser.add_argument("--reboot-event-file", type=Path, help="reboot-event-detector JSON output.") parser.add_argument("--target-minutes", type=int, default=10) parser.add_argument("--min-free-gib", type=float, default=2.0) parser.add_argument("--disk-path", type=Path, help="Optionally check local free space.") @@ -38,6 +39,12 @@ def parse_args() -> argparse.Namespace: help="Optional StockPlatform /api/v1/system/ingestion JSON readback.", ) parser.add_argument("--generated-at", help="Override generated_at for stable snapshots.") + parser.add_argument( + "--required-host", + action="append", + dest="required_hosts", + help="Required host alias. May be passed more than once.", + ) parser.add_argument("--output", type=Path, help="Write JSON to this path.") return parser.parse_args() @@ -141,6 +148,9 @@ def source_controls() -> dict[str, bool]: "host_boot_probe_source_present": source_file( "scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" ).exists(), + "reboot_event_detector_source_present": source_file( + "scripts/reboot-recovery/reboot-event-detector.py" + ).exists(), "slo_exporter_source_present": source_file( "scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh" ).exists(), @@ -166,6 +176,24 @@ def source_controls() -> dict[str, bool]: "cold_start_textfile_exporter_source_present": source_file( "scripts/reboot-recovery/cold-start-textfile-exporter.sh" ).exists(), + "windows_99_vmware_autostart_source_present": source_file( + "scripts/reboot-recovery/windows99-vmware-autostart.ps1" + ).exists(), + "public_maintenance_fallback_source_present": source_file( + "ops/maintenance/maintenance.html" + ).exists() + and file_contains( + source_file("docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md"), + "L0", + "L1", + "502", + ), + "telegram_reboot_backup_alert_rules_source_present": file_contains( + source_file("ops/monitoring/alerts-unified.yml"), + "HostRebootEventDetected", + "RebootAutoRecoverySLOMissed", + "BackupCoverageDomainStale", + ), } @@ -402,8 +430,10 @@ def choose_safe_next_step( def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: target_seconds = args.target_minutes * 60 generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds") + required_hosts = set(args.required_hosts or REQUIRED_HOSTS) summary = parse_kv(read_text(args.summary_file)) host_rows = parse_host_probe(read_text(args.host_probe_file)) + reboot_event = read_json_object(args.reboot_event_file) stockplatform = build_stockplatform_readback( summary=summary, freshness=read_json_object(args.stock_freshness_file), @@ -419,7 +449,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: blockers.append(key.replace("_present", "_missing")) host_aliases = {str(row.get("alias", "")) for row in host_rows} - missing_hosts = sorted(REQUIRED_HOSTS - host_aliases) + missing_hosts = sorted(required_hosts - host_aliases) unreachable_hosts = sorted(str(row.get("alias")) for row in host_rows if not row.get("reachable")) stale_hosts = sorted( str(row.get("alias")) @@ -442,6 +472,27 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: if unknown_uptime_hosts: blockers.append("host_uptime_unknown") + reboot_event_missing_hosts = strings(reboot_event.get("missing_hosts")) + reboot_event_unreachable_hosts = strings(reboot_event.get("unreachable_hosts")) + reboot_event_fresh_hosts = strings(reboot_event.get("fresh_boot_hosts")) + reboot_event_detected = reboot_event.get("reboot_detected") is True + all_required_in_reboot_window = reboot_event.get("all_required_hosts_in_reboot_window") is True + target_seconds_remaining = int_value(reboot_event.get("target_seconds_remaining"), 0) + reboot_deadline_status = str( + reboot_event.get("recovery_deadline_status") or "event_readback_missing" + ) + if not reboot_event: + blockers.append("stateful_reboot_event_detection_missing") + else: + if reboot_event_missing_hosts: + blockers.append("reboot_event_missing_required_hosts") + if reboot_event_unreachable_hosts: + blockers.append("reboot_event_required_host_unreachable") + if not reboot_event_detected: + blockers.append("fresh_all_host_reboot_event_missing") + if not all_required_in_reboot_window: + blockers.append("all_required_hosts_not_in_10_minute_reboot_window") + service_green = truthy(summary.get("SERVICE_GREEN")) product_data_green = truthy(summary.get("PRODUCT_DATA_GREEN")) backup_core_green = truthy(summary.get("BACKUP_CORE_GREEN")) @@ -488,7 +539,7 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: "can_claim_all_services_recovered_within_target": can_claim, "source_controls": controls, "host_boot_detection": { - "required_hosts": sorted(REQUIRED_HOSTS), + "required_hosts": sorted(required_hosts), "observed_hosts": sorted(host_aliases), "missing_hosts": missing_hosts, "unreachable_hosts": unreachable_hosts, @@ -497,6 +548,43 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]: "max_observed_uptime_seconds": max_uptime, "host_rows": host_rows, }, + "reboot_event_detection": { + "readback_present": bool(reboot_event), + "reboot_detected": reboot_event_detected, + "rebooted_hosts": strings(reboot_event.get("rebooted_hosts")), + "fresh_boot_hosts": reboot_event_fresh_hosts, + "changed_boot_id_hosts": strings(reboot_event.get("changed_boot_id_hosts")), + "missing_hosts": reboot_event_missing_hosts, + "unreachable_hosts": reboot_event_unreachable_hosts, + "all_required_hosts_observed": reboot_event.get("all_required_hosts_observed") is True, + "all_required_hosts_in_reboot_window": all_required_in_reboot_window, + "state_file": reboot_event.get("state_file") if reboot_event else None, + "state_written": reboot_event.get("state_written") if reboot_event else False, + }, + "sla_recovery_eta": { + "target_minutes": args.target_minutes, + "target_seconds_remaining": target_seconds_remaining, + "deadline_status": reboot_deadline_status, + "estimated_recovery_status": ( + "inside_10_minute_window" + if target_seconds_remaining > 0 and not unique_blockers + else ( + "inside_10_minute_window_with_blockers" + if target_seconds_remaining > 0 + else "target_window_elapsed_or_not_detected" + ) + ), + "next_update_due_seconds": 60, + "fixed_triage_order": [ + "99_vmware_autostart_and_vm_power", + "host_boot_event_and_node_exporter", + "public_routes_and_maintenance_fallback", + "awoooi_k3s_workloads_and_registry", + "stockplatform_public_api_and_freshness", + "backup_health_and_offsite_evidence", + "telegram_alert_delivery_readback", + ], + }, "post_reboot_readiness": { "summary_present": bool(summary), "post_start_result": summary.get("POST_START_RESULT", "unknown"), diff --git a/scripts/reboot-recovery/reboot-event-detector.py b/scripts/reboot-recovery/reboot-event-detector.py new file mode 100644 index 00000000..e7b68d49 --- /dev/null +++ b/scripts/reboot-recovery/reboot-event-detector.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +"""Stateful all-host reboot event detector for AWOOOI recovery automation.""" + +from __future__ import annotations + +import argparse +import json +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any + + +SCHEMA_VERSION = "awoooi_reboot_event_detector_v1" +REQUIRED_HOSTS = ("99", "110", "111", "112", "120", "121", "188") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Detect host reboot events from reboot-auto-recovery-host-probe output.", + ) + parser.add_argument("--host-probe-file", type=Path, required=True) + parser.add_argument("--state-file", type=Path, required=True) + parser.add_argument("--target-minutes", type=int, default=10) + parser.add_argument("--generated-at") + parser.add_argument("--output", type=Path) + parser.add_argument("--prometheus-output", type=Path) + parser.add_argument( + "--required-host", + action="append", + dest="required_hosts", + help="Required host alias. May be passed more than once.", + ) + parser.add_argument( + "--no-write-state", + action="store_true", + help="Evaluate without updating the state file.", + ) + return parser.parse_args() + + +def int_value(value: Any, default: int = -1) -> int: + try: + return int(str(value)) + except (TypeError, ValueError): + return default + + +def parse_host_probe(text: str) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line.startswith("HOST_BOOT "): + continue + row: dict[str, Any] = {} + for token in line.split()[1:]: + if "=" not in token: + continue + key, value = token.split("=", 1) + row[key] = value + row["alias"] = str(row.get("alias", "")) + row["reachable"] = row.get("reachable") == "1" + row["uptime_seconds"] = int_value(row.get("uptime_seconds")) + rows.append(row) + return rows + + +def load_state(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError): + return {"hosts": {}} + return payload if isinstance(payload, dict) else {"hosts": {}} + + +def host_state(rows: list[dict[str, Any]]) -> dict[str, dict[str, Any]]: + state: dict[str, dict[str, Any]] = {} + for row in rows: + alias = str(row.get("alias") or "") + if not alias: + continue + state[alias] = { + "boot_id": str(row.get("boot_id") or "unknown"), + "uptime_seconds": int_value(row.get("uptime_seconds")), + "reachable": bool(row.get("reachable")), + "systemd_state": str(row.get("systemd_state") or "unknown"), + "startup_unit": str(row.get("startup_unit") or "unknown"), + "startup_active": str(row.get("startup_active") or "unknown"), + } + return state + + +def build_payload(args: argparse.Namespace) -> dict[str, Any]: + generated_at = args.generated_at or datetime.now().astimezone().isoformat(timespec="seconds") + observed_at = datetime.fromisoformat(generated_at) + target_seconds = args.target_minutes * 60 + required_hosts = tuple(args.required_hosts or REQUIRED_HOSTS) + rows = parse_host_probe(args.host_probe_file.read_text(encoding="utf-8")) + current_hosts = host_state(rows) + previous = load_state(args.state_file) + previous_hosts = previous.get("hosts") if isinstance(previous.get("hosts"), dict) else {} + + rebooted_hosts: list[str] = [] + fresh_boot_hosts: list[str] = [] + changed_boot_id_hosts: list[str] = [] + unreachable_hosts: list[str] = [] + missing_hosts = sorted(set(required_hosts) - set(current_hosts)) + + events: list[dict[str, Any]] = [] + for alias in required_hosts: + current = current_hosts.get(alias) + previous_host = previous_hosts.get(alias) if isinstance(previous_hosts, dict) else None + if not current: + continue + if not current["reachable"]: + unreachable_hosts.append(alias) + previous_boot_id = ( + str(previous_host.get("boot_id")) + if isinstance(previous_host, dict) and previous_host.get("boot_id") + else "" + ) + current_boot_id = str(current.get("boot_id") or "") + boot_id_changed = bool( + previous_boot_id + and previous_boot_id != "unknown" + and current_boot_id + and current_boot_id != "unknown" + and previous_boot_id != current_boot_id + ) + fresh_boot = bool(current.get("reachable") and int_value(current.get("uptime_seconds")) <= target_seconds) + if boot_id_changed: + changed_boot_id_hosts.append(alias) + if fresh_boot: + fresh_boot_hosts.append(alias) + if boot_id_changed or fresh_boot: + rebooted_hosts.append(alias) + events.append( + { + "host": alias, + "event": "boot_id_changed" if boot_id_changed else "fresh_boot_window", + "previous_boot_id": previous_boot_id or "unknown", + "current_boot_id": current_boot_id or "unknown", + "uptime_seconds": current.get("uptime_seconds"), + "deadline_at": ( + observed_at + + timedelta(seconds=max(0, target_seconds - int_value(current.get("uptime_seconds"), 0))) + ).isoformat(timespec="seconds"), + } + ) + + max_uptime = max( + [int_value(row.get("uptime_seconds"), 0) for row in rows if row.get("reachable")] or [0] + ) + remaining_seconds = max(0, target_seconds - max_uptime) + reboot_detected = bool(rebooted_hosts) + all_required_observed = not missing_hosts and not unreachable_hosts + all_required_in_reboot_window = set(required_hosts).issubset(set(fresh_boot_hosts)) + + next_state = { + "schema_version": SCHEMA_VERSION, + "updated_at": generated_at, + "target_seconds": target_seconds, + "hosts": current_hosts, + } + if not args.no_write_state: + args.state_file.parent.mkdir(parents=True, exist_ok=True) + tmp = args.state_file.with_suffix(args.state_file.suffix + ".tmp") + tmp.write_text(json.dumps(next_state, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") + tmp.replace(args.state_file) + + return { + "schema_version": SCHEMA_VERSION, + "generated_at": generated_at, + "target_minutes": args.target_minutes, + "target_seconds": target_seconds, + "required_hosts": list(required_hosts), + "observed_hosts": sorted(current_hosts), + "missing_hosts": missing_hosts, + "unreachable_hosts": sorted(unreachable_hosts), + "reboot_detected": reboot_detected, + "rebooted_hosts": sorted(set(rebooted_hosts)), + "fresh_boot_hosts": sorted(set(fresh_boot_hosts)), + "changed_boot_id_hosts": sorted(set(changed_boot_id_hosts)), + "all_required_hosts_observed": all_required_observed, + "all_required_hosts_in_reboot_window": all_required_in_reboot_window, + "max_observed_uptime_seconds": max_uptime, + "target_seconds_remaining": remaining_seconds, + "recovery_deadline_status": "within_target_window" if remaining_seconds > 0 else "target_window_elapsed", + "events": events, + "state_file": str(args.state_file), + "state_written": not args.no_write_state, + } + + +def write_prometheus(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + lines = [ + "# HELP awoooi_reboot_event_detected Whether a reboot event was detected by boot_id or fresh uptime.", + "# TYPE awoooi_reboot_event_detected gauge", + f'awoooi_reboot_event_detected{{scope="99_110_111_112_120_121_188"}} {1 if payload["reboot_detected"] else 0}', + "# HELP awoooi_reboot_event_required_host_observed Whether each required host was observed.", + "# TYPE awoooi_reboot_event_required_host_observed gauge", + ] + observed = set(payload.get("observed_hosts") or []) + rebooted = set(payload.get("rebooted_hosts") or []) + for host in payload.get("required_hosts") or []: + lines.append(f'awoooi_reboot_event_required_host_observed{{host="{host}"}} {1 if host in observed else 0}') + lines.append(f'awoooi_reboot_event_host_rebooted{{host="{host}"}} {1 if host in rebooted else 0}') + lines.extend( + [ + "# HELP awoooi_reboot_event_target_seconds_remaining Seconds remaining in the reboot recovery target window.", + "# TYPE awoooi_reboot_event_target_seconds_remaining gauge", + f'awoooi_reboot_event_target_seconds_remaining{{scope="99_110_111_112_120_121_188"}} {payload["target_seconds_remaining"]}', + "# HELP awoooi_reboot_event_max_observed_uptime_seconds Maximum uptime observed across reachable hosts.", + "# TYPE awoooi_reboot_event_max_observed_uptime_seconds gauge", + f'awoooi_reboot_event_max_observed_uptime_seconds{{scope="99_110_111_112_120_121_188"}} {payload["max_observed_uptime_seconds"]}', + ] + ) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> int: + args = parse_args() + payload = build_payload(args) + text = json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n" + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(text, encoding="utf-8") + else: + print(text, end="") + if args.prometheus_output: + write_prometheus(args.prometheus_output, payload) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index 1e51394e..ff491e75 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -26,19 +26,47 @@ NEXT_REQUIRED_GATES=none HOST_PROBE_GREEN = """\ AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1 -TARGET_HOSTS=110,120,121,188 +TARGET_HOSTS=99,110,111,112,120,121,188 +HOST_BOOT alias=99 target=192.168.0.99 startup_unit=vmware-host-autostart reachable=1 boot_id=aa uptime_seconds=100 systemd_state=windows_exporter startup_enabled=enabled startup_active=active HOST_BOOT alias=110 target=wooo@192.168.0.110 startup_unit=awoooi-startup-110.service reachable=1 boot_id=a uptime_seconds=120 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=111 target=192.168.0.111 startup_unit=vm-host-boot reachable=1 boot_id=bb uptime_seconds=125 systemd_state=node_exporter startup_enabled=unknown startup_active=unknown +HOST_BOOT alias=112 target=192.168.0.112 startup_unit=vm-host-boot reachable=1 boot_id=cc uptime_seconds=128 systemd_state=node_exporter startup_enabled=unknown startup_active=unknown HOST_BOOT alias=120 target=wooo@192.168.0.120 startup_unit=k3s.service reachable=1 boot_id=b uptime_seconds=130 systemd_state=running startup_enabled=enabled startup_active=active HOST_BOOT alias=121 target=wooo@192.168.0.121 startup_unit=k3s.service reachable=1 boot_id=c uptime_seconds=140 systemd_state=running startup_enabled=enabled startup_active=active HOST_BOOT alias=188 target=ollama@192.168.0.188 startup_unit=awoooi-startup.service reachable=1 boot_id=d uptime_seconds=150 systemd_state=running startup_enabled=enabled startup_active=active """ +REBOOT_EVENT_GREEN = { + "schema_version": "awoooi_reboot_event_detector_v1", + "generated_at": "2026-06-29T14:30:00+08:00", + "target_minutes": 10, + "target_seconds": 600, + "required_hosts": ["99", "110", "111", "112", "120", "121", "188"], + "observed_hosts": ["99", "110", "111", "112", "120", "121", "188"], + "missing_hosts": [], + "unreachable_hosts": [], + "reboot_detected": True, + "rebooted_hosts": ["99", "110", "111", "112", "120", "121", "188"], + "fresh_boot_hosts": ["99", "110", "111", "112", "120", "121", "188"], + "changed_boot_id_hosts": [], + "all_required_hosts_observed": True, + "all_required_hosts_in_reboot_window": True, + "max_observed_uptime_seconds": 150, + "target_seconds_remaining": 450, + "recovery_deadline_status": "within_target_window", + "state_file": "/tmp/reboot-event-state.json", + "state_written": True, +} + + def run_scorecard(tmp_path: Path, summary: str, probe: str = HOST_PROBE_GREEN) -> dict: summary_path = tmp_path / "summary.txt" probe_path = tmp_path / "probe.txt" + reboot_event_path = tmp_path / "reboot-event.json" summary_path.write_text(summary, encoding="utf-8") probe_path.write_text(probe, encoding="utf-8") + reboot_event_path.write_text(json.dumps(REBOOT_EVENT_GREEN), encoding="utf-8") result = subprocess.run( [ sys.executable, @@ -47,6 +75,8 @@ def run_scorecard(tmp_path: Path, summary: str, probe: str = HOST_PROBE_GREEN) - str(summary_path), "--host-probe-file", str(probe_path), + "--reboot-event-file", + str(reboot_event_path), "--generated-at", "2026-06-29T14:30:00+08:00", ], @@ -66,10 +96,12 @@ def run_scorecard_with_stock( ) -> dict: summary_path = tmp_path / "summary.txt" probe_path = tmp_path / "probe.txt" + reboot_event_path = tmp_path / "reboot-event.json" freshness_path = tmp_path / "freshness.json" ingestion_path = tmp_path / "ingestion.json" summary_path.write_text(summary, encoding="utf-8") probe_path.write_text(HOST_PROBE_GREEN, encoding="utf-8") + reboot_event_path.write_text(json.dumps(REBOOT_EVENT_GREEN), encoding="utf-8") freshness_path.write_text(json.dumps(freshness), encoding="utf-8") ingestion_path.write_text(json.dumps(ingestion), encoding="utf-8") result = subprocess.run( @@ -80,6 +112,8 @@ def run_scorecard_with_stock( str(summary_path), "--host-probe-file", str(probe_path), + "--reboot-event-file", + str(reboot_event_path), "--stock-freshness-file", str(freshness_path), "--stock-ingestion-file", diff --git a/scripts/reboot-recovery/tests/test_reboot_event_detector.py b/scripts/reboot-recovery/tests/test_reboot_event_detector.py new file mode 100644 index 00000000..12569be1 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_reboot_event_detector.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "reboot-event-detector.py" + + +HOST_PROBE = """\ +AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1 +TARGET_HOSTS=99,110,111,112,120,121,188 +HOST_BOOT alias=99 target=192.168.0.99 startup_unit=vmware-host-autostart reachable=1 boot_id=win-boot-2 uptime_seconds=80 systemd_state=windows_exporter startup_enabled=enabled startup_active=active +HOST_BOOT alias=110 target=wooo@192.168.0.110 startup_unit=awoooi-startup-110.service reachable=1 boot_id=linux-boot-2 uptime_seconds=100 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=111 target=192.168.0.111 startup_unit=vm-host-boot reachable=1 boot_id=vm111-boot-2 uptime_seconds=110 systemd_state=node_exporter startup_enabled=unknown startup_active=unknown +HOST_BOOT alias=112 target=192.168.0.112 startup_unit=vm-host-boot reachable=1 boot_id=vm112-boot-2 uptime_seconds=115 systemd_state=node_exporter startup_enabled=unknown startup_active=unknown +HOST_BOOT alias=120 target=wooo@192.168.0.120 startup_unit=k3s.service reachable=1 boot_id=mon-boot-2 uptime_seconds=120 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=121 target=wooo@192.168.0.121 startup_unit=k3s.service reachable=1 boot_id=worker-boot-2 uptime_seconds=130 systemd_state=running startup_enabled=enabled startup_active=active +HOST_BOOT alias=188 target=ollama@192.168.0.188 startup_unit=awoooi-startup.service reachable=1 boot_id=ai-boot-2 uptime_seconds=140 systemd_state=running startup_enabled=enabled startup_active=active +""" + + +def test_reboot_detector_detects_all_required_hosts_and_writes_metrics(tmp_path: Path) -> None: + probe_path = tmp_path / "host-probe.txt" + state_path = tmp_path / "state.json" + output_path = tmp_path / "event.json" + prom_path = tmp_path / "event.prom" + probe_path.write_text(HOST_PROBE, encoding="utf-8") + state_path.write_text( + json.dumps( + { + "hosts": { + "99": {"boot_id": "win-boot-1"}, + "110": {"boot_id": "linux-boot-1"}, + "111": {"boot_id": "vm111-boot-1"}, + "112": {"boot_id": "vm112-boot-1"}, + "120": {"boot_id": "mon-boot-1"}, + "121": {"boot_id": "worker-boot-1"}, + "188": {"boot_id": "ai-boot-1"}, + } + } + ), + encoding="utf-8", + ) + + subprocess.run( + [ + sys.executable, + str(SCRIPT), + "--host-probe-file", + str(probe_path), + "--state-file", + str(state_path), + "--target-minutes", + "10", + "--generated-at", + "2026-06-30T18:00:00+08:00", + "--output", + str(output_path), + "--prometheus-output", + str(prom_path), + ], + check=True, + ) + + payload = json.loads(output_path.read_text(encoding="utf-8")) + metrics = prom_path.read_text(encoding="utf-8") + + assert payload["required_hosts"] == ["99", "110", "111", "112", "120", "121", "188"] + assert payload["reboot_detected"] is True + assert payload["all_required_hosts_in_reboot_window"] is True + assert payload["target_seconds_remaining"] == 460 + assert 'awoooi_reboot_event_host_rebooted{host="99"} 1' in metrics + assert 'awoooi_reboot_event_required_host_observed{host="188"} 1' in metrics + + +def test_reboot_detector_fails_visible_when_windows_or_vm_host_missing(tmp_path: Path) -> None: + probe_path = tmp_path / "host-probe.txt" + state_path = tmp_path / "state.json" + output_path = tmp_path / "event.json" + probe_path.write_text( + "\n".join(line for line in HOST_PROBE.splitlines() if "alias=99 " not in line) + "\n", + encoding="utf-8", + ) + + subprocess.run( + [ + sys.executable, + str(SCRIPT), + "--host-probe-file", + str(probe_path), + "--state-file", + str(state_path), + "--target-minutes", + "10", + "--generated-at", + "2026-06-30T18:00:00+08:00", + "--output", + str(output_path), + "--no-write-state", + ], + check=True, + ) + + payload = json.loads(output_path.read_text(encoding="utf-8")) + + assert "99" in payload["missing_hosts"] + assert payload["all_required_hosts_observed"] is False + assert payload["all_required_hosts_in_reboot_window"] is False diff --git a/scripts/reboot-recovery/tests/test_reboot_p0_operational_contract.py b/scripts/reboot-recovery/tests/test_reboot_p0_operational_contract.py new file mode 100644 index 00000000..b969a79d --- /dev/null +++ b/scripts/reboot-recovery/tests/test_reboot_p0_operational_contract.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] + + +def read(path: str) -> str: + return (ROOT / path).read_text(encoding="utf-8") + + +def test_reboot_p0_contract_covers_all_required_hosts_and_vmware_autostart() -> None: + host_probe = read("scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh") + windows99 = read("scripts/reboot-recovery/windows99-vmware-autostart.ps1") + + for host in ["99", "110", "111", "112", "120", "121", "188"]: + assert host in host_probe + assert "AWOOOI-Start-VMware-VMs" in windows99 + assert "NoAutoRebootWithLoggedOnUsers" in windows99 + assert "Host111Vmx" in windows99 + assert "Host188Vmx" in windows99 + assert "Host120Vmx" in windows99 + assert "Host121Vmx" in windows99 + assert "Host112Vmx" in windows99 + + +def test_reboot_p0_contract_has_maintenance_and_telegram_alerts() -> None: + alerts = read("ops/monitoring/alerts-unified.yml") + runbook = read("docs/runbooks/PUBLIC-MAINTENANCE-FALLBACK-RUNBOOK.md") + snippet = read("ops/maintenance/nginx-502-maintenance-snippet.conf") + + for alert_name in [ + "HostRebootEventDetected", + "RebootAutoRecoverySLOMissed", + "PublicRouteServing5xxWithoutMaintenanceFallback", + "BackupCoverageDomainStale", + ]: + assert alert_name in alerts + assert "notification_type: TYPE-3" in alerts + assert "proxy_intercept_errors on" in snippet + assert "502" in runbook + assert "L0" in runbook + assert "L1" in runbook + + +def test_backup_exporter_emits_domain_level_backup_coverage() -> None: + exporter = read("scripts/ops/backup-health-textfile-exporter.py") + + assert "awoooi_backup_coverage_domain_expected_info" in exporter + assert "awoooi_backup_coverage_domain_fresh" in exporter + for domain in ["host", "database", "website", "service", "package", "tool", "log"]: + assert f'"{domain}"' in exporter diff --git a/scripts/reboot-recovery/windows99-vmware-autostart.ps1 b/scripts/reboot-recovery/windows99-vmware-autostart.ps1 new file mode 100644 index 00000000..63d5b94a --- /dev/null +++ b/scripts/reboot-recovery/windows99-vmware-autostart.ps1 @@ -0,0 +1,162 @@ +param( + [ValidateSet("Verify", "DryRun", "Apply")] + [string]$Mode = "Verify", + [string]$VmrunPath = "C:\Program Files (x86)\VMware\VMware Workstation\vmrun.exe", + [string]$Host111Vmx = "", + [string]$Host188Vmx = "", + [string]$Host120Vmx = "", + [string]$Host121Vmx = "", + [string]$Host112Vmx = "", + [string[]]$DiscoveryRoot = @("D:\VMs", "E:\VMs", "C:\VMs", "C:\Users\Public\Documents\Virtual Machines") +) + +$ErrorActionPreference = "Stop" + +$TaskName = "AWOOOI-Start-VMware-VMs" +$ProgramDataDir = "C:\ProgramData\AWOOOI" +$StartScript = Join-Path $ProgramDataDir "Start-AWOOOI-VMs.ps1" +$VmOrder = @("111", "188", "120", "121", "112") +$SuppliedVmx = @{ + "111" = $Host111Vmx + "188" = $Host188Vmx + "120" = $Host120Vmx + "121" = $Host121Vmx + "112" = $Host112Vmx +} + +function Resolve-VmxPath { + param([string]$HostAlias) + + $supplied = $SuppliedVmx[$HostAlias] + if ($supplied -and (Test-Path -LiteralPath $supplied)) { + return (Resolve-Path -LiteralPath $supplied).Path + } + + foreach ($root in $DiscoveryRoot) { + if (-not (Test-Path -LiteralPath $root)) { + continue + } + $match = Get-ChildItem -LiteralPath $root -Recurse -Filter "*.vmx" -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "(^|[\\_\-\s])$HostAlias([\\_\-\s]|$)" } | + Select-Object -First 1 + if ($match) { + return $match.FullName + } + } + return "" +} + +function Get-VmMap { + $map = [ordered]@{} + foreach ($alias in $VmOrder) { + $map[$alias] = Resolve-VmxPath -HostAlias $alias + } + return $map +} + +function Write-StartupScript { + param([hashtable]$VmMap) + + New-Item -ItemType Directory -Path $ProgramDataDir -Force | Out-Null + $vmRows = foreach ($alias in $VmOrder) { + " @{ Alias = `"$alias`"; Path = `"$($VmMap[$alias])`" }" + } + $body = @" +`$ErrorActionPreference = "Continue" +`$VmrunPath = "$VmrunPath" +`$VMs = @( +$($vmRows -join ",`n") +) + +Start-Service -Name "VMAuthdService" -ErrorAction SilentlyContinue +Start-Service -Name "VMnetDHCP" -ErrorAction SilentlyContinue +Start-Service -Name "VMware NAT Service" -ErrorAction SilentlyContinue +Start-Sleep -Seconds 30 + +foreach (`$vm in `$VMs) { + if (-not (Test-Path -LiteralPath `$vm.Path)) { + Write-Output "VMX_MISSING alias=`$(`$vm.Alias) path=`$(`$vm.Path)" + continue + } + & `$VmrunPath -T ws start `$vm.Path nogui + Write-Output "VM_START_REQUESTED alias=`$(`$vm.Alias) path=`$(`$vm.Path) exit=`$LASTEXITCODE" + Start-Sleep -Seconds 20 +} +"@ + Set-Content -LiteralPath $StartScript -Value $body -Encoding UTF8 +} + +function Apply-WindowsUpdatePolicy { + $auPath = "HKLM:\SOFTWARE\Policies\Microsoft\Windows\WindowsUpdate\AU" + New-Item -Path $auPath -Force | Out-Null + New-ItemProperty -Path $auPath -Name "NoAutoRebootWithLoggedOnUsers" -Value 1 -PropertyType DWord -Force | Out-Null + New-ItemProperty -Path $auPath -Name "AlwaysAutoRebootAtScheduledTime" -Value 0 -PropertyType DWord -Force | Out-Null + New-ItemProperty -Path $auPath -Name "AUOptions" -Value 3 -PropertyType DWord -Force | Out-Null + New-ItemProperty -Path $auPath -Name "ScheduledInstallDay" -Value 0 -PropertyType DWord -Force | Out-Null + New-ItemProperty -Path $auPath -Name "ScheduledInstallTime" -Value 3 -PropertyType DWord -Force | Out-Null + New-ItemProperty -Path $auPath -Name "AUPowerManagement" -Value 0 -PropertyType DWord -Force | Out-Null +} + +function Apply-ScheduledTask { + $action = New-ScheduledTaskAction ` + -Execute "powershell.exe" ` + -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$StartScript`"" + $trigger = New-ScheduledTaskTrigger -AtStartup + $principal = New-ScheduledTaskPrincipal -UserId "SYSTEM" -RunLevel Highest + $settings = New-ScheduledTaskSettingsSet ` + -AllowStartIfOnBatteries ` + -DontStopIfGoingOnBatteries ` + -MultipleInstances IgnoreNew ` + -RestartCount 3 ` + -RestartInterval (New-TimeSpan -Minutes 2) ` + -StartWhenAvailable + Register-ScheduledTask ` + -TaskName $TaskName ` + -Action $action ` + -Trigger $trigger ` + -Principal $principal ` + -Settings $settings ` + -Force | Out-Null +} + +$vmMap = Get-VmMap +$missing = @($VmOrder | Where-Object { -not $vmMap[$_] }) +$vmrunPresent = Test-Path -LiteralPath $VmrunPath + +Write-Output "AWOOOI_WINDOWS99_VMWARE_AUTOSTART=1" +Write-Output "MODE=$Mode" +Write-Output "VMRUN_PRESENT=$([int]$vmrunPresent)" +foreach ($alias in $VmOrder) { + Write-Output "VMX alias=$alias path=$($vmMap[$alias]) present=$([int][bool]$vmMap[$alias])" +} + +if (-not $vmrunPresent) { + Write-Error "vmrun.exe not found at $VmrunPath" +} + +if ($missing.Count -gt 0) { + Write-Output "MISSING_VMX_ALIASES=$($missing -join ',')" + if ($Mode -eq "Apply") { + throw "Apply requires explicit or discoverable VMX paths for every required VM." + } +} + +if ($Mode -eq "Verify") { + exit 0 +} + +if ($Mode -eq "DryRun") { + Write-Output "DRY_RUN would_write_start_script=$StartScript" + Write-Output "DRY_RUN would_register_task=$TaskName" + Write-Output "DRY_RUN would_apply_windows_update_no_auto_reboot_policy=1" + exit 0 +} + +Write-StartupScript -VmMap $vmMap +Set-Service -Name "VMAuthdService" -StartupType Automatic -ErrorAction SilentlyContinue +Set-Service -Name "VMnetDHCP" -StartupType Automatic -ErrorAction SilentlyContinue +Set-Service -Name "VMware NAT Service" -StartupType Automatic -ErrorAction SilentlyContinue +Apply-ScheduledTask +Apply-WindowsUpdatePolicy + +Write-Output "APPLIED scheduled_task=$TaskName start_script=$StartScript windows_update_no_auto_reboot_policy=1"