From 4808995edabd45fff9bf0e0eff1f8380aeb096ff Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 29 May 2026 12:38:58 +0800
Subject: [PATCH] fix(ops): harden reboot recovery and backup alerts

---
 docs/LOGBOOK.md                               |  26 +
 docs/runbooks/FULL-STACK-COLD-START-SOP.md    |  79 ++
 .../nginx/templates/188-all-sites.conf.j2     | 307 ++++--
 k8s/monitoring/prometheus.yml                 |   8 +-
 ops/monitoring/alerts-unified.yml             | 877 ++++++++++++++----
 .../full-stack-backup-baseline.yml            | 306 ++++++
 .../full-stack-cold-start-baseline.yml        | 499 ++++------
 .../ops/backup-alert-label-contract-check.py  | 260 ++++++
 .../ops/backup-alert-live-visibility-check.py | 242 +++++
 scripts/ops/prometheus-rule-drift-guard.sh    |  42 +-
 .../ops/recovery-scorecard-contract-check.py  | 148 +++
 .../cold-start-textfile-exporter.sh           |  52 +-
 .../full-stack-cold-start-check.sh            | 178 ++--
 13 files changed, 2353 insertions(+), 671 deletions(-)
 create mode 100644 ops/reboot-recovery/full-stack-backup-baseline.yml
 create mode 100755 scripts/ops/backup-alert-label-contract-check.py
 create mode 100755 scripts/ops/backup-alert-live-visibility-check.py
 create mode 100755 scripts/ops/recovery-scorecard-contract-check.py

diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index 402496ea..6dc12aa3 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -22528,3 +22528,29 @@ production browser smoke:
 - 24h 完整自動修復 production claim：0%；目前仍不能宣稱真正 AI 自動修復閉環已達成。
 - 完整 AI 自動化管理產品化：約 99.3%，但「真正全自動 repair / approval / learning / KM writeback 閉環」
   仍需以 24h production evidence 補齊。
+
+## 2026-05-29 | 重開機恢復續修：aiops 入口、備份告警與 Ansible baseline 收斂
+
+**背景**：統帥要求確認所有主機重啟後，服務、網站、工具、資料庫、排程與備份都能快速恢復，且不能只停在人工熱修。前一輪已修正 AWOOOI/Flywheel stale incident 與成功率規則；本輪接著處理 cold-start gate 仍未綠燈的項目。
+
+**現場修復**：
+- 188 public gateway 的 `aiops.wooo.work` 原本仍反代到失聯的 `192.168.0.120:31234/31235`，導致 public route 502；已改為正式 VIP `192.168.0.125:32334/32335`，`/` 回 307 到 `/zh-TW`，`/api/v1/health` 回 `healthy`。
+- 188 `/etc/nginx/sites-enabled/` 中有舊備份檔仍被 Nginx include，造成新 vhost 被 `conflicting server name ... ignored`；已移到 `/etc/nginx/sites-disabled-codex/`，保留備份但不再載入。
+- 110 `fwupd.service` / `fwupd-refresh.service` 是 stale failed state；已 `reset-failed`，`systemctl --failed` 回 0。
+- Prometheus live `alerts.yml` 與 `alerts-unified.canonical.yml` 被縮水成舊版，缺完整備份、異地同步、credential escrow、cold-start scorecard 規則；已重新同步 repo 的 `ops/monitoring/alerts-unified.yml` 到兩個 live 檔並 reload Prometheus。
+- `prometheus-rule-drift-guard` 已確認 `missing_required_count=0`、`current_matches_canonical=1`，之後不會每 5 分鐘把完整備份規則拉回舊版。
+- Ansible `infra/ansible/roles/nginx/templates/188-all-sites.conf.j2` 已同步 188 live public gateway baseline，避免下一次跑 `nginx-sync.yml` 又把 aiops 指回單一 120 節點。
+
+**驗證**：
+- `https://aiops.wooo.work/` public route 與 TLS 已回 200/307 成功範圍；`https://aiops.wooo.work/api/v1/health` 回 `healthy prod`。
+- `bash /home/wooo/scripts/full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1`：public routes 全部通過，110 failed units = 0，momo scheduler 以 container health + 2h 內 task activity 判定正常，momo 當月 `daily_sales_snapshot`/`realtime_sales_monthly` 一致，結果為 `PASS=72 WARN=2 BLOCKED=3`。
+- `BLOCKED=3` 全部仍指向 120：`ping 192.168.0.120`、`ssh 192.168.0.120:22`、`ssh 120 k3s read-only check`。
+- Google Drive/rclone daily full sync 仍正常：`rclone-last-success` 與 `rclone-full-verify-last-success` 都是 2026-05-29，full repos 覆蓋 `awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes`。
+- 完整備份告警規則已載入：`BackupAggregateRunFailed`、`BackupConfigCapturePartial`、`BackupOffsiteCopyStale`、`BackupCredentialEscrowEvidenceMissing`、`awoooi_recovery_core_ready`、`ColdStartRecoveryBlocked` 全部存在；Prometheus rule count = 142。
+- 因 120 失聯，`BackupConfigCapturePartial{target="120-k3s-host-configs"}` 與 `BackupAggregateRunFailed` 會進入 pending/firing，這是正確訊號，不應消音。
+- `mo.wooo.work` 資料修復：momo 自動匯入 2026-05-29 11:55 已把 2026-05-01~2026-05-28 的 17,353 筆寫入 `daily_sales_snapshot`，但同步 `realtime_sales_monthly` 時 PostgreSQL index 內部錯誤 `posting list tuple ... cannot be split`，導致 5 月分析表為 0。已在 188 `momo-db` 執行 `REINDEX TABLE CONCURRENTLY public.realtime_sales_monthly`，再以同日期範圍從 `daily_sales_snapshot` idempotent 補同步；驗證 `daily_sales_snapshot=17,353`、`realtime_sales_monthly=17,353`、`realtime_sales_monthly` 總筆數 `774,111`，日期最大值到 `2026-05-28`，並清除 momo 應用 cache。
+
+**不可宣稱完成**：
+- 120 仍不可達，K3s node `mon` 是 `NotReady,SchedulingDisabled`；`mon1` 可承載 AWOOI workloads，但 full cold-start done criteria 尚未達成。
+- 110 backup aggregate `failed_count=1` 是 120 config capture 無法完成；必須 120 回來後重跑 `/backup/scripts/backup-configs.sh` 或 `/backup/scripts/backup-all.sh`，再補跑 Google Drive/rclone full sync。
+- `SLO_KMGrowthRate_Low` 仍為 warning（24h KM 約 19/20），不是網站 outage，但需後續追 KM 產出。
diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
index ae46188f..2b834e5d 100644
--- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md
+++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
@@ -590,6 +590,84 @@ Prometheus rules in `ops/monitoring/alerts-unified.yml` alert when the monitor i
 4. Release high-load services only after `GREEN` and load/core stays below `1.0` for 15 minutes.
 5. Record the final output summary and any manual repair in `docs/LOGBOOK.md`.
 
+### 13.6 2026-05-29 補充：188 Public Gateway 與備份告警
+
+`aiops.wooo.work` 的 188 public gateway 不可再指向單一 `192.168.0.120:31234/31235`。120 失聯時這會讓 public route 直接 502。正式 baseline 必須走 K3s VIP：
+
+```nginx
+location /api/ {
+    proxy_pass http://192.168.0.125:32334/api/;
+}
+
+location /api/v1/ws {
+    proxy_pass http://192.168.0.125:32334/api/v1/ws;
+}
+
+location / {
+    proxy_pass http://192.168.0.125:32335;
+}
+```
+
+變更來源必須是 `infra/ansible/roles/nginx/templates/188-all-sites.conf.j2`，再用 `infra/ansible/playbooks/nginx-sync.yml` 收斂；禁止只改 188 live 檔而不回寫 Ansible baseline。
+
+備份告警有兩層，缺一不可：
+
+- `ops/monitoring/alerts-unified.yml` 是 repo canonical。
+- 110 live `/home/wooo/monitoring/alerts.yml` 與 `/home/wooo/monitoring/alerts-unified.canonical.yml` 必須一致，否則 `prometheus-rule-drift-guard` 可能把規則拉回舊版。
+
+重啟後必查：
+
+```bash
+curl -s http://127.0.0.1:9090/api/v1/rules \
+  | python3 -c 'import json,sys; d=json.load(sys.stdin); names=[r.get("name") for g in d["data"]["groups"] for r in g["rules"]]; print([n for n in ["BackupAggregateRunFailed","BackupConfigCapturePartial","BackupOffsiteCopyStale","BackupCredentialEscrowEvidenceMissing","ColdStartRecoveryBlocked"] if n not in names])'
+
+cat /home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom
+```
+
+若 120 尚未恢復，`BackupConfigCapturePartial{target="120-k3s-host-configs"}` 與 cold-start blocked 是正確訊號，不可消音。120 恢復後再重跑：
+
+```bash
+/backup/scripts/backup-configs.sh
+/backup/scripts/backup-all.sh
+/backup/scripts/sync-offsite-backups.sh --mode sync
+/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color
+```
+
+### 13.7 2026-05-29 補充：momo PostgreSQL Index 與資料同步
+
+`mo.wooo.work` 不能只看 `/health` 或首頁 200。重啟或 fsck 後，PostgreSQL index 可能讓匯入流程表面完成，但 `daily_sales_snapshot` 未同步到 `realtime_sales_monthly`。本次症狀：
+
+- `daily_sales_snapshot` 已有 2026-05-01 到 2026-05-28 的 17,353 筆。
+- `realtime_sales_monthly` 同日期範圍為 0 筆。
+- momo-scheduler log 出現 PostgreSQL 內部錯誤 `posting list tuple ... cannot be split`。
+
+標準處理順序：
+
+```bash
+# 188 / momo-db，只重建索引，不刪資料
+docker exec -i momo-db bash -lc 'psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -v ON_ERROR_STOP=1' <<'SQL'
+REINDEX TABLE CONCURRENTLY public.realtime_sales_monthly;
+SQL
+```
+
+重建索引後，才可針對缺漏日期做 idempotent 補同步。正式作法必須先確認 `realtime_sales_monthly` 該日期範圍筆數，若非 0，需先保存查詢結果並確認是否重跑同範圍同步；不可整表 truncate、不可整庫 restore。補同步後至少驗證：
+
+```sql
+SELECT count(*), min(snapshot_date::date), max(snapshot_date::date)
+FROM daily_sales_snapshot
+WHERE snapshot_date::date BETWEEN DATE '2026-05-01' AND DATE '2026-05-28';
+
+SELECT count(*), min("日期"::date), max("日期"::date)
+FROM realtime_sales_monthly
+WHERE "日期"::date BETWEEN DATE '2026-05-01' AND DATE '2026-05-28';
+```
+
+兩張表同日期範圍筆數與日期上下界必須一致。完成後清除 momo 應用 cache：
+
+```bash
+docker exec momo-pro-system python -c 'from services.cache_service import clear_all_cache; clear_all_cache(); print("cache_cleared")'
+```
+
 ---
 
 ## 14. Done Criteria
@@ -604,6 +682,7 @@ All must be true:
 - AWOOOI API and Web reachable through NodePort/VIP.
 - Alertmanager E2E webhook succeeds.
 - cron/CronJob schedules are active, unsuspended, and verified.
+- momo `daily_sales_snapshot` 與 `realtime_sales_monthly` 在最新匯入日期範圍內筆數一致。
 - Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
 - High-load batch services are capped or delayed.
 - Runners are guarded and released last.
diff --git a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2
index a9936ea7..47687632 100644
--- a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2
+++ b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2
@@ -1,145 +1,268 @@
 # 188-all-sites.conf.j2
-# AWOOOI Nginx 全站設定 — 由 Ansible nginx-sync.yml playbook 管理
-# 禁止直接手改此檔案 → 請修改 roles/nginx/templates/188-all-sites.conf.j2
-# 部署指令: ansible-playbook -i inventory/hosts.yml playbooks/nginx-sync.yml --tags 188
-# 最後同步: {{ ansible_date_time.iso8601 }}
-
-# ============================================================
-# OpenClaw (port 8088)
-# ============================================================
+# AWOOOI 188 public gateway baseline managed by infra/ansible/playbooks/nginx-sync.yml.
+# 2026-05-29 Codex: synced from live 188 after reboot recovery; aiops.wooo.work
+# must use the K3s VIP 192.168.0.125:32334/32335 instead of a single 120 node.
+#
+# =============================================================================
+# AIOPS - aiops.wooo.work
+# =============================================================================
 server {
     listen 80;
-    server_name openclaw.awoooi.com;
+    server_name aiops.wooo.work;
+    return 301 https://$server_name$request_uri;
+}
 
-    location / {
-        proxy_pass http://127.0.0.1:8088;
+server {
+    listen 443 ssl http2;
+    server_name aiops.wooo.work;
+
+    ssl_certificate /etc/letsencrypt/live/aiops.wooo.work/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/aiops.wooo.work/privkey.pem;
+
+    # API
+    location /api/ {
+        proxy_pass http://192.168.0.125:32334/api/;
+        proxy_http_version 1.1;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+
+    # WebSocket
+    location /api/v1/ws {
+        proxy_pass http://192.168.0.125:32334/api/v1/ws;
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+        proxy_set_header Host $host;
+    }
+
+    # Frontend
+    location / {
+        proxy_pass http://192.168.0.125:32335;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+
+# =============================================================================
+# GitLab - gitlab.wooo.work (代理到 110)
+# =============================================================================
+server {
+    listen 80;
+    server_name gitlab.wooo.work;
+    return 301 https://$server_name$request_uri;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name gitlab.wooo.work;
+
+    ssl_certificate /etc/letsencrypt/live/gitlab.wooo.work/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/gitlab.wooo.work/privkey.pem;
+
+    client_max_body_size 500m;
+
+    location / {
+        proxy_pass http://192.168.0.110:8929;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
         proxy_read_timeout 300s;
+        proxy_connect_timeout 300s;
     }
 }
 
-# ============================================================
-# tsenyang (port 3000)
-# ============================================================
+# =============================================================================
+# SigNoz - signoz.wooo.work
+# =============================================================================
 server {
     listen 80;
-    server_name tsenyang.awoooi.com;
-
-    location / {
-        proxy_pass http://127.0.0.1:3000;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-    }
-}
-
-# ============================================================
-# momo (port 5003)
-# ============================================================
-server {
-    listen 80;
-    server_name momo.awoooi.com;
-
-    location / {
-        proxy_pass http://127.0.0.1:5003;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-    }
-}
-
-# ============================================================
-# SignOz (port 3301)
-# ============================================================
-server {
-    listen 80;
-    server_name signoz.awoooi.internal;
+    server_name signoz.wooo.work;
 
     location / {
         proxy_pass http://127.0.0.1:3301;
+        proxy_http_version 1.1;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
+    }
+}
+
+# =============================================================================
+# Tsenyang - www.tsenyang.com (待遷移，暫時代理到 110)
+# =============================================================================
+server {
+    listen 80;
+    server_name www.tsenyang.com tsenyang.com;
+    return 301 https://$server_name$request_uri;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name www.tsenyang.com tsenyang.com;
+
+    ssl_certificate /etc/letsencrypt/live/www.tsenyang.com/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/www.tsenyang.com/privkey.pem;
+
+    location / {
+        proxy_pass http://127.0.0.1:3000;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+    }
+}
+
+# =============================================================================
+# Stock Platform - stock.wooo.work
+# =============================================================================
+server {
+    listen 80;
+    server_name stock.wooo.work;
+
+    location /.well-known/acme-challenge/ {
+        root /var/www/html;
+    }
+
+    location / {
+        return 301 https://$server_name$request_uri;
+    }
+}
+
+server {
+    listen 443 ssl http2;
+    server_name stock.wooo.work;
+
+    ssl_certificate /etc/letsencrypt/live/stock.wooo.work/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/stock.wooo.work/privkey.pem;
+
+    # 後台直接接收，不經由網站主站 Basic Auth
+    location = /admin {
+        return 301 /admin/;
+    }
+
+    location /admin/ {
+        auth_basic off;
+        proxy_pass http://192.168.0.110:31235;
+        proxy_http_version 1.1;
         proxy_set_header Upgrade $http_upgrade;
         proxy_set_header Connection "upgrade";
-    }
-}
-
-# ============================================================
-# MinIO (port 9000 API / 9001 Console)
-# ============================================================
-server {
-    listen 80;
-    server_name minio.awoooi.internal;
-
-    location / {
-        proxy_pass http://127.0.0.1:9001;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
-        client_max_body_size 500m;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_buffering off;
+    }
+
+    # 前台主站
+    location / {
+        proxy_pass http://192.168.0.110:31235;
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
     }
 }
 
-# ============================================================
-# LiteLLM (port 4000)
-# ============================================================
+# =============================================================================
+# MOMO PRO - mo.wooo.work (待部署)
+# =============================================================================
 server {
     listen 80;
-    server_name litellm.awoooi.internal;
+    server_name mo.wooo.work;
+    return 301 https://$server_name$request_uri;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name mo.wooo.work;
+
+    ssl_certificate /etc/letsencrypt/live/mo.wooo.work/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/mo.wooo.work/privkey.pem;
 
     location / {
-        proxy_pass http://127.0.0.1:4000;
+        proxy_pass http://127.0.0.1:5003;
+        proxy_http_version 1.1;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_read_timeout 300s;
     }
 }
 
-# ============================================================
-# n8n (port 5678)
-# ============================================================
+# =============================================================================
+# Bitan 藥局 - bitan.wooo.work (待部署)
+# =============================================================================
 server {
     listen 80;
-    server_name n8n.awoooi.internal;
+    server_name bitan.wooo.work;
+    return 301 https://$server_name$request_uri;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name bitan.wooo.work;
+
+    ssl_certificate /etc/letsencrypt/live/bitan.wooo.work/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/bitan.wooo.work/privkey.pem;
+
+    client_max_body_size 25m;
 
     location / {
-        proxy_pass http://127.0.0.1:5678;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
+        proxy_pass http://192.168.0.110:3003;
+        proxy_http_version 1.1;
         proxy_set_header Upgrade $http_upgrade;
         proxy_set_header Connection "upgrade";
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
     }
 }
 
-# ============================================================
-# Open WebUI (port 3010)
-# ============================================================
+# =============================================================================
+# VTuber - vtuber.wooo.work
+# =============================================================================
 server {
-    listen 80;
-    server_name open-webui.awoooi.internal;
+    server_name vtuber.wooo.work;
+
+    location /.well-known/acme-challenge/ {
+        root /var/www/html;
+    }
 
     location / {
-        proxy_pass http://127.0.0.1:3010;
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
+        proxy_pass https://192.168.0.110;
+        proxy_http_version 1.1;
         proxy_set_header Upgrade $http_upgrade;
         proxy_set_header Connection "upgrade";
-        proxy_read_timeout 300s;
-    }
-}
-
-# ============================================================
-# Docker Registry (port 5001)
-# ============================================================
-server {
-    listen 80;
-    server_name registry.awoooi.internal;
-
-    location / {
-        proxy_pass http://127.0.0.1:5001;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
-        client_max_body_size 2g;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
     }
+
+    listen 443 ssl; # managed by Certbot
+    ssl_certificate /etc/letsencrypt/live/vtuber.wooo.work/fullchain.pem; # managed by Certbot
+    ssl_certificate_key /etc/letsencrypt/live/vtuber.wooo.work/privkey.pem; # managed by Certbot
+    include /etc/letsencrypt/options-ssl-nginx.conf; # managed by Certbot
+    ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; # managed by Certbot
+
+}
+
+server {
+    if ($host = vtuber.wooo.work) {
+        return 301 https://$host$request_uri;
+    } # managed by Certbot
+
+
+    listen 80;
+    server_name vtuber.wooo.work;
+    return 404; # managed by Certbot
+
+
 }
diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml
index 2452a9fd..14ad7764 100644
--- a/k8s/monitoring/prometheus.yml
+++ b/k8s/monitoring/prometheus.yml
@@ -57,8 +57,8 @@ scrape_configs:
           - https://mo.wooo.work
           - http://192.168.0.188:4000/health/liveliness
           - http://192.168.0.110:3001
-          - http://192.168.0.120:31234
-          - http://192.168.0.120:31235
+          - http://192.168.0.125:32334/api/v1/health
+          - http://192.168.0.125:32335
           - https://www.tsenyang.com
           - http://stock.wooo.work
           - https://bitan.wooo.work
@@ -93,8 +93,8 @@ scrape_configs:
           - 192.168.0.188:6380
           - 192.168.0.188:8089
           # K3s Worker
-          - 192.168.0.120:31234
-          - 192.168.0.120:31235
+          - 192.168.0.125:32334
+          - 192.168.0.125:32335
     relabel_configs:
       - source_labels: [__address__]
         target_label: __param_target
diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml
index 56163c6e..9521dec1 100644
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -15,6 +15,39 @@
 
 groups:
 
+  # =========================================================================
+  # Full-stack recovery scorecard recording rules
+  # =========================================================================
+  - name: full_stack_recovery_scorecard_rules
+    interval: 60s
+    rules:
+      - record: awoooi_recovery_core_ready
+        expr: |
+          sum without(result) (
+            awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1
+          )
+          * on(host,scope) (
+            awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0
+          )
+          * on(host,scope) (
+            awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0
+          )
+          * on(host,scope) (
+            (time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600
+          )
+
+      - record: awoooi_recovery_dr_offsite_ready
+        expr: |
+          max by(host) (
+            awoooi_backup_offsite_configured{host="110"} == bool 1
+          )
+          * on(host) max by(host) (
+            awoooi_backup_offsite_fresh{host="110"} == bool 1
+          )
+          * on(host) min by(host) (
+            awoooi_backup_credential_escrow_fresh{host="110"} == bool 1
+          )
+
   # =========================================================================
   # 主機層告警 (host_alerts)
   # =========================================================================
@@ -41,7 +74,7 @@ groups:
           severity: warning
           layer: systemd-188
           team: ops
-          auto_repair: "true"
+          auto_repair: "false"
           # MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤
           mcp_provider: "ssh_host"
           host_type: "bare_metal"
@@ -49,9 +82,6 @@ groups:
         annotations:
           summary: "主機 {{ $labels.host }} CPU 高負載"
           description: "CPU 使用率超過 90% 持續 10 分鐘；若 load5/core 未超過 1.5，先視為容量觀察與診斷，不直接修復。"
-          # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl
-          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷；禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)"
-          runbook: "host CPU 高負載排查：先 SSH ps aux 看 top 進程；若為第三方服務（Sentry/ClickHouse 等）寫 ADR 升級資源或調 limit，禁止 kubectl restart 跨 domain"
 
       - alert: HostLoadAverageSustainedHigh
         # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。
@@ -86,9 +116,6 @@ groups:
         annotations:
           summary: "主機 {{ $labels.host }} 記憶體不足"
           description: "記憶體使用率超過 85%"
-          # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷
-          auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷；禁 kubectl restart — 主因常為第三方服務)"
-          runbook: "host 記憶體不足排查：SSH 看 top 進程；若為第三方服務需擴容或調 limit"
 
       - alert: HostOutOfDiskSpace
         expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
@@ -173,7 +200,7 @@ groups:
           description: "過去 24 小時有備份失敗"
 
       - alert: VeleroBackupNotRun
-        expr: time() - velero_backup_last_successful_timestamp > 86400
+        expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0
         for: 10m
         labels:
           severity: critical
@@ -183,7 +210,7 @@ groups:
           auto_repair: "false"
         annotations:
           summary: "Velero 超過 24 小時未成功備份"
-          description: "最後一次成功備份超過 24 小時"
+          description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。"
 
       # Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6)
       # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success
@@ -521,26 +548,8 @@ groups:
           team: platform
           auto_repair: "false"
         annotations:
-          summary: "Alertmanager 主鏈路 2 小時內未收到告警"
-          description: "Alertmanager 是固定主鏈路；Sentry/SignOz 沉默不代表鏈路故障，錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
-
-      - alert: SourceProviderIngestionStale
-        expr: |
-          time() - max by (source) (
-            awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
-          ) > 86400
-        for: 15m
-        labels:
-          severity: warning
-          layer: k8s
-          component: source-ingestion
-          team: platform
-          auto_repair: "false"
-          alert_category: "alertchain_provider_freshness"
-        annotations:
-          summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
-          description: "{{ $labels.source }} webhook endpoint 可能仍健康，但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口，不是 Alertmanager 主鏈路故障。"
-          runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health，再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }}；若 endpoint OK 但 latest stale，檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
+          summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
+          description: "可能是告警鏈路問題，請執行 Smoke Test"
 
       - alert: AlertChainUnhealthy
         expr: awoooi_alert_chain_healthy == 0
@@ -633,8 +642,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core"
           description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘，需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。"
@@ -651,8 +658,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-3
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core"
           description: "{{ $labels.container_name }} 已持續吃超過 4 core，會拖垮 110/188 主機；需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。"
@@ -670,8 +675,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%"
           description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker，需先判斷 workload，不可直接降 limit。"
@@ -689,8 +692,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-3
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次"
           description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增，避免再次出現 litellm 24,464 次靜默崩潰。"
@@ -708,8 +709,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit"
           description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail，長時間尖峰可能拖垮 110/188。"
@@ -727,8 +726,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘"
           description: "{{ $labels.container_name }} 已超過 20 分鐘，110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。"
@@ -746,8 +743,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-3
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次"
           description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增；110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。"
@@ -764,8 +759,6 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec"
           description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。"
@@ -782,111 +775,12 @@ groups:
           alert_category: infrastructure
           notification_type: TYPE-1
           auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
         annotations:
           summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota"
           description: "{{ $labels.unit }} 仍為 unlimited；CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。"
           auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'"
           runbook: "建議 baseline：每個 runner CPUQuota=200%、MemoryMax=2G；由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用，若仍過載再限制並行度或分流。"
 
-  # =========================================================================
-  # Full-stack reboot/cold-start gate monitor
-  # =========================================================================
-  - name: cold_start_recovery_alerts
-    interval: 60s
-    rules:
-      - alert: ColdStartMonitorMissing
-        # 2026-05-06 ogt + Codex: full-stack reboot recovery must have a durable signal,
-        # not only a one-off terminal transcript.
-        expr: absent(awoooi_cold_start_monitor_up{host="110"})
-        for: 20m
-        labels:
-          severity: warning
-          layer: host-110
-          team: ops
-          alert_category: infrastructure
-          notification_type: TYPE-1
-          auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
-        annotations:
-          summary: "冷啟動 gate monitor 20 分鐘無指標"
-          description: "110 沒有暴露 awoooi_cold_start_monitor_up，代表 full-stack cold-start gate 沒有被 Prometheus 監控。"
-          auto_repair_action: "ssh 192.168.0.110 'crontab -l | sed -n \"/AWOOOI cold-start monitor start/,/AWOOOI cold-start monitor end/p\"; ls -l /home/wooo/node_exporter_textfiles/cold_start_recovery.prom /home/wooo/reboot-recovery/cold-start-last.log 2>/dev/null || true'"
-          runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh；只安裝 read-only textfile exporter，不需要 sudo。"
-
-      - alert: ColdStartMonitorStale
-        expr: time() - awoooi_cold_start_last_run_timestamp{host="110"} > 1800
-        for: 10m
-        labels:
-          severity: warning
-          layer: host-110
-          team: ops
-          alert_category: infrastructure
-          notification_type: TYPE-1
-          auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
-        annotations:
-          summary: "冷啟動 gate monitor 超過 30 分鐘未更新"
-          description: "cold_start_recovery.prom stale，無法確認 110/120/121/188 的重開機 gate 是否仍維持健康。"
-          auto_repair_action: "ssh 192.168.0.110 'tail -80 /tmp/awoooi-cold-start-monitor.cron.log 2>/dev/null || true; tail -120 /home/wooo/reboot-recovery/cold-start-last.log 2>/dev/null || true'"
-          runbook: "檢查 110 user cron、SSH key、/home/wooo/node_exporter_textfiles 權限；不要把 stale 當作服務可用。"
-
-      - alert: ColdStartRecoveryBlocked
-        expr: awoooi_cold_start_blocked_gates{host="110"} > 0 or awoooi_cold_start_last_result{host="110",result="blocked"} == 1
-        for: 5m
-        labels:
-          severity: critical
-          layer: full-stack
-          team: ops
-          alert_category: infrastructure
-          notification_type: TYPE-3
-          auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
-        annotations:
-          summary: "全棧冷啟動 gate 有 BLOCKED"
-          description: "full-stack cold-start check 偵測到 {{ $value }} 個 blocked gate。AI 自動修復只能先蒐證與通知，不可釋放 runner/CD 或重啟 stateful service。"
-          auto_repair_action: "ssh 192.168.0.110 'tail -220 /home/wooo/reboot-recovery/cold-start-last.log'"
-          runbook: "從第一個 BLOCKED gate 開始修；遵守 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 phase order。"
-
-      - alert: ColdStartRecoveryDegraded
-        expr: awoooi_cold_start_warn_gates{host="110"} > 0 or awoooi_cold_start_last_result{host="110",result="degraded"} == 1
-        for: 30m
-        labels:
-          severity: warning
-          layer: full-stack
-          team: ops
-          alert_category: infrastructure
-          notification_type: TYPE-1
-          auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
-        annotations:
-          summary: "全棧冷啟動 gate 持續 degraded"
-          description: "full-stack cold-start check 連續 30 分鐘有 WARN。此狀態不可宣告 reboot recovery 完成，也不可釋放高負載 runner/CD。"
-          auto_repair_action: "ssh 192.168.0.110 'tail -180 /home/wooo/reboot-recovery/cold-start-last.log'"
-          runbook: "清掉 WARN 後再執行 final gate：bash scripts/reboot-recovery/full-stack-cold-start-check.sh --watch --interval 60 --max-attempts 30 --send-alert-test。"
-
-      - alert: ColdStartLastGreenTooOld
-        expr: (time() - awoooi_cold_start_last_green_timestamp{host="110"} > 21600) and awoooi_cold_start_last_green_timestamp{host="110"} > 0
-        for: 30m
-        labels:
-          severity: warning
-          layer: full-stack
-          team: ops
-          alert_category: infrastructure
-          notification_type: TYPE-1
-          auto_repair: "false"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
-        annotations:
-          summary: "全棧 cold-start monitor 超過 6 小時沒有 GREEN"
-          description: "上次 GREEN 已超過 6 小時，表示冷啟動 baseline 長期沒有完整通過。"
-          runbook: "檢查 /home/wooo/reboot-recovery/cold-start-last.log；若僅因 read-only monitor 缺 final webhook POST，應修 monitor mode 而不是關告警。"
-
   # =========================================================================
   # MinIO / Kali 告警
   # =========================================================================
@@ -1152,10 +1046,10 @@ groups:
   # 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
   # =========================================================================
   - name: awoooi_backup_restore
-    interval: 1h
+    interval: 1m
     rules:
       - alert: BackupRestoreTestFailed
-        expr: awoooi_backup_restore_test_success == 0
+        expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0
         for: 5m
         labels:
           severity: critical
@@ -1164,11 +1058,37 @@ groups:
           auto_repair: "false"
         annotations:
           summary: "備份還原 dry-run 測試失敗"
-          description: "Velero restore dry-run 失敗，備份可能無法還原。立即人工驗證備份狀態。"
-          runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
+          description: "velero namespace 中保留了失敗的 backup-restore-test Job，備份可能無法還原。立即人工驗證備份狀態。"
+          runbook: "先找最新 Completed Velero backup，再執行 restore dry-run；禁止在 production namespace 做真還原"
+
+      - alert: BackupRestoreTestMissing
+        expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"})
+        for: 30m
+        labels:
+          severity: warning
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "備份還原 dry-run 監控指標缺失"
+          description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present；110 backup health exporter 或 120 kubectl 查詢可能失效。"
+          runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob"
+
+      - alert: BackupRestoreTestCronMissing
+        expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
+        for: 15m
+        labels:
+          severity: critical
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "備份還原 dry-run CronJob 缺失"
+          description: "velero namespace 找不到 backup-restore-test CronJob；備份可還原性沒有定期驗證。"
+          runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml"
 
       - alert: BackupRestoreTestStale
-        expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
+        expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0
         for: 10m
         labels:
           severity: warning
@@ -1177,9 +1097,375 @@ groups:
           auto_repair: "false"
         annotations:
           summary: "備份還原測試超過 8 天未執行"
-          description: "上次備份測試距今 {{ $value | humanizeDuration }}，週排程 CronJob 可能失效。"
+          description: "backup-restore-test CronJob 沒有 8 天內成功紀錄；週排程 CronJob 可能失效。"
           runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
 
+  # =========================================================================
+  # Host / service / config backup health
+  # =========================================================================
+  - name: full_stack_backup_health_alerts
+    interval: 1m
+    rules:
+      - alert: BackupHealthMonitorMissing110
+        expr: absent(awoooi_backup_health_monitor_up{host="110"})
+        for: 20m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-health-monitor
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份健康指標缺失"
+          description: "110 沒有輸出 backup_health.prom，無法確認資料庫、設定檔與服務備份是否新鮮。"
+          runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
+
+      - alert: BackupHealthMonitorMissing188
+        expr: absent(awoooi_backup_health_monitor_up{host="188"})
+        for: 20m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-health-monitor
+          host: "188"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "188 備份健康指標缺失"
+          description: "188 沒有輸出 backup_health.prom，無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。"
+          runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py"
+
+      - alert: BackupHealthMonitorStale
+        expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800
+        for: 10m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-health-monitor
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新"
+          description: "backup health textfile exporter stale，備份狀態不可觀測。"
+          runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
+
+      - alert: BackupExpectedJobMissing
+        expr: awoooi_backup_job_configured{host=~"110|188"} == 0
+        for: 15m
+        labels:
+          severity: critical
+          layer: host-backup
+          component: backup-cron
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "{{ $labels.host }} 備份排程缺失：{{ $labels.exported_job }}"
+          description: "預期備份 cron/config 不存在；下一次重開機後資料可能沒有可用還原點。"
+          runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron，先 dry-run 再執行"
+
+      - alert: BackupScheduleDuplicateActiveEntries
+        expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0
+        for: 15m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-cron
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份 crontab 有重複 active entries"
+          description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry；可能造成 offsite sync、verifier 或 status job 重複執行。"
+          runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`，只移除重複 active entry，不要刪除未理解的備份排程。"
+
+      - alert: BackupScheduleSingletonMismatch
+        expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0
+        for: 15m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-cron
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份排程單一入口異常：{{ $labels.entry }}"
+          description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry；目前 count={{ $value }}，可能造成排程缺失或重複執行。"
+          runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程，並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。"
+
+      - alert: BackupScriptMissing
+        expr: awoooi_backup_script_present{host=~"110|188"} == 0
+        for: 15m
+        labels:
+          severity: critical
+          layer: host-backup
+          component: backup-script
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "{{ $labels.host }} 備份腳本缺失：{{ $labels.script }}"
+          description: "備份排程可能存在，但實際腳本不存在或路徑漂移。"
+          runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本，確認權限 0755"
+
+      - alert: BackupJobStale
+        expr: awoooi_backup_job_fresh{host=~"110|188"} == 0
+        for: 15m
+        labels:
+          severity: critical
+          layer: host-backup
+          component: backup-freshness
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "{{ $labels.host }} 備份過舊：{{ $labels.exported_job }}"
+          description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在；來源 {{ $labels.source }}，目標 {{ $labels.target }}。"
+          runbook: "先檢查備份 log 與磁碟空間，再手動執行對應備份；禁止直接刪除舊備份或 production 資料"
+
+      - alert: BackupAggregateRunFailed
+        expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0
+        for: 10m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-all
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目"
+          description: "backup-all.sh 最近一次 aggregate run 仍有失敗；即使個別 DB 備份已手動補跑，也要重跑 aggregate backup 清除紅燈。"
+          runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log，修正後執行 /backup/scripts/backup-all.sh"
+
+      - alert: BackupConfigCapturePartial
+        expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0
+        for: 10m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-config-capture
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 設定檔備份缺少關鍵目標：{{ $labels.target }}"
+          description: "configs restic snapshot 雖可能存在，但最新設定檔備份未成功捕捉 {{ $labels.target }}；source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。"
+          runbook: "先修復對應主機或 K8s API 可達性，再執行 /backup/scripts/backup-configs.sh，確認 awoooi_backup_config_capture_ok 回到 1，最後補跑 Google Drive/rclone offsite sync。"
+
+      - alert: BackupConfigCaptureStatusStale
+        expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800)
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-config-capture
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 設定檔備份覆蓋率狀態缺失或過舊"
+          description: "backup-configs.sh 沒有新鮮的 capture status；無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。"
+          runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py，執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。"
+
+      - alert: BackupIntegrityCheckMissingOrFailed
+        expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0
+        for: 30m
+        labels:
+          severity: critical
+          layer: host-backup
+          component: backup-integrity
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份倉庫完整性檢查缺失或失敗"
+          description: "每週 restic check 沒有成功證據，或有 repo 檢查失敗；目前不能假設備份可讀。"
+          runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`，先看 /backup/logs/backup-integrity.log；禁止刪 repo 或 prune 直到確認原因"
+
+      - alert: BackupRestoreDrillMissingOrFailed
+        expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-restore-drill
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份抽樣還原演練缺失或失敗"
+          description: "每月 restore drill 沒有成功證據，備份雖可能新鮮，但尚未驗證可讀取還原。"
+          runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`；只允許還原到隔離暫存目錄，不得覆蓋 production"
+
+      - alert: BackupOffsiteCopyNotConfigured
+        expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0
+        for: 1m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-offsite
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 尚未配置離機備份 provider"
+          description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置；本地 restic 全綠仍不等於異地可恢復。"
+          runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote，產生 `/backup/offsite/*last_success` 證據；不得把 provider token 寫入 repo、Telegram 或 Prometheus label。"
+
+      - alert: BackupOffsiteCopyStale
+        expr: |
+          (
+            (sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0)
+            and
+            (sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0)
+          )
+          and
+          (
+            (sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0)
+            or
+            ((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600)
+          )
+        for: 2h
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-offsite
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 離機備份超過 48 小時未成功"
+          description: "已偵測到 offsite provider 配置，但沒有新鮮成功標記；本地備份可能無法抵抗整台 110 遺失。"
+          runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`；full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。"
+
+      - alert: BackupRetentionPolicyNotLatestOnly
+        expr: |
+          absent(awoooi_backup_retention_latest_only{host="110"})
+          or
+          awoooi_backup_retention_latest_only{host="110"} != 1
+          or
+          absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"})
+          or
+          awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1
+        for: 15m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-retention
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份保留策略不是 latest-only"
+          description: "operator 要求所有備份只保留最新一份；本地 restic 必須 keep-last=1，Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。"
+          runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1，刷新 backup-health textfile；必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。"
+
+      - alert: BackupSnapshotRetentionExceeded
+        expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-retention
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot"
+          description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshot；latest-only 策略要求每個 repo 全域只保留最新 1 份。"
+          runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`；若仍未收斂，確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`，避免 restic 依 path/tag 分組保留多份。"
+
+      - alert: BackupOffsiteFullVerifyFailed
+        expr: |
+          awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1
+          unless on(host, provider)
+          (awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1)
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-offsite
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 Google Drive full sync 完成但遠端驗證未通過"
+          description: "full offsite marker 已 fresh，但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。"
+          runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`，檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。"
+
+      - alert: BackupOffsiteRemoteSnapshotRetentionExceeded
+        expr: |
+          (awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1)
+          and on(host, provider)
+          (awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1)
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: backup-retention
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot"
+          description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshot；latest-only 策略要求遠端也只保留最新一份。"
+          runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`，再於低峰重新執行 full sync 與 verifier。"
+
+      - alert: BackupCredentialEscrowEvidenceMissing
+        expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0
+        for: 1m
+        labels:
+          severity: warning
+          layer: host-backup
+          component: credential-escrow
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "備份憑證金庫證據缺失或過期：{{ $labels.item }}"
+          description: "{{ $labels.item }} 沒有 31 天內人工驗證證據；重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。"
+          runbook: "在密碼管理器或離線加密金庫完成雙人覆核後，只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。"
+
   # =========================================================================
   # 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
   # =========================================================================
@@ -1236,19 +1522,13 @@ groups:
         labels:
           severity: warning
           layer: systemd-188
-          alert_category: host_resource
+          alert_category: infrastructure
           notification_type: TYPE-3
-          # 2026-05-02 ogt + Claude Sonnet 4.6: ADR-068 飛輪 — disk full SOP
-          # auto_repair: false → true，路由到 ssh_host MCP Group B `ssh_docker_prune`
-          # 工具內含 >=75% 磁碟守衛，低於閾值 no-op，避免誤刪
-          auto_repair: "true"
-          mcp_provider: "ssh_host"
-          host_type: "bare_metal"
+          auto_repair: "false"
           supersedes: PostgreSQLDiskGrowthRate
         annotations:
           summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)"
           description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache."
-          auto_repair_action: "ssh {{ $labels.instance }} docker prune (image+volume+builder; gated by 75% disk usage)"
           runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker"
 
       - alert: HostDiskUsageCritical
@@ -1468,3 +1748,284 @@ groups:
           summary: "Prometheus ({{ $labels.instance }}) 停擺"
           description: "Prometheus 自己停擺 → 所有其他告警失效"
           runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus"
+
+  # =========================================================================
+  # Full-stack cold-start recovery gate
+  # =========================================================================
+  - name: cold_start_recovery_alerts
+    rules:
+      - alert: PrometheusRuleDriftGuardFailed
+        expr: |
+          absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"})
+          or
+          (time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900)
+          or
+          (awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0)
+          or
+          (awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0)
+        for: 10m
+        labels:
+          severity: critical
+          layer: systemd-110
+          component: prometheus-rule-drift-guard
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "Prometheus 規則漂移防護失效"
+          description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失，或 active alerts.yml 不等於 canonical rules。"
+          runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard，等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。"
+
+      - alert: PrometheusRuleDriftAutoRepaired
+        expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          layer: systemd-110
+          component: prometheus-rule-drift-guard
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Prometheus 規則漂移已被自動修復"
+          description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移，已回復 canonical rules 並 reload Prometheus。"
+          runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`，找出誰覆寫了 active rules。"
+
+      - alert: ColdStartMonitorMissing
+        expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"})
+        for: 15m
+        labels:
+          severity: warning
+          layer: systemd-110
+          component: cold-start-monitor
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Cold-start monitor textfile metric missing"
+          description: "110 沒有輸出 awoooi_cold_start_monitor_up；重開機恢復 gate 目前不可觀測。"
+          runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh，確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
+
+      - alert: ColdStartMonitorStale
+        expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900
+        for: 10m
+        labels:
+          severity: warning
+          layer: systemd-110
+          component: cold-start-monitor
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Cold-start monitor stale"
+          description: "cold-start monitor 超過 15 分鐘沒有更新，距離上次執行 {{ $value | humanizeDuration }}。"
+          runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log"
+
+      - alert: ColdStartRecoveryBlocked
+        expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0
+        for: 5m
+        labels:
+          severity: critical
+          layer: full-stack
+          component: cold-start-gate
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "Full-stack cold-start recovery BLOCKED"
+          description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only，先處理第一個 blocked gate。"
+          runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log；依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復"
+
+      - alert: K3sNodeFilesystemErrorGateBlocked
+        expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0
+        for: 5m
+        labels:
+          severity: critical
+          layer: k3s
+          component: node-filesystem
+          host: "120"
+          target_host: "120"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "120 K3s 節點 filesystem error 阻擋重開機放行"
+          description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤；即使 Pod Running、網站 200，也不可宣告下一次重開機安全。"
+          runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`，執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查；維護窗內用 console/rescue 對 120 root LV 執行 fsck，禁止 online fsck。"
+
+      - alert: ColdStartHost120Unreachable
+        expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0
+        for: 3m
+        labels:
+          severity: critical
+          layer: host
+          component: host-reachability
+          host: "120"
+          target_host: "120"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "120 主機不可達，Full-stack cold-start 已阻擋"
+          description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120；目前只能由 121/VIP 撐住 K3s，不能宣告所有主機重開機恢復完成。"
+          runbook: "查看 120 console。若停在 initramfs/manual fsck，先對 root LV 做離線 fsck；若主機關機或網卡異常，先恢復電源/網路，再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。"
+
+      - alert: ColdStartRecoveryDegraded
+        expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0
+        for: 15m
+        labels:
+          severity: warning
+          layer: full-stack
+          component: cold-start-gate
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Full-stack cold-start recovery DEGRADED"
+          description: "cold-start gate 有 {{ $value }} 個 WARN gate；核心可用但不應放行 runner/CD/AI auto-repair full execution。"
+          runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log，修到 PASS/WARN/BLOCKED = green"
+
+      - alert: ColdStartLastGreenTooOld
+        expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600
+        for: 15m
+        labels:
+          severity: warning
+          layer: full-stack
+          component: cold-start-gate
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "Full-stack cold-start gate has not been GREEN recently"
+          description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }}；需要確認 110/120/121/188 與排程/網站 gate。"
+          runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test"
+
+  # =========================================================================
+  # Host storage health / dirty reboot evidence
+  # =========================================================================
+  - name: host_storage_health_alerts
+    rules:
+      - alert: Host110StorageHealthMonitorMissing
+        expr: absent(awoooi_host_storage_monitor_up{host="110"})
+        for: 15m
+        labels:
+          severity: warning
+          layer: systemd-110
+          component: storage-health-monitor
+          host: "110"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "110 storage health textfile metric missing"
+          description: "110 沒有輸出 storage_health.prom；dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
+          runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py，確認 /home/wooo/node_exporter_textfiles/storage_health.prom"
+
+      - alert: Host188StorageHealthMonitorMissing
+        expr: absent(awoooi_host_storage_monitor_up{host="188"})
+        for: 15m
+        labels:
+          severity: warning
+          layer: systemd-188
+          component: storage-health-monitor
+          host: "188"
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "188 storage health textfile metric missing"
+          description: "188 沒有輸出 storage_health.prom；dirty reboot、root read-only 與 fsck 證據目前不可觀測。"
+          runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py，確認 /home/ollama/node_exporter_textfiles/storage_health.prom"
+
+      - alert: HostStorageHealthMonitorStale
+        expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900
+        for: 10m
+        labels:
+          severity: warning
+          layer: host-storage
+          component: storage-health-monitor
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.host }} storage health textfile stale"
+          description: "storage health exporter 超過 15 分鐘沒有更新；重開機後檔案系統風險不可觀測。"
+          runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector"
+
+      - alert: HostRootFilesystemReadOnly
+        expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0
+        for: 1m
+        labels:
+          severity: critical
+          layer: host-storage
+          component: root-filesystem
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only"
+          description: "root filesystem 被掛載為唯讀，服務可能仍暫時存活但寫入會失敗；禁止自動修復，先保全證據並規劃維護窗。"
+          runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16：保全 journal/df/mount 證據，確認備份，再安排 console/offline fsck"
+
+      - alert: HostCurrentBootStorageErrorsDetected
+        expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0
+        for: 5m
+        labels:
+          severity: critical
+          layer: host-storage
+          component: kernel-storage
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤"
+          description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤；不可只重啟容器掩蓋問題。"
+          runbook: "先執行 read-only 診斷：journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態；必要時進入維護窗處理"
+
+      - alert: HostPreviousBootStorageErrorsDetected
+        expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-storage
+          component: dirty-reboot-evidence
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據"
+          description: "上一個開機週期留有 storage/fsck 錯誤，代表這次重開機事故需要完成 fsck、備份與容量後續檢查。"
+          runbook: "把證據寫入 docs/LOGBOOK.md，確認 full-stack cold-start gate 與 P3 gate；下一次維護窗補 offline fsck/SMART/RAID 檢查"
+
+      - alert: HostFsckLogErrorsDetected
+        expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0
+        for: 30m
+        labels:
+          severity: warning
+          layer: host-storage
+          component: fsck-log
+          team: ops
+          alert_category: infrastructure
+          notification_type: TYPE-1
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據"
+          description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字；這是事故後追蹤項，不應交給自動修復直接處理。"
+          runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*，將結果納入重開機事故報告與下次維護窗檢查項"
diff --git a/ops/reboot-recovery/full-stack-backup-baseline.yml b/ops/reboot-recovery/full-stack-backup-baseline.yml
new file mode 100644
index 00000000..77514995
--- /dev/null
+++ b/ops/reboot-recovery/full-stack-backup-baseline.yml
@@ -0,0 +1,306 @@
+version: 2026-05-19.v7
+scope: "110/120/121/188 全服務、資料、設定與還原驗證備份基準"
+
+principles:
+  - "資料備份與設定備份分層：DB/PV/物件資料負責資料，configs 負責可啟動狀態。"
+  - "Secrets、TLS private keys、SSH host keys 可進加密 restic/Velero 備份，但不得印到 log、repo、Telegram。"
+  - "備份系統本身也要備份：restic repository health、password/key escrow、offsite copy、restore drill evidence 缺一不可。"
+  - "每個備份都必須有三個證據：排程存在、最近成功時間、還原或 dry-run 驗證。"
+  - "AI 自動修復在備份/還原領域預設 observe-only；禁止未經新成功備份證據與 baseline gate 的刪除、DROP DB、覆蓋 production namespace。"
+  - "2026-05-19 起備份保留策略為 latest-only：每個本地 restic repo、188 MOMO 檔案備份與 Google Drive/rclone 離機鏡像都只保留最新一份。"
+
+backup_domains:
+  - id: host_configs
+    owner_host: "110"
+    script: "/backup/scripts/backup-configs.sh"
+    repository: "/backup/configs"
+    schedule: "daily via /backup/scripts/backup-all.sh"
+    max_age_hours: 48
+    includes:
+      - "110/188/120/121: /etc/nginx, /etc/systemd/system, /etc/cron.d, /etc/crontab"
+      - "110/188/120/121: /etc/letsencrypt, /etc/ssh, /etc/fstab, /etc/hosts, /etc/netplan"
+      - "110: /opt/harbor, /opt/sentry, /home/wooo/monitoring, /home/wooo/scripts, /backup/scripts"
+      - "188: /opt/n8n, /opt/open-webui, /opt/litellm, /opt/signoz, /home/ollama/momo-pro, /home/ollama/bin"
+      - "120/121: /etc/rancher/k3s, K3s manifests, containerd/keepalived host config"
+      - "K8s: workloads, services, ingress, configmaps, secrets, RBAC, PV/PVC, CRDs, Velero schedules/backups"
+    restore_test: "抽樣 restic restore 到隔離目錄，確認 nginx/systemd/K8s YAML 可讀；不得直接覆蓋 production。"
+
+  - id: awoooi_databases
+    owner_host: "110"
+    scripts:
+      - "/backup/scripts/backup-awoooi.sh"
+      - "/backup/scripts/backup-awoooi-frequent.sh"
+    repository: "/backup/awoooi"
+    schedule: "daily 02:00 + high-frequency 08:00/14:00/20:00"
+    max_age_hours: 7
+    includes:
+      - "awoooi_prod"
+      - "awoooi_dev"
+      - "k3s_datastore if present"
+    restore_test: "pg_restore/psql 到隔離 DB，驗證 schema 與核心表筆數；不可覆蓋 production DB。"
+
+  - id: gitea_and_ci
+    owner_host: "110"
+    repository: "/backup/gitea"
+    schedule: "daily via backup-all"
+    max_age_hours: 48
+    includes:
+      - "Gitea DB"
+      - "Git repositories"
+      - "Gitea app.ini 與 runner registration/config evidence"
+      - "workflow definitions from repos"
+    restore_test: "抽樣 git fsck / git clone；Gitea DB dump 可讀。"
+
+  - id: harbor_registry
+    owner_host: "110"
+    repository: "/backup/harbor"
+    schedule: "daily via backup-all"
+    max_age_hours: 48
+    includes:
+      - "Harbor DB/config"
+      - "registry storage"
+      - "TLS/config state from configs backup"
+    restore_test: "抽樣 registry manifest/blobs 可讀；Harbor compose/config 可重建。"
+
+  - id: observability
+    owner_host: "110"
+    repositories:
+      - "/backup/monitoring"
+      - "/backup/signoz"
+    schedule: "daily via backup-all"
+    max_age_hours: 48
+    includes:
+      - "Prometheus TSDB"
+      - "Grafana dashboards/datasources"
+      - "Alertmanager config/state"
+      - "SignOz ClickHouse/SQLite/config"
+      - "blackbox/node-exporter textfile config"
+    restore_test: "Prometheus/Grafana/Alertmanager 設定 lint；SignOz dump 可列出表。"
+
+  - id: sentry
+    owner_host: "110"
+    coverage_status: "covered_by_backup_sentry_script"
+    script: "/backup/scripts/backup-sentry.sh"
+    repository: "/backup/sentry"
+    schedule: "daily via backup-all; config also covered by /backup/configs"
+    max_age_hours: 48
+    includes:
+      - "Sentry compose/.env/config"
+      - "Sentry Postgres logical dump"
+      - "Sentry ClickHouse volume snapshot and table inventory"
+      - "Sentry Kafka queue volume snapshot"
+      - "Sentry Redis / SeaweedFS / Taskbroker / Vroom / Symbolicator state"
+    restore_test: "先在隔離 compose stack 驗證 Postgres dump 可讀、ClickHouse volume 可掛載、web/symbolicator/snuba 可啟動。"
+
+  - id: credential_escrow
+    owner_host: "human-controlled"
+    coverage_status: "gap_p0_out_of_band_escrow_required"
+    repository: "不可放在同一個 restic repo；需放在密碼管理器或離線加密金庫"
+    schedule: "每次新增/輪替 Secret 後立即更新 escrow；每月人工抽查"
+    max_age_hours: 744
+    includes:
+      - "restic password files / repository keys / Google Drive rclone.conf / offsite provider credentials"
+      - "Cloud DNS / registrar / CDN / tunnel 管理帳號與 recovery codes"
+      - "Gitea/Harbor/Sentry/admin break-glass credentials"
+      - "Git deploy keys、runner registration tokens、K8s bootstrap/admin kubeconfig 的復原路徑"
+      - "Google Drive / OAuth / Telegram / AI provider tokens 的輪替與復原流程，不包含明文輸出"
+    restore_test: "用人工雙人覆核方式確認 key escrow 可找到、可解密、可用於列出 snapshots；不得把 Secret 值寫進 repo 或監控 label。"
+
+  - id: external_dns_and_public_routes
+    owner_host: "110"
+    coverage_status: "covered_by_public_route_evidence_backup; provider_zone_export_still_requires_credentials"
+    script: "/backup/scripts/backup-public-routes.sh"
+    repository: "/backup/public-routes"
+    schedule: "daily via backup-all; DNS/CDN provider zone export after every routing change when credentials are available"
+    max_age_hours: 168
+    includes:
+      - "wooo.work DNS answers；CDN/Cloudflare/registrar 設定匯出仍需 provider token"
+      - "public nginx route map、TLS renewal config、ACME account evidence"
+      - "blackbox public endpoint inventory 與 expected status codes"
+      - "VPN/tunnel/port-forward/HA VIP 對外路由設定"
+    restore_test: "從匯出檔重建 public route checklist，確認 awoooi/mo/registry/harbor/gitea 等 endpoint 對應正確；不得在測試中改正式 DNS。"
+
+  - id: backup_repositories_and_integrity
+    owner_host: "110/188/121/offsite"
+    coverage_status: "covered_locally_by_check_backup_integrity_script; offsite copy still depends on credentials"
+    scripts:
+      - "/backup/scripts/check-backup-integrity.sh"
+      - "/backup/scripts/configure-offsite-rclone.sh"
+      - "/backup/scripts/configure-offsite-b2.sh"
+      - "/backup/scripts/sync-offsite-backups.sh"
+      - "/backup/scripts/backup-offsite-readiness-gate.sh"
+      - "/backup/scripts/offsite-escrow-evidence-report.sh"
+      - "/backup/scripts/mark-credential-escrow-verified.sh"
+    repositories:
+      - "/backup/* restic repos"
+      - "/home/ollama/backup/110"
+      - "Google Drive/rclone/offsite remote when credentials are configured"
+    schedule: "daily freshness; daily 06:10 offsite status; daily 06:15 offsite escrow evidence report; weekly restic check; monthly sample restore drill"
+    max_age_hours: 168
+    includes:
+      - "restic snapshots metadata、repo config、locks/prune policy"
+      - "188 backup-from-110 rsync copy"
+      - "offsite copy status and retention policy"
+      - "restore drill logs with snapshot id and restored object counts"
+    restore_test: "每週 `restic check --read-data-subset=1%`；每月 `restic dump latest <sample>` 到 0700 暫存目錄驗證可讀。"
+    retention_policy: "latest-only；本地 restic repo 新 snapshot 成功後 --group-by \"\" --keep-last=1 + prune；188 MOMO 檔案備份只留最新一份；離機 Google Drive/rclone 以本地 repo 為準鏡像刪舊。"
+    offsite_sync_policy: "offsite-escrow-evidence-report.sh 先產出紅acted 證據與 NEXT_STEP；backup-offsite-readiness-gate.sh 再做 status / dry-run-small / pre-full-sync；sync-offsite-backups.sh 預設 status；dry-run 可隨時執行；Google Drive/rclone full sync 需選低峰窗口，成功後才寫 /backup/offsite/rclone-last-success，且 OFFSITE_SYNC_DELETE_OLD=1 時會刪除遠端舊檔。full sync 不得與本地備份程序重疊，且必須距離下一次備份排程至少 270 分鐘。"
+
+  - id: momo_web_and_data
+    owner_host: "188"
+    scripts:
+      - "/backup/scripts/backup-momo.sh on 110"
+      - "/home/ollama/bin/momo-pg-backup.sh on 188"
+    repositories:
+      - "/backup/momo"
+      - "/home/ollama/momo_backups"
+    schedule: "110 daily + 188 daily 02:00"
+    max_age_hours: 30
+    includes:
+      - "mo.wooo.work app DB"
+      - "momo uploads/files/config"
+      - "scheduler config and cron"
+    restore_test: "隔離 DB restore 後跑 app health check；確認 mo.wooo.work 需要的資料表與資料筆數。"
+
+  - id: ai_and_tooling
+    owner_host: "188"
+    coverage_status: "covered_by_backup_ai_artifacts_for_manifest_and_metadata; model_blobs_require_manual_classification"
+    script: "/backup/scripts/backup-ai-artifacts.sh"
+    repositories:
+      - "/backup/langfuse"
+      - "/backup/open-webui"
+      - "/backup/clawbot"
+      - "/backup/configs"
+      - "/backup/ai-artifacts"
+    schedule: "daily via backup-all"
+    max_age_hours: 48
+    includes:
+      - "Langfuse traces/evaluations"
+      - "Open-WebUI conversations/config"
+      - "LiteLLM config, model routing, provider state"
+      - "OpenClaw/ClawBot Redis or persistent state"
+      - "n8n workflows/credentials through encrypted config backup"
+      - "Ollama model manifest/tag list/Modelfile；自製或不可重新下載的 model/adapters 才備份 blobs"
+      - "KM/RAG/vector 狀態；若存在於 AWOOOI DB，隨 DB dump 還原；若是外部 vector store 必須有獨立 dump"
+    restore_test: "抽樣匯出 workflow/config；Redis dump 可讀；Langfuse/Open-WebUI DB dump 可讀；Ollama manifest tar 可列出模型 tags。"
+
+  - id: source_of_truth_and_ops_memory
+    owner_host: "110/Gitea"
+    coverage_status: "gap_p1_sanitized_operational_context"
+    repositories:
+      - "/backup/gitea"
+      - "/backup/configs"
+    schedule: "Gitea daily; configs daily; 每次事故後更新 docs/LOGBOOK.md 與 runbooks"
+    max_age_hours: 48
+    includes:
+      - "所有 Git repositories、Ansible roles/playbooks/inventory、K8s manifests、monitoring rules"
+      - "AGENTS/HARD_RULES/runbooks/LOGBOOK/ADR 等決策與啟動順序文件"
+      - "AI agent handoff summaries and operational memory exports after sanitization"
+      - "CI/CD workflow definitions、runner labels、deployment marker policy"
+    restore_test: "從 Gitea backup 抽樣 clone repo，跑 ansible/k8s/alerts YAML validation；不得備份含明文 token 的聊天或 shell transcript。"
+
+  - id: k3s_and_velero
+    owner_host: "120"
+    schedule: "Velero daily-awoooi-prod + weekly restore dry-run"
+    max_age_hours: 25
+    includes:
+      - "K8s manifests and CRDs"
+      - "Secrets/ConfigMaps/RBAC"
+      - "PVC/PV snapshots via Velero provider"
+      - "backup-restore-test CronJob and result metrics"
+    restore_test: "backup-restore-test CronJob 每週 dry-run 到 restore-test-dry namespace mapping。"
+
+  - id: offsite_and_dr
+    owner_host: "188/121"
+    schedule: "188 backup-from-110 daily 01:00; 121 DR drill monthly"
+    max_age_hours: 25
+    includes:
+      - "110 Harbor/Gitea/bitan rsync copy on 188"
+      - "DR drill evidence on 121"
+      - "Google Drive/rclone remote when credentials are configured"
+    restore_test: "121 DR drill dry-run finds latest Completed Velero backup; 188 backup-from-110 textfile fresh。"
+
+monitoring_contract:
+  textfile_metrics:
+    "110": "/home/wooo/node_exporter_textfiles/backup_health.prom"
+    "188": "/home/ollama/node_exporter_textfiles/backup_health.prom"
+    "120": "由 110 backup_health.prom 透過 120 kubectl 查詢 Velero/CronJob/Job 狀態"
+  offsite_and_escrow_metrics:
+    - "awoooi_backup_offsite_configured：只回報 Google Drive/rclone 或相容 provider 是否看起來已配置，不輸出 credential 值。"
+    - "awoooi_backup_offsite_fresh：由 /backup/offsite/*last_success 類 marker 判斷離機同步是否新鮮。"
+    - "awoooi_backup_offsite_partial_fresh：由小範圍 partial sync marker 判斷 Google Drive/rclone 寫入路徑是否已被證明。"
+    - "awoooi_backup_credential_escrow_fresh：由 /backup/escrow-evidence/*.last_verified 類 marker 判斷人工金庫覆核是否在 31 天內完成。"
+    - "awoooi_backup_dr_next_step_info：用 next_step label 告訴 AI 巡檢與 operator 下一個安全人工作業，不包含 secret。"
+    - "awoooi_backup_dr_credential_escrow_missing_count：金庫覆核尚缺的項目數。"
+    - "awoooi_backup_cron_active_duplicate_count：110 active crontab 中 exact duplicate entry 的數量。"
+    - "awoooi_backup_cron_singular_entry_ok：offsite/status/verifier/exporter 等單一入口排程是否剛好只有一條 active cron。"
+    - "awoooi_backup_config_capture_ok：最新 configs snapshot 是否實際捕捉 110/120/121/188 host config 與 K8s workloads/secrets，不輸出 secret。"
+    - "awoooi_backup_config_capture_critical_failed_count：最新設定檔備份缺少的 critical capture target 數量。"
+  prometheus_alerts:
+    - BackupHealthMonitorMissing110
+    - BackupHealthMonitorMissing188
+    - BackupHealthMonitorStale
+    - BackupExpectedJobMissing
+    - BackupScheduleDuplicateActiveEntries
+    - BackupScheduleSingletonMismatch
+    - BackupScriptMissing
+    - BackupJobStale
+    - BackupAggregateRunFailed
+    - BackupConfigCapturePartial
+    - BackupConfigCaptureStatusStale
+    - BackupIntegrityCheckMissingOrFailed
+    - BackupRestoreDrillMissingOrFailed
+    - BackupRestoreTestMissing
+    - BackupRestoreTestCronMissing
+    - BackupRestoreTestFailed
+    - BackupRestoreTestStale
+    - BackupOffsiteCopyNotConfigured
+    - BackupOffsiteCopyStale
+    - BackupCredentialEscrowEvidenceMissing
+    - BackupRetentionPolicyNotLatestOnly
+    - BackupSnapshotRetentionExceeded
+    - BackupOffsiteFullVerifyFailed
+    - BackupOffsiteRemoteSnapshotRetentionExceeded
+  live_visibility_checks:
+    - "如果 awoooi_backup_offsite_configured{host=\"110\"} 為 0，Prometheus 必須有 BackupOffsiteCopyNotConfigured firing，Alertmanager 必須有 active alert。"
+    - "如果 offsite provider 已配置、full marker 尚未 fresh，且 full sync enable marker 缺失或已超過 30 小時，Prometheus 與 Alertmanager 必須看得到 BackupOffsiteCopyStale。"
+    - "如果 awoooi_backup_credential_escrow_fresh{host=\"110\"} == 0，Prometheus 與 Alertmanager 必須依 item 看得到 BackupCredentialEscrowEvidenceMissing。"
+    - "如果 awoooi_backup_retention_latest_only{host=\"110\"} 或 awoooi_backup_retention_offsite_delete_old_enabled{host=\"110\",provider=\"rclone\"} 缺失/不為 1，Prometheus 與 Alertmanager 必須看得到 BackupRetentionPolicyNotLatestOnly。"
+    - "如果任一 awoooi_backup_job_snapshot_count{host=\"110\",type=\"restic\"} > 1，Prometheus 與 Alertmanager 必須看得到 BackupSnapshotRetentionExceeded。"
+    - "如果 full offsite marker fresh 但 awoooi_backup_offsite_remote_verify_ok{host=\"110\",provider=\"rclone\"} 不為 1 或缺失，Prometheus 必須看得到 BackupOffsiteFullVerifyFailed。"
+    - "如果 full offsite marker fresh 且任一 awoooi_backup_offsite_remote_snapshot_count{host=\"110\",provider=\"rclone\"} > 1，Prometheus 必須看得到 BackupOffsiteRemoteSnapshotRetentionExceeded。"
+    - "如果 awoooi_backup_cron_active_duplicate_count{host=\"110\"} > 0，Prometheus 與 Alertmanager 必須看得到 BackupScheduleDuplicateActiveEntries。"
+    - "如果任一 awoooi_backup_cron_singular_entry_ok{host=\"110\"} == 0，Prometheus 與 Alertmanager 必須看得到 BackupScheduleSingletonMismatch。"
+    - "如果任一 awoooi_backup_config_capture_ok{host=\"110\",critical=\"true\"} == 0，Prometheus 與 Alertmanager 必須看得到 BackupConfigCapturePartial，且 target label 必須指出缺哪個設定來源。"
+    - "如果 awoooi_backup_config_capture_status_timestamp 缺失或超過 48 小時，Prometheus 與 Alertmanager 必須看得到 BackupConfigCaptureStatusStale。"
+    - "live visibility check 只讀 Prometheus / Alertmanager API，不送測試告警、不改 silence、不改 route、不觸發修復。"
+  prometheus_recording_rules:
+    - awoooi_recovery_core_ready
+    - awoooi_recovery_dr_offsite_ready
+
+release_gate:
+  cold_start_script: "bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color"
+  p3_script: "bash scripts/reboot-recovery/p3-controlled-release-gate.sh"
+  recovery_core_scorecard: "bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-core"
+  dr_offsite_operator_checklist: "bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh"
+  dr_offsite_scorecard: "bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr"
+  dr_offsite_final_gate: "bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --require-dr"
+  dr_offsite_post_marker_wait: "bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color"
+  required_green:
+    - "backup_health.prom fresh on 110/188"
+    - "awoooi_backup_job_fresh == 1 for every expected job"
+    - "Velero latest Completed backup < 25h"
+    - "backup-restore-test CronJob present and lastSuccessfulTime not stale"
+    - "weekly restic check successful"
+    - "monthly sample restore drill successful"
+  warning_until_human_escrow_ready:
+    - "offsite provider configured and latest offsite copy marker fresh"
+    - "credential escrow marker files refreshed after human verification; marker files must contain only timestamp/evidence id, never secret values"
+  strict_dr_exit_conditions:
+    - "Google Drive/rclone provider configured on 110 host-local rclone.conf; /backup/scripts/offsite.env keeps only non-secret remote/path with mode 0600"
+    - "credential escrow markers fresh for restic_repository_password, offsite_provider_credentials, break_glass_admin_credentials, dns_registrar_recovery, oauth_ai_provider_recovery"
+    - "full offsite marker /backup/offsite/rclone-last-success fresh after full 13 repo sync"
+    - "full-stack-recovery-scorecard.sh --require-dr exits 0"
+    - "recovery-scorecard-contract-check.py --expect-dr-ready exits 0 against 110 Prometheus"
+    - "dr-offsite-operator-checklist.sh --require-dr exits 0 after scorecard, Prometheus recording rule, and backup alert visibility contract agree"
+    - "wait-dr-offsite-ready.sh exits 0 after post-marker textfile, Prometheus, Alertmanager, and final checklist convergence"
diff --git a/ops/reboot-recovery/full-stack-cold-start-baseline.yml b/ops/reboot-recovery/full-stack-cold-start-baseline.yml
index d83d53a1..0db7dc47 100644
--- a/ops/reboot-recovery/full-stack-cold-start-baseline.yml
+++ b/ops/reboot-recovery/full-stack-cold-start-baseline.yml
@@ -1,337 +1,204 @@
-# AWOOOI full-stack cold-start dependency baseline.
-# This is the machine-readable companion to docs/runbooks/FULL-STACK-COLD-START-SOP.md.
-#
-# Intent:
-# - document the reboot startup order and service dependency graph
-# - define release gates for operators and AI automation
-# - keep stateful services out of generic auto-restart loops
-
-version: "2026-05-06"
-incident_reference: "2026-05-05 full-stack reboot recovery"
+version: 2026-05-06.v1
 scope:
-  managed_hosts:
-    "110":
-      address: "192.168.0.110"
-      ssh_user: "wooo"
-      roles:
-        - registry
-        - git
-        - observability
-        - sentry
-        - runners
-    "120":
-      address: "192.168.0.120"
-      ssh_user: "wooo"
-      roles:
-        - k3s_server
-        - keepalived_vip
-        - awoooi_nodeport
-    "121":
-      address: "192.168.0.121"
-      ssh_user: "wooo"
-      roles:
-        - k3s_node
-        - keepalived_peer
-        - dr_drill
-    "188":
-      address: "192.168.0.188"
-      ssh_user: "ollama"
-      roles:
-        - postgres_datastore
-        - redis
-        - momo
-        - signoz
-        - ai_proxy
-  intentionally_skipped:
-    "112":
-      role: "kali"
-      reason: "scanner host is not required for production cold-start release"
+  included_hosts:
+    "110": "DevOps, registry, observability, Sentry, runners"
+    "120": "K3s control plane and VIP"
+    "121": "K3s peer node and DR drill cron"
+    "188": "Data, AI, web, momo, SignOz, public nginx gateway"
+  excluded_hosts:
+    "112": "Kali security host; recorded but not part of cold-start release gate"
 
-global_policy:
-  startup_rule: "Recover the dependency chain before releasing high-load work."
-  runner_cd_rule: "Release runners and CD only after data, registry, K3s, workload, routes, schedules, and alert E2E gates are green."
-  ai_auto_repair_rule: "Observe-only until all green gates pass and host load stays below baseline."
-  destructive_state_rule: "No DROP, data directory deletion, volume recreation, pg_resetwal, fsck, or backup restore without explicit human approval."
-  no_generic_restart_rule: "Never run generic docker restart against all containers during cold start."
+principles:
+  - recover_dependency_chain_before_workloads
+  - keep_ai_auto_repair_observe_only_until_green
+  - never_generic_restart_stateful_services
+  - preserve_corrupt_parts_in_quarantine_not_delete
+  - release_runners_and_crawlers_last
 
 phases:
-  - id: "P0-NETWORK"
+  - id: P0-NETWORK
     order: 0
-    start_after: []
-    owns:
-      - "LAN reachability"
-      - "SSH reachability"
-      - "ARP evidence"
     gates:
-      - "ping 192.168.0.110/120/121/188 succeeds"
-      - "TCP 22 open on 192.168.0.110/120/121/188"
-      - "reboot evidence captured before repair"
-    blocks:
-      - "all other phases"
+      - ping_110_120_121_188
+      - ssh_port_110_120_121_188
+      - arp_evidence_or_monitor_mode_fallback
 
-  - id: "P0-188-DATA"
-    order: 1
-    start_after:
-      - "P0-NETWORK"
-    host: "188"
-    service_order:
-      - "containerd"
-      - "docker"
-      - "postgresql@14-main"
-      - "k3s_datastore.kine maintenance"
-      - "redis-server"
-      - "ollama or current AI proxy dependencies"
-      - "nginx"
-      - "Docker networks"
-      - "MinIO / OpenClaw / SignOz"
-      - "momo / litellm / batch services"
+  - id: P0-188-DATA
+    order: 10
+    required_before:
+      - P1-K3S
+      - P2-WORKLOAD-ALERTCHAIN
     gates:
-      - "PostgreSQL port 5432 open"
-      - "pg_isready reports accepting connections"
-      - "Redis replies PONG"
-      - "momo health endpoint returns 200"
-      - "SignOz HTTP route is reachable"
-    blocks:
-      - "120/121 K3s"
-      - "AWOOOI API database access"
-      - "Alertmanager webhook"
-      - "momo public site"
+      - containerd_docker_postgresql_redis_ollama_nginx_active
+      - postgresql_5432_accepting_connections
+      - redis_pong
+      - momo_db_not_restarting
+      - signoz_http_reachable
+      - momo_health_200
 
-  - id: "P0-110-REGISTRY-OBSERVABILITY"
-    order: 2
-    start_after:
-      - "P0-NETWORK"
-      - "P0-188-DATA"
-    host: "110"
-    service_order:
-      - "docker"
-      - "orphan Exited(128/137) cleanup if needed"
-      - "Harbor log"
-      - "Harbor registry stack"
-      - "Gitea"
-      - "Prometheus / Alertmanager / Grafana / exporters"
-      - "Langfuse"
-      - "SignOz or local observability companions"
-      - "Sentry DB layer"
-      - "Sentry web / worker / consumer layer"
-      - "Gitea host runner and actions runners"
+  - id: P0-110-REGISTRY-OBSERVABILITY
+    order: 20
+    required_before:
+      - P1-K3S
+      - P3-RUNNER-CD
     gates:
-      - "Harbor /v2/ returns 200 or 401"
-      - "Gitea returns 200 or 302"
-      - "Prometheus /-/ready returns 200"
-      - "Alertmanager /-/healthy returns 200"
-      - "Sentry HTTP returns 200, 302, or 400"
-      - "runner CPUQuota=200%, MemoryMax=2G, WatchdogUSec=0"
-    blocks:
-      - "K3s image pulls"
-      - "runtime CD"
-      - "alert rules deploy"
-      - "code-review runners"
+      - docker_active
+      - harbor_v2_200_or_401
+      - gitea_200_or_302
+      - prometheus_ready
+      - alertmanager_healthy
+      - sentry_http_reachable
+      - docker_containers_all_up
+      - runner_watchdog_disabled
+      - sentry_clickhouse_not_restarting
+      - cadvisor_image_v0_47_0
+      - cadvisor_cpu_cap_0_3
 
-  - id: "P1-K3S"
-    order: 3
-    start_after:
-      - "P0-188-DATA"
-      - "P0-110-REGISTRY-OBSERVABILITY"
-    hosts:
-      - "120"
-      - "121"
-    service_order:
-      - "120 k3s.service"
-      - "121 k3s-agent.service or live role"
-      - "CNI / kube-proxy"
-      - "nodes Ready"
-      - "core pods"
-      - "awoooi-prod pods"
-      - "keepalived VIP 192.168.0.125"
-      - "NodePorts 32334 and 32335"
+  - id: P1-K3S
+    order: 30
     gates:
-      - "120 can reach 188:5432"
-      - "K3s nodes show Ready"
-      - "VIP 192.168.0.125 is present"
-      - "awoooi-prod pods are Running or Completed"
-    blocks:
-      - "AWOOOI workload health"
-      - "public AWOOOI route"
-      - "Alertmanager webhook"
+      - 120_can_reach_188_postgres
+      - mon_and_mon1_ready
+      - no_non_running_non_succeeded_pods
+      - awoooi_dev_api_nodeport_200
+      - vip_192_168_0_125_present
 
-  - id: "P2-WORKLOAD-ALERTCHAIN"
-    order: 4
-    start_after:
-      - "P1-K3S"
-    owners:
-      - "AWOOOI API"
-      - "AWOOOI Web"
-      - "Alertmanager webhook"
-      - "Telegram delivery"
+  - id: P2-WORKLOAD-ALERTCHAIN
+    order: 40
     gates:
-      - "http://192.168.0.125:32334/api/v1/health returns 2xx/3xx"
-      - "http://192.168.0.125:32335/ returns 2xx/3xx"
-      - "Alertmanager webhook POST returns 2xx"
-      - "K8s Telegram secrets are present and non-placeholder"
-    blocks:
-      - "AI auto-remediation"
-      - "full alert confidence"
+      - awoooi_api_vip_health_2xx_or_3xx
+      - awoooi_web_vip_2xx_or_3xx
+      - alertmanager_webhook_e2e_2xx_when_release_gate
 
-  - id: "P2-PUBLIC-ROUTES"
-    order: 5
-    start_after:
-      - "P2-WORKLOAD-ALERTCHAIN"
+  - id: P2-PUBLIC-ROUTES
+    order: 50
+    public_https_routes:
+      - https://awoooi.wooo.work/api/v1/health
+      - https://awoooi.wooo.work/
+      - https://mo.wooo.work/
+      - https://mo.wooo.work/health
+      - https://gitea.wooo.work/
+      - https://harbor.wooo.work/
+      - https://registry.wooo.work/
+      - https://sentry.wooo.work/
+      - https://signoz.wooo.work/
+      - https://stock.wooo.work/
+      - https://langfuse.wooo.work/
+      - https://bitan.wooo.work/
+      - https://aiops.wooo.work/
+
+  - id: P2-SCHEDULES
+    order: 60
     gates:
-      - "https://awoooi.wooo.work/api/v1/health returns 2xx/3xx"
-      - "https://awoooi.wooo.work/ returns 2xx/3xx"
-      - "https://mo.wooo.work/ returns 2xx/3xx"
-      - "https://mo.wooo.work/health returns 2xx/3xx"
-    blocks:
-      - "external release complete"
+      - cron_active_188_110_120_121
+      - docker_restart_textfile_fresh_188
+      - docker_stats_textfile_fresh_188_110
+      - systemd_units_textfile_fresh_110
+      - backup_health_textfile_fresh_188_110
+      - backup_from_110_success_under_25h
+      - expected_backup_jobs_fresh_188_110
+      - host_service_config_backup_success_under_48h
+      - sentry_dedicated_backup_success_under_48h
+      - backup_integrity_check_success_under_8d
+      - backup_restore_drill_success_under_31d
+      - velero_schedule_present_and_latest_completed_under_25h
+      - velero_restore_test_cron_present
+      - momo_scheduler_registered_jobs
+      - k8s_cronjobs_unsuspended
+      - k8s_failed_jobs_zero
+      - dr_drill_cron_present_121
 
-  - id: "P2-SCHEDULES"
-    order: 6
-    start_after:
-      - "P2-PUBLIC-ROUTES"
-    gates:
-      - "110/120/121/188 cron services active"
-      - "188 backup-from-110 success age below 25h"
-      - "188 docker restart/stats textfiles fresh"
-      - "188 momo-scheduler container healthy and registration evidence present within 6h"
-      - "110 docker/systemd textfiles fresh"
-      - "120 awoooi-prod CronJobs present and unsuspended"
-      - "120 awoooi-prod has no failed Jobs"
-      - "121 DR drill cron present"
-    blocks:
-      - "done criteria"
-      - "AI auto-remediation release"
+  - id: P3-HIGH-LOAD-WORK
+    order: 70
+    release_after:
+      - P0-NETWORK
+      - P0-188-DATA
+      - P0-110-REGISTRY-OBSERVABILITY
+      - P1-K3S
+      - P2-WORKLOAD-ALERTCHAIN
+      - P2-PUBLIC-ROUTES
+      - P2-SCHEDULES
+    release_conditions:
+      - host_load_per_core_below_1_0_for_15m
+      - no_restart_storm
+      - clickhouse_merge_or_kafka_lag_not_increasing_two_checks
+    examples:
+      - sentry_snuba_consumers
+      - momo_scheduler_chrome_crawlers
+      - gitea_actions_jobs
 
-  - id: "P3-HIGH-LOAD-RELEASE"
-    order: 7
-    start_after:
-      - "P2-SCHEDULES"
-    release_last:
-      - "momo-scheduler / Chrome crawlers"
-      - "Sentry Snuba consumers"
-      - "SignOz ClickHouse merge-heavy work"
-      - "Gitea actions runners"
-      - "runtime CD jobs"
-    gates:
-      - "all prior gates green"
-      - "host load per CPU below 1.0 for 15 minutes before releasing batch/runner work"
-      - "ClickHouse/Kafka/Snuba backlog decreasing for two consecutive checks if backlog exists"
+  - id: P3-RUNNER-CD
+    order: 80
+    release_conditions:
+      - all_previous_gates_green
+      - runner_cpuquota_200_percent
+      - runner_memorymax_2g
+      - watchdogusec_0
+      - active_awoooi_cd_or_gitea_actions_task_containers_cpu_capped_during_cold_start
 
-baselines:
-  endpoints:
-    awoooi_vip_api_health: "http://192.168.0.125:32334/api/v1/health"
-    awoooi_vip_web: "http://192.168.0.125:32335/"
-    awoooi_public_api_health: "https://awoooi.wooo.work/api/v1/health"
-    awoooi_public_web: "https://awoooi.wooo.work/"
-    momo_public_web: "https://mo.wooo.work/"
-    momo_public_health: "https://mo.wooo.work/health"
-    harbor_registry: "http://127.0.0.1:5000/v2/"
-    gitea: "http://127.0.0.1:3001/"
-    prometheus_ready: "http://127.0.0.1:9090/-/ready"
-    alertmanager_healthy: "http://127.0.0.1:9093/-/healthy"
-    sentry: "http://127.0.0.1:9000/"
-  expected_codes:
-    harbor_registry:
-      - 200
-      - 401
-    gitea:
-      - 200
-      - 302
-    prometheus_ready:
-      - 200
-    alertmanager_healthy:
-      - 200
-    sentry:
-      - 200
-      - 302
-      - 400
-    workload_and_public:
-      - "2xx"
-      - "3xx"
-  runner_guardrails:
-    CPUQuotaPerSecUSec: "2s"
-    MemoryMax: "2147483648"
-    WatchdogUSec: "0"
-  freshness_seconds:
-    docker_textfiles: 300
-    systemd_textfiles: 300
-    backup_success: 90000
+automation_policy:
+  before_green:
+    ai_auto_repair: observe_only
+    alertmanager_smoke_test: manual_or_release_gate_only
+    stateful_service_actions: human_approval_required
+    generic_restart: forbidden
+  after_green:
+    ai_auto_repair: limited_execution_for_stateless_exporters_only
+    stateful_service_actions: human_in_the_loop
+    runner_cd: controlled_release
 
-stateful_services:
-  hard_block_auto_repair:
-    - "188 PostgreSQL data directory"
-    - "188 k3s_datastore"
-    - "188 momo database"
-    - "110 Harbor DB"
-    - "110 Sentry DB"
-    - "Sentry ClickHouse data"
-    - "SignOz ClickHouse data"
-    - "Kafka topic/log directories"
-  human_in_loop_required:
-    - "pg_resetwal"
-    - "ClickHouse clean-clone recovery"
-    - "Kafka checkpoint file quarantine"
-    - "backup restore"
-    - "filesystem repair"
+resource_guardrails:
+  "110":
+    cadvisor:
+      image: gcr.io/cadvisor/cadvisor:v0.47.0
+      cpus: 0.3
+      mem_limit: 512m
+    sentry_snuba_cold_start_consumers:
+      cpus: 0.5
+      persist_in: /opt/sentry/docker-compose.override.yml
+    sentry_self_hosted_memory_limits:
+      taskscheduler_mem_limit: 1g
+      relay_mem_limit: 2g
+      persist_in: /opt/sentry/docker-compose.override.yml
+      note: "taskscheduler/relay 不得回退到 512m/1g 造成長期 >85% memory-limit pressure；110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。"
+    actions_runner_systemd:
+      cpu_quota: 200%
+      memory_max: 2G
+      watchdog: disabled
+  "188":
+    ollama_systemd:
+      cpu_quota: 300%
+      memory_high: 20G
+      memory_max: 24G
+      max_loaded_models: 1
+      num_parallel: 1
+      note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint；不得維持 disabled/inactive，也不得保留 700%/45G 無節制 guardrail。"
+    litellm:
+      cpus: 1.0
+      memory: 1G
+      mode: stateless
+    momo_scheduler:
+      cpus: 2.0
+      memory: 2G
+    signoz_clickhouse:
+      memory: 24G
+      note: do_not_lower_during_merge_backlog
 
-ai_automation_gate:
-  observe_only_until:
-    - "P0-NETWORK green"
-    - "P0-188-DATA green"
-    - "P0-110-REGISTRY-OBSERVABILITY green"
-    - "P1-K3S green"
-    - "P2-WORKLOAD-ALERTCHAIN green"
-    - "P2-PUBLIC-ROUTES green"
-    - "P2-SCHEDULES green"
-    - "no active restart storm"
-    - "host load per CPU below 1.0 for 15 minutes"
-  allowed_before_green:
-    - "diagnose"
-    - "collect evidence"
-    - "notify"
-  blocked_before_green:
-    - "stateful restart"
-    - "destructive repair"
-    - "runner/CD release"
-    - "generic container restart"
-
-persistent_monitoring:
-  host: "110"
-  install_command: "bash scripts/reboot-recovery/install-cold-start-monitor-110.sh"
-  schedule: "*/10 * * * *"
-  mode: "read_only"
-  send_alert_test: false
-  scripts:
-    check: "/home/wooo/scripts/full-stack-cold-start-check.sh"
-    exporter: "/home/wooo/scripts/cold-start-textfile-exporter.sh"
-  outputs:
-    textfile: "/home/wooo/node_exporter_textfiles/cold_start_recovery.prom"
-    last_log: "/home/wooo/reboot-recovery/cold-start-last.log"
-  metrics:
-    - "awoooi_cold_start_monitor_up"
-    - "awoooi_cold_start_pass_gates"
-    - "awoooi_cold_start_warn_gates"
-    - "awoooi_cold_start_blocked_gates"
-    - "awoooi_cold_start_last_run_timestamp"
-    - "awoooi_cold_start_last_green_timestamp"
-    - "awoooi_cold_start_last_result"
-  prometheus_alerts:
-    - "ColdStartMonitorMissing"
-    - "ColdStartMonitorStale"
-    - "ColdStartRecoveryBlocked"
-    - "ColdStartRecoveryDegraded"
-    - "ColdStartLastGreenTooOld"
-  ai_contract:
-    monitor_missing: "diagnose cron/textfile path only"
-    stale: "collect cron log and last check log"
-    degraded: "collect evidence, do not release high-load work"
-    blocked: "follow first BLOCKED gate in phase order"
-    forbidden: "generic restart, stateful restart, destructive repair"
-
-final_confirmation:
-  command: "bash scripts/reboot-recovery/full-stack-cold-start-check.sh --watch --interval 60 --max-attempts 30 --send-alert-test"
-  green_result:
-    PASS: "greater than 0"
-    WARN: 0
-    BLOCKED: 0
-    summary: "Result: GREEN"
+authoritative_checks:
+  read_only_monitor:
+    command: bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color
+    expected_for_cron: PASS>0 WARN=0 BLOCKED=0
+  release_gate:
+    command: SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
+    expected: PASS=64 WARN=0 BLOCKED=0
+  textfile_metric:
+    path: /home/wooo/node_exporter_textfiles/cold_start_recovery.prom
+    green_metric: awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} 1
+  backup_baseline:
+    path: ops/reboot-recovery/full-stack-backup-baseline.yml
+    required_metrics:
+      - awoooi_backup_health_monitor_up
+      - awoooi_backup_job_fresh
+      - awoooi_backup_integrity_fresh
+      - awoooi_velero_restore_test_cron_present
+      - awoooi_velero_restore_test_last_success_fresh
diff --git a/scripts/ops/backup-alert-label-contract-check.py b/scripts/ops/backup-alert-label-contract-check.py
new file mode 100755
index 00000000..ccfbc818
--- /dev/null
+++ b/scripts/ops/backup-alert-label-contract-check.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""
+Validate the backup alert label contract.
+
+Node exporter textfile metrics use labels such as job="backup_all" locally, but
+Prometheus rewrites that metric label to exported_job because the scrape target
+already has job="node-exporter-110". Backup alerts must therefore use
+$labels.exported_job in user-facing text and exported_job="..." in expressions.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import urllib.parse
+import urllib.request
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
+DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml")
+
+
+class ContractError(RuntimeError):
+    pass
+
+
+def _load_alerts(path: Path) -> dict[str, dict[str, Any]]:
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    alerts: dict[str, dict[str, Any]] = {}
+    for group in data.get("groups") or []:
+        for rule in group.get("rules") or []:
+            name = rule.get("alert")
+            if name:
+                alerts[name] = rule
+    return alerts
+
+
+def _annotation_text(rule: dict[str, Any]) -> str:
+    annotations = rule.get("annotations") or {}
+    return "\n".join(str(value) for value in annotations.values())
+
+
+def _require_alert(alerts: dict[str, dict[str, Any]], name: str) -> dict[str, Any]:
+    if name not in alerts:
+        raise ContractError(f"missing alert: {name}")
+    return alerts[name]
+
+
+def _require_contains(value: str, expected: str, label: str) -> None:
+    if expected not in value:
+        raise ContractError(f"{label} must contain {expected!r}")
+
+
+def _require_not_contains(value: str, forbidden: str, label: str) -> None:
+    if forbidden in value:
+        raise ContractError(f"{label} must not contain {forbidden!r}")
+
+
+def _expected_backup_alerts(path: Path) -> list[str]:
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    alerts = data.get("monitoring_contract", {}).get("prometheus_alerts") or []
+    if not alerts:
+        raise ContractError(f"missing monitoring_contract.prometheus_alerts in {path}")
+    return [str(alert) for alert in alerts]
+
+
+def static_check(path: Path, baseline_path: Path) -> list[str]:
+    alerts = _load_alerts(path)
+    lines: list[str] = []
+
+    missing = sorted(set(_expected_backup_alerts(baseline_path)) - set(alerts))
+    if missing:
+        raise ContractError(f"alerts-unified.yml missing baseline backup alerts: {missing}")
+    lines.append("OK alerts-unified.yml contains every baseline backup alert")
+
+    rule = _require_alert(alerts, "BackupExpectedJobMissing")
+    _require_contains(str(rule.get("expr", "")), "awoooi_backup_job_configured", "BackupExpectedJobMissing expr")
+    text = _annotation_text(rule)
+    _require_contains(text, "$labels.exported_job", "BackupExpectedJobMissing annotations")
+    _require_not_contains(text, "$labels.job", "BackupExpectedJobMissing annotations")
+    lines.append("OK BackupExpectedJobMissing uses exported_job label")
+
+    rule = _require_alert(alerts, "BackupJobStale")
+    _require_contains(str(rule.get("expr", "")), "awoooi_backup_job_fresh", "BackupJobStale expr")
+    text = _annotation_text(rule)
+    _require_contains(text, "$labels.exported_job", "BackupJobStale annotations")
+    _require_not_contains(text, "$labels.job", "BackupJobStale annotations")
+    for required_label in ["$labels.max_age_hours", "$labels.source", "$labels.target"]:
+        _require_contains(text, required_label, "BackupJobStale annotations")
+    lines.append("OK BackupJobStale uses exported_job/source/target labels")
+
+    rule = _require_alert(alerts, "BackupAggregateRunFailed")
+    _require_contains(
+        str(rule.get("expr", "")),
+        'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}',
+        "BackupAggregateRunFailed expr",
+    )
+    lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all")
+
+    rule = _require_alert(alerts, "BackupConfigCapturePartial")
+    _require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr")
+    text = _annotation_text(rule)
+    for required_label in ["$labels.target", "$labels.source"]:
+        _require_contains(text, required_label, "BackupConfigCapturePartial annotations")
+    lines.append("OK BackupConfigCapturePartial uses target/source labels")
+
+    rule = _require_alert(alerts, "BackupConfigCaptureStatusStale")
+    _require_contains(
+        str(rule.get("expr", "")),
+        "awoooi_backup_config_capture_status_timestamp",
+        "BackupConfigCaptureStatusStale expr",
+    )
+    lines.append("OK BackupConfigCaptureStatusStale checks config capture status timestamp")
+
+    rule = _require_alert(alerts, "BackupScriptMissing")
+    _require_contains(_annotation_text(rule), "$labels.script", "BackupScriptMissing annotations")
+    lines.append("OK BackupScriptMissing uses script label")
+
+    rule = _require_alert(alerts, "BackupCredentialEscrowEvidenceMissing")
+    _require_contains(_annotation_text(rule), "$labels.item", "BackupCredentialEscrowEvidenceMissing annotations")
+    lines.append("OK BackupCredentialEscrowEvidenceMissing uses item label")
+
+    return lines
+
+
+def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]:
+    query = urllib.parse.urlencode({"query": expr})
+    url = f"{base_url.rstrip('/')}/api/v1/query?{query}"
+    with urllib.request.urlopen(url, timeout=8) as response:
+        payload = json.loads(response.read().decode("utf-8"))
+    if payload.get("status") != "success":
+        raise ContractError(f"Prometheus query failed for {expr}: {payload}")
+    return payload.get("data", {}).get("result") or []
+
+
+def _prom_rules(base_url: str) -> list[dict[str, Any]]:
+    url = f"{base_url.rstrip('/')}/api/v1/rules"
+    with urllib.request.urlopen(url, timeout=8) as response:
+        payload = json.loads(response.read().decode("utf-8"))
+    if payload.get("status") != "success":
+        raise ContractError(f"Prometheus rules query failed: {payload}")
+    rules: list[dict[str, Any]] = []
+    for group in payload.get("data", {}).get("groups") or []:
+        for rule in group.get("rules") or []:
+            name = rule.get("name") or rule.get("alert")
+            if not name:
+                continue
+            rules.append(
+                {
+                    "name": str(name),
+                    "health": str(rule.get("health", "")),
+                    "state": str(rule.get("state", "")),
+                    "group": str(group.get("name", "")),
+                }
+            )
+    return rules
+
+
+def _require_live_label(base_url: str, expr: str, labels: set[str]) -> str:
+    rows = _prom_query(base_url, expr)
+    if not rows:
+        raise ContractError(f"Prometheus query returned no series: {expr}")
+    metric = rows[0].get("metric") or {}
+    missing = sorted(label for label in labels if label not in metric)
+    if missing:
+        raise ContractError(f"{expr} missing labels {missing}; labels={sorted(metric)}")
+    return f"OK live {expr} exposes labels {','.join(sorted(labels))}"
+
+
+def _require_live_rules(base_url: str, expected_alerts: list[str]) -> list[str]:
+    rules = _prom_rules(base_url)
+    by_name = {rule["name"]: rule for rule in rules}
+    missing = sorted(set(expected_alerts) - set(by_name))
+    if missing:
+        raise ContractError(f"Prometheus missing loaded backup alert rules: {missing}")
+
+    unhealthy = [
+        f"{rule['name']} health={rule['health']} group={rule['group']}"
+        for rule in by_name.values()
+        if rule["name"] in expected_alerts and rule["health"] not in {"", "ok"}
+    ]
+    if unhealthy:
+        raise ContractError(f"Prometheus backup alert rule health is not ok: {unhealthy}")
+
+    state_counts: dict[str, int] = {}
+    for name in expected_alerts:
+        state = by_name[name]["state"] or "unknown"
+        state_counts[state] = state_counts.get(state, 0) + 1
+    state_summary = ",".join(f"{key}={state_counts[key]}" for key in sorted(state_counts))
+    return [
+        f"OK live Prometheus loaded {len(expected_alerts)} baseline backup alert rules",
+        f"OK live Prometheus backup alert rule states {state_summary}",
+    ]
+
+
+def live_check(base_url: str, baseline_path: Path) -> list[str]:
+    lines = [
+        _require_live_label(
+            base_url,
+            'awoooi_backup_job_configured{host="110"}',
+            {"exported_job", "host", "job"},
+        ),
+        _require_live_label(
+            base_url,
+            'awoooi_backup_job_fresh{host="110"}',
+            {"exported_job", "host", "job", "source", "target", "max_age_hours"},
+        ),
+        _require_live_label(
+            base_url,
+            'awoooi_backup_last_run_failed_count{host="110"}',
+            {"exported_job", "host", "job"},
+        ),
+        _require_live_label(
+            base_url,
+            'awoooi_backup_dr_next_step_info{host="110"}',
+            {"host", "next_step"},
+        ),
+        _require_live_label(
+            base_url,
+            'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone"}',
+            {"host", "provider", "scope", "max_age_hours"},
+        ),
+        _require_live_label(
+            base_url,
+            'awoooi_backup_config_capture_ok{host="110"}',
+            {"host", "target", "source", "critical"},
+        ),
+    ]
+    lines.extend(_require_live_rules(base_url, _expected_backup_alerts(baseline_path)))
+    return lines
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--rules", type=Path, default=DEFAULT_RULES)
+    parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE)
+    parser.add_argument("--prometheus-url", default="")
+    args = parser.parse_args()
+
+    try:
+        for line in static_check(args.rules, args.baseline):
+            print(line)
+        if args.prometheus_url:
+            for line in live_check(args.prometheus_url, args.baseline):
+                print(line)
+    except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
+        print(f"BACKUP_ALERT_LABEL_CONTRACT_FAILED {exc}", file=sys.stderr)
+        return 1
+
+    print("BACKUP_ALERT_LABEL_CONTRACT_OK")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/ops/backup-alert-live-visibility-check.py b/scripts/ops/backup-alert-live-visibility-check.py
new file mode 100755
index 00000000..7ec765ec
--- /dev/null
+++ b/scripts/ops/backup-alert-live-visibility-check.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""Verify live visibility for backup gap alerts.
+
+This read-only check closes the gap between "metrics exist" and "alerts are
+actually visible". If the offsite or credential-escrow gap metrics are present,
+the corresponding Prometheus firing alerts must be visible. When Alertmanager is
+provided, those same alerts must also be active there.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from typing import Any
+
+
+class VisibilityError(RuntimeError):
+    pass
+
+
+@dataclass(frozen=True)
+class RequiredAlert:
+    name: str
+    labels: dict[str, str]
+
+
+COMMON_LABELS = {
+    "host": "110",
+    "auto_repair": "false",
+    "alert_category": "infrastructure",
+    "notification_type": "TYPE-1",
+    "severity": "warning",
+}
+
+
+def _json_get(url: str, timeout: int) -> Any:
+    with urllib.request.urlopen(url, timeout=timeout) as response:
+        return json.loads(response.read().decode("utf-8"))
+
+
+def _prom_query(base_url: str, expr: str, timeout: int) -> list[dict[str, Any]]:
+    query = urllib.parse.urlencode({"query": expr})
+    url = f"{base_url.rstrip('/')}/api/v1/query?{query}"
+    payload = _json_get(url, timeout)
+    if payload.get("status") != "success":
+        raise VisibilityError(f"Prometheus query failed for {expr}: {payload}")
+    return payload.get("data", {}).get("result") or []
+
+
+def _prom_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]:
+    url = f"{base_url.rstrip('/')}/api/v1/alerts"
+    payload = _json_get(url, timeout)
+    if payload.get("status") != "success":
+        raise VisibilityError(f"Prometheus alerts query failed: {payload}")
+    return payload.get("data", {}).get("alerts") or []
+
+
+def _alertmanager_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]:
+    url = f"{base_url.rstrip('/')}/api/v2/alerts"
+    payload = _json_get(url, timeout)
+    if not isinstance(payload, list):
+        raise VisibilityError(f"Alertmanager alerts query returned unexpected payload: {payload}")
+    return payload
+
+
+def _float_value(row: dict[str, Any], expr: str) -> float:
+    value = row.get("value")
+    if not isinstance(value, list) or len(value) < 2:
+        raise VisibilityError(f"Prometheus query returned unexpected value for {expr}: {row}")
+    try:
+        return float(value[1])
+    except (TypeError, ValueError) as exc:
+        raise VisibilityError(f"Prometheus query returned non-numeric value for {expr}: {row}") from exc
+
+
+def _metric_labels(row: dict[str, Any]) -> dict[str, str]:
+    metric = row.get("metric") or {}
+    return {str(key): str(value) for key, value in metric.items()}
+
+
+def _labels_match(actual: dict[str, str], expected: dict[str, str]) -> bool:
+    return all(actual.get(key) == value for key, value in expected.items())
+
+
+def _find_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None:
+    expected = {"alertname": required.name, **required.labels}
+    for alert in alerts:
+        if str(alert.get("state", "")) != "firing":
+            continue
+        labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()}
+        if _labels_match(labels, expected):
+            return alert
+    return None
+
+
+def _find_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None:
+    expected = {"alertname": required.name, **required.labels}
+    for alert in alerts:
+        status = alert.get("status") or {}
+        if str(status.get("state", "")) != "active":
+            continue
+        labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()}
+        if _labels_match(labels, expected):
+            return alert
+    return None
+
+
+def _require_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None:
+    if _find_prom_alert(alerts, required) is None:
+        raise VisibilityError(
+            f"missing Prometheus firing alert {required.name} with labels {required.labels}"
+        )
+
+
+def _require_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None:
+    if _find_alertmanager_alert(alerts, required) is None:
+        raise VisibilityError(
+            f"missing Alertmanager active alert {required.name} with labels {required.labels}"
+        )
+
+
+def _sum_query_values(prometheus_url: str, expr: str, timeout: int) -> float:
+    return sum(_float_value(row, expr) for row in _prom_query(prometheus_url, expr, timeout))
+
+
+def _max_query_value(prometheus_url: str, expr: str, timeout: int) -> float:
+    rows = _prom_query(prometheus_url, expr, timeout)
+    if not rows:
+        return 0
+    return max(_float_value(row, expr) for row in rows)
+
+
+def _offsite_required_alerts(prometheus_url: str, host: str, timeout: int) -> tuple[list[RequiredAlert], str]:
+    expr = f'awoooi_backup_offsite_configured{{host="{host}"}}'
+    rows = _prom_query(prometheus_url, expr, timeout)
+    if not rows:
+        raise VisibilityError(f"Prometheus query returned no offsite configured series: {expr}")
+    configured_total = sum(_float_value(row, expr) for row in rows)
+    if configured_total == 0:
+        return (
+            [RequiredAlert("BackupOffsiteCopyNotConfigured", {**COMMON_LABELS, "host": host})],
+            "OK offsite gap metric requires BackupOffsiteCopyNotConfigured visibility",
+        )
+
+    fresh_expr = f'awoooi_backup_offsite_fresh{{host="{host}"}}'
+    if _sum_query_values(prometheus_url, fresh_expr, timeout) > 0:
+        return [], "OK offsite full marker is fresh; no offsite gap alert required"
+
+    enabled_expr = f'awoooi_backup_offsite_full_sync_enabled{{host="{host}"}}'
+    enabled_total = _sum_query_values(prometheus_url, enabled_expr, timeout)
+    if enabled_total > 0:
+        timestamp_expr = f'awoooi_backup_offsite_full_sync_enabled_timestamp{{host="{host}"}}'
+        enabled_timestamp = _max_query_value(prometheus_url, timestamp_expr, timeout)
+        enabled_age = int(time.time() - enabled_timestamp) if enabled_timestamp else 0
+        if enabled_timestamp and enabled_age <= 30 * 3600:
+            return (
+                [],
+                f"OK offsite full sync enabled within grace window; BackupOffsiteCopyStale not required yet age_seconds={enabled_age}",
+            )
+
+    return (
+        [RequiredAlert("BackupOffsiteCopyStale", {**COMMON_LABELS, "host": host})],
+        "OK offsite full marker gap requires BackupOffsiteCopyStale visibility",
+    )
+
+
+def _escrow_required_alerts(prometheus_url: str, host: str, timeout: int) -> list[RequiredAlert]:
+    expr = f'awoooi_backup_credential_escrow_fresh{{host="{host}"}} == 0'
+    rows = _prom_query(prometheus_url, expr, timeout)
+    required: list[RequiredAlert] = []
+    for row in rows:
+        labels = _metric_labels(row)
+        item = labels.get("item")
+        if not item:
+            raise VisibilityError(f"Credential escrow gap metric missing item label: {row}")
+        required.append(
+            RequiredAlert(
+                "BackupCredentialEscrowEvidenceMissing",
+                {**COMMON_LABELS, "host": host, "item": item},
+            )
+        )
+    return sorted(required, key=lambda alert: alert.labels["item"])
+
+
+def live_check(prometheus_url: str, alertmanager_url: str, host: str, timeout: int) -> list[str]:
+    required_alerts: list[RequiredAlert] = []
+    lines: list[str] = []
+
+    offsite_alerts, offsite_line = _offsite_required_alerts(prometheus_url, host, timeout)
+    required_alerts.extend(offsite_alerts)
+    lines.append(offsite_line)
+
+    escrow_alerts = _escrow_required_alerts(prometheus_url, host, timeout)
+    required_alerts.extend(escrow_alerts)
+    if escrow_alerts:
+        escrow_items = ", ".join(alert.labels["item"] for alert in escrow_alerts)
+        lines.append(
+            f"OK credential escrow gap metrics require {len(escrow_alerts)} alert(s): {escrow_items}"
+        )
+    else:
+        lines.append("OK credential escrow markers are fresh; no escrow gap alert required")
+
+    prom_alerts = _prom_alerts(prometheus_url, timeout)
+    for required in required_alerts:
+        _require_prom_alert(prom_alerts, required)
+    lines.append(f"OK Prometheus exposes {len(required_alerts)} required backup gap firing alert(s)")
+
+    if alertmanager_url:
+        am_alerts = _alertmanager_alerts(alertmanager_url, timeout)
+        for required in required_alerts:
+            _require_alertmanager_alert(am_alerts, required)
+        lines.append(f"OK Alertmanager exposes {len(required_alerts)} required backup gap active alert(s)")
+
+    return lines
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--prometheus-url", required=True)
+    parser.add_argument("--alertmanager-url", default="")
+    parser.add_argument("--host", default="110")
+    parser.add_argument("--timeout", type=int, default=8)
+    args = parser.parse_args()
+
+    try:
+        for line in live_check(args.prometheus_url, args.alertmanager_url, args.host, args.timeout):
+            print(line)
+    except (VisibilityError, OSError, json.JSONDecodeError) as exc:
+        print(f"BACKUP_ALERT_LIVE_VISIBILITY_FAILED {exc}", file=sys.stderr)
+        return 1
+
+    print("BACKUP_ALERT_LIVE_VISIBILITY_OK")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/ops/prometheus-rule-drift-guard.sh b/scripts/ops/prometheus-rule-drift-guard.sh
index d83635dd..a8bf9b19 100755
--- a/scripts/ops/prometheus-rule-drift-guard.sh
+++ b/scripts/ops/prometheus-rule-drift-guard.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
 # Guard 110 Prometheus alert rules against stale deploys.
 #
-# The canonical file is the source of truth. The guard restores active
-# alerts.yml only when the active file differs from canonical or when
-# Prometheus is missing rule names declared by canonical.
+# This script is intentionally narrow: it only restores the canonical alert
+# rules file when required recovery/backup rules disappear from live Prometheus
+# or when the active file differs from the canonical copy.
 
 set -uo pipefail
 
@@ -14,6 +14,14 @@ CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonic
 TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}"
 LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}"
 
+REQUIRED_RULES=(
+  "BackupCredentialEscrowEvidenceMissing"
+  "BackupExpectedJobMissing"
+  "awoooi_recovery_core_ready"
+  "awoooi_recovery_dr_offsite_ready"
+  "ColdStartRecoveryBlocked"
+)
+
 log() {
   mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
   printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE"
@@ -34,7 +42,7 @@ awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",statu
 # HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run.
 # TYPE awoooi_prometheus_rule_drift_guard_repaired gauge
 awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired}
-# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of canonical live rules missing after the last check.
+# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of required live rules missing after the last check.
 # TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge
 awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count}
 # HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy.
@@ -46,27 +54,13 @@ EOF
 }
 
 rules_missing_count() {
-  python3 - "$PROMETHEUS_URL" "$CANONICAL_RULES" <<'PY'
+  python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY'
 import json
-import re
 import sys
 import urllib.request
 
 base_url = sys.argv[1].rstrip("/")
-canonical_path = sys.argv[2]
-
-name_pattern = re.compile(r"^\s*-\s*(?:alert|record):\s*['\"]?([^'\"#]+?)['\"]?\s*(?:#.*)?$")
-required: set[str] = set()
-try:
-    with open(canonical_path, encoding="utf-8") as handle:
-        for line in handle:
-            match = name_pattern.match(line)
-            if match:
-                required.add(match.group(1).strip())
-except Exception as exc:
-    print(f"CANONICAL_PARSE_FAILED:{exc}")
-    raise SystemExit(0)
-
+required = set(sys.argv[2:])
 try:
     with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response:
         payload = json.loads(response.read().decode("utf-8"))
@@ -115,8 +109,8 @@ main() {
   before_matches="$(matches_canonical)"
   repaired=0
 
-  if [[ "$missing" == QUERY_FAILED:* || "$missing" == CANONICAL_PARSE_FAILED:* ]]; then
-    log "Prometheus/canonical query failed: ${missing}"
+  if [[ "$missing" == QUERY_FAILED:* ]]; then
+    log "Prometheus query failed: ${missing}"
     write_textfile "query_failed" 0 999 "$before_matches"
     return 1
   fi
@@ -135,8 +129,8 @@ main() {
 
   after_missing="$(rules_missing_count)"
   after_matches="$(matches_canonical)"
-  if [[ "$after_missing" == QUERY_FAILED:* || "$after_missing" == CANONICAL_PARSE_FAILED:* ]]; then
-    log "post-restore Prometheus/canonical query failed: ${after_missing}"
+  if [[ "$after_missing" == QUERY_FAILED:* ]]; then
+    log "post-restore Prometheus query failed: ${after_missing}"
     write_textfile "post_query_failed" "$repaired" 999 "$after_matches"
     return 1
   fi
diff --git a/scripts/ops/recovery-scorecard-contract-check.py b/scripts/ops/recovery-scorecard-contract-check.py
new file mode 100755
index 00000000..e4f59f3c
--- /dev/null
+++ b/scripts/ops/recovery-scorecard-contract-check.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Validate recovery scorecard recording-rule contract."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import urllib.parse
+import urllib.request
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
+DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml")
+EXPECTED_CORE = 'awoooi_recovery_core_ready{host="110",scope="110_120_121_188"}'
+EXPECTED_DR = 'awoooi_recovery_dr_offsite_ready{host="110"}'
+
+
+class ContractError(RuntimeError):
+    pass
+
+
+def _rules(path: Path) -> list[dict[str, Any]]:
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    rules: list[dict[str, Any]] = []
+    for group in data.get("groups") or []:
+        rules.extend(group.get("rules") or [])
+    return rules
+
+
+def _expected_recording_rules(path: Path) -> list[str]:
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or []
+    if not rules:
+        raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}")
+    return [str(rule) for rule in rules]
+
+
+def static_check(rules_path: Path, baseline_path: Path) -> list[str]:
+    rules = _rules(rules_path)
+    by_record = {str(rule.get("record")): rule for rule in rules if rule.get("record")}
+    expected = _expected_recording_rules(baseline_path)
+    missing = sorted(set(expected) - set(by_record))
+    if missing:
+        raise ContractError(f"alerts-unified.yml missing recovery recording rules: {missing}")
+
+    core_expr = str(by_record["awoooi_recovery_core_ready"].get("expr", ""))
+    for required in [
+        "awoooi_cold_start_last_result",
+        "awoooi_cold_start_warn_gates",
+        "awoooi_cold_start_blocked_gates",
+        "awoooi_cold_start_last_green_timestamp",
+    ]:
+        if required not in core_expr:
+            raise ContractError(f"awoooi_recovery_core_ready expr missing {required}")
+
+    dr_expr = str(by_record["awoooi_recovery_dr_offsite_ready"].get("expr", ""))
+    for required in [
+        "awoooi_backup_offsite_configured",
+        "awoooi_backup_offsite_fresh",
+        "awoooi_backup_credential_escrow_fresh",
+    ]:
+        if required not in dr_expr:
+            raise ContractError(f"awoooi_recovery_dr_offsite_ready expr missing {required}")
+
+    return [
+        "OK alerts-unified.yml contains every recovery scorecard recording rule",
+        "OK recovery core rule depends on cold-start green/warn/blocked/last-green metrics",
+        "OK recovery DR rule depends on provider-neutral offsite freshness and credential escrow freshness",
+    ]
+
+
+def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]:
+    url = f"{base_url.rstrip('/')}/api/v1/query?" + urllib.parse.urlencode({"query": expr})
+    with urllib.request.urlopen(url, timeout=8) as response:
+        payload = json.loads(response.read().decode("utf-8"))
+    if payload.get("status") != "success":
+        raise ContractError(f"Prometheus query failed for {expr}: {payload}")
+    return payload.get("data", {}).get("result") or []
+
+
+def _single_value(base_url: str, expr: str) -> float:
+    rows = _prom_query(base_url, expr)
+    if len(rows) != 1:
+        raise ContractError(f"Prometheus query expected one series for {expr}, got {len(rows)}")
+    value = rows[0].get("value") or []
+    if len(value) < 2:
+        raise ContractError(f"Prometheus query returned malformed value for {expr}: {rows[0]}")
+    try:
+        number = float(value[1])
+    except (TypeError, ValueError) as exc:
+        raise ContractError(f"Prometheus query returned non-numeric value for {expr}: {rows[0]}") from exc
+    if number not in {0.0, 1.0}:
+        raise ContractError(f"Prometheus recovery scorecard metric must be 0 or 1: {expr}={number}")
+    return number
+
+
+def live_check(
+    base_url: str,
+    expect_core_ready: bool = False,
+    expect_dr_ready: bool = False,
+) -> list[str]:
+    core = _single_value(base_url, EXPECTED_CORE)
+    dr = _single_value(base_url, EXPECTED_DR)
+    lines = [
+        f"OK live {EXPECTED_CORE} value={int(core)}",
+        f"OK live {EXPECTED_DR} value={int(dr)}",
+    ]
+    if expect_core_ready and core != 1.0:
+        raise ContractError(f"expected core recovery ready, got {core}")
+    if expect_dr_ready and dr != 1.0:
+        raise ContractError(f"expected DR offsite ready, got {dr}")
+    return lines
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--rules", type=Path, default=DEFAULT_RULES)
+    parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE)
+    parser.add_argument("--prometheus-url", default="")
+    parser.add_argument("--expect-core-ready", action="store_true")
+    parser.add_argument("--expect-dr-ready", action="store_true")
+    args = parser.parse_args()
+
+    try:
+        for line in static_check(args.rules, args.baseline):
+            print(line)
+        if args.prometheus_url:
+            for line in live_check(
+                args.prometheus_url,
+                args.expect_core_ready,
+                args.expect_dr_ready,
+            ):
+                print(line)
+    except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
+        print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr)
+        return 1
+
+    print("RECOVERY_SCORECARD_CONTRACT_OK")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/reboot-recovery/cold-start-textfile-exporter.sh b/scripts/reboot-recovery/cold-start-textfile-exporter.sh
index 82262eea..2798f877 100755
--- a/scripts/reboot-recovery/cold-start-textfile-exporter.sh
+++ b/scripts/reboot-recovery/cold-start-textfile-exporter.sh
@@ -1,10 +1,8 @@
 #!/usr/bin/env bash
 # Export AWOOOI full-stack cold-start gate status as node-exporter textfile metrics.
 #
-# 2026-05-06 ogt + Codex: reboot recovery hardening.
-# Intent: give Prometheus and the AI incident flow a durable, read-only signal
-# for the 110/120/121/188 startup gates. This wrapper never sends the
-# Alertmanager smoke event and never writes remote state.
+# This wrapper is read-only: it never sends the Alertmanager smoke event and
+# never mutates remote host/service state.
 
 set -uo pipefail
 
@@ -13,6 +11,8 @@ TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_expo
 OUTPUT_NAME="${OUTPUT_NAME:-cold_start_recovery.prom}"
 LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}"
 CHECK_TIMEOUT_SECONDS="${CHECK_TIMEOUT_SECONDS:-240}"
+CHECK_WATCH_INTERVAL_SECONDS="${CHECK_WATCH_INTERVAL_SECONDS:-10}"
+CHECK_WATCH_MAX_ATTEMPTS="${CHECK_WATCH_MAX_ATTEMPTS:-3}"
 HOST_LABEL="${AIOPS_HOST_LABEL:-110}"
 SCOPE_LABEL="${AIOPS_SCOPE_LABEL:-110_120_121_188}"
 LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-cold-start-textfile-exporter.lock}"
@@ -35,6 +35,10 @@ write_metric_file() {
   local blocked_state="${11}"
   local check_failed="${12}"
   local last_green="${13}"
+  local k3s_node_fs_blocker="${14}"
+  local public_route_tls_blocker="${15}"
+  local host_120_unreachable_blocker="${16}"
+  local backup_health_blocker="${17}"
   local host scope
   host=$(escape_label "$HOST_LABEL")
   scope=$(escape_label "$SCOPE_LABEL")
@@ -70,10 +74,16 @@ awoooi_cold_start_last_result{host="$host",scope="$scope",result="green"} $green
 awoooi_cold_start_last_result{host="$host",scope="$scope",result="degraded"} $degraded
 awoooi_cold_start_last_result{host="$host",scope="$scope",result="blocked"} $blocked_state
 awoooi_cold_start_last_result{host="$host",scope="$scope",result="check_failed"} $check_failed
+# HELP awoooi_cold_start_blocker_reason Whether a known cold-start blocker reason was detected in the last log.
+# TYPE awoooi_cold_start_blocker_reason gauge
+awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="k3s_node_filesystem_error",target="120"} $k3s_node_fs_blocker
+awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="public_route_tls_failure",target="public_https"} $public_route_tls_blocker
+awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="host_unreachable",target="120"} $host_120_unreachable_blocker
+awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="backup_health_blocked",target="110"} $backup_health_blocker
 METRICS
 }
 
-if [ -n "${BASH_VERSION:-}" ] && command -v flock >/dev/null 2>&1; then
+if command -v flock >/dev/null 2>&1; then
   exec 9>"$LOCK_FILE"
   if ! flock -n 9; then
     exit 0
@@ -92,13 +102,19 @@ if [ ! -x "$CHECK_SCRIPT" ]; then
   tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
   last_green=$(cat "$state_file" 2>/dev/null || echo 0)
   printf 'CHECK_SCRIPT not executable: %s\n' "$CHECK_SCRIPT" >"$log_file"
-  write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green"
+  write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green" 0 0 0 0
   chmod 0644 "$tmp_metric"
   mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"
   exit 0
 fi
 
-timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" --monitor-read-only --no-color >"$log_tmp" 2>&1
+timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" \
+  --monitor-read-only \
+  --no-color \
+  --watch \
+  --interval "$CHECK_WATCH_INTERVAL_SECONDS" \
+  --max-attempts "$CHECK_WATCH_MAX_ATTEMPTS" \
+  >"$log_tmp" 2>&1
 exit_code=$?
 mv "$log_tmp" "$log_file"
 
@@ -111,6 +127,10 @@ green=0
 degraded=0
 blocked_state=0
 check_failed=0
+k3s_node_fs_blocker=0
+public_route_tls_blocker=0
+host_120_unreachable_blocker=0
+backup_health_blocker=0
 
 if [ -n "$summary_line" ]; then
   monitor_up=1
@@ -130,6 +150,22 @@ else
   check_failed=1
 fi
 
+if grep -Eq 'NODE_FS_ERROR_EVENTS[[:space:]]+[1-9][0-9]*|K3s node filesystem error events present' "$log_file"; then
+  k3s_node_fs_blocker=1
+fi
+
+if grep -Eq 'PUBLIC_ROUTE_TLS .*(000|5[0-9][0-9])|public route .* TLS certificate verification failed' "$log_file"; then
+  public_route_tls_blocker=1
+fi
+
+if grep -Eq 'BLOCKED (ping 192\.168\.0\.120|ssh port 192\.168\.0\.120:22|ssh 120 k3s read-only check)' "$log_file"; then
+  host_120_unreachable_blocker=1
+fi
+
+if grep -Eq 'BLOCKED 110 backup health has stale expected jobs' "$log_file"; then
+  backup_health_blocker=1
+fi
+
 end_ts=$(date +%s)
 if [ "$green" -eq 1 ]; then
   printf '%s\n' "$end_ts" >"$state_file"
@@ -137,6 +173,6 @@ fi
 last_green=$(cat "$state_file" 2>/dev/null || echo 0)
 
 tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
-write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green"
+write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green" "$k3s_node_fs_blocker" "$public_route_tls_blocker" "$host_120_unreachable_blocker" "$backup_health_blocker"
 chmod 0644 "$tmp_metric"
 mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"
diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh
index a032104a..cca8e0b8 100755
--- a/scripts/reboot-recovery/full-stack-cold-start-check.sh
+++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh
@@ -7,6 +7,7 @@ set -uo pipefail
 SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
 SEND_ALERT_TEST=0
 MONITOR_READ_ONLY=0
+NO_COLOR_FLAG=0
 WATCH_MODE=0
 WATCH_INTERVAL=60
 WATCH_MAX_ATTEMPTS=30
@@ -30,15 +31,17 @@ USAGE
 }
 
 while [ "$#" -gt 0 ]; do
-  case "$1" in
+  arg="$1"
+  case "$arg" in
     --send-alert-test)
       SEND_ALERT_TEST=1
       ;;
     --monitor-read-only)
       MONITOR_READ_ONLY=1
+      SEND_ALERT_TEST=0
       ;;
     --no-color)
-      NO_COLOR=1
+      NO_COLOR_FLAG=1
       ;;
     --watch)
       WATCH_MODE=1
@@ -64,7 +67,7 @@ while [ "$#" -gt 0 ]; do
       exit 0
       ;;
     *)
-      echo "Unknown argument: $1" >&2
+      echo "Unknown argument: $arg" >&2
       usage >&2
       exit 64
       ;;
@@ -72,7 +75,7 @@ while [ "$#" -gt 0 ]; do
   shift
 done
 
-if [ -n "${NO_COLOR:-}" ]; then
+if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then
   RED=""
   GREEN=""
   YELLOW=""
@@ -90,12 +93,6 @@ PASS=0
 WARN=0
 FAIL=0
 
-reset_counters() {
-  PASS=0
-  WARN=0
-  FAIL=0
-}
-
 log_section() {
   printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
 }
@@ -198,6 +195,18 @@ probe_tcp() {
   nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
 }
 
+print_neighbor_rows() {
+  if command -v arp >/dev/null 2>&1; then
+    arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
+    return $?
+  fi
+  if command -v ip >/dev/null 2>&1; then
+    ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)'
+    return $?
+  fi
+  return 1
+}
+
 print_header() {
   echo "AWOOOI full-stack cold-start check"
   date '+%Y-%m-%d %H:%M:%S %Z'
@@ -222,12 +231,12 @@ check_network() {
     fi
   done
 
-  if arp -an | grep -E '192\.168\.0\.(110|120|121|188)'; then
-    ok "ARP evidence printed"
+  if print_neighbor_rows; then
+    ok "neighbor evidence printed"
   elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
-    ok "ARP evidence unavailable in monitor mode; ping and TCP gates passed"
+    ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal"
   else
-    warn "no ARP rows printed for one or more hosts"
+    warn "no neighbor rows printed for one or more hosts"
   fi
 }
 
@@ -370,21 +379,34 @@ WEB_CODE $web_code"
 
 check_public_routes() {
   log_section "P2-PUBLIC-ROUTES"
-  local awoooi_api_code awoooi_web_code momo_code momo_health_code
-  awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
-  awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
-  momo_code=$(probe_http_code "https://mo.wooo.work/")
-  momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
+  local item name url code tls_code
+  local routes=(
+    "awoooi_api|https://awoooi.wooo.work/api/v1/health"
+    "awoooi_web|https://awoooi.wooo.work/"
+    "momo_web|https://mo.wooo.work/"
+    "momo_health|https://mo.wooo.work/health"
+    "gitea|https://gitea.wooo.work/"
+    "harbor|https://harbor.wooo.work/"
+    "registry|https://registry.wooo.work/"
+    "sentry|https://sentry.wooo.work/"
+    "signoz|https://signoz.wooo.work/"
+    "stock|https://stock.wooo.work/"
+    "langfuse|https://langfuse.wooo.work/"
+    "bitan|https://bitan.wooo.work/"
+    "aiops|https://aiops.wooo.work/"
+  )
 
-  echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
-  echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
-  echo "MOMO_PUBLIC_CODE $momo_code"
-  echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
-
-  [[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
-  [[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
-  [[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
-  [[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
+  for item in "${routes[@]}"; do
+    name="${item%%|*}"
+    url="${item#*|}"
+    code=$(probe_http_code "$url")
+    echo "PUBLIC_ROUTE $name $code $url"
+    [[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed"
+    tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true)
+    tls_code="${tls_code:-000}"
+    echo "PUBLIC_ROUTE_TLS $name $tls_code $url"
+    [[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed"
+  done
 }
 
 check_schedules() {
@@ -394,7 +416,7 @@ check_schedules() {
   if out=$(host_cmd "ollama@192.168.0.188" '
 now=$(date +%s)
 echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
-for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
+for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do
   if [ -f "$f" ]; then
     mt=$(stat -c %Y "$f")
     echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
@@ -405,17 +427,37 @@ done
 if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
   awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
 fi
-echo "SCHEDULER_STATE $(docker inspect -f "{{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true)"
-echo "SCHEDULER_REGISTERED $(docker logs --since 6h momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
+if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then
+  awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom
+fi
+if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then
+  awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom
+fi
+echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)"
+echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)"
+echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
+echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler" || true)"
+momo_sync=$(docker exec momo-db sh -c "psql -U \"\$POSTGRES_USER\" -d \"\$POSTGRES_DB\" -Atc \"WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\\\"日期\\\"::date) mmin, max(\\\"日期\\\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \\\"日期\\\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;\"" 2>/dev/null || true)
+echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
 ' 2>&1); then
     echo "$out"
     grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
     awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
+    awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale"
     awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
     awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
+    awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale"
+    grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean"
     awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
-    grep -q "SCHEDULER_STATE running healthy" <<<"$out" && ok "188 momo scheduler container healthy" || warn "188 momo scheduler health not confirmed"
-    awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs within 6h" || warn "188 momo scheduler registration not confirmed within 6h"
+    grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs"
+    if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then
+      ok "188 momo scheduler healthy with recent task activity"
+    elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then
+      ok "188 momo scheduler registered jobs"
+    else
+      warn "188 momo scheduler registration/activity not confirmed"
+    fi
+    awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
   else
     warn "188 schedule check unavailable"
     echo "$out"
@@ -427,7 +469,7 @@ echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active cro
 echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
 echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
 echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
-for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
+for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do
   if [ -f "$f" ]; then
     mt=$(stat -c %Y "$f")
     echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
@@ -435,6 +477,12 @@ for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_ex
     echo "TEXTFILE_110 $(basename "$f") missing"
   fi
 done
+if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then
+  awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom
+fi
+if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then
+  awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom
+fi
 ' 2>&1); then
     echo "$out"
     grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
@@ -443,6 +491,11 @@ done
     grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
     awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
     awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
+    awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale"
+    awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale"
+    grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean"
+    grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers"
+    awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale"
   else
     warn "110 schedule check unavailable"
     echo "$out"
@@ -494,54 +547,41 @@ summary() {
   echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
   if [ "$FAIL" -gt 0 ]; then
     echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
-    return 2
+    exit 2
   fi
   if [ "$WARN" -gt 0 ]; then
     echo "Result: DEGRADED. Core gates passed but warnings remain."
-    return 1
+    exit 1
   fi
   echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
-  return 0
-}
-
-run_once() {
-  reset_counters
-  print_header
-  check_network
-  check_188
-  check_110
-  check_k3s
-  check_workload_and_alertchain
-  check_public_routes
-  check_schedules
-  summary
 }
 
 if [ "$WATCH_MODE" -eq 1 ]; then
   attempt=1
-  while :; do
-    if [ "$WATCH_MAX_ATTEMPTS" -eq 0 ]; then
-      printf "\nWatch attempt %s/unlimited\n" "$attempt"
-    else
-      printf "\nWatch attempt %s/%s\n" "$attempt" "$WATCH_MAX_ATTEMPTS"
-    fi
-
-    run_once
+  rc=2
+  while true; do
+    echo "WATCH_ATTEMPT=$attempt"
+    args=()
+    [ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only)
+    [ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color)
+    [ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test)
+    bash "$0" "${args[@]}"
     rc=$?
-    if [ "$rc" -eq 0 ]; then
-      exit 0
-    fi
-
-    if [ "$WATCH_MAX_ATTEMPTS" -ne 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
-      echo "Watch stopped before GREEN. Last result code: $rc"
+    [ "$rc" -eq 0 ] && exit 0
+    if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
       exit "$rc"
     fi
-
-    echo "Waiting ${WATCH_INTERVAL}s before the next cold-start gate check..."
-    sleep "$WATCH_INTERVAL"
     attempt=$((attempt + 1))
+    sleep "$WATCH_INTERVAL"
   done
 fi
 
-run_once
-exit $?
+print_header
+check_network
+check_188
+check_110
+check_k3s
+check_workload_and_alertchain
+check_public_routes
+check_schedules
+summary