fix(ops): harden cold-start schedule recovery

2026-05-05 22:14:54 +08:00
parent 10cd9fc025
commit 894174da5b
13 changed files with 1073 additions and 43 deletions
--- a/.gitea/workflows/cd-dev.yaml
+++ b/.gitea/workflows/cd-dev.yaml
@@ -108,7 +108,9 @@ jobs:
          mkdir -p ~/.ssh
          echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
-          ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
+          # 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
+          # worker and its local kubeconfig points at 127.0.0.1:6443.
+          ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
          set -e
          export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

@@ -138,10 +140,10 @@ jobs:
          SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
        run: |
          cat k8s/awoooi-dev/02-configmap.yaml | \
-            ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
+            ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
            "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"

-          ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
+          ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
          set -e
          export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

--- a/.gitea/workflows/cd.yaml
+++ b/.gitea/workflows/cd.yaml
@@ -406,8 +406,11 @@ jobs:
          mkdir -p ~/.ssh
          echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
-          ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
-          ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
+          # 2026-05-05 Codex: kubectl must run on the 120 control-plane.
+          # 121 is a worker after cold-start recovery; its kubeconfig points at
+          # 127.0.0.1:6443 and fails ADR-035 secret patching.
+          ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
+          ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
          set -e
          export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

@@ -634,19 +637,21 @@ jobs:
          mkdir -p ~/.ssh
          echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
          chmod 600 ~/.ssh/deploy_key
-          ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
+          # 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
+          # control-plane, not 121 worker.
+          ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null

          IMAGE_TAG="${{ github.sha }}"
          HARBOR=192.168.0.110:5000

          # ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment，ConfigMap 仍直接 apply) ───
          cat k8s/awoooi-prod/04-configmap.yaml | \
-            ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
+            ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
            "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
          echo "✅ ConfigMap 已更新"

          cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
-            ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
+            ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
            "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
          echo "✅ Service Registry ConfigMap 已更新"

@@ -688,7 +693,7 @@ jobs:
          }

          # ─── Step 4: 等待 ArgoCD sync + rollout ───
-          ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
+          ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
            "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
          set -e
          export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -814,7 +819,7 @@ jobs:
      - name: Alert Chain Smoke Test
        id: alert_chain_smoke
        run: |
-          # 2026-04-05 Claude Code: 使用真實 API 地址（192.168.0.121:32334 NodePort）
+          # 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
          # Host runner launches the CI image explicitly to avoid act RWLayer=nil.
          if docker run --rm \
            --name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
@@ -824,7 +829,7 @@ jobs:
            -v awoooi-api-venv-cache:/opt/api-venv \
            -w /workspace \
            "${{ env.CI_IMAGE }}" \
-            bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
+            bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
            echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
          else
            echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
--- a/apps/api/migrations/embedding_bge_m3_1024.sql
+++ b/apps/api/migrations/embedding_bge_m3_1024.sql
@@ -6,8 +6,9 @@
 --   bge-m3 產生 1024 維向量，現有 schema vector(768) 不相容，INSERT 會直接失敗
 --
 -- 影響範圍：
--   1. rag_chunks.embedding          vector(768) → vector(1024)
--   2. playbook_embeddings.embedding vector(768) → vector(1024)
+--   1. knowledge_entries.embedding   vector(768) → vector(1024)
+--   2. rag_chunks.embedding          vector(768) → vector(1024)
+--   3. playbook_embeddings.embedding vector(768) → vector(1024)
 --
 -- 遷移策略：清空現有向量資料，切換維度後由 re-embed script 重新嵌入
 -- 現有向量資料若要保留，需先 dump 用 nomic 格式備份（舊維度無法轉換）
@@ -21,7 +22,24 @@

 BEGIN;

-- 1. rag_chunks：清空向量資料，變更欄位維度
+-- 1. knowledge_entries：備份舊向量並清空，變更欄位維度
+CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
+SELECT
+    id,
+    embedding::text AS embedding_768,
+    NOW() AS backed_up_at
+FROM knowledge_entries
+WHERE embedding IS NOT NULL;
+
+ALTER TABLE knowledge_entries
+    ALTER COLUMN embedding TYPE vector(1024)
+    USING NULL;  -- 清空現有 768 維向量（維度不可轉換）
+
+COMMENT ON COLUMN knowledge_entries.embedding IS
+    'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
+
+
+-- 2. rag_chunks：清空向量資料，變更欄位維度
 --    ivfflat index 必須先 DROP 才能 ALTER COLUMN
 DROP INDEX IF EXISTS idx_rag_chunks_embedding;

@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
    'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';


-- 2. playbook_embeddings：清空向量資料，變更欄位維度
+-- 3. playbook_embeddings：清空向量資料，變更欄位維度
 DROP INDEX IF EXISTS ix_playbook_embeddings_vec;

 ALTER TABLE playbook_embeddings
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
 -- 3. 驗證遷移結果
 DO $$
 DECLARE
+    v_km_dim integer;
    v_rag_dim integer;
    v_pb_dim integer;
 BEGIN
+    SELECT atttypmod INTO v_km_dim
+    FROM pg_attribute
+    JOIN pg_class ON attrelid = pg_class.oid
+    WHERE relname = 'knowledge_entries' AND attname = 'embedding';
+
    SELECT atttypmod INTO v_rag_dim
    FROM pg_attribute
    JOIN pg_class ON attrelid = pg_class.oid
@@ -74,15 +98,18 @@ BEGIN
    JOIN pg_class ON attrelid = pg_class.oid
    WHERE relname = 'playbook_embeddings' AND attname = 'embedding';

-    -- atttypmod for vector(1024) = 1024 + 1 = 1025
-    IF v_rag_dim != 1025 THEN
-        RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗：expected 1025, got %', v_rag_dim;
+    -- pgvector atttypmod stores the configured dimension.
+    IF v_km_dim != 1024 THEN
+        RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗：expected 1024, got %', v_km_dim;
    END IF;
-    IF v_pb_dim != 1025 THEN
-        RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗：expected 1025, got %', v_pb_dim;
+    IF v_rag_dim != 1024 THEN
+        RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗：expected 1024, got %', v_rag_dim;
+    END IF;
+    IF v_pb_dim != 1024 THEN
+        RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗：expected 1024, got %', v_pb_dim;
    END IF;

-    RAISE NOTICE '✅ embedding 遷移驗證通過：rag_chunks 和 playbook_embeddings 均為 vector(1024)';
+    RAISE NOTICE '✅ embedding 遷移驗證通過：knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)';
 END $$;

 COMMIT;
--- a/apps/api/src/services/k3s_monitor_service.py
+++ b/apps/api/src/services/k3s_monitor_service.py
@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
 # 台北時區
 TZ_TAIPEI = ZoneInfo("Asia/Taipei")

-# Prometheus 端點
-PROMETHEUS_URL = "http://192.168.0.121:30090"
+# Prometheus endpoint.
+#
+# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
+# Production already injects PROMETHEUS_URL from ConfigMap, currently the
+# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
+PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")

 # kube-state-metrics 查詢
 PROM_QUERIES = {
@@ -215,7 +219,7 @@ class K3sMonitorService:

            # 發送訊息
            formatted = status.format()
-            result = await gateway.send_message(formatted)
+            result = await gateway.send_text(formatted)

            if result:
                logger.info("k3s_daily_report_sent", date=status.report_date)
--- a/apps/api/src/services/weekly_report_service.py
+++ b/apps/api/src/services/weekly_report_service.py
@@ -244,7 +244,7 @@ class WeeklyReportService:

            # 發送訊息
            formatted = report.format()
-            result = await gateway.send_message(formatted)
+            result = await gateway.send_text(formatted)

            if result:
                logger.info("weekly_report_sent", week=report.week_range)
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -6,6 +6,38 @@

 ---

+## 2026-05-05 | 重開機後排程與 startup baseline 修復
+
+**背景**：四台主機非預期重開機後，統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復，不能只看容器 `healthy`。
+
+**本次排程/啟動鏈修補**：
+- 120/121 K3s 回到 Ready；CD workflow 目標從 121 改為 120，避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗；120 已驗證 limited sudo kubectl 可用。
+- K8s CronJob 修正：`k3s-status-report`、`weekly-report`、`km-vectorize` 改用存在的 service account、live API image、cluster service DNS；手動 job 驗證 drift/k3s/weekly 可完成，歷史 failed jobs 已清掉。
+- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`；原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`，正在以 `bge-m3:latest` 重建。
+- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup；成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`。
+- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`；修正 `.ssh/config` 權限與 110 identity 設定後，以低優先權手動備份成功，Prometheus `backup_110_last_success_timestamp` 已更新。
+- 188 momo-scheduler 修正 dashboard URL：容器內改打 `http://momo-pro-system`，不再打 `127.0.0.1:5000`。
+- 188 Google Drive token 從 legacy pickle 轉為 JSON，scheduler 容器內 `GoogleDriveService().authenticate()` 通過。
+- 188 daily sales import 修正 Excel sheet 選擇，優先讀 `即時業績明細`；手動匯入成功 `19934` 筆，日期 `2026-04-01 ~ 2026-05-03`。
+- 188 import 尾端驗證修正：改比對本次匯入日期範圍，不再用全表筆數硬比；`daily_sales_snapshot` 與 `realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
+- 110 startup 修復：移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行；`systemd-sysctl` 恢復成功。
+- 110 停用兩個過期 startup units：`momo-startup-complete.service`（指向不存在路徑/錯 host）與 `wooo-staggered-startup.service`（舊 GitLab 延遲啟動且會增加重開機負載）。
+- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘，重跑後 `ActiveState=active`、`SubState=exited`、`Result=success`，`systemctl --failed` 為 0。
+- 110 certbot timer 失敗追查：`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`，HTTP-01 無法從 110 成功；已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`，並 reset certbot failed state。憑證 archive 未刪除；後續需修 public route 或改 DNS-01。
+- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`，覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
+- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria，要求排程真正可執行才算 reboot recovery 完成。
+
+**最終驗證**：
+- KM reembed 完成：`1774/1774` success、`0` failed；DB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`，舊 embedding backup `1691` rows。
+- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成，回 `embed-all: 200 {"total":0,"success":0,"failed":0}`。
+- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test` → `PASS=50 WARN=0 BLOCKED=0`，包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
+- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`；HostBackupFailed 解除。
+- 188/110 負載回到低檔；K3s node CPU 約 3-6%，KM reembed 未造成主機過載。
+
+**下一步**：
+- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy，避免下一版 image 覆蓋 hotfix。
+- 修 `grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal；目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
+
 ## 2026-05-05 | 110 Sentry resource limits persistence gap closed

 **背景**：110 guardrail 告警已清，但主機 load 仍有長尾；統帥擔心 Claude Code 只做 live `docker update`，重建後配置又失效。
@@ -3066,3 +3098,42 @@ C1（evolver 加 YAML_RULE guard）+ C2（seeder SQL `AND status != 'deprecated'
 ```bash
 psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
 ```
+
+---
+
+## 2026-05-05（台北）— 四主機重開機後全站冷啟動救援
+
+**觸發**：110 / 120 / 121 / 188 同時重開機後，多數服務異常；統帥要求先恢復所有網站、主機、核心服務，並建立完整冷啟動 SOP。
+
+### 已恢復
+
+| 範圍 | 結果 |
+|------|------|
+| 188 host PostgreSQL | WAL checkpoint 損壞；已備份後 `pg_resetwal`，`k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
+| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows；120 / 121 重新 Ready |
+| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` Running；VIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
+| mo.wooo.work | `momo-db` WAL redo 損壞；備份後 `pg_resetwal`，`momo-pro-system` / scheduler / bot / DB 全部 healthy；公網 `/` 200、`/health` 200 |
+| 110 host overload | actions runner units 維持最後放行；Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復，Sentry stack healthy |
+| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption；已 clean-clone 可讀資料並保留原始 corrupt volume，SignOz HTTP 恢復 |
+| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 與 `scripts/reboot-recovery/full-stack-cold-start-check.sh` |
+
+### 驗證
+
+```bash
+bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
+# PASS=31 WARN=0 BLOCKED=0
+# Result: GREEN. Full stack is ready for controlled runner/CD release.
+```
+
+### Dirty reboot 資料保全
+
+- 110 Sentry ClickHouse：原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`；以 clean-clone 恢復可讀資料並加 `force_restore_data`。
+- 110 Sentry Kafka：malformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`，只重建 checkpoint，不刪 topic/log data。
+- 188 SignOz ClickHouse：原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`；以 clean-clone 恢復可讀資料。
+- 188 `momo-db`：WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`。
+
+### 已知隔離 / 後續
+
+- 110 actions runner units 仍按策略最後放行：guardrail 已套用，`CPUQuota=200%`、`MemoryMax=2G`、`WatchdogUSec=0`；需在 load/core 穩定後逐步開啟。
+- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號；線上 clean-clone 已恢復服務，但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
+- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error，不阻斷主服務；後續可清理或調查。
--- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md
+++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md
@@ -0,0 +1,497 @@
+# AWOOOI Full-Stack Cold Start SOP
+
+> Version: v1.0
+> Last updated: 2026-05-05 Asia/Taipei
+> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
+
+---
+
+## 0. When To Use This
+
+Use this SOP when any of these happen:
+
+- 110/120/121/188 reboot unexpectedly.
+- All services are abnormal after a power/network event.
+- K3s is stuck `activating`.
+- Host load remains high during startup and service health is mixed.
+- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
+
+The rule is simple: **recover the dependency chain, not the loudest symptom.**
+
+---
+
+## 1. Golden Startup Order
+
+```text
+0. Freeze automation and preserve evidence
+1. Physical/network layer
+2. 188 data layer
+3. 110 registry/observability layer
+4. 120/121 K3s layer
+5. AWOOOI workload layer
+6. Public routes and alert chain
+7. High-load batch/consumer/crawler services
+8. Runner/CD
+9. AI auto-remediation
+10. 112 Kali scanner, if needed
+```
+
+Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
+
+---
+
+## 2. Automation Freeze
+
+Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
+
+| Item | Cold-start policy | Reason |
+|------|-------------------|--------|
+| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
+| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
+| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
+| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
+| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
+| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
+
+---
+
+## 3. P0 Evidence And Network
+
+Run from any machine on the same LAN:
+
+```bash
+for h in 110 120 121 188; do
+  ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
+done
+
+arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
+for h in 110 120 121 188; do
+  nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
+done
+```
+
+Then capture reboot evidence:
+
+```bash
+ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
+ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
+ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
+ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
+```
+
+If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
+
+---
+
+## 4. P0 188 Data Layer
+
+188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
+
+### 4.1 Startup order
+
+1. `containerd`
+2. `docker`
+3. `postgresql@14-main`
+4. `k3s_datastore.kine` maintenance
+5. `redis-server` on `6380`
+6. `ollama` or current AI proxy dependencies
+7. `nginx`
+8. Docker networks
+9. MinIO / OpenClaw / SignOz
+10. momo / litellm / batch services after load is stable
+
+### 4.2 Read-only check
+
+```bash
+ssh ollama@192.168.0.188 '
+hostname; date; uptime; free -h
+systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
+pg_isready -h localhost -p 5432 || true
+redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
+docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
+'
+```
+
+### 4.3 PostgreSQL WAL checkpoint damage
+
+Signature:
+
+```text
+PANIC: could not locate a valid checkpoint record
+invalid primary checkpoint record
+unexpected pageaddr ... in log segment ...
+```
+
+This blocks:
+
+- `188:5432`
+- K3s startup on 120/121
+- AWOOOI API DB access
+- Alertmanager webhook if API cannot start
+
+Human-approved recovery command on 188:
+
+```bash
+sudo systemctl stop postgresql@14-main
+sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
+sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
+sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
+sudo systemctl start postgresql@14-main
+pg_isready -h localhost -p 5432
+sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
+```
+
+Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
+
+---
+
+## 5. P0/P1 110 Registry And Observability
+
+110 must recover Harbor/Gitea/Monitoring early, but runners last.
+
+### 5.1 Startup order
+
+1. `docker`
+2. Remove `Exited (128)` / `Exited (137)` orphan containers
+3. Harbor `harbor-log`
+4. Harbor full stack
+5. Gitea
+6. Prometheus / Alertmanager / Grafana / exporters
+7. Langfuse
+8. SignOz
+9. Sentry DB layer
+10. Sentry web/worker/consumer layer
+11. Gitea host runner and actions runners
+
+### 5.2 Checks
+
+```bash
+ssh wooo@192.168.0.110 '
+hostname; date; uptime; free -h
+systemctl is-active docker || true
+curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
+curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
+curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
+curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
+curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
+docker ps --format "{{.Names}}\t{{.Status}}" | head -120
+'
+```
+
+Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
+
+### 5.3 Runner gate
+
+Runner may start only after all are true:
+
+- `188 PostgreSQL` ready
+- `110 Harbor` ready
+- `110 Gitea` ready
+- `120/121 K3s` nodes ready
+- AWOOOI API health passes
+- 110 load/core is below `1.0` for at least 15 minutes
+- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
+
+Check:
+
+```bash
+ssh wooo@192.168.0.110 '
+for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
+  echo "=== $u ==="
+  systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
+done
+'
+```
+
+If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
+
+```bash
+sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
+```
+
+---
+
+## 6. P1 120/121 K3s
+
+K3s must wait for 188 PostgreSQL and 110 Harbor.
+
+### 6.1 Startup order
+
+1. 120 `k3s.service`
+2. 121 `k3s-agent.service` or its live role
+3. CNI / kube-proxy
+4. Nodes Ready
+5. Core pods
+6. `awoooi-prod` pods
+7. keepalived VIP `192.168.0.125`
+8. NodePorts `32334` and `32335`
+
+### 6.2 Checks
+
+```bash
+ssh wooo@192.168.0.120 '
+hostname; uptime
+pg_isready -h 192.168.0.188 -p 5432 || true
+systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
+kubectl get nodes -o wide 2>/dev/null || true
+kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
+kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
+ip addr show | grep 192.168.0.125 || true
+'
+
+ssh wooo@192.168.0.121 '
+hostname; uptime
+systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
+ip addr show | grep 192.168.0.125 || true
+'
+```
+
+If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
+
+---
+
+## 7. P2 AWOOOI Workloads
+
+Run after K3s nodes are Ready:
+
+```bash
+ssh wooo@192.168.0.120 '
+kubectl get deploy -n awoooi-prod
+kubectl get pods -n awoooi-prod -o wide
+kubectl get svc -n awoooi-prod
+kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
+'
+
+curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
+curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
+```
+
+If pods are `ImagePullBackOff`, go back to 110 Harbor.
+
+If API health fails because DB/Redis is down, go back to 188.
+
+---
+
+## 8. P2 Alert Chain
+
+Current main path:
+
+```text
+Prometheus/Alertmanager on 110
+  -> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
+  -> AWOOOI API
+  -> TelegramGateway
+  -> Telegram
+```
+
+Alertmanager health alone is not enough. Run E2E:
+
+```bash
+curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
+  -H 'Content-Type: application/json' \
+  -d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
+```
+
+Expected: API returns success and Telegram receives the test alert.
+
+---
+
+## 9. P2 Schedules And Delayed Work
+
+Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
+
+| Host / Layer | Required check | Success baseline |
+|--------------|----------------|------------------|
+| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
+| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
+| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
+| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
+| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
+| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
+| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
+| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
+
+Useful checks:
+
+```bash
+ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
+ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
+ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
+ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
+```
+
+If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
+
+---
+
+## 10. P2/P3 Stateful Service Guardrails
+
+| Tier | Examples | Automation |
+|------|----------|------------|
+| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
+| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
+| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
+| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
+
+Never use generic `docker restart $(docker ps -q)` during cold start.
+
+### 10.1 Dirty-Reboot Storage Corruption
+
+Treat these log signatures as storage corruption, not ordinary service flakiness:
+
+- `Bad message`
+- `Structure needs cleaning`
+- `Unknown codec`
+- `PANIC: could not locate a valid checkpoint record`
+- Kafka `Malformed line` in checkpoint files
+- ClickHouse `broken and needs manual correction`
+
+Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
+
+### 10.2 ClickHouse Clean-Clone Recovery Pattern
+
+Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
+
+```text
+1. Stop the compose stack or at least stop dependent consumers.
+2. Disable restart loops for the failing container.
+3. Save logs and build an exclude list from unreadable store paths.
+4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
+5. Create a clean _data clone with readable files only.
+6. Add flags/force_restore_data.
+7. Start ClickHouse first, then web/API, then consumers.
+8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
+```
+
+Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
+
+```text
+/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
+/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
+```
+
+### 10.3 Kafka Checkpoint Recovery Pattern
+
+If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
+
+```text
+log-start-offset-checkpoint
+recovery-point-offset-checkpoint
+replication-offset-checkpoint
+```
+
+Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
+
+---
+
+## 11. P3 High-Load Services
+
+Only release these after P0/P1/P2 gates are green:
+
+| Host | Service | Release condition |
+|------|---------|-------------------|
+| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
+| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
+| 188 | litellm | `/health/liveliness` good and provider route verified |
+| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
+| 110 | Sentry uptime-checker | Sentry web/DB healthy |
+| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
+
+---
+
+## 12. Baseline And AI Auto-Remediation Gate
+
+### 12.1 Stable Runtime Baseline
+
+These are release gates after the first cold-start recovery pass:
+
+| Area | Baseline |
+|------|----------|
+| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
+| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
+| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
+| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
+| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
+| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
+| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
+
+If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
+
+### 12.2 AI Auto-Remediation Gate
+
+AI auto-repair can move from observe-only to limited execution only after:
+
+- Prometheus rules are loaded.
+- docker/systemd textfile exporter files are fresh.
+- blackbox probes have stable results.
+- cron/CronJob schedule checks are green.
+- AWOOOI API `/api/v1/health` passes.
+- Alertmanager E2E webhook passes.
+- Redis/KM/playbook health is available.
+- No active restart storm.
+- Host load/core remains below `1.0` for 15 minutes.
+
+Until then:
+
+- diagnose only
+- notify only
+- require human approval for remediation
+- no DB/ClickHouse/Harbor/Sentry destructive action
+- no generic restart action against stateful services
+
+---
+
+## 13. One-Command Readiness Script
+
+Run:
+
+```bash
+bash scripts/reboot-recovery/full-stack-cold-start-check.sh
+```
+
+The script is read-only. It reports gates:
+
+- `P0-NETWORK`
+- `P0-188-DATA`
+- `P0-110-REGISTRY`
+- `P1-K3S`
+- `P2-WORKLOAD`
+- `P2-ALERTCHAIN`
+- `P2-PUBLIC-ROUTES`
+- `P2-SCHEDULES`
+- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
+
+If it prints `BLOCKED`, fix the first blocked gate before moving forward.
+
+---
+
+## 14. Done Criteria
+
+All must be true:
+
+- Four hosts reachable by SSH.
+- 188 PostgreSQL and Redis healthy.
+- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
+- 120/121 K3s nodes Ready.
+- VIP `192.168.0.125` present.
+- AWOOOI API and Web reachable through NodePort/VIP.
+- Alertmanager E2E webhook succeeds.
+- cron/CronJob schedules are active, unsuspended, and verified.
+- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
+- High-load batch services are capped or delayed.
+- Runners are guarded and released last.
+- AI auto-remediation is not in full execution mode until all gates are green.
+
+---
+
+## 15. Known Drift To Fix After Recovery
+
+These must be cleaned after the incident, not during P0:
+
+- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
+- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
+- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
+- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
+- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
+- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
+- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.
--- a/k8s/awoooi-prod/13-cronjob-k3s-report.yaml
+++ b/k8s/awoooi-prod/13-cronjob-k3s-report.yaml
@@ -42,8 +42,11 @@ spec:
          restartPolicy: OnFailure
          containers:
            - name: k3s-report
-              image: 192.168.0.110:5000/awoooi-api:latest
-              imagePullPolicy: Always
+              # 2026-05-05 Codex: keep the API image placeholder so CD
+              # injects the same immutable tag used by API/worker. The old
+              # awoooi-api:latest repo returns 400 from Harbor after reboot.
+              image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
+              imagePullPolicy: IfNotPresent
              command:
                - python
                - -m
@@ -63,5 +66,7 @@ spec:
                limits:
                  cpu: "200m"
                  memory: "128Mi"
-          # 使用 API 的 ServiceAccount (需要 RBAC)
-          serviceAccountName: awoooi-api
+          # 2026-05-05 Codex: this report only calls Prometheus and Telegram.
+          # The old awoooi-api ServiceAccount does not exist, which prevented
+          # Job pods from being created after reboot.
+          serviceAccountName: default
--- a/k8s/awoooi-prod/14-cronjob-weekly-report.yaml
+++ b/k8s/awoooi-prod/14-cronjob-weekly-report.yaml
@@ -42,8 +42,11 @@ spec:
          restartPolicy: OnFailure
          containers:
            - name: weekly-report
-              image: 192.168.0.110:5000/awoooi-api:latest
-              imagePullPolicy: Always
+              # 2026-05-05 Codex: keep the API image placeholder so CD
+              # injects the same immutable tag used by API/worker. The old
+              # awoooi-api:latest repo returns 400 from Harbor after reboot.
+              image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
+              imagePullPolicy: IfNotPresent
              command:
                - python
                - -m
@@ -63,5 +66,7 @@ spec:
                limits:
                  cpu: "500m"
                  memory: "256Mi"
-          # 使用 API 的 ServiceAccount (需要 RBAC)
-          serviceAccountName: awoooi-api
+          # 2026-05-05 Codex: this report only calls app services, Prometheus,
+          # Git, and Telegram. The old awoooi-api ServiceAccount does not
+          # exist, which prevented Job pods from being created after reboot.
+          serviceAccountName: default
--- a/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml
+++ b/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml
@@ -27,7 +27,10 @@ spec:
  jobTemplate:
    spec:
      backoffLimit: 2
-      activeDeadlineSeconds: 300
+      # 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
+      # The script now fails if the API reports failed rows, so this longer
+      # deadline does not hide partial vectorization.
+      activeDeadlineSeconds: 1800
      template:
        metadata:
          labels:
@@ -37,8 +40,11 @@ spec:
          restartPolicy: OnFailure
          containers:
            - name: km-vectorize
-              image: 192.168.0.110:5000/awoooi-api:latest
-              imagePullPolicy: Always
+              # 2026-05-05 Codex: keep the API image placeholder so CD
+              # injects the same immutable tag used by API/worker. The old
+              # awoooi-api:latest repo returns 400 from Harbor after reboot.
+              image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
+              imagePullPolicy: IfNotPresent
              command:
                - python
                - /app/scripts/cron_km_vectorize.py
@@ -46,7 +52,9 @@ spec:
                - name: TZ
                  value: "Asia/Taipei"
                - name: INTERNAL_API_URL
-                  value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
+                  # 2026-05-05 Codex: use the actual Service name; the old
+                  # awoooi-api DNS name does not exist in awoooi-prod.
+                  value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
              resources:
                requests:
                  cpu: "50m"
@@ -54,4 +62,7 @@ spec:
                limits:
                  cpu: "200m"
                  memory: "128Mi"
-          serviceAccountName: awoooi-api
+          # 2026-05-05 Codex: this job only calls the internal API. The old
+          # awoooi-api ServiceAccount does not exist, which prevented Job pods
+          # from being created after reboot.
+          serviceAccountName: default
--- a/k8s/drift-cronjob.yaml
+++ b/k8s/drift-cronjob.yaml
@@ -63,10 +63,11 @@ spec:
                          print(f"status={r.status_code} body={r.text[:200]}")
                  asyncio.run(run())
              env:
-                # 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
-                # 改用 NodePort 直連 K3s worker node（同 K8s_API_SERVER_URL 解法）
+                # 2026-05-05 Codex: call the in-cluster Service instead of a
+                # fixed worker NodePort. After reboot, 121 can be unavailable
+                # while the Service and VIP are already healthy.
                - name: INTERNAL_API_URL
-                  value: "http://192.168.0.121:32334"
+                  value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
                - name: DRIFT_SCAN_NAMESPACES
                  value: "awoooi-prod"
              resources:
--- a/scripts/cron_km_vectorize.py
+++ b/scripts/cron_km_vectorize.py
@@ -18,17 +18,21 @@ import httpx
 async def main() -> int:
    api_base = os.environ.get(
        "INTERNAL_API_URL",
-        "http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
+        "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
    )
    url = f"{api_base}/api/v1/knowledge/embed-all"

-    async with httpx.AsyncClient(timeout=120) as client:
+    async with httpx.AsyncClient(timeout=1800) as client:
        try:
            resp = await client.post(url)
            print(f"embed-all: {resp.status_code} {resp.text[:200]}")
            if resp.status_code >= 400:
                print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
                return 1
+            result = resp.json()
+            if int(result.get("failed", 0)) > 0:
+                print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
+                return 1
            return 0
        except httpx.RequestError as exc:
            print(f"ERROR: request failed — {exc}", file=sys.stderr)
--- a/scripts/reboot-recovery/full-stack-cold-start-check.sh
+++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh
@@ -0,0 +1,398 @@
+#!/usr/bin/env bash
+# AWOOOI full-stack cold-start readiness check.
+# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
+
+set -uo pipefail
+
+SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
+SEND_ALERT_TEST=0
+
+for arg in "$@"; do
+  case "$arg" in
+    --send-alert-test)
+      SEND_ALERT_TEST=1
+      ;;
+    -h|--help)
+      cat <<'USAGE'
+Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
+
+Default mode is read-only and does not POST an Alertmanager test event.
+Use --send-alert-test only after AWOOOI API is expected to be ready.
+USAGE
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $arg" >&2
+      exit 64
+      ;;
+  esac
+done
+
+RED=$'\033[0;31m'
+GREEN=$'\033[0;32m'
+YELLOW=$'\033[1;33m'
+BLUE=$'\033[0;34m'
+NC=$'\033[0m'
+
+PASS=0
+WARN=0
+FAIL=0
+
+log_section() {
+  printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
+}
+
+ok() {
+  printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
+  PASS=$((PASS + 1))
+}
+
+warn() {
+  printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
+  WARN=$((WARN + 1))
+}
+
+fail() {
+  printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
+  FAIL=$((FAIL + 1))
+}
+
+run_local() {
+  local label="$1"
+  shift
+  if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
+    ok "$label"
+    cat /tmp/awoooi-cold-start-check.out
+    return 0
+  fi
+  fail "$label"
+  cat /tmp/awoooi-cold-start-check.out
+  return 1
+}
+
+ssh_cmd() {
+  local user_host="$1"
+  local cmd="$2"
+  local prefix=""
+  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
+    printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
+  fi
+  ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
+}
+
+probe_http_code() {
+  local url="$1"
+  local code
+  code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
+  echo "${code:-000}"
+}
+
+probe_tcp() {
+  local host="$1"
+  local port="$2"
+  nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
+}
+
+print_header() {
+  echo "AWOOOI full-stack cold-start check"
+  date '+%Y-%m-%d %H:%M:%S %Z'
+  echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
+}
+
+check_network() {
+  log_section "P0-NETWORK"
+  local host
+  for host in 110 120 121 188; do
+    if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
+      ok "ping 192.168.0.$host"
+    else
+      fail "ping 192.168.0.$host"
+    fi
+
+    if probe_tcp "192.168.0.$host" 22; then
+      ok "ssh port 192.168.0.$host:22"
+    else
+      fail "ssh port 192.168.0.$host:22"
+    fi
+  done
+
+  arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
+}
+
+check_188() {
+  log_section "P0-188-DATA"
+  local out
+  if ! out=$(ssh_cmd "ollama@192.168.0.188" '
+echo "HOST $(hostname) $(uptime)"
+echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
+echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
+echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
+echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
+echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
+echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
+echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
+docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
+' 2>&1); then
+    fail "ssh 188 read-only check"
+    echo "$out"
+    return
+  fi
+  echo "$out"
+
+  grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
+  grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
+  grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
+  grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
+  grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
+  grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
+}
+
+check_110() {
+  log_section "P0-110-REGISTRY-OBSERVABILITY"
+  local out
+  if ! out=$(ssh_cmd "wooo@192.168.0.110" '
+echo "HOST $(hostname) $(uptime)"
+echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
+echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
+echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
+echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
+echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
+echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
+echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
+for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
+  systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
+done
+docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
+' 2>&1); then
+    fail "ssh 110 read-only check"
+    echo "$out"
+    return
+  fi
+  echo "$out"
+
+  grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
+  grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
+  grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
+  grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
+  grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
+  grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
+  grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
+}
+
+check_k3s() {
+  log_section "P1-K3S"
+  local out local_kubectl_out
+  if ! out=$(ssh_cmd "wooo@192.168.0.120" '
+echo "HOST $(hostname) $(uptime)"
+echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
+echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
+kcmd() {
+  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
+    printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
+  else
+    sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
+  fi
+}
+kcmd get nodes -o wide 2>/dev/null || true
+kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
+ip addr show | grep 192.168.0.125 || true
+' 2>&1); then
+    fail "ssh 120 k3s read-only check"
+    echo "$out"
+    return
+  fi
+  echo "$out"
+
+  if ! grep -q " Ready " <<<"$out"; then
+    local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
+    if [ -n "$local_kubectl_out" ]; then
+      echo "LOCAL_KUBECTL_FALLBACK"
+      echo "$local_kubectl_out"
+    fi
+  else
+    local_kubectl_out=""
+  fi
+
+  grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
+  grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
+  grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
+}
+
+check_workload_and_alertchain() {
+  log_section "P2-WORKLOAD-ALERTCHAIN"
+  local api_code web_code alert_code
+  local out
+  if out=$(ssh_cmd "wooo@192.168.0.120" '
+api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
+web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
+echo "API_CODE ${api_code:-000}"
+echo "WEB_CODE ${web_code:-000}"
+' 2>/dev/null); then
+    api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
+    web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
+  else
+    api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
+    web_code=$(probe_http_code "http://192.168.0.125:32335/")
+    out="API_CODE $api_code
+WEB_CODE $web_code"
+  fi
+
+  echo "$out"
+
+  [[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
+  [[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
+
+  if [ "$SEND_ALERT_TEST" -eq 1 ]; then
+    alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
+      -X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
+      -H '"'"'Content-Type: application/json'"'"' \
+      -d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
+    echo "ALERTCHAIN_CODE $alert_code"
+    [[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
+  else
+    warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
+  fi
+}
+
+check_public_routes() {
+  log_section "P2-PUBLIC-ROUTES"
+  local awoooi_api_code awoooi_web_code momo_code momo_health_code
+  awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
+  awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
+  momo_code=$(probe_http_code "https://mo.wooo.work/")
+  momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
+
+  echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
+  echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
+  echo "MOMO_PUBLIC_CODE $momo_code"
+  echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
+
+  [[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
+  [[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
+  [[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
+  [[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
+}
+
+check_schedules() {
+  log_section "P2-SCHEDULES"
+  local out
+
+  if out=$(ssh_cmd "ollama@192.168.0.188" '
+now=$(date +%s)
+echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
+for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
+  if [ -f "$f" ]; then
+    mt=$(stat -c %Y "$f")
+    echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
+  else
+    echo "TEXTFILE_188 $(basename "$f") missing"
+  fi
+done
+if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
+  awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
+fi
+echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
+' 2>&1); then
+    echo "$out"
+    grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
+    awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
+    awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
+    awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
+    awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
+    awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
+  else
+    warn "188 schedule check unavailable"
+    echo "$out"
+  fi
+
+  if out=$(ssh_cmd "wooo@192.168.0.110" '
+now=$(date +%s)
+echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
+echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
+echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
+echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
+for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
+  if [ -f "$f" ]; then
+    mt=$(stat -c %Y "$f")
+    echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
+  else
+    echo "TEXTFILE_110 $(basename "$f") missing"
+  fi
+done
+' 2>&1); then
+    echo "$out"
+    grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
+    grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
+    grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
+    grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
+    awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
+    awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
+  else
+    warn "110 schedule check unavailable"
+    echo "$out"
+  fi
+
+  if out=$(ssh_cmd "wooo@192.168.0.120" '
+kcmd() {
+  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
+    printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
+  else
+    sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
+  fi
+}
+echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
+kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
+kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
+for j in d.get(\"items\", []):
+  if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
+    failed += 1
+print(\"FAILED_JOBS\", failed)"
+kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
+' 2>&1); then
+    echo "$out"
+    grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
+    awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
+    grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
+    grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
+    grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
+  else
+    warn "120 K8s schedule check unavailable"
+    echo "$out"
+  fi
+
+  if out=$(ssh_cmd "wooo@192.168.0.121" '
+echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
+crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
+' 2>&1); then
+    echo "$out"
+    grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
+    grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
+  else
+    warn "121 schedule check unavailable"
+    echo "$out"
+  fi
+}
+
+summary() {
+  log_section "SUMMARY"
+  echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
+  if [ "$FAIL" -gt 0 ]; then
+    echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
+    exit 2
+  fi
+  if [ "$WARN" -gt 0 ]; then
+    echo "Result: DEGRADED. Core gates passed but warnings remain."
+    exit 1
+  fi
+  echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
+}
+
+print_header
+check_network
+check_188
+check_110
+check_k3s
+check_workload_and_alertchain
+check_public_routes
+check_schedules
+summary