diff --git a/.gitea/workflows/cd-dev.yaml b/.gitea/workflows/cd-dev.yaml index ce1761b5..c1ccf2cd 100644 --- a/.gitea/workflows/cd-dev.yaml +++ b/.gitea/workflows/cd-dev.yaml @@ -108,7 +108,9 @@ jobs: mkdir -p ~/.ssh echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS + # 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a + # worker and its local kubeconfig points at 127.0.0.1:6443. + ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml @@ -138,10 +140,10 @@ jobs: SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }} run: | cat k8s/awoooi-dev/02-configmap.yaml | \ - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \ "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" - ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY' + ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY' set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index ec64d480..0eedc1db 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -406,8 +406,11 @@ jobs: mkdir -p ~/.ssh echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key - ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null - ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS + # 2026-05-05 Codex: kubectl must run on the 120 control-plane. + # 121 is a worker after cold-start recovery; its kubeconfig points at + # 127.0.0.1:6443 and fails ADR-035 secret patching. + ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null + ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml @@ -634,19 +637,21 @@ jobs: mkdir -p ~/.ssh echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key chmod 600 ~/.ssh/deploy_key - ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null + # 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120 + # control-plane, not 121 worker. + ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null IMAGE_TAG="${{ github.sha }}" HARBOR=192.168.0.110:5000 # ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment,ConfigMap 仍直接 apply) ─── cat k8s/awoooi-prod/04-configmap.yaml | \ - ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \ "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" echo "✅ ConfigMap 已更新" cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \ - ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \ "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" echo "✅ Service Registry ConfigMap 已更新" @@ -688,7 +693,7 @@ jobs: } # ─── Step 4: 等待 ArgoCD sync + rollout ─── - ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \ "EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT' set -e export KUBECONFIG=/etc/rancher/k3s/k3s.yaml @@ -814,7 +819,7 @@ jobs: - name: Alert Chain Smoke Test id: alert_chain_smoke run: | - # 2026-04-05 Claude Code: 使用真實 API 地址(192.168.0.121:32334 NodePort) + # 2026-05-05 Codex: use the keepalived VIP instead of a fixed node. # Host runner launches the CI image explicitly to avoid act RWLayer=nil. if docker run --rm \ --name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \ @@ -824,7 +829,7 @@ jobs: -v awoooi-api-venv-cache:/opt/api-venv \ -w /workspace \ "${{ env.CI_IMAGE }}" \ - bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then + bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then echo "alert_chain_status=pass" >> $GITHUB_OUTPUT else echo "alert_chain_status=fail" >> $GITHUB_OUTPUT diff --git a/apps/api/migrations/embedding_bge_m3_1024.sql b/apps/api/migrations/embedding_bge_m3_1024.sql index 935841db..9ea09246 100644 --- a/apps/api/migrations/embedding_bge_m3_1024.sql +++ b/apps/api/migrations/embedding_bge_m3_1024.sql @@ -6,8 +6,9 @@ -- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容,INSERT 會直接失敗 -- -- 影響範圍: --- 1. rag_chunks.embedding vector(768) → vector(1024) --- 2. playbook_embeddings.embedding vector(768) → vector(1024) +-- 1. knowledge_entries.embedding vector(768) → vector(1024) +-- 2. rag_chunks.embedding vector(768) → vector(1024) +-- 3. playbook_embeddings.embedding vector(768) → vector(1024) -- -- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入 -- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換) @@ -21,7 +22,24 @@ BEGIN; --- 1. rag_chunks:清空向量資料,變更欄位維度 +-- 1. knowledge_entries:備份舊向量並清空,變更欄位維度 +CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS +SELECT + id, + embedding::text AS embedding_768, + NOW() AS backed_up_at +FROM knowledge_entries +WHERE embedding IS NOT NULL; + +ALTER TABLE knowledge_entries + ALTER COLUMN embedding TYPE vector(1024) + USING NULL; -- 清空現有 768 維向量(維度不可轉換) + +COMMENT ON COLUMN knowledge_entries.embedding IS + 'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)'; + + +-- 2. rag_chunks:清空向量資料,變更欄位維度 -- ivfflat index 必須先 DROP 才能 ALTER COLUMN DROP INDEX IF EXISTS idx_rag_chunks_embedding; @@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS 'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)'; --- 2. playbook_embeddings:清空向量資料,變更欄位維度 +-- 3. playbook_embeddings:清空向量資料,變更欄位維度 DROP INDEX IF EXISTS ix_playbook_embeddings_vec; ALTER TABLE playbook_embeddings @@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS -- 3. 驗證遷移結果 DO $$ DECLARE + v_km_dim integer; v_rag_dim integer; v_pb_dim integer; BEGIN + SELECT atttypmod INTO v_km_dim + FROM pg_attribute + JOIN pg_class ON attrelid = pg_class.oid + WHERE relname = 'knowledge_entries' AND attname = 'embedding'; + SELECT atttypmod INTO v_rag_dim FROM pg_attribute JOIN pg_class ON attrelid = pg_class.oid @@ -74,15 +98,18 @@ BEGIN JOIN pg_class ON attrelid = pg_class.oid WHERE relname = 'playbook_embeddings' AND attname = 'embedding'; - -- atttypmod for vector(1024) = 1024 + 1 = 1025 - IF v_rag_dim != 1025 THEN - RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1025, got %', v_rag_dim; + -- pgvector atttypmod stores the configured dimension. + IF v_km_dim != 1024 THEN + RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗:expected 1024, got %', v_km_dim; END IF; - IF v_pb_dim != 1025 THEN - RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1025, got %', v_pb_dim; + IF v_rag_dim != 1024 THEN + RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1024, got %', v_rag_dim; + END IF; + IF v_pb_dim != 1024 THEN + RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1024, got %', v_pb_dim; END IF; - RAISE NOTICE '✅ embedding 遷移驗證通過:rag_chunks 和 playbook_embeddings 均為 vector(1024)'; + RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)'; END $$; COMMIT; diff --git a/apps/api/src/services/k3s_monitor_service.py b/apps/api/src/services/k3s_monitor_service.py index 8cdaea49..3f8d592c 100644 --- a/apps/api/src/services/k3s_monitor_service.py +++ b/apps/api/src/services/k3s_monitor_service.py @@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__) # 台北時區 TZ_TAIPEI = ZoneInfo("Asia/Taipei") -# Prometheus 端點 -PROMETHEUS_URL = "http://192.168.0.121:30090" +# Prometheus endpoint. +# +# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort. +# Production already injects PROMETHEUS_URL from ConfigMap, currently the +# Docker Prometheus on 110. This keeps reboot recovery independent of 121. +PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/") # kube-state-metrics 查詢 PROM_QUERIES = { @@ -215,7 +219,7 @@ class K3sMonitorService: # 發送訊息 formatted = status.format() - result = await gateway.send_message(formatted) + result = await gateway.send_text(formatted) if result: logger.info("k3s_daily_report_sent", date=status.report_date) diff --git a/apps/api/src/services/weekly_report_service.py b/apps/api/src/services/weekly_report_service.py index d9d59ccb..568c5a45 100644 --- a/apps/api/src/services/weekly_report_service.py +++ b/apps/api/src/services/weekly_report_service.py @@ -244,7 +244,7 @@ class WeeklyReportService: # 發送訊息 formatted = report.format() - result = await gateway.send_message(formatted) + result = await gateway.send_text(formatted) if result: logger.info("weekly_report_sent", week=report.week_range) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index faffb4dd..b0f815d0 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,38 @@ --- +## 2026-05-05 | 重開機後排程與 startup baseline 修復 + +**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`。 + +**本次排程/啟動鏈修補**: +- 120/121 K3s 回到 Ready;CD workflow 目標從 121 改為 120,避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗;120 已驗證 limited sudo kubectl 可用。 +- K8s CronJob 修正:`k3s-status-report`、`weekly-report`、`km-vectorize` 改用存在的 service account、live API image、cluster service DNS;手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。 +- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。 +- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup;成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`。 +- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後,以低優先權手動備份成功,Prometheus `backup_110_last_success_timestamp` 已更新。 +- 188 momo-scheduler 修正 dashboard URL:容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`。 +- 188 Google Drive token 從 legacy pickle 轉為 JSON,scheduler 容器內 `GoogleDriveService().authenticate()` 通過。 +- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`。 +- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot` 與 `realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。 +- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。 +- 110 停用兩個過期 startup units:`momo-startup-complete.service`(指向不存在路徑/錯 host)與 `wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。 +- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active`、`SubState=exited`、`Result=success`,`systemctl --failed` 為 0。 +- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`,HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。 +- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。 +- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria,要求排程真正可執行才算 reboot recovery 完成。 + +**最終驗證**: +- KM reembed 完成:`1774/1774` success、`0` failed;DB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。 +- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`。 +- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test` → `PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。 +- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`;HostBackupFailed 解除。 +- 188/110 負載回到低檔;K3s node CPU 約 3-6%,KM reembed 未造成主機過載。 + +**下一步**: +- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy,避免下一版 image 覆蓋 hotfix。 +- 修 `grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal;目前舊 renewal config 已停用以避免 certbot timer 每次失敗。 + ## 2026-05-05 | 110 Sentry resource limits persistence gap closed **背景**:110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。 @@ -3066,3 +3098,42 @@ C1(evolver 加 YAML_RULE guard)+ C2(seeder SQL `AND status != 'deprecated' ```bash psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql ``` + +--- + +## 2026-05-05(台北)— 四主機重開機後全站冷啟動救援 + +**觸發**:110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。 + +### 已恢復 + +| 範圍 | 結果 | +|------|------| +| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal`,`k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 | +| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows;120 / 121 重新 Ready | +| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` Running;VIP `192.168.0.125` 內網驗證 API 200 / Web 307 | +| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal`,`momo-pro-system` / scheduler / bot / DB 全部 healthy;公網 `/` 200、`/health` 200 | +| 110 host overload | actions runner units 維持最後放行;Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復,Sentry stack healthy | +| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption;已 clean-clone 可讀資料並保留原始 corrupt volume,SignOz HTTP 恢復 | +| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 與 `scripts/reboot-recovery/full-stack-cold-start-check.sh` | + +### 驗證 + +```bash +bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test +# PASS=31 WARN=0 BLOCKED=0 +# Result: GREEN. Full stack is ready for controlled runner/CD release. +``` + +### Dirty reboot 資料保全 + +- 110 Sentry ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`。 +- 110 Sentry Kafka:malformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint,不刪 topic/log data。 +- 188 SignOz ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。 +- 188 `momo-db`:WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`。 + +### 已知隔離 / 後續 + +- 110 actions runner units 仍按策略最後放行:guardrail 已套用,`CPUQuota=200%`、`MemoryMax=2G`、`WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。 +- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。 +- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error,不阻斷主服務;後續可清理或調查。 diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md new file mode 100644 index 00000000..9c04d3a2 --- /dev/null +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -0,0 +1,497 @@ +# AWOOOI Full-Stack Cold Start SOP + +> Version: v1.0 +> Last updated: 2026-05-05 Asia/Taipei +> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. + +--- + +## 0. When To Use This + +Use this SOP when any of these happen: + +- 110/120/121/188 reboot unexpectedly. +- All services are abnormal after a power/network event. +- K3s is stuck `activating`. +- Host load remains high during startup and service health is mixed. +- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state. + +The rule is simple: **recover the dependency chain, not the loudest symptom.** + +--- + +## 1. Golden Startup Order + +```text +0. Freeze automation and preserve evidence +1. Physical/network layer +2. 188 data layer +3. 110 registry/observability layer +4. 120/121 K3s layer +5. AWOOOI workload layer +6. Public routes and alert chain +7. High-load batch/consumer/crawler services +8. Runner/CD +9. AI auto-remediation +10. 112 Kali scanner, if needed +``` + +Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy. + +--- + +## 2. Automation Freeze + +Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode. + +| Item | Cold-start policy | Reason | +|------|-------------------|--------| +| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. | +| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. | +| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. | +| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. | +| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. | +| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. | + +--- + +## 3. P0 Evidence And Network + +Run from any machine on the same LAN: + +```bash +for h in 110 120 121 188; do + ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h" +done + +arp -an | grep -E '192\.168\.0\.(110|120|121|188)' +for h in 110 120 121 188; do + nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h" +done +``` + +Then capture reboot evidence: + +```bash +ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20' +ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20' +ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20' +ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20' +``` + +If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first. + +--- + +## 4. P0 188 Data Layer + +188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL. + +### 4.1 Startup order + +1. `containerd` +2. `docker` +3. `postgresql@14-main` +4. `k3s_datastore.kine` maintenance +5. `redis-server` on `6380` +6. `ollama` or current AI proxy dependencies +7. `nginx` +8. Docker networks +9. MinIO / OpenClaw / SignOz +10. momo / litellm / batch services after load is stable + +### 4.2 Read-only check + +```bash +ssh ollama@192.168.0.188 ' +hostname; date; uptime; free -h +systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true +pg_isready -h localhost -p 5432 || true +redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true +docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120 +' +``` + +### 4.3 PostgreSQL WAL checkpoint damage + +Signature: + +```text +PANIC: could not locate a valid checkpoint record +invalid primary checkpoint record +unexpected pageaddr ... in log segment ... +``` + +This blocks: + +- `188:5432` +- K3s startup on 120/121 +- AWOOOI API DB access +- Alertmanager webhook if API cannot start + +Human-approved recovery command on 188: + +```bash +sudo systemctl stop postgresql@14-main +sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql +sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main +sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main +sudo systemctl start postgresql@14-main +pg_isready -h localhost -p 5432 +sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;" +``` + +Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it. + +--- + +## 5. P0/P1 110 Registry And Observability + +110 must recover Harbor/Gitea/Monitoring early, but runners last. + +### 5.1 Startup order + +1. `docker` +2. Remove `Exited (128)` / `Exited (137)` orphan containers +3. Harbor `harbor-log` +4. Harbor full stack +5. Gitea +6. Prometheus / Alertmanager / Grafana / exporters +7. Langfuse +8. SignOz +9. Sentry DB layer +10. Sentry web/worker/consumer layer +11. Gitea host runner and actions runners + +### 5.2 Checks + +```bash +ssh wooo@192.168.0.110 ' +hostname; date; uptime; free -h +systemctl is-active docker || true +curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true +curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true +curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true +curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true +curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true +docker ps --format "{{.Names}}\t{{.Status}}" | head -120 +' +``` + +Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure. + +### 5.3 Runner gate + +Runner may start only after all are true: + +- `188 PostgreSQL` ready +- `110 Harbor` ready +- `110 Gitea` ready +- `120/121 K3s` nodes ready +- AWOOOI API health passes +- 110 load/core is below `1.0` for at least 15 minutes +- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` + +Check: + +```bash +ssh wooo@192.168.0.110 ' +for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do + echo "=== $u ===" + systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts +done +' +``` + +If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo: + +```bash +sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply +``` + +--- + +## 6. P1 120/121 K3s + +K3s must wait for 188 PostgreSQL and 110 Harbor. + +### 6.1 Startup order + +1. 120 `k3s.service` +2. 121 `k3s-agent.service` or its live role +3. CNI / kube-proxy +4. Nodes Ready +5. Core pods +6. `awoooi-prod` pods +7. keepalived VIP `192.168.0.125` +8. NodePorts `32334` and `32335` + +### 6.2 Checks + +```bash +ssh wooo@192.168.0.120 ' +hostname; uptime +pg_isready -h 192.168.0.188 -p 5432 || true +systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true +kubectl get nodes -o wide 2>/dev/null || true +kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true +kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true +ip addr show | grep 192.168.0.125 || true +' + +ssh wooo@192.168.0.121 ' +hostname; uptime +systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true +ip addr show | grep 192.168.0.125 || true +' +``` + +If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it. + +--- + +## 7. P2 AWOOOI Workloads + +Run after K3s nodes are Ready: + +```bash +ssh wooo@192.168.0.120 ' +kubectl get deploy -n awoooi-prod +kubectl get pods -n awoooi-prod -o wide +kubectl get svc -n awoooi-prod +kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40 +' + +curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health +curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/ +``` + +If pods are `ImagePullBackOff`, go back to 110 Harbor. + +If API health fails because DB/Redis is down, go back to 188. + +--- + +## 8. P2 Alert Chain + +Current main path: + +```text +Prometheus/Alertmanager on 110 + -> http://192.168.0.125:32334/api/v1/webhooks/alertmanager + -> AWOOOI API + -> TelegramGateway + -> Telegram +``` + +Alertmanager health alone is not enough. Run E2E: + +```bash +curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \ + -H 'Content-Type: application/json' \ + -d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}' +``` + +Expected: API returns success and Telegram receives the test alert. + +--- + +## 9. P2 Schedules And Delayed Work + +Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken. + +| Host / Layer | Required check | Success baseline | +|--------------|----------------|------------------| +| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present | +| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` | +| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames | +| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` | +| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh | +| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled | +| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation | +| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused | + +Useful checks: + +```bash +ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom' +ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l' +ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod' +ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l' +``` + +If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms. + +--- + +## 10. P2/P3 Stateful Service Guardrails + +| Tier | Examples | Automation | +|------|----------|------------| +| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. | +| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. | +| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. | +| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. | + +Never use generic `docker restart $(docker ps -q)` during cold start. + +### 10.1 Dirty-Reboot Storage Corruption + +Treat these log signatures as storage corruption, not ordinary service flakiness: + +- `Bad message` +- `Structure needs cleaning` +- `Unknown codec` +- `PANIC: could not locate a valid checkpoint record` +- Kafka `Malformed line` in checkpoint files +- ClickHouse `broken and needs manual correction` + +Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore. + +### 10.2 ClickHouse Clean-Clone Recovery Pattern + +Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads. + +```text +1. Stop the compose stack or at least stop dependent consumers. +2. Disable restart loops for the failing container. +3. Save logs and build an exclude list from unreadable store paths. +4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS. +5. Create a clean _data clone with readable files only. +6. Add flags/force_restore_data. +7. Start ClickHouse first, then web/API, then consumers. +8. Verify HTTP, merge backlog, and restart count before releasing high-load services. +``` + +Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is: + +```text +/var/lib/docker/volumes//_data.corrupt-YYYYMMDD-HHMMSS +/var/backups/--YYYYMMDD-HHMMSS +``` + +### 10.3 Kafka Checkpoint Recovery Pattern + +If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files: + +```text +log-start-offset-checkpoint +recovery-point-offset-checkpoint +replication-offset-checkpoint +``` + +Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery. + +--- + +## 11. P3 High-Load Services + +Only release these after P0/P1/P2 gates are green: + +| Host | Service | Release condition | +|------|---------|-------------------| +| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy | +| 188 | SignOz ClickHouse | healthy and merge backlog trending down | +| 188 | litellm | `/health/liveliness` good and provider route verified | +| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing | +| 110 | Sentry uptime-checker | Sentry web/DB healthy | +| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes | + +--- + +## 12. Baseline And AI Auto-Remediation Gate + +### 12.1 Stable Runtime Baseline + +These are release gates after the first cold-start recovery pass: + +| Area | Baseline | +|------|----------| +| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers | +| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop | +| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx | +| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx | +| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` | +| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` | +| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks | + +If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem. + +### 12.2 AI Auto-Remediation Gate + +AI auto-repair can move from observe-only to limited execution only after: + +- Prometheus rules are loaded. +- docker/systemd textfile exporter files are fresh. +- blackbox probes have stable results. +- cron/CronJob schedule checks are green. +- AWOOOI API `/api/v1/health` passes. +- Alertmanager E2E webhook passes. +- Redis/KM/playbook health is available. +- No active restart storm. +- Host load/core remains below `1.0` for 15 minutes. + +Until then: + +- diagnose only +- notify only +- require human approval for remediation +- no DB/ClickHouse/Harbor/Sentry destructive action +- no generic restart action against stateful services + +--- + +## 13. One-Command Readiness Script + +Run: + +```bash +bash scripts/reboot-recovery/full-stack-cold-start-check.sh +``` + +The script is read-only. It reports gates: + +- `P0-NETWORK` +- `P0-188-DATA` +- `P0-110-REGISTRY` +- `P1-K3S` +- `P2-WORKLOAD` +- `P2-ALERTCHAIN` +- `P2-PUBLIC-ROUTES` +- `P2-SCHEDULES` +- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY` + +If it prints `BLOCKED`, fix the first blocked gate before moving forward. + +--- + +## 14. Done Criteria + +All must be true: + +- Four hosts reachable by SSH. +- 188 PostgreSQL and Redis healthy. +- 110 Harbor, Gitea, Prometheus, Alertmanager healthy. +- 120/121 K3s nodes Ready. +- VIP `192.168.0.125` present. +- AWOOOI API and Web reachable through NodePort/VIP. +- Alertmanager E2E webhook succeeds. +- cron/CronJob schedules are active, unsuspended, and verified. +- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery. +- High-load batch services are capped or delayed. +- Runners are guarded and released last. +- AI auto-remediation is not in full execution mode until all gates are green. + +--- + +## 15. Known Drift To Fix After Recovery + +These must be cleaned after the incident, not during P0: + +- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations. +- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`. +- OpenClaw `8088` vs `8089` must be live-confirmed and normalized. +- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`. +- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`. +- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail. +- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured. diff --git a/k8s/awoooi-prod/13-cronjob-k3s-report.yaml b/k8s/awoooi-prod/13-cronjob-k3s-report.yaml index 403a3307..6588b82e 100644 --- a/k8s/awoooi-prod/13-cronjob-k3s-report.yaml +++ b/k8s/awoooi-prod/13-cronjob-k3s-report.yaml @@ -42,8 +42,11 @@ spec: restartPolicy: OnFailure containers: - name: k3s-report - image: 192.168.0.110:5000/awoooi-api:latest - imagePullPolicy: Always + # 2026-05-05 Codex: keep the API image placeholder so CD + # injects the same immutable tag used by API/worker. The old + # awoooi-api:latest repo returns 400 from Harbor after reboot. + image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER + imagePullPolicy: IfNotPresent command: - python - -m @@ -63,5 +66,7 @@ spec: limits: cpu: "200m" memory: "128Mi" - # 使用 API 的 ServiceAccount (需要 RBAC) - serviceAccountName: awoooi-api + # 2026-05-05 Codex: this report only calls Prometheus and Telegram. + # The old awoooi-api ServiceAccount does not exist, which prevented + # Job pods from being created after reboot. + serviceAccountName: default diff --git a/k8s/awoooi-prod/14-cronjob-weekly-report.yaml b/k8s/awoooi-prod/14-cronjob-weekly-report.yaml index cffcc039..d3677082 100644 --- a/k8s/awoooi-prod/14-cronjob-weekly-report.yaml +++ b/k8s/awoooi-prod/14-cronjob-weekly-report.yaml @@ -42,8 +42,11 @@ spec: restartPolicy: OnFailure containers: - name: weekly-report - image: 192.168.0.110:5000/awoooi-api:latest - imagePullPolicy: Always + # 2026-05-05 Codex: keep the API image placeholder so CD + # injects the same immutable tag used by API/worker. The old + # awoooi-api:latest repo returns 400 from Harbor after reboot. + image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER + imagePullPolicy: IfNotPresent command: - python - -m @@ -63,5 +66,7 @@ spec: limits: cpu: "500m" memory: "256Mi" - # 使用 API 的 ServiceAccount (需要 RBAC) - serviceAccountName: awoooi-api + # 2026-05-05 Codex: this report only calls app services, Prometheus, + # Git, and Telegram. The old awoooi-api ServiceAccount does not + # exist, which prevented Job pods from being created after reboot. + serviceAccountName: default diff --git a/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml b/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml index 77c97068..2c2e5e7c 100644 --- a/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml +++ b/k8s/awoooi-prod/15-cronjob-km-vectorize.yaml @@ -27,7 +27,10 @@ spec: jobTemplate: spec: backoffLimit: 2 - activeDeadlineSeconds: 300 + # 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches. + # The script now fails if the API reports failed rows, so this longer + # deadline does not hide partial vectorization. + activeDeadlineSeconds: 1800 template: metadata: labels: @@ -37,8 +40,11 @@ spec: restartPolicy: OnFailure containers: - name: km-vectorize - image: 192.168.0.110:5000/awoooi-api:latest - imagePullPolicy: Always + # 2026-05-05 Codex: keep the API image placeholder so CD + # injects the same immutable tag used by API/worker. The old + # awoooi-api:latest repo returns 400 from Harbor after reboot. + image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER + imagePullPolicy: IfNotPresent command: - python - /app/scripts/cron_km_vectorize.py @@ -46,7 +52,9 @@ spec: - name: TZ value: "Asia/Taipei" - name: INTERNAL_API_URL - value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000" + # 2026-05-05 Codex: use the actual Service name; the old + # awoooi-api DNS name does not exist in awoooi-prod. + value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000" resources: requests: cpu: "50m" @@ -54,4 +62,7 @@ spec: limits: cpu: "200m" memory: "128Mi" - serviceAccountName: awoooi-api + # 2026-05-05 Codex: this job only calls the internal API. The old + # awoooi-api ServiceAccount does not exist, which prevented Job pods + # from being created after reboot. + serviceAccountName: default diff --git a/k8s/drift-cronjob.yaml b/k8s/drift-cronjob.yaml index 9c17e4f7..7bf44a0f 100644 --- a/k8s/drift-cronjob.yaml +++ b/k8s/drift-cronjob.yaml @@ -63,10 +63,11 @@ spec: print(f"status={r.status_code} body={r.text[:200]}") asyncio.run(run()) env: - # 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達 - # 改用 NodePort 直連 K3s worker node(同 K8s_API_SERVER_URL 解法) + # 2026-05-05 Codex: call the in-cluster Service instead of a + # fixed worker NodePort. After reboot, 121 can be unavailable + # while the Service and VIP are already healthy. - name: INTERNAL_API_URL - value: "http://192.168.0.121:32334" + value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000" - name: DRIFT_SCAN_NAMESPACES value: "awoooi-prod" resources: diff --git a/scripts/cron_km_vectorize.py b/scripts/cron_km_vectorize.py index 0dd7f4a1..69d11247 100644 --- a/scripts/cron_km_vectorize.py +++ b/scripts/cron_km_vectorize.py @@ -18,17 +18,21 @@ import httpx async def main() -> int: api_base = os.environ.get( "INTERNAL_API_URL", - "http://awoooi-api.awoooi-prod.svc.cluster.local:8000", + "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000", ) url = f"{api_base}/api/v1/knowledge/embed-all" - async with httpx.AsyncClient(timeout=120) as client: + async with httpx.AsyncClient(timeout=1800) as client: try: resp = await client.post(url) print(f"embed-all: {resp.status_code} {resp.text[:200]}") if resp.status_code >= 400: print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr) return 1 + result = resp.json() + if int(result.get("failed", 0)) > 0: + print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr) + return 1 return 0 except httpx.RequestError as exc: print(f"ERROR: request failed — {exc}", file=sys.stderr) diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh new file mode 100755 index 00000000..86f40f7e --- /dev/null +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -0,0 +1,398 @@ +#!/usr/bin/env bash +# AWOOOI full-stack cold-start readiness check. +# Read-only by design. It never restarts, deletes, repairs, or writes remote state. + +set -uo pipefail + +SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6) +SEND_ALERT_TEST=0 + +for arg in "$@"; do + case "$arg" in + --send-alert-test) + SEND_ALERT_TEST=1 + ;; + -h|--help) + cat <<'USAGE' +Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test] + +Default mode is read-only and does not POST an Alertmanager test event. +Use --send-alert-test only after AWOOOI API is expected to be ready. +USAGE + exit 0 + ;; + *) + echo "Unknown argument: $arg" >&2 + exit 64 + ;; + esac +done + +RED=$'\033[0;31m' +GREEN=$'\033[0;32m' +YELLOW=$'\033[1;33m' +BLUE=$'\033[0;34m' +NC=$'\033[0m' + +PASS=0 +WARN=0 +FAIL=0 + +log_section() { + printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC" +} + +ok() { + printf "%sOK%s %s\n" "$GREEN" "$NC" "$1" + PASS=$((PASS + 1)) +} + +warn() { + printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1" + WARN=$((WARN + 1)) +} + +fail() { + printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1" + FAIL=$((FAIL + 1)) +} + +run_local() { + local label="$1" + shift + if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then + ok "$label" + cat /tmp/awoooi-cold-start-check.out + return 0 + fi + fail "$label" + cat /tmp/awoooi-cold-start-check.out + return 1 +} + +ssh_cmd() { + local user_host="$1" + local cmd="$2" + local prefix="" + if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then + printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD" + fi + ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}" +} + +probe_http_code() { + local url="$1" + local code + code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true) + echo "${code:-000}" +} + +probe_tcp() { + local host="$1" + local port="$2" + nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1 +} + +print_header() { + echo "AWOOOI full-stack cold-start check" + date '+%Y-%m-%d %H:%M:%S %Z' + echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped." +} + +check_network() { + log_section "P0-NETWORK" + local host + for host in 110 120 121 188; do + if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then + ok "ping 192.168.0.$host" + else + fail "ping 192.168.0.$host" + fi + + if probe_tcp "192.168.0.$host" 22; then + ok "ssh port 192.168.0.$host:22" + else + fail "ssh port 192.168.0.$host:22" + fi + done + + arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts" +} + +check_188() { + log_section "P0-188-DATA" + local out + if ! out=$(ssh_cmd "ollama@192.168.0.188" ' +echo "HOST $(hostname) $(uptime)" +echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")" +echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")" +echo "PG $(pg_isready -h localhost -p 5432 2>&1)" +echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)" +echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)" +echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)" +echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)" +docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80 +' 2>&1); then + fail "ssh 188 read-only check" + echo "$out" + return + fi + echo "$out" + + grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed" + grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections" + grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed" + grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop" + grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed" + grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed" +} + +check_110() { + log_section "P0-110-REGISTRY-OBSERVABILITY" + local out + if ! out=$(ssh_cmd "wooo@192.168.0.110" ' +echo "HOST $(hostname) $(uptime)" +echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")" +echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)" +echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)" +echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)" +echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)" +echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)" +echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)" +for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do + systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /" +done +docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 +' 2>&1); then + fail "ssh 110 read-only check" + echo "$out" + return + fi + echo "$out" + + grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy" + grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed" + grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready" + grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy" + grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed" + grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed" + grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting" +} + +check_k3s() { + log_section "P1-K3S" + local out local_kubectl_out + if ! out=$(ssh_cmd "wooo@192.168.0.120" ' +echo "HOST $(hostname) $(uptime)" +echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)" +echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")" +kcmd() { + if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then + printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@" + else + sudo -n kubectl "$@" 2>/dev/null || kubectl "$@" + fi +} +kcmd get nodes -o wide 2>/dev/null || true +kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true +ip addr show | grep 192.168.0.125 || true +' 2>&1); then + fail "ssh 120 k3s read-only check" + echo "$out" + return + fi + echo "$out" + + if ! grep -q " Ready " <<<"$out"; then + local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true) + if [ -n "$local_kubectl_out" ]; then + echo "LOCAL_KUBECTL_FALLBACK" + echo "$local_kubectl_out" + fi + else + local_kubectl_out="" + fi + + grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL" + grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable" + grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120" +} + +check_workload_and_alertchain() { + log_section "P2-WORKLOAD-ALERTCHAIN" + local api_code web_code alert_code + local out + if out=$(ssh_cmd "wooo@192.168.0.120" ' +api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true) +web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true) +echo "API_CODE ${api_code:-000}" +echo "WEB_CODE ${web_code:-000}" +' 2>/dev/null); then + api_code=$(awk '/^API_CODE / {print $2}' <<<"$out") + web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out") + else + api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health") + web_code=$(probe_http_code "http://192.168.0.125:32335/") + out="API_CODE $api_code +WEB_CODE $web_code" + fi + + echo "$out" + + [[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable" + [[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed" + + if [ "$SEND_ALERT_TEST" -eq 1 ]; then + alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \ + -X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \ + -H '"'"'Content-Type: application/json'"'"' \ + -d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"') + echo "ALERTCHAIN_CODE $alert_code" + [[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed" + else + warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready" + fi +} + +check_public_routes() { + log_section "P2-PUBLIC-ROUTES" + local awoooi_api_code awoooi_web_code momo_code momo_health_code + awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health") + awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/") + momo_code=$(probe_http_code "https://mo.wooo.work/") + momo_health_code=$(probe_http_code "https://mo.wooo.work/health") + + echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code" + echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code" + echo "MOMO_PUBLIC_CODE $momo_code" + echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code" + + [[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed" + [[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed" + [[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed" + [[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed" +} + +check_schedules() { + log_section "P2-SCHEDULES" + local out + + if out=$(ssh_cmd "ollama@192.168.0.188" ' +now=$(date +%s) +echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" +for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do + if [ -f "$f" ]; then + mt=$(stat -c %Y "$f") + echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))" + else + echo "TEXTFILE_188 $(basename "$f") missing" + fi +done +if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then + awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom +fi +echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" +' 2>&1); then + echo "$out" + grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed" + awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing" + awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale" + awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale" + awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed" + awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed" + else + warn "188 schedule check unavailable" + echo "$out" + fi + + if out=$(ssh_cmd "wooo@192.168.0.110" ' +now=$(date +%s) +echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" +echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)" +echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)" +echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)" +for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do + if [ -f "$f" ]; then + mt=$(stat -c %Y "$f") + echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))" + else + echo "TEXTFILE_110 $(basename "$f") missing" + fi +done +' 2>&1); then + echo "$out" + grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed" + grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain" + grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled" + grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled" + awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale" + awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale" + else + warn "110 schedule check unavailable" + echo "$out" + fi + + if out=$(ssh_cmd "wooo@192.168.0.120" ' +kcmd() { + if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then + printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@" + else + sudo -n kubectl "$@" 2>/dev/null || kubectl "$@" + fi +} +echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" +kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))" +kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0 +for j in d.get(\"items\", []): + if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []): + failed += 1 +print(\"FAILED_JOBS\", failed)" +kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}" +' 2>&1); then + echo "$out" + grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed" + awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing" + grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended" + grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain" + grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains" + else + warn "120 K8s schedule check unavailable" + echo "$out" + fi + + if out=$(ssh_cmd "wooo@192.168.0.121" ' +echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" +crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing" +' 2>&1); then + echo "$out" + grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed" + grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing" + else + warn "121 schedule check unavailable" + echo "$out" + fi +} + +summary() { + log_section "SUMMARY" + echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL" + if [ "$FAIL" -gt 0 ]; then + echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation." + exit 2 + fi + if [ "$WARN" -gt 0 ]; then + echo "Result: DEGRADED. Core gates passed but warnings remain." + exit 1 + fi + echo "Result: GREEN. Full stack is ready for controlled runner/CD release." +} + +print_header +check_network +check_188 +check_110 +check_k3s +check_workload_and_alertchain +check_public_routes +check_schedules +summary