fix(ops): harden cold-start schedule recovery

This commit is contained in:
Your Name
2026-05-05 22:14:54 +08:00
parent 10cd9fc025
commit 894174da5b
13 changed files with 1073 additions and 43 deletions

View File

@@ -108,7 +108,9 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
# worker and its local kubeconfig points at 127.0.0.1:6443.
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -138,10 +140,10 @@ jobs:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
run: |
cat k8s/awoooi-dev/02-configmap.yaml | \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

View File

@@ -406,8 +406,11 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
# 2026-05-05 Codex: kubectl must run on the 120 control-plane.
# 121 is a worker after cold-start recovery; its kubeconfig points at
# 127.0.0.1:6443 and fails ADR-035 secret patching.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -634,19 +637,21 @@ jobs:
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
# 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
# control-plane, not 121 worker.
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
IMAGE_TAG="${{ github.sha }}"
HARBOR=192.168.0.110:5000
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 DeploymentConfigMap 仍直接 apply) ───
cat k8s/awoooi-prod/04-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ ConfigMap 已更新"
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
echo "✅ Service Registry ConfigMap 已更新"
@@ -688,7 +693,7 @@ jobs:
}
# ─── Step 4: 等待 ArgoCD sync + rollout ───
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
@@ -814,7 +819,7 @@ jobs:
- name: Alert Chain Smoke Test
id: alert_chain_smoke
run: |
# 2026-04-05 Claude Code: 使用真實 API 地址192.168.0.121:32334 NodePort
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
if docker run --rm \
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
@@ -824,7 +829,7 @@ jobs:
-v awoooi-api-venv-cache:/opt/api-venv \
-w /workspace \
"${{ env.CI_IMAGE }}" \
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
else
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT

View File

@@ -6,8 +6,9 @@
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容INSERT 會直接失敗
--
-- 影響範圍:
-- 1. rag_chunks.embedding vector(768) → vector(1024)
-- 2. playbook_embeddings.embedding vector(768) → vector(1024)
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
-- 2. rag_chunks.embedding vector(768) → vector(1024)
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
--
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
@@ -21,7 +22,24 @@
BEGIN;
-- 1. rag_chunks清空向量資料,變更欄位維度
-- 1. knowledge_entries備份舊向量並清空,變更欄位維度
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
SELECT
id,
embedding::text AS embedding_768,
NOW() AS backed_up_at
FROM knowledge_entries
WHERE embedding IS NOT NULL;
ALTER TABLE knowledge_entries
ALTER COLUMN embedding TYPE vector(1024)
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
COMMENT ON COLUMN knowledge_entries.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
-- 2. rag_chunks清空向量資料變更欄位維度
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
-- 2. playbook_embeddings清空向量資料變更欄位維度
-- 3. playbook_embeddings清空向量資料變更欄位維度
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
ALTER TABLE playbook_embeddings
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
-- 3. 驗證遷移結果
DO $$
DECLARE
v_km_dim integer;
v_rag_dim integer;
v_pb_dim integer;
BEGIN
SELECT atttypmod INTO v_km_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
SELECT atttypmod INTO v_rag_dim
FROM pg_attribute
JOIN pg_class ON attrelid = pg_class.oid
@@ -74,15 +98,18 @@ BEGIN
JOIN pg_class ON attrelid = pg_class.oid
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
-- atttypmod for vector(1024) = 1024 + 1 = 1025
IF v_rag_dim != 1025 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1025, got %', v_rag_dim;
-- pgvector atttypmod stores the configured dimension.
IF v_km_dim != 1024 THEN
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗expected 1024, got %', v_km_dim;
END IF;
IF v_pb_dim != 1025 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1025, got %', v_pb_dim;
IF v_rag_dim != 1024 THEN
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗expected 1024, got %', v_rag_dim;
END IF;
IF v_pb_dim != 1024 THEN
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗expected 1024, got %', v_pb_dim;
END IF;
RAISE NOTICE '✅ embedding 遷移驗證通過rag_chunksplaybook_embeddings 均為 vector(1024)';
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunksplaybook_embeddings 均為 vector(1024)';
END $$;
COMMIT;

View File

@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
# 台北時區
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
# Prometheus 端點
PROMETHEUS_URL = "http://192.168.0.121:30090"
# Prometheus endpoint.
#
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
# kube-state-metrics 查詢
PROM_QUERIES = {
@@ -215,7 +219,7 @@ class K3sMonitorService:
# 發送訊息
formatted = status.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("k3s_daily_report_sent", date=status.report_date)

View File

@@ -244,7 +244,7 @@ class WeeklyReportService:
# 發送訊息
formatted = report.format()
result = await gateway.send_message(formatted)
result = await gateway.send_text(formatted)
if result:
logger.info("weekly_report_sent", week=report.week_range)

View File

@@ -6,6 +6,38 @@
---
## 2026-05-05 | 重開機後排程與 startup baseline 修復
**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`
**本次排程/啟動鏈修補**
- 120/121 K3s 回到 ReadyCD workflow 目標從 121 改為 120避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗120 已驗證 limited sudo kubectl 可用。
- K8s CronJob 修正:`k3s-status-report``weekly-report``km-vectorize` 改用存在的 service account、live API image、cluster service DNS手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。
- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。
- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`
- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後以低優先權手動備份成功Prometheus `backup_110_last_success_timestamp` 已更新。
- 188 momo-scheduler 修正 dashboard URL容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`
- 188 Google Drive token 從 legacy pickle 轉為 JSONscheduler 容器內 `GoogleDriveService().authenticate()` 通過。
- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`
- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot``realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。
- 110 停用兩個過期 startup units`momo-startup-complete.service`(指向不存在路徑/錯 host`wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。
- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active``SubState=exited``Result=success``systemctl --failed` 為 0。
- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。
- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria要求排程真正可執行才算 reboot recovery 完成。
**最終驗證**
- KM reembed 完成:`1774/1774` success、`0` failedDB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。
- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`
- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test``PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`HostBackupFailed 解除。
- 188/110 負載回到低檔K3s node CPU 約 3-6%KM reembed 未造成主機過載。
**下一步**
- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy避免下一版 image 覆蓋 hotfix。
-`grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
## 2026-05-05 | 110 Sentry resource limits persistence gap closed
**背景**110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。
@@ -3066,3 +3098,42 @@ C1evolver 加 YAML_RULE guard+ C2seeder SQL `AND status != 'deprecated'
```bash
psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
```
---
## 2026-05-05台北— 四主機重開機後全站冷啟動救援
**觸發**110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。
### 已恢復
| 範圍 | 結果 |
|------|------|
| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal``k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows120 / 121 重新 Ready |
| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` RunningVIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal``momo-pro-system` / scheduler / bot / DB 全部 healthy公網 `/` 200、`/health` 200 |
| 110 host overload | actions runner units 維持最後放行Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復Sentry stack healthy |
| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption已 clean-clone 可讀資料並保留原始 corrupt volumeSignOz HTTP 恢復 |
| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md``scripts/reboot-recovery/full-stack-cold-start-check.sh` |
### 驗證
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
# PASS=31 WARN=0 BLOCKED=0
# Result: GREEN. Full stack is ready for controlled runner/CD release.
```
### Dirty reboot 資料保全
- 110 Sentry ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`
- 110 Sentry Kafkamalformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint不刪 topic/log data。
- 188 SignOz ClickHouse原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。
- 188 `momo-db`WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`
### 已知隔離 / 後續
- 110 actions runner units 仍按策略最後放行guardrail 已套用,`CPUQuota=200%``MemoryMax=2G``WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。
- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error不阻斷主服務後續可清理或調查。

View File

@@ -0,0 +1,497 @@
# AWOOOI Full-Stack Cold Start SOP
> Version: v1.0
> Last updated: 2026-05-05 Asia/Taipei
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
---
## 0. When To Use This
Use this SOP when any of these happen:
- 110/120/121/188 reboot unexpectedly.
- All services are abnormal after a power/network event.
- K3s is stuck `activating`.
- Host load remains high during startup and service health is mixed.
- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
The rule is simple: **recover the dependency chain, not the loudest symptom.**
---
## 1. Golden Startup Order
```text
0. Freeze automation and preserve evidence
1. Physical/network layer
2. 188 data layer
3. 110 registry/observability layer
4. 120/121 K3s layer
5. AWOOOI workload layer
6. Public routes and alert chain
7. High-load batch/consumer/crawler services
8. Runner/CD
9. AI auto-remediation
10. 112 Kali scanner, if needed
```
Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
---
## 2. Automation Freeze
Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
| Item | Cold-start policy | Reason |
|------|-------------------|--------|
| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
---
## 3. P0 Evidence And Network
Run from any machine on the same LAN:
```bash
for h in 110 120 121 188; do
ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
for h in 110 120 121 188; do
nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
done
```
Then capture reboot evidence:
```bash
ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
```
If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
---
## 4. P0 188 Data Layer
188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
### 4.1 Startup order
1. `containerd`
2. `docker`
3. `postgresql@14-main`
4. `k3s_datastore.kine` maintenance
5. `redis-server` on `6380`
6. `ollama` or current AI proxy dependencies
7. `nginx`
8. Docker networks
9. MinIO / OpenClaw / SignOz
10. momo / litellm / batch services after load is stable
### 4.2 Read-only check
```bash
ssh ollama@192.168.0.188 '
hostname; date; uptime; free -h
systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
pg_isready -h localhost -p 5432 || true
redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
'
```
### 4.3 PostgreSQL WAL checkpoint damage
Signature:
```text
PANIC: could not locate a valid checkpoint record
invalid primary checkpoint record
unexpected pageaddr ... in log segment ...
```
This blocks:
- `188:5432`
- K3s startup on 120/121
- AWOOOI API DB access
- Alertmanager webhook if API cannot start
Human-approved recovery command on 188:
```bash
sudo systemctl stop postgresql@14-main
sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
sudo systemctl start postgresql@14-main
pg_isready -h localhost -p 5432
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
```
Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
---
## 5. P0/P1 110 Registry And Observability
110 must recover Harbor/Gitea/Monitoring early, but runners last.
### 5.1 Startup order
1. `docker`
2. Remove `Exited (128)` / `Exited (137)` orphan containers
3. Harbor `harbor-log`
4. Harbor full stack
5. Gitea
6. Prometheus / Alertmanager / Grafana / exporters
7. Langfuse
8. SignOz
9. Sentry DB layer
10. Sentry web/worker/consumer layer
11. Gitea host runner and actions runners
### 5.2 Checks
```bash
ssh wooo@192.168.0.110 '
hostname; date; uptime; free -h
systemctl is-active docker || true
curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
docker ps --format "{{.Names}}\t{{.Status}}" | head -120
'
```
Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
### 5.3 Runner gate
Runner may start only after all are true:
- `188 PostgreSQL` ready
- `110 Harbor` ready
- `110 Gitea` ready
- `120/121 K3s` nodes ready
- AWOOOI API health passes
- 110 load/core is below `1.0` for at least 15 minutes
- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
Check:
```bash
ssh wooo@192.168.0.110 '
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
echo "=== $u ==="
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
done
'
```
If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
```bash
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
```
---
## 6. P1 120/121 K3s
K3s must wait for 188 PostgreSQL and 110 Harbor.
### 6.1 Startup order
1. 120 `k3s.service`
2. 121 `k3s-agent.service` or its live role
3. CNI / kube-proxy
4. Nodes Ready
5. Core pods
6. `awoooi-prod` pods
7. keepalived VIP `192.168.0.125`
8. NodePorts `32334` and `32335`
### 6.2 Checks
```bash
ssh wooo@192.168.0.120 '
hostname; uptime
pg_isready -h 192.168.0.188 -p 5432 || true
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
kubectl get nodes -o wide 2>/dev/null || true
kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
ssh wooo@192.168.0.121 '
hostname; uptime
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
'
```
If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
---
## 7. P2 AWOOOI Workloads
Run after K3s nodes are Ready:
```bash
ssh wooo@192.168.0.120 '
kubectl get deploy -n awoooi-prod
kubectl get pods -n awoooi-prod -o wide
kubectl get svc -n awoooi-prod
kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
'
curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
```
If pods are `ImagePullBackOff`, go back to 110 Harbor.
If API health fails because DB/Redis is down, go back to 188.
---
## 8. P2 Alert Chain
Current main path:
```text
Prometheus/Alertmanager on 110
-> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
-> AWOOOI API
-> TelegramGateway
-> Telegram
```
Alertmanager health alone is not enough. Run E2E:
```bash
curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
-H 'Content-Type: application/json' \
-d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
```
Expected: API returns success and Telegram receives the test alert.
---
## 9. P2 Schedules And Delayed Work
Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
| Host / Layer | Required check | Success baseline |
|--------------|----------------|------------------|
| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
Useful checks:
```bash
ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
```
If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
---
## 10. P2/P3 Stateful Service Guardrails
| Tier | Examples | Automation |
|------|----------|------------|
| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
Never use generic `docker restart $(docker ps -q)` during cold start.
### 10.1 Dirty-Reboot Storage Corruption
Treat these log signatures as storage corruption, not ordinary service flakiness:
- `Bad message`
- `Structure needs cleaning`
- `Unknown codec`
- `PANIC: could not locate a valid checkpoint record`
- Kafka `Malformed line` in checkpoint files
- ClickHouse `broken and needs manual correction`
Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
### 10.2 ClickHouse Clean-Clone Recovery Pattern
Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
```text
1. Stop the compose stack or at least stop dependent consumers.
2. Disable restart loops for the failing container.
3. Save logs and build an exclude list from unreadable store paths.
4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
5. Create a clean _data clone with readable files only.
6. Add flags/force_restore_data.
7. Start ClickHouse first, then web/API, then consumers.
8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
```
Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
```text
/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
```
### 10.3 Kafka Checkpoint Recovery Pattern
If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
```text
log-start-offset-checkpoint
recovery-point-offset-checkpoint
replication-offset-checkpoint
```
Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
---
## 11. P3 High-Load Services
Only release these after P0/P1/P2 gates are green:
| Host | Service | Release condition |
|------|---------|-------------------|
| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
| 188 | litellm | `/health/liveliness` good and provider route verified |
| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
| 110 | Sentry uptime-checker | Sentry web/DB healthy |
| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
---
## 12. Baseline And AI Auto-Remediation Gate
### 12.1 Stable Runtime Baseline
These are release gates after the first cold-start recovery pass:
| Area | Baseline |
|------|----------|
| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
### 12.2 AI Auto-Remediation Gate
AI auto-repair can move from observe-only to limited execution only after:
- Prometheus rules are loaded.
- docker/systemd textfile exporter files are fresh.
- blackbox probes have stable results.
- cron/CronJob schedule checks are green.
- AWOOOI API `/api/v1/health` passes.
- Alertmanager E2E webhook passes.
- Redis/KM/playbook health is available.
- No active restart storm.
- Host load/core remains below `1.0` for 15 minutes.
Until then:
- diagnose only
- notify only
- require human approval for remediation
- no DB/ClickHouse/Harbor/Sentry destructive action
- no generic restart action against stateful services
---
## 13. One-Command Readiness Script
Run:
```bash
bash scripts/reboot-recovery/full-stack-cold-start-check.sh
```
The script is read-only. It reports gates:
- `P0-NETWORK`
- `P0-188-DATA`
- `P0-110-REGISTRY`
- `P1-K3S`
- `P2-WORKLOAD`
- `P2-ALERTCHAIN`
- `P2-PUBLIC-ROUTES`
- `P2-SCHEDULES`
- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
If it prints `BLOCKED`, fix the first blocked gate before moving forward.
---
## 14. Done Criteria
All must be true:
- Four hosts reachable by SSH.
- 188 PostgreSQL and Redis healthy.
- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
- 120/121 K3s nodes Ready.
- VIP `192.168.0.125` present.
- AWOOOI API and Web reachable through NodePort/VIP.
- Alertmanager E2E webhook succeeds.
- cron/CronJob schedules are active, unsuspended, and verified.
- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
- High-load batch services are capped or delayed.
- Runners are guarded and released last.
- AI auto-remediation is not in full execution mode until all gates are green.
---
## 15. Known Drift To Fix After Recovery
These must be cleaned after the incident, not during P0:
- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.

View File

@@ -42,8 +42,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: k3s-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +66,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls Prometheus and Telegram.
# The old awoooi-api ServiceAccount does not exist, which prevented
# Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -42,8 +42,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: weekly-report
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- -m
@@ -63,5 +66,7 @@ spec:
limits:
cpu: "500m"
memory: "256Mi"
# 使用 API 的 ServiceAccount (需要 RBAC)
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this report only calls app services, Prometheus,
# Git, and Telegram. The old awoooi-api ServiceAccount does not
# exist, which prevented Job pods from being created after reboot.
serviceAccountName: default

View File

@@ -27,7 +27,10 @@ spec:
jobTemplate:
spec:
backoffLimit: 2
activeDeadlineSeconds: 300
# 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
# The script now fails if the API reports failed rows, so this longer
# deadline does not hide partial vectorization.
activeDeadlineSeconds: 1800
template:
metadata:
labels:
@@ -37,8 +40,11 @@ spec:
restartPolicy: OnFailure
containers:
- name: km-vectorize
image: 192.168.0.110:5000/awoooi-api:latest
imagePullPolicy: Always
# 2026-05-05 Codex: keep the API image placeholder so CD
# injects the same immutable tag used by API/worker. The old
# awoooi-api:latest repo returns 400 from Harbor after reboot.
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: IfNotPresent
command:
- python
- /app/scripts/cron_km_vectorize.py
@@ -46,7 +52,9 @@ spec:
- name: TZ
value: "Asia/Taipei"
- name: INTERNAL_API_URL
value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
# 2026-05-05 Codex: use the actual Service name; the old
# awoooi-api DNS name does not exist in awoooi-prod.
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
resources:
requests:
cpu: "50m"
@@ -54,4 +62,7 @@ spec:
limits:
cpu: "200m"
memory: "128Mi"
serviceAccountName: awoooi-api
# 2026-05-05 Codex: this job only calls the internal API. The old
# awoooi-api ServiceAccount does not exist, which prevented Job pods
# from being created after reboot.
serviceAccountName: default

View File

@@ -63,10 +63,11 @@ spec:
print(f"status={r.status_code} body={r.text[:200]}")
asyncio.run(run())
env:
# 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
# 改用 NodePort 直連 K3s worker node同 K8s_API_SERVER_URL 解法)
# 2026-05-05 Codex: call the in-cluster Service instead of a
# fixed worker NodePort. After reboot, 121 can be unavailable
# while the Service and VIP are already healthy.
- name: INTERNAL_API_URL
value: "http://192.168.0.121:32334"
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
- name: DRIFT_SCAN_NAMESPACES
value: "awoooi-prod"
resources:

View File

@@ -18,17 +18,21 @@ import httpx
async def main() -> int:
api_base = os.environ.get(
"INTERNAL_API_URL",
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
"http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
)
url = f"{api_base}/api/v1/knowledge/embed-all"
async with httpx.AsyncClient(timeout=120) as client:
async with httpx.AsyncClient(timeout=1800) as client:
try:
resp = await client.post(url)
print(f"embed-all: {resp.status_code} {resp.text[:200]}")
if resp.status_code >= 400:
print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
return 1
result = resp.json()
if int(result.get("failed", 0)) > 0:
print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
return 1
return 0
except httpx.RequestError as exc:
print(f"ERROR: request failed — {exc}", file=sys.stderr)

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
for arg in "$@"; do
case "$arg" in
--send-alert-test)
SEND_ALERT_TEST=1
;;
-h|--help)
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test only after AWOOOI API is expected to be ready.
USAGE
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
exit 64
;;
esac
done
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
PASS=0
WARN=0
FAIL=0
log_section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
PASS=$((PASS + 1))
}
warn() {
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
WARN=$((WARN + 1))
}
fail() {
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
FAIL=$((FAIL + 1))
}
run_local() {
local label="$1"
shift
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
ok "$label"
cat /tmp/awoooi-cold-start-check.out
return 0
fi
fail "$label"
cat /tmp/awoooi-cold-start-check.out
return 1
}
ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}
probe_http_code() {
local url="$1"
local code
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
echo "${code:-000}"
}
probe_tcp() {
local host="$1"
local port="$2"
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}
print_header() {
echo "AWOOOI full-stack cold-start check"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
}
check_network() {
log_section "P0-NETWORK"
local host
for host in 110 120 121 188; do
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
ok "ping 192.168.0.$host"
else
fail "ping 192.168.0.$host"
fi
if probe_tcp "192.168.0.$host" 22; then
ok "ssh port 192.168.0.$host:22"
else
fail "ssh port 192.168.0.$host:22"
fi
done
arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
}
check_188() {
log_section "P0-188-DATA"
local out
if ! out=$(ssh_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
fail "ssh 188 read-only check"
echo "$out"
return
fi
echo "$out"
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}
check_110() {
log_section "P0-110-REGISTRY-OBSERVABILITY"
local out
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "$out"
return
fi
echo "$out"
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}
check_k3s() {
log_section "P1-K3S"
local out local_kubectl_out
if ! out=$(ssh_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
fail "ssh 120 k3s read-only check"
echo "$out"
return
fi
echo "$out"
if ! grep -q " Ready " <<<"$out"; then
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
if [ -n "$local_kubectl_out" ]; then
echo "LOCAL_KUBECTL_FALLBACK"
echo "$local_kubectl_out"
fi
else
local_kubectl_out=""
fi
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}
check_workload_and_alertchain() {
log_section "P2-WORKLOAD-ALERTCHAIN"
local api_code web_code alert_code
local out
if out=$(ssh_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
else
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
web_code=$(probe_http_code "http://192.168.0.125:32335/")
out="API_CODE $api_code
WEB_CODE $web_code"
fi
echo "$out"
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
-H '"'"'Content-Type: application/json'"'"' \
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
echo "ALERTCHAIN_CODE $alert_code"
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
else
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
fi
}
check_public_routes() {
log_section "P2-PUBLIC-ROUTES"
local awoooi_api_code awoooi_web_code momo_code momo_health_code
awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
momo_code=$(probe_http_code "https://mo.wooo.work/")
momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
echo "MOMO_PUBLIC_CODE $momo_code"
echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
[[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
[[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
[[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
[[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
}
check_schedules() {
log_section "P2-SCHEDULES"
local out
if out=$(ssh_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_188 $(basename "$f") missing"
fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
else
warn "188 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_110 $(basename "$f") missing"
fi
done
' 2>&1); then
echo "$out"
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
else
warn "110 schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.120" '
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
for j in d.get(\"items\", []):
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
failed += 1
print(\"FAILED_JOBS\", failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
echo "$out"
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
else
warn "120 K8s schedule check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
echo "$out"
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
else
warn "121 schedule check unavailable"
echo "$out"
fi
}
summary() {
log_section "SUMMARY"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
exit 2
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: DEGRADED. Core gates passed but warnings remain."
exit 1
fi
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}
print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary