fix(ops): harden cold-start schedule recovery
This commit is contained in:
@@ -108,7 +108,9 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
# 2026-05-05 Codex: kubectl runs on 120 control-plane. 121 is a
|
||||
# worker and its local kubeconfig points at 127.0.0.1:6443.
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
@@ -138,10 +140,10 @@ jobs:
|
||||
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
|
||||
run: |
|
||||
cat k8s/awoooi-dev/02-configmap.yaml | \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
|
||||
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.120 << 'DEPLOY'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
|
||||
@@ -406,8 +406,11 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
|
||||
# 2026-05-05 Codex: kubectl must run on the 120 control-plane.
|
||||
# 121 is a worker after cold-start recovery; its kubeconfig points at
|
||||
# 127.0.0.1:6443 and fails ADR-035 secret patching.
|
||||
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 << SECRETS
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
|
||||
@@ -634,19 +637,21 @@ jobs:
|
||||
mkdir -p ~/.ssh
|
||||
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
|
||||
chmod 600 ~/.ssh/deploy_key
|
||||
ssh-keyscan 192.168.0.121 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
# 2026-05-05 Codex: deploy-side kubectl/ArgoCD operations run on 120
|
||||
# control-plane, not 121 worker.
|
||||
ssh-keyscan 192.168.0.120 >> ~/.ssh/known_hosts 2>/dev/null
|
||||
|
||||
IMAGE_TAG="${{ github.sha }}"
|
||||
HARBOR=192.168.0.110:5000
|
||||
|
||||
# ─── Step 1: Apply ConfigMap + ServiceRegistry (ArgoCD 管的是 Deployment,ConfigMap 仍直接 apply) ───
|
||||
cat k8s/awoooi-prod/04-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
echo "✅ ConfigMap 已更新"
|
||||
|
||||
cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
|
||||
echo "✅ Service Registry ConfigMap 已更新"
|
||||
|
||||
@@ -688,7 +693,7 @@ jobs:
|
||||
}
|
||||
|
||||
# ─── Step 4: 等待 ArgoCD sync + rollout ───
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \
|
||||
ssh -i ~/.ssh/deploy_key wooo@192.168.0.120 \
|
||||
"EXPECTED_REVISION='${DEPLOY_REVISION}' bash -s" << 'ARGOCD_WAIT'
|
||||
set -e
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
@@ -814,7 +819,7 @@ jobs:
|
||||
- name: Alert Chain Smoke Test
|
||||
id: alert_chain_smoke
|
||||
run: |
|
||||
# 2026-04-05 Claude Code: 使用真實 API 地址(192.168.0.121:32334 NodePort)
|
||||
# 2026-05-05 Codex: use the keepalived VIP instead of a fixed node.
|
||||
# Host runner launches the CI image explicitly to avoid act RWLayer=nil.
|
||||
if docker run --rm \
|
||||
--name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-alert-smoke" \
|
||||
@@ -824,7 +829,7 @@ jobs:
|
||||
-v awoooi-api-venv-cache:/opt/api-venv \
|
||||
-w /workspace \
|
||||
"${{ env.CI_IMAGE }}" \
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.121:32334 --json | tee /tmp/alert_chain_result.json'; then
|
||||
bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url http://192.168.0.125:32334 --json | tee /tmp/alert_chain_result.json'; then
|
||||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -6,8 +6,9 @@
|
||||
-- bge-m3 產生 1024 維向量,現有 schema vector(768) 不相容,INSERT 會直接失敗
|
||||
--
|
||||
-- 影響範圍:
|
||||
-- 1. rag_chunks.embedding vector(768) → vector(1024)
|
||||
-- 2. playbook_embeddings.embedding vector(768) → vector(1024)
|
||||
-- 1. knowledge_entries.embedding vector(768) → vector(1024)
|
||||
-- 2. rag_chunks.embedding vector(768) → vector(1024)
|
||||
-- 3. playbook_embeddings.embedding vector(768) → vector(1024)
|
||||
--
|
||||
-- 遷移策略:清空現有向量資料,切換維度後由 re-embed script 重新嵌入
|
||||
-- 現有向量資料若要保留,需先 dump 用 nomic 格式備份(舊維度無法轉換)
|
||||
@@ -21,7 +22,24 @@
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- 1. rag_chunks:清空向量資料,變更欄位維度
|
||||
-- 1. knowledge_entries:備份舊向量並清空,變更欄位維度
|
||||
CREATE TABLE IF NOT EXISTS knowledge_entries_embedding_backup_20260505 AS
|
||||
SELECT
|
||||
id,
|
||||
embedding::text AS embedding_768,
|
||||
NOW() AS backed_up_at
|
||||
FROM knowledge_entries
|
||||
WHERE embedding IS NOT NULL;
|
||||
|
||||
ALTER TABLE knowledge_entries
|
||||
ALTER COLUMN embedding TYPE vector(1024)
|
||||
USING NULL; -- 清空現有 768 維向量(維度不可轉換)
|
||||
|
||||
COMMENT ON COLUMN knowledge_entries.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-05 ADR-110 follow-up)';
|
||||
|
||||
|
||||
-- 2. rag_chunks:清空向量資料,變更欄位維度
|
||||
-- ivfflat index 必須先 DROP 才能 ALTER COLUMN
|
||||
DROP INDEX IF EXISTS idx_rag_chunks_embedding;
|
||||
|
||||
@@ -39,7 +57,7 @@ COMMENT ON COLUMN rag_chunks.embedding IS
|
||||
'bge-m3:latest 1024 維向量 — 遷移自 nomic-embed-text 768 維 (2026-05-04 ADR-110)';
|
||||
|
||||
|
||||
-- 2. playbook_embeddings:清空向量資料,變更欄位維度
|
||||
-- 3. playbook_embeddings:清空向量資料,變更欄位維度
|
||||
DROP INDEX IF EXISTS ix_playbook_embeddings_vec;
|
||||
|
||||
ALTER TABLE playbook_embeddings
|
||||
@@ -61,9 +79,15 @@ COMMENT ON TABLE playbook_embeddings IS
|
||||
-- 3. 驗證遷移結果
|
||||
DO $$
|
||||
DECLARE
|
||||
v_km_dim integer;
|
||||
v_rag_dim integer;
|
||||
v_pb_dim integer;
|
||||
BEGIN
|
||||
SELECT atttypmod INTO v_km_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'knowledge_entries' AND attname = 'embedding';
|
||||
|
||||
SELECT atttypmod INTO v_rag_dim
|
||||
FROM pg_attribute
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
@@ -74,15 +98,18 @@ BEGIN
|
||||
JOIN pg_class ON attrelid = pg_class.oid
|
||||
WHERE relname = 'playbook_embeddings' AND attname = 'embedding';
|
||||
|
||||
-- atttypmod for vector(1024) = 1024 + 1 = 1025
|
||||
IF v_rag_dim != 1025 THEN
|
||||
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1025, got %', v_rag_dim;
|
||||
-- pgvector atttypmod stores the configured dimension.
|
||||
IF v_km_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'knowledge_entries.embedding 維度驗證失敗:expected 1024, got %', v_km_dim;
|
||||
END IF;
|
||||
IF v_pb_dim != 1025 THEN
|
||||
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1025, got %', v_pb_dim;
|
||||
IF v_rag_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'rag_chunks.embedding 維度驗證失敗:expected 1024, got %', v_rag_dim;
|
||||
END IF;
|
||||
IF v_pb_dim != 1024 THEN
|
||||
RAISE EXCEPTION 'playbook_embeddings.embedding 維度驗證失敗:expected 1024, got %', v_pb_dim;
|
||||
END IF;
|
||||
|
||||
RAISE NOTICE '✅ embedding 遷移驗證通過:rag_chunks 和 playbook_embeddings 均為 vector(1024)';
|
||||
RAISE NOTICE '✅ embedding 遷移驗證通過:knowledge_entries、rag_chunks、playbook_embeddings 均為 vector(1024)';
|
||||
END $$;
|
||||
|
||||
COMMIT;
|
||||
|
||||
@@ -34,8 +34,12 @@ logger = structlog.get_logger(__name__)
|
||||
# 台北時區
|
||||
TZ_TAIPEI = ZoneInfo("Asia/Taipei")
|
||||
|
||||
# Prometheus 端點
|
||||
PROMETHEUS_URL = "http://192.168.0.121:30090"
|
||||
# Prometheus endpoint.
|
||||
#
|
||||
# 2026-05-05 Codex: do not pin this report job to a K3s worker NodePort.
|
||||
# Production already injects PROMETHEUS_URL from ConfigMap, currently the
|
||||
# Docker Prometheus on 110. This keeps reboot recovery independent of 121.
|
||||
PROMETHEUS_URL = settings.PROMETHEUS_URL.rstrip("/")
|
||||
|
||||
# kube-state-metrics 查詢
|
||||
PROM_QUERIES = {
|
||||
@@ -215,7 +219,7 @@ class K3sMonitorService:
|
||||
|
||||
# 發送訊息
|
||||
formatted = status.format()
|
||||
result = await gateway.send_message(formatted)
|
||||
result = await gateway.send_text(formatted)
|
||||
|
||||
if result:
|
||||
logger.info("k3s_daily_report_sent", date=status.report_date)
|
||||
|
||||
@@ -244,7 +244,7 @@ class WeeklyReportService:
|
||||
|
||||
# 發送訊息
|
||||
formatted = report.format()
|
||||
result = await gateway.send_message(formatted)
|
||||
result = await gateway.send_text(formatted)
|
||||
|
||||
if result:
|
||||
logger.info("weekly_report_sent", week=report.week_range)
|
||||
|
||||
@@ -6,6 +6,38 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05 | 重開機後排程與 startup baseline 修復
|
||||
|
||||
**背景**:四台主機非預期重開機後,統帥要求確認所有服務、網站、工具、資料庫與排程都能正常恢復,不能只看容器 `healthy`。
|
||||
|
||||
**本次排程/啟動鏈修補**:
|
||||
- 120/121 K3s 回到 Ready;CD workflow 目標從 121 改為 120,避免 121 worker kubeconfig `127.0.0.1:6443` 造成 Secrets patch 失敗;120 已驗證 limited sudo kubectl 可用。
|
||||
- K8s CronJob 修正:`k3s-status-report`、`weekly-report`、`km-vectorize` 改用存在的 service account、live API image、cluster service DNS;手動 job 驗證 drift/k3s/weekly 可完成,歷史 failed jobs 已清掉。
|
||||
- KM embedding schema 從 768/錯誤 typmod 修為 `vector(1024)`;原 embedding 已備份到 `knowledge_entries_embedding_backup_20260505`,正在以 `bge-m3:latest` 重建。
|
||||
- 188 momo backup script 修正 quote/validation/Telegram optional/error cleanup;成功產出 `/home/ollama/momo_backups/momo_analytics_20260505_212032.sql.gz`。
|
||||
- 188 `backup-from-110.sh` 因 SSH config 權限錯誤導致 `HostBackupFailed`;修正 `.ssh/config` 權限與 110 identity 設定後,以低優先權手動備份成功,Prometheus `backup_110_last_success_timestamp` 已更新。
|
||||
- 188 momo-scheduler 修正 dashboard URL:容器內改打 `http://momo-pro-system`,不再打 `127.0.0.1:5000`。
|
||||
- 188 Google Drive token 從 legacy pickle 轉為 JSON,scheduler 容器內 `GoogleDriveService().authenticate()` 通過。
|
||||
- 188 daily sales import 修正 Excel sheet 選擇,優先讀 `即時業績明細`;手動匯入成功 `19934` 筆,日期 `2026-04-01 ~ 2026-05-03`。
|
||||
- 188 import 尾端驗證修正:改比對本次匯入日期範圍,不再用全表筆數硬比;`daily_sales_snapshot` 與 `realtime_sales_monthly` 在該日期範圍皆 `19934` 筆且驗證通過。
|
||||
- 110 startup 修復:移除 `/etc/sysctl.conf` 中誤寫的非法敏感純文字行;`systemd-sysctl` 恢復成功。
|
||||
- 110 停用兩個過期 startup units:`momo-startup-complete.service`(指向不存在路徑/錯 host)與 `wooo-staggered-startup.service`(舊 GitLab 延遲啟動且會增加重開機負載)。
|
||||
- 110 `awoooi-startup-110.service` timeout 從 5 分鐘延長到 15 分鐘,重跑後 `ActiveState=active`、`SubState=exited`、`Result=success`,`systemctl --failed` 為 0。
|
||||
- 110 certbot timer 失敗追查:`grist.wooo.work` / `registry.wooo.work` public route 目前被導向 `aiops.wooo.work`,HTTP-01 無法從 110 成功;已將兩個 stale renewal config 移至 `/etc/letsencrypt/renewal-disabled-codex-*`,並 reset certbot failed state。憑證 archive 未刪除;後續需修 public route 或改 DNS-01。
|
||||
- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `P2-SCHEDULES`,覆蓋 188/110/120/121 cron、textfile mtime、188 backup freshness、110 failed units、K8s CronJob/Job/Pod 狀態、121 DR drill cron。
|
||||
- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增排程驗證章節與 done criteria,要求排程真正可執行才算 reboot recovery 完成。
|
||||
|
||||
**最終驗證**:
|
||||
- KM reembed 完成:`1774/1774` success、`0` failed;DB 目前 `knowledge_entries` total `1785`、embedded `1776`、vector dims `1024..1024`,舊 embedding backup `1691` rows。
|
||||
- 手動 `km-vectorize` CronJob `km-vectorize-codex-220715` 完成,回 `embed-all: 200 {"total":0,"success":0,"failed":0}`。
|
||||
- `bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test` → `PASS=50 WARN=0 BLOCKED=0`,包含 Alertmanager webhook E2E、public routes、cron/CronJob/textfile/systemd schedule checks。
|
||||
- Prometheus firing alerts 已從 `HostBackupFailed + FlywheelExecutionRateMissing` 收斂為僅剩 `FlywheelExecutionRateMissing`;HostBackupFailed 解除。
|
||||
- 188/110 負載回到低檔;K3s node CPU 約 3-6%,KM reembed 未造成主機過載。
|
||||
|
||||
**下一步**:
|
||||
- 將本次 runtime hotfix 對應的 repo changes 走正式 deploy,避免下一版 image 覆蓋 hotfix。
|
||||
- 修 `grist.wooo.work` / `registry.wooo.work` public route 或改 DNS-01 renewal;目前舊 renewal config 已停用以避免 certbot timer 每次失敗。
|
||||
|
||||
## 2026-05-05 | 110 Sentry resource limits persistence gap closed
|
||||
|
||||
**背景**:110 guardrail 告警已清,但主機 load 仍有長尾;統帥擔心 Claude Code 只做 live `docker update`,重建後配置又失效。
|
||||
@@ -3066,3 +3098,42 @@ C1(evolver 加 YAML_RULE guard)+ C2(seeder SQL `AND status != 'deprecated'
|
||||
```bash
|
||||
psql $DATABASE_URL -f apps/api/migrations/cleanup_duplicate_deprecated_playbooks.sql
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-05(台北)— 四主機重開機後全站冷啟動救援
|
||||
|
||||
**觸發**:110 / 120 / 121 / 188 同時重開機後,多數服務異常;統帥要求先恢復所有網站、主機、核心服務,並建立完整冷啟動 SOP。
|
||||
|
||||
### 已恢復
|
||||
|
||||
| 範圍 | 結果 |
|
||||
|------|------|
|
||||
| 188 host PostgreSQL | WAL checkpoint 損壞;已備份後 `pg_resetwal`,`k3s_datastore` `REINDEX` + `VACUUM ANALYZE` 完成 |
|
||||
| K3s datastore | 刪除並備份可重建的腐壞 HPA / VPA / VPA checkpoint / `mon1` node rows;120 / 121 重新 Ready |
|
||||
| AWOOI prod | `awoooi-api` / `awoooi-web` / `awoooi-worker` Running;VIP `192.168.0.125` 內網驗證 API 200 / Web 307 |
|
||||
| mo.wooo.work | `momo-db` WAL redo 損壞;備份後 `pg_resetwal`,`momo-pro-system` / scheduler / bot / DB 全部 healthy;公網 `/` 200、`/health` 200 |
|
||||
| 110 host overload | actions runner units 維持最後放行;Sentry ClickHouse/Kafka 已從 dirty-reboot 損壞中恢復,Sentry stack healthy |
|
||||
| 188 SignOz | SignOz ClickHouse volume 出現 filesystem corruption;已 clean-clone 可讀資料並保留原始 corrupt volume,SignOz HTTP 恢復 |
|
||||
| 冷啟動 SOP | 新增 `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 與 `scripts/reboot-recovery/full-stack-cold-start-check.sh` |
|
||||
|
||||
### 驗證
|
||||
|
||||
```bash
|
||||
bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
|
||||
# PASS=31 WARN=0 BLOCKED=0
|
||||
# Result: GREEN. Full stack is ready for controlled runner/CD release.
|
||||
```
|
||||
|
||||
### Dirty reboot 資料保全
|
||||
|
||||
- 110 Sentry ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/sentry-clickhouse/_data.corrupt-20260505-203346`;以 clean-clone 恢復可讀資料並加 `force_restore_data`。
|
||||
- 110 Sentry Kafka:malformed checkpoint 已備份至 `/var/backups/sentry-kafka-checkpoints-20260505-203942`,只重建 checkpoint,不刪 topic/log data。
|
||||
- 188 SignOz ClickHouse:原始壞 volume 保留為 `/var/lib/docker/volumes/signoz-clickhouse/_data.corrupt-20260505-203735`;以 clean-clone 恢復可讀資料。
|
||||
- 188 `momo-db`:WAL reset 前備份 `/var/backups/postgresql/momo-db-before-pg-resetwal-20260505-200834.tgz`。
|
||||
|
||||
### 已知隔離 / 後續
|
||||
|
||||
- 110 actions runner units 仍按策略最後放行:guardrail 已套用,`CPUQuota=200%`、`MemoryMax=2G`、`WatchdogUSec=0`;需在 load/core 穩定後逐步開啟。
|
||||
- `Bad message` / `Structure needs cleaning` 是 host filesystem 層訊號;線上 clean-clone 已恢復服務,但完整歷史資料追溯需安排離線 `fsck` 或備份驗證。
|
||||
- `drift-scanner-29633040-qrf8w` 為單次 CronJob Error,不阻斷主服務;後續可清理或調查。
|
||||
|
||||
497
docs/runbooks/FULL-STACK-COLD-START-SOP.md
Normal file
497
docs/runbooks/FULL-STACK-COLD-START-SOP.md
Normal file
@@ -0,0 +1,497 @@
|
||||
# AWOOOI Full-Stack Cold Start SOP
|
||||
|
||||
> Version: v1.0
|
||||
> Last updated: 2026-05-05 Asia/Taipei
|
||||
> Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path.
|
||||
|
||||
---
|
||||
|
||||
## 0. When To Use This
|
||||
|
||||
Use this SOP when any of these happen:
|
||||
|
||||
- 110/120/121/188 reboot unexpectedly.
|
||||
- All services are abnormal after a power/network event.
|
||||
- K3s is stuck `activating`.
|
||||
- Host load remains high during startup and service health is mixed.
|
||||
- Monitoring, alerting, CD, AI auto-repair, and Docker Compose services disagree about the real state.
|
||||
|
||||
The rule is simple: **recover the dependency chain, not the loudest symptom.**
|
||||
|
||||
---
|
||||
|
||||
## 1. Golden Startup Order
|
||||
|
||||
```text
|
||||
0. Freeze automation and preserve evidence
|
||||
1. Physical/network layer
|
||||
2. 188 data layer
|
||||
3. 110 registry/observability layer
|
||||
4. 120/121 K3s layer
|
||||
5. AWOOOI workload layer
|
||||
6. Public routes and alert chain
|
||||
7. High-load batch/consumer/crawler services
|
||||
8. Runner/CD
|
||||
9. AI auto-remediation
|
||||
10. 112 Kali scanner, if needed
|
||||
```
|
||||
|
||||
Never start runner/CD before 188 PostgreSQL, 110 Harbor, K3s nodes, and AWOOOI API are healthy.
|
||||
|
||||
---
|
||||
|
||||
## 2. Automation Freeze
|
||||
|
||||
Cold start creates noisy metrics and partial failures. During P0/P1, keep automation in observe-only mode.
|
||||
|
||||
| Item | Cold-start policy | Reason |
|
||||
|------|-------------------|--------|
|
||||
| Gitea/GitHub runners | Last | Build jobs can saturate 110 CPU/RAM. |
|
||||
| momo-scheduler / crawlers | Last | Chrome and batch work can saturate 188. |
|
||||
| Sentry/Snuba consumers | Controlled | Kafka backlog and ClickHouse merge can create temporary high load. |
|
||||
| Alertmanager outbound notification | Gate | Avoid alert storms before API webhook and Telegram are verified. |
|
||||
| AI auto-repair | Observe-only | Metrics, Redis, KM, and playbooks may be incomplete. |
|
||||
| Stateful DB restart | Human approval | PostgreSQL, Redis, ClickHouse, Harbor DB, Sentry DB are not generic restart targets. |
|
||||
|
||||
---
|
||||
|
||||
## 3. P0 Evidence And Network
|
||||
|
||||
Run from any machine on the same LAN:
|
||||
|
||||
```bash
|
||||
for h in 110 120 121 188; do
|
||||
ping -c 2 -W 2 192.168.0.$h >/dev/null && echo "PING_OK 192.168.0.$h" || echo "PING_FAIL 192.168.0.$h"
|
||||
done
|
||||
|
||||
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
|
||||
for h in 110 120 121 188; do
|
||||
nc -G 3 -z 192.168.0.$h 22 && echo "SSH_OK 192.168.0.$h" || echo "SSH_FAIL 192.168.0.$h"
|
||||
done
|
||||
```
|
||||
|
||||
Then capture reboot evidence:
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.110 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.120 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
ssh wooo@192.168.0.121 'hostname; date; uptime; who -b; last -x reboot shutdown | head -20'
|
||||
```
|
||||
|
||||
If any host has ARP `incomplete` or SSH port down, stop here and fix physical/network first.
|
||||
|
||||
---
|
||||
|
||||
## 4. P0 188 Data Layer
|
||||
|
||||
188 is the first real service dependency because K3s datastore and AWOOOI DB depend on PostgreSQL.
|
||||
|
||||
### 4.1 Startup order
|
||||
|
||||
1. `containerd`
|
||||
2. `docker`
|
||||
3. `postgresql@14-main`
|
||||
4. `k3s_datastore.kine` maintenance
|
||||
5. `redis-server` on `6380`
|
||||
6. `ollama` or current AI proxy dependencies
|
||||
7. `nginx`
|
||||
8. Docker networks
|
||||
9. MinIO / OpenClaw / SignOz
|
||||
10. momo / litellm / batch services after load is stable
|
||||
|
||||
### 4.2 Read-only check
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 '
|
||||
hostname; date; uptime; free -h
|
||||
systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx || true
|
||||
pg_isready -h localhost -p 5432 || true
|
||||
redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true
|
||||
docker ps --format "{{.Names}}\t{{.Status}}\t{{.Ports}}" | head -120
|
||||
'
|
||||
```
|
||||
|
||||
### 4.3 PostgreSQL WAL checkpoint damage
|
||||
|
||||
Signature:
|
||||
|
||||
```text
|
||||
PANIC: could not locate a valid checkpoint record
|
||||
invalid primary checkpoint record
|
||||
unexpected pageaddr ... in log segment ...
|
||||
```
|
||||
|
||||
This blocks:
|
||||
|
||||
- `188:5432`
|
||||
- K3s startup on 120/121
|
||||
- AWOOOI API DB access
|
||||
- Alertmanager webhook if API cannot start
|
||||
|
||||
Human-approved recovery command on 188:
|
||||
|
||||
```bash
|
||||
sudo systemctl stop postgresql@14-main
|
||||
sudo install -d -m 700 -o postgres -g postgres /var/backups/postgresql
|
||||
sudo tar -C /var/lib/postgresql/14 -czf /var/backups/postgresql/14-main-before-pg-resetwal-$(date +%Y%m%d-%H%M%S).tgz main
|
||||
sudo -u postgres /usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
|
||||
sudo systemctl start postgresql@14-main
|
||||
pg_isready -h localhost -p 5432
|
||||
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;"
|
||||
```
|
||||
|
||||
Do not run `DROP`, reinitialize the cluster, delete `/var/lib/postgresql`, or restore an old backup unless the commander explicitly approves it.
|
||||
|
||||
---
|
||||
|
||||
## 5. P0/P1 110 Registry And Observability
|
||||
|
||||
110 must recover Harbor/Gitea/Monitoring early, but runners last.
|
||||
|
||||
### 5.1 Startup order
|
||||
|
||||
1. `docker`
|
||||
2. Remove `Exited (128)` / `Exited (137)` orphan containers
|
||||
3. Harbor `harbor-log`
|
||||
4. Harbor full stack
|
||||
5. Gitea
|
||||
6. Prometheus / Alertmanager / Grafana / exporters
|
||||
7. Langfuse
|
||||
8. SignOz
|
||||
9. Sentry DB layer
|
||||
10. Sentry web/worker/consumer layer
|
||||
11. Gitea host runner and actions runners
|
||||
|
||||
### 5.2 Checks
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.110 '
|
||||
hostname; date; uptime; free -h
|
||||
systemctl is-active docker || true
|
||||
curl -s -o /dev/null -w "harbor=%{http_code}\n" --max-time 5 http://127.0.0.1:5000/v2/ || true
|
||||
curl -s -o /dev/null -w "gitea=%{http_code}\n" --max-time 5 http://127.0.0.1:3001/ || true
|
||||
curl -s --max-time 5 http://127.0.0.1:9090/-/ready || true
|
||||
curl -s --max-time 5 http://127.0.0.1:9093/-/healthy || true
|
||||
curl -s -o /dev/null -w "sentry=%{http_code}\n" --max-time 10 http://127.0.0.1:9000/ || true
|
||||
docker ps --format "{{.Names}}\t{{.Status}}" | head -120
|
||||
'
|
||||
```
|
||||
|
||||
Harbor healthy means `/v2/` returns `200` or `401`. Do not treat `401` as failure.
|
||||
|
||||
### 5.3 Runner gate
|
||||
|
||||
Runner may start only after all are true:
|
||||
|
||||
- `188 PostgreSQL` ready
|
||||
- `110 Harbor` ready
|
||||
- `110 Gitea` ready
|
||||
- `120/121 K3s` nodes ready
|
||||
- AWOOOI API health passes
|
||||
- 110 load/core is below `1.0` for at least 15 minutes
|
||||
- runner systemd guardrails are active: `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0`
|
||||
|
||||
Check:
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.110 '
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
|
||||
echo "=== $u ==="
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts
|
||||
done
|
||||
'
|
||||
```
|
||||
|
||||
If `WatchdogUSec` is not `0`, apply the guardrail script manually with sudo:
|
||||
|
||||
```bash
|
||||
sudo /home/wooo/scripts/apply-runner-systemd-guardrails.sh --apply
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. P1 120/121 K3s
|
||||
|
||||
K3s must wait for 188 PostgreSQL and 110 Harbor.
|
||||
|
||||
### 6.1 Startup order
|
||||
|
||||
1. 120 `k3s.service`
|
||||
2. 121 `k3s-agent.service` or its live role
|
||||
3. CNI / kube-proxy
|
||||
4. Nodes Ready
|
||||
5. Core pods
|
||||
6. `awoooi-prod` pods
|
||||
7. keepalived VIP `192.168.0.125`
|
||||
8. NodePorts `32334` and `32335`
|
||||
|
||||
### 6.2 Checks
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.120 '
|
||||
hostname; uptime
|
||||
pg_isready -h 192.168.0.188 -p 5432 || true
|
||||
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
|
||||
kubectl get nodes -o wide 2>/dev/null || true
|
||||
kubectl get pods -A 2>/dev/null | grep -v -E "Running|Completed" || true
|
||||
kubectl get pods -n awoooi-prod -o wide 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
'
|
||||
|
||||
ssh wooo@192.168.0.121 '
|
||||
hostname; uptime
|
||||
systemctl is-active k3s k3s-agent keepalived 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
'
|
||||
```
|
||||
|
||||
If K3s is `activating` while 188 PostgreSQL is down, fix PostgreSQL first. Restarting K3s repeatedly will not solve it.
|
||||
|
||||
---
|
||||
|
||||
## 7. P2 AWOOOI Workloads
|
||||
|
||||
Run after K3s nodes are Ready:
|
||||
|
||||
```bash
|
||||
ssh wooo@192.168.0.120 '
|
||||
kubectl get deploy -n awoooi-prod
|
||||
kubectl get pods -n awoooi-prod -o wide
|
||||
kubectl get svc -n awoooi-prod
|
||||
kubectl get events -n awoooi-prod --sort-by=.lastTimestamp | tail -40
|
||||
'
|
||||
|
||||
curl -s --max-time 8 http://192.168.0.125:32334/api/v1/health
|
||||
curl -s -o /dev/null -w "web=%{http_code}\n" --max-time 8 http://192.168.0.125:32335/
|
||||
```
|
||||
|
||||
If pods are `ImagePullBackOff`, go back to 110 Harbor.
|
||||
|
||||
If API health fails because DB/Redis is down, go back to 188.
|
||||
|
||||
---
|
||||
|
||||
## 8. P2 Alert Chain
|
||||
|
||||
Current main path:
|
||||
|
||||
```text
|
||||
Prometheus/Alertmanager on 110
|
||||
-> http://192.168.0.125:32334/api/v1/webhooks/alertmanager
|
||||
-> AWOOOI API
|
||||
-> TelegramGateway
|
||||
-> Telegram
|
||||
```
|
||||
|
||||
Alertmanager health alone is not enough. Run E2E:
|
||||
|
||||
```bash
|
||||
curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"receiver":"cold-start-test","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartE2ETest","severity":"info"},"annotations":{"summary":"Cold start E2E test, ignore"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-test"}'
|
||||
```
|
||||
|
||||
Expected: API returns success and Telegram receives the test alert.
|
||||
|
||||
---
|
||||
|
||||
## 9. P2 Schedules And Delayed Work
|
||||
|
||||
Do not mark the reboot complete until scheduled work is proven runnable. A container can be healthy while its cron path is broken.
|
||||
|
||||
| Host / Layer | Required check | Success baseline |
|
||||
|--------------|----------------|------------------|
|
||||
| 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present |
|
||||
| 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` |
|
||||
| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames |
|
||||
| 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` |
|
||||
| 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh |
|
||||
| 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled |
|
||||
| 120 K8s CronJobs | `kubectl get cronjobs -n awoooi-prod` | unsuspended; no failed Jobs remain after current validation |
|
||||
| 121 DR drill | `crontab -l` | DR drill cron present unless explicitly paused |
|
||||
|
||||
Useful checks:
|
||||
|
||||
```bash
|
||||
ssh ollama@192.168.0.188 'systemctl is-active cron; crontab -l; ls -l /home/ollama/node_exporter_textfiles/*.prom'
|
||||
ssh wooo@192.168.0.110 'systemctl --failed --no-pager; systemctl is-active cron; crontab -l'
|
||||
ssh wooo@192.168.0.120 'sudo kubectl get cronjobs,jobs -n awoooi-prod'
|
||||
ssh wooo@192.168.0.121 'systemctl is-active cron; crontab -l'
|
||||
```
|
||||
|
||||
If a schedule succeeds but emits a false verification alert, fix the verification rule before releasing AI auto-remediation. False positives train operators to ignore real alarms.
|
||||
|
||||
---
|
||||
|
||||
## 10. P2/P3 Stateful Service Guardrails
|
||||
|
||||
| Tier | Examples | Automation |
|
||||
|------|----------|------------|
|
||||
| BLOCK | PostgreSQL data dir, ClickHouse data dir, Harbor DB, Sentry DB | No automatic destructive action. Human approval only. |
|
||||
| CRITICAL_HITL | Redis, Kafka, MinIO, SignOz ClickHouse, Sentry ClickHouse | Human-in-the-loop restart/repair. |
|
||||
| STANDARD_HITL | API/Web/worker, OpenClaw, litellm | Restart only with evidence and blast-radius check. |
|
||||
| AUTO | Stateless exporters, blackbox, nginx exporter | Auto restart allowed after verification. |
|
||||
|
||||
Never use generic `docker restart $(docker ps -q)` during cold start.
|
||||
|
||||
### 10.1 Dirty-Reboot Storage Corruption
|
||||
|
||||
Treat these log signatures as storage corruption, not ordinary service flakiness:
|
||||
|
||||
- `Bad message`
|
||||
- `Structure needs cleaning`
|
||||
- `Unknown codec`
|
||||
- `PANIC: could not locate a valid checkpoint record`
|
||||
- Kafka `Malformed line` in checkpoint files
|
||||
- ClickHouse `broken and needs manual correction`
|
||||
|
||||
Cold-start automation may stop a restart storm and collect evidence, but it must not delete the original data directory. If a filesystem returns `Bad message` or `Structure needs cleaning`, the real root cause is below the container layer. Online recovery can restore service from readable data, but complete historical recovery requires an offline filesystem check or backup restore.
|
||||
|
||||
### 10.2 ClickHouse Clean-Clone Recovery Pattern
|
||||
|
||||
Use this pattern for Sentry ClickHouse or SignOz ClickHouse when individual corrupted parts cannot be moved because the host filesystem rejects reads.
|
||||
|
||||
```text
|
||||
1. Stop the compose stack or at least stop dependent consumers.
|
||||
2. Disable restart loops for the failing container.
|
||||
3. Save logs and build an exclude list from unreadable store paths.
|
||||
4. Preserve the original volume as _data.corrupt-YYYYMMDD-HHMMSS.
|
||||
5. Create a clean _data clone with readable files only.
|
||||
6. Add flags/force_restore_data.
|
||||
7. Start ClickHouse first, then web/API, then consumers.
|
||||
8. Verify HTTP, merge backlog, and restart count before releasing high-load services.
|
||||
```
|
||||
|
||||
Do not replace this with `rm -rf store/...` unless the unreadable path is already backed up or the commander explicitly accepts data loss. The preferred incident artifact is:
|
||||
|
||||
```text
|
||||
/var/lib/docker/volumes/<volume>/_data.corrupt-YYYYMMDD-HHMMSS
|
||||
/var/backups/<service>-<component>-YYYYMMDD-HHMMSS
|
||||
```
|
||||
|
||||
### 10.3 Kafka Checkpoint Recovery Pattern
|
||||
|
||||
If Kafka refuses to start with malformed checkpoint files after a dirty reboot, preserve and move only checkpoint files:
|
||||
|
||||
```text
|
||||
log-start-offset-checkpoint
|
||||
recovery-point-offset-checkpoint
|
||||
replication-offset-checkpoint
|
||||
```
|
||||
|
||||
Then start Kafka and confirm health before starting Snuba/Sentry consumers. Do not delete topic directories or Kafka logs during cold-start recovery.
|
||||
|
||||
---
|
||||
|
||||
## 11. P3 High-Load Services
|
||||
|
||||
Only release these after P0/P1/P2 gates are green:
|
||||
|
||||
| Host | Service | Release condition |
|
||||
|------|---------|-------------------|
|
||||
| 188 | momo-scheduler / crawler | load/core < 1.0 for 15 minutes and DB healthy |
|
||||
| 188 | SignOz ClickHouse | healthy and merge backlog trending down |
|
||||
| 188 | litellm | `/health/liveliness` good and provider route verified |
|
||||
| 110 | Sentry Snuba consumers | ClickHouse healthy and Kafka backlog decreasing |
|
||||
| 110 | Sentry uptime-checker | Sentry web/DB healthy |
|
||||
| 110 | runners | all previous gates green and load/core < 1.0 for 15 minutes |
|
||||
|
||||
---
|
||||
|
||||
## 12. Baseline And AI Auto-Remediation Gate
|
||||
|
||||
### 12.1 Stable Runtime Baseline
|
||||
|
||||
These are release gates after the first cold-start recovery pass:
|
||||
|
||||
| Area | Baseline |
|
||||
|------|----------|
|
||||
| 188 host | PostgreSQL accepting, Redis PONG, momo `/health` 200, SignOz HTTP reachable, load/core < 1.0 sustained before crawlers |
|
||||
| 110 host | Harbor `/v2/` 200/401, Gitea 200/302, Prometheus ready, Alertmanager healthy, Sentry HTTP 200/302/400, no ClickHouse/Kafka restart loop |
|
||||
| K3s | 120/121 nodes Ready, VIP `192.168.0.125` present, AWOOOI API 2xx/3xx, Web 2xx/3xx |
|
||||
| Public routes | `https://awoooi.wooo.work/api/v1/health` 2xx/3xx, `https://mo.wooo.work/health` 2xx/3xx |
|
||||
| Guardrails | Docker/systemd textfile exporters fresh, runner `CPUQuota=200%`, `MemoryMax=2G`, `WatchdogUSec=0` |
|
||||
| Schedules | cron active on 110/188/120/121; K8s CronJobs unsuspended; no current failed Jobs; 188 backup success `< 25h` |
|
||||
| Backlog | ClickHouse merges and Kafka/Snuba lag trending down, not increasing for two consecutive checks |
|
||||
|
||||
If service health is green but load average remains high, check live CPU and IO before changing memory limits. High load after Sentry/Snuba or ClickHouse startup can be backlog drain; high CPU from runners/builds/crawlers is a release-order problem.
|
||||
|
||||
### 12.2 AI Auto-Remediation Gate
|
||||
|
||||
AI auto-repair can move from observe-only to limited execution only after:
|
||||
|
||||
- Prometheus rules are loaded.
|
||||
- docker/systemd textfile exporter files are fresh.
|
||||
- blackbox probes have stable results.
|
||||
- cron/CronJob schedule checks are green.
|
||||
- AWOOOI API `/api/v1/health` passes.
|
||||
- Alertmanager E2E webhook passes.
|
||||
- Redis/KM/playbook health is available.
|
||||
- No active restart storm.
|
||||
- Host load/core remains below `1.0` for 15 minutes.
|
||||
|
||||
Until then:
|
||||
|
||||
- diagnose only
|
||||
- notify only
|
||||
- require human approval for remediation
|
||||
- no DB/ClickHouse/Harbor/Sentry destructive action
|
||||
- no generic restart action against stateful services
|
||||
|
||||
---
|
||||
|
||||
## 13. One-Command Readiness Script
|
||||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
bash scripts/reboot-recovery/full-stack-cold-start-check.sh
|
||||
```
|
||||
|
||||
The script is read-only. It reports gates:
|
||||
|
||||
- `P0-NETWORK`
|
||||
- `P0-188-DATA`
|
||||
- `P0-110-REGISTRY`
|
||||
- `P1-K3S`
|
||||
- `P2-WORKLOAD`
|
||||
- `P2-ALERTCHAIN`
|
||||
- `P2-PUBLIC-ROUTES`
|
||||
- `P2-SCHEDULES`
|
||||
- runner guardrail state inside `P0-110-REGISTRY-OBSERVABILITY`
|
||||
|
||||
If it prints `BLOCKED`, fix the first blocked gate before moving forward.
|
||||
|
||||
---
|
||||
|
||||
## 14. Done Criteria
|
||||
|
||||
All must be true:
|
||||
|
||||
- Four hosts reachable by SSH.
|
||||
- 188 PostgreSQL and Redis healthy.
|
||||
- 110 Harbor, Gitea, Prometheus, Alertmanager healthy.
|
||||
- 120/121 K3s nodes Ready.
|
||||
- VIP `192.168.0.125` present.
|
||||
- AWOOOI API and Web reachable through NodePort/VIP.
|
||||
- Alertmanager E2E webhook succeeds.
|
||||
- cron/CronJob schedules are active, unsuspended, and verified.
|
||||
- Sentry and SignOz are either healthy or explicitly in controlled backlog recovery.
|
||||
- High-load batch services are capped or delayed.
|
||||
- Runners are guarded and released last.
|
||||
- AI auto-remediation is not in full execution mode until all gates are green.
|
||||
|
||||
---
|
||||
|
||||
## 15. Known Drift To Fix After Recovery
|
||||
|
||||
These must be cleaned after the incident, not during P0:
|
||||
|
||||
- `SERVICE-ENDPOINTS.md` still has old Prometheus/Alertmanager locations.
|
||||
- Audit older docs for direct node webhook targets; current main path should be VIP `192.168.0.125:32334`.
|
||||
- OpenClaw `8088` vs `8089` must be live-confirmed and normalized.
|
||||
- 188 compose paths drift between `/home/ollama/*` and Ansible `/opt/*`.
|
||||
- 110 runner docs still mention Docker runner in places; live startup prefers host `gitea-act-runner-host.service`.
|
||||
- `scripts/setup-runner-watchdog.sh` conflicts with the 2026-05-05 runner watchdog disablement guardrail.
|
||||
- `grist.wooo.work` / `registry.wooo.work` public HTTP/HTTPS currently route to `aiops.wooo.work`; their old 110 certbot renewal configs are disabled until public routing is corrected or DNS-01 renewal is configured.
|
||||
@@ -42,8 +42,11 @@ spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: k3s-report
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
@@ -63,5 +66,7 @@ spec:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
# 使用 API 的 ServiceAccount (需要 RBAC)
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this report only calls Prometheus and Telegram.
|
||||
# The old awoooi-api ServiceAccount does not exist, which prevented
|
||||
# Job pods from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -42,8 +42,11 @@ spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: weekly-report
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- -m
|
||||
@@ -63,5 +66,7 @@ spec:
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "256Mi"
|
||||
# 使用 API 的 ServiceAccount (需要 RBAC)
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this report only calls app services, Prometheus,
|
||||
# Git, and Telegram. The old awoooi-api ServiceAccount does not
|
||||
# exist, which prevented Job pods from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -27,7 +27,10 @@ spec:
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
activeDeadlineSeconds: 300
|
||||
# 2026-05-05 Codex: allow post-reboot/post-migration catch-up batches.
|
||||
# The script now fails if the API reports failed rows, so this longer
|
||||
# deadline does not hide partial vectorization.
|
||||
activeDeadlineSeconds: 1800
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
@@ -37,8 +40,11 @@ spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: km-vectorize
|
||||
image: 192.168.0.110:5000/awoooi-api:latest
|
||||
imagePullPolicy: Always
|
||||
# 2026-05-05 Codex: keep the API image placeholder so CD
|
||||
# injects the same immutable tag used by API/worker. The old
|
||||
# awoooi-api:latest repo returns 400 from Harbor after reboot.
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- /app/scripts/cron_km_vectorize.py
|
||||
@@ -46,7 +52,9 @@ spec:
|
||||
- name: TZ
|
||||
value: "Asia/Taipei"
|
||||
- name: INTERNAL_API_URL
|
||||
value: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
|
||||
# 2026-05-05 Codex: use the actual Service name; the old
|
||||
# awoooi-api DNS name does not exist in awoooi-prod.
|
||||
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
@@ -54,4 +62,7 @@ spec:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
serviceAccountName: awoooi-api
|
||||
# 2026-05-05 Codex: this job only calls the internal API. The old
|
||||
# awoooi-api ServiceAccount does not exist, which prevented Job pods
|
||||
# from being created after reboot.
|
||||
serviceAccountName: default
|
||||
|
||||
@@ -63,10 +63,11 @@ spec:
|
||||
print(f"status={r.status_code} body={r.text[:200]}")
|
||||
asyncio.run(run())
|
||||
env:
|
||||
# 2026-04-09 Claude Sonnet 4.6: ClusterIP 和 DNS 在 Job Pod 均不可達
|
||||
# 改用 NodePort 直連 K3s worker node(同 K8s_API_SERVER_URL 解法)
|
||||
# 2026-05-05 Codex: call the in-cluster Service instead of a
|
||||
# fixed worker NodePort. After reboot, 121 can be unavailable
|
||||
# while the Service and VIP are already healthy.
|
||||
- name: INTERNAL_API_URL
|
||||
value: "http://192.168.0.121:32334"
|
||||
value: "http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000"
|
||||
- name: DRIFT_SCAN_NAMESPACES
|
||||
value: "awoooi-prod"
|
||||
resources:
|
||||
|
||||
@@ -18,17 +18,21 @@ import httpx
|
||||
async def main() -> int:
|
||||
api_base = os.environ.get(
|
||||
"INTERNAL_API_URL",
|
||||
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000",
|
||||
"http://awoooi-api-svc.awoooi-prod.svc.cluster.local:8000",
|
||||
)
|
||||
url = f"{api_base}/api/v1/knowledge/embed-all"
|
||||
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
async with httpx.AsyncClient(timeout=1800) as client:
|
||||
try:
|
||||
resp = await client.post(url)
|
||||
print(f"embed-all: {resp.status_code} {resp.text[:200]}")
|
||||
if resp.status_code >= 400:
|
||||
print(f"ERROR: embed-all returned {resp.status_code}", file=sys.stderr)
|
||||
return 1
|
||||
result = resp.json()
|
||||
if int(result.get("failed", 0)) > 0:
|
||||
print(f"ERROR: embed-all failed rows: {result}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
except httpx.RequestError as exc:
|
||||
print(f"ERROR: request failed — {exc}", file=sys.stderr)
|
||||
|
||||
398
scripts/reboot-recovery/full-stack-cold-start-check.sh
Executable file
398
scripts/reboot-recovery/full-stack-cold-start-check.sh
Executable file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env bash
|
||||
# AWOOOI full-stack cold-start readiness check.
|
||||
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
|
||||
SEND_ALERT_TEST=0
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--send-alert-test)
|
||||
SEND_ALERT_TEST=1
|
||||
;;
|
||||
-h|--help)
|
||||
cat <<'USAGE'
|
||||
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [--send-alert-test]
|
||||
|
||||
Default mode is read-only and does not POST an Alertmanager test event.
|
||||
Use --send-alert-test only after AWOOOI API is expected to be ready.
|
||||
USAGE
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $arg" >&2
|
||||
exit 64
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
RED=$'\033[0;31m'
|
||||
GREEN=$'\033[0;32m'
|
||||
YELLOW=$'\033[1;33m'
|
||||
BLUE=$'\033[0;34m'
|
||||
NC=$'\033[0m'
|
||||
|
||||
PASS=0
|
||||
WARN=0
|
||||
FAIL=0
|
||||
|
||||
log_section() {
|
||||
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
|
||||
}
|
||||
|
||||
ok() {
|
||||
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
|
||||
PASS=$((PASS + 1))
|
||||
}
|
||||
|
||||
warn() {
|
||||
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
|
||||
WARN=$((WARN + 1))
|
||||
}
|
||||
|
||||
fail() {
|
||||
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
|
||||
FAIL=$((FAIL + 1))
|
||||
}
|
||||
|
||||
run_local() {
|
||||
local label="$1"
|
||||
shift
|
||||
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
|
||||
ok "$label"
|
||||
cat /tmp/awoooi-cold-start-check.out
|
||||
return 0
|
||||
fi
|
||||
fail "$label"
|
||||
cat /tmp/awoooi-cold-start-check.out
|
||||
return 1
|
||||
}
|
||||
|
||||
ssh_cmd() {
|
||||
local user_host="$1"
|
||||
local cmd="$2"
|
||||
local prefix=""
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
|
||||
fi
|
||||
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
|
||||
}
|
||||
|
||||
probe_http_code() {
|
||||
local url="$1"
|
||||
local code
|
||||
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || true)
|
||||
echo "${code:-000}"
|
||||
}
|
||||
|
||||
probe_tcp() {
|
||||
local host="$1"
|
||||
local port="$2"
|
||||
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
print_header() {
|
||||
echo "AWOOOI full-stack cold-start check"
|
||||
date '+%Y-%m-%d %H:%M:%S %Z'
|
||||
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
|
||||
}
|
||||
|
||||
check_network() {
|
||||
log_section "P0-NETWORK"
|
||||
local host
|
||||
for host in 110 120 121 188; do
|
||||
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
|
||||
ok "ping 192.168.0.$host"
|
||||
else
|
||||
fail "ping 192.168.0.$host"
|
||||
fi
|
||||
|
||||
if probe_tcp "192.168.0.$host" 22; then
|
||||
ok "ssh port 192.168.0.$host:22"
|
||||
else
|
||||
fail "ssh port 192.168.0.$host:22"
|
||||
fi
|
||||
done
|
||||
|
||||
arp -an | grep -E '192\.168\.0\.(110|120|121|188)' || warn "no ARP rows printed for one or more hosts"
|
||||
}
|
||||
|
||||
check_188() {
|
||||
log_section "P0-188-DATA"
|
||||
local out
|
||||
if ! out=$(ssh_cmd "ollama@192.168.0.188" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
||||
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
|
||||
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
|
||||
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
|
||||
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
||||
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
|
||||
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
|
||||
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
|
||||
' 2>&1); then
|
||||
fail "ssh 188 read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
|
||||
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
|
||||
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
|
||||
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
|
||||
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
|
||||
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
|
||||
}
|
||||
|
||||
check_110() {
|
||||
log_section "P0-110-REGISTRY-OBSERVABILITY"
|
||||
local out
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
||||
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
|
||||
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
|
||||
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
|
||||
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
|
||||
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
|
||||
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
done
|
||||
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
|
||||
' 2>&1); then
|
||||
fail "ssh 110 read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
|
||||
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
|
||||
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
|
||||
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
|
||||
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
|
||||
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
|
||||
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
|
||||
}
|
||||
|
||||
check_k3s() {
|
||||
log_section "P1-K3S"
|
||||
local out local_kubectl_out
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
||||
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
|
||||
kcmd() {
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
||||
else
|
||||
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
||||
fi
|
||||
}
|
||||
kcmd get nodes -o wide 2>/dev/null || true
|
||||
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
|
||||
ip addr show | grep 192.168.0.125 || true
|
||||
' 2>&1); then
|
||||
fail "ssh 120 k3s read-only check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
|
||||
if ! grep -q " Ready " <<<"$out"; then
|
||||
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
|
||||
if [ -n "$local_kubectl_out" ]; then
|
||||
echo "LOCAL_KUBECTL_FALLBACK"
|
||||
echo "$local_kubectl_out"
|
||||
fi
|
||||
else
|
||||
local_kubectl_out=""
|
||||
fi
|
||||
|
||||
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
|
||||
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
|
||||
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
|
||||
}
|
||||
|
||||
check_workload_and_alertchain() {
|
||||
log_section "P2-WORKLOAD-ALERTCHAIN"
|
||||
local api_code web_code alert_code
|
||||
local out
|
||||
if out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
|
||||
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
|
||||
echo "API_CODE ${api_code:-000}"
|
||||
echo "WEB_CODE ${web_code:-000}"
|
||||
' 2>/dev/null); then
|
||||
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
|
||||
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
|
||||
else
|
||||
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
|
||||
web_code=$(probe_http_code "http://192.168.0.125:32335/")
|
||||
out="API_CODE $api_code
|
||||
WEB_CODE $web_code"
|
||||
fi
|
||||
|
||||
echo "$out"
|
||||
|
||||
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
|
||||
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
|
||||
|
||||
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
|
||||
alert_code=$(ssh_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
|
||||
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
|
||||
-H '"'"'Content-Type: application/json'"'"' \
|
||||
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
|
||||
echo "ALERTCHAIN_CODE $alert_code"
|
||||
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
|
||||
else
|
||||
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
|
||||
fi
|
||||
}
|
||||
|
||||
check_public_routes() {
|
||||
log_section "P2-PUBLIC-ROUTES"
|
||||
local awoooi_api_code awoooi_web_code momo_code momo_health_code
|
||||
awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health")
|
||||
awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/")
|
||||
momo_code=$(probe_http_code "https://mo.wooo.work/")
|
||||
momo_health_code=$(probe_http_code "https://mo.wooo.work/health")
|
||||
|
||||
echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code"
|
||||
echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code"
|
||||
echo "MOMO_PUBLIC_CODE $momo_code"
|
||||
echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code"
|
||||
|
||||
[[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed"
|
||||
[[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed"
|
||||
[[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed"
|
||||
[[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed"
|
||||
}
|
||||
|
||||
check_schedules() {
|
||||
log_section "P2-SCHEDULES"
|
||||
local out
|
||||
|
||||
if out=$(ssh_cmd "ollama@192.168.0.188" '
|
||||
now=$(date +%s)
|
||||
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do
|
||||
if [ -f "$f" ]; then
|
||||
mt=$(stat -c %Y "$f")
|
||||
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
|
||||
else
|
||||
echo "TEXTFILE_188 $(basename "$f") missing"
|
||||
fi
|
||||
done
|
||||
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
|
||||
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
|
||||
fi
|
||||
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
|
||||
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
|
||||
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
|
||||
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
|
||||
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
|
||||
awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed"
|
||||
else
|
||||
warn "188 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
now=$(date +%s)
|
||||
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
|
||||
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
|
||||
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
|
||||
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do
|
||||
if [ -f "$f" ]; then
|
||||
mt=$(stat -c %Y "$f")
|
||||
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
|
||||
else
|
||||
echo "TEXTFILE_110 $(basename "$f") missing"
|
||||
fi
|
||||
done
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
|
||||
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
|
||||
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
|
||||
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
|
||||
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
|
||||
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
|
||||
else
|
||||
warn "110 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.120" '
|
||||
kcmd() {
|
||||
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
||||
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
||||
else
|
||||
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
||||
fi
|
||||
}
|
||||
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
|
||||
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
|
||||
for j in d.get(\"items\", []):
|
||||
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
|
||||
failed += 1
|
||||
print(\"FAILED_JOBS\", failed)"
|
||||
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
|
||||
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
|
||||
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
|
||||
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
|
||||
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
|
||||
else
|
||||
warn "120 K8s schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
|
||||
if out=$(ssh_cmd "wooo@192.168.0.121" '
|
||||
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
||||
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
|
||||
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
|
||||
else
|
||||
warn "121 schedule check unavailable"
|
||||
echo "$out"
|
||||
fi
|
||||
}
|
||||
|
||||
summary() {
|
||||
log_section "SUMMARY"
|
||||
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
|
||||
exit 2
|
||||
fi
|
||||
if [ "$WARN" -gt 0 ]; then
|
||||
echo "Result: DEGRADED. Core gates passed but warnings remain."
|
||||
exit 1
|
||||
fi
|
||||
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
|
||||
}
|
||||
|
||||
print_header
|
||||
check_network
|
||||
check_188
|
||||
check_110
|
||||
check_k3s
|
||||
check_workload_and_alertchain
|
||||
check_public_routes
|
||||
check_schedules
|
||||
summary
|
||||
Reference in New Issue
Block a user