From 4808995edabd45fff9bf0e0eff1f8380aeb096ff Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 29 May 2026 12:38:58 +0800 Subject: [PATCH] fix(ops): harden reboot recovery and backup alerts --- docs/LOGBOOK.md | 26 + docs/runbooks/FULL-STACK-COLD-START-SOP.md | 79 ++ .../nginx/templates/188-all-sites.conf.j2 | 307 ++++-- k8s/monitoring/prometheus.yml | 8 +- ops/monitoring/alerts-unified.yml | 877 ++++++++++++++---- .../full-stack-backup-baseline.yml | 306 ++++++ .../full-stack-cold-start-baseline.yml | 499 ++++------ .../ops/backup-alert-label-contract-check.py | 260 ++++++ .../ops/backup-alert-live-visibility-check.py | 242 +++++ scripts/ops/prometheus-rule-drift-guard.sh | 42 +- .../ops/recovery-scorecard-contract-check.py | 148 +++ .../cold-start-textfile-exporter.sh | 52 +- .../full-stack-cold-start-check.sh | 178 ++-- 13 files changed, 2353 insertions(+), 671 deletions(-) create mode 100644 ops/reboot-recovery/full-stack-backup-baseline.yml create mode 100755 scripts/ops/backup-alert-label-contract-check.py create mode 100755 scripts/ops/backup-alert-live-visibility-check.py create mode 100755 scripts/ops/recovery-scorecard-contract-check.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 402496ea..6dc12aa3 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -22528,3 +22528,29 @@ production browser smoke: - 24h 完整自動修復 production claim:0%;目前仍不能宣稱真正 AI 自動修復閉環已達成。 - 完整 AI 自動化管理產品化:約 99.3%,但「真正全自動 repair / approval / learning / KM writeback 閉環」 仍需以 24h production evidence 補齊。 + +## 2026-05-29 | 重開機恢復續修:aiops 入口、備份告警與 Ansible baseline 收斂 + +**背景**:統帥要求確認所有主機重啟後,服務、網站、工具、資料庫、排程與備份都能快速恢復,且不能只停在人工熱修。前一輪已修正 AWOOOI/Flywheel stale incident 與成功率規則;本輪接著處理 cold-start gate 仍未綠燈的項目。 + +**現場修復**: +- 188 public gateway 的 `aiops.wooo.work` 原本仍反代到失聯的 `192.168.0.120:31234/31235`,導致 public route 502;已改為正式 VIP `192.168.0.125:32334/32335`,`/` 回 307 到 `/zh-TW`,`/api/v1/health` 回 `healthy`。 +- 188 `/etc/nginx/sites-enabled/` 中有舊備份檔仍被 Nginx include,造成新 vhost 被 `conflicting server name ... ignored`;已移到 `/etc/nginx/sites-disabled-codex/`,保留備份但不再載入。 +- 110 `fwupd.service` / `fwupd-refresh.service` 是 stale failed state;已 `reset-failed`,`systemctl --failed` 回 0。 +- Prometheus live `alerts.yml` 與 `alerts-unified.canonical.yml` 被縮水成舊版,缺完整備份、異地同步、credential escrow、cold-start scorecard 規則;已重新同步 repo 的 `ops/monitoring/alerts-unified.yml` 到兩個 live 檔並 reload Prometheus。 +- `prometheus-rule-drift-guard` 已確認 `missing_required_count=0`、`current_matches_canonical=1`,之後不會每 5 分鐘把完整備份規則拉回舊版。 +- Ansible `infra/ansible/roles/nginx/templates/188-all-sites.conf.j2` 已同步 188 live public gateway baseline,避免下一次跑 `nginx-sync.yml` 又把 aiops 指回單一 120 節點。 + +**驗證**: +- `https://aiops.wooo.work/` public route 與 TLS 已回 200/307 成功範圍;`https://aiops.wooo.work/api/v1/health` 回 `healthy prod`。 +- `bash /home/wooo/scripts/full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1`:public routes 全部通過,110 failed units = 0,momo scheduler 以 container health + 2h 內 task activity 判定正常,momo 當月 `daily_sales_snapshot`/`realtime_sales_monthly` 一致,結果為 `PASS=72 WARN=2 BLOCKED=3`。 +- `BLOCKED=3` 全部仍指向 120:`ping 192.168.0.120`、`ssh 192.168.0.120:22`、`ssh 120 k3s read-only check`。 +- Google Drive/rclone daily full sync 仍正常:`rclone-last-success` 與 `rclone-full-verify-last-success` 都是 2026-05-29,full repos 覆蓋 `awoooi configs gitea harbor momo langfuse monitoring signoz open-webui clawbot sentry ai-artifacts public-routes`。 +- 完整備份告警規則已載入:`BackupAggregateRunFailed`、`BackupConfigCapturePartial`、`BackupOffsiteCopyStale`、`BackupCredentialEscrowEvidenceMissing`、`awoooi_recovery_core_ready`、`ColdStartRecoveryBlocked` 全部存在;Prometheus rule count = 142。 +- 因 120 失聯,`BackupConfigCapturePartial{target="120-k3s-host-configs"}` 與 `BackupAggregateRunFailed` 會進入 pending/firing,這是正確訊號,不應消音。 +- `mo.wooo.work` 資料修復:momo 自動匯入 2026-05-29 11:55 已把 2026-05-01~2026-05-28 的 17,353 筆寫入 `daily_sales_snapshot`,但同步 `realtime_sales_monthly` 時 PostgreSQL index 內部錯誤 `posting list tuple ... cannot be split`,導致 5 月分析表為 0。已在 188 `momo-db` 執行 `REINDEX TABLE CONCURRENTLY public.realtime_sales_monthly`,再以同日期範圍從 `daily_sales_snapshot` idempotent 補同步;驗證 `daily_sales_snapshot=17,353`、`realtime_sales_monthly=17,353`、`realtime_sales_monthly` 總筆數 `774,111`,日期最大值到 `2026-05-28`,並清除 momo 應用 cache。 + +**不可宣稱完成**: +- 120 仍不可達,K3s node `mon` 是 `NotReady,SchedulingDisabled`;`mon1` 可承載 AWOOI workloads,但 full cold-start done criteria 尚未達成。 +- 110 backup aggregate `failed_count=1` 是 120 config capture 無法完成;必須 120 回來後重跑 `/backup/scripts/backup-configs.sh` 或 `/backup/scripts/backup-all.sh`,再補跑 Google Drive/rclone full sync。 +- `SLO_KMGrowthRate_Low` 仍為 warning(24h KM 約 19/20),不是網站 outage,但需後續追 KM 產出。 diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index ae46188f..2b834e5d 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -590,6 +590,84 @@ Prometheus rules in `ops/monitoring/alerts-unified.yml` alert when the monitor i 4. Release high-load services only after `GREEN` and load/core stays below `1.0` for 15 minutes. 5. Record the final output summary and any manual repair in `docs/LOGBOOK.md`. +### 13.6 2026-05-29 補充:188 Public Gateway 與備份告警 + +`aiops.wooo.work` 的 188 public gateway 不可再指向單一 `192.168.0.120:31234/31235`。120 失聯時這會讓 public route 直接 502。正式 baseline 必須走 K3s VIP: + +```nginx +location /api/ { + proxy_pass http://192.168.0.125:32334/api/; +} + +location /api/v1/ws { + proxy_pass http://192.168.0.125:32334/api/v1/ws; +} + +location / { + proxy_pass http://192.168.0.125:32335; +} +``` + +變更來源必須是 `infra/ansible/roles/nginx/templates/188-all-sites.conf.j2`,再用 `infra/ansible/playbooks/nginx-sync.yml` 收斂;禁止只改 188 live 檔而不回寫 Ansible baseline。 + +備份告警有兩層,缺一不可: + +- `ops/monitoring/alerts-unified.yml` 是 repo canonical。 +- 110 live `/home/wooo/monitoring/alerts.yml` 與 `/home/wooo/monitoring/alerts-unified.canonical.yml` 必須一致,否則 `prometheus-rule-drift-guard` 可能把規則拉回舊版。 + +重啟後必查: + +```bash +curl -s http://127.0.0.1:9090/api/v1/rules \ + | python3 -c 'import json,sys; d=json.load(sys.stdin); names=[r.get("name") for g in d["data"]["groups"] for r in g["rules"]]; print([n for n in ["BackupAggregateRunFailed","BackupConfigCapturePartial","BackupOffsiteCopyStale","BackupCredentialEscrowEvidenceMissing","ColdStartRecoveryBlocked"] if n not in names])' + +cat /home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom +``` + +若 120 尚未恢復,`BackupConfigCapturePartial{target="120-k3s-host-configs"}` 與 cold-start blocked 是正確訊號,不可消音。120 恢復後再重跑: + +```bash +/backup/scripts/backup-configs.sh +/backup/scripts/backup-all.sh +/backup/scripts/sync-offsite-backups.sh --mode sync +/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color +``` + +### 13.7 2026-05-29 補充:momo PostgreSQL Index 與資料同步 + +`mo.wooo.work` 不能只看 `/health` 或首頁 200。重啟或 fsck 後,PostgreSQL index 可能讓匯入流程表面完成,但 `daily_sales_snapshot` 未同步到 `realtime_sales_monthly`。本次症狀: + +- `daily_sales_snapshot` 已有 2026-05-01 到 2026-05-28 的 17,353 筆。 +- `realtime_sales_monthly` 同日期範圍為 0 筆。 +- momo-scheduler log 出現 PostgreSQL 內部錯誤 `posting list tuple ... cannot be split`。 + +標準處理順序: + +```bash +# 188 / momo-db,只重建索引,不刪資料 +docker exec -i momo-db bash -lc 'psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -v ON_ERROR_STOP=1' <<'SQL' +REINDEX TABLE CONCURRENTLY public.realtime_sales_monthly; +SQL +``` + +重建索引後,才可針對缺漏日期做 idempotent 補同步。正式作法必須先確認 `realtime_sales_monthly` 該日期範圍筆數,若非 0,需先保存查詢結果並確認是否重跑同範圍同步;不可整表 truncate、不可整庫 restore。補同步後至少驗證: + +```sql +SELECT count(*), min(snapshot_date::date), max(snapshot_date::date) +FROM daily_sales_snapshot +WHERE snapshot_date::date BETWEEN DATE '2026-05-01' AND DATE '2026-05-28'; + +SELECT count(*), min("日期"::date), max("日期"::date) +FROM realtime_sales_monthly +WHERE "日期"::date BETWEEN DATE '2026-05-01' AND DATE '2026-05-28'; +``` + +兩張表同日期範圍筆數與日期上下界必須一致。完成後清除 momo 應用 cache: + +```bash +docker exec momo-pro-system python -c 'from services.cache_service import clear_all_cache; clear_all_cache(); print("cache_cleared")' +``` + --- ## 14. Done Criteria @@ -604,6 +682,7 @@ All must be true: - AWOOOI API and Web reachable through NodePort/VIP. - Alertmanager E2E webhook succeeds. - cron/CronJob schedules are active, unsuspended, and verified. +- momo `daily_sales_snapshot` 與 `realtime_sales_monthly` 在最新匯入日期範圍內筆數一致。 - Sentry and SignOz are either healthy or explicitly in controlled backlog recovery. - High-load batch services are capped or delayed. - Runners are guarded and released last. diff --git a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 index a9936ea7..47687632 100644 --- a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 +++ b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 @@ -1,145 +1,268 @@ # 188-all-sites.conf.j2 -# AWOOOI Nginx 全站設定 — 由 Ansible nginx-sync.yml playbook 管理 -# 禁止直接手改此檔案 → 請修改 roles/nginx/templates/188-all-sites.conf.j2 -# 部署指令: ansible-playbook -i inventory/hosts.yml playbooks/nginx-sync.yml --tags 188 -# 最後同步: {{ ansible_date_time.iso8601 }} - -# ============================================================ -# OpenClaw (port 8088) -# ============================================================ +# AWOOOI 188 public gateway baseline managed by infra/ansible/playbooks/nginx-sync.yml. +# 2026-05-29 Codex: synced from live 188 after reboot recovery; aiops.wooo.work +# must use the K3s VIP 192.168.0.125:32334/32335 instead of a single 120 node. +# +# ============================================================================= +# AIOPS - aiops.wooo.work +# ============================================================================= server { listen 80; - server_name openclaw.awoooi.com; + server_name aiops.wooo.work; + return 301 https://$server_name$request_uri; +} - location / { - proxy_pass http://127.0.0.1:8088; +server { + listen 443 ssl http2; + server_name aiops.wooo.work; + + ssl_certificate /etc/letsencrypt/live/aiops.wooo.work/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/aiops.wooo.work/privkey.pem; + + # API + location /api/ { + proxy_pass http://192.168.0.125:32334/api/; + proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # WebSocket + location /api/v1/ws { + proxy_pass http://192.168.0.125:32334/api/v1/ws; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + } + + # Frontend + location / { + proxy_pass http://192.168.0.125:32335; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } +} + +# ============================================================================= +# GitLab - gitlab.wooo.work (代理到 110) +# ============================================================================= +server { + listen 80; + server_name gitlab.wooo.work; + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + server_name gitlab.wooo.work; + + ssl_certificate /etc/letsencrypt/live/gitlab.wooo.work/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/gitlab.wooo.work/privkey.pem; + + client_max_body_size 500m; + + location / { + proxy_pass http://192.168.0.110:8929; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; proxy_read_timeout 300s; + proxy_connect_timeout 300s; } } -# ============================================================ -# tsenyang (port 3000) -# ============================================================ +# ============================================================================= +# SigNoz - signoz.wooo.work +# ============================================================================= server { listen 80; - server_name tsenyang.awoooi.com; - - location / { - proxy_pass http://127.0.0.1:3000; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - } -} - -# ============================================================ -# momo (port 5003) -# ============================================================ -server { - listen 80; - server_name momo.awoooi.com; - - location / { - proxy_pass http://127.0.0.1:5003; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - } -} - -# ============================================================ -# SignOz (port 3301) -# ============================================================ -server { - listen 80; - server_name signoz.awoooi.internal; + server_name signoz.wooo.work; location / { proxy_pass http://127.0.0.1:3301; + proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; + } +} + +# ============================================================================= +# Tsenyang - www.tsenyang.com (待遷移,暫時代理到 110) +# ============================================================================= +server { + listen 80; + server_name www.tsenyang.com tsenyang.com; + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + server_name www.tsenyang.com tsenyang.com; + + ssl_certificate /etc/letsencrypt/live/www.tsenyang.com/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/www.tsenyang.com/privkey.pem; + + location / { + proxy_pass http://127.0.0.1:3000; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } +} + +# ============================================================================= +# Stock Platform - stock.wooo.work +# ============================================================================= +server { + listen 80; + server_name stock.wooo.work; + + location /.well-known/acme-challenge/ { + root /var/www/html; + } + + location / { + return 301 https://$server_name$request_uri; + } +} + +server { + listen 443 ssl http2; + server_name stock.wooo.work; + + ssl_certificate /etc/letsencrypt/live/stock.wooo.work/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/stock.wooo.work/privkey.pem; + + # 後台直接接收,不經由網站主站 Basic Auth + location = /admin { + return 301 /admin/; + } + + location /admin/ { + auth_basic off; + proxy_pass http://192.168.0.110:31235; + proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; - } -} - -# ============================================================ -# MinIO (port 9000 API / 9001 Console) -# ============================================================ -server { - listen 80; - server_name minio.awoooi.internal; - - location / { - proxy_pass http://127.0.0.1:9001; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; - client_max_body_size 500m; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + } + + # 前台主站 + location / { + proxy_pass http://192.168.0.110:31235; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; } } -# ============================================================ -# LiteLLM (port 4000) -# ============================================================ +# ============================================================================= +# MOMO PRO - mo.wooo.work (待部署) +# ============================================================================= server { listen 80; - server_name litellm.awoooi.internal; + server_name mo.wooo.work; + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + server_name mo.wooo.work; + + ssl_certificate /etc/letsencrypt/live/mo.wooo.work/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/mo.wooo.work/privkey.pem; location / { - proxy_pass http://127.0.0.1:4000; + proxy_pass http://127.0.0.1:5003; + proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_read_timeout 300s; } } -# ============================================================ -# n8n (port 5678) -# ============================================================ +# ============================================================================= +# Bitan 藥局 - bitan.wooo.work (待部署) +# ============================================================================= server { listen 80; - server_name n8n.awoooi.internal; + server_name bitan.wooo.work; + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + server_name bitan.wooo.work; + + ssl_certificate /etc/letsencrypt/live/bitan.wooo.work/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/bitan.wooo.work/privkey.pem; + + client_max_body_size 25m; location / { - proxy_pass http://127.0.0.1:5678; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://192.168.0.110:3003; + proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; } } -# ============================================================ -# Open WebUI (port 3010) -# ============================================================ +# ============================================================================= +# VTuber - vtuber.wooo.work +# ============================================================================= server { - listen 80; - server_name open-webui.awoooi.internal; + server_name vtuber.wooo.work; + + location /.well-known/acme-challenge/ { + root /var/www/html; + } location / { - proxy_pass http://127.0.0.1:3010; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; + proxy_pass https://192.168.0.110; + proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; - proxy_read_timeout 300s; - } -} - -# ============================================================ -# Docker Registry (port 5001) -# ============================================================ -server { - listen 80; - server_name registry.awoooi.internal; - - location / { - proxy_pass http://127.0.0.1:5001; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; - client_max_body_size 2g; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; } + + listen 443 ssl; # managed by Certbot + ssl_certificate /etc/letsencrypt/live/vtuber.wooo.work/fullchain.pem; # managed by Certbot + ssl_certificate_key /etc/letsencrypt/live/vtuber.wooo.work/privkey.pem; # managed by Certbot + include /etc/letsencrypt/options-ssl-nginx.conf; # managed by Certbot + ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem; # managed by Certbot + +} + +server { + if ($host = vtuber.wooo.work) { + return 301 https://$host$request_uri; + } # managed by Certbot + + + listen 80; + server_name vtuber.wooo.work; + return 404; # managed by Certbot + + } diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index 2452a9fd..14ad7764 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -57,8 +57,8 @@ scrape_configs: - https://mo.wooo.work - http://192.168.0.188:4000/health/liveliness - http://192.168.0.110:3001 - - http://192.168.0.120:31234 - - http://192.168.0.120:31235 + - http://192.168.0.125:32334/api/v1/health + - http://192.168.0.125:32335 - https://www.tsenyang.com - http://stock.wooo.work - https://bitan.wooo.work @@ -93,8 +93,8 @@ scrape_configs: - 192.168.0.188:6380 - 192.168.0.188:8089 # K3s Worker - - 192.168.0.120:31234 - - 192.168.0.120:31235 + - 192.168.0.125:32334 + - 192.168.0.125:32335 relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 56163c6e..9521dec1 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -15,6 +15,39 @@ groups: + # ========================================================================= + # Full-stack recovery scorecard recording rules + # ========================================================================= + - name: full_stack_recovery_scorecard_rules + interval: 60s + rules: + - record: awoooi_recovery_core_ready + expr: | + sum without(result) ( + awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} == bool 1 + ) + * on(host,scope) ( + awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} == bool 0 + ) + * on(host,scope) ( + awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} == bool 0 + ) + * on(host,scope) ( + (time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"}) < bool 3600 + ) + + - record: awoooi_recovery_dr_offsite_ready + expr: | + max by(host) ( + awoooi_backup_offsite_configured{host="110"} == bool 1 + ) + * on(host) max by(host) ( + awoooi_backup_offsite_fresh{host="110"} == bool 1 + ) + * on(host) min by(host) ( + awoooi_backup_credential_escrow_fresh{host="110"} == bool 1 + ) + # ========================================================================= # 主機層告警 (host_alerts) # ========================================================================= @@ -41,7 +74,7 @@ groups: severity: warning layer: systemd-188 team: ops - auto_repair: "true" + auto_repair: "false" # MCP Phase 2a (ADR-071, 2026-04-11 Claude Sonnet 4.6): SSH MCP 路由標籤 mcp_provider: "ssh_host" host_type: "bare_metal" @@ -49,9 +82,6 @@ groups: annotations: summary: "主機 {{ $labels.host }} CPU 高負載" description: "CPU 使用率超過 90% 持續 10 分鐘;若 load5/core 未超過 1.5,先視為容量觀察與診斷,不直接修復。" - # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷而非 kubectl - auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%cpu | head -20' (host CPU 診斷;禁 kubectl restart awoooi-* — 主因常為第三方服務 Sentry/ClickHouse/Snuba)" - runbook: "host CPU 高負載排查:先 SSH ps aux 看 top 進程;若為第三方服務(Sentry/ClickHouse 等)寫 ADR 升級資源或調 limit,禁止 kubectl restart 跨 domain" - alert: HostLoadAverageSustainedHigh # 2026-05-05 ogt + Codex: 110/188 長時間過載基線。 @@ -86,9 +116,6 @@ groups: annotations: summary: "主機 {{ $labels.host }} 記憶體不足" description: "記憶體使用率超過 85%" - # 2026-05-02 ogt + Claude Sonnet 4.6: 引導 LLM 走 SSH 診斷 - auto_repair_action: "ssh {{ $labels.instance }} 'ps aux --sort=-%mem | head -20' (host 記憶體診斷;禁 kubectl restart — 主因常為第三方服務)" - runbook: "host 記憶體不足排查:SSH 看 top 進程;若為第三方服務需擴容或調 limit" - alert: HostOutOfDiskSpace expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85 @@ -173,7 +200,7 @@ groups: description: "過去 24 小時有備份失敗" - alert: VeleroBackupNotRun - expr: time() - velero_backup_last_successful_timestamp > 86400 + expr: max by(host, namespace) (awoooi_velero_latest_completed_backup_fresh{host="110",namespace="velero"}) == 0 for: 10m labels: severity: critical @@ -183,7 +210,7 @@ groups: auto_repair: "false" annotations: summary: "Velero 超過 24 小時未成功備份" - description: "最後一次成功備份超過 24 小時" + description: "backup health exporter 顯示 latest Completed Velero backup 超過 25 小時或不存在。" # Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6) # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success @@ -521,26 +548,8 @@ groups: team: platform auto_repair: "false" annotations: - summary: "Alertmanager 主鏈路 2 小時內未收到告警" - description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test" - - - alert: SourceProviderIngestionStale - expr: | - time() - max by (source) ( - awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} - ) > 86400 - for: 15m - labels: - severity: warning - layer: k8s - component: source-ingestion - team: platform - auto_repair: "false" - alert_category: "alertchain_provider_freshness" - annotations: - summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" - description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" - runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。" + summary: "2 小時內未收到任何告警 ({{ $labels.source }})" + description: "可能是告警鏈路問題,請執行 Smoke Test" - alert: AlertChainUnhealthy expr: awoooi_alert_chain_healthy == 0 @@ -633,8 +642,6 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 2 core" description: "{{ $labels.container_name }} 5m CPU 使用 >2 core 且持續 10 分鐘,需檢查是否缺 cpus limit、merge backlog、爬蟲尖峰或模型推理。" @@ -651,8 +658,6 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} CPU 持續超過 4 core" description: "{{ $labels.container_name }} 已持續吃超過 4 core,會拖垮 110/188 主機;需要 AI 判斷是否限流、暫停 worker 或套用既有 playbook。" @@ -670,8 +675,6 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 記憶體超過 limit 85%" description: "{{ $labels.container_name }} memory / limit > 85% 持續 10 分鐘。若是 Kafka/ClickHouse/Gitea/Taskworker,需先判斷 workload,不可直接降 limit。" @@ -689,8 +692,6 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 15 分鐘重啟超過 5 次" description: "Docker restart textfile exporter 顯示 {{ $labels.container_name }} 重啟暴增,避免再次出現 litellm 24,464 次靜默崩潰。" @@ -708,8 +709,6 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "容器 {{ $labels.container_name }} 缺 CPU 或 memory limit" description: "{{ $labels.container_name }} 在 Docker Host 上沒有完整 CPU/memory guardrail,長時間尖峰可能拖垮 110/188。" @@ -727,8 +726,6 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "Gitea Actions job {{ $labels.container_name }} 執行超過 20 分鐘" description: "{{ $labels.container_name }} 已超過 20 分鐘,110 曾出現 code-review/CD job timeout 失效而持續佔用 runner。" @@ -746,8 +743,6 @@ groups: alert_category: infrastructure notification_type: TYPE-3 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 15 分鐘重啟超過 2 次" description: "{{ $labels.unit }} 在 15 分鐘內重啟暴增;110 曾發生 WatchdogSec=5min 造成 runner 每 5 分鐘自殺重啟。" @@ -764,8 +759,6 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 啟用了 WatchdogSec" description: "{{ $labels.unit }} WatchdogSec={{ $value }} 秒。GitHub Actions runner service 不應被 systemd watchdog 週期性殺掉。" @@ -782,111 +775,12 @@ groups: alert_category: infrastructure notification_type: TYPE-1 auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" annotations: summary: "Systemd runner {{ $labels.unit }} 缺 CPU 或 memory quota" description: "{{ $labels.unit }} 仍為 unlimited;CI runner 會與 Sentry/ClickHouse/Gitea 搶主機 CPU。" auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'systemctl show {{ $labels.unit }} -p CPUQuotaPerSecUSec -p MemoryMax -p ActiveState -p SubState'" runbook: "建議 baseline:每個 runner CPUQuota=200%、MemoryMax=2G;由 /home/wooo/scripts/apply-runner-systemd-guardrails.sh 套用,若仍過載再限制並行度或分流。" - # ========================================================================= - # Full-stack reboot/cold-start gate monitor - # ========================================================================= - - name: cold_start_recovery_alerts - interval: 60s - rules: - - alert: ColdStartMonitorMissing - # 2026-05-06 ogt + Codex: full-stack reboot recovery must have a durable signal, - # not only a one-off terminal transcript. - expr: absent(awoooi_cold_start_monitor_up{host="110"}) - for: 20m - labels: - severity: warning - layer: host-110 - team: ops - alert_category: infrastructure - notification_type: TYPE-1 - auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" - annotations: - summary: "冷啟動 gate monitor 20 分鐘無指標" - description: "110 沒有暴露 awoooi_cold_start_monitor_up,代表 full-stack cold-start gate 沒有被 Prometheus 監控。" - auto_repair_action: "ssh 192.168.0.110 'crontab -l | sed -n \"/AWOOOI cold-start monitor start/,/AWOOOI cold-start monitor end/p\"; ls -l /home/wooo/node_exporter_textfiles/cold_start_recovery.prom /home/wooo/reboot-recovery/cold-start-last.log 2>/dev/null || true'" - runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh;只安裝 read-only textfile exporter,不需要 sudo。" - - - alert: ColdStartMonitorStale - expr: time() - awoooi_cold_start_last_run_timestamp{host="110"} > 1800 - for: 10m - labels: - severity: warning - layer: host-110 - team: ops - alert_category: infrastructure - notification_type: TYPE-1 - auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" - annotations: - summary: "冷啟動 gate monitor 超過 30 分鐘未更新" - description: "cold_start_recovery.prom stale,無法確認 110/120/121/188 的重開機 gate 是否仍維持健康。" - auto_repair_action: "ssh 192.168.0.110 'tail -80 /tmp/awoooi-cold-start-monitor.cron.log 2>/dev/null || true; tail -120 /home/wooo/reboot-recovery/cold-start-last.log 2>/dev/null || true'" - runbook: "檢查 110 user cron、SSH key、/home/wooo/node_exporter_textfiles 權限;不要把 stale 當作服務可用。" - - - alert: ColdStartRecoveryBlocked - expr: awoooi_cold_start_blocked_gates{host="110"} > 0 or awoooi_cold_start_last_result{host="110",result="blocked"} == 1 - for: 5m - labels: - severity: critical - layer: full-stack - team: ops - alert_category: infrastructure - notification_type: TYPE-3 - auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" - annotations: - summary: "全棧冷啟動 gate 有 BLOCKED" - description: "full-stack cold-start check 偵測到 {{ $value }} 個 blocked gate。AI 自動修復只能先蒐證與通知,不可釋放 runner/CD 或重啟 stateful service。" - auto_repair_action: "ssh 192.168.0.110 'tail -220 /home/wooo/reboot-recovery/cold-start-last.log'" - runbook: "從第一個 BLOCKED gate 開始修;遵守 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 phase order。" - - - alert: ColdStartRecoveryDegraded - expr: awoooi_cold_start_warn_gates{host="110"} > 0 or awoooi_cold_start_last_result{host="110",result="degraded"} == 1 - for: 30m - labels: - severity: warning - layer: full-stack - team: ops - alert_category: infrastructure - notification_type: TYPE-1 - auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" - annotations: - summary: "全棧冷啟動 gate 持續 degraded" - description: "full-stack cold-start check 連續 30 分鐘有 WARN。此狀態不可宣告 reboot recovery 完成,也不可釋放高負載 runner/CD。" - auto_repair_action: "ssh 192.168.0.110 'tail -180 /home/wooo/reboot-recovery/cold-start-last.log'" - runbook: "清掉 WARN 後再執行 final gate:bash scripts/reboot-recovery/full-stack-cold-start-check.sh --watch --interval 60 --max-attempts 30 --send-alert-test。" - - - alert: ColdStartLastGreenTooOld - expr: (time() - awoooi_cold_start_last_green_timestamp{host="110"} > 21600) and awoooi_cold_start_last_green_timestamp{host="110"} > 0 - for: 30m - labels: - severity: warning - layer: full-stack - team: ops - alert_category: infrastructure - notification_type: TYPE-1 - auto_repair: "false" - mcp_provider: "ssh_host" - host_type: "bare_metal" - annotations: - summary: "全棧 cold-start monitor 超過 6 小時沒有 GREEN" - description: "上次 GREEN 已超過 6 小時,表示冷啟動 baseline 長期沒有完整通過。" - runbook: "檢查 /home/wooo/reboot-recovery/cold-start-last.log;若僅因 read-only monitor 缺 final webhook POST,應修 monitor mode 而不是關告警。" - # ========================================================================= # MinIO / Kali 告警 # ========================================================================= @@ -1152,10 +1046,10 @@ groups: # 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12 # ========================================================================= - name: awoooi_backup_restore - interval: 1h + interval: 1m rules: - alert: BackupRestoreTestFailed - expr: awoooi_backup_restore_test_success == 0 + expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_failed_jobs{host="110",namespace="velero",cronjob="backup-restore-test"}) > 0 for: 5m labels: severity: critical @@ -1164,11 +1058,37 @@ groups: auto_repair: "false" annotations: summary: "備份還原 dry-run 測試失敗" - description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。" - runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run" + description: "velero namespace 中保留了失敗的 backup-restore-test Job,備份可能無法還原。立即人工驗證備份狀態。" + runbook: "先找最新 Completed Velero backup,再執行 restore dry-run;禁止在 production namespace 做真還原" + + - alert: BackupRestoreTestMissing + expr: absent(awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) + for: 30m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "備份還原 dry-run 監控指標缺失" + description: "Prometheus 沒有收到 awoooi_velero_restore_test_cron_present;110 backup health exporter 或 120 kubectl 查詢可能失效。" + runbook: "檢查 110 backup_health.prom、SSH 110→120、以及 velero namespace 的 backup-restore-test CronJob" + + - alert: BackupRestoreTestCronMissing + expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_cron_present{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0 + for: 15m + labels: + severity: critical + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "備份還原 dry-run CronJob 缺失" + description: "velero namespace 找不到 backup-restore-test CronJob;備份可還原性沒有定期驗證。" + runbook: "kubectl apply k8s/awoooi-prod/17-configmap-backup-restore-scripts.yaml 與 16-cronjob-backup-restore-test.yaml" - alert: BackupRestoreTestStale - expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200 + expr: max by(host, namespace, cronjob) (awoooi_velero_restore_test_last_success_fresh{host="110",namespace="velero",cronjob="backup-restore-test"}) == 0 for: 10m labels: severity: warning @@ -1177,9 +1097,375 @@ groups: auto_repair: "false" annotations: summary: "備份還原測試超過 8 天未執行" - description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" + description: "backup-restore-test CronJob 沒有 8 天內成功紀錄;週排程 CronJob 可能失效。" runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" + # ========================================================================= + # Host / service / config backup health + # ========================================================================= + - name: full_stack_backup_health_alerts + interval: 1m + rules: + - alert: BackupHealthMonitorMissing110 + expr: absent(awoooi_backup_health_monitor_up{host="110"}) + for: 20m + labels: + severity: warning + layer: host-backup + component: backup-health-monitor + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份健康指標缺失" + description: "110 沒有輸出 backup_health.prom,無法確認資料庫、設定檔與服務備份是否新鮮。" + runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py" + + - alert: BackupHealthMonitorMissing188 + expr: absent(awoooi_backup_health_monitor_up{host="188"}) + for: 20m + labels: + severity: warning + layer: host-backup + component: backup-health-monitor + host: "188" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "188 備份健康指標缺失" + description: "188 沒有輸出 backup_health.prom,無法確認 110 rsync 與 momo PostgreSQL 備份是否新鮮。" + runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/backup-health-textfile-exporter.py" + + - alert: BackupHealthMonitorStale + expr: time() - awoooi_backup_health_last_run_timestamp{host=~"110|188"} > 1800 + for: 10m + labels: + severity: warning + layer: host-backup + component: backup-health-monitor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "{{ $labels.host }} 備份健康 exporter 超過 30 分鐘未更新" + description: "backup health textfile exporter stale,備份狀態不可觀測。" + runbook: "SSH 主機檢查 cron、/tmp/awoooi-backup-health-textfile-exporter.cron.log 與 node-exporter textfile collector" + + - alert: BackupExpectedJobMissing + expr: awoooi_backup_job_configured{host=~"110|188"} == 0 + for: 15m + labels: + severity: critical + layer: host-backup + component: backup-cron + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "{{ $labels.host }} 備份排程缺失:{{ $labels.exported_job }}" + description: "預期備份 cron/config 不存在;下一次重開機後資料可能沒有可用還原點。" + runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的備份章節補回 cron,先 dry-run 再執行" + + - alert: BackupScheduleDuplicateActiveEntries + expr: awoooi_backup_cron_active_duplicate_count{host="110"} > 0 + for: 15m + labels: + severity: warning + layer: host-backup + component: backup-cron + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份 crontab 有重複 active entries" + description: "110 crontab 目前有 {{ $value }} 個 exact duplicate active entry;可能造成 offsite sync、verifier 或 status job 重複執行。" + runbook: "SSH 110 執行 `crontab -l | awk 'NF && $0 !~ /^#/ {count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}'`,只移除重複 active entry,不要刪除未理解的備份排程。" + + - alert: BackupScheduleSingletonMismatch + expr: awoooi_backup_cron_singular_entry_ok{host="110"} == 0 + for: 15m + labels: + severity: warning + layer: host-backup + component: backup-cron + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份排程單一入口異常:{{ $labels.entry }}" + description: "{{ $labels.entry }} 應該剛好只有一個 active cron entry;目前 count={{ $value }},可能造成排程缺失或重複執行。" + runbook: "用 Ansible `110-devops.yml --tags backup_jobs` 收斂排程,並用 `scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --live --no-color` 驗證。" + + - alert: BackupScriptMissing + expr: awoooi_backup_script_present{host=~"110|188"} == 0 + for: 15m + labels: + severity: critical + layer: host-backup + component: backup-script + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "{{ $labels.host }} 備份腳本缺失:{{ $labels.script }}" + description: "備份排程可能存在,但實際腳本不存在或路徑漂移。" + runbook: "從 repo 部署對應 scripts/backup 或 scripts/ops 腳本,確認權限 0755" + + - alert: BackupJobStale + expr: awoooi_backup_job_fresh{host=~"110|188"} == 0 + for: 15m + labels: + severity: critical + layer: host-backup + component: backup-freshness + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "{{ $labels.host }} 備份過舊:{{ $labels.exported_job }}" + description: "{{ $labels.exported_job }} 最新成功證據超過 {{ $labels.max_age_hours }} 小時或不存在;來源 {{ $labels.source }},目標 {{ $labels.target }}。" + runbook: "先檢查備份 log 與磁碟空間,再手動執行對應備份;禁止直接刪除舊備份或 production 資料" + + - alert: BackupAggregateRunFailed + expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0 + for: 10m + labels: + severity: warning + layer: host-backup + component: backup-all + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目" + description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。" + runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log,修正後執行 /backup/scripts/backup-all.sh" + + - alert: BackupConfigCapturePartial + expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0 + for: 10m + labels: + severity: warning + layer: host-backup + component: backup-config-capture + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 設定檔備份缺少關鍵目標:{{ $labels.target }}" + description: "configs restic snapshot 雖可能存在,但最新設定檔備份未成功捕捉 {{ $labels.target }};source={{ $labels.source }}。這會影響下一次冷啟動或災難還原的設定還原完整性。" + runbook: "先修復對應主機或 K8s API 可達性,再執行 /backup/scripts/backup-configs.sh,確認 awoooi_backup_config_capture_ok 回到 1,最後補跑 Google Drive/rclone offsite sync。" + + - alert: BackupConfigCaptureStatusStale + expr: absent(awoooi_backup_config_capture_status_timestamp{host="110"}) or (time() - awoooi_backup_config_capture_status_timestamp{host="110"} > 172800) + for: 30m + labels: + severity: warning + layer: host-backup + component: backup-config-capture + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 設定檔備份覆蓋率狀態缺失或過舊" + description: "backup-configs.sh 沒有新鮮的 capture status;無法判斷 110/120/121/188/K8s 設定檔是否真的被最新 snapshot 捕捉。" + runbook: "部署新版 /backup/scripts/backup-configs.sh 與 /home/wooo/scripts/backup-health-textfile-exporter.py,執行 /backup/scripts/backup-configs.sh 後刷新 textfile exporter。" + + - alert: BackupIntegrityCheckMissingOrFailed + expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restic_check"}) or awoooi_backup_integrity_fresh{host="110",scope="restic_check"} == 0 + for: 30m + labels: + severity: critical + layer: host-backup + component: backup-integrity + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "110 備份倉庫完整性檢查缺失或失敗" + description: "每週 restic check 沒有成功證據,或有 repo 檢查失敗;目前不能假設備份可讀。" + runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode check`,先看 /backup/logs/backup-integrity.log;禁止刪 repo 或 prune 直到確認原因" + + - alert: BackupRestoreDrillMissingOrFailed + expr: absent(awoooi_backup_integrity_fresh{host="110",scope="restore_drill"}) or awoooi_backup_integrity_fresh{host="110",scope="restore_drill"} == 0 + for: 30m + labels: + severity: warning + layer: host-backup + component: backup-restore-drill + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份抽樣還原演練缺失或失敗" + description: "每月 restore drill 沒有成功證據,備份雖可能新鮮,但尚未驗證可讀取還原。" + runbook: "SSH 110 執行 `/backup/scripts/check-backup-integrity.sh --mode restore-drill`;只允許還原到隔離暫存目錄,不得覆蓋 production" + + - alert: BackupOffsiteCopyNotConfigured + expr: sum by(host) (awoooi_backup_offsite_configured{host="110"}) == 0 + for: 1m + labels: + severity: warning + layer: host-backup + component: backup-offsite + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 尚未配置離機備份 provider" + description: "backup health exporter 未偵測到 Google Drive/rclone 或其他 offsite provider 配置;本地 restic 全綠仍不等於異地可恢復。" + runbook: "在 110 以 `/backup/scripts/configure-offsite-rclone.sh --interactive` 建立 Google Drive remote,產生 `/backup/offsite/*last_success` 證據;不得把 provider token 寫入 repo、Telegram 或 Prometheus label。" + + - alert: BackupOffsiteCopyStale + expr: | + ( + (sum by(host) (awoooi_backup_offsite_configured{host="110"}) > 0) + and + (sum by(host) (awoooi_backup_offsite_fresh{host="110"}) == 0) + ) + and + ( + (sum by(host) (awoooi_backup_offsite_full_sync_enabled{host="110"}) == 0) + or + ((time() - max by(host) (awoooi_backup_offsite_full_sync_enabled_timestamp{host="110"})) > 30 * 3600) + ) + for: 2h + labels: + severity: warning + layer: host-backup + component: backup-offsite + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 離機備份超過 48 小時未成功" + description: "已偵測到 offsite provider 配置,但沒有新鮮成功標記;本地備份可能無法抵抗整台 110 遺失。" + runbook: "SSH 110 檢查 Google Drive/rclone 同步 log 與 `/backup/offsite/*last_success`;full sync 需在 enable marker 與低負載門檻成立後由 `/backup/scripts/sync-offsite-backups.sh --mode sync` 鏡像本地 latest-only repo。" + + - alert: BackupRetentionPolicyNotLatestOnly + expr: | + absent(awoooi_backup_retention_latest_only{host="110"}) + or + awoooi_backup_retention_latest_only{host="110"} != 1 + or + absent(awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"}) + or + awoooi_backup_retention_offsite_delete_old_enabled{host="110",provider="rclone"} != 1 + for: 15m + labels: + severity: warning + layer: host-backup + component: backup-retention + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份保留策略不是 latest-only" + description: "operator 要求所有備份只保留最新一份;本地 restic 必須 keep-last=1,Google Drive/rclone 必須在成功 mirror 後刪除遠端舊檔。" + runbook: "檢查 `/backup/scripts/common.sh` 的 BACKUP_RETENTION_MODE=latest、KEEP_LAST=1 與 OFFSITE_SYNC_DELETE_OLD=1,刷新 backup-health textfile;必要時在備份成功後執行 `/backup/scripts/enforce-latest-only-retention.sh`。" + + - alert: BackupSnapshotRetentionExceeded + expr: awoooi_backup_job_snapshot_count{host="110",type="restic"} > 1 + for: 30m + labels: + severity: warning + layer: host-backup + component: backup-retention + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 備份 repo {{ $labels.exported_job }} 保留超過 1 份 snapshot" + description: "{{ $labels.exported_job }} 目前有 {{ $value }} 份 restic snapshot;latest-only 策略要求每個 repo 全域只保留最新 1 份。" + runbook: "SSH 110 執行 `/backup/scripts/enforce-latest-only-retention.sh`;若仍未收斂,確認 `common.sh` 使用 `restic forget --group-by \"\" --keep-last 1 --prune`,避免 restic 依 path/tag 分組保留多份。" + + - alert: BackupOffsiteFullVerifyFailed + expr: | + awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1 + unless on(host, provider) + (awoooi_backup_offsite_remote_verify_ok{host="110",provider="rclone"} == 1) + for: 30m + labels: + severity: warning + layer: host-backup + component: backup-offsite + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 Google Drive full sync 完成但遠端驗證未通過" + description: "full offsite marker 已 fresh,但 verify-offsite-full-sync.sh 沒有證明 13 個 Google Drive repo 都可列出且符合 latest-only。" + runbook: "SSH 110 執行 `/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color`,檢查 `/backup/logs/offsite-full-sync-verify.log` 與 `/home/wooo/node_exporter_textfiles/offsite_full_sync_verify.prom`。" + + - alert: BackupOffsiteRemoteSnapshotRetentionExceeded + expr: | + (awoooi_backup_offsite_remote_snapshot_count{host="110",provider="rclone"} > 1) + and on(host, provider) + (awoooi_backup_offsite_fresh{host="110",provider="rclone"} == 1) + for: 30m + labels: + severity: warning + layer: host-backup + component: backup-retention + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Google Drive repo {{ $labels.repo }} 保留超過 1 份 snapshot" + description: "{{ $labels.repo }} 在 Google Drive/rclone 遠端目前有 {{ $value }} 份 snapshot;latest-only 策略要求遠端也只保留最新一份。" + runbook: "確認 110 `/backup/scripts/sync-offsite-backups.sh --mode sync` 使用 `rclone sync`、`OFFSITE_SYNC_DELETE_OLD=1`、`RCLONE_DRIVE_USE_TRASH=false`,再於低峰重新執行 full sync 與 verifier。" + + - alert: BackupCredentialEscrowEvidenceMissing + expr: awoooi_backup_credential_escrow_fresh{host="110"} == 0 + for: 1m + labels: + severity: warning + layer: host-backup + component: credential-escrow + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "備份憑證金庫證據缺失或過期:{{ $labels.item }}" + description: "{{ $labels.item }} 沒有 31 天內人工驗證證據;重建時可能找不到 restic/offsite/break-glass/DNS/OAuth 復原材料。" + runbook: "在密碼管理器或離線加密金庫完成雙人覆核後,只建立不含 secret 的 `/backup/escrow-evidence/{{ $labels.item }}.last_verified` 時間戳證據。" + # ========================================================================= # 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12 # ========================================================================= @@ -1236,19 +1522,13 @@ groups: labels: severity: warning layer: systemd-188 - alert_category: host_resource + alert_category: infrastructure notification_type: TYPE-3 - # 2026-05-02 ogt + Claude Sonnet 4.6: ADR-068 飛輪 — disk full SOP - # auto_repair: false → true,路由到 ssh_host MCP Group B `ssh_docker_prune` - # 工具內含 >=75% 磁碟守衛,低於閾值 no-op,避免誤刪 - auto_repair: "true" - mcp_provider: "ssh_host" - host_type: "bare_metal" + auto_repair: "false" supersedes: PostgreSQLDiskGrowthRate annotations: summary: "主機 {{ $labels.instance }} 根目錄磁碟使用率 {{ $value | humanize }}% (>80%)" description: "磁碟使用率持續 10 分鐘超過 80%, 需清理或擴容. 常見原因: PG WAL, 日誌, container images, 舊 build cache." - auto_repair_action: "ssh {{ $labels.instance }} docker prune (image+volume+builder; gated by 75% disk usage)" runbook: "SSH 該主機: df -h / && du -sh /var/lib/postgresql/*/pg_wal /var/log /var/lib/docker" - alert: HostDiskUsageCritical @@ -1468,3 +1748,284 @@ groups: summary: "Prometheus ({{ $labels.instance }}) 停擺" description: "Prometheus 自己停擺 → 所有其他告警失效" runbook: "SSH 110 docker compose -f /home/wooo/monitoring/docker-compose.yml up -d prometheus" + + # ========================================================================= + # Full-stack cold-start recovery gate + # ========================================================================= + - name: cold_start_recovery_alerts + rules: + - alert: PrometheusRuleDriftGuardFailed + expr: | + absent(awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) + or + (time() - max by(host) (awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="110"}) > 900) + or + (awoooi_prometheus_rule_drift_guard_missing_required_count{host="110"} > 0) + or + (awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="110"} == 0) + for: 10m + labels: + severity: critical + layer: systemd-110 + component: prometheus-rule-drift-guard + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Prometheus 規則漂移防護失效" + description: "110 Prometheus rule drift guard 沒有新鮮成功指標、required rules 缺失,或 active alerts.yml 不等於 canonical rules。" + runbook: "執行 `bash scripts/ops/deploy-alerts.sh` 重新部署 canonical rules 與 drift guard,等待 1-2 個 Prometheus evaluation cycle 後重跑 readiness audit。" + + - alert: PrometheusRuleDriftAutoRepaired + expr: awoooi_prometheus_rule_drift_guard_repaired{host="110"} > 0 + for: 1m + labels: + severity: warning + layer: systemd-110 + component: prometheus-rule-drift-guard + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Prometheus 規則漂移已被自動修復" + description: "110 drift guard 最近一次偵測到 active Prometheus rules 漂移,已回復 canonical rules 並 reload Prometheus。" + runbook: "檢查 `/home/wooo/logs/prometheus-rule-drift-guard.log` 與 `/home/wooo/monitoring/alerts.yml.guard.bak.*`,找出誰覆寫了 active rules。" + + - alert: ColdStartMonitorMissing + expr: absent(awoooi_cold_start_monitor_up{host="110",scope="110_120_121_188"}) + for: 15m + labels: + severity: warning + layer: systemd-110 + component: cold-start-monitor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Cold-start monitor textfile metric missing" + description: "110 沒有輸出 awoooi_cold_start_monitor_up;重開機恢復 gate 目前不可觀測。" + runbook: "執行 scripts/reboot-recovery/install-cold-start-monitor-110.sh,確認 /home/wooo/node_exporter_textfiles/cold_start_recovery.prom" + + - alert: ColdStartMonitorStale + expr: time() - awoooi_cold_start_last_run_timestamp{host="110",scope="110_120_121_188"} > 900 + for: 10m + labels: + severity: warning + layer: systemd-110 + component: cold-start-monitor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Cold-start monitor stale" + description: "cold-start monitor 超過 15 分鐘沒有更新,距離上次執行 {{ $value | humanizeDuration }}。" + runbook: "SSH 110 檢查 crontab、/tmp/awoooi-cold-start-monitor.cron.log、cold-start-last.log" + + - alert: ColdStartRecoveryBlocked + expr: awoooi_cold_start_blocked_gates{host="110",scope="110_120_121_188"} > 0 + for: 5m + labels: + severity: critical + layer: full-stack + component: cold-start-gate + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Full-stack cold-start recovery BLOCKED" + description: "cold-start gate 有 {{ $value }} 個 BLOCKED gate。AI 修復需保持 observe-only,先處理第一個 blocked gate。" + runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log;依 docs/runbooks/FULL-STACK-COLD-START-SOP.md 的 P0→P2 順序修復" + + - alert: K3sNodeFilesystemErrorGateBlocked + expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="k3s_node_filesystem_error",target="120"} > 0 + for: 5m + labels: + severity: critical + layer: k3s + component: node-filesystem + host: "120" + target_host: "120" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "120 K3s 節點 filesystem error 阻擋重開機放行" + description: "cold-start log 偵測到 120 Node event 仍有 EXT4/I/O/deleted inode 類錯誤;即使 Pod Running、網站 200,也不可宣告下一次重開機安全。" + runbook: "查看 110 `/home/wooo/reboot-recovery/cold-start-last.log`,執行 `scripts/reboot-recovery/120-fsck-maintenance-checklist.sh` 做維護前只讀檢查;維護窗內用 console/rescue 對 120 root LV 執行 fsck,禁止 online fsck。" + + - alert: ColdStartHost120Unreachable + expr: awoooi_cold_start_blocker_reason{host="110",scope="110_120_121_188",reason="host_unreachable",target="120"} > 0 + for: 3m + labels: + severity: critical + layer: host + component: host-reachability + host: "120" + target_host: "120" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "120 主機不可達,Full-stack cold-start 已阻擋" + description: "110 cold-start monitor 無法 ping/SSH 192.168.0.120;目前只能由 121/VIP 撐住 K3s,不能宣告所有主機重開機恢復完成。" + runbook: "查看 120 console。若停在 initramfs/manual fsck,先對 root LV 做離線 fsck;若主機關機或網卡異常,先恢復電源/網路,再重跑 full-stack cold-start gate。禁止從自動修復直接重啟其他服務掩蓋主機離線。" + + - alert: ColdStartRecoveryDegraded + expr: awoooi_cold_start_warn_gates{host="110",scope="110_120_121_188"} > 0 + for: 15m + labels: + severity: warning + layer: full-stack + component: cold-start-gate + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Full-stack cold-start recovery DEGRADED" + description: "cold-start gate 有 {{ $value }} 個 WARN gate;核心可用但不應放行 runner/CD/AI auto-repair full execution。" + runbook: "查看 /home/wooo/reboot-recovery/cold-start-last.log,修到 PASS/WARN/BLOCKED = green" + + - alert: ColdStartLastGreenTooOld + expr: time() - awoooi_cold_start_last_green_timestamp{host="110",scope="110_120_121_188"} > 3600 + for: 15m + labels: + severity: warning + layer: full-stack + component: cold-start-gate + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "Full-stack cold-start gate has not been GREEN recently" + description: "距離上次 GREEN 已超過 {{ $value | humanizeDuration }};需要確認 110/120/121/188 與排程/網站 gate。" + runbook: "執行 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test" + + # ========================================================================= + # Host storage health / dirty reboot evidence + # ========================================================================= + - name: host_storage_health_alerts + rules: + - alert: Host110StorageHealthMonitorMissing + expr: absent(awoooi_host_storage_monitor_up{host="110"}) + for: 15m + labels: + severity: warning + layer: systemd-110 + component: storage-health-monitor + host: "110" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "110 storage health textfile metric missing" + description: "110 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。" + runbook: "用 Ansible `110-devops.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/wooo/node_exporter_textfiles/storage_health.prom" + + - alert: Host188StorageHealthMonitorMissing + expr: absent(awoooi_host_storage_monitor_up{host="188"}) + for: 15m + labels: + severity: warning + layer: systemd-188 + component: storage-health-monitor + host: "188" + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "188 storage health textfile metric missing" + description: "188 沒有輸出 storage_health.prom;dirty reboot、root read-only 與 fsck 證據目前不可觀測。" + runbook: "用 Ansible `188-ai-web.yml --tags textfile_exporters` 或手動部署 scripts/ops/storage-health-textfile-exporter.py,確認 /home/ollama/node_exporter_textfiles/storage_health.prom" + + - alert: HostStorageHealthMonitorStale + expr: time() - awoooi_host_storage_last_run_timestamp{host=~"110|188"} > 900 + for: 10m + labels: + severity: warning + layer: host-storage + component: storage-health-monitor + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.host }} storage health textfile stale" + description: "storage health exporter 超過 15 分鐘沒有更新;重開機後檔案系統風險不可觀測。" + runbook: "SSH 主機檢查 cron、/tmp/awoooi-storage-health-textfile-exporter.cron.log 與 node-exporter textfile collector" + + - alert: HostRootFilesystemReadOnly + expr: awoooi_host_root_filesystem_readonly{host=~"110|188",mountpoint="/"} > 0 + for: 1m + labels: + severity: critical + layer: host-storage + component: root-filesystem + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.host }} root filesystem 已變成 read-only" + description: "root filesystem 被掛載為唯讀,服務可能仍暫時存活但寫入會失敗;禁止自動修復,先保全證據並規劃維護窗。" + runbook: "依 docs/runbooks/FULL-STACK-COLD-START-SOP.md §16:保全 journal/df/mount 證據,確認備份,再安排 console/offline fsck" + + - alert: HostCurrentBootStorageErrorsDetected + expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="current"} > 0 + for: 5m + labels: + severity: critical + layer: host-storage + component: kernel-storage + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.host }} current boot 有 storage/kernel 錯誤" + description: "目前開機週期已出現 filesystem、I/O 或 fsck 類錯誤;不可只重啟容器掩蓋問題。" + runbook: "先執行 read-only 診斷:journalctl -k -p warning..alert、mount、df、smartctl/raid 狀態;必要時進入維護窗處理" + + - alert: HostPreviousBootStorageErrorsDetected + expr: awoooi_host_storage_error_count{host=~"110|188",source="journalctl-kernel",boot="previous"} > 0 + for: 30m + labels: + severity: warning + layer: host-storage + component: dirty-reboot-evidence + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.host }} previous boot 保留 storage/fsck 錯誤證據" + description: "上一個開機週期留有 storage/fsck 錯誤,代表這次重開機事故需要完成 fsck、備份與容量後續檢查。" + runbook: "把證據寫入 docs/LOGBOOK.md,確認 full-stack cold-start gate 與 P3 gate;下一次維護窗補 offline fsck/SMART/RAID 檢查" + + - alert: HostFsckLogErrorsDetected + expr: sum by(host) (awoooi_host_storage_error_count{host=~"110|188",boot="last-fsck-log"}) > 0 + for: 30m + labels: + severity: warning + layer: host-storage + component: fsck-log + team: ops + alert_category: infrastructure + notification_type: TYPE-1 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.host }} fsck log 保留錯誤證據" + description: "主機 fsck log 內仍有 inconsistency 或 I/O 類錯誤文字;這是事故後追蹤項,不應交給自動修復直接處理。" + runbook: "確認 /run/initramfs/fsck.log 與 /var/log/fsck/*,將結果納入重開機事故報告與下次維護窗檢查項" diff --git a/ops/reboot-recovery/full-stack-backup-baseline.yml b/ops/reboot-recovery/full-stack-backup-baseline.yml new file mode 100644 index 00000000..77514995 --- /dev/null +++ b/ops/reboot-recovery/full-stack-backup-baseline.yml @@ -0,0 +1,306 @@ +version: 2026-05-19.v7 +scope: "110/120/121/188 全服務、資料、設定與還原驗證備份基準" + +principles: + - "資料備份與設定備份分層:DB/PV/物件資料負責資料,configs 負責可啟動狀態。" + - "Secrets、TLS private keys、SSH host keys 可進加密 restic/Velero 備份,但不得印到 log、repo、Telegram。" + - "備份系統本身也要備份:restic repository health、password/key escrow、offsite copy、restore drill evidence 缺一不可。" + - "每個備份都必須有三個證據:排程存在、最近成功時間、還原或 dry-run 驗證。" + - "AI 自動修復在備份/還原領域預設 observe-only;禁止未經新成功備份證據與 baseline gate 的刪除、DROP DB、覆蓋 production namespace。" + - "2026-05-19 起備份保留策略為 latest-only:每個本地 restic repo、188 MOMO 檔案備份與 Google Drive/rclone 離機鏡像都只保留最新一份。" + +backup_domains: + - id: host_configs + owner_host: "110" + script: "/backup/scripts/backup-configs.sh" + repository: "/backup/configs" + schedule: "daily via /backup/scripts/backup-all.sh" + max_age_hours: 48 + includes: + - "110/188/120/121: /etc/nginx, /etc/systemd/system, /etc/cron.d, /etc/crontab" + - "110/188/120/121: /etc/letsencrypt, /etc/ssh, /etc/fstab, /etc/hosts, /etc/netplan" + - "110: /opt/harbor, /opt/sentry, /home/wooo/monitoring, /home/wooo/scripts, /backup/scripts" + - "188: /opt/n8n, /opt/open-webui, /opt/litellm, /opt/signoz, /home/ollama/momo-pro, /home/ollama/bin" + - "120/121: /etc/rancher/k3s, K3s manifests, containerd/keepalived host config" + - "K8s: workloads, services, ingress, configmaps, secrets, RBAC, PV/PVC, CRDs, Velero schedules/backups" + restore_test: "抽樣 restic restore 到隔離目錄,確認 nginx/systemd/K8s YAML 可讀;不得直接覆蓋 production。" + + - id: awoooi_databases + owner_host: "110" + scripts: + - "/backup/scripts/backup-awoooi.sh" + - "/backup/scripts/backup-awoooi-frequent.sh" + repository: "/backup/awoooi" + schedule: "daily 02:00 + high-frequency 08:00/14:00/20:00" + max_age_hours: 7 + includes: + - "awoooi_prod" + - "awoooi_dev" + - "k3s_datastore if present" + restore_test: "pg_restore/psql 到隔離 DB,驗證 schema 與核心表筆數;不可覆蓋 production DB。" + + - id: gitea_and_ci + owner_host: "110" + repository: "/backup/gitea" + schedule: "daily via backup-all" + max_age_hours: 48 + includes: + - "Gitea DB" + - "Git repositories" + - "Gitea app.ini 與 runner registration/config evidence" + - "workflow definitions from repos" + restore_test: "抽樣 git fsck / git clone;Gitea DB dump 可讀。" + + - id: harbor_registry + owner_host: "110" + repository: "/backup/harbor" + schedule: "daily via backup-all" + max_age_hours: 48 + includes: + - "Harbor DB/config" + - "registry storage" + - "TLS/config state from configs backup" + restore_test: "抽樣 registry manifest/blobs 可讀;Harbor compose/config 可重建。" + + - id: observability + owner_host: "110" + repositories: + - "/backup/monitoring" + - "/backup/signoz" + schedule: "daily via backup-all" + max_age_hours: 48 + includes: + - "Prometheus TSDB" + - "Grafana dashboards/datasources" + - "Alertmanager config/state" + - "SignOz ClickHouse/SQLite/config" + - "blackbox/node-exporter textfile config" + restore_test: "Prometheus/Grafana/Alertmanager 設定 lint;SignOz dump 可列出表。" + + - id: sentry + owner_host: "110" + coverage_status: "covered_by_backup_sentry_script" + script: "/backup/scripts/backup-sentry.sh" + repository: "/backup/sentry" + schedule: "daily via backup-all; config also covered by /backup/configs" + max_age_hours: 48 + includes: + - "Sentry compose/.env/config" + - "Sentry Postgres logical dump" + - "Sentry ClickHouse volume snapshot and table inventory" + - "Sentry Kafka queue volume snapshot" + - "Sentry Redis / SeaweedFS / Taskbroker / Vroom / Symbolicator state" + restore_test: "先在隔離 compose stack 驗證 Postgres dump 可讀、ClickHouse volume 可掛載、web/symbolicator/snuba 可啟動。" + + - id: credential_escrow + owner_host: "human-controlled" + coverage_status: "gap_p0_out_of_band_escrow_required" + repository: "不可放在同一個 restic repo;需放在密碼管理器或離線加密金庫" + schedule: "每次新增/輪替 Secret 後立即更新 escrow;每月人工抽查" + max_age_hours: 744 + includes: + - "restic password files / repository keys / Google Drive rclone.conf / offsite provider credentials" + - "Cloud DNS / registrar / CDN / tunnel 管理帳號與 recovery codes" + - "Gitea/Harbor/Sentry/admin break-glass credentials" + - "Git deploy keys、runner registration tokens、K8s bootstrap/admin kubeconfig 的復原路徑" + - "Google Drive / OAuth / Telegram / AI provider tokens 的輪替與復原流程,不包含明文輸出" + restore_test: "用人工雙人覆核方式確認 key escrow 可找到、可解密、可用於列出 snapshots;不得把 Secret 值寫進 repo 或監控 label。" + + - id: external_dns_and_public_routes + owner_host: "110" + coverage_status: "covered_by_public_route_evidence_backup; provider_zone_export_still_requires_credentials" + script: "/backup/scripts/backup-public-routes.sh" + repository: "/backup/public-routes" + schedule: "daily via backup-all; DNS/CDN provider zone export after every routing change when credentials are available" + max_age_hours: 168 + includes: + - "wooo.work DNS answers;CDN/Cloudflare/registrar 設定匯出仍需 provider token" + - "public nginx route map、TLS renewal config、ACME account evidence" + - "blackbox public endpoint inventory 與 expected status codes" + - "VPN/tunnel/port-forward/HA VIP 對外路由設定" + restore_test: "從匯出檔重建 public route checklist,確認 awoooi/mo/registry/harbor/gitea 等 endpoint 對應正確;不得在測試中改正式 DNS。" + + - id: backup_repositories_and_integrity + owner_host: "110/188/121/offsite" + coverage_status: "covered_locally_by_check_backup_integrity_script; offsite copy still depends on credentials" + scripts: + - "/backup/scripts/check-backup-integrity.sh" + - "/backup/scripts/configure-offsite-rclone.sh" + - "/backup/scripts/configure-offsite-b2.sh" + - "/backup/scripts/sync-offsite-backups.sh" + - "/backup/scripts/backup-offsite-readiness-gate.sh" + - "/backup/scripts/offsite-escrow-evidence-report.sh" + - "/backup/scripts/mark-credential-escrow-verified.sh" + repositories: + - "/backup/* restic repos" + - "/home/ollama/backup/110" + - "Google Drive/rclone/offsite remote when credentials are configured" + schedule: "daily freshness; daily 06:10 offsite status; daily 06:15 offsite escrow evidence report; weekly restic check; monthly sample restore drill" + max_age_hours: 168 + includes: + - "restic snapshots metadata、repo config、locks/prune policy" + - "188 backup-from-110 rsync copy" + - "offsite copy status and retention policy" + - "restore drill logs with snapshot id and restored object counts" + restore_test: "每週 `restic check --read-data-subset=1%`;每月 `restic dump latest ` 到 0700 暫存目錄驗證可讀。" + retention_policy: "latest-only;本地 restic repo 新 snapshot 成功後 --group-by \"\" --keep-last=1 + prune;188 MOMO 檔案備份只留最新一份;離機 Google Drive/rclone 以本地 repo 為準鏡像刪舊。" + offsite_sync_policy: "offsite-escrow-evidence-report.sh 先產出紅acted 證據與 NEXT_STEP;backup-offsite-readiness-gate.sh 再做 status / dry-run-small / pre-full-sync;sync-offsite-backups.sh 預設 status;dry-run 可隨時執行;Google Drive/rclone full sync 需選低峰窗口,成功後才寫 /backup/offsite/rclone-last-success,且 OFFSITE_SYNC_DELETE_OLD=1 時會刪除遠端舊檔。full sync 不得與本地備份程序重疊,且必須距離下一次備份排程至少 270 分鐘。" + + - id: momo_web_and_data + owner_host: "188" + scripts: + - "/backup/scripts/backup-momo.sh on 110" + - "/home/ollama/bin/momo-pg-backup.sh on 188" + repositories: + - "/backup/momo" + - "/home/ollama/momo_backups" + schedule: "110 daily + 188 daily 02:00" + max_age_hours: 30 + includes: + - "mo.wooo.work app DB" + - "momo uploads/files/config" + - "scheduler config and cron" + restore_test: "隔離 DB restore 後跑 app health check;確認 mo.wooo.work 需要的資料表與資料筆數。" + + - id: ai_and_tooling + owner_host: "188" + coverage_status: "covered_by_backup_ai_artifacts_for_manifest_and_metadata; model_blobs_require_manual_classification" + script: "/backup/scripts/backup-ai-artifacts.sh" + repositories: + - "/backup/langfuse" + - "/backup/open-webui" + - "/backup/clawbot" + - "/backup/configs" + - "/backup/ai-artifacts" + schedule: "daily via backup-all" + max_age_hours: 48 + includes: + - "Langfuse traces/evaluations" + - "Open-WebUI conversations/config" + - "LiteLLM config, model routing, provider state" + - "OpenClaw/ClawBot Redis or persistent state" + - "n8n workflows/credentials through encrypted config backup" + - "Ollama model manifest/tag list/Modelfile;自製或不可重新下載的 model/adapters 才備份 blobs" + - "KM/RAG/vector 狀態;若存在於 AWOOOI DB,隨 DB dump 還原;若是外部 vector store 必須有獨立 dump" + restore_test: "抽樣匯出 workflow/config;Redis dump 可讀;Langfuse/Open-WebUI DB dump 可讀;Ollama manifest tar 可列出模型 tags。" + + - id: source_of_truth_and_ops_memory + owner_host: "110/Gitea" + coverage_status: "gap_p1_sanitized_operational_context" + repositories: + - "/backup/gitea" + - "/backup/configs" + schedule: "Gitea daily; configs daily; 每次事故後更新 docs/LOGBOOK.md 與 runbooks" + max_age_hours: 48 + includes: + - "所有 Git repositories、Ansible roles/playbooks/inventory、K8s manifests、monitoring rules" + - "AGENTS/HARD_RULES/runbooks/LOGBOOK/ADR 等決策與啟動順序文件" + - "AI agent handoff summaries and operational memory exports after sanitization" + - "CI/CD workflow definitions、runner labels、deployment marker policy" + restore_test: "從 Gitea backup 抽樣 clone repo,跑 ansible/k8s/alerts YAML validation;不得備份含明文 token 的聊天或 shell transcript。" + + - id: k3s_and_velero + owner_host: "120" + schedule: "Velero daily-awoooi-prod + weekly restore dry-run" + max_age_hours: 25 + includes: + - "K8s manifests and CRDs" + - "Secrets/ConfigMaps/RBAC" + - "PVC/PV snapshots via Velero provider" + - "backup-restore-test CronJob and result metrics" + restore_test: "backup-restore-test CronJob 每週 dry-run 到 restore-test-dry namespace mapping。" + + - id: offsite_and_dr + owner_host: "188/121" + schedule: "188 backup-from-110 daily 01:00; 121 DR drill monthly" + max_age_hours: 25 + includes: + - "110 Harbor/Gitea/bitan rsync copy on 188" + - "DR drill evidence on 121" + - "Google Drive/rclone remote when credentials are configured" + restore_test: "121 DR drill dry-run finds latest Completed Velero backup; 188 backup-from-110 textfile fresh。" + +monitoring_contract: + textfile_metrics: + "110": "/home/wooo/node_exporter_textfiles/backup_health.prom" + "188": "/home/ollama/node_exporter_textfiles/backup_health.prom" + "120": "由 110 backup_health.prom 透過 120 kubectl 查詢 Velero/CronJob/Job 狀態" + offsite_and_escrow_metrics: + - "awoooi_backup_offsite_configured:只回報 Google Drive/rclone 或相容 provider 是否看起來已配置,不輸出 credential 值。" + - "awoooi_backup_offsite_fresh:由 /backup/offsite/*last_success 類 marker 判斷離機同步是否新鮮。" + - "awoooi_backup_offsite_partial_fresh:由小範圍 partial sync marker 判斷 Google Drive/rclone 寫入路徑是否已被證明。" + - "awoooi_backup_credential_escrow_fresh:由 /backup/escrow-evidence/*.last_verified 類 marker 判斷人工金庫覆核是否在 31 天內完成。" + - "awoooi_backup_dr_next_step_info:用 next_step label 告訴 AI 巡檢與 operator 下一個安全人工作業,不包含 secret。" + - "awoooi_backup_dr_credential_escrow_missing_count:金庫覆核尚缺的項目數。" + - "awoooi_backup_cron_active_duplicate_count:110 active crontab 中 exact duplicate entry 的數量。" + - "awoooi_backup_cron_singular_entry_ok:offsite/status/verifier/exporter 等單一入口排程是否剛好只有一條 active cron。" + - "awoooi_backup_config_capture_ok:最新 configs snapshot 是否實際捕捉 110/120/121/188 host config 與 K8s workloads/secrets,不輸出 secret。" + - "awoooi_backup_config_capture_critical_failed_count:最新設定檔備份缺少的 critical capture target 數量。" + prometheus_alerts: + - BackupHealthMonitorMissing110 + - BackupHealthMonitorMissing188 + - BackupHealthMonitorStale + - BackupExpectedJobMissing + - BackupScheduleDuplicateActiveEntries + - BackupScheduleSingletonMismatch + - BackupScriptMissing + - BackupJobStale + - BackupAggregateRunFailed + - BackupConfigCapturePartial + - BackupConfigCaptureStatusStale + - BackupIntegrityCheckMissingOrFailed + - BackupRestoreDrillMissingOrFailed + - BackupRestoreTestMissing + - BackupRestoreTestCronMissing + - BackupRestoreTestFailed + - BackupRestoreTestStale + - BackupOffsiteCopyNotConfigured + - BackupOffsiteCopyStale + - BackupCredentialEscrowEvidenceMissing + - BackupRetentionPolicyNotLatestOnly + - BackupSnapshotRetentionExceeded + - BackupOffsiteFullVerifyFailed + - BackupOffsiteRemoteSnapshotRetentionExceeded + live_visibility_checks: + - "如果 awoooi_backup_offsite_configured{host=\"110\"} 為 0,Prometheus 必須有 BackupOffsiteCopyNotConfigured firing,Alertmanager 必須有 active alert。" + - "如果 offsite provider 已配置、full marker 尚未 fresh,且 full sync enable marker 缺失或已超過 30 小時,Prometheus 與 Alertmanager 必須看得到 BackupOffsiteCopyStale。" + - "如果 awoooi_backup_credential_escrow_fresh{host=\"110\"} == 0,Prometheus 與 Alertmanager 必須依 item 看得到 BackupCredentialEscrowEvidenceMissing。" + - "如果 awoooi_backup_retention_latest_only{host=\"110\"} 或 awoooi_backup_retention_offsite_delete_old_enabled{host=\"110\",provider=\"rclone\"} 缺失/不為 1,Prometheus 與 Alertmanager 必須看得到 BackupRetentionPolicyNotLatestOnly。" + - "如果任一 awoooi_backup_job_snapshot_count{host=\"110\",type=\"restic\"} > 1,Prometheus 與 Alertmanager 必須看得到 BackupSnapshotRetentionExceeded。" + - "如果 full offsite marker fresh 但 awoooi_backup_offsite_remote_verify_ok{host=\"110\",provider=\"rclone\"} 不為 1 或缺失,Prometheus 必須看得到 BackupOffsiteFullVerifyFailed。" + - "如果 full offsite marker fresh 且任一 awoooi_backup_offsite_remote_snapshot_count{host=\"110\",provider=\"rclone\"} > 1,Prometheus 必須看得到 BackupOffsiteRemoteSnapshotRetentionExceeded。" + - "如果 awoooi_backup_cron_active_duplicate_count{host=\"110\"} > 0,Prometheus 與 Alertmanager 必須看得到 BackupScheduleDuplicateActiveEntries。" + - "如果任一 awoooi_backup_cron_singular_entry_ok{host=\"110\"} == 0,Prometheus 與 Alertmanager 必須看得到 BackupScheduleSingletonMismatch。" + - "如果任一 awoooi_backup_config_capture_ok{host=\"110\",critical=\"true\"} == 0,Prometheus 與 Alertmanager 必須看得到 BackupConfigCapturePartial,且 target label 必須指出缺哪個設定來源。" + - "如果 awoooi_backup_config_capture_status_timestamp 缺失或超過 48 小時,Prometheus 與 Alertmanager 必須看得到 BackupConfigCaptureStatusStale。" + - "live visibility check 只讀 Prometheus / Alertmanager API,不送測試告警、不改 silence、不改 route、不觸發修復。" + prometheus_recording_rules: + - awoooi_recovery_core_ready + - awoooi_recovery_dr_offsite_ready + +release_gate: + cold_start_script: "bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color" + p3_script: "bash scripts/reboot-recovery/p3-controlled-release-gate.sh" + recovery_core_scorecard: "bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-core" + dr_offsite_operator_checklist: "bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh" + dr_offsite_scorecard: "bash scripts/reboot-recovery/full-stack-recovery-scorecard.sh --require-dr" + dr_offsite_final_gate: "bash scripts/reboot-recovery/dr-offsite-operator-checklist.sh --require-dr" + dr_offsite_post_marker_wait: "bash scripts/reboot-recovery/wait-dr-offsite-ready.sh --timeout-seconds 900 --interval-seconds 30 --no-color" + required_green: + - "backup_health.prom fresh on 110/188" + - "awoooi_backup_job_fresh == 1 for every expected job" + - "Velero latest Completed backup < 25h" + - "backup-restore-test CronJob present and lastSuccessfulTime not stale" + - "weekly restic check successful" + - "monthly sample restore drill successful" + warning_until_human_escrow_ready: + - "offsite provider configured and latest offsite copy marker fresh" + - "credential escrow marker files refreshed after human verification; marker files must contain only timestamp/evidence id, never secret values" + strict_dr_exit_conditions: + - "Google Drive/rclone provider configured on 110 host-local rclone.conf; /backup/scripts/offsite.env keeps only non-secret remote/path with mode 0600" + - "credential escrow markers fresh for restic_repository_password, offsite_provider_credentials, break_glass_admin_credentials, dns_registrar_recovery, oauth_ai_provider_recovery" + - "full offsite marker /backup/offsite/rclone-last-success fresh after full 13 repo sync" + - "full-stack-recovery-scorecard.sh --require-dr exits 0" + - "recovery-scorecard-contract-check.py --expect-dr-ready exits 0 against 110 Prometheus" + - "dr-offsite-operator-checklist.sh --require-dr exits 0 after scorecard, Prometheus recording rule, and backup alert visibility contract agree" + - "wait-dr-offsite-ready.sh exits 0 after post-marker textfile, Prometheus, Alertmanager, and final checklist convergence" diff --git a/ops/reboot-recovery/full-stack-cold-start-baseline.yml b/ops/reboot-recovery/full-stack-cold-start-baseline.yml index d83d53a1..0db7dc47 100644 --- a/ops/reboot-recovery/full-stack-cold-start-baseline.yml +++ b/ops/reboot-recovery/full-stack-cold-start-baseline.yml @@ -1,337 +1,204 @@ -# AWOOOI full-stack cold-start dependency baseline. -# This is the machine-readable companion to docs/runbooks/FULL-STACK-COLD-START-SOP.md. -# -# Intent: -# - document the reboot startup order and service dependency graph -# - define release gates for operators and AI automation -# - keep stateful services out of generic auto-restart loops - -version: "2026-05-06" -incident_reference: "2026-05-05 full-stack reboot recovery" +version: 2026-05-06.v1 scope: - managed_hosts: - "110": - address: "192.168.0.110" - ssh_user: "wooo" - roles: - - registry - - git - - observability - - sentry - - runners - "120": - address: "192.168.0.120" - ssh_user: "wooo" - roles: - - k3s_server - - keepalived_vip - - awoooi_nodeport - "121": - address: "192.168.0.121" - ssh_user: "wooo" - roles: - - k3s_node - - keepalived_peer - - dr_drill - "188": - address: "192.168.0.188" - ssh_user: "ollama" - roles: - - postgres_datastore - - redis - - momo - - signoz - - ai_proxy - intentionally_skipped: - "112": - role: "kali" - reason: "scanner host is not required for production cold-start release" + included_hosts: + "110": "DevOps, registry, observability, Sentry, runners" + "120": "K3s control plane and VIP" + "121": "K3s peer node and DR drill cron" + "188": "Data, AI, web, momo, SignOz, public nginx gateway" + excluded_hosts: + "112": "Kali security host; recorded but not part of cold-start release gate" -global_policy: - startup_rule: "Recover the dependency chain before releasing high-load work." - runner_cd_rule: "Release runners and CD only after data, registry, K3s, workload, routes, schedules, and alert E2E gates are green." - ai_auto_repair_rule: "Observe-only until all green gates pass and host load stays below baseline." - destructive_state_rule: "No DROP, data directory deletion, volume recreation, pg_resetwal, fsck, or backup restore without explicit human approval." - no_generic_restart_rule: "Never run generic docker restart against all containers during cold start." +principles: + - recover_dependency_chain_before_workloads + - keep_ai_auto_repair_observe_only_until_green + - never_generic_restart_stateful_services + - preserve_corrupt_parts_in_quarantine_not_delete + - release_runners_and_crawlers_last phases: - - id: "P0-NETWORK" + - id: P0-NETWORK order: 0 - start_after: [] - owns: - - "LAN reachability" - - "SSH reachability" - - "ARP evidence" gates: - - "ping 192.168.0.110/120/121/188 succeeds" - - "TCP 22 open on 192.168.0.110/120/121/188" - - "reboot evidence captured before repair" - blocks: - - "all other phases" + - ping_110_120_121_188 + - ssh_port_110_120_121_188 + - arp_evidence_or_monitor_mode_fallback - - id: "P0-188-DATA" - order: 1 - start_after: - - "P0-NETWORK" - host: "188" - service_order: - - "containerd" - - "docker" - - "postgresql@14-main" - - "k3s_datastore.kine maintenance" - - "redis-server" - - "ollama or current AI proxy dependencies" - - "nginx" - - "Docker networks" - - "MinIO / OpenClaw / SignOz" - - "momo / litellm / batch services" + - id: P0-188-DATA + order: 10 + required_before: + - P1-K3S + - P2-WORKLOAD-ALERTCHAIN gates: - - "PostgreSQL port 5432 open" - - "pg_isready reports accepting connections" - - "Redis replies PONG" - - "momo health endpoint returns 200" - - "SignOz HTTP route is reachable" - blocks: - - "120/121 K3s" - - "AWOOOI API database access" - - "Alertmanager webhook" - - "momo public site" + - containerd_docker_postgresql_redis_ollama_nginx_active + - postgresql_5432_accepting_connections + - redis_pong + - momo_db_not_restarting + - signoz_http_reachable + - momo_health_200 - - id: "P0-110-REGISTRY-OBSERVABILITY" - order: 2 - start_after: - - "P0-NETWORK" - - "P0-188-DATA" - host: "110" - service_order: - - "docker" - - "orphan Exited(128/137) cleanup if needed" - - "Harbor log" - - "Harbor registry stack" - - "Gitea" - - "Prometheus / Alertmanager / Grafana / exporters" - - "Langfuse" - - "SignOz or local observability companions" - - "Sentry DB layer" - - "Sentry web / worker / consumer layer" - - "Gitea host runner and actions runners" + - id: P0-110-REGISTRY-OBSERVABILITY + order: 20 + required_before: + - P1-K3S + - P3-RUNNER-CD gates: - - "Harbor /v2/ returns 200 or 401" - - "Gitea returns 200 or 302" - - "Prometheus /-/ready returns 200" - - "Alertmanager /-/healthy returns 200" - - "Sentry HTTP returns 200, 302, or 400" - - "runner CPUQuota=200%, MemoryMax=2G, WatchdogUSec=0" - blocks: - - "K3s image pulls" - - "runtime CD" - - "alert rules deploy" - - "code-review runners" + - docker_active + - harbor_v2_200_or_401 + - gitea_200_or_302 + - prometheus_ready + - alertmanager_healthy + - sentry_http_reachable + - docker_containers_all_up + - runner_watchdog_disabled + - sentry_clickhouse_not_restarting + - cadvisor_image_v0_47_0 + - cadvisor_cpu_cap_0_3 - - id: "P1-K3S" - order: 3 - start_after: - - "P0-188-DATA" - - "P0-110-REGISTRY-OBSERVABILITY" - hosts: - - "120" - - "121" - service_order: - - "120 k3s.service" - - "121 k3s-agent.service or live role" - - "CNI / kube-proxy" - - "nodes Ready" - - "core pods" - - "awoooi-prod pods" - - "keepalived VIP 192.168.0.125" - - "NodePorts 32334 and 32335" + - id: P1-K3S + order: 30 gates: - - "120 can reach 188:5432" - - "K3s nodes show Ready" - - "VIP 192.168.0.125 is present" - - "awoooi-prod pods are Running or Completed" - blocks: - - "AWOOOI workload health" - - "public AWOOOI route" - - "Alertmanager webhook" + - 120_can_reach_188_postgres + - mon_and_mon1_ready + - no_non_running_non_succeeded_pods + - awoooi_dev_api_nodeport_200 + - vip_192_168_0_125_present - - id: "P2-WORKLOAD-ALERTCHAIN" - order: 4 - start_after: - - "P1-K3S" - owners: - - "AWOOOI API" - - "AWOOOI Web" - - "Alertmanager webhook" - - "Telegram delivery" + - id: P2-WORKLOAD-ALERTCHAIN + order: 40 gates: - - "http://192.168.0.125:32334/api/v1/health returns 2xx/3xx" - - "http://192.168.0.125:32335/ returns 2xx/3xx" - - "Alertmanager webhook POST returns 2xx" - - "K8s Telegram secrets are present and non-placeholder" - blocks: - - "AI auto-remediation" - - "full alert confidence" + - awoooi_api_vip_health_2xx_or_3xx + - awoooi_web_vip_2xx_or_3xx + - alertmanager_webhook_e2e_2xx_when_release_gate - - id: "P2-PUBLIC-ROUTES" - order: 5 - start_after: - - "P2-WORKLOAD-ALERTCHAIN" + - id: P2-PUBLIC-ROUTES + order: 50 + public_https_routes: + - https://awoooi.wooo.work/api/v1/health + - https://awoooi.wooo.work/ + - https://mo.wooo.work/ + - https://mo.wooo.work/health + - https://gitea.wooo.work/ + - https://harbor.wooo.work/ + - https://registry.wooo.work/ + - https://sentry.wooo.work/ + - https://signoz.wooo.work/ + - https://stock.wooo.work/ + - https://langfuse.wooo.work/ + - https://bitan.wooo.work/ + - https://aiops.wooo.work/ + + - id: P2-SCHEDULES + order: 60 gates: - - "https://awoooi.wooo.work/api/v1/health returns 2xx/3xx" - - "https://awoooi.wooo.work/ returns 2xx/3xx" - - "https://mo.wooo.work/ returns 2xx/3xx" - - "https://mo.wooo.work/health returns 2xx/3xx" - blocks: - - "external release complete" + - cron_active_188_110_120_121 + - docker_restart_textfile_fresh_188 + - docker_stats_textfile_fresh_188_110 + - systemd_units_textfile_fresh_110 + - backup_health_textfile_fresh_188_110 + - backup_from_110_success_under_25h + - expected_backup_jobs_fresh_188_110 + - host_service_config_backup_success_under_48h + - sentry_dedicated_backup_success_under_48h + - backup_integrity_check_success_under_8d + - backup_restore_drill_success_under_31d + - velero_schedule_present_and_latest_completed_under_25h + - velero_restore_test_cron_present + - momo_scheduler_registered_jobs + - k8s_cronjobs_unsuspended + - k8s_failed_jobs_zero + - dr_drill_cron_present_121 - - id: "P2-SCHEDULES" - order: 6 - start_after: - - "P2-PUBLIC-ROUTES" - gates: - - "110/120/121/188 cron services active" - - "188 backup-from-110 success age below 25h" - - "188 docker restart/stats textfiles fresh" - - "188 momo-scheduler container healthy and registration evidence present within 6h" - - "110 docker/systemd textfiles fresh" - - "120 awoooi-prod CronJobs present and unsuspended" - - "120 awoooi-prod has no failed Jobs" - - "121 DR drill cron present" - blocks: - - "done criteria" - - "AI auto-remediation release" + - id: P3-HIGH-LOAD-WORK + order: 70 + release_after: + - P0-NETWORK + - P0-188-DATA + - P0-110-REGISTRY-OBSERVABILITY + - P1-K3S + - P2-WORKLOAD-ALERTCHAIN + - P2-PUBLIC-ROUTES + - P2-SCHEDULES + release_conditions: + - host_load_per_core_below_1_0_for_15m + - no_restart_storm + - clickhouse_merge_or_kafka_lag_not_increasing_two_checks + examples: + - sentry_snuba_consumers + - momo_scheduler_chrome_crawlers + - gitea_actions_jobs - - id: "P3-HIGH-LOAD-RELEASE" - order: 7 - start_after: - - "P2-SCHEDULES" - release_last: - - "momo-scheduler / Chrome crawlers" - - "Sentry Snuba consumers" - - "SignOz ClickHouse merge-heavy work" - - "Gitea actions runners" - - "runtime CD jobs" - gates: - - "all prior gates green" - - "host load per CPU below 1.0 for 15 minutes before releasing batch/runner work" - - "ClickHouse/Kafka/Snuba backlog decreasing for two consecutive checks if backlog exists" + - id: P3-RUNNER-CD + order: 80 + release_conditions: + - all_previous_gates_green + - runner_cpuquota_200_percent + - runner_memorymax_2g + - watchdogusec_0 + - active_awoooi_cd_or_gitea_actions_task_containers_cpu_capped_during_cold_start -baselines: - endpoints: - awoooi_vip_api_health: "http://192.168.0.125:32334/api/v1/health" - awoooi_vip_web: "http://192.168.0.125:32335/" - awoooi_public_api_health: "https://awoooi.wooo.work/api/v1/health" - awoooi_public_web: "https://awoooi.wooo.work/" - momo_public_web: "https://mo.wooo.work/" - momo_public_health: "https://mo.wooo.work/health" - harbor_registry: "http://127.0.0.1:5000/v2/" - gitea: "http://127.0.0.1:3001/" - prometheus_ready: "http://127.0.0.1:9090/-/ready" - alertmanager_healthy: "http://127.0.0.1:9093/-/healthy" - sentry: "http://127.0.0.1:9000/" - expected_codes: - harbor_registry: - - 200 - - 401 - gitea: - - 200 - - 302 - prometheus_ready: - - 200 - alertmanager_healthy: - - 200 - sentry: - - 200 - - 302 - - 400 - workload_and_public: - - "2xx" - - "3xx" - runner_guardrails: - CPUQuotaPerSecUSec: "2s" - MemoryMax: "2147483648" - WatchdogUSec: "0" - freshness_seconds: - docker_textfiles: 300 - systemd_textfiles: 300 - backup_success: 90000 +automation_policy: + before_green: + ai_auto_repair: observe_only + alertmanager_smoke_test: manual_or_release_gate_only + stateful_service_actions: human_approval_required + generic_restart: forbidden + after_green: + ai_auto_repair: limited_execution_for_stateless_exporters_only + stateful_service_actions: human_in_the_loop + runner_cd: controlled_release -stateful_services: - hard_block_auto_repair: - - "188 PostgreSQL data directory" - - "188 k3s_datastore" - - "188 momo database" - - "110 Harbor DB" - - "110 Sentry DB" - - "Sentry ClickHouse data" - - "SignOz ClickHouse data" - - "Kafka topic/log directories" - human_in_loop_required: - - "pg_resetwal" - - "ClickHouse clean-clone recovery" - - "Kafka checkpoint file quarantine" - - "backup restore" - - "filesystem repair" +resource_guardrails: + "110": + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.47.0 + cpus: 0.3 + mem_limit: 512m + sentry_snuba_cold_start_consumers: + cpus: 0.5 + persist_in: /opt/sentry/docker-compose.override.yml + sentry_self_hosted_memory_limits: + taskscheduler_mem_limit: 1g + relay_mem_limit: 2g + persist_in: /opt/sentry/docker-compose.override.yml + note: "taskscheduler/relay 不得回退到 512m/1g 造成長期 >85% memory-limit pressure;110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。" + actions_runner_systemd: + cpu_quota: 200% + memory_max: 2G + watchdog: disabled + "188": + ollama_systemd: + cpu_quota: 300% + memory_high: 20G + memory_max: 24G + max_loaded_models: 1 + num_parallel: 1 + note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint;不得維持 disabled/inactive,也不得保留 700%/45G 無節制 guardrail。" + litellm: + cpus: 1.0 + memory: 1G + mode: stateless + momo_scheduler: + cpus: 2.0 + memory: 2G + signoz_clickhouse: + memory: 24G + note: do_not_lower_during_merge_backlog -ai_automation_gate: - observe_only_until: - - "P0-NETWORK green" - - "P0-188-DATA green" - - "P0-110-REGISTRY-OBSERVABILITY green" - - "P1-K3S green" - - "P2-WORKLOAD-ALERTCHAIN green" - - "P2-PUBLIC-ROUTES green" - - "P2-SCHEDULES green" - - "no active restart storm" - - "host load per CPU below 1.0 for 15 minutes" - allowed_before_green: - - "diagnose" - - "collect evidence" - - "notify" - blocked_before_green: - - "stateful restart" - - "destructive repair" - - "runner/CD release" - - "generic container restart" - -persistent_monitoring: - host: "110" - install_command: "bash scripts/reboot-recovery/install-cold-start-monitor-110.sh" - schedule: "*/10 * * * *" - mode: "read_only" - send_alert_test: false - scripts: - check: "/home/wooo/scripts/full-stack-cold-start-check.sh" - exporter: "/home/wooo/scripts/cold-start-textfile-exporter.sh" - outputs: - textfile: "/home/wooo/node_exporter_textfiles/cold_start_recovery.prom" - last_log: "/home/wooo/reboot-recovery/cold-start-last.log" - metrics: - - "awoooi_cold_start_monitor_up" - - "awoooi_cold_start_pass_gates" - - "awoooi_cold_start_warn_gates" - - "awoooi_cold_start_blocked_gates" - - "awoooi_cold_start_last_run_timestamp" - - "awoooi_cold_start_last_green_timestamp" - - "awoooi_cold_start_last_result" - prometheus_alerts: - - "ColdStartMonitorMissing" - - "ColdStartMonitorStale" - - "ColdStartRecoveryBlocked" - - "ColdStartRecoveryDegraded" - - "ColdStartLastGreenTooOld" - ai_contract: - monitor_missing: "diagnose cron/textfile path only" - stale: "collect cron log and last check log" - degraded: "collect evidence, do not release high-load work" - blocked: "follow first BLOCKED gate in phase order" - forbidden: "generic restart, stateful restart, destructive repair" - -final_confirmation: - command: "bash scripts/reboot-recovery/full-stack-cold-start-check.sh --watch --interval 60 --max-attempts 30 --send-alert-test" - green_result: - PASS: "greater than 0" - WARN: 0 - BLOCKED: 0 - summary: "Result: GREEN" +authoritative_checks: + read_only_monitor: + command: bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color + expected_for_cron: PASS>0 WARN=0 BLOCKED=0 + release_gate: + command: SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test + expected: PASS=64 WARN=0 BLOCKED=0 + textfile_metric: + path: /home/wooo/node_exporter_textfiles/cold_start_recovery.prom + green_metric: awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} 1 + backup_baseline: + path: ops/reboot-recovery/full-stack-backup-baseline.yml + required_metrics: + - awoooi_backup_health_monitor_up + - awoooi_backup_job_fresh + - awoooi_backup_integrity_fresh + - awoooi_velero_restore_test_cron_present + - awoooi_velero_restore_test_last_success_fresh diff --git a/scripts/ops/backup-alert-label-contract-check.py b/scripts/ops/backup-alert-label-contract-check.py new file mode 100755 index 00000000..ccfbc818 --- /dev/null +++ b/scripts/ops/backup-alert-label-contract-check.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +""" +Validate the backup alert label contract. + +Node exporter textfile metrics use labels such as job="backup_all" locally, but +Prometheus rewrites that metric label to exported_job because the scrape target +already has job="node-exporter-110". Backup alerts must therefore use +$labels.exported_job in user-facing text and exported_job="..." in expressions. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + +import yaml + + +DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml") +DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml") + + +class ContractError(RuntimeError): + pass + + +def _load_alerts(path: Path) -> dict[str, dict[str, Any]]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + alerts: dict[str, dict[str, Any]] = {} + for group in data.get("groups") or []: + for rule in group.get("rules") or []: + name = rule.get("alert") + if name: + alerts[name] = rule + return alerts + + +def _annotation_text(rule: dict[str, Any]) -> str: + annotations = rule.get("annotations") or {} + return "\n".join(str(value) for value in annotations.values()) + + +def _require_alert(alerts: dict[str, dict[str, Any]], name: str) -> dict[str, Any]: + if name not in alerts: + raise ContractError(f"missing alert: {name}") + return alerts[name] + + +def _require_contains(value: str, expected: str, label: str) -> None: + if expected not in value: + raise ContractError(f"{label} must contain {expected!r}") + + +def _require_not_contains(value: str, forbidden: str, label: str) -> None: + if forbidden in value: + raise ContractError(f"{label} must not contain {forbidden!r}") + + +def _expected_backup_alerts(path: Path) -> list[str]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + alerts = data.get("monitoring_contract", {}).get("prometheus_alerts") or [] + if not alerts: + raise ContractError(f"missing monitoring_contract.prometheus_alerts in {path}") + return [str(alert) for alert in alerts] + + +def static_check(path: Path, baseline_path: Path) -> list[str]: + alerts = _load_alerts(path) + lines: list[str] = [] + + missing = sorted(set(_expected_backup_alerts(baseline_path)) - set(alerts)) + if missing: + raise ContractError(f"alerts-unified.yml missing baseline backup alerts: {missing}") + lines.append("OK alerts-unified.yml contains every baseline backup alert") + + rule = _require_alert(alerts, "BackupExpectedJobMissing") + _require_contains(str(rule.get("expr", "")), "awoooi_backup_job_configured", "BackupExpectedJobMissing expr") + text = _annotation_text(rule) + _require_contains(text, "$labels.exported_job", "BackupExpectedJobMissing annotations") + _require_not_contains(text, "$labels.job", "BackupExpectedJobMissing annotations") + lines.append("OK BackupExpectedJobMissing uses exported_job label") + + rule = _require_alert(alerts, "BackupJobStale") + _require_contains(str(rule.get("expr", "")), "awoooi_backup_job_fresh", "BackupJobStale expr") + text = _annotation_text(rule) + _require_contains(text, "$labels.exported_job", "BackupJobStale annotations") + _require_not_contains(text, "$labels.job", "BackupJobStale annotations") + for required_label in ["$labels.max_age_hours", "$labels.source", "$labels.target"]: + _require_contains(text, required_label, "BackupJobStale annotations") + lines.append("OK BackupJobStale uses exported_job/source/target labels") + + rule = _require_alert(alerts, "BackupAggregateRunFailed") + _require_contains( + str(rule.get("expr", "")), + 'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}', + "BackupAggregateRunFailed expr", + ) + lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all") + + rule = _require_alert(alerts, "BackupConfigCapturePartial") + _require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr") + text = _annotation_text(rule) + for required_label in ["$labels.target", "$labels.source"]: + _require_contains(text, required_label, "BackupConfigCapturePartial annotations") + lines.append("OK BackupConfigCapturePartial uses target/source labels") + + rule = _require_alert(alerts, "BackupConfigCaptureStatusStale") + _require_contains( + str(rule.get("expr", "")), + "awoooi_backup_config_capture_status_timestamp", + "BackupConfigCaptureStatusStale expr", + ) + lines.append("OK BackupConfigCaptureStatusStale checks config capture status timestamp") + + rule = _require_alert(alerts, "BackupScriptMissing") + _require_contains(_annotation_text(rule), "$labels.script", "BackupScriptMissing annotations") + lines.append("OK BackupScriptMissing uses script label") + + rule = _require_alert(alerts, "BackupCredentialEscrowEvidenceMissing") + _require_contains(_annotation_text(rule), "$labels.item", "BackupCredentialEscrowEvidenceMissing annotations") + lines.append("OK BackupCredentialEscrowEvidenceMissing uses item label") + + return lines + + +def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]: + query = urllib.parse.urlencode({"query": expr}) + url = f"{base_url.rstrip('/')}/api/v1/query?{query}" + with urllib.request.urlopen(url, timeout=8) as response: + payload = json.loads(response.read().decode("utf-8")) + if payload.get("status") != "success": + raise ContractError(f"Prometheus query failed for {expr}: {payload}") + return payload.get("data", {}).get("result") or [] + + +def _prom_rules(base_url: str) -> list[dict[str, Any]]: + url = f"{base_url.rstrip('/')}/api/v1/rules" + with urllib.request.urlopen(url, timeout=8) as response: + payload = json.loads(response.read().decode("utf-8")) + if payload.get("status") != "success": + raise ContractError(f"Prometheus rules query failed: {payload}") + rules: list[dict[str, Any]] = [] + for group in payload.get("data", {}).get("groups") or []: + for rule in group.get("rules") or []: + name = rule.get("name") or rule.get("alert") + if not name: + continue + rules.append( + { + "name": str(name), + "health": str(rule.get("health", "")), + "state": str(rule.get("state", "")), + "group": str(group.get("name", "")), + } + ) + return rules + + +def _require_live_label(base_url: str, expr: str, labels: set[str]) -> str: + rows = _prom_query(base_url, expr) + if not rows: + raise ContractError(f"Prometheus query returned no series: {expr}") + metric = rows[0].get("metric") or {} + missing = sorted(label for label in labels if label not in metric) + if missing: + raise ContractError(f"{expr} missing labels {missing}; labels={sorted(metric)}") + return f"OK live {expr} exposes labels {','.join(sorted(labels))}" + + +def _require_live_rules(base_url: str, expected_alerts: list[str]) -> list[str]: + rules = _prom_rules(base_url) + by_name = {rule["name"]: rule for rule in rules} + missing = sorted(set(expected_alerts) - set(by_name)) + if missing: + raise ContractError(f"Prometheus missing loaded backup alert rules: {missing}") + + unhealthy = [ + f"{rule['name']} health={rule['health']} group={rule['group']}" + for rule in by_name.values() + if rule["name"] in expected_alerts and rule["health"] not in {"", "ok"} + ] + if unhealthy: + raise ContractError(f"Prometheus backup alert rule health is not ok: {unhealthy}") + + state_counts: dict[str, int] = {} + for name in expected_alerts: + state = by_name[name]["state"] or "unknown" + state_counts[state] = state_counts.get(state, 0) + 1 + state_summary = ",".join(f"{key}={state_counts[key]}" for key in sorted(state_counts)) + return [ + f"OK live Prometheus loaded {len(expected_alerts)} baseline backup alert rules", + f"OK live Prometheus backup alert rule states {state_summary}", + ] + + +def live_check(base_url: str, baseline_path: Path) -> list[str]: + lines = [ + _require_live_label( + base_url, + 'awoooi_backup_job_configured{host="110"}', + {"exported_job", "host", "job"}, + ), + _require_live_label( + base_url, + 'awoooi_backup_job_fresh{host="110"}', + {"exported_job", "host", "job", "source", "target", "max_age_hours"}, + ), + _require_live_label( + base_url, + 'awoooi_backup_last_run_failed_count{host="110"}', + {"exported_job", "host", "job"}, + ), + _require_live_label( + base_url, + 'awoooi_backup_dr_next_step_info{host="110"}', + {"host", "next_step"}, + ), + _require_live_label( + base_url, + 'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone"}', + {"host", "provider", "scope", "max_age_hours"}, + ), + _require_live_label( + base_url, + 'awoooi_backup_config_capture_ok{host="110"}', + {"host", "target", "source", "critical"}, + ), + ] + lines.extend(_require_live_rules(base_url, _expected_backup_alerts(baseline_path))) + return lines + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--rules", type=Path, default=DEFAULT_RULES) + parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE) + parser.add_argument("--prometheus-url", default="") + args = parser.parse_args() + + try: + for line in static_check(args.rules, args.baseline): + print(line) + if args.prometheus_url: + for line in live_check(args.prometheus_url, args.baseline): + print(line) + except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc: + print(f"BACKUP_ALERT_LABEL_CONTRACT_FAILED {exc}", file=sys.stderr) + return 1 + + print("BACKUP_ALERT_LABEL_CONTRACT_OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/ops/backup-alert-live-visibility-check.py b/scripts/ops/backup-alert-live-visibility-check.py new file mode 100755 index 00000000..7ec765ec --- /dev/null +++ b/scripts/ops/backup-alert-live-visibility-check.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +"""Verify live visibility for backup gap alerts. + +This read-only check closes the gap between "metrics exist" and "alerts are +actually visible". If the offsite or credential-escrow gap metrics are present, +the corresponding Prometheus firing alerts must be visible. When Alertmanager is +provided, those same alerts must also be active there. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +import urllib.parse +import urllib.request +from dataclasses import dataclass +from typing import Any + + +class VisibilityError(RuntimeError): + pass + + +@dataclass(frozen=True) +class RequiredAlert: + name: str + labels: dict[str, str] + + +COMMON_LABELS = { + "host": "110", + "auto_repair": "false", + "alert_category": "infrastructure", + "notification_type": "TYPE-1", + "severity": "warning", +} + + +def _json_get(url: str, timeout: int) -> Any: + with urllib.request.urlopen(url, timeout=timeout) as response: + return json.loads(response.read().decode("utf-8")) + + +def _prom_query(base_url: str, expr: str, timeout: int) -> list[dict[str, Any]]: + query = urllib.parse.urlencode({"query": expr}) + url = f"{base_url.rstrip('/')}/api/v1/query?{query}" + payload = _json_get(url, timeout) + if payload.get("status") != "success": + raise VisibilityError(f"Prometheus query failed for {expr}: {payload}") + return payload.get("data", {}).get("result") or [] + + +def _prom_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]: + url = f"{base_url.rstrip('/')}/api/v1/alerts" + payload = _json_get(url, timeout) + if payload.get("status") != "success": + raise VisibilityError(f"Prometheus alerts query failed: {payload}") + return payload.get("data", {}).get("alerts") or [] + + +def _alertmanager_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]: + url = f"{base_url.rstrip('/')}/api/v2/alerts" + payload = _json_get(url, timeout) + if not isinstance(payload, list): + raise VisibilityError(f"Alertmanager alerts query returned unexpected payload: {payload}") + return payload + + +def _float_value(row: dict[str, Any], expr: str) -> float: + value = row.get("value") + if not isinstance(value, list) or len(value) < 2: + raise VisibilityError(f"Prometheus query returned unexpected value for {expr}: {row}") + try: + return float(value[1]) + except (TypeError, ValueError) as exc: + raise VisibilityError(f"Prometheus query returned non-numeric value for {expr}: {row}") from exc + + +def _metric_labels(row: dict[str, Any]) -> dict[str, str]: + metric = row.get("metric") or {} + return {str(key): str(value) for key, value in metric.items()} + + +def _labels_match(actual: dict[str, str], expected: dict[str, str]) -> bool: + return all(actual.get(key) == value for key, value in expected.items()) + + +def _find_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None: + expected = {"alertname": required.name, **required.labels} + for alert in alerts: + if str(alert.get("state", "")) != "firing": + continue + labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()} + if _labels_match(labels, expected): + return alert + return None + + +def _find_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None: + expected = {"alertname": required.name, **required.labels} + for alert in alerts: + status = alert.get("status") or {} + if str(status.get("state", "")) != "active": + continue + labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()} + if _labels_match(labels, expected): + return alert + return None + + +def _require_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None: + if _find_prom_alert(alerts, required) is None: + raise VisibilityError( + f"missing Prometheus firing alert {required.name} with labels {required.labels}" + ) + + +def _require_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None: + if _find_alertmanager_alert(alerts, required) is None: + raise VisibilityError( + f"missing Alertmanager active alert {required.name} with labels {required.labels}" + ) + + +def _sum_query_values(prometheus_url: str, expr: str, timeout: int) -> float: + return sum(_float_value(row, expr) for row in _prom_query(prometheus_url, expr, timeout)) + + +def _max_query_value(prometheus_url: str, expr: str, timeout: int) -> float: + rows = _prom_query(prometheus_url, expr, timeout) + if not rows: + return 0 + return max(_float_value(row, expr) for row in rows) + + +def _offsite_required_alerts(prometheus_url: str, host: str, timeout: int) -> tuple[list[RequiredAlert], str]: + expr = f'awoooi_backup_offsite_configured{{host="{host}"}}' + rows = _prom_query(prometheus_url, expr, timeout) + if not rows: + raise VisibilityError(f"Prometheus query returned no offsite configured series: {expr}") + configured_total = sum(_float_value(row, expr) for row in rows) + if configured_total == 0: + return ( + [RequiredAlert("BackupOffsiteCopyNotConfigured", {**COMMON_LABELS, "host": host})], + "OK offsite gap metric requires BackupOffsiteCopyNotConfigured visibility", + ) + + fresh_expr = f'awoooi_backup_offsite_fresh{{host="{host}"}}' + if _sum_query_values(prometheus_url, fresh_expr, timeout) > 0: + return [], "OK offsite full marker is fresh; no offsite gap alert required" + + enabled_expr = f'awoooi_backup_offsite_full_sync_enabled{{host="{host}"}}' + enabled_total = _sum_query_values(prometheus_url, enabled_expr, timeout) + if enabled_total > 0: + timestamp_expr = f'awoooi_backup_offsite_full_sync_enabled_timestamp{{host="{host}"}}' + enabled_timestamp = _max_query_value(prometheus_url, timestamp_expr, timeout) + enabled_age = int(time.time() - enabled_timestamp) if enabled_timestamp else 0 + if enabled_timestamp and enabled_age <= 30 * 3600: + return ( + [], + f"OK offsite full sync enabled within grace window; BackupOffsiteCopyStale not required yet age_seconds={enabled_age}", + ) + + return ( + [RequiredAlert("BackupOffsiteCopyStale", {**COMMON_LABELS, "host": host})], + "OK offsite full marker gap requires BackupOffsiteCopyStale visibility", + ) + + +def _escrow_required_alerts(prometheus_url: str, host: str, timeout: int) -> list[RequiredAlert]: + expr = f'awoooi_backup_credential_escrow_fresh{{host="{host}"}} == 0' + rows = _prom_query(prometheus_url, expr, timeout) + required: list[RequiredAlert] = [] + for row in rows: + labels = _metric_labels(row) + item = labels.get("item") + if not item: + raise VisibilityError(f"Credential escrow gap metric missing item label: {row}") + required.append( + RequiredAlert( + "BackupCredentialEscrowEvidenceMissing", + {**COMMON_LABELS, "host": host, "item": item}, + ) + ) + return sorted(required, key=lambda alert: alert.labels["item"]) + + +def live_check(prometheus_url: str, alertmanager_url: str, host: str, timeout: int) -> list[str]: + required_alerts: list[RequiredAlert] = [] + lines: list[str] = [] + + offsite_alerts, offsite_line = _offsite_required_alerts(prometheus_url, host, timeout) + required_alerts.extend(offsite_alerts) + lines.append(offsite_line) + + escrow_alerts = _escrow_required_alerts(prometheus_url, host, timeout) + required_alerts.extend(escrow_alerts) + if escrow_alerts: + escrow_items = ", ".join(alert.labels["item"] for alert in escrow_alerts) + lines.append( + f"OK credential escrow gap metrics require {len(escrow_alerts)} alert(s): {escrow_items}" + ) + else: + lines.append("OK credential escrow markers are fresh; no escrow gap alert required") + + prom_alerts = _prom_alerts(prometheus_url, timeout) + for required in required_alerts: + _require_prom_alert(prom_alerts, required) + lines.append(f"OK Prometheus exposes {len(required_alerts)} required backup gap firing alert(s)") + + if alertmanager_url: + am_alerts = _alertmanager_alerts(alertmanager_url, timeout) + for required in required_alerts: + _require_alertmanager_alert(am_alerts, required) + lines.append(f"OK Alertmanager exposes {len(required_alerts)} required backup gap active alert(s)") + + return lines + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--prometheus-url", required=True) + parser.add_argument("--alertmanager-url", default="") + parser.add_argument("--host", default="110") + parser.add_argument("--timeout", type=int, default=8) + args = parser.parse_args() + + try: + for line in live_check(args.prometheus_url, args.alertmanager_url, args.host, args.timeout): + print(line) + except (VisibilityError, OSError, json.JSONDecodeError) as exc: + print(f"BACKUP_ALERT_LIVE_VISIBILITY_FAILED {exc}", file=sys.stderr) + return 1 + + print("BACKUP_ALERT_LIVE_VISIBILITY_OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/ops/prometheus-rule-drift-guard.sh b/scripts/ops/prometheus-rule-drift-guard.sh index d83635dd..a8bf9b19 100755 --- a/scripts/ops/prometheus-rule-drift-guard.sh +++ b/scripts/ops/prometheus-rule-drift-guard.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash # Guard 110 Prometheus alert rules against stale deploys. # -# The canonical file is the source of truth. The guard restores active -# alerts.yml only when the active file differs from canonical or when -# Prometheus is missing rule names declared by canonical. +# This script is intentionally narrow: it only restores the canonical alert +# rules file when required recovery/backup rules disappear from live Prometheus +# or when the active file differs from the canonical copy. set -uo pipefail @@ -14,6 +14,14 @@ CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonic TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}" LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}" +REQUIRED_RULES=( + "BackupCredentialEscrowEvidenceMissing" + "BackupExpectedJobMissing" + "awoooi_recovery_core_ready" + "awoooi_recovery_dr_offsite_ready" + "ColdStartRecoveryBlocked" +) + log() { mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE" @@ -34,7 +42,7 @@ awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",statu # HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run. # TYPE awoooi_prometheus_rule_drift_guard_repaired gauge awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired} -# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of canonical live rules missing after the last check. +# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of required live rules missing after the last check. # TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count} # HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy. @@ -46,27 +54,13 @@ EOF } rules_missing_count() { - python3 - "$PROMETHEUS_URL" "$CANONICAL_RULES" <<'PY' + python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY' import json -import re import sys import urllib.request base_url = sys.argv[1].rstrip("/") -canonical_path = sys.argv[2] - -name_pattern = re.compile(r"^\s*-\s*(?:alert|record):\s*['\"]?([^'\"#]+?)['\"]?\s*(?:#.*)?$") -required: set[str] = set() -try: - with open(canonical_path, encoding="utf-8") as handle: - for line in handle: - match = name_pattern.match(line) - if match: - required.add(match.group(1).strip()) -except Exception as exc: - print(f"CANONICAL_PARSE_FAILED:{exc}") - raise SystemExit(0) - +required = set(sys.argv[2:]) try: with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response: payload = json.loads(response.read().decode("utf-8")) @@ -115,8 +109,8 @@ main() { before_matches="$(matches_canonical)" repaired=0 - if [[ "$missing" == QUERY_FAILED:* || "$missing" == CANONICAL_PARSE_FAILED:* ]]; then - log "Prometheus/canonical query failed: ${missing}" + if [[ "$missing" == QUERY_FAILED:* ]]; then + log "Prometheus query failed: ${missing}" write_textfile "query_failed" 0 999 "$before_matches" return 1 fi @@ -135,8 +129,8 @@ main() { after_missing="$(rules_missing_count)" after_matches="$(matches_canonical)" - if [[ "$after_missing" == QUERY_FAILED:* || "$after_missing" == CANONICAL_PARSE_FAILED:* ]]; then - log "post-restore Prometheus/canonical query failed: ${after_missing}" + if [[ "$after_missing" == QUERY_FAILED:* ]]; then + log "post-restore Prometheus query failed: ${after_missing}" write_textfile "post_query_failed" "$repaired" 999 "$after_matches" return 1 fi diff --git a/scripts/ops/recovery-scorecard-contract-check.py b/scripts/ops/recovery-scorecard-contract-check.py new file mode 100755 index 00000000..e4f59f3c --- /dev/null +++ b/scripts/ops/recovery-scorecard-contract-check.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +"""Validate recovery scorecard recording-rule contract.""" + +from __future__ import annotations + +import argparse +import json +import sys +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + +import yaml + + +DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml") +DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml") +EXPECTED_CORE = 'awoooi_recovery_core_ready{host="110",scope="110_120_121_188"}' +EXPECTED_DR = 'awoooi_recovery_dr_offsite_ready{host="110"}' + + +class ContractError(RuntimeError): + pass + + +def _rules(path: Path) -> list[dict[str, Any]]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + rules: list[dict[str, Any]] = [] + for group in data.get("groups") or []: + rules.extend(group.get("rules") or []) + return rules + + +def _expected_recording_rules(path: Path) -> list[str]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or [] + if not rules: + raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}") + return [str(rule) for rule in rules] + + +def static_check(rules_path: Path, baseline_path: Path) -> list[str]: + rules = _rules(rules_path) + by_record = {str(rule.get("record")): rule for rule in rules if rule.get("record")} + expected = _expected_recording_rules(baseline_path) + missing = sorted(set(expected) - set(by_record)) + if missing: + raise ContractError(f"alerts-unified.yml missing recovery recording rules: {missing}") + + core_expr = str(by_record["awoooi_recovery_core_ready"].get("expr", "")) + for required in [ + "awoooi_cold_start_last_result", + "awoooi_cold_start_warn_gates", + "awoooi_cold_start_blocked_gates", + "awoooi_cold_start_last_green_timestamp", + ]: + if required not in core_expr: + raise ContractError(f"awoooi_recovery_core_ready expr missing {required}") + + dr_expr = str(by_record["awoooi_recovery_dr_offsite_ready"].get("expr", "")) + for required in [ + "awoooi_backup_offsite_configured", + "awoooi_backup_offsite_fresh", + "awoooi_backup_credential_escrow_fresh", + ]: + if required not in dr_expr: + raise ContractError(f"awoooi_recovery_dr_offsite_ready expr missing {required}") + + return [ + "OK alerts-unified.yml contains every recovery scorecard recording rule", + "OK recovery core rule depends on cold-start green/warn/blocked/last-green metrics", + "OK recovery DR rule depends on provider-neutral offsite freshness and credential escrow freshness", + ] + + +def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]: + url = f"{base_url.rstrip('/')}/api/v1/query?" + urllib.parse.urlencode({"query": expr}) + with urllib.request.urlopen(url, timeout=8) as response: + payload = json.loads(response.read().decode("utf-8")) + if payload.get("status") != "success": + raise ContractError(f"Prometheus query failed for {expr}: {payload}") + return payload.get("data", {}).get("result") or [] + + +def _single_value(base_url: str, expr: str) -> float: + rows = _prom_query(base_url, expr) + if len(rows) != 1: + raise ContractError(f"Prometheus query expected one series for {expr}, got {len(rows)}") + value = rows[0].get("value") or [] + if len(value) < 2: + raise ContractError(f"Prometheus query returned malformed value for {expr}: {rows[0]}") + try: + number = float(value[1]) + except (TypeError, ValueError) as exc: + raise ContractError(f"Prometheus query returned non-numeric value for {expr}: {rows[0]}") from exc + if number not in {0.0, 1.0}: + raise ContractError(f"Prometheus recovery scorecard metric must be 0 or 1: {expr}={number}") + return number + + +def live_check( + base_url: str, + expect_core_ready: bool = False, + expect_dr_ready: bool = False, +) -> list[str]: + core = _single_value(base_url, EXPECTED_CORE) + dr = _single_value(base_url, EXPECTED_DR) + lines = [ + f"OK live {EXPECTED_CORE} value={int(core)}", + f"OK live {EXPECTED_DR} value={int(dr)}", + ] + if expect_core_ready and core != 1.0: + raise ContractError(f"expected core recovery ready, got {core}") + if expect_dr_ready and dr != 1.0: + raise ContractError(f"expected DR offsite ready, got {dr}") + return lines + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--rules", type=Path, default=DEFAULT_RULES) + parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE) + parser.add_argument("--prometheus-url", default="") + parser.add_argument("--expect-core-ready", action="store_true") + parser.add_argument("--expect-dr-ready", action="store_true") + args = parser.parse_args() + + try: + for line in static_check(args.rules, args.baseline): + print(line) + if args.prometheus_url: + for line in live_check( + args.prometheus_url, + args.expect_core_ready, + args.expect_dr_ready, + ): + print(line) + except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc: + print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr) + return 1 + + print("RECOVERY_SCORECARD_CONTRACT_OK") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/reboot-recovery/cold-start-textfile-exporter.sh b/scripts/reboot-recovery/cold-start-textfile-exporter.sh index 82262eea..2798f877 100755 --- a/scripts/reboot-recovery/cold-start-textfile-exporter.sh +++ b/scripts/reboot-recovery/cold-start-textfile-exporter.sh @@ -1,10 +1,8 @@ #!/usr/bin/env bash # Export AWOOOI full-stack cold-start gate status as node-exporter textfile metrics. # -# 2026-05-06 ogt + Codex: reboot recovery hardening. -# Intent: give Prometheus and the AI incident flow a durable, read-only signal -# for the 110/120/121/188 startup gates. This wrapper never sends the -# Alertmanager smoke event and never writes remote state. +# This wrapper is read-only: it never sends the Alertmanager smoke event and +# never mutates remote host/service state. set -uo pipefail @@ -13,6 +11,8 @@ TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_expo OUTPUT_NAME="${OUTPUT_NAME:-cold_start_recovery.prom}" LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}" CHECK_TIMEOUT_SECONDS="${CHECK_TIMEOUT_SECONDS:-240}" +CHECK_WATCH_INTERVAL_SECONDS="${CHECK_WATCH_INTERVAL_SECONDS:-10}" +CHECK_WATCH_MAX_ATTEMPTS="${CHECK_WATCH_MAX_ATTEMPTS:-3}" HOST_LABEL="${AIOPS_HOST_LABEL:-110}" SCOPE_LABEL="${AIOPS_SCOPE_LABEL:-110_120_121_188}" LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-cold-start-textfile-exporter.lock}" @@ -35,6 +35,10 @@ write_metric_file() { local blocked_state="${11}" local check_failed="${12}" local last_green="${13}" + local k3s_node_fs_blocker="${14}" + local public_route_tls_blocker="${15}" + local host_120_unreachable_blocker="${16}" + local backup_health_blocker="${17}" local host scope host=$(escape_label "$HOST_LABEL") scope=$(escape_label "$SCOPE_LABEL") @@ -70,10 +74,16 @@ awoooi_cold_start_last_result{host="$host",scope="$scope",result="green"} $green awoooi_cold_start_last_result{host="$host",scope="$scope",result="degraded"} $degraded awoooi_cold_start_last_result{host="$host",scope="$scope",result="blocked"} $blocked_state awoooi_cold_start_last_result{host="$host",scope="$scope",result="check_failed"} $check_failed +# HELP awoooi_cold_start_blocker_reason Whether a known cold-start blocker reason was detected in the last log. +# TYPE awoooi_cold_start_blocker_reason gauge +awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="k3s_node_filesystem_error",target="120"} $k3s_node_fs_blocker +awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="public_route_tls_failure",target="public_https"} $public_route_tls_blocker +awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="host_unreachable",target="120"} $host_120_unreachable_blocker +awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="backup_health_blocked",target="110"} $backup_health_blocker METRICS } -if [ -n "${BASH_VERSION:-}" ] && command -v flock >/dev/null 2>&1; then +if command -v flock >/dev/null 2>&1; then exec 9>"$LOCK_FILE" if ! flock -n 9; then exit 0 @@ -92,13 +102,19 @@ if [ ! -x "$CHECK_SCRIPT" ]; then tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX") last_green=$(cat "$state_file" 2>/dev/null || echo 0) printf 'CHECK_SCRIPT not executable: %s\n' "$CHECK_SCRIPT" >"$log_file" - write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green" + write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green" 0 0 0 0 chmod 0644 "$tmp_metric" mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME" exit 0 fi -timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" --monitor-read-only --no-color >"$log_tmp" 2>&1 +timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" \ + --monitor-read-only \ + --no-color \ + --watch \ + --interval "$CHECK_WATCH_INTERVAL_SECONDS" \ + --max-attempts "$CHECK_WATCH_MAX_ATTEMPTS" \ + >"$log_tmp" 2>&1 exit_code=$? mv "$log_tmp" "$log_file" @@ -111,6 +127,10 @@ green=0 degraded=0 blocked_state=0 check_failed=0 +k3s_node_fs_blocker=0 +public_route_tls_blocker=0 +host_120_unreachable_blocker=0 +backup_health_blocker=0 if [ -n "$summary_line" ]; then monitor_up=1 @@ -130,6 +150,22 @@ else check_failed=1 fi +if grep -Eq 'NODE_FS_ERROR_EVENTS[[:space:]]+[1-9][0-9]*|K3s node filesystem error events present' "$log_file"; then + k3s_node_fs_blocker=1 +fi + +if grep -Eq 'PUBLIC_ROUTE_TLS .*(000|5[0-9][0-9])|public route .* TLS certificate verification failed' "$log_file"; then + public_route_tls_blocker=1 +fi + +if grep -Eq 'BLOCKED (ping 192\.168\.0\.120|ssh port 192\.168\.0\.120:22|ssh 120 k3s read-only check)' "$log_file"; then + host_120_unreachable_blocker=1 +fi + +if grep -Eq 'BLOCKED 110 backup health has stale expected jobs' "$log_file"; then + backup_health_blocker=1 +fi + end_ts=$(date +%s) if [ "$green" -eq 1 ]; then printf '%s\n' "$end_ts" >"$state_file" @@ -137,6 +173,6 @@ fi last_green=$(cat "$state_file" 2>/dev/null || echo 0) tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX") -write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green" +write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green" "$k3s_node_fs_blocker" "$public_route_tls_blocker" "$host_120_unreachable_blocker" "$backup_health_blocker" chmod 0644 "$tmp_metric" mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME" diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index a032104a..cca8e0b8 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -7,6 +7,7 @@ set -uo pipefail SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6) SEND_ALERT_TEST=0 MONITOR_READ_ONLY=0 +NO_COLOR_FLAG=0 WATCH_MODE=0 WATCH_INTERVAL=60 WATCH_MAX_ATTEMPTS=30 @@ -30,15 +31,17 @@ USAGE } while [ "$#" -gt 0 ]; do - case "$1" in + arg="$1" + case "$arg" in --send-alert-test) SEND_ALERT_TEST=1 ;; --monitor-read-only) MONITOR_READ_ONLY=1 + SEND_ALERT_TEST=0 ;; --no-color) - NO_COLOR=1 + NO_COLOR_FLAG=1 ;; --watch) WATCH_MODE=1 @@ -64,7 +67,7 @@ while [ "$#" -gt 0 ]; do exit 0 ;; *) - echo "Unknown argument: $1" >&2 + echo "Unknown argument: $arg" >&2 usage >&2 exit 64 ;; @@ -72,7 +75,7 @@ while [ "$#" -gt 0 ]; do shift done -if [ -n "${NO_COLOR:-}" ]; then +if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then RED="" GREEN="" YELLOW="" @@ -90,12 +93,6 @@ PASS=0 WARN=0 FAIL=0 -reset_counters() { - PASS=0 - WARN=0 - FAIL=0 -} - log_section() { printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC" } @@ -198,6 +195,18 @@ probe_tcp() { nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1 } +print_neighbor_rows() { + if command -v arp >/dev/null 2>&1; then + arp -an | grep -E '192\.168\.0\.(110|120|121|188)' + return $? + fi + if command -v ip >/dev/null 2>&1; then + ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)' + return $? + fi + return 1 +} + print_header() { echo "AWOOOI full-stack cold-start check" date '+%Y-%m-%d %H:%M:%S %Z' @@ -222,12 +231,12 @@ check_network() { fi done - if arp -an | grep -E '192\.168\.0\.(110|120|121|188)'; then - ok "ARP evidence printed" + if print_neighbor_rows; then + ok "neighbor evidence printed" elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then - ok "ARP evidence unavailable in monitor mode; ping and TCP gates passed" + ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal" else - warn "no ARP rows printed for one or more hosts" + warn "no neighbor rows printed for one or more hosts" fi } @@ -370,21 +379,34 @@ WEB_CODE $web_code" check_public_routes() { log_section "P2-PUBLIC-ROUTES" - local awoooi_api_code awoooi_web_code momo_code momo_health_code - awoooi_api_code=$(probe_http_code "https://awoooi.wooo.work/api/v1/health") - awoooi_web_code=$(probe_http_code "https://awoooi.wooo.work/") - momo_code=$(probe_http_code "https://mo.wooo.work/") - momo_health_code=$(probe_http_code "https://mo.wooo.work/health") + local item name url code tls_code + local routes=( + "awoooi_api|https://awoooi.wooo.work/api/v1/health" + "awoooi_web|https://awoooi.wooo.work/" + "momo_web|https://mo.wooo.work/" + "momo_health|https://mo.wooo.work/health" + "gitea|https://gitea.wooo.work/" + "harbor|https://harbor.wooo.work/" + "registry|https://registry.wooo.work/" + "sentry|https://sentry.wooo.work/" + "signoz|https://signoz.wooo.work/" + "stock|https://stock.wooo.work/" + "langfuse|https://langfuse.wooo.work/" + "bitan|https://bitan.wooo.work/" + "aiops|https://aiops.wooo.work/" + ) - echo "AWOOOI_PUBLIC_API_CODE $awoooi_api_code" - echo "AWOOOI_PUBLIC_WEB_CODE $awoooi_web_code" - echo "MOMO_PUBLIC_CODE $momo_code" - echo "MOMO_PUBLIC_HEALTH_CODE $momo_health_code" - - [[ "$awoooi_api_code" =~ ^[23] ]] && ok "AWOOOI public API reachable" || warn "AWOOOI public API not confirmed" - [[ "$awoooi_web_code" =~ ^[23] ]] && ok "AWOOOI public web reachable" || warn "AWOOOI public web not confirmed" - [[ "$momo_code" =~ ^[23] ]] && ok "momo public route reachable" || warn "momo public route not confirmed" - [[ "$momo_health_code" =~ ^[23] ]] && ok "momo public health reachable" || warn "momo public health not confirmed" + for item in "${routes[@]}"; do + name="${item%%|*}" + url="${item#*|}" + code=$(probe_http_code "$url") + echo "PUBLIC_ROUTE $name $code $url" + [[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed" + tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true) + tls_code="${tls_code:-000}" + echo "PUBLIC_ROUTE_TLS $name $tls_code $url" + [[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed" + done } check_schedules() { @@ -394,7 +416,7 @@ check_schedules() { if out=$(host_cmd "ollama@192.168.0.188" ' now=$(date +%s) echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" -for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom; do +for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do if [ -f "$f" ]; then mt=$(stat -c %Y "$f") echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))" @@ -405,17 +427,37 @@ done if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom fi -echo "SCHEDULER_STATE $(docker inspect -f "{{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true)" -echo "SCHEDULER_REGISTERED $(docker logs --since 6h momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" +if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then + awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom +fi +if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then + awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom +fi +echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)" +echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)" +echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" +echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler" || true)" +momo_sync=$(docker exec momo-db sh -c "psql -U \"\$POSTGRES_USER\" -d \"\$POSTGRES_DB\" -Atc \"WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\\\"日期\\\"::date) mmin, max(\\\"日期\\\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \\\"日期\\\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;\"" 2>/dev/null || true) +echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}" ' 2>&1); then echo "$out" grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed" awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing" + awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale" awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale" awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale" + awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale" + grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean" awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed" - grep -q "SCHEDULER_STATE running healthy" <<<"$out" && ok "188 momo scheduler container healthy" || warn "188 momo scheduler health not confirmed" - awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs within 6h" || warn "188 momo scheduler registration not confirmed within 6h" + grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs" + if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then + ok "188 momo scheduler healthy with recent task activity" + elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then + ok "188 momo scheduler registered jobs" + else + warn "188 momo scheduler registration/activity not confirmed" + fi + awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed" else warn "188 schedule check unavailable" echo "$out" @@ -427,7 +469,7 @@ echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active cro echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)" echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)" echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)" -for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom; do +for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do if [ -f "$f" ]; then mt=$(stat -c %Y "$f") echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))" @@ -435,6 +477,12 @@ for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_ex echo "TEXTFILE_110 $(basename "$f") missing" fi done +if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then + awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom +fi +if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then + awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom +fi ' 2>&1); then echo "$out" grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed" @@ -443,6 +491,11 @@ done grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled" awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale" awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale" + awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale" + awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale" + grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean" + grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers" + awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale" else warn "110 schedule check unavailable" echo "$out" @@ -494,54 +547,41 @@ summary() { echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL" if [ "$FAIL" -gt 0 ]; then echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation." - return 2 + exit 2 fi if [ "$WARN" -gt 0 ]; then echo "Result: DEGRADED. Core gates passed but warnings remain." - return 1 + exit 1 fi echo "Result: GREEN. Full stack is ready for controlled runner/CD release." - return 0 -} - -run_once() { - reset_counters - print_header - check_network - check_188 - check_110 - check_k3s - check_workload_and_alertchain - check_public_routes - check_schedules - summary } if [ "$WATCH_MODE" -eq 1 ]; then attempt=1 - while :; do - if [ "$WATCH_MAX_ATTEMPTS" -eq 0 ]; then - printf "\nWatch attempt %s/unlimited\n" "$attempt" - else - printf "\nWatch attempt %s/%s\n" "$attempt" "$WATCH_MAX_ATTEMPTS" - fi - - run_once + rc=2 + while true; do + echo "WATCH_ATTEMPT=$attempt" + args=() + [ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only) + [ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color) + [ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test) + bash "$0" "${args[@]}" rc=$? - if [ "$rc" -eq 0 ]; then - exit 0 - fi - - if [ "$WATCH_MAX_ATTEMPTS" -ne 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then - echo "Watch stopped before GREEN. Last result code: $rc" + [ "$rc" -eq 0 ] && exit 0 + if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then exit "$rc" fi - - echo "Waiting ${WATCH_INTERVAL}s before the next cold-start gate check..." - sleep "$WATCH_INTERVAL" attempt=$((attempt + 1)) + sleep "$WATCH_INTERVAL" done fi -run_once -exit $? +print_header +check_network +check_188 +check_110 +check_k3s +check_workload_and_alertchain +check_public_routes +check_schedules +summary