diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 8936a689..0cdbc4c7 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -3282,7 +3282,7 @@ DATABASE_URL=postgresql+asyncpg://u:p@localhost:5432/test REDIS_URL=redis://loca |----------|--------| | `docs/runbooks/FULL-STACK-COLD-START-SOP.md` | 升級為 v1.1,補齊 Golden Startup Order、Mermaid 依賴圖、phase gate 邏輯、script-to-SOP 覆蓋表、next-reboot operator contract | | `ops/reboot-recovery/full-stack-cold-start-baseline.yml` | 新增機器可讀 baseline,固定 hosts、roles、啟動順序、endpoint code、schedule freshness、stateful-service 禁區、AI auto-remediation gate | -| `scripts/reboot-recovery/full-stack-cold-start-check.sh` | 新增 `--watch` / `--interval` / `--max-attempts`,可在重開機後反覆檢查直到 `GREEN` | +| `scripts/reboot-recovery/full-stack-cold-start-check.sh` | 新增 `--watch` / `--interval` / `--max-attempts`,可在重開機後反覆檢查直到 `GREEN`;momo-scheduler 改用 container health + 6h registration evidence,避免 `tail 200` 假陰性 | ### 標準下次重開機放行指令 @@ -3304,7 +3304,7 @@ ruby -e 'require "yaml"; YAML.load_file("ops/reboot-recovery/full-stack-cold-sta # YAML OK bash scripts/reboot-recovery/full-stack-cold-start-check.sh --watch --interval 1 --max-attempts 1 --send-alert-test -# PASS=50 WARN=0 BLOCKED=0 +# PASS=51 WARN=0 BLOCKED=0 # Result: GREEN. Full stack is ready for controlled runner/CD release. ``` diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 0dbb612d..3e300ce5 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -369,7 +369,7 @@ Do not mark the reboot complete until scheduled work is proven runnable. A conta |--------------|----------------|------------------| | 188 cron | `systemctl is-active cron` and `crontab -l` | cron active; backup, restart exporter, stats exporter entries present | | 188 backup-from-110 | `backup_110_last_success_timestamp` in textfile/Prometheus | last success age `< 25h` | -| 188 momo-scheduler | `docker logs momo-scheduler` | `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames | +| 188 momo-scheduler | `docker inspect momo-scheduler` and `docker logs --since 6h momo-scheduler` | container `running healthy`; `全部排程任務已註冊`; Google Drive auth works; dashboard URLs use container-reachable hostnames | | 188 momo import | manual `run_auto_import_task()` after parser changes | selected sheet is `即時業績明細`; imported date range has matching rows in `daily_sales_snapshot` and `realtime_sales_monthly` | | 110 cron | `systemctl is-active cron` | cron active; Docker/systemd textfile exporters fresh | | 110 startup units | `systemctl --failed` | zero failed units; stale `momo-startup-complete` and `wooo-staggered-startup` disabled | diff --git a/ops/reboot-recovery/full-stack-cold-start-baseline.yml b/ops/reboot-recovery/full-stack-cold-start-baseline.yml index b28c0ed1..08cdf10d 100644 --- a/ops/reboot-recovery/full-stack-cold-start-baseline.yml +++ b/ops/reboot-recovery/full-stack-cold-start-baseline.yml @@ -193,6 +193,7 @@ phases: - "110/120/121/188 cron services active" - "188 backup-from-110 success age below 25h" - "188 docker restart/stats textfiles fresh" + - "188 momo-scheduler container healthy and registration evidence present within 6h" - "110 docker/systemd textfiles fresh" - "120 awoooi-prod CronJobs present and unsuspended" - "120 awoooi-prod has no failed Jobs" diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index d45df327..cf36910b 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -340,7 +340,8 @@ done if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom fi -echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" +echo "SCHEDULER_STATE $(docker inspect -f "{{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true)" +echo "SCHEDULER_REGISTERED $(docker logs --since 6h momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" ' 2>&1); then echo "$out" grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed" @@ -348,7 +349,8 @@ echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep - awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale" awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale" awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed" - awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs" || warn "188 momo scheduler registration not confirmed" + grep -q "SCHEDULER_STATE running healthy" <<<"$out" && ok "188 momo scheduler container healthy" || warn "188 momo scheduler health not confirmed" + awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out" && ok "188 momo scheduler registered jobs within 6h" || warn "188 momo scheduler registration not confirmed within 6h" else warn "188 schedule check unavailable" echo "$out"