From 81ff04d019ff720bf769ffc879d67289ce3dd1a7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 18:49:01 +0800 Subject: [PATCH] fix(recovery): point signoz route to live upstream --- docs/LOGBOOK.md | 22 +++++++++++++++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 2 +- .../nginx/templates/188-all-sites.conf.j2 | 2 +- .../188-internal-tools-https.conf.j2 | 2 +- .../full-stack-cold-start-check.sh | 4 ++-- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 5755332e..5bc76e60 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,25 @@ +## 2026-07-01 — 18:47 SignOz public route source drift 修正 + +**照主線修正的問題**: +- cold-start hard blocker 已縮到 SignOz public 502 / TLS 與 188 MOMO daily sales stale;本輪先處理 SignOz。 +- live read-only probe 顯示 `https://signoz.wooo.work/` 回 Nginx `502`;`192.168.0.110:8080` 回 SignOz UI `200`;110 Docker 顯示 `signoz` container healthy 且 port `8080->8080`;188 沒有 SignOz container,188 `:8080` 只是 nginx welcome page。 +- live 188 Nginx 與 repo templates 都把 `signoz.wooo.work` proxy 到 `127.0.0.1:3301`,但該 upstream 不存在,根因是 public gateway upstream drift。 +- 已把 `infra/ansible/roles/nginx/templates/188-all-sites.conf.j2` 與 `188-internal-tools-https.conf.j2` 的 SignOz proxy 改為 `http://192.168.0.110:8080`。 +- `full-stack-cold-start-check.sh` 的 SignOz upstream probe 也改成 `http://192.168.0.110:8080/`,避免 source 已修但 cold-start 仍檢查不存在的 188 localhost 3301。 + +**驗證**: +- `curl http://192.168.0.110:8080/`:`200`。 +- `bash -n scripts/reboot-recovery/full-stack-cold-start-check.sh`:通過。 +- `bash scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --no-color`:`PASS=199 WARN=1 BLOCKED=0`。 + +**未完成 / blocker**: +- live apply 未執行:188 `/etc/nginx/sites-enabled/*.conf` 是 root-owned,`ollama` 執行 `sudo -n nginx -t` 回 `sudo: a password is required`。 +- 未讀、未要求、未保存 sudo password;未 reload Nginx。因此 public `signoz.wooo.work` 仍需 privileged apply/readback 才能宣稱恢復。 + +**邊界**:未重啟主機,未 restart Docker / DB / K3s / firewall,未讀 secret / token / `.env` / raw sessions / SQLite / auth,未使用 GitHub / `gh` / GitHub API。 + +**下一步**:由具備 188 sudo / console 的 controlled Nginx apply path 套用 SignOz upstream diff,先 `nginx -t` 再 reload,最後重跑 public route TLS 與 cold-start。 + ## 2026-07-01 — 18:40 P0 cold-start readiness / 110 monitor parity 收斂 **照主線修正的問題**: diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index c224b7c1..7b351a9c 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -18,7 +18,7 @@ v1.79 active owner response template rule:同一輪 owner packet 產生後,p v1.80 / v1.81 credential escrow intake scorecard rule:同一輪 owner response preflight 後,必須用 `scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py --summary-file "$ARTIFACT_DIR/summary.txt" --owner-packet-file --response-file --offsite-report-file --escrow-status-file ` 收斂 DR escrow gate。scorecard 只讀 sanitized artifacts;不得讀 secret value、不得寫 marker、不得送 owner request、不得開 runtime gate。placeholder readback 期望 `STATUS=blocked_waiting_non_secret_credential_escrow_evidence`、`EFFECTIVE_ESCROW_MISSING_COUNT=5`、`OWNER_RESPONSE_RECEIVED_COUNT=0`、`OWNER_RESPONSE_ACCEPTED_COUNT=0`、`RUNTIME_GATE_COUNT=0`、`CREDENTIAL_MARKER_WRITE_AUTHORIZED_COUNT=0`。若未來收到合格 redacted owner response 並由 preflight 回 `ready_for_independent_reviewer_acceptance`,scorecard 應轉為 `STATUS=ready_for_independent_reviewer_acceptance`;即使 marker 尚未寫入,也只能進 `independent_reviewer_acceptance_then_marker_dry_run`,不得直接寫 marker 或宣稱 `DR_COMPLETE`。 -2026-07-01 18:40 latest live summary:全主機重啟後仍不可宣稱 10 分鐘自動恢復,但主 blocker 已從 110/Harbor/Gitea source parity 轉為實際 runtime/data blocker。repo-side readiness 已修成 `PASS=199 WARN=1 BLOCKED=0`;`full-stack-cold-start-check.sh --monitor-read-only --no-color` 回 `PASS=86 WARN=8 BLOCKED=2`。110 P0 區段已讀回 Harbor / Gitea / Prometheus / Alertmanager / Sentry OK、legacy/direct runner fail-closed、controlled CD lane fail-closed、root restore source left `0`、storage clean、textfile exporters fresh;但 `wooo@192.168.0.110` command channel 曾在同輪 backup 前 timeout,bounded retest 又回 `SSH_COMMAND_OK`,因此要記為 intermittent 110 control-channel evidence,不得宣稱 110 SSH 永久穩定。110 live cold-start monitor 已用既有 installer 同步,rollback evidence 保留在 `/home/wooo/scripts/full-stack-cold-start-check.sh.before-p0-readiness-20260701-183215` 與 `/home/wooo/scripts/full-stack-cold-start-check.sh.before-hostkey-policy-20260701-183637`;live hash 為 `full-stack-cold-start-check.sh=e320c061f5afd31c2a682576218f549b683f25dafd43dd52acc13b6283b33712`、`cold-start-textfile-exporter.sh=c52ea4fe8dd58688a87c01ca6288f8f6050aeb82417852213db3e2be69b29568`。`verify-cold-start-monitor-deploy.sh` 現在只判斷 deploy parity:hash、host-key policy、monitor-up;runtime green 另由 cold-start / scorecard 判斷,避免 SignOz/MOMO 尚未恢復時把 source parity 誤報成 deploy mismatch。18:40 scorecard 讀回 `CORE_COLD_START_DEPLOY_PARITY=1`、`CORE_REGISTRY_READY=1`、`CORE_COLD_START_BLOCKED_GATES=2`、`CORE_COLD_START_FIRING_ALERTS=3`、`DR_OFFSITE_EVIDENCE_READBACK=1`、`ESCROW_MISSING_COUNT=5`、`RECOVERY_STATE=CORE_NOT_READY_DR_OFFSITE_PENDING`。目前 hard blockers 是 `signoz.wooo.work` public 502 / TLS failure,以及 `188 momo daily sales data stale beyond 3 days`;DR 仍缺 5 個 credential escrow non-secret evidence。不可宣稱:full-stack green、10 分鐘全服務恢復、MOMO daily data 最新、SignOz public route 正常、DR complete 或 110 SSH 永久穩定。下一步固定為 SignOz public route / TLS 修復與 MOMO source freshness readback;110 若再次 command timeout,走 local console / control-channel recovery package,不重啟主機、不恢復 generic runner。 +2026-07-01 18:40 latest live summary:全主機重啟後仍不可宣稱 10 分鐘自動恢復,但主 blocker 已從 110/Harbor/Gitea source parity 轉為實際 runtime/data blocker。repo-side readiness 已修成 `PASS=199 WARN=1 BLOCKED=0`;`full-stack-cold-start-check.sh --monitor-read-only --no-color` 回 `PASS=86 WARN=8 BLOCKED=2`。110 P0 區段已讀回 Harbor / Gitea / Prometheus / Alertmanager / Sentry OK、legacy/direct runner fail-closed、controlled CD lane fail-closed、root restore source left `0`、storage clean、textfile exporters fresh;但 `wooo@192.168.0.110` command channel 曾在同輪 backup 前 timeout,bounded retest 又回 `SSH_COMMAND_OK`,因此要記為 intermittent 110 control-channel evidence,不得宣稱 110 SSH 永久穩定。110 live cold-start monitor 已用既有 installer 同步,rollback evidence 保留在 `/home/wooo/scripts/full-stack-cold-start-check.sh.before-p0-readiness-20260701-183215` 與 `/home/wooo/scripts/full-stack-cold-start-check.sh.before-hostkey-policy-20260701-183637`;live hash 為 `full-stack-cold-start-check.sh=e320c061f5afd31c2a682576218f549b683f25dafd43dd52acc13b6283b33712`、`cold-start-textfile-exporter.sh=c52ea4fe8dd58688a87c01ca6288f8f6050aeb82417852213db3e2be69b29568`。`verify-cold-start-monitor-deploy.sh` 現在只判斷 deploy parity:hash、host-key policy、monitor-up;runtime green 另由 cold-start / scorecard 判斷,避免 SignOz/MOMO 尚未恢復時把 source parity 誤報成 deploy mismatch。18:40 scorecard 讀回 `CORE_COLD_START_DEPLOY_PARITY=1`、`CORE_REGISTRY_READY=1`、`CORE_COLD_START_BLOCKED_GATES=2`、`CORE_COLD_START_FIRING_ALERTS=3`、`DR_OFFSITE_EVIDENCE_READBACK=1`、`ESCROW_MISSING_COUNT=5`、`RECOVERY_STATE=CORE_NOT_READY_DR_OFFSITE_PENDING`。目前 hard blockers 是 `signoz.wooo.work` public 502 / TLS failure,以及 `188 momo daily sales data stale beyond 3 days`;DR 仍缺 5 個 credential escrow non-secret evidence。18:47 source fix 已把 SignOz source-of-truth 從 `127.0.0.1:3301` 改到實際 110 upstream `192.168.0.110:8080`,且 cold-start probe 改為檢查同一 upstream;live apply 仍未執行,因為 188 `sudo -n nginx -t` 回 `sudo: a password is required`。不可宣稱:full-stack green、10 分鐘全服務恢復、MOMO daily data 最新、SignOz public route 正常、DR complete 或 110 SSH 永久穩定。下一步固定為 privileged Nginx config apply/readback 或本機 console 套用 SignOz route,接著處理 MOMO source freshness;110 若再次 command timeout,走 local console / control-channel recovery package,不重啟主機、不恢復 generic runner。 2026-06-30 22:55 latest live summary:全主機重啟後仍不可宣稱 10 分鐘自動恢復。`SSH_COMMAND_TIMEOUT_SECONDS=8 SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color` artifact `/tmp/awoooi-cold-start-live-after-ff.log` 回 `PASS=68 WARN=4 BLOCKED=4`,hard blockers 是 110 registry external `/v2`、110 SSH read-only check、K3s registry pull refused by `110:5000`、SignOz TLS / 502。StockPlatform public freshness / ingestion 22:50 仍回 `status=not_configured`、`blockers=["postgres_not_ready"]`。Public Gitea queue 22:55 回 `status=blocked_harbor_110_repair_no_matching_runner`,latest CD `#4105` 雖顯示 `Running`,但 build log 已有 `latest_visible_cd_inflight_classifier=harbor_registry_public_route_unavailable_pending_retry`、`latest_visible_cd_harbor_latest_registry_v2_status=502`、`latest_visible_cd_harbor_public_route_retrying_unavailable=true`、`harbor_controlled_repair_skipped=not_110_host`;Harbor repair 仍 `Waiting` 且缺 `awoooi-host` runner。判定:CD `Running` 不得視為中性等待;若 registry retry 已連續 502 / 000 且 repair 無 110 control path,立即依 110 control path / Harbor `/v2` 主線 blocker 處理。不可宣稱:全服務恢復、最新版本已上 production、Stock 資料最新、backup core green、DR complete、188 hygiene green。下一步仍固定為恢復 110-local repair control path / Harbor `/v2`,再重跑 post-reboot summary、cold-start、Stock freshness / ingestion 與 SLO scorecard。 diff --git a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 index bd3b90ad..d74f5a7a 100644 --- a/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 +++ b/infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 @@ -91,7 +91,7 @@ server { } location / { - proxy_pass http://127.0.0.1:3301; + proxy_pass http://192.168.0.110:8080; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; diff --git a/infra/ansible/roles/nginx/templates/188-internal-tools-https.conf.j2 b/infra/ansible/roles/nginx/templates/188-internal-tools-https.conf.j2 index 5a7118a5..b73dfc2d 100644 --- a/infra/ansible/roles/nginx/templates/188-internal-tools-https.conf.j2 +++ b/infra/ansible/roles/nginx/templates/188-internal-tools-https.conf.j2 @@ -42,7 +42,7 @@ server { ssl_certificate_key /etc/letsencrypt/live/sentry.wooo.work/privkey.pem; location / { - proxy_pass http://127.0.0.1:3301; + proxy_pass http://192.168.0.110:8080; proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index a7344399..bf5b1dbc 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -282,7 +282,7 @@ echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-s echo "PG $(pg_isready -h localhost -p 5432 2>&1)" echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)" echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)" -echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)" +echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.110:8080/ || true)" echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)" docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80 ' 2>&1); then @@ -296,7 +296,7 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80 grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections" grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed" grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop" - grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed" + grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "SignOz UI upstream reachable from 188" || warn "SignOz UI upstream not confirmed from 188" grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed" }