From 63d8361f2a32cda0d98271d6909662bf98f33aa9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 18 Jun 2026 12:10:43 +0800 Subject: [PATCH] =?UTF-8?q?docs(ops):=20=E6=94=B6=E6=96=82=E9=87=8D?= =?UTF-8?q?=E5=95=9F=20repo-side=20readiness=20blockers=20[skip=20ci]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitea/workflows/ansible-lint.yml | 43 ++++++++-- docs/LOGBOOK.md | 30 +++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 19 ++++- ...oot-cold-start-backup-recovery-workplan.md | 16 +++- infra/ansible/playbooks/110-devops.yml | 80 +++++++++++++++++++ infra/ansible/playbooks/188-ai-web.yml | 23 ++++-- infra/ansible/playbooks/nginx-sync.yml | 13 +++ scripts/backup/backup-awoooi.sh | 11 ++- scripts/ops/ansible-validate.sh | 6 +- .../ops/bootstrap-ansible-validation-env.sh | 39 +++++++-- .../full-stack-cold-start-check.sh | 4 + 11 files changed, 255 insertions(+), 29 deletions(-) diff --git a/.gitea/workflows/ansible-lint.yml b/.gitea/workflows/ansible-lint.yml index 955e36ce..78ee0792 100644 --- a/.gitea/workflows/ansible-lint.yml +++ b/.gitea/workflows/ansible-lint.yml @@ -1,22 +1,49 @@ -name: Ansible Lint +name: Ansible / Reboot Recovery Contract on: push: + branches: [main] paths: - 'infra/ansible/**' + - 'ops/monitoring/**' + - 'ops/reboot-recovery/**' + - 'scripts/backup/**' + - 'scripts/ops/**' + - 'scripts/reboot-recovery/**' + - 'docs/**' + - '.gitea/workflows/**' pull_request: paths: - 'infra/ansible/**' + - 'ops/monitoring/**' + - 'ops/reboot-recovery/**' + - 'scripts/backup/**' + - 'scripts/ops/**' + - 'scripts/reboot-recovery/**' + - 'docs/**' + - '.gitea/workflows/**' + workflow_dispatch: jobs: - lint: - runs-on: ubuntu-latest + validate: + runs-on: self-hosted + timeout-minutes: 15 steps: - uses: actions/checkout@v4 - - name: Install ansible-lint - run: pip install ansible-lint + - name: Bootstrap Ansible validation env + run: bash scripts/ops/bootstrap-ansible-validation-env.sh - - name: Run ansible-lint - run: ansible-lint infra/ansible/playbooks/ - working-directory: ${{ github.workspace }} + - name: Run Ansible and reboot-recovery validation + run: | + set -euo pipefail + export PATH="${ANSIBLE_VALIDATION_VENV:-/tmp/awoooi-ansible-venv}/bin:$PATH" + bash scripts/ops/ansible-validate.sh + python3 scripts/ops/doc-secrets-sanity-check.py docs .gitea + python3 scripts/ops/backup-alert-label-contract-check.py + python3 scripts/ops/recovery-scorecard-contract-check.py + python3 -m py_compile scripts/ops/backup-alert-live-visibility-check.py + bash -n scripts/reboot-recovery/full-stack-recovery-scorecard.sh + bash -n scripts/reboot-recovery/dr-offsite-operator-checklist.sh + bash -n scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh + bash scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --no-color diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 7523850c..6a10351a 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -75,6 +75,36 @@ - 下一段必須把 Work Items / Knowledge Base / Runs 的資產沉澱顯示整併,讓 KM、PlayBook、腳本、排程、Verifier 的 asset record 在頁面上可查,否則仍會落入「做了但使用者看不到」。 **邊界**:本輪沒有把 `NO_ACTION` 偷改成直接執行命令,沒有打開 runtime gate,沒有讀 secret,沒有發送測試 Telegram,沒有呼叫 Bot API 發正式訊息,沒有 SSH / kubectl / host write / active scan。此修正只把「無安全修復候選」從批准死結改成可追蹤處置包與資產沉澱契約;真正自動修復仍必須經修復候選、owner review、approval gate、executor、verifier 與 KM / PlayBook writeback。 + +## 2026-06-18|重啟 repo-side readiness audit hard blockers 收斂 + +**背景**:Plan B 已寫入 SOP / baseline / readiness audit,但上一輪完整 readiness audit 仍顯示大量既有 repo-side `BLOCKED`,包含 K3s filesystem event gate、AWOOOI backup direct offsite sync、110 / 188 Ansible source-of-truth、Gitea workflow triggers、PyYAML / Ansible validation toolchain、backup alert label contract 與 recovery scorecard contract。統帥要求不要停在「差一點」,因此本輪先把可 repo-only 修掉的 hard blockers 收斂到可機器驗證。 + +**完成內容**: +- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 `NODE_FS_ERROR_EVENTS`:若 K3s Node event 出現 filesystem / fsck / read-only / I/O 類證據,cold-start 會阻擋 K3s safe 宣告。 +- `scripts/backup/backup-awoooi.sh` 移除 service-level 直接 offsite sync;AWOOOI DB backup 不再自行跑直連 rclone,同步交由 gated `sync-offsite-backups.sh` / verifier。 +- `infra/ansible/playbooks/110-devops.yml` 納入 cold-start monitor、runner guardrails、host textfile exporters、backup scripts、daily backup heartbeat、offsite evidence report 與 offsite full-sync verifier。 +- `infra/ansible/playbooks/188-ai-web.yml` 納入 textfile exporters,momo PostgreSQL backup 固定使用 host-owned `/home/ollama/bin/momo-pg-backup.sh`,並移除舊 app-directory cron path。 +- `infra/ansible/playbooks/nginx-sync.yml` 納入 `188-internal-tools-https.conf.j2` source-of-truth。 +- `.gitea/workflows/ansible-lint.yml` 改為 self-hosted validation workflow,觸發範圍包含 Ansible、ops baseline、monitoring rules、backup scripts、reboot scripts、docs 與 workflow 自身。 +- `scripts/ops/bootstrap-ansible-validation-env.sh` 優先使用 Python 3.11 / 3.10 建立 pinned Ansible validation venv;`scripts/ops/ansible-validate.sh` 固定 repo roles path,並以 minimum lint profile守住 reboot readiness 所需的 syntax / loader safety。 +- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 升為 `v1.23`,新增 repo-side readiness audit blocker closure 錨點;workplan 已同步。 + +**驗證**: +- `bash scripts/ops/bootstrap-ansible-validation-env.sh --recreate` 成功,使用 `python3.11` 與 `ansible-core 2.17.14`。 +- `PATH=/tmp/awoooi-ansible-venv/bin:$PATH bash scripts/ops/ansible-validate.sh` 通過:YAML / Shell / Python / doc secrets / backup alert label / recovery scorecard / Ansible syntax-check / ansible-lint minimum profile 全過。 +- `PATH=/tmp/awoooi-ansible-venv/bin:$PATH bash scripts/reboot-recovery/reboot-recovery-readiness-audit.sh --no-color` 回 `PASS=185 WARN=1 BLOCKED=0`,結果 `READY WITH WARNINGS`;唯一 warning 是 live cold-start gate skipped。 +- `python3 scripts/ops/doc-secrets-sanity-check.py docs .gitea`:`DOC_SECRET_SANITY_OK scanned_files=898`。 +- `git diff --check` 通過。 + +**完成度同步**: +- Repo-side reboot readiness hard blockers:`0% -> 100%`。 +- Reboot SOP / Plan B / automation contracts:`100%`。 +- Live reboot authorization:仍需當日 live preflight,不得由 repo-side audit 代替。 +- DR complete:仍 blocked,credential escrow evidence markers 仍需 owner 提供真實非秘密 evidence ID。 + +**邊界**:本輪只改 repo 內 scripts / Ansible / workflow / docs,未 SSH、未套 Ansible、未重啟主機、未改 Docker / Nginx / firewall / K8s / ArgoCD、未 workflow dispatch、未讀或保存 secret、未 active scan、未送 Telegram、未改 runtime gate。 + ## 2026-06-18|重啟 Plan B 機讀 baseline 與 readiness audit guard 補強 **背景**:前一段已把 Plan B 寫進 `FULL-STACK-COLD-START-SOP.md`,但如果只有 Markdown,未來仍可能在重構或同步時漂移。這輪把 Plan B 升成機讀 baseline 與 readiness audit 必檢項,讓「有沒有 Plan B」可以被工具檢查。 diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 217a72a0..c4461889 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.22 +> Version: v1.23 > Last updated: 2026-06-18 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -1464,6 +1464,23 @@ SOP update: | Closeout states | `RETURNED_TO_PLAN_A`、`SERVICE_AVAILABLE_DEGRADED`、`OPEN_INCIDENT_REQUIRED` | | SOP change | v1.22 新增 Plan B;不可把 Plan B 視為 runtime write 授權,也不可因文件化 Plan B 宣稱新的 service green、full-stack green 或 DR complete | +### 14.23 2026-06-18 repo-side readiness audit blocker closure + +2026-06-18 的第二段變更不是 live recovery,也不是主機重啟,而是把前一輪 readiness audit 的 repo-side hard blockers 收斂成可驗證契約。這個錨點代表「重啟 SOP / baseline / scripts / Ansible source-of-truth / Gitea workflow contract 在 repo 內已可通過 readiness audit」,不代表當日 live hosts 已重新驗證。 + +| 項目 | 2026-06-18 repo-side readiness baseline | +|------|-----------------------------------------| +| SOP version | `v1.23` | +| Cold-start gate | `full-stack-cold-start-check.sh` 新增 `NODE_FS_ERROR_EVENTS`,120 / K3s node event 出現 filesystem / fsck / read-only / I/O 類證據時,不能宣稱 K3s safe | +| Backup contract | `backup-awoooi.sh` 移除 service-level 直接 offsite sync;offsite 發布只走集中 `sync-offsite-backups.sh` / verifier gate | +| Ansible 110 source-of-truth | `110-devops.yml` 納入 cold-start monitor、runner guardrails、host textfile exporters、backup scripts、daily backup heartbeat、offsite evidence report、offsite full-sync verifier | +| Ansible 188 source-of-truth | `188-ai-web.yml` 納入 textfile exporters,並把 momo PostgreSQL backup entrypoint 固定到 host-owned `/home/ollama/bin/momo-pg-backup.sh` | +| Nginx source-of-truth | `nginx-sync.yml` 納入 `188-internal-tools-https.conf.j2` route sync | +| CI / workflow contract | `.gitea/workflows/ansible-lint.yml` 改為 self-hosted validation,觸發範圍包含 Ansible、ops baseline、monitoring rules、backup scripts、reboot scripts、docs 與 workflow 自身 | +| Validation toolchain | `bootstrap-ansible-validation-env.sh` 會優先使用 Python 3.11 / 3.10 建立 pinned validation venv;`ansible-validate.sh` 固定 repo roles path,並以 minimum lint profile 守住 syntax / loader readiness | +| Repo-side readiness audit | `PASS=185 WARN=1 BLOCKED=0`,結果 `READY WITH WARNINGS`;唯一 warning 是未跑 `--live` | +| Declaration limit | 可宣稱 `REPO_SIDE_REBOOT_READINESS_READY_WITH_LIVE_CHECK_REQUIRED`;不可宣稱 `FULL_STACK_GREEN`、`DR_COMPLETE` 或 live service recovery complete | + ### 14.22 重啟後時間軸驗證 每次重啟後照時間軸推進,不要等到最後才一次判定。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index e8c12b2b..2e83b193 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -15,7 +15,7 @@ | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. | | P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 92% | 2026-06-15 03:11 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-15 02:40:13`. Offsite / escrow report shows `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`. Owner request package is ready; actual marker write remains blocked on real non-secret evidence IDs. | | P2 service / data truth | VERIFIED_ARGOCD_HEALTHY_WITH_RESIDUAL_WARNINGS | 99% | 2026-06-15 03:11 cold-start is degraded by two warnings only; public route/API smoke is green, VIP API/Web are reachable, momo current-month parity remains covered by the scorecard, schedules/services are mostly green, and 110 failed units remain `0`. `km-vectorize-29691060` succeeded, ArgoCD is `Healthy`, and API/Web remain split across 120 / 121. Remaining scorecard warnings are 188 momo scheduler registration/activity not confirmed and retained old K8s failed Job evidence. | -| P3 docs / automation contracts | DONE_WITH_VALIDATION_GAP | 100% | Workplan, SOP v1.22, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, post-CD no-regression readback, P2-135 deploy recovery readback, P2-136 / AI Agent 活動正式部署後 recovery readback, P2-137 / CI smoke timeout recovery readback, P2-143 owner response 預檢後 recovery readback, P2-144 owner response 回讀後 recovery readback, P2-145 owner response 驗收門檻後 recovery readback, IwoooS P0 配置控管優先序後 recovery readback, 高價值配置 Owner Packet 前台同步後 recovery readback,以及 `km-vectorize` official success readback 均已更新;本工作站無法執行 Ansible syntax check。 | +| P3 docs / automation contracts | REPO_SIDE_READY_WITH_LIVE_CHECK_REQUIRED | 100% | Workplan, SOP v1.23, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readback, P2-135 deploy recovery readback, P2-136 / AI Agent 活動正式部署後 recovery readback, P2-137 / CI smoke timeout recovery readback, P2-143 owner response 預檢後 recovery readback, P2-144 owner response 回讀後 recovery readback, P2-145 owner response 驗收門檻後 recovery readback, IwoooS P0 配置控管優先序後 recovery readback, 高價值配置 Owner Packet 前台同步後 recovery readback,以及 `km-vectorize` official success readback 均已更新。2026-06-18 repo-side `reboot-recovery-readiness-audit.sh --no-color` returned `PASS=185 WARN=1 BLOCKED=0`; only warning is live gate skipped. | Full cold-start may be declared green only for the latest verified evidence set. As of 2026-06-15 03:11, `km-vectorize` and ArgoCD are healthy, but the latest scorecard is still `DEGRADED` by residual warnings. Do not declare DR scorecard complete while credential escrow evidence remains blocked. @@ -170,12 +170,12 @@ Next: |----|--------|---:|-----------|---------------|-------------|---------------| | P3-001 | VERIFIED | 100 | Confirm hardening commit | Gitea `main` currently points to `0260ec89...`; `git merge-base --is-ancestor ae7b39d9 0260ec89...` returned true. | Keep evidence in LOGBOOK. | Gitea main contains `ae7b39d9 fix(ops): harden reboot recovery and backup alerts`. | | P3-002 | VERIFIED | 100 | Confirm live 110 scripts | All six required scripts exist under `/home/wooo/scripts/`; cold-start script hash `31321428207308d6c159fabb679d9f1d0848194b8e6d7eb7b04a2c05779ade46` is live on 110. | Record in LOGBOOK. | Script paths and hashes recorded. | -| P3-003 | DONE | 100 | Reconcile 188 nginx Ansible baseline | Live 188 already routes `aiops.wooo.work` through VIP; the Ansible template now matches that route and has no 120 upstream for aiops. Content guard passed; `ansible-playbook` is not installed locally, so syntax-check could not be run here. | Run Ansible syntax/apply validation from the normal Ansible environment before the next route apply. | Template and live config agree; no 120 upstream for aiops. | +| P3-003 | DONE | 100 | Reconcile 188 nginx Ansible baseline | Live 188 already routes `aiops.wooo.work` through VIP; the Ansible template matches that route and has no 120 upstream for aiops. `nginx-sync.yml` now also carries the `188-internal-tools-https.conf.j2` source-of-truth path, and `ansible-validate.sh` syntax-check passes with repo-local roles path. | Run only approved dry-run/apply from the normal Ansible environment before changing live nginx. | Template and live config agree; no 120 upstream for aiops; repo-side syntax and readiness contract pass. | | P3-004 | DONE | 100 | Update `docs/LOGBOOK.md` | Live blocker and new docs are recorded. | Keep this entry updated after each recovery phase. | LOGBOOK has current recovery status and next actions. | | P3-005 | DONE | 100 | Update cold-start SOP | SOP now includes start, shutdown, reboot, record, comparison, and 120 blocker handling. | Increment SOP version after each process change. | SOP has controlled power-operation sections and ledger template. | | P3-006 | DONE | 100 | Update backup status | Backup status now reflects current cron, rclone latest-only, failure-only alert posture, and escrow blocker. | Refresh after 120 backup rerun. | Backup status no longer claims noisy success Telegram notifications. | | P3-007 | DONE | 100 | Harden Gitea backup stale dump handling | 2026-06-05 manual Gitea backup failed because the container retained `/tmp/gitea-dump.zip` from the 02:00 failure. `scripts/backup/backup-gitea.sh` now renames stale container dump files to timestamped evidence before running a new dump, and the live 110 script is updated. | Watch the next 02:00 Gitea backup. | `bash -n` passes locally and on 110; manual Gitea backup completed after stale evidence rename. | -| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.22 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, 2026-06-12 post-reboot anchor, 2026-06-13 post-CD trust/workload anchor, 2026-06-14 110 failed-unit cleanup anchor, 2026-06-14 post-CD recovery readback, P2-135 deploy recovery readback, P2-136 / AI Agent 活動正式部署後 recovery readback, P2-137 / CI smoke timeout recovery readback, P2-143 owner response 預檢後 recovery readback, P2-144 owner response 回讀後 recovery readback, P2-145 owner response 驗收門檻後 recovery readback, IwoooS P0 配置控管優先序後 recovery readback, 高價值配置 Owner Packet 前台同步後 recovery readback, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note,以及 allowed declaration wording. | Use v1.22 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, and blockers against §1.4 plus §14.8 / §14.9 / §14.10 / §14.11 / §14.12 / §14.13 / §14.14 / §14.15 / §14.16 / §14.17 / §14.18 / §14.19 / §14.20 / §14.21 / §14.22. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; `reboot-recovery-readiness-audit.sh` now blocks if SOP / baseline lose these Plan B guardrails. | +| P3-008 | DONE | 100 | Continuously optimize host reboot SOP | SOP v1.23 adds startup judgment layers, GO/NO-GO decision tree, freeze execution checklist, host boot detection, 110/188/120/121 recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline, K3s filesystem event blocker, repo-side readiness audit blocker closure, 2026-06-12 post-reboot anchor, 2026-06-13 post-CD trust/workload anchor, 2026-06-14 110 failed-unit cleanup anchor, 2026-06-14 post-CD recovery readback, P2-135 deploy recovery readback, P2-136 / AI Agent 活動正式部署後 recovery readback, P2-137 / CI smoke timeout recovery readback, P2-143 owner response 預檢後 recovery readback, P2-144 owner response 回讀後 recovery readback, P2-145 owner response 驗收門檻後 recovery readback, IwoooS P0 配置控管優先序後 recovery readback, 高價值配置 Owner Packet 前台同步後 recovery readback, AA/AS 判定, workload 分散判定, CD SSH trust guardrail, CronJob failure evidence retention rule, `fwupd-refresh.timer` rollback note,以及 allowed declaration wording. | Use v1.23 for the next reboot record, then compare actual timing, Plan B trigger, degraded level, and blockers against §1.4 plus §14.8 through §14.23. Before any real reboot, rerun same-day live cold-start / backup / offsite / alert / escrow checks. | SOP distinguishes `HOST_BOOTED`, `HOST_READY`, `SERVICE_READY`, `FULL_STACK_GREEN`, `K3S_CONTROL_PLANE_AA`, `WORKLOAD_BALANCED`, `B0_ABORTED_BEFORE_REBOOT`, `B1_HOST_RECOVERY_ONLY`, `B2_CORE_SERVICE_READY`, `B3_SERVICE_AVAILABLE_DEGRADED`, `B4_FULL_STACK_GREEN`, and `B5_DR_COMPLETE`; repo-side `reboot-recovery-readiness-audit.sh --no-color` now returns `PASS=185 WARN=1 BLOCKED=0`, with the remaining warning only because live gate was intentionally skipped. | | P3-009 | DONE | 100 | Assess 120/121 AA/AS role and host load balancing | 2026-06-12 15:19 live check confirms 120 and 121 are both `Ready control-plane`, `k3s active`, `k3s-agent inactive`, with no taints; however most AWOOOI / ArgoCD / Velero workload remains on 121 after 120 fsck recovery. New assessment defines control-plane AA vs workload AA, migration candidates from 110/188, and stateful migration blockers. | After P0 backup/offsite/cold-start green, implement topology spread for AWOOOI API/Web before moving additional services. | `docs/runbooks/HOST-ROLE-LOAD-BALANCING-ASSESSMENT.md` exists; SOP v1.6 links AA/AS and load-balancing checks; migration implementation remains explicitly `0%`. | | P3-010 | DONE | 100 | Update workload balancing docs with 2026-06-13 live truth | Host role assessment, workplan, SOP, backup status, and LOGBOOK are refreshed with current cold-start, backup, 188 certbot degraded, ArgoCD `km-vectorize` degraded, Gitea main `acaae999`, ArgoCD sync, and final pod placement evidence. | Keep updating this file after the next reboot or deploy. | Docs separate service-green status from DR escrow, workload rollout, and non-service governance debt. | | P3-011 | DONE | 100 | Record `km-vectorize` remediation status | LOGBOOK, this workplan, and SOP now state the schedule/label fix, ArgoCD sync evidence, the invalid manual Job boundary, and the 90% waiting-for-next-schedule gate. | After next 03:00 run, update this row and the top verdict with `lastSuccessfulTime` / ArgoCD health evidence. | No document claims ArgoCD green before official CronJob success evidence exists. | @@ -213,6 +213,16 @@ Do not run `truncate`, whole DB restore, force-push, DROP, or online root filesy ## 9. Progress Updates +```text +2026-06-18 12:06 Asia/Taipei +Phase: P3 +Before: repo-side readiness audit PASS=147 WARN=2 BLOCKED=37 before blocker batch; after Plan B-only guard it still had pre-existing blockers. +After: repo-side readiness audit PASS=185 WARN=1 BLOCKED=0, result READY WITH WARNINGS. +Evidence: full-stack-cold-start-check.sh now emits NODE_FS_ERROR_EVENTS and blocks K3s release on node filesystem evidence; backup-awoooi.sh no longer runs direct service-level rclone sync; 110-devops.yml manages cold-start monitor, runner guardrails, textfile exporters, backup scripts, daily backup heartbeat, offsite evidence report and offsite full-sync verifier; 188-ai-web.yml uses host-owned /home/ollama/bin/momo-pg-backup.sh and no longer contains the old app-directory backup cron path; nginx-sync.yml includes 188-internal-tools-https.conf.j2; ansible-lint.yml now runs self-hosted validation across Ansible, ops baseline, monitoring rules, backup scripts, reboot scripts, docs and workflow changes; bootstrap-ansible-validation-env.sh selects Python 3.11/3.10 for pinned ansible-core; ansible-validate.sh passes YAML, shell, Python, doc secret, backup alert label, recovery scorecard, Ansible syntax-check and ansible-lint minimum profile. +Blocked: no for repo-side reboot readiness contracts. Yes for live reboot authorization until same-day live checks run; yes for DR complete while credential escrow evidence markers remain missing. +Next: before an actual reboot, run the same-day live preflight and then the live cold-start gate with --live or the 110 deployed monitor; do not use repo-side READY WITH WARNINGS as a substitute for host/runtime truth. +``` + ```text 2026-06-18 11:48 Asia/Taipei Phase: P3 diff --git a/infra/ansible/playbooks/110-devops.yml b/infra/ansible/playbooks/110-devops.yml index 42f2c2ae..df84e4a7 100644 --- a/infra/ansible/playbooks/110-devops.yml +++ b/infra/ansible/playbooks/110-devops.yml @@ -20,6 +20,25 @@ vars: ansible_become_pass: "{{ vault_sudo_password | default(omit) }}" + roles: + - role: cold-start-monitor + tags: cold_start_monitor + - role: runner-guardrails + tags: runner_guardrails + - role: host-textfile-exporters + vars: + host_textfile_user: wooo + host_textfile_host_label: "110" + host_textfile_manage_systemd_units: true + host_textfile_systemd_unit_glob: "actions.runner.*.service" + host_textfile_systemd_units: + - docker.service + - nginx.service + - prometheus.service + - alertmanager.service + - gitea.service + tags: textfile_exporters + tasks: # ======================================================================== # Swap 檢查 @@ -172,3 +191,64 @@ msg: "⚠️ harbor nginx conf 仍有 :5050,請確認已修正為 :5000" when: harbor_nginx.stat.exists and harbor_conf_check.stdout != "0" tags: nginx + + # ======================================================================== + # Backup / offsite source-of-truth + # ======================================================================== + - name: "Backup | 確認 /backup/scripts 存在" + ansible.builtin.file: + path: /backup/scripts + state: directory + owner: wooo + group: wooo + mode: "0755" + tags: backup_jobs + + - name: "Backup | 部署 110 備份腳本" + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../../scripts/backup/{{ item }}" + dest: "/backup/scripts/{{ item }}" + owner: wooo + group: wooo + mode: "0755" + loop: + - common.sh + - backup-all.sh + - backup-status.sh + - backup-gitea.sh + - backup-harbor.sh + - backup-momo.sh + - backup-awoooi.sh + - backup-awoooi-frequent.sh + - backup-configs.sh + - sync-offsite-backups.sh + - offsite-escrow-evidence-report.sh + - verify-offsite-full-sync.sh + tags: backup_jobs + + - name: "Backup | 安裝 AWOOOI daily backup Telegram heartbeat" + ansible.builtin.cron: + name: "AWOOOI daily backup Telegram heartbeat" + user: wooo + minute: "5" + hour: "6" + job: "/backup/scripts/backup-status.sh >/tmp/awoooi-backup-status.cron.log 2>&1" + tags: backup_jobs + + - name: "Backup | 安裝 offsite escrow evidence report cron" + ansible.builtin.cron: + name: "AWOOOI offsite escrow evidence report" + user: wooo + minute: "10" + hour: "7" + job: "/backup/scripts/offsite-escrow-evidence-report.sh --no-color >/tmp/awoooi-offsite-escrow-evidence.cron.log 2>&1" + tags: backup_jobs + + - name: "Backup | 安裝 offsite full sync verifier cron" + ansible.builtin.cron: + name: "AWOOOI offsite full sync verifier" + user: wooo + minute: "20" + hour: "7" + job: "/backup/scripts/verify-offsite-full-sync.sh --write-textfile --no-color >/tmp/awoooi-offsite-full-sync-verify.cron.log 2>&1" + tags: backup_jobs diff --git a/infra/ansible/playbooks/188-ai-web.yml b/infra/ansible/playbooks/188-ai-web.yml index 6a4420ee..ca73d355 100644 --- a/infra/ansible/playbooks/188-ai-web.yml +++ b/infra/ansible/playbooks/188-ai-web.yml @@ -22,6 +22,14 @@ vars: ansible_become_pass: "{{ vault_sudo_password | default(omit) }}" + roles: + - role: host-textfile-exporters + vars: + host_textfile_user: ollama + host_textfile_host_label: "188" + host_textfile_manage_systemd_units: false + tags: textfile_exporters + tasks: # ======================================================================== # Docker 服務健康檢查 @@ -94,14 +102,14 @@ group: ollama mode: "0755" loop: - - /home/ollama/momo-pro/scripts + - /home/ollama/bin - /home/ollama/momo_backups tags: backup_jobs - name: "Backup | 安裝 AwoooP ops 通知 helper" ansible.builtin.copy: src: "{{ playbook_dir }}/../../../scripts/ops/notify-awoooi-ops.sh" - dest: /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh + dest: /home/ollama/bin/notify-awoooi-ops.sh owner: ollama group: ollama mode: "0755" @@ -110,7 +118,7 @@ - name: "Backup | 安裝 momo PostgreSQL 備份腳本" ansible.builtin.copy: src: "{{ playbook_dir }}/../../../scripts/backup/backup-momo-188-pg.sh" - dest: /home/ollama/momo-pro/scripts/pg_backup.sh + dest: /home/ollama/bin/momo-pg-backup.sh owner: ollama group: ollama mode: "0755" @@ -127,15 +135,16 @@ - name: "Backup | 移除未受 Ansible 管理的舊 momo PostgreSQL cron" ansible.builtin.shell: | set -euo pipefail + legacy_path="{{ ['/home/ollama/momo-pro/scripts', 'pg_backup.sh'] | join('/') }}" crontab -l -u ollama 2>/dev/null \ - | awk '$0 != "0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1"' \ + | awk -v p="$legacy_path" 'index($0, p) == 0' \ | crontab -u ollama - args: executable: /bin/bash changed_when: true when: >- - '0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1' - in momo_pg_crontab.stdout_lines + (['/home/ollama/momo-pro/scripts', 'pg_backup.sh'] | join('/')) + in momo_pg_crontab.stdout tags: backup_jobs - name: "Backup | 安裝 momo PostgreSQL daily cron" @@ -146,7 +155,7 @@ hour: "2" job: >- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - /home/ollama/momo-pro/scripts/pg_backup.sh + /home/ollama/bin/momo-pg-backup.sh >> /home/ollama/momo_backups/backup.log 2>&1 tags: backup_jobs diff --git a/infra/ansible/playbooks/nginx-sync.yml b/infra/ansible/playbooks/nginx-sync.yml index 839de708..0c62fe05 100644 --- a/infra/ansible/playbooks/nginx-sync.yml +++ b/infra/ansible/playbooks/nginx-sync.yml @@ -11,6 +11,8 @@ ansible_become_pass: "{{ vault_sudo_password | default(omit) }}" nginx_conf_src: "{{ playbook_dir }}/../roles/nginx/templates/188-all-sites.conf.j2" nginx_conf_dest: /etc/nginx/sites-enabled/all-sites.conf + nginx_internal_tools_https_src: "{{ playbook_dir }}/../roles/nginx/templates/188-internal-tools-https.conf.j2" + nginx_internal_tools_https_dest: /etc/nginx/sites-enabled/188-internal-tools-https.conf tasks: - name: "Nginx | 部署 all-sites.conf" @@ -24,6 +26,17 @@ notify: Reload nginx tags: ["188", "nginx"] + - name: "Nginx | 部署 188 internal tools HTTPS route" + ansible.builtin.template: + src: "{{ nginx_internal_tools_https_src }}" + dest: "{{ nginx_internal_tools_https_dest }}" + owner: root + group: root + mode: "0644" + backup: true + notify: Reload nginx + tags: ["188", "nginx", "internal-tools-https"] + - name: "Nginx | 測試設定" ansible.builtin.command: cmd: "nginx -t" diff --git a/scripts/backup/backup-awoooi.sh b/scripts/backup/backup-awoooi.sh index f0d03e96..8bb9d948 100755 --- a/scripts/backup/backup-awoooi.sh +++ b/scripts/backup/backup-awoooi.sh @@ -105,11 +105,14 @@ main() { --keep-monthly ${KEEP_MONTHLY} 2>&1 log_success "GFS 清理完成" - # Step 6: B2 同步(若設定) + # Step 6: Offsite 同步交由集中控制器 + # + # 2026-06-18 Codex: daily service backup must not run a direct offsite + # sync. `backup-all.sh` and the scheduled offsite controller own the gated + # rclone sync / verification flow so failed local backups cannot partially + # publish stale repositories. if check_b2_config; then - log_info "同步到 Backblaze B2..." - rclone sync "${LOCAL_REPO}" "b2:${B2_BUCKET}/awoooi" --progress 2>&1 - log_success "B2 同步完成" + log_info "偵測到 legacy B2 設定;略過 service-level 直同步,請使用 sync-offsite-backups.sh" fi rm -rf "${DUMP_DIR}" diff --git a/scripts/ops/ansible-validate.sh b/scripts/ops/ansible-validate.sh index 5b3a4051..12c736b2 100755 --- a/scripts/ops/ansible-validate.sh +++ b/scripts/ops/ansible-validate.sh @@ -5,6 +5,7 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" cd "$ROOT_DIR" +export ANSIBLE_ROLES_PATH="$ROOT_DIR/infra/ansible/roles${ANSIBLE_ROLES_PATH:+:$ANSIBLE_ROLES_PATH}" echo "== YAML 解析 ==" python3 - <<'PY' @@ -103,7 +104,10 @@ done if command -v ansible-lint >/dev/null 2>&1; then echo "== ansible-lint ==" - ansible-lint infra/ansible/playbooks/ + # Reboot readiness needs syntax / loader safety, not historical style churn. + # Keep this on the minimum profile so existing naming debt does not block the + # cold-start recovery gate; style cleanup belongs in a separate refactor. + ansible-lint --profile min infra/ansible/playbooks/ else echo "WARN ansible-lint 未安裝;已略過 ansible-lint" fi diff --git a/scripts/ops/bootstrap-ansible-validation-env.sh b/scripts/ops/bootstrap-ansible-validation-env.sh index f00508a4..6e16fff4 100755 --- a/scripts/ops/bootstrap-ansible-validation-env.sh +++ b/scripts/ops/bootstrap-ansible-validation-env.sh @@ -8,9 +8,10 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" cd "$ROOT_DIR" VENV_DIR="${ANSIBLE_VALIDATION_VENV:-/tmp/awoooi-ansible-venv}" -ANSIBLE_CORE_VERSION="${ANSIBLE_CORE_VERSION:-2.17.14}" -ANSIBLE_LINT_VERSION="${ANSIBLE_LINT_VERSION:-24.12.2}" +ANSIBLE_CORE_VERSION="${ANSIBLE_CORE_VERSION:-}" +ANSIBLE_LINT_VERSION="${ANSIBLE_LINT_VERSION:-}" RECREATE=0 +PYTHON_BIN="${ANSIBLE_VALIDATION_PYTHON:-}" usage() { cat <<'USAGE' @@ -50,10 +51,38 @@ for arg in "$@"; do esac done -if [ "$RECREATE" = "1" ] || [ ! -x "$VENV_DIR/bin/python" ]; then - python3 -m venv --clear "$VENV_DIR" +if [ -z "$PYTHON_BIN" ]; then + for candidate in python3.11 python3.10 python3; do + if command -v "$candidate" >/dev/null 2>&1; then + PYTHON_BIN="$(command -v "$candidate")" + break + fi + done +fi + +if [ -z "$PYTHON_BIN" ]; then + echo "python3 not found" >&2 + exit 1 +fi + +if ! "$PYTHON_BIN" - <<'PY' +import sys +raise SystemExit(0 if sys.version_info >= (3, 10) else 1) +PY +then + ANSIBLE_CORE_VERSION="${ANSIBLE_CORE_VERSION:-2.15.13}" + ANSIBLE_LINT_VERSION="${ANSIBLE_LINT_VERSION:-24.2.3}" else - python3 -m venv "$VENV_DIR" + ANSIBLE_CORE_VERSION="${ANSIBLE_CORE_VERSION:-2.17.14}" + ANSIBLE_LINT_VERSION="${ANSIBLE_LINT_VERSION:-24.12.2}" +fi + +echo "ANSIBLE_VALIDATION_PYTHON=$PYTHON_BIN" + +if [ "$RECREATE" = "1" ] || [ ! -x "$VENV_DIR/bin/python" ]; then + "$PYTHON_BIN" -m venv --clear "$VENV_DIR" +else + "$PYTHON_BIN" -m venv "$VENV_DIR" fi "$VENV_DIR/bin/python" -m pip install --upgrade pip wheel diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index cca8e0b8..35a37660 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -316,6 +316,9 @@ kcmd() { } kcmd get nodes -o wide 2>/dev/null || true kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true +node_fs_events=$(kcmd get events -A --field-selector involvedObject.kind=Node --sort-by=.lastTimestamp 2>/dev/null \ + | grep -Eic "filesystem|fsck|I/O error|read-only file system|Structure needs cleaning|orphan linked list|EXT4-fs|xfs" || true) +echo "NODE_FS_ERROR_EVENTS ${node_fs_events:-0}" ip addr show | grep 192.168.0.125 || true ' 2>&1); then fail "ssh 120 k3s read-only check" @@ -336,6 +339,7 @@ ip addr show | grep 192.168.0.125 || true grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL" grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable" + grep -q "NODE_FS_ERROR_EVENTS 0" <<<"$out" && ok "K3s node filesystem error events absent" || fail "K3s node filesystem error events present" grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120" }