From 99eafd357210d045b2cec080b23c7769051beee0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 16:06:24 +0800 Subject: [PATCH] fix(k8s): unblock prod topology rollouts [skip ci] --- docs/LOGBOOK.md | 19 +++++++++++++++++++ k8s/awoooi-prod/05-deployment-web.yaml | 7 ++++--- k8s/awoooi-prod/06-deployment-api.yaml | 7 ++++--- k8s/awoooi-prod/08-deployment-worker.yaml | 6 +++--- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index a8b7c56a..c98b6732 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,22 @@ +## 2026-07-01 — 16:05 Gitea CD / ArgoCD post-reboot recovery + +**照主線修復的問題**: +- Gitea / Harbor / registry live readback 已恢復,但 CD `#4258` 在 deploy marker `9db4f72cb chore(cd): deploy 06819ea [skip ci]` 後仍轉 `Failure`;未讀 raw job log,改用公開 run list、Gitea main、K8s / ArgoCD readback 分層定位。 +- production workbench 與 K8s deployment 顯示 API / Web / Worker 仍跑 `7890778b...`,而 Gitea main desired image 已是 `06819ea...`,根因不是 Gitea repo 消失,而是 ArgoCD 無法比較 / 同步 main。 +- ArgoCD `awoooi-prod` 回 `ComparisonError`:repo-server gRPC `connection refused`;`argocd-repo-server` initContainer `copyutil` 因 `/bin/ln: Already exists` 反覆 BackOff。已 live patch repo-server init command,先移除既有 symlink 再建立,repo-server 成功 rollout Ready。 +- ArgoCD refresh 後成功同步到 `9db4f72cb`,但 API rollout 卡在 1/2;Pending pod event 顯示 `nodeSelector=mon` 搭配 `topologySpreadConstraints minDomains=2 whenUnsatisfiable=DoNotSchedule` 自相矛盾,造成 scheduler deadlock。 +- 已把 API / Web / Worker 的 topology spread 從硬性 `DoNotSchedule + minDomains=2` 調整為柔性 `ScheduleAnyway`,保留 hostname spread preference,但不再阻塞 post-reboot / GitOps rollout。 + +**驗證**: +- `argocd-repo-server` rollout:`deployment "argocd-repo-server" successfully rolled out`。 +- ArgoCD readback:`sync=Synced rev=9db4f72cbf98`,API deployment image / env 已更新為 `06819ea96c05...`。 +- `kubectl apply --dry-run=server` 驗證 `awoooi-api`、`awoooi-web`、`awoooi-worker` manifests:通過。 + +**邊界**:未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未重啟主機 / Docker / Nginx / K3s / DB / firewall;live patch 僅限 ArgoCD repo-server init command 與 prod Deployment topology scheduler policy。 + +**下一步**: +- 推 Gitea main 後等 ArgoCD 套用 source 修法,讀回 API / Web / Worker 全部 Ready、production workbench runtime/desired tag 等於 `06819ea...`,再判定 Gitea/CD 恢復完成。 + ## 2026-07-01 — 15:25 Gitea live recovery after 110 Docker/control-plane failure **照主線修復的問題**: diff --git a/k8s/awoooi-prod/05-deployment-web.yaml b/k8s/awoooi-prod/05-deployment-web.yaml index c60c3f8f..ae6ecf19 100644 --- a/k8s/awoooi-prod/05-deployment-web.yaml +++ b/k8s/awoooi-prod/05-deployment-web.yaml @@ -36,12 +36,13 @@ spec: annotations: awoooi.dev/topology-rebalance-generation: "2026-06-13T13:05:00+08:00" spec: - # 2026-06-13 Codex: 120 / 121 皆 Ready 時強制跨節點分散,避免 replicas=2 合法同落單節點。 + # 2026-07-01 Codex: topology spread must not deadlock rollouts when the + # active prod placement has fewer eligible domains after reboot. + # Keep the skew preference, but allow scheduling so GitOps deploys can finish. topologySpreadConstraints: - maxSkew: 1 - minDomains: 2 topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule + whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app: awoooi-web diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index bdc35cce..cfd48452 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -40,12 +40,13 @@ spec: annotations: awoooi.dev/topology-rebalance-generation: "2026-06-13T13:05:00+08:00" spec: - # 2026-06-13 Codex: 120 / 121 皆 Ready 時強制跨節點分散,避免 replicas=2 合法同落單節點。 + # 2026-07-01 Codex: topology spread must not deadlock rollouts when the + # active prod placement is pinned to one eligible node after reboot. + # Keep the skew preference, but allow scheduling so GitOps deploys can finish. topologySpreadConstraints: - maxSkew: 1 - minDomains: 2 topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule + whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app: awoooi-api diff --git a/k8s/awoooi-prod/08-deployment-worker.yaml b/k8s/awoooi-prod/08-deployment-worker.yaml index 44091905..54089b05 100644 --- a/k8s/awoooi-prod/08-deployment-worker.yaml +++ b/k8s/awoooi-prod/08-deployment-worker.yaml @@ -41,12 +41,12 @@ spec: environment: prod component: signal-processor spec: - # 2026-06-13 Codex: Worker 目前 min=1;擴到多副本時必須跨 120 / 121 分散。 + # 2026-07-01 Codex: keep hostname spread as a preference, not a hard + # scheduler blocker, so reboot recovery and GitOps rollouts do not deadlock. topologySpreadConstraints: - maxSkew: 1 - minDomains: 2 topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule + whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app: awoooi-worker