From ba44c4cff18ce6f47d9100359d3121b60e9fe242 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 16:11:13 +0800 Subject: [PATCH] fix(api): resolve recovery preflight paths in containers --- .../harbor_registry_controlled_recovery_preflight.py | 7 +++++-- ...ockplatform_public_api_controlled_recovery_preflight.py | 4 +++- docs/LOGBOOK.md | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/apps/api/src/services/harbor_registry_controlled_recovery_preflight.py b/apps/api/src/services/harbor_registry_controlled_recovery_preflight.py index cde0d8f6..d57cf8b6 100644 --- a/apps/api/src/services/harbor_registry_controlled_recovery_preflight.py +++ b/apps/api/src/services/harbor_registry_controlled_recovery_preflight.py @@ -15,9 +15,12 @@ from collections.abc import Callable from pathlib import Path from typing import Any +from src.services.snapshot_paths import resolve_repo_root + _SCHEMA_VERSION = "harbor_registry_controlled_recovery_preflight_v1" +_REPO_ROOT = resolve_repo_root(Path(__file__)) _DEFAULT_ROUTE_SOURCE = ( - Path(__file__).resolve().parents[4] + _REPO_ROOT / "infra" / "ansible" / "roles" @@ -26,7 +29,7 @@ _DEFAULT_ROUTE_SOURCE = ( / "188-internal-tools-https.conf.j2" ) _DEFAULT_WATCHDOG_SOURCE = ( - Path(__file__).resolve().parents[4] + _REPO_ROOT / "scripts" / "reboot-recovery" / "harbor-watchdog.sh" diff --git a/apps/api/src/services/stockplatform_public_api_controlled_recovery_preflight.py b/apps/api/src/services/stockplatform_public_api_controlled_recovery_preflight.py index ebcaf144..4a477fac 100644 --- a/apps/api/src/services/stockplatform_public_api_controlled_recovery_preflight.py +++ b/apps/api/src/services/stockplatform_public_api_controlled_recovery_preflight.py @@ -11,13 +11,15 @@ import re from pathlib import Path from typing import Any +from src.services.snapshot_paths import resolve_repo_root from src.services.stockplatform_public_api_runtime_readback import ( load_latest_stockplatform_public_api_runtime_readback, ) _SCHEMA_VERSION = "stockplatform_public_api_controlled_recovery_preflight_v1" +_REPO_ROOT = resolve_repo_root(Path(__file__)) _DEFAULT_ROUTE_SOURCE = ( - Path(__file__).resolve().parents[4] + _REPO_ROOT / "infra" / "ansible" / "roles" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c98b6732..2d0625d7 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,16 +6,18 @@ - ArgoCD `awoooi-prod` 回 `ComparisonError`:repo-server gRPC `connection refused`;`argocd-repo-server` initContainer `copyutil` 因 `/bin/ln: Already exists` 反覆 BackOff。已 live patch repo-server init command,先移除既有 symlink 再建立,repo-server 成功 rollout Ready。 - ArgoCD refresh 後成功同步到 `9db4f72cb`,但 API rollout 卡在 1/2;Pending pod event 顯示 `nodeSelector=mon` 搭配 `topologySpreadConstraints minDomains=2 whenUnsatisfiable=DoNotSchedule` 自相矛盾,造成 scheduler deadlock。 - 已把 API / Web / Worker 的 topology spread 從硬性 `DoNotSchedule + minDomains=2` 調整為柔性 `ScheduleAnyway`,保留 hostname spread preference,但不再阻塞 post-reboot / GitOps rollout。 +- API 新 pod 排程後又 `CrashLoopBackOff`;previous log 顯示 `harbor_registry_controlled_recovery_preflight.py` 在 production container layout `/app/src/services/...` 使用 `Path(__file__).parents[4]` 造成 `IndexError: 4`。已改用既有 `resolve_repo_root(Path(__file__))`,並同步修掉同型的 StockPlatform controlled recovery preflight 路徑解析,避免下一個 import 再炸。 **驗證**: - `argocd-repo-server` rollout:`deployment "argocd-repo-server" successfully rolled out`。 - ArgoCD readback:`sync=Synced rev=9db4f72cbf98`,API deployment image / env 已更新為 `06819ea96c05...`。 - `kubectl apply --dry-run=server` 驗證 `awoooi-api`、`awoooi-web`、`awoooi-worker` manifests:通過。 +- `py_compile` / targeted pytest 需在 path resolver fix 後重跑並推正常 CD,因為 production 必須建出新 API image 才能離開 CrashLoop。 **邊界**:未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未重啟主機 / Docker / Nginx / K3s / DB / firewall;live patch 僅限 ArgoCD repo-server init command 與 prod Deployment topology scheduler policy。 **下一步**: -- 推 Gitea main 後等 ArgoCD 套用 source 修法,讀回 API / Web / Worker 全部 Ready、production workbench runtime/desired tag 等於 `06819ea...`,再判定 Gitea/CD 恢復完成。 +- 推 Gitea main 後等 CD 建新 API image,讀回 API / Web / Worker 全部 Ready、production workbench runtime/desired tag 等於最新 commit,再判定 Gitea/CD 恢復完成。 ## 2026-07-01 — 15:25 Gitea live recovery after 110 Docker/control-plane failure