fix(cd): guard production argocd source [skip ci]
This commit is contained in:
@@ -991,7 +991,9 @@ jobs:
|
||||
status=$?
|
||||
set -e
|
||||
if [ "$status" -ne 0 ]; then
|
||||
echo "resource_query_failed=$(echo "$output" | head -c 180)"
|
||||
local output_snippet
|
||||
output_snippet=$(printf '%s' "$output" | head -c 180)
|
||||
echo "resource_query_failed=${output_snippet}"
|
||||
return 0
|
||||
fi
|
||||
echo "$output" \
|
||||
@@ -1001,11 +1003,34 @@ jobs:
|
||||
| sed 's/[[:cntrl:]]//g; s/;*$//'
|
||||
}
|
||||
|
||||
validate_argocd_source_contract() {
|
||||
local target_revision
|
||||
local image_override
|
||||
|
||||
target_revision=$(app_field '{.spec.source.targetRevision}' source_target_revision)
|
||||
image_override=$(app_field '{.spec.source.kustomize.images}' source_kustomize_images)
|
||||
|
||||
if [ "$target_revision" != "main" ]; then
|
||||
record_rollout_risk "argocd_source_target_revision_not_main targetRevision=$target_revision"
|
||||
echo "❌ ArgoCD source targetRevision must be main, got: $target_revision" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "$image_override" ]; then
|
||||
local image_override_snippet
|
||||
image_override_snippet=$(printf '%s' "$image_override" | head -c 180)
|
||||
record_rollout_risk "argocd_source_image_override_present images=${image_override_snippet}"
|
||||
echo "❌ ArgoCD source kustomize.images override must be empty; image truth belongs in k8s/awoooi-prod/kustomization.yaml" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 等待 ArgoCD Application 同步到目標 revision(最多 180s)。
|
||||
# 2026-05-24 Codex: top-level Application health can stay Degraded
|
||||
# without per-resource health detail. Treat that as rollout evidence,
|
||||
# then let kubectl rollout status and API health decide pass/fail.
|
||||
echo "⏳ 等待 ArgoCD sync..."
|
||||
validate_argocd_source_contract
|
||||
$KUBECTL annotate application awoooi-prod -n argocd \
|
||||
argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true
|
||||
for i in $(seq 1 36); do
|
||||
|
||||
@@ -1,3 +1,53 @@
|
||||
## 2026-06-04|Production 首頁舊版回退止血與 ArgoCD Source Guard
|
||||
|
||||
**背景**:統帥回報 `https://awoooi.wooo.work/` 變成舊版本。即時檢查確認不是瀏覽器快取,而是 production GitOps source 被切離 `main`,且 ArgoCD Application 殘留舊 image override。
|
||||
|
||||
**根因**:
|
||||
- `argocd/Application awoooi-prod.spec.source.targetRevision` 被改成 `codex/w1-redline`。
|
||||
- `.spec.source.kustomize.images` 殘留覆寫到 `cbd28e29a08435deb8c66af51654d8fa65120a14`。
|
||||
- K8s 實際運行 image 一度變成:
|
||||
- `awoooi-api` / `awoooi-worker`:`192.168.0.110:5000/awoooi/api:cbd28e29a08435deb8c66af51654d8fa65120a14`
|
||||
- `awoooi-web`:`192.168.0.110:5000/awoooi/web:cbd28e29a08435deb8c66af51654d8fa65120a14`
|
||||
- 因此首頁看起來像整站回到舊版本;這是 GitOps source/override 錯位,不是 Next.js cache。
|
||||
|
||||
**即時修復**:
|
||||
- 將 `awoooi-prod.spec.source.targetRevision` 改回 `main`。
|
||||
- 移除 `awoooi-prod.spec.source.kustomize.images` override,讓 image truth 回到 `k8s/awoooi-prod/kustomization.yaml`。
|
||||
- 觸發 ArgoCD hard refresh,等待 rollout。
|
||||
- 清理人工檢查殘留的孤兒 Pod:`probe-ab21`、`probe-f1ef-server`、`check-1cc9`、`check-1cc9-manifest`、`check-f1ef`、`check-f1ef-manifest`、`sh`。
|
||||
- 清理舊 image 造成的 failed Job:`k3s-status-report-29675100`。
|
||||
|
||||
**修復後 live 狀態**:
|
||||
- `argocd/Application awoooi-prod`:`targetRevision=main`、`status.sync.revision=ab6d82743cba59e039943aaeeec4d8eaf8f99144`、`sync=Synced`。
|
||||
- K8s image 已恢復:
|
||||
- `awoooi-api`:`192.168.0.110:5000/awoooi/api:f1ef7ec3e295313af67d7acaf40d439585cb5270`,`2/2 ready`。
|
||||
- `awoooi-web`:`192.168.0.110:5000/awoooi/web:f1ef7ec3e295313af67d7acaf40d439585cb5270`,`2/2 ready`。
|
||||
- `awoooi-worker`:`192.168.0.110:5000/awoooi/api:f1ef7ec3e295313af67d7acaf40d439585cb5270`,`1/1 ready`。
|
||||
- `awoooi-auto-repair-canary`:`1/1 ready`。
|
||||
- Namespace 清理後 quota:`limits.cpu=3550m/8`、`pods=6/20`,rollout 不再被孤兒 Pod 擠壓。
|
||||
- `https://awoooi.wooo.work/api/v1/health`:`status=healthy`、`environment=prod`、`mock_mode=false`。
|
||||
|
||||
**首頁驗證**:
|
||||
- `curl -I https://awoooi.wooo.work/zh-TW`:`HTTP/2 200`,`cache-control: private, no-cache, no-store, max-age=0, must-revalidate`。
|
||||
- Playwright desktop 1440x1000:title `AWOOOI - 零干預維運,以人為本的決策`,可見 `AWOOOI OPERATIONS MAP`、`AI 自動化管理介面`、`AwoooP`、`OpenClaw / Hermes`、`MCP 證據`;`horizontalOverflow=0`、`canScrollVertical=true`、`hasOldPlaceholder=false`。
|
||||
- Playwright mobile 390x1000:同樣可見新版 Operations Map;`horizontalOverflow=0`、`canScrollVertical=true`、`hasOldPlaceholder=false`。
|
||||
- 截圖:`/tmp/awoooi-home-desktop-restore-main-fast.png`、`/tmp/awoooi-home-mobile-restore-main-fast.png`。
|
||||
|
||||
**新增守門**:
|
||||
- `.gitea/workflows/cd.yaml` 新增 `validate_argocd_source_contract`:
|
||||
- `awoooi-prod.spec.source.targetRevision` 必須等於 `main`。
|
||||
- `awoooi-prod.spec.source.kustomize.images` 必須為空。
|
||||
- 違反即 fail-fast,避免正式站再被支線或手動 image override 拉回舊版。
|
||||
|
||||
**殘留觀察**:
|
||||
- ArgoCD top-level health 仍顯示 `Degraded`,但 `.status.resources` 無任何 degraded child resource,controller log 顯示 sync 成功且沒有錯誤;目前以 rollout、Pod ready、public API health 與 Playwright 為實際可用性判斷。需另開一小段清理 Argo health stale/custom-health 行為,避免治理頁被橘燈誤導。
|
||||
|
||||
**進度更新**:
|
||||
- Production 首頁版本恢復:`100%`。
|
||||
- GitOps source guard:`0% -> 80%`;已在 CD 補 fail-fast,尚待下一次 CD 實跑驗證。
|
||||
- CI/CD release gates:維持 `100%`,新增 source contract gate。
|
||||
- 完整 AI 自動化飛輪:維持 `67%`;本輪是 production source 修復與可觀測性補強,未新增 auto-repair execution。
|
||||
|
||||
## 2026-06-03|AwoooP Work Items Owner Review Gate 與 Mobile Shell 可讀性
|
||||
|
||||
**背景**:統帥要求 AwoooP / AI 治理不能只在 Telegram 噴告警,前端必須看得出事件跑到哪個流程、誰要接手、AI 做了什麼、哪些步驟被 gate 擋住。本階段聚焦 `/zh-TW/awooop/work-items` 的 KM owner-review 接續處理與手機可讀性:把告警中的 `KM 需要更新` 往 Work Items 的單筆審核、乾跑預覽、Owner 確認、寫回保護與 stale ratio 回測串起來。
|
||||
|
||||
Reference in New Issue
Block a user