fix(ci): poll production deploy readback
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 26s
CD Pipeline / build-and-deploy (push) Successful in 5m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m3s

This commit is contained in:
Your Name
2026-06-30 09:15:00 +08:00
parent 90daa544f8
commit b4dc407ce0
3 changed files with 85 additions and 44 deletions

View File

@@ -1706,58 +1706,78 @@ jobs:
import json
import os
import sys
import time
import urllib.request
expected = os.environ["IMAGE_TAG"].strip().lower()
expected_short = expected[:10]
url = "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench"
try:
with urllib.request.urlopen(url, timeout=20) as response:
payload = json.load(response)
except Exception as exc:
print(
"production_workbench_deploy_readback_failed="
f"{type(exc).__name__}",
file=sys.stderr,
)
raise SystemExit(1) from exc
summary = payload.get("summary") if isinstance(payload, dict) else {}
if not isinstance(summary, dict):
summary = {}
runtime_short = str(
summary.get("production_deploy_runtime_build_commit_short_sha") or ""
)
desired_short = str(
summary.get("production_deploy_desired_main_api_image_tag_short_sha")
or ""
)
desired_status = str(
summary.get(
"production_deploy_desired_main_api_image_tag_readback_status"
)
or ""
)
matches_main = summary.get("production_deploy_image_tag_matches_main") is True
if (
runtime_short != expected_short
or desired_short != expected_short
or desired_status != "ok"
or not matches_main
):
print(
"production_deploy_readback_mismatch="
f"expected={expected_short};runtime={runtime_short};"
f"desired={desired_short};desired_status={desired_status};"
f"matches_main={matches_main}",
file=sys.stderr,
)
raise SystemExit(1)
attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18"))
sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))
last_error = ""
for attempt in range(1, attempts + 1):
try:
with urllib.request.urlopen(url, timeout=20) as response:
payload = json.load(response)
except Exception as exc:
last_error = f"fetch_failed={type(exc).__name__}"
print(
"production_deploy_readback_attempt="
f"{attempt}/{attempts};{last_error}",
file=sys.stderr,
)
else:
summary = payload.get("summary") if isinstance(payload, dict) else {}
if not isinstance(summary, dict):
summary = {}
runtime_short = str(
summary.get("production_deploy_runtime_build_commit_short_sha")
or ""
)
desired_short = str(
summary.get(
"production_deploy_desired_main_api_image_tag_short_sha"
)
or ""
)
desired_status = str(
summary.get(
"production_deploy_desired_main_api_image_tag_readback_status"
)
or ""
)
matches_main = (
summary.get("production_deploy_image_tag_matches_main") is True
)
if (
runtime_short == expected_short
and desired_short == expected_short
and desired_status == "ok"
and matches_main
):
print(
"✅ Production deploy readback matches Gitea main desired "
f"image tag ({expected_short}) on attempt {attempt}/{attempts}"
)
raise SystemExit(0)
last_error = (
f"expected={expected_short};runtime={runtime_short};"
f"desired={desired_short};desired_status={desired_status};"
f"matches_main={matches_main}"
)
print(
"production_deploy_readback_attempt="
f"{attempt}/{attempts};{last_error}",
file=sys.stderr,
)
if attempt < attempts:
time.sleep(sleep_seconds)
print(
"✅ Production deploy readback matches Gitea main desired image tag "
f"({expected_short})"
"production_deploy_readback_mismatch=" + last_error,
file=sys.stderr,
)
raise SystemExit(1)
PY
fi

View File

@@ -1,3 +1,20 @@
## 2026-06-30 — 09:24 production deploy readback bounded poll
**照主線處理的問題**
- Gitea CD `#4014``eb137bb4e fix(recovery): bound cold-start monitor probes` 轉 Failurepublic job log 顯示 tests job 成功、build-and-deploy 已 build/push、deploy marker `90daa544f chore(cd): deploy eb137bb [skip ci]` 已 push、ArgoCD `Synced/Healthy`、三個 deployment rollout 成功、API health 200。
- 失敗點不是 runner label、runner token 或測試,而是 deploy 完成後立刻讀 production Workbench 仍看到上一版:`production_deploy_readback_mismatch=expected=eb137bb4e0;runtime=4295b3383a;desired=4295b3383a;desired_status=ok;matches_main=True`
- `.gitea/workflows/cd.yaml` 已把 production deploy readback 從單次 request 改成 bounded polling預設 `DEPLOY_READBACK_ATTEMPTS=18``DEPLOY_READBACK_SLEEP_SECONDS=10`,每次輸出 `production_deploy_readback_attempt=`;若最後仍 mismatch 才 fail-closed。
- `ops/runner/test_cd_controlled_runtime_profile.py` 已鎖住 readback polling contract避免未來又回到 rollout 後單次瞬讀 false failure。
**驗證**
- CD profile guard + bounded monitor tests`29 passed`
- Gitea runner pressure guard`workflow_files=11``auto_branch_events_on_110=0``generic_runner_labels=0`
- Gitea step env secret guard`no Gitea run/with secrets or legacy Telegram routes`
- `git diff --check`:通過。
- Production Workbench 在 rollout 後延遲收斂已讀回 `production_deploy_runtime_build_commit_short_sha=eb137bb4e0``production_deploy_desired_main_api_image_tag_short_sha=eb137bb4e0``production_deploy_desired_main_api_image_tag_readback_status=ok``production_deploy_image_tag_matches_main=true`
**邊界**:未 workflow_dispatch未重啟主機未 restart Docker / Nginx / K3s / DB / firewall未手動改 K8s / DB未讀 secret / token / raw sessions / SQLite / `.env`,未使用 GitHub / `gh` / GitHub API。
## 2026-06-30 — 09:07 P0-006 cold-start monitor bounded probe hardening
**照主線處理的問題**

View File

@@ -55,6 +55,10 @@ def test_cd_requires_production_deploy_readback_after_rollout() -> None:
assert "apps/api/tests/test_awoooi_production_deploy_readback_blocker.py)" in text
assert "tests/test_awoooi_production_deploy_readback_blocker.py" in text
assert "production_deploy_readback_mismatch=" in text
assert 'attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18"))' in text
assert 'sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))' in text
assert "production_deploy_readback_attempt=" in text
assert "time.sleep(sleep_seconds)" in text
assert "production_deploy_runtime_build_commit_short_sha" in text
assert "production_deploy_desired_main_api_image_tag_short_sha" in text
assert "production_deploy_desired_main_api_image_tag_readback_status" in text