fix(ci): poll production deploy readback
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 26s
CD Pipeline / build-and-deploy (push) Successful in 5m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m3s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 26s
CD Pipeline / build-and-deploy (push) Successful in 5m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m3s
This commit is contained in:
@@ -1706,58 +1706,78 @@ jobs:
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
expected = os.environ["IMAGE_TAG"].strip().lower()
|
||||
expected_short = expected[:10]
|
||||
url = "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench"
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=20) as response:
|
||||
payload = json.load(response)
|
||||
except Exception as exc:
|
||||
print(
|
||||
"production_workbench_deploy_readback_failed="
|
||||
f"{type(exc).__name__}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise SystemExit(1) from exc
|
||||
|
||||
summary = payload.get("summary") if isinstance(payload, dict) else {}
|
||||
if not isinstance(summary, dict):
|
||||
summary = {}
|
||||
runtime_short = str(
|
||||
summary.get("production_deploy_runtime_build_commit_short_sha") or ""
|
||||
)
|
||||
desired_short = str(
|
||||
summary.get("production_deploy_desired_main_api_image_tag_short_sha")
|
||||
or ""
|
||||
)
|
||||
desired_status = str(
|
||||
summary.get(
|
||||
"production_deploy_desired_main_api_image_tag_readback_status"
|
||||
)
|
||||
or ""
|
||||
)
|
||||
matches_main = summary.get("production_deploy_image_tag_matches_main") is True
|
||||
if (
|
||||
runtime_short != expected_short
|
||||
or desired_short != expected_short
|
||||
or desired_status != "ok"
|
||||
or not matches_main
|
||||
):
|
||||
print(
|
||||
"production_deploy_readback_mismatch="
|
||||
f"expected={expected_short};runtime={runtime_short};"
|
||||
f"desired={desired_short};desired_status={desired_status};"
|
||||
f"matches_main={matches_main}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise SystemExit(1)
|
||||
attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18"))
|
||||
sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))
|
||||
last_error = ""
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=20) as response:
|
||||
payload = json.load(response)
|
||||
except Exception as exc:
|
||||
last_error = f"fetch_failed={type(exc).__name__}"
|
||||
print(
|
||||
"production_deploy_readback_attempt="
|
||||
f"{attempt}/{attempts};{last_error}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
summary = payload.get("summary") if isinstance(payload, dict) else {}
|
||||
if not isinstance(summary, dict):
|
||||
summary = {}
|
||||
runtime_short = str(
|
||||
summary.get("production_deploy_runtime_build_commit_short_sha")
|
||||
or ""
|
||||
)
|
||||
desired_short = str(
|
||||
summary.get(
|
||||
"production_deploy_desired_main_api_image_tag_short_sha"
|
||||
)
|
||||
or ""
|
||||
)
|
||||
desired_status = str(
|
||||
summary.get(
|
||||
"production_deploy_desired_main_api_image_tag_readback_status"
|
||||
)
|
||||
or ""
|
||||
)
|
||||
matches_main = (
|
||||
summary.get("production_deploy_image_tag_matches_main") is True
|
||||
)
|
||||
if (
|
||||
runtime_short == expected_short
|
||||
and desired_short == expected_short
|
||||
and desired_status == "ok"
|
||||
and matches_main
|
||||
):
|
||||
print(
|
||||
"✅ Production deploy readback matches Gitea main desired "
|
||||
f"image tag ({expected_short}) on attempt {attempt}/{attempts}"
|
||||
)
|
||||
raise SystemExit(0)
|
||||
last_error = (
|
||||
f"expected={expected_short};runtime={runtime_short};"
|
||||
f"desired={desired_short};desired_status={desired_status};"
|
||||
f"matches_main={matches_main}"
|
||||
)
|
||||
print(
|
||||
"production_deploy_readback_attempt="
|
||||
f"{attempt}/{attempts};{last_error}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
if attempt < attempts:
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
print(
|
||||
"✅ Production deploy readback matches Gitea main desired image tag "
|
||||
f"({expected_short})"
|
||||
"production_deploy_readback_mismatch=" + last_error,
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise SystemExit(1)
|
||||
PY
|
||||
fi
|
||||
|
||||
|
||||
@@ -1,3 +1,20 @@
|
||||
## 2026-06-30 — 09:24 production deploy readback bounded poll
|
||||
|
||||
**照主線處理的問題**:
|
||||
- Gitea CD `#4014` 對 `eb137bb4e fix(recovery): bound cold-start monitor probes` 轉 Failure;public job log 顯示 tests job 成功、build-and-deploy 已 build/push、deploy marker `90daa544f chore(cd): deploy eb137bb [skip ci]` 已 push、ArgoCD `Synced/Healthy`、三個 deployment rollout 成功、API health 200。
|
||||
- 失敗點不是 runner label、runner token 或測試,而是 deploy 完成後立刻讀 production Workbench 仍看到上一版:`production_deploy_readback_mismatch=expected=eb137bb4e0;runtime=4295b3383a;desired=4295b3383a;desired_status=ok;matches_main=True`。
|
||||
- `.gitea/workflows/cd.yaml` 已把 production deploy readback 從單次 request 改成 bounded polling:預設 `DEPLOY_READBACK_ATTEMPTS=18`、`DEPLOY_READBACK_SLEEP_SECONDS=10`,每次輸出 `production_deploy_readback_attempt=`;若最後仍 mismatch 才 fail-closed。
|
||||
- `ops/runner/test_cd_controlled_runtime_profile.py` 已鎖住 readback polling contract,避免未來又回到 rollout 後單次瞬讀 false failure。
|
||||
|
||||
**驗證**:
|
||||
- CD profile guard + bounded monitor tests:`29 passed`。
|
||||
- Gitea runner pressure guard:`workflow_files=11`、`auto_branch_events_on_110=0`、`generic_runner_labels=0`。
|
||||
- Gitea step env secret guard:`no Gitea run/with secrets or legacy Telegram routes`。
|
||||
- `git diff --check`:通過。
|
||||
- Production Workbench 在 rollout 後延遲收斂已讀回 `production_deploy_runtime_build_commit_short_sha=eb137bb4e0`、`production_deploy_desired_main_api_image_tag_short_sha=eb137bb4e0`、`production_deploy_desired_main_api_image_tag_readback_status=ok`、`production_deploy_image_tag_matches_main=true`。
|
||||
|
||||
**邊界**:未 workflow_dispatch,未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未手動改 K8s / DB,未讀 secret / token / raw sessions / SQLite / `.env`,未使用 GitHub / `gh` / GitHub API。
|
||||
|
||||
## 2026-06-30 — 09:07 P0-006 cold-start monitor bounded probe hardening
|
||||
|
||||
**照主線處理的問題**:
|
||||
|
||||
@@ -55,6 +55,10 @@ def test_cd_requires_production_deploy_readback_after_rollout() -> None:
|
||||
assert "apps/api/tests/test_awoooi_production_deploy_readback_blocker.py)" in text
|
||||
assert "tests/test_awoooi_production_deploy_readback_blocker.py" in text
|
||||
assert "production_deploy_readback_mismatch=" in text
|
||||
assert 'attempts = int(os.environ.get("DEPLOY_READBACK_ATTEMPTS", "18"))' in text
|
||||
assert 'sleep_seconds = int(os.environ.get("DEPLOY_READBACK_SLEEP_SECONDS", "10"))' in text
|
||||
assert "production_deploy_readback_attempt=" in text
|
||||
assert "time.sleep(sleep_seconds)" in text
|
||||
assert "production_deploy_runtime_build_commit_short_sha" in text
|
||||
assert "production_deploy_desired_main_api_image_tag_short_sha" in text
|
||||
assert "production_deploy_desired_main_api_image_tag_readback_status" in text
|
||||
|
||||
Reference in New Issue
Block a user