From 83e27fa2b2545591da5c019b29d3890cfd412c54 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 31 May 2026 12:43:19 +0800 Subject: [PATCH] fix(cd): harden source link post-deploy gate --- .gitea/workflows/cd.yaml | 40 +++++++++++++++--- .../test_cd_post_deploy_source_link_gate.py | 37 +++++++++++++++++ docs/LOGBOOK.md | 41 +++++++++++++++++++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 7 ++++ 4 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 apps/api/tests/test_cd_post_deploy_source_link_gate.py diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 13f5611d..0d6b2f8e 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -1250,6 +1250,21 @@ jobs: EVENT_EXPORTER_ERROR="${OBSERVABILITY_PREFLIGHT_ERROR}" fi + SOURCE_LINK_RUN_REF="gitea-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}" + SOURCE_LINK_CANARY_WORK_ITEM_ID="source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}" + SOURCE_LINK_CANARY_EVENT_ID="sentry:source_correlation_linked:awoooi-source-link-canary-${SOURCE_LINK_RUN_REF}" + echo "source_link_canary_work_item_id=${SOURCE_LINK_CANARY_WORK_ITEM_ID}" >> "$GITHUB_OUTPUT" + echo "source_link_canary_event_id=${SOURCE_LINK_CANARY_EVENT_ID}" >> "$GITHUB_OUTPUT" + + AWOOOP_OPERATOR_API_KEY="$( + ssh $SSH_OPTS "wooo@${K8S_SSH_HOST}" \ + "sudo kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml --server=${K8S_API_SERVER} get secret awoooi-secrets -n awoooi-prod -o jsonpath='{.data.AWOOOP_OPERATOR_API_KEY}' | base64 -d" + )" + if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then + echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run" + exit 1 + fi + # 2026-05-05 Codex: use the keepalived VIP instead of a fixed node. # Host runner launches the CI image explicitly to avoid act RWLayer=nil. if docker run --rm \ @@ -1263,11 +1278,15 @@ jobs: -e AWOOOI_OTEL_COLLECTOR_ERROR="${OTEL_COLLECTOR_ERROR}" \ -e AWOOOI_EVENT_EXPORTER_STATUSES="${EVENT_EXPORTER_STATUSES}" \ -e AWOOOI_EVENT_EXPORTER_ERROR="${EVENT_EXPORTER_ERROR}" \ + -e AWOOOP_OPERATOR_API_KEY \ + -e AWOOOP_OPERATOR_ID="gitea-cd-post-deploy" \ + -e SOURCE_LINK_RUN_REF="${SOURCE_LINK_RUN_REF}" \ "${{ env.CI_IMAGE }}" \ - bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --json | tee /tmp/alert_chain_result.json'; then + bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py --api-url ${{ env.ALERT_CHAIN_API_URL }} --source-link-canary-target-incident-id INC-20260505-25E744 --run-ref "${SOURCE_LINK_RUN_REF}" --json | tee /tmp/alert_chain_result.json'; then echo "alert_chain_status=pass" >> $GITHUB_OUTPUT else echo "alert_chain_status=fail" >> $GITHUB_OUTPUT + exit 1 fi # Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check) @@ -1287,11 +1306,14 @@ jobs: echo "coverage_status=pass" >> $GITHUB_OUTPUT else echo "coverage_status=fail" >> $GITHUB_OUTPUT + exit 1 fi - name: AwoooP Source Correlation Applied-Link Smoke id: source_correlation_apply_smoke run: | + SOURCE_LINK_CANARY_WORK_ITEM_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_work_item_id }}" + SOURCE_LINK_CANARY_EVENT_ID="${{ steps.alert_chain_smoke.outputs.source_link_canary_event_id }}" if docker run --rm \ --name "awoooi-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}-source-link-smoke" \ --cpus "0.5" \ @@ -1299,13 +1321,21 @@ jobs: -v "$PWD:/workspace" \ -v awoooi-api-venv-cache:/opt/api-venv \ -w /workspace \ + -e SOURCE_LINK_CANARY_WORK_ITEM_ID \ + -e SOURCE_LINK_CANARY_EVENT_ID \ "${{ env.CI_IMAGE }}" \ - bash -lc 'source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py \ + bash -lc 'set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py \ --api-url ${{ env.ALERT_CHAIN_API_URL }} \ --target-incident-id INC-20260505-25E744 \ - --work-item-id source-evidence:sentry:received:codex-sentry-20260513-t15b-v3 \ - --expected-source-event-provider-event-id sentry:source_correlation_linked:codex-sentry-20260513-t15b-v3 \ - --allow-existing-apply | tee /tmp/source_correlation_apply_smoke.json'; then + --work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \ + --expected-source-event-provider-event-id "${SOURCE_LINK_CANARY_EVENT_ID}" \ + --allow-existing-apply \ + --refresh-if-stale-days 6 \ + --refresh-work-item-id "${SOURCE_LINK_CANARY_WORK_ITEM_ID}" \ + --verify-refresh-candidate \ + --reviewer-id gitea_cd_source_link_canary \ + --operator-note "CD dedicated source-link canary; append-only status-chain proof" \ + | tee /tmp/source_correlation_apply_smoke.json'; then echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT else echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT diff --git a/apps/api/tests/test_cd_post_deploy_source_link_gate.py b/apps/api/tests/test_cd_post_deploy_source_link_gate.py new file mode 100644 index 00000000..e7caf96a --- /dev/null +++ b/apps/api/tests/test_cd_post_deploy_source_link_gate.py @@ -0,0 +1,37 @@ +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[3] +CD_WORKFLOW = ROOT / ".gitea" / "workflows" / "cd.yaml" + + +def _workflow_text() -> str: + return CD_WORKFLOW.read_text(encoding="utf-8") + + +def test_post_deploy_pipefail_preserves_smoke_exit_codes() -> None: + text = _workflow_text() + + assert "set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/alert_chain_smoke_test.py" in text + assert "set -o pipefail; source /opt/api-venv/bin/activate && python3 scripts/awooop_source_correlation_apply_smoke.py" in text + + +def test_post_deploy_critical_gates_exit_nonzero() -> None: + text = _workflow_text() + + alert_chain_failure = text.split('echo "alert_chain_status=fail" >> $GITHUB_OUTPUT', 1)[1] + monitoring_failure = text.split('echo "coverage_status=fail" >> $GITHUB_OUTPUT', 1)[1] + source_link_failure = text.split('echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT', 1)[1] + + assert "exit 1" in alert_chain_failure.split("fi", 1)[0] + assert "exit 1" in monitoring_failure.split("fi", 1)[0] + assert "exit 1" in source_link_failure.split("fi", 1)[0] + + +def test_cd_source_link_gate_uses_current_deploy_canary() -> None: + text = _workflow_text() + + assert "SOURCE_LINK_RUN_REF=\"gitea-cd-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-1}\"" in text + assert "--source-link-canary-target-incident-id INC-20260505-25E744" in text + assert "--work-item-id \"${SOURCE_LINK_CANARY_WORK_ITEM_ID}\"" in text + assert "--expected-source-event-provider-event-id \"${SOURCE_LINK_CANARY_EVENT_ID}\"" in text + assert "codex-sentry-20260513-t15b-v3" not in text diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 8afacf2c..eb459a31 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,44 @@ +## 2026-05-31|CD source-link gate 過期與 pipefail 修復 + +**背景**: + +- 2026-05-29 post-deploy log 已輸出 `status-chain did not expose applied source link`,但 workflow 仍繼續跑到成功通知。 +- 根因有兩層:`python ... | tee ...` 未設 `pipefail`,Python exit 1 被 `tee` 蓋成 0;同時 CD source-link gate 綁死 T120 的固定 Sentry event,但 Status Chain source correlation lookback 只有 7 天,固定事件自然會過期。 + +**本次調整**: + +- `.gitea/workflows/cd.yaml` 的 Alert Chain / Source Link smoke pipeline 加上 `set -o pipefail`。 +- Alert Chain / Monitoring Coverage / Source Link 三個 critical gate 失敗時都會 `exit 1`。 +- CD post-deploy 改為每次建立 dedicated `AwoooPSourceLinkCanary`: + - `run_ref=gitea-cd-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}` + - 對應 work item:`source-evidence:sentry:upstream_canary:awoooi-source-link-canary-${run_ref}` + - 對應 expected event:`sentry:source_correlation_linked:awoooi-source-link-canary-${run_ref}` +- Source Correlation smoke 改驗證同一次部署產生的 canary,不再依賴 `codex-sentry-20260513-t15b-v3` 固定老事件。 +- 新增 `apps/api/tests/test_cd_post_deploy_source_link_gate.py`,鎖住 pipefail、critical gate `exit 1`、current deploy canary 三個防退化條件。 + +**Verification**: + +```text +ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/cd.yaml"); puts "yaml ok"' + -> yaml ok +node scripts/ci/check-gitea-step-env-secrets.js + -> no Gitea step env/with secrets +python3 -m py_compile scripts/alert_chain_smoke_test.py scripts/awooop_source_correlation_apply_smoke.py apps/api/tests/test_cd_post_deploy_source_link_gate.py + -> pass +/Users/ogt/.pyenv/shims/ruff check apps/api/tests/test_cd_post_deploy_source_link_gate.py + -> pass +/Users/ogt/.pyenv/shims/pytest apps/api/tests/test_cd_post_deploy_source_link_gate.py apps/api/tests/test_alert_chain_smoke_metric.py -q + -> 16 passed +git diff --check + -> pass +production same-shape smoke: + -> Alert Chain 9/9 checks passed + -> Source Correlation status=passed + -> verification_status=applied_link_verified + -> expected_source_event_provider_event_id=sentry:source_correlation_linked:awoooi-source-link-canary-gitea-cd-codex-20260531-manual + -> writes_incident_state=false, writes_auto_repair_result=false, writes_ticket=false +``` + ## 2026-05-31|IwoooS 部署證據去固定化 **背景**: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index d987c9c5..51ea436f 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2755,6 +2755,13 @@ Phase 6 完成後 - 判讀:T140 不修改 live runner 設定,只把 runner pool ownership 變成可稽核證據。下一段 T141 應讀 `awoooi` / `ewoooc` / `stockplatform-v2` workflow labels,再設計 repo label isolation 或獨立 runner registration;不可在沒有替代 runner 前直接移除 `ewoooc-host`。 - 目前進度更新:AwoooP 告警可觀測鏈約 99.998%;Incident-level source correlation 可見性約 98.8%;Source correlation apply 狀態鏈可驗證性約 99.72%;Source correlation freshness / rolling gate 約 98.2%;前端 AI 自動化管理介面同步約 99.999%;Dashboard snapshot / SSE console noise 收斂約 99.2%;CI/CD runner hygiene 約 99.4%;Runner ownership 收斂約 96%;Runner pool inventory 約 70%;API image build layer hygiene 約 88%;Deploy rollout-risk 可觀測性約 91%;CI/CD evidence 前端可見性約 92%;Pipeline stage 可觀測性約 88%;Build host pressure治理約 86%;完整 AI 自動化管理產品化約 99.965%。 +**T140 CD source-link gate freshness + hard-fail repair(2026-05-31 台北)**: +- 觸發:2026-05-29 CD post-deploy log 已輸出 `status-chain did not expose applied source link`,但 workflow 因 `python | tee` 未設 `pipefail`,且 alert-chain / monitoring coverage 失敗分支只寫 output 不 `exit 1`,導致 gate 明面成功、實際失效。舊 CD gate 仍綁定 T120 固定事件 `codex-sentry-20260513-t15b-v3`;Status Chain source correlation lookback 為 7 天,固定事件自然過期後會反覆失敗。 +- 修正:`.gitea/workflows/cd.yaml` post-deploy 改為每次部署建立 dedicated `AwoooPSourceLinkCanary`(`run_ref=gitea-cd-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}`),輸出對應 `source_link_canary_work_item_id` / `source_link_canary_event_id`,下一個 source-correlation smoke 只驗證同一次部署產生的 canary。Alert Chain 與 Source Link smoke 的 `tee` pipeline 加 `set -o pipefail`;Alert Chain / Monitoring / Source Link 三個 critical gate 失敗都會 `exit 1`。 +- 安全邊界:CD post-deploy 不把 Gitea secret 放入 step `env/with`;operator key 從 production K8s Secret 讀出,只以 host env 傳進短生命週期 smoke container。Source-link apply 仍是 append-only,驗證 `writes_incident_state=false`、`writes_auto_repair_result=false`、`writes_ticket=false`。 +- Verification:workflow YAML parse ok;`node scripts/ci/check-gitea-step-env-secrets.js` pass;`py_compile` pass;`ruff check apps/api/tests/test_cd_post_deploy_source_link_gate.py` pass;`pytest apps/api/tests/test_cd_post_deploy_source_link_gate.py apps/api/tests/test_alert_chain_smoke_metric.py -q` -> `16 passed`;`git diff --check` pass。Production same-shape smoke(public API)回 Alert Chain `9/9 checks passed`,Source Correlation 回 `status=passed`、`verification_status=applied_link_verified`、`expected_source_event_provider_event_id=sentry:source_correlation_linked:awoooi-source-link-canary-gitea-cd-codex-20260531-manual`、`writes_* = false`。 +- 判讀:T140 不新增自動修復策略,也不擴大告警音量;它把原本會被 `tee` 吃掉的 post-deploy 紅燈變回真正 hard gate,並移除固定老 source event 的 7 天過期問題。 + **T139 CI/CD stage transition evidence(2026-05-21 台北)**: - 觸發:T138 已把 CI/CD evidence 顯示到 AwoooP Deployments,但實測 CD #2833 發現 `post-deploy-checks` 會被同一台 110 shared runner 的其他 repo job 卡住。只靠 tests running / post-deploy success,operator 仍看不出 pipeline 是卡在 build、rollout、post-deploy queue,還是 post-deploy gate 本身。 - 修正:`.gitea/workflows/cd.yaml` 在 `build-and-deploy` 開始時新增 `CI_build_and_deploy_running`,在 image build/push + ArgoCD rollout + API health 成功後新增 `CI_build_and_deploy_success`,在 `post-deploy-checks` 開始時新增 `CI_post_deploy_checks_running`;這三個通知都只走 AWOOI API/AwoooP,失敗時只在 CI log warning,不 fallback Telegram 洗版。`apps/web/src/components/panels/DeploymentsPanel.tsx` 與 `apps/web/messages/{zh-TW,en}.json` 補 `build-and-deploy` / `post-deploy-checks` stage label。