From 3c9f6489635bf4e8c0682571b48ff43f70bc2888 Mon Sep 17 00:00:00 2001 From: ogt Date: Wed, 1 Jul 2026 21:16:19 +0800 Subject: [PATCH] fix(api): ignore stale cd failure after production readback --- .../awoooi_priority_work_order_readback.py | 162 ++++++++++++++---- ...awoooi_priority_work_order_readback_api.py | 75 ++++++++ docs/LOGBOOK.md | 21 +++ 3 files changed, 225 insertions(+), 33 deletions(-) diff --git a/apps/api/src/services/awoooi_priority_work_order_readback.py b/apps/api/src/services/awoooi_priority_work_order_readback.py index 48263c25..20ac417e 100644 --- a/apps/api/src/services/awoooi_priority_work_order_readback.py +++ b/apps/api/src/services/awoooi_priority_work_order_readback.py @@ -618,6 +618,44 @@ def apply_ai_loop_current_blocker_execution_queue( ) state = _dict(payload.setdefault("mainline_execution_state", {})) + current_head = _dict(payload.get("current_head")) + production_readback_verified = bool( + state.get("current_main_cd_run_status") == "production_readback_verified" + and current_head.get("production_source_truth_available") is True + and _is_sha(str(state.get("latest_successful_deployed_source_sha") or "")) + ) + deploy_marker_resolved_by_production_readback = bool( + production_readback_verified + and (deploy_marker_readback_required or cd_failed_after_registry_ready) + ) + active_deployment_closure_state = ( + "production_readback_verified" + if deploy_marker_resolved_by_production_readback + else deployment_closure_state + ) + active_deploy_marker_readback_required = bool( + deploy_marker_readback_required + and not deploy_marker_resolved_by_production_readback + ) + active_cd_failed_after_registry_ready = bool( + cd_failed_after_registry_ready + and not deploy_marker_resolved_by_production_readback + ) + active_current_cd_run_id = ( + str(state.get("current_main_cd_run_id") or "") + if deploy_marker_resolved_by_production_readback + else current_cd_run_id + ) + active_current_cd_run_status = ( + "production_readback_verified" + if deploy_marker_resolved_by_production_readback + else current_cd_run_status + ) + active_current_cd_commit_sha = ( + str(state.get("latest_successful_deployed_source_sha") or "") + if deploy_marker_resolved_by_production_readback + else current_cd_commit_sha + ) state["active_p0_state"] = "blocked_ai_loop_current_blocker_execution_queue" state["next_executable_mainline_workplan_id"] = ( "P0-006-AI-LOOP-CURRENT-BLOCKER-EXECUTION-QUEUE" @@ -632,15 +670,38 @@ def apply_ai_loop_current_blocker_execution_queue( registry_v2_status_classifier ) state["ai_loop_current_blocker_deployment_closure_state"] = ( - deployment_closure_state + active_deployment_closure_state ) state["ai_loop_current_blocker_deploy_marker_readback_required"] = ( + active_deploy_marker_readback_required + ) + state["ai_loop_current_blocker_current_cd_run_id"] = active_current_cd_run_id + state["ai_loop_current_blocker_current_cd_run_status"] = ( + active_current_cd_run_status + ) + state["ai_loop_current_blocker_current_cd_commit_sha"] = ( + active_current_cd_commit_sha + ) + state["ai_loop_current_blocker_cd_failed_after_registry_ready"] = ( + active_cd_failed_after_registry_ready + ) + state["ai_loop_current_blocker_deploy_marker_resolved_by_production_readback"] = ( + deploy_marker_resolved_by_production_readback + ) + state["ai_loop_current_blocker_historical_deployment_closure_state"] = ( + deployment_closure_state + ) + state["ai_loop_current_blocker_historical_deploy_marker_readback_required"] = ( deploy_marker_readback_required ) - state["ai_loop_current_blocker_current_cd_run_id"] = current_cd_run_id - state["ai_loop_current_blocker_current_cd_run_status"] = current_cd_run_status - state["ai_loop_current_blocker_current_cd_commit_sha"] = current_cd_commit_sha - state["ai_loop_current_blocker_cd_failed_after_registry_ready"] = ( + state["ai_loop_current_blocker_historical_current_cd_run_id"] = current_cd_run_id + state["ai_loop_current_blocker_historical_current_cd_run_status"] = ( + current_cd_run_status + ) + state["ai_loop_current_blocker_historical_current_cd_commit_sha"] = ( + current_cd_commit_sha + ) + state["ai_loop_current_blocker_historical_cd_failed_after_registry_ready"] = ( cd_failed_after_registry_ready ) state["ai_loop_current_blocker_harbor_110_repair_run_id"] = ( @@ -717,12 +778,12 @@ def apply_ai_loop_current_blocker_execution_queue( + ([pressure_blocker] if pressure_blocker else []) + ( ["deploy_marker_readback_required_after_registry_ready"] - if deploy_marker_readback_required + if active_deploy_marker_readback_required else [] ) + ( ["current_cd_failure_after_registry_ready"] - if cd_failed_after_registry_ready + if active_cd_failed_after_registry_ready else [] ) ) @@ -749,19 +810,44 @@ def apply_ai_loop_current_blocker_execution_queue( registry_v2_status_classifier ) evidence["ai_loop_current_blocker_deployment_closure_state"] = ( - deployment_closure_state + active_deployment_closure_state ) evidence["ai_loop_current_blocker_deploy_marker_readback_required"] = ( - deploy_marker_readback_required + active_deploy_marker_readback_required + ) + evidence["ai_loop_current_blocker_current_cd_run_id"] = ( + active_current_cd_run_id ) - evidence["ai_loop_current_blocker_current_cd_run_id"] = current_cd_run_id evidence["ai_loop_current_blocker_current_cd_run_status"] = ( - current_cd_run_status + active_current_cd_run_status ) evidence["ai_loop_current_blocker_current_cd_commit_sha"] = ( - current_cd_commit_sha + active_current_cd_commit_sha ) evidence["ai_loop_current_blocker_cd_failed_after_registry_ready"] = ( + active_cd_failed_after_registry_ready + ) + evidence[ + "ai_loop_current_blocker_deploy_marker_resolved_by_production_readback" + ] = deploy_marker_resolved_by_production_readback + evidence["ai_loop_current_blocker_historical_deployment_closure_state"] = ( + deployment_closure_state + ) + evidence[ + "ai_loop_current_blocker_historical_deploy_marker_readback_required" + ] = deploy_marker_readback_required + evidence["ai_loop_current_blocker_historical_current_cd_run_id"] = ( + current_cd_run_id + ) + evidence["ai_loop_current_blocker_historical_current_cd_run_status"] = ( + current_cd_run_status + ) + evidence["ai_loop_current_blocker_historical_current_cd_commit_sha"] = ( + current_cd_commit_sha + ) + evidence[ + "ai_loop_current_blocker_historical_cd_failed_after_registry_ready" + ] = ( cd_failed_after_registry_ready ) evidence["ai_loop_current_blocker_harbor_110_repair_run_id"] = ( @@ -909,20 +995,21 @@ def apply_ai_loop_current_blocker_execution_queue( "SSH/session control-path readback, ordered local-console phases, " "post-recovery queue readbacks, and metadata-only KM/RAG/MCP/" "PlayBook writeback." - ), - ( - "P0-006-CD-DEPLOY-MARKER-READBACK: close the latest visible CD " - f"{current_cd_run_id or 'unknown'} status " - f"{current_cd_run_status or 'unknown'} and verify deploy marker / " - "production image / priority API before claiming runtime closure." - ), - ( - "P0-006-HARBOR-REGISTRY-CONTROLLED-RECOVERY-PREFLIGHT: only " - "rerun Harbor watchdog repair if registry /v2/ regresses below " - "200/401; otherwise keep focus on 110 SSH control-path and " - "deploy-marker closure." - ), + ) ] + if not deploy_marker_resolved_by_production_readback: + payload["next_execution_order"].append( + "P0-006-CD-DEPLOY-MARKER-READBACK: close the latest visible CD " + f"{active_current_cd_run_id or 'unknown'} status " + f"{active_current_cd_run_status or 'unknown'} and verify deploy marker / " + "production image / priority API before claiming runtime closure." + ) + payload["next_execution_order"].append( + "P0-006-HARBOR-REGISTRY-CONTROLLED-RECOVERY-PREFLIGHT: only " + "rerun Harbor watchdog repair if registry /v2/ regresses below " + "200/401; otherwise keep focus on 110 SSH control-path and " + "deploy-marker closure." + ) else: payload["next_execution_order"] = [ ( @@ -938,14 +1025,15 @@ def apply_ai_loop_current_blocker_execution_queue( "P0-006-HARBOR-REGISTRY-CONTROLLED-RECOVERY-PREFLIGHT: after the " "AI Loop queue item verifies 110 control path, rerun Harbor " "watchdog check-mode / repair-once and registry /v2/ readback." - ), - ( + ) + ] + if not deploy_marker_resolved_by_production_readback: + payload["next_execution_order"].append( "P0-006-CD-DEPLOY-MARKER-READBACK: after registry /v2/ is " "200/401, let the next Gitea CD run build/push/deploy and then " "verify delivery-closure-workbench, priority work order, and " "production marker advance." - ), - ] + ) _refresh_rollups_after_stockplatform_overlay(payload, state) summary = _dict(payload.setdefault("summary", {})) summary["ai_loop_current_blocker_execution_queue_count"] = len(queue) @@ -955,14 +1043,22 @@ def apply_ai_loop_current_blocker_execution_queue( registry_v2_status_classifier ) summary["ai_loop_current_blocker_deployment_closure_state"] = ( - deployment_closure_state + active_deployment_closure_state ) summary["ai_loop_current_blocker_deploy_marker_readback_required"] = ( - deploy_marker_readback_required + active_deploy_marker_readback_required + ) + summary["ai_loop_current_blocker_current_cd_run_status"] = ( + active_current_cd_run_status ) - summary["ai_loop_current_blocker_current_cd_run_status"] = current_cd_run_status summary["ai_loop_current_blocker_cd_failed_after_registry_ready"] = ( - cd_failed_after_registry_ready + active_cd_failed_after_registry_ready + ) + summary["ai_loop_current_blocker_deploy_marker_resolved_by_production_readback"] = ( + deploy_marker_resolved_by_production_readback + ) + summary["ai_loop_current_blocker_historical_current_cd_run_status"] = ( + current_cd_run_status ) summary["ai_loop_current_blocker_harbor_110_repair_run_status"] = ( harbor_110_repair_run_status diff --git a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py index d3638c06..5dd0765e 100644 --- a/apps/api/tests/test_awoooi_priority_work_order_readback_api.py +++ b/apps/api/tests/test_awoooi_priority_work_order_readback_api.py @@ -799,6 +799,81 @@ def test_awoooi_priority_work_order_readback_normalizes_runtime_source_truth( assert "f426522" not in current_truth +def test_awoooi_priority_work_order_readback_does_not_reopen_stale_cd_failure_after_production_readback( + monkeypatch: pytest.MonkeyPatch, +): + runtime_sha = "b4dc407ce05c68a3908b993437a61b869d83810f" + runtime_short_sha = runtime_sha[:10] + monkeypatch.setenv("AWOOOI_BUILD_COMMIT_SHA", runtime_sha) + monkeypatch.setenv("AWOOOI_DESIRED_API_IMAGE_TAG", runtime_sha) + + payload = load_latest_awoooi_priority_work_order_readback() + apply_harbor_registry_controlled_recovery_preflight( + payload, + _harbor_registry_ready(), + ) + executor = json.loads( + json.dumps(load_latest_ai_agent_log_controlled_writeback_executor_readback()) + ) + executor["agent_consumption_context"]["current_blocker_execution_queue"][0].update( + { + "deployment_closure_state": ( + "blocked_latest_visible_cd_failure_after_registry_ready" + ), + "deploy_marker_readback_required": True, + "current_cd_run_id": "4258", + "current_cd_run_status": "Failure", + "current_cd_commit_sha": "06819ea96c058e7987811e853242390eaced7f91", + "cd_failed_after_registry_ready": True, + } + ) + apply_ai_loop_current_blocker_execution_queue( + payload, + executor, + ) + + state = payload["mainline_execution_state"] + evidence = payload["in_progress_or_blocked_in_priority_order"][0]["evidence"] + blockers = state["active_p0_live_active_blockers"] + assert state["current_main_cd_run_status"] == "production_readback_verified" + assert state["ai_loop_current_blocker_deploy_marker_readback_required"] is False + assert state["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False + assert ( + state["ai_loop_current_blocker_deploy_marker_resolved_by_production_readback"] + is True + ) + assert state["ai_loop_current_blocker_current_cd_run_status"] == ( + "production_readback_verified" + ) + assert state["ai_loop_current_blocker_current_cd_run_id"] == ( + f"production_readback:{runtime_short_sha}" + ) + assert state["ai_loop_current_blocker_historical_current_cd_run_id"] == "4258" + assert state["ai_loop_current_blocker_historical_current_cd_run_status"] == ( + "Failure" + ) + assert "deploy_marker_readback_required_after_registry_ready" not in blockers + assert "current_cd_failure_after_registry_ready" not in blockers + assert evidence["ai_loop_current_blocker_deploy_marker_readback_required"] is False + assert evidence["ai_loop_current_blocker_cd_failed_after_registry_ready"] is False + assert evidence[ + "ai_loop_current_blocker_historical_deploy_marker_readback_required" + ] is True + assert evidence["ai_loop_current_blocker_historical_current_cd_run_status"] == ( + "Failure" + ) + assert payload["summary"][ + "ai_loop_current_blocker_deploy_marker_resolved_by_production_readback" + ] is True + assert payload["summary"]["ai_loop_current_blocker_current_cd_run_status"] == ( + "production_readback_verified" + ) + assert all( + "P0-006-CD-DEPLOY-MARKER-READBACK" not in item + for item in payload["next_execution_order"] + ) + + def test_awoooi_priority_work_order_readback_rejects_reordered_active_p0(tmp_path): operations_dir = tmp_path / "docs" / "operations" operations_dir.mkdir(parents=True) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 781aef77..ab03f2a3 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -25,6 +25,27 @@ **下一步**:照 P0 繼續收斂剩餘 WARN:先處理 MOMO source freshness / current-month confirmation 與 110 `backup-all` failed components;另開受控設計把高頻全量 pg_dump 改成專用 backup role + 分層備份,避免下一次重啟或 cron 再製造 DB/IO 壓力。 +## 2026-07-01 — 21:20 P0-006 production readback 不再重開舊 CD failure + +**照主線修正的問題**: +- Gitea CD `#4293` 對 `87512d32f5` 已 Success,production priority readback 也讀到 `latest_successful_deployed_source_sha=87512d32f5a73849b2d8090092677a745c85f9a8` 與 `production_readback_verified`;但 AI Loop current blocker queue 仍把舊 `#4258 Failure` / deploy-marker readback requirement 寫回 active blocker 與 `next_execution_order`。 +- `apply_ai_loop_current_blocker_execution_queue` 現在會在 production source truth 已 verified 時,把舊 CD failure / deploy-marker requirement 保留在 `historical_*` evidence,但 active 欄位改為 `production_readback_verified`,且不再把 `deploy_marker_readback_required_after_registry_ready` / `current_cd_failure_after_registry_ready` 加回 active blockers。 +- P0-006 仍維持 blocked;剩餘主線 blocker 是 AI Loop current blocker execution queue / 110 control-path readback,不把 deploy marker 成功誤宣稱為 reboot SLO closure。 + +**驗證**: +- `python3.11 -m py_compile apps/api/src/services/awoooi_priority_work_order_readback.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py`:通過。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awoooi_priority_work_order_readback_api.py -q`:`12 passed`。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py apps/api/tests/test_awoooi_priority_work_order_readback_api.py apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py apps/api/tests/test_delivery_closure_workbench_api.py -q`:`61 passed`。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有使用 GitHub / `gh` / GitHub API / GitHub Actions。 +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 runtime write。 + +**下一步**: +- 正常 push Gitea `main`,等 CD 完成後讀回 `/api/v1/agents/awoooi-priority-work-order-readback`;預期 active P0 仍是 `P0-006`,但 summary/current blocker 的 CD status 應為 `production_readback_verified`,`next_execution_order` 不再包含 `P0-006-CD-DEPLOY-MARKER-READBACK`。 + ## 2026-07-01 — 20:34 P0-006 reboot SLO machine-readback source closure **照主線修正的問題**: