From c32fab9cb49f10f3439b47b423de0453f7055b0d Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 00:00:28 +0800 Subject: [PATCH] feat(ops): carry stock readbacks into reboot slo --- .../test_delivery_closure_workbench_api.py | 4 +- ..._reboot_auto_recovery_slo_scorecard_api.py | 10 +++- docs/LOGBOOK.md | 14 +++++ ...-auto-recovery-slo-scorecard.snapshot.json | 49 ++++++++++++--- .../reboot-auto-recovery-slo-exporter.sh | 28 ++++++++- .../reboot-auto-recovery-slo-scorecard.py | 43 +++++++++++++- ...test_reboot_auto_recovery_slo_installer.py | 13 ++++ ...test_reboot_auto_recovery_slo_scorecard.py | 59 +++++++++++++++++++ 8 files changed, 203 insertions(+), 17 deletions(-) diff --git a/apps/api/tests/test_delivery_closure_workbench_api.py b/apps/api/tests/test_delivery_closure_workbench_api.py index 7a03a999..6ceee3c1 100644 --- a/apps/api/tests/test_delivery_closure_workbench_api.py +++ b/apps/api/tests/test_delivery_closure_workbench_api.py @@ -192,7 +192,7 @@ def test_delivery_closure_workbench_exposes_p0_006_reboot_slo_lane(): assert lane["metric"]["stockplatform_ingestion_status"] == "ok" assert lane["metric"]["stockplatform_freshness_blocker_count"] == 0 assert lane["metric"]["stockplatform_ingestion_blocker_count"] == 0 - assert lane["metric"]["stockplatform_final_retry_window_passed"] is False + assert lane["metric"]["stockplatform_final_retry_window_passed"] is True assert lane["metric"]["stockplatform_controlled_recovery_gate_required"] is False assert lane["metric"]["host_reboot_performed"] is False assert lane["metric"]["service_restart_performed"] is False @@ -245,7 +245,7 @@ def _assert_delivery_workbench_shape(data: dict): data["summary"][ "reboot_auto_recovery_stockplatform_final_retry_window_passed" ] - is False + is True ) assert ( data["summary"][ diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 90eff29b..f1034f70 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -101,7 +101,7 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["rollups"]["stockplatform_ingestion_status"] == "ok" assert payload["rollups"]["stockplatform_freshness_blocker_count"] == 0 assert payload["rollups"]["stockplatform_ingestion_blocker_count"] == 0 - assert payload["rollups"]["stockplatform_final_retry_window_passed"] is False + assert payload["rollups"]["stockplatform_final_retry_window_passed"] is True assert ( payload["rollups"]["stockplatform_controlled_recovery_gate_required"] is False @@ -114,12 +114,16 @@ def _assert_reboot_slo_payload(payload: dict): assert stockplatform["ingestion_blockers"] == [] assert stockplatform["margin_short_recovery"]["status"] == "recovered" assert stockplatform["margin_short_recovery"]["successful_source_run_ids"] == [ - 3390, 3389, + 3390, ] assert stockplatform["ai_recommendations_recovery"]["status"] == "recovered" - assert stockplatform["eod_window"]["final_retry_window_passed"] is False + assert stockplatform["eod_window"]["final_retry_window_passed"] is True assert stockplatform["controlled_recovery_gate"]["required"] is False + assert ( + stockplatform["controlled_recovery_gate"]["status"] + == "not_required_freshness_recovered" + ) assert "manual_db_update" in stockplatform["controlled_recovery_gate"][ "forbidden_actions" ] diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 6e078757..db401498 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,17 @@ +## 2026-06-29 — 23:45 P0-006 final retry window readback source closure + +**照優先順序完成的實作**: +- P0-006 仍是 active P0;StockPlatform final retry window 已過,freshness / ingestion live readback 仍為 `ok`,因此 `stockplatform_final_retry_window_passed=true` 且 controlled data recovery gate 維持 `not_required_freshness_recovered`。 +- `scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh` 現在會把 public StockPlatform freshness / ingestion JSON 存成 artifact 並傳給 scorecard,不再只靠 post-start summary 推導。 +- `scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py` 在 freshness 已恢復時明確輸出 `margin_short_recovery`、`ai_recommendations_recovery` 與 `controlled_recovery_gate.status=not_required_freshness_recovered`。 +- 更新 `docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json`,production API / Delivery Workbench 發佈後會讀到 23:45 final-window truth;唯一剩餘 blocker 仍是 `host_boot_observation_older_than_target_window`。 + +**驗證**: +- Focused pytest:P0-006 scorecard / exporter contract / API readback `14 passed`;Delivery Workbench / P0-006 API / CD profile `24 passed`。 +- `bash -n`、`py_compile`、JSON parse、Gitea runner pressure guard、Gitea secret env guard、`git diff --check`:通過。 + +**邊界**:未重啟主機,未 restart Docker / Nginx / K3s / DB / service,未寫 StockPlatform DB,未 workflow_dispatch,未使用 GitHub / `gh` / GitHub API,未讀 secret / token / raw sessions / SQLite / `.env`。 + ## 2026-06-29 — 23:44 主線 priority readback 收斂:P0 event-gated,P1 production verified **照優先順序讀回的事實**: diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index 697fb472..73eef3e7 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -8,7 +8,7 @@ "free_gib": 4.454, "min_free_gib": 2.0 }, - "generated_at": "2026-06-29T21:15:30+08:00", + "generated_at": "2026-06-29T23:45:54+08:00", "host_boot_detection": { "host_rows": [ { @@ -126,9 +126,9 @@ "target_selector": "stockplatform-v2:system_freshness:core.margin_short_daily,ai.recommendations" }, "eod_window": { - "classification": "recovered_after_21_05_retry_window", + "classification": "recovered_after_final_retry_window", "final_retry_window_end_local": "23:35", - "final_retry_window_passed": false, + "final_retry_window_passed": true, "first_full_window_end_local": "19:15", "next_action": "rerun_slo_verify_only_after_next_fresh_all_host_reboot_event_or_approved_reboot_drill", "pending": false @@ -141,7 +141,23 @@ "ingestion_status": "ok", "latest_source_runs": [ { - "source_run_id": 3390, + "source_run_id": 3392, + "source_name": "intelligence_security_linker", + "target_date": null, + "status": "succeeded", + "started_at": "2026-06-29T15:00:09.333642Z", + "finished_at": "2026-06-29T15:00:09.333642Z" + }, + { + "source_run_id": 3391, + "source_name": "intelligence_reports_import", + "target_date": null, + "status": "succeeded", + "started_at": "2026-06-29T15:00:08.472808Z", + "finished_at": "2026-06-29T15:00:08.472808Z" + }, + { + "source_run_id": 3389, "source_name": "official_margin_short_daily", "target_date": "2026-06-29", "status": "succeeded", @@ -149,7 +165,7 @@ "finished_at": "2026-06-29T13:05:13.341357Z" }, { - "source_run_id": 3389, + "source_run_id": 3390, "source_name": "official_margin_short_daily", "target_date": "2026-06-29", "status": "succeeded", @@ -171,6 +187,22 @@ "status": "succeeded", "started_at": "2026-06-29T13:00:07.822700Z", "finished_at": "2026-06-29T13:00:07.822700Z" + }, + { + "source_run_id": 3385, + "source_name": "official_margin_short_daily", + "target_date": "2026-06-29", + "status": "official_pending", + "started_at": "2026-06-29T12:05:16.716460Z", + "finished_at": "2026-06-29T12:05:16.716460Z" + }, + { + "source_run_id": 3386, + "source_name": "official_margin_short_daily", + "target_date": "2026-06-29", + "status": "official_pending", + "started_at": "2026-06-29T12:05:16.716460Z", + "finished_at": "2026-06-29T12:05:16.716460Z" } ], "latest_trading_date": "2026-06-29", @@ -181,14 +213,15 @@ "cleared_blocker": "core_margin_short_daily_missing", "cleared_ingestion_blocker": "core.margin_short_daily_incomplete", "successful_source_run_ids": [ - 3390, - 3389 + 3389, + 3390 ] }, "ai_recommendations_recovery": { "status": "recovered", "cleared_blocker": "ai_recommendations_stale" - } + }, + "final_retry_checked_at": "2026-06-29T23:45:54+08:00" }, "target_minutes": 10, "target_seconds": 600, diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh index c9064b15..f690fffd 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh @@ -13,6 +13,9 @@ OUTPUT_NAME="${OUTPUT_NAME:-reboot_auto_recovery_slo.prom}" TARGET_MINUTES="${TARGET_MINUTES:-10}" MIN_FREE_GIB="${MIN_FREE_GIB:-2}" LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}" +STOCK_FRESHNESS_URL="${STOCK_FRESHNESS_URL:-https://stock.wooo.work/api/v1/system/freshness}" +STOCK_INGESTION_URL="${STOCK_INGESTION_URL:-https://stock.wooo.work/api/v1/system/ingestion}" +STOCK_READBACK_TIMEOUT_SECONDS="${STOCK_READBACK_TIMEOUT_SECONDS:-10}" mkdir -p "$TEXTFILE_DIR" "$LOG_DIR" @@ -28,18 +31,39 @@ mkdir -p "$artifact_dir" host_probe="$artifact_dir/host-probe.txt" summary_file="$artifact_dir/summary.txt" scorecard_file="$artifact_dir/scorecard.json" +stock_freshness_file="$artifact_dir/stock-freshness.json" +stock_ingestion_file="$artifact_dir/stock-ingestion.json" bash "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh" >"$host_probe" 2>&1 || true ARTIFACT_DIR="$artifact_dir/post-reboot-readiness" \ bash "$ROOT_DIR/scripts/reboot-recovery/post-reboot-readiness-summary.sh" --no-color >"$summary_file" 2>&1 || true -python3 "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" \ +if command -v curl >/dev/null 2>&1; then + curl -fsS --max-time "$STOCK_READBACK_TIMEOUT_SECONDS" \ + "$STOCK_FRESHNESS_URL" >"$stock_freshness_file" 2>"$artifact_dir/stock-freshness.err" \ + || rm -f "$stock_freshness_file" + curl -fsS --max-time "$STOCK_READBACK_TIMEOUT_SECONDS" \ + "$STOCK_INGESTION_URL" >"$stock_ingestion_file" 2>"$artifact_dir/stock-ingestion.err" \ + || rm -f "$stock_ingestion_file" +fi + +scorecard_args=( + "$ROOT_DIR/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py" --summary-file "$summary_file" \ --host-probe-file "$host_probe" \ --target-minutes "$TARGET_MINUTES" \ --min-free-gib "$MIN_FREE_GIB" \ --disk-path / \ - --output "$scorecard_file" || true + --output "$scorecard_file" +) +if [ -s "$stock_freshness_file" ]; then + scorecard_args+=(--stock-freshness-file "$stock_freshness_file") +fi +if [ -s "$stock_ingestion_file" ]; then + scorecard_args+=(--stock-ingestion-file "$stock_ingestion_file") +fi + +python3 "${scorecard_args[@]}" || true now="$(date +%s)" ready="$(python3 - "$scorecard_file" <<'PY' diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index 32357344..b3f43e91 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -264,6 +264,15 @@ def build_stockplatform_readback( or summary.get("STOCK_LATEST_TRADING_DATE") or "" ) + latest_source_runs = compact_source_runs(ingestion) + margin_short_success_ids = [ + row.get("source_run_id") + for row in latest_source_runs + if row.get("source_name") == "official_margin_short_daily" + and row.get("target_date") == latest_trading_date + and row.get("status") == "succeeded" + and row.get("source_run_id") is not None + ] eod_pending = truthy(summary.get("STOCK_EOD_WINDOW_PENDING")) eod_final_window = str( summary.get("STOCK_EOD_FINAL_RETRY_WINDOW_END_LOCAL") or "unknown" @@ -278,6 +287,15 @@ def build_stockplatform_readback( or final_passed ) ) + recovery_gate_status = ( + "ready_to_open" + if recovery_required + else ( + "not_required_freshness_recovered" + if freshness_status == "ok" and not freshness_blockers + else "not_required_yet" + ) + ) return { "freshness_endpoint_readback_present": bool(freshness), "ingestion_endpoint_readback_present": bool(ingestion), @@ -291,7 +309,7 @@ def build_stockplatform_readback( for row in compact_stock_sources(freshness) if row["status"] not in {"ok", "warning"} ], - "latest_source_runs": compact_source_runs(ingestion), + "latest_source_runs": latest_source_runs, "eod_window": { "pending": eod_pending, "classification": str(summary.get("STOCK_EOD_CLASSIFICATION") or "unknown"), @@ -304,7 +322,7 @@ def build_stockplatform_readback( }, "controlled_recovery_gate": { "required": recovery_required, - "status": "ready_to_open" if recovery_required else "not_required_yet", + "status": recovery_gate_status, "target_selector": "stockplatform-v2:system_freshness:core.margin_short_daily,ai.recommendations", "allowed_actions": [ "inspect_existing_ingestion_readback", @@ -320,6 +338,27 @@ def build_stockplatform_readback( "reboot_or_service_restart_from_reboot_slo_lane", ], }, + "margin_short_recovery": { + "status": ( + "recovered" + if freshness_status == "ok" + and "core_margin_short_daily_missing" not in freshness_blockers + and bool(margin_short_success_ids) + else "not_verified" + ), + "cleared_blocker": "core_margin_short_daily_missing", + "cleared_ingestion_blocker": "core.margin_short_daily_incomplete", + "successful_source_run_ids": margin_short_success_ids, + }, + "ai_recommendations_recovery": { + "status": ( + "recovered" + if freshness_status == "ok" + and "ai_recommendations_stale" not in freshness_blockers + else "not_verified" + ), + "cleared_blocker": "ai_recommendations_stale", + }, } diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py index be5c66dc..338b44b0 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py @@ -77,3 +77,16 @@ def test_exporter_uses_user_writable_lock_after_creating_log_dir() -> None: assert 'LOCK_FILE="${LOCK_FILE:-${LOG_DIR}/reboot_auto_recovery_slo.lock}"' in text assert text.index('mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"') < text.index('exec 9>"$LOCK_FILE"') + + +def test_exporter_carries_stockplatform_readbacks_into_scorecard() -> None: + text = EXPORTER.read_text(encoding="utf-8") + + assert "STOCK_FRESHNESS_URL" in text + assert "STOCK_INGESTION_URL" in text + assert 'stock_freshness_file="$artifact_dir/stock-freshness.json"' in text + assert 'stock_ingestion_file="$artifact_dir/stock-ingestion.json"' in text + assert "scorecard_args+=(--stock-freshness-file" in text + assert "scorecard_args+=(--stock-ingestion-file" in text + assert "manual_db_update" not in text + assert "systemctl restart" not in text diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index 2e16442b..1e51394e 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -200,6 +200,65 @@ STOCK_EOD_FINAL_RETRY_WINDOW_END_LOCAL=23:35 ) +def test_stockplatform_recovered_marks_controlled_gate_not_required( + tmp_path: Path, +) -> None: + summary = GREEN_SUMMARY + """\ +STOCK_FRESHNESS_STATUS=ok +STOCK_LATEST_TRADING_DATE=2026-06-29 +STOCK_BLOCKERS=none +STOCK_EOD_WINDOW_PENDING=0 +STOCK_EOD_CLASSIFICATION=recovered_after_21_05_retry_window +STOCK_EOD_NEXT_ACTION=rerun_slo_verify_only_after_next_fresh_all_host_reboot_event_or_approved_reboot_drill +STOCK_EOD_FIRST_FULL_WINDOW_END_LOCAL=19:15 +STOCK_EOD_FINAL_RETRY_WINDOW_END_LOCAL=23:35 +""" + + payload = run_scorecard_with_stock( + tmp_path, + summary, + { + "status": "ok", + "latest_trading_date": "2026-06-29", + "blockers": [], + }, + { + "status": "ok", + "latest_trading_date": "2026-06-29", + "blockers": [], + "latest_source_runs": [ + { + "source_run_id": 3390, + "source_name": "official_margin_short_daily", + "target_date": "2026-06-29", + "status": "succeeded", + }, + { + "source_run_id": 3389, + "source_name": "official_margin_short_daily", + "target_date": "2026-06-29", + "status": "succeeded", + }, + ], + }, + generated_at="2026-06-29T23:40:00+08:00", + ) + + stockplatform = payload["stockplatform_data_freshness"] + assert stockplatform["eod_window"]["final_retry_window_passed"] is True + assert stockplatform["controlled_recovery_gate"]["required"] is False + assert ( + stockplatform["controlled_recovery_gate"]["status"] + == "not_required_freshness_recovered" + ) + assert stockplatform["margin_short_recovery"]["status"] == "recovered" + assert stockplatform["margin_short_recovery"]["successful_source_run_ids"] == [ + 3390, + 3389, + ] + assert stockplatform["ai_recommendations_recovery"]["status"] == "recovered" + + def test_stockplatform_blocked_after_final_retry_opens_controlled_gate( tmp_path: Path, ) -> None: