diff --git a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py index 8a06cfc3..aa625d85 100644 --- a/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py +++ b/apps/api/src/services/reboot_auto_recovery_slo_scorecard.py @@ -66,6 +66,68 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: can_claim_slo = ( scorecard.get("can_claim_all_services_recovered_within_target") is True ) + latest_verify_metric = _dict(scorecard.get("latest_verify_only_metric")) + active_blocker_count = len(active_blockers) + observed_host_count = len(_strings(host_boot_detection.get("observed_hosts"))) + missing_host_count = len(_strings(host_boot_detection.get("missing_hosts"))) + unreachable_host_count = len(_strings(host_boot_detection.get("unreachable_hosts"))) + stale_host_count = len(_strings(host_boot_detection.get("stale_hosts"))) + service_green = post_reboot_readiness.get("service_green") is True + product_data_green = post_reboot_readiness.get("product_data_green") is True + backup_core_green = post_reboot_readiness.get("backup_core_green") is True + host_188_service_green = post_reboot_readiness.get("host_188_service_green") is True + blocked_by_fresh_reboot_window_only = active_blockers == [ + "host_boot_observation_older_than_target_window" + ] + latest_verify_only_metric_present = bool(latest_verify_metric) + rollups = { + "active_blocker_count": active_blocker_count, + "readiness_percent": readiness_percent, + "completed_check_count": completed_check_count, + "required_check_count": len(required_checks), + "can_claim_all_services_recovered_within_target": can_claim_slo, + "observed_host_count": observed_host_count, + "missing_host_count": missing_host_count, + "unreachable_host_count": unreachable_host_count, + "stale_host_count": stale_host_count, + "post_start_blocked": _int(post_reboot_readiness.get("post_start_blocked")), + "service_green": service_green, + "product_data_green": product_data_green, + "backup_core_green": backup_core_green, + "host_188_service_green": host_188_service_green, + "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, + "latest_verify_only_metric_present": latest_verify_only_metric_present, + "latest_verify_only_metric_ready": _int(latest_verify_metric.get("ready")), + "latest_verify_only_metric_blocker_count": _int( + latest_verify_metric.get("blocker_count") + ), + "latest_verify_only_metric_max_host_uptime_seconds": _int( + latest_verify_metric.get("max_host_uptime_seconds") + ), + "latest_verify_only_metric_last_run_timestamp": _int( + latest_verify_metric.get("last_run_timestamp") + ), + "stockplatform_freshness_status": str( + stockplatform.get("freshness_status") or "unknown" + ), + "stockplatform_ingestion_status": str( + stockplatform.get("ingestion_status") or "unknown" + ), + "stockplatform_freshness_blocker_count": len( + _strings(stockplatform.get("freshness_blockers")) + ), + "stockplatform_ingestion_blocker_count": len( + _strings(stockplatform.get("ingestion_blockers")) + ), + "stockplatform_final_retry_window_passed": _dict( + stockplatform.get("eod_window") + ).get("final_retry_window_passed") + is True, + "stockplatform_controlled_recovery_gate_required": recovery_gate.get( + "required" + ) + is True, + } return { "schema_version": _API_SCHEMA_VERSION, "generated_at": str(scorecard.get("generated_at") or ""), @@ -74,55 +136,47 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]: "status": str(scorecard.get("status") or "unknown"), "safe_next_step": safe_next_step, "can_claim_all_services_recovered_within_target": can_claim_slo, + "active_blocker_count": active_blocker_count, + "readiness_percent": readiness_percent, + "service_green": service_green, + "product_data_green": product_data_green, + "backup_core_green": backup_core_green, + "host_188_service_green": host_188_service_green, + "observed_host_count": observed_host_count, + "missing_host_count": missing_host_count, + "unreachable_host_count": unreachable_host_count, + "stale_host_count": stale_host_count, + "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, + "latest_verify_only_metric_present": latest_verify_only_metric_present, + "latest_verify_only_metric_ready": rollups["latest_verify_only_metric_ready"], + "latest_verify_only_metric_blocker_count": rollups[ + "latest_verify_only_metric_blocker_count" + ], + "latest_verify_only_metric_max_host_uptime_seconds": rollups[ + "latest_verify_only_metric_max_host_uptime_seconds" + ], + "latest_verify_only_metric_last_run_timestamp": rollups[ + "latest_verify_only_metric_last_run_timestamp" + ], + "stockplatform_freshness_status": rollups["stockplatform_freshness_status"], + "stockplatform_ingestion_status": rollups["stockplatform_ingestion_status"], "readback": { "workplan_id": "P0-006", "workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO", "source_scorecard_ref": f"docs/operations/{path.name}", "target_minutes": _int(scorecard.get("target_minutes")), "safe_next_step": safe_next_step, + "active_blocker_count": active_blocker_count, + "readiness_percent": readiness_percent, + "blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only, + "latest_verify_only_metric_present": latest_verify_only_metric_present, }, "host_boot_detection": host_boot_detection, "post_reboot_readiness": post_reboot_readiness, "stockplatform_data_freshness": stockplatform, "active_blockers": active_blockers, "required_checks": required_checks, - "rollups": { - "active_blocker_count": len(active_blockers), - "readiness_percent": readiness_percent, - "completed_check_count": completed_check_count, - "required_check_count": len(required_checks), - "can_claim_all_services_recovered_within_target": can_claim_slo, - "observed_host_count": len(_strings(host_boot_detection.get("observed_hosts"))), - "missing_host_count": len(_strings(host_boot_detection.get("missing_hosts"))), - "unreachable_host_count": len( - _strings(host_boot_detection.get("unreachable_hosts")) - ), - "stale_host_count": len(_strings(host_boot_detection.get("stale_hosts"))), - "post_start_blocked": _int(post_reboot_readiness.get("post_start_blocked")), - "service_green": post_reboot_readiness.get("service_green") is True, - "product_data_green": post_reboot_readiness.get("product_data_green") is True, - "backup_core_green": post_reboot_readiness.get("backup_core_green") is True, - "stockplatform_freshness_status": str( - stockplatform.get("freshness_status") or "unknown" - ), - "stockplatform_ingestion_status": str( - stockplatform.get("ingestion_status") or "unknown" - ), - "stockplatform_freshness_blocker_count": len( - _strings(stockplatform.get("freshness_blockers")) - ), - "stockplatform_ingestion_blocker_count": len( - _strings(stockplatform.get("ingestion_blockers")) - ), - "stockplatform_final_retry_window_passed": _dict( - stockplatform.get("eod_window") - ).get("final_retry_window_passed") - is True, - "stockplatform_controlled_recovery_gate_required": recovery_gate.get( - "required" - ) - is True, - }, + "rollups": rollups, "operation_boundaries": { "read_only_api_allowed": True, "host_reboot_performed": False, diff --git a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py index 26d1b7c4..f110d025 100644 --- a/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py +++ b/apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py @@ -35,12 +35,34 @@ def _assert_reboot_slo_payload(payload: dict): "event_or_approved_reboot_drill_to_prove_10_minute_slo" ) assert payload["can_claim_all_services_recovered_within_target"] is False + assert payload["active_blocker_count"] == 1 + assert payload["readiness_percent"] == 82 + assert payload["service_green"] is True + assert payload["product_data_green"] is True + assert payload["backup_core_green"] is True + assert payload["host_188_service_green"] is True + assert payload["observed_host_count"] == 4 + assert payload["missing_host_count"] == 0 + assert payload["unreachable_host_count"] == 0 + assert payload["stale_host_count"] == 4 + assert payload["blocked_by_fresh_reboot_window_only"] is True + assert payload["latest_verify_only_metric_present"] is True + assert payload["latest_verify_only_metric_ready"] == 0 + assert payload["latest_verify_only_metric_blocker_count"] == 1 + assert payload["latest_verify_only_metric_max_host_uptime_seconds"] == 541538 + assert payload["latest_verify_only_metric_last_run_timestamp"] > 0 + assert payload["stockplatform_freshness_status"] == "ok" + assert payload["stockplatform_ingestion_status"] == "ok" assert payload["readback"]["workplan_id"] == "P0-006" assert payload["readback"]["target_minutes"] == 10 assert payload["readback"]["safe_next_step"] == ( "timer_and_service_data_readback_green_wait_for_next_all_host_reboot_" "event_or_approved_reboot_drill_to_prove_10_minute_slo" ) + assert payload["readback"]["active_blocker_count"] == 1 + assert payload["readback"]["readiness_percent"] == 82 + assert payload["readback"]["blocked_by_fresh_reboot_window_only"] is True + assert payload["readback"]["latest_verify_only_metric_present"] is True assert payload["rollups"]["active_blocker_count"] == 1 assert payload["rollups"]["readiness_percent"] == 82 assert payload["rollups"]["observed_host_count"] == 4 @@ -50,6 +72,11 @@ def _assert_reboot_slo_payload(payload: dict): assert payload["rollups"]["service_green"] is True assert payload["rollups"]["product_data_green"] is True assert payload["rollups"]["backup_core_green"] is True + assert payload["rollups"]["host_188_service_green"] is True + assert payload["rollups"]["blocked_by_fresh_reboot_window_only"] is True + assert payload["rollups"]["latest_verify_only_metric_present"] is True + assert payload["rollups"]["latest_verify_only_metric_ready"] == 0 + assert payload["rollups"]["latest_verify_only_metric_blocker_count"] == 1 assert payload["rollups"]["stockplatform_freshness_status"] == "ok" assert payload["rollups"]["stockplatform_ingestion_status"] == "ok" assert payload["rollups"]["stockplatform_freshness_blocker_count"] == 0 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 83e0f512..338c7f35 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,16 @@ +## 2026-06-29 — 21:43 P0-006 reboot SLO API readback promoted + +**照優先順序完成的實作**: +- P0-005 credential escrow evidence refs 已由 production API / Delivery Workbench 讀回 closed;最新 Gitea CD #3968 / #3969 皆 success。 +- P0-006 仍是 active P0,唯一 blocker 維持 `host_boot_observation_older_than_target_window`;未從此 lane 重啟主機或 restart service。 +- 將 P0-006 `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 的主線判斷值提升成 top-level readback:`active_blocker_count`、`readiness_percent`、service/data/backup green、observed/stale hosts、verify-only metric 與 StockPlatform freshness / ingestion 狀態。 + +**驗證**: +- Focused pytest:P0-006 scorecard / Delivery Workbench / CD profile `24 passed`。 +- `ruff check`、`py_compile`、`git diff --check`、Gitea runner pressure guard、Gitea secret env guard:通過。 + +**邊界**:未操作 host / Docker / K8s / DB / firewall / Wazuh runtime;未觸發 workflow_dispatch;未使用 GitHub / `gh` / GitHub API;未讀 secret / token / raw sessions / SQLite / `.env`。 + ## 2026-06-29 — 21:25 CD profile fix for log writeback readbacks **照優先順序處理**: