feat(api): expose reboot SLO readback rollups
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 22s
CD Pipeline / build-and-deploy (push) Successful in 4m10s
CD Pipeline / post-deploy-checks (push) Successful in 1m1s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 22s
CD Pipeline / build-and-deploy (push) Successful in 4m10s
CD Pipeline / post-deploy-checks (push) Successful in 1m1s
This commit is contained in:
@@ -66,6 +66,68 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]:
|
||||
can_claim_slo = (
|
||||
scorecard.get("can_claim_all_services_recovered_within_target") is True
|
||||
)
|
||||
latest_verify_metric = _dict(scorecard.get("latest_verify_only_metric"))
|
||||
active_blocker_count = len(active_blockers)
|
||||
observed_host_count = len(_strings(host_boot_detection.get("observed_hosts")))
|
||||
missing_host_count = len(_strings(host_boot_detection.get("missing_hosts")))
|
||||
unreachable_host_count = len(_strings(host_boot_detection.get("unreachable_hosts")))
|
||||
stale_host_count = len(_strings(host_boot_detection.get("stale_hosts")))
|
||||
service_green = post_reboot_readiness.get("service_green") is True
|
||||
product_data_green = post_reboot_readiness.get("product_data_green") is True
|
||||
backup_core_green = post_reboot_readiness.get("backup_core_green") is True
|
||||
host_188_service_green = post_reboot_readiness.get("host_188_service_green") is True
|
||||
blocked_by_fresh_reboot_window_only = active_blockers == [
|
||||
"host_boot_observation_older_than_target_window"
|
||||
]
|
||||
latest_verify_only_metric_present = bool(latest_verify_metric)
|
||||
rollups = {
|
||||
"active_blocker_count": active_blocker_count,
|
||||
"readiness_percent": readiness_percent,
|
||||
"completed_check_count": completed_check_count,
|
||||
"required_check_count": len(required_checks),
|
||||
"can_claim_all_services_recovered_within_target": can_claim_slo,
|
||||
"observed_host_count": observed_host_count,
|
||||
"missing_host_count": missing_host_count,
|
||||
"unreachable_host_count": unreachable_host_count,
|
||||
"stale_host_count": stale_host_count,
|
||||
"post_start_blocked": _int(post_reboot_readiness.get("post_start_blocked")),
|
||||
"service_green": service_green,
|
||||
"product_data_green": product_data_green,
|
||||
"backup_core_green": backup_core_green,
|
||||
"host_188_service_green": host_188_service_green,
|
||||
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
|
||||
"latest_verify_only_metric_present": latest_verify_only_metric_present,
|
||||
"latest_verify_only_metric_ready": _int(latest_verify_metric.get("ready")),
|
||||
"latest_verify_only_metric_blocker_count": _int(
|
||||
latest_verify_metric.get("blocker_count")
|
||||
),
|
||||
"latest_verify_only_metric_max_host_uptime_seconds": _int(
|
||||
latest_verify_metric.get("max_host_uptime_seconds")
|
||||
),
|
||||
"latest_verify_only_metric_last_run_timestamp": _int(
|
||||
latest_verify_metric.get("last_run_timestamp")
|
||||
),
|
||||
"stockplatform_freshness_status": str(
|
||||
stockplatform.get("freshness_status") or "unknown"
|
||||
),
|
||||
"stockplatform_ingestion_status": str(
|
||||
stockplatform.get("ingestion_status") or "unknown"
|
||||
),
|
||||
"stockplatform_freshness_blocker_count": len(
|
||||
_strings(stockplatform.get("freshness_blockers"))
|
||||
),
|
||||
"stockplatform_ingestion_blocker_count": len(
|
||||
_strings(stockplatform.get("ingestion_blockers"))
|
||||
),
|
||||
"stockplatform_final_retry_window_passed": _dict(
|
||||
stockplatform.get("eod_window")
|
||||
).get("final_retry_window_passed")
|
||||
is True,
|
||||
"stockplatform_controlled_recovery_gate_required": recovery_gate.get(
|
||||
"required"
|
||||
)
|
||||
is True,
|
||||
}
|
||||
return {
|
||||
"schema_version": _API_SCHEMA_VERSION,
|
||||
"generated_at": str(scorecard.get("generated_at") or ""),
|
||||
@@ -74,55 +136,47 @@ def _build_payload(scorecard: dict[str, Any], path: Path) -> dict[str, Any]:
|
||||
"status": str(scorecard.get("status") or "unknown"),
|
||||
"safe_next_step": safe_next_step,
|
||||
"can_claim_all_services_recovered_within_target": can_claim_slo,
|
||||
"active_blocker_count": active_blocker_count,
|
||||
"readiness_percent": readiness_percent,
|
||||
"service_green": service_green,
|
||||
"product_data_green": product_data_green,
|
||||
"backup_core_green": backup_core_green,
|
||||
"host_188_service_green": host_188_service_green,
|
||||
"observed_host_count": observed_host_count,
|
||||
"missing_host_count": missing_host_count,
|
||||
"unreachable_host_count": unreachable_host_count,
|
||||
"stale_host_count": stale_host_count,
|
||||
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
|
||||
"latest_verify_only_metric_present": latest_verify_only_metric_present,
|
||||
"latest_verify_only_metric_ready": rollups["latest_verify_only_metric_ready"],
|
||||
"latest_verify_only_metric_blocker_count": rollups[
|
||||
"latest_verify_only_metric_blocker_count"
|
||||
],
|
||||
"latest_verify_only_metric_max_host_uptime_seconds": rollups[
|
||||
"latest_verify_only_metric_max_host_uptime_seconds"
|
||||
],
|
||||
"latest_verify_only_metric_last_run_timestamp": rollups[
|
||||
"latest_verify_only_metric_last_run_timestamp"
|
||||
],
|
||||
"stockplatform_freshness_status": rollups["stockplatform_freshness_status"],
|
||||
"stockplatform_ingestion_status": rollups["stockplatform_ingestion_status"],
|
||||
"readback": {
|
||||
"workplan_id": "P0-006",
|
||||
"workplan_title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
|
||||
"source_scorecard_ref": f"docs/operations/{path.name}",
|
||||
"target_minutes": _int(scorecard.get("target_minutes")),
|
||||
"safe_next_step": safe_next_step,
|
||||
"active_blocker_count": active_blocker_count,
|
||||
"readiness_percent": readiness_percent,
|
||||
"blocked_by_fresh_reboot_window_only": blocked_by_fresh_reboot_window_only,
|
||||
"latest_verify_only_metric_present": latest_verify_only_metric_present,
|
||||
},
|
||||
"host_boot_detection": host_boot_detection,
|
||||
"post_reboot_readiness": post_reboot_readiness,
|
||||
"stockplatform_data_freshness": stockplatform,
|
||||
"active_blockers": active_blockers,
|
||||
"required_checks": required_checks,
|
||||
"rollups": {
|
||||
"active_blocker_count": len(active_blockers),
|
||||
"readiness_percent": readiness_percent,
|
||||
"completed_check_count": completed_check_count,
|
||||
"required_check_count": len(required_checks),
|
||||
"can_claim_all_services_recovered_within_target": can_claim_slo,
|
||||
"observed_host_count": len(_strings(host_boot_detection.get("observed_hosts"))),
|
||||
"missing_host_count": len(_strings(host_boot_detection.get("missing_hosts"))),
|
||||
"unreachable_host_count": len(
|
||||
_strings(host_boot_detection.get("unreachable_hosts"))
|
||||
),
|
||||
"stale_host_count": len(_strings(host_boot_detection.get("stale_hosts"))),
|
||||
"post_start_blocked": _int(post_reboot_readiness.get("post_start_blocked")),
|
||||
"service_green": post_reboot_readiness.get("service_green") is True,
|
||||
"product_data_green": post_reboot_readiness.get("product_data_green") is True,
|
||||
"backup_core_green": post_reboot_readiness.get("backup_core_green") is True,
|
||||
"stockplatform_freshness_status": str(
|
||||
stockplatform.get("freshness_status") or "unknown"
|
||||
),
|
||||
"stockplatform_ingestion_status": str(
|
||||
stockplatform.get("ingestion_status") or "unknown"
|
||||
),
|
||||
"stockplatform_freshness_blocker_count": len(
|
||||
_strings(stockplatform.get("freshness_blockers"))
|
||||
),
|
||||
"stockplatform_ingestion_blocker_count": len(
|
||||
_strings(stockplatform.get("ingestion_blockers"))
|
||||
),
|
||||
"stockplatform_final_retry_window_passed": _dict(
|
||||
stockplatform.get("eod_window")
|
||||
).get("final_retry_window_passed")
|
||||
is True,
|
||||
"stockplatform_controlled_recovery_gate_required": recovery_gate.get(
|
||||
"required"
|
||||
)
|
||||
is True,
|
||||
},
|
||||
"rollups": rollups,
|
||||
"operation_boundaries": {
|
||||
"read_only_api_allowed": True,
|
||||
"host_reboot_performed": False,
|
||||
|
||||
@@ -35,12 +35,34 @@ def _assert_reboot_slo_payload(payload: dict):
|
||||
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
)
|
||||
assert payload["can_claim_all_services_recovered_within_target"] is False
|
||||
assert payload["active_blocker_count"] == 1
|
||||
assert payload["readiness_percent"] == 82
|
||||
assert payload["service_green"] is True
|
||||
assert payload["product_data_green"] is True
|
||||
assert payload["backup_core_green"] is True
|
||||
assert payload["host_188_service_green"] is True
|
||||
assert payload["observed_host_count"] == 4
|
||||
assert payload["missing_host_count"] == 0
|
||||
assert payload["unreachable_host_count"] == 0
|
||||
assert payload["stale_host_count"] == 4
|
||||
assert payload["blocked_by_fresh_reboot_window_only"] is True
|
||||
assert payload["latest_verify_only_metric_present"] is True
|
||||
assert payload["latest_verify_only_metric_ready"] == 0
|
||||
assert payload["latest_verify_only_metric_blocker_count"] == 1
|
||||
assert payload["latest_verify_only_metric_max_host_uptime_seconds"] == 541538
|
||||
assert payload["latest_verify_only_metric_last_run_timestamp"] > 0
|
||||
assert payload["stockplatform_freshness_status"] == "ok"
|
||||
assert payload["stockplatform_ingestion_status"] == "ok"
|
||||
assert payload["readback"]["workplan_id"] == "P0-006"
|
||||
assert payload["readback"]["target_minutes"] == 10
|
||||
assert payload["readback"]["safe_next_step"] == (
|
||||
"timer_and_service_data_readback_green_wait_for_next_all_host_reboot_"
|
||||
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
)
|
||||
assert payload["readback"]["active_blocker_count"] == 1
|
||||
assert payload["readback"]["readiness_percent"] == 82
|
||||
assert payload["readback"]["blocked_by_fresh_reboot_window_only"] is True
|
||||
assert payload["readback"]["latest_verify_only_metric_present"] is True
|
||||
assert payload["rollups"]["active_blocker_count"] == 1
|
||||
assert payload["rollups"]["readiness_percent"] == 82
|
||||
assert payload["rollups"]["observed_host_count"] == 4
|
||||
@@ -50,6 +72,11 @@ def _assert_reboot_slo_payload(payload: dict):
|
||||
assert payload["rollups"]["service_green"] is True
|
||||
assert payload["rollups"]["product_data_green"] is True
|
||||
assert payload["rollups"]["backup_core_green"] is True
|
||||
assert payload["rollups"]["host_188_service_green"] is True
|
||||
assert payload["rollups"]["blocked_by_fresh_reboot_window_only"] is True
|
||||
assert payload["rollups"]["latest_verify_only_metric_present"] is True
|
||||
assert payload["rollups"]["latest_verify_only_metric_ready"] == 0
|
||||
assert payload["rollups"]["latest_verify_only_metric_blocker_count"] == 1
|
||||
assert payload["rollups"]["stockplatform_freshness_status"] == "ok"
|
||||
assert payload["rollups"]["stockplatform_ingestion_status"] == "ok"
|
||||
assert payload["rollups"]["stockplatform_freshness_blocker_count"] == 0
|
||||
|
||||
@@ -1,3 +1,16 @@
|
||||
## 2026-06-29 — 21:43 P0-006 reboot SLO API readback promoted
|
||||
|
||||
**照優先順序完成的實作**:
|
||||
- P0-005 credential escrow evidence refs 已由 production API / Delivery Workbench 讀回 closed;最新 Gitea CD #3968 / #3969 皆 success。
|
||||
- P0-006 仍是 active P0,唯一 blocker 維持 `host_boot_observation_older_than_target_window`;未從此 lane 重啟主機或 restart service。
|
||||
- 將 P0-006 `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 的主線判斷值提升成 top-level readback:`active_blocker_count`、`readiness_percent`、service/data/backup green、observed/stale hosts、verify-only metric 與 StockPlatform freshness / ingestion 狀態。
|
||||
|
||||
**驗證**:
|
||||
- Focused pytest:P0-006 scorecard / Delivery Workbench / CD profile `24 passed`。
|
||||
- `ruff check`、`py_compile`、`git diff --check`、Gitea runner pressure guard、Gitea secret env guard:通過。
|
||||
|
||||
**邊界**:未操作 host / Docker / K8s / DB / firewall / Wazuh runtime;未觸發 workflow_dispatch;未使用 GitHub / `gh` / GitHub API;未讀 secret / token / raw sessions / SQLite / `.env`。
|
||||
|
||||
## 2026-06-29 — 21:25 CD profile fix for log writeback readbacks
|
||||
|
||||
**照優先順序處理**:
|
||||
|
||||
Reference in New Issue
Block a user