Merge remote-tracking branch 'gitea-ssh/main' into codex/p0-product-manifest-standard-20260629
# Conflicts: # docs/LOGBOOK.md
This commit is contained in:
@@ -49456,3 +49456,29 @@ production browser smoke:
|
||||
**仍維持**:
|
||||
- 沒有使用 GitHub app / connector / MCP、`gh`、GitHub API、GitHub Actions、PR / issue / search。
|
||||
- 沒有讀 token / secret / `.env` / raw sessions / SQLite / auth;沒有寫 Gitea repo / refs / branch / secret;沒有 host / Docker / K8s / firewall / Wazuh runtime 操作。
|
||||
|
||||
## 2026-06-29 — 15:02 P0-006A reboot auto-recovery SLO timer live on 110 / services green
|
||||
|
||||
**完成內容**:
|
||||
- 將 `awoooi-reboot-auto-recovery-slo.timer` / `.service` 受控部署到 110;timer `enabled=enabled`、`active=active`,service last result `success`。
|
||||
- 新增/修正 110 installer 的 dry-run / rollback / verify contract;systemd oneshot 以 `User=wooo`、`HOME=/home/wooo`、`ROOT_DIR=/home/wooo/awoooi-reboot-recovery-slo` 執行,metric 寫入 `/home/wooo/node_exporter_textfiles/reboot_auto_recovery_slo.prom`。
|
||||
- 修正 host probe 不依賴互動 SSH agent:110 self 走 local `/proc` / `systemctl`,120/121/188 讀回 boot id / uptime;node-exporter fallback 保留。
|
||||
- 修正 188 hygiene selector:188 預設 `ollama@192.168.0.188`,110 self-check 不再被 SSH config 轉成錯誤 identity。
|
||||
- 修正 post-start 判準:public routes + AWOOOI API 綠時,cold-start / runner / host-pressure 留在 capacity/evidence lane,不再把 CD runner 壓力誤判成使用者服務未恢復。
|
||||
|
||||
**live readback**:
|
||||
- 最新 artifact:`/home/wooo/reboot-recovery/reboot-auto-recovery-slo-20260629-150501`。
|
||||
- Metric:`awoooi_reboot_auto_recovery_slo_ready=0`、`blocker_count=1`、`max_host_uptime_seconds=519376`。
|
||||
- Scorecard:`observed_hosts=110,120,121,188`、`missing_hosts=[]`、`unreachable_hosts=[]`。
|
||||
- Service readback:`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`HOST_188_SERVICE_GREEN=1`、`WAZUH_DASHBOARD_DEGRADED=false`。
|
||||
- 唯一 active blocker:`host_boot_observation_older_than_target_window`;目前不是 fresh all-host reboot window,因此不得偽稱已證明 10 分鐘 SLO。
|
||||
|
||||
**本地驗證結果**:
|
||||
- `bash -n scripts/reboot-recovery/post-start-quick-check.sh scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh`:通過。
|
||||
- `python3.11 -m py_compile scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py`:通過。
|
||||
- Focused pytest:`12 passed`。
|
||||
- `git diff --check`:通過。
|
||||
|
||||
**仍維持**:
|
||||
- 沒有重啟任何主機;沒有 restart Docker / Nginx / K3s / DB / firewall;沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有使用 GitHub。
|
||||
- 下一個 P0-006 proof 不是再部署 source,而是等待下一次實際 all-host reboot event,或另開經批准 reboot drill,讓 `max_observed_uptime_seconds<=600` 且上述 service readback 維持綠。
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
{
|
||||
"schema_version": "awoooi_priority_work_order_readback_v1",
|
||||
"generated_at": "2026-06-29T14:49:52+08:00",
|
||||
"status": "p0_006a_reboot_auto_recovery_slo_live_probe_installed_blocked_service_health",
|
||||
"generated_at": "2026-06-29T15:05:00+08:00",
|
||||
"status": "p0_006a_reboot_auto_recovery_slo_timer_live_services_green_waiting_reboot_window",
|
||||
"source_refs": {
|
||||
"global_scorecard": "~/.codex/product-runtime-governance-completion-scorecard.snapshot.json",
|
||||
"workstation_dashboard": "~/.codex/codex-workstation-sync-dashboard.snapshot.json",
|
||||
"post_reboot_summary": "/tmp/awoooi-post-reboot-readiness-20260629-115730/summary.txt",
|
||||
"post_reboot_summary": "/home/wooo/reboot-recovery/reboot-auto-recovery-slo-20260629-150501/summary.txt",
|
||||
"full_stack_cold_start_check": "scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color",
|
||||
"delivery_closure_workbench": "https://awoooi.wooo.work/api/v1/agents/delivery-closure-workbench",
|
||||
"public_gitea_queue_readback": "ops/runner/read-public-gitea-actions-queue.py --json",
|
||||
"credential_escrow_scorecard": "/tmp/awoooi-credential-escrow-intake-scorecard-20260629-1200-priority.json",
|
||||
"dr_escrow_evidence_checklist_generator": "scripts/reboot-recovery/dr-escrow-evidence-checklist.py",
|
||||
"gitea_private_inventory_p0_scorecard": "docs/operations/awoooi-gitea-private-inventory-p0-scorecard.snapshot.json",
|
||||
"reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json"
|
||||
"reboot_auto_recovery_slo_scorecard": "docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json",
|
||||
"reboot_auto_recovery_slo_metric": "/home/wooo/node_exporter_textfiles/reboot_auto_recovery_slo.prom"
|
||||
},
|
||||
"current_head": {
|
||||
"gitea_main_sha": "7ff959b6a8bbdf152da1969687f188ceda4b0561",
|
||||
@@ -186,8 +187,8 @@
|
||||
{
|
||||
"workplan_id": "P0-006",
|
||||
"title": "主機重啟自動偵測、自動觸發與 10 分鐘恢復 SLO",
|
||||
"status": "blocked_reboot_auto_recovery_slo_not_ready",
|
||||
"reason": "Boot-triggered SLO timer is live on host 110 and all required host boot probes are now observed; the 10-minute recovery claim remains fail-closed because this was not a fresh reboot window and service/backup/post-start blockers remain.",
|
||||
"status": "blocked_waiting_fresh_all_host_reboot_window",
|
||||
"reason": "Boot-triggered SLO timer is enabled and active on host 110; live metric and scorecard observed all required hosts. Service/data/backup/188 readback is green. The 10-minute claim remains fail-closed only because the current boot observation is older than the 600-second target window, so the next proof requires an actual fresh all-host reboot event or an approved reboot drill.",
|
||||
"evidence": {
|
||||
"target_minutes": 10,
|
||||
"can_claim_all_services_recovered_within_target": false,
|
||||
@@ -195,10 +196,10 @@
|
||||
"host_boot_probe_source_present": true,
|
||||
"slo_systemd_timer_source_present": true,
|
||||
"slo_exporter_source_present": true,
|
||||
"post_start_blocked": 6,
|
||||
"service_green": false,
|
||||
"post_start_blocked": 0,
|
||||
"service_green": true,
|
||||
"product_data_green": true,
|
||||
"backup_core_green": false,
|
||||
"backup_core_green": true,
|
||||
"wazuh_dashboard_degraded": false,
|
||||
"all_host_reboot_detection_missing": false,
|
||||
"host_boot_probe_missing_hosts": false,
|
||||
@@ -222,32 +223,31 @@
|
||||
"121",
|
||||
"188"
|
||||
],
|
||||
"max_observed_uptime_seconds": 518414,
|
||||
"max_observed_uptime_seconds": 519376,
|
||||
"active_blockers": [
|
||||
"backup_core_green_not_1",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1"
|
||||
]
|
||||
"host_boot_observation_older_than_target_window"
|
||||
],
|
||||
"host_188_service_green": true,
|
||||
"safe_next_step_from_scorecard": "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
},
|
||||
"professional_fix": {
|
||||
"owner": "reboot auto-recovery lane",
|
||||
"action": "Keep the live boot-triggered SLO timer enabled, fix backup_core/post-start/service blockers, then use the next fresh reboot window to prove max_observed_uptime_seconds<=600.",
|
||||
"action": "Keep the live boot-triggered SLO timer enabled. Do not reboot from this lane. Use the next actual all-host reboot event, or a separately approved reboot drill, to prove max_observed_uptime_seconds<=600 with service/data/backup readback still green.",
|
||||
"exit_criteria": [
|
||||
"can_claim_all_services_recovered_within_target=true",
|
||||
"observed_hosts=110,120,121,188",
|
||||
"missing_hosts=[]",
|
||||
"unreachable_hosts=[]",
|
||||
"max_observed_uptime_seconds<=600 during a fresh reboot window",
|
||||
"max_observed_uptime_seconds<=600 during a fresh all-host reboot window",
|
||||
"POST_START_BLOCKED=0",
|
||||
"SERVICE_GREEN=1",
|
||||
"PRODUCT_DATA_GREEN=1",
|
||||
"BACKUP_CORE_GREEN=1",
|
||||
"WAZUH_DASHBOARD_DEGRADED=0",
|
||||
"HOST_188_SERVICE_GREEN=1",
|
||||
"live_slo_metric_present=true"
|
||||
]
|
||||
},
|
||||
"safe_next_step": "fix_backup_core_post_start_and_service_green_blockers_then_rerun_live_slo_scorecard_after_next_reboot_window"
|
||||
"safe_next_step": "wait_for_next_all_host_reboot_event_or_separately_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
}
|
||||
],
|
||||
"noise_integrated_risk_register": [
|
||||
@@ -319,8 +319,8 @@
|
||||
"database_write_or_restore_performed": false
|
||||
},
|
||||
"next_execution_order": [
|
||||
"P0-006: keep the live reboot SLO timer active; next proof is the next fresh all-host reboot event or separately approved reboot drill, not another source deploy.",
|
||||
"P0-005: fill the single DR escrow evidence checklist with five non-secret refs and rerun one preflight.",
|
||||
"P0-003: convert private/internal inventory to Gitea-only readback and remove retired GitHub from active P0 blocker math.",
|
||||
"P0-006: deploy boot-triggered reboot auto-recovery SLO verifier, collect all-host boot probe, and prove or block the 10-minute recovery claim."
|
||||
"P0-003: complete Gitea authenticated/admin inventory export and owner coverage attestation; GitHub remains stopped/retired/do_not_use."
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,29 +1,26 @@
|
||||
{
|
||||
"active_blockers": [
|
||||
"backup_core_green_not_1",
|
||||
"host_boot_observation_older_than_target_window",
|
||||
"post_start_blocked_not_zero",
|
||||
"service_green_not_1"
|
||||
"host_boot_observation_older_than_target_window"
|
||||
],
|
||||
"can_claim_all_services_recovered_within_target": false,
|
||||
"capacity": {
|
||||
"checked": true,
|
||||
"free_gib": 145.514,
|
||||
"free_gib": 145.646,
|
||||
"min_free_gib": 2.0
|
||||
},
|
||||
"generated_at": "2026-06-29T14:49:52+08:00",
|
||||
"generated_at": "2026-06-29T15:06:25+08:00",
|
||||
"host_boot_detection": {
|
||||
"host_rows": [
|
||||
{
|
||||
"alias": "110",
|
||||
"boot_id": "a3dfae32-3762-4394-86fa-a342aea07df5",
|
||||
"reachable": true,
|
||||
"startup_active": "inactive_unknown",
|
||||
"startup_active": "inactive",
|
||||
"startup_enabled": "enabled",
|
||||
"startup_unit": "awoooi-startup-110.service",
|
||||
"systemd_state": "degraded",
|
||||
"target": "wooo@192.168.0.110",
|
||||
"uptime_seconds": 518406
|
||||
"uptime_seconds": 519367
|
||||
},
|
||||
{
|
||||
"alias": "120",
|
||||
@@ -34,7 +31,7 @@
|
||||
"startup_unit": "k3s.service",
|
||||
"systemd_state": "running",
|
||||
"target": "wooo@192.168.0.120",
|
||||
"uptime_seconds": 518397
|
||||
"uptime_seconds": 519359
|
||||
},
|
||||
{
|
||||
"alias": "121",
|
||||
@@ -45,7 +42,7 @@
|
||||
"startup_unit": "k3s.service",
|
||||
"systemd_state": "running",
|
||||
"target": "wooo@192.168.0.121",
|
||||
"uptime_seconds": 518355
|
||||
"uptime_seconds": 519317
|
||||
},
|
||||
{
|
||||
"alias": "188",
|
||||
@@ -56,10 +53,10 @@
|
||||
"startup_unit": "awoooi-startup.service",
|
||||
"systemd_state": "running",
|
||||
"target": "ollama@192.168.0.188",
|
||||
"uptime_seconds": 518414
|
||||
"uptime_seconds": 519376
|
||||
}
|
||||
],
|
||||
"max_observed_uptime_seconds": 518414,
|
||||
"max_observed_uptime_seconds": 519376,
|
||||
"missing_hosts": [],
|
||||
"observed_hosts": [
|
||||
"110",
|
||||
@@ -83,18 +80,18 @@
|
||||
"unreachable_hosts": []
|
||||
},
|
||||
"post_reboot_readiness": {
|
||||
"backup_core_green": false,
|
||||
"backup_core_green": true,
|
||||
"host_188_service_green": true,
|
||||
"next_required_gates": "none",
|
||||
"overall_declaration": "SERVICE_BLOCKED",
|
||||
"post_start_blocked": 6,
|
||||
"post_start_result": "BLOCKED",
|
||||
"next_required_gates": "credential_escrow_evidence",
|
||||
"overall_declaration": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
|
||||
"post_start_blocked": 0,
|
||||
"post_start_result": "FULL_STACK_GREEN_DR_ESCROW_BLOCKED",
|
||||
"product_data_green": true,
|
||||
"service_green": false,
|
||||
"service_green": true,
|
||||
"summary_present": true,
|
||||
"wazuh_dashboard_degraded": false
|
||||
},
|
||||
"safe_next_step": "deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_rerun_scorecard_until_status_slo_ready",
|
||||
"safe_next_step": "timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_event_or_approved_reboot_drill_to_prove_10_minute_slo",
|
||||
"schema_version": "awoooi_reboot_auto_recovery_slo_scorecard_v1",
|
||||
"source_controls": {
|
||||
"cold_start_textfile_exporter_source_present": true,
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
REMOTE_188="${REMOTE_188:-192.168.0.188}"
|
||||
REMOTE_110="${REMOTE_110:-192.168.0.110}"
|
||||
REMOTE_188="${REMOTE_188:-ollama@192.168.0.188}"
|
||||
REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"
|
||||
SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}"
|
||||
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
|
||||
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-8}"
|
||||
@@ -21,8 +21,8 @@ and route health. It never runs pg_resetwal, certbot renew, reset-failed, restor
|
||||
Nginx reload, Docker/systemd restart, or any host write.
|
||||
|
||||
Environment:
|
||||
REMOTE_188=192.168.0.188
|
||||
REMOTE_110=192.168.0.110
|
||||
REMOTE_188=ollama@192.168.0.188
|
||||
REMOTE_110=wooo@192.168.0.110
|
||||
SSH_BATCH_MODE=yes
|
||||
SSH_STRICT_HOST_KEY_CHECKING=accept-new
|
||||
SSH_CONNECT_TIMEOUT=8
|
||||
@@ -73,6 +73,23 @@ ssh_opts=(
|
||||
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
|
||||
)
|
||||
|
||||
local_ip_list() {
|
||||
{
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' || true
|
||||
ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true
|
||||
ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true
|
||||
} | awk 'NF'
|
||||
}
|
||||
|
||||
is_local_target() {
|
||||
local target="$1"
|
||||
local host="${target##*@}"
|
||||
local ips
|
||||
[[ "$host" == "127.0.0.1" || "$host" == "localhost" ]] && return 0
|
||||
ips="$(local_ip_list)"
|
||||
grep -Fxq "$host" <<<"$ips"
|
||||
}
|
||||
|
||||
section() {
|
||||
printf "\n%s=== %s ===%s\n" "$blue" "$1" "$reset"
|
||||
}
|
||||
@@ -96,6 +113,10 @@ blocked() {
|
||||
ssh_cmd() {
|
||||
local target="$1"
|
||||
local command="$2"
|
||||
if is_local_target "$target"; then
|
||||
bash -lc "$command"
|
||||
return $?
|
||||
fi
|
||||
ssh "${ssh_opts[@]}" "$target" "$command"
|
||||
}
|
||||
|
||||
|
||||
@@ -184,12 +184,45 @@ blocked() {
|
||||
printf '%sBLOCKED%s %s\n' "$RED" "$NC" "$*"
|
||||
}
|
||||
|
||||
local_ip_list() {
|
||||
{
|
||||
hostname -I 2>/dev/null | tr ' ' '\n' || true
|
||||
ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true
|
||||
ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true
|
||||
} | awk 'NF'
|
||||
}
|
||||
|
||||
is_local_target() {
|
||||
local target="$1"
|
||||
local host="${target##*@}"
|
||||
local ips
|
||||
[[ "$host" == "127.0.0.1" || "$host" == "localhost" ]] && return 0
|
||||
ips="$(local_ip_list)"
|
||||
grep -Fxq "$host" <<<"$ips"
|
||||
}
|
||||
|
||||
ssh_read() {
|
||||
local user_host="$1"
|
||||
local command="$2"
|
||||
if is_local_target "$user_host"; then
|
||||
bash -lc "$command"
|
||||
return $?
|
||||
fi
|
||||
ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command"
|
||||
}
|
||||
|
||||
service_route_recovered() {
|
||||
[[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && "$AWOOOI_API_ROUTE_OK" -eq 1 ]]
|
||||
}
|
||||
|
||||
capacity_or_runner_issue() {
|
||||
if service_route_recovered; then
|
||||
evidence_warn "$@"
|
||||
else
|
||||
blocked "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_and_capture() {
|
||||
local label="$1"
|
||||
shift
|
||||
@@ -509,9 +542,12 @@ if [[ "$COLD_START_PENDING_BLOCKERS" -gt 0 ]]; then
|
||||
printf '%s\n' "$non_route_cold_blockers" | grep -Ev '^BLOCKED AWOOOI API not reachable$|^BLOCKED AWOOI API not reachable$' || true
|
||||
)"
|
||||
fi
|
||||
if [[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && -z "$non_route_cold_blockers" ]]; then
|
||||
if service_route_recovered && [[ -z "$non_route_cold_blockers" ]]; then
|
||||
evidence_warn "cold-start route/API warmup blockers recovered under wrapper route retry: $COLD_START_BLOCKED_SUMMARY"
|
||||
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
||||
elif service_route_recovered; then
|
||||
evidence_warn "cold-start non-route blockers retained as capacity/freshness evidence after public routes and AWOOOI API recovered: $COLD_START_BLOCKED_SUMMARY"
|
||||
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
||||
else
|
||||
blocked "cold-start has blockers: $COLD_START_BLOCKED_SUMMARY"
|
||||
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
||||
@@ -656,18 +692,18 @@ echo "RUNNER_PRESSURE_GATE_RC $?"
|
||||
' >"$runner_tmp" 2>&1; then
|
||||
ok "110 controlled runner readback succeeded"
|
||||
else
|
||||
blocked "110 controlled runner readback failed"
|
||||
capacity_or_runner_issue "110 controlled runner readback failed"
|
||||
fi
|
||||
cat "$runner_tmp"
|
||||
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then
|
||||
ok "110 legacy direct/Gitea runner units are fail-closed"
|
||||
else
|
||||
blocked "110 legacy direct/Gitea runner units are not fail-closed"
|
||||
capacity_or_runner_issue "110 legacy direct/Gitea runner units are not fail-closed"
|
||||
fi
|
||||
grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || blocked "110 controlled cd-lane guardrails incomplete"
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || blocked "110 legacy direct runner process detected"
|
||||
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
|
||||
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"
|
||||
grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || capacity_or_runner_issue "110 controlled cd-lane guardrails incomplete"
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || capacity_or_runner_issue "110 legacy direct runner process detected"
|
||||
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && capacity_or_runner_issue "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
|
||||
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || capacity_or_runner_issue "110 host pressure gate is blocking"
|
||||
rm -f "$runner_tmp"
|
||||
|
||||
section "總結"
|
||||
|
||||
@@ -31,8 +31,10 @@ local_ip_list() {
|
||||
|
||||
is_local_target() {
|
||||
local target_host="$1"
|
||||
local ips
|
||||
[[ "$target_host" == "127.0.0.1" || "$target_host" == "localhost" ]] && return 0
|
||||
local_ip_list | grep -Fxq "$target_host"
|
||||
ips="$(local_ip_list)"
|
||||
grep -Fxq "$target_host" <<<"$ips"
|
||||
}
|
||||
|
||||
emit_boot_row() {
|
||||
@@ -64,8 +66,10 @@ probe_local_host() {
|
||||
boot_id="$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown)"
|
||||
uptime_seconds="$(awk '{print int($1)}' /proc/uptime 2>/dev/null || echo unknown)"
|
||||
systemd_state="$(systemctl is-system-running 2>/dev/null || true)"
|
||||
enabled="$(systemctl is-enabled "$unit" 2>/dev/null || echo unknown)"
|
||||
active="$(systemctl is-active "$unit" 2>/dev/null || echo unknown)"
|
||||
enabled="$(systemctl is-enabled "$unit" 2>/dev/null || true)"
|
||||
active="$(systemctl is-active "$unit" 2>/dev/null || true)"
|
||||
enabled="${enabled:-unknown}"
|
||||
active="${active:-unknown}"
|
||||
emit_boot_row "$alias" "$target" "$unit" 1 "$boot_id" "$uptime_seconds" "$systemd_state" "$enabled" "$active"
|
||||
}
|
||||
|
||||
@@ -106,8 +110,10 @@ probe_host() {
|
||||
boot_id=\$(cat /proc/sys/kernel/random/boot_id 2>/dev/null || echo unknown); \
|
||||
uptime_seconds=\$(awk '{print int(\$1)}' /proc/uptime 2>/dev/null || echo unknown); \
|
||||
systemd_state=\$(systemctl is-system-running 2>/dev/null || true); \
|
||||
enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || echo unknown); \
|
||||
active=\$(systemctl is-active \"\$unit\" 2>/dev/null || echo unknown); \
|
||||
enabled=\$(systemctl is-enabled \"\$unit\" 2>/dev/null || true); \
|
||||
active=\$(systemctl is-active \"\$unit\" 2>/dev/null || true); \
|
||||
enabled=\${enabled:-unknown}; \
|
||||
active=\${active:-unknown}; \
|
||||
printf 'boot_id=%s uptime_seconds=%s systemd_state=%s startup_enabled=%s startup_active=%s\n' \"\$boot_id\" \"\$uptime_seconds\" \"\$systemd_state\" \"\$enabled\" \"\$active\" \
|
||||
" 2>/dev/null)"
|
||||
if [[ $? -ne 0 || -z "$output" ]]; then
|
||||
|
||||
@@ -219,7 +219,18 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
|
||||
[int_value(row.get("uptime_seconds"), 0) for row in host_rows if row.get("reachable")]
|
||||
or [0]
|
||||
)
|
||||
can_claim = not blockers
|
||||
unique_blockers = sorted(set(blockers))
|
||||
can_claim = not unique_blockers
|
||||
if unique_blockers == ["host_boot_observation_older_than_target_window"]:
|
||||
safe_next_step = (
|
||||
"timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_"
|
||||
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
)
|
||||
else:
|
||||
safe_next_step = (
|
||||
"deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_"
|
||||
"rerun_scorecard_until_status_slo_ready"
|
||||
)
|
||||
return {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"generated_at": args.generated_at
|
||||
@@ -256,11 +267,8 @@ def build_scorecard(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"free_gib": round(free_gib, 3) if free_gib is not None else None,
|
||||
"min_free_gib": args.min_free_gib,
|
||||
},
|
||||
"active_blockers": sorted(set(blockers)),
|
||||
"safe_next_step": (
|
||||
"deploy_boot_triggered_slo_timer_and_collect_all_host_boot_probe_then_"
|
||||
"rerun_scorecard_until_status_slo_ready"
|
||||
),
|
||||
"active_blockers": unique_blockers,
|
||||
"safe_next_step": safe_next_step,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
SCRIPT = ROOT / "scripts" / "reboot-recovery" / "188-host-hygiene-maintenance-checklist.sh"
|
||||
|
||||
|
||||
def test_188_and_110_default_to_reachable_runtime_identities() -> None:
|
||||
text = SCRIPT.read_text(encoding="utf-8")
|
||||
|
||||
assert 'REMOTE_188="${REMOTE_188:-ollama@192.168.0.188}"' in text
|
||||
assert 'REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}"' in text
|
||||
|
||||
|
||||
def test_110_self_check_can_run_locally_without_ssh_config_drift() -> None:
|
||||
text = SCRIPT.read_text(encoding="utf-8")
|
||||
|
||||
assert "is_local_target()" in text
|
||||
assert 'ips="$(local_ip_list)"' in text
|
||||
assert 'local_ip_list | grep' not in text
|
||||
assert 'bash -lc "$command"' in text
|
||||
assert 'ssh "${ssh_opts[@]}" "$target" "$command"' in text
|
||||
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
SCRIPT = ROOT / "scripts" / "reboot-recovery" / "post-start-quick-check.sh"
|
||||
|
||||
|
||||
def test_post_start_self_checks_can_run_locally_on_110() -> None:
|
||||
text = SCRIPT.read_text(encoding="utf-8")
|
||||
|
||||
assert "is_local_target()" in text
|
||||
assert 'ips="$(local_ip_list)"' in text
|
||||
assert 'local_ip_list | grep' not in text
|
||||
assert 'bash -lc "$command"' in text
|
||||
assert 'ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command"' in text
|
||||
|
||||
|
||||
def test_runner_pressure_is_capacity_evidence_after_routes_recover() -> None:
|
||||
text = SCRIPT.read_text(encoding="utf-8")
|
||||
|
||||
assert "service_route_recovered()" in text
|
||||
assert "capacity_or_runner_issue()" in text
|
||||
assert "cold-start non-route blockers retained as capacity/freshness evidence" in text
|
||||
assert 'capacity_or_runner_issue "110 host pressure gate is blocking"' in text
|
||||
@@ -86,3 +86,16 @@ def test_degraded_wazuh_and_old_boot_observation_block_slo(tmp_path: Path) -> No
|
||||
assert payload["can_claim_all_services_recovered_within_target"] is False
|
||||
assert "wazuh_dashboard_degraded" in payload["active_blockers"]
|
||||
assert "host_boot_observation_older_than_target_window" in payload["active_blockers"]
|
||||
|
||||
|
||||
def test_services_green_but_old_boot_window_waits_for_reboot_event(tmp_path: Path) -> None:
|
||||
probe = HOST_PROBE_GREEN.replace("uptime_seconds=150", "uptime_seconds=900")
|
||||
|
||||
payload = run_scorecard(tmp_path, GREEN_SUMMARY, probe=probe)
|
||||
|
||||
assert payload["status"] == "blocked_reboot_auto_recovery_slo_not_ready"
|
||||
assert payload["active_blockers"] == ["host_boot_observation_older_than_target_window"]
|
||||
assert payload["safe_next_step"] == (
|
||||
"timer_deployed_and_services_readback_green_wait_for_next_all_host_reboot_"
|
||||
"event_or_approved_reboot_drill_to_prove_10_minute_slo"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user