ops(reboot): close 188 hygiene and dynamic post-reboot gates
Some checks failed
Code Review / ai-code-review (push) Successful in 15s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled

This commit is contained in:
ogt
2026-06-26 12:39:55 +08:00
parent d8a68c742c
commit 71261c122e
12 changed files with 227 additions and 67 deletions

View File

@@ -149,17 +149,35 @@ if out=$(ssh_cmd "$REMOTE_188" '
pg_lsclusters 2>/dev/null || true
systemctl status postgresql@14-main.service --no-pager || true
echo "PG_ISREADY_LOCAL $(pg_isready -h localhost -p 5432 2>/dev/null || true)"
echo "RECOVERY_CONTAINER $(docker inspect -f "{{.State.Running}} {{.HostConfig.NetworkMode}} {{.HostConfig.RestartPolicy.Name}}" k3s-postgres-recovery 2>/dev/null || echo missing)"
' 2>&1); then
echo "$out"
recovery_container_ready=0
if grep -q '^RECOVERY_CONTAINER true host ' <<<"$out" && grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
recovery_container_ready=1
fi
if grep -Eq '^14[[:space:]]+main[[:space:]]+5432[[:space:]]+down' <<<"$out"; then
blocked "host PostgreSQL cluster 14/main is down"
if [[ "$recovery_container_ready" -eq 1 ]]; then
warn "host PostgreSQL cluster 14/main is down, but controlled k3s-postgres-recovery runtime is accepting connections"
else
blocked "host PostgreSQL cluster 14/main is down and no controlled recovery runtime was accepted"
fi
else
ok "host PostgreSQL cluster 14/main not reported down"
fi
if grep -Eiq 'invalid primary checkpoint record|could not locate a valid checkpoint record|PANIC:' <<<"$out"; then
blocked "PostgreSQL checkpoint/WAL error detected; pg_resetwal is break-glass only"
if [[ "$recovery_container_ready" -eq 1 ]]; then
warn "PostgreSQL checkpoint/WAL error remains historical host-cluster evidence; pg_resetwal is still break-glass only"
else
blocked "PostgreSQL checkpoint/WAL error detected; pg_resetwal is break-glass only"
fi
fi
if grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
if [[ "$recovery_container_ready" -eq 1 ]]; then
ok "PostgreSQL runtime is provided by k3s-postgres-recovery on host network"
elif grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
warn "pg_isready accepts on localhost; do not use this alone as host 14/main health"
fi
else
@@ -169,12 +187,30 @@ fi
section "188 certbot / ACME"
if out=$(ssh_cmd "$REMOTE_188" '
systemctl status certbot.service --no-pager || true
systemctl status snap.certbot.renew.service --no-pager || true
systemctl show certbot.service snap.certbot.renew.service certbot.timer snap.certbot.renew.timer -p Id -p ActiveState -p SubState -p Result -p UnitFileState --no-pager || true
systemctl list-timers --all --no-pager | grep -i certbot || true
' 2>&1); then
echo "$out"
grep -Eiq 'rateLimited|Service busy' <<<"$out" && blocked "certbot renewal is rate-limited; do not retry blindly"
grep -Eiq 'Some challenges have failed|challenge' <<<"$out" && blocked "certbot challenge failure requires DNS / ACME route owner evidence"
if grep -q 'Id=certbot.service' <<<"$out" && grep -A3 'Id=certbot.service' <<<"$out" | grep -q 'Result=failed'; then
blocked "apt certbot service currently failed"
else
ok "apt certbot service is not currently failed"
fi
if grep -q 'Id=snap.certbot.renew.service' <<<"$out" && grep -A3 'Id=snap.certbot.renew.service' <<<"$out" | grep -q 'Result=failed'; then
blocked "snap certbot renew service currently failed"
else
ok "snap certbot renew service is not currently failed"
fi
if grep -A4 'Id=certbot.timer' <<<"$out" | grep -q 'UnitFileState=disabled'; then
ok "legacy apt certbot timer disabled to avoid duplicate renewals"
else
warn "legacy apt certbot timer is not disabled"
fi
if grep -A4 'Id=snap.certbot.renew.timer' <<<"$out" | grep -q 'ActiveState=active' && grep -A4 'Id=snap.certbot.renew.timer' <<<"$out" | grep -q 'UnitFileState=enabled'; then
ok "snap certbot renew timer enabled"
else
blocked "snap certbot renew timer is not enabled and active"
fi
else
blocked "certbot status unavailable"
echo "$out"
@@ -223,7 +259,27 @@ else
fi
section "Maintenance decision tree"
cat <<'STEPS'
if [ "$SERVICE_GREEN" -eq 1 ] && [ "$HOST_HYGIENE_BLOCKED" -eq 0 ]; then
cat <<'STEPS'
Current expected outcome:
SERVICE_GREEN=1
HOST_HYGIENE_BLOCKED=0
RESULT=HOST_188_HYGIENE_GREEN
Allowed next step:
1. Keep this host in the normal post-reboot summary.
2. Wait for snap certbot timer / ACME-window readback before declaring formal certificate renewal success.
3. Keep DR credential escrow and Wazuh registry evidence as separate blockers.
Forbidden without separate approval:
- pg_resetwal
- DB restore
- Docker/systemd restart
- firewall change
- Wazuh active response or agent re-enroll
STEPS
else
cat <<'STEPS'
Current expected outcome when 188 service is green but host hygiene is not:
SERVICE_GREEN=1
HOST_HYGIENE_BLOCKED=1
@@ -244,6 +300,7 @@ Forbidden without maintenance approval:
- Docker/systemd restart
- host file write
STEPS
fi
echo
echo "SERVICE_GREEN=$SERVICE_GREEN"

View File

@@ -14,8 +14,9 @@ Description=AWOOOI Auto-Startup Recovery Sequence
After=network-online.target containerd.service docker.service
Wants=network-online.target
# 確保 PostgreSQL 盡早嘗試啟動
Wants=postgresql@14-main.service redis-server.service ollama.service nginx.service
# PostgreSQL 可由受控 recovery container 提供;不得在 startup 階段硬拉
# postgresql@14-main.service,避免與 recovery runtime 競爭或觸發假綠修復。
Wants=redis-server.service ollama.service nginx.service
[Service]
Type=oneshot

View File

@@ -3,6 +3,8 @@
# 2026-04-04 ogt: 根據實際事故建立,處理 container / Docker 啟動順序與 K3s Kine 維護。
# 2026-06-26 Codex: PostgreSQL checkpoint/WAL 錯誤改為 fail-closed
# 不在自動啟動腳本內執行 pg_resetwal避免資料破壞被誤判成恢復。
# 2026-06-26 Codex: 允許受控 recovery container 提供 14/main runtime
# 不再因 systemd postgresql@14-main failed 而誤判活 DB 為不可用。
# 部署位置: /usr/local/bin/awoooi-startup.sh (on 192.168.0.188)
# systemd unit: /etc/systemd/system/awoooi-startup.service
@@ -12,6 +14,22 @@ exec > >(tee -a "$LOG") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
postgres_runtime_ready() {
if systemctl is-active postgresql@14-main >/dev/null 2>&1; then
log "✅ PostgreSQL systemd unit active"
return 0
fi
if docker inspect -f '{{.State.Running}} {{.HostConfig.NetworkMode}}' k3s-postgres-recovery 2>/dev/null | grep -q '^true host$'; then
if pg_isready -h localhost -p 5432 >/dev/null 2>&1; then
log "✅ PostgreSQL recovery container active on host network"
return 0
fi
fi
return 1
}
log "=== AWOOOI 啟動序列開始 ==="
# ──────────────────────────────────────────────
@@ -73,20 +91,20 @@ fi
# ──────────────────────────────────────────────
log "[3/7] 檢查 PostgreSQL..."
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
if ! postgres_runtime_ready; then
log "PostgreSQL 未啟動,嘗試啟動..."
systemctl start postgresql@14-main || true
sleep 8
fi
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
if ! postgres_runtime_ready; then
log "PostgreSQL 啟動失敗,檢查是否屬於 checkpoint/WAL 類資料層錯誤..."
if journalctl -u postgresql@14-main -n 20 | grep -q "could not locate a valid checkpoint"; then
log "❌ 偵測到 PostgreSQL checkpoint/WAL 錯誤;禁止自動 pg_resetwal。"
log "需要 DB owner、備份/restore evidence、maintenance window 與 post-check 後才能人工處理。"
exit 1
fi
systemctl is-active postgresql@14-main && log "✅ PostgreSQL 修復成功" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
postgres_runtime_ready && log "✅ PostgreSQL runtime 可用" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
fi
# 等待 PG 接受連線

View File

@@ -23,7 +23,7 @@ OWNER_PACKET_GENERATOR = (
)
EXPECTED_SCHEMA = "awoooi_post_reboot_next_gate_owner_packets_v1"
EXPECTED_GATES = {
KNOWN_GATES = {
"credential_escrow_evidence",
"host_188_hygiene_maintenance_window",
"wazuh_manager_registry_export",
@@ -187,12 +187,21 @@ def validate_packet(packet: dict[str, Any]) -> list[str]:
counts = {}
gate_ids = {str(item.get("packet_id", "")) for item in owner_packets if isinstance(item, dict)}
if gate_ids != EXPECTED_GATES:
unknown_gates = sorted(gate_ids - KNOWN_GATES)
if unknown_gates:
failures.append(f"unknown_gate_ids={unknown_gates}")
source = packet.get("source", {})
if not isinstance(source, dict):
failures.append("source_not_object")
source = {}
expected_gates = set(str(item) for item in as_list(source.get("next_required_gates")))
if expected_gates != gate_ids:
failures.append(f"gate_ids={sorted(gate_ids)}")
expected_counts = {
"next_gate_count": 3,
"p0_gate_count": 3,
"next_gate_count": len(gate_ids),
"p0_gate_count": len(gate_ids),
}
for key, expected in expected_counts.items():
if counts.get(key) != expected:

View File

@@ -9,6 +9,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-6}"
ROUTE_RETRY_ATTEMPTS="${ROUTE_RETRY_ATTEMPTS:-3}"
ROUTE_RETRY_DELAY_SECONDS="${ROUTE_RETRY_DELAY_SECONDS:-2}"
STOCK_FRESHNESS_RETRY_ATTEMPTS="${STOCK_FRESHNESS_RETRY_ATTEMPTS:-6}"
STOCK_FRESHNESS_RETRY_DELAY_SECONDS="${STOCK_FRESHNESS_RETRY_DELAY_SECONDS:-5}"
RUN_COLD_START=1
RUN_MOMO=1
RUN_STOCK=1
@@ -76,6 +78,8 @@ Options:
Environment:
ROUTE_RETRY_ATTEMPTS Public route attempts before blocking. Default: 3.
ROUTE_RETRY_DELAY_SECONDS Delay between failed public route attempts. Default: 2.
STOCK_FRESHNESS_RETRY_ATTEMPTS Stock freshness attempts before blocking. Default: 6.
STOCK_FRESHNESS_RETRY_DELAY_SECONDS Delay between failed Stock freshness attempts. Default: 5.
Exit codes:
0 = no service blockers. Boundary / evidence warnings may still be present.
@@ -277,9 +281,23 @@ fi
if [[ "$RUN_STOCK" -eq 1 ]]; then
section "StockPlatform freshness"
stock_tmp="$(mktemp -t post-start-stock.XXXXXX)"
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
stock_code=""
stock_attempt=1
while [[ "$stock_attempt" -le "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; do
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
if [[ "$stock_code" == 2* ]]; then
if [[ "$stock_attempt" -gt 1 ]]; then
evidence_warn "StockPlatform freshness recovered after attempt=$stock_attempt"
fi
break
fi
if [[ "$stock_attempt" -lt "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; then
sleep "$STOCK_FRESHNESS_RETRY_DELAY_SECONDS"
fi
stock_attempt=$((stock_attempt + 1))
done
if [[ "$stock_code" != 2* ]]; then
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed}"
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed} attempts=$STOCK_FRESHNESS_RETRY_ATTEMPTS"
cat "$stock_tmp" || true
else
python3 - "$stock_tmp" <<'PY'