ops(reboot): close 188 hygiene and dynamic post-reboot gates
This commit is contained in:
@@ -149,17 +149,35 @@ if out=$(ssh_cmd "$REMOTE_188" '
|
||||
pg_lsclusters 2>/dev/null || true
|
||||
systemctl status postgresql@14-main.service --no-pager || true
|
||||
echo "PG_ISREADY_LOCAL $(pg_isready -h localhost -p 5432 2>/dev/null || true)"
|
||||
echo "RECOVERY_CONTAINER $(docker inspect -f "{{.State.Running}} {{.HostConfig.NetworkMode}} {{.HostConfig.RestartPolicy.Name}}" k3s-postgres-recovery 2>/dev/null || echo missing)"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
recovery_container_ready=0
|
||||
if grep -q '^RECOVERY_CONTAINER true host ' <<<"$out" && grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
|
||||
recovery_container_ready=1
|
||||
fi
|
||||
|
||||
if grep -Eq '^14[[:space:]]+main[[:space:]]+5432[[:space:]]+down' <<<"$out"; then
|
||||
blocked "host PostgreSQL cluster 14/main is down"
|
||||
if [[ "$recovery_container_ready" -eq 1 ]]; then
|
||||
warn "host PostgreSQL cluster 14/main is down, but controlled k3s-postgres-recovery runtime is accepting connections"
|
||||
else
|
||||
blocked "host PostgreSQL cluster 14/main is down and no controlled recovery runtime was accepted"
|
||||
fi
|
||||
else
|
||||
ok "host PostgreSQL cluster 14/main not reported down"
|
||||
fi
|
||||
|
||||
if grep -Eiq 'invalid primary checkpoint record|could not locate a valid checkpoint record|PANIC:' <<<"$out"; then
|
||||
blocked "PostgreSQL checkpoint/WAL error detected; pg_resetwal is break-glass only"
|
||||
if [[ "$recovery_container_ready" -eq 1 ]]; then
|
||||
warn "PostgreSQL checkpoint/WAL error remains historical host-cluster evidence; pg_resetwal is still break-glass only"
|
||||
else
|
||||
blocked "PostgreSQL checkpoint/WAL error detected; pg_resetwal is break-glass only"
|
||||
fi
|
||||
fi
|
||||
if grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
|
||||
|
||||
if [[ "$recovery_container_ready" -eq 1 ]]; then
|
||||
ok "PostgreSQL runtime is provided by k3s-postgres-recovery on host network"
|
||||
elif grep -q 'PG_ISREADY_LOCAL .*accepting connections' <<<"$out"; then
|
||||
warn "pg_isready accepts on localhost; do not use this alone as host 14/main health"
|
||||
fi
|
||||
else
|
||||
@@ -169,12 +187,30 @@ fi
|
||||
|
||||
section "188 certbot / ACME"
|
||||
if out=$(ssh_cmd "$REMOTE_188" '
|
||||
systemctl status certbot.service --no-pager || true
|
||||
systemctl status snap.certbot.renew.service --no-pager || true
|
||||
systemctl show certbot.service snap.certbot.renew.service certbot.timer snap.certbot.renew.timer -p Id -p ActiveState -p SubState -p Result -p UnitFileState --no-pager || true
|
||||
systemctl list-timers --all --no-pager | grep -i certbot || true
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -Eiq 'rateLimited|Service busy' <<<"$out" && blocked "certbot renewal is rate-limited; do not retry blindly"
|
||||
grep -Eiq 'Some challenges have failed|challenge' <<<"$out" && blocked "certbot challenge failure requires DNS / ACME route owner evidence"
|
||||
if grep -q 'Id=certbot.service' <<<"$out" && grep -A3 'Id=certbot.service' <<<"$out" | grep -q 'Result=failed'; then
|
||||
blocked "apt certbot service currently failed"
|
||||
else
|
||||
ok "apt certbot service is not currently failed"
|
||||
fi
|
||||
if grep -q 'Id=snap.certbot.renew.service' <<<"$out" && grep -A3 'Id=snap.certbot.renew.service' <<<"$out" | grep -q 'Result=failed'; then
|
||||
blocked "snap certbot renew service currently failed"
|
||||
else
|
||||
ok "snap certbot renew service is not currently failed"
|
||||
fi
|
||||
if grep -A4 'Id=certbot.timer' <<<"$out" | grep -q 'UnitFileState=disabled'; then
|
||||
ok "legacy apt certbot timer disabled to avoid duplicate renewals"
|
||||
else
|
||||
warn "legacy apt certbot timer is not disabled"
|
||||
fi
|
||||
if grep -A4 'Id=snap.certbot.renew.timer' <<<"$out" | grep -q 'ActiveState=active' && grep -A4 'Id=snap.certbot.renew.timer' <<<"$out" | grep -q 'UnitFileState=enabled'; then
|
||||
ok "snap certbot renew timer enabled"
|
||||
else
|
||||
blocked "snap certbot renew timer is not enabled and active"
|
||||
fi
|
||||
else
|
||||
blocked "certbot status unavailable"
|
||||
echo "$out"
|
||||
@@ -223,7 +259,27 @@ else
|
||||
fi
|
||||
|
||||
section "Maintenance decision tree"
|
||||
cat <<'STEPS'
|
||||
if [ "$SERVICE_GREEN" -eq 1 ] && [ "$HOST_HYGIENE_BLOCKED" -eq 0 ]; then
|
||||
cat <<'STEPS'
|
||||
Current expected outcome:
|
||||
SERVICE_GREEN=1
|
||||
HOST_HYGIENE_BLOCKED=0
|
||||
RESULT=HOST_188_HYGIENE_GREEN
|
||||
|
||||
Allowed next step:
|
||||
1. Keep this host in the normal post-reboot summary.
|
||||
2. Wait for snap certbot timer / ACME-window readback before declaring formal certificate renewal success.
|
||||
3. Keep DR credential escrow and Wazuh registry evidence as separate blockers.
|
||||
|
||||
Forbidden without separate approval:
|
||||
- pg_resetwal
|
||||
- DB restore
|
||||
- Docker/systemd restart
|
||||
- firewall change
|
||||
- Wazuh active response or agent re-enroll
|
||||
STEPS
|
||||
else
|
||||
cat <<'STEPS'
|
||||
Current expected outcome when 188 service is green but host hygiene is not:
|
||||
SERVICE_GREEN=1
|
||||
HOST_HYGIENE_BLOCKED=1
|
||||
@@ -244,6 +300,7 @@ Forbidden without maintenance approval:
|
||||
- Docker/systemd restart
|
||||
- host file write
|
||||
STEPS
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "SERVICE_GREEN=$SERVICE_GREEN"
|
||||
|
||||
@@ -14,8 +14,9 @@ Description=AWOOOI Auto-Startup Recovery Sequence
|
||||
After=network-online.target containerd.service docker.service
|
||||
Wants=network-online.target
|
||||
|
||||
# 確保 PostgreSQL 盡早嘗試啟動
|
||||
Wants=postgresql@14-main.service redis-server.service ollama.service nginx.service
|
||||
# PostgreSQL 可由受控 recovery container 提供;不得在 startup 階段硬拉
|
||||
# postgresql@14-main.service,避免與 recovery runtime 競爭或觸發假綠修復。
|
||||
Wants=redis-server.service ollama.service nginx.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
# 2026-04-04 ogt: 根據實際事故建立,處理 container / Docker 啟動順序與 K3s Kine 維護。
|
||||
# 2026-06-26 Codex: PostgreSQL checkpoint/WAL 錯誤改為 fail-closed;
|
||||
# 不在自動啟動腳本內執行 pg_resetwal,避免資料破壞被誤判成恢復。
|
||||
# 2026-06-26 Codex: 允許受控 recovery container 提供 14/main runtime;
|
||||
# 不再因 systemd postgresql@14-main failed 而誤判活 DB 為不可用。
|
||||
# 部署位置: /usr/local/bin/awoooi-startup.sh (on 192.168.0.188)
|
||||
# systemd unit: /etc/systemd/system/awoooi-startup.service
|
||||
|
||||
@@ -12,6 +14,22 @@ exec > >(tee -a "$LOG") 2>&1
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
|
||||
postgres_runtime_ready() {
|
||||
if systemctl is-active postgresql@14-main >/dev/null 2>&1; then
|
||||
log "✅ PostgreSQL systemd unit active"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if docker inspect -f '{{.State.Running}} {{.HostConfig.NetworkMode}}' k3s-postgres-recovery 2>/dev/null | grep -q '^true host$'; then
|
||||
if pg_isready -h localhost -p 5432 >/dev/null 2>&1; then
|
||||
log "✅ PostgreSQL recovery container active on host network"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
log "=== AWOOOI 啟動序列開始 ==="
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
@@ -73,20 +91,20 @@ fi
|
||||
# ──────────────────────────────────────────────
|
||||
log "[3/7] 檢查 PostgreSQL..."
|
||||
|
||||
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
|
||||
if ! postgres_runtime_ready; then
|
||||
log "PostgreSQL 未啟動,嘗試啟動..."
|
||||
systemctl start postgresql@14-main || true
|
||||
sleep 8
|
||||
fi
|
||||
|
||||
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
|
||||
if ! postgres_runtime_ready; then
|
||||
log "PostgreSQL 啟動失敗,檢查是否屬於 checkpoint/WAL 類資料層錯誤..."
|
||||
if journalctl -u postgresql@14-main -n 20 | grep -q "could not locate a valid checkpoint"; then
|
||||
log "❌ 偵測到 PostgreSQL checkpoint/WAL 錯誤;禁止自動 pg_resetwal。"
|
||||
log "需要 DB owner、備份/restore evidence、maintenance window 與 post-check 後才能人工處理。"
|
||||
exit 1
|
||||
fi
|
||||
systemctl is-active postgresql@14-main && log "✅ PostgreSQL 修復成功" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
|
||||
postgres_runtime_ready && log "✅ PostgreSQL runtime 可用" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
|
||||
fi
|
||||
|
||||
# 等待 PG 接受連線
|
||||
|
||||
@@ -23,7 +23,7 @@ OWNER_PACKET_GENERATOR = (
|
||||
)
|
||||
|
||||
EXPECTED_SCHEMA = "awoooi_post_reboot_next_gate_owner_packets_v1"
|
||||
EXPECTED_GATES = {
|
||||
KNOWN_GATES = {
|
||||
"credential_escrow_evidence",
|
||||
"host_188_hygiene_maintenance_window",
|
||||
"wazuh_manager_registry_export",
|
||||
@@ -187,12 +187,21 @@ def validate_packet(packet: dict[str, Any]) -> list[str]:
|
||||
counts = {}
|
||||
|
||||
gate_ids = {str(item.get("packet_id", "")) for item in owner_packets if isinstance(item, dict)}
|
||||
if gate_ids != EXPECTED_GATES:
|
||||
unknown_gates = sorted(gate_ids - KNOWN_GATES)
|
||||
if unknown_gates:
|
||||
failures.append(f"unknown_gate_ids={unknown_gates}")
|
||||
|
||||
source = packet.get("source", {})
|
||||
if not isinstance(source, dict):
|
||||
failures.append("source_not_object")
|
||||
source = {}
|
||||
expected_gates = set(str(item) for item in as_list(source.get("next_required_gates")))
|
||||
if expected_gates != gate_ids:
|
||||
failures.append(f"gate_ids={sorted(gate_ids)}")
|
||||
|
||||
expected_counts = {
|
||||
"next_gate_count": 3,
|
||||
"p0_gate_count": 3,
|
||||
"next_gate_count": len(gate_ids),
|
||||
"p0_gate_count": len(gate_ids),
|
||||
}
|
||||
for key, expected in expected_counts.items():
|
||||
if counts.get(key) != expected:
|
||||
|
||||
@@ -9,6 +9,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-6}"
|
||||
ROUTE_RETRY_ATTEMPTS="${ROUTE_RETRY_ATTEMPTS:-3}"
|
||||
ROUTE_RETRY_DELAY_SECONDS="${ROUTE_RETRY_DELAY_SECONDS:-2}"
|
||||
STOCK_FRESHNESS_RETRY_ATTEMPTS="${STOCK_FRESHNESS_RETRY_ATTEMPTS:-6}"
|
||||
STOCK_FRESHNESS_RETRY_DELAY_SECONDS="${STOCK_FRESHNESS_RETRY_DELAY_SECONDS:-5}"
|
||||
RUN_COLD_START=1
|
||||
RUN_MOMO=1
|
||||
RUN_STOCK=1
|
||||
@@ -76,6 +78,8 @@ Options:
|
||||
Environment:
|
||||
ROUTE_RETRY_ATTEMPTS Public route attempts before blocking. Default: 3.
|
||||
ROUTE_RETRY_DELAY_SECONDS Delay between failed public route attempts. Default: 2.
|
||||
STOCK_FRESHNESS_RETRY_ATTEMPTS Stock freshness attempts before blocking. Default: 6.
|
||||
STOCK_FRESHNESS_RETRY_DELAY_SECONDS Delay between failed Stock freshness attempts. Default: 5.
|
||||
|
||||
Exit codes:
|
||||
0 = no service blockers. Boundary / evidence warnings may still be present.
|
||||
@@ -277,9 +281,23 @@ fi
|
||||
if [[ "$RUN_STOCK" -eq 1 ]]; then
|
||||
section "StockPlatform freshness"
|
||||
stock_tmp="$(mktemp -t post-start-stock.XXXXXX)"
|
||||
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
|
||||
stock_code=""
|
||||
stock_attempt=1
|
||||
while [[ "$stock_attempt" -le "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; do
|
||||
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
|
||||
if [[ "$stock_code" == 2* ]]; then
|
||||
if [[ "$stock_attempt" -gt 1 ]]; then
|
||||
evidence_warn "StockPlatform freshness recovered after attempt=$stock_attempt"
|
||||
fi
|
||||
break
|
||||
fi
|
||||
if [[ "$stock_attempt" -lt "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; then
|
||||
sleep "$STOCK_FRESHNESS_RETRY_DELAY_SECONDS"
|
||||
fi
|
||||
stock_attempt=$((stock_attempt + 1))
|
||||
done
|
||||
if [[ "$stock_code" != 2* ]]; then
|
||||
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed}"
|
||||
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed} attempts=$STOCK_FRESHNESS_RETRY_ATTEMPTS"
|
||||
cat "$stock_tmp" || true
|
||||
else
|
||||
python3 - "$stock_tmp" <<'PY'
|
||||
|
||||
Reference in New Issue
Block a user