diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 6660bbfd..38fd1c30 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -11,6 +11,7 @@ - StockPlatform public edge / API 已恢復:`https://stock.wooo.work/healthz=200`、`/api/healthz=200`;但 freshness / ingestion 仍 `status=not_configured`,blocker `postgres_not_ready`。 **source 修正**: +- `awoooi-startup-110.sh` 新增 bounded Docker start、dockerd journal corrupt metadata ID 抽取、container metadata dir quarantine 與 TERM main process 後重啟 Docker 的受控恢復路徑,避免下次 dockerd 因 JSON metadata 損壞卡死 startup。 - `full-stack-cold-start-check.sh` 新增 registry 外部 `/v2/` gate 與 K3s `IMAGE_PULL_BLOCKED` / `REGISTRY_PULL_REFUSED_EVENTS` blocker。 - `full-stack-recovery-scorecard.sh` 新增 `CORE_REGISTRY_HTTPS_CODE`、`CORE_REGISTRY_HTTP_CODE`、`CORE_REGISTRY_READY`、`CORE_REGISTRY_BLOCKER`,registry `:5000` 未就緒時不允許 `CORE_READY`。 diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index ae8ab647..bc6a9a78 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -6,15 +6,99 @@ # # 已知問題處理: # - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db) +# - Docker container metadata JSON 損壞,dockerd 卡在 startup / loading containers # - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部) # - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up) set -uo pipefail LOG="/var/log/awoooi-startup-110.log" +DOCKER_START_TIMEOUT_SECONDS="${AWOOOI_DOCKER_START_TIMEOUT_SECONDS:-45}" +DOCKER_ACTIVE_WAIT_ATTEMPTS="${AWOOOI_DOCKER_ACTIVE_WAIT_ATTEMPTS:-18}" +DOCKER_ACTIVE_WAIT_SLEEP_SECONDS="${AWOOOI_DOCKER_ACTIVE_WAIT_SLEEP_SECONDS:-5}" exec > >(tee -a "$LOG") 2>&1 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } +run_bounded() { + local seconds="$1" + shift + if command -v timeout >/dev/null 2>&1; then + timeout "$seconds" "$@" + else + "$@" + fi +} + +start_docker_bounded() { + log "bounded start docker.service/docker.socket timeout=${DOCKER_START_TIMEOUT_SECONDS}s" + run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service || true +} + +wait_docker_active() { + local i state + for i in $(seq 1 "$DOCKER_ACTIVE_WAIT_ATTEMPTS"); do + state=$(systemctl is-active docker 2>/dev/null || true) + log "Docker wait ${i}/${DOCKER_ACTIVE_WAIT_ATTEMPTS}: ${state:-unknown}" + [ "$state" = "active" ] && return 0 + sleep "$DOCKER_ACTIVE_WAIT_SLEEP_SECONDS" + done + return 1 +} + +stop_docker_bounded_for_metadata_recovery() { + local state + run_bounded 25 systemctl stop docker.service docker.socket || true + state=$(systemctl is-active docker 2>/dev/null || true) + if [ "$state" = "active" ] || [ "$state" = "activating" ] || [ "$state" = "deactivating" ]; then + log "Docker stop still ${state}; TERM main process for metadata quarantine recovery" + systemctl kill --kill-who=main --signal=TERM docker.service || true + sleep 4 + run_bounded 20 systemctl stop docker.service docker.socket || true + fi +} + +DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0 +quarantine_corrupt_docker_container_metadata() { + local ids qdir id src count + DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0 + ids=$(journalctl -u docker.service --since "-30 min" --no-pager 2>/dev/null \ + | awk '/failed to load container|invalid character|json: cannot unmarshal|unexpected EOF/ { + line=$0 + while (match(line, /[0-9a-f]{64}/)) { + print substr(line, RSTART, RLENGTH) + line=substr(line, RSTART + RLENGTH) + } + }' \ + | sort -u) + + if [ -z "$ids" ]; then + log "Docker corrupt container metadata: journal 未找到可 quarantine 的 container id" + return 0 + fi + + qdir="/var/lib/docker/containers/.awoooi-corrupt-metadata-quarantine-$(date +%Y%m%d-%H%M%S)" + mkdir -p "$qdir" + count=0 + for id in $ids; do + case "$id" in + *[!0-9a-f]*|"") + log "跳過不合法 container id: $id" + continue + ;; + esac + src="/var/lib/docker/containers/$id" + if [ -d "$src" ]; then + mv "$src" "$qdir/$id" + count=$((count + 1)) + log "quarantine corrupt Docker container metadata: $id" + else + log "corrupt metadata candidate already absent: $id" + fi + done + DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT="$count" + log "Docker corrupt metadata quarantine dir=$qdir count=$count" +} + log "=== 192.168.0.110 啟動序列開始 ===" # ────────────────────────────────────────────── @@ -24,8 +108,18 @@ log "[1/5] 檢查 Docker..." if ! systemctl is-active docker >/dev/null 2>&1; then log "Docker 未啟動,嘗試啟動..." - systemctl start docker || true - sleep 8 + start_docker_bounded + wait_docker_active || true +fi + +if ! systemctl is-active docker >/dev/null 2>&1; then + log "Docker bounded start 未 active,檢查 corrupt container metadata..." + quarantine_corrupt_docker_container_metadata + if [ "$DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" -gt 0 ]; then + stop_docker_bounded_for_metadata_recovery + start_docker_bounded + wait_docker_active || true + fi fi if ! systemctl is-active docker >/dev/null 2>&1; then @@ -47,8 +141,8 @@ if ! systemctl is-active docker >/dev/null 2>&1; then # 清除 buildkit 快取(也可能損壞) find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true - systemctl start docker - sleep 8 + start_docker_bounded + wait_docker_active || true systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; } else log "✅ Docker 已 active" diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 73bd71c4..f668b55d 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -5,6 +5,7 @@ from pathlib import Path ROOT = Path(__file__).resolve().parents[3] COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh" +STARTUP_110 = ROOT / "scripts" / "reboot-recovery" / "awoooi-startup-110.sh" RECOVERY_SCORECARD = ( ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh" ) @@ -57,6 +58,18 @@ def test_full_stack_cold_start_check_tracks_registry_and_image_pull_blockers() - assert "K3s registry pull refused by 110:5000" in text +def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None: + text = STARTUP_110.read_text(encoding="utf-8") + + assert "AWOOOI_DOCKER_START_TIMEOUT_SECONDS" in text + assert "quarantine_corrupt_docker_container_metadata()" in text + assert "DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" in text + assert "failed to load container|invalid character|json: cannot unmarshal|unexpected EOF" in text + assert 'mv "$src" "$qdir/$id"' in text + assert "systemctl kill --kill-who=main --signal=TERM docker.service" in text + assert 'run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service' in text + + def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None: text = VERIFY_DEPLOY.read_text(encoding="utf-8")