fix(recovery): quarantine corrupt docker metadata on boot
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 10m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-06-30 18:02:43 +08:00
parent c9ef44e050
commit 66ec3c92e5
3 changed files with 112 additions and 4 deletions

View File

@@ -11,6 +11,7 @@
- StockPlatform public edge / API 已恢復:`https://stock.wooo.work/healthz=200``/api/healthz=200`;但 freshness / ingestion 仍 `status=not_configured`blocker `postgres_not_ready`
**source 修正**
- `awoooi-startup-110.sh` 新增 bounded Docker start、dockerd journal corrupt metadata ID 抽取、container metadata dir quarantine 與 TERM main process 後重啟 Docker 的受控恢復路徑,避免下次 dockerd 因 JSON metadata 損壞卡死 startup。
- `full-stack-cold-start-check.sh` 新增 registry 外部 `/v2/` gate 與 K3s `IMAGE_PULL_BLOCKED` / `REGISTRY_PULL_REFUSED_EVENTS` blocker。
- `full-stack-recovery-scorecard.sh` 新增 `CORE_REGISTRY_HTTPS_CODE``CORE_REGISTRY_HTTP_CODE``CORE_REGISTRY_READY``CORE_REGISTRY_BLOCKER`registry `:5000` 未就緒時不允許 `CORE_READY`

View File

@@ -6,15 +6,99 @@
#
# 已知問題處理:
# - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db)
# - Docker container metadata JSON 損壞dockerd 卡在 startup / loading containers
# - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部)
# - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up)
set -uo pipefail
LOG="/var/log/awoooi-startup-110.log"
DOCKER_START_TIMEOUT_SECONDS="${AWOOOI_DOCKER_START_TIMEOUT_SECONDS:-45}"
DOCKER_ACTIVE_WAIT_ATTEMPTS="${AWOOOI_DOCKER_ACTIVE_WAIT_ATTEMPTS:-18}"
DOCKER_ACTIVE_WAIT_SLEEP_SECONDS="${AWOOOI_DOCKER_ACTIVE_WAIT_SLEEP_SECONDS:-5}"
exec > >(tee -a "$LOG") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
run_bounded() {
local seconds="$1"
shift
if command -v timeout >/dev/null 2>&1; then
timeout "$seconds" "$@"
else
"$@"
fi
}
start_docker_bounded() {
log "bounded start docker.service/docker.socket timeout=${DOCKER_START_TIMEOUT_SECONDS}s"
run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service || true
}
wait_docker_active() {
local i state
for i in $(seq 1 "$DOCKER_ACTIVE_WAIT_ATTEMPTS"); do
state=$(systemctl is-active docker 2>/dev/null || true)
log "Docker wait ${i}/${DOCKER_ACTIVE_WAIT_ATTEMPTS}: ${state:-unknown}"
[ "$state" = "active" ] && return 0
sleep "$DOCKER_ACTIVE_WAIT_SLEEP_SECONDS"
done
return 1
}
stop_docker_bounded_for_metadata_recovery() {
local state
run_bounded 25 systemctl stop docker.service docker.socket || true
state=$(systemctl is-active docker 2>/dev/null || true)
if [ "$state" = "active" ] || [ "$state" = "activating" ] || [ "$state" = "deactivating" ]; then
log "Docker stop still ${state}; TERM main process for metadata quarantine recovery"
systemctl kill --kill-who=main --signal=TERM docker.service || true
sleep 4
run_bounded 20 systemctl stop docker.service docker.socket || true
fi
}
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0
quarantine_corrupt_docker_container_metadata() {
local ids qdir id src count
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0
ids=$(journalctl -u docker.service --since "-30 min" --no-pager 2>/dev/null \
| awk '/failed to load container|invalid character|json: cannot unmarshal|unexpected EOF/ {
line=$0
while (match(line, /[0-9a-f]{64}/)) {
print substr(line, RSTART, RLENGTH)
line=substr(line, RSTART + RLENGTH)
}
}' \
| sort -u)
if [ -z "$ids" ]; then
log "Docker corrupt container metadata: journal 未找到可 quarantine 的 container id"
return 0
fi
qdir="/var/lib/docker/containers/.awoooi-corrupt-metadata-quarantine-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$qdir"
count=0
for id in $ids; do
case "$id" in
*[!0-9a-f]*|"")
log "跳過不合法 container id: $id"
continue
;;
esac
src="/var/lib/docker/containers/$id"
if [ -d "$src" ]; then
mv "$src" "$qdir/$id"
count=$((count + 1))
log "quarantine corrupt Docker container metadata: $id"
else
log "corrupt metadata candidate already absent: $id"
fi
done
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT="$count"
log "Docker corrupt metadata quarantine dir=$qdir count=$count"
}
log "=== 192.168.0.110 啟動序列開始 ==="
# ──────────────────────────────────────────────
@@ -24,8 +108,18 @@ log "[1/5] 檢查 Docker..."
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker 未啟動,嘗試啟動..."
systemctl start docker || true
sleep 8
start_docker_bounded
wait_docker_active || true
fi
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker bounded start 未 active檢查 corrupt container metadata..."
quarantine_corrupt_docker_container_metadata
if [ "$DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" -gt 0 ]; then
stop_docker_bounded_for_metadata_recovery
start_docker_bounded
wait_docker_active || true
fi
fi
if ! systemctl is-active docker >/dev/null 2>&1; then
@@ -47,8 +141,8 @@ if ! systemctl is-active docker >/dev/null 2>&1; then
# 清除 buildkit 快取(也可能損壞)
find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true
systemctl start docker
sleep 8
start_docker_bounded
wait_docker_active || true
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
else
log "✅ Docker 已 active"

View File

@@ -5,6 +5,7 @@ from pathlib import Path
ROOT = Path(__file__).resolve().parents[3]
COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh"
STARTUP_110 = ROOT / "scripts" / "reboot-recovery" / "awoooi-startup-110.sh"
RECOVERY_SCORECARD = (
ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh"
)
@@ -57,6 +58,18 @@ def test_full_stack_cold_start_check_tracks_registry_and_image_pull_blockers() -
assert "K3s registry pull refused by 110:5000" in text
def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None:
text = STARTUP_110.read_text(encoding="utf-8")
assert "AWOOOI_DOCKER_START_TIMEOUT_SECONDS" in text
assert "quarantine_corrupt_docker_container_metadata()" in text
assert "DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" in text
assert "failed to load container|invalid character|json: cannot unmarshal|unexpected EOF" in text
assert 'mv "$src" "$qdir/$id"' in text
assert "systemctl kill --kill-who=main --signal=TERM docker.service" in text
assert 'run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service' in text
def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None:
text = VERIFY_DEPLOY.read_text(encoding="utf-8")