fix(recovery): quarantine corrupt docker metadata on boot
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 10m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 10m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -11,6 +11,7 @@
|
||||
- StockPlatform public edge / API 已恢復:`https://stock.wooo.work/healthz=200`、`/api/healthz=200`;但 freshness / ingestion 仍 `status=not_configured`,blocker `postgres_not_ready`。
|
||||
|
||||
**source 修正**:
|
||||
- `awoooi-startup-110.sh` 新增 bounded Docker start、dockerd journal corrupt metadata ID 抽取、container metadata dir quarantine 與 TERM main process 後重啟 Docker 的受控恢復路徑,避免下次 dockerd 因 JSON metadata 損壞卡死 startup。
|
||||
- `full-stack-cold-start-check.sh` 新增 registry 外部 `/v2/` gate 與 K3s `IMAGE_PULL_BLOCKED` / `REGISTRY_PULL_REFUSED_EVENTS` blocker。
|
||||
- `full-stack-recovery-scorecard.sh` 新增 `CORE_REGISTRY_HTTPS_CODE`、`CORE_REGISTRY_HTTP_CODE`、`CORE_REGISTRY_READY`、`CORE_REGISTRY_BLOCKER`,registry `:5000` 未就緒時不允許 `CORE_READY`。
|
||||
|
||||
|
||||
@@ -6,15 +6,99 @@
|
||||
#
|
||||
# 已知問題處理:
|
||||
# - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db)
|
||||
# - Docker container metadata JSON 損壞,dockerd 卡在 startup / loading containers
|
||||
# - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部)
|
||||
# - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up)
|
||||
|
||||
set -uo pipefail
|
||||
LOG="/var/log/awoooi-startup-110.log"
|
||||
DOCKER_START_TIMEOUT_SECONDS="${AWOOOI_DOCKER_START_TIMEOUT_SECONDS:-45}"
|
||||
DOCKER_ACTIVE_WAIT_ATTEMPTS="${AWOOOI_DOCKER_ACTIVE_WAIT_ATTEMPTS:-18}"
|
||||
DOCKER_ACTIVE_WAIT_SLEEP_SECONDS="${AWOOOI_DOCKER_ACTIVE_WAIT_SLEEP_SECONDS:-5}"
|
||||
exec > >(tee -a "$LOG") 2>&1
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
|
||||
run_bounded() {
|
||||
local seconds="$1"
|
||||
shift
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
timeout "$seconds" "$@"
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
}
|
||||
|
||||
start_docker_bounded() {
|
||||
log "bounded start docker.service/docker.socket timeout=${DOCKER_START_TIMEOUT_SECONDS}s"
|
||||
run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service || true
|
||||
}
|
||||
|
||||
wait_docker_active() {
|
||||
local i state
|
||||
for i in $(seq 1 "$DOCKER_ACTIVE_WAIT_ATTEMPTS"); do
|
||||
state=$(systemctl is-active docker 2>/dev/null || true)
|
||||
log "Docker wait ${i}/${DOCKER_ACTIVE_WAIT_ATTEMPTS}: ${state:-unknown}"
|
||||
[ "$state" = "active" ] && return 0
|
||||
sleep "$DOCKER_ACTIVE_WAIT_SLEEP_SECONDS"
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_docker_bounded_for_metadata_recovery() {
|
||||
local state
|
||||
run_bounded 25 systemctl stop docker.service docker.socket || true
|
||||
state=$(systemctl is-active docker 2>/dev/null || true)
|
||||
if [ "$state" = "active" ] || [ "$state" = "activating" ] || [ "$state" = "deactivating" ]; then
|
||||
log "Docker stop still ${state}; TERM main process for metadata quarantine recovery"
|
||||
systemctl kill --kill-who=main --signal=TERM docker.service || true
|
||||
sleep 4
|
||||
run_bounded 20 systemctl stop docker.service docker.socket || true
|
||||
fi
|
||||
}
|
||||
|
||||
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0
|
||||
quarantine_corrupt_docker_container_metadata() {
|
||||
local ids qdir id src count
|
||||
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT=0
|
||||
ids=$(journalctl -u docker.service --since "-30 min" --no-pager 2>/dev/null \
|
||||
| awk '/failed to load container|invalid character|json: cannot unmarshal|unexpected EOF/ {
|
||||
line=$0
|
||||
while (match(line, /[0-9a-f]{64}/)) {
|
||||
print substr(line, RSTART, RLENGTH)
|
||||
line=substr(line, RSTART + RLENGTH)
|
||||
}
|
||||
}' \
|
||||
| sort -u)
|
||||
|
||||
if [ -z "$ids" ]; then
|
||||
log "Docker corrupt container metadata: journal 未找到可 quarantine 的 container id"
|
||||
return 0
|
||||
fi
|
||||
|
||||
qdir="/var/lib/docker/containers/.awoooi-corrupt-metadata-quarantine-$(date +%Y%m%d-%H%M%S)"
|
||||
mkdir -p "$qdir"
|
||||
count=0
|
||||
for id in $ids; do
|
||||
case "$id" in
|
||||
*[!0-9a-f]*|"")
|
||||
log "跳過不合法 container id: $id"
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
src="/var/lib/docker/containers/$id"
|
||||
if [ -d "$src" ]; then
|
||||
mv "$src" "$qdir/$id"
|
||||
count=$((count + 1))
|
||||
log "quarantine corrupt Docker container metadata: $id"
|
||||
else
|
||||
log "corrupt metadata candidate already absent: $id"
|
||||
fi
|
||||
done
|
||||
DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT="$count"
|
||||
log "Docker corrupt metadata quarantine dir=$qdir count=$count"
|
||||
}
|
||||
|
||||
log "=== 192.168.0.110 啟動序列開始 ==="
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
@@ -24,8 +108,18 @@ log "[1/5] 檢查 Docker..."
|
||||
|
||||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||||
log "Docker 未啟動,嘗試啟動..."
|
||||
systemctl start docker || true
|
||||
sleep 8
|
||||
start_docker_bounded
|
||||
wait_docker_active || true
|
||||
fi
|
||||
|
||||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||||
log "Docker bounded start 未 active,檢查 corrupt container metadata..."
|
||||
quarantine_corrupt_docker_container_metadata
|
||||
if [ "$DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" -gt 0 ]; then
|
||||
stop_docker_bounded_for_metadata_recovery
|
||||
start_docker_bounded
|
||||
wait_docker_active || true
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||||
@@ -47,8 +141,8 @@ if ! systemctl is-active docker >/dev/null 2>&1; then
|
||||
# 清除 buildkit 快取(也可能損壞)
|
||||
find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true
|
||||
|
||||
systemctl start docker
|
||||
sleep 8
|
||||
start_docker_bounded
|
||||
wait_docker_active || true
|
||||
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
|
||||
else
|
||||
log "✅ Docker 已 active"
|
||||
|
||||
@@ -5,6 +5,7 @@ from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh"
|
||||
STARTUP_110 = ROOT / "scripts" / "reboot-recovery" / "awoooi-startup-110.sh"
|
||||
RECOVERY_SCORECARD = (
|
||||
ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh"
|
||||
)
|
||||
@@ -57,6 +58,18 @@ def test_full_stack_cold_start_check_tracks_registry_and_image_pull_blockers() -
|
||||
assert "K3s registry pull refused by 110:5000" in text
|
||||
|
||||
|
||||
def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None:
|
||||
text = STARTUP_110.read_text(encoding="utf-8")
|
||||
|
||||
assert "AWOOOI_DOCKER_START_TIMEOUT_SECONDS" in text
|
||||
assert "quarantine_corrupt_docker_container_metadata()" in text
|
||||
assert "DOCKER_CORRUPT_METADATA_QUARANTINED_COUNT" in text
|
||||
assert "failed to load container|invalid character|json: cannot unmarshal|unexpected EOF" in text
|
||||
assert 'mv "$src" "$qdir/$id"' in text
|
||||
assert "systemctl kill --kill-who=main --signal=TERM docker.service" in text
|
||||
assert 'run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service' in text
|
||||
|
||||
|
||||
def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None:
|
||||
text = VERIFY_DEPLOY.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user