From a15ab298ff45d37ee29f4cf23c11d9d2b69e16c2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Jul 2026 00:18:17 +0800 Subject: [PATCH] fix(ops): close post-reboot recovery guardrails --- docs/LOGBOOK.md | 34 ++ ...recovery-readback-2026-07-01.snapshot.json | 151 +++++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 45 +++ .../full-stack-cold-start-baseline.yml | 11 +- .../docker-disk-pressure-retention-cleanup.py | 305 ++++++++++++++++++ ..._docker_disk_pressure_retention_cleanup.py | 110 +++++++ 6 files changed, 654 insertions(+), 2 deletions(-) create mode 100644 docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json create mode 100755 scripts/ops/docker-disk-pressure-retention-cleanup.py create mode 100644 scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d2ab15a6..c7e61a60 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51934,6 +51934,40 @@ production browser smoke: - 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 - 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。 +## 2026-07-02 — 00:17 P0 2026-07-01 post-reboot runtime recovery GREEN 與 SLO miss 留證 + +**完成內容**: +- 188 `momo-db` 補上 Docker resource guardrail:`--cpus=2 --memory=4g --memory-swap=6g`;live inspect 讀回 `nanocpus=2000000000 memory=4294967296 memswap=6442450944 restart=unless-stopped`,`https://mo.wooo.work/health` 回 `status=healthy / database=postgresql / version=V10.725`。 +- 110 `sentry-self-hosted-relay-1` memory-limit pressure 修復:live inspect 讀回 `health=healthy memory=3221225472 memswap=4294967296`,`https://sentry.wooo.work/_health/` 回 `ok`。 +- 新增 `scripts/ops/docker-disk-pressure-retention-cleanup.py` 與測試,提供 dry-run first、bounded、非 destructive 的 Docker disk-pressure cleanup;明確禁止 `docker system prune`,不碰 containers / volumes / DB / backups。 +- 110 disk pressure 從約 `85%` 降到 Prometheus `79.6829%`;188 disk pressure 從約 `85%` 降到 Prometheus `79.2250%`。 +- 重新安裝 110 reboot auto-recovery SLO textfile,scope 從舊 `110_120_121_188` 修正為 `99_110_111_112_120_121_188`;live textfile 讀回 `awoooi_reboot_event_detected=1`,`RebootEventDetectorMissing` 已清除。 +- Prometheus firing alerts 從 7 個降到 2 個,只剩 `HostRebootEventDetected` 與 `RebootAutoRecoverySLOMissed`;disk/resource/missing-detector 類告警已清除。 +- 全主機 cold-start scorecard 已重新跑:`PASS=96 WARN=0 BLOCKED=0`,result `GREEN`。 +- `ops/reboot-recovery/full-stack-cold-start-baseline.yml` 已補 source baseline:110 Sentry relay `3g/4g swap`,188 `momo-db 2CPU/4G/6G swap/unless-stopped`,避免 live 修復下次回退。 +- 新增 machine-readable receipt:`docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json`。 +- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 新增 §16.3,沉澱本次 resource / disk / detector / full-stack GREEN 與 SLO miss 宣告界線。 + +**本地驗證結果**: +- `python3.11 -m pytest scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py -q`:`6 passed`。 +- `python3.11 -m pytest scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py scripts/ops/tests/test_host_pressure_alert_contract.py -q`:`9 passed`。 +- `python3.11 -m py_compile scripts/ops/docker-disk-pressure-retention-cleanup.py scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py`:通過。 +- live full-stack:`SSH_CONNECT_TIMEOUT=8 bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color` → `PASS=96 WARN=0 BLOCKED=0`。 + +**live truth**: +- 可以宣稱目前 runtime full-stack recovered / GREEN。 +- 不可宣稱本次重啟達成 10 分鐘 SLO;Prometheus 保留 `RebootAutoRecoverySLOMissed` 是正確 incident evidence,不應消音。 +- `https://aiops.wooo.work/api/v1/agents/reboot-auto-recovery-slo-scorecard` 仍可能是 stale snapshot;目前以 live textfile / Prometheus / cold-start scorecard 為 runtime truth。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune / remote delete。 + +**下一步**: +- rebase 到 Gitea main 最新 deploy marker,commit / push 本次 source 修法與 receipt。 +- 讀回 Gitea CD / Prometheus;下一條主線是把 `RebootAutoRecoverySLOMissed` 的 6 個 blocker 轉成可自動判斷與可回滾 remediation,不消音告警。 + ## 2026-07-01 — 23:28 P0 110 sustained CPU pressure alert / controlled quota / alert-chain readback **完成內容**: diff --git a/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json b/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json new file mode 100644 index 00000000..89f7cadf --- /dev/null +++ b/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json @@ -0,0 +1,151 @@ +{ + "schema_version": "awoooi_post_reboot_runtime_recovery_readback_v1", + "generated_at": "2026-07-02T00:17:36+08:00", + "scope": "99_110_111_112_120_121_188", + "purpose": "Record live post-reboot recovery evidence after 2026-07-01 all-host reboot and prevent source drift.", + "boundaries": { + "read_secret_values": false, + "read_env_files": false, + "read_raw_sessions": false, + "read_sqlite": false, + "used_github": false, + "rebooted_hosts": false, + "restarted_docker_nginx_k3s_db_firewall": false, + "used_docker_system_prune": false, + "touched_containers": false, + "touched_volumes": false, + "touched_databases": false, + "touched_backups": false + }, + "live_actions": [ + { + "host": "188", + "target": "momo-db", + "action": "docker update --cpus=2 --memory=4g --memory-swap=6g momo-db", + "blast_radius": "container resource guardrail only; no container restart", + "readback": { + "running": true, + "nanocpus": 2000000000, + "memory_bytes": 4294967296, + "memswap_bytes": 6442450944, + "restart_policy": "unless-stopped", + "public_health": "https://mo.wooo.work/health returned status=healthy database=postgresql version=V10.725" + } + }, + { + "host": "110", + "target": "sentry-self-hosted-relay-1", + "action": "docker update --memory=3g --memory-swap=4g sentry-self-hosted-relay-1", + "blast_radius": "container resource guardrail only; no container restart", + "readback": { + "running": true, + "health": "healthy", + "memory_bytes": 3221225472, + "memswap_bytes": 4294967296, + "public_health": "https://sentry.wooo.work/_health/ returned ok" + } + }, + { + "host": "110", + "target": "root filesystem Docker pressure", + "action": "bounded dangling image and BuildKit cache cleanup with scripts/ops/docker-disk-pressure-retention-cleanup.py", + "blast_radius": "unreferenced dangling images and builder cache only", + "readback": { + "filesystem_used_percent_after_prometheus": 79.68293081715491, + "direct_df_after": "/dev/mapper/ubuntu--vg-ubuntu--lv ext4 983G Used 743G Avail 200G Use% 79% /", + "docker_images_after": "92.91GB total, 29.81GB reclaimable", + "docker_build_cache_after": "43.57GB total, 12.59GB reclaimable" + } + }, + { + "host": "188", + "target": "root filesystem Docker pressure", + "action": "bounded BuildKit cache cleanup with scripts/ops/docker-disk-pressure-retention-cleanup.py", + "blast_radius": "builder cache only in final zero-age pass", + "readback": { + "filesystem_used_percent_after_prometheus": 79.22498388062007, + "direct_df_after": "/dev/mapper/ubuntu--vg-ubuntu--lv ext4 982G Used 737G Avail 204G Use% 79% /", + "docker_images_after": "42.93GB total, 14.96GB reclaimable", + "docker_build_cache_after": "33.31GB total, 42.82MB reclaimable" + } + }, + { + "host": "110", + "target": "reboot auto recovery SLO textfile", + "action": "bash scripts/reboot-recovery/install-reboot-auto-recovery-slo-110.sh", + "blast_radius": "script/timer/textfile install and verifier run only", + "readback": { + "scope": "99_110_111_112_120_121_188", + "awoooi_reboot_event_detected": 1, + "awoooi_reboot_auto_recovery_slo_ready": 0, + "awoooi_reboot_auto_recovery_slo_blocker_count": 6, + "awoooi_reboot_event_target_seconds_remaining": 0 + } + } + ], + "prometheus_readback": { + "alerts_after_recovery": [ + { + "alertname": "HostRebootEventDetected", + "severity": "warning", + "host": "110" + }, + { + "alertname": "RebootAutoRecoverySLOMissed", + "severity": "critical", + "scope": "99_110_111_112_120_121_188" + } + ], + "cleared_alert_classes": [ + "DockerContainerMissingResourceLimit", + "DockerContainerMemoryLimitPressure", + "HostOutOfDiskSpace", + "HostDiskUsageHigh", + "RebootEventDetectorMissing" + ], + "filesystem_used_percent": { + "110": 79.68293081715491, + "188": 79.22498388062007 + } + }, + "cold_start_scorecard": { + "command": "SSH_CONNECT_TIMEOUT=8 bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color", + "pass": 96, + "warn": 0, + "blocked": 0, + "result": "GREEN", + "runtime_truth": "Full stack is ready for controlled runner/CD release.", + "declaration_limit": "This proves current runtime recovery, not that the 10-minute reboot SLO was met." + }, + "source_updates": [ + "scripts/ops/docker-disk-pressure-retention-cleanup.py", + "scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py", + "ops/reboot-recovery/full-stack-cold-start-baseline.yml", + "docs/runbooks/FULL-STACK-COLD-START-SOP.md", + "docs/LOGBOOK.md" + ], + "local_verification": [ + { + "command": "python3.11 -m pytest scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py -q", + "result": "6 passed" + }, + { + "command": "python3.11 -m pytest scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py scripts/ops/tests/test_host_pressure_alert_contract.py -q", + "result": "9 passed" + }, + { + "command": "python3.11 -m py_compile scripts/ops/docker-disk-pressure-retention-cleanup.py scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py", + "result": "passed" + } + ], + "remaining_truth": { + "runtime_full_stack_green": true, + "ten_minute_slo_met_for_this_reboot": false, + "why": "Reboot event was detected after the target window had elapsed, so Prometheus correctly keeps RebootAutoRecoverySLOMissed as incident evidence.", + "next_mainline": [ + "Persist resource guardrails in host compose/source where owned by the matching repo.", + "Replace stale reboot SLO API snapshot with live textfile/Prometheus backed readback.", + "Convert the six reboot SLO blockers into automatic preflight/remediation steps without silencing the SLO miss." + ] + } +} diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index daa714c5..a816aeb1 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -2427,3 +2427,48 @@ DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/a bash -n scripts/ci/wait-host-web-build-pressure.sh scripts/ops/systemd-units-textfile-exporter.py python3.11 -m pytest scripts/ops/tests/test_systemd_units_textfile_exporter.py ops/runner/test_cd_controlled_runtime_profile.py -q ``` + +### 16.3 2026-07-01 post-reboot resource / disk / detector closure + +2026-07-01 全主機重啟後,runtime 問題分成四條:188 `momo-db` 缺 Docker resource limit、110 Sentry relay memory-limit pressure、110 / 188 root filesystem 超過 disk-pressure 門檻、以及 reboot detector scope 仍停在舊 `110_120_121_188`。這些不是噪音;disk pressure 會阻塞 Docker / registry / build path,resource-limit pressure 會讓告警長期失真,detector scope 錯誤會讓「有沒有偵測到重啟」本身不可信。 + +已完成 live 修復: + +| Layer | 修復 | 讀回證據 | +|-------|------|----------| +| 188 `momo-db` guardrail | `docker update --cpus=2 --memory=4g --memory-swap=6g momo-db` | `nanocpus=2000000000 memory=4294967296 memswap=6442450944 restart=unless-stopped` | +| 110 Sentry relay guardrail | `docker update --memory=3g --memory-swap=4g sentry-self-hosted-relay-1` | `health=healthy memory=3221225472 memswap=4294967296`,`https://sentry.wooo.work/_health/` 回 `ok` | +| 110 disk pressure | 只清 unreferenced dangling images 與 bounded builder cache;不使用 `docker system prune` | `/` 從約 `85%` 降到 Prometheus `79.6829%` | +| 188 disk pressure | builder-cache-only follow-up;不碰 containers / volumes / DB / backups | `/` 從約 `85%` 降到 Prometheus `79.2250%` | +| reboot detector | 安裝新版 auto-recovery SLO textfile scope | `awoooi_reboot_event_detected{scope="99_110_111_112_120_121_188"} 1` | +| full-stack scorecard | 重新跑全主機 cold-start | `PASS=96 WARN=0 BLOCKED=0`,result `GREEN` | + +新的 source-of-truth: + +- `scripts/ops/docker-disk-pressure-retention-cleanup.py` +- `scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py` +- `ops/reboot-recovery/full-stack-cold-start-baseline.yml` +- `docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json` + +Docker disk pressure cleanup 規則: + +- 預設 dry-run;只有明確 `--apply` 才執行。 +- 不使用 `docker system prune`。 +- 不刪除 containers、volumes、databases、backups、logs。 +- dangling image cleanup 只允許移除未被任何 container 參照、沒有 tag、超過 age gate 的 image,且保留最新 N 個。 +- BuildKit cache cleanup 必須明確加 `--include-builder-cache`,並設定 `--builder-keep-storage`。 +- `--min-age-hours=0` 只允許和 `--skip-dangling-images` 一起使用,避免無 age gate 刪 image。 + +標準命令: + +```bash +python3 scripts/ops/docker-disk-pressure-retention-cleanup.py --host-label 110 --include-builder-cache +python3 scripts/ops/docker-disk-pressure-retention-cleanup.py --host-label 110 --apply --min-age-hours 24 --keep-dangling-newest 20 --include-builder-cache --builder-keep-storage 30GB +python3 scripts/ops/docker-disk-pressure-retention-cleanup.py --host-label 188 --skip-dangling-images --apply --min-age-hours 0 --include-builder-cache --builder-keep-storage 1GB +``` + +宣告限制: + +- 可以宣稱:這次回讀的 runtime full-stack 已 GREEN,110 / 188 disk pressure、momo-db missing resource limit、Sentry relay memory pressure、RebootEventDetectorMissing 已解除。 +- 不可宣稱:這次重啟有達成 10 分鐘 SLO。Prometheus 仍正確保留 `RebootAutoRecoverySLOMissed`,代表本次事件已 missed,必須做為後續自動化改善 evidence,而不是消音。 +- 不可用 stale API snapshot 覆蓋 live truth;`https://aiops.wooo.work/api/v1/agents/reboot-auto-recovery-slo-scorecard` 若 `generated_at` 舊於最新 textfile / Prometheus readback,只能列為 stale artifact。 diff --git a/ops/reboot-recovery/full-stack-cold-start-baseline.yml b/ops/reboot-recovery/full-stack-cold-start-baseline.yml index c824273a..013a55bc 100644 --- a/ops/reboot-recovery/full-stack-cold-start-baseline.yml +++ b/ops/reboot-recovery/full-stack-cold-start-baseline.yml @@ -289,9 +289,10 @@ resource_guardrails: persist_in: /opt/sentry/docker-compose.override.yml sentry_self_hosted_memory_limits: taskscheduler_mem_limit: 1g - relay_mem_limit: 2g + relay_mem_limit: 3g + relay_memswap_limit: 4g persist_in: /opt/sentry/docker-compose.override.yml - note: "taskscheduler/relay 不得回退到 512m/1g 造成長期 >85% memory-limit pressure;110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。" + note: "taskscheduler 不得回退到 512m/1g;relay 不得回退到 2g 或以下造成長期 >85% memory-limit pressure;110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。" actions_runner_systemd: cpu_quota: 200% memory_max: 2G @@ -310,6 +311,12 @@ resource_guardrails: momo_scheduler: cpus: 2.0 memory: 2G + momo_db: + cpus: 2.0 + memory: 4G + memswap: 6G + restart_policy: unless-stopped + note: "2026-07-01 live recovery 將 momo-db 補上 Docker resource guardrail;不得回退到 nanocpus=0 / memory=0 / memswap=0。" signoz_clickhouse: memory: 24G note: do_not_lower_during_merge_backlog diff --git a/scripts/ops/docker-disk-pressure-retention-cleanup.py b/scripts/ops/docker-disk-pressure-retention-cleanup.py new file mode 100755 index 00000000..b8722df9 --- /dev/null +++ b/scripts/ops/docker-disk-pressure-retention-cleanup.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +"""Bounded Docker disk-pressure cleanup for host reboot recovery. + +This controller intentionally avoids `docker system prune`, volumes, containers, +running images, databases, backups, and logs. It only removes dangling images +that are not referenced by any container, and can optionally run a bounded +BuildKit cache cleanup with an explicit keep-storage floor. +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterable + + +DEFAULT_MIN_AGE_HOURS = 24 +DEFAULT_KEEP_DANGLING_NEWEST = 20 +DEFAULT_BUILDER_KEEP_STORAGE = "30GB" + + +@dataclass(frozen=True) +class ImageInfo: + image_id: str + created_at: datetime + size_bytes: int + repo_tags: tuple[str, ...] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Safely reclaim Docker disk space without touching volumes or containers.", + ) + parser.add_argument("--apply", action="store_true", help="Actually remove selected images/cache.") + parser.add_argument("--docker-bin", default="docker") + parser.add_argument("--disk-path", default="/") + parser.add_argument("--host-label", default="") + parser.add_argument("--min-age-hours", type=int, default=DEFAULT_MIN_AGE_HOURS) + parser.add_argument("--keep-dangling-newest", type=int, default=DEFAULT_KEEP_DANGLING_NEWEST) + parser.add_argument( + "--skip-dangling-images", + action="store_true", + help="Do not remove dangling images; useful for builder-cache-only follow-up cleanup.", + ) + parser.add_argument( + "--include-builder-cache", + action="store_true", + help="Also run docker builder prune with --filter until and --keep-storage.", + ) + parser.add_argument("--builder-keep-storage", default=DEFAULT_BUILDER_KEEP_STORAGE) + parser.add_argument("--output", type=Path, help="Optional JSON receipt path.") + return parser.parse_args() + + +def run_command( + args: list[str], + *, + check: bool = True, + capture_output: bool = True, +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + args, + check=check, + text=True, + capture_output=capture_output, + ) + + +def docker(args: list[str], docker_bin: str) -> subprocess.CompletedProcess[str]: + return run_command([docker_bin, *args]) + + +def normalize_image_id(value: str) -> str: + value = value.strip() + if value.startswith("sha256:"): + value = value.split(":", 1)[1] + return value + + +def parse_docker_datetime(value: str) -> datetime: + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + if "." in text: + head, tail = text.split(".", 1) + fraction = [] + suffix_start = len(tail) + for index, char in enumerate(tail): + if not char.isdigit(): + suffix_start = index + break + fraction.append(char) + frac_text = "".join(fraction) + suffix = tail[suffix_start:] + if len(frac_text) > 6: + frac_text = frac_text[:6] + text = f"{head}.{frac_text}{suffix}" + parsed = datetime.fromisoformat(text) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + + +def chunked(values: list[str], size: int) -> Iterable[list[str]]: + for start in range(0, len(values), size): + yield values[start : start + size] + + +def current_disk_bytes(path: str) -> dict[str, int]: + result = run_command(["df", "-PB1", path]) + lines = [line for line in result.stdout.splitlines() if line.strip()] + if len(lines) < 2: + return {"size_bytes": 0, "used_bytes": 0, "available_bytes": 0, "used_percent": 0} + parts = lines[1].split() + size = int(parts[1]) + used = int(parts[2]) + avail = int(parts[3]) + used_percent = int(parts[4].rstrip("%")) + return { + "size_bytes": size, + "used_bytes": used, + "available_bytes": avail, + "used_percent": used_percent, + } + + +def get_container_image_ids(docker_bin: str) -> set[str]: + containers = docker(["ps", "-aq", "--no-trunc"], docker_bin).stdout.split() + protected: set[str] = set() + for group in chunked(containers, 100): + if not group: + continue + result = docker(["inspect", "--format", "{{.Image}}", *group], docker_bin) + for line in result.stdout.splitlines(): + image_id = normalize_image_id(line) + if image_id: + protected.add(image_id) + return protected + + +def get_dangling_images(docker_bin: str) -> list[ImageInfo]: + image_ids = docker( + ["image", "ls", "--filter", "dangling=true", "--quiet", "--no-trunc"], + docker_bin, + ).stdout.split() + images: list[ImageInfo] = [] + for group in chunked([normalize_image_id(value) for value in image_ids], 100): + if not group: + continue + result = docker(["image", "inspect", *group], docker_bin) + payload = json.loads(result.stdout or "[]") + for item in payload: + image_id = normalize_image_id(str(item.get("Id") or "")) + if not image_id: + continue + tags = item.get("RepoTags") or [] + images.append( + ImageInfo( + image_id=image_id, + created_at=parse_docker_datetime(str(item.get("Created") or "")), + size_bytes=int(item.get("Size") or 0), + repo_tags=tuple(str(tag) for tag in tags if tag), + ) + ) + return images + + +def select_dangling_image_removals( + images: list[ImageInfo], + protected_ids: set[str], + *, + now: datetime, + min_age_hours: int, + keep_newest: int, +) -> list[ImageInfo]: + cutoff_seconds = min_age_hours * 3600 + dangling = [ + image + for image in images + if normalize_image_id(image.image_id) not in protected_ids + and not image.repo_tags + and (now - image.created_at).total_seconds() >= cutoff_seconds + ] + dangling.sort(key=lambda image: image.created_at, reverse=True) + if keep_newest > 0: + dangling = dangling[keep_newest:] + return sorted(dangling, key=lambda image: image.created_at) + + +def summarize_images(images: list[ImageInfo]) -> dict[str, Any]: + return { + "count": len(images), + "estimated_total_size_bytes": sum(image.size_bytes for image in images), + "oldest_created_at": images[0].created_at.isoformat() if images else None, + "newest_created_at": images[-1].created_at.isoformat() if images else None, + "sample_image_ids": [image.image_id[:12] for image in images[:20]], + } + + +def remove_images(images: list[ImageInfo], docker_bin: str) -> list[str]: + removed: list[str] = [] + for group in chunked([image.image_id for image in images], 25): + if not group: + continue + docker(["image", "rm", *group], docker_bin) + removed.extend(group) + return removed + + +def builder_prune_command(args: argparse.Namespace) -> list[str]: + command = [ + args.docker_bin, + "builder", + "prune", + "--force", + "--keep-storage", + args.builder_keep_storage, + ] + if args.min_age_hours > 0: + command[4:4] = ["--filter", f"until={args.min_age_hours}h"] + return command + + +def build_receipt(args: argparse.Namespace) -> dict[str, Any]: + now = datetime.now(timezone.utc) + before = current_disk_bytes(args.disk_path) + protected_ids = get_container_image_ids(args.docker_bin) + dangling_images = get_dangling_images(args.docker_bin) + removal_candidates = ( + [] + if args.skip_dangling_images + else select_dangling_image_removals( + dangling_images, + protected_ids, + now=now, + min_age_hours=args.min_age_hours, + keep_newest=args.keep_dangling_newest, + ) + ) + receipt: dict[str, Any] = { + "schema_version": "awoooi_docker_disk_pressure_retention_cleanup_v1", + "generated_at": now.isoformat(), + "host_label": args.host_label, + "mode": "apply" if args.apply else "dry_run", + "disk_path": args.disk_path, + "boundaries": { + "touches_containers": False, + "touches_volumes": False, + "touches_databases": False, + "touches_backups": False, + "uses_docker_system_prune": False, + "removes_only_unreferenced_dangling_images": True, + "builder_cache_cleanup_requires_explicit_flag": True, + }, + "parameters": { + "min_age_hours": args.min_age_hours, + "keep_dangling_newest": args.keep_dangling_newest, + "include_builder_cache": args.include_builder_cache, + "builder_keep_storage": args.builder_keep_storage, + "skip_dangling_images": args.skip_dangling_images, + }, + "disk_before": before, + "protected_container_image_count": len(protected_ids), + "dangling_image_total_count": len(dangling_images), + "dangling_image_removal_plan": summarize_images(removal_candidates), + "builder_cache_command": builder_prune_command(args)[1:] if args.include_builder_cache else None, + "removed_image_ids": [], + "builder_cache_cleanup_executed": False, + } + if args.apply: + receipt["removed_image_ids"] = [image[:12] for image in remove_images(removal_candidates, args.docker_bin)] + if args.include_builder_cache: + run_command(builder_prune_command(args), capture_output=True) + receipt["builder_cache_cleanup_executed"] = True + receipt["disk_after"] = current_disk_bytes(args.disk_path) + return receipt + + +def main() -> int: + args = parse_args() + if args.min_age_hours < 0: + print("min-age-hours must be >= 0", file=sys.stderr) + return 2 + if args.min_age_hours == 0 and not args.skip_dangling_images: + print("min-age-hours=0 requires --skip-dangling-images", file=sys.stderr) + return 2 + if args.keep_dangling_newest < 0: + print("keep-dangling-newest must be >= 0", file=sys.stderr) + return 2 + receipt = build_receipt(args) + text = json.dumps(receipt, ensure_ascii=False, indent=2, sort_keys=True) + "\n" + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(text, encoding="utf-8") + print(text, end="") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py b/scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py new file mode 100644 index 00000000..e1e67d65 --- /dev/null +++ b/scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import importlib.util +import sys +from datetime import datetime, timedelta, timezone +from pathlib import Path +from types import SimpleNamespace + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "ops" / "docker-disk-pressure-retention-cleanup.py" +spec = importlib.util.spec_from_file_location("docker_disk_pressure_retention_cleanup", SCRIPT) +module = importlib.util.module_from_spec(spec) +assert spec and spec.loader +sys.modules[spec.name] = module +spec.loader.exec_module(module) + + +def image(image_id: str, created_at: datetime, tags: tuple[str, ...] = ()): + return module.ImageInfo( + image_id=image_id, + created_at=created_at, + size_bytes=1024, + repo_tags=tags, + ) + + +def test_select_dangling_images_keeps_newest_and_protects_running_images() -> None: + now = datetime(2026, 7, 1, 12, tzinfo=timezone.utc) + images = [ + image("sha256:running", now - timedelta(hours=72)), + image("oldest", now - timedelta(hours=72)), + image("middle", now - timedelta(hours=48)), + image("newest", now - timedelta(hours=30)), + image("too_recent", now - timedelta(hours=2)), + image("tagged", now - timedelta(hours=72), ("repo:tag",)), + ] + + selected = module.select_dangling_image_removals( + images, + {"running"}, + now=now, + min_age_hours=24, + keep_newest=1, + ) + + assert [item.image_id for item in selected] == ["oldest", "middle"] + + +def test_builder_prune_command_is_bounded_by_age_and_keep_storage() -> None: + args = SimpleNamespace( + docker_bin="docker", + min_age_hours=36, + builder_keep_storage="40GB", + ) + + assert module.builder_prune_command(args) == [ + "docker", + "builder", + "prune", + "--force", + "--filter", + "until=36h", + "--keep-storage", + "40GB", + ] + + +def test_parse_docker_datetime_accepts_nanosecond_fraction() -> None: + parsed = module.parse_docker_datetime("2026-07-01T23:29:21.919867918+08:00") + + assert parsed.isoformat() == "2026-07-01T15:29:21.919867+00:00" + + +def test_summary_never_reports_volumes_or_container_cleanup_boundary() -> None: + now = datetime(2026, 7, 1, 12, tzinfo=timezone.utc) + selected = module.select_dangling_image_removals( + [image("old", now - timedelta(days=3))], + set(), + now=now, + min_age_hours=24, + keep_newest=0, + ) + summary = module.summarize_images(selected) + + assert summary["count"] == 1 + assert summary["sample_image_ids"] == ["old"] + + +def test_cli_exposes_builder_cache_only_flag() -> None: + help_text = module.run_command(["python3", str(SCRIPT), "--help"]).stdout + + assert "--skip-dangling-images" in help_text + + +def test_zero_age_builder_prune_omits_until_filter() -> None: + args = SimpleNamespace( + docker_bin="docker", + min_age_hours=0, + builder_keep_storage="1GB", + ) + + assert module.builder_prune_command(args) == [ + "docker", + "builder", + "prune", + "--force", + "--keep-storage", + "1GB", + ]