#!/usr/bin/env python3 """ Storage health textfile exporter for reboot-recovery guardrails. 2026-05-06 ogt + Codex: 110/188 dirty-reboot follow-up. Why: both hosts recently stopped in initramfs with root filesystem inconsistency. Service-level checks were blind until the console showed fsck. This exporter keeps the filesystem/kernel storage evidence visible in Prometheus without performing any repair. """ from __future__ import annotations import os import re import subprocess import tempfile import time from pathlib import Path TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector")) OUTPUT_NAME = "storage_health.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) LABEL_RE = re.compile(r'["\\\n]') STORAGE_ERROR_RE = re.compile( r"(" r"EXT4-fs (error|warning)|" r"Buffer I/O error|" r"I/O error|" r"blk_update_request|" r"end_request: I/O error|" r"UNEXPECTED INCONSISTENCY|" r"RUN fsck MANUALLY|" r"orphan linked list|" r"Multiply-claimed block|" r"deleted inode referenced|" r"Structure needs cleaning|" r"Bad message|" r"filesystem .*error|" r"fsck.*(error|failed)|" r"read-only file system" r")", re.IGNORECASE, ) def _escape_label(value: str) -> str: return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value) def _run(command: list[str], timeout: int = 12) -> tuple[int, str, str]: try: result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False) except FileNotFoundError as exc: return 127, "", str(exc) except subprocess.TimeoutExpired as exc: stdout = exc.stdout if isinstance(exc.stdout, str) else "" stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout" return 124, stdout, stderr return result.returncode, result.stdout, result.stderr def _root_filesystem_readonly() -> tuple[int, int]: try: for line in Path("/proc/mounts").read_text(encoding="utf-8").splitlines(): fields = line.split() if len(fields) >= 4 and fields[1] == "/": options = set(fields[3].split(",")) return 1, int("ro" in options) except OSError: return 0, 0 return 0, 0 def _boot_time_seconds() -> int: try: for line in Path("/proc/stat").read_text(encoding="utf-8").splitlines(): if line.startswith("btime "): return int(line.split()[1]) except (OSError, ValueError, IndexError): return 0 return 0 def _count_storage_errors(text: str) -> int: return sum(1 for line in text.splitlines() if STORAGE_ERROR_RE.search(line)) def _journal_storage_count(boot: str) -> tuple[int, int]: rc, stdout, _stderr = _run( [ "journalctl", "--no-pager", "-k", "-b", boot, "-p", "warning..alert", "-n", "5000", "-o", "short-iso", ], timeout=15, ) if rc != 0: return 0, 0 return 1, _count_storage_errors(stdout) def _fsck_log_counts() -> list[tuple[str, int, int]]: sources = [ "/run/initramfs/fsck.log", "/var/log/fsck/checkroot", "/var/log/fsck/checkfs", ] rows = [] for source in sources: path = Path(source) try: exists = path.exists() except OSError: rows.append((source, 0, 0)) continue if not exists: rows.append((source, 0, 0)) continue try: text = path.read_text(encoding="utf-8", errors="replace") except OSError: rows.append((source, 0, 0)) continue rows.append((source, 1, _count_storage_errors(text))) return rows def collect() -> str: now = int(time.time()) host = _escape_label(HOST_LABEL) mount_available, root_readonly = _root_filesystem_readonly() current_available, current_errors = _journal_storage_count("0") previous_available, previous_errors = _journal_storage_count("-1") boot_time = _boot_time_seconds() lines = [ "# HELP awoooi_host_storage_monitor_up Whether the storage health exporter completed.", "# TYPE awoooi_host_storage_monitor_up gauge", "# HELP awoooi_host_storage_last_run_timestamp Unix timestamp of the last storage health exporter run.", "# TYPE awoooi_host_storage_last_run_timestamp gauge", "# HELP awoooi_host_boot_time_timestamp Host boot time from /proc/stat btime.", "# TYPE awoooi_host_boot_time_timestamp gauge", "# HELP awoooi_host_root_filesystem_readonly Whether the root filesystem is mounted read-only.", "# TYPE awoooi_host_root_filesystem_readonly gauge", "# HELP awoooi_host_storage_source_available Whether a storage evidence source was readable.", "# TYPE awoooi_host_storage_source_available gauge", "# HELP awoooi_host_storage_error_count Storage or fsck error lines detected in the evidence source.", "# TYPE awoooi_host_storage_error_count gauge", f'awoooi_host_storage_monitor_up{{host="{host}"}} 1', f'awoooi_host_storage_last_run_timestamp{{host="{host}"}} {now}', f'awoooi_host_boot_time_timestamp{{host="{host}"}} {boot_time}', f'awoooi_host_root_filesystem_readonly{{host="{host}",mountpoint="/"}} {root_readonly}', f'awoooi_host_storage_source_available{{host="{host}",source="/proc/mounts"}} {mount_available}', f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="current"}} {current_available}', f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_available}', f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="current"}} {current_errors}', f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_errors}', ] for source, available, errors in _fsck_log_counts(): escaped_source = _escape_label(source) lines.append(f'awoooi_host_storage_source_available{{host="{host}",source="{escaped_source}"}} {available}') lines.append(f'awoooi_host_storage_error_count{{host="{host}",source="{escaped_source}",boot="last-fsck-log"}} {errors}') return "\n".join(lines) + "\n" def main() -> None: TEXTFILE_DIR.mkdir(parents=True, exist_ok=True) payload = collect() with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp: tmp.write(payload) tmp_path = Path(tmp.name) output_path = TEXTFILE_DIR / OUTPUT_NAME tmp_path.replace(output_path) output_path.chmod(0o644) if __name__ == "__main__": main()