191 lines
6.8 KiB
Python
Executable File
191 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Storage health textfile exporter for reboot-recovery guardrails.
|
|
|
|
2026-05-06 ogt + Codex: 110/188 dirty-reboot follow-up.
|
|
Why: both hosts recently stopped in initramfs with root filesystem
|
|
inconsistency. Service-level checks were blind until the console showed fsck.
|
|
This exporter keeps the filesystem/kernel storage evidence visible in
|
|
Prometheus without performing any repair.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
|
OUTPUT_NAME = "storage_health.prom"
|
|
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
|
LABEL_RE = re.compile(r'["\\\n]')
|
|
STORAGE_ERROR_RE = re.compile(
|
|
r"("
|
|
r"EXT4-fs (error|warning)|"
|
|
r"Buffer I/O error|"
|
|
r"I/O error|"
|
|
r"blk_update_request|"
|
|
r"end_request: I/O error|"
|
|
r"UNEXPECTED INCONSISTENCY|"
|
|
r"RUN fsck MANUALLY|"
|
|
r"orphan linked list|"
|
|
r"Multiply-claimed block|"
|
|
r"deleted inode referenced|"
|
|
r"Structure needs cleaning|"
|
|
r"Bad message|"
|
|
r"filesystem .*error|"
|
|
r"fsck.*(error|failed)|"
|
|
r"read-only file system"
|
|
r")",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _escape_label(value: str) -> str:
|
|
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
|
|
|
|
|
def _run(command: list[str], timeout: int = 12) -> tuple[int, str, str]:
|
|
try:
|
|
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
|
|
except FileNotFoundError as exc:
|
|
return 127, "", str(exc)
|
|
except subprocess.TimeoutExpired as exc:
|
|
stdout = exc.stdout if isinstance(exc.stdout, str) else ""
|
|
stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
|
|
return 124, stdout, stderr
|
|
return result.returncode, result.stdout, result.stderr
|
|
|
|
|
|
def _root_filesystem_readonly() -> tuple[int, int]:
|
|
try:
|
|
for line in Path("/proc/mounts").read_text(encoding="utf-8").splitlines():
|
|
fields = line.split()
|
|
if len(fields) >= 4 and fields[1] == "/":
|
|
options = set(fields[3].split(","))
|
|
return 1, int("ro" in options)
|
|
except OSError:
|
|
return 0, 0
|
|
return 0, 0
|
|
|
|
|
|
def _boot_time_seconds() -> int:
|
|
try:
|
|
for line in Path("/proc/stat").read_text(encoding="utf-8").splitlines():
|
|
if line.startswith("btime "):
|
|
return int(line.split()[1])
|
|
except (OSError, ValueError, IndexError):
|
|
return 0
|
|
return 0
|
|
|
|
|
|
def _count_storage_errors(text: str) -> int:
|
|
return sum(1 for line in text.splitlines() if STORAGE_ERROR_RE.search(line))
|
|
|
|
|
|
def _journal_storage_count(boot: str) -> tuple[int, int]:
|
|
rc, stdout, _stderr = _run(
|
|
[
|
|
"journalctl",
|
|
"--no-pager",
|
|
"-k",
|
|
"-b",
|
|
boot,
|
|
"-p",
|
|
"warning..alert",
|
|
"-n",
|
|
"5000",
|
|
"-o",
|
|
"short-iso",
|
|
],
|
|
timeout=15,
|
|
)
|
|
if rc != 0:
|
|
return 0, 0
|
|
return 1, _count_storage_errors(stdout)
|
|
|
|
|
|
def _fsck_log_counts() -> list[tuple[str, int, int]]:
|
|
sources = [
|
|
"/run/initramfs/fsck.log",
|
|
"/var/log/fsck/checkroot",
|
|
"/var/log/fsck/checkfs",
|
|
]
|
|
rows = []
|
|
for source in sources:
|
|
path = Path(source)
|
|
try:
|
|
exists = path.exists()
|
|
except OSError:
|
|
rows.append((source, 0, 0))
|
|
continue
|
|
if not exists:
|
|
rows.append((source, 0, 0))
|
|
continue
|
|
try:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
except OSError:
|
|
rows.append((source, 0, 0))
|
|
continue
|
|
rows.append((source, 1, _count_storage_errors(text)))
|
|
return rows
|
|
|
|
|
|
def collect() -> str:
|
|
now = int(time.time())
|
|
host = _escape_label(HOST_LABEL)
|
|
mount_available, root_readonly = _root_filesystem_readonly()
|
|
current_available, current_errors = _journal_storage_count("0")
|
|
previous_available, previous_errors = _journal_storage_count("-1")
|
|
boot_time = _boot_time_seconds()
|
|
|
|
lines = [
|
|
"# HELP awoooi_host_storage_monitor_up Whether the storage health exporter completed.",
|
|
"# TYPE awoooi_host_storage_monitor_up gauge",
|
|
"# HELP awoooi_host_storage_last_run_timestamp Unix timestamp of the last storage health exporter run.",
|
|
"# TYPE awoooi_host_storage_last_run_timestamp gauge",
|
|
"# HELP awoooi_host_boot_time_timestamp Host boot time from /proc/stat btime.",
|
|
"# TYPE awoooi_host_boot_time_timestamp gauge",
|
|
"# HELP awoooi_host_root_filesystem_readonly Whether the root filesystem is mounted read-only.",
|
|
"# TYPE awoooi_host_root_filesystem_readonly gauge",
|
|
"# HELP awoooi_host_storage_source_available Whether a storage evidence source was readable.",
|
|
"# TYPE awoooi_host_storage_source_available gauge",
|
|
"# HELP awoooi_host_storage_error_count Storage or fsck error lines detected in the evidence source.",
|
|
"# TYPE awoooi_host_storage_error_count gauge",
|
|
f'awoooi_host_storage_monitor_up{{host="{host}"}} 1',
|
|
f'awoooi_host_storage_last_run_timestamp{{host="{host}"}} {now}',
|
|
f'awoooi_host_boot_time_timestamp{{host="{host}"}} {boot_time}',
|
|
f'awoooi_host_root_filesystem_readonly{{host="{host}",mountpoint="/"}} {root_readonly}',
|
|
f'awoooi_host_storage_source_available{{host="{host}",source="/proc/mounts"}} {mount_available}',
|
|
f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="current"}} {current_available}',
|
|
f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_available}',
|
|
f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="current"}} {current_errors}',
|
|
f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_errors}',
|
|
]
|
|
|
|
for source, available, errors in _fsck_log_counts():
|
|
escaped_source = _escape_label(source)
|
|
lines.append(f'awoooi_host_storage_source_available{{host="{host}",source="{escaped_source}"}} {available}')
|
|
lines.append(f'awoooi_host_storage_error_count{{host="{host}",source="{escaped_source}",boot="last-fsck-log"}} {errors}')
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> None:
|
|
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
|
payload = collect()
|
|
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
|
tmp.write(payload)
|
|
tmp_path = Path(tmp.name)
|
|
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
|
tmp_path.replace(output_path)
|
|
output_path.chmod(0o644)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|