Files
awoooi/scripts/ops/storage-health-textfile-exporter.py
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

191 lines
6.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Storage health textfile exporter for reboot-recovery guardrails.
2026-05-06 ogt + Codex: 110/188 dirty-reboot follow-up.
Why: both hosts recently stopped in initramfs with root filesystem
inconsistency. Service-level checks were blind until the console showed fsck.
This exporter keeps the filesystem/kernel storage evidence visible in
Prometheus without performing any repair.
"""
from __future__ import annotations
import os
import re
import subprocess
import tempfile
import time
from pathlib import Path
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
OUTPUT_NAME = "storage_health.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
STORAGE_ERROR_RE = re.compile(
r"("
r"EXT4-fs (error|warning)|"
r"Buffer I/O error|"
r"I/O error|"
r"blk_update_request|"
r"end_request: I/O error|"
r"UNEXPECTED INCONSISTENCY|"
r"RUN fsck MANUALLY|"
r"orphan linked list|"
r"Multiply-claimed block|"
r"deleted inode referenced|"
r"Structure needs cleaning|"
r"Bad message|"
r"filesystem .*error|"
r"fsck.*(error|failed)|"
r"read-only file system"
r")",
re.IGNORECASE,
)
def _escape_label(value: str) -> str:
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
def _run(command: list[str], timeout: int = 12) -> tuple[int, str, str]:
try:
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
except FileNotFoundError as exc:
return 127, "", str(exc)
except subprocess.TimeoutExpired as exc:
stdout = exc.stdout if isinstance(exc.stdout, str) else ""
stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
return 124, stdout, stderr
return result.returncode, result.stdout, result.stderr
def _root_filesystem_readonly() -> tuple[int, int]:
try:
for line in Path("/proc/mounts").read_text(encoding="utf-8").splitlines():
fields = line.split()
if len(fields) >= 4 and fields[1] == "/":
options = set(fields[3].split(","))
return 1, int("ro" in options)
except OSError:
return 0, 0
return 0, 0
def _boot_time_seconds() -> int:
try:
for line in Path("/proc/stat").read_text(encoding="utf-8").splitlines():
if line.startswith("btime "):
return int(line.split()[1])
except (OSError, ValueError, IndexError):
return 0
return 0
def _count_storage_errors(text: str) -> int:
return sum(1 for line in text.splitlines() if STORAGE_ERROR_RE.search(line))
def _journal_storage_count(boot: str) -> tuple[int, int]:
rc, stdout, _stderr = _run(
[
"journalctl",
"--no-pager",
"-k",
"-b",
boot,
"-p",
"warning..alert",
"-n",
"5000",
"-o",
"short-iso",
],
timeout=15,
)
if rc != 0:
return 0, 0
return 1, _count_storage_errors(stdout)
def _fsck_log_counts() -> list[tuple[str, int, int]]:
sources = [
"/run/initramfs/fsck.log",
"/var/log/fsck/checkroot",
"/var/log/fsck/checkfs",
]
rows = []
for source in sources:
path = Path(source)
try:
exists = path.exists()
except OSError:
rows.append((source, 0, 0))
continue
if not exists:
rows.append((source, 0, 0))
continue
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError:
rows.append((source, 0, 0))
continue
rows.append((source, 1, _count_storage_errors(text)))
return rows
def collect() -> str:
now = int(time.time())
host = _escape_label(HOST_LABEL)
mount_available, root_readonly = _root_filesystem_readonly()
current_available, current_errors = _journal_storage_count("0")
previous_available, previous_errors = _journal_storage_count("-1")
boot_time = _boot_time_seconds()
lines = [
"# HELP awoooi_host_storage_monitor_up Whether the storage health exporter completed.",
"# TYPE awoooi_host_storage_monitor_up gauge",
"# HELP awoooi_host_storage_last_run_timestamp Unix timestamp of the last storage health exporter run.",
"# TYPE awoooi_host_storage_last_run_timestamp gauge",
"# HELP awoooi_host_boot_time_timestamp Host boot time from /proc/stat btime.",
"# TYPE awoooi_host_boot_time_timestamp gauge",
"# HELP awoooi_host_root_filesystem_readonly Whether the root filesystem is mounted read-only.",
"# TYPE awoooi_host_root_filesystem_readonly gauge",
"# HELP awoooi_host_storage_source_available Whether a storage evidence source was readable.",
"# TYPE awoooi_host_storage_source_available gauge",
"# HELP awoooi_host_storage_error_count Storage or fsck error lines detected in the evidence source.",
"# TYPE awoooi_host_storage_error_count gauge",
f'awoooi_host_storage_monitor_up{{host="{host}"}} 1',
f'awoooi_host_storage_last_run_timestamp{{host="{host}"}} {now}',
f'awoooi_host_boot_time_timestamp{{host="{host}"}} {boot_time}',
f'awoooi_host_root_filesystem_readonly{{host="{host}",mountpoint="/"}} {root_readonly}',
f'awoooi_host_storage_source_available{{host="{host}",source="/proc/mounts"}} {mount_available}',
f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="current"}} {current_available}',
f'awoooi_host_storage_source_available{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_available}',
f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="current"}} {current_errors}',
f'awoooi_host_storage_error_count{{host="{host}",source="journalctl-kernel",boot="previous"}} {previous_errors}',
]
for source, available, errors in _fsck_log_counts():
escaped_source = _escape_label(source)
lines.append(f'awoooi_host_storage_source_available{{host="{host}",source="{escaped_source}"}} {available}')
lines.append(f'awoooi_host_storage_error_count{{host="{host}",source="{escaped_source}",boot="last-fsck-log"}} {errors}')
return "\n".join(lines) + "\n"
def main() -> None:
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
payload = collect()
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
tmp.write(payload)
tmp_path = Path(tmp.name)
output_path = TEXTFILE_DIR / OUTPUT_NAME
tmp_path.replace(output_path)
output_path.chmod(0o644)
if __name__ == "__main__":
main()