Files
awoooi/scripts/ops/host-runaway-process-remediation.py
Your Name ff18872a23
Some checks failed
Code Review / ai-code-review (push) Successful in 14s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Failing after 26s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
feat(ops): 新增 host runaway process aiops guard
2026-06-18 14:17:03 +08:00

166 lines
5.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Gated remediation helper for AWOOOI host runaway process groups.
Default mode is dry-run. Applying SIGTERM requires explicit owner approval,
maintenance window, evidence reference, and --confirm-apply. This script is a
PlayBook primitive, not a background auto-kill daemon.
"""
from __future__ import annotations
import argparse
import importlib.util
import json
import os
import signal
import sys
import time
from pathlib import Path
from types import ModuleType
EXPORTER_PATH = Path(__file__).with_name("host-runaway-process-exporter.py")
def load_exporter() -> ModuleType:
spec = importlib.util.spec_from_file_location("host_runaway_process_exporter", EXPORTER_PATH)
if spec is None or spec.loader is None:
raise RuntimeError(f"cannot load exporter module: {EXPORTER_PATH}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Dry-run or gated SIGTERM for AWOOOI runaway process groups.")
parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename))
parser.add_argument("--rule", help="Limit candidates to one rule id. Required with --apply.")
parser.add_argument("--ps-file", type=Path, help="Use a fixture ps file for tests or offline review.")
parser.add_argument("--min-age-seconds", type=int, default=1800)
parser.add_argument("--min-cpu-percent", type=float, default=50)
parser.add_argument("--apply", action="store_true", help="Send SIGTERM to matching process groups.")
parser.add_argument("--confirm-apply", action="store_true", help="Required together with --apply.")
parser.add_argument("--owner-approval-id", default="")
parser.add_argument("--maintenance-window-id", default="")
parser.add_argument("--evidence-ref", default="")
parser.add_argument("--wait-seconds", type=int, default=0, help="Optional wait after SIGTERM before re-reading ps.")
return parser.parse_args()
def validate_apply_args(args: argparse.Namespace) -> None:
if not args.apply:
return
missing = []
if not args.confirm_apply:
missing.append("--confirm-apply")
if not args.rule:
missing.append("--rule")
if not args.owner_approval_id:
missing.append("--owner-approval-id")
if not args.maintenance_window_id:
missing.append("--maintenance-window-id")
if not args.evidence_ref:
missing.append("--evidence-ref")
if missing:
raise SystemExit(
"Refusing apply; missing required gates: "
+ ", ".join(missing)
+ ". Use dry-run output for the PlayBook packet first."
)
def current_process_group() -> int:
try:
return os.getpgrp()
except Exception:
return -1
def main() -> None:
args = parse_args()
validate_apply_args(args)
exporter = load_exporter()
rows = exporter.parse_ps_rows(exporter.read_ps_text(args.ps_file))
groups = exporter.classify_groups(
rows,
min_age_seconds=args.min_age_seconds,
min_cpu_percent=args.min_cpu_percent,
)
if args.rule:
groups = [group for group in groups if group.rule_id == args.rule]
own_pgrp = current_process_group()
candidates = []
for group in groups:
blocked_reason = None
if group.pgid <= 1:
blocked_reason = "unsafe_pgid"
elif group.pgid == own_pgrp:
blocked_reason = "own_process_group"
candidates.append(
{
"rule": group.rule_id,
"pgid": group.pgid,
"process_count": len(group.rows),
"cpu_percent": round(group.cpu_percent, 3),
"oldest_age_seconds": group.oldest_age_seconds,
"orphan_reason": group.orphan_reason,
"sample_comm": group.sample_comm,
"blocked_reason": blocked_reason,
"action": "skip" if blocked_reason else ("sigterm" if args.apply else "dry_run"),
}
)
signaled: list[int] = []
if args.apply:
for candidate in candidates:
if candidate["blocked_reason"]:
continue
os.killpg(int(candidate["pgid"]), signal.SIGTERM)
signaled.append(int(candidate["pgid"]))
remaining_after_wait = None
if args.apply and args.wait_seconds > 0:
time.sleep(args.wait_seconds)
fresh_rows = exporter.parse_ps_rows(exporter.read_ps_text(args.ps_file))
fresh_groups = exporter.classify_groups(
fresh_rows,
min_age_seconds=args.min_age_seconds,
min_cpu_percent=args.min_cpu_percent,
)
remaining_after_wait = [
group.pgid for group in fresh_groups if not args.rule or group.rule_id == args.rule
]
payload = {
"schema_version": "host_runaway_process_remediation_v1",
"host": args.host,
"mode": "apply_sigterm" if args.apply else "dry_run",
"runtime_gate": 1 if args.apply else 0,
"owner_approval_id": args.owner_approval_id if args.apply else None,
"maintenance_window_id": args.maintenance_window_id if args.apply else None,
"evidence_ref": args.evidence_ref if args.apply else None,
"min_age_seconds": args.min_age_seconds,
"min_cpu_percent": args.min_cpu_percent,
"candidate_count": len(candidates),
"signaled_process_group_count": len(signaled),
"signaled_process_groups": signaled,
"remaining_after_wait": remaining_after_wait,
"candidates": candidates,
"forbidden_without_gates": [
"sigkill",
"docker_restart",
"systemctl_restart",
"nginx_reload",
"firewall_change",
"secret_collection",
],
}
print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
if __name__ == "__main__":
main()