166 lines
5.9 KiB
Python
Executable File
166 lines
5.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Gated remediation helper for AWOOOI host runaway process groups.
|
|
|
|
Default mode is dry-run. Applying SIGTERM requires explicit owner approval,
|
|
maintenance window, evidence reference, and --confirm-apply. This script is a
|
|
PlayBook primitive, not a background auto-kill daemon.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import importlib.util
|
|
import json
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from types import ModuleType
|
|
|
|
|
|
EXPORTER_PATH = Path(__file__).with_name("host-runaway-process-exporter.py")
|
|
|
|
|
|
def load_exporter() -> ModuleType:
|
|
spec = importlib.util.spec_from_file_location("host_runaway_process_exporter", EXPORTER_PATH)
|
|
if spec is None or spec.loader is None:
|
|
raise RuntimeError(f"cannot load exporter module: {EXPORTER_PATH}")
|
|
module = importlib.util.module_from_spec(spec)
|
|
sys.modules[spec.name] = module
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Dry-run or gated SIGTERM for AWOOOI runaway process groups.")
|
|
parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename))
|
|
parser.add_argument("--rule", help="Limit candidates to one rule id. Required with --apply.")
|
|
parser.add_argument("--ps-file", type=Path, help="Use a fixture ps file for tests or offline review.")
|
|
parser.add_argument("--min-age-seconds", type=int, default=1800)
|
|
parser.add_argument("--min-cpu-percent", type=float, default=50)
|
|
parser.add_argument("--apply", action="store_true", help="Send SIGTERM to matching process groups.")
|
|
parser.add_argument("--confirm-apply", action="store_true", help="Required together with --apply.")
|
|
parser.add_argument("--owner-approval-id", default="")
|
|
parser.add_argument("--maintenance-window-id", default="")
|
|
parser.add_argument("--evidence-ref", default="")
|
|
parser.add_argument("--wait-seconds", type=int, default=0, help="Optional wait after SIGTERM before re-reading ps.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def validate_apply_args(args: argparse.Namespace) -> None:
|
|
if not args.apply:
|
|
return
|
|
missing = []
|
|
if not args.confirm_apply:
|
|
missing.append("--confirm-apply")
|
|
if not args.rule:
|
|
missing.append("--rule")
|
|
if not args.owner_approval_id:
|
|
missing.append("--owner-approval-id")
|
|
if not args.maintenance_window_id:
|
|
missing.append("--maintenance-window-id")
|
|
if not args.evidence_ref:
|
|
missing.append("--evidence-ref")
|
|
if missing:
|
|
raise SystemExit(
|
|
"Refusing apply; missing required gates: "
|
|
+ ", ".join(missing)
|
|
+ ". Use dry-run output for the PlayBook packet first."
|
|
)
|
|
|
|
|
|
def current_process_group() -> int:
|
|
try:
|
|
return os.getpgrp()
|
|
except Exception:
|
|
return -1
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
validate_apply_args(args)
|
|
exporter = load_exporter()
|
|
rows = exporter.parse_ps_rows(exporter.read_ps_text(args.ps_file))
|
|
groups = exporter.classify_groups(
|
|
rows,
|
|
min_age_seconds=args.min_age_seconds,
|
|
min_cpu_percent=args.min_cpu_percent,
|
|
)
|
|
if args.rule:
|
|
groups = [group for group in groups if group.rule_id == args.rule]
|
|
|
|
own_pgrp = current_process_group()
|
|
candidates = []
|
|
for group in groups:
|
|
blocked_reason = None
|
|
if group.pgid <= 1:
|
|
blocked_reason = "unsafe_pgid"
|
|
elif group.pgid == own_pgrp:
|
|
blocked_reason = "own_process_group"
|
|
candidates.append(
|
|
{
|
|
"rule": group.rule_id,
|
|
"pgid": group.pgid,
|
|
"process_count": len(group.rows),
|
|
"cpu_percent": round(group.cpu_percent, 3),
|
|
"oldest_age_seconds": group.oldest_age_seconds,
|
|
"orphan_reason": group.orphan_reason,
|
|
"sample_comm": group.sample_comm,
|
|
"blocked_reason": blocked_reason,
|
|
"action": "skip" if blocked_reason else ("sigterm" if args.apply else "dry_run"),
|
|
}
|
|
)
|
|
|
|
signaled: list[int] = []
|
|
if args.apply:
|
|
for candidate in candidates:
|
|
if candidate["blocked_reason"]:
|
|
continue
|
|
os.killpg(int(candidate["pgid"]), signal.SIGTERM)
|
|
signaled.append(int(candidate["pgid"]))
|
|
|
|
remaining_after_wait = None
|
|
if args.apply and args.wait_seconds > 0:
|
|
time.sleep(args.wait_seconds)
|
|
fresh_rows = exporter.parse_ps_rows(exporter.read_ps_text(args.ps_file))
|
|
fresh_groups = exporter.classify_groups(
|
|
fresh_rows,
|
|
min_age_seconds=args.min_age_seconds,
|
|
min_cpu_percent=args.min_cpu_percent,
|
|
)
|
|
remaining_after_wait = [
|
|
group.pgid for group in fresh_groups if not args.rule or group.rule_id == args.rule
|
|
]
|
|
|
|
payload = {
|
|
"schema_version": "host_runaway_process_remediation_v1",
|
|
"host": args.host,
|
|
"mode": "apply_sigterm" if args.apply else "dry_run",
|
|
"runtime_gate": 1 if args.apply else 0,
|
|
"owner_approval_id": args.owner_approval_id if args.apply else None,
|
|
"maintenance_window_id": args.maintenance_window_id if args.apply else None,
|
|
"evidence_ref": args.evidence_ref if args.apply else None,
|
|
"min_age_seconds": args.min_age_seconds,
|
|
"min_cpu_percent": args.min_cpu_percent,
|
|
"candidate_count": len(candidates),
|
|
"signaled_process_group_count": len(signaled),
|
|
"signaled_process_groups": signaled,
|
|
"remaining_after_wait": remaining_after_wait,
|
|
"candidates": candidates,
|
|
"forbidden_without_gates": [
|
|
"sigkill",
|
|
"docker_restart",
|
|
"systemctl_restart",
|
|
"nginx_reload",
|
|
"firewall_change",
|
|
"secret_collection",
|
|
],
|
|
}
|
|
print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|