Files
awoooi/apps/api/src/services/host_runaway_aiops_loop_readiness.py
Your Name 0e72a6f428
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
feat(aiops): expose host runaway loop readiness
2026-06-18 15:28:15 +08:00

314 lines
12 KiB
Python

"""
Host runaway AIOps loop readiness snapshot.
This loader exposes a committed, read-only product surface for the 110 CPU
runaway process loop. It validates that monitor, alert, event-packet, PlayBook,
KM / Verifier, and gated remediation contracts are complete while every runtime
write or process termination boundary remains closed.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from src.services.snapshot_paths import default_evaluations_dir
_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__))
_SNAPSHOT_PATTERN = "host_runaway_aiops_loop_readiness_*.json"
_SCHEMA_VERSION = "host_runaway_aiops_loop_readiness_v1"
_RUNTIME_AUTHORITY = "host_runaway_aiops_loop_readiness_only_no_host_write"
_EXPECTED_CURRENT_TASK = "P3-009"
_EXPECTED_NEXT_TASK = "P3-010"
_EXPECTED_ALERT_LANES = {
"HostOrphanBrowserSmokeHighCpu": "orphan_browser_smoke_runaway_process",
"HostCiRunnerLoadSaturation": "ci_runner_load_saturation",
}
_ZERO_ROLLUP_FIELDS = {
"runtime_remediation_authorized_count",
"telegram_send_count",
"gateway_queue_write_count",
"bot_api_call_count",
"host_write_count",
"process_termination_count",
"docker_restart_count",
"systemd_restart_count",
"nginx_reload_count",
"firewall_change_count",
"kubectl_action_count",
"production_write_count",
}
_FALSE_BOUNDARY_FLAGS = {
"runtime_remediation_enabled",
"process_termination_authorized",
"telegram_send_enabled",
"gateway_queue_write_enabled",
"bot_api_call_enabled",
"host_write_enabled",
"docker_restart_enabled",
"systemd_restart_enabled",
"nginx_reload_enabled",
"firewall_change_enabled",
"kubectl_action_enabled",
"production_write_enabled",
"secret_read_enabled",
}
_TRUE_BOUNDARY_FLAGS = {
"read_only_readback_allowed",
"ai_triage_packet_allowed",
"dry_run_generation_allowed",
}
_FORBIDDEN_PUBLIC_TERMS = {
"工作視窗",
"批准!",
"codex_delegation",
"source_thread_id",
"My request for Codex",
"authorization_header",
"secret_value",
"raw secret",
"private key",
"token value",
"chain_of_thought",
}
def load_latest_host_runaway_aiops_loop_readiness(
evaluations_dir: Path | None = None,
) -> dict[str, Any]:
"""Load the newest committed host runaway AIOps loop readiness snapshot."""
directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR
candidates = sorted(directory.glob(_SNAPSHOT_PATTERN))
if not candidates:
raise FileNotFoundError(
f"no host runaway AIOps loop readiness snapshots found in {directory}"
)
latest = candidates[-1]
with latest.open(encoding="utf-8") as handle:
payload = json.load(handle)
if not isinstance(payload, dict):
raise ValueError(f"{latest}: expected JSON object")
label = str(latest)
_require_schema(payload, label)
_require_rollups(payload, label)
_require_loop_stages(payload, label)
_require_alert_lanes(payload, label)
_require_asset_writeback_contract(payload, label)
_require_live_readback(payload, label)
_require_remediation_gate(payload, label)
_require_activation_boundaries(payload, label)
_require_no_forbidden_public_terms(payload, label)
return payload
def _require_schema(payload: dict[str, Any], label: str) -> None:
if payload.get("schema_version") != _SCHEMA_VERSION:
raise ValueError(f"{label}: expected schema_version={_SCHEMA_VERSION}")
status = payload.get("program_status") or {}
expected = {
"overall_completion_percent": 100,
"current_priority": "P3",
"current_task_id": _EXPECTED_CURRENT_TASK,
"next_task_id": _EXPECTED_NEXT_TASK,
"read_only_mode": True,
"runtime_authority": _RUNTIME_AUTHORITY,
}
mismatches = _mismatches(status, expected)
if mismatches:
raise ValueError(f"{label}: program_status mismatch: {mismatches}")
if not status.get("status_note"):
raise ValueError(f"{label}: program_status.status_note is required")
source_refs = payload.get("source_refs") or []
if len(source_refs) != 7:
raise ValueError(f"{label}: source_refs must contain 7 refs")
def _require_rollups(payload: dict[str, Any], label: str) -> None:
rollups = payload.get("rollups") or {}
expected_counts = {
"loop_stage_count": 6,
"alert_lane_count": 2,
"asset_writeback_contract_count": 5,
"source_ref_count": 7,
"live_readback_metric_count": 8,
"blocked_runtime_action_count": 12,
}
mismatches = _mismatches(rollups, expected_counts)
if mismatches:
raise ValueError(f"{label}: rollup counts mismatch: {mismatches}")
nonzero = {
field: rollups.get(field)
for field in _ZERO_ROLLUP_FIELDS
if rollups.get(field) != 0
}
if nonzero:
raise ValueError(f"{label}: zero rollup fields must remain 0: {nonzero}")
def _require_loop_stages(payload: dict[str, Any], label: str) -> None:
stages = payload.get("loop_stages") or []
if len(stages) != 6:
raise ValueError(f"{label}: loop_stages must contain 6 stages")
required_stage_ids = {
"read_only_host_textfile_exporter",
"prometheus_alert_rules",
"telegram_ai_event_packet",
"playbook_contract",
"km_verifier_writeback_contract",
"gated_remediation_helper",
}
stage_ids = {stage.get("stage_id") for stage in stages}
missing = sorted(required_stage_ids - stage_ids)
if missing:
raise ValueError(f"{label}: missing loop stages: {missing}")
for stage in stages:
stage_id = stage.get("stage_id") or "<missing>"
if stage.get("completion_percent") != 100:
raise ValueError(f"{label}: stage {stage_id} must be 100 percent")
for field in ("display_name", "owner_agent", "status", "next_action"):
if not stage.get(field):
raise ValueError(f"{label}: stage {stage_id} missing {field}")
if not stage.get("evidence_refs"):
raise ValueError(f"{label}: stage {stage_id} missing evidence_refs")
if not stage.get("blocked_runtime_actions"):
raise ValueError(f"{label}: stage {stage_id} missing blocked_runtime_actions")
def _require_alert_lanes(payload: dict[str, Any], label: str) -> None:
lanes = payload.get("alert_lanes") or []
if len(lanes) != 2:
raise ValueError(f"{label}: alert_lanes must contain 2 lanes")
lane_map = {lane.get("alertname"): lane for lane in lanes}
missing = sorted(set(_EXPECTED_ALERT_LANES) - set(lane_map))
if missing:
raise ValueError(f"{label}: missing alert lanes: {missing}")
for alertname, expected_lane in _EXPECTED_ALERT_LANES.items():
lane = lane_map[alertname]
if lane.get("lane_id") != expected_lane:
raise ValueError(f"{label}: {alertname} lane_id mismatch")
if lane.get("runtime_write_gate") != 0:
raise ValueError(f"{label}: {alertname} runtime_write_gate must be 0")
if lane.get("apply_allowed_without_owner_gate") is not False:
raise ValueError(f"{label}: {alertname} apply must require owner gate")
if not lane.get("next_action"):
raise ValueError(f"{label}: {alertname} next_action is required")
def _require_asset_writeback_contract(payload: dict[str, Any], label: str) -> None:
contracts = payload.get("asset_writeback_contract") or []
if len(contracts) != 5:
raise ValueError(f"{label}: asset_writeback_contract must contain 5 contracts")
for contract in contracts:
asset_id = contract.get("asset_id") or "<missing>"
if contract.get("required_on_real_incident") is not True:
raise ValueError(f"{label}: {asset_id} must be required on real incidents")
if contract.get("live_write_enabled") is not False:
raise ValueError(f"{label}: {asset_id} live_write_enabled must remain false")
if not contract.get("required_fields"):
raise ValueError(f"{label}: {asset_id} required_fields must not be empty")
def _require_live_readback(payload: dict[str, Any], label: str) -> None:
readback = payload.get("live_readback") or {}
expected = {
"host_label": "110",
"monitor_up": 1,
"orphan_browser_group_count": 0,
"active_ci_container_count": 2,
"remediation_authorized_count": 0,
"alerts_firing_count": 0,
"deploy_marker": "2d278568",
"argocd_sync": "Synced",
"argocd_health": "Healthy",
"production_route_count": 3,
"forbidden_public_hit_count": 0,
}
mismatches = _mismatches(readback, expected)
if mismatches:
raise ValueError(f"{label}: live_readback mismatch: {mismatches}")
if not str(readback.get("runtime_revision", "")).startswith("f358a0f6"):
raise ValueError(f"{label}: live_readback.runtime_revision must start with f358a0f6")
def _require_remediation_gate(payload: dict[str, Any], label: str) -> None:
gate = payload.get("remediation_gate") or {}
expected = {
"dry_run_required": True,
"owner_approval_required": True,
"maintenance_window_required": True,
"evidence_ref_required": True,
"post_check_required": True,
"allowed_signal_after_gate": "SIGTERM",
"process_termination_authorized": False,
}
mismatches = _mismatches(gate, expected)
if mismatches:
raise ValueError(f"{label}: remediation_gate mismatch: {mismatches}")
if len(gate.get("disallowed_actions") or []) != 8:
raise ValueError(f"{label}: remediation_gate.disallowed_actions must contain 8 actions")
def _require_activation_boundaries(payload: dict[str, Any], label: str) -> None:
boundaries = payload.get("activation_boundaries") or {}
true_mismatches = {
flag: boundaries.get(flag)
for flag in _TRUE_BOUNDARY_FLAGS
if boundaries.get(flag) is not True
}
if true_mismatches:
raise ValueError(f"{label}: true activation boundaries mismatch: {true_mismatches}")
false_mismatches = {
flag: boundaries.get(flag)
for flag in _FALSE_BOUNDARY_FLAGS
if boundaries.get(flag) is not False
}
if false_mismatches:
raise ValueError(f"{label}: false activation boundaries mismatch: {false_mismatches}")
for step in payload.get("next_steps") or []:
if step.get("runtime_write_allowed") is not False:
raise ValueError(f"{label}: next_steps runtime_write_allowed must remain false")
def _require_no_forbidden_public_terms(payload: Any, label: str) -> None:
hits: list[str] = []
def walk(value: Any, path: str) -> None:
if isinstance(value, dict):
for key, nested in value.items():
walk(nested, f"{path}.{key}" if path else str(key))
return
if isinstance(value, list):
for index, nested in enumerate(value):
walk(nested, f"{path}[{index}]")
return
if isinstance(value, str):
for term in _FORBIDDEN_PUBLIC_TERMS:
if term.lower() in value.lower():
hits.append(f"{path}: {term}")
walk(payload, "")
if hits:
raise ValueError(f"{label}: forbidden public terms: {hits[:5]}")
def _mismatches(actual: dict[str, Any], expected: dict[str, Any]) -> dict[str, dict[str, Any]]:
return {
key: {"expected": expected_value, "actual": actual.get(key)}
for key, expected_value in expected.items()
if actual.get(key) != expected_value
}