Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Code Review / ai-code-review (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
314 lines
12 KiB
Python
314 lines
12 KiB
Python
"""
|
|
Host runaway AIOps loop readiness snapshot.
|
|
|
|
This loader exposes a committed, read-only product surface for the 110 CPU
|
|
runaway process loop. It validates that monitor, alert, event-packet, PlayBook,
|
|
KM / Verifier, and gated remediation contracts are complete while every runtime
|
|
write or process termination boundary remains closed.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from src.services.snapshot_paths import default_evaluations_dir
|
|
|
|
_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__))
|
|
_SNAPSHOT_PATTERN = "host_runaway_aiops_loop_readiness_*.json"
|
|
_SCHEMA_VERSION = "host_runaway_aiops_loop_readiness_v1"
|
|
_RUNTIME_AUTHORITY = "host_runaway_aiops_loop_readiness_only_no_host_write"
|
|
_EXPECTED_CURRENT_TASK = "P3-009"
|
|
_EXPECTED_NEXT_TASK = "P3-010"
|
|
_EXPECTED_ALERT_LANES = {
|
|
"HostOrphanBrowserSmokeHighCpu": "orphan_browser_smoke_runaway_process",
|
|
"HostCiRunnerLoadSaturation": "ci_runner_load_saturation",
|
|
}
|
|
_ZERO_ROLLUP_FIELDS = {
|
|
"runtime_remediation_authorized_count",
|
|
"telegram_send_count",
|
|
"gateway_queue_write_count",
|
|
"bot_api_call_count",
|
|
"host_write_count",
|
|
"process_termination_count",
|
|
"docker_restart_count",
|
|
"systemd_restart_count",
|
|
"nginx_reload_count",
|
|
"firewall_change_count",
|
|
"kubectl_action_count",
|
|
"production_write_count",
|
|
}
|
|
_FALSE_BOUNDARY_FLAGS = {
|
|
"runtime_remediation_enabled",
|
|
"process_termination_authorized",
|
|
"telegram_send_enabled",
|
|
"gateway_queue_write_enabled",
|
|
"bot_api_call_enabled",
|
|
"host_write_enabled",
|
|
"docker_restart_enabled",
|
|
"systemd_restart_enabled",
|
|
"nginx_reload_enabled",
|
|
"firewall_change_enabled",
|
|
"kubectl_action_enabled",
|
|
"production_write_enabled",
|
|
"secret_read_enabled",
|
|
}
|
|
_TRUE_BOUNDARY_FLAGS = {
|
|
"read_only_readback_allowed",
|
|
"ai_triage_packet_allowed",
|
|
"dry_run_generation_allowed",
|
|
}
|
|
_FORBIDDEN_PUBLIC_TERMS = {
|
|
"工作視窗",
|
|
"批准!",
|
|
"codex_delegation",
|
|
"source_thread_id",
|
|
"My request for Codex",
|
|
"authorization_header",
|
|
"secret_value",
|
|
"raw secret",
|
|
"private key",
|
|
"token value",
|
|
"chain_of_thought",
|
|
}
|
|
|
|
|
|
def load_latest_host_runaway_aiops_loop_readiness(
|
|
evaluations_dir: Path | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Load the newest committed host runaway AIOps loop readiness snapshot."""
|
|
directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR
|
|
candidates = sorted(directory.glob(_SNAPSHOT_PATTERN))
|
|
if not candidates:
|
|
raise FileNotFoundError(
|
|
f"no host runaway AIOps loop readiness snapshots found in {directory}"
|
|
)
|
|
|
|
latest = candidates[-1]
|
|
with latest.open(encoding="utf-8") as handle:
|
|
payload = json.load(handle)
|
|
|
|
if not isinstance(payload, dict):
|
|
raise ValueError(f"{latest}: expected JSON object")
|
|
|
|
label = str(latest)
|
|
_require_schema(payload, label)
|
|
_require_rollups(payload, label)
|
|
_require_loop_stages(payload, label)
|
|
_require_alert_lanes(payload, label)
|
|
_require_asset_writeback_contract(payload, label)
|
|
_require_live_readback(payload, label)
|
|
_require_remediation_gate(payload, label)
|
|
_require_activation_boundaries(payload, label)
|
|
_require_no_forbidden_public_terms(payload, label)
|
|
return payload
|
|
|
|
|
|
def _require_schema(payload: dict[str, Any], label: str) -> None:
|
|
if payload.get("schema_version") != _SCHEMA_VERSION:
|
|
raise ValueError(f"{label}: expected schema_version={_SCHEMA_VERSION}")
|
|
|
|
status = payload.get("program_status") or {}
|
|
expected = {
|
|
"overall_completion_percent": 100,
|
|
"current_priority": "P3",
|
|
"current_task_id": _EXPECTED_CURRENT_TASK,
|
|
"next_task_id": _EXPECTED_NEXT_TASK,
|
|
"read_only_mode": True,
|
|
"runtime_authority": _RUNTIME_AUTHORITY,
|
|
}
|
|
mismatches = _mismatches(status, expected)
|
|
if mismatches:
|
|
raise ValueError(f"{label}: program_status mismatch: {mismatches}")
|
|
if not status.get("status_note"):
|
|
raise ValueError(f"{label}: program_status.status_note is required")
|
|
|
|
source_refs = payload.get("source_refs") or []
|
|
if len(source_refs) != 7:
|
|
raise ValueError(f"{label}: source_refs must contain 7 refs")
|
|
|
|
|
|
def _require_rollups(payload: dict[str, Any], label: str) -> None:
|
|
rollups = payload.get("rollups") or {}
|
|
expected_counts = {
|
|
"loop_stage_count": 6,
|
|
"alert_lane_count": 2,
|
|
"asset_writeback_contract_count": 5,
|
|
"source_ref_count": 7,
|
|
"live_readback_metric_count": 8,
|
|
"blocked_runtime_action_count": 12,
|
|
}
|
|
mismatches = _mismatches(rollups, expected_counts)
|
|
if mismatches:
|
|
raise ValueError(f"{label}: rollup counts mismatch: {mismatches}")
|
|
|
|
nonzero = {
|
|
field: rollups.get(field)
|
|
for field in _ZERO_ROLLUP_FIELDS
|
|
if rollups.get(field) != 0
|
|
}
|
|
if nonzero:
|
|
raise ValueError(f"{label}: zero rollup fields must remain 0: {nonzero}")
|
|
|
|
|
|
def _require_loop_stages(payload: dict[str, Any], label: str) -> None:
|
|
stages = payload.get("loop_stages") or []
|
|
if len(stages) != 6:
|
|
raise ValueError(f"{label}: loop_stages must contain 6 stages")
|
|
|
|
required_stage_ids = {
|
|
"read_only_host_textfile_exporter",
|
|
"prometheus_alert_rules",
|
|
"telegram_ai_event_packet",
|
|
"playbook_contract",
|
|
"km_verifier_writeback_contract",
|
|
"gated_remediation_helper",
|
|
}
|
|
stage_ids = {stage.get("stage_id") for stage in stages}
|
|
missing = sorted(required_stage_ids - stage_ids)
|
|
if missing:
|
|
raise ValueError(f"{label}: missing loop stages: {missing}")
|
|
|
|
for stage in stages:
|
|
stage_id = stage.get("stage_id") or "<missing>"
|
|
if stage.get("completion_percent") != 100:
|
|
raise ValueError(f"{label}: stage {stage_id} must be 100 percent")
|
|
for field in ("display_name", "owner_agent", "status", "next_action"):
|
|
if not stage.get(field):
|
|
raise ValueError(f"{label}: stage {stage_id} missing {field}")
|
|
if not stage.get("evidence_refs"):
|
|
raise ValueError(f"{label}: stage {stage_id} missing evidence_refs")
|
|
if not stage.get("blocked_runtime_actions"):
|
|
raise ValueError(f"{label}: stage {stage_id} missing blocked_runtime_actions")
|
|
|
|
|
|
def _require_alert_lanes(payload: dict[str, Any], label: str) -> None:
|
|
lanes = payload.get("alert_lanes") or []
|
|
if len(lanes) != 2:
|
|
raise ValueError(f"{label}: alert_lanes must contain 2 lanes")
|
|
|
|
lane_map = {lane.get("alertname"): lane for lane in lanes}
|
|
missing = sorted(set(_EXPECTED_ALERT_LANES) - set(lane_map))
|
|
if missing:
|
|
raise ValueError(f"{label}: missing alert lanes: {missing}")
|
|
|
|
for alertname, expected_lane in _EXPECTED_ALERT_LANES.items():
|
|
lane = lane_map[alertname]
|
|
if lane.get("lane_id") != expected_lane:
|
|
raise ValueError(f"{label}: {alertname} lane_id mismatch")
|
|
if lane.get("runtime_write_gate") != 0:
|
|
raise ValueError(f"{label}: {alertname} runtime_write_gate must be 0")
|
|
if lane.get("apply_allowed_without_owner_gate") is not False:
|
|
raise ValueError(f"{label}: {alertname} apply must require owner gate")
|
|
if not lane.get("next_action"):
|
|
raise ValueError(f"{label}: {alertname} next_action is required")
|
|
|
|
|
|
def _require_asset_writeback_contract(payload: dict[str, Any], label: str) -> None:
|
|
contracts = payload.get("asset_writeback_contract") or []
|
|
if len(contracts) != 5:
|
|
raise ValueError(f"{label}: asset_writeback_contract must contain 5 contracts")
|
|
|
|
for contract in contracts:
|
|
asset_id = contract.get("asset_id") or "<missing>"
|
|
if contract.get("required_on_real_incident") is not True:
|
|
raise ValueError(f"{label}: {asset_id} must be required on real incidents")
|
|
if contract.get("live_write_enabled") is not False:
|
|
raise ValueError(f"{label}: {asset_id} live_write_enabled must remain false")
|
|
if not contract.get("required_fields"):
|
|
raise ValueError(f"{label}: {asset_id} required_fields must not be empty")
|
|
|
|
|
|
def _require_live_readback(payload: dict[str, Any], label: str) -> None:
|
|
readback = payload.get("live_readback") or {}
|
|
expected = {
|
|
"host_label": "110",
|
|
"monitor_up": 1,
|
|
"orphan_browser_group_count": 0,
|
|
"active_ci_container_count": 2,
|
|
"remediation_authorized_count": 0,
|
|
"alerts_firing_count": 0,
|
|
"deploy_marker": "2d278568",
|
|
"argocd_sync": "Synced",
|
|
"argocd_health": "Healthy",
|
|
"production_route_count": 3,
|
|
"forbidden_public_hit_count": 0,
|
|
}
|
|
mismatches = _mismatches(readback, expected)
|
|
if mismatches:
|
|
raise ValueError(f"{label}: live_readback mismatch: {mismatches}")
|
|
if not str(readback.get("runtime_revision", "")).startswith("f358a0f6"):
|
|
raise ValueError(f"{label}: live_readback.runtime_revision must start with f358a0f6")
|
|
|
|
|
|
def _require_remediation_gate(payload: dict[str, Any], label: str) -> None:
|
|
gate = payload.get("remediation_gate") or {}
|
|
expected = {
|
|
"dry_run_required": True,
|
|
"owner_approval_required": True,
|
|
"maintenance_window_required": True,
|
|
"evidence_ref_required": True,
|
|
"post_check_required": True,
|
|
"allowed_signal_after_gate": "SIGTERM",
|
|
"process_termination_authorized": False,
|
|
}
|
|
mismatches = _mismatches(gate, expected)
|
|
if mismatches:
|
|
raise ValueError(f"{label}: remediation_gate mismatch: {mismatches}")
|
|
if len(gate.get("disallowed_actions") or []) != 8:
|
|
raise ValueError(f"{label}: remediation_gate.disallowed_actions must contain 8 actions")
|
|
|
|
|
|
def _require_activation_boundaries(payload: dict[str, Any], label: str) -> None:
|
|
boundaries = payload.get("activation_boundaries") or {}
|
|
true_mismatches = {
|
|
flag: boundaries.get(flag)
|
|
for flag in _TRUE_BOUNDARY_FLAGS
|
|
if boundaries.get(flag) is not True
|
|
}
|
|
if true_mismatches:
|
|
raise ValueError(f"{label}: true activation boundaries mismatch: {true_mismatches}")
|
|
|
|
false_mismatches = {
|
|
flag: boundaries.get(flag)
|
|
for flag in _FALSE_BOUNDARY_FLAGS
|
|
if boundaries.get(flag) is not False
|
|
}
|
|
if false_mismatches:
|
|
raise ValueError(f"{label}: false activation boundaries mismatch: {false_mismatches}")
|
|
|
|
for step in payload.get("next_steps") or []:
|
|
if step.get("runtime_write_allowed") is not False:
|
|
raise ValueError(f"{label}: next_steps runtime_write_allowed must remain false")
|
|
|
|
|
|
def _require_no_forbidden_public_terms(payload: Any, label: str) -> None:
|
|
hits: list[str] = []
|
|
|
|
def walk(value: Any, path: str) -> None:
|
|
if isinstance(value, dict):
|
|
for key, nested in value.items():
|
|
walk(nested, f"{path}.{key}" if path else str(key))
|
|
return
|
|
if isinstance(value, list):
|
|
for index, nested in enumerate(value):
|
|
walk(nested, f"{path}[{index}]")
|
|
return
|
|
if isinstance(value, str):
|
|
for term in _FORBIDDEN_PUBLIC_TERMS:
|
|
if term.lower() in value.lower():
|
|
hits.append(f"{path}: {term}")
|
|
|
|
walk(payload, "")
|
|
if hits:
|
|
raise ValueError(f"{label}: forbidden public terms: {hits[:5]}")
|
|
|
|
|
|
def _mismatches(actual: dict[str, Any], expected: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
|
return {
|
|
key: {"expected": expected_value, "actual": actual.get(key)}
|
|
for key, expected_value in expected.items()
|
|
if actual.get(key) != expected_value
|
|
}
|