awoooi/apps/api/src/services/host_runaway_aiops_loop_readiness.py

"""
Host runaway AIOps loop readiness snapshot.

This loader exposes a committed, read-only product surface for the 110 CPU
runaway process loop. It validates that monitor, alert, event-packet, PlayBook,
KM / Verifier, and gated remediation contracts are complete while every runtime
write or process termination boundary remains closed.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

from src.services.snapshot_paths import default_evaluations_dir

_DEFAULT_EVALUATIONS_DIR = default_evaluations_dir(Path(__file__))
_SNAPSHOT_PATTERN = "host_runaway_aiops_loop_readiness_*.json"
_SCHEMA_VERSION = "host_runaway_aiops_loop_readiness_v1"
_RUNTIME_AUTHORITY = "host_runaway_aiops_loop_readiness_only_no_host_write"
_EXPECTED_CURRENT_TASK = "P3-009"
_EXPECTED_NEXT_TASK = "P3-010"
_EXPECTED_ALERT_LANES = {
    "HostOrphanBrowserSmokeHighCpu": "orphan_browser_smoke_runaway_process",
    "HostCiRunnerLoadSaturation": "ci_runner_load_saturation",
}
_ZERO_ROLLUP_FIELDS = {
    "runtime_remediation_authorized_count",
    "telegram_send_count",
    "gateway_queue_write_count",
    "bot_api_call_count",
    "host_write_count",
    "process_termination_count",
    "docker_restart_count",
    "systemd_restart_count",
    "nginx_reload_count",
    "firewall_change_count",
    "kubectl_action_count",
    "production_write_count",
}
_FALSE_BOUNDARY_FLAGS = {
    "runtime_remediation_enabled",
    "process_termination_authorized",
    "telegram_send_enabled",
    "gateway_queue_write_enabled",
    "bot_api_call_enabled",
    "host_write_enabled",
    "docker_restart_enabled",
    "systemd_restart_enabled",
    "nginx_reload_enabled",
    "firewall_change_enabled",
    "kubectl_action_enabled",
    "production_write_enabled",
    "secret_read_enabled",
}
_TRUE_BOUNDARY_FLAGS = {
    "read_only_readback_allowed",
    "ai_triage_packet_allowed",
    "dry_run_generation_allowed",
}
_FORBIDDEN_PUBLIC_TERMS = {
    "工作視窗",
    "批准！",
    "codex_delegation",
    "source_thread_id",
    "My request for Codex",
    "authorization_header",
    "secret_value",
    "raw secret",
    "private key",
    "token value",
    "chain_of_thought",
}


def load_latest_host_runaway_aiops_loop_readiness(
    evaluations_dir: Path | None = None,
) -> dict[str, Any]:
    """Load the newest committed host runaway AIOps loop readiness snapshot."""
    directory = evaluations_dir or _DEFAULT_EVALUATIONS_DIR
    candidates = sorted(directory.glob(_SNAPSHOT_PATTERN))
    if not candidates:
        raise FileNotFoundError(
            f"no host runaway AIOps loop readiness snapshots found in {directory}"
        )

    latest = candidates[-1]
    with latest.open(encoding="utf-8") as handle:
        payload = json.load(handle)

    if not isinstance(payload, dict):
        raise ValueError(f"{latest}: expected JSON object")

    label = str(latest)
    _require_schema(payload, label)
    _require_rollups(payload, label)
    _require_loop_stages(payload, label)
    _require_alert_lanes(payload, label)
    _require_asset_writeback_contract(payload, label)
    _require_live_readback(payload, label)
    _require_remediation_gate(payload, label)
    _require_activation_boundaries(payload, label)
    _require_no_forbidden_public_terms(payload, label)
    return payload


def _require_schema(payload: dict[str, Any], label: str) -> None:
    if payload.get("schema_version") != _SCHEMA_VERSION:
        raise ValueError(f"{label}: expected schema_version={_SCHEMA_VERSION}")

    status = payload.get("program_status") or {}
    expected = {
        "overall_completion_percent": 100,
        "current_priority": "P3",
        "current_task_id": _EXPECTED_CURRENT_TASK,
        "next_task_id": _EXPECTED_NEXT_TASK,
        "read_only_mode": True,
        "runtime_authority": _RUNTIME_AUTHORITY,
    }
    mismatches = _mismatches(status, expected)
    if mismatches:
        raise ValueError(f"{label}: program_status mismatch: {mismatches}")
    if not status.get("status_note"):
        raise ValueError(f"{label}: program_status.status_note is required")

    source_refs = payload.get("source_refs") or []
    if len(source_refs) != 7:
        raise ValueError(f"{label}: source_refs must contain 7 refs")


def _require_rollups(payload: dict[str, Any], label: str) -> None:
    rollups = payload.get("rollups") or {}
    expected_counts = {
        "loop_stage_count": 6,
        "alert_lane_count": 2,
        "asset_writeback_contract_count": 5,
        "source_ref_count": 7,
        "live_readback_metric_count": 8,
        "blocked_runtime_action_count": 12,
    }
    mismatches = _mismatches(rollups, expected_counts)
    if mismatches:
        raise ValueError(f"{label}: rollup counts mismatch: {mismatches}")

    nonzero = {
        field: rollups.get(field)
        for field in _ZERO_ROLLUP_FIELDS
        if rollups.get(field) != 0
    }
    if nonzero:
        raise ValueError(f"{label}: zero rollup fields must remain 0: {nonzero}")


def _require_loop_stages(payload: dict[str, Any], label: str) -> None:
    stages = payload.get("loop_stages") or []
    if len(stages) != 6:
        raise ValueError(f"{label}: loop_stages must contain 6 stages")

    required_stage_ids = {
        "read_only_host_textfile_exporter",
        "prometheus_alert_rules",
        "telegram_ai_event_packet",
        "playbook_contract",
        "km_verifier_writeback_contract",
        "gated_remediation_helper",
    }
    stage_ids = {stage.get("stage_id") for stage in stages}
    missing = sorted(required_stage_ids - stage_ids)
    if missing:
        raise ValueError(f"{label}: missing loop stages: {missing}")

    for stage in stages:
        stage_id = stage.get("stage_id") or "<missing>"
        if stage.get("completion_percent") != 100:
            raise ValueError(f"{label}: stage {stage_id} must be 100 percent")
        for field in ("display_name", "owner_agent", "status", "next_action"):
            if not stage.get(field):
                raise ValueError(f"{label}: stage {stage_id} missing {field}")
        if not stage.get("evidence_refs"):
            raise ValueError(f"{label}: stage {stage_id} missing evidence_refs")
        if not stage.get("blocked_runtime_actions"):
            raise ValueError(f"{label}: stage {stage_id} missing blocked_runtime_actions")


def _require_alert_lanes(payload: dict[str, Any], label: str) -> None:
    lanes = payload.get("alert_lanes") or []
    if len(lanes) != 2:
        raise ValueError(f"{label}: alert_lanes must contain 2 lanes")

    lane_map = {lane.get("alertname"): lane for lane in lanes}
    missing = sorted(set(_EXPECTED_ALERT_LANES) - set(lane_map))
    if missing:
        raise ValueError(f"{label}: missing alert lanes: {missing}")

    for alertname, expected_lane in _EXPECTED_ALERT_LANES.items():
        lane = lane_map[alertname]
        if lane.get("lane_id") != expected_lane:
            raise ValueError(f"{label}: {alertname} lane_id mismatch")
        if lane.get("runtime_write_gate") != 0:
            raise ValueError(f"{label}: {alertname} runtime_write_gate must be 0")
        if lane.get("apply_allowed_without_owner_gate") is not False:
            raise ValueError(f"{label}: {alertname} apply must require owner gate")
        if not lane.get("next_action"):
            raise ValueError(f"{label}: {alertname} next_action is required")


def _require_asset_writeback_contract(payload: dict[str, Any], label: str) -> None:
    contracts = payload.get("asset_writeback_contract") or []
    if len(contracts) != 5:
        raise ValueError(f"{label}: asset_writeback_contract must contain 5 contracts")

    for contract in contracts:
        asset_id = contract.get("asset_id") or "<missing>"
        if contract.get("required_on_real_incident") is not True:
            raise ValueError(f"{label}: {asset_id} must be required on real incidents")
        if contract.get("live_write_enabled") is not False:
            raise ValueError(f"{label}: {asset_id} live_write_enabled must remain false")
        if not contract.get("required_fields"):
            raise ValueError(f"{label}: {asset_id} required_fields must not be empty")


def _require_live_readback(payload: dict[str, Any], label: str) -> None:
    readback = payload.get("live_readback") or {}
    expected = {
        "host_label": "110",
        "monitor_up": 1,
        "orphan_browser_group_count": 0,
        "active_ci_container_count": 2,
        "remediation_authorized_count": 0,
        "alerts_firing_count": 0,
        "deploy_marker": "2d278568",
        "argocd_sync": "Synced",
        "argocd_health": "Healthy",
        "production_route_count": 3,
        "forbidden_public_hit_count": 0,
    }
    mismatches = _mismatches(readback, expected)
    if mismatches:
        raise ValueError(f"{label}: live_readback mismatch: {mismatches}")
    if not str(readback.get("runtime_revision", "")).startswith("f358a0f6"):
        raise ValueError(f"{label}: live_readback.runtime_revision must start with f358a0f6")


def _require_remediation_gate(payload: dict[str, Any], label: str) -> None:
    gate = payload.get("remediation_gate") or {}
    expected = {
        "dry_run_required": True,
        "owner_approval_required": True,
        "maintenance_window_required": True,
        "evidence_ref_required": True,
        "post_check_required": True,
        "allowed_signal_after_gate": "SIGTERM",
        "process_termination_authorized": False,
    }
    mismatches = _mismatches(gate, expected)
    if mismatches:
        raise ValueError(f"{label}: remediation_gate mismatch: {mismatches}")
    if len(gate.get("disallowed_actions") or []) != 8:
        raise ValueError(f"{label}: remediation_gate.disallowed_actions must contain 8 actions")


def _require_activation_boundaries(payload: dict[str, Any], label: str) -> None:
    boundaries = payload.get("activation_boundaries") or {}
    true_mismatches = {
        flag: boundaries.get(flag)
        for flag in _TRUE_BOUNDARY_FLAGS
        if boundaries.get(flag) is not True
    }
    if true_mismatches:
        raise ValueError(f"{label}: true activation boundaries mismatch: {true_mismatches}")

    false_mismatches = {
        flag: boundaries.get(flag)
        for flag in _FALSE_BOUNDARY_FLAGS
        if boundaries.get(flag) is not False
    }
    if false_mismatches:
        raise ValueError(f"{label}: false activation boundaries mismatch: {false_mismatches}")

    for step in payload.get("next_steps") or []:
        if step.get("runtime_write_allowed") is not False:
            raise ValueError(f"{label}: next_steps runtime_write_allowed must remain false")


def _require_no_forbidden_public_terms(payload: Any, label: str) -> None:
    hits: list[str] = []

    def walk(value: Any, path: str) -> None:
        if isinstance(value, dict):
            for key, nested in value.items():
                walk(nested, f"{path}.{key}" if path else str(key))
            return
        if isinstance(value, list):
            for index, nested in enumerate(value):
                walk(nested, f"{path}[{index}]")
            return
        if isinstance(value, str):
            for term in _FORBIDDEN_PUBLIC_TERMS:
                if term.lower() in value.lower():
                    hits.append(f"{path}: {term}")

    walk(payload, "")
    if hits:
        raise ValueError(f"{label}: forbidden public terms: {hits[:5]}")


def _mismatches(actual: dict[str, Any], expected: dict[str, Any]) -> dict[str, dict[str, Any]]:
    return {
        key: {"expected": expected_value, "actual": actual.get(key)}
        for key, expected_value in expected.items()
        if actual.get(key) != expected_value
    }