152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
"""Alert approval guardrails for AI-generated remediation actions.
|
|
|
|
This service runs before an Alertmanager-derived action becomes an
|
|
ApprovalRecord. It prevents a known failure mode: an LLM invents a kubectl
|
|
target that does not belong to the current alert domain, then the approval
|
|
pipeline faithfully executes or displays that bad command.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
import structlog
|
|
|
|
from src.services.action_parser import ActionKind, parse_kubectl_action
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
_ALLOWED_K8S_NAMESPACES = frozenset({"awoooi-prod", "observability", "signoz", "langfuse"})
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ApprovalActionGuardResult:
|
|
"""Guarded action payload returned to approval creation."""
|
|
|
|
action: str
|
|
blocked: bool = False
|
|
reason: str | None = None
|
|
metadata: dict[str, object] = field(default_factory=dict)
|
|
|
|
|
|
async def guard_alert_approval_action(
|
|
*,
|
|
action: str,
|
|
alert_namespace: str | None,
|
|
alertname: str,
|
|
alert_category: str,
|
|
) -> ApprovalActionGuardResult:
|
|
"""Validate an AI/rule action before it is persisted as an approval.
|
|
|
|
Non-kubectl actions are intentionally left to their domain-specific gates.
|
|
Kubectl actions must satisfy the structured parser and must not jump to an
|
|
unrelated namespace such as ``default`` or ``production`` when the alert
|
|
came from AWOOOI's production namespace.
|
|
"""
|
|
|
|
raw_action = (action or "").strip()
|
|
if not raw_action.lower().startswith("kubectl"):
|
|
return ApprovalActionGuardResult(action=action)
|
|
|
|
parsed = parse_kubectl_action(raw_action)
|
|
if not parsed.ok:
|
|
return _blocked(raw_action, f"invalid_kubectl:{parsed.reason}", alertname)
|
|
|
|
requested_namespace = parsed.namespace
|
|
expected_namespace = (alert_namespace or "awoooi-prod").strip() or "awoooi-prod"
|
|
if requested_namespace and requested_namespace not in _ALLOWED_K8S_NAMESPACES:
|
|
return _blocked(
|
|
raw_action,
|
|
f"namespace_not_allowed:{requested_namespace}",
|
|
alertname,
|
|
expected_namespace=expected_namespace,
|
|
)
|
|
|
|
if (
|
|
requested_namespace
|
|
and expected_namespace in _ALLOWED_K8S_NAMESPACES
|
|
and requested_namespace != expected_namespace
|
|
and requested_namespace != "observability"
|
|
):
|
|
return _blocked(
|
|
raw_action,
|
|
f"namespace_mismatch:{requested_namespace}!={expected_namespace}",
|
|
alertname,
|
|
expected_namespace=expected_namespace,
|
|
)
|
|
|
|
# Read-only commands are safe enough to display once the namespace is sane.
|
|
# Mutating commands still need resource existence checks to avoid executing
|
|
# hallucinated deployments like "flywheelexecutionratemissing".
|
|
if parsed.kind == ActionKind.READONLY and parsed.verb in {"get", "version"}:
|
|
return ApprovalActionGuardResult(action=action)
|
|
|
|
if parsed.resource_name and parsed.resource_type in {
|
|
"deployment",
|
|
"statefulset",
|
|
"daemonset",
|
|
"pod",
|
|
"service",
|
|
}:
|
|
try:
|
|
from src.services.resource_resolver import get_resource_resolver
|
|
|
|
resolver = get_resource_resolver()
|
|
resolved = await resolver.resolve(
|
|
raw_resource=parsed.resource_name,
|
|
namespace=requested_namespace or expected_namespace,
|
|
resource_kind=parsed.resource_type,
|
|
)
|
|
if not resolved.success:
|
|
return _blocked(
|
|
raw_action,
|
|
f"k8s_resource_not_found:{parsed.resource_type}/{parsed.resource_name}",
|
|
alertname,
|
|
expected_namespace=expected_namespace,
|
|
candidates=resolved.candidates,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"approval_action_resource_guard_unavailable",
|
|
alertname=alertname,
|
|
alert_category=alert_category,
|
|
action=raw_action[:160],
|
|
error=str(exc),
|
|
)
|
|
return ApprovalActionGuardResult(
|
|
action=action,
|
|
metadata={"action_guard_warning": "resource_guard_unavailable"},
|
|
)
|
|
|
|
return ApprovalActionGuardResult(action=action)
|
|
|
|
|
|
def _blocked(
|
|
raw_action: str,
|
|
reason: str,
|
|
alertname: str,
|
|
*,
|
|
expected_namespace: str | None = None,
|
|
candidates: list[str] | None = None,
|
|
) -> ApprovalActionGuardResult:
|
|
logger.warning(
|
|
"approval_action_blocked_before_persist",
|
|
alertname=alertname,
|
|
reason=reason,
|
|
action=raw_action[:160],
|
|
expected_namespace=expected_namespace,
|
|
candidates=candidates or [],
|
|
)
|
|
return ApprovalActionGuardResult(
|
|
action=f"NO_ACTION - INVALID_TARGET: {reason}; original={raw_action[:180]}",
|
|
blocked=True,
|
|
reason=reason,
|
|
metadata={
|
|
"action_guard": "blocked_before_persist",
|
|
"blocked_action": raw_action[:300],
|
|
"blocked_reason": reason,
|
|
"expected_namespace": expected_namespace,
|
|
"candidates": candidates or [],
|
|
},
|
|
)
|