Files
awoooi/apps/api/src/services/alert_approval_guard.py
Your Name 3b64d66836
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 42s
CD Pipeline / build-and-deploy (push) Successful in 3m31s
CD Pipeline / post-deploy-checks (push) Successful in 1m18s
fix(alerts): guard approval actions and wire playbook learning
2026-05-06 03:34:24 +08:00

152 lines
5.1 KiB
Python

"""Alert approval guardrails for AI-generated remediation actions.
This service runs before an Alertmanager-derived action becomes an
ApprovalRecord. It prevents a known failure mode: an LLM invents a kubectl
target that does not belong to the current alert domain, then the approval
pipeline faithfully executes or displays that bad command.
"""
from __future__ import annotations
from dataclasses import dataclass, field
import structlog
from src.services.action_parser import ActionKind, parse_kubectl_action
logger = structlog.get_logger(__name__)
_ALLOWED_K8S_NAMESPACES = frozenset({"awoooi-prod", "observability", "signoz", "langfuse"})
@dataclass(frozen=True)
class ApprovalActionGuardResult:
"""Guarded action payload returned to approval creation."""
action: str
blocked: bool = False
reason: str | None = None
metadata: dict[str, object] = field(default_factory=dict)
async def guard_alert_approval_action(
*,
action: str,
alert_namespace: str | None,
alertname: str,
alert_category: str,
) -> ApprovalActionGuardResult:
"""Validate an AI/rule action before it is persisted as an approval.
Non-kubectl actions are intentionally left to their domain-specific gates.
Kubectl actions must satisfy the structured parser and must not jump to an
unrelated namespace such as ``default`` or ``production`` when the alert
came from AWOOOI's production namespace.
"""
raw_action = (action or "").strip()
if not raw_action.lower().startswith("kubectl"):
return ApprovalActionGuardResult(action=action)
parsed = parse_kubectl_action(raw_action)
if not parsed.ok:
return _blocked(raw_action, f"invalid_kubectl:{parsed.reason}", alertname)
requested_namespace = parsed.namespace
expected_namespace = (alert_namespace or "awoooi-prod").strip() or "awoooi-prod"
if requested_namespace and requested_namespace not in _ALLOWED_K8S_NAMESPACES:
return _blocked(
raw_action,
f"namespace_not_allowed:{requested_namespace}",
alertname,
expected_namespace=expected_namespace,
)
if (
requested_namespace
and expected_namespace in _ALLOWED_K8S_NAMESPACES
and requested_namespace != expected_namespace
and requested_namespace != "observability"
):
return _blocked(
raw_action,
f"namespace_mismatch:{requested_namespace}!={expected_namespace}",
alertname,
expected_namespace=expected_namespace,
)
# Read-only commands are safe enough to display once the namespace is sane.
# Mutating commands still need resource existence checks to avoid executing
# hallucinated deployments like "flywheelexecutionratemissing".
if parsed.kind == ActionKind.READONLY and parsed.verb in {"get", "version"}:
return ApprovalActionGuardResult(action=action)
if parsed.resource_name and parsed.resource_type in {
"deployment",
"statefulset",
"daemonset",
"pod",
"service",
}:
try:
from src.services.resource_resolver import get_resource_resolver
resolver = get_resource_resolver()
resolved = await resolver.resolve(
raw_resource=parsed.resource_name,
namespace=requested_namespace or expected_namespace,
resource_kind=parsed.resource_type,
)
if not resolved.success:
return _blocked(
raw_action,
f"k8s_resource_not_found:{parsed.resource_type}/{parsed.resource_name}",
alertname,
expected_namespace=expected_namespace,
candidates=resolved.candidates,
)
except Exception as exc:
logger.warning(
"approval_action_resource_guard_unavailable",
alertname=alertname,
alert_category=alert_category,
action=raw_action[:160],
error=str(exc),
)
return ApprovalActionGuardResult(
action=action,
metadata={"action_guard_warning": "resource_guard_unavailable"},
)
return ApprovalActionGuardResult(action=action)
def _blocked(
raw_action: str,
reason: str,
alertname: str,
*,
expected_namespace: str | None = None,
candidates: list[str] | None = None,
) -> ApprovalActionGuardResult:
logger.warning(
"approval_action_blocked_before_persist",
alertname=alertname,
reason=reason,
action=raw_action[:160],
expected_namespace=expected_namespace,
candidates=candidates or [],
)
return ApprovalActionGuardResult(
action=f"NO_ACTION - INVALID_TARGET: {reason}; original={raw_action[:180]}",
blocked=True,
reason=reason,
metadata={
"action_guard": "blocked_before_persist",
"blocked_action": raw_action[:300],
"blocked_reason": reason,
"expected_namespace": expected_namespace,
"candidates": candidates or [],
},
)