"""
AWOOOI AIOps Phase 1 — 執行後驗證器
=====================================
每次 AI 修復動作執行後,主動用 MCP 抓取環境後狀態,
與 EvidenceSnapshot.pre_execution_state 對比,
判斷修復是否真的有效。
驗證結果三態:
- "success" — 問題已解決(Pod Running / 指標恢復正常)
- "degraded" — 部分改善但未完全恢復
- "failed" — 執行後狀態比執行前更差,或完全未改善
- "timeout" — 驗證超時(MCP 無法回應)
驗證結果用途:
1. 填入 EvidenceSnapshot.verification_result(Phase 3 學習閉環基礎)
2. 傳給 learning_service 更新 Playbook EWMA trust_score
3. 觸發 Reviewer Agent 的 rollback 決策(Phase 2)
設計原則:
- 執行後等待 warm-up period(預設 10s),讓 K8s controller 有時間收斂
- 超時不 raise,標記 "timeout" 並繼續流程
- 不阻塞原始執行路徑(await,但結果不影響執行本身是否成功)
W2 PR-V1: SelfHealingValidator 串接 (2026-04-28 ogt + Claude Sonnet 4.6)
- ENABLE_SELF_HEALING_VALIDATOR=True 時,verify() 完成後呼叫 assess_self_healing()
- self_healing_score < 0.5 → Telegram 警示 rollback 提案(不自動執行)
- 驗證失敗不阻塞主流程(try/except 全包)
ADR-081: PreDecisionInvestigator + EvidenceSnapshot
MASTER §3.1 L6×D1
2026-04-15 ogt + Claude Sonnet 4.6 (亞太): Phase 1 初始建立
"""
from __future__ import annotations
import asyncio
import re
from typing import TYPE_CHECKING, Any
import structlog
from src.db.base import get_db_context
from src.plugins.mcp.gateway import GatewayContext, McpGateway
from src.plugins.mcp.registry import AuditedMCPToolProvider
from src.services.evidence_snapshot import EvidenceSnapshot
from src.services.mcp_audit_context import with_mcp_audit_context
from src.services.mcp_tool_registry import RegisteredTool, SensorDimension, get_mcp_tool_registry
from src.services.sanitization_service import sanitize, sanitize_dict_values
# W2 PR-V1: 頂層 import 讓測試 patch 路徑固定(延遲 import 無法被 patch)
# ENABLE_SELF_HEALING_VALIDATOR=False 時此 import 不影響效能(純 python 模組)
from src.services import self_healing_validator as _shv_module
if TYPE_CHECKING:
from src.models.incident import Incident
logger = structlog.get_logger(__name__)
# 執行後等待收斂時間(秒)— K8s controller 需要時間處理重啟/滾動更新
POST_EXEC_WARMUP_SEC = 10.0
# 驗證超時(秒)
VERIFY_TIMEOUT_SEC = 30.0
# MCP 單工具超時(秒)
TOOL_TIMEOUT_SEC = 8.0
class PostExecutionVerifier:
"""
執行後環境狀態驗證器。
在 approval_execution.py 的 execute_approved_action() 中,
執行動作後呼叫 verify(),取得驗證結果並補填 EvidenceSnapshot。
Usage:
verifier = get_post_execution_verifier()
result = await verifier.verify(
incident=incident,
snapshot=pre_decision_snapshot,
action_taken="restart_service:awoooi-api",
)
# result: "success" | "degraded" | "failed" | "timeout"
"""
def __init__(self) -> None:
self._registry = get_mcp_tool_registry()
async def verify(
self,
incident: "Incident",
snapshot: EvidenceSnapshot | None,
action_taken: str,
warmup_sec: float = POST_EXEC_WARMUP_SEC,
) -> str:
"""
執行後驗證。
Args:
incident: 原始 Incident(用於取 labels 定位資源)
snapshot: 執行前的 EvidenceSnapshot(取 pre_execution_state 作基準線)
action_taken: 執行的動作描述(例如 "restart_service:awoooi-api")
warmup_sec: 等待 K8s 收斂的秒數
Returns:
str: "success" | "degraded" | "failed" | "timeout"
"""
incident_id = _get_incident_id(incident)
logger.info(
"verifier_start",
incident_id=incident_id,
action=action_taken,
warmup_sec=warmup_sec,
)
# 1. 等待收斂
if warmup_sec > 0:
await asyncio.sleep(warmup_sec)
# 2. 抓後狀態
try:
post_state = await asyncio.wait_for(
self._collect_post_state(incident),
timeout=VERIFY_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning("verifier_timeout", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "timeout")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="timeout",
action_taken=action_taken,
)
return "timeout"
except Exception:
logger.exception("verifier_collect_error", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "failed")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="failed",
action_taken=action_taken,
)
return "failed"
# 3. 對比前後狀態
pre_state = snapshot.pre_execution_state if snapshot else None
result = _assess_recovery(pre_state, post_state, action_taken)
# 4. 更新 EvidenceSnapshot
if snapshot:
await _update_snapshot(snapshot, post_state, result)
else:
await _persist_fallback_snapshot(
incident=incident,
post_state=post_state,
result=result,
action_taken=action_taken,
)
logger.info(
"verifier_done",
incident_id=incident_id,
result=result,
action=action_taken,
)
# 5. W2 PR-V1: SelfHealingValidator 串接(ENABLE_SELF_HEALING_VALIDATOR gate)
# 在 post_state 已補填後評估自愈品質,不阻塞主流程
# 外層 try/except 確保任何 validator 失敗不影響 verify() 返回值
try:
await _run_self_healing_validator(
incident_id=incident_id,
snapshot=snapshot,
pre_state=pre_state,
post_state=post_state,
verification_result=result,
action_taken=action_taken,
)
except Exception:
logger.warning(
"self_healing_validator_uncaught",
incident_id=incident_id,
exc_info=True,
)
return result
async def capture_pre_execution_state(
self,
incident: "Incident",
snapshot: EvidenceSnapshot,
) -> None:
"""
執行前快照當前狀態,寫入 snapshot.pre_execution_state。
在 approval_execution.py 的動作執行「之前」呼叫。
"""
incident_id = _get_incident_id(incident)
try:
state = await asyncio.wait_for(
self._collect_post_state(incident), # 同樣的抓取邏輯
timeout=TOOL_TIMEOUT_SEC,
)
snapshot.pre_execution_state = state
try:
await snapshot.update_pre_execution(state)
except Exception as exc:
logger.warning(
"verifier_pre_state_persist_failed",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
error=str(exc),
)
logger.debug("verifier_pre_state_captured", incident_id=incident_id)
except Exception:
logger.warning("verifier_pre_state_failed", incident_id=incident_id)
snapshot.pre_execution_state = {}
try:
await snapshot.update_pre_execution({})
except Exception as exc:
logger.warning(
"verifier_empty_pre_state_persist_failed",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
error=str(exc),
)
async def _collect_post_state(self, incident: "Incident") -> dict[str, Any]:
"""
蒐集執行後環境狀態(K8s Pod 狀態 + 關鍵指標)。
只選 D1(K8s 狀態)和 D3(指標)作為驗證基準線,
其他感官維度(日誌、拓撲等)在驗證時不必要。
"""
state: dict[str, Any] = {}
alertname = _get_alertname(incident)
labels = _get_labels(incident)
# 取 D1 + D3 工具
all_tools = self._registry.suggest_tools(alertname=alertname, incident_labels=labels)
verify_tools = [
t for t in all_tools
if any(d in (SensorDimension.D1_K8S_STATE, SensorDimension.D3_METRICS)
for d in t.dimensions)
]
params = {
"namespace": labels.get("namespace", "awoooi-prod"),
"pod_name": labels.get("pod", labels.get("name", "")),
"deployment": labels.get("deployment", ""),
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
"container_name": _extract_container_name(labels),
"filter_name": _extract_container_name(labels),
"query": _build_prometheus_query(alertname, labels),
}
async def _call_one(reg) -> tuple[str, Any]:
try:
audited_params = with_mcp_audit_context(
params,
session_id=f"incident:{_get_incident_id(incident)}:post_execution",
incident_id=_get_incident_id(incident),
flywheel_node="verify",
agent_role="post_execution_verifier",
)
result = await asyncio.wait_for(
self._execute_tool(
reg=reg,
tool_name=reg.tool.name,
audited_params=audited_params,
incident_id=_get_incident_id(incident),
),
timeout=TOOL_TIMEOUT_SEC,
)
if result.success and result.output:
return reg.tool.name, result.output
except Exception:
pass
return reg.tool.name, None
results = await asyncio.gather(*[_call_one(t) for t in verify_tools])
for tool_name, output in results:
if output is not None:
if isinstance(output, dict):
state[tool_name] = sanitize_dict_values(output, f"post_state.{tool_name}")
else:
state[tool_name] = {"raw": sanitize(str(output), f"post_state.{tool_name}")}
return state
async def _execute_tool(
self,
reg: RegisteredTool,
tool_name: str,
audited_params: dict[str, Any],
incident_id: str,
):
"""Route production post-execution sensors through AwoooP MCP Gateway.
Raw providers are still used by unit tests and manual injections. In
production the registry wraps providers in `AuditedMCPToolProvider`, and
those calls must leave first-class gateway audit rows just like the
pre-decision sense path.
"""
if not isinstance(reg.provider, AuditedMCPToolProvider):
return await reg.provider.execute(tool_name, audited_params)
async with get_db_context("awoooi") as db:
ctx = GatewayContext(
project_id="awoooi",
agent_id="post_execution_verifier",
tool_name=tool_name,
trace_id=incident_id,
is_shadow=True,
environment={"env": "prod"},
required_scope="read",
)
return await McpGateway(db).call(ctx, audited_params)
# ─────────────────────────────────────────────────────────────────────────────
# W2 PR-V1: SelfHealingValidator 串接
# 2026-04-28 ogt + Claude Sonnet 4.6: C6 飛輪斷鏈修復
# ─────────────────────────────────────────────────────────────────────────────
async def _run_self_healing_validator(
incident_id: str,
snapshot: EvidenceSnapshot | None,
pre_state: dict[str, Any] | None,
post_state: dict[str, Any],
verification_result: str,
action_taken: str,
) -> None:
"""
SelfHealingValidator 串接入口。
Feature gate: ENABLE_SELF_HEALING_VALIDATOR(預設 False)。
驗證失敗全程 try/except 保護,不影響主流程。
評估後:
- 補填 snapshot.self_healing_score + self_healing_detail
- score < 0.5 → 發送 Telegram rollback 提案警示
"""
try:
from src.core.config import get_settings
_settings = get_settings()
if not _settings.ENABLE_SELF_HEALING_VALIDATOR:
return
assessment = _shv_module.assess_self_healing(
pre_state=pre_state,
post_state=post_state,
verification_result=verification_result,
action_taken=action_taken,
)
score: float = assessment["score"]
logger.info(
"self_healing_assessed",
incident_id=incident_id,
score=score,
regressions=assessment.get("regressions", []),
root_cause_cleared=assessment.get("root_cause_cleared"),
detail=assessment.get("detail"),
)
# 補填 EvidenceSnapshot
if snapshot:
try:
await snapshot.update_self_healing(score=score, detail=assessment)
except Exception as _snap_err:
logger.warning(
"self_healing_snapshot_update_failed",
incident_id=incident_id,
error=str(_snap_err),
)
# score < 0.5 → Telegram rollback 提案警示
if score < 0.5:
await _send_rollback_proposal_alert(
incident_id=incident_id,
score=score,
assessment=assessment,
action_taken=action_taken,
)
except Exception:
logger.warning(
"self_healing_validator_error",
incident_id=incident_id,
exc_info=True,
)
async def _send_rollback_proposal_alert(
incident_id: str,
score: float,
assessment: dict[str, Any],
action_taken: str,
) -> None:
"""
自愈品質分數 < 0.5 時,發送 Telegram rollback 提案警示。
不自動執行 rollback,僅通知人工評估。
"""
try:
from src.core.config import get_settings
from src.services.telegram_gateway import get_telegram_gateway
_settings = get_settings()
gateway = get_telegram_gateway()
regressions = assessment.get("regressions", [])
reg_str = ", ".join(regressions[:5]) if regressions else "無"
root_cleared = "是" if assessment.get("root_cause_cleared") else "否"
text = (
f"⚠️ 自愈品質警示 — 建議人工評估 Rollback\n"
f"Incident: {incident_id}\n"
f"動作: {action_taken[:120]}\n"
f"自愈分數: {score:.2f} (門檻 0.5)\n"
f"Root Cause 解除: {root_cleared}\n"
f"Regression 信號: {reg_str}\n"
f"此為提案,不會自動執行 Rollback"
)
target_chat_id = _settings.SRE_GROUP_CHAT_ID
await gateway._send_request(
"sendMessage",
{
"chat_id": target_chat_id,
"text": text,
"parse_mode": "HTML",
},
)
logger.info(
"rollback_proposal_sent",
incident_id=incident_id,
score=score,
)
except Exception:
logger.warning(
"rollback_proposal_send_failed",
incident_id=incident_id,
exc_info=True,
)
# ─────────────────────────────────────────────────────────────────────────────
# Recovery Assessment
# ─────────────────────────────────────────────────────────────────────────────
def _assess_recovery(
pre_state: dict[str, Any] | None,
post_state: dict[str, Any],
action_taken: str,
) -> str:
"""
評估修復效果。
Phase 1 使用啟發式規則(基於 K8s Pod 狀態字串判斷)。
Phase 4 將改用動態基線(Holt-Winters 偏差量),不再用靜態閾值。
Heuristics(Phase 1 版本):
- post_state 含 Running → success
- post_state 含 CrashLoopBackOff / Error / OOMKilled → failed
- post_state 為空(MCP 無回應)→ degraded
- pre_state 與 post_state 完全相同 → degraded(未改變)
"""
if not post_state:
return "degraded"
# 轉為字串做啟發式掃描
post_str = str(post_state).lower()
pre_str = str(pre_state).lower() if pre_state else ""
# 失敗信號(Gate 1 fix: 移除裸 "error" — 會誤觸 error_rate/error_count 等指標 key)
# "error" 作為 K8s ContainerState reason 由 "failed" Pod phase 間接覆蓋
failure_signals = ["crashloopbackoff", "oomkilled", "oomkill", "failed"]
if any(sig in post_str for sig in failure_signals):
return "failed"
# 成功信號
success_signals = [
"running",
"ready",
"1/1",
"2/2",
"3/3",
"healthy",
"successfully rolled out",
"'success': true",
'"success": true',
]
has_success_signal = any(sig in post_str for sig in success_signals)
if not has_success_signal and _docker_state_indicates_running(post_str):
has_success_signal = True
if has_success_signal:
if _is_observe_only_action(action_taken):
return "degraded"
# 但如果 pre_state 已經是 running,可能是無效操作
if pre_str and any(sig in pre_str for sig in success_signals):
# 如果執行的是 restart,即使 pre/post 都 Running 也算 success
if "restart" in action_taken.lower() or "delete" in action_taken.lower():
return "success"
return "degraded"
return "success"
# 前後無變化
if pre_str and post_str == pre_str:
return "degraded"
return "degraded"
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _get_incident_id(incident: "Incident") -> str:
return incident.incident_id if hasattr(incident, "incident_id") else str(incident.id)
def _get_alertname(incident: "Incident") -> str:
if incident.signals:
signal = incident.signals[0]
labels = signal.labels or {}
return labels.get("alertname", "") or getattr(signal, "alert_name", "")
return ""
def _get_labels(incident: "Incident") -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels or {}
return {}
def _extract_container_name(labels: dict[str, Any]) -> str:
"""Resolve Docker container target labels for post-execution SSH sensors."""
for key in ("filter_name", "container_name", "container", "resource", "name"):
value = str(labels.get(key) or "").strip()
if value and "{" not in value and "}" not in value:
return value
return ""
def _is_observe_only_action(action_taken: str) -> bool:
"""Return true when the executed step collected evidence but did not mutate state."""
lowered = (action_taken or "").lower()
observe_tokens = (
"mcp:ssh_diagnose",
"ssh_diagnose",
"docker stats",
"ps aux",
"free -h",
"df -h",
)
mutation_tokens = (
"restart",
"delete",
"rollout",
"scale",
"patch",
"apply",
"prune",
"truncate",
"clear",
)
return any(token in lowered for token in observe_tokens) and not any(
token in lowered for token in mutation_tokens
)
def _docker_state_indicates_running(post_str: str) -> bool:
"""Recognize Docker-specific healthy/running output without matching uptime."""
if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")):
return False
return bool(
re.search(r'\bup\s+\d', post_str)
or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str)
or re.search(r'["\']running["\']\s*:\s*true', post_str)
)
def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
"""Build a non-empty PromQL probe for post-execution metric sensors."""
alert = (alertname or "").lower()
namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod"))
pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or ""))
host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or ""))
container = _safe_label_value(str(
labels.get("container_name")
or labels.get("container")
or labels.get("name")
or ""
))
if alert.startswith("dockercontainer"):
selector = _selector({
"host": host,
"container_name": container,
})
if "memory" in alert:
return (
f"docker_container_memory_usage_bytes{selector} / "
f"docker_container_memory_limit_bytes{selector}"
)
if "restart" in alert:
return (
f"increase(docker_container_inspect_restart_count{selector}[15m]) "
f"or increase(docker_container_restart_count{selector}[15m])"
)
if "cpu" in alert:
return f"docker_container_cpu_cores{selector}"
return f"docker_container_info{selector}"
if any(key in alert for key in ("host", "node")):
instance_selector = _selector({"instance": str(labels.get("instance") or "")})
if "memory" in alert or "oom" in alert:
return f"node_memory_MemAvailable_bytes{instance_selector}"
if "disk" in alert or "storage" in alert:
return f"node_filesystem_avail_bytes{instance_selector}"
if "load" in alert or "cpu" in alert:
return f"node_load5{instance_selector}"
return f"up{instance_selector}"
pod_filter = f',pod=~"{pod_name}.*"' if pod_name else ""
if any(key in alert for key in ("memory", "mem", "oom")):
return (
f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) '
"/ 1048576"
)
if any(key in alert for key in ("cpu", "load", "throttl")):
return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))'
if any(key in alert for key in ("crash", "restart", "backoff")):
return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))'
if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")):
return "1 - avg(probe_success)"
return f'up{{namespace="{namespace}"}}'
def _selector(labels: dict[str, str]) -> str:
parts = [f'{key}="{value}"' for key, value in labels.items() if value]
return "{" + ",".join(parts) + "}" if parts else ""
def _short_instance(instance: Any) -> str:
raw = str(instance or "")
if raw.count(":") == 1:
return raw.rsplit(":", 1)[0]
return raw
def _safe_label_value(value: str) -> str:
cleaned = (value or "").strip()
if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned):
return cleaned
return ""
async def _update_snapshot(
snapshot: EvidenceSnapshot,
post_state: dict[str, Any],
result: str,
) -> None:
"""補填 EvidenceSnapshot 的 post_execution_state + verification_result。"""
try:
await snapshot.update_post_execution(post_state, result)
except Exception:
logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id)
async def _persist_fallback_snapshot(
*,
incident: "Incident",
post_state: dict[str, Any],
result: str,
action_taken: str,
) -> None:
"""
Persist verifier outcome even when the pre-decision snapshot is unavailable.
Live T14 evidence showed auto_repair rows with verifier decisions in logs but
NULL incident_evidence.verification_result because verify(snapshot=None) had
no durable target. This fallback makes the verification gate auditable without
pretending a pre-execution baseline existed.
"""
incident_id = _get_incident_id(incident)
try:
snapshot = EvidenceSnapshot(incident_id=incident_id)
snapshot.post_execution_state = post_state
snapshot.verification_result = result
snapshot.matched_playbook_id = _extract_playbook_id(action_taken)
snapshot.sensors_attempted = 1
snapshot.sensors_succeeded = 1 if post_state else 0
snapshot.mcp_health = {"post_execution_verifier": bool(post_state)}
snapshot.evidence_summary = (
"[PostExecutionVerifier] fallback verification snapshot; "
f"action={action_taken[:160]}; result={result}; "
"pre_execution_state=missing"
)
await snapshot.save()
logger.info(
"verifier_fallback_snapshot_saved",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
result=result,
action=action_taken,
)
except Exception:
logger.warning(
"verifier_fallback_snapshot_save_failed",
incident_id=incident_id,
result=result,
exc_info=True,
)
def _extract_playbook_id(action_taken: str) -> str | None:
for prefix in ("auto_repair_playbook:", "auto_repair:"):
if action_taken.startswith(prefix):
playbook_id = action_taken.removeprefix(prefix).split(":", 1)[0].strip()
return playbook_id or None
return None
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_verifier: PostExecutionVerifier | None = None
def get_post_execution_verifier() -> PostExecutionVerifier:
"""取得 PostExecutionVerifier Singleton。"""
global _verifier
if _verifier is None:
_verifier = PostExecutionVerifier()
return _verifier