""" AWOOOI AIOps Phase 1 — 執行後驗證器 ===================================== 每次 AI 修復動作執行後,主動用 MCP 抓取環境後狀態, 與 EvidenceSnapshot.pre_execution_state 對比, 判斷修復是否真的有效。 驗證結果三態: - "success" — 問題已解決(Pod Running / 指標恢復正常) - "degraded" — 部分改善但未完全恢復 - "failed" — 執行後狀態比執行前更差,或完全未改善 - "timeout" — 驗證超時(MCP 無法回應) 驗證結果用途: 1. 填入 EvidenceSnapshot.verification_result(Phase 3 學習閉環基礎) 2. 傳給 learning_service 更新 Playbook EWMA trust_score 3. 觸發 Reviewer Agent 的 rollback 決策(Phase 2) 設計原則: - 執行後等待 warm-up period(預設 10s),讓 K8s controller 有時間收斂 - 超時不 raise,標記 "timeout" 並繼續流程 - 不阻塞原始執行路徑(await,但結果不影響執行本身是否成功) W2 PR-V1: SelfHealingValidator 串接 (2026-04-28 ogt + Claude Sonnet 4.6) - ENABLE_SELF_HEALING_VALIDATOR=True 時,verify() 完成後呼叫 assess_self_healing() - self_healing_score < 0.5 → Telegram 警示 rollback 提案(不自動執行) - 驗證失敗不阻塞主流程(try/except 全包) ADR-081: PreDecisionInvestigator + EvidenceSnapshot MASTER §3.1 L6×D1 2026-04-15 ogt + Claude Sonnet 4.6 (亞太): Phase 1 初始建立 """ from __future__ import annotations import asyncio import re from typing import TYPE_CHECKING, Any import structlog from src.db.base import get_db_context from src.plugins.mcp.gateway import GatewayContext, McpGateway from src.plugins.mcp.registry import AuditedMCPToolProvider from src.services.evidence_snapshot import EvidenceSnapshot from src.services.mcp_audit_context import with_mcp_audit_context from src.services.mcp_tool_registry import RegisteredTool, SensorDimension, get_mcp_tool_registry from src.services.sanitization_service import sanitize, sanitize_dict_values # W2 PR-V1: 頂層 import 讓測試 patch 路徑固定(延遲 import 無法被 patch) # ENABLE_SELF_HEALING_VALIDATOR=False 時此 import 不影響效能(純 python 模組) from src.services import self_healing_validator as _shv_module if TYPE_CHECKING: from src.models.incident import Incident logger = structlog.get_logger(__name__) # 執行後等待收斂時間(秒)— K8s controller 需要時間處理重啟/滾動更新 POST_EXEC_WARMUP_SEC = 10.0 # 驗證超時(秒) VERIFY_TIMEOUT_SEC = 30.0 # MCP 單工具超時(秒) TOOL_TIMEOUT_SEC = 8.0 class PostExecutionVerifier: """ 執行後環境狀態驗證器。 在 approval_execution.py 的 execute_approved_action() 中, 執行動作後呼叫 verify(),取得驗證結果並補填 EvidenceSnapshot。 Usage: verifier = get_post_execution_verifier() result = await verifier.verify( incident=incident, snapshot=pre_decision_snapshot, action_taken="restart_service:awoooi-api", ) # result: "success" | "degraded" | "failed" | "timeout" """ def __init__(self) -> None: self._registry = get_mcp_tool_registry() async def verify( self, incident: "Incident", snapshot: EvidenceSnapshot | None, action_taken: str, warmup_sec: float = POST_EXEC_WARMUP_SEC, ) -> str: """ 執行後驗證。 Args: incident: 原始 Incident(用於取 labels 定位資源) snapshot: 執行前的 EvidenceSnapshot(取 pre_execution_state 作基準線) action_taken: 執行的動作描述(例如 "restart_service:awoooi-api") warmup_sec: 等待 K8s 收斂的秒數 Returns: str: "success" | "degraded" | "failed" | "timeout" """ incident_id = _get_incident_id(incident) logger.info( "verifier_start", incident_id=incident_id, action=action_taken, warmup_sec=warmup_sec, ) # 1. 等待收斂 if warmup_sec > 0: await asyncio.sleep(warmup_sec) # 2. 抓後狀態 try: post_state = await asyncio.wait_for( self._collect_post_state(incident), timeout=VERIFY_TIMEOUT_SEC, ) except asyncio.TimeoutError: logger.warning("verifier_timeout", incident_id=incident_id) if snapshot: await _update_snapshot(snapshot, {}, "timeout") else: await _persist_fallback_snapshot( incident=incident, post_state={}, result="timeout", action_taken=action_taken, ) return "timeout" except Exception: logger.exception("verifier_collect_error", incident_id=incident_id) if snapshot: await _update_snapshot(snapshot, {}, "failed") else: await _persist_fallback_snapshot( incident=incident, post_state={}, result="failed", action_taken=action_taken, ) return "failed" # 3. 對比前後狀態 pre_state = snapshot.pre_execution_state if snapshot else None result = _assess_recovery(pre_state, post_state, action_taken) # 4. 更新 EvidenceSnapshot if snapshot: await _update_snapshot(snapshot, post_state, result) else: await _persist_fallback_snapshot( incident=incident, post_state=post_state, result=result, action_taken=action_taken, ) logger.info( "verifier_done", incident_id=incident_id, result=result, action=action_taken, ) # 5. W2 PR-V1: SelfHealingValidator 串接(ENABLE_SELF_HEALING_VALIDATOR gate) # 在 post_state 已補填後評估自愈品質,不阻塞主流程 # 外層 try/except 確保任何 validator 失敗不影響 verify() 返回值 try: await _run_self_healing_validator( incident_id=incident_id, snapshot=snapshot, pre_state=pre_state, post_state=post_state, verification_result=result, action_taken=action_taken, ) except Exception: logger.warning( "self_healing_validator_uncaught", incident_id=incident_id, exc_info=True, ) return result async def capture_pre_execution_state( self, incident: "Incident", snapshot: EvidenceSnapshot, ) -> None: """ 執行前快照當前狀態,寫入 snapshot.pre_execution_state。 在 approval_execution.py 的動作執行「之前」呼叫。 """ incident_id = _get_incident_id(incident) try: state = await asyncio.wait_for( self._collect_post_state(incident), # 同樣的抓取邏輯 timeout=TOOL_TIMEOUT_SEC, ) snapshot.pre_execution_state = state try: await snapshot.update_pre_execution(state) except Exception as exc: logger.warning( "verifier_pre_state_persist_failed", incident_id=incident_id, snapshot_id=snapshot.snapshot_id, error=str(exc), ) logger.debug("verifier_pre_state_captured", incident_id=incident_id) except Exception: logger.warning("verifier_pre_state_failed", incident_id=incident_id) snapshot.pre_execution_state = {} try: await snapshot.update_pre_execution({}) except Exception as exc: logger.warning( "verifier_empty_pre_state_persist_failed", incident_id=incident_id, snapshot_id=snapshot.snapshot_id, error=str(exc), ) async def _collect_post_state(self, incident: "Incident") -> dict[str, Any]: """ 蒐集執行後環境狀態(K8s Pod 狀態 + 關鍵指標)。 只選 D1(K8s 狀態)和 D3(指標)作為驗證基準線, 其他感官維度(日誌、拓撲等)在驗證時不必要。 """ state: dict[str, Any] = {} alertname = _get_alertname(incident) labels = _get_labels(incident) # 取 D1 + D3 工具 all_tools = self._registry.suggest_tools(alertname=alertname, incident_labels=labels) verify_tools = [ t for t in all_tools if any(d in (SensorDimension.D1_K8S_STATE, SensorDimension.D3_METRICS) for d in t.dimensions) ] params = { "namespace": labels.get("namespace", "awoooi-prod"), "pod_name": labels.get("pod", labels.get("name", "")), "deployment": labels.get("deployment", ""), "host": labels.get("instance", "").split(":")[0] or labels.get("host", ""), "container_name": _extract_container_name(labels), "filter_name": _extract_container_name(labels), "query": _build_prometheus_query(alertname, labels), } async def _call_one(reg) -> tuple[str, Any]: try: audited_params = with_mcp_audit_context( params, session_id=f"incident:{_get_incident_id(incident)}:post_execution", incident_id=_get_incident_id(incident), flywheel_node="verify", agent_role="post_execution_verifier", ) result = await asyncio.wait_for( self._execute_tool( reg=reg, tool_name=reg.tool.name, audited_params=audited_params, incident_id=_get_incident_id(incident), ), timeout=TOOL_TIMEOUT_SEC, ) if result.success and result.output: return reg.tool.name, result.output except Exception: pass return reg.tool.name, None results = await asyncio.gather(*[_call_one(t) for t in verify_tools]) for tool_name, output in results: if output is not None: if isinstance(output, dict): state[tool_name] = sanitize_dict_values(output, f"post_state.{tool_name}") else: state[tool_name] = {"raw": sanitize(str(output), f"post_state.{tool_name}")} return state async def _execute_tool( self, reg: RegisteredTool, tool_name: str, audited_params: dict[str, Any], incident_id: str, ): """Route production post-execution sensors through AwoooP MCP Gateway. Raw providers are still used by unit tests and manual injections. In production the registry wraps providers in `AuditedMCPToolProvider`, and those calls must leave first-class gateway audit rows just like the pre-decision sense path. """ if not isinstance(reg.provider, AuditedMCPToolProvider): return await reg.provider.execute(tool_name, audited_params) async with get_db_context("awoooi") as db: ctx = GatewayContext( project_id="awoooi", agent_id="post_execution_verifier", tool_name=tool_name, trace_id=incident_id, is_shadow=True, environment={"env": "prod"}, required_scope="read", ) return await McpGateway(db).call(ctx, audited_params) # ───────────────────────────────────────────────────────────────────────────── # W2 PR-V1: SelfHealingValidator 串接 # 2026-04-28 ogt + Claude Sonnet 4.6: C6 飛輪斷鏈修復 # ───────────────────────────────────────────────────────────────────────────── async def _run_self_healing_validator( incident_id: str, snapshot: EvidenceSnapshot | None, pre_state: dict[str, Any] | None, post_state: dict[str, Any], verification_result: str, action_taken: str, ) -> None: """ SelfHealingValidator 串接入口。 Feature gate: ENABLE_SELF_HEALING_VALIDATOR(預設 False)。 驗證失敗全程 try/except 保護,不影響主流程。 評估後: - 補填 snapshot.self_healing_score + self_healing_detail - score < 0.5 → 發送 Telegram rollback 提案警示 """ try: from src.core.config import get_settings _settings = get_settings() if not _settings.ENABLE_SELF_HEALING_VALIDATOR: return assessment = _shv_module.assess_self_healing( pre_state=pre_state, post_state=post_state, verification_result=verification_result, action_taken=action_taken, ) score: float = assessment["score"] logger.info( "self_healing_assessed", incident_id=incident_id, score=score, regressions=assessment.get("regressions", []), root_cause_cleared=assessment.get("root_cause_cleared"), detail=assessment.get("detail"), ) # 補填 EvidenceSnapshot if snapshot: try: await snapshot.update_self_healing(score=score, detail=assessment) except Exception as _snap_err: logger.warning( "self_healing_snapshot_update_failed", incident_id=incident_id, error=str(_snap_err), ) # score < 0.5 → Telegram rollback 提案警示 if score < 0.5: await _send_rollback_proposal_alert( incident_id=incident_id, score=score, assessment=assessment, action_taken=action_taken, ) except Exception: logger.warning( "self_healing_validator_error", incident_id=incident_id, exc_info=True, ) async def _send_rollback_proposal_alert( incident_id: str, score: float, assessment: dict[str, Any], action_taken: str, ) -> None: """ 自愈品質分數 < 0.5 時,發送 Telegram rollback 提案警示。 不自動執行 rollback,僅通知人工評估。 """ try: from src.core.config import get_settings from src.services.telegram_gateway import get_telegram_gateway _settings = get_settings() gateway = get_telegram_gateway() regressions = assessment.get("regressions", []) reg_str = ", ".join(regressions[:5]) if regressions else "無" root_cleared = "是" if assessment.get("root_cause_cleared") else "否" text = ( f"⚠️ 自愈品質警示 — 建議人工評估 Rollback\n" f"Incident: {incident_id}\n" f"動作: {action_taken[:120]}\n" f"自愈分數: {score:.2f} (門檻 0.5)\n" f"Root Cause 解除: {root_cleared}\n" f"Regression 信號: {reg_str}\n" f"此為提案,不會自動執行 Rollback" ) target_chat_id = _settings.SRE_GROUP_CHAT_ID await gateway._send_request( "sendMessage", { "chat_id": target_chat_id, "text": text, "parse_mode": "HTML", }, ) logger.info( "rollback_proposal_sent", incident_id=incident_id, score=score, ) except Exception: logger.warning( "rollback_proposal_send_failed", incident_id=incident_id, exc_info=True, ) # ───────────────────────────────────────────────────────────────────────────── # Recovery Assessment # ───────────────────────────────────────────────────────────────────────────── def _assess_recovery( pre_state: dict[str, Any] | None, post_state: dict[str, Any], action_taken: str, ) -> str: """ 評估修復效果。 Phase 1 使用啟發式規則(基於 K8s Pod 狀態字串判斷)。 Phase 4 將改用動態基線(Holt-Winters 偏差量),不再用靜態閾值。 Heuristics(Phase 1 版本): - post_state 含 Running → success - post_state 含 CrashLoopBackOff / Error / OOMKilled → failed - post_state 為空(MCP 無回應)→ degraded - pre_state 與 post_state 完全相同 → degraded(未改變) """ if not post_state: return "degraded" # 轉為字串做啟發式掃描 post_str = str(post_state).lower() pre_str = str(pre_state).lower() if pre_state else "" # 失敗信號(Gate 1 fix: 移除裸 "error" — 會誤觸 error_rate/error_count 等指標 key) # "error" 作為 K8s ContainerState reason 由 "failed" Pod phase 間接覆蓋 failure_signals = ["crashloopbackoff", "oomkilled", "oomkill", "failed"] if any(sig in post_str for sig in failure_signals): return "failed" # 成功信號 success_signals = [ "running", "ready", "1/1", "2/2", "3/3", "healthy", "successfully rolled out", "'success': true", '"success": true', ] has_success_signal = any(sig in post_str for sig in success_signals) if not has_success_signal and _docker_state_indicates_running(post_str): has_success_signal = True if has_success_signal: if _is_observe_only_action(action_taken): return "degraded" # 但如果 pre_state 已經是 running,可能是無效操作 if pre_str and any(sig in pre_str for sig in success_signals): # 如果執行的是 restart,即使 pre/post 都 Running 也算 success if "restart" in action_taken.lower() or "delete" in action_taken.lower(): return "success" return "degraded" return "success" # 前後無變化 if pre_str and post_str == pre_str: return "degraded" return "degraded" # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── def _get_incident_id(incident: "Incident") -> str: return incident.incident_id if hasattr(incident, "incident_id") else str(incident.id) def _get_alertname(incident: "Incident") -> str: if incident.signals: signal = incident.signals[0] labels = signal.labels or {} return labels.get("alertname", "") or getattr(signal, "alert_name", "") return "" def _get_labels(incident: "Incident") -> dict[str, Any]: if incident.signals: return incident.signals[0].labels or {} return {} def _extract_container_name(labels: dict[str, Any]) -> str: """Resolve Docker container target labels for post-execution SSH sensors.""" for key in ("filter_name", "container_name", "container", "resource", "name"): value = str(labels.get(key) or "").strip() if value and "{" not in value and "}" not in value: return value return "" def _is_observe_only_action(action_taken: str) -> bool: """Return true when the executed step collected evidence but did not mutate state.""" lowered = (action_taken or "").lower() observe_tokens = ( "mcp:ssh_diagnose", "ssh_diagnose", "docker stats", "ps aux", "free -h", "df -h", ) mutation_tokens = ( "restart", "delete", "rollout", "scale", "patch", "apply", "prune", "truncate", "clear", ) return any(token in lowered for token in observe_tokens) and not any( token in lowered for token in mutation_tokens ) def _docker_state_indicates_running(post_str: str) -> bool: """Recognize Docker-specific healthy/running output without matching uptime.""" if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")): return False return bool( re.search(r'\bup\s+\d', post_str) or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str) or re.search(r'["\']running["\']\s*:\s*true', post_str) ) def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str: """Build a non-empty PromQL probe for post-execution metric sensors.""" alert = (alertname or "").lower() namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod")) pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or "")) host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or "")) container = _safe_label_value(str( labels.get("container_name") or labels.get("container") or labels.get("name") or "" )) if alert.startswith("dockercontainer"): selector = _selector({ "host": host, "container_name": container, }) if "memory" in alert: return ( f"docker_container_memory_usage_bytes{selector} / " f"docker_container_memory_limit_bytes{selector}" ) if "restart" in alert: return ( f"increase(docker_container_inspect_restart_count{selector}[15m]) " f"or increase(docker_container_restart_count{selector}[15m])" ) if "cpu" in alert: return f"docker_container_cpu_cores{selector}" return f"docker_container_info{selector}" if any(key in alert for key in ("host", "node")): instance_selector = _selector({"instance": str(labels.get("instance") or "")}) if "memory" in alert or "oom" in alert: return f"node_memory_MemAvailable_bytes{instance_selector}" if "disk" in alert or "storage" in alert: return f"node_filesystem_avail_bytes{instance_selector}" if "load" in alert or "cpu" in alert: return f"node_load5{instance_selector}" return f"up{instance_selector}" pod_filter = f',pod=~"{pod_name}.*"' if pod_name else "" if any(key in alert for key in ("memory", "mem", "oom")): return ( f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) ' "/ 1048576" ) if any(key in alert for key in ("cpu", "load", "throttl")): return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))' if any(key in alert for key in ("crash", "restart", "backoff")): return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))' if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")): return "1 - avg(probe_success)" return f'up{{namespace="{namespace}"}}' def _selector(labels: dict[str, str]) -> str: parts = [f'{key}="{value}"' for key, value in labels.items() if value] return "{" + ",".join(parts) + "}" if parts else "" def _short_instance(instance: Any) -> str: raw = str(instance or "") if raw.count(":") == 1: return raw.rsplit(":", 1)[0] return raw def _safe_label_value(value: str) -> str: cleaned = (value or "").strip() if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned): return cleaned return "" async def _update_snapshot( snapshot: EvidenceSnapshot, post_state: dict[str, Any], result: str, ) -> None: """補填 EvidenceSnapshot 的 post_execution_state + verification_result。""" try: await snapshot.update_post_execution(post_state, result) except Exception: logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id) async def _persist_fallback_snapshot( *, incident: "Incident", post_state: dict[str, Any], result: str, action_taken: str, ) -> None: """ Persist verifier outcome even when the pre-decision snapshot is unavailable. Live T14 evidence showed auto_repair rows with verifier decisions in logs but NULL incident_evidence.verification_result because verify(snapshot=None) had no durable target. This fallback makes the verification gate auditable without pretending a pre-execution baseline existed. """ incident_id = _get_incident_id(incident) try: snapshot = EvidenceSnapshot(incident_id=incident_id) snapshot.post_execution_state = post_state snapshot.verification_result = result snapshot.matched_playbook_id = _extract_playbook_id(action_taken) snapshot.sensors_attempted = 1 snapshot.sensors_succeeded = 1 if post_state else 0 snapshot.mcp_health = {"post_execution_verifier": bool(post_state)} snapshot.evidence_summary = ( "[PostExecutionVerifier] fallback verification snapshot; " f"action={action_taken[:160]}; result={result}; " "pre_execution_state=missing" ) await snapshot.save() logger.info( "verifier_fallback_snapshot_saved", incident_id=incident_id, snapshot_id=snapshot.snapshot_id, result=result, action=action_taken, ) except Exception: logger.warning( "verifier_fallback_snapshot_save_failed", incident_id=incident_id, result=result, exc_info=True, ) def _extract_playbook_id(action_taken: str) -> str | None: for prefix in ("auto_repair_playbook:", "auto_repair:"): if action_taken.startswith(prefix): playbook_id = action_taken.removeprefix(prefix).split(":", 1)[0].strip() return playbook_id or None return None # ───────────────────────────────────────────────────────────────────────────── # Singleton # ───────────────────────────────────────────────────────────────────────────── _verifier: PostExecutionVerifier | None = None def get_post_execution_verifier() -> PostExecutionVerifier: """取得 PostExecutionVerifier Singleton。""" global _verifier if _verifier is None: _verifier = PostExecutionVerifier() return _verifier