Files
awoooi/apps/api/src/services/post_execution_verifier.py
Your Name ee2cc2bfc3
Some checks failed
CD Pipeline / tests (push) Failing after 1m23s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 15s
fix(alerts): 收斂 Telegram 告警到 SRE 戰情室
2026-06-12 11:06:16 +08:00

747 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 1 — 執行後驗證器
=====================================
每次 AI 修復動作執行後,主動用 MCP 抓取環境後狀態,
與 EvidenceSnapshot.pre_execution_state 對比,
判斷修復是否真的有效。
驗證結果三態:
- "success" — 問題已解決Pod Running / 指標恢復正常)
- "degraded" — 部分改善但未完全恢復
- "failed" — 執行後狀態比執行前更差,或完全未改善
- "timeout" — 驗證超時MCP 無法回應)
驗證結果用途:
1. 填入 EvidenceSnapshot.verification_resultPhase 3 學習閉環基礎)
2. 傳給 learning_service 更新 Playbook EWMA trust_score
3. 觸發 Reviewer Agent 的 rollback 決策Phase 2
設計原則:
- 執行後等待 warm-up period預設 10s讓 K8s controller 有時間收斂
- 超時不 raise標記 "timeout" 並繼續流程
- 不阻塞原始執行路徑await但結果不影響執行本身是否成功
W2 PR-V1: SelfHealingValidator 串接 (2026-04-28 ogt + Claude Sonnet 4.6)
- ENABLE_SELF_HEALING_VALIDATOR=True 時verify() 完成後呼叫 assess_self_healing()
- self_healing_score < 0.5 → Telegram 警示 rollback 提案(不自動執行)
- 驗證失敗不阻塞主流程try/except 全包)
ADR-081: PreDecisionInvestigator + EvidenceSnapshot
MASTER §3.1 L6×D1
2026-04-15 ogt + Claude Sonnet 4.6 (亞太): Phase 1 初始建立
"""
from __future__ import annotations
import asyncio
import re
from typing import TYPE_CHECKING, Any
import structlog
from src.db.base import get_db_context
from src.plugins.mcp.gateway import GatewayContext, McpGateway
from src.plugins.mcp.registry import AuditedMCPToolProvider
from src.services.evidence_snapshot import EvidenceSnapshot
from src.services.mcp_audit_context import with_mcp_audit_context
from src.services.mcp_tool_registry import RegisteredTool, SensorDimension, get_mcp_tool_registry
from src.services.sanitization_service import sanitize, sanitize_dict_values
# W2 PR-V1: 頂層 import 讓測試 patch 路徑固定(延遲 import 無法被 patch
# ENABLE_SELF_HEALING_VALIDATOR=False 時此 import 不影響效能(純 python 模組)
from src.services import self_healing_validator as _shv_module
if TYPE_CHECKING:
from src.models.incident import Incident
logger = structlog.get_logger(__name__)
# 執行後等待收斂時間(秒)— K8s controller 需要時間處理重啟/滾動更新
POST_EXEC_WARMUP_SEC = 10.0
# 驗證超時(秒)
VERIFY_TIMEOUT_SEC = 30.0
# MCP 單工具超時(秒)
TOOL_TIMEOUT_SEC = 8.0
class PostExecutionVerifier:
"""
執行後環境狀態驗證器。
在 approval_execution.py 的 execute_approved_action() 中,
執行動作後呼叫 verify(),取得驗證結果並補填 EvidenceSnapshot。
Usage:
verifier = get_post_execution_verifier()
result = await verifier.verify(
incident=incident,
snapshot=pre_decision_snapshot,
action_taken="restart_service:awoooi-api",
)
# result: "success" | "degraded" | "failed" | "timeout"
"""
def __init__(self) -> None:
self._registry = get_mcp_tool_registry()
async def verify(
self,
incident: "Incident",
snapshot: EvidenceSnapshot | None,
action_taken: str,
warmup_sec: float = POST_EXEC_WARMUP_SEC,
) -> str:
"""
執行後驗證。
Args:
incident: 原始 Incident用於取 labels 定位資源)
snapshot: 執行前的 EvidenceSnapshot取 pre_execution_state 作基準線)
action_taken: 執行的動作描述(例如 "restart_service:awoooi-api"
warmup_sec: 等待 K8s 收斂的秒數
Returns:
str: "success" | "degraded" | "failed" | "timeout"
"""
incident_id = _get_incident_id(incident)
logger.info(
"verifier_start",
incident_id=incident_id,
action=action_taken,
warmup_sec=warmup_sec,
)
# 1. 等待收斂
if warmup_sec > 0:
await asyncio.sleep(warmup_sec)
# 2. 抓後狀態
try:
post_state = await asyncio.wait_for(
self._collect_post_state(incident),
timeout=VERIFY_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning("verifier_timeout", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "timeout")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="timeout",
action_taken=action_taken,
)
return "timeout"
except Exception:
logger.exception("verifier_collect_error", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "failed")
else:
await _persist_fallback_snapshot(
incident=incident,
post_state={},
result="failed",
action_taken=action_taken,
)
return "failed"
# 3. 對比前後狀態
pre_state = snapshot.pre_execution_state if snapshot else None
result = _assess_recovery(pre_state, post_state, action_taken)
# 4. 更新 EvidenceSnapshot
if snapshot:
await _update_snapshot(snapshot, post_state, result)
else:
await _persist_fallback_snapshot(
incident=incident,
post_state=post_state,
result=result,
action_taken=action_taken,
)
logger.info(
"verifier_done",
incident_id=incident_id,
result=result,
action=action_taken,
)
# 5. W2 PR-V1: SelfHealingValidator 串接ENABLE_SELF_HEALING_VALIDATOR gate
# 在 post_state 已補填後評估自愈品質,不阻塞主流程
# 外層 try/except 確保任何 validator 失敗不影響 verify() 返回值
try:
await _run_self_healing_validator(
incident_id=incident_id,
snapshot=snapshot,
pre_state=pre_state,
post_state=post_state,
verification_result=result,
action_taken=action_taken,
)
except Exception:
logger.warning(
"self_healing_validator_uncaught",
incident_id=incident_id,
exc_info=True,
)
return result
async def capture_pre_execution_state(
self,
incident: "Incident",
snapshot: EvidenceSnapshot,
) -> None:
"""
執行前快照當前狀態,寫入 snapshot.pre_execution_state。
在 approval_execution.py 的動作執行「之前」呼叫。
"""
incident_id = _get_incident_id(incident)
try:
state = await asyncio.wait_for(
self._collect_post_state(incident), # 同樣的抓取邏輯
timeout=TOOL_TIMEOUT_SEC,
)
snapshot.pre_execution_state = state
try:
await snapshot.update_pre_execution(state)
except Exception as exc:
logger.warning(
"verifier_pre_state_persist_failed",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
error=str(exc),
)
logger.debug("verifier_pre_state_captured", incident_id=incident_id)
except Exception:
logger.warning("verifier_pre_state_failed", incident_id=incident_id)
snapshot.pre_execution_state = {}
try:
await snapshot.update_pre_execution({})
except Exception as exc:
logger.warning(
"verifier_empty_pre_state_persist_failed",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
error=str(exc),
)
async def _collect_post_state(self, incident: "Incident") -> dict[str, Any]:
"""
蒐集執行後環境狀態K8s Pod 狀態 + 關鍵指標)。
只選 D1K8s 狀態)和 D3指標作為驗證基準線
其他感官維度(日誌、拓撲等)在驗證時不必要。
"""
state: dict[str, Any] = {}
alertname = _get_alertname(incident)
labels = _get_labels(incident)
# 取 D1 + D3 工具
all_tools = self._registry.suggest_tools(alertname=alertname, incident_labels=labels)
verify_tools = [
t for t in all_tools
if any(d in (SensorDimension.D1_K8S_STATE, SensorDimension.D3_METRICS)
for d in t.dimensions)
]
params = {
"namespace": labels.get("namespace", "awoooi-prod"),
"pod_name": labels.get("pod", labels.get("name", "")),
"deployment": labels.get("deployment", ""),
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
"container_name": _extract_container_name(labels),
"filter_name": _extract_container_name(labels),
"query": _build_prometheus_query(alertname, labels),
}
async def _call_one(reg) -> tuple[str, Any]:
try:
audited_params = with_mcp_audit_context(
params,
session_id=f"incident:{_get_incident_id(incident)}:post_execution",
incident_id=_get_incident_id(incident),
flywheel_node="verify",
agent_role="post_execution_verifier",
)
result = await asyncio.wait_for(
self._execute_tool(
reg=reg,
tool_name=reg.tool.name,
audited_params=audited_params,
incident_id=_get_incident_id(incident),
),
timeout=TOOL_TIMEOUT_SEC,
)
if result.success and result.output:
return reg.tool.name, result.output
except Exception:
pass
return reg.tool.name, None
results = await asyncio.gather(*[_call_one(t) for t in verify_tools])
for tool_name, output in results:
if output is not None:
if isinstance(output, dict):
state[tool_name] = sanitize_dict_values(output, f"post_state.{tool_name}")
else:
state[tool_name] = {"raw": sanitize(str(output), f"post_state.{tool_name}")}
return state
async def _execute_tool(
self,
reg: RegisteredTool,
tool_name: str,
audited_params: dict[str, Any],
incident_id: str,
):
"""Route production post-execution sensors through AwoooP MCP Gateway.
Raw providers are still used by unit tests and manual injections. In
production the registry wraps providers in `AuditedMCPToolProvider`, and
those calls must leave first-class gateway audit rows just like the
pre-decision sense path.
"""
if not isinstance(reg.provider, AuditedMCPToolProvider):
return await reg.provider.execute(tool_name, audited_params)
async with get_db_context("awoooi") as db:
ctx = GatewayContext(
project_id="awoooi",
agent_id="post_execution_verifier",
tool_name=tool_name,
trace_id=incident_id,
is_shadow=True,
environment={"env": "prod"},
required_scope="read",
)
return await McpGateway(db).call(ctx, audited_params)
# ─────────────────────────────────────────────────────────────────────────────
# W2 PR-V1: SelfHealingValidator 串接
# 2026-04-28 ogt + Claude Sonnet 4.6: C6 飛輪斷鏈修復
# ─────────────────────────────────────────────────────────────────────────────
async def _run_self_healing_validator(
incident_id: str,
snapshot: EvidenceSnapshot | None,
pre_state: dict[str, Any] | None,
post_state: dict[str, Any],
verification_result: str,
action_taken: str,
) -> None:
"""
SelfHealingValidator 串接入口。
Feature gate: ENABLE_SELF_HEALING_VALIDATOR預設 False
驗證失敗全程 try/except 保護,不影響主流程。
評估後:
- 補填 snapshot.self_healing_score + self_healing_detail
- score < 0.5 → 發送 Telegram rollback 提案警示
"""
try:
from src.core.config import get_settings
_settings = get_settings()
if not _settings.ENABLE_SELF_HEALING_VALIDATOR:
return
assessment = _shv_module.assess_self_healing(
pre_state=pre_state,
post_state=post_state,
verification_result=verification_result,
action_taken=action_taken,
)
score: float = assessment["score"]
logger.info(
"self_healing_assessed",
incident_id=incident_id,
score=score,
regressions=assessment.get("regressions", []),
root_cause_cleared=assessment.get("root_cause_cleared"),
detail=assessment.get("detail"),
)
# 補填 EvidenceSnapshot
if snapshot:
try:
await snapshot.update_self_healing(score=score, detail=assessment)
except Exception as _snap_err:
logger.warning(
"self_healing_snapshot_update_failed",
incident_id=incident_id,
error=str(_snap_err),
)
# score < 0.5 → Telegram rollback 提案警示
if score < 0.5:
await _send_rollback_proposal_alert(
incident_id=incident_id,
score=score,
assessment=assessment,
action_taken=action_taken,
)
except Exception:
logger.warning(
"self_healing_validator_error",
incident_id=incident_id,
exc_info=True,
)
async def _send_rollback_proposal_alert(
incident_id: str,
score: float,
assessment: dict[str, Any],
action_taken: str,
) -> None:
"""
自愈品質分數 < 0.5 時,發送 Telegram rollback 提案警示。
不自動執行 rollback僅通知人工評估。
"""
try:
from src.core.config import get_settings
from src.services.telegram_gateway import get_telegram_gateway
_settings = get_settings()
gateway = get_telegram_gateway()
regressions = assessment.get("regressions", [])
reg_str = ", ".join(regressions[:5]) if regressions else ""
root_cleared = "" if assessment.get("root_cause_cleared") else ""
text = (
f"⚠️ <b>自愈品質警示 — 建議人工評估 Rollback</b>\n"
f"Incident: <code>{incident_id}</code>\n"
f"動作: <code>{action_taken[:120]}</code>\n"
f"自愈分數: <b>{score:.2f}</b> (門檻 0.5)\n"
f"Root Cause 解除: {root_cleared}\n"
f"Regression 信號: {reg_str}\n"
f"<i>此為提案,不會自動執行 Rollback</i>"
)
target_chat_id = _settings.SRE_GROUP_CHAT_ID
await gateway._send_request(
"sendMessage",
{
"chat_id": target_chat_id,
"text": text,
"parse_mode": "HTML",
},
)
logger.info(
"rollback_proposal_sent",
incident_id=incident_id,
score=score,
)
except Exception:
logger.warning(
"rollback_proposal_send_failed",
incident_id=incident_id,
exc_info=True,
)
# ─────────────────────────────────────────────────────────────────────────────
# Recovery Assessment
# ─────────────────────────────────────────────────────────────────────────────
def _assess_recovery(
pre_state: dict[str, Any] | None,
post_state: dict[str, Any],
action_taken: str,
) -> str:
"""
評估修復效果。
Phase 1 使用啟發式規則(基於 K8s Pod 狀態字串判斷)。
Phase 4 將改用動態基線Holt-Winters 偏差量),不再用靜態閾值。
HeuristicsPhase 1 版本):
- post_state 含 Running → success
- post_state 含 CrashLoopBackOff / Error / OOMKilled → failed
- post_state 為空MCP 無回應)→ degraded
- pre_state 與 post_state 完全相同 → degraded未改變
"""
if not post_state:
return "degraded"
# 轉為字串做啟發式掃描
post_str = str(post_state).lower()
pre_str = str(pre_state).lower() if pre_state else ""
# 失敗信號Gate 1 fix: 移除裸 "error" — 會誤觸 error_rate/error_count 等指標 key
# "error" 作為 K8s ContainerState reason 由 "failed" Pod phase 間接覆蓋
failure_signals = ["crashloopbackoff", "oomkilled", "oomkill", "failed"]
if any(sig in post_str for sig in failure_signals):
return "failed"
# 成功信號
success_signals = [
"running",
"ready",
"1/1",
"2/2",
"3/3",
"healthy",
"successfully rolled out",
"'success': true",
'"success": true',
]
has_success_signal = any(sig in post_str for sig in success_signals)
if not has_success_signal and _docker_state_indicates_running(post_str):
has_success_signal = True
if has_success_signal:
if _is_observe_only_action(action_taken):
return "degraded"
# 但如果 pre_state 已經是 running可能是無效操作
if pre_str and any(sig in pre_str for sig in success_signals):
# 如果執行的是 restart即使 pre/post 都 Running 也算 success
if "restart" in action_taken.lower() or "delete" in action_taken.lower():
return "success"
return "degraded"
return "success"
# 前後無變化
if pre_str and post_str == pre_str:
return "degraded"
return "degraded"
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _get_incident_id(incident: "Incident") -> str:
return incident.incident_id if hasattr(incident, "incident_id") else str(incident.id)
def _get_alertname(incident: "Incident") -> str:
if incident.signals:
signal = incident.signals[0]
labels = signal.labels or {}
return labels.get("alertname", "") or getattr(signal, "alert_name", "")
return ""
def _get_labels(incident: "Incident") -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels or {}
return {}
def _extract_container_name(labels: dict[str, Any]) -> str:
"""Resolve Docker container target labels for post-execution SSH sensors."""
for key in ("filter_name", "container_name", "container", "resource", "name"):
value = str(labels.get(key) or "").strip()
if value and "{" not in value and "}" not in value:
return value
return ""
def _is_observe_only_action(action_taken: str) -> bool:
"""Return true when the executed step collected evidence but did not mutate state."""
lowered = (action_taken or "").lower()
observe_tokens = (
"mcp:ssh_diagnose",
"ssh_diagnose",
"docker stats",
"ps aux",
"free -h",
"df -h",
)
mutation_tokens = (
"restart",
"delete",
"rollout",
"scale",
"patch",
"apply",
"prune",
"truncate",
"clear",
)
return any(token in lowered for token in observe_tokens) and not any(
token in lowered for token in mutation_tokens
)
def _docker_state_indicates_running(post_str: str) -> bool:
"""Recognize Docker-specific healthy/running output without matching uptime."""
if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")):
return False
return bool(
re.search(r'\bup\s+\d', post_str)
or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str)
or re.search(r'["\']running["\']\s*:\s*true', post_str)
)
def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
"""Build a non-empty PromQL probe for post-execution metric sensors."""
alert = (alertname or "").lower()
namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod"))
pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or ""))
host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or ""))
container = _safe_label_value(str(
labels.get("container_name")
or labels.get("container")
or labels.get("name")
or ""
))
if alert.startswith("dockercontainer"):
selector = _selector({
"host": host,
"container_name": container,
})
if "memory" in alert:
return (
f"docker_container_memory_usage_bytes{selector} / "
f"docker_container_memory_limit_bytes{selector}"
)
if "restart" in alert:
return (
f"increase(docker_container_inspect_restart_count{selector}[15m]) "
f"or increase(docker_container_restart_count{selector}[15m])"
)
if "cpu" in alert:
return f"docker_container_cpu_cores{selector}"
return f"docker_container_info{selector}"
if any(key in alert for key in ("host", "node")):
instance_selector = _selector({"instance": str(labels.get("instance") or "")})
if "memory" in alert or "oom" in alert:
return f"node_memory_MemAvailable_bytes{instance_selector}"
if "disk" in alert or "storage" in alert:
return f"node_filesystem_avail_bytes{instance_selector}"
if "load" in alert or "cpu" in alert:
return f"node_load5{instance_selector}"
return f"up{instance_selector}"
pod_filter = f',pod=~"{pod_name}.*"' if pod_name else ""
if any(key in alert for key in ("memory", "mem", "oom")):
return (
f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) '
"/ 1048576"
)
if any(key in alert for key in ("cpu", "load", "throttl")):
return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))'
if any(key in alert for key in ("crash", "restart", "backoff")):
return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))'
if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")):
return "1 - avg(probe_success)"
return f'up{{namespace="{namespace}"}}'
def _selector(labels: dict[str, str]) -> str:
parts = [f'{key}="{value}"' for key, value in labels.items() if value]
return "{" + ",".join(parts) + "}" if parts else ""
def _short_instance(instance: Any) -> str:
raw = str(instance or "")
if raw.count(":") == 1:
return raw.rsplit(":", 1)[0]
return raw
def _safe_label_value(value: str) -> str:
cleaned = (value or "").strip()
if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned):
return cleaned
return ""
async def _update_snapshot(
snapshot: EvidenceSnapshot,
post_state: dict[str, Any],
result: str,
) -> None:
"""補填 EvidenceSnapshot 的 post_execution_state + verification_result。"""
try:
await snapshot.update_post_execution(post_state, result)
except Exception:
logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id)
async def _persist_fallback_snapshot(
*,
incident: "Incident",
post_state: dict[str, Any],
result: str,
action_taken: str,
) -> None:
"""
Persist verifier outcome even when the pre-decision snapshot is unavailable.
Live T14 evidence showed auto_repair rows with verifier decisions in logs but
NULL incident_evidence.verification_result because verify(snapshot=None) had
no durable target. This fallback makes the verification gate auditable without
pretending a pre-execution baseline existed.
"""
incident_id = _get_incident_id(incident)
try:
snapshot = EvidenceSnapshot(incident_id=incident_id)
snapshot.post_execution_state = post_state
snapshot.verification_result = result
snapshot.matched_playbook_id = _extract_playbook_id(action_taken)
snapshot.sensors_attempted = 1
snapshot.sensors_succeeded = 1 if post_state else 0
snapshot.mcp_health = {"post_execution_verifier": bool(post_state)}
snapshot.evidence_summary = (
"[PostExecutionVerifier] fallback verification snapshot; "
f"action={action_taken[:160]}; result={result}; "
"pre_execution_state=missing"
)
await snapshot.save()
logger.info(
"verifier_fallback_snapshot_saved",
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
result=result,
action=action_taken,
)
except Exception:
logger.warning(
"verifier_fallback_snapshot_save_failed",
incident_id=incident_id,
result=result,
exc_info=True,
)
def _extract_playbook_id(action_taken: str) -> str | None:
for prefix in ("auto_repair_playbook:", "auto_repair:"):
if action_taken.startswith(prefix):
playbook_id = action_taken.removeprefix(prefix).split(":", 1)[0].strip()
return playbook_id or None
return None
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_verifier: PostExecutionVerifier | None = None
def get_post_execution_verifier() -> PostExecutionVerifier:
"""取得 PostExecutionVerifier Singleton。"""
global _verifier
if _verifier is None:
_verifier = PostExecutionVerifier()
return _verifier