feat(adr-081): Phase 1 感官縱深 — 8D 情報蒐集 + 執行後驗證

成品:
- IncidentEvidence DB model(8D 感官 + pre/post 執行狀態)
- EvidenceSnapshot dataclass(build_summary → LLM 上下文)
- SanitizationService(Prompt Injection 0-tolerance,12 pattern)
- MCPToolRegistry(動態工具登記,suggest_tools 不寫死告警類型)
- PreDecisionInvestigator(8D 並行感官,P99 < 8s,Redis 30s 快取)
- PostExecutionVerifier(warmup 10s → 後狀態評估 success/degraded/failed)
- decision_manager + approval_execution 接線(feature flag 守衛)

Gate 1 修復:D4/D5/D7/D8 補 sanitize_dict_values;移除裸 "error" failure
signal 防 error_rate key 誤判;evidence_snapshot rowcount 零行警告。

測試:130 passed(+111 新增)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-15 13:08:38 +08:00
parent db9e304a14
commit f1cbf6db7d
14 changed files with 2936 additions and 3 deletions

View File

@@ -0,0 +1,308 @@
"""
AWOOOI AIOps Phase 1 — 執行後驗證器
=====================================
每次 AI 修復動作執行後,主動用 MCP 抓取環境後狀態,
與 EvidenceSnapshot.pre_execution_state 對比,
判斷修復是否真的有效。
驗證結果三態:
- "success" — 問題已解決Pod Running / 指標恢復正常)
- "degraded" — 部分改善但未完全恢復
- "failed" — 執行後狀態比執行前更差,或完全未改善
- "timeout" — 驗證超時MCP 無法回應)
驗證結果用途:
1. 填入 EvidenceSnapshot.verification_resultPhase 3 學習閉環基礎)
2. 傳給 learning_service 更新 Playbook EWMA trust_score
3. 觸發 Reviewer Agent 的 rollback 決策Phase 2
設計原則:
- 執行後等待 warm-up period預設 10s讓 K8s controller 有時間收斂
- 超時不 raise標記 "timeout" 並繼續流程
- 不阻塞原始執行路徑await但結果不影響執行本身是否成功
ADR-081: PreDecisionInvestigator + EvidenceSnapshot
MASTER §3.1 L6×D1
2026-04-15 ogt + Claude Sonnet 4.6 (亞太): Phase 1 初始建立
"""
from __future__ import annotations
import asyncio
import time
from typing import TYPE_CHECKING, Any
import structlog
from src.services.evidence_snapshot import EvidenceSnapshot
from src.services.mcp_tool_registry import SensorDimension, get_mcp_tool_registry
from src.services.sanitization_service import sanitize_dict_values
if TYPE_CHECKING:
from src.models.incident import Incident
logger = structlog.get_logger(__name__)
# 執行後等待收斂時間(秒)— K8s controller 需要時間處理重啟/滾動更新
POST_EXEC_WARMUP_SEC = 10.0
# 驗證超時(秒)
VERIFY_TIMEOUT_SEC = 30.0
# MCP 單工具超時(秒)
TOOL_TIMEOUT_SEC = 8.0
class PostExecutionVerifier:
"""
執行後環境狀態驗證器。
在 approval_execution.py 的 execute_approved_action() 中,
執行動作後呼叫 verify(),取得驗證結果並補填 EvidenceSnapshot。
Usage:
verifier = get_post_execution_verifier()
result = await verifier.verify(
incident=incident,
snapshot=pre_decision_snapshot,
action_taken="restart_service:awoooi-api",
)
# result: "success" | "degraded" | "failed" | "timeout"
"""
def __init__(self) -> None:
self._registry = get_mcp_tool_registry()
async def verify(
self,
incident: "Incident",
snapshot: EvidenceSnapshot | None,
action_taken: str,
warmup_sec: float = POST_EXEC_WARMUP_SEC,
) -> str:
"""
執行後驗證。
Args:
incident: 原始 Incident用於取 labels 定位資源)
snapshot: 執行前的 EvidenceSnapshot取 pre_execution_state 作基準線)
action_taken: 執行的動作描述(例如 "restart_service:awoooi-api"
warmup_sec: 等待 K8s 收斂的秒數
Returns:
str: "success" | "degraded" | "failed" | "timeout"
"""
incident_id = _get_incident_id(incident)
logger.info(
"verifier_start",
incident_id=incident_id,
action=action_taken,
warmup_sec=warmup_sec,
)
# 1. 等待收斂
if warmup_sec > 0:
await asyncio.sleep(warmup_sec)
# 2. 抓後狀態
try:
post_state = await asyncio.wait_for(
self._collect_post_state(incident),
timeout=VERIFY_TIMEOUT_SEC,
)
except asyncio.TimeoutError:
logger.warning("verifier_timeout", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "timeout")
return "timeout"
except Exception:
logger.exception("verifier_collect_error", incident_id=incident_id)
if snapshot:
await _update_snapshot(snapshot, {}, "failed")
return "failed"
# 3. 對比前後狀態
pre_state = snapshot.pre_execution_state if snapshot else None
result = _assess_recovery(pre_state, post_state, action_taken)
# 4. 更新 EvidenceSnapshot
if snapshot:
await _update_snapshot(snapshot, post_state, result)
logger.info(
"verifier_done",
incident_id=incident_id,
result=result,
action=action_taken,
)
return result
async def capture_pre_execution_state(
self,
incident: "Incident",
snapshot: EvidenceSnapshot,
) -> None:
"""
執行前快照當前狀態,寫入 snapshot.pre_execution_state。
在 approval_execution.py 的動作執行「之前」呼叫。
"""
incident_id = _get_incident_id(incident)
try:
state = await asyncio.wait_for(
self._collect_post_state(incident), # 同樣的抓取邏輯
timeout=TOOL_TIMEOUT_SEC,
)
snapshot.pre_execution_state = state
logger.debug("verifier_pre_state_captured", incident_id=incident_id)
except Exception:
logger.warning("verifier_pre_state_failed", incident_id=incident_id)
snapshot.pre_execution_state = {}
async def _collect_post_state(self, incident: "Incident") -> dict[str, Any]:
"""
蒐集執行後環境狀態K8s Pod 狀態 + 關鍵指標)。
只選 D1K8s 狀態)和 D3指標作為驗證基準線
其他感官維度(日誌、拓撲等)在驗證時不必要。
"""
state: dict[str, Any] = {}
alertname = _get_alertname(incident)
labels = _get_labels(incident)
# 取 D1 + D3 工具
all_tools = self._registry.suggest_tools(alertname=alertname, incident_labels=labels)
verify_tools = [
t for t in all_tools
if any(d in (SensorDimension.D1_K8S_STATE, SensorDimension.D3_METRICS)
for d in t.dimensions)
]
params = {
"namespace": labels.get("namespace", "awoooi-prod"),
"pod_name": labels.get("pod", labels.get("name", "")),
"deployment": labels.get("deployment", ""),
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
}
async def _call_one(reg) -> tuple[str, Any]:
try:
result = await asyncio.wait_for(
reg.provider.execute(reg.tool.name, params),
timeout=TOOL_TIMEOUT_SEC,
)
if result.success and result.output:
return reg.tool.name, result.output
except Exception:
pass
return reg.tool.name, None
results = await asyncio.gather(*[_call_one(t) for t in verify_tools])
for tool_name, output in results:
if output is not None:
if isinstance(output, dict):
state[tool_name] = sanitize_dict_values(output, f"post_state.{tool_name}")
else:
state[tool_name] = {"raw": sanitize(str(output), f"post_state.{tool_name}")}
return state
# ─────────────────────────────────────────────────────────────────────────────
# Recovery Assessment
# ─────────────────────────────────────────────────────────────────────────────
def _assess_recovery(
pre_state: dict[str, Any] | None,
post_state: dict[str, Any],
action_taken: str,
) -> str:
"""
評估修復效果。
Phase 1 使用啟發式規則(基於 K8s Pod 狀態字串判斷)。
Phase 4 將改用動態基線Holt-Winters 偏差量),不再用靜態閾值。
HeuristicsPhase 1 版本):
- post_state 含 Running → success
- post_state 含 CrashLoopBackOff / Error / OOMKilled → failed
- post_state 為空MCP 無回應)→ degraded
- pre_state 與 post_state 完全相同 → degraded未改變
"""
if not post_state:
return "degraded"
# 轉為字串做啟發式掃描
post_str = str(post_state).lower()
pre_str = str(pre_state).lower() if pre_state else ""
# 失敗信號Gate 1 fix: 移除裸 "error" — 會誤觸 error_rate/error_count 等指標 key
# "error" 作為 K8s ContainerState reason 由 "failed" Pod phase 間接覆蓋
failure_signals = ["crashloopbackoff", "oomkilled", "oomkill", "failed"]
if any(sig in post_str for sig in failure_signals):
return "failed"
# 成功信號
success_signals = ["running", "ready", "1/1", "2/2", "3/3", "healthy"]
if any(sig in post_str for sig in success_signals):
# 但如果 pre_state 已經是 running可能是無效操作
if pre_str and any(sig in pre_str for sig in success_signals):
# 如果執行的是 restart即使 pre/post 都 Running 也算 success
if "restart" in action_taken.lower() or "delete" in action_taken.lower():
return "success"
return "degraded"
return "success"
# 前後無變化
if pre_str and post_str == pre_str:
return "degraded"
return "degraded"
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _get_incident_id(incident: "Incident") -> str:
return incident.incident_id if hasattr(incident, "incident_id") else str(incident.id)
def _get_alertname(incident: "Incident") -> str:
if incident.signals:
return incident.signals[0].labels.get("alertname", "")
return ""
def _get_labels(incident: "Incident") -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels
return {}
async def _update_snapshot(
snapshot: EvidenceSnapshot,
post_state: dict[str, Any],
result: str,
) -> None:
"""補填 EvidenceSnapshot 的 post_execution_state + verification_result。"""
try:
await snapshot.update_post_execution(post_state, result)
except Exception:
logger.exception("verifier_snapshot_update_failed", snapshot_id=snapshot.snapshot_id)
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_verifier: PostExecutionVerifier | None = None
def get_post_execution_verifier() -> PostExecutionVerifier:
"""取得 PostExecutionVerifier Singleton。"""
global _verifier
if _verifier is None:
_verifier = PostExecutionVerifier()
return _verifier