""" AWOOOI AIOps — 自愈品質驗證器 ================================ W2 PR-V1: 飛輪斷鏈 C6 修復 — PostExecutionVerifier 串接自愈品質評估 職責: 1. 評估系統是否真的「自愈」(root cause 解除 vs 只是 metric 暫時恢復) 2. Regression Detection(修完一個指標但其他指標惡化) 3. 修復品質分數(0.0 ~ 1.0) 評分邏輯: - base_score 由 verification_result 決定(success=1.0 / degraded=0.4 / failed=0.0 / timeout=0.2) - regression_penalty 由 pre/post state diff 中惡化指標數量決定 - 最終 score = max(0.0, base_score - regression_penalty) 閾值: - score < 0.5 → rollback 提案(Telegram 警示,不自動執行) - score >= 0.5 → 認可自愈,無額外動作 設計原則: - 不修改 self_healing_validator 內部邏輯(外部串接層) - 驗證失敗不阻塞主流程(容錯 try/except 全包) - Feature Flag: ENABLE_SELF_HEALING_VALIDATOR=false(預設關閉) ADR-081 Phase 1 延伸 2026-04-28 ogt + Claude Sonnet 4.6: W2 PR-V1 初始建立(C6 修復) """ from __future__ import annotations import re from typing import TYPE_CHECKING, Any import structlog if TYPE_CHECKING: pass logger = structlog.get_logger(__name__) # 修復品質分數基準(by verification_result) _BASE_SCORES: dict[str, float] = { "success": 1.0, "degraded": 0.4, "failed": 0.0, "timeout": 0.2, } # 每個惡化指標的扣分 _REGRESSION_PENALTY_PER_METRIC = 0.15 # 扣分上限(避免 over-penalty) _MAX_REGRESSION_PENALTY = 0.4 # root cause 解除信號(post_state 出現這些 → root cause 已清除) _ROOT_CAUSE_CLEARED_SIGNALS = ["running", "ready", "1/1", "2/2", "3/3", "healthy"] # regression 惡化信號(post_state 新出現但 pre_state 不存在 → regression) _REGRESSION_SIGNALS = [ "crashloopbackoff", "oomkilled", "oomkill", "pending", "terminating", "error", "failed", "timeout", "evicted", "imagepullbackoff", "errimagepull", ] # 數值指標惡化偵測(regex 找 %、數字,比較增幅) _NUMERIC_THRESHOLD_RATIO = 0.2 # 超過 20% 增幅算惡化 def assess_self_healing( pre_state: dict[str, Any] | None, post_state: dict[str, Any] | None, verification_result: str, action_taken: str, ) -> dict[str, Any]: """ 評估自愈品質,返回結構化評估結果。 Args: pre_state: 執行前環境狀態(可為 None) post_state: 執行後環境狀態(可為 None) verification_result: PostExecutionVerifier 的判斷結果(success/degraded/failed/timeout) action_taken: 執行的動作描述 Returns: dict 包含: score (float 0.0-1.0) root_cause_cleared (bool) regressions (list[str] — 惡化的指標名稱) detail (str — 人類可讀說明) """ base_score = _BASE_SCORES.get(verification_result, 0.0) pre_str = str(pre_state).lower() if pre_state else "" post_str = str(post_state).lower() if post_state else "" # 1. Root cause 是否真正解除 root_cause_cleared = any(sig in post_str for sig in _ROOT_CAUSE_CLEARED_SIGNALS) if verification_result in ("failed", "timeout"): root_cause_cleared = False # 2. Regression detection — 新出現在 post 但 pre 沒有的惡化信號 regressions: list[str] = [] for sig in _REGRESSION_SIGNALS: if sig in post_str and sig not in pre_str: regressions.append(sig) # 3. 數值指標惡化偵測(簡單版:找百分比值增幅) pre_nums = _extract_percentages(pre_str) post_nums = _extract_percentages(post_str) for key, pre_val in pre_nums.items(): if key in post_nums: post_val = post_nums[key] if pre_val > 0 and (post_val - pre_val) / pre_val > _NUMERIC_THRESHOLD_RATIO: regressions.append(f"metric_increase:{key}") # 4. 計算最終分數 regression_penalty = min( len(regressions) * _REGRESSION_PENALTY_PER_METRIC, _MAX_REGRESSION_PENALTY, ) score = max(0.0, base_score - regression_penalty) # 5. 組裝說明 detail_parts = [f"base={base_score:.2f}"] if regressions: detail_parts.append(f"regression_penalty={regression_penalty:.2f} ({','.join(regressions[:5])})") if not root_cause_cleared and verification_result == "success": detail_parts.append("root_cause_unclear") detail = "; ".join(detail_parts) return { "score": round(score, 4), "root_cause_cleared": root_cause_cleared, "regressions": regressions, "detail": detail, "verification_result": verification_result, "action_taken": action_taken, } def _extract_percentages(text: str) -> dict[str, float]: """ 從狀態字串中提取數值百分比。 例如 "cpu_usage: 85%" → {"cpu_usage": 85.0} 用於偵測指標惡化(簡單啟發式,Phase 1 版本)。 """ result: dict[str, float] = {} # 格式:word_key: N% 或 word_key=N% pattern = re.compile(r"(\w+)[:\s=]+(\d+(?:\.\d+)?)\s*%") for match in pattern.finditer(text): key = match.group(1) val = float(match.group(2)) result[key] = val return result