awoooi/apps/api/src/services/auto_repair_service.py

"""
Auto Repair Service - #8 自動升級決策
=====================================
高品質 Playbook 自動修復執行

Phase 8: 自動化層實作
建立時間: 2026-03-26 17:30 (台北時區)
建立者: Claude Code (#8 自動升級決策)

遵循 leWOOOgo 積木化原則:
- Service 層只依賴 Repository/Service Interface
- 不直接存取 Redis/DB
- 封裝所有自動修復邏輯

觸發條件 (AND):
1. 有匹配的高品質 Playbook (is_high_quality = True)
2. Playbook 中的動作風險等級 <= MEDIUM
3. Incident 嚴重度 <= P2

安全邊界:
- HIGH/CRITICAL 風險動作永遠需要人工審核
- P0/P1 嚴重度 Incident 需要人工確認
"""

from dataclasses import dataclass
from collections.abc import Callable
from typing import Protocol

import structlog

from src.models.incident import Incident, Severity
from src.models.playbook import (
    ActionType,
    Playbook,
    PlaybookStatus,
    RiskLevel,
    SymptomPattern,
)
from src.services.anomaly_counter import AnomalyFrequency, get_anomaly_counter
from src.services.executor import get_executor
from src.services.global_repair_cooldown import (
    check_global_repair_cooldown,
    record_global_repair_action,
)
# Sprint 5.1: Service Registry Guardrail (ADR-062)
from src.services.service_registry import StatefulLevel, get_service_registry
from src.services.playbook_service import IPlaybookService, get_playbook_service

logger = structlog.get_logger(__name__)


# =============================================================================
# Types
# =============================================================================


@dataclass
class AutoRepairDecision:
    """自動修復決策結果"""

    can_auto_repair: bool
    playbook: Playbook | None = None
    reason: str = ""
    risk_level: RiskLevel = RiskLevel.MEDIUM
    blocked_by: str | None = None  # 阻擋原因 (如 HIGH_RISK, P1_SEVERITY)
    # 2026-04-07 Claude Code: Sprint 4 B2 — 追蹤首次信任
    is_cold_start: bool = False
    # 2026-04-08 Claude Code: 傳入 execute_auto_repair 供 DB 記錄
    similarity_score: float | None = None


@dataclass
class AutoRepairResult:
    """自動修復執行結果"""

    success: bool
    playbook_id: str
    incident_id: str
    executed_steps: list[str]
    error: str | None = None
    execution_time_ms: int = 0


# =============================================================================
# Auto Repair Service Interface
# =============================================================================


class IAutoRepairService(Protocol):
    """自動修復服務介面"""

    async def evaluate_auto_repair(
        self,
        incident: Incident,
    ) -> AutoRepairDecision:
        """
        評估是否可自動修復

        Args:
            incident: 待處理的 Incident

        Returns:
            AutoRepairDecision: 決策結果
        """
        ...

    async def execute_auto_repair(
        self,
        incident: Incident,
        playbook: Playbook,
    ) -> AutoRepairResult:
        """
        執行自動修復

        Args:
            incident: 待處理的 Incident
            playbook: 要執行的 Playbook

        Returns:
            AutoRepairResult: 執行結果
        """
        ...


# =============================================================================
# Auto Repair Service Implementation
# =============================================================================


class AutoRepairService:
    """
    自動修復服務實作

    職責:
    - 評估 Incident 是否可自動修復
    - 執行高品質 Playbook
    - 更新執行統計
    """

    # === 安全邊界常數 ===
    # 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
    # 移除相似度/品質/風險門檻，只保留 P0/P1 嚴重度阻擋
    MAX_AUTO_REPAIR_RISK = RiskLevel.MEDIUM  # 保留供日後參考，不再用於阻擋
    MAX_AUTO_REPAIR_SEVERITY = Severity.P2  # P0/P1 仍需人工審核
    MIN_SIMILARITY_SCORE = 0.0  # 🔴 已取消門檻
    COLD_START_TRUST_MAX_EXECUTIONS = 3  # 保留供參考
    COLD_START_TRUST_DAILY_LIMIT = 5    # 保留供參考

    def __init__(
        self,
        playbook_service: IPlaybookService | None = None,
        cooldown_checker: Callable | None = None,
    ):
        # 2026-04-01 ogt: 注入 cooldown_checker 支援測試隔離 (DI 原則)
        self._playbook_service = playbook_service or get_playbook_service()
        self._cooldown_checker = cooldown_checker or check_global_repair_cooldown
        # 2026-04-04 Claude Code: Phase 25 P1 — 持有 runbook_generator task 引用，防 GC 回收
        import asyncio
        self._pending_tasks: set[asyncio.Task] = set()

    async def drain_pending_tasks(self, timeout: float = 60.0) -> dict:
        """K8s rolling restart 時優雅等待所有背景任務完成。

        # 2026-04-27 Wave8-X3 by Claude — B25/B26 drain fix
        在 lifespan shutdown 中呼叫，確保 _verify_and_learn / runbook_generator
        等 fire-and-forget task 在 SIGTERM 後仍有機會寫入 trust_score / runbook。
        """
        import asyncio as _asyncio

        if not self._pending_tasks:
            return {"drained": 0, "timeout": False}

        pending_count = len(self._pending_tasks)
        logger.info(
            "auto_repair_draining_pending_tasks",
            count=pending_count,
            timeout=timeout,
        )

        try:
            done, still_pending = await _asyncio.wait(
                self._pending_tasks,
                timeout=timeout,
                return_when=_asyncio.ALL_COMPLETED,
            )
            return {
                "drained": len(done),
                "still_pending": len(still_pending),
                "timeout": len(still_pending) > 0,
            }
        except Exception as e:
            logger.exception("drain_pending_tasks_failed", error=str(e))
            return {"drained": 0, "still_pending": pending_count, "error": str(e)}

    async def evaluate_auto_repair(
        self,
        incident: Incident,
    ) -> AutoRepairDecision:
        """
        評估是否可自動修復

        決策流程:
        1. 檢查 Incident 嚴重度 (P0/P1 需人工)
        2. 從 Playbook 找匹配項
        3. 檢查 Playbook 是否為高品質
        4. 檢查動作風險等級
        """
        logger.info(
            "auto_repair_evaluate_start",
            incident_id=incident.incident_id,
            severity=incident.severity.value if incident.severity else None,
        )

        # 0. 全域熔斷檢查（ADR-039 最優先）
        can_repair, cooldown_reason = await self._cooldown_checker(
            incident_id=incident.incident_id,
            affected_services=incident.affected_services or [],
        )
        if not can_repair:
            logger.warning(
                "auto_repair_blocked_global_cooldown",
                incident_id=incident.incident_id,
                reason=cooldown_reason,
            )
            return AutoRepairDecision(
                can_auto_repair=False,
                reason=cooldown_reason,
                blocked_by="GLOBAL_GUARDRAIL",
            )

        # 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
        # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei，ADR-062)
        # 全域熔斷之後、嚴重度之前，BLOCK 等級直接拒絕
        # 保守原則：Registry 讀取失敗也 block（優先安全，不放行）
        try:
            _registry = get_service_registry()
            _service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
            if not _service_name and incident.affected_services:
                _service_name = incident.affected_services[0]
            _stateful_level = _registry.get_stateful_level(_service_name)
            if _stateful_level == StatefulLevel.BLOCK:
                logger.warning(
                    "auto_repair_blocked_guardrail",
                    incident_id=incident.incident_id,
                    service_name=_service_name,
                    stateful_level="BLOCK",
                )
                return AutoRepairDecision(
                    can_auto_repair=False,
                    reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單（資料安全，見 service-registry.yaml）",
                    blocked_by="SERVICE_REGISTRY_BLOCK",
                )
        except Exception as _guardrail_err:
            # S1-3 修正: Registry 失敗時保守拒絕，不允許穿透（ADR-062 審查修正 2026-04-08）
            logger.error("guardrail_check_failed", error=str(_guardrail_err))
            return AutoRepairDecision(
                can_auto_repair=False,
                reason="Guardrail Service Registry 讀取異常，保守拒絕自動修復",
                blocked_by="GUARDRAIL_ERROR",
            )

        # 1. 檢查 Incident 嚴重度
        if incident.severity and incident.severity.value in ["P0", "P1"]:
            logger.info(
                "auto_repair_blocked_severity",
                incident_id=incident.incident_id,
                severity=incident.severity.value,
            )
            return AutoRepairDecision(
                can_auto_repair=False,
                reason=f"Incident 嚴重度 {incident.severity.value} 需要人工審核",
                blocked_by="HIGH_SEVERITY",
            )

        # 2. 提取症狀模式
        symptoms = self._extract_symptoms(incident)

        # 2.1 2026-04-04 Claude Code: Phase 25 P1 — Anti-Pattern 閘門
        # 根據確定性 hash 比對近 7 天失敗案例，避免 AI 在同一個坑重複摔倒
        try:
            from src.services.knowledge_service import get_knowledge_service
            symptoms_hash = symptoms.compute_hash()
            anti_patterns = await get_knowledge_service().check_anti_pattern(
                symptoms_hash, days=7
            )
            if anti_patterns:
                ap = anti_patterns[0]
                logger.warning(
                    "auto_repair_blocked_anti_pattern",
                    incident_id=incident.incident_id,
                    symptoms_hash=symptoms_hash,
                    anti_pattern_id=ap.id,
                    anti_pattern_title=ap.title,
                )
                return AutoRepairDecision(
                    can_auto_repair=False,
                    reason=f"過去 7 天有失敗案例: {ap.title}",
                    blocked_by="ANTI_PATTERN",
                )
        except Exception as _ap_e:
            # Anti-Pattern 閘門失敗不阻塞主流程（僅記錄）
            logger.warning("anti_pattern_gate_error", error=str(_ap_e))
            symptoms_hash = ""

        # 3. 找匹配的 Playbook
        recommendations = await self._playbook_service.get_recommendations(
            symptoms=symptoms,
            top_k=3,
        )

        if not recommendations:
            logger.info(
                "auto_repair_no_playbook_match",
                incident_id=incident.incident_id,
            )
            return AutoRepairDecision(
                can_auto_repair=False,
                reason="未找到匹配的 Playbook",
                blocked_by="NO_MATCH",
            )

        # 4. 檢查最佳匹配
        best_match = recommendations[0]

        # 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
        # 移除: 相似度門檻、is_high_quality 門檻、冷啟動機制、風險等級門檻
        # 只要有匹配 Playbook 且 APPROVED，直接執行
        max_risk = self._get_max_risk_level(best_match.playbook)
        _is_cold_start = False

        # 只保留: Playbook 必須是 APPROVED 狀態
        if best_match.playbook.status != PlaybookStatus.APPROVED:
            return AutoRepairDecision(
                can_auto_repair=False,
                playbook=best_match.playbook,
                reason=f"Playbook 狀態為 {best_match.playbook.status.value}，必須是 APPROVED",
                blocked_by="NOT_APPROVED",
            )

        if self._is_host_or_backup_incident(incident) and self._playbook_has_k8s_steps(best_match.playbook):
            logger.warning(
                "auto_repair_blocked_host_backup_k8s_playbook",
                incident_id=incident.incident_id,
                playbook_id=best_match.playbook.playbook_id,
                alert_category=getattr(incident, "alert_category", None),
            )
            return AutoRepairDecision(
                can_auto_repair=False,
                playbook=best_match.playbook,
                reason=(
                    "主機/備份類告警禁止執行 K8s Playbook；"
                    "需改走 SSH 診斷或緊急介入"
                ),
                blocked_by="HOST_BACKUP_K8S_PLAYBOOK",
            )

        # 5. 可以自動修復
        logger.info(
            "auto_repair_approved",
            incident_id=incident.incident_id,
            playbook_id=best_match.playbook.playbook_id,
            similarity=best_match.similarity_score,
            success_rate=best_match.playbook.success_rate,
        )

        return AutoRepairDecision(
            can_auto_repair=True,
            playbook=best_match.playbook,
            reason=f"匹配 Playbook: {best_match.playbook.name} (相似度 {best_match.similarity_score:.0%})",
            risk_level=max_risk,
            is_cold_start=_is_cold_start,
            similarity_score=best_match.similarity_score,
        )

    async def execute_auto_repair(
        self,
        incident: Incident,
        playbook: Playbook,
        is_cold_start: bool = False,
        similarity_score: float | None = None,
    ) -> AutoRepairResult:
        """
        執行自動修復

        流程:
        1. 依序執行 Playbook 中的 repair_steps
        2. 記錄執行結果到 DB (auto_repair_executions)
        3. 更新 Playbook 統計
        4. 記錄處置類型 (Sprint 4 B1/B2)
        """
        import time

        start_time = time.perf_counter()
        executed_steps: list[str] = []

        logger.info(
            "auto_repair_execute_start",
            incident_id=incident.incident_id,
            playbook_id=playbook.playbook_id,
            steps_count=len(playbook.repair_steps),
        )

        # ADR-039: 記錄全域修復計數（用於熔斷檢查）
        await record_global_repair_action()

        try:
            # 執行每個步驟
            for step in playbook.repair_steps:
                # 2026-04-07 Claude Code: 統帥指令「直接全部跳成自動修復」
                # 移除 step-level 風險門檻，所有步驟直接執行

                # 執行步驟
                step_result = await self._execute_step(incident, step)
                executed_steps.append(
                    f"Step {step.step_number}: {step.command[:50]}... -> {step_result}"
                )

            # 更新 Playbook 統計
            await self._playbook_service.record_execution(
                playbook_id=playbook.playbook_id,
                success=True,
            )

            execution_time = int((time.perf_counter() - start_time) * 1000)

            logger.info(
                "auto_repair_execute_success",
                incident_id=incident.incident_id,
                playbook_id=playbook.playbook_id,
                executed_steps=len(executed_steps),
                execution_time_ms=execution_time,
            )

            repair_result = AutoRepairResult(
                success=True,
                playbook_id=playbook.playbook_id,
                incident_id=incident.incident_id,
                executed_steps=executed_steps,
                execution_time_ms=execution_time,
            )

            # 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄，寫入資料庫」
            try:
                from src.repositories.audit_log_repository import get_auto_repair_execution_repository
                max_risk = self._get_max_risk_level(playbook)
                await get_auto_repair_execution_repository().create(
                    incident_id=incident.incident_id,
                    playbook_id=playbook.playbook_id,
                    playbook_name=playbook.name,
                    success=True,
                    executed_steps=executed_steps,
                    triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
                    similarity_score=similarity_score,
                    risk_level=max_risk.value if max_risk else None,
                    execution_time_ms=execution_time,
                )
            except Exception as _db_e:
                logger.error("auto_repair_db_write_failed", error=str(_db_e))

            self._record_auto_repair_metric(playbook, success=True)

            # 2026-04-07 Claude Code: Sprint 4 B1/B2 — 記錄處置類型
            # P0-1 Fix: 統一使用 AnomalyCounter.hash_signature()
            try:
                from src.services.anomaly_counter import get_anomaly_counter
                counter = get_anomaly_counter()
                anomaly_key = self._derive_anomaly_key(incident)
                if anomaly_key:
                    disposition_type = "cold_start_trust" if is_cold_start else "auto_repair"
                    await counter.record_disposition(anomaly_key, disposition_type)
            except Exception as _disp_e:
                logger.warning("disposition_record_failed", error=str(_disp_e))

            # 2026-04-04 Claude Code: Phase 25 P1 — 成功修復後 fire-and-forget 生成 AUTO_RUNBOOK
            try:
                from src.services.runbook_generator import get_runbook_generator
                symptoms = self._extract_symptoms(incident)
                symptoms_hash = symptoms.compute_hash()
                gen = get_runbook_generator()
                import asyncio as _asyncio
                task = _asyncio.create_task(
                    gen.generate_runbook(incident, playbook, repair_result, symptoms_hash)
                )
                self._pending_tasks.add(task) if hasattr(self, "_pending_tasks") else None
                task.add_done_callback(
                    lambda t: self._pending_tasks.discard(t) if hasattr(self, "_pending_tasks") else None
                )
            except Exception as _rg_e:
                logger.warning("runbook_generator_task_failed", error=str(_rg_e))

            # 2026-04-26 Wave4 P1.3+P1.4 by Claude Engineer-B3 — 飛輪閉環最後一哩
            # 成功執行後，fire-and-forget 啟動後執行驗證 + EWMA 學習回饋
            # verifier 有 10s warmup + 30s timeout，不能阻塞在主路徑
            try:
                import asyncio as _asyncio
                from src.services.post_execution_verifier import get_post_execution_verifier
                from src.services.learning_service import get_learning_service

                _action_taken = f"auto_repair:{playbook.playbook_id}"
                _verifier = get_post_execution_verifier()
                _learning = get_learning_service()

                async def _verify_and_learn() -> None:
                    try:
                        verification_result = await _verifier.verify(
                            incident=incident,
                            snapshot=None,
                            action_taken=_action_taken,
                        )
                        await _learning.record_verification_result(
                            incident_id=incident.incident_id,
                            action_taken=_action_taken,
                            verification_result=verification_result,
                            matched_playbook_id=playbook.playbook_id,
                        )
                        logger.info(
                            "auto_repair_verify_and_learn_done",
                            incident_id=incident.incident_id,
                            playbook_id=playbook.playbook_id,
                            verification_result=verification_result,
                        )

                        # 2026-04-27 P3.1-T1 by Claude — 三 Tier-1 服務整合
                        # PostExecutionVerifier 判斷失敗/降級 → 觸發自動 Rollback
                        if verification_result in ("failed", "degraded"):
                            if self._should_escalate_failed_verification(incident, playbook):
                                await self._escalate_failed_verification(
                                    incident=incident,
                                    playbook=playbook,
                                    verification_result=verification_result,
                                )
                                return
                            try:
                                from src.services.rollback_manager import get_rollback_manager
                                from src.services.declarative_remediation import DeclarativeRemediation
                                from src.core.metrics import ROLLBACK_EXECUTED_TOTAL

                                # 從 Incident 推導 target / namespace / action
                                _rb_target = (incident.affected_services or ["unknown"])[0]
                                _rb_ns = "awoooi-prod"
                                _rb_action = f"kubectl rollout restart deployment/{_rb_target} -n {_rb_ns}"
                                _spec = DeclarativeRemediation().evaluate(
                                    action=_rb_action,
                                    target=_rb_target,
                                    namespace=_rb_ns,
                                )
                                rollback_mgr = get_rollback_manager()
                                rollback_result = await rollback_mgr.trigger(
                                    incident_id=incident.incident_id,
                                    spec=_spec,
                                    verification_result=verification_result,
                                )
                                _rb_status = "success" if rollback_result.success else "failed"
                                _rb_reason = "converged" if rollback_result.convergence_confirmed else (
                                    "no_previous_revision" if rollback_result.error and "revision" in (rollback_result.error or "")
                                    else "error"
                                )
                                ROLLBACK_EXECUTED_TOTAL.labels(
                                    status=_rb_status, reason=_rb_reason
                                ).inc()
                                logger.info(
                                    "auto_rollback_triggered",
                                    incident_id=incident.incident_id,
                                    rollback_success=rollback_result.success,
                                    convergence_confirmed=rollback_result.convergence_confirmed,
                                    rollback_error=rollback_result.error,
                                )
                            except Exception as _rb_e:
                                logger.exception(
                                    "auto_rollback_failed",
                                    incident_id=incident.incident_id,
                                    error=str(_rb_e),
                                )

                    except Exception as _inner_e:
                        logger.warning(
                            "auto_repair_verify_and_learn_failed",
                            incident_id=incident.incident_id,
                            error=str(_inner_e),
                        )

                _vl_task = _asyncio.create_task(_verify_and_learn())
                if hasattr(self, "_pending_tasks"):
                    self._pending_tasks.add(_vl_task)
                    _vl_task.add_done_callback(self._pending_tasks.discard)
            except Exception as _vl_e:
                logger.warning("auto_repair_verifier_setup_failed", error=str(_vl_e))

            return repair_result

        except Exception as e:
            # 更新失敗統計
            await self._playbook_service.record_execution(
                playbook_id=playbook.playbook_id,
                success=False,
            )

            execution_time = int((time.perf_counter() - start_time) * 1000)

            logger.error(
                "auto_repair_execute_failed",
                incident_id=incident.incident_id,
                playbook_id=playbook.playbook_id,
                error=str(e),
            )

            fail_result = AutoRepairResult(
                success=False,
                playbook_id=playbook.playbook_id,
                incident_id=incident.incident_id,
                executed_steps=executed_steps,
                error=str(e),
                execution_time_ms=execution_time,
            )

            # 2026-04-08 Claude Code: 失敗也必須寫入 DB
            try:
                from src.repositories.audit_log_repository import get_auto_repair_execution_repository
                max_risk = self._get_max_risk_level(playbook)
                await get_auto_repair_execution_repository().create(
                    incident_id=incident.incident_id,
                    playbook_id=playbook.playbook_id,
                    playbook_name=playbook.name,
                    success=False,
                    executed_steps=executed_steps,
                    error_message=str(e),
                    triggered_by="cold_start_trust" if is_cold_start else "auto_repair",
                    similarity_score=similarity_score,
                    risk_level=max_risk.value if max_risk else None,
                    execution_time_ms=execution_time,
                )
            except Exception as _db_e:
                logger.error("auto_repair_db_write_failed", error=str(_db_e))

            self._record_auto_repair_metric(playbook, success=False)

            # 2026-04-04 Claude Code: Phase 25 P1 — 失敗修復後 fire-and-forget 生成 ANTI_PATTERN
            # 2026-04-05 Claude Code: I1 修正 — 補齊 _pending_tasks GC 防護（對稱化）
            try:
                from src.services.runbook_generator import get_runbook_generator
                import asyncio as _asyncio
                symptoms = self._extract_symptoms(incident)
                symptoms_hash = symptoms.compute_hash()
                gen = get_runbook_generator()
                _ap_task = _asyncio.create_task(
                    gen.generate_anti_pattern(incident, playbook, fail_result, symptoms_hash)
                )
                self._pending_tasks.add(_ap_task)
                _ap_task.add_done_callback(self._pending_tasks.discard)
            except Exception as _ap_e:
                logger.warning("anti_pattern_task_failed", error=str(_ap_e))

            return fail_result

    # === Private Helpers ===

    @staticmethod
    def _derive_anomaly_key(incident: Incident) -> str | None:
        """
        從 Incident 推導 anomaly_key。
        2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
        """
        from src.services.anomaly_counter import AnomalyCounter
        return AnomalyCounter.derive_key_from_incident(incident)

    def _extract_symptoms(self, incident: Incident) -> SymptomPattern:
        """從 Incident 提取症狀模式"""
        alert_names = []
        keywords = []

        if incident.signals:
            for signal in incident.signals:
                # 優先用 labels["alertname"]（原始 Prometheus alertname），
                # fallback 到 signal.alert_name（可能是 "custom" 等類別值）
                # (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, L7 E2E 修正)
                raw_alertname = signal.labels.get("alertname") if signal.labels else None
                alert_names.append(raw_alertname or signal.alert_name)
                # 從 annotations 提取關鍵字
                if signal.annotations:
                    for value in signal.annotations.values():
                        if isinstance(value, str) and len(value) < 50:
                            keywords.append(value)

        return SymptomPattern(
            alert_names=alert_names,
            affected_services=incident.affected_services or [],
            severity_range=[incident.severity.value] if incident.severity else ["P2"],
            keywords=keywords[:10],
        )

    def _get_max_risk_level(self, playbook: Playbook) -> RiskLevel:
        """取得 Playbook 中最高的風險等級"""
        risk_order = {
            RiskLevel.LOW: 0,
            RiskLevel.MEDIUM: 1,
            RiskLevel.HIGH: 2,
            RiskLevel.CRITICAL: 3,
        }

        max_risk = RiskLevel.LOW
        for step in playbook.repair_steps:
            if risk_order.get(step.risk_level, 0) > risk_order.get(max_risk, 0):
                max_risk = step.risk_level

        return max_risk

    def _record_auto_repair_metric(self, playbook: Playbook, success: bool) -> None:
        """把實際 auto-repair 執行寫入 Prometheus 指標。

        2026-05-06 ogt + Codex：DB 已有 auto_repair_executions，但
        core.metrics.record_auto_repair() 長期零 caller，導致治理/心跳用
        Prometheus 看起來像「飛輪沒做事」。label 使用 action_type，避免
        playbook_id 造成高基數。
        """
        try:
            from src.core.metrics import record_auto_repair

            first_step = playbook.repair_steps[0] if playbook.repair_steps else None
            action = first_step.action_type.value if first_step else "unknown"
            max_risk = self._get_max_risk_level(playbook)
            tier = {
                RiskLevel.LOW: 1,
                RiskLevel.MEDIUM: 2,
                RiskLevel.HIGH: 3,
                RiskLevel.CRITICAL: 4,
            }.get(max_risk, 0)
            record_auto_repair(action=action, tier=tier, success=success)
        except Exception as e:
            logger.warning(
                "auto_repair_metric_record_failed",
                playbook_id=playbook.playbook_id,
                success=success,
                error=str(e),
            )

    def _is_host_or_backup_incident(self, incident: Incident) -> bool:
        """主機/備份類事件只能走 SSH/只讀診斷，不允許 K8s rollout 類修復。"""

        category = (getattr(incident, "alert_category", None) or "").lower()
        if category in {"host_resource", "backup_failure"}:
            return True

        for signal in incident.signals or []:
            labels = signal.labels or {}
            alertname = str(labels.get("alertname") or signal.alert_name or "")
            if alertname.startswith("HostBackup") or alertname.startswith("Host"):
                return True
        return False

    def _playbook_has_k8s_steps(self, playbook: Playbook) -> bool:
        """檢查 Playbook 是否包含 K8s 指令，避免主機告警誤執行 deployment 操作。"""

        for step in playbook.repair_steps:
            command = (step.command or "").strip().lower()
            if step.action_type == ActionType.KUBECTL or command.startswith("kubectl "):
                return True
        return False

    def _should_escalate_failed_verification(self, incident: Incident, playbook: Playbook) -> bool:
        """非 K8s 修復或主機/備份事件驗證失敗時，禁止合成 K8s rollback。"""

        return self._is_host_or_backup_incident(incident) or not self._playbook_has_k8s_steps(playbook)

    async def _escalate_failed_verification(
        self,
        *,
        incident: Incident,
        playbook: Playbook,
        verification_result: str,
    ) -> None:
        """Post-verification failed but rollback is unsafe; notify emergency channel."""

        target = (incident.affected_services or ["unknown"])[0]
        namespace = "awoooi-prod"
        alert_type = self._incident_alert_type(incident)
        reason = (
            f"auto repair playbook {playbook.playbook_id} verification={verification_result}; "
            "rollback is unsafe for host/backup or non-K8s remediation"
        )
        logger.warning(
            "auto_repair_verification_failed_emergency",
            incident_id=incident.incident_id,
            playbook_id=playbook.playbook_id,
            verification_result=verification_result,
            target=target,
        )
        try:
            from src.services.emergency_escalation_service import (
                escalate_auto_repair_unavailable,
            )

            await escalate_auto_repair_unavailable(
                incident_id=incident.incident_id,
                approval_id=None,
                alert_type=alert_type,
                target_resource=target,
                namespace=namespace,
                failure_reason=reason,
                attempted_actions=(
                    f"auto_repair:{playbook.playbook_id} -> verifier:{verification_result} "
                    "-> emergency_intervention"
                ),
            )
        except Exception as exc:
            logger.warning(
                "auto_repair_verification_emergency_failed",
                incident_id=incident.incident_id,
                playbook_id=playbook.playbook_id,
                error=str(exc),
            )

    def _incident_alert_type(self, incident: Incident) -> str:
        """Best-effort alertname for emergency cards."""

        for signal in incident.signals or []:
            labels = signal.labels or {}
            alertname = labels.get("alertname") or signal.alert_name
            if alertname:
                return str(alertname)
        return "AutoRepairVerificationFailed"

    def _risk_exceeds_threshold(self, risk: RiskLevel) -> bool:
        """檢查風險是否超過自動修復門檻"""
        high_risks = {RiskLevel.HIGH, RiskLevel.CRITICAL}
        return risk in high_risks

    async def _check_cold_start_daily_limit(self) -> bool:
        """
        檢查今日首次信任自動修復次數是否在限額內。
        使用 Redis counter，key 含日期，自動過期。
        2026-04-07 Claude Code: 方案 C — 冷啟動每日上限防護
        """
        try:
            from src.core.redis_client import get_redis
            redis = await get_redis()
            if redis is None:
                # Redis 不可用 → 保守拒絕
                return False

            from src.utils.timezone import now_taipei
            today_key = f"cold_start_trust:{now_taipei().strftime('%Y-%m-%d')}"
            count = await redis.incr(today_key)

            # 首次建立 key 時設定過期 (25 小時，確保跨日清理)
            if count == 1:
                await redis.expire(today_key, 90000)

            if count > self.COLD_START_TRUST_DAILY_LIMIT:
                logger.warning(
                    "cold_start_daily_limit_reached",
                    today_key=today_key,
                    count=count,
                    limit=self.COLD_START_TRUST_DAILY_LIMIT,
                )
                return False

            return True
        except Exception as e:
            logger.warning("cold_start_daily_limit_check_failed", error=str(e))
            # 安全降級：檢查失敗 → 保守拒絕
            return False

    async def _execute_step(self, incident: Incident, step) -> str:
        """
        執行單一修復步驟

        目前整合:
        - kubectl 命令: 透過 ActionExecutor
        - script: 透過 subprocess
        - manual: 跳過 (需人工)
        """
        if step.action_type == ActionType.MANUAL:
            return "SKIPPED (manual step)"

        if step.action_type == ActionType.KUBECTL:
            # 整合 ActionExecutor
            try:
                executor = get_executor()

                # 替換 {target} 為實際目標
                command = step.command
                if incident.affected_services:
                    command = command.replace("{target}", incident.affected_services[0])

                result = await executor.execute_kubectl_command(command)
                return "SUCCESS" if result.success else f"FAILED: {result.error}"

            except ImportError:
                logger.warning("action_executor_not_available")
                return "SKIPPED (executor not available)"

        # 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由)
        if step.action_type == ActionType.SSH_COMMAND:
            from src.services.host_repair_agent import HostRepairAgent
            agent = HostRepairAgent()
            approved = not getattr(step, "requires_approval", False)
            result = await agent.repair_by_uri(step.command, approved=approved)
            if result.success:
                return f"SUCCESS: {result.output}"
            else:
                return f"FAILED: {result.error}"

        return "UNKNOWN_ACTION_TYPE"

    # === ADR-037: Tier-based Repair (2026-03-29) ===

    # Tier 分級動作映射
    TIER_ACTIONS = {
        1: ["restart_pod", "restart_container"],  # 臨時修復
        2: ["scale_up", "increase_memory", "adjust_limits"],  # 緩解修復
        3: ["apply_hotfix", "update_config", "patch_deployment"],  # 根因修復
        4: ["create_issue", "notify_team", "schedule_fix"],  # 架構修復
    }

    async def determine_repair_tier(
        self,
        anomaly_key: str,
        frequency: AnomalyFrequency,
    ) -> int:
        """
        根據頻率決定修復 Tier (ADR-037)

        統帥指示 (2026-03-29):
        - "重啟只是治標，不是治本！太常發生的異常必須徹底解決"
        - 根據異常頻率和修復歷史決定應該嘗試的修復層級

        Returns:
            1: 臨時修復 (重啟)
            2: 緩解修復 (擴容)
            3: 根因修復 (配置變更)
            4: 架構修復 (需開發)
        """
        # 取得修復歷史
        counter = get_anomaly_counter()
        stats = await counter.get_all_repair_stats(anomaly_key)

        # 計算重啟次數
        restart_count = stats.get("restart_pod", {}).get("total", 0)
        restart_count += stats.get("restart_container", {}).get("total", 0)

        # Tier 決策邏輯
        if frequency.permanent_fix_applied:
            # 已有永久修復但仍出問題 → 需架構級修復
            logger.info(
                "tier_decision",
                anomaly_key=anomaly_key,
                tier=4,
                reason="permanent_fix_still_failing",
            )
            return 4

        if frequency.escalation_level == "PERMANENT_FIX":
            # 24h 內 ≥10 次 → 根因修復
            logger.info(
                "tier_decision",
                anomaly_key=anomaly_key,
                tier=3,
                reason="escalation_permanent_fix",
            )
            return 3

        if frequency.escalation_level == "ESCALATE":
            # 24h 內 ≥5 次 → 緩解修復
            logger.info(
                "tier_decision",
                anomaly_key=anomaly_key,
                tier=2,
                reason="escalation_escalate",
            )
            return 2

        if restart_count >= 2:
            # 已重啟 2 次 → 升級到緩解
            logger.info(
                "tier_decision",
                anomaly_key=anomaly_key,
                tier=2,
                reason=f"restart_count_{restart_count}",
            )
            return 2

        # 預設臨時修復
        return 1

    def get_tier_actions(self, tier: int) -> list[str]:
        """
        根據 Tier 返回可用修復動作 (ADR-037)
        """
        return self.TIER_ACTIONS.get(tier, self.TIER_ACTIONS[1])

    async def record_repair_result(
        self,
        anomaly_key: str,
        action: str,
        success: bool,
        tier: int = 1,
    ) -> None:
        """
        記錄修復結果到 AnomalyCounter (ADR-037)

        Args:
            anomaly_key: 異常 key
            action: 修復動作
            success: 是否成功
            tier: 修復 Tier
        """
        counter = get_anomaly_counter()
        await counter.record_repair_attempt(anomaly_key, action, success)

        # 如果是 Tier 3 永久修復成功，標記已套用
        if tier >= 3 and success:
            await counter.mark_permanent_fix_applied(
                anomaly_key=anomaly_key,
                fix_description=f"Tier {tier} repair: {action}",
            )

        logger.info(
            "repair_result_recorded",
            anomaly_key=anomaly_key,
            action=action,
            success=success,
            tier=tier,
        )


# =============================================================================
# Singleton
# =============================================================================

_service: AutoRepairService | None = None


def get_auto_repair_service() -> IAutoRepairService:
    """取得 AutoRepairService 單例"""
    global _service
    if _service is None:
        _service = AutoRepairService()
    return _service


def set_auto_repair_service(service: AutoRepairService | None) -> None:
    """注入 AutoRepairService 實例 (用於 DI 或測試)"""
    global _service
    _service = service