awoooi/apps/api/src/services/ci_auto_repair.py

"""
CI Auto-Repair Service - Phase 13.1 #78
========================================
CI 失敗自動修復服務，根據風險分級決定執行策略

策略:
- LOW: 自動執行修復 (如重啟 Runner、清理快取)
- MEDIUM: 發送 Telegram 確認，快速批准後執行
- HIGH: 建立 Approval，等待人工審核
- CRITICAL: 禁止自動修復，僅通知

整合:
- Intent Classifier: 判斷修復意圖類型
- Complexity Scorer: 評估修復複雜度
- AI Router: 選擇最適 AI 進行分析

版本: v1.0
建立: 2026-03-26 16:50 (台北時區)
建立者: Claude Code
"""

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum

import structlog

from src.services.complexity_scorer import get_complexity_scorer
from src.services.intent_classifier import IntentType, RiskLevel, get_intent_classifier

logger = structlog.get_logger(__name__)


# =============================================================================
# Types
# =============================================================================


class RepairAction(Enum):
    """修復動作類型"""
    RESTART_RUNNER = "restart_runner"
    CLEAR_CACHE = "clear_cache"
    RETRY_WORKFLOW = "retry_workflow"
    ROLLBACK_COMMIT = "rollback_commit"
    FIX_CONFIG = "fix_config"
    FIX_DEPENDENCY = "fix_dependency"
    SCALE_RESOURCE = "scale_resource"
    MANUAL_REQUIRED = "manual_required"


class ExecutionDecision(Enum):
    """執行決策"""
    AUTO_EXECUTE = "auto_execute"          # 直接自動執行
    TELEGRAM_CONFIRM = "telegram_confirm"  # Telegram 快速確認
    APPROVAL_REQUIRED = "approval_required"  # 建立 Approval 等待審核
    BLOCKED = "blocked"                    # 禁止執行，僅通知


@dataclass
class RepairRecommendation:
    """修復建議"""
    action: RepairAction
    command: str | None
    reason: str
    risk_level: RiskLevel
    execution_decision: ExecutionDecision
    confidence: float
    estimated_duration_seconds: int
    rollback_command: str | None = None


@dataclass
class CIRepairDecision:
    """CI 修復決策結果"""
    should_repair: bool
    execution_decision: ExecutionDecision
    recommendations: list[RepairRecommendation]
    risk_level: RiskLevel
    complexity_score: int
    intent_type: IntentType
    reason: str
    metadata: dict


# =============================================================================
# Repair Strategy Mapping
# =============================================================================


# 錯誤類型 → 修復動作映射
ERROR_TYPE_REPAIR_MAP: dict[str, list[RepairAction]] = {
    "build": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
    "test": [RepairAction.RETRY_WORKFLOW, RepairAction.FIX_CONFIG],
    "lint": [RepairAction.RETRY_WORKFLOW],
    "deploy": [RepairAction.ROLLBACK_COMMIT, RepairAction.FIX_CONFIG],
    "timeout": [RepairAction.RESTART_RUNNER, RepairAction.SCALE_RESOURCE],
    "runner": [RepairAction.RESTART_RUNNER],
    "dependency": [RepairAction.CLEAR_CACHE, RepairAction.FIX_DEPENDENCY],
    "unknown": [RepairAction.MANUAL_REQUIRED],
}


# 修復動作 → 風險等級映射
ACTION_RISK_MAP: dict[RepairAction, RiskLevel] = {
    RepairAction.RETRY_WORKFLOW: RiskLevel.LOW,
    RepairAction.CLEAR_CACHE: RiskLevel.LOW,
    RepairAction.RESTART_RUNNER: RiskLevel.LOW,
    RepairAction.FIX_CONFIG: RiskLevel.MEDIUM,
    RepairAction.FIX_DEPENDENCY: RiskLevel.MEDIUM,
    RepairAction.SCALE_RESOURCE: RiskLevel.MEDIUM,
    RepairAction.ROLLBACK_COMMIT: RiskLevel.HIGH,
    RepairAction.MANUAL_REQUIRED: RiskLevel.CRITICAL,
}


# 風險等級 → 執行決策映射
RISK_EXECUTION_MAP: dict[RiskLevel, ExecutionDecision] = {
    RiskLevel.LOW: ExecutionDecision.AUTO_EXECUTE,
    RiskLevel.MEDIUM: ExecutionDecision.TELEGRAM_CONFIRM,
    RiskLevel.HIGH: ExecutionDecision.APPROVAL_REQUIRED,
    RiskLevel.CRITICAL: ExecutionDecision.BLOCKED,
}


# =============================================================================
# CI Auto-Repair Service
# =============================================================================


class CIAutoRepairService:
    """
    CI 自動修復服務

    整合智能路由 (Phase 13.3) 進行風險評估和修復決策
    """

    def __init__(self):
        self._intent_classifier = get_intent_classifier()
        self._complexity_scorer = get_complexity_scorer()

    async def evaluate_repair(
        self,
        error_type: str,
        workflow_name: str,
        repo: str,
        failure_context: dict,
        diagnosis_summary: str | None = None,
    ) -> CIRepairDecision:
        """
        評估 CI 失敗的修復策略

        Args:
            error_type: 錯誤類型 (build/test/lint/deploy/timeout)
            workflow_name: Workflow 名稱
            repo: 倉庫名稱
            failure_context: 失敗上下文
            diagnosis_summary: AI 診斷摘要 (可選)

        Returns:
            CIRepairDecision: 修復決策
        """
        logger.info(
            "ci_repair_evaluation_started",
            error_type=error_type,
            workflow_name=workflow_name,
            repo=repo,
        )

        # 1. 構建分析文字
        analysis_text = self._build_analysis_text(
            error_type=error_type,
            workflow_name=workflow_name,
            diagnosis_summary=diagnosis_summary,
        )

        # 2. 意圖分類
        # 2026-04-27 P3.1-T3 by Claude — 修復缺失 await（classify 是 async method）
        intent_result = await self._intent_classifier.classify(analysis_text)

        # 3. 複雜度評估
        complexity_result = self._complexity_scorer.score(
            text=analysis_text,
            context={
                "error_type": error_type,
                "workflow_name": workflow_name,
                "repo": repo,
                **failure_context,
            },
        )

        # 4. 獲取可能的修復動作
        possible_actions = ERROR_TYPE_REPAIR_MAP.get(
            error_type.lower(),
            [RepairAction.MANUAL_REQUIRED],
        )

        # 5. 生成修復建議
        recommendations = self._generate_recommendations(
            possible_actions=possible_actions,
            error_type=error_type,
            workflow_name=workflow_name,
            complexity_score=complexity_result.score,
        )

        # 6. 決定整體風險等級和執行策略
        overall_risk = self._determine_overall_risk(
            recommendations=recommendations,
            intent_risk=intent_result.risk_level,
            complexity_score=complexity_result.score,
        )

        execution_decision = RISK_EXECUTION_MAP.get(
            overall_risk,
            ExecutionDecision.APPROVAL_REQUIRED,
        )

        # 7. 特殊規則覆蓋
        execution_decision = self._apply_special_rules(
            execution_decision=execution_decision,
            error_type=error_type,
            workflow_name=workflow_name,
            repo=repo,
        )

        decision = CIRepairDecision(
            should_repair=execution_decision != ExecutionDecision.BLOCKED,
            execution_decision=execution_decision,
            recommendations=recommendations,
            risk_level=overall_risk,
            complexity_score=complexity_result.score,
            intent_type=intent_result.intent,
            reason=self._generate_decision_reason(
                execution_decision=execution_decision,
                overall_risk=overall_risk,
                error_type=error_type,
            ),
            metadata={
                "intent_confidence": intent_result.confidence,
                "complexity_factors": complexity_result.factors,
                "workflow_name": workflow_name,
                "repo": repo,
            },
        )

        logger.info(
            "ci_repair_evaluation_completed",
            should_repair=decision.should_repair,
            execution_decision=execution_decision.value,
            risk_level=overall_risk.value,
            recommendations_count=len(recommendations),
        )

        return decision

    def _build_analysis_text(
        self,
        error_type: str,
        workflow_name: str,
        diagnosis_summary: str | None,
    ) -> str:
        """構建意圖分類用的分析文字"""
        parts = [
            f"CI workflow '{workflow_name}' failed",
            f"Error type: {error_type}",
        ]
        if diagnosis_summary:
            parts.append(f"Diagnosis: {diagnosis_summary}")
        return ". ".join(parts)

    def _generate_recommendations(
        self,
        possible_actions: list[RepairAction],
        error_type: str,
        workflow_name: str,
        complexity_score: int,
    ) -> list[RepairRecommendation]:
        """生成修復建議列表"""
        recommendations = []

        for action in possible_actions:
            risk = ACTION_RISK_MAP.get(action, RiskLevel.HIGH)

            # 根據複雜度調整風險
            if complexity_score >= 4:
                risk = RiskLevel.HIGH if risk == RiskLevel.MEDIUM else risk

            command, rollback = self._get_repair_command(
                action=action,
                workflow_name=workflow_name,
            )

            recommendations.append(RepairRecommendation(
                action=action,
                command=command,
                reason=self._get_action_reason(action, error_type),
                risk_level=risk,
                execution_decision=RISK_EXECUTION_MAP.get(risk, ExecutionDecision.APPROVAL_REQUIRED),
                confidence=self._calculate_confidence(action, error_type),
                estimated_duration_seconds=self._estimate_duration(action),
                rollback_command=rollback,
            ))

        # 按風險等級排序 (低風險優先)
        risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
        recommendations.sort(key=lambda r: risk_order.get(r.risk_level, 99))

        return recommendations

    def _get_repair_command(
        self,
        action: RepairAction,
        workflow_name: str,
    ) -> tuple[str | None, str | None]:
        """獲取修復指令和回滾指令"""
        commands: dict[RepairAction, tuple[str | None, str | None]] = {
            RepairAction.RETRY_WORKFLOW: (
                f"gh workflow run {workflow_name}",
                None,
            ),
            RepairAction.CLEAR_CACHE: (
                "gh cache delete --all",
                None,
            ),
            RepairAction.RESTART_RUNNER: (
                "sudo systemctl restart actions.runner.*",
                None,
            ),
            RepairAction.SCALE_RESOURCE: (
                "kubectl scale deployment/actions-runner --replicas=3",
                "kubectl scale deployment/actions-runner --replicas=2",
            ),
            RepairAction.ROLLBACK_COMMIT: (
                "git revert HEAD --no-edit && git push",
                "git revert HEAD --no-edit && git push",
            ),
            RepairAction.FIX_CONFIG: (
                None,  # 需要 AI 生成具體指令
                None,
            ),
            RepairAction.FIX_DEPENDENCY: (
                "pnpm install --force && uv sync",
                None,
            ),
            RepairAction.MANUAL_REQUIRED: (
                None,
                None,
            ),
        }
        return commands.get(action, (None, None))

    def _get_action_reason(self, action: RepairAction, error_type: str) -> str:
        """獲取修復動作的原因說明"""
        reasons = {
            RepairAction.RETRY_WORKFLOW: f"Retry workflow to recover from transient {error_type} failure",
            RepairAction.CLEAR_CACHE: "Clear build/dependency cache to resolve potential cache corruption",
            RepairAction.RESTART_RUNNER: "Restart GitHub Actions runner to recover from runner issues",
            RepairAction.SCALE_RESOURCE: "Scale runner resources to handle timeout issues",
            RepairAction.ROLLBACK_COMMIT: "Rollback recent commit that may have introduced the failure",
            RepairAction.FIX_CONFIG: "Fix configuration that may be causing the failure",
            RepairAction.FIX_DEPENDENCY: "Update or fix dependencies to resolve compatibility issues",
            RepairAction.MANUAL_REQUIRED: "Manual investigation required due to complex failure",
        }
        return reasons.get(action, "Unknown action")

    def _calculate_confidence(self, _action: RepairAction, _error_type: str) -> float:
        """
        計算修復信心度

        🔴 2026-03-29 修正: 規則匹配不是 AI 分析，統一返回 0.0
        根據 feedback_confidence_truthfulness.md 鐵律
        """
        # 規則匹配/規則引擎判斷，非 AI 分析
        return 0.0

    def _estimate_duration(self, action: RepairAction) -> int:
        """估算修復時間 (秒)"""
        durations = {
            RepairAction.RETRY_WORKFLOW: 300,  # 5 分鐘
            RepairAction.CLEAR_CACHE: 30,
            RepairAction.RESTART_RUNNER: 60,
            RepairAction.SCALE_RESOURCE: 120,
            RepairAction.ROLLBACK_COMMIT: 180,
            RepairAction.FIX_CONFIG: 600,
            RepairAction.FIX_DEPENDENCY: 300,
            RepairAction.MANUAL_REQUIRED: 3600,
        }
        return durations.get(action, 300)

    def _determine_overall_risk(
        self,
        recommendations: list[RepairRecommendation],
        intent_risk: RiskLevel,
        complexity_score: int,
    ) -> RiskLevel:
        """決定整體風險等級"""
        if not recommendations:
            return RiskLevel.CRITICAL

        # 取最低風險的建議作為基礎
        min_risk = min(
            recommendations,
            key=lambda r: {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}.get(r.risk_level, 99),
        ).risk_level

        # 如果複雜度高，提升風險等級
        if complexity_score >= 4 and min_risk == RiskLevel.LOW:
            min_risk = RiskLevel.MEDIUM
        elif complexity_score >= 5 and min_risk == RiskLevel.MEDIUM:
            min_risk = RiskLevel.HIGH

        # 如果意圖分類顯示高風險，取較高值
        risk_order = {RiskLevel.LOW: 0, RiskLevel.MEDIUM: 1, RiskLevel.HIGH: 2, RiskLevel.CRITICAL: 3}
        if risk_order.get(intent_risk, 0) > risk_order.get(min_risk, 0):
            return intent_risk

        return min_risk

    def _apply_special_rules(
        self,
        execution_decision: ExecutionDecision,
        error_type: str,
        workflow_name: str,
        repo: str,
    ) -> ExecutionDecision:
        """應用特殊規則覆蓋"""
        # 生產部署相關的 workflow 強制需要審核
        production_keywords = ["prod", "production", "release", "deploy"]
        if any(kw in workflow_name.lower() for kw in production_keywords):
            if execution_decision == ExecutionDecision.AUTO_EXECUTE:
                return ExecutionDecision.TELEGRAM_CONFIRM

        # rollback 錯誤類型強制需要審核
        if error_type == "deploy":
            if execution_decision in (ExecutionDecision.AUTO_EXECUTE, ExecutionDecision.TELEGRAM_CONFIRM):
                return ExecutionDecision.APPROVAL_REQUIRED

        return execution_decision

    def _generate_decision_reason(
        self,
        execution_decision: ExecutionDecision,
        overall_risk: RiskLevel,
        error_type: str,
    ) -> str:
        """生成決策原因說明"""
        reasons = {
            ExecutionDecision.AUTO_EXECUTE: f"Low risk {error_type} failure, safe for auto-repair",
            ExecutionDecision.TELEGRAM_CONFIRM: f"Medium risk {error_type} failure, quick Telegram confirmation recommended",
            ExecutionDecision.APPROVAL_REQUIRED: f"High risk {error_type} failure, human approval required before repair",
            ExecutionDecision.BLOCKED: f"Critical {error_type} failure, auto-repair blocked for safety",
        }
        return reasons.get(execution_decision, "Unknown decision")


# =============================================================================
# Singleton
# =============================================================================


_ci_auto_repair_service: CIAutoRepairService | None = None


def get_ci_auto_repair_service() -> CIAutoRepairService:
    """取得全域 CI Auto-Repair Service 實例"""
    global _ci_auto_repair_service
    if _ci_auto_repair_service is None:
        _ci_auto_repair_service = CIAutoRepairService()
    return _ci_auto_repair_service