awoooi/apps/api/src/services/proposal_service.py

"""
Decision Proposal Service - Phase 6.4 決策輸出層
================================================

功能:
1. 從 Incident 生成 Decision Proposal (修復動作)
2. 整合 TrustEngine 評估風險等級
3. 建立向下相容的 ApprovalRequest
4. 關聯 Proposal 到 Incident 並推進狀態

設計原則:
- 向下相容: 生成的 Proposal 完全符合現有 ApprovalRequest 格式
- 前端零改動: /approvals/pending 直接可渲染
- 可追溯: Incident.proposal_ids 記錄所有決策嘗試

統帥鐵律:
- 禁止跳過 TrustEngine 評估
- 所有決策必須可稽核
"""

from datetime import UTC, datetime

import structlog

from src.core.config import get_settings
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.approval import (
    ApprovalRequest,
    ApprovalRequestCreate,
    BlastRadius,
    DataImpact,
    DryRunCheck,
)
from src.models.approval import (
    RiskLevel as ApprovalRiskLevel,
)
from src.models.incident import (
    Incident,
    IncidentStatus,
    Severity,
)
from src.core.redis_client import get_redis
from src.services.approval_db import get_approval_service
from src.services.incident_engine import get_incident_engine
from src.services.incident_memory import get_incident_memory
from src.services.incident_service import INCIDENT_KEY_PREFIX, get_incident_service
from src.services.openclaw import get_openclaw
from src.services.trust_engine import normalize_action_pattern, trust_engine
from src.utils.incident_converter import local_to_brain

logger = structlog.get_logger(__name__)


# =============================================================================
# Constants
# =============================================================================

# Severity → RiskLevel 對應
SEVERITY_TO_RISK = {
    Severity.P0: ApprovalRiskLevel.CRITICAL,  # P0 (critical) → CRITICAL (2 簽核)
    Severity.P1: ApprovalRiskLevel.CRITICAL,  # P1 (high) → CRITICAL (2 簽核)
    Severity.P2: ApprovalRiskLevel.MEDIUM,    # P2 (warning) → MEDIUM (1 簽核)
    Severity.P3: ApprovalRiskLevel.LOW,       # P3 (info) → LOW (自動放行)
}

# 動作模板 (根據告警類型)
ACTION_TEMPLATES = {
    "pod_crash": {
        "action": "Restart deployment: {target}",
        "description": "AI 建議重啟部署以恢復服務。根據 {signal_count} 筆告警分析，服務 {target} 可能需要重啟。",
    },
    "high_latency": {
        "action": "Scale up deployment: {target}",
        "description": "AI 建議擴容以降低延遲。當前延遲超標，增加副本數可緩解負載。",
    },
    "high_error_rate": {
        "action": "Rollback deployment: {target}",
        "description": "AI 建議回滾部署。錯誤率過高，可能是最近部署引入的問題。",
    },
    "resource_exhaustion": {
        "action": "Scale up deployment: {target} to 3 replicas",
        "description": "AI 建議擴容。CPU/Memory 使用率超標，需增加副本分散負載。",
    },
    "default": {
        "action": "Investigate service: {target}",
        "description": "AI 無法確定具體修復動作，建議人工調查。收到 {signal_count} 筆相關告警。",
    },
}


# =============================================================================
# Proposal Service
# =============================================================================

class ProposalService:
    """
    決策提案服務 - Phase 6.4

    職責:
    1. 分析 Incident 生成修復建議 (LLM-based)
    2. 評估風險等級
    3. 建立 ApprovalRequest (向下相容前端)
    4. 更新 Incident 狀態與關聯

    Phase 6.4 升級:
    - 整合 OpenClaw LLM 生成智能提案
    - 使用 _call_with_cache 保護算力資源
    - Fallback 到模板方案確保可用性
    """

    def __init__(self) -> None:
        self._approval_service = get_approval_service()
        self._openclaw = get_openclaw()

    # =========================================================================
    # 核心方法: 從 Incident 生成 Proposal
    # =========================================================================

    async def generate_proposal(
        self,
        incident_id: str,
    ) -> tuple[ApprovalRequest | None, str]:
        """
        從 Incident 生成 Decision Proposal

        流程:
        1. 載入 Incident (Redis 優先，DB 備援)
        2. 分析 signals 決定修復動作
        3. 評估風險等級 (TrustEngine)
        4. 建立 ApprovalRequest
        5. 關聯 Proposal 到 Incident
        6. 推進 Incident 狀態為 MITIGATING
        7. 更新 Redis + DB

        Args:
            incident_id: Incident ID

        Returns:
            (ApprovalRequest, message) 或 (None, error_message)
        """
        try:
            # 1. 載入 Incident
            incident = await self._load_incident(incident_id)
            if not incident:
                return None, f"Incident not found: {incident_id}"

            # 檢查狀態
            if incident.status not in (IncidentStatus.INVESTIGATING, IncidentStatus.MITIGATING):
                return None, f"Cannot generate proposal for status: {incident.status.value}"

            logger.info(
                "generating_proposal",
                incident_id=incident_id,
                severity=incident.severity.value,
                signal_count=len(incident.signals),
            )

            # 2. 呼叫 OpenClaw LLM 生成提案 (Phase 6.4 核心)
            # Phase 22: 升級為 OpenClaw + Nemotron 協作 (ADR-044)
            # 2026-03-31 Claude Code: 使用 _with_tools 方法啟用雙軌協作
            target = incident.affected_services[0] if incident.affected_services else "unknown"
            signals_dict = [s.model_dump() for s in incident.signals]

            llm_proposal, provider, llm_success = await self._openclaw.generate_incident_proposal_with_tools(
                incident_id=incident_id,
                severity=incident.severity.value,
                signals=signals_dict,
                affected_services=incident.affected_services,
            )

            # 使用 LLM 結果或 fallback 到模板
            if llm_success and llm_proposal:
                # 2026-04-09 Claude Sonnet 4.6: action 優先用 kubectl_command（可執行），
                # fallback 到 action_title（中文標題無法被 parse_operation_from_action 解析）
                # 根本問題: action="未知操作 | " 導致批准後永遠 skip 執行
                _kubectl = llm_proposal.get("kubectl_command", "").strip()
                action = _kubectl if _kubectl else llm_proposal["action"]
                description = f"{llm_proposal['description']}\n\n**AI 推理**: {llm_proposal['reasoning']}"
                action_type = llm_proposal.get("primary_responsibility", "default").lower()

                # LLM 提供的 risk_level 轉換
                llm_risk = llm_proposal.get("risk_level", "medium")
                # 2026-04-09 Claude Sonnet 4.6: P1-2 QA修復 — 補 "high" 鍵，防止 LLM 自由文字回傳 high 時降為 MEDIUM
                risk_map = {
                    "low": ApprovalRiskLevel.LOW,
                    "medium": ApprovalRiskLevel.MEDIUM,
                    "high": ApprovalRiskLevel.HIGH,
                    "critical": ApprovalRiskLevel.CRITICAL,
                }
                base_risk = risk_map.get(llm_risk, ApprovalRiskLevel.MEDIUM)

                logger.info(
                    "llm_proposal_generated",
                    incident_id=incident_id,
                    provider=provider,
                    action=action[:50],
                    risk_level=llm_risk,
                    confidence=llm_proposal.get("confidence", 0),
                )
            else:
                # Fallback 到模板方案
                logger.warning(
                    "llm_proposal_fallback_to_template",
                    incident_id=incident_id,
                    provider=provider,
                )
                action_type, action, description = self._determine_action(incident)
                base_risk = SEVERITY_TO_RISK.get(incident.severity, ApprovalRiskLevel.MEDIUM)

            # 3. 評估風險等級 (TrustEngine 調整)
            action_pattern = normalize_action_pattern(action_type, {"resource": target})

            risk_adjustment = trust_engine.evaluate_adjusted_risk(
                action_pattern=action_pattern,
                original_risk=base_risk.value,
            )
            adjusted_risk = ApprovalRiskLevel(risk_adjustment.adjusted_risk.value)

            logger.info(
                "risk_evaluated",
                incident_id=incident_id,
                original_risk=base_risk.value,
                adjusted_risk=adjusted_risk.value,
                trust_score=risk_adjustment.trust_score,
            )

            # 4. 建立 ApprovalRequest
            blast_radius = self._build_blast_radius(incident)
            dry_run_checks = self._build_dry_run_checks(incident)

            # 建立 metadata (含 LLM 資訊)
            metadata = {
                "incident_id": incident_id,
                "severity": incident.severity.value,
                "signal_count": len(incident.signals),
                "affected_services": incident.affected_services,
                "trust_adjustment": risk_adjustment.to_dict(),
            }

            # 加入 LLM 相關資訊 (Phase 6.4)
            if llm_success and llm_proposal:
                metadata["llm_provider"] = llm_proposal.get("provider", "unknown")
                metadata["llm_confidence"] = llm_proposal.get("confidence", 0)
                metadata["llm_from_cache"] = llm_proposal.get("from_cache", False)
                metadata["kubectl_command"] = llm_proposal.get("kubectl_command", "")
                metadata["signoz_correlation"] = llm_proposal.get("signoz_correlation", "")
                metadata["optimization_suggestions"] = llm_proposal.get("optimization_suggestions", [])

            # 2026-04-25 P0.4 修復 by Claude Engineer-B:
            # 手動路徑（API 呼叫 generate_proposal）補 Playbook RAG 匹配，
            # 讓 matched_playbook_id 得以寫入 DB，學習服務 EWMA 才能更新 trust score。
            # decision_manager auto_execute 路徑已有此邏輯（行 2035），此處補手動路徑缺口。
            matched_pb_id: str | None = await self._try_playbook_match_id(incident)

            approval_create = ApprovalRequestCreate(
                action=action,
                description=description,
                risk_level=adjusted_risk,
                blast_radius=blast_radius,
                dry_run_checks=dry_run_checks,
                requested_by="OpenClaw AI",
                incident_id=incident_id,
                metadata=metadata,
                matched_playbook_id=matched_pb_id,
            )

            approval = await self._approval_service.create_approval(approval_create)

            logger.info(
                "approval_created",
                incident_id=incident_id,
                approval_id=str(approval.id),
                risk_level=approval.risk_level.value,
            )

            # 5. 關聯 Proposal 到 Incident
            incident.proposal_ids.append(approval.id)

            # 6. 推進狀態為 MITIGATING
            if incident.status == IncidentStatus.INVESTIGATING:
                incident.status = IncidentStatus.MITIGATING
                logger.info(
                    "incident_status_updated",
                    incident_id=incident_id,
                    new_status="MITIGATING",
                )

            incident.updated_at = datetime.now(UTC)

            # 7. 更新 Redis + DB
            await self._persist_incident(incident)

            message = f"Proposal generated: {approval.action[:50]}... (Risk: {adjusted_risk.value})"
            return approval, message

        except Exception as e:
            logger.exception(
                "generate_proposal_error",
                incident_id=incident_id,
                error=str(e),
            )
            return None, f"Error generating proposal: {str(e)}"

    # =========================================================================
    # 輔助方法: 載入 Incident
    # =========================================================================

    async def _load_incident(self, incident_id: str) -> Incident | None:
        """
        載入 Incident (從 Working Memory 讀取)

        2026-04-02 ogt: 修復 brain engine 使用 awoooi:incidents: prefix
        但資料實際儲存在 incident: prefix，導致永遠 404。
        改用 incident_service.get_from_working_memory() 讀正確 key。
        """
        try:
            return await get_incident_service().get_from_working_memory(incident_id)
        except Exception as e:
            logger.warning(
                "load_incident_failed",
                incident_id=incident_id,
                error=str(e),
            )
            return None

    # =========================================================================
    # 輔助方法: 決定修復動作
    # =========================================================================

    def _determine_action(
        self,
        incident: Incident,
    ) -> tuple[str, str, str]:
        """
        分析 Incident 決定修復動作

        Returns:
            (action_type, action, description)
        """
        target = incident.affected_services[0] if incident.affected_services else "unknown-service"
        signal_count = len(incident.signals)

        # 分析告警名稱決定類型
        alert_names = [s.alert_name.lower() for s in incident.signals]

        action_type = "default"

        # 優先級: crash > error_rate > latency > resource
        if any("crash" in name or "restart" in name or "oom" in name for name in alert_names):
            action_type = "pod_crash"
        elif any("error" in name or "fail" in name for name in alert_names):
            action_type = "high_error_rate"
        elif any("latency" in name or "slow" in name or "timeout" in name for name in alert_names):
            action_type = "high_latency"
        elif any("cpu" in name or "memory" in name or "resource" in name for name in alert_names):
            action_type = "resource_exhaustion"

        template = ACTION_TEMPLATES.get(action_type, ACTION_TEMPLATES["default"])
        action = template["action"].format(target=target, signal_count=signal_count)
        description = template["description"].format(target=target, signal_count=signal_count)

        return action_type, action, description

    # =========================================================================
    # 輔助方法: Playbook RAG 匹配（P0.4 2026-04-25 by Claude Engineer-B）
    # =========================================================================

    async def _try_playbook_match_id(self, incident: Incident) -> str | None:
        """
        嘗試 Playbook RAG 匹配，回傳 matched_playbook_id（相似度 >= 0.85 才填）。

        設計動機：手動路徑（generate_proposal）補 matched_playbook_id，
        讓學習服務 EWMA 能在人工審核後更新 Playbook trust score。
        邏輯與 decision_manager._try_playbook_match 相同，但只回傳 ID 不改 action。
        失敗時靜默返回 None（不阻塞主流程）。

        W1 PR-P1 Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6):
        ENABLE_PLAYBOOK_MATCHING=false → 回傳 None，行為與修復前完全相同（回滾用）。
        """
        if not get_settings().ENABLE_PLAYBOOK_MATCHING:
            logger.debug(
                "playbook_matching_disabled",
                incident_id=getattr(incident, "incident_id", "?"),
            )
            return None

        PLAYBOOK_SIMILARITY_THRESHOLD = 0.85
        try:
            from src.models.playbook import SymptomPattern
            from src.services.playbook_service import get_playbook_service

            alert_names = [s.alert_name for s in incident.signals] if incident.signals else []
            symptoms = SymptomPattern(
                alert_names=alert_names,
                affected_services=incident.affected_services or [],
                severity_range=[incident.severity.value] if incident.severity else ["P2"],
            )
            recommendations = await get_playbook_service().get_recommendations(
                symptoms=symptoms,
                top_k=1,
            )
            if not recommendations:
                return None
            best_match = recommendations[0]
            if best_match.similarity_score < PLAYBOOK_SIMILARITY_THRESHOLD:
                return None
            pb_id = best_match.playbook.playbook_id
            logger.info(
                "proposal_playbook_matched",
                incident_id=incident.incident_id,
                playbook_id=pb_id,
                similarity=best_match.similarity_score,
            )
            return pb_id
        except Exception as e:
            logger.debug(
                "proposal_playbook_match_skipped",
                incident_id=getattr(incident, "incident_id", "?"),
                error=str(e),
            )
            return None

    # =========================================================================
    # 輔助方法: 建立 BlastRadius
    # =========================================================================

    def _build_blast_radius(self, incident: Incident) -> BlastRadius:
        """
        建立爆炸半徑評估
        """
        affected_count = len(incident.affected_services)

        # 根據嚴重度估算停機時間
        downtime_map = {
            Severity.P0: "5-15 min",
            Severity.P1: "2-5 min",
            Severity.P2: "< 2 min",
            Severity.P3: "0 min",
        }

        # 根據嚴重度決定資料影響
        impact_map = {
            Severity.P0: DataImpact.DESTRUCTIVE,
            Severity.P1: DataImpact.WRITE,
            Severity.P2: DataImpact.READ_ONLY,
            Severity.P3: DataImpact.NONE,
        }

        return BlastRadius(
            affected_pods=max(1, affected_count * 2),  # 估算受影響 Pod 數
            estimated_downtime=downtime_map.get(incident.severity, "unknown"),
            related_services=incident.affected_services[:5],  # 最多 5 個
            data_impact=impact_map.get(incident.severity, DataImpact.NONE),
        )

    def _build_dry_run_checks(self, incident: Incident) -> list[DryRunCheck]:
        """
        建立 Dry-Run 檢查項目
        """
        checks = [
            DryRunCheck(
                name="RBAC Permission",
                passed=True,
                message="leWOOOgo has sufficient permissions",
            ),
            DryRunCheck(
                name="Resource Exists",
                passed=True,
                message=f"Target resources verified: {len(incident.affected_services)} services",
            ),
            DryRunCheck(
                name="Syntax Validation",
                passed=True,
                message="Command syntax validated",
            ),
        ]

        # P0/P1 增加額外檢查
        if incident.severity in (Severity.P0, Severity.P1):
            checks.append(
                DryRunCheck(
                    name="Blast Radius Assessment",
                    passed=True,
                    message=f"High severity ({incident.severity.value}): Multi-sig required",
                )
            )

        return checks

    # =========================================================================
    # 輔助方法: 持久化 Incident
    # =========================================================================

    async def _persist_incident(self, incident: Incident) -> None:
        """
        更新 Incident 到 Redis + DB

        ADR-046 (2026-04-01 ogt): Redis 委派給 brain DualIncidentMemory
        確保 key prefix 一致性 (awoooi:incidents:{id})。
        DB 直接更新 status + proposal_ids (最小化變更)。
        """
        # 1. 更新 Redis (委派給 brain DualIncidentMemory)
        try:
            brain_incident = local_to_brain(incident)
            await get_incident_memory().save_incident(brain_incident)
        except Exception as e:
            logger.warning(
                "redis_persist_failed",
                incident_id=incident.incident_id,
                error=str(e),
            )

        # 2. 更新 DB
        try:
            async with get_db_context() as db:
                from sqlalchemy import select

                stmt = select(IncidentRecord).where(
                    IncidentRecord.incident_id == incident.incident_id
                )
                result = await db.execute(stmt)
                record = result.scalar_one_or_none()

                if record:
                    record.status = incident.status.value
                    record.proposal_ids = [str(pid) for pid in incident.proposal_ids]
                    record.updated_at = incident.updated_at
                    # 顯式 commit 確保變更持久化
                    await db.commit()
                    logger.info(
                        "db_incident_updated",
                        incident_id=incident.incident_id,
                        new_status=incident.status.value,
                    )

        except Exception as e:
            logger.warning(
                "db_persist_failed",
                incident_id=incident.incident_id,
                error=str(e),
            )

    # =========================================================================
    # Phase 6.5: 簽核完成後更新 Incident 狀態
    # =========================================================================

    async def resolve_incident_after_approval(
        self,
        incident_id: str,
        approval_id: str | None = None,
    ) -> bool:
        """
        簽核完成後更新 Incident 狀態為 RESOLVED

        當 Approval 達到所需簽核數時呼叫，更新:
        1. incident.status → RESOLVED
        2. incident.decision.state → completed (如果有)

        Args:
            incident_id: Incident ID
            approval_id: 簽核的 Approval ID (用於日誌)

        Returns:
            是否更新成功
        """
        from sqlalchemy import select

        logger.info(
            "resolve_incident_starting",
            incident_id=incident_id,
            approval_id=approval_id,
        )

        redis_client = get_redis()
        key = f"{INCIDENT_KEY_PREFIX}{incident_id}"
        redis_ok = False
        db_ok = False

        # 1. 更新 Redis
        try:
            data = await redis_client.get(key)
            if data:
                incident = Incident.model_validate_json(data)
                old_status = incident.status.value
                incident.status = IncidentStatus.RESOLVED
                incident.resolved_at = datetime.now(UTC)
                incident.updated_at = datetime.now(UTC)
                # 注意: decision_chain 是完整 AI 推論過程，不需要在此修改
                # 狀態由 incident.status = RESOLVED 表達
                await redis_client.set(key, incident.model_dump_json(), ex=604800)
                redis_ok = True
                logger.info(
                    "resolve_incident_redis_updated",
                    incident_id=incident_id,
                    old_status=old_status,
                    new_status="resolved",
                )
            else:
                # 🔴 關鍵修復: Redis 沒有 Incident 時，從 DB 讀取並更新
                logger.warning(
                    "resolve_incident_redis_miss",
                    incident_id=incident_id,
                    note="Incident not found in Redis, will update DB only",
                )
                # 仍然標記為成功，讓 DB 更新繼續執行
                redis_ok = True
        except Exception as e:
            logger.exception(
                "resolve_incident_redis_error",
                incident_id=incident_id,
                error=str(e),
            )

        # 2. 更新 DB (如果存在)
        now = datetime.now(UTC)
        try:
            async with get_db_context() as db:
                stmt = select(IncidentRecord).where(
                    IncidentRecord.incident_id == incident_id
                )
                result = await db.execute(stmt)
                record = result.scalar_one_or_none()
                if record:
                    record.status = "resolved"
                    record.resolved_at = now
                    record.updated_at = now
                    # 🔴 關鍵: 確保 commit 成功
                    await db.commit()
                    db_ok = True
                    logger.info(
                        "resolve_incident_db_updated",
                        incident_id=incident_id,
                        resolved_at=now.isoformat(),
                    )
                else:
                    # DB 沒有記錄但 Redis 有 - 這是可接受的狀態
                    # (Incident 可能因 DB 寫入失敗只存在 Redis)
                    db_ok = True  # 視為成功，因為沒有需要更新的記錄
                    logger.warning(
                        "resolve_incident_db_not_found",
                        incident_id=incident_id,
                        note="Incident exists in Redis but not in DB, this is acceptable",
                    )
        except Exception as e:
            logger.exception(
                "resolve_incident_db_error",
                incident_id=incident_id,
                error=str(e),
            )

        # 3. 更新 DecisionToken 狀態為 COMPLETED
        # 關鍵修復: DecisionToken 獨立儲存在 decision:{token}
        # 必須同步更新，否則下次 poll 會顯示 Y/n
        decision_ok = False
        try:
            from src.services.decision_manager import (
                DecisionState,
                get_decision_manager,
            )
            decision_manager = get_decision_manager()
            existing_token = await decision_manager._find_existing_token(incident_id)
            if existing_token:
                await decision_manager.update_token_state(
                    existing_token.token,
                    DecisionState.COMPLETED,
                )
                decision_ok = True
                logger.info(
                    "resolve_decision_token_updated",
                    incident_id=incident_id,
                    token=existing_token.token,
                    new_state="completed",
                )
            else:
                decision_ok = True  # 沒有 token 也算成功
                logger.warning(
                    "resolve_decision_token_not_found",
                    incident_id=incident_id,
                )
        except Exception as e:
            logger.exception(
                "resolve_decision_token_error",
                incident_id=incident_id,
                error=str(e),
            )

        # 只要 Redis 更新成功就算成功（API 只讀 Redis）
        success = redis_ok
        logger.info(
            "resolve_incident_completed",
            incident_id=incident_id,
            approval_id=approval_id,
            success=success,
            redis_ok=redis_ok,
            db_ok=db_ok,
            decision_ok=decision_ok,
        )

        return success


# =============================================================================
# Singleton
# =============================================================================

_proposal_service: ProposalService | None = None


def get_proposal_service() -> ProposalService:
    """取得 ProposalService 實例 (Singleton)"""
    global _proposal_service
    if _proposal_service is None:
        _proposal_service = ProposalService()
    return _proposal_service