""" Decision Proposal Service - Phase 6.4 決策輸出層 ================================================ 功能: 1. 從 Incident 生成 Decision Proposal (修復動作) 2. 整合 TrustEngine 評估風險等級 3. 建立向下相容的 ApprovalRequest 4. 關聯 Proposal 到 Incident 並推進狀態 設計原則: - 向下相容: 生成的 Proposal 完全符合現有 ApprovalRequest 格式 - 前端零改動: /approvals/pending 直接可渲染 - 可追溯: Incident.proposal_ids 記錄所有決策嘗試 統帥鐵律: - 禁止跳過 TrustEngine 評估 - 所有決策必須可稽核 """ from datetime import UTC, datetime import structlog from src.core.config import get_settings from src.db.base import get_db_context from src.db.models import IncidentRecord from src.models.approval import ( ApprovalRequest, ApprovalRequestCreate, BlastRadius, DataImpact, DryRunCheck, ) from src.models.approval import ( RiskLevel as ApprovalRiskLevel, ) from src.models.incident import ( Incident, IncidentStatus, Severity, ) from src.core.redis_client import get_redis from src.services.approval_db import get_approval_service from src.services.incident_engine import get_incident_engine from src.services.incident_memory import get_incident_memory from src.services.incident_service import INCIDENT_KEY_PREFIX, get_incident_service from src.services.openclaw import get_openclaw from src.services.trust_engine import normalize_action_pattern, trust_engine from src.utils.incident_converter import local_to_brain logger = structlog.get_logger(__name__) # ============================================================================= # Constants # ============================================================================= # Severity → RiskLevel 對應 SEVERITY_TO_RISK = { Severity.P0: ApprovalRiskLevel.CRITICAL, # P0 (critical) → CRITICAL (2 簽核) Severity.P1: ApprovalRiskLevel.CRITICAL, # P1 (high) → CRITICAL (2 簽核) Severity.P2: ApprovalRiskLevel.MEDIUM, # P2 (warning) → MEDIUM (1 簽核) Severity.P3: ApprovalRiskLevel.LOW, # P3 (info) → LOW (自動放行) } # 動作模板 (根據告警類型) ACTION_TEMPLATES = { "pod_crash": { "action": "Restart deployment: {target}", "description": "AI 建議重啟部署以恢復服務。根據 {signal_count} 筆告警分析,服務 {target} 可能需要重啟。", }, "high_latency": { "action": "Scale up deployment: {target}", "description": "AI 建議擴容以降低延遲。當前延遲超標,增加副本數可緩解負載。", }, "high_error_rate": { "action": "Rollback deployment: {target}", "description": "AI 建議回滾部署。錯誤率過高,可能是最近部署引入的問題。", }, "resource_exhaustion": { "action": "Scale up deployment: {target} to 3 replicas", "description": "AI 建議擴容。CPU/Memory 使用率超標,需增加副本分散負載。", }, "default": { "action": "Investigate service: {target}", "description": "AI 無法確定具體修復動作,建議人工調查。收到 {signal_count} 筆相關告警。", }, } # ============================================================================= # Proposal Service # ============================================================================= class ProposalService: """ 決策提案服務 - Phase 6.4 職責: 1. 分析 Incident 生成修復建議 (LLM-based) 2. 評估風險等級 3. 建立 ApprovalRequest (向下相容前端) 4. 更新 Incident 狀態與關聯 Phase 6.4 升級: - 整合 OpenClaw LLM 生成智能提案 - 使用 _call_with_cache 保護算力資源 - Fallback 到模板方案確保可用性 """ def __init__(self) -> None: self._approval_service = get_approval_service() self._openclaw = get_openclaw() # ========================================================================= # 核心方法: 從 Incident 生成 Proposal # ========================================================================= async def generate_proposal( self, incident_id: str, ) -> tuple[ApprovalRequest | None, str]: """ 從 Incident 生成 Decision Proposal 流程: 1. 載入 Incident (Redis 優先,DB 備援) 2. 分析 signals 決定修復動作 3. 評估風險等級 (TrustEngine) 4. 建立 ApprovalRequest 5. 關聯 Proposal 到 Incident 6. 推進 Incident 狀態為 MITIGATING 7. 更新 Redis + DB Args: incident_id: Incident ID Returns: (ApprovalRequest, message) 或 (None, error_message) """ try: # 1. 載入 Incident incident = await self._load_incident(incident_id) if not incident: return None, f"Incident not found: {incident_id}" # 檢查狀態 if incident.status not in (IncidentStatus.INVESTIGATING, IncidentStatus.MITIGATING): return None, f"Cannot generate proposal for status: {incident.status.value}" logger.info( "generating_proposal", incident_id=incident_id, severity=incident.severity.value, signal_count=len(incident.signals), ) # 2. 呼叫 OpenClaw LLM 生成提案 (Phase 6.4 核心) # Phase 22: 升級為 OpenClaw + Nemotron 協作 (ADR-044) # 2026-03-31 Claude Code: 使用 _with_tools 方法啟用雙軌協作 target = incident.affected_services[0] if incident.affected_services else "unknown" signals_dict = [s.model_dump() for s in incident.signals] llm_proposal, provider, llm_success = await self._openclaw.generate_incident_proposal_with_tools( incident_id=incident_id, severity=incident.severity.value, signals=signals_dict, affected_services=incident.affected_services, ) # 使用 LLM 結果或 fallback 到模板 if llm_success and llm_proposal: # 2026-04-09 Claude Sonnet 4.6: action 優先用 kubectl_command(可執行), # fallback 到 action_title(中文標題無法被 parse_operation_from_action 解析) # 根本問題: action="未知操作 | " 導致批准後永遠 skip 執行 _kubectl = llm_proposal.get("kubectl_command", "").strip() action = _kubectl if _kubectl else llm_proposal["action"] description = f"{llm_proposal['description']}\n\n**AI 推理**: {llm_proposal['reasoning']}" action_type = llm_proposal.get("primary_responsibility", "default").lower() # LLM 提供的 risk_level 轉換 llm_risk = llm_proposal.get("risk_level", "medium") # 2026-04-09 Claude Sonnet 4.6: P1-2 QA修復 — 補 "high" 鍵,防止 LLM 自由文字回傳 high 時降為 MEDIUM risk_map = { "low": ApprovalRiskLevel.LOW, "medium": ApprovalRiskLevel.MEDIUM, "high": ApprovalRiskLevel.HIGH, "critical": ApprovalRiskLevel.CRITICAL, } base_risk = risk_map.get(llm_risk, ApprovalRiskLevel.MEDIUM) logger.info( "llm_proposal_generated", incident_id=incident_id, provider=provider, action=action[:50], risk_level=llm_risk, confidence=llm_proposal.get("confidence", 0), ) else: # Fallback 到模板方案 logger.warning( "llm_proposal_fallback_to_template", incident_id=incident_id, provider=provider, ) action_type, action, description = self._determine_action(incident) base_risk = SEVERITY_TO_RISK.get(incident.severity, ApprovalRiskLevel.MEDIUM) # 3. 評估風險等級 (TrustEngine 調整) action_pattern = normalize_action_pattern(action_type, {"resource": target}) risk_adjustment = trust_engine.evaluate_adjusted_risk( action_pattern=action_pattern, original_risk=base_risk.value, ) adjusted_risk = ApprovalRiskLevel(risk_adjustment.adjusted_risk.value) logger.info( "risk_evaluated", incident_id=incident_id, original_risk=base_risk.value, adjusted_risk=adjusted_risk.value, trust_score=risk_adjustment.trust_score, ) # 4. 建立 ApprovalRequest blast_radius = self._build_blast_radius(incident) dry_run_checks = self._build_dry_run_checks(incident) # 建立 metadata (含 LLM 資訊) metadata = { "incident_id": incident_id, "severity": incident.severity.value, "signal_count": len(incident.signals), "affected_services": incident.affected_services, "trust_adjustment": risk_adjustment.to_dict(), } # 加入 LLM 相關資訊 (Phase 6.4) if llm_success and llm_proposal: metadata["llm_provider"] = llm_proposal.get("provider", "unknown") metadata["llm_confidence"] = llm_proposal.get("confidence", 0) metadata["llm_from_cache"] = llm_proposal.get("from_cache", False) metadata["kubectl_command"] = llm_proposal.get("kubectl_command", "") metadata["signoz_correlation"] = llm_proposal.get("signoz_correlation", "") metadata["optimization_suggestions"] = llm_proposal.get("optimization_suggestions", []) # 2026-04-25 P0.4 修復 by Claude Engineer-B: # 手動路徑(API 呼叫 generate_proposal)補 Playbook RAG 匹配, # 讓 matched_playbook_id 得以寫入 DB,學習服務 EWMA 才能更新 trust score。 # decision_manager auto_execute 路徑已有此邏輯(行 2035),此處補手動路徑缺口。 matched_pb_id: str | None = await self._try_playbook_match_id(incident) approval_create = ApprovalRequestCreate( action=action, description=description, risk_level=adjusted_risk, blast_radius=blast_radius, dry_run_checks=dry_run_checks, requested_by="OpenClaw AI", incident_id=incident_id, metadata=metadata, matched_playbook_id=matched_pb_id, ) approval = await self._approval_service.create_approval(approval_create) logger.info( "approval_created", incident_id=incident_id, approval_id=str(approval.id), risk_level=approval.risk_level.value, ) # 5. 關聯 Proposal 到 Incident incident.proposal_ids.append(approval.id) # 6. 推進狀態為 MITIGATING if incident.status == IncidentStatus.INVESTIGATING: incident.status = IncidentStatus.MITIGATING logger.info( "incident_status_updated", incident_id=incident_id, new_status="MITIGATING", ) incident.updated_at = datetime.now(UTC) # 7. 更新 Redis + DB await self._persist_incident(incident) message = f"Proposal generated: {approval.action[:50]}... (Risk: {adjusted_risk.value})" return approval, message except Exception as e: logger.exception( "generate_proposal_error", incident_id=incident_id, error=str(e), ) return None, f"Error generating proposal: {str(e)}" # ========================================================================= # 輔助方法: 載入 Incident # ========================================================================= async def _load_incident(self, incident_id: str) -> Incident | None: """ 載入 Incident (從 Working Memory 讀取) 2026-04-02 ogt: 修復 brain engine 使用 awoooi:incidents: prefix 但資料實際儲存在 incident: prefix,導致永遠 404。 改用 incident_service.get_from_working_memory() 讀正確 key。 """ try: return await get_incident_service().get_from_working_memory(incident_id) except Exception as e: logger.warning( "load_incident_failed", incident_id=incident_id, error=str(e), ) return None # ========================================================================= # 輔助方法: 決定修復動作 # ========================================================================= def _determine_action( self, incident: Incident, ) -> tuple[str, str, str]: """ 分析 Incident 決定修復動作 Returns: (action_type, action, description) """ target = incident.affected_services[0] if incident.affected_services else "unknown-service" signal_count = len(incident.signals) # 分析告警名稱決定類型 alert_names = [s.alert_name.lower() for s in incident.signals] action_type = "default" # 優先級: crash > error_rate > latency > resource if any("crash" in name or "restart" in name or "oom" in name for name in alert_names): action_type = "pod_crash" elif any("error" in name or "fail" in name for name in alert_names): action_type = "high_error_rate" elif any("latency" in name or "slow" in name or "timeout" in name for name in alert_names): action_type = "high_latency" elif any("cpu" in name or "memory" in name or "resource" in name for name in alert_names): action_type = "resource_exhaustion" template = ACTION_TEMPLATES.get(action_type, ACTION_TEMPLATES["default"]) action = template["action"].format(target=target, signal_count=signal_count) description = template["description"].format(target=target, signal_count=signal_count) return action_type, action, description # ========================================================================= # 輔助方法: Playbook RAG 匹配(P0.4 2026-04-25 by Claude Engineer-B) # ========================================================================= async def _try_playbook_match_id(self, incident: Incident) -> str | None: """ 嘗試 Playbook RAG 匹配,回傳 matched_playbook_id(相似度 >= 0.85 才填)。 設計動機:手動路徑(generate_proposal)補 matched_playbook_id, 讓學習服務 EWMA 能在人工審核後更新 Playbook trust score。 邏輯與 decision_manager._try_playbook_match 相同,但只回傳 ID 不改 action。 失敗時靜默返回 None(不阻塞主流程)。 W1 PR-P1 Feature Flag (2026-04-28 ogt + Claude Sonnet 4.6): ENABLE_PLAYBOOK_MATCHING=false → 回傳 None,行為與修復前完全相同(回滾用)。 """ if not get_settings().ENABLE_PLAYBOOK_MATCHING: logger.debug( "playbook_matching_disabled", incident_id=getattr(incident, "incident_id", "?"), ) return None PLAYBOOK_SIMILARITY_THRESHOLD = 0.85 try: from src.models.playbook import SymptomPattern from src.services.playbook_service import get_playbook_service alert_names = [s.alert_name for s in incident.signals] if incident.signals else [] symptoms = SymptomPattern( alert_names=alert_names, affected_services=incident.affected_services or [], severity_range=[incident.severity.value] if incident.severity else ["P2"], ) recommendations = await get_playbook_service().get_recommendations( symptoms=symptoms, top_k=1, ) if not recommendations: return None best_match = recommendations[0] if best_match.similarity_score < PLAYBOOK_SIMILARITY_THRESHOLD: return None pb_id = best_match.playbook.playbook_id logger.info( "proposal_playbook_matched", incident_id=incident.incident_id, playbook_id=pb_id, similarity=best_match.similarity_score, ) return pb_id except Exception as e: logger.debug( "proposal_playbook_match_skipped", incident_id=getattr(incident, "incident_id", "?"), error=str(e), ) return None # ========================================================================= # 輔助方法: 建立 BlastRadius # ========================================================================= def _build_blast_radius(self, incident: Incident) -> BlastRadius: """ 建立爆炸半徑評估 """ affected_count = len(incident.affected_services) # 根據嚴重度估算停機時間 downtime_map = { Severity.P0: "5-15 min", Severity.P1: "2-5 min", Severity.P2: "< 2 min", Severity.P3: "0 min", } # 根據嚴重度決定資料影響 impact_map = { Severity.P0: DataImpact.DESTRUCTIVE, Severity.P1: DataImpact.WRITE, Severity.P2: DataImpact.READ_ONLY, Severity.P3: DataImpact.NONE, } return BlastRadius( affected_pods=max(1, affected_count * 2), # 估算受影響 Pod 數 estimated_downtime=downtime_map.get(incident.severity, "unknown"), related_services=incident.affected_services[:5], # 最多 5 個 data_impact=impact_map.get(incident.severity, DataImpact.NONE), ) def _build_dry_run_checks(self, incident: Incident) -> list[DryRunCheck]: """ 建立 Dry-Run 檢查項目 """ checks = [ DryRunCheck( name="RBAC Permission", passed=True, message="leWOOOgo has sufficient permissions", ), DryRunCheck( name="Resource Exists", passed=True, message=f"Target resources verified: {len(incident.affected_services)} services", ), DryRunCheck( name="Syntax Validation", passed=True, message="Command syntax validated", ), ] # P0/P1 增加額外檢查 if incident.severity in (Severity.P0, Severity.P1): checks.append( DryRunCheck( name="Blast Radius Assessment", passed=True, message=f"High severity ({incident.severity.value}): Multi-sig required", ) ) return checks # ========================================================================= # 輔助方法: 持久化 Incident # ========================================================================= async def _persist_incident(self, incident: Incident) -> None: """ 更新 Incident 到 Redis + DB ADR-046 (2026-04-01 ogt): Redis 委派給 brain DualIncidentMemory 確保 key prefix 一致性 (awoooi:incidents:{id})。 DB 直接更新 status + proposal_ids (最小化變更)。 """ # 1. 更新 Redis (委派給 brain DualIncidentMemory) try: brain_incident = local_to_brain(incident) await get_incident_memory().save_incident(brain_incident) except Exception as e: logger.warning( "redis_persist_failed", incident_id=incident.incident_id, error=str(e), ) # 2. 更新 DB try: async with get_db_context() as db: from sqlalchemy import select stmt = select(IncidentRecord).where( IncidentRecord.incident_id == incident.incident_id ) result = await db.execute(stmt) record = result.scalar_one_or_none() if record: record.status = incident.status.value record.proposal_ids = [str(pid) for pid in incident.proposal_ids] record.updated_at = incident.updated_at # 顯式 commit 確保變更持久化 await db.commit() logger.info( "db_incident_updated", incident_id=incident.incident_id, new_status=incident.status.value, ) except Exception as e: logger.warning( "db_persist_failed", incident_id=incident.incident_id, error=str(e), ) # ========================================================================= # Phase 6.5: 簽核完成後更新 Incident 狀態 # ========================================================================= async def resolve_incident_after_approval( self, incident_id: str, approval_id: str | None = None, ) -> bool: """ 簽核完成後更新 Incident 狀態為 RESOLVED 當 Approval 達到所需簽核數時呼叫,更新: 1. incident.status → RESOLVED 2. incident.decision.state → completed (如果有) Args: incident_id: Incident ID approval_id: 簽核的 Approval ID (用於日誌) Returns: 是否更新成功 """ from sqlalchemy import select logger.info( "resolve_incident_starting", incident_id=incident_id, approval_id=approval_id, ) redis_client = get_redis() key = f"{INCIDENT_KEY_PREFIX}{incident_id}" redis_ok = False db_ok = False # 1. 更新 Redis try: data = await redis_client.get(key) if data: incident = Incident.model_validate_json(data) old_status = incident.status.value incident.status = IncidentStatus.RESOLVED incident.resolved_at = datetime.now(UTC) incident.updated_at = datetime.now(UTC) # 注意: decision_chain 是完整 AI 推論過程,不需要在此修改 # 狀態由 incident.status = RESOLVED 表達 await redis_client.set(key, incident.model_dump_json(), ex=604800) redis_ok = True logger.info( "resolve_incident_redis_updated", incident_id=incident_id, old_status=old_status, new_status="resolved", ) else: # 🔴 關鍵修復: Redis 沒有 Incident 時,從 DB 讀取並更新 logger.warning( "resolve_incident_redis_miss", incident_id=incident_id, note="Incident not found in Redis, will update DB only", ) # 仍然標記為成功,讓 DB 更新繼續執行 redis_ok = True except Exception as e: logger.exception( "resolve_incident_redis_error", incident_id=incident_id, error=str(e), ) # 2. 更新 DB (如果存在) now = datetime.now(UTC) try: async with get_db_context() as db: stmt = select(IncidentRecord).where( IncidentRecord.incident_id == incident_id ) result = await db.execute(stmt) record = result.scalar_one_or_none() if record: record.status = "resolved" record.resolved_at = now record.updated_at = now # 🔴 關鍵: 確保 commit 成功 await db.commit() db_ok = True logger.info( "resolve_incident_db_updated", incident_id=incident_id, resolved_at=now.isoformat(), ) else: # DB 沒有記錄但 Redis 有 - 這是可接受的狀態 # (Incident 可能因 DB 寫入失敗只存在 Redis) db_ok = True # 視為成功,因為沒有需要更新的記錄 logger.warning( "resolve_incident_db_not_found", incident_id=incident_id, note="Incident exists in Redis but not in DB, this is acceptable", ) except Exception as e: logger.exception( "resolve_incident_db_error", incident_id=incident_id, error=str(e), ) # 3. 更新 DecisionToken 狀態為 COMPLETED # 關鍵修復: DecisionToken 獨立儲存在 decision:{token} # 必須同步更新,否則下次 poll 會顯示 Y/n decision_ok = False try: from src.services.decision_manager import ( DecisionState, get_decision_manager, ) decision_manager = get_decision_manager() existing_token = await decision_manager._find_existing_token(incident_id) if existing_token: await decision_manager.update_token_state( existing_token.token, DecisionState.COMPLETED, ) decision_ok = True logger.info( "resolve_decision_token_updated", incident_id=incident_id, token=existing_token.token, new_state="completed", ) else: decision_ok = True # 沒有 token 也算成功 logger.warning( "resolve_decision_token_not_found", incident_id=incident_id, ) except Exception as e: logger.exception( "resolve_decision_token_error", incident_id=incident_id, error=str(e), ) # 只要 Redis 更新成功就算成功(API 只讀 Redis) success = redis_ok logger.info( "resolve_incident_completed", incident_id=incident_id, approval_id=approval_id, success=success, redis_ok=redis_ok, db_ok=db_ok, decision_ok=decision_ok, ) return success # ============================================================================= # Singleton # ============================================================================= _proposal_service: ProposalService | None = None def get_proposal_service() -> ProposalService: """取得 ProposalService 實例 (Singleton)""" global _proposal_service if _proposal_service is None: _proposal_service = ProposalService() return _proposal_service