Files
awoooi/apps/api/src/services/proposal_service.py
OG T 890e2a9568 fix(review): 架構審查修復 — P0 import crash + i18n 零 hardcode + 靜默錯誤
P0:
- proposal_service.py: 補 get_redis + INCIDENT_KEY_PREFIX import
  (修前: resolve_incident_after_approval 必 NameError crash)

P1 i18n:
- page.tsx: 拓撲群組移除 emoji,改用 tTopo() i18n key
- page.tsx: 主機標籤 (DevOps金庫等) 改 tTopo() i18n
- ai-model-status.tsx: 加 useTranslations,AI 模型狀態 → t('aiModelStatus')
- disposition-mini.tsx: 查看完整報表 → t('viewAllReport')
- recent-activity.tsx: 查看活動串流 → t('viewAllAlerts')

P2 品質:
- pending-approvals-card.tsx: approve/reject 加 r.ok 檢查+錯誤顯示,查看全部授權加路由+i18n
- page-tabs.tsx: TabSkeleton 載入中... → t('loading')
- page.tsx: ↑5% → tDashboard('trendUp', {pct}) 動態值
- page.tsx: Prometheus '23' hardcode → '-- targets'

i18n 新增 key (zh-TW + en 同步):
- dashboard: viewAllAlerts/viewAllAuth/viewAllReport/aiModelStatus/loading/trendUp
- topology: groupExternal/allReachable/investigating/hostDevops/hostAiData/hostK3sMaster/hostK3sWorker

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 18:34:50 +08:00

648 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Decision Proposal Service - Phase 6.4 決策輸出層
================================================
功能:
1. 從 Incident 生成 Decision Proposal (修復動作)
2. 整合 TrustEngine 評估風險等級
3. 建立向下相容的 ApprovalRequest
4. 關聯 Proposal 到 Incident 並推進狀態
設計原則:
- 向下相容: 生成的 Proposal 完全符合現有 ApprovalRequest 格式
- 前端零改動: /approvals/pending 直接可渲染
- 可追溯: Incident.proposal_ids 記錄所有決策嘗試
統帥鐵律:
- 禁止跳過 TrustEngine 評估
- 所有決策必須可稽核
"""
from datetime import UTC, datetime
import structlog
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.approval import (
ApprovalRequest,
ApprovalRequestCreate,
BlastRadius,
DataImpact,
DryRunCheck,
)
from src.models.approval import (
RiskLevel as ApprovalRiskLevel,
)
from src.models.incident import (
Incident,
IncidentStatus,
Severity,
)
from src.core.redis_client import get_redis
from src.services.approval_db import get_approval_service
from src.services.incident_engine import get_incident_engine
from src.services.incident_memory import get_incident_memory
from src.services.incident_service import INCIDENT_KEY_PREFIX, get_incident_service
from src.services.openclaw import get_openclaw
from src.services.trust_engine import normalize_action_pattern, trust_engine
from src.utils.incident_converter import local_to_brain
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants
# =============================================================================
# Severity → RiskLevel 對應
SEVERITY_TO_RISK = {
Severity.P0: ApprovalRiskLevel.CRITICAL, # P0 (critical) → CRITICAL (2 簽核)
Severity.P1: ApprovalRiskLevel.CRITICAL, # P1 (high) → CRITICAL (2 簽核)
Severity.P2: ApprovalRiskLevel.MEDIUM, # P2 (warning) → MEDIUM (1 簽核)
Severity.P3: ApprovalRiskLevel.LOW, # P3 (info) → LOW (自動放行)
}
# 動作模板 (根據告警類型)
ACTION_TEMPLATES = {
"pod_crash": {
"action": "Restart deployment: {target}",
"description": "AI 建議重啟部署以恢復服務。根據 {signal_count} 筆告警分析,服務 {target} 可能需要重啟。",
},
"high_latency": {
"action": "Scale up deployment: {target}",
"description": "AI 建議擴容以降低延遲。當前延遲超標,增加副本數可緩解負載。",
},
"high_error_rate": {
"action": "Rollback deployment: {target}",
"description": "AI 建議回滾部署。錯誤率過高,可能是最近部署引入的問題。",
},
"resource_exhaustion": {
"action": "Scale up deployment: {target} to 3 replicas",
"description": "AI 建議擴容。CPU/Memory 使用率超標,需增加副本分散負載。",
},
"default": {
"action": "Investigate service: {target}",
"description": "AI 無法確定具體修復動作,建議人工調查。收到 {signal_count} 筆相關告警。",
},
}
# =============================================================================
# Proposal Service
# =============================================================================
class ProposalService:
"""
決策提案服務 - Phase 6.4
職責:
1. 分析 Incident 生成修復建議 (LLM-based)
2. 評估風險等級
3. 建立 ApprovalRequest (向下相容前端)
4. 更新 Incident 狀態與關聯
Phase 6.4 升級:
- 整合 OpenClaw LLM 生成智能提案
- 使用 _call_with_cache 保護算力資源
- Fallback 到模板方案確保可用性
"""
def __init__(self) -> None:
self._approval_service = get_approval_service()
self._openclaw = get_openclaw()
# =========================================================================
# 核心方法: 從 Incident 生成 Proposal
# =========================================================================
async def generate_proposal(
self,
incident_id: str,
) -> tuple[ApprovalRequest | None, str]:
"""
從 Incident 生成 Decision Proposal
流程:
1. 載入 Incident (Redis 優先DB 備援)
2. 分析 signals 決定修復動作
3. 評估風險等級 (TrustEngine)
4. 建立 ApprovalRequest
5. 關聯 Proposal 到 Incident
6. 推進 Incident 狀態為 MITIGATING
7. 更新 Redis + DB
Args:
incident_id: Incident ID
Returns:
(ApprovalRequest, message) 或 (None, error_message)
"""
try:
# 1. 載入 Incident
incident = await self._load_incident(incident_id)
if not incident:
return None, f"Incident not found: {incident_id}"
# 檢查狀態
if incident.status not in (IncidentStatus.INVESTIGATING, IncidentStatus.MITIGATING):
return None, f"Cannot generate proposal for status: {incident.status.value}"
logger.info(
"generating_proposal",
incident_id=incident_id,
severity=incident.severity.value,
signal_count=len(incident.signals),
)
# 2. 呼叫 OpenClaw LLM 生成提案 (Phase 6.4 核心)
# Phase 22: 升級為 OpenClaw + Nemotron 協作 (ADR-044)
# 2026-03-31 Claude Code: 使用 _with_tools 方法啟用雙軌協作
target = incident.affected_services[0] if incident.affected_services else "unknown"
signals_dict = [s.model_dump() for s in incident.signals]
llm_proposal, provider, llm_success = await self._openclaw.generate_incident_proposal_with_tools(
incident_id=incident_id,
severity=incident.severity.value,
signals=signals_dict,
affected_services=incident.affected_services,
)
# 使用 LLM 結果或 fallback 到模板
if llm_success and llm_proposal:
# 2026-04-09 Claude Sonnet 4.6: action 優先用 kubectl_command可執行
# fallback 到 action_title中文標題無法被 parse_operation_from_action 解析)
# 根本問題: action="未知操作 | " 導致批准後永遠 skip 執行
_kubectl = llm_proposal.get("kubectl_command", "").strip()
action = _kubectl if _kubectl else llm_proposal["action"]
description = f"{llm_proposal['description']}\n\n**AI 推理**: {llm_proposal['reasoning']}"
action_type = llm_proposal.get("primary_responsibility", "default").lower()
# LLM 提供的 risk_level 轉換
llm_risk = llm_proposal.get("risk_level", "medium")
# 2026-04-09 Claude Sonnet 4.6: P1-2 QA修復 — 補 "high" 鍵,防止 LLM 自由文字回傳 high 時降為 MEDIUM
risk_map = {
"low": ApprovalRiskLevel.LOW,
"medium": ApprovalRiskLevel.MEDIUM,
"high": ApprovalRiskLevel.HIGH,
"critical": ApprovalRiskLevel.CRITICAL,
}
base_risk = risk_map.get(llm_risk, ApprovalRiskLevel.MEDIUM)
logger.info(
"llm_proposal_generated",
incident_id=incident_id,
provider=provider,
action=action[:50],
risk_level=llm_risk,
confidence=llm_proposal.get("confidence", 0),
)
else:
# Fallback 到模板方案
logger.warning(
"llm_proposal_fallback_to_template",
incident_id=incident_id,
provider=provider,
)
action_type, action, description = self._determine_action(incident)
base_risk = SEVERITY_TO_RISK.get(incident.severity, ApprovalRiskLevel.MEDIUM)
# 3. 評估風險等級 (TrustEngine 調整)
action_pattern = normalize_action_pattern(action_type, {"resource": target})
risk_adjustment = trust_engine.evaluate_adjusted_risk(
action_pattern=action_pattern,
original_risk=base_risk.value,
)
adjusted_risk = ApprovalRiskLevel(risk_adjustment.adjusted_risk.value)
logger.info(
"risk_evaluated",
incident_id=incident_id,
original_risk=base_risk.value,
adjusted_risk=adjusted_risk.value,
trust_score=risk_adjustment.trust_score,
)
# 4. 建立 ApprovalRequest
blast_radius = self._build_blast_radius(incident)
dry_run_checks = self._build_dry_run_checks(incident)
# 建立 metadata (含 LLM 資訊)
metadata = {
"incident_id": incident_id,
"severity": incident.severity.value,
"signal_count": len(incident.signals),
"affected_services": incident.affected_services,
"trust_adjustment": risk_adjustment.to_dict(),
}
# 加入 LLM 相關資訊 (Phase 6.4)
if llm_success and llm_proposal:
metadata["llm_provider"] = llm_proposal.get("provider", "unknown")
metadata["llm_confidence"] = llm_proposal.get("confidence", 0)
metadata["llm_from_cache"] = llm_proposal.get("from_cache", False)
metadata["kubectl_command"] = llm_proposal.get("kubectl_command", "")
metadata["signoz_correlation"] = llm_proposal.get("signoz_correlation", "")
metadata["optimization_suggestions"] = llm_proposal.get("optimization_suggestions", [])
approval_create = ApprovalRequestCreate(
action=action,
description=description,
risk_level=adjusted_risk,
blast_radius=blast_radius,
dry_run_checks=dry_run_checks,
requested_by="OpenClaw AI",
metadata=metadata,
)
approval = await self._approval_service.create_approval(approval_create)
logger.info(
"approval_created",
incident_id=incident_id,
approval_id=str(approval.id),
risk_level=approval.risk_level.value,
)
# 5. 關聯 Proposal 到 Incident
incident.proposal_ids.append(approval.id)
# 6. 推進狀態為 MITIGATING
if incident.status == IncidentStatus.INVESTIGATING:
incident.status = IncidentStatus.MITIGATING
logger.info(
"incident_status_updated",
incident_id=incident_id,
new_status="MITIGATING",
)
incident.updated_at = datetime.now(UTC)
# 7. 更新 Redis + DB
await self._persist_incident(incident)
message = f"Proposal generated: {approval.action[:50]}... (Risk: {adjusted_risk.value})"
return approval, message
except Exception as e:
logger.exception(
"generate_proposal_error",
incident_id=incident_id,
error=str(e),
)
return None, f"Error generating proposal: {str(e)}"
# =========================================================================
# 輔助方法: 載入 Incident
# =========================================================================
async def _load_incident(self, incident_id: str) -> Incident | None:
"""
載入 Incident (從 Working Memory 讀取)
2026-04-02 ogt: 修復 brain engine 使用 awoooi:incidents: prefix
但資料實際儲存在 incident: prefix導致永遠 404。
改用 incident_service.get_from_working_memory() 讀正確 key。
"""
try:
return await get_incident_service().get_from_working_memory(incident_id)
except Exception as e:
logger.warning(
"load_incident_failed",
incident_id=incident_id,
error=str(e),
)
return None
# =========================================================================
# 輔助方法: 決定修復動作
# =========================================================================
def _determine_action(
self,
incident: Incident,
) -> tuple[str, str, str]:
"""
分析 Incident 決定修復動作
Returns:
(action_type, action, description)
"""
target = incident.affected_services[0] if incident.affected_services else "unknown-service"
signal_count = len(incident.signals)
# 分析告警名稱決定類型
alert_names = [s.alert_name.lower() for s in incident.signals]
action_type = "default"
# 優先級: crash > error_rate > latency > resource
if any("crash" in name or "restart" in name or "oom" in name for name in alert_names):
action_type = "pod_crash"
elif any("error" in name or "fail" in name for name in alert_names):
action_type = "high_error_rate"
elif any("latency" in name or "slow" in name or "timeout" in name for name in alert_names):
action_type = "high_latency"
elif any("cpu" in name or "memory" in name or "resource" in name for name in alert_names):
action_type = "resource_exhaustion"
template = ACTION_TEMPLATES.get(action_type, ACTION_TEMPLATES["default"])
action = template["action"].format(target=target, signal_count=signal_count)
description = template["description"].format(target=target, signal_count=signal_count)
return action_type, action, description
# =========================================================================
# 輔助方法: 建立 BlastRadius
# =========================================================================
def _build_blast_radius(self, incident: Incident) -> BlastRadius:
"""
建立爆炸半徑評估
"""
affected_count = len(incident.affected_services)
# 根據嚴重度估算停機時間
downtime_map = {
Severity.P0: "5-15 min",
Severity.P1: "2-5 min",
Severity.P2: "< 2 min",
Severity.P3: "0 min",
}
# 根據嚴重度決定資料影響
impact_map = {
Severity.P0: DataImpact.DESTRUCTIVE,
Severity.P1: DataImpact.WRITE,
Severity.P2: DataImpact.READ_ONLY,
Severity.P3: DataImpact.NONE,
}
return BlastRadius(
affected_pods=max(1, affected_count * 2), # 估算受影響 Pod 數
estimated_downtime=downtime_map.get(incident.severity, "unknown"),
related_services=incident.affected_services[:5], # 最多 5 個
data_impact=impact_map.get(incident.severity, DataImpact.NONE),
)
def _build_dry_run_checks(self, incident: Incident) -> list[DryRunCheck]:
"""
建立 Dry-Run 檢查項目
"""
checks = [
DryRunCheck(
name="RBAC Permission",
passed=True,
message="leWOOOgo has sufficient permissions",
),
DryRunCheck(
name="Resource Exists",
passed=True,
message=f"Target resources verified: {len(incident.affected_services)} services",
),
DryRunCheck(
name="Syntax Validation",
passed=True,
message="Command syntax validated",
),
]
# P0/P1 增加額外檢查
if incident.severity in (Severity.P0, Severity.P1):
checks.append(
DryRunCheck(
name="Blast Radius Assessment",
passed=True,
message=f"High severity ({incident.severity.value}): Multi-sig required",
)
)
return checks
# =========================================================================
# 輔助方法: 持久化 Incident
# =========================================================================
async def _persist_incident(self, incident: Incident) -> None:
"""
更新 Incident 到 Redis + DB
ADR-046 (2026-04-01 ogt): Redis 委派給 brain DualIncidentMemory
確保 key prefix 一致性 (awoooi:incidents:{id})。
DB 直接更新 status + proposal_ids (最小化變更)。
"""
# 1. 更新 Redis (委派給 brain DualIncidentMemory)
try:
brain_incident = local_to_brain(incident)
await get_incident_memory().save_incident(brain_incident)
except Exception as e:
logger.warning(
"redis_persist_failed",
incident_id=incident.incident_id,
error=str(e),
)
# 2. 更新 DB
try:
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident.incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record:
record.status = incident.status.value
record.proposal_ids = [str(pid) for pid in incident.proposal_ids]
record.updated_at = incident.updated_at
# 顯式 commit 確保變更持久化
await db.commit()
logger.info(
"db_incident_updated",
incident_id=incident.incident_id,
new_status=incident.status.value,
)
except Exception as e:
logger.warning(
"db_persist_failed",
incident_id=incident.incident_id,
error=str(e),
)
# =========================================================================
# Phase 6.5: 簽核完成後更新 Incident 狀態
# =========================================================================
async def resolve_incident_after_approval(
self,
incident_id: str,
approval_id: str | None = None,
) -> bool:
"""
簽核完成後更新 Incident 狀態為 RESOLVED
當 Approval 達到所需簽核數時呼叫,更新:
1. incident.status → RESOLVED
2. incident.decision.state → completed (如果有)
Args:
incident_id: Incident ID
approval_id: 簽核的 Approval ID (用於日誌)
Returns:
是否更新成功
"""
from sqlalchemy import select
logger.info(
"resolve_incident_starting",
incident_id=incident_id,
approval_id=approval_id,
)
redis_client = get_redis()
key = f"{INCIDENT_KEY_PREFIX}{incident_id}"
redis_ok = False
db_ok = False
# 1. 更新 Redis
try:
data = await redis_client.get(key)
if data:
incident = Incident.model_validate_json(data)
old_status = incident.status.value
incident.status = IncidentStatus.RESOLVED
incident.resolved_at = datetime.now(UTC)
incident.updated_at = datetime.now(UTC)
# 注意: decision_chain 是完整 AI 推論過程,不需要在此修改
# 狀態由 incident.status = RESOLVED 表達
await redis_client.set(key, incident.model_dump_json(), ex=604800)
redis_ok = True
logger.info(
"resolve_incident_redis_updated",
incident_id=incident_id,
old_status=old_status,
new_status="resolved",
)
else:
# 🔴 關鍵修復: Redis 沒有 Incident 時,從 DB 讀取並更新
logger.warning(
"resolve_incident_redis_miss",
incident_id=incident_id,
note="Incident not found in Redis, will update DB only",
)
# 仍然標記為成功,讓 DB 更新繼續執行
redis_ok = True
except Exception as e:
logger.exception(
"resolve_incident_redis_error",
incident_id=incident_id,
error=str(e),
)
# 2. 更新 DB (如果存在)
now = datetime.now(UTC)
try:
async with get_db_context() as db:
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record:
record.status = "resolved"
record.resolved_at = now
record.updated_at = now
# 🔴 關鍵: 確保 commit 成功
await db.commit()
db_ok = True
logger.info(
"resolve_incident_db_updated",
incident_id=incident_id,
resolved_at=now.isoformat(),
)
else:
# DB 沒有記錄但 Redis 有 - 這是可接受的狀態
# (Incident 可能因 DB 寫入失敗只存在 Redis)
db_ok = True # 視為成功,因為沒有需要更新的記錄
logger.warning(
"resolve_incident_db_not_found",
incident_id=incident_id,
note="Incident exists in Redis but not in DB, this is acceptable",
)
except Exception as e:
logger.exception(
"resolve_incident_db_error",
incident_id=incident_id,
error=str(e),
)
# 3. 更新 DecisionToken 狀態為 COMPLETED
# 關鍵修復: DecisionToken 獨立儲存在 decision:{token}
# 必須同步更新,否則下次 poll 會顯示 Y/n
decision_ok = False
try:
from src.services.decision_manager import (
DecisionState,
get_decision_manager,
)
decision_manager = get_decision_manager()
existing_token = await decision_manager._find_existing_token(incident_id)
if existing_token:
await decision_manager.update_token_state(
existing_token.token,
DecisionState.COMPLETED,
)
decision_ok = True
logger.info(
"resolve_decision_token_updated",
incident_id=incident_id,
token=existing_token.token,
new_state="completed",
)
else:
decision_ok = True # 沒有 token 也算成功
logger.warning(
"resolve_decision_token_not_found",
incident_id=incident_id,
)
except Exception as e:
logger.exception(
"resolve_decision_token_error",
incident_id=incident_id,
error=str(e),
)
# 只要 Redis 更新成功就算成功API 只讀 Redis
success = redis_ok
logger.info(
"resolve_incident_completed",
incident_id=incident_id,
approval_id=approval_id,
success=success,
redis_ok=redis_ok,
db_ok=db_ok,
decision_ok=decision_ok,
)
return success
# =============================================================================
# Singleton
# =============================================================================
_proposal_service: ProposalService | None = None
def get_proposal_service() -> ProposalService:
"""取得 ProposalService 實例 (Singleton)"""
global _proposal_service
if _proposal_service is None:
_proposal_service = ProposalService()
return _proposal_service