Files
awoooi/apps/api/src/services/decision_fusion_adapter.py
Your Name 35fe37c82a
All checks were successful
Code Review / ai-code-review (push) Successful in 23s
CD Pipeline / tests (push) Successful in 5m51s
CD Pipeline / build-and-deploy (push) Successful in 3m29s
CD Pipeline / post-deploy-checks (push) Successful in 1m14s
fix(api): route direct ollama callers through ordered fallback
2026-05-19 12:56:13 +08:00

562 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
GovernanceDispatcher 決策融合適配器
======================================
將 decision_fusion / playbook_service / Ollama 的既有能力
組合成「給治理事件用的三維融合介面」。
設計原則:
- 不修改任何 Tier 3 檔decision_manager / learning_service / trust_engine
- 只 consume 公開 APIread-only
- 三維融合LLM × Playbook trust × MCP 情報
- Exception 隔離:任一維度失敗 → 中立值 0.5,不阻塞主流程
融合公式起始權重TODO 移到 settings 由 AI 自學調整):
confidence = w_llm * llm_score + w_playbook * playbook_trust + w_mcp * mcp_score
w_llm=0.4, w_playbook=0.3, w_mcp=0.3
決策分支(閾值 TODO 移到 settings
confidence >= 0.85 → auto_dispatch
0.65 <= conf < 0.85 → pending_approval
conf < 0.65 → skip
2026-05-03 ogt + Claude Sonnet 4.6(亞太): GovernanceDispatcher Wave 2E 實作
"""
from __future__ import annotations
import asyncio
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Literal
import httpx
import structlog
from src.core.config import get_settings
from src.services.ollama_endpoint_resolver import resolve_ollama_order
if TYPE_CHECKING:
from src.db.models import AiGovernanceEvent
logger = structlog.get_logger(__name__)
# =============================================================================
# 常數
# TODO: 移到 settingsADR-P2E-FUTURE屆時可讓 AI 自學調整
# =============================================================================
# 三維融合權重0.4 / 0.3 / 0.3
_W_LLM: float = 0.4 # TODO: 由 AI 自學調整,初始值 0.4
_W_PLAYBOOK: float = 0.3 # TODO: 由 AI 自學調整,初始值 0.3
_W_MCP: float = 0.3 # TODO: 由 AI 自學調整,初始值 0.3
# 決策分支閾值
# TODO: 移到 settings未來由 AI 根據 false-positive rate 動態調整
_AUTO_DISPATCH_THRESHOLD: float = 0.85 # >= 此值 → auto_dispatch
_PENDING_APPROVAL_THRESHOLD: float = 0.65 # >= 此值 < AUTO → pending_approval
# # < 此值 → skip
# Ollama 推理超時(秒)
_LLM_TIMEOUT_SEC: float = 30.0
# Prometheus 查詢超時(秒)
_PROM_TIMEOUT_SEC: float = 5.0
# =============================================================================
# FusedDecision 資料結構
# =============================================================================
@dataclass
class FusedDecision:
"""三維融合決策輸出。
所有分數均為 0.0-1.00.5 為中立值,任一維度失敗時使用)。
decision_path 決定 GovernanceDispatcher 寫入哪種 dispatch。
Attributes:
confidence: 三維加權融合分數0.0-1.0
recommended_action: LLM 推薦的修復動作摘要≤200 字)
matched_playbook_id: 最高相似度的 Playbook ID可 None
playbook_trust: matched_playbook 的 trust_score可 None
llm_reasoning: LLM 原始輸出摘要dict供 decision_context JSONB 記錄)
mcp_snapshot: MCP 情報快照dict供 decision_context JSONB 記錄)
decision_path: auto_dispatch / pending_approval / skip
llm_score: LLM 分數0.0-1.0
playbook_score: Playbook 信任分數0.0-1.0,無 playbook 時 0.3
mcp_score: MCP 感官品質分數0.0-1.0
"""
confidence: float
recommended_action: str
matched_playbook_id: str | None
playbook_trust: float | None
llm_reasoning: dict[str, Any]
mcp_snapshot: dict[str, Any]
decision_path: Literal["auto_dispatch", "pending_approval", "skip"]
llm_score: float
playbook_score: float
mcp_score: float
# =============================================================================
# DecisionFusionAdapter
# =============================================================================
class DecisionFusionAdapter:
"""治理事件決策融合適配器。
將 decision_fusion / playbook_service / MCP 的既有能力組合成
「給治理事件用的三維融合介面」。本類不修改任何 Tier 3 檔,只 consume。
不注入 Tier 3 class
- DecisionManager — 有 incident 中心的複雜狀態機,不適合治理事件
- TrustEngine — 只管理 incident 信任分數
- LearningService — 只管理 KM 寫入路徑
本 Adapter 直接呼叫:
- Ollama仿 decision_fusion._score_hermes 模式)→ LLM 推理
- playbook_service.get_recommendations → Playbook trust
- Prometheus provider → MCP 情報
"""
def __init__(self) -> None:
self._settings = get_settings()
# =========================================================================
# Public API
# =========================================================================
async def fuse_decision(self, event: AiGovernanceEvent) -> FusedDecision:
"""三維融合LLM × Playbook × MCP → FusedDecision。
三個維度並行評估asyncio.gather任一失敗靜默降為 0.5。
依 confidence 決定 decision_path。
Args:
event: AiGovernanceEvent ORM 物件(不修改此物件)
Returns:
FusedDecision 含完整三維快照,供 dispatcher 寫入 decision_context
"""
# 並行取三維分數
results = await asyncio.gather(
self._score_llm(event),
self._score_playbook(event),
self._score_mcp(event),
return_exceptions=True,
)
# 安全解包Exception → 中立值 0.5
llm_result = results[0]
playbook_result = results[1]
mcp_result = results[2]
if isinstance(llm_result, Exception):
logger.warning(
"fusion_llm_score_failed",
event_id=event.id,
event_type=event.event_type,
error=str(llm_result),
)
llm_result = (0.5, "LLM 評估失敗,使用中立值)", {})
if isinstance(playbook_result, Exception):
logger.warning(
"fusion_playbook_score_failed",
event_id=event.id,
error=str(playbook_result),
)
playbook_result = (0.3, None, None)
if isinstance(mcp_result, Exception):
logger.warning(
"fusion_mcp_score_failed",
event_id=event.id,
error=str(mcp_result),
)
mcp_result = (0.5, {})
llm_score, recommended_action, llm_reasoning = llm_result
playbook_score, matched_playbook_id, playbook_trust = playbook_result
mcp_score, mcp_snapshot = mcp_result
# 三維加權融合
# TODO: 移到 settings未來由 AI 自學調整 _W_LLM / _W_PLAYBOOK / _W_MCP
confidence = (
_W_LLM * llm_score
+ _W_PLAYBOOK * playbook_score
+ _W_MCP * mcp_score
)
confidence = max(0.0, min(1.0, confidence))
# 決策分支
# TODO: 閾值移到 settings未來由 AI 根據 false-positive rate 動態調整
if confidence >= _AUTO_DISPATCH_THRESHOLD:
decision_path: Literal["auto_dispatch", "pending_approval", "skip"] = "auto_dispatch"
elif confidence >= _PENDING_APPROVAL_THRESHOLD:
decision_path = "pending_approval"
else:
decision_path = "skip"
logger.info(
"governance_fusion_complete",
event_id=event.id,
event_type=event.event_type,
llm_score=round(llm_score, 4),
playbook_score=round(playbook_score, 4),
mcp_score=round(mcp_score, 4),
confidence=round(confidence, 4),
decision_path=decision_path,
)
return FusedDecision(
confidence=confidence,
recommended_action=recommended_action,
matched_playbook_id=matched_playbook_id,
playbook_trust=playbook_trust,
llm_reasoning=llm_reasoning,
mcp_snapshot=mcp_snapshot,
decision_path=decision_path,
llm_score=llm_score,
playbook_score=playbook_score,
mcp_score=mcp_score,
)
# =========================================================================
# 維度 1LLM 推理Ollama qwen3:8b — 仿 decision_fusion._score_hermes
# =========================================================================
async def _score_llm(
self, event: AiGovernanceEvent
) -> tuple[float, str, dict[str, Any]]:
"""Ollama LLM 推理:治理事件情境 → 建議動作 + 信心度。
Prompt 設計:
- 提供 event_type + details 摘要sanitize 後)
- 要求輸出「信心度0-1+ 建議動作」
Returns:
(llm_score, recommended_action, llm_reasoning_dict)
"""
event_type = str(event.event_type or "unknown")
details_summary = self._summarize_details(event.details or {})
prompt = (
"你是 AIOps 治理分析員。根據以下治理事件,評估自動修復的可行性與建議動作。\n\n"
f"【事件類型】{event_type}\n"
f"【事件摘要】{details_summary}\n\n"
"請以以下格式回應(不超過 200 字):\n"
"CONFIDENCE: [0.0-1.0 的數字]\n"
"ACTION: [具體建議修復動作≤100字]\n\n"
"注意:\n"
"- CONFIDENCE 越高表示越適合自動執行\n"
"- 若事件模糊或影響範圍不明給低分0.3-0.5\n"
"- 若有明確、低風險的修復路徑可給高分0.7-0.9\n"
"只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。"
)
raw_text = ""
last_error = ""
async with httpx.AsyncClient(
timeout=httpx.Timeout(_LLM_TIMEOUT_SEC, connect=5.0)
) as client:
for endpoint in resolve_ollama_order("deep_rca"):
if not endpoint.url:
continue
try:
resp = await client.post(
f"{endpoint.url}/api/generate",
json={
"model": "qwen3:8b",
"prompt": prompt,
"stream": False,
"options": {"num_predict": 128, "temperature": 0.1},
},
)
if resp.status_code != 200:
last_error = f"http_{resp.status_code}"
logger.warning(
"fusion_llm_http_error",
provider=endpoint.provider_name,
status=resp.status_code,
event_id=event.id,
)
continue
raw_text = resp.json().get("response", "").strip()
break
except Exception as exc:
last_error = str(exc)
logger.warning(
"fusion_llm_request_failed",
provider=endpoint.provider_name,
event_id=event.id,
error=str(exc),
)
if not raw_text:
return 0.5, "LLM 連線失敗,使用中立值)", {"error": last_error or "no_ollama_endpoint"}
# 移除 <think> 標籤qwen3 CoT 輸出)
clean = re.sub(r"<think>.*?</think>", "", raw_text, flags=re.DOTALL).strip()
# 解析 CONFIDENCE 行
llm_score = 0.5
conf_match = re.search(r"CONFIDENCE:\s*([01]?\.\d+|[01])", clean, re.IGNORECASE)
if conf_match:
try:
llm_score = max(0.0, min(1.0, float(conf_match.group(1))))
except ValueError:
pass
# 解析 ACTION 行
recommended_action = "LLM 未提供明確建議)"
action_match = re.search(r"ACTION:\s*(.+)", clean, re.IGNORECASE)
if action_match:
recommended_action = action_match.group(1).strip()[:200]
llm_reasoning = {
"raw_text_preview": raw_text[:300],
"parsed_confidence": llm_score,
"parsed_action": recommended_action,
"event_type": event_type,
}
logger.debug(
"fusion_llm_scored",
event_id=event.id,
llm_score=llm_score,
action_preview=recommended_action[:60],
)
return llm_score, recommended_action, llm_reasoning
# =========================================================================
# 維度 2Playbook 比對 + trust_score
# =========================================================================
async def _score_playbook(
self, event: AiGovernanceEvent
) -> tuple[float, str | None, float | None]:
"""Playbook 相似度比對 → 取最高 trust_score。
治理事件沒有 SymptomPattern用 event_type 作為 alert_name 搜尋。
無命中時返回保守初始值 (0.3, None, None)。
Returns:
(playbook_score, matched_playbook_id, playbook_trust)
"""
from src.models.playbook import SymptomPattern
from src.services.playbook_service import get_playbook_service
symptoms = SymptomPattern(
alert_names=[event.event_type or "unknown"],
affected_services=[],
severity_range=["P2"],
keywords=self._extract_keywords(event.details or {}),
)
try:
svc = get_playbook_service()
recommendations = await svc.get_recommendations(
symptoms=symptoms,
top_k=1,
use_rag=False, # 治理事件用 Jaccard 精確比對即可
)
except Exception as exc:
logger.warning("fusion_playbook_lookup_failed", event_id=event.id, error=str(exc))
return 0.3, None, None
if not recommendations:
logger.debug("fusion_playbook_no_match", event_id=event.id, event_type=event.event_type)
return 0.3, None, None
best = recommendations[0]
trust = float(best.playbook.trust_score)
playbook_id = best.playbook.playbook_id
logger.debug(
"fusion_playbook_matched",
event_id=event.id,
playbook_id=playbook_id,
trust_score=trust,
similarity=round(best.similarity_score, 4),
)
return trust, playbook_id, trust
# =========================================================================
# 維度 3MCP 情報Prometheus
# =========================================================================
async def _score_mcp(
self, event: AiGovernanceEvent
) -> tuple[float, dict[str, Any]]:
"""Prometheus 情報採集 → MCP 感官品質分數。
查詢與事件相關的核心指標autonomy_rate / hallucination_rate
MCP 不可用時返回中立值 (0.5, {})。
Returns:
(mcp_score, mcp_snapshot_dict)
"""
prom_url = getattr(
self._settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090"
)
# 依 event_type 選擇查詢指標(治理事件相關)
queries: dict[str, str] = self._get_mcp_queries(event.event_type or "unknown")
snapshot: dict[str, Any] = {}
success_count = 0
no_data_count = 0 # Prometheus 正常但指標尚未建立SLI recording rules 未生效)
total_count = len(queries)
if total_count == 0:
return 0.5, {"reason": "no_queries_for_event_type"}
try:
async with httpx.AsyncClient(timeout=_PROM_TIMEOUT_SEC) as client:
for metric_name, query in queries.items():
try:
resp = await client.get(
f"{prom_url}/api/v1/query",
params={"query": query},
)
data = resp.json()
if data.get("status") == "success":
result_list = data.get("data", {}).get("result", [])
if result_list:
value = float(result_list[0]["value"][1])
snapshot[metric_name] = round(value, 4)
success_count += 1
else:
# 2026-05-04 ogt: 指標尚未建立 ≠ MCP 失敗
# SLI recording rules 初期可能無資料,給予 0.5 中性貢獻
snapshot[metric_name] = "no_data"
no_data_count += 1
except Exception as exc:
snapshot[metric_name] = f"error:{exc!s:.60}"
except Exception as exc:
logger.warning("fusion_mcp_prometheus_failed", event_id=event.id, error=str(exc))
return 0.5, {"error": str(exc)}
# 2026-05-04 ogt: 品質分數修正
# success=完整貢獻(1.0), no_data=半貢獻(0.5,指標未建立非 MCP 故障), error=0
# 最終映射到 [0.2, 0.9]
if total_count > 0:
weighted = success_count + 0.5 * no_data_count
ratio = weighted / total_count
mcp_score = 0.2 + 0.7 * ratio
else:
mcp_score = 0.5
snapshot["_meta"] = {
"success_count": success_count,
"no_data_count": no_data_count,
"total_queries": total_count,
"quality_score": round(mcp_score, 4),
}
logger.debug(
"fusion_mcp_scored",
event_id=event.id,
mcp_score=round(mcp_score, 4),
success=success_count,
total=total_count,
)
return mcp_score, snapshot
# =========================================================================
# Helpers
# =========================================================================
@staticmethod
def _summarize_details(details: dict[str, Any]) -> str:
"""從 details dict 提取可讀摘要≤300 字)。"""
if not details:
return "(無詳細資訊)"
parts: list[str] = []
# 常見欄位優先展示
for key in ("status", "impact", "remediation", "reason"):
val = details.get(key)
if val is None:
continue
if isinstance(val, dict):
inner = "; ".join(f"{k}={v}" for k, v in list(val.items())[:4])
parts.append(f"{key}: {inner}")
elif isinstance(val, (str, int, float)):
parts.append(f"{key}: {val!s:.80}")
if not parts:
# fallback: 取前幾個 top-level k=v
parts = [f"{k}={v!s:.40}" for k, v in list(details.items())[:5]]
return "; ".join(parts)[:300]
@staticmethod
def _extract_keywords(details: dict[str, Any]) -> list[str]:
"""從 details 提取關鍵字供 Playbook 搜尋(最多 5 個)。"""
keywords: list[str] = []
for key in ("remediation", "actionable", "impact"):
val = details.get(key)
if isinstance(val, dict):
for sub_key in ("next_action", "items"):
sub = val.get(sub_key)
if isinstance(sub, str):
keywords.append(sub[:50])
elif isinstance(sub, list):
keywords.extend(str(x)[:40] for x in sub[:2])
return keywords[:5]
@staticmethod
def _get_mcp_queries(event_type: str) -> dict[str, str]:
"""依 event_type 返回相關 Prometheus 查詢指標。
不硬寫 event_type → action 對應規則,僅決定「看哪些指標」。
"""
# 通用指標(所有 event_type 都查)
base_queries: dict[str, str] = {
"autonomy_rate": "sli:autonomy_rate:5m",
"decision_accuracy": "sli:decision_accuracy:5m",
}
# 依 event_type 補充針對性指標
extra: dict[str, str] = {}
if event_type in ("trust_drift", "execution_blast_radius"):
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
elif event_type in ("knowledge_degradation", "kb_stale"):
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
elif event_type == "llm_hallucination":
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
elif event_type == "governance_slo_data_gap":
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
return {**base_queries, **extra}
# =============================================================================
# Singleton
# =============================================================================
_adapter_instance: DecisionFusionAdapter | None = None
def get_decision_fusion_adapter() -> DecisionFusionAdapter:
"""取得 DecisionFusionAdapter 單例lazy init"""
global _adapter_instance
if _adapter_instance is None:
_adapter_instance = DecisionFusionAdapter()
return _adapter_instance
def reset_decision_fusion_adapter() -> None:
"""重置 singleton測試用"""
global _adapter_instance
_adapter_instance = None