562 lines
21 KiB
Python
562 lines
21 KiB
Python
"""
|
||
GovernanceDispatcher 決策融合適配器
|
||
======================================
|
||
將 decision_fusion / playbook_service / Ollama 的既有能力
|
||
組合成「給治理事件用的三維融合介面」。
|
||
|
||
設計原則:
|
||
- 不修改任何 Tier 3 檔(decision_manager / learning_service / trust_engine)
|
||
- 只 consume 公開 API(read-only)
|
||
- 三維融合:LLM × Playbook trust × MCP 情報
|
||
- Exception 隔離:任一維度失敗 → 中立值 0.5,不阻塞主流程
|
||
|
||
融合公式(起始權重,TODO 移到 settings 由 AI 自學調整):
|
||
confidence = w_llm * llm_score + w_playbook * playbook_trust + w_mcp * mcp_score
|
||
w_llm=0.4, w_playbook=0.3, w_mcp=0.3
|
||
|
||
決策分支(閾值 TODO 移到 settings):
|
||
confidence >= 0.85 → auto_dispatch
|
||
0.65 <= conf < 0.85 → pending_approval
|
||
conf < 0.65 → skip
|
||
|
||
2026-05-03 ogt + Claude Sonnet 4.6(亞太): GovernanceDispatcher Wave 2E 實作
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import re
|
||
from dataclasses import dataclass
|
||
from typing import TYPE_CHECKING, Any, Literal
|
||
|
||
import httpx
|
||
import structlog
|
||
|
||
from src.core.config import get_settings
|
||
from src.services.ollama_endpoint_resolver import resolve_ollama_order
|
||
|
||
if TYPE_CHECKING:
|
||
from src.db.models import AiGovernanceEvent
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# =============================================================================
|
||
# 常數
|
||
# TODO: 移到 settings(ADR-P2E-FUTURE),屆時可讓 AI 自學調整
|
||
# =============================================================================
|
||
|
||
# 三維融合權重(0.4 / 0.3 / 0.3)
|
||
_W_LLM: float = 0.4 # TODO: 由 AI 自學調整,初始值 0.4
|
||
_W_PLAYBOOK: float = 0.3 # TODO: 由 AI 自學調整,初始值 0.3
|
||
_W_MCP: float = 0.3 # TODO: 由 AI 自學調整,初始值 0.3
|
||
|
||
# 決策分支閾值
|
||
# TODO: 移到 settings,未來由 AI 根據 false-positive rate 動態調整
|
||
_AUTO_DISPATCH_THRESHOLD: float = 0.85 # >= 此值 → auto_dispatch
|
||
_PENDING_APPROVAL_THRESHOLD: float = 0.65 # >= 此值 < AUTO → pending_approval
|
||
# # < 此值 → skip
|
||
|
||
# Ollama 推理超時(秒)
|
||
_LLM_TIMEOUT_SEC: float = 30.0
|
||
|
||
# Prometheus 查詢超時(秒)
|
||
_PROM_TIMEOUT_SEC: float = 5.0
|
||
|
||
|
||
# =============================================================================
|
||
# FusedDecision 資料結構
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class FusedDecision:
|
||
"""三維融合決策輸出。
|
||
|
||
所有分數均為 0.0-1.0(0.5 為中立值,任一維度失敗時使用)。
|
||
decision_path 決定 GovernanceDispatcher 寫入哪種 dispatch。
|
||
|
||
Attributes:
|
||
confidence: 三維加權融合分數(0.0-1.0)
|
||
recommended_action: LLM 推薦的修復動作摘要(≤200 字)
|
||
matched_playbook_id: 最高相似度的 Playbook ID(可 None)
|
||
playbook_trust: matched_playbook 的 trust_score(可 None)
|
||
llm_reasoning: LLM 原始輸出摘要(dict,供 decision_context JSONB 記錄)
|
||
mcp_snapshot: MCP 情報快照(dict,供 decision_context JSONB 記錄)
|
||
decision_path: auto_dispatch / pending_approval / skip
|
||
llm_score: LLM 分數(0.0-1.0)
|
||
playbook_score: Playbook 信任分數(0.0-1.0,無 playbook 時 0.3)
|
||
mcp_score: MCP 感官品質分數(0.0-1.0)
|
||
"""
|
||
confidence: float
|
||
recommended_action: str
|
||
matched_playbook_id: str | None
|
||
playbook_trust: float | None
|
||
llm_reasoning: dict[str, Any]
|
||
mcp_snapshot: dict[str, Any]
|
||
decision_path: Literal["auto_dispatch", "pending_approval", "skip"]
|
||
llm_score: float
|
||
playbook_score: float
|
||
mcp_score: float
|
||
|
||
|
||
# =============================================================================
|
||
# DecisionFusionAdapter
|
||
# =============================================================================
|
||
|
||
class DecisionFusionAdapter:
|
||
"""治理事件決策融合適配器。
|
||
|
||
將 decision_fusion / playbook_service / MCP 的既有能力組合成
|
||
「給治理事件用的三維融合介面」。本類不修改任何 Tier 3 檔,只 consume。
|
||
|
||
不注入 Tier 3 class:
|
||
- DecisionManager — 有 incident 中心的複雜狀態機,不適合治理事件
|
||
- TrustEngine — 只管理 incident 信任分數
|
||
- LearningService — 只管理 KM 寫入路徑
|
||
|
||
本 Adapter 直接呼叫:
|
||
- Ollama(仿 decision_fusion._score_hermes 模式)→ LLM 推理
|
||
- playbook_service.get_recommendations → Playbook trust
|
||
- Prometheus provider → MCP 情報
|
||
"""
|
||
|
||
def __init__(self) -> None:
|
||
self._settings = get_settings()
|
||
|
||
# =========================================================================
|
||
# Public API
|
||
# =========================================================================
|
||
|
||
async def fuse_decision(self, event: AiGovernanceEvent) -> FusedDecision:
|
||
"""三維融合:LLM × Playbook × MCP → FusedDecision。
|
||
|
||
三個維度並行評估(asyncio.gather),任一失敗靜默降為 0.5。
|
||
依 confidence 決定 decision_path。
|
||
|
||
Args:
|
||
event: AiGovernanceEvent ORM 物件(不修改此物件)
|
||
|
||
Returns:
|
||
FusedDecision 含完整三維快照,供 dispatcher 寫入 decision_context
|
||
"""
|
||
# 並行取三維分數
|
||
results = await asyncio.gather(
|
||
self._score_llm(event),
|
||
self._score_playbook(event),
|
||
self._score_mcp(event),
|
||
return_exceptions=True,
|
||
)
|
||
|
||
# 安全解包(Exception → 中立值 0.5)
|
||
llm_result = results[0]
|
||
playbook_result = results[1]
|
||
mcp_result = results[2]
|
||
|
||
if isinstance(llm_result, Exception):
|
||
logger.warning(
|
||
"fusion_llm_score_failed",
|
||
event_id=event.id,
|
||
event_type=event.event_type,
|
||
error=str(llm_result),
|
||
)
|
||
llm_result = (0.5, "(LLM 評估失敗,使用中立值)", {})
|
||
|
||
if isinstance(playbook_result, Exception):
|
||
logger.warning(
|
||
"fusion_playbook_score_failed",
|
||
event_id=event.id,
|
||
error=str(playbook_result),
|
||
)
|
||
playbook_result = (0.3, None, None)
|
||
|
||
if isinstance(mcp_result, Exception):
|
||
logger.warning(
|
||
"fusion_mcp_score_failed",
|
||
event_id=event.id,
|
||
error=str(mcp_result),
|
||
)
|
||
mcp_result = (0.5, {})
|
||
|
||
llm_score, recommended_action, llm_reasoning = llm_result
|
||
playbook_score, matched_playbook_id, playbook_trust = playbook_result
|
||
mcp_score, mcp_snapshot = mcp_result
|
||
|
||
# 三維加權融合
|
||
# TODO: 移到 settings,未來由 AI 自學調整 _W_LLM / _W_PLAYBOOK / _W_MCP
|
||
confidence = (
|
||
_W_LLM * llm_score
|
||
+ _W_PLAYBOOK * playbook_score
|
||
+ _W_MCP * mcp_score
|
||
)
|
||
confidence = max(0.0, min(1.0, confidence))
|
||
|
||
# 決策分支
|
||
# TODO: 閾值移到 settings,未來由 AI 根據 false-positive rate 動態調整
|
||
if confidence >= _AUTO_DISPATCH_THRESHOLD:
|
||
decision_path: Literal["auto_dispatch", "pending_approval", "skip"] = "auto_dispatch"
|
||
elif confidence >= _PENDING_APPROVAL_THRESHOLD:
|
||
decision_path = "pending_approval"
|
||
else:
|
||
decision_path = "skip"
|
||
|
||
logger.info(
|
||
"governance_fusion_complete",
|
||
event_id=event.id,
|
||
event_type=event.event_type,
|
||
llm_score=round(llm_score, 4),
|
||
playbook_score=round(playbook_score, 4),
|
||
mcp_score=round(mcp_score, 4),
|
||
confidence=round(confidence, 4),
|
||
decision_path=decision_path,
|
||
)
|
||
|
||
return FusedDecision(
|
||
confidence=confidence,
|
||
recommended_action=recommended_action,
|
||
matched_playbook_id=matched_playbook_id,
|
||
playbook_trust=playbook_trust,
|
||
llm_reasoning=llm_reasoning,
|
||
mcp_snapshot=mcp_snapshot,
|
||
decision_path=decision_path,
|
||
llm_score=llm_score,
|
||
playbook_score=playbook_score,
|
||
mcp_score=mcp_score,
|
||
)
|
||
|
||
# =========================================================================
|
||
# 維度 1:LLM 推理(Ollama qwen3:8b — 仿 decision_fusion._score_hermes)
|
||
# =========================================================================
|
||
|
||
async def _score_llm(
|
||
self, event: AiGovernanceEvent
|
||
) -> tuple[float, str, dict[str, Any]]:
|
||
"""Ollama LLM 推理:治理事件情境 → 建議動作 + 信心度。
|
||
|
||
Prompt 設計:
|
||
- 提供 event_type + details 摘要(sanitize 後)
|
||
- 要求輸出「信心度(0-1)+ 建議動作」
|
||
|
||
Returns:
|
||
(llm_score, recommended_action, llm_reasoning_dict)
|
||
"""
|
||
event_type = str(event.event_type or "unknown")
|
||
details_summary = self._summarize_details(event.details or {})
|
||
|
||
prompt = (
|
||
"你是 AIOps 治理分析員。根據以下治理事件,評估自動修復的可行性與建議動作。\n\n"
|
||
f"【事件類型】{event_type}\n"
|
||
f"【事件摘要】{details_summary}\n\n"
|
||
"請以以下格式回應(不超過 200 字):\n"
|
||
"CONFIDENCE: [0.0-1.0 的數字]\n"
|
||
"ACTION: [具體建議修復動作,≤100字]\n\n"
|
||
"注意:\n"
|
||
"- CONFIDENCE 越高表示越適合自動執行\n"
|
||
"- 若事件模糊或影響範圍不明,給低分(0.3-0.5)\n"
|
||
"- 若有明確、低風險的修復路徑,可給高分(0.7-0.9)\n"
|
||
"只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。"
|
||
)
|
||
|
||
raw_text = ""
|
||
last_error = ""
|
||
async with httpx.AsyncClient(
|
||
timeout=httpx.Timeout(_LLM_TIMEOUT_SEC, connect=5.0)
|
||
) as client:
|
||
for endpoint in resolve_ollama_order("deep_rca"):
|
||
if not endpoint.url:
|
||
continue
|
||
try:
|
||
resp = await client.post(
|
||
f"{endpoint.url}/api/generate",
|
||
json={
|
||
"model": "qwen3:8b",
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {"num_predict": 128, "temperature": 0.1},
|
||
},
|
||
)
|
||
if resp.status_code != 200:
|
||
last_error = f"http_{resp.status_code}"
|
||
logger.warning(
|
||
"fusion_llm_http_error",
|
||
provider=endpoint.provider_name,
|
||
status=resp.status_code,
|
||
event_id=event.id,
|
||
)
|
||
continue
|
||
|
||
raw_text = resp.json().get("response", "").strip()
|
||
break
|
||
except Exception as exc:
|
||
last_error = str(exc)
|
||
logger.warning(
|
||
"fusion_llm_request_failed",
|
||
provider=endpoint.provider_name,
|
||
event_id=event.id,
|
||
error=str(exc),
|
||
)
|
||
|
||
if not raw_text:
|
||
return 0.5, "(LLM 連線失敗,使用中立值)", {"error": last_error or "no_ollama_endpoint"}
|
||
|
||
# 移除 <think> 標籤(qwen3 CoT 輸出)
|
||
clean = re.sub(r"<think>.*?</think>", "", raw_text, flags=re.DOTALL).strip()
|
||
|
||
# 解析 CONFIDENCE 行
|
||
llm_score = 0.5
|
||
conf_match = re.search(r"CONFIDENCE:\s*([01]?\.\d+|[01])", clean, re.IGNORECASE)
|
||
if conf_match:
|
||
try:
|
||
llm_score = max(0.0, min(1.0, float(conf_match.group(1))))
|
||
except ValueError:
|
||
pass
|
||
|
||
# 解析 ACTION 行
|
||
recommended_action = "(LLM 未提供明確建議)"
|
||
action_match = re.search(r"ACTION:\s*(.+)", clean, re.IGNORECASE)
|
||
if action_match:
|
||
recommended_action = action_match.group(1).strip()[:200]
|
||
|
||
llm_reasoning = {
|
||
"raw_text_preview": raw_text[:300],
|
||
"parsed_confidence": llm_score,
|
||
"parsed_action": recommended_action,
|
||
"event_type": event_type,
|
||
}
|
||
|
||
logger.debug(
|
||
"fusion_llm_scored",
|
||
event_id=event.id,
|
||
llm_score=llm_score,
|
||
action_preview=recommended_action[:60],
|
||
)
|
||
return llm_score, recommended_action, llm_reasoning
|
||
|
||
# =========================================================================
|
||
# 維度 2:Playbook 比對 + trust_score
|
||
# =========================================================================
|
||
|
||
async def _score_playbook(
|
||
self, event: AiGovernanceEvent
|
||
) -> tuple[float, str | None, float | None]:
|
||
"""Playbook 相似度比對 → 取最高 trust_score。
|
||
|
||
治理事件沒有 SymptomPattern,用 event_type 作為 alert_name 搜尋。
|
||
無命中時返回保守初始值 (0.3, None, None)。
|
||
|
||
Returns:
|
||
(playbook_score, matched_playbook_id, playbook_trust)
|
||
"""
|
||
from src.models.playbook import SymptomPattern
|
||
from src.services.playbook_service import get_playbook_service
|
||
|
||
symptoms = SymptomPattern(
|
||
alert_names=[event.event_type or "unknown"],
|
||
affected_services=[],
|
||
severity_range=["P2"],
|
||
keywords=self._extract_keywords(event.details or {}),
|
||
)
|
||
|
||
try:
|
||
svc = get_playbook_service()
|
||
recommendations = await svc.get_recommendations(
|
||
symptoms=symptoms,
|
||
top_k=1,
|
||
use_rag=False, # 治理事件用 Jaccard 精確比對即可
|
||
)
|
||
except Exception as exc:
|
||
logger.warning("fusion_playbook_lookup_failed", event_id=event.id, error=str(exc))
|
||
return 0.3, None, None
|
||
|
||
if not recommendations:
|
||
logger.debug("fusion_playbook_no_match", event_id=event.id, event_type=event.event_type)
|
||
return 0.3, None, None
|
||
|
||
best = recommendations[0]
|
||
trust = float(best.playbook.trust_score)
|
||
playbook_id = best.playbook.playbook_id
|
||
|
||
logger.debug(
|
||
"fusion_playbook_matched",
|
||
event_id=event.id,
|
||
playbook_id=playbook_id,
|
||
trust_score=trust,
|
||
similarity=round(best.similarity_score, 4),
|
||
)
|
||
return trust, playbook_id, trust
|
||
|
||
# =========================================================================
|
||
# 維度 3:MCP 情報(Prometheus)
|
||
# =========================================================================
|
||
|
||
async def _score_mcp(
|
||
self, event: AiGovernanceEvent
|
||
) -> tuple[float, dict[str, Any]]:
|
||
"""Prometheus 情報採集 → MCP 感官品質分數。
|
||
|
||
查詢與事件相關的核心指標(autonomy_rate / hallucination_rate)。
|
||
MCP 不可用時返回中立值 (0.5, {})。
|
||
|
||
Returns:
|
||
(mcp_score, mcp_snapshot_dict)
|
||
"""
|
||
prom_url = getattr(
|
||
self._settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090"
|
||
)
|
||
|
||
# 依 event_type 選擇查詢指標(治理事件相關)
|
||
queries: dict[str, str] = self._get_mcp_queries(event.event_type or "unknown")
|
||
|
||
snapshot: dict[str, Any] = {}
|
||
success_count = 0
|
||
no_data_count = 0 # Prometheus 正常但指標尚未建立(SLI recording rules 未生效)
|
||
total_count = len(queries)
|
||
|
||
if total_count == 0:
|
||
return 0.5, {"reason": "no_queries_for_event_type"}
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=_PROM_TIMEOUT_SEC) as client:
|
||
for metric_name, query in queries.items():
|
||
try:
|
||
resp = await client.get(
|
||
f"{prom_url}/api/v1/query",
|
||
params={"query": query},
|
||
)
|
||
data = resp.json()
|
||
if data.get("status") == "success":
|
||
result_list = data.get("data", {}).get("result", [])
|
||
if result_list:
|
||
value = float(result_list[0]["value"][1])
|
||
snapshot[metric_name] = round(value, 4)
|
||
success_count += 1
|
||
else:
|
||
# 2026-05-04 ogt: 指標尚未建立 ≠ MCP 失敗
|
||
# SLI recording rules 初期可能無資料,給予 0.5 中性貢獻
|
||
snapshot[metric_name] = "no_data"
|
||
no_data_count += 1
|
||
except Exception as exc:
|
||
snapshot[metric_name] = f"error:{exc!s:.60}"
|
||
except Exception as exc:
|
||
logger.warning("fusion_mcp_prometheus_failed", event_id=event.id, error=str(exc))
|
||
return 0.5, {"error": str(exc)}
|
||
|
||
# 2026-05-04 ogt: 品質分數修正
|
||
# success=完整貢獻(1.0), no_data=半貢獻(0.5,指標未建立非 MCP 故障), error=0
|
||
# 最終映射到 [0.2, 0.9]
|
||
if total_count > 0:
|
||
weighted = success_count + 0.5 * no_data_count
|
||
ratio = weighted / total_count
|
||
mcp_score = 0.2 + 0.7 * ratio
|
||
else:
|
||
mcp_score = 0.5
|
||
|
||
snapshot["_meta"] = {
|
||
"success_count": success_count,
|
||
"no_data_count": no_data_count,
|
||
"total_queries": total_count,
|
||
"quality_score": round(mcp_score, 4),
|
||
}
|
||
|
||
logger.debug(
|
||
"fusion_mcp_scored",
|
||
event_id=event.id,
|
||
mcp_score=round(mcp_score, 4),
|
||
success=success_count,
|
||
total=total_count,
|
||
)
|
||
return mcp_score, snapshot
|
||
|
||
# =========================================================================
|
||
# Helpers
|
||
# =========================================================================
|
||
|
||
@staticmethod
|
||
def _summarize_details(details: dict[str, Any]) -> str:
|
||
"""從 details dict 提取可讀摘要(≤300 字)。"""
|
||
if not details:
|
||
return "(無詳細資訊)"
|
||
|
||
parts: list[str] = []
|
||
|
||
# 常見欄位優先展示
|
||
for key in ("status", "impact", "remediation", "reason"):
|
||
val = details.get(key)
|
||
if val is None:
|
||
continue
|
||
if isinstance(val, dict):
|
||
inner = "; ".join(f"{k}={v}" for k, v in list(val.items())[:4])
|
||
parts.append(f"{key}: {inner}")
|
||
elif isinstance(val, (str, int, float)):
|
||
parts.append(f"{key}: {val!s:.80}")
|
||
|
||
if not parts:
|
||
# fallback: 取前幾個 top-level k=v
|
||
parts = [f"{k}={v!s:.40}" for k, v in list(details.items())[:5]]
|
||
|
||
return "; ".join(parts)[:300]
|
||
|
||
@staticmethod
|
||
def _extract_keywords(details: dict[str, Any]) -> list[str]:
|
||
"""從 details 提取關鍵字供 Playbook 搜尋(最多 5 個)。"""
|
||
keywords: list[str] = []
|
||
|
||
for key in ("remediation", "actionable", "impact"):
|
||
val = details.get(key)
|
||
if isinstance(val, dict):
|
||
for sub_key in ("next_action", "items"):
|
||
sub = val.get(sub_key)
|
||
if isinstance(sub, str):
|
||
keywords.append(sub[:50])
|
||
elif isinstance(sub, list):
|
||
keywords.extend(str(x)[:40] for x in sub[:2])
|
||
|
||
return keywords[:5]
|
||
|
||
@staticmethod
|
||
def _get_mcp_queries(event_type: str) -> dict[str, str]:
|
||
"""依 event_type 返回相關 Prometheus 查詢指標。
|
||
|
||
不硬寫 event_type → action 對應規則,僅決定「看哪些指標」。
|
||
"""
|
||
# 通用指標(所有 event_type 都查)
|
||
base_queries: dict[str, str] = {
|
||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||
}
|
||
|
||
# 依 event_type 補充針對性指標
|
||
extra: dict[str, str] = {}
|
||
|
||
if event_type in ("trust_drift", "execution_blast_radius"):
|
||
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
|
||
elif event_type in ("knowledge_degradation", "kb_stale"):
|
||
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
|
||
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
|
||
elif event_type == "llm_hallucination":
|
||
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
|
||
elif event_type == "governance_slo_data_gap":
|
||
extra["confidence_calibration"] = "sli:confidence_calibration:1h"
|
||
extra["km_growth_rate"] = "sli:km_growth_rate:24h"
|
||
|
||
return {**base_queries, **extra}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_adapter_instance: DecisionFusionAdapter | None = None
|
||
|
||
|
||
def get_decision_fusion_adapter() -> DecisionFusionAdapter:
|
||
"""取得 DecisionFusionAdapter 單例(lazy init)。"""
|
||
global _adapter_instance
|
||
if _adapter_instance is None:
|
||
_adapter_instance = DecisionFusionAdapter()
|
||
return _adapter_instance
|
||
|
||
|
||
def reset_decision_fusion_adapter() -> None:
|
||
"""重置 singleton(測試用)。"""
|
||
global _adapter_instance
|
||
_adapter_instance = None
|