Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m59s
Wave 8 P3.1-T2 PathA 啟用 + Solver F4 安全強化 + test 對齊:
PathA — DiagnosisAggregator 信號分類層補 PDI:
- ENABLE_DIAGNOSIS_AGGREGATOR default=False → True
· PathA 純信號分類層(OOMKilled/CrashLoop 等業務邏輯)
· 不重複呼叫 K8s/SignOz API(只取 PDI 已收集的 raw 資料)
· 安全 default on — 純邏輯處理,無外部依賴重疊
- diagnosis_aggregator.py +155 行(PathA 實作)
- pre_decision_investigator.py 已接 (commit 3a2cd151)
F4 — Solver critical risk reject:
- solver_agent.py: _validate_recommended_action 拒絕 risk=critical
· 鐵律:critical 動作必須走人工審批,不可變 Telegram 按鈕
· log warning + return None(被 _extract 過濾掉)
- _extract_recommended_actions 改返回 (list, status_str) tuple
· status="ok"/"empty"/"all_invalid" 供呼叫端決策
- protocol.py +16 / metrics.py +9 / ai_router.py +18 — 配套 metric + protocol field
測試對齊:
- test_solver_recommended_actions.py 拆 test_all_valid → low/medium/high accepted +
test_critical_rejected
- result tuple unpack: result, _ = _extract_recommended_actions(...)
- test_diagnosis_aggregator_stub.py: feature flag default 改 True 對齊 PathA
Tests: 51 passed (solver 28 + aggregator 16 + router fallback 8)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: Multiple Engineers (Wave 8 P3.1-T2 PathA + F4) <noreply@anthropic.com>
305 lines
13 KiB
Python
305 lines
13 KiB
Python
"""
|
||
AWOOOI AIOps Phase 2 — 多 Agent 協作訊息協定
|
||
==============================================
|
||
定義 5 個 Agent 間傳遞的不可變資料型別。
|
||
|
||
設計原則:
|
||
1. 每個 Agent 有明確的 Input / Output 型別(不共用 dict)
|
||
2. 所有型別都是 dataclass(快速、可序列化、無外部依賴)
|
||
3. 降級 / 棄權用明確 AgentVote.ABSTAIN,不用 None 代替
|
||
4. 全程 immutable — Agent 不得修改彼此的輸出(防 prompt 污染)
|
||
|
||
ADR-082: 多 Agent 協作架構(Phase 2)
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||
2026-04-27 Claude Sonnet 4.6: B1 — 新增 RecommendedAction schema(北極星 §1.1 修復多樣性 ≥ 40%)
|
||
2026-04-27 Claude Sonnet 4.6: H1+B1 Fix Round — ActionPlan.recommended_actions_status enum(可觀測性)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from enum import Enum
|
||
from typing import Any, Literal
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: H1+B1 Fix Round — recommended_actions_status 型別別名
|
||
# 方便 solver_agent.py 使用;Literal 比 Enum 輕量且不需要額外 import
|
||
RecommendedActionsStatus = Literal[
|
||
"ok", # LLM 推出 ≥ 1 個通過 registry + validator 的 action
|
||
"empty", # LLM 推 0 個 recommended_actions
|
||
"schema_failed", # LLM 推但全被 schema / registry 驗證 reject
|
||
"registry_unavailable",# registry 載入失敗({})
|
||
]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Enums
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class AgentRole(str, Enum):
|
||
"""Phase 2 五角色標識"""
|
||
DIAGNOSTICIAN = "diagnostician"
|
||
SOLVER = "solver"
|
||
REVIEWER = "reviewer"
|
||
CRITIC = "critic"
|
||
COORDINATOR = "coordinator"
|
||
|
||
|
||
class AgentVote(str, Enum):
|
||
"""Agent 投票結果"""
|
||
APPROVE = "approve"
|
||
REJECT = "reject"
|
||
REQUEST_REVISION = "request_revision"
|
||
ABSTAIN = "abstain" # 熔斷 / 超時 / 無足夠資訊
|
||
DEGRADED = "degraded" # 降級路徑(rule-based mock)
|
||
|
||
|
||
class AgentSessionStatus(str, Enum):
|
||
"""AgentSession 整體狀態"""
|
||
RUNNING = "running"
|
||
COMPLETED = "completed"
|
||
DEGRADED = "degraded" # 部分 Agent 熔斷但仍完成
|
||
FAILED = "failed" # Coordinator 無法輸出任何結論
|
||
TIMEOUT = "timeout" # 全流程 > 30s
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Diagnostician Output
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class Hypothesis:
|
||
"""單一根因假設"""
|
||
description: str
|
||
confidence: float # 0.0 ~ 1.0
|
||
evidence_chain: list[str] # 支持此假設的 evidence key
|
||
category: str = "" # alert_category(KubePod / HostDisk 等)
|
||
|
||
|
||
@dataclass
|
||
class DiagnosisReport:
|
||
"""
|
||
Diagnostician Agent 輸出
|
||
|
||
包含多個根因假設(按信心排序),
|
||
top-1 confidence < 0.4 觸發 Coordinator 回退 Investigator 重抓。
|
||
"""
|
||
hypotheses: list[Hypothesis]
|
||
evidence_snapshot_id: str
|
||
latency_ms: int
|
||
vote: AgentVote = AgentVote.APPROVE # 資訊足夠 = APPROVE;不足 = ABSTAIN
|
||
degraded: bool = False # 熔斷降級標記
|
||
|
||
@property
|
||
def top_hypothesis(self) -> Hypothesis | None:
|
||
"""最高信心假設"""
|
||
return self.hypotheses[0] if self.hypotheses else None
|
||
|
||
@property
|
||
def top_confidence(self) -> float:
|
||
return self.top_hypothesis.confidence if self.top_hypothesis else 0.0
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Solver Output
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class CandidateAction:
|
||
"""單一候選修復動作"""
|
||
action: str # 動作描述(e.g. "restart_service:awoooi-api")
|
||
blast_radius: int # 0-100:影響範圍評分
|
||
rollback_cost: int # 0-100:回滾難度
|
||
confidence: float # 0.0 ~ 1.0
|
||
rationale: str = "" # 為什麼選此方案
|
||
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%)
|
||
# RecommendedAction 是 ActionPlan.recommended_actions 的元素,供 B3 Telegram 按鈕動態生成用。
|
||
# 與 CandidateAction(kubectl 命令字串)不同:RecommendedAction 指向 MCP tool(可被 B2 allowlist 審核)。
|
||
@dataclass
|
||
class RecommendedAction:
|
||
"""
|
||
結構化推薦修復動作(B1 新增,供 Telegram 按鈕動態生成)
|
||
|
||
與 CandidateAction 的差異:
|
||
- CandidateAction:kubectl 命令字串(供 Coordinator 判斷)
|
||
- RecommendedAction:MCP tool 呼叫規格(供 B3 Telegram 按鈕動態渲染)
|
||
|
||
mcp_provider 必須在 callback_action_spec.yaml 的 provider 清單內。
|
||
mcp_tool 必須在 B2 allowlist(待 B2 任務建立)。
|
||
params 支援模板替換:{labels.xxx} / {incident_id}。
|
||
"""
|
||
name: str # action 識別(如 check_pod_logs)
|
||
label: str # UI 顯示文字(如「查 Pod 日誌」)
|
||
emoji: str # UI 圖示(如「📋」)
|
||
mcp_provider: Literal[ # MCP provider 限制在已知清單
|
||
"k8s", "ssh", "prometheus", "signoz", "database", "internal"
|
||
]
|
||
mcp_tool: str # MCP tool 名(必須在 B2 allowlist)
|
||
params: dict[str, str] # 參數模板(支援 {labels.xxx} / {incident_id})
|
||
risk: Literal["low", "medium", "high", "critical"] # 風險等級
|
||
reasoning: str # 為何推薦此動作(讓 critic 能審)
|
||
|
||
|
||
@dataclass
|
||
class ActionPlan:
|
||
"""
|
||
Solver Agent 輸出
|
||
|
||
對每個根因假設提出 ≥1 個候選方案(含 blast_radius / rollback_cost)。
|
||
blast_radius > 50 → Reviewer 必須標 `request_revision`。
|
||
|
||
2026-04-27 Claude Sonnet 4.6: B1 新增 recommended_actions(結構化動作清單)
|
||
- recommended_actions 為空 list 代表降級(degraded=True)或 LLM 無法輸出合法動作
|
||
- Coordinator 舊邏輯只讀 candidates,不受影響
|
||
2026-04-27 Claude Sonnet 4.6: H1+B1 Fix Round — recommended_actions_status 新增
|
||
- 可觀測性:B3 Telegram / 監控 dashboard 可讀取此欄位判斷 Solver 品質
|
||
"""
|
||
candidates: list[CandidateAction]
|
||
diagnosis_report: DiagnosisReport
|
||
latency_ms: int
|
||
vote: AgentVote = AgentVote.APPROVE
|
||
degraded: bool = False
|
||
# 2026-04-27 Claude Sonnet 4.6: B1 — 結構化推薦動作(0-3 個,降級時為 [])
|
||
recommended_actions: list[RecommendedAction] = field(default_factory=list)
|
||
# 2026-04-27 Claude Sonnet 4.6: H1+B1 Fix Round — recommended_actions 提取結果狀態
|
||
# ok=正常, empty=LLM 未輸出, schema_failed=全部驗證失敗, registry_unavailable=registry 載入失敗
|
||
# 欄位加在尾部,default="ok",不破壞既有 callsite
|
||
recommended_actions_status: RecommendedActionsStatus = "ok"
|
||
|
||
@property
|
||
def top_candidate(self) -> CandidateAction | None:
|
||
"""最高信心候選方案"""
|
||
return max(self.candidates, key=lambda c: c.confidence) if self.candidates else None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Reviewer Output
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class ReviewVerdict:
|
||
"""
|
||
Reviewer Agent 輸出(安全審查)
|
||
|
||
硬核拒絕 HARD_RULES 觸碰動作(delete node / DROP TABLE / force push 等)。
|
||
vote = REJECT 時,Coordinator 不得執行任何候選方案。
|
||
"""
|
||
vote: AgentVote
|
||
reason: str
|
||
blocked_actions: list[str] # 被拒絕的動作清單
|
||
safe_actions: list[str] # 通過安全審查的動作
|
||
latency_ms: int
|
||
degraded: bool = False
|
||
|
||
@property
|
||
def is_safe(self) -> bool:
|
||
return self.vote == AgentVote.APPROVE and bool(self.safe_actions)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Critic Output
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class Challenge:
|
||
"""Critic 的單一質疑"""
|
||
target: str # "diagnosis" | "action:{action_str}"
|
||
argument: str # 質疑的具體理由
|
||
severity: str # "minor" | "major" | "critical"
|
||
|
||
|
||
@dataclass
|
||
class CriticReport:
|
||
"""
|
||
Critic Agent 輸出(刻意唱反調)
|
||
|
||
challenge_count > 0 是 Phase 2 退出條件之一。
|
||
major/critical challenge 觸發 Coordinator 降低對 Solver 方案的信心。
|
||
"""
|
||
challenges: list[Challenge]
|
||
overall_assessment: str
|
||
latency_ms: int
|
||
vote: AgentVote = AgentVote.APPROVE # APPROVE=無重大反對;REJECT=有 critical challenge
|
||
degraded: bool = False
|
||
|
||
@property
|
||
def challenge_count(self) -> int:
|
||
return len(self.challenges)
|
||
|
||
@property
|
||
def has_critical_challenge(self) -> bool:
|
||
return any(c.severity == "critical" for c in self.challenges)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Coordinator Output
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class DecisionPackage:
|
||
"""
|
||
Coordinator Agent 輸出(最終決策包)
|
||
|
||
包含:
|
||
- recommended_action: 最終推薦動作(None = 棄權 / 升級人工)
|
||
- confidence: 綜合信心(Solver × Reviewer × Critic 加權)
|
||
- requires_human_approval: 是否需要人工審核
|
||
- debate_summary: 辯證歷程摘要(供 Audit Trail + 學習閉環)
|
||
- session_status: 整體辯證狀態
|
||
"""
|
||
recommended_action: str | None
|
||
confidence: float
|
||
requires_human_approval: bool
|
||
debate_summary: str
|
||
session_status: AgentSessionStatus
|
||
latency_ms: int
|
||
|
||
# 保留各 Agent 原始輸出(供學習閉環查詢)
|
||
diagnosis: DiagnosisReport | None = None
|
||
action_plan: ActionPlan | None = None
|
||
reviewer_verdict: ReviewVerdict | None = None
|
||
critic_report: CriticReport | None = None
|
||
|
||
# 棄選方案(含原因)
|
||
rejected_actions: list[dict[str, Any]] = field(default_factory=list)
|
||
|
||
# 阻擋原因(requires_human_approval=True 時說明)
|
||
blocked_reason: str | None = None
|
||
|
||
# 全部 Agent 都降級(更嚴格的人工審核信號)
|
||
all_agents_degraded: bool = False
|
||
|
||
@property
|
||
def is_actionable(self) -> bool:
|
||
"""可以執行(有推薦動作且信心 > 0.4 且 Reviewer 通過)"""
|
||
if not self.recommended_action:
|
||
return False
|
||
if self.confidence < 0.4:
|
||
return False
|
||
if self.reviewer_verdict and self.reviewer_verdict.vote == AgentVote.REJECT:
|
||
return False
|
||
return True
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Agent Session Record(DB 寫入用)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class AgentTurn:
|
||
"""
|
||
單次 Agent 發言記錄
|
||
|
||
寫入 `agent_sessions` 表的一行,
|
||
session_id + agent_role 唯一確定一次辯證發言。
|
||
"""
|
||
session_id: str
|
||
incident_id: str
|
||
agent_role: AgentRole
|
||
input_hash: str # sha256(input_json)[:16]
|
||
output_json: dict[str, Any] # Agent 原始輸出
|
||
latency_ms: int
|
||
vote: AgentVote
|
||
degraded: bool = False
|