Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
原設計 asyncio.wait_for(timeout_sec=25s) 是任意截斷,
只要 LLM 超過時限就降級為 confidence=20%,根本沒有分析。
正確做法:
- 移除所有 4 個 agent 的 asyncio.wait_for() 包裝
- 只留 except Exception 捕真實異常(連線失敗、模型崩潰)
- 全流程由 Orchestrator GLOBAL_TIMEOUT_SEC=90s 防掛死
- _PER_AGENT_TIMEOUT_SEC 常數廢棄移除
影響:LLM 推理多久就等多久,不再人工截斷,
deepseek-r1:14b 等模型得以完整輸出分析結果。
2026-04-16 ogt + Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
228 lines
8.7 KiB
Python
228 lines
8.7 KiB
Python
"""
|
||
AWOOOI AIOps Phase 2 — Reviewer Agent(安全官)
|
||
================================================
|
||
職責:安全審查 + 可行性驗證
|
||
|
||
輸入:ActionPlan(來自 Solver)
|
||
輸出:ReviewVerdict(approve / reject / request_revision)
|
||
|
||
設計原則:
|
||
1. 硬核拒絕 HARD_RULES 觸碰動作(delete node / DROP TABLE / force push 等)
|
||
2. blast_radius > 50 → 自動 request_revision(不 reject,讓 Solver 調整方案)
|
||
3. blast_radius > 80 → reject(風險太高)
|
||
4. 熔斷降級:LLM 失敗 → 保守降級(APPROVE 低 blast_radius,REJECT 高 blast_radius)
|
||
5. Reviewer 的 REJECT 是最高優先:Coordinator 不得執行任何被拒絕的方案
|
||
|
||
HARD_RULES 觸碰清單(ADR-082 §安全原則):
|
||
- kubectl delete node / kubectl delete --all
|
||
- DROP TABLE / DELETE FROM(無 WHERE)
|
||
- rm -rf /
|
||
- force push to main
|
||
- kubectl exec 執行任意 shell
|
||
|
||
ADR-082: Phase 2 多 Agent 協作
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import re
|
||
import time
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.agents.base import BaseAgent
|
||
from src.agents.protocol import (
|
||
ActionPlan,
|
||
AgentRole,
|
||
AgentVote,
|
||
CandidateAction,
|
||
ReviewVerdict,
|
||
)
|
||
from src.services.sanitization_service import sanitize
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# blast_radius 閾值
|
||
BLAST_REQUEST_REVISION_THRESHOLD = 50 # > 50 → request_revision
|
||
BLAST_REJECT_THRESHOLD = 80 # > 80 → reject(太危險)
|
||
|
||
# 硬核拒絕 pattern(HARD_RULES 觸碰)
|
||
_HARD_BLOCK_PATTERNS = [
|
||
re.compile(r"kubectl\s+delete\s+node", re.IGNORECASE),
|
||
re.compile(r"kubectl\s+delete\s+--all", re.IGNORECASE),
|
||
re.compile(r"\bDROP\s+TABLE\b", re.IGNORECASE),
|
||
re.compile(r"\bDELETE\s+FROM\b(?!.*\bWHERE\b)", re.IGNORECASE | re.DOTALL), # Gate 2: lookahead 必須在 FROM 後而非 .* 後
|
||
re.compile(r"rm\s+-rf\s+/", re.IGNORECASE),
|
||
# Gate 2 驗證修正:git push --force 是 "push" 先、"--force/-f" 後,需同時覆蓋兩種順序
|
||
re.compile(r"(?:force.{0,5}push|push.{0,30}(?:--force|-f\b)).{0,30}main", re.IGNORECASE),
|
||
]
|
||
|
||
|
||
class ReviewerAgent(BaseAgent):
|
||
"""
|
||
Reviewer Agent — 安全審查官
|
||
|
||
Usage:
|
||
agent = ReviewerAgent()
|
||
verdict = await agent.run(action_plan)
|
||
"""
|
||
|
||
AGENT_NAME = AgentRole.REVIEWER.value
|
||
AGENT_DESCRIPTION = "Safety and feasibility reviewer. Hard-blocks HARD_RULES violations."
|
||
|
||
async def run(
|
||
self,
|
||
plan: ActionPlan,
|
||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||
) -> ReviewVerdict:
|
||
"""
|
||
審查方案安全性。
|
||
|
||
Args:
|
||
plan: Solver 輸出的方案
|
||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||
|
||
Returns:
|
||
ReviewVerdict(真實異常時 degraded=True)
|
||
"""
|
||
start_ms = int(time.monotonic() * 1000)
|
||
|
||
# 1. 硬核靜態檢查(不依賴 LLM)— HARD_RULES 優先
|
||
hard_blocked = [
|
||
c.action for c in plan.candidates
|
||
if _is_hard_blocked(c.action)
|
||
]
|
||
if hard_blocked:
|
||
latency = int(time.monotonic() * 1000) - start_ms
|
||
logger.warning("reviewer_hard_block", blocked=hard_blocked)
|
||
return ReviewVerdict(
|
||
vote=AgentVote.REJECT,
|
||
reason=f"HARD_RULES 觸碰:{hard_blocked}",
|
||
blocked_actions=hard_blocked,
|
||
safe_actions=[],
|
||
latency_ms=latency,
|
||
)
|
||
|
||
try:
|
||
verdict = await self._review(plan)
|
||
verdict.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||
logger.info(
|
||
"reviewer_done",
|
||
vote=verdict.vote,
|
||
blocked=len(verdict.blocked_actions),
|
||
safe=len(verdict.safe_actions),
|
||
latency_ms=verdict.latency_ms,
|
||
)
|
||
return verdict
|
||
|
||
except Exception:
|
||
latency = int(time.monotonic() * 1000) - start_ms
|
||
logger.exception("reviewer_error")
|
||
return self._degraded_verdict(plan, latency, "error")
|
||
|
||
async def _review(self, plan: ActionPlan) -> ReviewVerdict:
|
||
"""LLM 審查 + blast_radius 靜態規則組合。"""
|
||
# 靜態 blast_radius 規則(不需要 LLM)
|
||
high_blast = [c for c in plan.candidates if c.blast_radius > BLAST_REJECT_THRESHOLD]
|
||
mid_blast = [c for c in plan.candidates if BLAST_REQUEST_REVISION_THRESHOLD < c.blast_radius <= BLAST_REJECT_THRESHOLD]
|
||
safe_candidates = [c for c in plan.candidates if c.blast_radius <= BLAST_REQUEST_REVISION_THRESHOLD]
|
||
|
||
if high_blast:
|
||
return ReviewVerdict(
|
||
vote=AgentVote.REJECT,
|
||
reason=f"blast_radius > {BLAST_REJECT_THRESHOLD},風險過高",
|
||
blocked_actions=[c.action for c in high_blast],
|
||
safe_actions=[c.action for c in safe_candidates],
|
||
latency_ms=0,
|
||
)
|
||
|
||
if mid_blast:
|
||
return ReviewVerdict(
|
||
vote=AgentVote.REQUEST_REVISION,
|
||
reason=f"blast_radius > {BLAST_REQUEST_REVISION_THRESHOLD},請 Solver 提供影響更小的方案",
|
||
blocked_actions=[c.action for c in mid_blast],
|
||
safe_actions=[c.action for c in safe_candidates],
|
||
latency_ms=0,
|
||
)
|
||
|
||
# 低 blast_radius → LLM 補充可行性審查
|
||
if safe_candidates:
|
||
return ReviewVerdict(
|
||
vote=AgentVote.APPROVE,
|
||
reason="blast_radius 符合安全閾值,靜態規則通過",
|
||
blocked_actions=[],
|
||
safe_actions=[c.action for c in safe_candidates],
|
||
latency_ms=0,
|
||
)
|
||
|
||
return ReviewVerdict(
|
||
vote=AgentVote.ABSTAIN,
|
||
reason="無候選方案可審查",
|
||
blocked_actions=[],
|
||
safe_actions=[],
|
||
latency_ms=0,
|
||
)
|
||
|
||
def _build_prompt(self, context: dict[str, Any]) -> str:
|
||
return "" # Phase 2 Reviewer 使用靜態規則,LLM 備用
|
||
|
||
def _parse_response(self, response: str) -> dict[str, Any]:
|
||
return self._extract_json(response)
|
||
|
||
def analyze(self, context: dict[str, Any]) -> Any:
|
||
raise NotImplementedError("Use run() for Phase 2 agents")
|
||
|
||
def _degraded_verdict(
|
||
self,
|
||
plan: ActionPlan,
|
||
latency_ms: int,
|
||
reason: str,
|
||
) -> ReviewVerdict:
|
||
"""
|
||
熔斷降級:保守策略
|
||
- blast_radius <= 30 → APPROVE(低風險兜底)
|
||
- blast_radius > 30 → REQUEST_REVISION(高風險不敢承擔)
|
||
"""
|
||
safe = [c.action for c in plan.candidates if c.blast_radius <= 30]
|
||
risky = [c.action for c in plan.candidates if c.blast_radius > 30]
|
||
vote = AgentVote.APPROVE if safe and not risky else AgentVote.REQUEST_REVISION
|
||
return ReviewVerdict(
|
||
vote=vote,
|
||
reason=f"[降級] Reviewer LLM 失敗({reason}),使用保守靜態降級規則",
|
||
blocked_actions=risky,
|
||
safe_actions=safe,
|
||
latency_ms=latency_ms,
|
||
degraded=True,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _is_hard_blocked(action: str) -> bool:
|
||
"""檢查動作是否觸碰 HARD_RULES(靜態 pattern,不依賴 LLM)"""
|
||
return any(p.search(action) for p in _HARD_BLOCK_PATTERNS)
|
||
|
||
|
||
def compute_input_hash(plan: ActionPlan) -> str:
|
||
key = plan.diagnosis_report.evidence_snapshot_id + str([c.action for c in plan.candidates])
|
||
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_agent: ReviewerAgent | None = None
|
||
|
||
|
||
def get_reviewer_agent() -> ReviewerAgent:
|
||
global _agent
|
||
if _agent is None:
|
||
_agent = ReviewerAgent()
|
||
return _agent
|