""" AWOOOI AIOps Phase 2 — Agent Orchestrator(排程指揮官) ====================================================== 職責:序列化 5 個 Agent 的執行,管理 Redis Streams 審計軌跡,記錄 DB 執行順序: Diagnostician(snapshot) → Solver(diagnosis) → [Reviewer(plan) ‖ Critic(diagnosis, plan)] # 並行 → Coordinator(diagnosis, plan, verdict, critic) 設計原則: 1. 每個 Agent 已有內建 5s 熔斷(在各自的 run() 中)— Orchestrator 不重複 2. 全流程 30s 全局超時(GLOBAL_TIMEOUT_SEC) 3. Redis Streams XADD:best-effort,失敗不阻塞主流程 4. DB 記錄:每個 Agent turn 一行 agent_sessions(Immutable Event Sourcing) 5. AIOPS_P2_ENABLED=False 時禁止使用(呼叫方負責 gate) Redis Stream key: aiops:p2:events DB table: agent_sessions ADR-082: Phase 2 多 Agent 協作 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立 """ from __future__ import annotations import asyncio import dataclasses import os import time import uuid from datetime import UTC, datetime from typing import TYPE_CHECKING import structlog from sqlalchemy import insert from src.agents.coordinator_agent import get_coordinator_agent from src.agents.critic_agent import compute_input_hash as critic_hash from src.agents.critic_agent import get_critic_agent from src.agents.diagnostician_agent import compute_input_hash as diag_hash from src.agents.diagnostician_agent import get_diagnostician_agent from src.agents.protocol import ( ActionPlan, AgentRole, AgentSessionStatus, AgentVote, CriticReport, DecisionPackage, DiagnosisReport, ReviewVerdict, ) from src.agents.reviewer_agent import compute_input_hash as reviewer_hash from src.agents.reviewer_agent import get_reviewer_agent from src.agents.solver_agent import compute_input_hash as solver_hash from src.agents.solver_agent import get_solver_agent from src.db.base import get_db_context from src.db.models import AgentSession if TYPE_CHECKING: from src.services.evidence_snapshot import EvidenceSnapshot logger = structlog.get_logger(__name__) def _agent_debate_global_timeout_seconds() -> float: """Return the full Phase 2 debate timeout. GCP Ollama incident analysis can legitimately take longer than the old 90s guard. Keep a hard ceiling, but make it an explicit deployment knob. """ raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "420.0") try: timeout = float(raw) except (TypeError, ValueError): timeout = 420.0 return max(timeout, 90.0) # 全局超時(所有 Agent 加起來) # 2026-05-06 Codex: configurable for GCP-A/GCP-B/111 Ollama-first incident # diagnosis. The old 90s guard was cutting off valid deep diagnosis runs. GLOBAL_TIMEOUT_SEC = _agent_debate_global_timeout_seconds() # 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC # LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。 # 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。 # Redis Stream key STREAM_KEY = "aiops:p2:events" STREAM_MAXLEN = 10_000 # 保留最近 10k 條事件 # ───────────────────────────────────────────────────────────────────────────── # Public API # ───────────────────────────────────────────────────────────────────────────── async def run_agent_debate( snapshot: "EvidenceSnapshot", incident_id: str, ) -> DecisionPackage: """ 執行完整的 5 Agent 辯證流程。 Args: snapshot: Phase 1 感官快照(EvidenceSnapshot) incident_id: 關聯 incident ID Returns: DecisionPackage(包含 recommended_action + confidence + requires_human_approval) Raises: 不拋出 — 所有錯誤內部吸收,最壞情況返回 requires_human_approval=True 的 Package """ session_id = str(uuid.uuid4()) start_ms = int(time.monotonic() * 1000) logger.info( "agent_debate_start", session_id=session_id, incident_id=incident_id, snapshot_id=snapshot.snapshot_id, ) try: package = await asyncio.wait_for( _debate(session_id, incident_id, snapshot), timeout=GLOBAL_TIMEOUT_SEC, ) elapsed = int(time.monotonic() * 1000) - start_ms logger.info( "agent_debate_done", session_id=session_id, incident_id=incident_id, requires_human=package.requires_human_approval, confidence=package.confidence, status=package.session_status.value, elapsed_ms=elapsed, ) # ADR-083 Phase 3: 辯證結果學習訊號接線(L7×D2) # best-effort — 學習失敗不阻塞主流程 # 2026-04-15 ogt + Claude Sonnet 4.6(亞太) try: from src.services.learning_service import get_learning_service _critic_challenges = ( package.critic_report.challenge_count if package.critic_report else 0 ) await get_learning_service().record_agent_session( session_id=session_id, incident_id=incident_id, final_confidence=package.confidence, requires_human_approval=package.requires_human_approval, all_agents_degraded=package.all_agents_degraded, critic_challenge_count=_critic_challenges, ) except Exception as _lerr: logger.warning( "agent_session_learning_failed", session_id=session_id, error=str(_lerr), ) return package except asyncio.TimeoutError: elapsed = int(time.monotonic() * 1000) - start_ms logger.error( "agent_debate_global_timeout", session_id=session_id, elapsed_ms=elapsed, timeout_sec=GLOBAL_TIMEOUT_SEC, ) return _timeout_package(elapsed) except Exception: elapsed = int(time.monotonic() * 1000) - start_ms logger.exception("agent_debate_fatal", session_id=session_id) return _error_package(elapsed) # ───────────────────────────────────────────────────────────────────────────── # Internal — debate pipeline # ───────────────────────────────────────────────────────────────────────────── async def _debate( session_id: str, incident_id: str, snapshot: "EvidenceSnapshot", ) -> DecisionPackage: """實際的 Agent 辯證流程(在全局超時保護內執行)。""" # ── Step 1: Diagnostician ────────────────────────────────────────────── diagnostician = get_diagnostician_agent() diagnosis = await diagnostician.run(snapshot) await _record_turn( session_id=session_id, incident_id=incident_id, role=AgentRole.DIAGNOSTICIAN, input_hash=diag_hash(snapshot), output=diagnosis, latency_ms=diagnosis.latency_ms, vote=diagnosis.vote, degraded=diagnosis.degraded, ) # ── Step 2: Solver ───────────────────────────────────────────────────── solver = get_solver_agent() # 2026-04-28: 把告警 labels 傳入,讓 Solver params 模板能填真實值 _alert_labels = (snapshot.alert_info or {}).get("labels", {}) if snapshot.alert_info else {} plan = await solver.run(diagnosis, incident_labels=_alert_labels) await _record_turn( session_id=session_id, incident_id=incident_id, role=AgentRole.SOLVER, input_hash=solver_hash(diagnosis), output=plan, latency_ms=plan.latency_ms, vote=plan.vote, degraded=plan.degraded, ) # ── Step 3: Reviewer ‖ Critic(並行)────────────────────────────────── reviewer = get_reviewer_agent() critic = get_critic_agent() verdict, critic_report = await asyncio.gather( reviewer.run(plan), critic.run(diagnosis, plan), ) await asyncio.gather( _record_turn( session_id=session_id, incident_id=incident_id, role=AgentRole.REVIEWER, input_hash=reviewer_hash(plan), output=verdict, latency_ms=verdict.latency_ms, vote=verdict.vote, degraded=verdict.degraded, ), _record_turn( session_id=session_id, incident_id=incident_id, role=AgentRole.CRITIC, input_hash=critic_hash(diagnosis, plan), output=critic_report, latency_ms=critic_report.latency_ms, vote=critic_report.vote, degraded=critic_report.degraded, ), return_exceptions=True, # MINOR-1: best-effort — 一個審計失敗不取消另一個 ) # ── Step 4: Coordinator ──────────────────────────────────────────────── coordinator = get_coordinator_agent() package = await coordinator.run(diagnosis, plan, verdict, critic_report) await _record_turn( session_id=session_id, incident_id=incident_id, role=AgentRole.COORDINATOR, input_hash=_coordinator_input_hash(diagnosis, plan, verdict, critic_report), output=package, latency_ms=package.latency_ms, vote=AgentVote.APPROVE if not package.requires_human_approval else AgentVote.ABSTAIN, degraded=package.all_agents_degraded, ) return package # ───────────────────────────────────────────────────────────────────────────── # Internal — DB + Redis recording # ───────────────────────────────────────────────────────────────────────────── async def _record_turn( session_id: str, incident_id: str, role: AgentRole, input_hash: str, output: object, latency_ms: int, vote: AgentVote, degraded: bool, ) -> None: """ 寫入 agent_sessions DB 行 + Redis Stream XADD(兩者皆 best-effort)。 Immutable Event Sourcing — 只 INSERT,永不 UPDATE / DELETE。 """ output_json = _serialize_output(output) # ── DB 寫入 ────────────────────────────────────────────────────────── try: async with get_db_context() as db: await db.execute( insert(AgentSession).values( id=str(uuid.uuid4()), session_id=session_id, incident_id=incident_id, agent_role=role.value, input_hash=input_hash, output_json=output_json, latency_ms=latency_ms, vote=vote.value, degraded=degraded, created_at=datetime.now(UTC), ) ) except Exception: logger.exception( "agent_session_db_write_failed", session_id=session_id, role=role.value, ) # ── Redis Stream XADD(best-effort,失敗靜默)──────────────────────── try: from src.core.redis_client import get_redis redis = get_redis() await redis.xadd( STREAM_KEY, { "session_id": session_id, "incident_id": incident_id, "role": role.value, "vote": vote.value, "latency_ms": str(latency_ms), "degraded": "1" if degraded else "0", }, maxlen=STREAM_MAXLEN, approximate=True, ) except Exception: # Redis 失敗不阻塞主流程(觀測性非關鍵路徑) logger.warning( "agent_stream_xadd_failed", session_id=session_id, role=role.value, ) def _serialize_output(output: object) -> dict: """ 將 Agent 輸出 dataclass 轉為可 JSON 序列化的 dict。 Gate 2: dataclasses.asdict() 保留 Enum 實例,json.dumps 會拋 TypeError。 用 json.dumps(default=...) 先把 Enum 轉 .value 再 loads 回 dict。 """ import json from enum import Enum try: if dataclasses.is_dataclass(output) and not isinstance(output, type): raw = dataclasses.asdict(output) # type: ignore[arg-type] return json.loads( json.dumps(raw, default=lambda o: o.value if isinstance(o, Enum) else str(o)) ) return {"raw": str(output)[:2000]} except Exception: return {"error": "serialization_failed"} def _coordinator_input_hash( diagnosis: DiagnosisReport, plan: ActionPlan, verdict: ReviewVerdict, critic: CriticReport, ) -> str: """Coordinator 輸入 fingerprint(4 個 Agent 輸出的組合)。""" import hashlib key = ( diagnosis.evidence_snapshot_id + (diagnosis.top_hypothesis.description if diagnosis.top_hypothesis else "") + (plan.top_candidate.action if plan.top_candidate else "") + verdict.vote.value + str(critic.challenge_count) ) return hashlib.sha256(key.encode()).hexdigest()[:16] # ───────────────────────────────────────────────────────────────────────────── # Degraded fallback packages # ───────────────────────────────────────────────────────────────────────────── def _timeout_package(elapsed_ms: int) -> DecisionPackage: """全局超時 → 強制人工審核。""" return DecisionPackage( recommended_action=None, confidence=0.0, requires_human_approval=True, debate_summary=f"[超時] 辯證流程超過 {GLOBAL_TIMEOUT_SEC}s,強制升級人工審核", session_status=AgentSessionStatus.TIMEOUT, latency_ms=elapsed_ms, blocked_reason=f"全局超時 > {GLOBAL_TIMEOUT_SEC}s", all_agents_degraded=True, ) def _error_package(elapsed_ms: int) -> DecisionPackage: """未預期錯誤 → 強制人工審核。""" return DecisionPackage( recommended_action=None, confidence=0.0, requires_human_approval=True, debate_summary="[錯誤] 辯證流程發生未預期錯誤,強制升級人工審核", session_status=AgentSessionStatus.FAILED, latency_ms=elapsed_ms, blocked_reason="Orchestrator 未預期錯誤", all_agents_degraded=True, )