415 lines
16 KiB
Python
415 lines
16 KiB
Python
"""
|
||
AWOOOI AIOps Phase 2 — Agent Orchestrator(排程指揮官)
|
||
======================================================
|
||
職責:序列化 5 個 Agent 的執行,管理 Redis Streams 審計軌跡,記錄 DB
|
||
|
||
執行順序:
|
||
Diagnostician(snapshot)
|
||
→ Solver(diagnosis)
|
||
→ [Reviewer(plan) ‖ Critic(diagnosis, plan)] # 並行
|
||
→ Coordinator(diagnosis, plan, verdict, critic)
|
||
|
||
設計原則:
|
||
1. 每個 Agent 已有內建 5s 熔斷(在各自的 run() 中)— Orchestrator 不重複
|
||
2. 全流程 30s 全局超時(GLOBAL_TIMEOUT_SEC)
|
||
3. Redis Streams XADD:best-effort,失敗不阻塞主流程
|
||
4. DB 記錄:每個 Agent turn 一行 agent_sessions(Immutable Event Sourcing)
|
||
5. AIOPS_P2_ENABLED=False 時禁止使用(呼叫方負責 gate)
|
||
|
||
Redis Stream key: aiops:p2:events
|
||
DB table: agent_sessions
|
||
|
||
ADR-082: Phase 2 多 Agent 協作
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import dataclasses
|
||
import os
|
||
import time
|
||
import uuid
|
||
from datetime import UTC, datetime
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
from sqlalchemy import insert
|
||
|
||
from src.agents.coordinator_agent import get_coordinator_agent
|
||
from src.agents.critic_agent import compute_input_hash as critic_hash
|
||
from src.agents.critic_agent import get_critic_agent
|
||
from src.agents.diagnostician_agent import compute_input_hash as diag_hash
|
||
from src.agents.diagnostician_agent import get_diagnostician_agent
|
||
from src.agents.protocol import (
|
||
ActionPlan,
|
||
AgentRole,
|
||
AgentSessionStatus,
|
||
AgentVote,
|
||
CriticReport,
|
||
DecisionPackage,
|
||
DiagnosisReport,
|
||
ReviewVerdict,
|
||
)
|
||
from src.agents.reviewer_agent import compute_input_hash as reviewer_hash
|
||
from src.agents.reviewer_agent import get_reviewer_agent
|
||
from src.agents.solver_agent import compute_input_hash as solver_hash
|
||
from src.agents.solver_agent import get_solver_agent
|
||
from src.db.base import get_db_context
|
||
from src.db.models import AgentSession
|
||
|
||
if TYPE_CHECKING:
|
||
from src.services.evidence_snapshot import EvidenceSnapshot
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
def _agent_debate_global_timeout_seconds() -> float:
|
||
"""Return the full Phase 2 debate timeout.
|
||
|
||
GCP Ollama incident analysis can legitimately take longer than the old
|
||
90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
|
||
"""
|
||
|
||
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "260.0")
|
||
try:
|
||
timeout = float(raw)
|
||
except (TypeError, ValueError):
|
||
timeout = 260.0
|
||
return max(timeout, 90.0)
|
||
|
||
|
||
# 全局超時(所有 Agent 加起來)
|
||
# 2026-05-06 Codex: configurable for GCP-A/GCP-B/111 Ollama-first incident
|
||
# diagnosis. The old 90s guard was cutting off valid deep diagnosis runs.
|
||
GLOBAL_TIMEOUT_SEC = _agent_debate_global_timeout_seconds()
|
||
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
|
||
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。
|
||
# 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。
|
||
|
||
# Redis Stream key
|
||
STREAM_KEY = "aiops:p2:events"
|
||
STREAM_MAXLEN = 10_000 # 保留最近 10k 條事件
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Public API
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def run_agent_debate(
|
||
snapshot: "EvidenceSnapshot",
|
||
incident_id: str,
|
||
) -> DecisionPackage:
|
||
"""
|
||
執行完整的 5 Agent 辯證流程。
|
||
|
||
Args:
|
||
snapshot: Phase 1 感官快照(EvidenceSnapshot)
|
||
incident_id: 關聯 incident ID
|
||
|
||
Returns:
|
||
DecisionPackage(包含 recommended_action + confidence + requires_human_approval)
|
||
|
||
Raises:
|
||
不拋出 — 所有錯誤內部吸收,最壞情況返回 requires_human_approval=True 的 Package
|
||
"""
|
||
session_id = str(uuid.uuid4())
|
||
start_ms = int(time.monotonic() * 1000)
|
||
|
||
logger.info(
|
||
"agent_debate_start",
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
snapshot_id=snapshot.snapshot_id,
|
||
)
|
||
|
||
try:
|
||
package = await asyncio.wait_for(
|
||
_debate(session_id, incident_id, snapshot),
|
||
timeout=GLOBAL_TIMEOUT_SEC,
|
||
)
|
||
elapsed = int(time.monotonic() * 1000) - start_ms
|
||
logger.info(
|
||
"agent_debate_done",
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
requires_human=package.requires_human_approval,
|
||
confidence=package.confidence,
|
||
status=package.session_status.value,
|
||
elapsed_ms=elapsed,
|
||
)
|
||
|
||
# ADR-083 Phase 3: 辯證結果學習訊號接線(L7×D2)
|
||
# best-effort — 學習失敗不阻塞主流程
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太)
|
||
try:
|
||
from src.services.learning_service import get_learning_service
|
||
_critic_challenges = (
|
||
package.critic_report.challenge_count
|
||
if package.critic_report else 0
|
||
)
|
||
await get_learning_service().record_agent_session(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
final_confidence=package.confidence,
|
||
requires_human_approval=package.requires_human_approval,
|
||
all_agents_degraded=package.all_agents_degraded,
|
||
critic_challenge_count=_critic_challenges,
|
||
)
|
||
except Exception as _lerr:
|
||
logger.warning(
|
||
"agent_session_learning_failed",
|
||
session_id=session_id,
|
||
error=str(_lerr),
|
||
)
|
||
|
||
return package
|
||
|
||
except asyncio.TimeoutError:
|
||
elapsed = int(time.monotonic() * 1000) - start_ms
|
||
logger.error(
|
||
"agent_debate_global_timeout",
|
||
session_id=session_id,
|
||
elapsed_ms=elapsed,
|
||
timeout_sec=GLOBAL_TIMEOUT_SEC,
|
||
)
|
||
return _timeout_package(elapsed)
|
||
|
||
except Exception:
|
||
elapsed = int(time.monotonic() * 1000) - start_ms
|
||
logger.exception("agent_debate_fatal", session_id=session_id)
|
||
return _error_package(elapsed)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Internal — debate pipeline
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _debate(
|
||
session_id: str,
|
||
incident_id: str,
|
||
snapshot: "EvidenceSnapshot",
|
||
) -> DecisionPackage:
|
||
"""實際的 Agent 辯證流程(在全局超時保護內執行)。"""
|
||
|
||
# ── Step 1: Diagnostician ──────────────────────────────────────────────
|
||
diagnostician = get_diagnostician_agent()
|
||
diagnosis = await diagnostician.run(snapshot)
|
||
await _record_turn(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
role=AgentRole.DIAGNOSTICIAN,
|
||
input_hash=diag_hash(snapshot),
|
||
output=diagnosis,
|
||
latency_ms=diagnosis.latency_ms,
|
||
vote=diagnosis.vote,
|
||
degraded=diagnosis.degraded,
|
||
)
|
||
|
||
# ── Step 2: Solver ─────────────────────────────────────────────────────
|
||
solver = get_solver_agent()
|
||
# 2026-04-28: 把告警 labels 傳入,讓 Solver params 模板能填真實值
|
||
_alert_labels = (snapshot.alert_info or {}).get("labels", {}) if snapshot.alert_info else {}
|
||
plan = await solver.run(diagnosis, incident_labels=_alert_labels)
|
||
await _record_turn(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
role=AgentRole.SOLVER,
|
||
input_hash=solver_hash(diagnosis),
|
||
output=plan,
|
||
latency_ms=plan.latency_ms,
|
||
vote=plan.vote,
|
||
degraded=plan.degraded,
|
||
)
|
||
|
||
# ── Step 3: Reviewer ‖ Critic(並行)──────────────────────────────────
|
||
reviewer = get_reviewer_agent()
|
||
critic = get_critic_agent()
|
||
|
||
verdict, critic_report = await asyncio.gather(
|
||
reviewer.run(plan),
|
||
critic.run(diagnosis, plan),
|
||
)
|
||
|
||
await asyncio.gather(
|
||
_record_turn(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
role=AgentRole.REVIEWER,
|
||
input_hash=reviewer_hash(plan),
|
||
output=verdict,
|
||
latency_ms=verdict.latency_ms,
|
||
vote=verdict.vote,
|
||
degraded=verdict.degraded,
|
||
),
|
||
_record_turn(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
role=AgentRole.CRITIC,
|
||
input_hash=critic_hash(diagnosis, plan),
|
||
output=critic_report,
|
||
latency_ms=critic_report.latency_ms,
|
||
vote=critic_report.vote,
|
||
degraded=critic_report.degraded,
|
||
),
|
||
return_exceptions=True, # MINOR-1: best-effort — 一個審計失敗不取消另一個
|
||
)
|
||
|
||
# ── Step 4: Coordinator ────────────────────────────────────────────────
|
||
coordinator = get_coordinator_agent()
|
||
package = await coordinator.run(diagnosis, plan, verdict, critic_report)
|
||
await _record_turn(
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
role=AgentRole.COORDINATOR,
|
||
input_hash=_coordinator_input_hash(diagnosis, plan, verdict, critic_report),
|
||
output=package,
|
||
latency_ms=package.latency_ms,
|
||
vote=AgentVote.APPROVE if not package.requires_human_approval else AgentVote.ABSTAIN,
|
||
degraded=package.all_agents_degraded,
|
||
)
|
||
|
||
return package
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Internal — DB + Redis recording
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _record_turn(
|
||
session_id: str,
|
||
incident_id: str,
|
||
role: AgentRole,
|
||
input_hash: str,
|
||
output: object,
|
||
latency_ms: int,
|
||
vote: AgentVote,
|
||
degraded: bool,
|
||
) -> None:
|
||
"""
|
||
寫入 agent_sessions DB 行 + Redis Stream XADD(兩者皆 best-effort)。
|
||
|
||
Immutable Event Sourcing — 只 INSERT,永不 UPDATE / DELETE。
|
||
"""
|
||
output_json = _serialize_output(output)
|
||
|
||
# ── DB 寫入 ──────────────────────────────────────────────────────────
|
||
try:
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
insert(AgentSession).values(
|
||
id=str(uuid.uuid4()),
|
||
session_id=session_id,
|
||
incident_id=incident_id,
|
||
agent_role=role.value,
|
||
input_hash=input_hash,
|
||
output_json=output_json,
|
||
latency_ms=latency_ms,
|
||
vote=vote.value,
|
||
degraded=degraded,
|
||
created_at=datetime.now(UTC),
|
||
)
|
||
)
|
||
except Exception:
|
||
logger.exception(
|
||
"agent_session_db_write_failed",
|
||
session_id=session_id,
|
||
role=role.value,
|
||
)
|
||
|
||
# ── Redis Stream XADD(best-effort,失敗靜默)────────────────────────
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
await redis.xadd(
|
||
STREAM_KEY,
|
||
{
|
||
"session_id": session_id,
|
||
"incident_id": incident_id,
|
||
"role": role.value,
|
||
"vote": vote.value,
|
||
"latency_ms": str(latency_ms),
|
||
"degraded": "1" if degraded else "0",
|
||
},
|
||
maxlen=STREAM_MAXLEN,
|
||
approximate=True,
|
||
)
|
||
except Exception:
|
||
# Redis 失敗不阻塞主流程(觀測性非關鍵路徑)
|
||
logger.warning(
|
||
"agent_stream_xadd_failed",
|
||
session_id=session_id,
|
||
role=role.value,
|
||
)
|
||
|
||
|
||
def _serialize_output(output: object) -> dict:
|
||
"""
|
||
將 Agent 輸出 dataclass 轉為可 JSON 序列化的 dict。
|
||
|
||
Gate 2: dataclasses.asdict() 保留 Enum 實例,json.dumps 會拋 TypeError。
|
||
用 json.dumps(default=...) 先把 Enum 轉 .value 再 loads 回 dict。
|
||
"""
|
||
import json
|
||
from enum import Enum
|
||
|
||
try:
|
||
if dataclasses.is_dataclass(output) and not isinstance(output, type):
|
||
raw = dataclasses.asdict(output) # type: ignore[arg-type]
|
||
return json.loads(
|
||
json.dumps(raw, default=lambda o: o.value if isinstance(o, Enum) else str(o))
|
||
)
|
||
return {"raw": str(output)[:2000]}
|
||
except Exception:
|
||
return {"error": "serialization_failed"}
|
||
|
||
|
||
def _coordinator_input_hash(
|
||
diagnosis: DiagnosisReport,
|
||
plan: ActionPlan,
|
||
verdict: ReviewVerdict,
|
||
critic: CriticReport,
|
||
) -> str:
|
||
"""Coordinator 輸入 fingerprint(4 個 Agent 輸出的組合)。"""
|
||
import hashlib
|
||
key = (
|
||
diagnosis.evidence_snapshot_id
|
||
+ (diagnosis.top_hypothesis.description if diagnosis.top_hypothesis else "")
|
||
+ (plan.top_candidate.action if plan.top_candidate else "")
|
||
+ verdict.vote.value
|
||
+ str(critic.challenge_count)
|
||
)
|
||
return hashlib.sha256(key.encode()).hexdigest()[:16]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Degraded fallback packages
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _timeout_package(elapsed_ms: int) -> DecisionPackage:
|
||
"""全局超時 → 強制人工審核。"""
|
||
return DecisionPackage(
|
||
recommended_action=None,
|
||
confidence=0.0,
|
||
requires_human_approval=True,
|
||
debate_summary=f"[超時] 辯證流程超過 {GLOBAL_TIMEOUT_SEC}s,強制升級人工審核",
|
||
session_status=AgentSessionStatus.TIMEOUT,
|
||
latency_ms=elapsed_ms,
|
||
blocked_reason=f"全局超時 > {GLOBAL_TIMEOUT_SEC}s",
|
||
all_agents_degraded=True,
|
||
)
|
||
|
||
|
||
def _error_package(elapsed_ms: int) -> DecisionPackage:
|
||
"""未預期錯誤 → 強制人工審核。"""
|
||
return DecisionPackage(
|
||
recommended_action=None,
|
||
confidence=0.0,
|
||
requires_human_approval=True,
|
||
debate_summary="[錯誤] 辯證流程發生未預期錯誤,強制升級人工審核",
|
||
session_status=AgentSessionStatus.FAILED,
|
||
latency_ms=elapsed_ms,
|
||
blocked_reason="Orchestrator 未預期錯誤",
|
||
all_agents_degraded=True,
|
||
)
|