Files
awoooi/apps/api/src/services/agent_orchestrator.py
Your Name c2c0b1ec82
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m9s
CD Pipeline / build-and-deploy (push) Successful in 4m21s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s
fix(alerts): let GCP Ollama finish before cloud fallback
2026-05-06 05:27:55 +08:00

415 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 2 — Agent Orchestrator排程指揮官
======================================================
職責:序列化 5 個 Agent 的執行,管理 Redis Streams 審計軌跡,記錄 DB
執行順序:
Diagnostician(snapshot)
→ Solver(diagnosis)
→ [Reviewer(plan) ‖ Critic(diagnosis, plan)] # 並行
→ Coordinator(diagnosis, plan, verdict, critic)
設計原則:
1. 每個 Agent 已有內建 5s 熔斷(在各自的 run() 中)— Orchestrator 不重複
2. 全流程 30s 全局超時GLOBAL_TIMEOUT_SEC
3. Redis Streams XADDbest-effort失敗不阻塞主流程
4. DB 記錄:每個 Agent turn 一行 agent_sessionsImmutable Event Sourcing
5. AIOPS_P2_ENABLED=False 時禁止使用(呼叫方負責 gate
Redis Stream key: aiops:p2:events
DB table: agent_sessions
ADR-082: Phase 2 多 Agent 協作
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
"""
from __future__ import annotations
import asyncio
import dataclasses
import os
import time
import uuid
from datetime import UTC, datetime
from typing import TYPE_CHECKING
import structlog
from sqlalchemy import insert
from src.agents.coordinator_agent import get_coordinator_agent
from src.agents.critic_agent import compute_input_hash as critic_hash
from src.agents.critic_agent import get_critic_agent
from src.agents.diagnostician_agent import compute_input_hash as diag_hash
from src.agents.diagnostician_agent import get_diagnostician_agent
from src.agents.protocol import (
ActionPlan,
AgentRole,
AgentSessionStatus,
AgentVote,
CriticReport,
DecisionPackage,
DiagnosisReport,
ReviewVerdict,
)
from src.agents.reviewer_agent import compute_input_hash as reviewer_hash
from src.agents.reviewer_agent import get_reviewer_agent
from src.agents.solver_agent import compute_input_hash as solver_hash
from src.agents.solver_agent import get_solver_agent
from src.db.base import get_db_context
from src.db.models import AgentSession
if TYPE_CHECKING:
from src.services.evidence_snapshot import EvidenceSnapshot
logger = structlog.get_logger(__name__)
def _agent_debate_global_timeout_seconds() -> float:
"""Return the full Phase 2 debate timeout.
GCP Ollama incident analysis can legitimately take longer than the old
90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
"""
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "260.0")
try:
timeout = float(raw)
except (TypeError, ValueError):
timeout = 260.0
return max(timeout, 90.0)
# 全局超時(所有 Agent 加起來)
# 2026-05-06 Codex: configurable for GCP-A/GCP-B/111 Ollama-first incident
# diagnosis. The old 90s guard was cutting off valid deep diagnosis runs.
GLOBAL_TIMEOUT_SEC = _agent_debate_global_timeout_seconds()
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。
# 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。
# Redis Stream key
STREAM_KEY = "aiops:p2:events"
STREAM_MAXLEN = 10_000 # 保留最近 10k 條事件
# ─────────────────────────────────────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────────────────────────────────────
async def run_agent_debate(
snapshot: "EvidenceSnapshot",
incident_id: str,
) -> DecisionPackage:
"""
執行完整的 5 Agent 辯證流程。
Args:
snapshot: Phase 1 感官快照EvidenceSnapshot
incident_id: 關聯 incident ID
Returns:
DecisionPackage包含 recommended_action + confidence + requires_human_approval
Raises:
不拋出 — 所有錯誤內部吸收,最壞情況返回 requires_human_approval=True 的 Package
"""
session_id = str(uuid.uuid4())
start_ms = int(time.monotonic() * 1000)
logger.info(
"agent_debate_start",
session_id=session_id,
incident_id=incident_id,
snapshot_id=snapshot.snapshot_id,
)
try:
package = await asyncio.wait_for(
_debate(session_id, incident_id, snapshot),
timeout=GLOBAL_TIMEOUT_SEC,
)
elapsed = int(time.monotonic() * 1000) - start_ms
logger.info(
"agent_debate_done",
session_id=session_id,
incident_id=incident_id,
requires_human=package.requires_human_approval,
confidence=package.confidence,
status=package.session_status.value,
elapsed_ms=elapsed,
)
# ADR-083 Phase 3: 辯證結果學習訊號接線L7×D2
# best-effort — 學習失敗不阻塞主流程
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太)
try:
from src.services.learning_service import get_learning_service
_critic_challenges = (
package.critic_report.challenge_count
if package.critic_report else 0
)
await get_learning_service().record_agent_session(
session_id=session_id,
incident_id=incident_id,
final_confidence=package.confidence,
requires_human_approval=package.requires_human_approval,
all_agents_degraded=package.all_agents_degraded,
critic_challenge_count=_critic_challenges,
)
except Exception as _lerr:
logger.warning(
"agent_session_learning_failed",
session_id=session_id,
error=str(_lerr),
)
return package
except asyncio.TimeoutError:
elapsed = int(time.monotonic() * 1000) - start_ms
logger.error(
"agent_debate_global_timeout",
session_id=session_id,
elapsed_ms=elapsed,
timeout_sec=GLOBAL_TIMEOUT_SEC,
)
return _timeout_package(elapsed)
except Exception:
elapsed = int(time.monotonic() * 1000) - start_ms
logger.exception("agent_debate_fatal", session_id=session_id)
return _error_package(elapsed)
# ─────────────────────────────────────────────────────────────────────────────
# Internal — debate pipeline
# ─────────────────────────────────────────────────────────────────────────────
async def _debate(
session_id: str,
incident_id: str,
snapshot: "EvidenceSnapshot",
) -> DecisionPackage:
"""實際的 Agent 辯證流程(在全局超時保護內執行)。"""
# ── Step 1: Diagnostician ──────────────────────────────────────────────
diagnostician = get_diagnostician_agent()
diagnosis = await diagnostician.run(snapshot)
await _record_turn(
session_id=session_id,
incident_id=incident_id,
role=AgentRole.DIAGNOSTICIAN,
input_hash=diag_hash(snapshot),
output=diagnosis,
latency_ms=diagnosis.latency_ms,
vote=diagnosis.vote,
degraded=diagnosis.degraded,
)
# ── Step 2: Solver ─────────────────────────────────────────────────────
solver = get_solver_agent()
# 2026-04-28: 把告警 labels 傳入,讓 Solver params 模板能填真實值
_alert_labels = (snapshot.alert_info or {}).get("labels", {}) if snapshot.alert_info else {}
plan = await solver.run(diagnosis, incident_labels=_alert_labels)
await _record_turn(
session_id=session_id,
incident_id=incident_id,
role=AgentRole.SOLVER,
input_hash=solver_hash(diagnosis),
output=plan,
latency_ms=plan.latency_ms,
vote=plan.vote,
degraded=plan.degraded,
)
# ── Step 3: Reviewer ‖ Critic並行──────────────────────────────────
reviewer = get_reviewer_agent()
critic = get_critic_agent()
verdict, critic_report = await asyncio.gather(
reviewer.run(plan),
critic.run(diagnosis, plan),
)
await asyncio.gather(
_record_turn(
session_id=session_id,
incident_id=incident_id,
role=AgentRole.REVIEWER,
input_hash=reviewer_hash(plan),
output=verdict,
latency_ms=verdict.latency_ms,
vote=verdict.vote,
degraded=verdict.degraded,
),
_record_turn(
session_id=session_id,
incident_id=incident_id,
role=AgentRole.CRITIC,
input_hash=critic_hash(diagnosis, plan),
output=critic_report,
latency_ms=critic_report.latency_ms,
vote=critic_report.vote,
degraded=critic_report.degraded,
),
return_exceptions=True, # MINOR-1: best-effort — 一個審計失敗不取消另一個
)
# ── Step 4: Coordinator ────────────────────────────────────────────────
coordinator = get_coordinator_agent()
package = await coordinator.run(diagnosis, plan, verdict, critic_report)
await _record_turn(
session_id=session_id,
incident_id=incident_id,
role=AgentRole.COORDINATOR,
input_hash=_coordinator_input_hash(diagnosis, plan, verdict, critic_report),
output=package,
latency_ms=package.latency_ms,
vote=AgentVote.APPROVE if not package.requires_human_approval else AgentVote.ABSTAIN,
degraded=package.all_agents_degraded,
)
return package
# ─────────────────────────────────────────────────────────────────────────────
# Internal — DB + Redis recording
# ─────────────────────────────────────────────────────────────────────────────
async def _record_turn(
session_id: str,
incident_id: str,
role: AgentRole,
input_hash: str,
output: object,
latency_ms: int,
vote: AgentVote,
degraded: bool,
) -> None:
"""
寫入 agent_sessions DB 行 + Redis Stream XADD兩者皆 best-effort
Immutable Event Sourcing — 只 INSERT永不 UPDATE / DELETE。
"""
output_json = _serialize_output(output)
# ── DB 寫入 ──────────────────────────────────────────────────────────
try:
async with get_db_context() as db:
await db.execute(
insert(AgentSession).values(
id=str(uuid.uuid4()),
session_id=session_id,
incident_id=incident_id,
agent_role=role.value,
input_hash=input_hash,
output_json=output_json,
latency_ms=latency_ms,
vote=vote.value,
degraded=degraded,
created_at=datetime.now(UTC),
)
)
except Exception:
logger.exception(
"agent_session_db_write_failed",
session_id=session_id,
role=role.value,
)
# ── Redis Stream XADDbest-effort失敗靜默────────────────────────
try:
from src.core.redis_client import get_redis
redis = get_redis()
await redis.xadd(
STREAM_KEY,
{
"session_id": session_id,
"incident_id": incident_id,
"role": role.value,
"vote": vote.value,
"latency_ms": str(latency_ms),
"degraded": "1" if degraded else "0",
},
maxlen=STREAM_MAXLEN,
approximate=True,
)
except Exception:
# Redis 失敗不阻塞主流程(觀測性非關鍵路徑)
logger.warning(
"agent_stream_xadd_failed",
session_id=session_id,
role=role.value,
)
def _serialize_output(output: object) -> dict:
"""
將 Agent 輸出 dataclass 轉為可 JSON 序列化的 dict。
Gate 2: dataclasses.asdict() 保留 Enum 實例json.dumps 會拋 TypeError。
用 json.dumps(default=...) 先把 Enum 轉 .value 再 loads 回 dict。
"""
import json
from enum import Enum
try:
if dataclasses.is_dataclass(output) and not isinstance(output, type):
raw = dataclasses.asdict(output) # type: ignore[arg-type]
return json.loads(
json.dumps(raw, default=lambda o: o.value if isinstance(o, Enum) else str(o))
)
return {"raw": str(output)[:2000]}
except Exception:
return {"error": "serialization_failed"}
def _coordinator_input_hash(
diagnosis: DiagnosisReport,
plan: ActionPlan,
verdict: ReviewVerdict,
critic: CriticReport,
) -> str:
"""Coordinator 輸入 fingerprint4 個 Agent 輸出的組合)。"""
import hashlib
key = (
diagnosis.evidence_snapshot_id
+ (diagnosis.top_hypothesis.description if diagnosis.top_hypothesis else "")
+ (plan.top_candidate.action if plan.top_candidate else "")
+ verdict.vote.value
+ str(critic.challenge_count)
)
return hashlib.sha256(key.encode()).hexdigest()[:16]
# ─────────────────────────────────────────────────────────────────────────────
# Degraded fallback packages
# ─────────────────────────────────────────────────────────────────────────────
def _timeout_package(elapsed_ms: int) -> DecisionPackage:
"""全局超時 → 強制人工審核。"""
return DecisionPackage(
recommended_action=None,
confidence=0.0,
requires_human_approval=True,
debate_summary=f"[超時] 辯證流程超過 {GLOBAL_TIMEOUT_SEC}s強制升級人工審核",
session_status=AgentSessionStatus.TIMEOUT,
latency_ms=elapsed_ms,
blocked_reason=f"全局超時 > {GLOBAL_TIMEOUT_SEC}s",
all_agents_degraded=True,
)
def _error_package(elapsed_ms: int) -> DecisionPackage:
"""未預期錯誤 → 強制人工審核。"""
return DecisionPackage(
recommended_action=None,
confidence=0.0,
requires_human_approval=True,
debate_summary="[錯誤] 辯證流程發生未預期錯誤,強制升級人工審核",
session_status=AgentSessionStatus.FAILED,
latency_ms=elapsed_ms,
blocked_reason="Orchestrator 未預期錯誤",
all_agents_degraded=True,
)