Files
ewoooc/services/event_router.py
OoO 1c2dc6cd61
All checks were successful
CD Pipeline / deploy (push) Successful in 1m14s
補強 AI 自動化閉環與安全降級
2026-04-29 22:56:00 +08:00

361 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# services/event_router.py
#
# ADR-012 §③: 單一入口 dispatch(event) — L0/L1/L2 分流
# W2-C: L2 優先走 Elephant Alpha OrchestratorEA 不可用時 fallback AIOrchestrator
#
import asyncio
import json
import logging
import os
import threading
import traceback
import time
from datetime import datetime
from typing import Any, Dict, Optional
from services.ai_orchestrator import AIOrchestrator
from services.telegram_templates import send_telegram_with_result, triaged_alert
logger = logging.getLogger(__name__)
_QUEUE_PATH = os.getenv(
"MOMO_EVENT_ROUTER_QUEUE",
os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "event_router_failed_deliveries.jsonl"),
)
_QUEUE_LOCK = threading.Lock()
async def _handle_l1(event: Dict[str, Any], session_id: str) -> Dict[str, Any]:
"""L1: semantic translation + reason analysis (Hermes)."""
orchestrator = AIOrchestrator()
return await orchestrator.handle_l1(event, session_id)
async def _handle_l2(event: Dict[str, Any], session_id: str) -> Dict[str, Any]:
"""
L2: W2-C — EA Orchestrator 優先(動態路由 256K ctx
EA 不可用API key 未設或連線失敗)時 fallback AIOrchestrator。
ADR-012: audit trail 由 EA._log_decision + triaged_alert 雙寫保證。
"""
try:
from services.elephant_service import elephant_service
from services.elephant_alpha_orchestrator import elephant_orchestrator
# 護欄EA API key 未設定則直接 fallback不嘗試連線
if not elephant_service.api_key:
raise RuntimeError("OPENROUTER_API_KEY not configured, using fallback")
# 護欄連線快取確認W3-A cache 300s不會每次都 ping
if not elephant_service.check_connection():
raise RuntimeError("EA connection unavailable, using fallback")
decision = await elephant_orchestrator.analyze_and_coordinate({
"event": event,
"tier": "L2",
"session_id": session_id,
"urgency": "high",
"complexity": "medium",
"task_type": event.get("event_type", "general_analysis"),
})
return {
"source": "elephant_alpha",
"priority": decision.priority,
"confidence": decision.confidence,
"execution_plan": decision.execution_plan,
"agents_required": decision.agents_required,
"reasoning": decision.reasoning,
}
except Exception as e:
logger.warning(f"[EventRouter] EA L2 failed ({e}), fallback → AIOrchestrator")
orchestrator = AIOrchestrator()
return await orchestrator.handle_l2(event, session_id)
async def _handle_l0(event: Dict[str, Any]) -> Dict[str, Any]:
"""L0: return raw event (compatibility/monitoring)."""
return {"status": "ok", "echo": event.get("event_type")}
async def _run_tier_handler(tier: str, event: Dict[str, Any], session_id: str) -> Dict[str, Any]:
"""Run AI tier with L0-safe degradation; handler failure must not break alerts."""
try:
if tier == "L0":
return await _handle_l0(event)
if tier == "L1":
return await _handle_l1(event, session_id)
if tier == "L2":
return await _handle_l2(event, session_id)
return await _handle_l0(event)
except Exception as exc:
logger.exception("[EventRouter] %s handler failed, degraded to template alert: %s", tier, exc)
return {
"status": "degraded",
"summary": "AI 分析暫不可用,已降級為模板告警;原始事件仍保留。",
"cause": f"{type(exc).__name__}: {str(exc)[:200]}",
"suggested_actions": ["先依原始事實排查", "若重複發生,查看 event_router_failed_deliveries.jsonl 與服務 logs"],
"handler_error": str(exc)[:500],
}
def _event_key(event: Dict[str, Any]) -> str:
return f"{event.get('source', 'unknown')}:{event.get('event_type', 'unknown')}"
def _is_event_silenced(event: Dict[str, Any]) -> bool:
try:
from services.agent_actions import is_silenced
key = _event_key(event)
return is_silenced(key) or is_silenced(str(event.get("event_type", "")))
except Exception as exc:
logger.warning("[EventRouter] silence check failed: %s", exc)
return False
def _queue_failed_delivery(
event: Dict[str, Any],
tier: str,
message: Optional[str],
errors: list,
reason: str,
) -> bool:
"""Append failed notification delivery to a local JSONL queue for later replay."""
record = {
"ts": datetime.now().isoformat(),
"reason": reason,
"tier": tier,
"event_key": _event_key(event),
"event": event,
"message": message,
"errors": errors,
}
try:
os.makedirs(os.path.dirname(_QUEUE_PATH), exist_ok=True)
with _QUEUE_LOCK:
with open(_QUEUE_PATH, "a", encoding="utf-8") as fh:
fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n")
return True
except Exception as exc:
logger.error("[EventRouter] failed to queue delivery fallback: %s", exc)
return False
def _execute_safe_actions(result: Dict[str, Any], event: Dict[str, Any]) -> list[Dict[str, Any]]:
"""
Execute only ADR-012 L2 SAFE_ACTIONS when NemoTron explicitly marks the plan auto-safe.
All action functions own their own audit writes.
"""
if not isinstance(result, dict):
return []
if not (result.get("auto_execute") or result.get("dispatch_to") in {"safe_action", "auto_execute"}):
return []
action_plan = result.get("action_plan") or result.get("execution_plan") or []
if not isinstance(action_plan, list):
return []
try:
from services.agent_actions import SAFE_ACTIONS
except Exception as exc:
return [{"action": "load_safe_actions", "status": "error", "error": str(exc)[:200]}]
executed = []
for step in action_plan[:3]:
if not isinstance(step, dict):
continue
action = step.get("action")
params = step.get("params") or {}
if action not in SAFE_ACTIONS:
executed.append({"action": action, "status": "rejected", "reason": "not in SAFE_ACTIONS"})
continue
try:
action_result = SAFE_ACTIONS[action](**params)
executed.append({"action": action, "status": "ok", "result": action_result})
except Exception as exc:
logger.exception("[EventRouter] safe action failed: %s", action)
executed.append({"action": action, "status": "error", "error": str(exc)[:300]})
return executed
async def dispatch(event: Dict[str, Any], admin_chat_ids: Optional[list] = None) -> Dict[str, Any]:
"""
Main event routing entry (ADR-012 §③ — 唯一入口).
Output format compatible with routes/bot_api_routes.
"""
tier = _classify(event)
session_id = f"evt:{event.get('event_type')}:{event.get('source', 'unknown')}"
started_at = time.perf_counter()
try:
if _is_event_silenced(event):
return {
"tier": tier,
"sent": 0,
"errors": [],
"latency_ms": int((time.perf_counter() - started_at) * 1000),
"payload": {"status": "silenced", "event_key": _event_key(event)},
"delivered": True,
"silenced": True,
"queued": False,
}
result = await _run_tier_handler(tier, event, session_id)
executed_actions = _execute_safe_actions(result, event)
if executed_actions:
result["executed_actions"] = executed_actions
message, reply_markup = _build_telegram_message(event, tier, result)
send_result = send_telegram_with_result(message, chat_ids=admin_chat_ids, reply_markup=reply_markup)
queued = False
if not send_result["ok"]:
queued = _queue_failed_delivery(event, tier, message, send_result["errors"], "telegram_delivery_failed")
latency_ms = int((time.perf_counter() - started_at) * 1000)
return {
"tier": tier,
"sent": send_result["sent"],
"errors": send_result["errors"],
"latency_ms": latency_ms,
"payload": result,
"delivered": send_result["ok"],
"silenced": False,
"queued": queued,
}
except Exception as e:
logger.exception(f"[EventRouter] dispatch failed: {e}")
queued = _queue_failed_delivery(event, tier, None, [str(e)], "dispatch_exception")
return {
"tier": tier,
"sent": 0,
"errors": [str(e)],
"latency_ms": int((time.perf_counter() - started_at) * 1000),
"payload": None,
"delivered": False,
"silenced": False,
"queued": queued,
}
def _run_coroutine_in_thread(coro) -> Dict[str, Any]:
result = {}
def runner():
try:
result["value"] = asyncio.run(coro)
except Exception as e:
result["value"] = {
"tier": "unknown",
"sent": 0,
"errors": [str(e)],
"latency_ms": 0,
"payload": None,
"delivered": False,
}
thread = threading.Thread(target=runner, daemon=True)
thread.start()
thread.join(timeout=15)
if thread.is_alive():
return {
"tier": "unknown",
"sent": 0,
"errors": ["dispatch_sync timed out"],
"latency_ms": 15000,
"payload": None,
"delivered": False,
}
return result["value"]
def dispatch_sync(event: Dict[str, Any], admin_chat_ids: Optional[list] = None) -> Dict[str, Any]:
"""同步環境使用的 EventRouter 入口。"""
try:
asyncio.get_running_loop()
except RuntimeError:
return asyncio.run(dispatch(event, admin_chat_ids=admin_chat_ids))
return _run_coroutine_in_thread(dispatch(event, admin_chat_ids=admin_chat_ids))
def notify_failure(
task_name: str,
error: Exception,
*,
source: Optional[str] = None,
event_type: str = "scheduler_task_failure",
priority: str = "P2",
title: Optional[str] = None,
trace: Optional[str] = None,
payload: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""排程/背景任務失敗的同步通知 helper。"""
severity = "alert" if priority in {"P1", "P2"} else "warning"
event = {
"source": source or f"Scheduler.{task_name}",
"event_type": event_type,
"severity": severity,
"title": title or f"{task_name} 任務異常",
"status": "任務失敗",
"impact": f"{priority} - 背景任務需要檢查",
"summary": str(error)[:200],
"trace": trace or "".join(traceback.format_exception(type(error), error, error.__traceback__)),
"payload": {"task_name": task_name, **(payload or {})},
}
return dispatch_sync(event)
def _classify(event: Dict[str, Any]) -> str:
sev = event.get("severity", "info")
has_trace = bool(event.get("trace"))
event_type = event.get("event_type", "")
if sev in ("info", "success"):
return "L0"
if sev == "warning":
return "L1" if has_trace else "L0"
if sev == "alert":
if event_type in {"price_threat", "db_connection_error", "crawler_timeout",
"nim_quota_exhausted", "embedding_failure",
"scheduler_task_failure"}:
return "L2"
return "L1"
return "L0"
def _build_telegram_message(event: Dict[str, Any], tier: str, result: Optional[Dict[str, Any]]) -> tuple[str, Optional[Dict[str, Any]]]:
if tier == "L0":
title = event.get("title") or event.get("event_type", "system_event")
summary = event.get("summary") or event.get("status") or "系統事件"
body = event.get("impact") or event.get("source") or ""
message = (
f" <b>{title}</b>\n"
f"━━━━━━━━━━━━━━━━━━━━\n"
f"{summary}\n"
f"{body}\n"
)
return message, None
ai_summary = ""
ai_cause = None
ai_actions = None
ai_executed = None
if isinstance(result, dict):
ai_summary = (
result.get("summary")
or result.get("reasoning")
or result.get("message")
or str(result)[:400]
)
ai_cause = result.get("cause") or result.get("root_cause")
ai_actions = result.get("suggested_actions") or result.get("execution_plan")
ai_executed = result.get("executed_actions")
return triaged_alert(
event,
tier_label=tier,
ai_summary=ai_summary,
ai_cause=ai_cause,
ai_actions=ai_actions if isinstance(ai_actions, list) else ([str(ai_actions)] if ai_actions else None),
ai_executed=ai_executed if isinstance(ai_executed, list) else ([str(ai_executed)] if ai_executed else None),
)