All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s
問題盤點(2026-04-19 實地 SSH 111:11434): - 我原本設 HERMES_TIMEOUT=30 是人為限制,AI 推理不該被綁 - 111 Ollama 實況:9 個模型共享,deepseek-r1:14b 會佔 VRAM - hermes3 冷啟動 30+s(切換)/ warm 後 <1s(40x 差距) - 30s timeout → 冷啟動必中 → 誤判 AI 掛 → 人為降級 修正: - HERMES_TIMEOUT default 30 → 180(HERMES_TIMEOUT=0 代表無限制) - 新增 keep_alive=24h payload,讓 hermes3 常駐 VRAM 避免被其他客戶端(deepseek-r1 等)切換觸發冷啟動 - Memory reference_env_map.md 更新 111 實況(9 模型清單、切換陷阱、 ADR-012 呼叫設定) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
426 lines
16 KiB
Python
426 lines
16 KiB
Python
"""
|
||
EventRouter — 事件分流入口(ADR-012 Phase 1 骨幹)
|
||
|
||
所有系統事件(exception / 排程完成 / 告警 / 資訊通報)**應**統一透過
|
||
`dispatch(event)` 進入,由 EventRouter 依 severity × event_type 分流到:
|
||
L0 Direct / L1 Hermes Observer / L2 NemoTron Investigator / L3 OpenClaw Operator
|
||
|
||
設計原則(ADR-012 §⑥):無論 AI 狀況,**通知鏈絕不中斷**。
|
||
每一級失敗立即降級到下一級,最終保底 L0 直出模板。
|
||
|
||
Phase 1 實作範圍:
|
||
- 骨幹 + 分類邏輯
|
||
- L0 模板直出(已可用)
|
||
- L1 Hermes / L2 NemoTron / L3 OpenClaw 為 stub(附 TODO 標記)
|
||
- 完整 fallback 鏈(AI 掛必降級)
|
||
- 靜音檢查 + Audit Trail
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import time
|
||
from datetime import datetime
|
||
from enum import Enum
|
||
from typing import Any
|
||
|
||
import requests
|
||
|
||
from services.logger_manager import SystemLogger
|
||
from services import telegram_templates as tpl
|
||
from services import agent_actions
|
||
|
||
sys_log = SystemLogger("EventRouter").get_logger()
|
||
|
||
|
||
class Tier(str, Enum):
|
||
L0_DIRECT = "L0"
|
||
L1_OBSERVER = "L1"
|
||
L2_INVESTIGATOR = "L2"
|
||
L3_OPERATOR = "L3"
|
||
|
||
|
||
class Severity(str, Enum):
|
||
INFO = "info"
|
||
SUCCESS = "success"
|
||
WARNING = "warning"
|
||
ALERT = "alert" # P0/P1
|
||
|
||
|
||
# =====================================================================
|
||
# 分類規則(ADR-012 §③)
|
||
# =====================================================================
|
||
def _classify(event: dict) -> Tier:
|
||
sev = event.get("severity", "info")
|
||
has_trace = bool(event.get("trace"))
|
||
event_type = event.get("event_type", "")
|
||
|
||
# L3 OpenClaw 由週期任務主動觸發(週報、Meta-Analysis),不走 router
|
||
# 這裡只處理 L0/L1/L2
|
||
|
||
if sev in (Severity.INFO, Severity.SUCCESS):
|
||
return Tier.L0_DIRECT
|
||
|
||
if sev == Severity.WARNING:
|
||
# 有技術 trace → L1 Hermes 翻譯
|
||
return Tier.L1_OBSERVER if has_trace else Tier.L0_DIRECT
|
||
|
||
if sev == Severity.ALERT:
|
||
# 符合 L2 白名單 event_type → NemoTron 介入
|
||
L2_EVENT_TYPES = {
|
||
"price_threat", "db_connection_error", "crawler_timeout",
|
||
"nim_quota_exhausted", "embedding_failure",
|
||
}
|
||
if event_type in L2_EVENT_TYPES:
|
||
return Tier.L2_INVESTIGATOR
|
||
return Tier.L1_OBSERVER
|
||
|
||
return Tier.L0_DIRECT
|
||
|
||
|
||
# =====================================================================
|
||
# 主入口
|
||
# =====================================================================
|
||
def dispatch(event: dict, admin_chat_ids: list[int] | None = None) -> dict:
|
||
"""
|
||
主要入口。回傳 dict: {tier, sent, insight_id, errors, latency_ms}
|
||
|
||
event 格式見 ADR-012 §③。必要欄位:source, event_type, severity, title, summary
|
||
可選欄位:trace, payload, time
|
||
|
||
admin_chat_ids 若未給,從 DB telegram_users where is_admin=true 取
|
||
"""
|
||
t0 = time.time()
|
||
event_key = f"{event.get('source', '?')}:{event.get('event_type', '?')}"
|
||
|
||
# 靜音檢查
|
||
if agent_actions.is_silenced(event_key):
|
||
sys_log.info(f"[EventRouter] 事件被靜音略過: {event_key}")
|
||
return {"tier": "silenced", "sent": 0, "event_key": event_key}
|
||
|
||
tier = _classify(event)
|
||
sys_log.info(f"[EventRouter] dispatch {event_key} → {tier.value}")
|
||
|
||
# 執行對應 Tier
|
||
try:
|
||
if tier == Tier.L0_DIRECT:
|
||
text = _render_l0(event)
|
||
elif tier == Tier.L1_OBSERVER:
|
||
text = _render_l1_with_fallback(event)
|
||
elif tier == Tier.L2_INVESTIGATOR:
|
||
text = _render_l2_with_fallback(event)
|
||
else:
|
||
text = _render_l0(event) # 未知 Tier 保底
|
||
except Exception as e:
|
||
sys_log.error(f"[EventRouter] 渲染失敗,降級 L0: {e}")
|
||
text = _render_l0(event)
|
||
|
||
# 發送 Telegram
|
||
result = _send(text, admin_chat_ids)
|
||
result["tier"] = tier.value
|
||
result["event_key"] = event_key
|
||
result["latency_ms"] = round((time.time() - t0) * 1000, 1)
|
||
|
||
# 審計(每次 dispatch 都入 KM)
|
||
_audit_dispatch(event, tier, result)
|
||
return result
|
||
|
||
|
||
# =====================================================================
|
||
# Tier 渲染器
|
||
# =====================================================================
|
||
def _base_event_for_template(event: dict) -> dict:
|
||
"""把 EventRouter 的 event 結構轉成 templates.triaged_alert 需要的格式"""
|
||
payload = event.get("payload") if isinstance(event.get("payload"), dict) else None
|
||
return {
|
||
"severity": event.get("severity", "warning"),
|
||
"title": event.get("title", "未命名事件"),
|
||
"module": event.get("source", "unknown"),
|
||
"status": event.get("status"),
|
||
"impact": event.get("impact"),
|
||
"summary": event.get("summary", ""),
|
||
"details": payload,
|
||
"trace": event.get("trace"),
|
||
}
|
||
|
||
|
||
def _render_l0(event: dict) -> str:
|
||
"""L0 直出:根據 severity 選用對應模板"""
|
||
sev = event.get("severity", "info")
|
||
title = event.get("title", "未命名事件")
|
||
module = event.get("source", "unknown")
|
||
summary = event.get("summary", "")
|
||
details = event.get("payload") if isinstance(event.get("payload"), dict) else None
|
||
|
||
if sev == Severity.SUCCESS:
|
||
return tpl.success(title=title, module=module, stats=summary)
|
||
if sev == Severity.INFO:
|
||
return tpl.info(title=title, module=module, content=summary)
|
||
if sev == Severity.WARNING:
|
||
return tpl.warning(title=title, module=module, summary=summary, details=details)
|
||
return tpl.alert(
|
||
title=title, module=module,
|
||
status=event.get("status", "未知"),
|
||
impact=event.get("impact", "未評估"),
|
||
summary=summary,
|
||
actions=event.get("suggested_actions"),
|
||
trace=event.get("trace"),
|
||
)
|
||
|
||
|
||
def _parse_hermes_json(raw: str) -> dict | None:
|
||
"""解析 Hermes 回傳的 JSON,容錯 markdown fence"""
|
||
import json as _json
|
||
raw = (raw or "").strip()
|
||
if "```" in raw:
|
||
for p in raw.split("```"):
|
||
if p.strip().startswith("{"):
|
||
raw = p.strip()
|
||
break
|
||
try:
|
||
return _json.loads(raw)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _render_l1_with_fallback(event: dict) -> str:
|
||
"""L1 Hermes 翻譯 → 三層式 triaged_alert;失敗降 L0 + 🟡 標記"""
|
||
try:
|
||
parsed = _hermes_observe_parsed(event)
|
||
if parsed and parsed.get("summary"):
|
||
return tpl.triaged_alert(
|
||
base_event=_base_event_for_template(event),
|
||
tier_label="L1 · Hermes",
|
||
ai_summary=parsed.get("summary", ""),
|
||
ai_cause=parsed.get("probable_cause"),
|
||
ai_actions=parsed.get("actions") or [],
|
||
)
|
||
except Exception as e:
|
||
sys_log.warning(f"[EventRouter] L1 Hermes 失敗,降 L0: {e}")
|
||
|
||
return _render_l0(event) + "\n\n🟡 <i>AI 分析暫不可用,以原始資料呈現</i>"
|
||
|
||
|
||
def _render_l2_with_fallback(event: dict) -> str:
|
||
"""L2 NemoTron 規則式 → triaged_alert + 已執行 action;失敗降 L1"""
|
||
try:
|
||
ai_result = _nemoton_investigate(event)
|
||
if ai_result:
|
||
# 同時跑 L1 Hermes 補齊摘要(已執行動作與摘要合併呈現)
|
||
parsed = None
|
||
try:
|
||
parsed = _hermes_observe_parsed(event)
|
||
except Exception:
|
||
pass
|
||
return tpl.triaged_alert(
|
||
base_event=_base_event_for_template(event),
|
||
tier_label="L2 · NemoTron",
|
||
ai_summary=(parsed or {}).get("summary") or ai_result.get("summary", ""),
|
||
ai_cause=(parsed or {}).get("probable_cause"),
|
||
ai_actions=(parsed or {}).get("actions") or [],
|
||
ai_executed=ai_result.get("actions_taken", []),
|
||
)
|
||
except Exception as e:
|
||
sys_log.warning(f"[EventRouter] L2 NemoTron 失敗,降 L1: {e}")
|
||
|
||
return _render_l1_with_fallback(event)
|
||
|
||
|
||
# =====================================================================
|
||
# L1 Hermes Observer(Phase 2 實作)
|
||
# =====================================================================
|
||
_HERMES_URL = os.getenv("HERMES_URL", "http://192.168.0.111:11434")
|
||
_HERMES_MODEL = os.getenv("HERMES_MODEL", "hermes3:latest")
|
||
# 放寬 timeout:hermes3 warm 後 <1s,冷啟動 30-60s(deepseek-r1 佔 VRAM 時會切換)
|
||
# 設 HERMES_TIMEOUT=0 表示無限制(僅受 OS TCP 層保底 ~120s)
|
||
_HERMES_TIMEOUT_RAW = int(os.getenv("HERMES_TIMEOUT", "180"))
|
||
_HERMES_TIMEOUT: float | None = None if _HERMES_TIMEOUT_RAW <= 0 else _HERMES_TIMEOUT_RAW
|
||
# keep_alive=24h 讓 hermes3 常駐記憶體,避免被其他客戶端切換掉造成冷啟動
|
||
_HERMES_KEEP_ALIVE = os.getenv("HERMES_KEEP_ALIVE", "24h")
|
||
|
||
_HERMES_OBSERVE_PROMPT = """你是一個 SRE 助手,任務是把技術錯誤翻譯成人類可理解的摘要。
|
||
|
||
請根據以下事件產出**繁體中文**分析,嚴格以下列 JSON 格式輸出(不要加 markdown 代碼塊、不要加說明):
|
||
{"summary": "一句話技術根因(中文,<60 字)", "probable_cause": "最可能的原因(中文,<40 字)", "actions": ["建議動作1", "建議動作2"]}
|
||
|
||
限制:
|
||
- summary 翻譯英文錯誤為中文,去除技術 jargon
|
||
- probable_cause 推測根因(基於 stack trace 和事件類型)
|
||
- actions 最多 3 個,具體可執行
|
||
- 若資訊不足,summary 填 "資訊不足"、actions 填 ["請檢查原始 trace"]
|
||
"""
|
||
|
||
|
||
def _hermes_observe_parsed(event: dict) -> dict | None:
|
||
"""
|
||
呼叫 Hermes(Ollama)翻譯 stack trace,回傳結構化 dict:
|
||
{summary, probable_cause, actions[]}
|
||
失敗回 None 讓上層降級到 L0 + 🟡 標記。
|
||
"""
|
||
try:
|
||
user_prompt = (
|
||
f"事件類型:{event.get('event_type', 'unknown')}\n"
|
||
f"來源模組:{event.get('source', 'unknown')}\n"
|
||
f"標題:{event.get('title', '')}\n"
|
||
f"簡述:{event.get('summary', '')}\n"
|
||
f"技術 trace:\n{(event.get('trace') or '')[-800:]}"
|
||
)
|
||
resp = requests.post(
|
||
f"{_HERMES_URL}/api/generate",
|
||
json={
|
||
"model": _HERMES_MODEL,
|
||
"system": _HERMES_OBSERVE_PROMPT,
|
||
"prompt": user_prompt,
|
||
"stream": False,
|
||
"keep_alive": _HERMES_KEEP_ALIVE, # 讓模型常駐 VRAM,避免冷切換
|
||
"options": {"temperature": 0.1, "num_predict": 300},
|
||
},
|
||
timeout=_HERMES_TIMEOUT,
|
||
)
|
||
if not resp.ok:
|
||
sys_log.warning(f"[EventRouter.L1] Hermes HTTP {resp.status_code}")
|
||
return None
|
||
|
||
raw = (resp.json().get("response") or "").strip()
|
||
parsed = _parse_hermes_json(raw)
|
||
if not parsed or not parsed.get("summary"):
|
||
return None
|
||
|
||
return {
|
||
"summary": str(parsed.get("summary", "")).strip(),
|
||
"probable_cause": str(parsed.get("probable_cause") or "").strip() or None,
|
||
"actions": [str(a).strip() for a in (parsed.get("actions") or []) if a][:5],
|
||
}
|
||
except Exception as e:
|
||
sys_log.warning(f"[EventRouter.L1] Hermes 呼叫失敗,降級:{type(e).__name__}: {str(e)[:120]}")
|
||
return None
|
||
|
||
|
||
# =====================================================================
|
||
# L2 NemoTron Investigator(Phase 3 規則式實作,不呼 NIM)
|
||
# =====================================================================
|
||
# event_type → [(action_name, params)] 規則表
|
||
_L2_RULES: dict[str, list[tuple[str, dict]]] = {
|
||
"db_connection_error": [
|
||
("query_km", {"query": "DNS resolve 失敗 momo-postgres"}),
|
||
("retry_task", {"task_name": "<auto>", "backoff_sec": 60}),
|
||
],
|
||
"crawler_timeout": [
|
||
("silence_alert", {"duration_min": 30}),
|
||
("retry_task", {"task_name": "<auto>", "backoff_sec": 300}),
|
||
],
|
||
"nim_quota_exhausted": [
|
||
("silence_alert", {"duration_min": 720}), # 12 小時,等 quota 重置
|
||
],
|
||
"embedding_failure": [
|
||
("silence_alert", {"duration_min": 10}), # 已有 retry queue 處理
|
||
],
|
||
}
|
||
|
||
|
||
def _nemoton_investigate(event: dict) -> dict | None:
|
||
"""
|
||
Phase 3 規則式 L2:根據 event_type 查 _L2_RULES,執行對應 safe actions。
|
||
Phase 5+ 可改接 NIM 讓 LLM 決定 action。
|
||
"""
|
||
event_type = event.get("event_type", "")
|
||
rules = _L2_RULES.get(event_type)
|
||
if not rules:
|
||
return None
|
||
|
||
actions_taken = []
|
||
for action_name, params in rules:
|
||
action_fn = agent_actions.SAFE_ACTIONS.get(action_name)
|
||
if not action_fn:
|
||
continue
|
||
# 動態參數:<auto> 代入事件本身的欄位
|
||
p = dict(params)
|
||
if p.get("task_name") == "<auto>":
|
||
p["task_name"] = event.get("payload", {}).get("task_name", "") or event.get("source", "").split(".")[-1]
|
||
if action_name == "silence_alert" and "event_key" not in p:
|
||
p["event_key"] = f"{event.get('source', '?')}:{event_type}"
|
||
|
||
try:
|
||
result = action_fn(**p)
|
||
status = result.get("status", "unknown")
|
||
actions_taken.append(f"{action_name} → {status}")
|
||
except Exception as e:
|
||
actions_taken.append(f"{action_name} → error: {str(e)[:80]}")
|
||
sys_log.error(f"[EventRouter.L2] action {action_name} 例外: {e}")
|
||
|
||
# 產一句 summary 說明決策邏輯
|
||
summary = f"依規則 _L2_RULES[{event_type}] 執行 {len(actions_taken)} 個安全動作"
|
||
return {"summary": summary, "actions_taken": actions_taken}
|
||
|
||
|
||
# =====================================================================
|
||
# Telegram 發送 + Audit
|
||
# =====================================================================
|
||
def _send(text: str, admin_chat_ids: list[int] | None) -> dict:
|
||
token = os.getenv("TELEGRAM_BOT_TOKEN")
|
||
if not token:
|
||
return {"sent": 0, "errors": ["TELEGRAM_BOT_TOKEN 未設定"]}
|
||
|
||
if admin_chat_ids is None:
|
||
admin_chat_ids = _load_admin_chat_ids()
|
||
if not admin_chat_ids:
|
||
return {"sent": 0, "errors": ["無管理員 chat_id"]}
|
||
|
||
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||
sent = 0
|
||
errors = []
|
||
for cid in admin_chat_ids:
|
||
try:
|
||
r = requests.post(url, json={
|
||
"chat_id": int(cid), "text": text, "parse_mode": "HTML",
|
||
}, timeout=10)
|
||
if r.ok:
|
||
sent += 1
|
||
else:
|
||
errors.append(f"chat={cid} status={r.status_code}")
|
||
except Exception as e:
|
||
errors.append(f"chat={cid} exc={e}")
|
||
return {"sent": sent, "errors": errors}
|
||
|
||
|
||
def _load_admin_chat_ids() -> list[int]:
|
||
"""從 DB 撈 is_admin=true 的 chat_id;fallback 到 .env TELEGRAM_CHAT_IDS"""
|
||
try:
|
||
from database.manager import get_session
|
||
from sqlalchemy import text as sa_text
|
||
session = get_session()
|
||
try:
|
||
rows = session.execute(sa_text(
|
||
"SELECT telegram_id FROM telegram_users WHERE is_active=true AND is_admin=true"
|
||
)).fetchall()
|
||
if rows:
|
||
return [int(r[0]) for r in rows]
|
||
finally:
|
||
session.close()
|
||
except Exception as e:
|
||
sys_log.warning(f"[EventRouter] 查 telegram_users 失敗,fallback .env: {e}")
|
||
|
||
# Fallback 到 env
|
||
import re
|
||
raw = os.getenv("TELEGRAM_CHAT_IDS", "")
|
||
return [int(x.strip()) for x in re.sub(r'[\[\]"\' ]', "", raw).split(",") if x.strip()]
|
||
|
||
|
||
def _audit_dispatch(event: dict, tier: Tier, result: dict) -> None:
|
||
"""每次 dispatch 都寫入 ai_insights 作為審計軌跡"""
|
||
try:
|
||
from services.openclaw_learning_service import store_insight
|
||
store_insight(
|
||
insight_type="agent_action",
|
||
content=f"dispatch tier={tier.value} event={event.get('event_type')}",
|
||
period=datetime.now().strftime("%Y-%m-%d"),
|
||
metadata={
|
||
"tier": tier.value,
|
||
"event": {k: event.get(k) for k in ("source", "event_type", "severity", "title")},
|
||
"result": {k: result.get(k) for k in ("sent", "latency_ms", "errors")},
|
||
"ts": datetime.now().isoformat(),
|
||
},
|
||
)
|
||
except Exception as e:
|
||
sys_log.error(f"[EventRouter] audit 失敗: {e}")
|