All checks were successful
CD Pipeline / deploy (push) Successful in 1m11s
Phase 2 — Hermes L1 Observer 真實接入:
- services/event_router.py::_hermes_observe() 呼叫 hermes3:latest
@192.168.0.111:11434/api/generate,做 stack trace 翻譯
- 輸出 JSON {summary, probable_cause, actions},容錯 markdown fence
- scheduler.py run_auto_import_task / run_momo_task 兩個 outer
except 改走 event_router.dispatch(),帶完整 trace
Phase 3 — NemoTron L2 Investigator 規則式實作:
- event_router._L2_RULES: event_type → [(action, params)] 規則表
• db_connection_error → query_km + retry_task(60s backoff)
• crawler_timeout → silence_alert(30min) + retry_task(300s)
• nim_quota_exhausted → silence_alert(720min)
• embedding_failure → silence_alert(10min)
- agent_actions.retry_task 真實實作: threading.Timer + exponential
backoff (60→120→240s) + _retry_state 追蹤 + ALLOWED_RETRY_TASKS
白名單 + 非 scheduler 容器回 'deferred'
Phase 4 — L3 HITL Ops 擴充:
- agent_actions: pause_task / resume_task / force_retry_now / is_task_paused
- OPS_ACTIONS 白名單與 SAFE_ACTIONS 嚴格分離(L2 不可呼叫 L3)
- telegram_templates.ops_action_request(): 4 按鈕 inline keyboard
(暫停1h / 暫停6h / 立即重試 / 解除暫停)
- telegram_bot_service._handle_ops_callback(): 接 momo:ops:<action>:<task>
- scheduler.py run_momo_task + run_auto_import_task 開頭加
is_task_paused() 檢查(Phase 4 暫停機制生效)
安全邊界(ADR-012 §①):
- L1 Hermes 只讀 → 失敗降 L0 + 🟡 標記
- L2 NemoTron 只碰 ai_insights + 發 Telegram + SAFE_ACTIONS
- L3 OpenClaw 任意動作必經 HITL inline keyboard 批准
- 不做容器重啟按鈕(需 docker socket,風險過高)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
321 lines
13 KiB
Python
321 lines
13 KiB
Python
"""
|
||
Agent Action 白名單(ADR-012 Phase 1 骨幹)
|
||
|
||
L2 NemoTron 可安全呼叫的動作集合。嚴格限制:
|
||
- 只能寫 ai_insights 和發 Telegram
|
||
- 不可動 prod 資料表 / 容器 / 外部系統
|
||
- 所有 action 必須 dual-write 審計軌跡
|
||
|
||
現階段為 **stub + 完整 interface**,供 event_router 串接。真實執行邏輯將於 Phase 3 填入。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import threading
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
from typing import Any
|
||
|
||
from services.logger_manager import SystemLogger
|
||
|
||
sys_log = SystemLogger("AgentAction").get_logger()
|
||
|
||
# ─── Module-level 狀態(記憶體,container restart 清空)─────────────────
|
||
# 靜音表:event_key → 靜音到期時間
|
||
_silence_table: dict[str, datetime] = {}
|
||
# 暫停表:task_name → 暫停到期時間(Phase 4 L3 HITL Ops)
|
||
_paused_tasks: dict[str, datetime] = {}
|
||
# Retry 狀態:task_name → {attempts, last_ts, last_error}(指數退避用)
|
||
_retry_state: dict[str, dict] = {}
|
||
_retry_lock = threading.Lock()
|
||
|
||
|
||
def _audit(action: str, params: dict, result: dict, latency_ms: float) -> int | None:
|
||
"""所有 action 統一審計入 ai_insights(ADR-007 Dual-Write)"""
|
||
try:
|
||
from services.openclaw_learning_service import store_insight
|
||
return store_insight(
|
||
insight_type="agent_action",
|
||
content=f"action={action} result={result.get('status', 'unknown')}",
|
||
period=datetime.now().strftime("%Y-%m-%d"),
|
||
metadata={
|
||
"action": action,
|
||
"params": params,
|
||
"result": result,
|
||
"latency_ms": latency_ms,
|
||
"ts": datetime.now().isoformat(),
|
||
},
|
||
)
|
||
except Exception as e:
|
||
sys_log.error(f"[AgentAction] audit 失敗 action={action}: {e}")
|
||
return None
|
||
|
||
|
||
ALLOWED_RETRY_TASKS = {
|
||
"run_auto_import_task", "run_momo_task", "run_edm_task",
|
||
"run_competitor_price_feeder_task", "run_backup_monitor_task",
|
||
"run_icaim_analysis_task", "run_festival_task", "run_whitepage_check",
|
||
"run_icaim_analysis_task", "run_db_backup_task",
|
||
}
|
||
|
||
|
||
def _try_load_task(task_name: str):
|
||
"""Lazy-import scheduler module 取 task function。非 scheduler 容器會失敗。"""
|
||
try:
|
||
import importlib
|
||
mod = importlib.import_module("scheduler")
|
||
return getattr(mod, task_name, None)
|
||
except Exception as e:
|
||
sys_log.warning(f"[AgentAction] import scheduler 失敗(預期於非 scheduler 容器): {e}")
|
||
return None
|
||
|
||
|
||
def _run_task_in_thread(task_name: str, attempt: int):
|
||
"""在獨立 thread 執行 task,寫審計"""
|
||
func = _try_load_task(task_name)
|
||
if func is None:
|
||
sys_log.error(f"[AgentAction] _run_task_in_thread: task {task_name} 不存在")
|
||
return
|
||
t0 = time.time()
|
||
sys_log.info(f"[AgentAction] 🔁 重試執行 {task_name} (attempt {attempt})")
|
||
try:
|
||
func()
|
||
with _retry_lock:
|
||
_retry_state.pop(task_name, None)
|
||
_audit("retry_task_completed",
|
||
{"task_name": task_name, "attempt": attempt},
|
||
{"status": "success"}, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] ✅ 重試成功 {task_name}")
|
||
except Exception as e:
|
||
err = str(e)[:300]
|
||
sys_log.error(f"[AgentAction] ❌ 重試失敗 {task_name} attempt={attempt}: {err}")
|
||
with _retry_lock:
|
||
st = _retry_state.get(task_name, {})
|
||
st["last_error"] = err
|
||
_retry_state[task_name] = st
|
||
_audit("retry_task_failed",
|
||
{"task_name": task_name, "attempt": attempt},
|
||
{"status": "error", "error": err}, (time.time() - t0) * 1000)
|
||
|
||
|
||
# =====================================================================
|
||
# 🔁 retry_task — 安全重試(exponential backoff,threading.Timer 延遲執行)
|
||
# =====================================================================
|
||
def retry_task(task_name: str, max_attempts: int = 3, backoff_sec: int = 60) -> dict:
|
||
t0 = time.time()
|
||
if task_name not in ALLOWED_RETRY_TASKS:
|
||
result = {"status": "rejected", "reason": f"task '{task_name}' not in whitelist"}
|
||
_audit("retry_task", {"task_name": task_name}, result, (time.time() - t0) * 1000)
|
||
sys_log.warning(f"[AgentAction] retry_task 拒絕:{task_name} 不在白名單")
|
||
return result
|
||
|
||
if is_task_paused(task_name):
|
||
result = {"status": "skipped", "reason": "task is paused"}
|
||
_audit("retry_task", {"task_name": task_name}, result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
# 確認能載入 task(否則 deferred)
|
||
if _try_load_task(task_name) is None:
|
||
result = {"status": "deferred", "task_name": task_name,
|
||
"note": "非 scheduler 容器無法重試,將由下次排程自然執行"}
|
||
_audit("retry_task", {"task_name": task_name}, result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
# Exponential backoff:已有 retry 中的就延長,最多 max_attempts
|
||
with _retry_lock:
|
||
st = _retry_state.get(task_name, {"attempts": 0})
|
||
attempts = st.get("attempts", 0) + 1
|
||
if attempts > max_attempts:
|
||
result = {"status": "exhausted", "task_name": task_name, "attempts": attempts - 1}
|
||
_audit("retry_task", {"task_name": task_name}, result, (time.time() - t0) * 1000)
|
||
_retry_state.pop(task_name, None)
|
||
sys_log.warning(f"[AgentAction] {task_name} 已達最大重試次數 {max_attempts}")
|
||
return result
|
||
delay = backoff_sec * (2 ** (attempts - 1)) # 60s / 120s / 240s ...
|
||
st.update({"attempts": attempts, "last_ts": time.time()})
|
||
_retry_state[task_name] = st
|
||
|
||
timer = threading.Timer(delay, _run_task_in_thread, args=[task_name, attempts])
|
||
timer.daemon = True
|
||
timer.start()
|
||
|
||
result = {"status": "scheduled", "task_name": task_name,
|
||
"attempt": attempts, "delay_sec": delay, "max_attempts": max_attempts}
|
||
_audit("retry_task", {"task_name": task_name, "delay_sec": delay, "attempt": attempts},
|
||
result, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] 🔁 {task_name} 已排定 {delay}s 後重試 (attempt {attempts}/{max_attempts})")
|
||
return result
|
||
|
||
|
||
# =====================================================================
|
||
# 🔍 query_km — RAG 查詢歷史同類事件
|
||
# =====================================================================
|
||
def query_km(query: str, insight_type: str | None = None, limit: int = 5) -> dict:
|
||
"""透過 openclaw_learning_service.build_rag_context 找歷史同類事件"""
|
||
t0 = time.time()
|
||
try:
|
||
from services.openclaw_learning_service import build_rag_context
|
||
context = build_rag_context(query=query, insight_type=insight_type)
|
||
result = {
|
||
"status": "ok",
|
||
"query": query,
|
||
"context_preview": (context or "")[:500],
|
||
"has_results": bool(context and context.strip()),
|
||
}
|
||
except Exception as e:
|
||
result = {"status": "error", "error": str(e)[:200]}
|
||
sys_log.error(f"[AgentAction] query_km 失敗: {e}")
|
||
|
||
_audit("query_km", {"query": query, "insight_type": insight_type, "limit": limit},
|
||
result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
|
||
# =====================================================================
|
||
# 🔕 silence_alert — 靜音抑制(避免告警風暴)
|
||
# =====================================================================
|
||
def silence_alert(event_key: str, duration_min: int = 60) -> dict:
|
||
"""
|
||
對特定 event_key 設定靜音期限。EventRouter 在 dispatch 前會先檢查。
|
||
event_key 建議格式:"<source>:<event_type>",例:
|
||
"Scheduler.AutoImport:db_connection_error"
|
||
"""
|
||
t0 = time.time()
|
||
until = datetime.now() + timedelta(minutes=duration_min)
|
||
_silence_table[event_key] = until
|
||
result = {"status": "silenced", "event_key": event_key, "until": until.isoformat()}
|
||
_audit("silence_alert", {"event_key": event_key, "duration_min": duration_min},
|
||
result, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] silence_alert: {event_key} → 靜音至 {until.strftime('%H:%M')}")
|
||
return result
|
||
|
||
|
||
def is_silenced(event_key: str) -> bool:
|
||
"""EventRouter 呼叫,判斷是否需略過此事件"""
|
||
until = _silence_table.get(event_key)
|
||
if until is None:
|
||
return False
|
||
if datetime.now() >= until:
|
||
_silence_table.pop(event_key, None)
|
||
return False
|
||
return True
|
||
|
||
|
||
# =====================================================================
|
||
# 🏷️ 三個既有 NemoTron tool 的 wrapper(供 event_router 統一調用)
|
||
# =====================================================================
|
||
def flag_for_human_review(sku: str, concern: str) -> dict:
|
||
"""升級到 L3 HITL(包裝 NemoTron 既有 tool,保持呼叫介面一致)"""
|
||
t0 = time.time()
|
||
# TODO Phase 3: 接入 nemoton_dispatcher_service._exec_flag_for_human_review
|
||
result = {"status": "stub", "sku": sku, "concern": concern,
|
||
"note": "Phase 1 stub,Phase 3 接 NemoTron"}
|
||
_audit("flag_for_human_review", {"sku": sku, "concern": concern},
|
||
result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
|
||
def route_to_km(sku: str, domain: str, summary: str) -> dict:
|
||
"""KM 歸檔(Phase 3 接 NemoTron)"""
|
||
t0 = time.time()
|
||
result = {"status": "stub", "note": "Phase 3 接 NemoTron"}
|
||
_audit("route_to_km", {"sku": sku, "domain": domain}, result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
|
||
def mark_for_relearn(sku: str, reason: str) -> dict:
|
||
"""標記重新訓練(Phase 3 接 NemoTron)"""
|
||
t0 = time.time()
|
||
result = {"status": "stub", "note": "Phase 3 接 NemoTron"}
|
||
_audit("mark_for_relearn", {"sku": sku, "reason": reason}, result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
|
||
# =====================================================================
|
||
# 🛑 L3 OPS Actions(只由 Telegram HITL Callback 觸發,不進 SAFE_ACTIONS)
|
||
# =====================================================================
|
||
def pause_task(task_name: str, duration_min: int = 60, operator: str = "unknown") -> dict:
|
||
"""暫停 scheduler 某個 task 指定分鐘。run_scheduler 須在執行前呼叫 is_task_paused 檢查。"""
|
||
t0 = time.time()
|
||
if task_name not in ALLOWED_RETRY_TASKS:
|
||
result = {"status": "rejected", "reason": "task not in whitelist"}
|
||
_audit("pause_task", {"task_name": task_name, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
until = datetime.now() + timedelta(minutes=duration_min)
|
||
_paused_tasks[task_name] = until
|
||
result = {"status": "paused", "task_name": task_name,
|
||
"until": until.isoformat(), "operator": operator}
|
||
_audit("pause_task",
|
||
{"task_name": task_name, "duration_min": duration_min, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] ⏸️ {task_name} 已暫停至 {until.strftime('%H:%M')} (by {operator})")
|
||
return result
|
||
|
||
|
||
def resume_task(task_name: str, operator: str = "unknown") -> dict:
|
||
"""立即解除 task 暫停"""
|
||
t0 = time.time()
|
||
had = _paused_tasks.pop(task_name, None)
|
||
result = {"status": "resumed" if had else "not_paused",
|
||
"task_name": task_name, "operator": operator}
|
||
_audit("resume_task", {"task_name": task_name, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] ▶️ {task_name} 恢復 (by {operator})")
|
||
return result
|
||
|
||
|
||
def is_task_paused(task_name: str) -> bool:
|
||
"""run_scheduler 每個 task 啟動前呼叫:true 則跳過本次"""
|
||
until = _paused_tasks.get(task_name)
|
||
if until is None:
|
||
return False
|
||
if datetime.now() >= until:
|
||
_paused_tasks.pop(task_name, None)
|
||
return False
|
||
return True
|
||
|
||
|
||
def force_retry_now(task_name: str, operator: str = "unknown") -> dict:
|
||
"""HITL:立即強制重試,繞過 backoff,不計入 attempts"""
|
||
t0 = time.time()
|
||
if task_name not in ALLOWED_RETRY_TASKS:
|
||
result = {"status": "rejected", "reason": "task not in whitelist"}
|
||
_audit("force_retry_now", {"task_name": task_name, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
if _try_load_task(task_name) is None:
|
||
result = {"status": "deferred", "note": "非 scheduler 容器"}
|
||
_audit("force_retry_now", {"task_name": task_name, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
return result
|
||
|
||
# 立即執行(不延遲)
|
||
t = threading.Thread(target=_run_task_in_thread, args=[task_name, 0], daemon=True)
|
||
t.start()
|
||
result = {"status": "started", "task_name": task_name, "operator": operator}
|
||
_audit("force_retry_now", {"task_name": task_name, "operator": operator},
|
||
result, (time.time() - t0) * 1000)
|
||
sys_log.info(f"[AgentAction] ⚡ {task_name} 強制立即重試 (by {operator})")
|
||
return result
|
||
|
||
|
||
# L2 白名單(NemoTron 可自主呼叫,讀多寫少)
|
||
SAFE_ACTIONS: dict[str, Any] = {
|
||
"retry_task": retry_task,
|
||
"query_km": query_km,
|
||
"silence_alert": silence_alert,
|
||
"flag_for_human_review": flag_for_human_review,
|
||
"route_to_km": route_to_km,
|
||
"mark_for_relearn": mark_for_relearn,
|
||
}
|
||
|
||
# L3 白名單(僅 Telegram HITL callback 可呼叫,狀態變更類)
|
||
OPS_ACTIONS: dict[str, Any] = {
|
||
"pause_task": pause_task,
|
||
"resume_task": resume_task,
|
||
"force_retry_now": force_retry_now,
|
||
}
|