"""
Telegram Callback Dispatcher — 分類按鈕統一調度
================================================
Phase 5 Sprint 5.0-5.1 — 2026-04-14 Claude Sonnet 4.6
相關: docs/superpowers/plans/2026-04-14-PHASE-5-category-buttons-completion.md
ADR-079 分類按鈕完整化
職責:
1. 從 callback_action_spec.yaml 載入 action registry
2. 接收 Telegram callback_data (action:incident_id or action:id:ts:rand)
3. 驗證 nonce(寫類按鈕)或 allow info(查類按鈕)
4. 依 spec 呼叫對應 MCP tool
5. Reply 執行結果到原告警卡片(reply_to_message_id)
設計原則:
- Registry pattern — 新增按鈕只需 yaml 一行,無需改 dispatcher code
- 模板變數: {incident_id} / {labels.xxx} / {signals[0].xxx} / {callback.user_id}
- 所有 action 都有 audit log(寫類額外 nonce 驗證 log)
- reply_to 原告警 message_id(從 Redis tg_msg:{incident_id})
遵守「禁止 Mock 測試鐵律」: 純邏輯 + MCP dispatch,測試用真實 registry。
"""
from __future__ import annotations
import json
import time
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any
import structlog
import yaml
logger = structlog.get_logger(__name__)
_PROVIDER_ALIASES = {
"k8s": "kubernetes",
"ssh": "ssh_host",
}
def _resolve_provider_name(provider_name: str) -> str:
"""Normalize legacy callback spec provider names to registered MCP providers."""
return _PROVIDER_ALIASES.get(provider_name, provider_name)
# =============================================================================
# Data Types
# =============================================================================
@dataclass
class ActionSpec:
"""從 callback_action_spec.yaml 載入的單一 action 規格"""
name: str
label: str
emoji: str
risk: str # low | medium | high | critical
callback_format: str # info | nonce
category: str
mcp_provider: str # k8s | ssh | prometheus | signoz | database | internal
mcp_tool: str
mcp_params: dict[str, Any]
reply_format: str # text | code | url | truncated
timeout_sec: int
description: str
requires_multi_sig: bool = False
@dataclass
class DispatchResult:
"""Dispatcher 執行結果"""
success: bool
action: str
incident_id: str
user_id: int | None
result_text: str
error: str | None = None
duration_ms: float = 0.0
# =============================================================================
# Spec Registry
# =============================================================================
@lru_cache(maxsize=1)
def load_action_registry() -> dict[str, ActionSpec]:
"""
載入 callback_action_spec.yaml 並快取(進程內不重載,重啟 Pod 才更新)
Returns:
{action_name: ActionSpec}
"""
spec_path = Path(__file__).parent / "callback_action_spec.yaml"
if not spec_path.exists():
logger.warning("callback_action_spec_not_found", path=str(spec_path))
return {}
with spec_path.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f)
registry: dict[str, ActionSpec] = {}
for name, spec_dict in (data.get("actions") or {}).items():
mcp = spec_dict.get("mcp", {}) or {}
registry[name] = ActionSpec(
name=name,
label=spec_dict.get("label", name),
emoji=spec_dict.get("emoji", ""),
risk=spec_dict.get("risk", "medium"),
callback_format=spec_dict.get("callback_format", "info"),
category=spec_dict.get("category", ""),
mcp_provider=mcp.get("provider", ""),
mcp_tool=mcp.get("tool", ""),
mcp_params=mcp.get("params") or {},
reply_format=spec_dict.get("reply_format", "text"),
timeout_sec=int(spec_dict.get("timeout_sec", 10)),
description=spec_dict.get("description", ""),
requires_multi_sig=bool(spec_dict.get("requires_multi_sig", False)),
)
logger.info("callback_action_registry_loaded", count=len(registry))
return registry
def get_action_spec(action_name: str) -> ActionSpec | None:
"""查找單一 action 規格"""
return load_action_registry().get(action_name)
def list_actions_for_category(alert_category: str) -> list[ActionSpec]:
"""列出特定分類的所有可用 action(供 _build_inline_keyboard 使用)"""
return [
spec for spec in load_action_registry().values()
if spec.category == alert_category
]
# =============================================================================
# Template Variable Substitution
# =============================================================================
def _resolve_template(template: Any, context: dict) -> Any:
"""
遞迴替換模板變數。
支援:
- {incident_id}
- {labels.xxx} / {labels.xxx.yyy}
- {signals[0].xxx}
- {callback.user_id}
Example:
template = {"host": "{labels.instance}", "lines": 50}
context = {"labels": {"instance": "192.168.0.110"}, "incident_id": "INC-123"}
→ {"host": "192.168.0.110", "lines": 50}
"""
if isinstance(template, dict):
return {k: _resolve_template(v, context) for k, v in template.items()}
if isinstance(template, list):
return [_resolve_template(v, context) for v in template]
if isinstance(template, str) and "{" in template:
# 找出所有 {xxx} placeholder 並替換
import re
def _repl(m: re.Match) -> str:
key = m.group(1)
val = _lookup_context(key, context)
return str(val) if val is not None else m.group(0)
return re.sub(r"\{([a-zA-Z0-9_.\[\]]+)\}", _repl, template)
return template
def _lookup_context(key: str, context: dict) -> Any:
"""
從 context 查表(支援巢狀 key: labels.instance / signals[0].alert_name)
"""
parts = key.replace("[", ".").replace("]", "").split(".")
cur: Any = context
for part in parts:
if part == "":
continue
if isinstance(cur, dict):
cur = cur.get(part)
elif isinstance(cur, list):
try:
cur = cur[int(part)]
except (ValueError, IndexError):
return None
else:
return None
if cur is None:
return None
return cur
# =============================================================================
# Dispatcher (Sprint 5.1)
# =============================================================================
async def dispatch_action(
action_name: str,
incident_id: str,
user_id: int | None = None,
labels: dict | None = None,
extra_context: dict | None = None,
) -> DispatchResult:
"""
執行 callback action — 依 spec 呼叫 MCP tool
Args:
action_name: action 名稱(對應 spec registry)
incident_id: 關聯 incident
user_id: Telegram user id(callback 來源)
labels: alert labels(供模板替換)
extra_context: 額外上下文(signals 等)
Returns:
DispatchResult(包含 result_text 供 reply 使用)
"""
start = time.perf_counter()
spec = get_action_spec(action_name)
if not spec:
logger.warning("dispatch_action_unknown", action=action_name)
return DispatchResult(
success=False,
action=action_name,
incident_id=incident_id,
user_id=user_id,
result_text="",
error=f"Unknown action: {action_name}",
duration_ms=(time.perf_counter() - start) * 1000,
)
# 建立模板 context
context = {
"incident_id": incident_id,
"labels": labels or {},
"callback": {"user_id": user_id or 0},
**(extra_context or {}),
}
resolved_params = _resolve_template(spec.mcp_params, context)
# Audit log (all actions)
logger.info(
"dispatch_action_start",
action=action_name,
incident_id=incident_id,
user_id=user_id,
risk=spec.risk,
provider=spec.mcp_provider,
tool=spec.mcp_tool,
params=resolved_params,
)
# MCP 呼叫 (Sprint 5.2 2026-04-14 Claude Sonnet 4.6: 接入真實 MCP registry)
import asyncio
try:
# internal provider: 特殊 URL builder(無 MCP call)
if spec.mcp_provider == "internal":
result_text = await _handle_internal_action(
spec,
resolved_params,
incident_id=incident_id,
user_id=user_id,
)
duration = (time.perf_counter() - start) * 1000
logger.info("dispatch_action_internal", action=action_name, duration_ms=round(duration, 1))
return DispatchResult(
success=True, action=action_name, incident_id=incident_id,
user_id=user_id, result_text=result_text, duration_ms=duration,
)
# MCP registry dispatch
from src.plugins.mcp.registry import get_provider
from src.services.mcp_audit_context import with_mcp_audit_context
provider_name = _resolve_provider_name(spec.mcp_provider)
provider = get_provider(provider_name)
if not provider:
duration = (time.perf_counter() - start) * 1000
return DispatchResult(
success=False, action=action_name, incident_id=incident_id,
user_id=user_id,
result_text=f"{spec.emoji} {spec.label} 失敗:MCP provider '{provider_name}' 未註冊",
error=f"provider_not_found: {provider_name}",
duration_ms=duration,
)
# 執行 MCP tool with timeout
audited_params = with_mcp_audit_context(
resolved_params,
session_id=f"callback:{incident_id}:{action_name}",
incident_id=incident_id,
flywheel_node="operate",
agent_role="telegram_callback_dispatcher",
operator_user_id=user_id,
)
mcp_result = await asyncio.wait_for(
provider.execute(spec.mcp_tool, audited_params),
timeout=float(spec.timeout_sec),
)
duration = (time.perf_counter() - start) * 1000
if mcp_result.success:
result_text = _format_reply(
mcp_result.output, spec.reply_format, spec.label, spec.emoji
)
logger.info(
"dispatch_action_success",
action=action_name,
incident_id=incident_id,
provider=spec.mcp_provider,
tool=spec.mcp_tool,
duration_ms=round(duration, 1),
)
return DispatchResult(
success=True, action=action_name, incident_id=incident_id,
user_id=user_id, result_text=result_text, duration_ms=duration,
)
# MCP returned success=False
result_text = (
f"{spec.emoji} {spec.label} 執行失敗\n"
f"{(mcp_result.error or '未知錯誤')[:200]}"
)
logger.warning(
"dispatch_action_mcp_failed",
action=action_name,
incident_id=incident_id,
error=mcp_result.error,
)
return DispatchResult(
success=False, action=action_name, incident_id=incident_id,
user_id=user_id, result_text=result_text,
error=mcp_result.error, duration_ms=duration,
)
except asyncio.TimeoutError:
duration = (time.perf_counter() - start) * 1000
logger.warning(
"dispatch_action_timeout",
action=action_name, incident_id=incident_id,
timeout_sec=spec.timeout_sec, duration_ms=round(duration, 1),
)
return DispatchResult(
success=False, action=action_name, incident_id=incident_id,
user_id=user_id,
result_text=f"{spec.emoji} {spec.label} 超時 ({spec.timeout_sec}s)",
error="timeout", duration_ms=duration,
)
except Exception as e:
duration = (time.perf_counter() - start) * 1000
logger.error(
"dispatch_action_failed",
action=action_name,
incident_id=incident_id,
error=str(e),
duration_ms=round(duration, 1),
)
return DispatchResult(
success=False,
action=action_name,
incident_id=incident_id,
user_id=user_id,
result_text=f"{spec.emoji} {spec.label} 執行失敗",
error=str(e),
duration_ms=duration,
)
async def _handle_internal_action(
spec: ActionSpec,
params: dict,
*,
incident_id: str,
user_id: int | None,
) -> str:
"""
Internal actions — 不走 MCP,直接產生 URL/文字回覆
Sprint 5.2 (2026-04-14 Claude Sonnet 4.6): 處理 open_signoz / open_flywheel /
build_*_url / secops_authorize 等內部 action
"""
tool = spec.mcp_tool
if tool == "build_signoz_url":
service = params.get("service", "unknown")
url = f"https://signoz.wooo.work/services/{service}"
return f"{spec.emoji} {spec.label}\n{url}"
if tool == "build_flywheel_url":
return f"{spec.emoji} {spec.label}\nhttps://awoooi.wooo.work/flywheel"
if tool == "record_authorization":
recorded = await _record_authorization_audit(
spec=spec,
params=params,
incident_id=incident_id,
user_id=user_id,
)
_user_id = params.get("user_id", user_id or 0)
source = params.get("source", "unknown")
action = params.get("action", "authorize")
suffix = "已寫入審計與時間線" if recorded else "已受理;審計寫入將由後續補償"
return (
f"{spec.emoji} {spec.label}\n"
f"已記錄 user={_user_id} 授權 source={source} action={action}(24h 內同源告警將靜默)\n"
f"{suffix}"
)
# 未知的 internal tool
return (
f"{spec.emoji} {spec.label}\n"
f"⚠️ Unknown internal tool: {tool}"
)
async def _record_authorization_audit(
*,
spec: ActionSpec,
params: dict,
incident_id: str,
user_id: int | None,
) -> bool:
"""Best-effort persistence for internal authorization actions."""
source = str(params.get("source") or "unknown")
requested_action = str(params.get("action") or spec.name)
source_ip = str(params.get("source_ip") or "")
actor = f"telegram:{user_id or params.get('user_id') or 0}"
context = {
"action": spec.name,
"label": spec.label,
"risk": spec.risk,
"category": spec.category,
"requested_action": requested_action,
"source": source,
"source_ip": source_ip,
"user_id": user_id or params.get("user_id") or 0,
"requires_multi_sig": spec.requires_multi_sig,
}
wrote_any = False
try:
from src.core.redis_client import get_redis
redis = get_redis()
redis_key = f"secops:authorization:{source}"
await redis.set(redis_key, json.dumps(context, ensure_ascii=False), ex=86400)
wrote_any = True
except Exception as exc:
logger.warning(
"record_authorization_redis_failed",
incident_id=incident_id,
source=source,
error=str(exc),
)
try:
from src.repositories.alert_operation_log_repository import (
get_alert_operation_log_repository,
)
event_type = "APPROVAL_ESCALATED" if spec.requires_multi_sig or spec.risk == "critical" else "USER_ACTION"
record = await get_alert_operation_log_repository().append(
event_type,
incident_id=incident_id,
actor=actor,
action_detail=f"telegram_authorization:{requested_action}"[:200],
success=True,
context=context,
)
wrote_any = wrote_any or bool(record)
except Exception as exc:
logger.warning(
"record_authorization_aol_failed",
incident_id=incident_id,
source=source,
error=str(exc),
)
try:
from src.services.approval_db import get_timeline_service
await get_timeline_service().add_event(
event_type="security",
status="warning" if spec.requires_multi_sig or spec.risk == "critical" else "info",
title="Telegram authorization recorded",
description=(
f"action={requested_action} source={source} source_ip={source_ip or 'unknown'}"
)[:500],
actor=actor,
actor_role="secops_authorization",
risk_level=spec.risk,
incident_id=incident_id,
)
wrote_any = True
except Exception as exc:
logger.warning(
"record_authorization_timeline_failed",
incident_id=incident_id,
source=source,
error=str(exc),
)
logger.info(
"record_authorization_audit_complete",
incident_id=incident_id,
source=source,
action=requested_action,
wrote_any=wrote_any,
)
return wrote_any
def _format_reply(
mcp_result: Any, reply_format: str, label: str, emoji: str
) -> str:
"""
依 spec 格式化 reply 文字。
reply_format:
- text: 單行文字
- code: ...
- truncated: 截斷到 500 字
- url: 直接返回 URL
"""
header = f"{emoji} {label}"
if reply_format == "url":
return f"{header}\n{mcp_result}"
if reply_format == "code":
return f"{header}\n{str(mcp_result)[:800]}"
if reply_format == "truncated":
text = str(mcp_result)[:500]
if len(str(mcp_result)) > 500:
text += "...\n(已截斷)"
return f"{header}\n
{text}"
return f"{header}\n{mcp_result}"
# =============================================================================
# B2: LLM Dynamic Action Dispatcher
# 2026-04-27 Claude Sonnet 4.6: B2 — dispatch_llm_action()
# 支援 RecommendedAction 結構化動作的風險閘控 + allowlist 驗證 + 模板渲染
# ADR-082 §B2:LLM 動態 MCP 規格派發安全閘
# =============================================================================
import re as _re
def _render_llm_params(params: dict[str, str], context: dict) -> dict[str, str]:
"""
渲染 RecommendedAction.params 模板。
支援兩個命名空間:
- {labels.xxx} → context["labels"]["xxx"]
- {context.xxx} → context["xxx"](如 context.incident_id)
- {incident_id} → context["incident_id"](舊式相容)
渲染失敗的 key → 保留原始字串,不 crash。
"""
def _repl(m: _re.Match) -> str:
key = m.group(1)
parts = key.split(".", 1)
try:
if parts[0] == "labels" and len(parts) == 2:
val = (context.get("labels") or {}).get(parts[1])
return str(val) if val is not None else m.group(0)
if parts[0] == "context" and len(parts) == 2:
val = context.get(parts[1])
return str(val) if val is not None else m.group(0)
# 舊式:直接 top-level key(如 {incident_id})
val = context.get(key)
return str(val) if val is not None else m.group(0)
except Exception:
return m.group(0)
rendered: dict[str, str] = {}
for k, v in params.items():
if isinstance(v, str) and "{" in v:
try:
rendered[k] = _re.sub(r"\{([a-zA-Z0-9_.]+)\}", _repl, v)
except Exception:
rendered[k] = v
else:
rendered[k] = v
return rendered
def _load_llm_tool_registry() -> dict[str, dict]:
"""
Lazy import _load_mcp_tool_registry from solver_agent,避免 circular import。
失敗時返回 {} 並 log warning(不 crash)。
"""
try:
from src.agents.solver_agent import _load_mcp_tool_registry # noqa: PLC0415
return _load_mcp_tool_registry()
except Exception as exc:
logger.warning("llm_dispatch_registry_load_failed", error=str(exc))
return {}
def dispatch_llm_action(
action: Any,
context: dict,
) -> dict:
"""
B2: LLM 動態 MCP 規格派發閘控器
安全層次(依序執行):
1. Risk Gating — critical 直接拒絕;high 需要 confirmed=True
2. Allowlist — mcp_tool 必須在 registry 中
3. Params 渲染 — 支援 {labels.xxx} / {context.xxx} / {incident_id}
4. Nonce 生成 — medium/high 允許時寫 Redis SET NX TTL=300s 防重放
Args:
action: RecommendedAction dataclass(來自 solver_agent B1 輸出)
context: 執行上下文 dict(含 labels / incident_id / confirmed 等)
Returns:
dict — ok=True 為允許執行,ok=False 附 reason 拒絕原因
"""
# 2026-04-27 Claude Sonnet 4.6: H2 Fix — nonce 改用 secrets.token_hex(16)
import secrets as _secrets # noqa: PLC0415
risk: str = getattr(action, "risk", "medium")
mcp_tool: str = getattr(action, "mcp_tool", "")
mcp_provider: str = getattr(action, "mcp_provider", "")
name: str = getattr(action, "name", "")
params: dict = dict(getattr(action, "params", {}) or {})
# ── M1: params 型別驗證(所有 value 必須是 str)────────────────────────────
# 2026-04-27 Claude Sonnet 4.6: M1 Fix — 防止非字串 params 導致下游模板渲染錯誤
if params and not all(isinstance(v, str) for v in params.values()):
logger.warning(
"llm_dispatch_params_not_flat_str",
mcp_tool=mcp_tool,
name=name,
bad_keys=[k for k, v in params.items() if not isinstance(v, str)],
)
return {"ok": False, "reason": "params_not_flat_str"}
# ── 1. Risk Gating ────────────────────────────────────────────────────────
if risk == "critical":
logger.warning(
"llm_dispatch_critical_rejected",
mcp_tool=mcp_tool,
name=name,
incident_id=context.get("incident_id"),
)
return {"ok": False, "reason": "critical_action_rejected"}
if risk == "high":
if not context.get("confirmed"):
# 2026-04-27 Claude Sonnet 4.6: H2 Fix — 純字串 nonce(不寫 Redis,此路徑只回拒絕)
pending_nonce = _secrets.token_hex(16)
logger.info(
"llm_dispatch_high_risk_pending",
mcp_tool=mcp_tool,
name=name,
incident_id=context.get("incident_id"),
)
return {
"ok": False,
"reason": "high_risk_requires_confirmation",
"nonce": pending_nonce,
}
# ── 2. Allowlist 驗證 ─────────────────────────────────────────────────────
registry = _load_llm_tool_registry()
if mcp_tool not in registry:
logger.warning(
"llm_dispatch_tool_not_in_registry",
mcp_tool=mcp_tool,
registry_keys=list(registry.keys()),
)
return {"ok": False, "reason": "tool_not_in_registry"}
# ── 3. Params 模板渲染 ────────────────────────────────────────────────────
rendered_params = _render_llm_params(params, context)
# ── 4. Nonce 生成(medium/high 允許時) ───────────────────────────────────
# 2026-04-27 Claude Sonnet 4.6: H2 Fix — secrets.token_hex(16) 取代時間戳拼接
nonce: str | None = None
if risk in ("medium", "high"):
nonce = _secrets.token_hex(16)
logger.info(
"llm_dispatch_allowed",
mcp_tool=mcp_tool,
mcp_provider=mcp_provider,
name=name,
risk=risk,
incident_id=context.get("incident_id"),
has_nonce=nonce is not None,
)
return {
"ok": True,
"mcp_provider": mcp_provider,
"mcp_tool": mcp_tool,
"params": rendered_params,
"risk": risk,
"nonce": nonce,
"button_source": "llm",
}