違規 buttons 一覽: - flywheel_diag / flywheel_dashboard (META告警卡) - pause_1h / ignore (業務告警卡) - postmortem / escalation_ack / dr_manual (升級通知卡) - secops_block_ip / secops_evict (SecOps 卡,spec=nonce 但用 2-part) 所有 buttons 均無 callback handler,點擊無回應 = 鬼魂按鈕 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
5371 lines
214 KiB
Python
5371 lines
214 KiB
Python
"""
|
||
Telegram Gateway - OpenClaw 行動戰情室 + SignOz 整合
|
||
====================================================
|
||
Phase 5.4.3 & 5.4.4: Telegram 推送與簽核接收
|
||
統帥校正: SignOz 為唯一全能視力中心
|
||
|
||
Features:
|
||
- 推送待簽核卡片到 Telegram (含 SignOz 指標)
|
||
- 動態 SignOz Trace URL (告警前後 5 分鐘)
|
||
- 自動調優按鈕 (Shadow Mode: 僅日誌輸出)
|
||
- 接收統帥簽核回調
|
||
- SOUL.md 訊息壓縮原則 100% 遵守
|
||
|
||
SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則):
|
||
- 狀態標籤: 20 字元
|
||
- 資源名稱: 50 字元
|
||
- 根因摘要: 100 字元
|
||
- 建議行動: 50 字元
|
||
- 總長度: 800 字元 (v7.0 擴展以容納 SignOz 區塊)
|
||
|
||
修復紀錄:
|
||
- 2026-03-26 Claude Code: 修復 HTML 解析錯誤 (Can't parse entities)
|
||
"""
|
||
|
||
import asyncio
|
||
import html
|
||
import os
|
||
from dataclasses import dataclass
|
||
from datetime import UTC, datetime
|
||
|
||
import httpx
|
||
import structlog
|
||
from opentelemetry import trace
|
||
|
||
from src.core.config import settings
|
||
from src.core.redis_client import get_redis
|
||
from src.services.security_interceptor import (
|
||
NonceReplayError,
|
||
UserNotWhitelistedError,
|
||
get_security_interceptor,
|
||
)
|
||
from src.services.chat_manager import get_chat_manager
|
||
|
||
# =============================================================================
|
||
# Snooze/Silence Redis Keys (2026-03-27 P1 優化)
|
||
# =============================================================================
|
||
SNOOZE_KEY_PREFIX = "telegram_snooze:" # {approval_id} -> 稍後提醒
|
||
SILENCE_KEY_PREFIX = "telegram_silence:" # {resource_name} -> 靜默
|
||
SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
|
||
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
|
||
|
||
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
|
||
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
|
||
POLLING_LEADER_KEY = "telegram:polling:leader"
|
||
POLLING_LEADER_TTL = 45 # seconds - Pod 宕掉後 45s 自動轉移
|
||
POLLING_LEADER_RENEW = 20 # seconds - 每 20s 續約
|
||
POLLING_LEADER_WATCH = 30 # seconds - 非 Leader Pod 每 30s 嘗試接管
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# =============================================================================
|
||
# OTEL Tracer (Phase C P1 可觀測性)
|
||
# 2026-03-30 Claude Code: 新增 Telegram Gateway 追蹤
|
||
# =============================================================================
|
||
_tracer = trace.get_tracer("awoooi.telegram_gateway", "1.0.0")
|
||
|
||
|
||
# =============================================================================
|
||
# Long Polling 配置 (Phase 5 內網修復)
|
||
# =============================================================================
|
||
LONG_POLLING_TIMEOUT = 30 # getUpdates timeout (秒)
|
||
LONG_POLLING_RETRY_DELAY = 5 # 錯誤後重試延遲 (秒)
|
||
|
||
|
||
# =============================================================================
|
||
# SignOz Metrics Block (v7.0)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class SignOzMetricsBlock:
|
||
"""
|
||
SignOz 指標區塊 (嵌入 Telegram 卡片)
|
||
|
||
格式:
|
||
📊 SignOz 指標
|
||
├ RPS: 150.2 📈
|
||
├ Error: 🟢 0.5%
|
||
└ P99: 245ms ➡️
|
||
"""
|
||
rps: float = 0.0
|
||
rps_trend: str = "stable" # up, down, stable
|
||
error_rate: float = 0.0
|
||
p99_latency_ms: float = 0.0
|
||
latency_trend: str = "stable"
|
||
trace_url: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
|
||
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
|
||
|
||
return (
|
||
f"📊 <b>SignOz 指標</b>\n"
|
||
f"├ RPS: <code>{self.rps:.1f}</code> {trend_emoji.get(self.rps_trend, '➡️')}\n"
|
||
f"├ Error: {error_emoji} <code>{self.error_rate:.2f}%</code>\n"
|
||
f"└ P99: <code>{self.p99_latency_ms:.0f}ms</code> {trend_emoji.get(self.latency_trend, '➡️')}"
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# SOUL.md 訊息格式定義 (v7.0 + SignOz)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class TelegramMessage:
|
||
"""
|
||
Telegram 訊息結構 (SOUL.md 4.1 + v7.0 SignOz 整合)
|
||
|
||
格式:
|
||
═══════════════════════════
|
||
🚨 CRITICAL | harbor-core
|
||
═══════════════════════════
|
||
📋 INC-20260321-0001
|
||
🎯 資源: harbor-core-7d4b8c9f5
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🤖 AI 仲裁判定
|
||
👥 責任: BE (後端)
|
||
📊 信心: 🟢 88%
|
||
💡 原因: JVM Heap 配置不當
|
||
━━━━━━━━━━━━━━━━━━━
|
||
📊 SignOz 指標
|
||
├ RPS: 150.2 📈
|
||
├ Error: 🟢 0.5%
|
||
└ P99: 245ms ➡️
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 建議: 刪除 Pod
|
||
⏱️ 停機: ~30s
|
||
🔍 SignOz Trace (±5min)
|
||
|
||
[✅ 簽核] [❌ 拒絕] [⚡ 自動調優]
|
||
"""
|
||
status_emoji: str # 🚨, ⚠️, ℹ️
|
||
risk_level: str # CRITICAL, MEDIUM, LOW
|
||
resource_name: str # Pod/Deployment 名稱 (max 50)
|
||
root_cause: str # 根因摘要 (max 100)
|
||
suggested_action: str # 建議操作 (max 50)
|
||
estimated_downtime: str # 預計停機時間
|
||
approval_id: str # 簽核單 ID
|
||
# v6.0 AI 仲裁欄位
|
||
incident_id: str = "" # 事件編號 INC-YYYYMMDD-XXXX
|
||
primary_responsibility: str = "COLLAB" # FE/BE/INFRA/DB/COLLAB
|
||
confidence: float = 0.0 # 信心度 0.0-1.0
|
||
namespace: str = "default" # K8s namespace
|
||
# v7.0 SignOz 整合
|
||
signoz_metrics: SignOzMetricsBlock | None = None
|
||
signoz_trace_url: str = "" # 動態時間參數 URL
|
||
auto_tuning_command: str = "" # kubectl 調優指令
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens: int = 0 # LLM Token 使用量
|
||
ai_cost: float = 0.0 # LLM 成本 (USD)
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency: dict | None = None # AnomalyCounter 統計
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider: str = "" # ollama/gemini/claude/expert_system/mock
|
||
# 2026-04-04 ogt: 底層模型名稱 (e.g. qwen2.5:7b-instruct, nemotron-70b)
|
||
ai_model: str = ""
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 告警分類與修復鏈路顯示 (ADR-076)
|
||
alert_category: str = "" # host/k8s/database/service/external_site/secops 等
|
||
playbook_name: str = "" # 匹配到的 Playbook 名稱(空字串=規則匹配)
|
||
|
||
# ==========================================================================
|
||
# Phase 22: Nemotron 協作欄位 (ADR-044)
|
||
# 2026-03-31 Claude Code: OpenClaw + Nemotron 雙軌顯示
|
||
# ==========================================================================
|
||
nemotron_enabled: bool = False # 是否啟用 Nemotron 協作
|
||
nemotron_tool_model: str = "" # Tool Calling 模型 (e.g. "llama3.1:8b")
|
||
nemotron_tool_backend: str = "" # Tool Calling 後端 (e.g. "Ollama 本機" / "NVIDIA 雲端")
|
||
nemotron_tools: list[dict] | None = None # Tool Calling 結果 [{"tool": str, "args": dict, "valid": bool}]
|
||
nemotron_validation: str = "" # "✅ 驗證通過" / "❌ 驗證失敗" / "⏳ 驗證中"
|
||
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
|
||
|
||
def format(self) -> str:
|
||
"""
|
||
格式化為 SOUL.md 規範的訊息 (含 AI 仲裁 + SignOz)
|
||
|
||
Returns:
|
||
str: 格式化的 Telegram 訊息 (max 900 字元)
|
||
"""
|
||
# 責任映射
|
||
resp_map = {
|
||
"FE": "👨💻 FE (前端)",
|
||
"BE": "⚙️ BE (後端)",
|
||
"INFRA": "🏗️ INFRA (基礎設施)",
|
||
"DB": "🗄️ DB (資料庫)",
|
||
"COLLAB": "🤝 COLLAB (協同處理)",
|
||
}
|
||
resp_display = resp_map.get(self.primary_responsibility, "❓ 未知")
|
||
|
||
# 信心度顯示
|
||
confidence_pct = int(self.confidence * 100)
|
||
if confidence_pct >= 80:
|
||
conf_emoji = "🟢"
|
||
elif confidence_pct >= 70:
|
||
conf_emoji = "🟡"
|
||
else:
|
||
conf_emoji = "🔴"
|
||
|
||
# 自動生成事件編號 (2026-03-27 ogt: 修復 INC-INC- 重複前綴)
|
||
if self.incident_id:
|
||
incident_id = self.incident_id
|
||
elif self.approval_id.startswith("INC-"):
|
||
incident_id = self.approval_id
|
||
else:
|
||
incident_id = f"INC-{self.approval_id[:8].upper()}"
|
||
|
||
# SignOz URL (優先使用動態 URL) - 必須 HTML 轉義防止解析錯誤
|
||
service_name = self.resource_name.split("-")[0] if "-" in self.resource_name else self.resource_name
|
||
raw_url = self.signoz_trace_url or f"http://192.168.0.188:3301/traces?service={service_name}"
|
||
signoz_url = html.escape(raw_url, quote=True)
|
||
|
||
# SignOz 指標區塊(含 Trace 連結)
|
||
signoz_block = ""
|
||
if self.signoz_metrics:
|
||
signoz_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{self.signoz_metrics.format()}\n"
|
||
f"🔍 <a href='{signoz_url}'>SignOz Trace</a>\n"
|
||
)
|
||
|
||
# HTML 轉義用戶輸入內容,防止 "Can't parse entities" 錯誤
|
||
safe_resource = html.escape(self.resource_name)
|
||
safe_root_cause = html.escape(self.root_cause)
|
||
safe_action = html.escape(self.suggested_action)
|
||
safe_downtime = html.escape(self.estimated_downtime)
|
||
|
||
# 2026-03-29 ogt: AI Token/Cost 顯示
|
||
ai_cost_display = ""
|
||
if self.ai_tokens > 0 or self.ai_cost > 0:
|
||
ai_cost_display = f"💰 Tokens: {self.ai_tokens:,} / ${self.ai_cost:.4f}\n"
|
||
|
||
# 2026-03-29 ogt: ADR-037 異常頻率顯示
|
||
frequency_block = ""
|
||
if self.anomaly_frequency and self.anomaly_frequency.get("count_24h", 0) >= 1:
|
||
freq = self.anomaly_frequency
|
||
escalation_emoji = {
|
||
None: "",
|
||
"REPEAT": "⚠️",
|
||
"ESCALATE": "🔴",
|
||
"PERMANENT_FIX": "🚨",
|
||
}.get(freq.get("escalation_level"), "")
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 D1 — 處置統計行
|
||
auto_r = freq.get("auto_repair_count", 0)
|
||
human_a = freq.get("human_approved_count", 0)
|
||
manual_r = freq.get("manual_resolved_count", 0)
|
||
cold_s = freq.get("cold_start_trust_count", 0)
|
||
total_res = freq.get("total_resolution_count", 0)
|
||
|
||
# 處置分佈行 (只在有處置紀錄時顯示)
|
||
disposition_line = ""
|
||
if total_res > 0:
|
||
auto_total = auto_r + cold_s
|
||
auto_rate = int(auto_total / total_res * 100) if total_res > 0 else 0
|
||
disposition_line = (
|
||
f"├ 🤖 自動: <code>{auto_total}</code>"
|
||
f" | 👤 審核: <code>{human_a}</code>"
|
||
f" | 🔧 手動: <code>{manual_r}</code>\n"
|
||
f"├ 自動化率: <b>{auto_rate}%</b>\n"
|
||
)
|
||
|
||
frequency_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>頻率統計</b> {escalation_emoji}\n"
|
||
f"├ 1h: <code>{freq.get('count_1h', 0)}</code> 次"
|
||
f" | 24h: <code>{freq.get('count_24h', 0)}</code> 次\n"
|
||
f"{disposition_line}"
|
||
f"└ 累計修復: <code>{auto_r}</code> 次\n"
|
||
)
|
||
if freq.get("escalation_level"):
|
||
frequency_block += f"🔺 升級: <b>{freq['escalation_level']}</b>\n"
|
||
|
||
# ADR-075 TYPE-3 格式 (2026-04-12 ogt)
|
||
# AI 來源標籤:confidence=0 不顯示 0%,顯示 📋 規則分析
|
||
if self.confidence > 0 and self.ai_provider:
|
||
provider_names = {
|
||
"ollama": "Ollama",
|
||
"gemini": "Gemini",
|
||
"claude": "Claude",
|
||
"nvidia": "Nemotron",
|
||
"openclaw_nemo": "Nemotron",
|
||
"openclaw_nvidia_nim": "Nemotron",
|
||
"openclaw_qwen": "Nemotron",
|
||
}
|
||
provider_display = provider_names.get(self.ai_provider.lower(), self.ai_provider.upper())
|
||
model_suffix = f" ({html.escape(self.ai_model)})" if self.ai_model else ""
|
||
ai_source = f"🤖 <b>{provider_display}{model_suffix}</b> {conf_emoji} {confidence_pct}%"
|
||
elif self.confidence > 0:
|
||
ai_source = f"🤖 <b>AI 仲裁</b> {conf_emoji} {confidence_pct}%"
|
||
else:
|
||
ai_source = "📋 規則分析"
|
||
|
||
# 風險等級中文
|
||
risk_zh = {
|
||
"CRITICAL": "嚴重",
|
||
"HIGH": "高風險",
|
||
"MEDIUM": "中風險",
|
||
"LOW": "低風險",
|
||
}.get(self.risk_level.upper(), self.risk_level)
|
||
|
||
# ADR-076 分類標籤 (2026-04-16 ogt + Claude Sonnet 4.6)
|
||
_category_map = {
|
||
"host": "🖥️ 主機", "host_resource": "🖥️ 主機",
|
||
"k8s": "☸️ K8s", "kubernetes": "☸️ K8s",
|
||
"database": "🗄️ 資料庫",
|
||
"service": "⚙️ 服務",
|
||
"external_site": "🌐 外部網站",
|
||
"secops": "🔐 安全",
|
||
"auto_repair": "🔧 自動修復",
|
||
"alertchain_health": "📡 告警鏈路",
|
||
"flywheel_health": "🔄 飛輪健康",
|
||
"docker": "🐳 Docker",
|
||
"infrastructure": "🏗️ 基礎設施",
|
||
}
|
||
category_line = ""
|
||
if self.alert_category:
|
||
cat_display = html.escape(_category_map.get(self.alert_category, self.alert_category))
|
||
category_line = f"🏷️ 分類:<b>{cat_display}</b>\n"
|
||
|
||
# Playbook 顯示
|
||
playbook_line = ""
|
||
if self.playbook_name:
|
||
playbook_line = f"📖 Playbook:<code>{html.escape(self.playbook_name)}</code>\n"
|
||
|
||
# ADR-075 TYPE-3 格式組裝
|
||
message = (
|
||
f"{self.status_emoji} ACTION REQUIRED | <b>{html.escape(risk_zh)}</b>\n"
|
||
f"──────────────────────\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🎯 資源:<code>{safe_resource}</code>\n"
|
||
f"{category_line}"
|
||
f"\n"
|
||
f"🧠 <b>AI 深度診斷</b>\n"
|
||
f"├─ 分析:{safe_root_cause}\n"
|
||
f"├─ 責任:{resp_display}\n"
|
||
f"└─ {ai_source}\n"
|
||
f"\n"
|
||
f"⚡ <b>建議修復動作</b>\n"
|
||
f"{playbook_line}"
|
||
f"<code>{safe_action}</code>\n"
|
||
)
|
||
if ai_cost_display:
|
||
message += f"{ai_cost_display}"
|
||
if signoz_block:
|
||
message += f"\n{signoz_block}"
|
||
if frequency_block:
|
||
message += f"\n{frequency_block}"
|
||
message += f"\n⏱️ 停機: {safe_downtime}"
|
||
|
||
return message[:4096] # Telegram 硬限制
|
||
|
||
def format_with_nemotron(self) -> str:
|
||
"""
|
||
格式化含 Nemotron 結果的訊息 (Phase 22 ADR-044)
|
||
|
||
格式:
|
||
═══════════════════════════
|
||
🚨 CRITICAL | harbor-core
|
||
═══════════════════════════
|
||
📋 INC-20260331-0001
|
||
🎯 資源: harbor-core-7d4b8c9f5
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🤖 OpenClaw 仲裁
|
||
├ 📊 信心: 🟢 85%
|
||
├ 👥 責任: BE (後端)
|
||
└ 💡 原因: JVM Heap 配置不當
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 Nemotron 執行方案
|
||
✅ restart_deployment: awoooi-api
|
||
✅ scale_deployment: replicas=3
|
||
└ 驗證: ✅ 驗證通過
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 建議: 刪除 Pod
|
||
⏱️ 停機: ~30s
|
||
|
||
Returns:
|
||
str: 格式化的 Telegram 訊息 (max 1000 字元)
|
||
"""
|
||
# 責任映射
|
||
resp_map = {
|
||
"FE": "👨💻 FE (前端)",
|
||
"BE": "⚙️ BE (後端)",
|
||
"INFRA": "🏗️ INFRA (基礎設施)",
|
||
"DB": "🗄️ DB (資料庫)",
|
||
"COLLAB": "🤝 COLLAB (協同處理)",
|
||
}
|
||
resp_display = resp_map.get(self.primary_responsibility, "❓ 未知")
|
||
|
||
# 信心度顯示
|
||
confidence_pct = int(self.confidence * 100)
|
||
if confidence_pct >= 80:
|
||
conf_emoji = "🟢"
|
||
elif confidence_pct >= 70:
|
||
conf_emoji = "🟡"
|
||
else:
|
||
conf_emoji = "🔴"
|
||
|
||
# 自動生成事件編號
|
||
if self.incident_id:
|
||
incident_id = self.incident_id
|
||
elif self.approval_id.startswith("INC-"):
|
||
incident_id = self.approval_id
|
||
else:
|
||
incident_id = f"INC-{self.approval_id[:8].upper()}"
|
||
|
||
# HTML 轉義
|
||
safe_resource = html.escape(self.resource_name)
|
||
safe_root_cause = html.escape(self.root_cause)
|
||
safe_action = html.escape(self.suggested_action)
|
||
safe_downtime = html.escape(self.estimated_downtime)
|
||
|
||
# AI Provider 顯示
|
||
# 2026-04-04 ogt: 加入 ai_model 顯示底層模型名稱
|
||
# 2026-04-12 ogt: 規則匹配不顯示 🔴 0%,改用 ✅
|
||
if self.confidence > 0 and self.ai_provider:
|
||
provider_names = {
|
||
"ollama": "Ollama",
|
||
"gemini": "Gemini",
|
||
"claude": "Claude",
|
||
"nvidia": "Nemotron",
|
||
"openclaw_nemo": "OpenClaw Nemo",
|
||
"openclaw_nvidia_nim": "OpenClaw Nemo",
|
||
"openclaw_qwen": "OpenClaw Nemo",
|
||
}
|
||
provider_display = provider_names.get(self.ai_provider.lower(), self.ai_provider.upper())
|
||
model_suffix = f" ({html.escape(self.ai_model)})" if self.ai_model else ""
|
||
conf_line = f"🤖 <b>{provider_display} 仲裁</b>{model_suffix} {conf_emoji} {confidence_pct}%"
|
||
elif self.confidence > 0:
|
||
conf_line = f"🤖 <b>OpenClaw 仲裁</b> {conf_emoji} {confidence_pct}%"
|
||
else:
|
||
conf_line = "⚙️ <b>規則匹配</b> ✅"
|
||
|
||
# Nemotron 區塊
|
||
# 2026-04-09 Claude Sonnet 4.6: 顯示 AI 鏈路 — OpenClaw 用哪個模型,Tool Calling 用哪個模型
|
||
nemotron_block = ""
|
||
if self.nemotron_enabled and self.nemotron_tools:
|
||
tools_lines = []
|
||
for t in self.nemotron_tools[:3]: # 最多顯示 3 個
|
||
valid_emoji = "✅" if t.get("valid", False) else "❌"
|
||
tool_name = html.escape(str(t.get("tool", "unknown"))[:20])
|
||
args = t.get("args", {})
|
||
if isinstance(args, dict) and args:
|
||
args_str = ", ".join(f"{k}={v}" for k, v in list(args.items())[:2])
|
||
else:
|
||
args_str = str(args)[:30]
|
||
safe_args = html.escape(args_str[:40])
|
||
tools_lines.append(f" {valid_emoji} {tool_name}: {safe_args}")
|
||
|
||
tools_str = "\n".join(tools_lines)
|
||
validation_display = html.escape(self.nemotron_validation or "⏳ 驗證中")
|
||
|
||
# Tool Calling 模型/後端標籤
|
||
if self.nemotron_tool_model and self.nemotron_tool_backend:
|
||
tool_model_label = f"<code>{html.escape(self.nemotron_tool_model)}</code> ({html.escape(self.nemotron_tool_backend)})"
|
||
elif self.nemotron_tool_model:
|
||
tool_model_label = f"<code>{html.escape(self.nemotron_tool_model)}</code>"
|
||
else:
|
||
tool_model_label = "Nemotron"
|
||
|
||
latency_line = f"└ 延遲: {self.nemotron_latency_ms:.0f}ms\n" if self.nemotron_latency_ms > 0 else ""
|
||
|
||
nemotron_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🔧 <b>Tool Calling</b>: {tool_model_label}\n"
|
||
f"{tools_str}\n"
|
||
f"└ 驗證: {validation_display}\n"
|
||
f"{latency_line}"
|
||
)
|
||
|
||
# 2026-04-05 Claude Code: 重設計訊息格式,提升易讀性
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 補 alert_category + playbook_name 顯示
|
||
_category_map = {
|
||
"host": "🖥️ 主機", "host_resource": "🖥️ 主機",
|
||
"k8s": "☸️ K8s", "kubernetes": "☸️ K8s",
|
||
"database": "🗄️ 資料庫", "service": "⚙️ 服務",
|
||
"external_site": "🌐 外部網站", "secops": "🔐 安全",
|
||
"auto_repair": "🔧 自動修復", "docker": "🐳 Docker",
|
||
}
|
||
category_line = ""
|
||
if self.alert_category:
|
||
cat_display = html.escape(_category_map.get(self.alert_category, self.alert_category))
|
||
category_line = f"🏷️ {cat_display}\n"
|
||
playbook_line = ""
|
||
if self.playbook_name:
|
||
playbook_line = f"📖 <code>{html.escape(self.playbook_name)}</code>\n"
|
||
|
||
# 組裝訊息
|
||
message = (
|
||
f"{self.status_emoji} <b>{html.escape(self.risk_level)}</b> <code>{html.escape(incident_id)}</code>\n"
|
||
f"<b>{safe_resource}</b>\n"
|
||
f"{category_line}"
|
||
f"\n"
|
||
f"{conf_line}\n"
|
||
f"👥 {resp_display}\n"
|
||
f"💡 {safe_root_cause}\n"
|
||
)
|
||
if nemotron_block:
|
||
message += f"\n{nemotron_block}"
|
||
message += (
|
||
f"\n🔧 <b>建議:</b> {playbook_line}{safe_action}\n"
|
||
f"⏱️ 停機: {safe_downtime}"
|
||
)
|
||
|
||
return message[:4096] # Telegram 硬限制
|
||
|
||
|
||
# =============================================================================
|
||
# 新訊息模板 (2026-03-29 ogt: ADR-038 Telegram 訊息規範)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class SentryErrorMessage:
|
||
"""
|
||
Sentry 錯誤訊息 (SENTRY_ERROR)
|
||
|
||
2026-03-29 ogt: 新增,用於 Sentry 錯誤通知
|
||
按鈕: [🔍 查看詳情] [🔕 靜默 1h]
|
||
"""
|
||
error_id: str # Sentry Issue ID
|
||
error_type: str # TypeError, ValueError, etc.
|
||
error_message: str # 錯誤訊息 (max 100)
|
||
service_name: str # awoooi-api, awoooi-web, etc.
|
||
file_location: str # src/api/v1/incidents.py:123
|
||
occurrence_count: int = 1 # 發生次數
|
||
affected_users: int = 0 # 影響用戶數
|
||
first_seen: str = "" # 首次發生時間
|
||
stack_trace: list[str] | None = None # Stack trace (前 3 行)
|
||
sentry_url: str = "" # Sentry 連結
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_error = html.escape(self.error_message[:80])
|
||
safe_type = html.escape(self.error_type[:30])
|
||
safe_service = html.escape(self.service_name[:25])
|
||
safe_file = html.escape(self.file_location[:50])
|
||
|
||
# Stack trace 區塊
|
||
trace_block = ""
|
||
if self.stack_trace:
|
||
trace_lines = "\n".join(f" → {html.escape(line[:50])}" for line in self.stack_trace[:3])
|
||
trace_block = f"🔗 Stack Trace (前 3 行):\n{trace_lines}\n"
|
||
|
||
# Sentry URL
|
||
sentry_link = ""
|
||
if self.sentry_url:
|
||
safe_url = html.escape(self.sentry_url, quote=True)
|
||
sentry_link = f"\n🔍 <a href='{safe_url}'>查看 Sentry</a>"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🐛 <b>SENTRY ERROR</b> | {safe_service}\n"
|
||
f"═══════════════════════════\n"
|
||
f"📋 <code>{html.escape(self.error_id)}</code>\n"
|
||
f"🎯 錯誤: <code>{safe_type}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💬 {safe_error}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>統計</b>\n"
|
||
f"├ 發生次數: <code>{self.occurrence_count}</code>\n"
|
||
f"├ 影響用戶: <code>{self.affected_users}</code>\n"
|
||
f"└ 首次發生: {html.escape(self.first_seen) if self.first_seen else 'N/A'}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📍 位置: <code>{safe_file}</code>\n"
|
||
f"{trace_block}"
|
||
f"{sentry_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class ResourceWarnMessage:
|
||
"""
|
||
資源告警訊息 (RESOURCE_WARN)
|
||
|
||
2026-03-29 ogt: 新增,用於資源耗盡警告
|
||
按鈕: [⚡ 自動擴展] [🔕 靜默 1h]
|
||
"""
|
||
resource_id: str # RES-YYYYMMDD-XXXX
|
||
pod_name: str # Pod 名稱
|
||
namespace: str = "default" # K8s namespace
|
||
cpu_percent: float = 0.0 # CPU 使用率
|
||
cpu_limit: str = "" # CPU limit (e.g., 500m)
|
||
memory_percent: float = 0.0 # Memory 使用率
|
||
memory_limit: str = "" # Memory limit (e.g., 512Mi)
|
||
disk_percent: float = 0.0 # Disk 使用率
|
||
trend_info: str = "" # 趨勢資訊
|
||
suggestion: str = "" # 建議操作
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_pod = html.escape(self.pod_name[:35])
|
||
safe_ns = html.escape(self.namespace[:20])
|
||
|
||
# 資源狀態 emoji
|
||
def get_status_emoji(percent: float) -> str:
|
||
if percent >= 90:
|
||
return "🔴"
|
||
elif percent >= 70:
|
||
return "🟡"
|
||
return "🟢"
|
||
|
||
cpu_emoji = get_status_emoji(self.cpu_percent)
|
||
mem_emoji = get_status_emoji(self.memory_percent)
|
||
disk_emoji = get_status_emoji(self.disk_percent)
|
||
|
||
# 趨勢和建議
|
||
trend_block = ""
|
||
if self.trend_info:
|
||
trend_block = f"📈 趨勢: {html.escape(self.trend_info[:50])}\n"
|
||
|
||
suggestion_block = ""
|
||
if self.suggestion:
|
||
suggestion_block = f"💡 建議: {html.escape(self.suggestion)}\n"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"⚠️ <b>資源告警</b> | {safe_ns}\n"
|
||
f"═══════════════════════════\n"
|
||
f"📋 <code>{html.escape(self.resource_id)}</code>\n"
|
||
f"🎯 Pod: <code>{safe_pod}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>資源使用率</b>\n"
|
||
f"├ CPU: {cpu_emoji} <code>{self.cpu_percent:.1f}%</code>"
|
||
f"{f' (limit: {self.cpu_limit})' if self.cpu_limit else ''}\n"
|
||
f"├ Memory: {mem_emoji} <code>{self.memory_percent:.1f}%</code>"
|
||
f"{f' (limit: {self.memory_limit})' if self.memory_limit else ''}\n"
|
||
f"└ Disk: {disk_emoji} <code>{self.disk_percent:.1f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{trend_block}"
|
||
f"{suggestion_block}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class RepairReportMessage:
|
||
"""
|
||
自動修復報告訊息 (REPAIR_REPORT)
|
||
|
||
2026-03-29 ogt: 新增,用於每日自動修復彙總
|
||
按鈕: 無
|
||
"""
|
||
report_date: str # 報告日期 (YYYY-MM-DD)
|
||
total_repairs: int = 0 # 總修復次數
|
||
success_count: int = 0 # 成功次數
|
||
failure_count: int = 0 # 失敗次數
|
||
saved_minutes: int = 0 # 節省人工時間 (分鐘)
|
||
top_issues: list[tuple[str, int]] | None = None # Top 問題 [(name, count)]
|
||
ai_cost_gemini: float = 0.0 # Gemini 成本
|
||
ai_cost_nvidia: float = 0.0 # NVIDIA 成本 (免費)
|
||
ai_tokens_total: int = 0 # 總 Token 數
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 成功率
|
||
success_rate = (self.success_count / self.total_repairs * 100) if self.total_repairs > 0 else 0
|
||
|
||
# Top 問題區塊
|
||
issues_block = ""
|
||
if self.top_issues:
|
||
issues_lines = "\n".join(
|
||
f" {i+1}. {html.escape(name[:30])} ({count} 次)"
|
||
for i, (name, count) in enumerate(self.top_issues[:3])
|
||
)
|
||
issues_block = f"━━━━━━━━━━━━━━━━━━━\n🔝 <b>Top 3 問題</b>:\n{issues_lines}\n"
|
||
|
||
# AI 成本
|
||
total_cost = self.ai_cost_gemini + self.ai_cost_nvidia
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🔧 <b>自動修復報告</b> | 每日彙總\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>統計</b>\n"
|
||
f"├ 總修復次數: <code>{self.total_repairs}</code>\n"
|
||
f"├ 成功: ✅ <code>{self.success_count}</code> ({success_rate:.0f}%)\n"
|
||
f"├ 失敗: ❌ <code>{self.failure_count}</code>\n"
|
||
f"└ 節省人工: ~<code>{self.saved_minutes}</code> 分鐘\n"
|
||
f"{issues_block}"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>AI 成本</b>\n"
|
||
f"├ Gemini: ${self.ai_cost_gemini:.4f} ({self.ai_tokens_total:,} tokens)\n"
|
||
f"├ NVIDIA: ${self.ai_cost_nvidia:.4f} (免費)\n"
|
||
f"└ 總計: ${total_cost:.4f}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class DailySummaryMessage:
|
||
"""
|
||
每日摘要訊息 (DAILY_SUMMARY)
|
||
|
||
2026-03-29 ogt: 新增,用於每日系統狀態摘要
|
||
按鈕: 無
|
||
"""
|
||
summary_date: str # 摘要日期 (YYYY-MM-DD)
|
||
# 告警統計
|
||
alert_total: int = 0
|
||
alert_critical: int = 0
|
||
alert_medium: int = 0
|
||
alert_low: int = 0
|
||
# 處理統計
|
||
auto_repair_count: int = 0
|
||
manual_approval_count: int = 0
|
||
ignored_count: int = 0
|
||
avg_response_minutes: float = 0.0
|
||
# 可用性
|
||
api_availability: float = 99.9
|
||
web_availability: float = 99.9
|
||
worker_availability: float = 99.9
|
||
# 成本
|
||
ai_cost: float = 0.0
|
||
cloud_cost: float = 0.0
|
||
budget_remaining: float = 0.0
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 處理百分比
|
||
total_handled = self.auto_repair_count + self.manual_approval_count + self.ignored_count
|
||
auto_pct = (self.auto_repair_count / total_handled * 100) if total_handled > 0 else 0
|
||
manual_pct = (self.manual_approval_count / total_handled * 100) if total_handled > 0 else 0
|
||
ignored_pct = (self.ignored_count / total_handled * 100) if total_handled > 0 else 0
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"📊 <b>每日摘要</b> | AWOOOI\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.summary_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🚨 <b>告警統計</b>\n"
|
||
f"├ 總數: <code>{self.alert_total}</code>\n"
|
||
f"├ Critical: <code>{self.alert_critical}</code>\n"
|
||
f"├ Medium: <code>{self.alert_medium}</code>\n"
|
||
f"└ Low: <code>{self.alert_low}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"✅ <b>處理統計</b>\n"
|
||
f"├ 自動修復: <code>{self.auto_repair_count}</code> ({auto_pct:.0f}%)\n"
|
||
f"├ 人工簽核: <code>{self.manual_approval_count}</code> ({manual_pct:.0f}%)\n"
|
||
f"├ 忽略/靜默: <code>{self.ignored_count}</code> ({ignored_pct:.0f}%)\n"
|
||
f"└ 平均回應: <code>{self.avg_response_minutes:.1f}</code> 分鐘\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📈 <b>可用性</b>\n"
|
||
f"├ API: <code>{self.api_availability:.2f}%</code>\n"
|
||
f"├ Web: <code>{self.web_availability:.2f}%</code>\n"
|
||
f"└ Worker: <code>{self.worker_availability:.2f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>成本</b>\n"
|
||
f"├ AI: ${self.ai_cost:.2f}\n"
|
||
f"├ 雲端: ${self.cloud_cost:.2f}\n"
|
||
f"└ 預算剩餘: ${self.budget_remaining:.2f}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class CICDProgressMessage:
|
||
"""
|
||
CI/CD 進度訊息 (CICD_PROGRESS)
|
||
|
||
2026-03-30 ogt: 新增,用於 CI/CD 流程進度通知
|
||
特性: 簡潔、不走 AI 仲裁、無按鈕
|
||
"""
|
||
job_name: str # Job 名稱 (e.g., Build, Test, Deploy)
|
||
status: str # running, success, failed
|
||
stage: str = "" # CI/CD 階段 (e.g., build, test, deploy)
|
||
commit_sha: str = "" # Git commit SHA
|
||
triggered_by: str = "" # 觸發者
|
||
duration_seconds: int = 0 # 執行時間
|
||
message: str = "" # 額外訊息
|
||
workflow_url: str = "" # Workflow 連結
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML (簡潔版)"""
|
||
# 狀態 emoji
|
||
status_emoji = {
|
||
"running": "🔄",
|
||
"success": "✅",
|
||
"failed": "❌",
|
||
"pending": "⏳",
|
||
}.get(self.status.lower(), "📦")
|
||
|
||
safe_job = html.escape(self.job_name[:40])
|
||
safe_stage = html.escape(self.stage[:20]) if self.stage else ""
|
||
|
||
# 時間格式化
|
||
duration_str = ""
|
||
if self.duration_seconds > 0:
|
||
minutes = self.duration_seconds // 60
|
||
seconds = self.duration_seconds % 60
|
||
duration_str = f" ({minutes}m {seconds}s)" if minutes > 0 else f" ({seconds}s)"
|
||
|
||
# Commit 資訊
|
||
commit_info = ""
|
||
if self.commit_sha:
|
||
commit_info = f"\n📋 <code>{html.escape(self.commit_sha[:8])}</code>"
|
||
|
||
# Workflow 連結
|
||
workflow_link = ""
|
||
if self.workflow_url:
|
||
safe_url = html.escape(self.workflow_url, quote=True)
|
||
workflow_link = f"\n🔗 <a href='{safe_url}'>Workflow</a>"
|
||
|
||
# 簡潔訊息
|
||
stage_label = f" | {safe_stage}" if safe_stage else ""
|
||
message = (
|
||
f"{status_emoji} <b>[AWOOOI CI/CD]</b>{stage_label}\n"
|
||
f"📦 {safe_job}{duration_str}"
|
||
f"{commit_info}"
|
||
f"{workflow_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class DeploySuccessMessage:
|
||
"""
|
||
部署成功訊息 (DEPLOY_SUCCESS)
|
||
|
||
2026-03-29 ogt: 新增,用於 CD 部署成功通知
|
||
按鈕: 無
|
||
"""
|
||
commit_sha: str # Git commit SHA (short)
|
||
triggered_by: str # 觸發者
|
||
environment: str = "Production" # 環境
|
||
# 版本資訊
|
||
api_version: str = ""
|
||
web_version: str = ""
|
||
worker_version: str = ""
|
||
# 部署時間
|
||
duration_seconds: int = 0
|
||
# 測試結果
|
||
e2e_passed: int = 0
|
||
e2e_total: int = 0
|
||
health_check_passed: bool = True
|
||
# 連結
|
||
workflow_url: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_commit = html.escape(self.commit_sha[:8])
|
||
safe_user = html.escape(self.triggered_by[:20])
|
||
safe_env = html.escape(self.environment[:15])
|
||
|
||
# 部署時間格式化
|
||
minutes = self.duration_seconds // 60
|
||
seconds = self.duration_seconds % 60
|
||
duration_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
|
||
|
||
# 測試結果
|
||
e2e_status = "✅" if self.e2e_passed == self.e2e_total else "⚠️"
|
||
health_status = "✅ 全部通過" if self.health_check_passed else "❌ 部分失敗"
|
||
|
||
# Workflow 連結
|
||
workflow_link = ""
|
||
if self.workflow_url:
|
||
safe_url = html.escape(self.workflow_url, quote=True)
|
||
workflow_link = f"\n🔗 <a href='{safe_url}'>查看 Workflow</a>"
|
||
|
||
message = (
|
||
f"✅ <b>部署成功</b> | {safe_env}\n\n"
|
||
f"📋 Commit: <code>{safe_commit}</code>\n"
|
||
f"👤 觸發者: @{safe_user}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>部署詳情</b>\n"
|
||
f"├ API: {html.escape(self.api_version) if self.api_version else 'N/A'} ✅\n"
|
||
f"├ Web: {html.escape(self.web_version) if self.web_version else 'N/A'} ✅\n"
|
||
f"├ Worker: {html.escape(self.worker_version) if self.worker_version else 'N/A'} ✅\n"
|
||
f"└ 耗時: {duration_str}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🧪 E2E 測試: {e2e_status} {self.e2e_passed}/{self.e2e_total} PASSED\n"
|
||
f"📊 健康檢查: {health_status}"
|
||
f"{workflow_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class RateLimitMessage:
|
||
"""
|
||
API 限額警告訊息 (RATE_LIMIT)
|
||
|
||
2026-03-29 ogt: 新增,用於 AI API 限額警告
|
||
按鈕: 無
|
||
"""
|
||
provider: str # gemini, openai, etc.
|
||
# 用量統計
|
||
daily_usage: int = 0
|
||
daily_limit: int = 0
|
||
token_usage: int = 0
|
||
token_limit: int = 0
|
||
cost_usd: float = 0.0
|
||
# 建議
|
||
suggestions: list[str] | None = None
|
||
# 重置時間
|
||
reset_time: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_provider = html.escape(self.provider.upper()[:15])
|
||
|
||
# 使用率百分比
|
||
usage_pct = (self.daily_usage / self.daily_limit * 100) if self.daily_limit > 0 else 0
|
||
token_pct = (self.token_usage / self.token_limit * 100) if self.token_limit > 0 else 0
|
||
|
||
# 建議區塊
|
||
suggestion_block = ""
|
||
if self.suggestions:
|
||
suggestion_lines = "\n".join(f" - {html.escape(s[:50])}" for s in self.suggestions[:3])
|
||
suggestion_block = f"━━━━━━━━━━━━━━━━━━━\n💡 <b>建議</b>:\n{suggestion_lines}\n"
|
||
|
||
# 重置時間
|
||
reset_block = ""
|
||
if self.reset_time:
|
||
reset_block = f"\n🔄 將於 {html.escape(self.reset_time)} 重置"
|
||
|
||
message = (
|
||
f"⚠️ <b>API 限額警告</b>\n\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>{safe_provider} API</b>\n"
|
||
f"├ 今日用量: <code>{self.daily_usage}/{self.daily_limit}</code> ({usage_pct:.0f}%)\n"
|
||
f"├ Token: <code>{self.token_usage:,}/{self.token_limit:,}</code> ({token_pct:.0f}%)\n"
|
||
f"└ 成本: ${self.cost_usd:.4f}\n"
|
||
f"{suggestion_block}"
|
||
f"{reset_block}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class K3sStatusMessage:
|
||
"""
|
||
K3s 叢集狀態報告訊息 (K3S_STATUS)
|
||
|
||
2026-03-31 Claude Code: Phase 21.2 定期報告
|
||
用於每日 K3s 健康狀態推送
|
||
按鈕: 無
|
||
"""
|
||
report_date: str # 報告日期 (YYYY-MM-DD HH:MM)
|
||
# 節點狀態
|
||
node_total: int = 0
|
||
node_ready: int = 0
|
||
# Pod 狀態
|
||
pod_total: int = 0
|
||
pod_running: int = 0
|
||
pod_pending: int = 0
|
||
pod_failed: int = 0
|
||
# HPA 狀態
|
||
hpa_api_replicas: str = "2/6"
|
||
hpa_web_replicas: str = "2/6"
|
||
hpa_worker_replicas: str = "1/3"
|
||
# 備份狀態
|
||
etcd_backup_last: str = ""
|
||
velero_backup_last: str = ""
|
||
# 穩定指標
|
||
alert_count_48h: int = 0
|
||
pod_restart_48h: int = 0
|
||
# 版本資訊
|
||
k3s_version: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 健康狀態 emoji
|
||
node_health = "✅" if self.node_ready == self.node_total else "⚠️"
|
||
pod_health = "✅" if self.pod_failed == 0 and self.pod_pending == 0 else "⚠️"
|
||
stability = "✅" if self.alert_count_48h == 0 and self.pod_restart_48h == 0 else "⚠️"
|
||
|
||
# 備份狀態
|
||
etcd_status = html.escape(self.etcd_backup_last[:20]) if self.etcd_backup_last else "N/A"
|
||
velero_status = html.escape(self.velero_backup_last[:20]) if self.velero_backup_last else "N/A"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🎛️ <b>K3s 叢集狀態</b> | Daily\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{node_health} <b>節點</b>: {self.node_ready}/{self.node_total} Ready\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{pod_health} <b>Pod 狀態</b>\n"
|
||
f"├ Running: <code>{self.pod_running}</code>\n"
|
||
f"├ Pending: <code>{self.pod_pending}</code>\n"
|
||
f"└ Failed: <code>{self.pod_failed}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>HPA 副本數</b>\n"
|
||
f"├ API: <code>{html.escape(self.hpa_api_replicas)}</code>\n"
|
||
f"├ Web: <code>{html.escape(self.hpa_web_replicas)}</code>\n"
|
||
f"└ Worker: <code>{html.escape(self.hpa_worker_replicas)}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💾 <b>備份</b>\n"
|
||
f"├ etcd: {etcd_status}\n"
|
||
f"└ Velero: {velero_status}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{stability} <b>48h 穩定度</b>\n"
|
||
f"├ 告警: <code>{self.alert_count_48h}</code>\n"
|
||
f"└ Pod 重啟: <code>{self.pod_restart_48h}</code>"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class WeeklyReportMessage:
|
||
"""
|
||
週報訊息 (WEEKLY_REPORT)
|
||
|
||
2026-03-31 Claude Code: Phase 21.3 定期報告
|
||
每週五 18:00 台北發送
|
||
按鈕: 無
|
||
"""
|
||
week_range: str # 週次 (e.g., "2026-W14")
|
||
report_date: str # 報告日期時間
|
||
# 告警統計
|
||
alert_total: int = 0
|
||
alert_critical: int = 0
|
||
alert_resolved: int = 0
|
||
resolved_rate: float = 0.0
|
||
# AI 效能
|
||
ai_proposal_count: int = 0
|
||
ai_executed_count: int = 0
|
||
ai_success_rate: float = 0.0
|
||
avg_response_minutes: float = 0.0
|
||
# K3s 健康
|
||
k3s_uptime_pct: float = 99.9
|
||
pod_restart_total: int = 0
|
||
hpa_scale_events: int = 0
|
||
# Git 活動
|
||
commits_count: int = 0
|
||
deploy_count: int = 0
|
||
# 成本
|
||
ai_cost_week: float = 0.0
|
||
ai_tokens_week: int = 0
|
||
# 2026-04-07 Claude Code: Sprint 4 F1 — 處置分佈
|
||
disposition_auto: int = 0
|
||
disposition_human: int = 0
|
||
disposition_manual: int = 0
|
||
disposition_cold_start: int = 0
|
||
disposition_total: int = 0
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 健康狀態 emoji
|
||
alert_health = "✅" if self.resolved_rate >= 80 else "⚠️"
|
||
ai_health = "✅" if self.ai_success_rate >= 70 else "⚠️"
|
||
k3s_health = "✅" if self.k3s_uptime_pct >= 99 else "⚠️"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"📊 <b>AWOOOI 週報</b>\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.week_range)} | {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{alert_health} <b>告警統計</b>\n"
|
||
f"├ 總數: <code>{self.alert_total}</code>\n"
|
||
f"├ Critical: <code>{self.alert_critical}</code>\n"
|
||
f"├ 已解決: <code>{self.alert_resolved}</code>\n"
|
||
f"└ 解決率: <code>{self.resolved_rate:.1f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{ai_health} <b>AI 效能</b>\n"
|
||
f"├ 提案數: <code>{self.ai_proposal_count}</code>\n"
|
||
f"├ 執行數: <code>{self.ai_executed_count}</code>\n"
|
||
f"├ 成功率: <code>{self.ai_success_rate:.1f}%</code>\n"
|
||
f"└ 平均回應: <code>{self.avg_response_minutes:.1f}</code> 分鐘\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{k3s_health} <b>K3s 健康</b>\n"
|
||
f"├ Uptime: <code>{self.k3s_uptime_pct:.2f}%</code>\n"
|
||
f"├ Pod 重啟: <code>{self.pod_restart_total}</code>\n"
|
||
f"└ HPA 擴縮: <code>{self.hpa_scale_events}</code> 次\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📦 <b>開發活動</b>\n"
|
||
f"├ Commits: <code>{self.commits_count}</code>\n"
|
||
f"└ 部署: <code>{self.deploy_count}</code> 次\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>AI 成本</b>\n"
|
||
f"├ 費用: $<code>{self.ai_cost_week:.2f}</code>\n"
|
||
f"└ Tokens: <code>{self.ai_tokens_week:,}</code>\n"
|
||
)
|
||
|
||
# Sprint 4 F1: 處置分佈(有資料才加)
|
||
if self.disposition_total > 0:
|
||
auto_total = self.disposition_auto + self.disposition_cold_start
|
||
auto_rate = int(auto_total / self.disposition_total * 100) if self.disposition_total > 0 else 0
|
||
message += (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <b>處置分佈</b>\n"
|
||
f"├ 🤖 自動修復: <code>{self.disposition_auto}</code>\n"
|
||
f"├ ❄️ 冷啟動信任: <code>{self.disposition_cold_start}</code>\n"
|
||
f"├ 👤 人工審核: <code>{self.disposition_human}</code>\n"
|
||
f"├ 🔧 手動處理: <code>{self.disposition_manual}</code>\n"
|
||
f"└ 自動化率: <b>{auto_rate}%</b>"
|
||
)
|
||
|
||
return message[:1200]
|
||
|
||
|
||
@dataclass
|
||
class InfraAlertMessage:
|
||
"""
|
||
基礎設施異常告警訊息 (INFRA_ALERT)
|
||
|
||
2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式
|
||
用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等)
|
||
按鈕: 無 (資訊型告警)
|
||
"""
|
||
component: str # 元件名稱 (e.g., "Nemotron NIM")
|
||
status: str # 狀態描述 (e.g., "⚠️ 超時 (>25s)")
|
||
impact: str # 影響說明
|
||
auto_fixed: bool = False # 是否已自動修復
|
||
fix_action: str = "" # 執行的修復動作 (auto_fixed=True 時顯示)
|
||
note: str = "" # 附加說明 (info_only 情境用,不顯示修復區塊)
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 有 note 表示「資訊性提示」,不顯示修復區塊
|
||
if self.note:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n💡 {html.escape(self.note)}\n"
|
||
elif self.auto_fixed:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n✅ <b>已自動修復</b>\n└ {html.escape(self.fix_action)}\n"
|
||
else:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n⚠️ <b>需要關注</b>\n└ {html.escape(self.fix_action or '請確認元件狀態')}\n"
|
||
|
||
return (
|
||
f"🚨 <b>基礎設施異常</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"⚙️ <b>{html.escape(self.component)}</b>: {html.escape(self.status)}\n"
|
||
f"📛 影響: {html.escape(self.impact)}\n"
|
||
f"{footer}"
|
||
)[:900]
|
||
|
||
|
||
# =============================================================================
|
||
# Risk Level Emoji Mapping
|
||
# =============================================================================
|
||
|
||
RISK_EMOJI_MAP = {
|
||
"critical": "🚨",
|
||
"high": "🔴",
|
||
"medium": "⚠️",
|
||
"low": "ℹ️",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# ADR-071-B: 告警通知四類型分類器 (2026-04-11 Claude Sonnet 4.6)
|
||
# =============================================================================
|
||
|
||
from enum import Enum
|
||
|
||
class NotificationType(str, Enum):
|
||
TYPE_1 = "TYPE-1" # 純資訊,無按鈕
|
||
TYPE_2 = "TYPE-2" # 已自動修復
|
||
TYPE_3 = "TYPE-3" # 需人工審核(預設)
|
||
TYPE_4 = "TYPE-4" # AI 無法判斷
|
||
TYPE_4_DRIFT = "TYPE-4D" # Config Drift 專屬
|
||
TYPE_8M = "TYPE-8M" # ADR-075: 飛輪/告警鏈路自身健康(Meta-System)
|
||
|
||
|
||
def classify_notification(
|
||
incident,
|
||
confidence: float,
|
||
auto_executed: bool,
|
||
mcp_all_failed: bool = False,
|
||
decision_state: str = "",
|
||
) -> NotificationType:
|
||
"""
|
||
告警通知分類器 — 決定要送哪種類型的 Telegram 卡片
|
||
|
||
分類優先順序:
|
||
TYPE-4D > TYPE-1 > TYPE-2 > TYPE-4 > TYPE-3(預設)
|
||
|
||
Args:
|
||
incident: Incident Pydantic 模型(需要 signals[].labels + title)
|
||
confidence: AI 決策信心值 (0.0~1.0)
|
||
auto_executed: 是否已自動修復執行完成
|
||
mcp_all_failed: 所有 MCP provider 是否全失敗
|
||
decision_state: DecisionState 字串 ("COMPLETED" / "ERROR" / ...)
|
||
"""
|
||
# ADR-073 Phase 3-1: 優先採用 classify_alert_early() 已設定的 notification_type
|
||
# 這樣 TYPE-1/TYPE-4D/TYPE-8M 告警不需進入 LLM 分析路徑 (2026-04-12 ogt)
|
||
_early_type = getattr(incident, "notification_type", None)
|
||
if _early_type == "TYPE-4D":
|
||
return NotificationType.TYPE_4_DRIFT
|
||
if _early_type == "TYPE-8M":
|
||
return NotificationType.TYPE_8M
|
||
if _early_type == "TYPE-1":
|
||
return NotificationType.TYPE_1
|
||
|
||
labels = incident.signals[0].labels if incident.signals else {}
|
||
alertname = labels.get("alertname", "")
|
||
label_severity = labels.get("severity", "")
|
||
|
||
# TYPE-4D:Config Drift 專屬(最優先)
|
||
if alertname in ("ConfigDrift", "ConfigurationDrift", "KubeConfigDrift"):
|
||
return NotificationType.TYPE_4_DRIFT
|
||
|
||
# TYPE-1:純資訊(severity=info + 成功類告警)
|
||
# 2026-04-12 ogt: Incident 無 title 欄位,改用 alertname + signal annotations
|
||
_tg_sig_summary = (
|
||
incident.signals[0].annotations.get("summary", "") or
|
||
incident.signals[0].annotations.get("description", "") or
|
||
incident.signals[0].alert_name
|
||
) if incident.signals else ""
|
||
title_lower = (alertname + " " + _tg_sig_summary).lower()
|
||
if label_severity == "info" and any(kw in title_lower for kw in ["success", "完成", "completed"]):
|
||
return NotificationType.TYPE_1
|
||
if alertname.startswith(("Backup.", "VeleroBackup")) and label_severity == "info":
|
||
return NotificationType.TYPE_1
|
||
if alertname in ("AlertChainHealthy", "AutoRepairHighSuccessRate"):
|
||
return NotificationType.TYPE_1
|
||
|
||
# TYPE-2:已自動修復完成
|
||
if auto_executed and decision_state == "COMPLETED":
|
||
return NotificationType.TYPE_2
|
||
|
||
# TYPE-4:AI 無法判斷(信心不足 / MCP 全失敗 / 決策錯誤)
|
||
if confidence < 0.5 or mcp_all_failed or decision_state == "ERROR":
|
||
return NotificationType.TYPE_4
|
||
|
||
# TYPE-3:預設(需人工審核)
|
||
return NotificationType.TYPE_3
|
||
|
||
|
||
# =============================================================================
|
||
# Telegram Gateway
|
||
# =============================================================================
|
||
|
||
class TelegramGatewayError(Exception):
|
||
"""Telegram Gateway 錯誤"""
|
||
pass
|
||
|
||
|
||
class TelegramGateway:
|
||
"""
|
||
Telegram Gateway - 行動戰情室 + SignOz 整合
|
||
|
||
職責:
|
||
1. 推送待簽核卡片到 Telegram (含 SignOz 指標)
|
||
2. 接收並驗證簽核/調優回調
|
||
3. Shadow Mode 調優執行 (僅日誌)
|
||
4. 遵守 SOUL.md 訊息壓縮原則
|
||
"""
|
||
|
||
TELEGRAM_API_BASE = "https://api.telegram.org"
|
||
|
||
def __init__(self):
|
||
self._http_client: httpx.AsyncClient | None = None
|
||
self._security = get_security_interceptor()
|
||
self._initialized = False
|
||
# Long Polling 狀態 (Phase 5 內網修復)
|
||
self._polling_active = False
|
||
self._polling_task: asyncio.Task | None = None
|
||
self._last_update_id = 0
|
||
# 2026-04-01 Claude Code: 分散式 Leader Election (防 2-Pod 409 互搶)
|
||
self._pod_id = os.environ.get("POD_NAME", os.urandom(8).hex())
|
||
self._leader_task: asyncio.Task | None = None
|
||
# Phase 6.5: 心跳監控 (防止沉默盲點)
|
||
self._last_message_time: datetime | None = None
|
||
self._heartbeat_task: asyncio.Task | None = None
|
||
self._heartbeat_active = False
|
||
|
||
async def initialize(self) -> bool:
|
||
"""初始化 Gateway"""
|
||
if not settings.OPENCLAW_TG_BOT_TOKEN:
|
||
logger.warning("telegram_gateway_disabled", reason="Bot token not configured")
|
||
return False
|
||
|
||
if not settings.OPENCLAW_TG_CHAT_ID:
|
||
logger.warning("telegram_gateway_disabled", reason="Chat ID not configured")
|
||
return False
|
||
|
||
# 2026-04-03 ogt: timeout 改用 httpx.Timeout 分開設定
|
||
# connect=10s, read=50s (getUpdates long polling timeout 40s + buffer)
|
||
# 舊的 timeout=30.0 會讓 getUpdates(timeout=40s) 每次都被 client 先打斷
|
||
self._http_client = httpx.AsyncClient(
|
||
timeout=httpx.Timeout(connect=10.0, read=50.0, write=10.0, pool=10.0),
|
||
headers={"Content-Type": "application/json"},
|
||
)
|
||
|
||
await self._security.initialize()
|
||
self._initialized = True
|
||
|
||
logger.info("telegram_gateway_initialized")
|
||
return True
|
||
|
||
@property
|
||
def bot_token(self) -> str:
|
||
"""取得 Bot Token"""
|
||
return settings.OPENCLAW_TG_BOT_TOKEN
|
||
|
||
@property
|
||
def chat_id(self) -> str:
|
||
"""取得 Chat ID"""
|
||
return settings.OPENCLAW_TG_CHAT_ID
|
||
|
||
@property
|
||
def api_url(self) -> str:
|
||
"""取得 Telegram API URL"""
|
||
return f"{self.TELEGRAM_API_BASE}/bot{self.bot_token}"
|
||
|
||
async def _send_request(
|
||
self,
|
||
method: str,
|
||
payload: dict,
|
||
) -> dict:
|
||
"""
|
||
發送 Telegram API 請求
|
||
|
||
Phase C P1: 新增 OTEL 追蹤
|
||
@author Claude Code
|
||
@date 2026-03-30 (台北時間)
|
||
|
||
Args:
|
||
method: API 方法 (sendMessage, editMessageText, etc.)
|
||
payload: 請求 Payload
|
||
|
||
Returns:
|
||
dict: API 回應
|
||
"""
|
||
if not self._initialized:
|
||
await self.initialize()
|
||
|
||
if not self._http_client:
|
||
raise TelegramGatewayError("HTTP client not initialized")
|
||
|
||
url = f"{self.api_url}/{method}"
|
||
|
||
# OTEL Span: telegram.api.{method}
|
||
with _tracer.start_as_current_span(
|
||
f"telegram.api.{method}",
|
||
attributes={
|
||
"telegram.method": method,
|
||
"telegram.chat_id": str(payload.get("chat_id", "")),
|
||
"telegram.has_reply_markup": "reply_markup" in payload,
|
||
},
|
||
) as span:
|
||
try:
|
||
response = await self._http_client.post(url, json=payload)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
|
||
if not result.get("ok"):
|
||
span.set_attribute("telegram.error", result.get("description", "Unknown"))
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
raise TelegramGatewayError(
|
||
f"Telegram API error: {result.get('description', 'Unknown error')}"
|
||
)
|
||
|
||
# 成功: 記錄 message_id (result 可能是 dict 或 bool,需防禦)
|
||
result_val = result.get("result")
|
||
if isinstance(result_val, dict) and "message_id" in result_val:
|
||
span.set_attribute("telegram.message_id", result_val["message_id"])
|
||
|
||
span.set_status(trace.Status(trace.StatusCode.OK))
|
||
return result
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
span.set_attribute("telegram.http_status", e.response.status_code)
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(e)
|
||
logger.error("telegram_api_error", method=method, status=e.response.status_code)
|
||
raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from e
|
||
|
||
except TelegramGatewayError:
|
||
# 已處理的錯誤,直接拋出
|
||
raise
|
||
|
||
except Exception as e:
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(e)
|
||
logger.error("telegram_request_failed", method=method, error=str(e))
|
||
raise TelegramGatewayError(str(e)) from e
|
||
|
||
def _build_inline_keyboard(
|
||
self,
|
||
approval_id: str,
|
||
include_auto_tuning: bool = True,
|
||
auto_tuning_command: str = "",
|
||
incident_id: str = "",
|
||
# ADR-071-E: TYPE-3 動態按鈕 (2026-04-11 Claude Sonnet 4.6)
|
||
alert_category: str = "",
|
||
notification_type: str = "",
|
||
) -> dict:
|
||
"""
|
||
建立 Inline Keyboard
|
||
|
||
ADR-050 v2.0 (2026-04-01): 六鍵佈局
|
||
ADR-071-E (2026-04-11): TYPE-3 依 alert_category 動態組合操作按鈕
|
||
|
||
TYPE-3 按鈕對應 alert_category:
|
||
k8s_workload → [重啟] [擴容] [縮容] [回滾]
|
||
database → [終止慢查詢] [清連線池]
|
||
host_resource → [查程序] [重啟服務] [清 Log]
|
||
network → [重載 Nginx] [查 Port]
|
||
devops_tool → [重啟服務] [查 Log]
|
||
ai_system → [切換 Provider]
|
||
ssl_cert → [更新憑證]
|
||
(其他) → [批准] [拒絕] (舊版通用鍵)
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID (用於 nonce 生成)
|
||
include_auto_tuning: 是否包含自動調優按鈕
|
||
auto_tuning_command: kubectl 調優指令
|
||
incident_id: 關聯 Incident ID (用於 detail/reanalyze/history 按鈕)
|
||
alert_category: 告警類別 (ADR-071-E: 決定 TYPE-3 按鈕組合)
|
||
notification_type: 通知類型 (TYPE-1/2/3/4/4D)
|
||
"""
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.4):
|
||
# 從 callback_action_spec registry 動態產生按鈕(原 _CATEGORY_BUTTONS hardcode 已下架)
|
||
# 優點:新增按鈕只需改 yaml,callback_data 格式由 spec.callback_format 決定
|
||
# 安全:yaml 裡每個 action 都有對應 MCP dispatcher handler(Sprint 5.2/5.3 實作)
|
||
from src.services.callback_dispatcher import list_actions_for_category as _list_actions
|
||
|
||
def _build_category_buttons_for(category: str) -> list[tuple[str, str]]:
|
||
"""從 registry 產生 (label, callback_data) list"""
|
||
actions = _list_actions(category)
|
||
btns: list[tuple[str, str]] = []
|
||
for spec_it in actions:
|
||
emoji_label = f"{spec_it.emoji} {spec_it.label}".strip()
|
||
if spec_it.callback_format == "nonce":
|
||
# 寫類:產生 4-part nonce
|
||
cb = self._security.generate_callback_nonce(approval_id, spec_it.name)
|
||
else:
|
||
# 查類:2-part info
|
||
cb = f"{spec_it.name}:{incident_id}"
|
||
btns.append((emoji_label, cb))
|
||
return btns
|
||
|
||
# 產生 Nonce (防重放,用於寫操作)
|
||
approve_nonce = self._security.generate_callback_nonce(approval_id, "approve")
|
||
reject_nonce = self._security.generate_callback_nonce(approval_id, "reject")
|
||
silence_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
|
||
is_type3 = notification_type in ("TYPE-3", NotificationType.TYPE_3, "")
|
||
|
||
_dynamic_buttons = _build_category_buttons_for(alert_category) if alert_category else []
|
||
|
||
if is_type3 and _dynamic_buttons:
|
||
# TYPE-3 動態操作按鈕:第一行為類別專屬操作
|
||
category_btns = [
|
||
{"text": text, "callback_data": cb_data}
|
||
for text, cb_data in _dynamic_buttons
|
||
]
|
||
# 每行最多 3 個,超過換行
|
||
rows = [category_btns[i:i+3] for i in range(0, len(category_btns), 3)]
|
||
# 通用操作:[查看詳情] [忽略]
|
||
rows.append([
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔕 忽略", "callback_data": silence_nonce},
|
||
])
|
||
buttons = rows
|
||
else:
|
||
# 舊版通用鍵(向下相容)
|
||
buttons = [
|
||
[
|
||
{"text": "✅ 批准", "callback_data": approve_nonce},
|
||
{"text": "❌ 拒絕", "callback_data": reject_nonce},
|
||
{"text": "🔕 靜默", "callback_data": silence_nonce},
|
||
],
|
||
]
|
||
# 第二行: 資訊查詢按鈕 (ADR-050: read-only, format: action:incident_id)
|
||
if incident_id:
|
||
buttons.append([
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
])
|
||
|
||
# 自動調優按鈕 (v7.0)
|
||
if include_auto_tuning and auto_tuning_command:
|
||
tuning_nonce = self._security.generate_callback_nonce(approval_id, "tune")
|
||
buttons.append([
|
||
{"text": "⚡ 執行自動調優", "callback_data": tuning_nonce}
|
||
])
|
||
|
||
return {"inline_keyboard": buttons}
|
||
|
||
async def send_approval_card(
|
||
self,
|
||
approval_id: str,
|
||
risk_level: str,
|
||
resource_name: str,
|
||
root_cause: str,
|
||
suggested_action: str,
|
||
estimated_downtime: str = "~30s",
|
||
# v6.0 AI 仲裁欄位
|
||
primary_responsibility: str = "COLLAB",
|
||
confidence: float = 0.0,
|
||
namespace: str = "default",
|
||
# v7.0 SignOz 整合
|
||
signoz_rps: float = 0.0,
|
||
signoz_rps_trend: str = "stable",
|
||
signoz_error_rate: float = 0.0,
|
||
signoz_p99_latency: float = 0.0,
|
||
signoz_latency_trend: str = "stable",
|
||
signoz_trace_url: str = "",
|
||
auto_tuning_command: str = "",
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens: int = 0,
|
||
ai_cost: float = 0.0,
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency: dict | None = None,
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider: str = "",
|
||
# 2026-04-04 ogt: 底層模型名稱
|
||
ai_model: str = "",
|
||
# 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044)
|
||
nemotron_enabled: bool = False,
|
||
nemotron_tools: list[dict] | None = None,
|
||
nemotron_validation: str = "",
|
||
nemotron_latency_ms: float = 0.0,
|
||
# 2026-04-09 Claude Sonnet 4.6: Tool Calling 模型/後端顯示
|
||
nemotron_tool_model: str = "",
|
||
nemotron_tool_backend: str = "",
|
||
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
|
||
incident_id: str = "",
|
||
# 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復)
|
||
alert_category: str = "",
|
||
notification_type: str = "",
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076)
|
||
playbook_name: str = "",
|
||
) -> dict:
|
||
"""
|
||
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
|
||
|
||
SOUL.md 4.1 + AI 仲裁 + SignOz 訊息格式
|
||
Phase 21 (ADR-037): 含異常頻率統計
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
risk_level: 風險等級 (critical/medium/low)
|
||
resource_name: 資源名稱
|
||
root_cause: 根因摘要
|
||
suggested_action: 建議操作
|
||
estimated_downtime: 預計停機時間
|
||
primary_responsibility: 責任團隊 (FE/BE/INFRA/DB/COLLAB)
|
||
confidence: AI 信心度 (0.0-1.0)
|
||
namespace: K8s namespace
|
||
signoz_*: SignOz Gold Metrics
|
||
signoz_trace_url: 動態時間參數的 Trace URL
|
||
auto_tuning_command: kubectl 調優指令
|
||
anomaly_frequency: 異常頻率統計 (ADR-037)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
# 取得狀態 Emoji
|
||
emoji = RISK_EMOJI_MAP.get(risk_level.lower(), "⚠️")
|
||
|
||
# 建立 SignOz 指標區塊
|
||
signoz_metrics = None
|
||
if signoz_rps > 0 or signoz_error_rate > 0 or signoz_p99_latency > 0:
|
||
signoz_metrics = SignOzMetricsBlock(
|
||
rps=signoz_rps,
|
||
rps_trend=signoz_rps_trend,
|
||
error_rate=signoz_error_rate,
|
||
p99_latency_ms=signoz_p99_latency,
|
||
latency_trend=signoz_latency_trend,
|
||
trace_url=signoz_trace_url,
|
||
)
|
||
|
||
# 建立訊息結構 (含 AI 仲裁 + SignOz + Token/Cost + 頻率統計)
|
||
message = TelegramMessage(
|
||
status_emoji=emoji,
|
||
risk_level=risk_level.upper(),
|
||
resource_name=resource_name,
|
||
root_cause=root_cause,
|
||
suggested_action=suggested_action,
|
||
estimated_downtime=estimated_downtime,
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
primary_responsibility=primary_responsibility,
|
||
confidence=confidence,
|
||
namespace=namespace,
|
||
signoz_metrics=signoz_metrics,
|
||
signoz_trace_url=signoz_trace_url,
|
||
auto_tuning_command=auto_tuning_command,
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens=ai_tokens,
|
||
ai_cost=ai_cost,
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency=anomaly_frequency,
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider=ai_provider,
|
||
# 2026-04-04 ogt: 底層模型名稱
|
||
ai_model=ai_model,
|
||
# 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044)
|
||
nemotron_enabled=nemotron_enabled,
|
||
nemotron_tools=nemotron_tools,
|
||
nemotron_validation=nemotron_validation,
|
||
nemotron_latency_ms=nemotron_latency_ms,
|
||
# 2026-04-09 Claude Sonnet 4.6: Tool Calling 模型/後端
|
||
nemotron_tool_model=nemotron_tool_model,
|
||
nemotron_tool_backend=nemotron_tool_backend,
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076)
|
||
alert_category=alert_category,
|
||
playbook_name=playbook_name,
|
||
)
|
||
|
||
# 格式化訊息 — Phase 22: 如果 Nemotron 啟用,使用雙軌格式
|
||
text = message.format_with_nemotron() if nemotron_enabled else message.format()
|
||
|
||
# 建立按鈕 (含自動調優)
|
||
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
|
||
# ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復)
|
||
keyboard = self._build_inline_keyboard(
|
||
approval_id=approval_id,
|
||
include_auto_tuning=bool(auto_tuning_command),
|
||
auto_tuning_command=auto_tuning_command,
|
||
incident_id=incident_id,
|
||
alert_category=alert_category,
|
||
notification_type=notification_type,
|
||
)
|
||
|
||
# 發送訊息
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
"disable_web_page_preview": True, # 避免 SignOz URL 預覽
|
||
}
|
||
|
||
logger.info(
|
||
"telegram_approval_card_sending",
|
||
approval_id=approval_id,
|
||
risk_level=risk_level,
|
||
resource=resource_name,
|
||
signoz_integrated=signoz_metrics is not None,
|
||
auto_tuning_available=bool(auto_tuning_command),
|
||
)
|
||
|
||
result = await self._send_request("sendMessage", payload)
|
||
|
||
_msg_id = result.get("result", {}).get("message_id")
|
||
logger.info(
|
||
"telegram_approval_card_sent",
|
||
approval_id=approval_id,
|
||
message_id=_msg_id,
|
||
)
|
||
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片
|
||
# key: tg_approval:{approval_id},TTL 24h
|
||
if _msg_id:
|
||
try:
|
||
await get_redis().setex(f"tg_approval:{approval_id}", 86400, str(_msg_id))
|
||
except Exception as _e:
|
||
logger.warning("tg_approval_msg_id_store_failed", approval_id=approval_id, error=str(_e))
|
||
|
||
# 2026-04-03 ogt: 發到 SRE 群組並觸發 AI 雙向討論 (Triumvirate ADR-053)
|
||
# 2026-04-05 ogt: 升級為完整 v7.0 格式,與個人 chat 一致
|
||
# 2026-04-12 ogt: ADR-075 斷點 C 修復 — 含按鈕的互動型通知禁止發群組(nonce 洩漏)
|
||
# TYPE-1/TYPE-2 → 可發群組(純資訊,無 callback 按鈕)
|
||
# TYPE-3/TYPE-4/TYPE-4D/TYPE-8M → 僅 DM,不發群組
|
||
_interactive_types = {"TYPE-3", "TYPE-4", "TYPE-4D", "TYPE-8M"}
|
||
_is_interactive = notification_type in _interactive_types or (
|
||
not notification_type and alert_category # 有分類但無明確型別 → 視為互動型
|
||
)
|
||
if settings.SRE_GROUP_CHAT_ID and not _is_interactive:
|
||
asyncio.create_task(
|
||
self._send_approval_card_to_group(
|
||
approval_id=approval_id,
|
||
risk_level=risk_level,
|
||
resource_name=resource_name,
|
||
root_cause=root_cause,
|
||
suggested_action=suggested_action,
|
||
estimated_downtime=estimated_downtime,
|
||
primary_responsibility=primary_responsibility,
|
||
confidence=confidence,
|
||
namespace=namespace,
|
||
signoz_rps=signoz_rps,
|
||
signoz_rps_trend=signoz_rps_trend,
|
||
signoz_error_rate=signoz_error_rate,
|
||
signoz_p99_latency=signoz_p99_latency,
|
||
signoz_latency_trend=signoz_latency_trend,
|
||
signoz_trace_url=signoz_trace_url,
|
||
auto_tuning_command=auto_tuning_command,
|
||
ai_tokens=ai_tokens,
|
||
ai_cost=ai_cost,
|
||
anomaly_frequency=anomaly_frequency,
|
||
ai_provider=ai_provider,
|
||
ai_model=ai_model,
|
||
nemotron_enabled=nemotron_enabled,
|
||
nemotron_tools=nemotron_tools,
|
||
nemotron_validation=nemotron_validation,
|
||
nemotron_latency_ms=nemotron_latency_ms,
|
||
incident_id=incident_id,
|
||
)
|
||
)
|
||
|
||
return result
|
||
|
||
async def _send_approval_card_to_group(
|
||
self,
|
||
approval_id: str,
|
||
risk_level: str,
|
||
resource_name: str,
|
||
root_cause: str,
|
||
suggested_action: str,
|
||
estimated_downtime: str = "~30s",
|
||
primary_responsibility: str = "COLLAB",
|
||
confidence: float = 0.0,
|
||
namespace: str = "default",
|
||
signoz_rps: float = 0.0,
|
||
signoz_rps_trend: str = "stable",
|
||
signoz_error_rate: float = 0.0,
|
||
signoz_p99_latency: float = 0.0,
|
||
signoz_latency_trend: str = "stable",
|
||
signoz_trace_url: str = "",
|
||
auto_tuning_command: str = "",
|
||
ai_tokens: int = 0,
|
||
ai_cost: float = 0.0,
|
||
anomaly_frequency: dict | None = None,
|
||
ai_provider: str = "",
|
||
ai_model: str = "",
|
||
nemotron_enabled: bool = False,
|
||
nemotron_tools: list[dict] | None = None,
|
||
nemotron_validation: str = "",
|
||
nemotron_latency_ms: float = 0.0,
|
||
incident_id: str = "",
|
||
) -> None:
|
||
"""
|
||
發送告警卡片到 SRE 群組 — 與個人 chat 相同的完整 v7.0 格式
|
||
|
||
2026-04-05 ogt: 升級為完整格式(含 SignOz/AI/Nemotron),移除精簡版
|
||
由 asyncio.create_task 非同步呼叫,失敗不影響主告警流程。
|
||
"""
|
||
try:
|
||
emoji = RISK_EMOJI_MAP.get(risk_level.lower(), "⚠️")
|
||
|
||
signoz_metrics = None
|
||
if signoz_rps > 0 or signoz_error_rate > 0 or signoz_p99_latency > 0:
|
||
signoz_metrics = SignOzMetricsBlock(
|
||
rps=signoz_rps,
|
||
rps_trend=signoz_rps_trend,
|
||
error_rate=signoz_error_rate,
|
||
p99_latency_ms=signoz_p99_latency,
|
||
latency_trend=signoz_latency_trend,
|
||
trace_url=signoz_trace_url,
|
||
)
|
||
|
||
message = TelegramMessage(
|
||
status_emoji=emoji,
|
||
risk_level=risk_level.upper(),
|
||
resource_name=resource_name,
|
||
root_cause=root_cause,
|
||
suggested_action=suggested_action,
|
||
estimated_downtime=estimated_downtime,
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
primary_responsibility=primary_responsibility,
|
||
confidence=confidence,
|
||
namespace=namespace,
|
||
signoz_metrics=signoz_metrics,
|
||
signoz_trace_url=signoz_trace_url,
|
||
auto_tuning_command=auto_tuning_command,
|
||
ai_tokens=ai_tokens,
|
||
ai_cost=ai_cost,
|
||
anomaly_frequency=anomaly_frequency,
|
||
ai_provider=ai_provider,
|
||
ai_model=ai_model,
|
||
nemotron_enabled=nemotron_enabled,
|
||
nemotron_tools=nemotron_tools,
|
||
nemotron_validation=nemotron_validation,
|
||
nemotron_latency_ms=nemotron_latency_ms,
|
||
)
|
||
text = message.format_with_nemotron() if nemotron_enabled else message.format()
|
||
resp = await self.send_to_group(text=text)
|
||
|
||
# 2026-04-10 Claude Sonnet 4.6: 儲存 message_id 到 Redis,供 append_incident_update 使用
|
||
# tg_msg:{incident_id} → Telegram message_id (TTL 24h)
|
||
if incident_id and resp:
|
||
tg_message_id = (resp.get("result") or {}).get("message_id") or resp.get("message_id")
|
||
if tg_message_id:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
await redis.set(f"tg_msg:{incident_id}", str(tg_message_id), ex=86400)
|
||
logger.info("tg_msg_id_stored", incident_id=incident_id, message_id=tg_message_id)
|
||
except Exception as e:
|
||
logger.error("send_approval_card_to_group_failed", error=str(e))
|
||
|
||
# =========================================================================
|
||
# ADR-071-C: TYPE-1 純資訊通知 (2026-04-11 Claude Sonnet 4.6)
|
||
# =========================================================================
|
||
|
||
async def send_info_notification(
|
||
self,
|
||
incident_id: str,
|
||
title: str,
|
||
message: str,
|
||
alertname: str = "",
|
||
severity: str = "info",
|
||
) -> dict:
|
||
"""
|
||
TYPE-1 純資訊通知 — 無按鈕,FYI 類告警
|
||
|
||
用於: severity=info 成功類 / Backup 完成 / AlertChainHealthy 等
|
||
格式: 簡潔文字,無 InlineKeyboard
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
title: 訊息標題
|
||
message: 訊息內容
|
||
alertname: 告警名稱
|
||
severity: 嚴重度 (info/success)
|
||
"""
|
||
severity_emoji = {"info": "ℹ️", "success": "✅"}.get(severity, "ℹ️")
|
||
text = (
|
||
f"{severity_emoji} <b>{html.escape(title)}</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
)
|
||
if alertname:
|
||
text += f"🔔 告警: <code>{html.escape(alertname)}</code>\n"
|
||
text += (
|
||
f"\n{html.escape(message)}\n"
|
||
f"\n<i>此為純資訊通知,無需操作。</i>"
|
||
)
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
},
|
||
)
|
||
|
||
# =========================================================================
|
||
# ADR-071-F: TYPE-4D Config Drift 專屬卡片 (2026-04-11 Claude Sonnet 4.6)
|
||
# =========================================================================
|
||
|
||
async def send_drift_card(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
resource_name: str,
|
||
diff_summary: str,
|
||
detected_at: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-4D Config Drift 通知卡片
|
||
|
||
按鈕: [查看Diff] [採納變更] [回滾] [忽略]
|
||
Diff 長度 > 500 字 → 改送 Web 連結,避免 Telegram 訊息過長
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
approval_id: 簽核單 ID (用於 nonce 生成)
|
||
resource_name: 漂移的資源名稱
|
||
diff_summary: Diff 摘要文字
|
||
detected_at: 偵測時間
|
||
"""
|
||
# Diff 長度處理 (ADR-071, Section 14.9.6)
|
||
if len(diff_summary) <= 500:
|
||
diff_block = f"\n<pre>{html.escape(diff_summary)}</pre>"
|
||
else:
|
||
web_url = f"https://aiops.wooo.work/incidents/{incident_id}/drift-diff"
|
||
diff_block = f"\n⚠️ 差異過大({len(diff_summary)} 字)\n🔗 <a href='{web_url}'>查看完整 Diff</a>"
|
||
|
||
text = (
|
||
f"⚙️ <b>Config Drift 偵測</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🎯 資源: <code>{html.escape(resource_name[:50])}</code>\n"
|
||
)
|
||
if detected_at:
|
||
text += f"🕐 偵測時間: {html.escape(detected_at)}\n"
|
||
text += diff_block
|
||
|
||
# 按鈕組合 (TYPE-4D 固定四鍵)
|
||
view_nonce = self._security.generate_callback_nonce(approval_id, "drift_view")
|
||
adopt_nonce = self._security.generate_callback_nonce(approval_id, "drift_adopt")
|
||
revert_nonce = self._security.generate_callback_nonce(approval_id, "drift_revert")
|
||
ignore_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔍 查看 Diff", "callback_data": view_nonce},
|
||
{"text": "✅ 採納變更", "callback_data": adopt_nonce},
|
||
],
|
||
[
|
||
{"text": "⏪ 回滾", "callback_data": revert_nonce},
|
||
{"text": "🔕 忽略", "callback_data": ignore_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
# =========================================================================
|
||
# ADR-075: TYPE-8M Meta-System 告警(飛輪/告警鏈路健康)
|
||
# 2026-04-12 ogt
|
||
# =========================================================================
|
||
|
||
async def send_meta_alert(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
alertname: str,
|
||
alert_category: str, # "flywheel_health" or "alertchain_health"
|
||
diagnosis: str,
|
||
severity_level: str = "critical",
|
||
system_impact: str = "",
|
||
probable_cause: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-8M Meta-System 告警 — 飛輪或告警鏈路自身健康異常。
|
||
|
||
適用: FlywheelPlaybookZero / AlertChainBroken_* 等
|
||
按鈕: 固定 3 個([觸發診斷] [查看面板] [靜默])
|
||
只發個人 DM,不發群組(雙頻道路由規則)。
|
||
"""
|
||
severity_emoji = "🔴" if severity_level == "critical" else "🟠"
|
||
category_label = "飛輪核心異常" if alert_category == "flywheel_health" else "告警鏈路異常"
|
||
|
||
text = (
|
||
f"⚙️ META SYSTEM | {severity_emoji} {category_label}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 異常元件:<b>{html.escape(alertname)}</b>\n"
|
||
f"🎯 診斷結果:{html.escape(diagnosis[:100])}\n"
|
||
)
|
||
if system_impact:
|
||
text += f"\n🧠 <b>系統影響</b>\n{html.escape(system_impact[:150])}\n"
|
||
if probable_cause:
|
||
text += f"└─ 可能根因:{html.escape(probable_cause[:100])}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
silence_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_secops_card(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
alertname: str,
|
||
threat_level: str,
|
||
source: str = "",
|
||
threat_behavior: str = "",
|
||
defense_action: str = "",
|
||
resource: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-5S SecOps 資安威脅告警卡片。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
按鈕: [隔離資源] [封鎖來源IP] [強制驅逐] [確認授權]
|
||
只發個人 DM(指令敏感,不發群組)。
|
||
"""
|
||
level_icon = {"critical": "🔴", "warning": "🟠"}.get(threat_level.lower(), "⚠️")
|
||
|
||
text = (
|
||
f"🥷 SECOPS | {level_icon} 資安威脅\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 威脅類型:<b>{html.escape(alertname)}</b>\n"
|
||
)
|
||
if resource:
|
||
text += f"🎯 受害資源:<code>{html.escape(resource)}</code>\n"
|
||
text += "\n🧠 <b>AI 威脅分析</b>\n"
|
||
if source:
|
||
text += f"├─ 來源:{html.escape(source)}\n"
|
||
if threat_behavior:
|
||
text += f"├─ 異常行為:{html.escape(threat_behavior[:200])}\n"
|
||
text += f"└─ 風險評估:<b>{html.escape(threat_level)}</b>\n"
|
||
if defense_action:
|
||
text += f"\n🛡️ <b>建議防禦動作</b>\n<code>{html.escape(defense_action[:200])}</code>\n"
|
||
|
||
# 2026-04-16 ogt: 移除 secops_block_ip / secops_evict (spec=nonce 但用了 2-part格式,無 handler)
|
||
# secops 高危操作需 multi-sig,現階段無多簽機制,暫移除等補齊
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
isolate_nonce = self._security.generate_callback_nonce(approval_id, "secops_isolate")
|
||
auth_nonce = self._security.generate_callback_nonce(approval_id, "secops_authorize")
|
||
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🚫 隔離資源", "callback_data": isolate_nonce},
|
||
{"text": "✅ 確認授權操作", "callback_data": auth_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_business_alert(
|
||
self,
|
||
incident_id: str,
|
||
alertname: str,
|
||
business_domain: str,
|
||
metric_name: str,
|
||
current_value: str,
|
||
threshold: str,
|
||
loss_rate: str = "",
|
||
group_chat_id: str | None = None,
|
||
) -> dict:
|
||
"""
|
||
TYPE-6B 業務/FinOps 告警。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
路由: TYPE-1 發群組,此為 TYPE-6B 也發群組(業務趨勢數字)
|
||
按鈕: [暫停] [查 SignOz] [忽略]
|
||
"""
|
||
text = (
|
||
f"📉 SLO ALERT | 業務指標異常\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 告警:<b>{html.escape(alertname)}</b>\n"
|
||
f"🎯 影響業務:{html.escape(business_domain)}\n"
|
||
f"📊 異常指標:<code>{html.escape(metric_name)}</code>\n"
|
||
f"\n🧠 <b>業務衝擊分析</b>\n"
|
||
f"├─ 當前狀態:{html.escape(current_value)} (閾值: {html.escape(threshold)})\n"
|
||
)
|
||
if loss_rate:
|
||
text += f"└─ 損失速率:{html.escape(loss_rate)}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 pause_1h / ignore (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
keyboard = {"inline_keyboard": []}
|
||
|
||
target_chat = group_chat_id or settings.OPENCLAW_TG_CHAT_ID
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": target_chat,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_escalation_card(
|
||
self,
|
||
incident_id: str,
|
||
original_alertname: str,
|
||
duration_min: int,
|
||
priority: int = 0,
|
||
attempted_actions: str = "",
|
||
failure_reason: str = "",
|
||
current_impact: str = "",
|
||
group_chat_id: str | None = None,
|
||
) -> dict:
|
||
"""
|
||
TYPE-7E 重大事故升級通知。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
觸發: SLA 超時(P0: 15分鐘; P1: 45分鐘)
|
||
路由: 個人 DM + 群組(緊急事故全員知情)
|
||
按鈕: [建立戰情室] [Postmortem草稿] [DR手冊] [確認接手]
|
||
"""
|
||
duration_str = f"{duration_min} 分鐘" if duration_min < 60 else f"{duration_min//60} 小時 {duration_min%60} 分"
|
||
|
||
text = (
|
||
f"🚨 ESCALATION | P{priority} 事故升級\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code> | 已持續 <b>{duration_str}</b>\n"
|
||
f"⚠️ <b>超出自動修復能力範圍</b>\n"
|
||
f"🎯 核心問題:<code>{html.escape(original_alertname)}</code>\n"
|
||
)
|
||
if attempted_actions or failure_reason or current_impact:
|
||
text += "\n🧠 <b>AI 戰局總結</b>\n"
|
||
if attempted_actions:
|
||
text += f"├─ 嘗試動作:{html.escape(attempted_actions[:100])}\n"
|
||
if failure_reason:
|
||
text += f"├─ 失敗原因:{html.escape(failure_reason[:100])}\n"
|
||
if current_impact:
|
||
text += f"└─ 目前影響:{html.escape(current_impact[:100])}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 postmortem / escalation_ack / dr_manual (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
keyboard = {"inline_keyboard": []}
|
||
|
||
results = []
|
||
# 發個人 DM
|
||
results.append(await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
))
|
||
# 發群組(若有)
|
||
if group_chat_id:
|
||
results.append(await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": group_chat_id,
|
||
"text": text + "\n📣 @所有人 事故升級,請協助!",
|
||
"parse_mode": "HTML",
|
||
},
|
||
))
|
||
return results[0]
|
||
|
||
# =========================================================================
|
||
# 新訊息發送方法 (2026-03-29 ogt: ADR-038)
|
||
# =========================================================================
|
||
|
||
def _build_sentry_keyboard(self, error_id: str) -> dict:
|
||
"""建立 Sentry 錯誤訊息按鈕"""
|
||
view_nonce = self._security.generate_callback_nonce(error_id, "view")
|
||
silence_nonce = self._security.generate_callback_nonce(error_id, "silence")
|
||
|
||
return {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔍 查看詳情", "callback_data": view_nonce},
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
]
|
||
]
|
||
}
|
||
|
||
def _build_resource_keyboard(self, resource_id: str) -> dict:
|
||
"""建立資源告警按鈕"""
|
||
scale_nonce = self._security.generate_callback_nonce(resource_id, "scale")
|
||
silence_nonce = self._security.generate_callback_nonce(resource_id, "silence")
|
||
|
||
return {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "⚡ 自動擴展", "callback_data": scale_nonce},
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
]
|
||
]
|
||
}
|
||
|
||
async def send_sentry_error(
|
||
self,
|
||
error_id: str,
|
||
error_type: str,
|
||
error_message: str,
|
||
service_name: str,
|
||
file_location: str,
|
||
occurrence_count: int = 1,
|
||
affected_users: int = 0,
|
||
first_seen: str = "",
|
||
stack_trace: list[str] | None = None,
|
||
sentry_url: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送 Sentry 錯誤通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
error_id: Sentry Issue ID
|
||
error_type: 錯誤類型 (TypeError, etc.)
|
||
error_message: 錯誤訊息
|
||
service_name: 服務名稱
|
||
file_location: 檔案位置
|
||
occurrence_count: 發生次數
|
||
affected_users: 影響用戶數
|
||
first_seen: 首次發生時間
|
||
stack_trace: Stack trace
|
||
sentry_url: Sentry 連結
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = SentryErrorMessage(
|
||
error_id=error_id,
|
||
error_type=error_type,
|
||
error_message=error_message,
|
||
service_name=service_name,
|
||
file_location=file_location,
|
||
occurrence_count=occurrence_count,
|
||
affected_users=affected_users,
|
||
first_seen=first_seen,
|
||
stack_trace=stack_trace,
|
||
sentry_url=sentry_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"reply_markup": self._build_sentry_keyboard(error_id),
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_sentry_error_sending", error_id=error_id, service=service_name)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_sentry_error_sent", error_id=error_id)
|
||
|
||
return result
|
||
|
||
async def send_resource_warning(
|
||
self,
|
||
resource_id: str,
|
||
pod_name: str,
|
||
namespace: str = "default",
|
||
cpu_percent: float = 0.0,
|
||
cpu_limit: str = "",
|
||
memory_percent: float = 0.0,
|
||
memory_limit: str = "",
|
||
disk_percent: float = 0.0,
|
||
trend_info: str = "",
|
||
suggestion: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送資源告警通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
resource_id: 資源 ID
|
||
pod_name: Pod 名稱
|
||
namespace: K8s namespace
|
||
cpu_percent: CPU 使用率
|
||
memory_percent: Memory 使用率
|
||
disk_percent: Disk 使用率
|
||
trend_info: 趨勢資訊
|
||
suggestion: 建議
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = ResourceWarnMessage(
|
||
resource_id=resource_id,
|
||
pod_name=pod_name,
|
||
namespace=namespace,
|
||
cpu_percent=cpu_percent,
|
||
cpu_limit=cpu_limit,
|
||
memory_percent=memory_percent,
|
||
memory_limit=memory_limit,
|
||
disk_percent=disk_percent,
|
||
trend_info=trend_info,
|
||
suggestion=suggestion,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"reply_markup": self._build_resource_keyboard(resource_id),
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_resource_warning_sending", resource_id=resource_id, pod=pod_name)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_resource_warning_sent", resource_id=resource_id)
|
||
|
||
return result
|
||
|
||
async def send_repair_report(
|
||
self,
|
||
report_date: str,
|
||
total_repairs: int = 0,
|
||
success_count: int = 0,
|
||
failure_count: int = 0,
|
||
saved_minutes: int = 0,
|
||
top_issues: list[tuple[str, int]] | None = None,
|
||
ai_cost_gemini: float = 0.0,
|
||
ai_cost_nvidia: float = 0.0,
|
||
ai_tokens_total: int = 0,
|
||
) -> dict:
|
||
"""
|
||
發送自動修復報告
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
report_date: 報告日期
|
||
total_repairs: 總修復次數
|
||
success_count: 成功次數
|
||
failure_count: 失敗次數
|
||
saved_minutes: 節省人工時間
|
||
top_issues: Top 問題列表
|
||
ai_cost_gemini: Gemini 成本
|
||
ai_cost_nvidia: NVIDIA 成本
|
||
ai_tokens_total: 總 Token 數
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = RepairReportMessage(
|
||
report_date=report_date,
|
||
total_repairs=total_repairs,
|
||
success_count=success_count,
|
||
failure_count=failure_count,
|
||
saved_minutes=saved_minutes,
|
||
top_issues=top_issues,
|
||
ai_cost_gemini=ai_cost_gemini,
|
||
ai_cost_nvidia=ai_cost_nvidia,
|
||
ai_tokens_total=ai_tokens_total,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_repair_report_sending", date=report_date)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_repair_report_sent", date=report_date)
|
||
|
||
return result
|
||
|
||
async def send_daily_summary(
|
||
self,
|
||
summary_date: str,
|
||
alert_total: int = 0,
|
||
alert_critical: int = 0,
|
||
alert_medium: int = 0,
|
||
alert_low: int = 0,
|
||
auto_repair_count: int = 0,
|
||
manual_approval_count: int = 0,
|
||
ignored_count: int = 0,
|
||
avg_response_minutes: float = 0.0,
|
||
api_availability: float = 99.9,
|
||
web_availability: float = 99.9,
|
||
worker_availability: float = 99.9,
|
||
ai_cost: float = 0.0,
|
||
cloud_cost: float = 0.0,
|
||
budget_remaining: float = 0.0,
|
||
) -> dict:
|
||
"""
|
||
發送每日摘要
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = DailySummaryMessage(
|
||
summary_date=summary_date,
|
||
alert_total=alert_total,
|
||
alert_critical=alert_critical,
|
||
alert_medium=alert_medium,
|
||
alert_low=alert_low,
|
||
auto_repair_count=auto_repair_count,
|
||
manual_approval_count=manual_approval_count,
|
||
ignored_count=ignored_count,
|
||
avg_response_minutes=avg_response_minutes,
|
||
api_availability=api_availability,
|
||
web_availability=web_availability,
|
||
worker_availability=worker_availability,
|
||
ai_cost=ai_cost,
|
||
cloud_cost=cloud_cost,
|
||
budget_remaining=budget_remaining,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_daily_summary_sending", date=summary_date)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_daily_summary_sent", date=summary_date)
|
||
|
||
return result
|
||
|
||
async def send_cicd_progress(
|
||
self,
|
||
job_name: str,
|
||
status: str,
|
||
stage: str = "",
|
||
commit_sha: str = "",
|
||
triggered_by: str = "",
|
||
duration_seconds: int = 0,
|
||
message: str = "",
|
||
workflow_url: str = "",
|
||
max_retries: int = 3,
|
||
) -> dict:
|
||
"""
|
||
發送 CI/CD 進度通知 (簡潔版,不走 AI 仲裁)
|
||
|
||
2026-03-30 ogt: 新增,解決 CI/CD 告警被當成事件處理的問題
|
||
2026-03-30 P1: 新增重試機制 (指數退避)
|
||
|
||
Args:
|
||
max_retries: 最大重試次數 (預設 3)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
# OTEL Span: telegram.send_cicd_progress
|
||
with _tracer.start_as_current_span(
|
||
"telegram.send_cicd_progress",
|
||
attributes={
|
||
"telegram.job_name": job_name,
|
||
"telegram.status": status,
|
||
"telegram.stage": stage,
|
||
"telegram.max_retries": max_retries,
|
||
},
|
||
) as span:
|
||
msg = CICDProgressMessage(
|
||
job_name=job_name,
|
||
status=status,
|
||
stage=stage,
|
||
commit_sha=commit_sha,
|
||
triggered_by=triggered_by,
|
||
duration_seconds=duration_seconds,
|
||
message=message,
|
||
workflow_url=workflow_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": msg.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_cicd_progress_sending", job=job_name, status=status)
|
||
|
||
# 重試機制 (指數退避)
|
||
last_error = None
|
||
for attempt in range(max_retries):
|
||
try:
|
||
result = await self._send_request("sendMessage", payload)
|
||
span.set_attribute("telegram.attempts", attempt + 1)
|
||
span.set_status(trace.Status(trace.StatusCode.OK))
|
||
logger.info("telegram_cicd_progress_sent", job=job_name, status=status, attempt=attempt + 1)
|
||
return result
|
||
except TelegramGatewayError as e:
|
||
last_error = e
|
||
if attempt < max_retries - 1:
|
||
delay = 2 ** attempt # 1, 2, 4 秒
|
||
logger.warning(
|
||
"telegram_cicd_progress_retry",
|
||
job=job_name,
|
||
attempt=attempt + 1,
|
||
delay=delay,
|
||
error=str(e),
|
||
)
|
||
await asyncio.sleep(delay)
|
||
|
||
# 所有重試都失敗
|
||
span.set_attribute("telegram.attempts", max_retries)
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(last_error)
|
||
logger.error(
|
||
"telegram_cicd_progress_failed",
|
||
job=job_name,
|
||
status=status,
|
||
max_retries=max_retries,
|
||
error=str(last_error),
|
||
)
|
||
raise last_error
|
||
|
||
async def send_deploy_success(
|
||
self,
|
||
commit_sha: str,
|
||
triggered_by: str,
|
||
environment: str = "Production",
|
||
api_version: str = "",
|
||
web_version: str = "",
|
||
worker_version: str = "",
|
||
duration_seconds: int = 0,
|
||
e2e_passed: int = 0,
|
||
e2e_total: int = 0,
|
||
health_check_passed: bool = True,
|
||
workflow_url: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送部署成功通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = DeploySuccessMessage(
|
||
commit_sha=commit_sha,
|
||
triggered_by=triggered_by,
|
||
environment=environment,
|
||
api_version=api_version,
|
||
web_version=web_version,
|
||
worker_version=worker_version,
|
||
duration_seconds=duration_seconds,
|
||
e2e_passed=e2e_passed,
|
||
e2e_total=e2e_total,
|
||
health_check_passed=health_check_passed,
|
||
workflow_url=workflow_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_deploy_success_sending", commit=commit_sha[:8])
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_deploy_success_sent", commit=commit_sha[:8])
|
||
|
||
return result
|
||
|
||
async def send_rate_limit_warning(
|
||
self,
|
||
provider: str,
|
||
daily_usage: int = 0,
|
||
daily_limit: int = 0,
|
||
token_usage: int = 0,
|
||
token_limit: int = 0,
|
||
cost_usd: float = 0.0,
|
||
suggestions: list[str] | None = None,
|
||
reset_time: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送 API 限額警告
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = RateLimitMessage(
|
||
provider=provider,
|
||
daily_usage=daily_usage,
|
||
daily_limit=daily_limit,
|
||
token_usage=token_usage,
|
||
token_limit=token_limit,
|
||
cost_usd=cost_usd,
|
||
suggestions=suggestions,
|
||
reset_time=reset_time,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_rate_limit_warning_sending", provider=provider)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_rate_limit_warning_sent", provider=provider)
|
||
|
||
return result
|
||
|
||
async def handle_callback(
|
||
self,
|
||
callback_query_id: str,
|
||
callback_data: str,
|
||
user_id: int,
|
||
message_id: int,
|
||
original_text: str = "",
|
||
username: str = "",
|
||
) -> dict:
|
||
"""
|
||
處理簽核/調優回調
|
||
|
||
Args:
|
||
callback_query_id: Telegram Callback Query ID
|
||
callback_data: Callback Data (包含 nonce)
|
||
user_id: Telegram User ID
|
||
message_id: 原始訊息 ID
|
||
original_text: 原始卡片內容 (用於保留上下文)
|
||
username: 簽核者使用者名稱
|
||
|
||
Returns:
|
||
dict: 處理結果 {action, approval_id, user, auto_tuning_result?}
|
||
"""
|
||
try:
|
||
# ===================================================================
|
||
# Step 1: 解析 Callback Data (支援兩種格式)
|
||
# ===================================================================
|
||
parsed = self._security.parse_callback_data(callback_data)
|
||
action = parsed["action"]
|
||
approval_id = parsed["approval_id"]
|
||
|
||
# ===================================================================
|
||
# Step 1.5: ADR-050 Info Actions (read-only, 只需白名單驗證)
|
||
# ===================================================================
|
||
# 2026-04-01 Claude Code (ADR-050 P1): detail/reanalyze/history
|
||
if parsed.get("is_info_action"):
|
||
if not self._security.is_whitelisted(user_id):
|
||
raise UserNotWhitelistedError(f"User {user_id} not in whitelist")
|
||
|
||
incident_id = parsed.get("incident_id", approval_id)
|
||
|
||
if action == "detail":
|
||
# ADR-050 P2: 取得事件詳情,傳送新訊息 (保留原始簽核卡片+按鈕)
|
||
# 2026-04-01 Claude Code (ADR-050 P2)
|
||
await self._answer_callback(callback_query_id, action, text="📋 詳情傳送中...")
|
||
await self._send_incident_detail(incident_id)
|
||
elif action == "history":
|
||
# ADR-050 P2: 取得頻率統計
|
||
# 2026-04-01 Claude Code (ADR-050 P2)
|
||
await self._answer_callback(callback_query_id, action, text="📊 歷史統計傳送中...")
|
||
await self._send_incident_history(incident_id)
|
||
elif action == "reanalyze":
|
||
# ADR-050 P2: 觸發重診
|
||
# 2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
|
||
await self._answer_callback(callback_query_id, action, text="🔄 重診排程中...")
|
||
await self._send_reanalyze_result(incident_id)
|
||
else:
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.1):
|
||
# 未知 action → fallback dispatcher (查看 callback_action_spec.yaml 是否有註冊)
|
||
await self._dispatch_category_action(
|
||
callback_query_id=callback_query_id,
|
||
action=action,
|
||
incident_id=incident_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": {"id": user_id, "username": username},
|
||
"success": True,
|
||
"info_action": True,
|
||
}
|
||
|
||
nonce = parsed["nonce"] # 4-part nonce action
|
||
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.1):
|
||
# 寫類 nonce action 先驗 nonce 再 fallback dispatcher(若 action 在 registry)
|
||
# 這段邏輯在 Step 2 之後再處理,這裡只是佔位註解
|
||
|
||
# 驗證使用者 + Nonce
|
||
user = await self._security.verify_callback(
|
||
user_id=user_id,
|
||
callback_id=callback_query_id,
|
||
nonce=nonce,
|
||
)
|
||
|
||
# ===================================================================
|
||
# Step 1.8: ADR-071-D 狀態機守衛(State Machine Guardrail)
|
||
# 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批最高優先)
|
||
# 防止已 RESOLVED/CLOSED 的事件卡片被誤點再次執行
|
||
# 防止 MITIGATING 中的事件被重複觸發
|
||
# ===================================================================
|
||
guard_result = await self._check_incident_state_guard(
|
||
approval_id=approval_id,
|
||
callback_query_id=callback_query_id,
|
||
message_id=message_id,
|
||
original_text=original_text,
|
||
)
|
||
if guard_result is not None:
|
||
return guard_result
|
||
|
||
# ===================================================================
|
||
# Step 1.9: Phase 5 Sprint 5.3 — 分類按鈕寫類 action 路由
|
||
# 2026-04-14 Claude Sonnet 4.6
|
||
# 若 action 在 callback_action_spec registry 且非 approve/reject/silence/tune
|
||
# → 走 dispatcher 執行 MCP + audit log
|
||
# ===================================================================
|
||
from src.services.callback_dispatcher import get_action_spec as _get_spec
|
||
_category_spec = _get_spec(action)
|
||
if _category_spec and action not in (
|
||
"approve", "reject", "silence", "tune", "log_manual_fix"
|
||
):
|
||
# Multi-Sig 守衛 (Sprint 5.4 secops 類)
|
||
if _category_spec.requires_multi_sig:
|
||
# 檢查 approval_records.current_signatures 是否已達 2
|
||
try:
|
||
from src.services.approval_db import get_approval_service as _svc
|
||
from uuid import UUID as _UUID
|
||
_existing = await _svc().get_approval(_UUID(approval_id))
|
||
_sigs = (
|
||
len(_existing.signatures) if _existing and _existing.signatures else 0
|
||
)
|
||
except Exception:
|
||
_sigs = 0
|
||
if _sigs < 2:
|
||
await self._answer_callback(
|
||
callback_query_id, action,
|
||
text=f"⚠️ 需 2 人簽核 ({_sigs}/2)",
|
||
)
|
||
logger.info(
|
||
"category_action_multi_sig_pending",
|
||
action=action, approval_id=approval_id, current_sigs=_sigs,
|
||
)
|
||
return {
|
||
"action": action, "approval_id": approval_id,
|
||
"user": user, "success": False,
|
||
"reason": "multi_sig_pending",
|
||
}
|
||
|
||
# Audit log 開始(寫類動作)
|
||
logger.info(
|
||
"category_write_action_audit_start",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
risk=_category_spec.risk,
|
||
provider=_category_spec.mcp_provider,
|
||
tool=_category_spec.mcp_tool,
|
||
)
|
||
|
||
# Ack Telegram
|
||
await self._answer_callback(
|
||
callback_query_id, action,
|
||
text=f"{_category_spec.emoji} {_category_spec.label} 執行中...",
|
||
)
|
||
|
||
# 查 incident_id + labels for template
|
||
_incident_id_resolved = approval_id # fallback
|
||
_labels: dict = {}
|
||
try:
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
_repo = get_incident_repository()
|
||
# approval_id 可能是 INC-xxx 或 UUID,先試 INC 格式
|
||
if approval_id.startswith("INC-"):
|
||
_inc = await _repo.get_by_id(approval_id)
|
||
else:
|
||
# UUID → 找 approval → incident_id
|
||
from src.services.approval_db import get_approval_service
|
||
from uuid import UUID
|
||
_app = await get_approval_service().get_approval(UUID(approval_id))
|
||
_inc_id = getattr(_app, "incident_id", None) if _app else None
|
||
_inc = await _repo.get_by_id(_inc_id) if _inc_id else None
|
||
if _inc:
|
||
_incident_id_resolved = _inc.incident_id
|
||
if _inc and _inc.signals:
|
||
_labels = _inc.signals[0].labels or {}
|
||
except Exception as _e:
|
||
logger.debug("category_action_labels_lookup_failed", error=str(_e))
|
||
|
||
# Dispatch
|
||
from src.services.callback_dispatcher import dispatch_action as _dispatch
|
||
_result = await _dispatch(
|
||
action_name=action,
|
||
incident_id=_incident_id_resolved,
|
||
user_id=user_id,
|
||
labels=_labels,
|
||
)
|
||
|
||
# Reply 結果到原告警卡片
|
||
try:
|
||
from src.core.redis_client import get_redis as _gr
|
||
_rds = _gr()
|
||
_msg_id_raw = await _rds.get(f"tg_msg:{_incident_id_resolved}")
|
||
_orig_msg = int(_msg_id_raw) if _msg_id_raw else None
|
||
except Exception:
|
||
_orig_msg = None
|
||
try:
|
||
_payload = {
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": _result.result_text,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if _orig_msg:
|
||
_payload["reply_to_message_id"] = _orig_msg
|
||
await self._http_client.post(
|
||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||
json=_payload,
|
||
)
|
||
except Exception as _re:
|
||
logger.warning("category_action_reply_send_failed", error=str(_re))
|
||
|
||
# Audit log 完成
|
||
logger.info(
|
||
"category_write_action_audit_complete",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
success=_result.success,
|
||
error=_result.error,
|
||
duration_ms=round(_result.duration_ms, 1),
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": _result.success,
|
||
"category_action": True,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2: 處理自動調優 (Shadow Mode)
|
||
# ===================================================================
|
||
auto_tuning_result = None
|
||
if action == "tune":
|
||
auto_tuning_result = await self._handle_auto_tuning(
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
# 回應 Callback Query
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"tune",
|
||
text="⚡ 調優指令已記錄 (Shadow Mode)",
|
||
)
|
||
# 更新訊息
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="tune",
|
||
username=username,
|
||
original_text=original_text,
|
||
extra_info=auto_tuning_result.get("command", ""),
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"auto_tuning_result": auto_tuning_result,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2.5: 處理稍後/靜默 (2026-03-27 P1 優化)
|
||
# ===================================================================
|
||
if action == "snooze":
|
||
snooze_result = await self._handle_snooze(
|
||
approval_id=approval_id,
|
||
username=username,
|
||
)
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"snooze",
|
||
text="⏰ 30 分鐘後再提醒",
|
||
)
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="snooze",
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"snooze_result": snooze_result,
|
||
}
|
||
|
||
if action == "silence":
|
||
silence_result = await self._handle_silence(
|
||
approval_id=approval_id,
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"silence",
|
||
text="🔕 此類告警靜默 1 小時",
|
||
)
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="silence",
|
||
username=username,
|
||
original_text=original_text,
|
||
extra_info=silence_result.get("resource_name", ""),
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"silence_result": silence_result,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2.8: ADR-071-H 手動修復記錄 (TYPE-4)
|
||
# 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批)
|
||
# 使用者點擊 [手動修復後記錄] → Bot 提示輸入步驟
|
||
# 實際步驟收集在 handle_message() 的 /done 流程中完成
|
||
# ===================================================================
|
||
if action == "log_manual_fix":
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"log_manual_fix",
|
||
text="📝 請輸入修復步驟,完成後傳送 /done",
|
||
)
|
||
# 在 Redis 儲存「等待手動修復輸入」狀態
|
||
try:
|
||
redis = get_redis()
|
||
await redis.setex(
|
||
f"manual_fix_pending:{user_id}",
|
||
1800, # 30 分鐘
|
||
approval_id,
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("manual_fix_pending_store_failed", error=str(_e))
|
||
|
||
await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": (
|
||
"📝 <b>手動修復記錄</b>\n"
|
||
"━━━━━━━━━━━━━━━━━━━\n"
|
||
"請輸入您的修復步驟(可多行)。\n"
|
||
"輸入完畢後傳送 <code>/done</code>\n\n"
|
||
"<i>30 分鐘內有效</i>"
|
||
),
|
||
"parse_mode": "HTML",
|
||
},
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"waiting_for_manual_fix": True,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 3: 回應 Callback Query (簽核/拒絕)
|
||
# ===================================================================
|
||
await self._answer_callback(callback_query_id, action)
|
||
|
||
# ===================================================================
|
||
# Step 4: 更新訊息 (保留原始內容 + 簽核鋼印)
|
||
# ===================================================================
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action=action,
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_callback_processed",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
}
|
||
|
||
except UserNotWhitelistedError as e:
|
||
logger.warning("telegram_callback_denied", error=str(e), user_id=user_id)
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"denied",
|
||
text="⛔ 您沒有簽核權限",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
except NonceReplayError as e:
|
||
logger.warning("telegram_callback_replay", error=str(e))
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"replay",
|
||
text="⚠️ 此操作已處理過",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_callback_error", error=str(e))
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"error",
|
||
text="❌ 處理失敗",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
async def _check_incident_state_guard(
|
||
self,
|
||
approval_id: str,
|
||
callback_query_id: str,
|
||
message_id: int,
|
||
original_text: str,
|
||
) -> dict | None:
|
||
"""
|
||
ADR-071-D 狀態機守衛
|
||
|
||
從 approval_id 查詢關聯 incident 的當下狀態:
|
||
- RESOLVED / CLOSED → 拒絕執行,更新卡片文字,移除按鈕
|
||
- MITIGATING → 防止重複觸發,回覆「修復中」提示
|
||
- 其他 / 查不到 → 返回 None(讓主流程繼續)
|
||
|
||
2026-04-11 Claude Sonnet 4.6 (ADR-071-D)
|
||
"""
|
||
try:
|
||
from uuid import UUID
|
||
from src.services.approval_db import get_approval_service
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.models.incident import IncidentStatus
|
||
|
||
approval_svc = get_approval_service()
|
||
try:
|
||
approval = await approval_svc.get_approval_by_id(UUID(approval_id))
|
||
except (ValueError, Exception):
|
||
return None # approval_id 格式異常,讓主流程處理
|
||
|
||
if not approval or not approval.incident_id:
|
||
return None # 無關聯 incident,放行
|
||
|
||
incident_repo = get_incident_repository()
|
||
incident = await incident_repo.get_by_id(approval.incident_id)
|
||
if not incident:
|
||
return None
|
||
|
||
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||
resolved_at = incident.resolved_at.strftime("%Y-%m-%d %H:%M") if incident.resolved_at else "未知時間"
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"blocked",
|
||
text="✅ 此事件已解決",
|
||
)
|
||
try:
|
||
separator = "──────────────"
|
||
safe_original = html.escape(original_text) if original_text else ""
|
||
stamp = f"✅ 此事件已於 {resolved_at} 解決"
|
||
await self._send_request("editMessageText", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"text": f"{safe_original}\n{separator}\n{stamp}" if safe_original else stamp,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": {"inline_keyboard": []},
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except Exception:
|
||
# 移除按鈕保底
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except Exception:
|
||
pass
|
||
logger.info(
|
||
"state_guard_blocked_resolved",
|
||
approval_id=approval_id,
|
||
incident_id=approval.incident_id,
|
||
incident_status=incident.status.value,
|
||
)
|
||
return {"blocked": True, "reason": "already_resolved", "approval_id": approval_id}
|
||
|
||
if incident.status == IncidentStatus.MITIGATING:
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"blocked",
|
||
text="⏳ 正在修復中,請稍候...",
|
||
)
|
||
logger.info(
|
||
"state_guard_blocked_mitigating",
|
||
approval_id=approval_id,
|
||
incident_id=approval.incident_id,
|
||
)
|
||
return {"blocked": True, "reason": "already_executing", "approval_id": approval_id}
|
||
|
||
except Exception as e:
|
||
# 守衛失敗不阻塞主流程
|
||
logger.warning("state_guard_error", approval_id=approval_id, error=str(e))
|
||
|
||
return None
|
||
|
||
async def handle_manual_fix_done(
|
||
self,
|
||
user_id: int,
|
||
username: str,
|
||
fix_steps: str,
|
||
) -> dict:
|
||
"""
|
||
ADR-071-H: 處理使用者輸入 /done 後的手動修復步驟記錄
|
||
|
||
流程:
|
||
1. 從 Redis 取得 pending approval_id
|
||
2. 查詢 ApprovalRecord → 取得 incident_id
|
||
3. 更新 incidents.manual_fix_steps + manual_fix_by
|
||
4. 寫入 alert_operation_log MANUAL_FIX_RECORDED
|
||
5. 觸發 KMConversionService.convert()
|
||
6. 回覆 Telegram 確認訊息
|
||
|
||
Args:
|
||
user_id: Telegram user ID
|
||
username: Telegram username
|
||
fix_steps: 使用者輸入的修復步驟
|
||
"""
|
||
try:
|
||
from src.core.redis_client import get_redis as _get_redis
|
||
redis = _get_redis()
|
||
|
||
pending_key = f"manual_fix_pending:{user_id}"
|
||
approval_id_bytes = await redis.get(pending_key)
|
||
if not approval_id_bytes:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": "⚠️ 找不到待記錄的修復任務,或已逾時。",
|
||
"parse_mode": "HTML",
|
||
})
|
||
return {"success": False, "reason": "no_pending_task"}
|
||
|
||
approval_id = approval_id_bytes.decode() if isinstance(approval_id_bytes, bytes) else str(approval_id_bytes)
|
||
await redis.delete(pending_key)
|
||
|
||
# 查 ApprovalRecord → incident
|
||
from src.repositories.incident_repository import IncidentDBRepository
|
||
from src.repositories.approval_repository import ApprovalDBRepository
|
||
|
||
approval_repo = ApprovalDBRepository()
|
||
approval = await approval_repo.get_by_approval_id(approval_id)
|
||
if not approval:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": f"⚠️ 找不到簽核單 <code>{html.escape(approval_id)}</code>",
|
||
"parse_mode": "HTML",
|
||
})
|
||
return {"success": False, "reason": "approval_not_found"}
|
||
|
||
incident_repo = IncidentDBRepository()
|
||
incident = await incident_repo.get_by_id(approval.incident_id)
|
||
if not incident:
|
||
return {"success": False, "reason": "incident_not_found"}
|
||
|
||
# 更新 incidents.manual_fix_steps + manual_fix_by
|
||
from src.db.base import get_db_context
|
||
from src.db.models import Incident as IncidentORM
|
||
from sqlalchemy import update
|
||
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
update(IncidentORM)
|
||
.where(IncidentORM.incident_id == approval.incident_id)
|
||
.values(
|
||
manual_fix_steps=fix_steps,
|
||
manual_fix_by=username or str(user_id),
|
||
)
|
||
)
|
||
await db.commit()
|
||
|
||
# 寫操作日誌
|
||
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
|
||
op_log_repo = get_alert_operation_log_repository()
|
||
await op_log_repo.append(
|
||
event_type="MANUAL_FIX_RECORDED",
|
||
incident_id=approval.incident_id,
|
||
approval_id=approval_id,
|
||
actor=username or str(user_id),
|
||
action_detail=fix_steps[:500],
|
||
success=True,
|
||
)
|
||
|
||
# 觸發 KM 轉換(直接 await,避免 create_task() 在 DB session 關閉後的競爭條件)
|
||
# 重讀 incident 確保 manual_fix_steps 已寫入
|
||
incident_updated = await incident_repo.get_by_id(approval.incident_id)
|
||
if incident_updated:
|
||
from src.services.km_conversion_service import get_km_conversion_service
|
||
km_svc = get_km_conversion_service()
|
||
try:
|
||
await km_svc.convert(incident_updated)
|
||
except Exception as _km_err:
|
||
logger.warning(
|
||
"km_conversion_failed",
|
||
incident_id=approval.incident_id,
|
||
error=str(_km_err),
|
||
)
|
||
|
||
# 回覆確認
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": (
|
||
f"✅ <b>手動修復步驟已記錄</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 事件: <code>{html.escape(approval.incident_id)}</code>\n"
|
||
f"👤 記錄者: @{html.escape(username or str(user_id))}\n\n"
|
||
f"<i>正在建立草稿 Playbook,請至 AWOOOI 審核後生效。</i>"
|
||
),
|
||
"parse_mode": "HTML",
|
||
})
|
||
|
||
logger.info(
|
||
"manual_fix_recorded",
|
||
incident_id=approval.incident_id,
|
||
user=username,
|
||
)
|
||
return {"success": True, "incident_id": approval.incident_id}
|
||
|
||
except Exception as e:
|
||
logger.error("handle_manual_fix_done_failed", error=str(e))
|
||
return {"success": False, "error": str(e)}
|
||
|
||
async def _handle_auto_tuning(
|
||
self,
|
||
approval_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
) -> dict:
|
||
"""
|
||
處理自動調優請求 (Shadow Mode)
|
||
|
||
統帥鐵律: Shadow Mode 下嚴禁實際執行 K8s 命令
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
user_id: 執行者 Telegram ID
|
||
username: 執行者名稱
|
||
|
||
Returns:
|
||
dict: 調優結果
|
||
"""
|
||
try:
|
||
# Shadow Mode: 僅記錄調優請求
|
||
# 實際生產環境需從 ApprovalRecord 取得完整調優指令
|
||
# Shadow Mode: 僅記錄調優請求
|
||
# 實際生產環境需從 ApprovalRecord 取得完整調優指令
|
||
log_message = f"[SHADOW MODE] 自動調優請求 - 簽核單: {approval_id}"
|
||
|
||
if settings.SHADOW_MODE_ENABLED:
|
||
logger.info(
|
||
"shadow_mode_auto_tuning_triggered",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
shadow_mode=True,
|
||
)
|
||
print(f"\n{'='*60}")
|
||
print("[SHADOW MODE] AI 生成的調優指令請求")
|
||
print(f"簽核單: {approval_id}")
|
||
print(f"執行者: @{username} (ID: {user_id})")
|
||
print(f"時間: {datetime.now(UTC).isoformat()}")
|
||
print("狀態: 僅記錄,未實際執行")
|
||
print(f"{'='*60}\n")
|
||
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": True,
|
||
"approval_id": approval_id,
|
||
"triggered_by": username,
|
||
"command": "kubectl command logged (see server logs)",
|
||
"log": log_message,
|
||
}
|
||
else:
|
||
logger.warning(
|
||
"auto_tuning_blocked_not_shadow_mode",
|
||
approval_id=approval_id,
|
||
message="Production execution requires multi-sig approval",
|
||
)
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": False,
|
||
"approval_id": approval_id,
|
||
"error": "Production execution requires multi-sig approval",
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("auto_tuning_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"executed": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _handle_snooze(
|
||
self,
|
||
approval_id: str,
|
||
username: str,
|
||
) -> dict:
|
||
"""
|
||
處理稍後提醒 (2026-03-27 P1 優化)
|
||
|
||
功能: 延遲 30 分鐘後再提醒此告警
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
username: 執行者名稱
|
||
|
||
Returns:
|
||
dict: 處理結果
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
snooze_key = f"{SNOOZE_KEY_PREFIX}{approval_id}"
|
||
|
||
# 設置 30 分鐘延遲標記
|
||
await redis.setex(
|
||
snooze_key,
|
||
SNOOZE_TTL_SECONDS,
|
||
f"{username}:{datetime.now(UTC).isoformat()}",
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_snooze_set",
|
||
approval_id=approval_id,
|
||
username=username,
|
||
ttl_minutes=SNOOZE_TTL_SECONDS // 60,
|
||
)
|
||
|
||
return {
|
||
"snoozed": True,
|
||
"approval_id": approval_id,
|
||
"snooze_until": datetime.now(UTC).isoformat(),
|
||
"ttl_minutes": SNOOZE_TTL_SECONDS // 60,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("snooze_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"snoozed": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _handle_silence(
|
||
self,
|
||
approval_id: str,
|
||
username: str,
|
||
original_text: str,
|
||
) -> dict:
|
||
"""
|
||
處理靜默 1 小時 (2026-03-27 P1 優化)
|
||
|
||
功能: 同類告警 (相同資源) 1 小時內不再發送
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
username: 執行者名稱
|
||
original_text: 原始訊息 (用於解析資源名稱)
|
||
|
||
Returns:
|
||
dict: 處理結果
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
|
||
# 從原始訊息解析資源名稱 (格式: 🎯 資源: xxx)
|
||
resource_name = "unknown"
|
||
for line in original_text.split("\n"):
|
||
if "🎯 資源:" in line or "🎯 資源: " in line:
|
||
resource_name = line.split(":")[-1].strip()
|
||
break
|
||
|
||
silence_key = f"{SILENCE_KEY_PREFIX}{resource_name}"
|
||
|
||
# 設置 1 小時靜默標記
|
||
await redis.setex(
|
||
silence_key,
|
||
SILENCE_TTL_SECONDS,
|
||
f"{username}:{datetime.now(UTC).isoformat()}:{approval_id}",
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_silence_set",
|
||
approval_id=approval_id,
|
||
resource_name=resource_name,
|
||
username=username,
|
||
ttl_hours=SILENCE_TTL_SECONDS // 3600,
|
||
)
|
||
|
||
return {
|
||
"silenced": True,
|
||
"approval_id": approval_id,
|
||
"resource_name": resource_name,
|
||
"silence_until": datetime.now(UTC).isoformat(),
|
||
"ttl_hours": SILENCE_TTL_SECONDS // 3600,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("silence_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"silenced": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _answer_callback(
|
||
self,
|
||
callback_query_id: str,
|
||
action: str,
|
||
text: str | None = None,
|
||
) -> None:
|
||
"""回應 Callback Query"""
|
||
if text is None:
|
||
if action == "approve":
|
||
text = "✅ 已簽核"
|
||
elif action == "reject":
|
||
text = "❌ 已拒絕"
|
||
elif action == "tune":
|
||
text = "⚡ 調優中..."
|
||
elif action == "snooze":
|
||
text = "⏰ 30 分鐘後再提醒"
|
||
elif action == "silence":
|
||
text = "🔕 此類告警靜默 1 小時"
|
||
else:
|
||
text = "✓ 已處理"
|
||
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": text,
|
||
"show_alert": False,
|
||
})
|
||
|
||
async def _update_message_after_action(
|
||
self,
|
||
message_id: int,
|
||
action: str,
|
||
username: str,
|
||
original_text: str,
|
||
extra_info: str = "",
|
||
) -> None:
|
||
"""
|
||
更新訊息: 保留原始卡片內容 + 簽核/調優鋼印
|
||
|
||
UX 要求:
|
||
- 嚴禁覆蓋原始內容
|
||
- 必須在底部加上分隔線與簽核狀態
|
||
- 移除所有按鈕
|
||
"""
|
||
# 構建鋼印 (2026-03-27 ogt: 新增 snooze/silence)
|
||
if action == "approve":
|
||
stamp = f"✅ 已由 @{username} 授權執行"
|
||
elif action == "reject":
|
||
stamp = f"❌ 已由 @{username} 拒絕執行"
|
||
elif action == "tune":
|
||
stamp = f"⚡ 已由 @{username} 觸發自動調優 (Shadow Mode)"
|
||
if extra_info:
|
||
stamp += "\n📝 指令已記錄"
|
||
elif action == "snooze":
|
||
stamp = f"⏰ @{username} 已設定 30 分鐘後再提醒"
|
||
elif action == "silence":
|
||
resource_info = f" ({extra_info})" if extra_info else ""
|
||
stamp = f"🔕 @{username} 已靜默此類告警 1 小時{resource_info}"
|
||
else:
|
||
stamp = f"✓ 已由 @{username} 處理"
|
||
|
||
# Step 1: 先移除按鈕 (確保按鈕一定消失,即使文字更新失敗)
|
||
# 2026-04-05 Claude Code: editMessageText 因 HTML 特殊字符可能失敗,
|
||
# 先用 editMessageReplyMarkup 確保按鈕移除,再嘗試更新文字
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("telegram_remove_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
# Step 2: 嘗試更新文字 (原始文字已轉義,確保 HTML 安全)
|
||
separator = "──────────────"
|
||
safe_original = html.escape(original_text)
|
||
safe_updated_text = f"{safe_original}\n{separator}\n{stamp}"
|
||
try:
|
||
await self._send_request("editMessageText", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"text": safe_updated_text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": {"inline_keyboard": []},
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
# 文字更新失敗不影響整體流程,按鈕已移除
|
||
logger.warning("telegram_update_text_failed", message_id=message_id, error=str(e))
|
||
|
||
async def mark_auto_repaired(
|
||
self,
|
||
approval_id: str,
|
||
playbook_name: str,
|
||
execution_time_ms: int,
|
||
success: bool = True,
|
||
) -> bool:
|
||
"""
|
||
自動修復完成後更新 Telegram 卡片:
|
||
1. 移除批准/拒絕/靜默按鈕
|
||
2. 回覆原訊息顯示修復結果
|
||
|
||
2026-04-10 Claude Sonnet 4.6 Asia/Taipei (ADR-068 閉環)
|
||
"""
|
||
try:
|
||
stored = await get_redis().get(f"tg_approval:{approval_id}")
|
||
if not stored:
|
||
logger.warning("mark_auto_repaired_no_msg_id", approval_id=approval_id)
|
||
return False
|
||
|
||
message_id = int(stored)
|
||
|
||
# 移除按鈕
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("mark_auto_repaired_remove_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
# 回覆原訊息說明結果
|
||
_status = "✅ 已自動修復" if success else "❌ 自動修復失敗"
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.chat_id,
|
||
"text": (
|
||
f"{_status}\n"
|
||
f"Playbook: <code>{html.escape(playbook_name)}</code>\n"
|
||
f"耗時: {execution_time_ms}ms"
|
||
),
|
||
"parse_mode": "HTML",
|
||
"reply_parameters": {"message_id": message_id},
|
||
})
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.warning("mark_auto_repaired_failed", approval_id=approval_id, error=str(e))
|
||
return False
|
||
|
||
async def append_incident_update(
|
||
self,
|
||
incident_id: str,
|
||
status_line: str,
|
||
keep_info_buttons: bool = True,
|
||
) -> bool:
|
||
"""
|
||
在原始告警訊息追加狀態行,並換掉操作按鈕。
|
||
用於自動修復完成/失敗後更新原訊息,讓狀態變更在同一則訊息上延續。
|
||
|
||
流程:
|
||
1. 從 Redis 取 tg_msg:{incident_id} 得到 message_id
|
||
2. editMessageText: 原文 + 分隔線 + status_line
|
||
3. editMessageReplyMarkup: 移除 Row 1 (批准/拒絕/靜默),保留 Row 2 (詳情/重診/歷史)
|
||
|
||
Args:
|
||
incident_id: Incident ID(用於查 Redis 的 message_id)
|
||
status_line: 追加的狀態文字,如「✅ 已自動修復: kubectl rollout restart…」
|
||
keep_info_buttons: 是否保留詳情/重診/歷史按鈕(預設 True)
|
||
|
||
Returns:
|
||
bool: True = 成功 edit 原訊息;False = 找不到 message_id(fallback 需另行處理)
|
||
|
||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei (統帥要求: 狀態變更在原訊息延續)
|
||
"""
|
||
from src.core.redis_client import get_redis
|
||
|
||
redis = get_redis()
|
||
redis_key = f"tg_msg:{incident_id}"
|
||
stored = await redis.get(redis_key)
|
||
if not stored:
|
||
logger.warning(
|
||
"append_incident_update_no_message_id",
|
||
incident_id=incident_id,
|
||
reason="message_id not in Redis",
|
||
)
|
||
return False
|
||
|
||
try:
|
||
message_id = int(stored)
|
||
except (ValueError, TypeError):
|
||
logger.warning("append_incident_update_invalid_message_id", stored=stored)
|
||
return False
|
||
|
||
# Step 1: 取得原始訊息文字(Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
|
||
# 策略: 只追加 status_line,不讀取原文(Telegram edit 要傳完整新文字)
|
||
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態
|
||
# → 實際上用 reply_to_message_id 讓 Telegram 顯示連結更直觀
|
||
|
||
# Step 1: 換掉按鈕 (移除 Row 1 批准/拒絕/靜默,保留 Row 2 資訊按鈕)
|
||
if keep_info_buttons:
|
||
new_keyboard = {"inline_keyboard": [
|
||
[
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
],
|
||
]}
|
||
else:
|
||
new_keyboard = {"inline_keyboard": []}
|
||
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": new_keyboard,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("append_incident_update_edit_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
# Step 2: Reply 原訊息追加狀態(保留原文不動,以 reply 方式延續)
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.chat_id,
|
||
"text": status_line,
|
||
"parse_mode": "HTML",
|
||
"reply_to_message_id": message_id,
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("append_incident_update_reply_failed", message_id=message_id, error=str(e))
|
||
|
||
logger.info(
|
||
"append_incident_update_done",
|
||
incident_id=incident_id,
|
||
message_id=message_id,
|
||
)
|
||
return True
|
||
|
||
async def _dispatch_category_action(
|
||
self,
|
||
callback_query_id: str,
|
||
action: str,
|
||
incident_id: str,
|
||
user_id: int,
|
||
) -> None:
|
||
"""
|
||
Phase 5 Sprint 5.1 (2026-04-14 Claude Sonnet 4.6):
|
||
Fallback dispatcher — 未知 info action 查 callback_action_spec.yaml
|
||
|
||
流程:
|
||
1. 查 action registry
|
||
2. 若不存在 → 原「⚠️ 未知操作」回覆
|
||
3. 若存在 → 從 incident 取 labels → dispatch_action → reply_to 原卡片
|
||
|
||
注意: 此方法只處理 info action (查類)。nonce action (寫類) 走另一路徑。
|
||
"""
|
||
from src.services.callback_dispatcher import dispatch_action, get_action_spec
|
||
|
||
spec = get_action_spec(action)
|
||
if not spec:
|
||
await self._answer_callback(callback_query_id, action, text="⚠️ 未知操作")
|
||
return
|
||
|
||
# Acknowledge callback immediately(避免 Telegram 端 timeout)
|
||
await self._answer_callback(
|
||
callback_query_id, action, text=f"{spec.emoji} 執行中..."
|
||
)
|
||
|
||
# 從 incident 取 labels (供模板替換)
|
||
labels: dict = {}
|
||
try:
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
if incident and incident.signals:
|
||
labels = incident.signals[0].labels or {}
|
||
except Exception as _e:
|
||
logger.debug("dispatch_labels_lookup_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
# Dispatch
|
||
result = await dispatch_action(
|
||
action_name=action,
|
||
incident_id=incident_id,
|
||
user_id=user_id,
|
||
labels=labels,
|
||
)
|
||
|
||
# Reply to 原卡片 — 從 Redis tg_msg 查 message_id
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
msg_id_raw = await redis.get(f"tg_msg:{incident_id}")
|
||
orig_msg_id = int(msg_id_raw) if msg_id_raw else None
|
||
except Exception:
|
||
orig_msg_id = None
|
||
|
||
try:
|
||
payload: dict = {
|
||
"chat_id": settings.OPENCLAW_TG_CHAT_ID,
|
||
"text": result.result_text,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if orig_msg_id:
|
||
payload["reply_to_message_id"] = orig_msg_id
|
||
await self._http_client.post(
|
||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||
json=payload,
|
||
)
|
||
logger.info(
|
||
"category_action_reply_sent",
|
||
action=action,
|
||
incident_id=incident_id,
|
||
success=result.success,
|
||
duration_ms=round(result.duration_ms, 1),
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("category_action_reply_failed", action=action, error=str(_e))
|
||
|
||
async def _send_incident_detail(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 傳送事件詳情訊息 (不修改原始簽核卡片)
|
||
|
||
2026-04-01 Claude Code (ADR-050 P2): detail button handler
|
||
"""
|
||
# 延遲 import 避免循環依賴 (與 approval_service 同一模式)
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
|
||
try:
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
|
||
if not incident:
|
||
await self.send_notification(f"⚠️ 找不到事件 <code>{html.escape(incident_id)}</code>")
|
||
return
|
||
|
||
dc = incident.decision_chain
|
||
confidence_bar = "█" * int((dc.confidence if dc else 0) * 10) + "░" * (10 - int((dc.confidence if dc else 0) * 10))
|
||
|
||
lines = [
|
||
f"📋 <b>事件詳情</b>",
|
||
f"",
|
||
f"🔖 <b>ID:</b> <code>{html.escape(incident.incident_id)}</code>",
|
||
f"📊 <b>狀態:</b> {incident.status.value}",
|
||
f"⚡ <b>嚴重度:</b> {incident.severity.value}",
|
||
]
|
||
|
||
if incident.affected_services:
|
||
lines.append(f"🎯 <b>受影響服務:</b> {', '.join(html.escape(s) for s in incident.affected_services[:3])}")
|
||
|
||
if dc:
|
||
lines += [
|
||
f"",
|
||
f"🤖 <b>AI 分析</b> ({html.escape(dc.model_used)})",
|
||
f"💡 {html.escape(dc.hypothesis)}",
|
||
f"📈 信心: [{confidence_bar}] {dc.confidence:.0%}",
|
||
]
|
||
if dc.probable_root_causes:
|
||
lines.append(f"🔍 根因: {html.escape(dc.probable_root_causes[0][:100])}")
|
||
|
||
# 2026-04-02 Claude Code: 修正時區 — 必須轉台北時區 (feedback_timezone_taipei.md)
|
||
from zoneinfo import ZoneInfo
|
||
created_taipei = incident.created_at.astimezone(ZoneInfo("Asia/Taipei")) if incident.created_at else incident.created_at
|
||
lines += [
|
||
f"",
|
||
f"🕐 <b>建立:</b> {created_taipei.strftime('%m/%d %H:%M') if created_taipei else 'N/A'}",
|
||
]
|
||
|
||
if incident.frequency_stats:
|
||
fs = incident.frequency_stats
|
||
lines.append(f"📉 <b>頻率:</b> 1h={fs.count_1h} 24h={fs.count_24h} 7d={fs.count_7d}")
|
||
|
||
await self.send_notification("\n".join(lines))
|
||
|
||
except Exception as e:
|
||
logger.warning("send_incident_detail_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(f"⚠️ 無法取得事件詳情: {html.escape(str(e)[:100])}")
|
||
|
||
async def _send_incident_history(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 傳送事件頻率統計訊息
|
||
|
||
Phase 27 雙層策略 (2026-04-10 ogt):
|
||
- Layer 1: DB frequency_snapshot — 建立時刻快照,永久保存
|
||
- Layer 2: Redis AnomalyCounter — 跨 incident 累積統計 (35d TTL)
|
||
"""
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.services.anomaly_counter import get_anomaly_counter
|
||
|
||
try:
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
|
||
if not incident:
|
||
await self.send_notification(f"⚠️ 找不到事件 <code>{html.escape(incident_id)}</code>")
|
||
return
|
||
|
||
lines = [
|
||
"📊 <b>事件歷史統計</b>",
|
||
"",
|
||
f"🔖 <code>{html.escape(incident_id)}</code>",
|
||
]
|
||
|
||
# === Layer 1: DB 快照 (建立時刻,永久) ===
|
||
fs = incident.frequency_stats
|
||
if fs:
|
||
lines += [
|
||
"",
|
||
"📌 <b>建立時刻快照</b>",
|
||
f" 1小時: {fs.count_1h} 次",
|
||
f" 24小時: {fs.count_24h} 次",
|
||
f" 7天: {fs.count_7d} 次",
|
||
f" 30天: {fs.count_30d} 次",
|
||
]
|
||
if fs.auto_repair_count > 0:
|
||
lines.append(f" 自動修復: {fs.auto_repair_count} 次")
|
||
if fs.last_repair_action:
|
||
lines.append(f" 最後動作: {html.escape(fs.last_repair_action)}")
|
||
if fs.escalation_level:
|
||
lines.append(f" 升級等級: {html.escape(fs.escalation_level)}")
|
||
if fs.anomaly_key:
|
||
lines.append(f"🔑 告警鍵: <code>{html.escape(fs.anomaly_key)}</code>")
|
||
anomaly_key = fs.anomaly_key
|
||
else:
|
||
lines += ["", "⚠️ 無建立時快照(舊 incident 或 Redis 已超期)"]
|
||
# 嘗試從 signals 推導 anomaly_key
|
||
anomaly_key = None
|
||
if incident.signals:
|
||
sig = incident.signals[0]
|
||
parts = [
|
||
sig.alert_name or "",
|
||
incident.affected_services[0] if incident.affected_services else "",
|
||
(sig.labels or {}).get("namespace", ""),
|
||
(sig.labels or {}).get("error_type", ""),
|
||
]
|
||
candidate = ":".join(p for p in parts if p)
|
||
if candidate:
|
||
anomaly_key = candidate
|
||
|
||
# === Layer 2: Redis 累積統計 (35d TTL) ===
|
||
if anomaly_key:
|
||
try:
|
||
counter = get_anomaly_counter()
|
||
disposition = await counter.get_disposition_stats(anomaly_key)
|
||
auto_r = disposition.get("auto_repair_count", 0)
|
||
cold_s = disposition.get("cold_start_trust_count", 0)
|
||
human_a = disposition.get("human_approved_count", 0)
|
||
manual_r = disposition.get("manual_resolved_count", 0)
|
||
total_res = auto_r + cold_s + human_a + manual_r
|
||
if total_res > 0:
|
||
auto_rate = int((auto_r + cold_s) / total_res * 100)
|
||
lines += [
|
||
"",
|
||
f"📋 <b>累積處置分佈</b> (共 {total_res} 次,35天內)",
|
||
f" 🤖 自動修復: {auto_r}",
|
||
f" ❄️ 冷啟動信任: {cold_s}",
|
||
f" 👤 人工審核: {human_a}",
|
||
f" 🔧 手動處理: {manual_r}",
|
||
f" 📈 自動化率: <b>{auto_rate}%</b>",
|
||
]
|
||
else:
|
||
lines += ["", "📋 <b>累積處置</b>: 尚無記錄 (Redis TTL 35天)"]
|
||
except Exception as redis_err:
|
||
logger.warning("incident_history_redis_error", error=str(redis_err))
|
||
lines += ["", "⚠️ Redis 統計暫時無法取得"]
|
||
|
||
await self.send_notification("\n".join(lines))
|
||
|
||
except Exception as e:
|
||
logger.warning("send_incident_history_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(f"⚠️ 無法取得歷史統計: {html.escape(str(e))}")
|
||
|
||
async def _send_reanalyze_result(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 觸發重診並傳送結果訊息
|
||
|
||
呼叫 IncidentService.trigger_reanalysis(),以新訊息回報排程結果。
|
||
不修改原始簽核卡片,避免干擾授權流程。
|
||
|
||
2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
|
||
"""
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
try:
|
||
service = get_incident_service()
|
||
result = await service.trigger_reanalysis(incident_id)
|
||
|
||
if result["already_analyzing"]:
|
||
msg = (
|
||
f"⏳ <b>重診進行中</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"{html.escape(result['message'])}"
|
||
)
|
||
elif result["triggered"]:
|
||
msg = (
|
||
f"🔄 <b>重診已排程</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"✅ {html.escape(result['message'])}\n"
|
||
f"AI 分析結果將自動更新事件狀態。"
|
||
)
|
||
else:
|
||
msg = (
|
||
f"⚠️ <b>重診失敗</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"{html.escape(result['message'])}"
|
||
)
|
||
|
||
await self.send_notification(msg)
|
||
|
||
except Exception as e:
|
||
logger.warning("send_reanalyze_result_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(
|
||
f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}"
|
||
)
|
||
|
||
# =========================================================================
|
||
# Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景
|
||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||
# =========================================================================
|
||
|
||
async def send_guardrail_blocked(
|
||
self,
|
||
service_name: str,
|
||
alertname: str,
|
||
reason: str,
|
||
) -> None:
|
||
"""T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
|
||
try:
|
||
text = (
|
||
"🚫 <b>[服務保護] 自動修復已阻擋</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"告警: <code>{html.escape(alertname)}</code>\n"
|
||
f"原因: {html.escape(reason)}\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"⚠️ 請人工評估並手動處理"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t1_guardrail_blocked_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_preflight_failed(
|
||
self,
|
||
service_name: str,
|
||
backup_age_hours: float,
|
||
max_age_hours: float,
|
||
backup_name: str | None,
|
||
) -> None:
|
||
"""T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
|
||
try:
|
||
backup_status = (
|
||
f"緊急備份: 已啟動 <code>{html.escape(backup_name)}</code>"
|
||
if backup_name
|
||
else "緊急備份: <b>啟動失敗</b>,請人工處理"
|
||
)
|
||
text = (
|
||
"⏸ <b>[Pre-flight 阻擋] 備份已過期,修復暫停</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
|
||
f"{backup_status}\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"請等待備份完成後,人工重新評估修復方案"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t2_preflight_failed_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_backup_result(
|
||
self,
|
||
backup_name: str,
|
||
success: bool,
|
||
error_msg: str | None = None,
|
||
) -> None:
|
||
"""T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
|
||
try:
|
||
if success:
|
||
text = (
|
||
"✅ <b>緊急備份完成</b>\n"
|
||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||
"可繼續手動執行修復"
|
||
)
|
||
else:
|
||
err = html.escape(error_msg or "未知錯誤")
|
||
text = (
|
||
"❌ <b>緊急備份失敗</b>\n"
|
||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||
f"錯誤: {err}\n"
|
||
"請人工介入,備份異常"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t3_backup_result_notify_failed", backup=backup_name, error=str(e))
|
||
|
||
async def send_multisig_waiting(
|
||
self,
|
||
action: str,
|
||
service_name: str,
|
||
votes_received: int,
|
||
votes_required: int,
|
||
approval_id: str,
|
||
) -> None:
|
||
"""T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
|
||
try:
|
||
text = (
|
||
"🔐 <b>[MultiSig] 等待第 2 票授權</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"操作: {html.escape(action)}\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"風險: CRITICAL(HITL 雙簽)\n"
|
||
f"已獲授權: {votes_received}/{votes_required} 票\n"
|
||
f"審核 ID: <code>{html.escape(approval_id)}</code>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"請第二位審核者登入確認"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t4_multisig_waiting_notify_failed", approval=approval_id, error=str(e))
|
||
|
||
async def send_multisig_approved(
|
||
self,
|
||
action: str,
|
||
service_name: str,
|
||
) -> None:
|
||
"""T5: MultiSig 完成(2/2)"""
|
||
try:
|
||
text = (
|
||
"✅ <b>[MultiSig 完成] 雙簽授權通過</b>\n"
|
||
f"操作: {html.escape(action)}\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
"授權: 2/2 票 開始執行..."
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t5_multisig_approved_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_change_applied(
|
||
self,
|
||
operator: str,
|
||
action_description: str,
|
||
timestamp: str,
|
||
) -> None:
|
||
"""T6: CHANGE_APPLIED — 手動變更記錄"""
|
||
try:
|
||
text = (
|
||
"📝 <b>[變更記錄] 手動操作已記錄</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"操作者: {html.escape(operator)}\n"
|
||
f"動作: {html.escape(action_description)}\n"
|
||
f"時間: {html.escape(timestamp)}"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t6_change_applied_notify_failed", operator=operator, error=str(e))
|
||
|
||
async def send_notification(
|
||
self,
|
||
text: str,
|
||
parse_mode: str = "HTML",
|
||
chat_id: str | int | None = None,
|
||
) -> dict:
|
||
"""
|
||
發送純文字通知
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
parse_mode: 解析模式
|
||
|
||
Returns:
|
||
dict: API 回應
|
||
"""
|
||
payload = {
|
||
"chat_id": chat_id or self.chat_id,
|
||
"text": text[:500], # SOUL.md 字數限制
|
||
"parse_mode": parse_mode,
|
||
}
|
||
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
# =========================================================================
|
||
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053
|
||
# @tsenyangbot 發告警卡片到群組,OpenClaw/NemoClaw Bot 各自回覆分析
|
||
# =========================================================================
|
||
|
||
async def send_to_group(
|
||
self,
|
||
text: str,
|
||
parse_mode: str = "HTML",
|
||
reply_markup: dict | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @tsenyangbot 發訊息到 SRE 群組 (SRE_GROUP_CHAT_ID)
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
parse_mode: 解析模式
|
||
reply_markup: 按鈕 (可選)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應 (含 message_id)
|
||
"""
|
||
if not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_to_group_skipped", reason="SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
payload: dict = {
|
||
"chat_id": settings.SRE_GROUP_CHAT_ID,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
}
|
||
if reply_markup:
|
||
payload["reply_markup"] = reply_markup
|
||
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
async def _send_as_bot(
|
||
self,
|
||
token: str,
|
||
chat_id: str,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
parse_mode: str = "HTML",
|
||
) -> dict:
|
||
"""
|
||
用指定 Bot Token 發訊息(不走 self._http_client,獨立建立請求)
|
||
|
||
Args:
|
||
token: Bot Token
|
||
chat_id: 群組 Chat ID
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息的 message_id
|
||
parse_mode: 解析模式
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not self._http_client:
|
||
raise TelegramGatewayError("HTTP client not initialized")
|
||
|
||
url = f"{self.TELEGRAM_API_BASE}/bot{token}/sendMessage"
|
||
payload: dict = {
|
||
"chat_id": chat_id,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
}
|
||
# 2026-04-03 ogt: supergroup 跨 Bot reply 需用 reply_parameters (Bot API v6.7+)
|
||
# 舊的 reply_to_message_id 在 supergroup 會 400,改用新格式 + allow_sending_without_reply
|
||
if reply_to_message_id:
|
||
payload["reply_parameters"] = {
|
||
"message_id": reply_to_message_id,
|
||
"allow_sending_without_reply": True,
|
||
}
|
||
|
||
response = await self._http_client.post(url, json=payload)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
|
||
async def send_as_openclaw(
|
||
self,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @OpenClawAwoooI_Bot 在群組發言
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not settings.OPENCLAW_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_as_openclaw_skipped", reason="OPENCLAW_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
return await self._send_as_bot(
|
||
token=settings.OPENCLAW_BOT_TOKEN,
|
||
chat_id=settings.SRE_GROUP_CHAT_ID,
|
||
text=text,
|
||
reply_to_message_id=reply_to_message_id,
|
||
)
|
||
|
||
async def send_as_nemotron(
|
||
self,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @NemoTronAwoooI_Bot 在群組發言
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not settings.NEMOTRON_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_as_nemotron_skipped", reason="NEMOTRON_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
return await self._send_as_bot(
|
||
token=settings.NEMOTRON_BOT_TOKEN,
|
||
chat_id=settings.SRE_GROUP_CHAT_ID,
|
||
text=text,
|
||
reply_to_message_id=reply_to_message_id,
|
||
)
|
||
|
||
async def trigger_group_ai_discussion(
|
||
self,
|
||
alert_message_id: int,
|
||
alert_summary: str,
|
||
) -> None:
|
||
"""
|
||
觸發群組 AI 並行分析(三頭政治核心流程)
|
||
|
||
流程 (2026-04-03 ogt: 統帥指示改為並行):
|
||
- OpenClaw 和 NemoClaw 同時對告警進行獨立分析
|
||
- 兩者都 reply 同一條告警訊息
|
||
- 並行執行,總等待時間 = max(OpenClaw, NemoClaw) 而非相加
|
||
|
||
此方法由 asyncio.create_task 非同步呼叫,失敗不影響主流程。
|
||
|
||
Args:
|
||
alert_message_id: 告警訊息的 message_id(兩個 Bot 回覆的起點)
|
||
alert_summary: 告警摘要文字(提供給 AI 分析用)
|
||
"""
|
||
try:
|
||
from src.services.chat_manager import ChatManager # noqa: PLC0415
|
||
except ImportError:
|
||
logger.error("trigger_group_ai_discussion_failed", reason="Cannot import ChatManager")
|
||
return
|
||
|
||
try:
|
||
chat_mgr = ChatManager()
|
||
|
||
# 2026-04-03 ogt: 老闆指示 — 告警分析只由 OpenClaw 負責,NemoClaw 不分析告警
|
||
openclaw_prompt = (
|
||
f"你是 OpenClaw,AWOOOI SRE 戰情室首席 AI,精通 K8s、Prometheus、告警分析。\n"
|
||
f"以下是一則基礎設施告警,請進行 RCA 根因分析並給出 3 點具體建議行動。\n"
|
||
f"繁體中文回應,不超過 300 字:\n\n"
|
||
f"{alert_summary}"
|
||
)
|
||
|
||
openclaw_analysis = await chat_mgr._call_openclaw(
|
||
system_prompt="你是 OpenClaw,AWOOOI SRE 戰情室首席 AI。稱呼用戶為「老闆」。",
|
||
user_message=openclaw_prompt,
|
||
)
|
||
|
||
if openclaw_analysis and not isinstance(openclaw_analysis, Exception):
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw 分析</b>\n\n{openclaw_analysis}",
|
||
reply_to_message_id=alert_message_id,
|
||
)
|
||
logger.info("group_ai_discussion_openclaw_sent")
|
||
else:
|
||
logger.warning("trigger_group_ai_discussion_openclaw_empty")
|
||
|
||
logger.info("group_ai_discussion_completed", alert_message_id=alert_message_id)
|
||
|
||
except Exception as e:
|
||
# 群組 AI 討論失敗不影響主流程
|
||
logger.error("trigger_group_ai_discussion_failed", error=str(e))
|
||
|
||
async def close(self) -> None:
|
||
"""關閉 Gateway"""
|
||
# 停止 Long Polling 與 Leader 相關 Tasks
|
||
self._polling_active = False
|
||
for task in (self._polling_task, self._leader_task):
|
||
if task and not task.done():
|
||
task.cancel()
|
||
try:
|
||
await task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self._polling_task = None
|
||
self._leader_task = None
|
||
|
||
if self._http_client:
|
||
await self._http_client.aclose()
|
||
self._http_client = None
|
||
self._initialized = False
|
||
logger.info("telegram_gateway_closed")
|
||
|
||
# =========================================================================
|
||
# Long Polling 實作 (Phase 5 內網修復)
|
||
# =========================================================================
|
||
|
||
async def start_long_polling(self) -> None:
|
||
"""
|
||
啟動 Long Polling 背景任務
|
||
|
||
取代 Webhook 模式,適用於內網環境
|
||
統帥鐵律: 內網無法接收外部 Webhook,必須主動輪詢
|
||
|
||
2026-04-01 Claude Code: 加入 Redis Leader Election
|
||
多 Pod 環境下,只有 Leader 執行 getUpdates,其餘 Pod 進入 Watcher 模式
|
||
"""
|
||
if not self._initialized:
|
||
success = await self.initialize()
|
||
if not success:
|
||
logger.error("telegram_long_polling_failed", reason="Gateway not initialized")
|
||
return
|
||
|
||
if self._polling_active:
|
||
logger.warning("telegram_long_polling_already_running")
|
||
return
|
||
|
||
# 嘗試取得 Leader Lock (NX = 僅在不存在時設定)
|
||
redis = await get_redis()
|
||
acquired = await redis.set(POLLING_LEADER_KEY, self._pod_id, nx=True, ex=POLLING_LEADER_TTL)
|
||
|
||
if not acquired:
|
||
current_leader = await redis.get(POLLING_LEADER_KEY)
|
||
logger.info(
|
||
"telegram_polling_not_leader",
|
||
pod_id=self._pod_id,
|
||
current_leader=current_leader,
|
||
action="watcher_mode",
|
||
)
|
||
# 啟動 Watcher:定期嘗試接管
|
||
self._leader_task = asyncio.create_task(self._leader_watcher())
|
||
return
|
||
|
||
# 取得 Leader Lock,開始 Polling
|
||
await self._delete_webhook()
|
||
|
||
self._polling_active = True
|
||
self._last_update_id = 0
|
||
self._polling_task = asyncio.create_task(self._polling_loop())
|
||
self._leader_task = asyncio.create_task(self._leader_renewer())
|
||
|
||
logger.info(
|
||
"telegram_long_polling_started",
|
||
pod_id=self._pod_id,
|
||
timeout=LONG_POLLING_TIMEOUT,
|
||
chat_id=self.chat_id[:10] + "..." if self.chat_id else "N/A",
|
||
)
|
||
|
||
async def _delete_webhook(self) -> None:
|
||
"""
|
||
刪除現有 Webhook (切換至 Long Polling 模式)
|
||
|
||
統帥鐵律: Webhook 和 Long Polling 不能共存
|
||
必須先刪除 Webhook 才能使用 getUpdates
|
||
"""
|
||
if not self._http_client:
|
||
return
|
||
|
||
try:
|
||
# Step 1: 刪除 Webhook
|
||
url = f"{self.api_url}/deleteWebhook"
|
||
response = await self._http_client.post(url, json={"drop_pending_updates": True})
|
||
result = response.json()
|
||
|
||
if result.get("ok"):
|
||
logger.info(
|
||
"telegram_webhook_deleted",
|
||
description=result.get("description", "Webhook deleted"),
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"telegram_webhook_delete_failed",
|
||
error=result.get("description"),
|
||
)
|
||
|
||
# Step 2: 等待 Telegram 伺服器同步 (避免 409 Conflict)
|
||
await asyncio.sleep(1)
|
||
|
||
# Step 3: 驗證 Webhook 狀態
|
||
info_url = f"{self.api_url}/getWebhookInfo"
|
||
info_response = await self._http_client.get(info_url)
|
||
info_result = info_response.json()
|
||
|
||
webhook_url = info_result.get("result", {}).get("url", "")
|
||
if webhook_url:
|
||
logger.warning(
|
||
"telegram_webhook_still_active",
|
||
url=webhook_url[:50],
|
||
)
|
||
else:
|
||
logger.info("telegram_webhook_confirmed_deleted")
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_webhook_delete_error", error=str(e))
|
||
|
||
async def _polling_loop(self) -> None:
|
||
"""
|
||
Long Polling 主循環
|
||
|
||
使用 getUpdates API 持續監聽 Telegram 更新
|
||
"""
|
||
logger.info("[Telegram] Long polling started - 神經已接通,等待統帥指令...")
|
||
|
||
while self._polling_active:
|
||
try:
|
||
updates = await self._get_updates()
|
||
|
||
for update in updates:
|
||
await self._process_update(update)
|
||
|
||
except asyncio.CancelledError:
|
||
logger.info("telegram_long_polling_cancelled")
|
||
break
|
||
|
||
except httpx.TimeoutException:
|
||
# Long polling timeout 是正常的,繼續下一輪
|
||
continue
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
if e.response.status_code == 409:
|
||
# 409 Conflict: 另一個 Pod 正在 polling,主動釋放 Leader Lock
|
||
# 2026-04-01 Claude Code: 改為釋放 Lock 讓 Watcher 競爭
|
||
# (舊: 侵略性搶佔 2s,已不適用 - 現在是多 Pod 場景而非 .188 搶佔)
|
||
logger.warning(
|
||
"telegram_polling_conflict",
|
||
status=409,
|
||
pod_id=self._pod_id,
|
||
action="releasing_leader_lock",
|
||
)
|
||
redis = await get_redis()
|
||
current = await redis.get(POLLING_LEADER_KEY)
|
||
if current == self._pod_id:
|
||
await redis.delete(POLLING_LEADER_KEY)
|
||
self._polling_active = False
|
||
# Watcher 會在 POLLING_LEADER_WATCH 秒後重新競爭
|
||
self._leader_task = asyncio.create_task(self._leader_watcher())
|
||
break
|
||
else:
|
||
logger.error("telegram_polling_http_error", status=e.response.status_code)
|
||
await asyncio.sleep(LONG_POLLING_RETRY_DELAY)
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_polling_error", error=str(e))
|
||
# 錯誤後等待再重試
|
||
await asyncio.sleep(LONG_POLLING_RETRY_DELAY)
|
||
|
||
logger.info("telegram_long_polling_stopped")
|
||
|
||
async def _leader_renewer(self) -> None:
|
||
"""
|
||
Leader Lock 續約背景任務
|
||
|
||
每 POLLING_LEADER_RENEW 秒更新 Redis TTL,
|
||
確保 Leader 在 Poll 期間持續持有 Lock。
|
||
若 Lock 被搶走,停止 Polling。
|
||
|
||
2026-04-01 Claude Code: 分散式 Leader Election
|
||
"""
|
||
while self._polling_active:
|
||
await asyncio.sleep(POLLING_LEADER_RENEW)
|
||
if not self._polling_active:
|
||
break
|
||
try:
|
||
redis = await get_redis()
|
||
current = await redis.get(POLLING_LEADER_KEY)
|
||
if current != self._pod_id:
|
||
logger.warning(
|
||
"telegram_leader_lock_lost",
|
||
pod_id=self._pod_id,
|
||
current_leader=current,
|
||
)
|
||
self._polling_active = False
|
||
break
|
||
await redis.expire(POLLING_LEADER_KEY, POLLING_LEADER_TTL)
|
||
except Exception as e:
|
||
logger.error("telegram_leader_renew_error", error=str(e))
|
||
|
||
async def _leader_watcher(self) -> None:
|
||
"""
|
||
非 Leader Pod 的接管監控任務
|
||
|
||
每 POLLING_LEADER_WATCH 秒嘗試取得 Leader Lock。
|
||
若原 Leader 宕掉(TTL 過期),此 Pod 接管 Polling。
|
||
|
||
2026-04-01 Claude Code: 分散式 Leader Election
|
||
"""
|
||
while not self._polling_active:
|
||
await asyncio.sleep(POLLING_LEADER_WATCH)
|
||
try:
|
||
redis = await get_redis()
|
||
acquired = await redis.set(
|
||
POLLING_LEADER_KEY, self._pod_id, nx=True, ex=POLLING_LEADER_TTL
|
||
)
|
||
if acquired:
|
||
logger.info(
|
||
"telegram_leader_acquired",
|
||
pod_id=self._pod_id,
|
||
action="starting_polling",
|
||
)
|
||
await self._delete_webhook()
|
||
self._polling_active = True
|
||
self._last_update_id = 0
|
||
self._polling_task = asyncio.create_task(self._polling_loop())
|
||
self._leader_task = asyncio.create_task(self._leader_renewer())
|
||
break
|
||
except asyncio.CancelledError:
|
||
break
|
||
except Exception as e:
|
||
logger.error("telegram_leader_watch_error", error=str(e))
|
||
|
||
async def _get_updates(self) -> list[dict]:
|
||
"""
|
||
呼叫 Telegram getUpdates API
|
||
|
||
Returns:
|
||
list[dict]: 更新列表
|
||
"""
|
||
if not self._http_client:
|
||
return []
|
||
|
||
url = f"{self.api_url}/getUpdates"
|
||
payload = {
|
||
"offset": self._last_update_id + 1,
|
||
"timeout": LONG_POLLING_TIMEOUT,
|
||
"allowed_updates": ["callback_query", "message"], # 監聽按鈕與文字訊息
|
||
}
|
||
|
||
response = await self._http_client.post(
|
||
url,
|
||
json=payload,
|
||
timeout=LONG_POLLING_TIMEOUT + 10, # 比 API timeout 多一點
|
||
)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
|
||
if not result.get("ok"):
|
||
raise TelegramGatewayError(f"getUpdates failed: {result.get('description')}")
|
||
|
||
updates = result.get("result", [])
|
||
|
||
# 更新 offset
|
||
if updates:
|
||
self._last_update_id = updates[-1]["update_id"]
|
||
|
||
return updates
|
||
|
||
async def _process_update(self, update: dict) -> None:
|
||
"""
|
||
處理單個 Telegram Update
|
||
|
||
Args:
|
||
update: Telegram Update 物件
|
||
"""
|
||
update_id = update.get("update_id")
|
||
callback_query = update.get("callback_query")
|
||
message = update.get("message")
|
||
|
||
if not callback_query and not message:
|
||
logger.debug("telegram_update_ignored", update_id=update_id, reason="unsupported update type")
|
||
return
|
||
|
||
if callback_query:
|
||
await self._handle_callback_query(update_id, callback_query)
|
||
elif message:
|
||
await self._handle_chat_message(update_id, message)
|
||
|
||
async def _handle_callback_query(self, update_id: int, callback_query: dict) -> None:
|
||
"""處理按鈕點擊更新"""
|
||
callback_query_id = callback_query.get("id")
|
||
callback_data = callback_query.get("data")
|
||
user = callback_query.get("from", {})
|
||
user_id = user.get("id")
|
||
|
||
if not all([callback_query_id, callback_data, user_id]):
|
||
logger.warning("telegram_callback_invalid", update_id=update_id)
|
||
return
|
||
|
||
username = user.get("username") or user.get("first_name") or str(user_id)
|
||
original_text = callback_query.get("message", {}).get("text", "")
|
||
message_id = callback_query.get("message", {}).get("message_id")
|
||
|
||
logger.info(
|
||
"telegram_callback_received",
|
||
update_id=update_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
|
||
# 呼叫現有的 handle_callback 邏輯
|
||
result = await self.handle_callback(
|
||
callback_query_id=callback_query_id,
|
||
callback_data=callback_data,
|
||
user_id=user_id,
|
||
message_id=message_id,
|
||
original_text=original_text,
|
||
username=username,
|
||
)
|
||
|
||
if result.get("success"):
|
||
# 執行資料庫更新 (簽核/拒絕)
|
||
await self._execute_approval_action(
|
||
action=result["action"],
|
||
approval_id=result["approval_id"],
|
||
user_id=user_id,
|
||
username=username,
|
||
message_id=message_id,
|
||
)
|
||
|
||
async def _handle_chat_message(self, update_id: int, message: dict) -> None:
|
||
"""處理统帥的文字訊息(個人 chat 或 SRE 群組)"""
|
||
text = message.get("text")
|
||
user = message.get("from", {})
|
||
user_id = user.get("id")
|
||
chat_id = message.get("chat", {}).get("id")
|
||
chat_type = message.get("chat", {}).get("type", "private")
|
||
message_id = message.get("message_id")
|
||
username = user.get("username") or user.get("first_name") or str(user_id)
|
||
|
||
# Phase 34 (ADR-067 2026-04-10): 圖片訊息路由
|
||
photos = message.get("photo")
|
||
if photos and user_id:
|
||
if not user.get("is_bot"):
|
||
best = max(photos, key=lambda p: p.get("file_size", 0))
|
||
file_id = best.get("file_id", "")
|
||
caption = message.get("caption", "請用繁體中文描述這張圖片")
|
||
if file_id:
|
||
try:
|
||
from src.services.image_analysis_service import get_image_analysis_service
|
||
svc = get_image_analysis_service()
|
||
await svc.download_and_analyze(
|
||
chat_id=str(chat_id),
|
||
file_id=file_id,
|
||
question=caption,
|
||
)
|
||
except Exception as _img_err:
|
||
logger.warning("image_analysis_polling_failed", error=str(_img_err))
|
||
return
|
||
|
||
if not text or not user_id:
|
||
return
|
||
|
||
# Bot 訊息忽略(避免 Bot 互相觸發無限循環)
|
||
if user.get("is_bot"):
|
||
return
|
||
|
||
logger.info(
|
||
"telegram_chat_received",
|
||
update_id=update_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
chat_type=chat_type,
|
||
text=text[:50],
|
||
)
|
||
|
||
# 1. 群組訊息路由優先 (2026-04-03 ogt: SRE 戰情室群組無需個人白名單)
|
||
# 群組是封閉環境,成員由 Telegram 群組管理員控制,不走個人 whitelist
|
||
is_group = chat_type in ("group", "supergroup")
|
||
is_sre_group = str(chat_id) == str(settings.SRE_GROUP_CHAT_ID)
|
||
|
||
if is_group and is_sre_group:
|
||
reply_to_message = message.get("reply_to_message")
|
||
await self._handle_group_message(text, user_id, username, chat_id, message_id, reply_to_message)
|
||
return
|
||
|
||
# 2. 個人 chat 安全檢查 (ADR-012)
|
||
try:
|
||
interceptor = get_security_interceptor()
|
||
await interceptor.intercept_telegram(user_id)
|
||
except Exception as e:
|
||
logger.warning("telegram_chat_unauthorized", user_id=user_id, error=str(e))
|
||
return
|
||
|
||
# 3. /ai 指令攔截 (Phase 24 C — 2026-04-03 ogt)
|
||
if text.strip().lower().startswith("/ai"):
|
||
whitelist = settings.get_tg_user_whitelist()
|
||
if not whitelist or user_id not in whitelist:
|
||
logger.warning("telegram_ai_command_unauthorized", user_id=user_id, whitelist_empty=not whitelist)
|
||
await self.send_notification("⛔ 未授權:/ai 指令僅限白名單用戶", parse_mode="HTML", chat_id=chat_id)
|
||
return
|
||
from src.services.ai_control import handle_ai_command
|
||
response = await handle_ai_command(text.strip())
|
||
await self.send_notification(response, parse_mode="HTML", chat_id=chat_id)
|
||
logger.info("telegram_ai_command_handled", user_id=user_id, text=text[:50])
|
||
return
|
||
|
||
# 4. 個人 chat — 顯示輸入狀態
|
||
await self._send_chat_action(chat_id, "typing")
|
||
|
||
# 5. ChatManager 處理(個人 chat)
|
||
chat_manager = get_chat_manager()
|
||
response = await chat_manager.generate_response(
|
||
user_id=user_id,
|
||
username=username,
|
||
message_text=text,
|
||
)
|
||
await self.send_notification(response, parse_mode="HTML", chat_id=chat_id)
|
||
|
||
async def _handle_group_message(
|
||
self,
|
||
text: str,
|
||
user_id: int,
|
||
username: str,
|
||
chat_id: int, # noqa: ARG002
|
||
message_id: int | None,
|
||
reply_to_message: dict | None = None,
|
||
) -> None:
|
||
"""
|
||
處理 SRE 群組訊息 (2026-04-03 ogt: Phase 22.6 Triumvirate)
|
||
|
||
路由規則:
|
||
Reply OpenClaw 訊息 → 只有 OpenClaw 回應
|
||
Reply NemoClaw 訊息 → 只有 NemoClaw 回應
|
||
@OpenClawAwoooI_Bot <msg> → 只有 OpenClaw 回應
|
||
@NemoTronAwoooI_Bot <msg> → 只有 NemoClaw 回應
|
||
其他訊息 → 兩個 AI 並行回應
|
||
"""
|
||
# ── 指令路由 (2026-04-03 ogt: 方案B slash commands) ──────────────────
|
||
cmd = text.strip().split()[0].lower().split("@")[0] if text.strip() else ""
|
||
if cmd.startswith("/"):
|
||
await self._handle_group_command(cmd, chat_id, message_id, full_text=text.strip())
|
||
return
|
||
|
||
from src.services.chat_manager import get_chat_manager as _get_cm
|
||
chat_mgr = _get_cm()
|
||
|
||
# 全形/半形統一化後比較
|
||
import unicodedata
|
||
text_normalized = unicodedata.normalize("NFKC", text).lower()
|
||
|
||
# Reply 路由: 若 Reply 的是 Bot 訊息,直接認定目標 AI (2026-04-03 ogt)
|
||
if reply_to_message:
|
||
replied_from = reply_to_message.get("from", {})
|
||
if replied_from.get("is_bot"):
|
||
replied_username = (replied_from.get("username") or "").lower()
|
||
if "openclawawoooi" in replied_username:
|
||
mention_openclaw, mention_nemo = True, False
|
||
elif "nemotronawoooi" in replied_username:
|
||
mention_openclaw, mention_nemo = False, True
|
||
else:
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
else:
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
else:
|
||
# 別名: 小O / 小o (含全形O) → OpenClaw; 小賀 / 小贺 → NemoClaw
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
|
||
# 去掉 @ mention 與別名,取出純訊息
|
||
clean_text = unicodedata.normalize("NFKC", text)
|
||
for token in ["@openclawawoooi_bot", "@OpenClawAwoooI_Bot", "@nemotronawoooi_bot", "@NemoTronAwoooI_Bot",
|
||
"小O", "小o", "小O", "小o", "小賀", "小贺"]:
|
||
clean_text = clean_text.replace(token, "").strip()
|
||
if not clean_text:
|
||
clean_text = text
|
||
|
||
context = await chat_mgr.get_system_context()
|
||
|
||
def _clean_ai_reply(text: str, max_chars: int = 600) -> str:
|
||
"""清理 AI 回覆:移除 Markdown 語法,截斷超長內容"""
|
||
import re
|
||
# 移除 Markdown bold/italic (**text**, *text*, __text__, _text_)
|
||
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
||
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
||
text = re.sub(r'__(.+?)__', r'\1', text)
|
||
text = re.sub(r'_(.+?)_', r'\1', text)
|
||
# 移除 Markdown header (#, ##, ###)
|
||
text = re.sub(r'^#{1,3}\s+', '', text, flags=re.MULTILINE)
|
||
# 移除 <think> 標籤(deepseek-r1)
|
||
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
|
||
# 截斷
|
||
if len(text) > max_chars:
|
||
text = text[:max_chars].rsplit('\n', 1)[0] + '…'
|
||
return text.strip()
|
||
|
||
if mention_openclaw and not mention_nemo:
|
||
# 只 OpenClaw 回應
|
||
result = await chat_mgr._call_openclaw(
|
||
f"{context}\n用戶 {username} 在 SRE 戰情室問你:",
|
||
clean_text,
|
||
)
|
||
body = _clean_ai_reply(result) if result else '🔴 無響應'
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw</b>\n\n{body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
elif mention_nemo and not mention_openclaw:
|
||
# 只 NemoClaw 回應
|
||
result = await chat_mgr._call_nemotron(
|
||
f"{context}\n用戶 {username} 在 SRE 戰情室問你:",
|
||
clean_text,
|
||
)
|
||
body = (_clean_ai_reply(result) if result else '') or '🔴 無響應 (deepseek-r1 超時或思考截斷)'
|
||
await self.send_as_nemotron(
|
||
text=f"🤖 <b>NemoClaw</b>\n\n{body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
else:
|
||
# 兩個 AI 並行回應,完成後互相評論
|
||
oc_task = asyncio.create_task(
|
||
chat_mgr._call_openclaw(f"{context}\n用戶 {username} 在 SRE 戰情室:", clean_text)
|
||
)
|
||
nemo_task = asyncio.create_task(
|
||
chat_mgr._call_nemotron(f"{context}\n用戶 {username} 在 SRE 戰情室:", clean_text)
|
||
)
|
||
oc_result, nemo_result = await asyncio.gather(oc_task, nemo_task, return_exceptions=True)
|
||
|
||
if oc_result and not isinstance(oc_result, Exception):
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw</b>\n\n{_clean_ai_reply(oc_result)}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
if nemo_result and not isinstance(nemo_result, Exception):
|
||
nemo_body = _clean_ai_reply(nemo_result) or "🔴 回覆清理後為空 (deepseek-r1 思考超時)"
|
||
await self.send_as_nemotron(
|
||
text=f"🤖 <b>NemoClaw</b>\n\n{nemo_body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
logger.info("group_message_handled", user_id=user_id, text=text[:50])
|
||
|
||
async def _handle_group_command(self, cmd: str, _chat_id: int, message_id: int | None, full_text: str = "") -> None:
|
||
"""
|
||
SRE 群組 Slash Commands (2026-04-03 ogt: 方案B)
|
||
|
||
/status → K8s Cluster 健康狀態
|
||
/incidents → 活躍告警列表
|
||
/cost → 本月 AI 費用統計
|
||
/pods → 異常 Pod 列表
|
||
/rag → RAG 知識庫查詢 (ADR-067 Phase 33)
|
||
/help → 指令說明
|
||
"""
|
||
from src.repositories.k8s_repository import get_k8s_repository
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.core.redis_client import get_redis
|
||
from src.utils.timezone import now_taipei
|
||
|
||
if cmd == "/status":
|
||
try:
|
||
k8s = get_k8s_repository()
|
||
s = await k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||
running, total = s.get("running", 0), s.get("total", 0)
|
||
problems = s.get("problem_pods", [])
|
||
lines = [f"<b>🖥 Cluster 狀態</b>", f"• Pods: {running}/{total} Running"]
|
||
if problems:
|
||
lines.append(f"• 異常: {len(problems)} 個")
|
||
for p in problems[:5]:
|
||
lines.append(f" ⚠️ {p}")
|
||
else:
|
||
lines.append("• 全部正常 ✅")
|
||
msg = "\n".join(lines)
|
||
except Exception as e:
|
||
msg = f"<b>🖥 Cluster 狀態</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/incidents":
|
||
try:
|
||
repo = get_incident_repository()
|
||
incidents = await repo.get_active()
|
||
if incidents:
|
||
lines = ["<b>🚨 活躍告警</b>"]
|
||
for inc in incidents[:10]:
|
||
lines.append(f"• <code>{inc.incident_id}</code> SEV{inc.severity.value} — {inc.status.value}")
|
||
msg = "\n".join(lines)
|
||
else:
|
||
msg = "<b>🚨 活躍告警</b>\n✅ 目前無告警"
|
||
except Exception as e:
|
||
msg = f"<b>🚨 活躍告警</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/cost":
|
||
redis = get_redis()
|
||
month = now_taipei().strftime("%Y-%m")
|
||
try:
|
||
gemini_cost = float(await redis.get(f"gemini_cost:{month}") or 0)
|
||
claude_cost = float(await redis.get(f"claude_cost:{month}") or 0)
|
||
total = gemini_cost + claude_cost
|
||
msg = (
|
||
f"<b>💰 {month} AI 費用統計</b>\n"
|
||
f"• 🦞 OpenClaw (Gemini Flash-Lite): <b>${gemini_cost:.4f}</b> / $10.00 上限\n"
|
||
f"• 🤖 NemoClaw (Claude Haiku 4.5): <b>${claude_cost:.4f}</b>\n"
|
||
f"• 合計: <b>${total:.4f}</b>"
|
||
)
|
||
except Exception as e:
|
||
msg = f"<b>💰 費用統計</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/pods":
|
||
try:
|
||
k8s = get_k8s_repository()
|
||
s = await k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||
problems = s.get("problem_pods", [])
|
||
if problems:
|
||
lines = [f"<b>⚠️ 異常 Pod ({len(problems)} 個)</b>"]
|
||
for p in problems[:15]:
|
||
lines.append(f"• <code>{p}</code>")
|
||
msg = "\n".join(lines)
|
||
else:
|
||
msg = "<b>⚠️ 異常 Pod</b>\n✅ 全部 Pod 正常"
|
||
except Exception as e:
|
||
msg = f"<b>⚠️ 異常 Pod</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/rag":
|
||
# /rag <查詢內容> — RAG 知識庫語義查詢 (ADR-067 Phase 33)
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
parts = full_text.split(None, 1)
|
||
if len(parts) < 2 or not parts[1].strip():
|
||
await self.send_as_openclaw(
|
||
text="<b>📚 RAG 知識庫查詢</b>\n用法: <code>/rag 你的問題</code>\n例如: <code>/rag 什麼是 ADR-067?</code>",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
return
|
||
question = parts[1].strip()
|
||
await self.send_as_openclaw(
|
||
text=f"<b>📚 查詢知識庫中...</b>\n<code>{question[:80]}</code>",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
try:
|
||
from src.services.knowledge_rag_service import get_knowledge_rag_service
|
||
svc = get_knowledge_rag_service()
|
||
answer = await svc.query(question, top_k=5)
|
||
msg = f"<b>📚 RAG 知識庫</b>\n<i>Q: {question[:80]}</i>\n\n{answer}"
|
||
except Exception as e:
|
||
logger.warning("rag_telegram_query_failed", error=str(e))
|
||
msg = f"<b>📚 RAG 查詢失敗</b>\n{e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/help":
|
||
msg = (
|
||
"<b>🤖 SRE 戰情室指令</b>\n\n"
|
||
"/status — 查詢 K8s Cluster 狀態\n"
|
||
"/incidents — 列出活躍告警\n"
|
||
"/cost — 查詢本月 AI 費用\n"
|
||
"/pods — 列出異常 Pod\n"
|
||
"/rag <問題> — 查詢 RAG 知識庫\n"
|
||
"/help — 顯示此說明\n\n"
|
||
"<b>對話方式:</b>\n"
|
||
"• 直接輸入 → 小O + 小賀 同時回應\n"
|
||
"• 小O 或 @OpenClawAwoooI_Bot → 只有 OpenClaw\n"
|
||
"• 小賀 或 @NemoTronAwoooI_Bot → 只有 NemoClaw\n"
|
||
"• Reply 某個 Bot 的訊息 → 只有那個 Bot 回應"
|
||
)
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
else:
|
||
logger.debug("group_unknown_command", cmd=cmd)
|
||
|
||
async def _send_chat_action(self, chat_id: int, action: str) -> None:
|
||
"""發送聊天狀態 (e.g., typing)"""
|
||
if not self._http_client: return
|
||
try:
|
||
url = f"{self.api_url}/sendChatAction"
|
||
await self._http_client.post(url, json={"chat_id": chat_id, "action": action})
|
||
except: pass
|
||
|
||
async def _notify_approval_result(
|
||
self,
|
||
message_id: int | None,
|
||
incident_id: str,
|
||
action: str,
|
||
username: str,
|
||
execution_triggered: bool,
|
||
) -> None:
|
||
"""
|
||
2026-04-09 Claude Sonnet 4.6: 批准/拒絕後立即更新 Telegram 訊息狀態。
|
||
|
||
策略:
|
||
1. editMessageReplyMarkup — 移除批准/拒絕按鈕,保留資訊按鈕
|
||
2. sendMessage reply_to → 在原訊息下方附加狀態行
|
||
3. 如果 message_id 找不到,fallback 到 send_notification
|
||
"""
|
||
import html as _html
|
||
|
||
chat_id = settings.OPENCLAW_TG_CHAT_ID
|
||
if not chat_id:
|
||
return
|
||
|
||
# 找到原始告警訊息 ID(優先 Redis,fallback DB)
|
||
orig_msg_id = message_id
|
||
if not orig_msg_id:
|
||
try:
|
||
redis = await get_redis()
|
||
_val = await redis.get(f"tg_msg:{incident_id}")
|
||
if _val:
|
||
orig_msg_id = int(_val)
|
||
else:
|
||
# DB fallback
|
||
from src.services.approval_db import get_approval_service as _svc
|
||
_approvals = await _svc().get_all_approvals(incident_id=incident_id)
|
||
if _approvals and _approvals[0].telegram_message_id:
|
||
orig_msg_id = _approvals[0].telegram_message_id
|
||
except Exception:
|
||
pass
|
||
|
||
if action == "approve":
|
||
status_emoji = "✅"
|
||
status_text = f"<b>已批准</b> by {_html.escape(username)}"
|
||
# 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑)
|
||
# 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上
|
||
suffix = "⚡ 執行中..."
|
||
else:
|
||
status_emoji = "❌"
|
||
status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
|
||
suffix = ""
|
||
|
||
status_line = f"{status_emoji} {status_text} {suffix}".strip()
|
||
|
||
if orig_msg_id:
|
||
try:
|
||
# 1. 移除批准/拒絕按鈕(只保留資訊按鈕列)
|
||
info_buttons = [[
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
]]
|
||
await self._http_client.post(
|
||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/editMessageReplyMarkup",
|
||
json={
|
||
"chat_id": chat_id,
|
||
"message_id": orig_msg_id,
|
||
"reply_markup": {"inline_keyboard": info_buttons},
|
||
},
|
||
)
|
||
except Exception as _e:
|
||
# 2026-04-09 Claude Sonnet 4.6: I3 架構Review修復 — 加 warning 防止靜默失敗
|
||
logger.warning("notify_approval_edit_keyboard_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
try:
|
||
# 2. 在原訊息下回覆狀態
|
||
await self._http_client.post(
|
||
f"https://api.telegram.org/bot{settings.OPENCLAW_TG_BOT_TOKEN}/sendMessage",
|
||
json={
|
||
"chat_id": chat_id,
|
||
"text": status_line,
|
||
"parse_mode": "HTML",
|
||
"reply_to_message_id": orig_msg_id,
|
||
},
|
||
)
|
||
return
|
||
except Exception as _e:
|
||
logger.warning("notify_approval_reply_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
# fallback: 發新通知
|
||
try:
|
||
await self.send_notification(status_line, parse_mode="HTML")
|
||
except Exception as _e:
|
||
logger.warning("notify_approval_fallback_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
async def _execute_approval_action(
|
||
self,
|
||
action: str,
|
||
approval_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
message_id: int, # noqa: ARG002
|
||
) -> None:
|
||
"""
|
||
執行簽核動作 (更新資料庫)
|
||
|
||
Args:
|
||
action: approve/reject/tune
|
||
approval_id: 簽核單 ID
|
||
user_id: Telegram User ID
|
||
username: 使用者名稱
|
||
message_id: 訊息 ID
|
||
"""
|
||
# 2026-03-29 ogt: 修復方法呼叫 - add_signature/reject 不存在
|
||
# 正確方法: sign_approval / reject_approval
|
||
from uuid import UUID
|
||
|
||
from src.services.approval_db import get_approval_service
|
||
|
||
try:
|
||
service = get_approval_service()
|
||
|
||
# approval_id 可能是 INC-xxx (incident_id) 格式,需查出真正的 UUID
|
||
# 2026-04-06 Claude Code: decision_manager 傳入的是 incident.incident_id
|
||
approval_uuid: UUID | None = None
|
||
try:
|
||
approval_uuid = UUID(approval_id)
|
||
except ValueError:
|
||
# 非 UUID 格式,嘗試用 incident_id 查出 pending approval
|
||
pending_list = await service.get_all_approvals(incident_id=approval_id)
|
||
if pending_list:
|
||
approval_uuid = UUID(pending_list[0].id) if isinstance(pending_list[0].id, str) else pending_list[0].id
|
||
else:
|
||
logger.warning(
|
||
"telegram_approval_not_found_by_incident",
|
||
approval_id=approval_id,
|
||
)
|
||
return
|
||
|
||
if action == "approve":
|
||
approval, message, execution_triggered = await service.sign_approval(
|
||
approval_id=approval_uuid,
|
||
signer_id=f"tg_{user_id}",
|
||
signer_name=username,
|
||
comment="Telegram 簽核 (Long Polling)",
|
||
)
|
||
|
||
if approval:
|
||
logger.info(
|
||
"telegram_approval_signed_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
status=approval.status.value,
|
||
execution_triggered=execution_triggered,
|
||
)
|
||
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
|
||
await self._notify_approval_result(
|
||
message_id=message_id,
|
||
incident_id=approval_id,
|
||
action="approve",
|
||
username=username,
|
||
execution_triggered=execution_triggered,
|
||
)
|
||
# ADR-073 修補 + 2026-04-14 Claude Sonnet 4.6 修復:
|
||
# 原本 gate 用 execution_triggered,race condition 時失效(樂觀鎖失敗)
|
||
# 改用 approval.status == APPROVED(與 REST API 路徑 approvals.py:360 對齊)
|
||
# 用 Redis lock exec:{approval_id} 防重入(REST + Telegram 同時簽核)
|
||
from src.models.approval import ApprovalStatus
|
||
if approval.status == ApprovalStatus.APPROVED:
|
||
import asyncio
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.approval_execution import get_execution_service
|
||
|
||
_redis = get_redis()
|
||
_lock_key = f"exec:{approval.id}"
|
||
# SET NX EX 60 — 60s 內同一 approval 只能執行一次
|
||
_acquired = await _redis.set(_lock_key, "1", nx=True, ex=60)
|
||
if _acquired:
|
||
_exec_task = asyncio.create_task(
|
||
get_execution_service().execute_approved_action(approval)
|
||
)
|
||
_exec_task.add_done_callback(
|
||
lambda t: t.exception() if not t.cancelled() else None
|
||
)
|
||
logger.info(
|
||
"telegram_approval_execution_triggered",
|
||
approval_id=approval_id,
|
||
action=approval.action,
|
||
gate="status=APPROVED",
|
||
)
|
||
else:
|
||
logger.info(
|
||
"telegram_approval_execution_skipped_lock_held",
|
||
approval_id=approval_id,
|
||
reason="另一路徑 (REST/自動) 已取得 exec lock",
|
||
)
|
||
|
||
elif action == "reject":
|
||
approval, message = await service.reject_approval(
|
||
approval_id=approval_uuid,
|
||
rejector_id=f"tg_{user_id}",
|
||
rejector_name=username,
|
||
reason="Telegram 拒絕 (Long Polling)",
|
||
)
|
||
|
||
if approval:
|
||
logger.info(
|
||
"telegram_approval_rejected_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態
|
||
await self._notify_approval_result(
|
||
message_id=message_id,
|
||
incident_id=approval_id,
|
||
action="reject",
|
||
username=username,
|
||
execution_triggered=False,
|
||
)
|
||
|
||
elif action == "tune":
|
||
logger.info(
|
||
"telegram_auto_tuning_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"telegram_approval_action_failed",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
error=str(e),
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Phase 6.5: 心跳監控方法
|
||
# =============================================================================
|
||
|
||
async def _check_nemotron_health(self) -> tuple[bool, str]:
|
||
"""
|
||
探測 Nemotron (NVIDIA NIM) 是否可用
|
||
|
||
2026-04-03 ogt: 新增 — Nemotron 100% 超時但沒有告警,補足監控盲區
|
||
Returns: (is_healthy, status_text)
|
||
"""
|
||
import httpx
|
||
from src.core.config import get_settings
|
||
settings = get_settings()
|
||
|
||
api_key = settings.NVIDIA_API_KEY
|
||
if not api_key:
|
||
return False, "❌ NVIDIA_API_KEY 未設定"
|
||
|
||
# 2026-04-03 ogt: 用 /v1/models 輕量端點探測,避免觸發推理計費
|
||
# timeout 改為 25s — NIM 免費 tier 冷啟動可能需要 15-20s
|
||
try:
|
||
async with httpx.AsyncClient(timeout=25.0) as client:
|
||
resp = await client.get(
|
||
"https://integrate.api.nvidia.com/v1/models",
|
||
headers={"Authorization": f"Bearer {api_key}"},
|
||
)
|
||
if resp.status_code == 200:
|
||
return True, "✅ 正常"
|
||
return False, f"❌ HTTP {resp.status_code}"
|
||
except httpx.TimeoutException:
|
||
return False, "⚠️ 超時 (>25s)"
|
||
except Exception as e:
|
||
return False, f"❌ {str(e)[:40]}"
|
||
|
||
async def send_heartbeat(self) -> bool:
|
||
"""
|
||
發送心跳報告到 SRE 戰情室群組
|
||
|
||
ADR-073 重構 (2026-04-12 ogt):
|
||
- Redis 分散式鎖:2 個 replica 只發一條
|
||
- 並行探測所有服務(HeartbeatReportService)
|
||
- 一條彙整報告發到 SRE_GROUP_CHAT_ID,不散發
|
||
- 沉默告警整合進報告 warnings,不額外多發
|
||
|
||
2026-04-15 ogt: 修復多 replica 重複發送 bug
|
||
- 舊做法:RedisLock 在 async with 結束後立即 release,
|
||
同 slot 另一 pod 若 10s 後 wake 可再次搶到鎖 → 重複發送
|
||
- 新做法:slot-based key(heartbeat:slot:{slot_id}),
|
||
SET NX EX interval_seconds,讓 key 自然過期;
|
||
不主動 release,整個 slot 週期只有一個 pod 能寫入
|
||
"""
|
||
try:
|
||
if not self._initialized:
|
||
await self.initialize()
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.heartbeat_report_service import (
|
||
HeartbeatReportService,
|
||
report_to_telegram_html,
|
||
)
|
||
|
||
# Slot-based 去重:每個 30min slot 只有第一個搶到的 replica 發送
|
||
# key 自然過期(TTL = interval_seconds),不主動 release
|
||
interval_seconds = 30 * 60
|
||
slot_id = int(datetime.now(UTC).timestamp() / interval_seconds)
|
||
slot_key = f"heartbeat:slot:{slot_id}"
|
||
|
||
redis_client = get_redis()
|
||
acquired = await redis_client.set(slot_key, "1", nx=True, ex=interval_seconds)
|
||
if not acquired:
|
||
logger.debug("heartbeat_skipped_slot_taken", slot_id=slot_id)
|
||
return True
|
||
|
||
report = await HeartbeatReportService().collect()
|
||
text = report_to_telegram_html(report)
|
||
|
||
# 只發到 SRE 戰情室群組
|
||
if settings.SRE_GROUP_CHAT_ID:
|
||
await self.send_to_group(text=text)
|
||
else:
|
||
# SRE_GROUP_CHAT_ID 未注入時,fallback 到個人頻道並加警告
|
||
fallback = (
|
||
"⚠️ <b>SRE_GROUP_CHAT_ID 未設定</b>,心跳報告暫發到個人頻道\n\n"
|
||
+ text
|
||
)
|
||
await self.send_notification(fallback)
|
||
|
||
self._last_message_time = datetime.now(UTC)
|
||
logger.info(
|
||
"telegram_heartbeat_sent",
|
||
warnings=len(report.warnings),
|
||
has_sre_group=bool(settings.SRE_GROUP_CHAT_ID),
|
||
)
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_heartbeat_failed", error=str(e))
|
||
return False
|
||
|
||
async def start_heartbeat_monitor(
|
||
self,
|
||
heartbeat_interval_minutes: int = 30,
|
||
silence_threshold_hours: int = 2,
|
||
) -> None:
|
||
"""
|
||
啟動心跳監控背景任務
|
||
|
||
Args:
|
||
heartbeat_interval_minutes: 心跳間隔 (預設 30 分鐘)
|
||
silence_threshold_hours: 沉默告警閾值 (預設 2 小時)
|
||
"""
|
||
if self._heartbeat_active:
|
||
logger.warning("telegram_heartbeat_already_running")
|
||
return
|
||
|
||
self._heartbeat_active = True
|
||
self._heartbeat_task = asyncio.create_task(
|
||
self._heartbeat_loop(heartbeat_interval_minutes, silence_threshold_hours)
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_heartbeat_monitor_started",
|
||
interval_minutes=heartbeat_interval_minutes,
|
||
silence_threshold_hours=silence_threshold_hours,
|
||
)
|
||
|
||
async def _heartbeat_loop(
|
||
self,
|
||
interval_minutes: int,
|
||
_silence_hours: int, # 保留參數簽名相容性,沉默判斷已整合進 HeartbeatReport.warnings
|
||
) -> None:
|
||
"""
|
||
心跳監控循環
|
||
|
||
ADR-073 重構 (2026-04-12 ogt):
|
||
- 移除額外沉默告警多發邏輯(已整合進 HeartbeatReport.warnings)
|
||
- send_heartbeat() 內部有 RedisLock,2 個 replica 各自跑 loop 也只發一條
|
||
"""
|
||
interval_seconds = interval_minutes * 60
|
||
|
||
# 對齊到下一個整點倍數(例如 interval=30 → 對齊到 :00 或 :30)
|
||
# 避免多 replica 因啟動時間不同而各自發送
|
||
now_ts = datetime.now(UTC).timestamp()
|
||
next_slot = (int(now_ts / interval_seconds) + 1) * interval_seconds
|
||
wait_seconds = next_slot - now_ts
|
||
try:
|
||
await asyncio.sleep(wait_seconds)
|
||
except asyncio.CancelledError:
|
||
return
|
||
|
||
while self._heartbeat_active:
|
||
try:
|
||
await self.send_heartbeat()
|
||
await asyncio.sleep(interval_seconds)
|
||
except asyncio.CancelledError:
|
||
break
|
||
except Exception as e:
|
||
logger.error("telegram_heartbeat_loop_error", error=str(e))
|
||
await asyncio.sleep(60)
|
||
|
||
async def stop_heartbeat_monitor(self) -> None:
|
||
"""停止心跳監控"""
|
||
self._heartbeat_active = False
|
||
if self._heartbeat_task and not self._heartbeat_task.done():
|
||
self._heartbeat_task.cancel()
|
||
try:
|
||
await self._heartbeat_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self._heartbeat_task = None
|
||
logger.info("telegram_heartbeat_monitor_stopped")
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_gateway: TelegramGateway | None = None
|
||
|
||
|
||
def get_telegram_gateway() -> TelegramGateway:
|
||
"""取得全域 TelegramGateway 實例"""
|
||
global _gateway
|
||
if _gateway is None:
|
||
_gateway = TelegramGateway()
|
||
return _gateway
|