7523 lines
307 KiB
Python
7523 lines
307 KiB
Python
"""
|
||
Telegram Gateway - OpenClaw 行動戰情室 + SignOz 整合
|
||
====================================================
|
||
Phase 5.4.3 & 5.4.4: Telegram 推送與簽核接收
|
||
統帥校正: SignOz 為唯一全能視力中心
|
||
|
||
Features:
|
||
- 推送待簽核卡片到 Telegram (含 SignOz 指標)
|
||
- 動態 SignOz Trace URL (告警前後 5 分鐘)
|
||
- 自動調優按鈕 (Shadow Mode: 僅日誌輸出)
|
||
- 接收統帥簽核回調
|
||
- SOUL.md 訊息壓縮原則 100% 遵守
|
||
|
||
SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則):
|
||
- 狀態標籤: 20 字元
|
||
- 資源名稱: 50 字元
|
||
- 根因摘要: 100 字元
|
||
- 建議行動: 50 字元
|
||
- 總長度: 800 字元 (v7.0 擴展以容納 SignOz 區塊)
|
||
|
||
修復紀錄:
|
||
- 2026-03-26 Claude Code: 修復 HTML 解析錯誤 (Can't parse entities)
|
||
"""
|
||
|
||
import asyncio
|
||
import hashlib
|
||
import html
|
||
import json
|
||
import os
|
||
import re
|
||
from dataclasses import dataclass
|
||
from datetime import UTC, datetime
|
||
from urllib.parse import quote
|
||
from uuid import NAMESPACE_URL, UUID, uuid5
|
||
|
||
import httpx
|
||
import structlog
|
||
from opentelemetry import trace
|
||
|
||
from src.core.config import settings
|
||
from src.core.redis_client import get_redis
|
||
from src.services.security_interceptor import (
|
||
NonceReplayError,
|
||
UserNotWhitelistedError,
|
||
get_security_interceptor,
|
||
)
|
||
from src.services.chat_manager import get_chat_manager
|
||
|
||
# =============================================================================
|
||
# Snooze/Silence Redis Keys (2026-03-27 P1 優化)
|
||
# =============================================================================
|
||
SNOOZE_KEY_PREFIX = "telegram_snooze:" # {approval_id} -> 稍後提醒
|
||
SILENCE_KEY_PREFIX = "telegram_silence:" # {resource_name} -> 靜默
|
||
SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
|
||
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
|
||
INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash}
|
||
INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版
|
||
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX = "awoooi:tg_update_global_failure_dedup:"
|
||
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要跨 incident 10 分鐘只推一次
|
||
GROUPED_ALERT_DIGEST_DEDUP_PREFIX = "awoooi:tg_group_digest:" # {group_key}
|
||
GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS = 5 * 60 # 同一告警群組 5 分鐘只推一則 digest
|
||
|
||
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
|
||
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
|
||
POLLING_LEADER_KEY = "telegram:polling:leader"
|
||
POLLING_LEADER_TTL = 45 # seconds - Pod 宕掉後 45s 自動轉移
|
||
POLLING_LEADER_RENEW = 20 # seconds - 每 20s 續約
|
||
POLLING_LEADER_WATCH = 30 # seconds - 非 Leader Pod 每 30s 嘗試接管
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
|
||
_INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b")
|
||
_CODE_REF_RE = re.compile(r"<code>([0-9a-f]{7,12})</code>", re.IGNORECASE)
|
||
_TELEGRAM_HTML_CHUNK_LIMIT = 3600
|
||
_AWOOOP_WEB_BASE_URL = "https://awoooi.wooo.work"
|
||
|
||
|
||
def _top_gateway_bucket(
|
||
buckets: list[dict[str, object]],
|
||
field: str,
|
||
) -> str | None:
|
||
if not buckets:
|
||
return None
|
||
top = max(buckets, key=lambda row: int(row.get("total") or 0))
|
||
value = top.get(field)
|
||
if value is None:
|
||
return None
|
||
return f"{value} ({top.get('total', 0)})"
|
||
|
||
|
||
def _format_gateway_summary_lines(summary: dict[str, object] | None) -> list[str]:
|
||
if not summary or int(summary.get("total") or 0) <= 0:
|
||
return []
|
||
|
||
by_agent = summary.get("by_agent") if isinstance(summary.get("by_agent"), list) else []
|
||
by_tool = summary.get("by_tool") if isinstance(summary.get("by_tool"), list) else []
|
||
by_scope = summary.get("by_scope") if isinstance(summary.get("by_scope"), list) else []
|
||
blockers = summary.get("blockers") if isinstance(summary.get("blockers"), list) else []
|
||
|
||
lines = [
|
||
"",
|
||
"🛡️ <b>MCP Gateway</b>",
|
||
(
|
||
"階段: "
|
||
f"<code>{html.escape(str(summary.get('stage') or 'unknown'))}</code>"
|
||
" / "
|
||
f"<code>{html.escape(str(summary.get('stage_status') or 'unknown'))}</code>"
|
||
),
|
||
(
|
||
"治理: "
|
||
f"first-class <code>{int(summary.get('first_class_total') or 0)}</code> / "
|
||
f"policy <code>{int(summary.get('policy_enforced_total') or 0)}</code> / "
|
||
f"legacy <code>{int(summary.get('legacy_bridge_total') or 0)}</code>"
|
||
),
|
||
]
|
||
|
||
agent = _top_gateway_bucket(by_agent, "agent_id")
|
||
tool = _top_gateway_bucket(by_tool, "tool_name")
|
||
scope = _top_gateway_bucket(by_scope, "required_scope")
|
||
if agent:
|
||
lines.append(f"Agent: <code>{html.escape(agent)}</code>")
|
||
if tool:
|
||
lines.append(f"Tool: <code>{html.escape(tool)}</code>")
|
||
if scope:
|
||
lines.append(f"Scope: <code>{html.escape(scope)}</code>")
|
||
if blockers:
|
||
lines.append(
|
||
"卡點: "
|
||
+ html.escape(", ".join(str(item) for item in blockers[:3]))
|
||
)
|
||
return lines
|
||
|
||
|
||
def _format_automation_quality_lines(quality: dict[str, object] | None) -> list[str]:
|
||
if not quality or quality.get("applicable") is False:
|
||
return []
|
||
|
||
facts = quality.get("facts") if isinstance(quality.get("facts"), dict) else {}
|
||
blockers = quality.get("blockers") if isinstance(quality.get("blockers"), list) else []
|
||
verdict = html.escape(str(quality.get("verdict") or "unknown"))
|
||
score = int(quality.get("score") or 0)
|
||
|
||
lines = [
|
||
"",
|
||
"🧪 <b>自動化品質</b>",
|
||
f"判定: <code>{verdict}</code> / <code>{score}</code>",
|
||
(
|
||
"執行: "
|
||
f"auto-repair <code>{int(facts.get('auto_repair_execution_records') or 0)}</code> / "
|
||
f"ops <code>{int(facts.get('automation_operation_records') or 0)}</code> / "
|
||
f"verify <code>{html.escape(str(facts.get('verification_result') or 'missing'))}</code>"
|
||
),
|
||
(
|
||
"證據: "
|
||
f"sensors <code>{int(facts.get('sensors_succeeded') or 0)}/"
|
||
f"{int(facts.get('sensors_attempted') or 0)}</code> / "
|
||
f"gateway <code>{int(facts.get('mcp_gateway_total') or 0)}</code> / "
|
||
f"KM <code>{int(facts.get('knowledge_entries') or 0)}</code>"
|
||
),
|
||
]
|
||
|
||
if blockers:
|
||
lines.append("缺口: " + html.escape(", ".join(str(item) for item in blockers[:4])))
|
||
return lines
|
||
|
||
|
||
def _format_remediation_history_lines(history: dict[str, object] | None) -> list[str]:
|
||
if not history or int(history.get("total") or 0) <= 0:
|
||
return []
|
||
|
||
items = history.get("items") if isinstance(history.get("items"), list) else []
|
||
latest = items[0] if items and isinstance(items[0], dict) else {}
|
||
agent = latest.get("agent_id") or "unknown_agent"
|
||
tool = latest.get("tool_name") or "current_state"
|
||
scope = latest.get("required_scope") or "unknown"
|
||
writes_incident = latest.get("writes_incident_state")
|
||
writes_auto_repair = latest.get("writes_auto_repair_result")
|
||
|
||
return [
|
||
"",
|
||
"🧪 <b>ADR-100 補救試跑</b>",
|
||
f"歷史: <code>{int(history.get('total') or 0)}</code> 次",
|
||
(
|
||
"上次: "
|
||
f"<code>{html.escape(str(latest.get('mode') or 'unknown'))}</code> / "
|
||
f"<code>{html.escape(str(latest.get('verification_result_preview') or 'unknown'))}</code>"
|
||
),
|
||
(
|
||
"MCP: "
|
||
f"<code>{html.escape(str(agent))}/{html.escape(str(tool))}</code> / "
|
||
f"<code>{html.escape(str(scope))}</code>"
|
||
),
|
||
(
|
||
"寫入: "
|
||
f"incident <code>{html.escape(str(writes_incident))}</code> / "
|
||
f"auto-repair <code>{html.escape(str(writes_auto_repair))}</code>"
|
||
),
|
||
]
|
||
|
||
|
||
def _awooop_runs_url_for_incident(incident_id: str) -> str:
|
||
safe_incident_id = quote(str(incident_id or ""), safe="")
|
||
return (
|
||
f"{_AWOOOP_WEB_BASE_URL}/zh-TW/awooop/runs"
|
||
f"?project_id=awoooi&incident_id={safe_incident_id}"
|
||
)
|
||
|
||
|
||
def _awooop_runs_button_row(incident_id: str) -> list[dict[str, str]]:
|
||
if not incident_id:
|
||
return []
|
||
return [{
|
||
"text": "🧭 AwoooP",
|
||
"url": _awooop_runs_url_for_incident(incident_id),
|
||
}]
|
||
|
||
|
||
def _awooop_runs_reply_markup(incident_id: str) -> dict | None:
|
||
row = _awooop_runs_button_row(incident_id)
|
||
if not row:
|
||
return None
|
||
return {"inline_keyboard": [row]}
|
||
|
||
|
||
def _latest_remediation_history_item(history: dict[str, object] | None) -> dict[str, object]:
|
||
if not history:
|
||
return {}
|
||
items = history.get("items") if isinstance(history.get("items"), list) else []
|
||
latest = items[0] if items and isinstance(items[0], dict) else {}
|
||
return latest
|
||
|
||
|
||
def _remediation_evidence_state(history: dict[str, object] | None) -> str:
|
||
"""Classify ADR-100 dry-run evidence for first-screen Telegram status."""
|
||
if not history:
|
||
return ""
|
||
|
||
total = int(history.get("total") or 0)
|
||
if total <= 0:
|
||
if history.get("status") == "fetch_failed":
|
||
return "fetch_failed"
|
||
return "missing"
|
||
|
||
latest = _latest_remediation_history_item(history)
|
||
if latest.get("writes_incident_state") or latest.get("writes_auto_repair_result"):
|
||
return "write_observed"
|
||
if latest.get("allowed") is False or latest.get("success") is False:
|
||
return "blocked"
|
||
|
||
safety_level = str(latest.get("safety_level") or "").lower()
|
||
required_scope = str(latest.get("required_scope") or "").lower()
|
||
if safety_level == "read_only" or required_scope == "read":
|
||
return "read_only"
|
||
return "observed"
|
||
|
||
|
||
def _format_remediation_evidence_block(history: dict[str, object] | None) -> str:
|
||
"""Compact ADR-100 dry-run evidence for the root ACTION REQUIRED card."""
|
||
if not history:
|
||
return ""
|
||
|
||
state = _remediation_evidence_state(history)
|
||
total = int(history.get("total") or 0)
|
||
if total <= 0:
|
||
label = (
|
||
"補救試跑查詢失敗"
|
||
if state == "fetch_failed"
|
||
else "尚無補救試跑紀錄"
|
||
)
|
||
return f"🧪 AI 證據:<code>{html.escape(label)}</code>\n"
|
||
|
||
latest = _latest_remediation_history_item(history)
|
||
agent = latest.get("agent_id") or "unknown_agent"
|
||
tool = latest.get("tool_name") or "current_state"
|
||
scope = latest.get("required_scope") or "unknown"
|
||
writes_incident = str(bool(latest.get("writes_incident_state"))).lower()
|
||
writes_auto_repair = str(bool(latest.get("writes_auto_repair_result"))).lower()
|
||
route = f"{agent}/{tool}/{scope}"
|
||
preview = latest.get("verification_result_preview") or "unknown"
|
||
|
||
state_label = {
|
||
"read_only": "只讀試跑",
|
||
"write_observed": "有寫入旗標",
|
||
"blocked": "試跑受阻",
|
||
"observed": "已試跑",
|
||
}.get(state, "已試跑")
|
||
|
||
return (
|
||
f"🧪 AI 證據:<b>{state_label} {total} 次</b> | "
|
||
f"<code>{html.escape(str(route))}</code>\n"
|
||
f"├ preview:<code>{html.escape(str(preview))}</code>\n"
|
||
f"└ 寫入:incident <code>{writes_incident}</code> / "
|
||
f"auto-repair <code>{writes_auto_repair}</code>\n"
|
||
)
|
||
|
||
|
||
async def _fetch_remediation_summary_for_card(
|
||
*,
|
||
approval_id: str,
|
||
incident_id: str,
|
||
) -> dict[str, object] | None:
|
||
if not incident_id:
|
||
return None
|
||
try:
|
||
from src.services.adr100_remediation_service import (
|
||
get_adr100_remediation_service,
|
||
)
|
||
|
||
history = await asyncio.wait_for(
|
||
get_adr100_remediation_service().history(
|
||
limit=5,
|
||
incident_id=incident_id,
|
||
),
|
||
timeout=2.5,
|
||
)
|
||
return history if isinstance(history, dict) else None
|
||
except Exception as remediation_exc:
|
||
logger.debug(
|
||
"telegram_approval_card_remediation_history_fetch_failed",
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
error=str(remediation_exc),
|
||
)
|
||
return {
|
||
"schema_version": "adr100_remediation_history_v1",
|
||
"total": 0,
|
||
"items": [],
|
||
"status": "fetch_failed",
|
||
}
|
||
|
||
|
||
def _telegram_html_chunks(lines: list[str], limit: int = _TELEGRAM_HTML_CHUNK_LIMIT) -> list[str]:
|
||
"""Split HTML messages by complete lines so Telegram does not receive broken tags."""
|
||
chunks: list[str] = []
|
||
current: list[str] = []
|
||
current_len = 0
|
||
for raw_line in lines:
|
||
line = str(raw_line)
|
||
line_len = len(line) + 1
|
||
if current and current_len + line_len > limit:
|
||
chunks.append("\n".join(current))
|
||
current = []
|
||
current_len = 0
|
||
if line_len > limit:
|
||
chunks.append(_html_safe_plain_chunk(line, limit=limit))
|
||
continue
|
||
current.append(line)
|
||
current_len += line_len
|
||
if current:
|
||
chunks.append("\n".join(current))
|
||
return chunks
|
||
|
||
|
||
def _plain_text_from_html(text: str, limit: int = 3900) -> str:
|
||
"""Fallback renderer for Telegram HTML parse failures."""
|
||
plain = re.sub(r"</?[^>]+>", "", text)
|
||
return html.unescape(plain)[:limit]
|
||
|
||
|
||
def _html_safe_plain_chunk(text: str, limit: int) -> str:
|
||
"""Render one overlong HTML line as parse-safe text for HTML mode chunks."""
|
||
plain = _plain_text_from_html(text, limit=limit)
|
||
escaped = html.escape(plain)
|
||
if len(escaped) <= limit:
|
||
return escaped
|
||
# Escaping may expand &, <, >. Trim once more after escaping; a partial HTML
|
||
# entity is still plain text to Telegram, while a partial tag is not.
|
||
return escaped[:limit]
|
||
|
||
|
||
def _sanitize_telegram_error(text: str) -> str:
|
||
"""遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。"""
|
||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||
|
||
|
||
def _is_noisy_failure_update(status_line: str) -> bool:
|
||
"""判斷是否屬於容易跨 incident 洗版的失敗摘要。"""
|
||
return (
|
||
"AI 自動修復失敗" in status_line
|
||
or "AI 診斷工具失敗" in status_line
|
||
)
|
||
|
||
|
||
def _extract_incident_id_from_text(text: str) -> str | None:
|
||
"""從 Telegram 出站文字擷取 Incident ID。"""
|
||
match = _INCIDENT_ID_RE.search(text or "")
|
||
return match.group(0) if match else None
|
||
|
||
|
||
def _has_reply_context(payload: dict) -> bool:
|
||
return "reply_to_message_id" in payload or "reply_parameters" in payload
|
||
|
||
|
||
def _is_root_action_required_card(text: str) -> bool:
|
||
"""主告警卡片本身不自動 reply,避免把新主卡接到舊訊息下。"""
|
||
return "ACTION REQUIRED" in text and "AI 自動化鏈路" in text
|
||
|
||
|
||
def _legacy_outbound_run_id(chat_id: str, provider_message_id: str) -> UUID:
|
||
"""Legacy Telegram 發送尚未有 run_id 時,產生穩定 soft run_id 供 Channel Hub 串接。"""
|
||
return uuid5(NAMESPACE_URL, f"awoooi:legacy-telegram:{chat_id}:{provider_message_id}")
|
||
|
||
|
||
def _infer_outbound_message_type(text: str, payload: dict) -> str:
|
||
"""將既有 Telegram 訊息映射成 AwoooP outbound_message 的有限分類。"""
|
||
if "RUNBOOK REVIEW" in text or "待審核" in text:
|
||
return "approval_request"
|
||
if _has_reply_context(payload):
|
||
if "失敗" in text or "錯誤" in text or "FAILED" in text:
|
||
return "error"
|
||
return "final"
|
||
if payload.get("reply_markup"):
|
||
return "approval_request"
|
||
if "ACTION REQUIRED" in text or "待審" in text or "審批" in text:
|
||
return "approval_request"
|
||
if "失敗" in text or "錯誤" in text or "FAILED" in text:
|
||
return "error"
|
||
return "final"
|
||
|
||
|
||
def _outbound_payload_hash(payload: dict) -> str:
|
||
"""Stable hash for Telegram payload replay without storing raw payload."""
|
||
canonical = json.dumps(payload, ensure_ascii=False, sort_keys=True, default=str)
|
||
return hashlib.sha256(canonical.encode()).hexdigest()
|
||
|
||
|
||
def _reply_markup_summary(payload: dict) -> dict[str, object]:
|
||
"""Summarize Telegram buttons without turning callback payloads into policy."""
|
||
reply_markup = payload.get("reply_markup")
|
||
if not isinstance(reply_markup, dict):
|
||
return {"present": False, "button_count": 0}
|
||
|
||
buttons: list[dict[str, object]] = []
|
||
for row in reply_markup.get("inline_keyboard") or []:
|
||
if not isinstance(row, list):
|
||
continue
|
||
for button in row:
|
||
if not isinstance(button, dict):
|
||
continue
|
||
callback_data = str(button.get("callback_data") or "")
|
||
buttons.append({
|
||
"text": str(button.get("text") or ""),
|
||
"callback_prefix": callback_data.split(":", 1)[0] if callback_data else "",
|
||
"has_url": bool(button.get("url")),
|
||
})
|
||
|
||
return {
|
||
"present": True,
|
||
"button_count": len(buttons),
|
||
"buttons": buttons[:12],
|
||
"truncated": len(buttons) > 12,
|
||
}
|
||
|
||
|
||
def _outbound_source_envelope(method: str, payload: dict) -> dict[str, object]:
|
||
"""Build a redaction-friendly source envelope for Channel Hub replay."""
|
||
text = str(payload.get("text") or payload.get("caption") or "")
|
||
incident_ids = sorted(set(_INCIDENT_ID_RE.findall(text)))
|
||
code_refs = sorted(set(match.group(1) for match in _CODE_REF_RE.finditer(text)))
|
||
return {
|
||
"adapter": "legacy_telegram_gateway",
|
||
"method": method,
|
||
"payload_sha256": _outbound_payload_hash(payload),
|
||
"payload_keys": sorted(str(key) for key in payload.keys()),
|
||
"parse_mode": payload.get("parse_mode"),
|
||
"disable_web_page_preview": payload.get("disable_web_page_preview"),
|
||
"has_reply_context": _has_reply_context(payload),
|
||
"reply_markup": _reply_markup_summary(payload),
|
||
"source_refs": {
|
||
"incident_ids": incident_ids[:20],
|
||
"code_refs": code_refs[:20],
|
||
},
|
||
}
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag
|
||
# true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕
|
||
# false → 維持現有 callback_action_spec.yaml 路徑(預設,向下相容)
|
||
USE_LLM_DYNAMIC_BUTTONS = os.environ.get("USE_LLM_DYNAMIC_BUTTONS", "false").lower() == "true"
|
||
|
||
# =============================================================================
|
||
# OTEL Tracer (Phase C P1 可觀測性)
|
||
# 2026-03-30 Claude Code: 新增 Telegram Gateway 追蹤
|
||
# =============================================================================
|
||
_tracer = trace.get_tracer("awoooi.telegram_gateway", "1.0.0")
|
||
|
||
|
||
# =============================================================================
|
||
# 智能截斷 (2026-04-17 ogt + Claude Sonnet 4.6 — ADR-075 修復)
|
||
# 根因:粗暴 [:N] 在括號/中文字中間切斷 → 幽靈截斷「質疑:無(通」
|
||
# 規則:在完整句子邊界截斷;若無邊界則補 …[截斷] 標記
|
||
# =============================================================================
|
||
|
||
def _smart_truncate(text: str, limit: int, suffix: str = "…[截斷]") -> str:
|
||
"""
|
||
在句子邊界截斷文字,防止破壞括號閉合或切斷中文字。
|
||
|
||
優先序:。!? > ; > ,、, > 空白
|
||
若在合理位置(>50% limit)找到邊界 → 在邊界後截斷
|
||
否則 → 在 limit 處截斷並加 suffix
|
||
"""
|
||
if len(text) <= limit:
|
||
return text
|
||
# 依優先序嘗試各邊界字元
|
||
for boundary in ("。", "!", "?", ";", ",", "、", ",", " "):
|
||
pos = text.rfind(boundary, 0, limit)
|
||
if pos >= limit // 2: # 至少在一半後才算有效邊界
|
||
return text[:pos + len(boundary)] + suffix
|
||
# 無邊界:硬截 + 標記
|
||
return text[:limit] + suffix
|
||
|
||
|
||
def _format_resolved_guard_stamp(resolved_at: datetime | None) -> str:
|
||
"""格式化 ADR-071-D 已解決狀態守衛文案。"""
|
||
if resolved_at is None:
|
||
return "✅ 此事件已解決"
|
||
return f"✅ 此事件已於 {resolved_at.strftime('%Y-%m-%d %H:%M')} 解決"
|
||
|
||
|
||
# =============================================================================
|
||
# Long Polling 配置 (Phase 5 內網修復)
|
||
# =============================================================================
|
||
LONG_POLLING_TIMEOUT = 30 # getUpdates timeout (秒)
|
||
LONG_POLLING_RETRY_DELAY = 5 # 錯誤後重試延遲 (秒)
|
||
|
||
|
||
# =============================================================================
|
||
# SignOz Metrics Block (v7.0)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class SignOzMetricsBlock:
|
||
"""
|
||
SignOz 指標區塊 (嵌入 Telegram 卡片)
|
||
|
||
格式:
|
||
📊 SignOz 指標
|
||
├ RPS: 150.2 📈
|
||
├ Error: 🟢 0.5%
|
||
└ P99: 245ms ➡️
|
||
"""
|
||
rps: float = 0.0
|
||
rps_trend: str = "stable" # up, down, stable
|
||
error_rate: float = 0.0
|
||
p99_latency_ms: float = 0.0
|
||
latency_trend: str = "stable"
|
||
trace_url: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
|
||
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
|
||
|
||
return (
|
||
f"📊 <b>SignOz 指標</b>\n"
|
||
f"├ RPS: <code>{self.rps:.1f}</code> {trend_emoji.get(self.rps_trend, '➡️')}\n"
|
||
f"├ Error: {error_emoji} <code>{self.error_rate:.2f}%</code>\n"
|
||
f"└ P99: <code>{self.p99_latency_ms:.0f}ms</code> {trend_emoji.get(self.latency_trend, '➡️')}"
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# SOUL.md 訊息格式定義 (v7.0 + SignOz)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class TelegramMessage:
|
||
"""
|
||
Telegram 訊息結構 (SOUL.md 4.1 + v7.0 SignOz 整合)
|
||
|
||
格式:
|
||
═══════════════════════════
|
||
🚨 CRITICAL | harbor-core
|
||
═══════════════════════════
|
||
📋 INC-20260321-0001
|
||
🎯 資源: harbor-core-7d4b8c9f5
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🤖 AI 仲裁判定
|
||
👥 責任: BE (後端)
|
||
📊 信心: 🟢 88%
|
||
💡 原因: JVM Heap 配置不當
|
||
━━━━━━━━━━━━━━━━━━━
|
||
📊 SignOz 指標
|
||
├ RPS: 150.2 📈
|
||
├ Error: 🟢 0.5%
|
||
└ P99: 245ms ➡️
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 建議: 刪除 Pod
|
||
⏱️ 停機: ~30s
|
||
🔍 SignOz Trace (±5min)
|
||
|
||
[✅ 簽核] [❌ 拒絕] [⚡ 自動調優]
|
||
"""
|
||
status_emoji: str # 🚨, ⚠️, ℹ️
|
||
risk_level: str # CRITICAL, MEDIUM, LOW
|
||
resource_name: str # Pod/Deployment 名稱 (max 50)
|
||
root_cause: str # 根因摘要 (max 100)
|
||
suggested_action: str # 建議操作 (max 50)
|
||
estimated_downtime: str # 預計停機時間
|
||
approval_id: str # 簽核單 ID
|
||
# v6.0 AI 仲裁欄位
|
||
incident_id: str = "" # 事件編號 INC-YYYYMMDD-XXXX
|
||
primary_responsibility: str = "COLLAB" # FE/BE/INFRA/DB/COLLAB
|
||
confidence: float = 0.0 # 信心度 0.0-1.0
|
||
namespace: str = "default" # K8s namespace
|
||
# v7.0 SignOz 整合
|
||
signoz_metrics: SignOzMetricsBlock | None = None
|
||
signoz_trace_url: str = "" # 動態時間參數 URL
|
||
auto_tuning_command: str = "" # kubectl 調優指令
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens: int = 0 # LLM Token 使用量
|
||
ai_cost: float = 0.0 # LLM 成本 (USD)
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency: dict | None = None # AnomalyCounter 統計
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider: str = "" # ollama/gemini/claude/expert_system/mock
|
||
# 2026-04-04 ogt: 底層模型名稱 (e.g. qwen2.5:7b-instruct, nemotron-70b)
|
||
ai_model: str = ""
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 告警分類與修復鏈路顯示 (ADR-076)
|
||
alert_category: str = "" # host/k8s/database/service/external_site/secops 等
|
||
playbook_name: str = "" # 匹配到的 Playbook 名稱(空字串=規則匹配)
|
||
automation_state: str = "" # diagnosis_collected_manual_required / diagnosis_failed_manual_required
|
||
automation_quality: dict | None = None # truth-chain automation_quality 摘要
|
||
remediation_summary: dict | None = None # ADR-100 read-only dry-run history 摘要
|
||
|
||
# ==========================================================================
|
||
# Phase 22: Nemotron 協作欄位 (ADR-044)
|
||
# 2026-03-31 Claude Code: OpenClaw + Nemotron 雙軌顯示
|
||
# ==========================================================================
|
||
nemotron_enabled: bool = False # 是否啟用 Nemotron 協作
|
||
nemotron_tool_model: str = "" # Tool Calling 模型 (e.g. "llama3.1:8b")
|
||
nemotron_tool_backend: str = "" # Tool Calling 後端 (e.g. "Ollama 本機" / "NVIDIA 雲端")
|
||
nemotron_tools: list[dict] | None = None # Tool Calling 結果 [{"tool": str, "args": dict, "valid": bool}]
|
||
nemotron_validation: str = "" # "✅ 驗證通過" / "❌ 驗證失敗" / "⏳ 驗證中"
|
||
nemotron_latency_ms: float = 0.0 # Nemotron 呼叫延遲 (ms)
|
||
|
||
def _provider_display(self) -> tuple[str, str]:
|
||
"""Return display provider and optional model suffix.
|
||
2026-05-04 ogt: 加入具體 Ollama 伺服器顯示(GCP-A/B/Local)
|
||
"""
|
||
provider_names = {
|
||
"ollama": "Ollama",
|
||
# 2026-05-04 ogt: ADR-110 三層容災具體伺服器識別
|
||
"ollama_gcp_a": "Ollama GCP-A (34.143.170.20)",
|
||
"ollama_gcp_b": "Ollama GCP-B (34.21.145.224)",
|
||
"ollama_local": "Ollama Local (111)",
|
||
"gemini": "Gemini",
|
||
"claude": "Claude",
|
||
"nvidia": "Nemotron",
|
||
"openclaw_nemo": "OpenClaw Nemo",
|
||
"openclaw_nvidia_nim": "OpenClaw Nemo",
|
||
"openclaw_qwen": "OpenClaw Nemo",
|
||
}
|
||
provider = (self.ai_provider or "").strip().lower()
|
||
if provider:
|
||
provider_display = provider_names.get(provider, self.ai_provider.upper())
|
||
elif self.confidence > 0:
|
||
provider_display = "AI Router"
|
||
else:
|
||
provider_display = "rule_fallback"
|
||
model_suffix = f" ({html.escape(self.ai_model)})" if self.ai_model else ""
|
||
return provider_display, model_suffix
|
||
|
||
def _automation_mode(self) -> str:
|
||
text = f"{self.root_cause} {self.suggested_action}".lower()
|
||
if "超時" in text or "timeout" in text:
|
||
return "llm_timeout_manual_gate"
|
||
if self.confidence > 0 and self.suggested_action and self.suggested_action != "待分析":
|
||
return "ai_proposal_ready"
|
||
if self.suggested_action in {"待分析", "", "NO_ACTION"}:
|
||
return "analysis_degraded"
|
||
return "safe_gate_pending"
|
||
|
||
def _automation_status_summary(self) -> str:
|
||
"""Telegram 首屏的人類可讀處置狀態。
|
||
|
||
這行是值班判斷入口:先讓人知道這張卡是「AI 已有建議待審批」、
|
||
「AI 無法修復需人工」或「純觀察」,細節才放到後面的鏈路區塊。
|
||
"""
|
||
mode = self._automation_mode()
|
||
action = (self.suggested_action or "").upper()
|
||
text = f"{self.root_cause} {self.suggested_action}".lower()
|
||
state = (self.automation_state or "").lower()
|
||
quality = self.automation_quality or {}
|
||
facts = quality.get("facts") if isinstance(quality.get("facts"), dict) else {}
|
||
verdict = str(quality.get("verdict") or "")
|
||
auto_repair_records = int(facts.get("auto_repair_execution_records") or 0)
|
||
operation_records = int(facts.get("automation_operation_records") or 0)
|
||
verification = str(facts.get("verification_result") or "missing")
|
||
remediation_state = _remediation_evidence_state(self.remediation_summary)
|
||
|
||
if verdict == "auto_repaired_verified":
|
||
return "✅ 已驗證自動修復完成"
|
||
if auto_repair_records > 0 or operation_records > 0:
|
||
if verification == "missing":
|
||
return "🔄 已自動執行,等待驗證證據"
|
||
return f"🔄 已自動執行,驗證結果:{verification}"
|
||
if remediation_state == "read_only":
|
||
return "🔎 AI 已完成只讀補救試跑,等待人工審批"
|
||
if remediation_state == "write_observed":
|
||
return "⚠️ AI 補救試跑出現寫入旗標,需人工確認"
|
||
if remediation_state == "blocked":
|
||
return "🔴 AI 補救試跑受阻,需人工處理"
|
||
if remediation_state == "fetch_failed":
|
||
return "🟠 AI 補救試跑證據查詢失敗,需人工判斷"
|
||
if verdict == "approval_required":
|
||
return "🟡 需要審批後才會執行"
|
||
if verdict.startswith("manual_required"):
|
||
return "🟠 未自動修復,需人工判斷"
|
||
|
||
if state == "diagnosis_collected_manual_required":
|
||
return "🔎 AI 已完成只讀診斷,需人工判斷"
|
||
if state == "diagnosis_failed_manual_required":
|
||
return "🔴 AI 診斷工具失敗,需人工排查"
|
||
if mode == "llm_timeout_manual_gate":
|
||
return "🔴 AI 分析超時,需人工排查"
|
||
if action in {"NO_ACTION", "待分析", ""} or "invalid_target" in text:
|
||
return "🟠 AI 無可安全執行動作,需人工判斷"
|
||
if self.confidence <= 0:
|
||
return "🟡 規則建議待審批"
|
||
if mode == "analysis_degraded":
|
||
return "🟠 AI 降級分析,需人工判斷"
|
||
if mode == "ai_proposal_ready":
|
||
return "🟡 AI 已提出修復建議,等待人工批准"
|
||
return "🟡 安全閘門待審批"
|
||
|
||
def _format_automation_block(self) -> str:
|
||
"""Visible AI automation chain for every ACTION REQUIRED card.
|
||
2026-05-04 ogt: 加入 Token 用量 + 具體 Ollama 伺服器顯示
|
||
"""
|
||
provider_display, model_suffix = self._provider_display()
|
||
mode = self._automation_mode()
|
||
openclaw_state = provider_display if provider_display != "rule_fallback" else "degraded"
|
||
nemotron_state = "tool_ready" if self.nemotron_enabled else "standby"
|
||
hermes_state = self.playbook_name or "rule_catalog"
|
||
elephant_state = "timeline_km_pending"
|
||
flow = "webhook>investigator>router>llm/rule>safe>approval"
|
||
|
||
# 2026-05-04 ogt: Token 用量顯示(有資料才顯示)
|
||
token_line = ""
|
||
if self.ai_tokens > 0:
|
||
cost_str = f" / ${self.ai_cost:.4f}" if self.ai_cost > 0 else ""
|
||
token_line = f"├ Tokens:<code>{self.ai_tokens:,}{cost_str}</code>\n"
|
||
|
||
return (
|
||
f"🤖 <b>AI 自動化鏈路</b>\n"
|
||
f"├ Router:<code>{html.escape(provider_display)}{model_suffix}</code>\n"
|
||
f"├ Mode:<code>{html.escape(mode)}</code>\n"
|
||
f"├ OpenClaw:<code>{html.escape(openclaw_state)}</code> | "
|
||
f"NemoTron:<code>{html.escape(nemotron_state)}</code>\n"
|
||
f"├ Hermes:<code>{html.escape(hermes_state)}</code> | "
|
||
f"ElephantAlpha:<code>{html.escape(elephant_state)}</code>\n"
|
||
f"{token_line}"
|
||
f"└ Flow:<code>{flow}</code>\n"
|
||
)
|
||
|
||
def _format_flow_progress_block(self) -> str:
|
||
"""Operator-facing state of where the alert is in the automation loop."""
|
||
quality = self.automation_quality or {}
|
||
facts = quality.get("facts") if isinstance(quality.get("facts"), dict) else {}
|
||
verdict = str(quality.get("verdict") or self._automation_mode())
|
||
|
||
action_upper = (self.suggested_action or "").upper()
|
||
is_noop = (
|
||
"NO_ACTION" in action_upper
|
||
or action_upper.startswith("OBSERVE")
|
||
or action_upper.startswith("INVESTIGATE")
|
||
or not action_upper.strip()
|
||
or action_upper == "待分析"
|
||
)
|
||
auto_repair_records = int(facts.get("auto_repair_execution_records") or 0)
|
||
operation_records = int(facts.get("automation_operation_records") or 0)
|
||
verification = str(facts.get("verification_result") or "missing")
|
||
gateway_total = int(facts.get("mcp_gateway_total") or 0)
|
||
km_entries = int(facts.get("knowledge_entries") or 0)
|
||
|
||
if self.confidence > 0:
|
||
diagnose_state = "ai_ready"
|
||
elif self.automation_state == "diagnosis_failed_manual_required":
|
||
diagnose_state = "failed"
|
||
else:
|
||
diagnose_state = "rule_or_degraded"
|
||
|
||
match_state = self.playbook_name or "rule_catalog"
|
||
if auto_repair_records > 0:
|
||
execute_state = f"auto_repair_recorded:{auto_repair_records}"
|
||
elif operation_records > 0:
|
||
execute_state = f"operation_recorded:{operation_records}"
|
||
elif is_noop:
|
||
execute_state = "no_action_or_observe"
|
||
elif "approval" in verdict or self._automation_mode() == "ai_proposal_ready":
|
||
execute_state = "awaiting_approval"
|
||
else:
|
||
execute_state = "not_started"
|
||
|
||
if verification != "missing":
|
||
verify_state = verification
|
||
elif auto_repair_records > 0 or operation_records > 0:
|
||
verify_state = "pending_or_missing"
|
||
else:
|
||
verify_state = "not_started"
|
||
|
||
if verdict == "auto_repaired_verified":
|
||
conclusion = "已驗證自動修復"
|
||
elif auto_repair_records > 0 or operation_records > 0:
|
||
conclusion = "已記錄執行,等待或缺少驗證"
|
||
elif is_noop:
|
||
conclusion = "未自動修復,需人工判斷"
|
||
elif "approval" in verdict:
|
||
conclusion = "等待審批後才會執行"
|
||
elif "manual" in verdict:
|
||
conclusion = "轉人工處理"
|
||
else:
|
||
conclusion = "尚未形成可宣稱自動修復的證據鏈"
|
||
|
||
return (
|
||
"🧭 <b>流程進度</b>\n"
|
||
f"├ 收件:<code>received</code> | 診斷:<code>{html.escape(diagnose_state)}</code>\n"
|
||
f"├ 匹配:<code>{html.escape(str(match_state)[:60])}</code> | "
|
||
f"執行:<code>{html.escape(execute_state)}</code>\n"
|
||
f"├ 驗證:<code>{html.escape(verify_state)}</code> | "
|
||
f"KM:<code>{km_entries}</code> | MCP:<code>{gateway_total}</code>\n"
|
||
f"└ 判定:<code>{html.escape(verdict)}</code> — {html.escape(conclusion)}\n"
|
||
)
|
||
|
||
def _format_remediation_evidence_block(self) -> str:
|
||
return _format_remediation_evidence_block(self.remediation_summary)
|
||
|
||
def format(self) -> str:
|
||
"""
|
||
格式化為 SOUL.md 規範的訊息 (含 AI 仲裁 + SignOz)
|
||
|
||
Returns:
|
||
str: 格式化的 Telegram 訊息 (max 900 字元)
|
||
"""
|
||
# 責任映射
|
||
resp_map = {
|
||
"FE": "👨💻 FE (前端)",
|
||
"BE": "⚙️ BE (後端)",
|
||
"INFRA": "🏗️ INFRA (基礎設施)",
|
||
"DB": "🗄️ DB (資料庫)",
|
||
"COLLAB": "🤝 COLLAB (協同處理)",
|
||
}
|
||
resp_display = resp_map.get(self.primary_responsibility, "❓ 未知")
|
||
|
||
# 信心度顯示
|
||
confidence_pct = int(self.confidence * 100)
|
||
if confidence_pct >= 80:
|
||
conf_emoji = "🟢"
|
||
elif confidence_pct >= 70:
|
||
conf_emoji = "🟡"
|
||
else:
|
||
conf_emoji = "🔴"
|
||
|
||
# 自動生成事件編號 (2026-03-27 ogt: 修復 INC-INC- 重複前綴)
|
||
if self.incident_id:
|
||
incident_id = self.incident_id
|
||
elif self.approval_id.startswith("INC-"):
|
||
incident_id = self.approval_id
|
||
else:
|
||
incident_id = f"INC-{self.approval_id[:8].upper()}"
|
||
|
||
# SignOz URL (優先使用動態 URL) - 必須 HTML 轉義防止解析錯誤
|
||
service_name = self.resource_name.split("-")[0] if "-" in self.resource_name else self.resource_name
|
||
raw_url = self.signoz_trace_url or f"http://192.168.0.188:3301/traces?service={service_name}"
|
||
signoz_url = html.escape(raw_url, quote=True)
|
||
|
||
# SignOz 指標區塊(含 Trace 連結)
|
||
signoz_block = ""
|
||
if self.signoz_metrics:
|
||
signoz_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{self.signoz_metrics.format()}\n"
|
||
f"🔍 <a href='{signoz_url}'>SignOz Trace</a>\n"
|
||
)
|
||
|
||
# HTML 轉義用戶輸入內容,防止 "Can't parse entities" 錯誤
|
||
safe_resource = html.escape(self.resource_name)
|
||
safe_root_cause = html.escape(self.root_cause)
|
||
safe_action = html.escape(self.suggested_action)
|
||
safe_downtime = html.escape(self.estimated_downtime)
|
||
safe_automation_summary = html.escape(self._automation_status_summary())
|
||
|
||
# 2026-03-29 ogt: AI Token/Cost 顯示
|
||
ai_cost_display = ""
|
||
if self.ai_tokens > 0 or self.ai_cost > 0:
|
||
ai_cost_display = f"💰 Tokens: {self.ai_tokens:,} / ${self.ai_cost:.4f}\n"
|
||
|
||
# 2026-03-29 ogt: ADR-037 異常頻率顯示
|
||
frequency_block = ""
|
||
if self.anomaly_frequency and self.anomaly_frequency.get("count_24h", 0) >= 1:
|
||
freq = self.anomaly_frequency
|
||
escalation_emoji = {
|
||
None: "",
|
||
"REPEAT": "⚠️",
|
||
"ESCALATE": "🔴",
|
||
"PERMANENT_FIX": "🚨",
|
||
}.get(freq.get("escalation_level"), "")
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 D1 — 處置統計行
|
||
auto_r = freq.get("auto_repair_count", 0)
|
||
human_a = freq.get("human_approved_count", 0)
|
||
manual_r = freq.get("manual_resolved_count", 0)
|
||
cold_s = freq.get("cold_start_trust_count", 0)
|
||
total_res = freq.get("total_resolution_count", 0)
|
||
|
||
# 處置分佈行 (只在有處置紀錄時顯示)
|
||
disposition_line = ""
|
||
if total_res > 0:
|
||
auto_total = auto_r + cold_s
|
||
auto_rate = int(auto_total / total_res * 100) if total_res > 0 else 0
|
||
disposition_line = (
|
||
f"├ 🤖 自動: <code>{auto_total}</code>"
|
||
f" | 👤 審核: <code>{human_a}</code>"
|
||
f" | 🔧 手動: <code>{manual_r}</code>\n"
|
||
f"├ 自動化率: <b>{auto_rate}%</b>\n"
|
||
)
|
||
|
||
frequency_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>頻率統計</b> {escalation_emoji}\n"
|
||
f"├ 1h: <code>{freq.get('count_1h', 0)}</code> 次"
|
||
f" | 24h: <code>{freq.get('count_24h', 0)}</code> 次\n"
|
||
f"{disposition_line}"
|
||
f"└ 累計修復: <code>{auto_r}</code> 次\n"
|
||
)
|
||
if freq.get("escalation_level"):
|
||
frequency_block += f"🔺 升級: <b>{freq['escalation_level']}</b>\n"
|
||
|
||
# ADR-075 TYPE-3 格式 (2026-04-12 ogt)
|
||
# AI 來源標籤:confidence=0 不顯示 0%,顯示 📋 規則分析
|
||
if self.confidence > 0 and self.ai_provider:
|
||
provider_display, model_suffix = self._provider_display()
|
||
ai_source = f"🤖 <b>{provider_display}{model_suffix}</b> {conf_emoji} {confidence_pct}%"
|
||
elif self.confidence > 0:
|
||
ai_source = f"🤖 <b>AI 仲裁</b> {conf_emoji} {confidence_pct}%"
|
||
else:
|
||
ai_source = "⚙️ <b>規則/降級分析</b>"
|
||
|
||
# 風險等級中文
|
||
risk_zh = {
|
||
"CRITICAL": "嚴重",
|
||
"HIGH": "高風險",
|
||
"MEDIUM": "中風險",
|
||
"LOW": "低風險",
|
||
}.get(self.risk_level.upper(), self.risk_level)
|
||
|
||
# ADR-076 分類標籤 (2026-04-16 ogt + Claude Sonnet 4.6)
|
||
_category_map = {
|
||
"host": "🖥️ 主機", "host_resource": "🖥️ 主機",
|
||
"k8s": "☸️ K8s", "kubernetes": "☸️ K8s",
|
||
"database": "🗄️ 資料庫",
|
||
"service": "⚙️ 服務",
|
||
"external_site": "🌐 外部網站",
|
||
"secops": "🔐 安全",
|
||
"auto_repair": "🔧 自動修復",
|
||
"alertchain_health": "📡 告警鏈路",
|
||
"flywheel_health": "🔄 飛輪健康",
|
||
"docker": "🐳 Docker",
|
||
"infrastructure": "🏗️ 基礎設施",
|
||
}
|
||
category_line = ""
|
||
if self.alert_category:
|
||
cat_display = html.escape(_category_map.get(self.alert_category, self.alert_category))
|
||
category_line = f"🏷️ 分類:<b>{cat_display}</b>\n"
|
||
|
||
# Playbook 顯示
|
||
playbook_line = ""
|
||
if self.playbook_name:
|
||
playbook_line = f"📖 Playbook:<code>{html.escape(self.playbook_name)}</code>\n"
|
||
remediation_evidence_block = self._format_remediation_evidence_block()
|
||
flow_progress_block = self._format_flow_progress_block()
|
||
automation_block = self._format_automation_block()
|
||
|
||
# ADR-075 TYPE-3 格式組裝
|
||
message = (
|
||
f"{self.status_emoji} ACTION REQUIRED | <b>{html.escape(risk_zh)}</b>\n"
|
||
f"──────────────────────\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🎯 資源:<code>{safe_resource}</code>\n"
|
||
f"{category_line}"
|
||
f"🧭 處置狀態:<b>{safe_automation_summary}</b>\n"
|
||
f"{remediation_evidence_block}\n"
|
||
f"{flow_progress_block}\n"
|
||
f"{automation_block}"
|
||
f"\n"
|
||
f"🧠 <b>AI 深度診斷</b>\n"
|
||
f"├─ 分析:{safe_root_cause}\n"
|
||
f"├─ 責任:{resp_display}\n"
|
||
f"└─ {ai_source}\n"
|
||
f"\n"
|
||
f"⚡ <b>建議修復動作</b>\n"
|
||
f"{playbook_line}"
|
||
f"<code>{safe_action}</code>\n"
|
||
)
|
||
if ai_cost_display:
|
||
message += f"{ai_cost_display}"
|
||
if signoz_block:
|
||
message += f"\n{signoz_block}"
|
||
if frequency_block:
|
||
message += f"\n{frequency_block}"
|
||
message += f"\n⏱️ 停機: {safe_downtime}"
|
||
|
||
return message[:4096] # Telegram 硬限制
|
||
|
||
def format_with_nemotron(self) -> str:
|
||
"""
|
||
格式化含 Nemotron 結果的訊息 (Phase 22 ADR-044)
|
||
|
||
格式:
|
||
═══════════════════════════
|
||
🚨 CRITICAL | harbor-core
|
||
═══════════════════════════
|
||
📋 INC-20260331-0001
|
||
🎯 資源: harbor-core-7d4b8c9f5
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🤖 OpenClaw 仲裁
|
||
├ 📊 信心: 🟢 85%
|
||
├ 👥 責任: BE (後端)
|
||
└ 💡 原因: JVM Heap 配置不當
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 Nemotron 執行方案
|
||
✅ restart_deployment: awoooi-api
|
||
✅ scale_deployment: replicas=3
|
||
└ 驗證: ✅ 驗證通過
|
||
━━━━━━━━━━━━━━━━━━━
|
||
🔧 建議: 刪除 Pod
|
||
⏱️ 停機: ~30s
|
||
|
||
Returns:
|
||
str: 格式化的 Telegram 訊息 (max 1000 字元)
|
||
"""
|
||
# 責任映射
|
||
resp_map = {
|
||
"FE": "👨💻 FE (前端)",
|
||
"BE": "⚙️ BE (後端)",
|
||
"INFRA": "🏗️ INFRA (基礎設施)",
|
||
"DB": "🗄️ DB (資料庫)",
|
||
"COLLAB": "🤝 COLLAB (協同處理)",
|
||
}
|
||
resp_display = resp_map.get(self.primary_responsibility, "❓ 未知")
|
||
|
||
# 信心度顯示
|
||
confidence_pct = int(self.confidence * 100)
|
||
if confidence_pct >= 80:
|
||
conf_emoji = "🟢"
|
||
elif confidence_pct >= 70:
|
||
conf_emoji = "🟡"
|
||
else:
|
||
conf_emoji = "🔴"
|
||
|
||
# 自動生成事件編號
|
||
if self.incident_id:
|
||
incident_id = self.incident_id
|
||
elif self.approval_id.startswith("INC-"):
|
||
incident_id = self.approval_id
|
||
else:
|
||
incident_id = f"INC-{self.approval_id[:8].upper()}"
|
||
|
||
# HTML 轉義
|
||
safe_resource = html.escape(self.resource_name)
|
||
safe_root_cause = html.escape(self.root_cause)
|
||
safe_action = html.escape(self.suggested_action)
|
||
safe_downtime = html.escape(self.estimated_downtime)
|
||
|
||
# AI Provider 顯示
|
||
# 2026-04-04 ogt: 加入 ai_model 顯示底層模型名稱
|
||
# 2026-04-12 ogt: 規則匹配不顯示 🔴 0%,改用 ✅
|
||
if self.confidence > 0 and self.ai_provider:
|
||
provider_display, model_suffix = self._provider_display()
|
||
conf_line = f"🤖 <b>{provider_display} 仲裁</b>{model_suffix} {conf_emoji} {confidence_pct}%"
|
||
elif self.confidence > 0:
|
||
conf_line = f"🤖 <b>OpenClaw 仲裁</b> {conf_emoji} {confidence_pct}%"
|
||
else:
|
||
conf_line = "⚙️ <b>規則匹配</b> ✅"
|
||
|
||
# Nemotron 區塊
|
||
# 2026-04-09 Claude Sonnet 4.6: 顯示 AI 鏈路 — OpenClaw 用哪個模型,Tool Calling 用哪個模型
|
||
nemotron_block = ""
|
||
if self.nemotron_enabled and self.nemotron_tools:
|
||
tools_lines = []
|
||
for t in self.nemotron_tools[:3]: # 最多顯示 3 個
|
||
valid_emoji = "✅" if t.get("valid", False) else "❌"
|
||
tool_name = html.escape(str(t.get("tool", "unknown"))[:20])
|
||
args = t.get("args", {})
|
||
if isinstance(args, dict) and args:
|
||
args_str = ", ".join(f"{k}={v}" for k, v in list(args.items())[:2])
|
||
else:
|
||
args_str = str(args)[:30]
|
||
safe_args = html.escape(args_str[:40])
|
||
tools_lines.append(f" {valid_emoji} {tool_name}: {safe_args}")
|
||
|
||
tools_str = "\n".join(tools_lines)
|
||
validation_display = html.escape(self.nemotron_validation or "⏳ 驗證中")
|
||
|
||
# Tool Calling 模型/後端標籤
|
||
if self.nemotron_tool_model and self.nemotron_tool_backend:
|
||
tool_model_label = f"<code>{html.escape(self.nemotron_tool_model)}</code> ({html.escape(self.nemotron_tool_backend)})"
|
||
elif self.nemotron_tool_model:
|
||
tool_model_label = f"<code>{html.escape(self.nemotron_tool_model)}</code>"
|
||
else:
|
||
tool_model_label = "Nemotron"
|
||
|
||
latency_line = f"└ 延遲: {self.nemotron_latency_ms:.0f}ms\n" if self.nemotron_latency_ms > 0 else ""
|
||
|
||
nemotron_block = (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🔧 <b>Tool Calling</b>: {tool_model_label}\n"
|
||
f"{tools_str}\n"
|
||
f"└ 驗證: {validation_display}\n"
|
||
f"{latency_line}"
|
||
)
|
||
|
||
# 2026-04-05 Claude Code: 重設計訊息格式,提升易讀性
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 補 alert_category + playbook_name 顯示
|
||
_category_map = {
|
||
"host": "🖥️ 主機", "host_resource": "🖥️ 主機",
|
||
"k8s": "☸️ K8s", "kubernetes": "☸️ K8s",
|
||
"database": "🗄️ 資料庫", "service": "⚙️ 服務",
|
||
"external_site": "🌐 外部網站", "secops": "🔐 安全",
|
||
"auto_repair": "🔧 自動修復", "docker": "🐳 Docker",
|
||
}
|
||
category_line = ""
|
||
if self.alert_category:
|
||
cat_display = html.escape(_category_map.get(self.alert_category, self.alert_category))
|
||
category_line = f"🏷️ {cat_display}\n"
|
||
playbook_line = ""
|
||
if self.playbook_name:
|
||
playbook_line = f"📖 <code>{html.escape(self.playbook_name)}</code>\n"
|
||
remediation_evidence_block = self._format_remediation_evidence_block()
|
||
flow_progress_block = self._format_flow_progress_block()
|
||
|
||
# 組裝訊息
|
||
message = (
|
||
f"{self.status_emoji} <b>{html.escape(self.risk_level)}</b> <code>{html.escape(incident_id)}</code>\n"
|
||
f"<b>{safe_resource}</b>\n"
|
||
f"{category_line}"
|
||
f"\n"
|
||
f"{remediation_evidence_block}"
|
||
f"{flow_progress_block}\n"
|
||
f"{self._format_automation_block()}\n"
|
||
f"{conf_line}\n"
|
||
f"👥 {resp_display}\n"
|
||
f"💡 {safe_root_cause}\n"
|
||
)
|
||
if nemotron_block:
|
||
message += f"\n{nemotron_block}"
|
||
message += (
|
||
f"\n🔧 <b>建議:</b> {playbook_line}{safe_action}\n"
|
||
f"⏱️ 停機: {safe_downtime}"
|
||
)
|
||
|
||
return message[:4096] # Telegram 硬限制
|
||
|
||
|
||
# =============================================================================
|
||
# 新訊息模板 (2026-03-29 ogt: ADR-038 Telegram 訊息規範)
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class SentryErrorMessage:
|
||
"""
|
||
Sentry 錯誤訊息 (SENTRY_ERROR)
|
||
|
||
2026-03-29 ogt: 新增,用於 Sentry 錯誤通知
|
||
按鈕: [🔍 查看詳情] [🔕 靜默 1h]
|
||
"""
|
||
error_id: str # Sentry Issue ID
|
||
error_type: str # TypeError, ValueError, etc.
|
||
error_message: str # 錯誤訊息 (max 100)
|
||
service_name: str # awoooi-api, awoooi-web, etc.
|
||
file_location: str # src/api/v1/incidents.py:123
|
||
occurrence_count: int = 1 # 發生次數
|
||
affected_users: int = 0 # 影響用戶數
|
||
first_seen: str = "" # 首次發生時間
|
||
stack_trace: list[str] | None = None # Stack trace (前 3 行)
|
||
sentry_url: str = "" # Sentry 連結
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_error = html.escape(self.error_message[:80])
|
||
safe_type = html.escape(self.error_type[:30])
|
||
safe_service = html.escape(self.service_name[:25])
|
||
safe_file = html.escape(self.file_location[:50])
|
||
|
||
# Stack trace 區塊
|
||
trace_block = ""
|
||
if self.stack_trace:
|
||
trace_lines = "\n".join(f" → {html.escape(line[:50])}" for line in self.stack_trace[:3])
|
||
trace_block = f"🔗 Stack Trace (前 3 行):\n{trace_lines}\n"
|
||
|
||
# Sentry URL
|
||
sentry_link = ""
|
||
if self.sentry_url:
|
||
safe_url = html.escape(self.sentry_url, quote=True)
|
||
sentry_link = f"\n🔍 <a href='{safe_url}'>查看 Sentry</a>"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🐛 <b>SENTRY ERROR</b> | {safe_service}\n"
|
||
f"═══════════════════════════\n"
|
||
f"📋 <code>{html.escape(self.error_id)}</code>\n"
|
||
f"🎯 錯誤: <code>{safe_type}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💬 {safe_error}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>統計</b>\n"
|
||
f"├ 發生次數: <code>{self.occurrence_count}</code>\n"
|
||
f"├ 影響用戶: <code>{self.affected_users}</code>\n"
|
||
f"└ 首次發生: {html.escape(self.first_seen) if self.first_seen else 'N/A'}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📍 位置: <code>{safe_file}</code>\n"
|
||
f"{trace_block}"
|
||
f"{sentry_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class ResourceWarnMessage:
|
||
"""
|
||
資源告警訊息 (RESOURCE_WARN)
|
||
|
||
2026-03-29 ogt: 新增,用於資源耗盡警告
|
||
按鈕: [⚡ 自動擴展] [🔕 靜默 1h]
|
||
"""
|
||
resource_id: str # RES-YYYYMMDD-XXXX
|
||
pod_name: str # Pod 名稱
|
||
namespace: str = "default" # K8s namespace
|
||
cpu_percent: float = 0.0 # CPU 使用率
|
||
cpu_limit: str = "" # CPU limit (e.g., 500m)
|
||
memory_percent: float = 0.0 # Memory 使用率
|
||
memory_limit: str = "" # Memory limit (e.g., 512Mi)
|
||
disk_percent: float = 0.0 # Disk 使用率
|
||
trend_info: str = "" # 趨勢資訊
|
||
suggestion: str = "" # 建議操作
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_pod = html.escape(self.pod_name[:35])
|
||
safe_ns = html.escape(self.namespace[:20])
|
||
|
||
# 資源狀態 emoji
|
||
def get_status_emoji(percent: float) -> str:
|
||
if percent >= 90:
|
||
return "🔴"
|
||
elif percent >= 70:
|
||
return "🟡"
|
||
return "🟢"
|
||
|
||
cpu_emoji = get_status_emoji(self.cpu_percent)
|
||
mem_emoji = get_status_emoji(self.memory_percent)
|
||
disk_emoji = get_status_emoji(self.disk_percent)
|
||
|
||
# 趨勢和建議
|
||
trend_block = ""
|
||
if self.trend_info:
|
||
trend_block = f"📈 趨勢: {html.escape(self.trend_info[:50])}\n"
|
||
|
||
suggestion_block = ""
|
||
if self.suggestion:
|
||
suggestion_block = f"💡 建議: {html.escape(self.suggestion)}\n"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"⚠️ <b>資源告警</b> | {safe_ns}\n"
|
||
f"═══════════════════════════\n"
|
||
f"📋 <code>{html.escape(self.resource_id)}</code>\n"
|
||
f"🎯 Pod: <code>{safe_pod}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>資源使用率</b>\n"
|
||
f"├ CPU: {cpu_emoji} <code>{self.cpu_percent:.1f}%</code>"
|
||
f"{f' (limit: {self.cpu_limit})' if self.cpu_limit else ''}\n"
|
||
f"├ Memory: {mem_emoji} <code>{self.memory_percent:.1f}%</code>"
|
||
f"{f' (limit: {self.memory_limit})' if self.memory_limit else ''}\n"
|
||
f"└ Disk: {disk_emoji} <code>{self.disk_percent:.1f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{trend_block}"
|
||
f"{suggestion_block}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class RepairReportMessage:
|
||
"""
|
||
自動修復報告訊息 (REPAIR_REPORT)
|
||
|
||
2026-03-29 ogt: 新增,用於每日自動修復彙總
|
||
按鈕: 無
|
||
"""
|
||
report_date: str # 報告日期 (YYYY-MM-DD)
|
||
total_repairs: int = 0 # 總修復次數
|
||
success_count: int = 0 # 成功次數
|
||
failure_count: int = 0 # 失敗次數
|
||
saved_minutes: int = 0 # 節省人工時間 (分鐘)
|
||
top_issues: list[tuple[str, int]] | None = None # Top 問題 [(name, count)]
|
||
ai_cost_gemini: float = 0.0 # Gemini 成本
|
||
ai_cost_nvidia: float = 0.0 # NVIDIA 成本 (免費)
|
||
ai_tokens_total: int = 0 # 總 Token 數
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 成功率
|
||
success_rate = (self.success_count / self.total_repairs * 100) if self.total_repairs > 0 else 0
|
||
|
||
# Top 問題區塊
|
||
issues_block = ""
|
||
if self.top_issues:
|
||
issues_lines = "\n".join(
|
||
f" {i+1}. {html.escape(name[:30])} ({count} 次)"
|
||
for i, (name, count) in enumerate(self.top_issues[:3])
|
||
)
|
||
issues_block = f"━━━━━━━━━━━━━━━━━━━\n🔝 <b>Top 3 問題</b>:\n{issues_lines}\n"
|
||
|
||
# AI 成本
|
||
total_cost = self.ai_cost_gemini + self.ai_cost_nvidia
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🔧 <b>自動修復報告</b> | 每日彙總\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>統計</b>\n"
|
||
f"├ 總修復次數: <code>{self.total_repairs}</code>\n"
|
||
f"├ 成功: ✅ <code>{self.success_count}</code> ({success_rate:.0f}%)\n"
|
||
f"├ 失敗: ❌ <code>{self.failure_count}</code>\n"
|
||
f"└ 節省人工: ~<code>{self.saved_minutes}</code> 分鐘\n"
|
||
f"{issues_block}"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>AI 成本</b>\n"
|
||
f"├ Gemini: ${self.ai_cost_gemini:.4f} ({self.ai_tokens_total:,} tokens)\n"
|
||
f"├ NVIDIA: ${self.ai_cost_nvidia:.4f} (免費)\n"
|
||
f"└ 總計: ${total_cost:.4f}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class DailySummaryMessage:
|
||
"""
|
||
每日摘要訊息 (DAILY_SUMMARY)
|
||
|
||
2026-03-29 ogt: 新增,用於每日系統狀態摘要
|
||
按鈕: 無
|
||
"""
|
||
summary_date: str # 摘要日期 (YYYY-MM-DD)
|
||
# 告警統計
|
||
alert_total: int = 0
|
||
alert_critical: int = 0
|
||
alert_medium: int = 0
|
||
alert_low: int = 0
|
||
# 處理統計
|
||
auto_repair_count: int = 0
|
||
manual_approval_count: int = 0
|
||
ignored_count: int = 0
|
||
avg_response_minutes: float = 0.0
|
||
# 可用性
|
||
api_availability: float = 99.9
|
||
web_availability: float = 99.9
|
||
worker_availability: float = 99.9
|
||
# 成本
|
||
ai_cost: float = 0.0
|
||
cloud_cost: float = 0.0
|
||
budget_remaining: float = 0.0
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 處理百分比
|
||
total_handled = self.auto_repair_count + self.manual_approval_count + self.ignored_count
|
||
auto_pct = (self.auto_repair_count / total_handled * 100) if total_handled > 0 else 0
|
||
manual_pct = (self.manual_approval_count / total_handled * 100) if total_handled > 0 else 0
|
||
ignored_pct = (self.ignored_count / total_handled * 100) if total_handled > 0 else 0
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"📊 <b>每日摘要</b> | AWOOOI\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.summary_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🚨 <b>告警統計</b>\n"
|
||
f"├ 總數: <code>{self.alert_total}</code>\n"
|
||
f"├ Critical: <code>{self.alert_critical}</code>\n"
|
||
f"├ Medium: <code>{self.alert_medium}</code>\n"
|
||
f"└ Low: <code>{self.alert_low}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"✅ <b>處理統計</b>\n"
|
||
f"├ 自動修復: <code>{self.auto_repair_count}</code> ({auto_pct:.0f}%)\n"
|
||
f"├ 人工簽核: <code>{self.manual_approval_count}</code> ({manual_pct:.0f}%)\n"
|
||
f"├ 忽略/靜默: <code>{self.ignored_count}</code> ({ignored_pct:.0f}%)\n"
|
||
f"└ 平均回應: <code>{self.avg_response_minutes:.1f}</code> 分鐘\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📈 <b>可用性</b>\n"
|
||
f"├ API: <code>{self.api_availability:.2f}%</code>\n"
|
||
f"├ Web: <code>{self.web_availability:.2f}%</code>\n"
|
||
f"└ Worker: <code>{self.worker_availability:.2f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>成本</b>\n"
|
||
f"├ AI: ${self.ai_cost:.2f}\n"
|
||
f"├ 雲端: ${self.cloud_cost:.2f}\n"
|
||
f"└ 預算剩餘: ${self.budget_remaining:.2f}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class CICDProgressMessage:
|
||
"""
|
||
CI/CD 進度訊息 (CICD_PROGRESS)
|
||
|
||
2026-03-30 ogt: 新增,用於 CI/CD 流程進度通知
|
||
特性: 簡潔、不走 AI 仲裁、無按鈕
|
||
"""
|
||
job_name: str # Job 名稱 (e.g., Build, Test, Deploy)
|
||
status: str # running, success, failed
|
||
stage: str = "" # CI/CD 階段 (e.g., build, test, deploy)
|
||
commit_sha: str = "" # Git commit SHA
|
||
triggered_by: str = "" # 觸發者
|
||
duration_seconds: int = 0 # 執行時間
|
||
message: str = "" # 額外訊息
|
||
workflow_url: str = "" # Workflow 連結
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML (簡潔版)"""
|
||
# 狀態 emoji
|
||
status_emoji = {
|
||
"running": "🔄",
|
||
"success": "✅",
|
||
"failed": "❌",
|
||
"pending": "⏳",
|
||
}.get(self.status.lower(), "📦")
|
||
|
||
safe_job = html.escape(self.job_name[:40])
|
||
safe_stage = html.escape(self.stage[:20]) if self.stage else ""
|
||
|
||
# 時間格式化
|
||
duration_str = ""
|
||
if self.duration_seconds > 0:
|
||
minutes = self.duration_seconds // 60
|
||
seconds = self.duration_seconds % 60
|
||
duration_str = f" ({minutes}m {seconds}s)" if minutes > 0 else f" ({seconds}s)"
|
||
|
||
# Commit 資訊
|
||
commit_info = ""
|
||
if self.commit_sha:
|
||
commit_info = f"\n📋 <code>{html.escape(self.commit_sha[:8])}</code>"
|
||
|
||
# Workflow 連結
|
||
workflow_link = ""
|
||
if self.workflow_url:
|
||
safe_url = html.escape(self.workflow_url, quote=True)
|
||
workflow_link = f"\n🔗 <a href='{safe_url}'>Workflow</a>"
|
||
|
||
detail = ""
|
||
if self.message:
|
||
safe_message = html.escape(self.message[:240])
|
||
detail = f"\n📝 {safe_message}"
|
||
|
||
# 簡潔訊息
|
||
stage_label = f" | {safe_stage}" if safe_stage else ""
|
||
message = (
|
||
f"{status_emoji} <b>[AWOOOI CI/CD]</b>{stage_label}\n"
|
||
f"📦 {safe_job}{duration_str}"
|
||
f"{commit_info}"
|
||
f"{detail}"
|
||
f"{workflow_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class DeploySuccessMessage:
|
||
"""
|
||
部署成功訊息 (DEPLOY_SUCCESS)
|
||
|
||
2026-03-29 ogt: 新增,用於 CD 部署成功通知
|
||
按鈕: 無
|
||
"""
|
||
commit_sha: str # Git commit SHA (short)
|
||
triggered_by: str # 觸發者
|
||
environment: str = "Production" # 環境
|
||
# 版本資訊
|
||
api_version: str = ""
|
||
web_version: str = ""
|
||
worker_version: str = ""
|
||
# 部署時間
|
||
duration_seconds: int = 0
|
||
# 測試結果
|
||
e2e_passed: int = 0
|
||
e2e_total: int = 0
|
||
health_check_passed: bool = True
|
||
# 連結
|
||
workflow_url: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_commit = html.escape(self.commit_sha[:8])
|
||
safe_user = html.escape(self.triggered_by[:20])
|
||
safe_env = html.escape(self.environment[:15])
|
||
|
||
# 部署時間格式化
|
||
minutes = self.duration_seconds // 60
|
||
seconds = self.duration_seconds % 60
|
||
duration_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
|
||
|
||
# 測試結果
|
||
e2e_status = "✅" if self.e2e_passed == self.e2e_total else "⚠️"
|
||
health_status = "✅ 全部通過" if self.health_check_passed else "❌ 部分失敗"
|
||
|
||
# Workflow 連結
|
||
workflow_link = ""
|
||
if self.workflow_url:
|
||
safe_url = html.escape(self.workflow_url, quote=True)
|
||
workflow_link = f"\n🔗 <a href='{safe_url}'>查看 Workflow</a>"
|
||
|
||
message = (
|
||
f"✅ <b>部署成功</b> | {safe_env}\n\n"
|
||
f"📋 Commit: <code>{safe_commit}</code>\n"
|
||
f"👤 觸發者: @{safe_user}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>部署詳情</b>\n"
|
||
f"├ API: {html.escape(self.api_version) if self.api_version else 'N/A'} ✅\n"
|
||
f"├ Web: {html.escape(self.web_version) if self.web_version else 'N/A'} ✅\n"
|
||
f"├ Worker: {html.escape(self.worker_version) if self.worker_version else 'N/A'} ✅\n"
|
||
f"└ 耗時: {duration_str}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🧪 E2E 測試: {e2e_status} {self.e2e_passed}/{self.e2e_total} PASSED\n"
|
||
f"📊 健康檢查: {health_status}"
|
||
f"{workflow_link}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class RateLimitMessage:
|
||
"""
|
||
API 限額警告訊息 (RATE_LIMIT)
|
||
|
||
2026-03-29 ogt: 新增,用於 AI API 限額警告
|
||
按鈕: 無
|
||
"""
|
||
provider: str # gemini, openai, etc.
|
||
# 用量統計
|
||
daily_usage: int = 0
|
||
daily_limit: int = 0
|
||
token_usage: int = 0
|
||
token_limit: int = 0
|
||
cost_usd: float = 0.0
|
||
# 建議
|
||
suggestions: list[str] | None = None
|
||
# 重置時間
|
||
reset_time: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
safe_provider = html.escape(self.provider.upper()[:15])
|
||
|
||
# 使用率百分比
|
||
usage_pct = (self.daily_usage / self.daily_limit * 100) if self.daily_limit > 0 else 0
|
||
token_pct = (self.token_usage / self.token_limit * 100) if self.token_limit > 0 else 0
|
||
|
||
# 建議區塊
|
||
suggestion_block = ""
|
||
if self.suggestions:
|
||
suggestion_lines = "\n".join(f" - {html.escape(s[:50])}" for s in self.suggestions[:3])
|
||
suggestion_block = f"━━━━━━━━━━━━━━━━━━━\n💡 <b>建議</b>:\n{suggestion_lines}\n"
|
||
|
||
# 重置時間
|
||
reset_block = ""
|
||
if self.reset_time:
|
||
reset_block = f"\n🔄 將於 {html.escape(self.reset_time)} 重置"
|
||
|
||
message = (
|
||
f"⚠️ <b>API 限額警告</b>\n\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>{safe_provider} API</b>\n"
|
||
f"├ 今日用量: <code>{self.daily_usage}/{self.daily_limit}</code> ({usage_pct:.0f}%)\n"
|
||
f"├ Token: <code>{self.token_usage:,}/{self.token_limit:,}</code> ({token_pct:.0f}%)\n"
|
||
f"└ 成本: ${self.cost_usd:.4f}\n"
|
||
f"{suggestion_block}"
|
||
f"{reset_block}"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class K3sStatusMessage:
|
||
"""
|
||
K3s 叢集狀態報告訊息 (K3S_STATUS)
|
||
|
||
2026-03-31 Claude Code: Phase 21.2 定期報告
|
||
用於每日 K3s 健康狀態推送
|
||
按鈕: 無
|
||
"""
|
||
report_date: str # 報告日期 (YYYY-MM-DD HH:MM)
|
||
# 節點狀態
|
||
node_total: int = 0
|
||
node_ready: int = 0
|
||
# Pod 狀態
|
||
pod_total: int = 0
|
||
pod_running: int = 0
|
||
pod_pending: int = 0
|
||
pod_failed: int = 0
|
||
# HPA 狀態
|
||
hpa_api_replicas: str = "2/6"
|
||
hpa_web_replicas: str = "2/6"
|
||
hpa_worker_replicas: str = "1/3"
|
||
# 備份狀態
|
||
etcd_backup_last: str = ""
|
||
velero_backup_last: str = ""
|
||
# 穩定指標
|
||
alert_count_48h: int = 0
|
||
pod_restart_48h: int = 0
|
||
# 版本資訊
|
||
k3s_version: str = ""
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 健康狀態 emoji
|
||
node_health = "✅" if self.node_ready == self.node_total else "⚠️"
|
||
pod_health = "✅" if self.pod_failed == 0 and self.pod_pending == 0 else "⚠️"
|
||
stability = "✅" if self.alert_count_48h == 0 and self.pod_restart_48h == 0 else "⚠️"
|
||
|
||
# 備份狀態
|
||
etcd_status = html.escape(self.etcd_backup_last[:20]) if self.etcd_backup_last else "N/A"
|
||
velero_status = html.escape(self.velero_backup_last[:20]) if self.velero_backup_last else "N/A"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"🎛️ <b>K3s 叢集狀態</b> | Daily\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{node_health} <b>節點</b>: {self.node_ready}/{self.node_total} Ready\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{pod_health} <b>Pod 狀態</b>\n"
|
||
f"├ Running: <code>{self.pod_running}</code>\n"
|
||
f"├ Pending: <code>{self.pod_pending}</code>\n"
|
||
f"└ Failed: <code>{self.pod_failed}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📊 <b>HPA 副本數</b>\n"
|
||
f"├ API: <code>{html.escape(self.hpa_api_replicas)}</code>\n"
|
||
f"├ Web: <code>{html.escape(self.hpa_web_replicas)}</code>\n"
|
||
f"└ Worker: <code>{html.escape(self.hpa_worker_replicas)}</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💾 <b>備份</b>\n"
|
||
f"├ etcd: {etcd_status}\n"
|
||
f"└ Velero: {velero_status}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{stability} <b>48h 穩定度</b>\n"
|
||
f"├ 告警: <code>{self.alert_count_48h}</code>\n"
|
||
f"└ Pod 重啟: <code>{self.pod_restart_48h}</code>"
|
||
)
|
||
|
||
return message[:900]
|
||
|
||
|
||
@dataclass
|
||
class WeeklyReportMessage:
|
||
"""
|
||
週報訊息 (WEEKLY_REPORT)
|
||
|
||
2026-03-31 Claude Code: Phase 21.3 定期報告
|
||
每週五 18:00 台北發送
|
||
按鈕: 無
|
||
"""
|
||
week_range: str # 週次 (e.g., "2026-W14")
|
||
report_date: str # 報告日期時間
|
||
# 告警統計
|
||
alert_total: int = 0
|
||
alert_critical: int = 0
|
||
alert_resolved: int = 0
|
||
resolved_rate: float = 0.0
|
||
# AI 效能
|
||
ai_proposal_count: int = 0
|
||
ai_executed_count: int = 0
|
||
ai_success_rate: float = 0.0
|
||
avg_response_minutes: float = 0.0
|
||
# K3s 健康
|
||
k3s_uptime_pct: float = 99.9
|
||
pod_restart_total: int = 0
|
||
hpa_scale_events: int = 0
|
||
# Git 活動
|
||
commits_count: int = 0
|
||
deploy_count: int = 0
|
||
# 成本
|
||
ai_cost_week: float = 0.0
|
||
ai_tokens_week: int = 0
|
||
# 2026-04-07 Claude Code: Sprint 4 F1 — 處置分佈
|
||
disposition_auto: int = 0
|
||
disposition_human: int = 0
|
||
disposition_manual: int = 0
|
||
disposition_cold_start: int = 0
|
||
disposition_total: int = 0
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 健康狀態 emoji
|
||
alert_health = "✅" if self.resolved_rate >= 80 else "⚠️"
|
||
ai_health = "✅" if self.ai_success_rate >= 70 else "⚠️"
|
||
k3s_health = "✅" if self.k3s_uptime_pct >= 99 else "⚠️"
|
||
|
||
message = (
|
||
f"═══════════════════════════\n"
|
||
f"📊 <b>AWOOOI 週報</b>\n"
|
||
f"═══════════════════════════\n"
|
||
f"📅 {html.escape(self.week_range)} | {html.escape(self.report_date)}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{alert_health} <b>告警統計</b>\n"
|
||
f"├ 總數: <code>{self.alert_total}</code>\n"
|
||
f"├ Critical: <code>{self.alert_critical}</code>\n"
|
||
f"├ 已解決: <code>{self.alert_resolved}</code>\n"
|
||
f"└ 解決率: <code>{self.resolved_rate:.1f}%</code>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{ai_health} <b>AI 效能</b>\n"
|
||
f"├ 提案數: <code>{self.ai_proposal_count}</code>\n"
|
||
f"├ 執行數: <code>{self.ai_executed_count}</code>\n"
|
||
f"├ 成功率: <code>{self.ai_success_rate:.1f}%</code>\n"
|
||
f"└ 平均回應: <code>{self.avg_response_minutes:.1f}</code> 分鐘\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"{k3s_health} <b>K3s 健康</b>\n"
|
||
f"├ Uptime: <code>{self.k3s_uptime_pct:.2f}%</code>\n"
|
||
f"├ Pod 重啟: <code>{self.pod_restart_total}</code>\n"
|
||
f"└ HPA 擴縮: <code>{self.hpa_scale_events}</code> 次\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📦 <b>開發活動</b>\n"
|
||
f"├ Commits: <code>{self.commits_count}</code>\n"
|
||
f"└ 部署: <code>{self.deploy_count}</code> 次\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"💰 <b>AI 成本</b>\n"
|
||
f"├ 費用: $<code>{self.ai_cost_week:.2f}</code>\n"
|
||
f"└ Tokens: <code>{self.ai_tokens_week:,}</code>\n"
|
||
)
|
||
|
||
# Sprint 4 F1: 處置分佈(有資料才加)
|
||
if self.disposition_total > 0:
|
||
auto_total = self.disposition_auto + self.disposition_cold_start
|
||
auto_rate = int(auto_total / self.disposition_total * 100) if self.disposition_total > 0 else 0
|
||
message += (
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <b>處置分佈</b>\n"
|
||
f"├ 🤖 自動修復: <code>{self.disposition_auto}</code>\n"
|
||
f"├ ❄️ 冷啟動信任: <code>{self.disposition_cold_start}</code>\n"
|
||
f"├ 👤 人工審核: <code>{self.disposition_human}</code>\n"
|
||
f"├ 🔧 手動處理: <code>{self.disposition_manual}</code>\n"
|
||
f"└ 自動化率: <b>{auto_rate}%</b>"
|
||
)
|
||
|
||
return message[:1200]
|
||
|
||
|
||
@dataclass
|
||
class InfraAlertMessage:
|
||
"""
|
||
基礎設施異常告警訊息 (INFRA_ALERT)
|
||
|
||
2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式
|
||
用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等)
|
||
按鈕: 無 (資訊型告警)
|
||
"""
|
||
component: str # 元件名稱 (e.g., "Nemotron NIM")
|
||
status: str # 狀態描述 (e.g., "⚠️ 超時 (>25s)")
|
||
impact: str # 影響說明
|
||
auto_fixed: bool = False # 是否已自動修復
|
||
fix_action: str = "" # 執行的修復動作 (auto_fixed=True 時顯示)
|
||
note: str = "" # 附加說明 (info_only 情境用,不顯示修復區塊)
|
||
|
||
def format(self) -> str:
|
||
"""格式化為 Telegram HTML"""
|
||
# 有 note 表示「資訊性提示」,不顯示修復區塊
|
||
if self.note:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n💡 {html.escape(self.note)}\n"
|
||
elif self.auto_fixed:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n✅ <b>已自動修復</b>\n└ {html.escape(self.fix_action)}\n"
|
||
else:
|
||
footer = f"━━━━━━━━━━━━━━━━━━━\n⚠️ <b>需要關注</b>\n└ {html.escape(self.fix_action or '請確認元件狀態')}\n"
|
||
|
||
return (
|
||
f"🚨 <b>基礎設施異常</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"⚙️ <b>{html.escape(self.component)}</b>: {html.escape(self.status)}\n"
|
||
f"📛 影響: {html.escape(self.impact)}\n"
|
||
f"{footer}"
|
||
)[:900]
|
||
|
||
|
||
# =============================================================================
|
||
# Risk Level Emoji Mapping
|
||
# =============================================================================
|
||
|
||
RISK_EMOJI_MAP = {
|
||
"critical": "🚨",
|
||
"high": "🔴",
|
||
"medium": "⚠️",
|
||
"low": "ℹ️",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# ADR-071-B: 告警通知四類型分類器 (2026-04-11 Claude Sonnet 4.6)
|
||
# =============================================================================
|
||
|
||
from enum import Enum
|
||
|
||
class NotificationType(str, Enum):
|
||
TYPE_1 = "TYPE-1" # 純資訊,無按鈕
|
||
TYPE_2 = "TYPE-2" # 已自動修復
|
||
TYPE_3 = "TYPE-3" # 需人工審核(預設)
|
||
TYPE_4 = "TYPE-4" # AI 無法判斷
|
||
TYPE_4_DRIFT = "TYPE-4D" # Config Drift 專屬
|
||
TYPE_8M = "TYPE-8M" # ADR-075: 飛輪/告警鏈路自身健康(Meta-System)
|
||
|
||
|
||
def classify_notification(
|
||
incident,
|
||
confidence: float,
|
||
auto_executed: bool,
|
||
mcp_all_failed: bool = False,
|
||
decision_state: str = "",
|
||
) -> NotificationType:
|
||
"""
|
||
告警通知分類器 — 決定要送哪種類型的 Telegram 卡片
|
||
|
||
分類優先順序:
|
||
TYPE-4D > TYPE-1 > TYPE-2 > TYPE-4 > TYPE-3(預設)
|
||
|
||
Args:
|
||
incident: Incident Pydantic 模型(需要 signals[].labels + title)
|
||
confidence: AI 決策信心值 (0.0~1.0)
|
||
auto_executed: 是否已自動修復執行完成
|
||
mcp_all_failed: 所有 MCP provider 是否全失敗
|
||
decision_state: DecisionState 字串 ("COMPLETED" / "ERROR" / ...)
|
||
"""
|
||
# ADR-073 Phase 3-1: 優先採用 classify_alert_early() 已設定的 notification_type
|
||
# 這樣 TYPE-1/TYPE-4D/TYPE-8M 告警不需進入 LLM 分析路徑 (2026-04-12 ogt)
|
||
_early_type = getattr(incident, "notification_type", None)
|
||
if _early_type == "TYPE-4D":
|
||
return NotificationType.TYPE_4_DRIFT
|
||
if _early_type == "TYPE-8M":
|
||
return NotificationType.TYPE_8M
|
||
if _early_type == "TYPE-1":
|
||
return NotificationType.TYPE_1
|
||
|
||
labels = incident.signals[0].labels if incident.signals else {}
|
||
alertname = labels.get("alertname", "")
|
||
label_severity = labels.get("severity", "")
|
||
|
||
# TYPE-4D:Config Drift 專屬(最優先)
|
||
if alertname in ("ConfigDrift", "ConfigurationDrift", "KubeConfigDrift"):
|
||
return NotificationType.TYPE_4_DRIFT
|
||
|
||
# TYPE-1:純資訊(severity=info + 成功類告警)
|
||
# 2026-04-12 ogt: Incident 無 title 欄位,改用 alertname + signal annotations
|
||
_tg_sig_summary = (
|
||
incident.signals[0].annotations.get("summary", "") or
|
||
incident.signals[0].annotations.get("description", "") or
|
||
incident.signals[0].alert_name
|
||
) if incident.signals else ""
|
||
title_lower = (alertname + " " + _tg_sig_summary).lower()
|
||
if label_severity == "info" and any(kw in title_lower for kw in ["success", "完成", "completed"]):
|
||
return NotificationType.TYPE_1
|
||
if alertname.startswith(("Backup.", "VeleroBackup")) and label_severity == "info":
|
||
return NotificationType.TYPE_1
|
||
if alertname in ("AlertChainHealthy", "AutoRepairHighSuccessRate"):
|
||
return NotificationType.TYPE_1
|
||
|
||
# TYPE-2:已自動修復完成
|
||
if auto_executed and decision_state == "COMPLETED":
|
||
return NotificationType.TYPE_2
|
||
|
||
# TYPE-4:AI 無法判斷(信心不足 / MCP 全失敗 / 決策錯誤)
|
||
if confidence < 0.5 or mcp_all_failed or decision_state == "ERROR":
|
||
return NotificationType.TYPE_4
|
||
|
||
# TYPE-3:預設(需人工審核)
|
||
return NotificationType.TYPE_3
|
||
|
||
|
||
# =============================================================================
|
||
# Telegram Gateway
|
||
# =============================================================================
|
||
|
||
class TelegramGatewayError(Exception):
|
||
"""Telegram Gateway 錯誤"""
|
||
pass
|
||
|
||
|
||
class TelegramGateway:
|
||
"""
|
||
Telegram Gateway - 行動戰情室 + SignOz 整合
|
||
|
||
職責:
|
||
1. 推送待簽核卡片到 Telegram (含 SignOz 指標)
|
||
2. 接收並驗證簽核/調優回調
|
||
3. Shadow Mode 調優執行 (僅日誌)
|
||
4. 遵守 SOUL.md 訊息壓縮原則
|
||
"""
|
||
|
||
TELEGRAM_API_BASE = "https://api.telegram.org"
|
||
|
||
def __init__(self):
|
||
self._http_client: httpx.AsyncClient | None = None
|
||
self._security = get_security_interceptor()
|
||
self._initialized = False
|
||
# Long Polling 狀態 (Phase 5 內網修復)
|
||
self._polling_active = False
|
||
self._polling_task: asyncio.Task | None = None
|
||
self._last_update_id = 0
|
||
# 2026-04-01 Claude Code: 分散式 Leader Election (防 2-Pod 409 互搶)
|
||
self._pod_id = os.environ.get("POD_NAME", os.urandom(8).hex())
|
||
self._leader_task: asyncio.Task | None = None
|
||
# Phase 6.5: 心跳監控 (防止沉默盲點)
|
||
self._last_message_time: datetime | None = None
|
||
self._heartbeat_task: asyncio.Task | None = None
|
||
self._heartbeat_active = False
|
||
|
||
async def initialize(self) -> bool:
|
||
"""初始化 Gateway"""
|
||
if not settings.OPENCLAW_TG_BOT_TOKEN:
|
||
logger.warning("telegram_gateway_disabled", reason="Bot token not configured")
|
||
return False
|
||
|
||
if not settings.OPENCLAW_TG_CHAT_ID and not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("telegram_gateway_disabled", reason="No Telegram chat ID configured")
|
||
return False
|
||
|
||
# 2026-04-03 ogt: timeout 改用 httpx.Timeout 分開設定
|
||
# connect=10s, read=50s (getUpdates long polling timeout 40s + buffer)
|
||
# 舊的 timeout=30.0 會讓 getUpdates(timeout=40s) 每次都被 client 先打斷
|
||
self._http_client = httpx.AsyncClient(
|
||
timeout=httpx.Timeout(connect=10.0, read=50.0, write=10.0, pool=10.0),
|
||
headers={"Content-Type": "application/json"},
|
||
)
|
||
|
||
await self._security.initialize()
|
||
self._initialized = True
|
||
|
||
logger.info("telegram_gateway_initialized")
|
||
return True
|
||
|
||
@property
|
||
def bot_token(self) -> str:
|
||
"""取得 Bot Token"""
|
||
return settings.OPENCLAW_TG_BOT_TOKEN
|
||
|
||
@property
|
||
def chat_id(self) -> str:
|
||
"""取得 Chat ID"""
|
||
return settings.OPENCLAW_TG_CHAT_ID
|
||
|
||
@property
|
||
def alert_chat_id(self) -> str:
|
||
"""告警訊息收件人:SRE 群組優先,缺設定時才回退個人頻道。"""
|
||
return settings.SRE_GROUP_CHAT_ID or settings.OPENCLAW_TG_CHAT_ID
|
||
|
||
@property
|
||
def api_url(self) -> str:
|
||
"""取得 Telegram API URL"""
|
||
return f"{self.TELEGRAM_API_BASE}/bot{self.bot_token}"
|
||
|
||
async def _send_request(
|
||
self,
|
||
method: str,
|
||
payload: dict,
|
||
) -> dict:
|
||
"""
|
||
發送 Telegram API 請求
|
||
|
||
Phase C P1: 新增 OTEL 追蹤
|
||
@author Claude Code
|
||
@date 2026-03-30 (台北時間)
|
||
|
||
Args:
|
||
method: API 方法 (sendMessage, editMessageText, etc.)
|
||
payload: 請求 Payload
|
||
|
||
Returns:
|
||
dict: API 回應
|
||
"""
|
||
if not self._initialized:
|
||
await self.initialize()
|
||
|
||
if not self._http_client:
|
||
raise TelegramGatewayError("HTTP client not initialized")
|
||
|
||
await self._attach_incident_thread_reply(method, payload)
|
||
|
||
url = f"{self.api_url}/{method}"
|
||
|
||
# OTEL Span: telegram.api.{method}
|
||
with _tracer.start_as_current_span(
|
||
f"telegram.api.{method}",
|
||
attributes={
|
||
"telegram.method": method,
|
||
"telegram.chat_id": str(payload.get("chat_id", "")),
|
||
"telegram.has_reply_markup": "reply_markup" in payload,
|
||
},
|
||
) as span:
|
||
try:
|
||
response = await self._http_client.post(url, json=payload)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
|
||
if not result.get("ok"):
|
||
span.set_attribute("telegram.error", result.get("description", "Unknown"))
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
raise TelegramGatewayError(
|
||
f"Telegram API error: {result.get('description', 'Unknown error')}"
|
||
)
|
||
|
||
# 成功: 記錄 message_id (result 可能是 dict 或 bool,需防禦)
|
||
result_val = result.get("result")
|
||
if isinstance(result_val, dict) and "message_id" in result_val:
|
||
span.set_attribute("telegram.message_id", result_val["message_id"])
|
||
await self._mirror_outbound_message(
|
||
method=method,
|
||
payload=payload,
|
||
provider_message_id=str(result_val["message_id"]),
|
||
)
|
||
|
||
span.set_status(trace.Status(trace.StatusCode.OK))
|
||
return result
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
span.set_attribute("telegram.http_status", e.response.status_code)
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(
|
||
TelegramGatewayError(f"HTTP error: {e.response.status_code}")
|
||
)
|
||
logger.error("telegram_api_error", method=method, status=e.response.status_code,
|
||
response_body=e.response.text[:500])
|
||
raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from None
|
||
|
||
except TelegramGatewayError:
|
||
# 已處理的錯誤,直接拋出
|
||
raise
|
||
|
||
except Exception as e:
|
||
safe_error = _sanitize_telegram_error(str(e))
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(TelegramGatewayError(safe_error))
|
||
logger.error(
|
||
"telegram_request_failed",
|
||
method=method,
|
||
error=safe_error,
|
||
error_type=type(e).__name__,
|
||
)
|
||
raise TelegramGatewayError(safe_error) from None
|
||
|
||
async def _attach_incident_thread_reply(self, method: str, payload: dict) -> None:
|
||
"""將同一 Incident 的後續 Telegram 訊息接回原告警卡片。
|
||
|
||
2026-05-07 Codex — 主卡 `tg_msg:{incident_id}` 已存在時,後續
|
||
Runbook / escalation / 執行摘要不要再形成頂層訊息洪水,而是以
|
||
Telegram reply thread 延續;主 ACTION REQUIRED 卡與已顯式 reply 的
|
||
payload 不改動。
|
||
"""
|
||
if payload.pop("_skip_incident_thread_reply", False):
|
||
return
|
||
|
||
if method != "sendMessage" or _has_reply_context(payload):
|
||
return
|
||
|
||
text = str(payload.get("text") or "")
|
||
if not text or _is_root_action_required_card(text):
|
||
return
|
||
|
||
incident_id = _extract_incident_id_from_text(text)
|
||
if not incident_id:
|
||
return
|
||
|
||
try:
|
||
stored = await get_redis().get(f"tg_msg:{incident_id}")
|
||
except Exception as exc:
|
||
logger.debug(
|
||
"telegram_incident_thread_lookup_failed",
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
return
|
||
|
||
if not stored:
|
||
return
|
||
|
||
try:
|
||
message_id = int(stored)
|
||
except (TypeError, ValueError):
|
||
logger.debug(
|
||
"telegram_incident_thread_invalid_message_id",
|
||
incident_id=incident_id,
|
||
stored=str(stored),
|
||
)
|
||
return
|
||
|
||
payload["reply_parameters"] = {
|
||
"message_id": message_id,
|
||
"allow_sending_without_reply": True,
|
||
}
|
||
logger.info(
|
||
"telegram_incident_thread_reply_attached",
|
||
incident_id=incident_id,
|
||
message_id=message_id,
|
||
)
|
||
|
||
async def _mirror_outbound_message(
|
||
self,
|
||
*,
|
||
method: str,
|
||
payload: dict,
|
||
provider_message_id: str,
|
||
) -> None:
|
||
"""將 legacy Telegram 出站訊息鏡像到 AwoooP,不改變實際發送行為。"""
|
||
if method != "sendMessage":
|
||
return
|
||
|
||
chat_id = str(payload.get("chat_id") or "")
|
||
text = str(payload.get("text") or payload.get("caption") or "")
|
||
if not chat_id or not text:
|
||
return
|
||
|
||
try:
|
||
from src.core.context import get_current_project_id
|
||
from src.db.base import get_db_context
|
||
from src.services.channel_hub import record_outbound_message
|
||
|
||
project_id = get_current_project_id() or "awoooi"
|
||
run_id = _legacy_outbound_run_id(chat_id, provider_message_id)
|
||
async with get_db_context(project_id) as db:
|
||
await record_outbound_message(
|
||
db,
|
||
project_id=project_id,
|
||
run_id=run_id,
|
||
channel_type="telegram",
|
||
channel_chat_id=chat_id,
|
||
message_type=_infer_outbound_message_type(text, payload),
|
||
content=text,
|
||
source_envelope=_outbound_source_envelope(method, payload),
|
||
provider_message_id=provider_message_id,
|
||
send_status="sent",
|
||
triggered_by_state="legacy_gateway",
|
||
is_shadow=False,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"telegram_outbound_mirror_failed",
|
||
method=method,
|
||
chat_id=chat_id,
|
||
provider_message_id=provider_message_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
async def _build_inline_keyboard(
|
||
self,
|
||
approval_id: str,
|
||
include_auto_tuning: bool = True,
|
||
auto_tuning_command: str = "",
|
||
incident_id: str = "",
|
||
# ADR-071-E: TYPE-3 動態按鈕 (2026-04-11 Claude Sonnet 4.6)
|
||
alert_category: str = "",
|
||
notification_type: str = "",
|
||
# 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態按鈕(ActionPlan,可選)
|
||
action_plan: object = None,
|
||
) -> dict:
|
||
"""
|
||
建立 Inline Keyboard
|
||
|
||
ADR-050 v2.0 (2026-04-01): 六鍵佈局
|
||
ADR-071-E (2026-04-11): TYPE-3 依 alert_category 動態組合操作按鈕
|
||
ADR-082 B3 (2026-04-27): USE_LLM_DYNAMIC_BUTTONS → 優先使用 Solver LLM 動態按鈕
|
||
|
||
TYPE-3 按鈕對應 alert_category:
|
||
k8s_workload → [重啟] [擴容] [縮容] [回滾]
|
||
database → [終止慢查詢] [清連線池]
|
||
host_resource → [查程序] [重啟服務] [清 Log]
|
||
network → [重載 Nginx] [查 Port]
|
||
devops_tool → [重啟服務] [查 Log]
|
||
ai_system → [切換 Provider]
|
||
ssl_cert → [更新憑證]
|
||
(其他) → [批准] [拒絕] (舊版通用鍵)
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID (用於 nonce 生成)
|
||
include_auto_tuning: 是否包含自動調優按鈕
|
||
auto_tuning_command: kubectl 調優指令
|
||
incident_id: 關聯 Incident ID (用於 detail/reanalyze/history 按鈕)
|
||
alert_category: 告警類別 (ADR-071-E: 決定 TYPE-3 按鈕組合)
|
||
notification_type: 通知類型 (TYPE-1/2/3/4/4D)
|
||
action_plan: ActionPlan dataclass(B3: 有值且 USE_LLM_DYNAMIC_BUTTONS=true 時走 LLM 路徑)
|
||
"""
|
||
# 產生 Nonce (防重放,用於寫操作)
|
||
approve_nonce = self._security.generate_callback_nonce(approval_id, "approve")
|
||
reject_nonce = self._security.generate_callback_nonce(approval_id, "reject")
|
||
silence_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
|
||
# 第一排永遠置頂(HARD RULE,任何路徑不得改動)
|
||
first_row: list[dict] = [
|
||
{"text": "✅ 批准", "callback_data": approve_nonce},
|
||
{"text": "❌ 拒絕", "callback_data": reject_nonce},
|
||
]
|
||
|
||
# ── B3: LLM 動態路徑 ─────────────────────────────────────────────────
|
||
# 2026-04-27 Claude Sonnet 4.6: B3 — USE_LLM_DYNAMIC_BUTTONS=true 且
|
||
# action_plan.recommended_actions 非空時走此路徑,否則 fallback 到 YAML。
|
||
_llm_actions = (
|
||
getattr(action_plan, "recommended_actions", None)
|
||
if action_plan is not None
|
||
else None
|
||
)
|
||
if USE_LLM_DYNAMIC_BUTTONS and _llm_actions:
|
||
llm_rows = await self._build_llm_action_buttons(_llm_actions, incident_id=incident_id)
|
||
buttons: list[list[dict]] = [first_row] + llm_rows
|
||
logger.info(
|
||
"telegram_keyboard_built",
|
||
source="llm",
|
||
action_count=len(_llm_actions),
|
||
)
|
||
|
||
# 自動調優按鈕 (v7.0)
|
||
if include_auto_tuning and auto_tuning_command:
|
||
tuning_nonce = self._security.generate_callback_nonce(approval_id, "tune")
|
||
buttons.append([{"text": "⚡ 執行自動調優", "callback_data": tuning_nonce}])
|
||
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
buttons.append(awooop_row)
|
||
|
||
return {"inline_keyboard": buttons}
|
||
|
||
# ── YAML Fallback 路徑(保留既有 callback 佈局,另補 AwoooP evidence deep link)────
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.4):
|
||
# 從 callback_action_spec registry 動態產生按鈕(原 _CATEGORY_BUTTONS hardcode 已下架)
|
||
# 優點:新增按鈕只需改 yaml,callback_data 格式由 spec.callback_format 決定
|
||
# 安全:yaml 裡每個 action 都有對應 MCP dispatcher handler(Sprint 5.2/5.3 實作)
|
||
from src.services.callback_dispatcher import list_actions_for_category as _list_actions
|
||
|
||
def _build_category_buttons_for(category: str) -> list[tuple[str, str]]:
|
||
"""從 registry 產生 (label, callback_data) list"""
|
||
actions = _list_actions(category)
|
||
btns: list[tuple[str, str]] = []
|
||
for spec_it in actions:
|
||
emoji_label = f"{spec_it.emoji} {spec_it.label}".strip()
|
||
if spec_it.callback_format == "nonce":
|
||
# 寫類:產生 4-part nonce
|
||
cb = self._security.generate_callback_nonce(approval_id, spec_it.name)
|
||
else:
|
||
# 查類:2-part info
|
||
cb = f"{spec_it.name}:{incident_id}"
|
||
btns.append((emoji_label, cb))
|
||
return btns
|
||
|
||
is_type3 = notification_type in ("TYPE-3", NotificationType.TYPE_3, "")
|
||
|
||
_dynamic_buttons = _build_category_buttons_for(alert_category) if alert_category else []
|
||
|
||
if is_type3 and _dynamic_buttons:
|
||
# TYPE-3 動態按鈕:批准/拒絕永遠置頂第一行
|
||
# 2026-04-17 ogt + Claude Sonnet 4.6 (BUG-C): 強制置頂批准/拒絕
|
||
# 舊:批准/拒絕列在最後且受 requires_human_approval 控制 → K8s 按鈕蓋台 → 死卡
|
||
# 新:[批准][拒絕] 永遠第一行,K8s 類別按鈕置後,SRE 第一眼就看到審核扳機
|
||
rows: list[list[dict]] = [first_row]
|
||
# K8s/DB/Host 等類別操作按鈕(每行最多 3 個)置於第二列以後
|
||
category_btns = [
|
||
{"text": text, "callback_data": cb_data}
|
||
for text, cb_data in _dynamic_buttons
|
||
]
|
||
rows += [category_btns[i:i+3] for i in range(0, len(category_btns), 3)]
|
||
# 通用操作:[詳情] [忽略]
|
||
rows.append([
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔕 忽略", "callback_data": silence_nonce},
|
||
])
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
rows.append(awooop_row)
|
||
buttons = rows
|
||
else:
|
||
# 舊版通用鍵(向下相容)
|
||
buttons = [
|
||
[
|
||
{"text": "✅ 批准", "callback_data": approve_nonce},
|
||
{"text": "❌ 拒絕", "callback_data": reject_nonce},
|
||
{"text": "🔕 靜默", "callback_data": silence_nonce},
|
||
],
|
||
]
|
||
# 第二行: 資訊查詢按鈕 (ADR-050: read-only, format: action:incident_id)
|
||
if incident_id:
|
||
buttons.append([
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
])
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
buttons.append(awooop_row)
|
||
|
||
logger.info(
|
||
"telegram_keyboard_built",
|
||
source="yaml_fallback",
|
||
action_count=len(_dynamic_buttons),
|
||
)
|
||
|
||
# 自動調優按鈕 (v7.0)
|
||
if include_auto_tuning and auto_tuning_command:
|
||
tuning_nonce = self._security.generate_callback_nonce(approval_id, "tune")
|
||
buttons.append([
|
||
{"text": "⚡ 執行自動調優", "callback_data": tuning_nonce}
|
||
])
|
||
|
||
return {"inline_keyboard": buttons}
|
||
|
||
async def _build_llm_action_buttons(
|
||
self,
|
||
actions: list,
|
||
incident_id: str = "",
|
||
) -> list[list[dict]]:
|
||
"""
|
||
2026-04-27 Claude Sonnet 4.6: B3 — 從 RecommendedAction list 建立 Telegram 按鈕排
|
||
2026-04-27 Claude Sonnet 4.6: H3+M6 Fix — short_id Redis 映射 + critical 過濾
|
||
2026-04-27 Claude Sonnet 4.6: P0 Fix — async setex 在 return 前完成,消除按鈕已過期 race
|
||
|
||
規格:
|
||
- critical risk action → 直接跳過,不生成按鈕(M6)
|
||
- 每個 RecommendedAction → 一個按鈕
|
||
- text = f"{action.emoji} {action.label}"(risk=high 前綴 ⚠️)
|
||
- callback_data = f"la:{short_id}"(16-hex-chars,≤19 bytes,絕不截斷)(H3)
|
||
- 完整 payload(含 incident_id)寫入 Redis tg:la:{short_id},TTL=3600s(H3)
|
||
- Redis setex 在 return 之前 await 完成(P0 race fix)
|
||
- 每排最多 2 個(同 YAML fallback 排版)
|
||
- 不包含第一排 [批准][拒絕](由呼叫方負責置頂)
|
||
|
||
Args:
|
||
actions: list[RecommendedAction]
|
||
incident_id: 真實 incident ID,寫入 Redis payload 供 callback handler 還原
|
||
|
||
Returns:
|
||
list[list[dict]] — 按鈕行列(不含第一排)
|
||
"""
|
||
import json # noqa: PLC0415
|
||
import secrets # noqa: PLC0415
|
||
|
||
btn_list: list[dict] = []
|
||
redis_writes: list[tuple[str, str]] = [] # (key, json_str)
|
||
|
||
for action in actions:
|
||
name: str = getattr(action, "name", "")
|
||
label: str = getattr(action, "label", "")
|
||
emoji: str = getattr(action, "emoji", "")
|
||
provider: str = getattr(action, "mcp_provider", "")
|
||
tool: str = getattr(action, "mcp_tool", "")
|
||
risk: str = getattr(action, "risk", "low")
|
||
|
||
# M6: critical risk 直接跳過,不出按鈕
|
||
# 2026-04-27 Claude Sonnet 4.6: M6 Fix — critical action 不可被 Telegram 觸發
|
||
if risk == "critical":
|
||
logger.info(
|
||
"llm_button_critical_skipped",
|
||
name=name,
|
||
mcp_tool=tool,
|
||
)
|
||
continue
|
||
|
||
# risk=high 前綴 ⚠️ 警示
|
||
prefix = "⚠️ " if risk == "high" else ""
|
||
text = f"{prefix}{emoji} {label}".strip()
|
||
|
||
# H3: 16-hex short_id(64-bit),callback_data ≤19 bytes
|
||
short_id = secrets.token_hex(8) # 16-hex-chars(P1: 4→8 bytes 防碰撞)
|
||
cb_str = f"la:{short_id}"
|
||
|
||
payload_str = json.dumps(
|
||
{
|
||
"name": name,
|
||
"provider": provider,
|
||
"tool": tool,
|
||
"risk": risk,
|
||
"incident_id": incident_id, # P0: 真實 incident_id 進 Redis
|
||
},
|
||
ensure_ascii=False,
|
||
separators=(",", ":"),
|
||
)
|
||
redis_writes.append((f"tg:la:{short_id}", payload_str))
|
||
|
||
btn_list.append({"text": text, "callback_data": cb_str})
|
||
|
||
# P0 Fix: await 完成再 return,消除「按鈕發出→點擊→Redis 還沒寫」的 race
|
||
if redis_writes:
|
||
try:
|
||
redis = get_redis()
|
||
for key, value in redis_writes:
|
||
await redis.setex(key, 3600, value)
|
||
logger.debug("llm_button_redis_written", count=len(redis_writes))
|
||
except Exception as exc:
|
||
# 2026-04-28 ogt + Claude Opus 4.7: P0-4 鬼魂按鈕守門
|
||
# feedback_no_ghost_buttons.md 三缺一鐵律:callback 對應 short_id 找不到 = 鬼魂
|
||
# Redis 寫入失敗 → LLM 動態按鈕的 callback_data 在 Redis 撈不到 payload → 鬼魂風險
|
||
# 對策:清空 LLM 動態按鈕,caller (build_keyboard) 1488 行的 first_row 永遠保留
|
||
# (✅ 批准 / ❌ 拒絕 用 HMAC nonce,無狀態,不依賴 Redis)
|
||
# 統帥仍可走核心通道,少了 LLM 推薦的 specific actions(可接受的降級)
|
||
logger.error(
|
||
"llm_button_redis_write_failed_fallback_to_static",
|
||
error=str(exc),
|
||
dropped_count=len(btn_list),
|
||
hint="user will see only first_row (approve/reject), LLM-recommended actions dropped",
|
||
)
|
||
btn_list.clear()
|
||
|
||
# 每排最多 2 個
|
||
rows: list[list[dict]] = [btn_list[i:i+2] for i in range(0, len(btn_list), 2)]
|
||
return rows
|
||
|
||
async def send_analyzing_placeholder(
|
||
self,
|
||
alert_type: str,
|
||
resource_name: str,
|
||
severity: str = "medium",
|
||
) -> int | None:
|
||
"""
|
||
P2.4 中間態推播 2026-04-24 ogt + Claude Sonnet 4.6
|
||
在 LLM 分析開始前送出佔位卡,讓使用者知道系統正在處理。
|
||
分析完成後用 delete_message() 刪除,再由 send_approval_card 補上完整卡。
|
||
Returns: Telegram message_id 或 None(Bot 未設定 / API 失敗)
|
||
"""
|
||
if not self.bot_token:
|
||
return None
|
||
emoji = {"critical": "🔴", "medium": "🟡", "low": "🟢"}.get(severity.lower(), "⚠️")
|
||
text = (
|
||
f"{emoji} <b>告警收到,AI 正在分析中...</b>\n\n"
|
||
f"資源: <code>{html.escape(resource_name or 'unknown')}</code>\n"
|
||
f"類型: <code>{html.escape(alert_type or 'unknown')}</code>\n\n"
|
||
f"<i>預計 10-30 秒完成,請稍候...</i>"
|
||
)
|
||
try:
|
||
result = await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
})
|
||
msg_id: int | None = None
|
||
result_val = result.get("result")
|
||
if isinstance(result_val, dict):
|
||
msg_id = result_val.get("message_id")
|
||
logger.info("analyzing_placeholder_sent", message_id=msg_id, resource=resource_name)
|
||
return msg_id
|
||
except Exception as e:
|
||
logger.warning("analyzing_placeholder_failed", error=str(e))
|
||
return None
|
||
|
||
async def delete_message(self, message_id: int) -> bool:
|
||
"""
|
||
P2.4 中間態清理 2026-04-24 ogt + Claude Sonnet 4.6
|
||
刪除佔位卡(分析完成、完整卡已發出後呼叫)。
|
||
"""
|
||
if not self.bot_token or not message_id:
|
||
return False
|
||
try:
|
||
await self._send_request("deleteMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
})
|
||
logger.info("placeholder_deleted", message_id=message_id)
|
||
return True
|
||
except Exception as e:
|
||
logger.warning("placeholder_delete_failed", message_id=message_id, error=str(e))
|
||
return False
|
||
|
||
async def send_approval_card(
|
||
self,
|
||
approval_id: str,
|
||
risk_level: str,
|
||
resource_name: str,
|
||
root_cause: str,
|
||
suggested_action: str,
|
||
estimated_downtime: str = "~30s",
|
||
# v6.0 AI 仲裁欄位
|
||
primary_responsibility: str = "COLLAB",
|
||
confidence: float = 0.0,
|
||
namespace: str = "default",
|
||
# v7.0 SignOz 整合
|
||
signoz_rps: float = 0.0,
|
||
signoz_rps_trend: str = "stable",
|
||
signoz_error_rate: float = 0.0,
|
||
signoz_p99_latency: float = 0.0,
|
||
signoz_latency_trend: str = "stable",
|
||
signoz_trace_url: str = "",
|
||
auto_tuning_command: str = "",
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens: int = 0,
|
||
ai_cost: float = 0.0,
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency: dict | None = None,
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider: str = "",
|
||
# 2026-04-04 ogt: 底層模型名稱
|
||
ai_model: str = "",
|
||
# 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044)
|
||
nemotron_enabled: bool = False,
|
||
nemotron_tools: list[dict] | None = None,
|
||
nemotron_validation: str = "",
|
||
nemotron_latency_ms: float = 0.0,
|
||
# 2026-04-09 Claude Sonnet 4.6: Tool Calling 模型/後端顯示
|
||
nemotron_tool_model: str = "",
|
||
nemotron_tool_backend: str = "",
|
||
# 2026-04-05 Claude Code: incident_id 用於 detail/reanalyze/history 按鈕
|
||
incident_id: str = "",
|
||
# 2026-04-12 ogt: ADR-075 告警分類與通知類型(斷點 B 修復)
|
||
alert_category: str = "",
|
||
notification_type: str = "",
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076)
|
||
playbook_name: str = "",
|
||
automation_state: str = "",
|
||
) -> dict:
|
||
"""
|
||
推送待簽核卡片到 Telegram (v7.0 含 SignOz 整合)
|
||
|
||
SOUL.md 4.1 + AI 仲裁 + SignOz 訊息格式
|
||
Phase 21 (ADR-037): 含異常頻率統計
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
risk_level: 風險等級 (critical/medium/low)
|
||
resource_name: 資源名稱
|
||
root_cause: 根因摘要
|
||
suggested_action: 建議操作
|
||
estimated_downtime: 預計停機時間
|
||
primary_responsibility: 責任團隊 (FE/BE/INFRA/DB/COLLAB)
|
||
confidence: AI 信心度 (0.0-1.0)
|
||
namespace: K8s namespace
|
||
signoz_*: SignOz Gold Metrics
|
||
signoz_trace_url: 動態時間參數的 Trace URL
|
||
auto_tuning_command: kubectl 調優指令
|
||
anomaly_frequency: 異常頻率統計 (ADR-037)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
# 取得狀態 Emoji
|
||
emoji = RISK_EMOJI_MAP.get(risk_level.lower(), "⚠️")
|
||
|
||
# 建立 SignOz 指標區塊
|
||
signoz_metrics = None
|
||
if signoz_rps > 0 or signoz_error_rate > 0 or signoz_p99_latency > 0:
|
||
signoz_metrics = SignOzMetricsBlock(
|
||
rps=signoz_rps,
|
||
rps_trend=signoz_rps_trend,
|
||
error_rate=signoz_error_rate,
|
||
p99_latency_ms=signoz_p99_latency,
|
||
latency_trend=signoz_latency_trend,
|
||
trace_url=signoz_trace_url,
|
||
)
|
||
|
||
automation_quality: dict | None = None
|
||
remediation_summary = await _fetch_remediation_summary_for_card(
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
)
|
||
if incident_id:
|
||
try:
|
||
from src.services.awooop_truth_chain_service import fetch_truth_chain
|
||
|
||
truth_chain = await asyncio.wait_for(
|
||
fetch_truth_chain(
|
||
source_id=incident_id,
|
||
project_id="awoooi",
|
||
),
|
||
timeout=2.5,
|
||
)
|
||
quality = truth_chain.get("automation_quality")
|
||
if isinstance(quality, dict):
|
||
automation_quality = quality
|
||
except Exception as truth_exc:
|
||
logger.debug(
|
||
"telegram_approval_card_truth_chain_fetch_failed",
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
error=str(truth_exc),
|
||
)
|
||
|
||
# 建立訊息結構 (含 AI 仲裁 + SignOz + Token/Cost + 頻率統計)
|
||
message = TelegramMessage(
|
||
status_emoji=emoji,
|
||
risk_level=risk_level.upper(),
|
||
resource_name=resource_name,
|
||
root_cause=root_cause,
|
||
suggested_action=suggested_action,
|
||
estimated_downtime=estimated_downtime,
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
primary_responsibility=primary_responsibility,
|
||
confidence=confidence,
|
||
namespace=namespace,
|
||
signoz_metrics=signoz_metrics,
|
||
signoz_trace_url=signoz_trace_url,
|
||
auto_tuning_command=auto_tuning_command,
|
||
# 2026-03-29 ogt: AI Token/Cost 追蹤
|
||
ai_tokens=ai_tokens,
|
||
ai_cost=ai_cost,
|
||
# 2026-03-29 ogt: ADR-037 異常頻率統計
|
||
anomaly_frequency=anomaly_frequency,
|
||
# 2026-03-29 ogt: AI Provider 來源顯示
|
||
ai_provider=ai_provider,
|
||
# 2026-04-04 ogt: 底層模型名稱
|
||
ai_model=ai_model,
|
||
# 2026-04-02 ogt: Phase 22 Nemotron 協作 (ADR-044)
|
||
nemotron_enabled=nemotron_enabled,
|
||
nemotron_tools=nemotron_tools,
|
||
nemotron_validation=nemotron_validation,
|
||
nemotron_latency_ms=nemotron_latency_ms,
|
||
# 2026-04-09 Claude Sonnet 4.6: Tool Calling 模型/後端
|
||
nemotron_tool_model=nemotron_tool_model,
|
||
nemotron_tool_backend=nemotron_tool_backend,
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復鏈路顯示 (ADR-076)
|
||
alert_category=alert_category,
|
||
playbook_name=playbook_name,
|
||
automation_state=automation_state,
|
||
automation_quality=automation_quality,
|
||
remediation_summary=remediation_summary,
|
||
)
|
||
|
||
# 格式化訊息 — Phase 22: 如果 Nemotron 啟用,使用雙軌格式
|
||
text = message.format_with_nemotron() if nemotron_enabled else message.format()
|
||
|
||
# 建立按鈕 (含自動調優)
|
||
# 2026-04-05 Claude Code: 傳入 incident_id 以啟用 detail/reanalyze/history 按鈕
|
||
# ADR-075: 傳入 alert_category/notification_type 以啟用分類動態按鈕(斷點 B 修復)
|
||
keyboard = await self._build_inline_keyboard(
|
||
approval_id=approval_id,
|
||
include_auto_tuning=bool(auto_tuning_command),
|
||
auto_tuning_command=auto_tuning_command,
|
||
incident_id=incident_id,
|
||
alert_category=alert_category,
|
||
notification_type=notification_type,
|
||
)
|
||
|
||
# 發送訊息:2026-04-30 統帥指示,告警卡片完整切到 SRE 戰情室群組。
|
||
target_chat_id = self.alert_chat_id
|
||
if not target_chat_id:
|
||
logger.warning("telegram_approval_card_skipped", reason="alert_chat_id_missing")
|
||
return {}
|
||
payload = {
|
||
"chat_id": target_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
"disable_web_page_preview": True, # 避免 SignOz URL 預覽
|
||
}
|
||
|
||
logger.info(
|
||
"telegram_approval_card_sending",
|
||
approval_id=approval_id,
|
||
risk_level=risk_level,
|
||
resource=resource_name,
|
||
target_chat_id=str(target_chat_id),
|
||
signoz_integrated=signoz_metrics is not None,
|
||
auto_tuning_available=bool(auto_tuning_command),
|
||
)
|
||
|
||
result = await self._send_request("sendMessage", payload)
|
||
|
||
_msg_id = result.get("result", {}).get("message_id")
|
||
logger.info(
|
||
"telegram_approval_card_sent",
|
||
approval_id=approval_id,
|
||
message_id=_msg_id,
|
||
target_chat_id=str(target_chat_id),
|
||
)
|
||
|
||
# 2026-04-18 ADR-090-D: 寫入 notification_outcomes (MASTER §7.1 #10 KPI)
|
||
try:
|
||
from sqlalchemy import text as _sql
|
||
from src.db.base import get_db_context
|
||
_delivered = "delivered" if _msg_id else "failed"
|
||
_notif_type = f"TYPE-3-{alert_category}" if alert_category else "TYPE-3"
|
||
async with get_db_context() as _db:
|
||
await _db.execute(
|
||
_sql("""
|
||
INSERT INTO notification_outcomes (
|
||
approval_id, channel, notification_type, recipient,
|
||
message_id, delivery_status, metadata
|
||
) VALUES (
|
||
:aid, 'telegram', :nt, :rp,
|
||
:mid, :ds, CAST(:md AS jsonb)
|
||
)
|
||
"""),
|
||
{
|
||
"aid": approval_id,
|
||
"nt": _notif_type,
|
||
"rp": str(target_chat_id),
|
||
"mid": str(_msg_id) if _msg_id else None,
|
||
"ds": _delivered,
|
||
"md": '{"risk_level":"' + str(risk_level) + '"}',
|
||
},
|
||
)
|
||
except Exception as _db_e:
|
||
logger.warning("notification_outcomes_db_write_failed", error=str(_db_e))
|
||
|
||
# 2026-04-19 ogt + Claude Opus 4.7: 修 AP-1 — message_id 同時存進
|
||
# approval_records.telegram_message_id,不只 Redis(重啟會丟)
|
||
if _msg_id:
|
||
try:
|
||
from src.services.approval_db import get_approval_service
|
||
_svc = get_approval_service()
|
||
if hasattr(_svc, "update_telegram_message"):
|
||
# 若有 update_telegram_message 方法(通常用 incident_id)
|
||
# 先用 incident_id 更新,再 fallback 直接 UPDATE approval_records
|
||
from sqlalchemy import text as _sql2
|
||
from src.db.base import get_db_context as _gdc
|
||
async with _gdc() as _db2:
|
||
await _db2.execute(
|
||
_sql2("""
|
||
UPDATE approval_records
|
||
SET telegram_message_id = :mid,
|
||
telegram_chat_id = :cid
|
||
WHERE id = :aid
|
||
"""),
|
||
{
|
||
"mid": int(_msg_id),
|
||
"cid": int(target_chat_id),
|
||
"aid": str(approval_id),
|
||
},
|
||
)
|
||
except Exception as _db_e2:
|
||
logger.warning("approval_tg_msg_id_db_persist_failed",
|
||
approval_id=str(approval_id), error=str(_db_e2))
|
||
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei: 儲存 message_id 供自動修復後更新卡片
|
||
# key: tg_approval:{approval_id},TTL 24h
|
||
if _msg_id:
|
||
try:
|
||
await get_redis().setex(f"tg_approval:{approval_id}", 86400, str(_msg_id))
|
||
if incident_id:
|
||
await get_redis().setex(f"tg_msg:{incident_id}", 86400, str(_msg_id))
|
||
except Exception as _e:
|
||
logger.warning("tg_approval_msg_id_store_failed", approval_id=approval_id, error=str(_e))
|
||
|
||
return result
|
||
|
||
async def _send_approval_card_to_group(
|
||
self,
|
||
approval_id: str,
|
||
risk_level: str,
|
||
resource_name: str,
|
||
root_cause: str,
|
||
suggested_action: str,
|
||
estimated_downtime: str = "~30s",
|
||
primary_responsibility: str = "COLLAB",
|
||
confidence: float = 0.0,
|
||
namespace: str = "default",
|
||
signoz_rps: float = 0.0,
|
||
signoz_rps_trend: str = "stable",
|
||
signoz_error_rate: float = 0.0,
|
||
signoz_p99_latency: float = 0.0,
|
||
signoz_latency_trend: str = "stable",
|
||
signoz_trace_url: str = "",
|
||
auto_tuning_command: str = "",
|
||
ai_tokens: int = 0,
|
||
ai_cost: float = 0.0,
|
||
anomaly_frequency: dict | None = None,
|
||
ai_provider: str = "",
|
||
ai_model: str = "",
|
||
nemotron_enabled: bool = False,
|
||
nemotron_tools: list[dict] | None = None,
|
||
nemotron_validation: str = "",
|
||
nemotron_latency_ms: float = 0.0,
|
||
incident_id: str = "",
|
||
alert_category: str = "",
|
||
notification_type: str = "",
|
||
) -> None:
|
||
"""
|
||
發送告警卡片到 SRE 群組 — 與個人 chat 相同的完整 v7.0 格式
|
||
|
||
2026-04-05 ogt: 升級為完整格式(含 SignOz/AI/Nemotron),移除精簡版
|
||
由 asyncio.create_task 非同步呼叫,失敗不影響主告警流程。
|
||
"""
|
||
try:
|
||
emoji = RISK_EMOJI_MAP.get(risk_level.lower(), "⚠️")
|
||
|
||
signoz_metrics = None
|
||
if signoz_rps > 0 or signoz_error_rate > 0 or signoz_p99_latency > 0:
|
||
signoz_metrics = SignOzMetricsBlock(
|
||
rps=signoz_rps,
|
||
rps_trend=signoz_rps_trend,
|
||
error_rate=signoz_error_rate,
|
||
p99_latency_ms=signoz_p99_latency,
|
||
latency_trend=signoz_latency_trend,
|
||
trace_url=signoz_trace_url,
|
||
)
|
||
|
||
remediation_summary = await _fetch_remediation_summary_for_card(
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
)
|
||
message = TelegramMessage(
|
||
status_emoji=emoji,
|
||
risk_level=risk_level.upper(),
|
||
resource_name=resource_name,
|
||
root_cause=root_cause,
|
||
suggested_action=suggested_action,
|
||
estimated_downtime=estimated_downtime,
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
primary_responsibility=primary_responsibility,
|
||
confidence=confidence,
|
||
namespace=namespace,
|
||
signoz_metrics=signoz_metrics,
|
||
signoz_trace_url=signoz_trace_url,
|
||
auto_tuning_command=auto_tuning_command,
|
||
ai_tokens=ai_tokens,
|
||
ai_cost=ai_cost,
|
||
anomaly_frequency=anomaly_frequency,
|
||
ai_provider=ai_provider,
|
||
ai_model=ai_model,
|
||
nemotron_enabled=nemotron_enabled,
|
||
nemotron_tools=nemotron_tools,
|
||
nemotron_validation=nemotron_validation,
|
||
nemotron_latency_ms=nemotron_latency_ms,
|
||
remediation_summary=remediation_summary,
|
||
)
|
||
text = message.format_with_nemotron() if nemotron_enabled else message.format()
|
||
|
||
# 2026-04-25 ogt + Claude Sonnet 4.6: 群組卡片使用完整 _build_inline_keyboard
|
||
# 統帥決策: 群組成員為受信任 SRE,完整批准/拒絕/暫默/詳情/重診/歷史按鈕從 DM 移植至群組
|
||
_group_keyboard = await self._build_inline_keyboard(
|
||
approval_id=approval_id,
|
||
incident_id=incident_id,
|
||
alert_category=alert_category,
|
||
notification_type=notification_type,
|
||
)
|
||
resp = await self.send_to_group(text=text, reply_markup=_group_keyboard)
|
||
|
||
# 2026-04-10 Claude Sonnet 4.6: 儲存 message_id 到 Redis,供 append_incident_update 使用
|
||
# tg_msg:{incident_id} → Telegram message_id (TTL 24h)
|
||
if incident_id and resp:
|
||
tg_message_id = (resp.get("result") or {}).get("message_id") or resp.get("message_id")
|
||
if tg_message_id:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
await redis.set(f"tg_msg:{incident_id}", str(tg_message_id), ex=86400)
|
||
logger.info("tg_msg_id_stored", incident_id=incident_id, message_id=tg_message_id)
|
||
except Exception as e:
|
||
logger.error("send_approval_card_to_group_failed", error=str(e))
|
||
|
||
# =========================================================================
|
||
# ADR-071-C: TYPE-1 純資訊通知 (2026-04-11 Claude Sonnet 4.6)
|
||
# =========================================================================
|
||
|
||
async def send_info_notification(
|
||
self,
|
||
incident_id: str,
|
||
title: str,
|
||
message: str,
|
||
alertname: str = "",
|
||
severity: str = "info",
|
||
) -> dict:
|
||
"""
|
||
TYPE-1 純資訊通知 — FYI 類告警
|
||
|
||
用於: severity=info 成功類 / Backup 完成 / AlertChainHealthy 等
|
||
格式: 簡潔文字 + [詳情][歷史] 查類按鈕(read-only,2-part info 格式,ADR-050)
|
||
|
||
2026-04-25 ogt + Claude Sonnet 4.6: 補充 read-only 按鈕(鬼魂按鈕鐵律:
|
||
detail/history 已有 handler 且無副作用,符合三條件才加)
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
title: 訊息標題
|
||
message: 訊息內容
|
||
alertname: 告警名稱
|
||
severity: 嚴重度 (info/success)
|
||
"""
|
||
severity_emoji = {"info": "ℹ️", "success": "✅"}.get(severity, "ℹ️")
|
||
text = (
|
||
f"{severity_emoji} <b>{html.escape(title)}</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
)
|
||
if alertname:
|
||
text += f"🔔 告警: <code>{html.escape(alertname)}</code>\n"
|
||
text += f"\n{html.escape(message)}"
|
||
|
||
# read-only 查類按鈕(2-part info 格式,handler 已在 handle_callback 實作)
|
||
# detail/history 均在 INFO_ACTIONS 白名單,無 nonce 無副作用
|
||
inline_keyboard = [[
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
]]
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
inline_keyboard.append(awooop_row)
|
||
keyboard = {"inline_keyboard": inline_keyboard}
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
# =========================================================================
|
||
# ADR-071-F: TYPE-4D Config Drift 專屬卡片 (2026-04-11 Claude Sonnet 4.6)
|
||
# =========================================================================
|
||
|
||
async def send_drift_card(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
resource_name: str,
|
||
diff_summary: str,
|
||
detected_at: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-4D Config Drift 通知卡片
|
||
|
||
按鈕: [查看Diff] [採納變更] [回滾] [忽略]
|
||
Diff 長度 > 500 字 → 改送 Web 連結,避免 Telegram 訊息過長
|
||
|
||
Args:
|
||
incident_id: 事件 ID
|
||
approval_id: 簽核單 ID (用於 nonce 生成)
|
||
resource_name: 漂移的資源名稱
|
||
diff_summary: Diff 摘要文字
|
||
detected_at: 偵測時間
|
||
"""
|
||
# 2026-04-19 Claude Opus 4.7 修 diff_summary 被 <pre> 包成 code block (copy 按鈕 UI)
|
||
# 根因:<pre> 在 Telegram HTML mode 渲染為 code block,但 diff_summary 是 AI
|
||
# 研判敘述 + emoji 清單(非 code),應以純文字顯示
|
||
# Diff 長度處理 (ADR-071, Section 14.9.6)
|
||
# 2026-04-20 P0.2 ogt + Claude Opus 4.7: 500 → 1500 讓 AI 建議 + narrative + items 完整顯示
|
||
if len(diff_summary) <= 1500:
|
||
diff_block = f"\n━━━━━━━━━━━━━━━━━━━\n{html.escape(diff_summary)}"
|
||
else:
|
||
web_url = f"https://aiops.wooo.work/incidents/{incident_id}/drift-diff"
|
||
diff_block = f"\n⚠️ 差異過大({len(diff_summary)} 字)\n🔗 <a href='{web_url}'>查看完整 Diff</a>"
|
||
|
||
text = (
|
||
f"⚙️ <b>Config Drift 偵測</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🎯 資源: <code>{html.escape(resource_name[:50])}</code>\n"
|
||
)
|
||
if detected_at:
|
||
text += f"🕐 偵測時間: {html.escape(detected_at)}\n"
|
||
text += diff_block
|
||
|
||
# 按鈕組合 (TYPE-4D 固定四鍵)
|
||
view_nonce = self._security.generate_callback_nonce(approval_id, "drift_view")
|
||
adopt_nonce = self._security.generate_callback_nonce(approval_id, "drift_adopt")
|
||
revert_nonce = self._security.generate_callback_nonce(approval_id, "drift_revert")
|
||
ignore_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔍 查看 Diff", "callback_data": view_nonce},
|
||
{"text": "✅ 採納變更", "callback_data": adopt_nonce},
|
||
],
|
||
[
|
||
{"text": "⏪ 回滾", "callback_data": revert_nonce},
|
||
{"text": "🔕 忽略", "callback_data": ignore_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
_result = await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
# 2026-04-19 ogt + Claude Opus 4.7: 修 TG-4 存 drift message_id 到 Redis
|
||
# 供 drift_adopt/drift_revert 執行後 edit 回原卡片
|
||
try:
|
||
_msg_id = _result.get("result", {}).get("message_id")
|
||
if _msg_id:
|
||
await get_redis().setex(
|
||
f"tg_drift:{incident_id}", 86400, str(_msg_id)
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("tg_drift_msg_id_store_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
return _result
|
||
|
||
# =========================================================================
|
||
# 2026-04-19 ogt + Claude Opus 4.7: drift_* 按鈕 handler (修 TG-2)
|
||
# =========================================================================
|
||
|
||
async def _handle_drift_action(
|
||
self,
|
||
action: str,
|
||
approval_id: str,
|
||
callback_query_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
user: dict,
|
||
) -> dict:
|
||
"""
|
||
處理 drift_view / drift_adopt / drift_revert 按鈕。
|
||
approval_id 在 drift card 即 report_id (send_drift_card 設計)。
|
||
"""
|
||
report_id = approval_id
|
||
logger.info(
|
||
"drift_callback_dispatched",
|
||
action=action, report_id=report_id,
|
||
user_id=user_id, username=username,
|
||
)
|
||
try:
|
||
if action == "drift_view":
|
||
await self._answer_callback(callback_query_id, action, text="🔍 撈全部 Diff...")
|
||
await self._send_drift_diff_detail(report_id)
|
||
return {
|
||
"action": action, "approval_id": approval_id,
|
||
"user": user, "success": True, "info_action": True,
|
||
}
|
||
|
||
if action == "drift_adopt":
|
||
await self._answer_callback(callback_query_id, action, text="✅ 採納中...")
|
||
try:
|
||
from src.services.drift_adopt_service import get_drift_adopt_service
|
||
_adopt_result = await get_drift_adopt_service().adopt_drift(report_id)
|
||
_ok = bool(_adopt_result.get("success") if isinstance(_adopt_result, dict) else _adopt_result)
|
||
except Exception as _e:
|
||
logger.warning("drift_adopt_failed", report_id=report_id, error=str(_e))
|
||
_ok = False
|
||
await self._edit_drift_card_outcome(
|
||
report_id=report_id, verb="已採納", by=username, ok=_ok,
|
||
)
|
||
return {"action": action, "approval_id": approval_id, "user": user, "success": _ok}
|
||
|
||
if action == "drift_revert":
|
||
await self._answer_callback(callback_query_id, action, text="⏪ 回滾中...")
|
||
try:
|
||
from src.services.drift_remediator import get_drift_remediator
|
||
_revert_result = await get_drift_remediator().revert(report_id)
|
||
_ok = bool(_revert_result.get("success") if isinstance(_revert_result, dict) else _revert_result)
|
||
except Exception as _e:
|
||
logger.warning("drift_revert_failed", report_id=report_id, error=str(_e))
|
||
_ok = False
|
||
await self._edit_drift_card_outcome(
|
||
report_id=report_id, verb="已回滾", by=username, ok=_ok,
|
||
)
|
||
return {"action": action, "approval_id": approval_id, "user": user, "success": _ok}
|
||
|
||
except Exception as _outer:
|
||
logger.exception("drift_action_handler_error", action=action, error=str(_outer))
|
||
|
||
return {"action": action, "approval_id": approval_id, "user": user, "success": False}
|
||
|
||
# 2026-04-20 P0.1 ogt + Claude Opus 4.7: drift_view 分頁 + 分類桶
|
||
# 原邏輯: _send_drift_diff_detail 一次列 3800 字元 → 30 項洗版
|
||
# 新邏輯: 分頁 10 項/頁、header 顯示 3 桶分類計數、⬅️/➡️ 按鈕切頁
|
||
_DRIFT_PAGE_SIZE = 10
|
||
|
||
def _classify_drift_item(self, item) -> str:
|
||
"""
|
||
分類 drift item 到 3 桶(規則式,不走 LLM 省 token):
|
||
- k8s_default: K8s controller 自動補齊(白名單或空↔空)
|
||
- human_high: HIGH level 且非 trivial(像是 image/env/ports 被人工改)
|
||
- routine_medium: MEDIUM 非 trivial(一般設定調整)
|
||
"""
|
||
level = getattr(item.drift_level, "value", str(item.drift_level))
|
||
# 白名單或 trivial → K8s 自動補齊
|
||
if item.is_allowlisted:
|
||
return "k8s_default"
|
||
_g, _a = item.git_value, item.actual_value
|
||
_empty_g = _g is None or str(_g).strip() in ("", "{}", "[]", "null", "None")
|
||
_empty_a = _a is None or str(_a).strip() in ("", "{}", "[]", "null", "None")
|
||
if _empty_g and _empty_a:
|
||
return "k8s_default"
|
||
if level == "high":
|
||
return "human_high"
|
||
return "routine_medium"
|
||
|
||
async def _send_drift_diff_detail(self, report_id: str, page: int = 0) -> None:
|
||
"""
|
||
送分頁 Drift Diff 到 Telegram (drift_view / drift_view_page 按鈕回應)
|
||
|
||
每頁 _DRIFT_PAGE_SIZE 項,header 顯示 3 桶分類計數 + 分頁位置,
|
||
底部含「⬅️ 上頁 / 下頁 ➡️」按鈕 (callback: drift_view_page:{report_id}_{page})。
|
||
"""
|
||
try:
|
||
from src.repositories.drift_repository import get_drift_repository
|
||
_rpt = await get_drift_repository().get_by_id(report_id)
|
||
if not _rpt:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": f"⚠️ 找不到 Drift report <code>{html.escape(report_id)}</code>",
|
||
"parse_mode": "HTML",
|
||
})
|
||
return
|
||
|
||
# 1. 分類 & 排序(HIGH 優先 → routine → trivial)
|
||
_classified: list[tuple[str, object]] = [
|
||
(self._classify_drift_item(_it), _it) for _it in _rpt.items
|
||
]
|
||
_bucket_order = {"human_high": 0, "routine_medium": 1, "k8s_default": 2}
|
||
_classified.sort(key=lambda x: _bucket_order[x[0]])
|
||
|
||
_bucket_counts = {"human_high": 0, "routine_medium": 0, "k8s_default": 0}
|
||
for _bk, _ in _classified:
|
||
_bucket_counts[_bk] += 1
|
||
|
||
_total = len(_classified)
|
||
_total_pages = max(1, (_total + self._DRIFT_PAGE_SIZE - 1) // self._DRIFT_PAGE_SIZE)
|
||
_page = max(0, min(page, _total_pages - 1))
|
||
_start = _page * self._DRIFT_PAGE_SIZE
|
||
_end = min(_start + self._DRIFT_PAGE_SIZE, _total)
|
||
_slice = _classified[_start:_end]
|
||
|
||
# 2. Header(AI 分類桶)
|
||
_header = [
|
||
f"📊 <b>Drift Diff (頁 {_page + 1}/{_total_pages})</b> — <code>{html.escape(report_id)[:24]}</code>",
|
||
f"Namespace: <code>{html.escape(_rpt.namespace)}</code>",
|
||
(
|
||
f"🔴 人工高風險 {_bucket_counts['human_high']} | "
|
||
f"🟡 一般修改 {_bucket_counts['routine_medium']} | "
|
||
f"🔧 K8s 自動 {_bucket_counts['k8s_default']}"
|
||
),
|
||
"━" * 20,
|
||
]
|
||
_lines = list(_header)
|
||
_MAX_LEN = 3800
|
||
_used_len = sum(len(s) + 1 for s in _header)
|
||
|
||
# 3. 本頁項目(每項仍守 _MAX_LEN 上限,極端長值時寧可提早中斷也不洗版)
|
||
_rendered = 0
|
||
_bucket_emoji = {"human_high": "🔴", "routine_medium": "🟡", "k8s_default": "🔧"}
|
||
for _bk, _item in _slice:
|
||
_emoji = _bucket_emoji[_bk]
|
||
_field = (_item.field_path or "")[:80]
|
||
_git = str(_item.git_value)[:40] if _item.git_value is not None else "(未設)"
|
||
_k8s = str(_item.actual_value)[:40] if _item.actual_value is not None else "(未設)"
|
||
_block = (
|
||
f"{_emoji} <b>{html.escape(_field)}</b>\n"
|
||
f" Git: <code>{html.escape(_git)}</code>\n"
|
||
f" K8s: <code>{html.escape(_k8s)}</code>"
|
||
)
|
||
if _used_len + len(_block) + 1 > _MAX_LEN:
|
||
break
|
||
_lines.append(_block)
|
||
_used_len += len(_block) + 1
|
||
_rendered += 1
|
||
|
||
_skipped_in_page = len(_slice) - _rendered
|
||
if _skipped_in_page > 0:
|
||
_lines.append(f"… 本頁還有 {_skipped_in_page} 項過長未顯示,請縮小 field 範圍")
|
||
|
||
_full = "\n".join(_lines)
|
||
|
||
# 4. 分頁按鈕(INFO_ACTIONS 2-part 格式,payload 用底線分隔 report_id 與 page)
|
||
_rows = []
|
||
_nav = []
|
||
if _page > 0:
|
||
_nav.append({
|
||
"text": "⬅️ 上頁",
|
||
"callback_data": f"drift_view_page:{report_id}_{_page - 1}",
|
||
})
|
||
if _page < _total_pages - 1:
|
||
_nav.append({
|
||
"text": "下頁 ➡️",
|
||
"callback_data": f"drift_view_page:{report_id}_{_page + 1}",
|
||
})
|
||
if _nav:
|
||
_rows.append(_nav)
|
||
_keyboard = {"inline_keyboard": _rows} if _rows else None
|
||
|
||
_payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": _full,
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
if _keyboard:
|
||
_payload["reply_markup"] = _keyboard
|
||
await self._send_request("sendMessage", _payload)
|
||
except Exception as _e:
|
||
logger.warning("drift_diff_detail_send_failed", report_id=report_id, page=page, error=str(_e))
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": f"⚠️ Drift Diff 查詢失敗: <code>{html.escape(str(_e)[:150])}</code>",
|
||
"parse_mode": "HTML",
|
||
})
|
||
|
||
async def _handle_ai_advisory_action(
|
||
self,
|
||
action: str,
|
||
advisory_payload: str, # 格式: '{type}:{id}'
|
||
callback_query_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
user: dict,
|
||
message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
2026-04-19 P0 修 (ADR-092): 處理 4 LLM scanner 的互動按鈕.
|
||
|
||
action: ai_advisory_handled / ai_advisory_snooze / ai_advisory_view / ai_advisory_produce_cmd
|
||
advisory_payload: '{advisory_type}:{advisory_id}' (nonce 解析後的 approval_id 位置)
|
||
|
||
流程:
|
||
1. 解析 payload → advisory_type + advisory_id
|
||
2. 呼叫 ai_advisory_helpers.handle_ai_advisory_callback
|
||
3. answer_callback (Telegram 按鈕回饋 toast)
|
||
4. 編輯原訊息尾部加「✅ 已處理 by user@時間」
|
||
"""
|
||
try:
|
||
# 解析 '{type}:{id}'
|
||
if ":" in advisory_payload:
|
||
advisory_type, advisory_id = advisory_payload.split(":", 1)
|
||
else:
|
||
advisory_type, advisory_id = "unknown", advisory_payload
|
||
|
||
# action 去掉 'ai_advisory_' 前綴 → 得到純 action 名 (handled/snooze/view/produce_cmd)
|
||
pure_action = action.replace("ai_advisory_", "", 1)
|
||
|
||
logger.info(
|
||
"ai_advisory_callback",
|
||
action=pure_action, advisory_type=advisory_type,
|
||
advisory_id=advisory_id, user=username,
|
||
)
|
||
|
||
from src.services.ai_advisory_helpers import handle_ai_advisory_callback
|
||
result = await handle_ai_advisory_callback(
|
||
action=pure_action,
|
||
advisory_type=advisory_type,
|
||
advisory_id=advisory_id,
|
||
username=username,
|
||
)
|
||
|
||
feedback_text = result.get("feedback_text", "已收到")
|
||
await self._answer_callback(callback_query_id, action, text=feedback_text)
|
||
|
||
# 2026-04-22 Claude Sonnet 4.6: 發群組 reply(toast 2-3 秒消失,群組才是永久可見)
|
||
if message_id and feedback_text:
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": feedback_text,
|
||
"reply_to_message_id": message_id,
|
||
})
|
||
logger.info("ai_advisory_group_reply_sent", action=pure_action, message_id=message_id)
|
||
except Exception as _ge:
|
||
logger.warning("ai_advisory_group_reply_failed", action=pure_action, error=str(_ge))
|
||
|
||
return {
|
||
"action": action, "advisory_type": advisory_type, "advisory_id": advisory_id,
|
||
"user": user, "success": result.get("success", False),
|
||
"info_action": pure_action in ("view", "produce_cmd"),
|
||
}
|
||
except Exception as _e:
|
||
logger.exception("ai_advisory_callback_error", action=action, error=str(_e))
|
||
try:
|
||
await self._answer_callback(callback_query_id, action, text="⚠️ 處理失敗")
|
||
except Exception:
|
||
pass
|
||
return {"action": action, "user": user, "success": False}
|
||
|
||
async def _edit_drift_card_outcome(
|
||
self, report_id: str, verb: str, by: str, ok: bool,
|
||
) -> None:
|
||
"""
|
||
drift_adopt/drift_revert 執行後:
|
||
1. 原卡片移除按鈕(用 editMessageReplyMarkup)
|
||
2. 在原卡片下 reply 執行結果訊息(包含 verb/by/成功失敗)
|
||
"""
|
||
_icon = "✅" if ok else "❌"
|
||
_stamp = (
|
||
f"{_icon} <b>{html.escape(verb)}</b> by @{html.escape(by)} "
|
||
f"({'成功' if ok else '失敗'})\n"
|
||
f"Drift <code>{html.escape(report_id)}</code>"
|
||
)
|
||
_msg_id: int | None = None
|
||
try:
|
||
_msg_id_raw = await get_redis().get(f"tg_drift:{report_id}")
|
||
if _msg_id_raw:
|
||
_msg_id = int(_msg_id_raw)
|
||
# 先移除按鈕
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": _msg_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except Exception as _e:
|
||
logger.warning("drift_card_buttons_remove_failed", report_id=report_id, error=str(_e))
|
||
|
||
# 送簽核戳訊息(reply_to 原卡片,若有 msg_id)
|
||
try:
|
||
_payload: dict = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": _stamp,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if _msg_id:
|
||
_payload["reply_to_message_id"] = _msg_id
|
||
await self._send_request("sendMessage", _payload)
|
||
except Exception as _e:
|
||
logger.warning("drift_outcome_stamp_send_failed", report_id=report_id, error=str(_e))
|
||
|
||
# =========================================================================
|
||
# ADR-075: TYPE-8M Meta-System 告警(飛輪/告警鏈路健康)
|
||
# 2026-04-12 ogt
|
||
# =========================================================================
|
||
|
||
async def send_meta_alert(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
alertname: str,
|
||
alert_category: str, # "flywheel_health" or "alertchain_health"
|
||
diagnosis: str,
|
||
severity_level: str = "critical",
|
||
system_impact: str = "",
|
||
probable_cause: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-8M Meta-System 告警 — 飛輪或告警鏈路自身健康異常。
|
||
|
||
適用: FlywheelPlaybookZero / AlertChainBroken_* 等
|
||
按鈕: 固定 3 個([觸發診斷] [查看面板] [靜默])
|
||
正式收件通道為 AwoooI SRE 戰情室群組。
|
||
"""
|
||
severity_emoji = "🔴" if severity_level == "critical" else "🟠"
|
||
category_label = "飛輪核心異常" if alert_category == "flywheel_health" else "告警鏈路異常"
|
||
|
||
text = (
|
||
f"⚙️ META SYSTEM | {severity_emoji} {category_label}\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 異常元件:<b>{html.escape(alertname)}</b>\n"
|
||
f"🎯 診斷結果:{html.escape(_smart_truncate(diagnosis, 320))}\n"
|
||
)
|
||
if system_impact:
|
||
text += f"\n🧠 <b>系統影響</b>\n{html.escape(_smart_truncate(system_impact, 320))}\n"
|
||
if probable_cause:
|
||
text += f"└─ 可能根因:{html.escape(_smart_truncate(probable_cause, 320))}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
silence_nonce = self._security.generate_callback_nonce(approval_id, "silence")
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_secops_card(
|
||
self,
|
||
incident_id: str,
|
||
approval_id: str,
|
||
alertname: str,
|
||
threat_level: str,
|
||
source: str = "",
|
||
threat_behavior: str = "",
|
||
defense_action: str = "",
|
||
resource: str = "",
|
||
) -> dict:
|
||
"""
|
||
TYPE-5S SecOps 資安威脅告警卡片。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
按鈕: [隔離資源] [封鎖來源IP] [強制驅逐] [確認授權]
|
||
正式收件通道為 AwoooI SRE 戰情室群組。
|
||
"""
|
||
level_icon = {"critical": "🔴", "warning": "🟠"}.get(threat_level.lower(), "⚠️")
|
||
|
||
text = (
|
||
f"🥷 SECOPS | {level_icon} 資安威脅\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 威脅類型:<b>{html.escape(alertname)}</b>\n"
|
||
)
|
||
if resource:
|
||
text += f"🎯 受害資源:<code>{html.escape(resource)}</code>\n"
|
||
text += "\n🧠 <b>AI 威脅分析</b>\n"
|
||
if source:
|
||
text += f"├─ 來源:{html.escape(source)}\n"
|
||
if threat_behavior:
|
||
text += f"├─ 異常行為:{html.escape(threat_behavior[:200])}\n"
|
||
text += f"└─ 風險評估:<b>{html.escape(threat_level)}</b>\n"
|
||
if defense_action:
|
||
text += f"\n🛡️ <b>建議防禦動作</b>\n<code>{html.escape(defense_action[:200])}</code>\n"
|
||
|
||
# 2026-04-16 ogt: 移除 secops_block_ip / secops_evict (spec=nonce 但用了 2-part格式,無 handler)
|
||
# secops 高危操作需 multi-sig,現階段無多簽機制,暫移除等補齊
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
isolate_nonce = self._security.generate_callback_nonce(approval_id, "secops_isolate")
|
||
auth_nonce = self._security.generate_callback_nonce(approval_id, "secops_authorize")
|
||
|
||
keyboard = {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🚫 隔離資源", "callback_data": isolate_nonce},
|
||
{"text": "✅ 確認授權操作", "callback_data": auth_nonce},
|
||
],
|
||
]
|
||
}
|
||
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_business_alert(
|
||
self,
|
||
incident_id: str,
|
||
alertname: str,
|
||
business_domain: str,
|
||
metric_name: str,
|
||
current_value: str,
|
||
threshold: str,
|
||
loss_rate: str = "",
|
||
group_chat_id: str | None = None,
|
||
) -> dict:
|
||
"""
|
||
TYPE-6B 業務/FinOps 告警。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
路由: TYPE-1 發群組,此為 TYPE-6B 也發群組(業務趨勢數字)
|
||
按鈕: [暫停] [查 SignOz] [忽略]
|
||
"""
|
||
text = (
|
||
f"📉 SLO ALERT | 業務指標異常\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||
f"🚨 告警:<b>{html.escape(alertname)}</b>\n"
|
||
f"🎯 影響業務:{html.escape(business_domain)}\n"
|
||
f"📊 異常指標:<code>{html.escape(metric_name)}</code>\n"
|
||
f"\n🧠 <b>業務衝擊分析</b>\n"
|
||
f"├─ 當前狀態:{html.escape(current_value)} (閾值: {html.escape(threshold)})\n"
|
||
)
|
||
if loss_rate:
|
||
text += f"└─ 損失速率:{html.escape(loss_rate)}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 pause_1h / ignore (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
keyboard = {"inline_keyboard": []}
|
||
|
||
target_chat = group_chat_id or self.alert_chat_id
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": target_chat,
|
||
"text": text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
async def send_escalation_card(
|
||
self,
|
||
incident_id: str,
|
||
original_alertname: str,
|
||
duration_min: int,
|
||
priority: int = 0,
|
||
attempted_actions: str = "",
|
||
failure_reason: str = "",
|
||
current_impact: str = "",
|
||
group_chat_id: str | None = None,
|
||
) -> dict:
|
||
"""
|
||
TYPE-7E 重大事故升級通知。
|
||
|
||
ADR-075 (2026-04-12 ogt)
|
||
觸發: SLA 超時(P0: 15分鐘; P1: 45分鐘)
|
||
路由: SRE 戰情室群組(緊急事故全員知情)
|
||
按鈕: [建立戰情室] [Postmortem草稿] [DR手冊] [確認接手]
|
||
"""
|
||
duration_str = f"{duration_min} 分鐘" if duration_min < 60 else f"{duration_min//60} 小時 {duration_min%60} 分"
|
||
|
||
text = (
|
||
f"🚨 ESCALATION | P{priority} 事故升級\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 <code>{html.escape(incident_id)}</code> | 已持續 <b>{duration_str}</b>\n"
|
||
f"⚠️ <b>超出自動修復能力範圍</b>\n"
|
||
f"🎯 核心問題:<code>{html.escape(original_alertname)}</code>\n"
|
||
)
|
||
if attempted_actions or failure_reason or current_impact:
|
||
text += "\n🧠 <b>AI 戰局總結</b>\n"
|
||
if attempted_actions:
|
||
text += f"├─ 嘗試動作:{html.escape(attempted_actions[:100])}\n"
|
||
if failure_reason:
|
||
text += f"├─ 失敗原因:{html.escape(failure_reason[:100])}\n"
|
||
if current_impact:
|
||
text += f"└─ 目前影響:{html.escape(current_impact[:100])}\n"
|
||
|
||
# 2026-04-16 ogt: 移除 postmortem / escalation_ack / dr_manual (3-part ghost button,無 handler)
|
||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||
keyboard = {"inline_keyboard": []}
|
||
|
||
target_chat = group_chat_id or self.alert_chat_id
|
||
return await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": target_chat,
|
||
"text": text + ("\n📣 @所有人 事故升級,請協助!" if settings.SRE_GROUP_CHAT_ID else ""),
|
||
"parse_mode": "HTML",
|
||
"reply_markup": keyboard,
|
||
},
|
||
)
|
||
|
||
# =========================================================================
|
||
# 新訊息發送方法 (2026-03-29 ogt: ADR-038)
|
||
# =========================================================================
|
||
|
||
def _build_sentry_keyboard(self, error_id: str) -> dict:
|
||
"""建立 Sentry 錯誤訊息按鈕"""
|
||
view_nonce = self._security.generate_callback_nonce(error_id, "view")
|
||
silence_nonce = self._security.generate_callback_nonce(error_id, "silence")
|
||
|
||
return {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "🔍 查看詳情", "callback_data": view_nonce},
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
]
|
||
]
|
||
}
|
||
|
||
def _build_resource_keyboard(self, resource_id: str) -> dict:
|
||
"""建立資源告警按鈕"""
|
||
scale_nonce = self._security.generate_callback_nonce(resource_id, "scale")
|
||
silence_nonce = self._security.generate_callback_nonce(resource_id, "silence")
|
||
|
||
return {
|
||
"inline_keyboard": [
|
||
[
|
||
{"text": "⚡ 自動擴展", "callback_data": scale_nonce},
|
||
{"text": "🔕 靜默 1h", "callback_data": silence_nonce},
|
||
]
|
||
]
|
||
}
|
||
|
||
async def send_sentry_error(
|
||
self,
|
||
error_id: str,
|
||
error_type: str,
|
||
error_message: str,
|
||
service_name: str,
|
||
file_location: str,
|
||
occurrence_count: int = 1,
|
||
affected_users: int = 0,
|
||
first_seen: str = "",
|
||
stack_trace: list[str] | None = None,
|
||
sentry_url: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送 Sentry 錯誤通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
error_id: Sentry Issue ID
|
||
error_type: 錯誤類型 (TypeError, etc.)
|
||
error_message: 錯誤訊息
|
||
service_name: 服務名稱
|
||
file_location: 檔案位置
|
||
occurrence_count: 發生次數
|
||
affected_users: 影響用戶數
|
||
first_seen: 首次發生時間
|
||
stack_trace: Stack trace
|
||
sentry_url: Sentry 連結
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = SentryErrorMessage(
|
||
error_id=error_id,
|
||
error_type=error_type,
|
||
error_message=error_message,
|
||
service_name=service_name,
|
||
file_location=file_location,
|
||
occurrence_count=occurrence_count,
|
||
affected_users=affected_users,
|
||
first_seen=first_seen,
|
||
stack_trace=stack_trace,
|
||
sentry_url=sentry_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"reply_markup": self._build_sentry_keyboard(error_id),
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_sentry_error_sending", error_id=error_id, service=service_name)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_sentry_error_sent", error_id=error_id)
|
||
|
||
return result
|
||
|
||
async def send_resource_warning(
|
||
self,
|
||
resource_id: str,
|
||
pod_name: str,
|
||
namespace: str = "default",
|
||
cpu_percent: float = 0.0,
|
||
cpu_limit: str = "",
|
||
memory_percent: float = 0.0,
|
||
memory_limit: str = "",
|
||
disk_percent: float = 0.0,
|
||
trend_info: str = "",
|
||
suggestion: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送資源告警通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
resource_id: 資源 ID
|
||
pod_name: Pod 名稱
|
||
namespace: K8s namespace
|
||
cpu_percent: CPU 使用率
|
||
memory_percent: Memory 使用率
|
||
disk_percent: Disk 使用率
|
||
trend_info: 趨勢資訊
|
||
suggestion: 建議
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = ResourceWarnMessage(
|
||
resource_id=resource_id,
|
||
pod_name=pod_name,
|
||
namespace=namespace,
|
||
cpu_percent=cpu_percent,
|
||
cpu_limit=cpu_limit,
|
||
memory_percent=memory_percent,
|
||
memory_limit=memory_limit,
|
||
disk_percent=disk_percent,
|
||
trend_info=trend_info,
|
||
suggestion=suggestion,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"reply_markup": self._build_resource_keyboard(resource_id),
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_resource_warning_sending", resource_id=resource_id, pod=pod_name)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_resource_warning_sent", resource_id=resource_id)
|
||
|
||
return result
|
||
|
||
async def send_repair_report(
|
||
self,
|
||
report_date: str,
|
||
total_repairs: int = 0,
|
||
success_count: int = 0,
|
||
failure_count: int = 0,
|
||
saved_minutes: int = 0,
|
||
top_issues: list[tuple[str, int]] | None = None,
|
||
ai_cost_gemini: float = 0.0,
|
||
ai_cost_nvidia: float = 0.0,
|
||
ai_tokens_total: int = 0,
|
||
) -> dict:
|
||
"""
|
||
發送自動修復報告
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Args:
|
||
report_date: 報告日期
|
||
total_repairs: 總修復次數
|
||
success_count: 成功次數
|
||
failure_count: 失敗次數
|
||
saved_minutes: 節省人工時間
|
||
top_issues: Top 問題列表
|
||
ai_cost_gemini: Gemini 成本
|
||
ai_cost_nvidia: NVIDIA 成本
|
||
ai_tokens_total: 總 Token 數
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = RepairReportMessage(
|
||
report_date=report_date,
|
||
total_repairs=total_repairs,
|
||
success_count=success_count,
|
||
failure_count=failure_count,
|
||
saved_minutes=saved_minutes,
|
||
top_issues=top_issues,
|
||
ai_cost_gemini=ai_cost_gemini,
|
||
ai_cost_nvidia=ai_cost_nvidia,
|
||
ai_tokens_total=ai_tokens_total,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_repair_report_sending", date=report_date)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_repair_report_sent", date=report_date)
|
||
|
||
return result
|
||
|
||
async def send_daily_summary(
|
||
self,
|
||
summary_date: str,
|
||
alert_total: int = 0,
|
||
alert_critical: int = 0,
|
||
alert_medium: int = 0,
|
||
alert_low: int = 0,
|
||
auto_repair_count: int = 0,
|
||
manual_approval_count: int = 0,
|
||
ignored_count: int = 0,
|
||
avg_response_minutes: float = 0.0,
|
||
api_availability: float = 99.9,
|
||
web_availability: float = 99.9,
|
||
worker_availability: float = 99.9,
|
||
ai_cost: float = 0.0,
|
||
cloud_cost: float = 0.0,
|
||
budget_remaining: float = 0.0,
|
||
) -> dict:
|
||
"""
|
||
發送每日摘要
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = DailySummaryMessage(
|
||
summary_date=summary_date,
|
||
alert_total=alert_total,
|
||
alert_critical=alert_critical,
|
||
alert_medium=alert_medium,
|
||
alert_low=alert_low,
|
||
auto_repair_count=auto_repair_count,
|
||
manual_approval_count=manual_approval_count,
|
||
ignored_count=ignored_count,
|
||
avg_response_minutes=avg_response_minutes,
|
||
api_availability=api_availability,
|
||
web_availability=web_availability,
|
||
worker_availability=worker_availability,
|
||
ai_cost=ai_cost,
|
||
cloud_cost=cloud_cost,
|
||
budget_remaining=budget_remaining,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_daily_summary_sending", date=summary_date)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_daily_summary_sent", date=summary_date)
|
||
|
||
return result
|
||
|
||
async def send_cicd_progress(
|
||
self,
|
||
job_name: str,
|
||
status: str,
|
||
stage: str = "",
|
||
commit_sha: str = "",
|
||
triggered_by: str = "",
|
||
duration_seconds: int = 0,
|
||
message: str = "",
|
||
workflow_url: str = "",
|
||
max_retries: int = 3,
|
||
) -> dict:
|
||
"""
|
||
發送 CI/CD 進度通知 (簡潔版,不走 AI 仲裁)
|
||
|
||
2026-03-30 ogt: 新增,解決 CI/CD 告警被當成事件處理的問題
|
||
2026-03-30 P1: 新增重試機制 (指數退避)
|
||
|
||
Args:
|
||
max_retries: 最大重試次數 (預設 3)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
# OTEL Span: telegram.send_cicd_progress
|
||
with _tracer.start_as_current_span(
|
||
"telegram.send_cicd_progress",
|
||
attributes={
|
||
"telegram.job_name": job_name,
|
||
"telegram.status": status,
|
||
"telegram.stage": stage,
|
||
"telegram.max_retries": max_retries,
|
||
},
|
||
) as span:
|
||
msg = CICDProgressMessage(
|
||
job_name=job_name,
|
||
status=status,
|
||
stage=stage,
|
||
commit_sha=commit_sha,
|
||
triggered_by=triggered_by,
|
||
duration_seconds=duration_seconds,
|
||
message=message,
|
||
workflow_url=workflow_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": msg.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_cicd_progress_sending", job=job_name, status=status)
|
||
|
||
# 重試機制 (指數退避)
|
||
last_error = None
|
||
for attempt in range(max_retries):
|
||
try:
|
||
result = await self._send_request("sendMessage", payload)
|
||
span.set_attribute("telegram.attempts", attempt + 1)
|
||
span.set_status(trace.Status(trace.StatusCode.OK))
|
||
logger.info("telegram_cicd_progress_sent", job=job_name, status=status, attempt=attempt + 1)
|
||
return result
|
||
except TelegramGatewayError as e:
|
||
last_error = e
|
||
if attempt < max_retries - 1:
|
||
delay = 2 ** attempt # 1, 2, 4 秒
|
||
logger.warning(
|
||
"telegram_cicd_progress_retry",
|
||
job=job_name,
|
||
attempt=attempt + 1,
|
||
delay=delay,
|
||
error=str(e),
|
||
)
|
||
await asyncio.sleep(delay)
|
||
|
||
# 所有重試都失敗
|
||
span.set_attribute("telegram.attempts", max_retries)
|
||
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
||
span.record_exception(last_error)
|
||
logger.error(
|
||
"telegram_cicd_progress_failed",
|
||
job=job_name,
|
||
status=status,
|
||
max_retries=max_retries,
|
||
error=str(last_error),
|
||
)
|
||
raise last_error
|
||
|
||
async def send_deploy_success(
|
||
self,
|
||
commit_sha: str,
|
||
triggered_by: str,
|
||
environment: str = "Production",
|
||
api_version: str = "",
|
||
web_version: str = "",
|
||
worker_version: str = "",
|
||
duration_seconds: int = 0,
|
||
e2e_passed: int = 0,
|
||
e2e_total: int = 0,
|
||
health_check_passed: bool = True,
|
||
workflow_url: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送部署成功通知
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = DeploySuccessMessage(
|
||
commit_sha=commit_sha,
|
||
triggered_by=triggered_by,
|
||
environment=environment,
|
||
api_version=api_version,
|
||
web_version=web_version,
|
||
worker_version=worker_version,
|
||
duration_seconds=duration_seconds,
|
||
e2e_passed=e2e_passed,
|
||
e2e_total=e2e_total,
|
||
health_check_passed=health_check_passed,
|
||
workflow_url=workflow_url,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_deploy_success_sending", commit=commit_sha[:8])
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_deploy_success_sent", commit=commit_sha[:8])
|
||
|
||
return result
|
||
|
||
async def send_rate_limit_warning(
|
||
self,
|
||
provider: str,
|
||
daily_usage: int = 0,
|
||
daily_limit: int = 0,
|
||
token_usage: int = 0,
|
||
token_limit: int = 0,
|
||
cost_usd: float = 0.0,
|
||
suggestions: list[str] | None = None,
|
||
reset_time: str = "",
|
||
) -> dict:
|
||
"""
|
||
發送 API 限額警告
|
||
|
||
2026-03-29 ogt: 新增
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
message = RateLimitMessage(
|
||
provider=provider,
|
||
daily_usage=daily_usage,
|
||
daily_limit=daily_limit,
|
||
token_usage=token_usage,
|
||
token_limit=token_limit,
|
||
cost_usd=cost_usd,
|
||
suggestions=suggestions,
|
||
reset_time=reset_time,
|
||
)
|
||
|
||
payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": message.format(),
|
||
"parse_mode": "HTML",
|
||
"disable_web_page_preview": True,
|
||
}
|
||
|
||
logger.info("telegram_rate_limit_warning_sending", provider=provider)
|
||
result = await self._send_request("sendMessage", payload)
|
||
logger.info("telegram_rate_limit_warning_sent", provider=provider)
|
||
|
||
return result
|
||
|
||
async def handle_callback(
|
||
self,
|
||
callback_query_id: str,
|
||
callback_data: str,
|
||
user_id: int,
|
||
message_id: int,
|
||
original_text: str = "",
|
||
username: str = "",
|
||
) -> dict:
|
||
"""
|
||
處理簽核/調優回調
|
||
|
||
Args:
|
||
callback_query_id: Telegram Callback Query ID
|
||
callback_data: Callback Data (包含 nonce)
|
||
user_id: Telegram User ID
|
||
message_id: 原始訊息 ID
|
||
original_text: 原始卡片內容 (用於保留上下文)
|
||
username: 簽核者使用者名稱
|
||
|
||
Returns:
|
||
dict: 處理結果 {action, approval_id, user, auto_tuning_result?}
|
||
"""
|
||
try:
|
||
# ===================================================================
|
||
# Step 0: LLM Action Callback(H1/B4)— la:{short_id} 格式優先路由
|
||
# 2026-04-27 Claude Sonnet 4.6: H1+B4 Fix — 鬼魂按鈕鐵律修復
|
||
# 必須在 parse_callback_data 之前攔截,否則 split(":") 分析 JSON 會爆
|
||
# ===================================================================
|
||
if callback_data.startswith("la:"):
|
||
return await self._handle_llm_action_callback(
|
||
callback_query_id=callback_query_id,
|
||
callback_data=callback_data,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
|
||
# ===================================================================
|
||
# Step 1: 解析 Callback Data (支援兩種格式)
|
||
# ===================================================================
|
||
parsed = self._security.parse_callback_data(callback_data)
|
||
action = parsed["action"]
|
||
approval_id = parsed["approval_id"]
|
||
|
||
# ===================================================================
|
||
# Step 1.5: ADR-050 Info Actions (read-only, 只需白名單驗證)
|
||
# ===================================================================
|
||
# 2026-04-01 Claude Code (ADR-050 P1): detail/reanalyze/history
|
||
if parsed.get("is_info_action"):
|
||
if not self._security.is_whitelisted(user_id):
|
||
raise UserNotWhitelistedError(f"User {user_id} not in whitelist")
|
||
|
||
incident_id = parsed.get("incident_id", approval_id)
|
||
|
||
if action == "detail":
|
||
# ADR-050 P2: 取得事件詳情,傳送新訊息 (保留原始簽核卡片+按鈕)
|
||
# 2026-04-01 Claude Code (ADR-050 P2)
|
||
await self._answer_callback_nonfatal(callback_query_id, action, text="📋 詳情傳送中...")
|
||
await self._send_incident_detail(incident_id)
|
||
elif action == "history":
|
||
# ADR-050 P2: 取得頻率統計
|
||
# 2026-04-01 Claude Code (ADR-050 P2)
|
||
await self._answer_callback_nonfatal(callback_query_id, action, text="📊 歷史統計傳送中...")
|
||
await self._send_incident_history(incident_id)
|
||
elif action == "reanalyze":
|
||
# ADR-050 P2: 觸發重診
|
||
# 2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
|
||
await self._answer_callback_nonfatal(callback_query_id, action, text="🔄 重診排程中...")
|
||
await self._send_reanalyze_result(incident_id)
|
||
elif action == "drift_view_page":
|
||
# 2026-04-20 P0.1 ogt + Claude Opus 4.7: drift_view 分頁切頁
|
||
# incident_id 格式: {report_id}_{page}(底線分隔)
|
||
_rid, _, _page_str = incident_id.rpartition("_")
|
||
try:
|
||
_page_num = int(_page_str)
|
||
except ValueError:
|
||
_rid, _page_num = incident_id, 0
|
||
await self._answer_callback_nonfatal(
|
||
callback_query_id, action, text=f"📄 切換至第 {_page_num + 1} 頁..."
|
||
)
|
||
await self._send_drift_diff_detail(_rid or incident_id, page=_page_num)
|
||
else:
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.1):
|
||
# 未知 action → fallback dispatcher (查看 callback_action_spec.yaml 是否有註冊)
|
||
await self._dispatch_category_action(
|
||
callback_query_id=callback_query_id,
|
||
action=action,
|
||
incident_id=incident_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": {"id": user_id, "username": username},
|
||
"success": True,
|
||
"info_action": True,
|
||
}
|
||
|
||
nonce = parsed["nonce"] # 4-part nonce action
|
||
|
||
# 2026-04-14 Claude Sonnet 4.6 (Phase 5 Sprint 5.1):
|
||
# 寫類 nonce action 先驗 nonce 再 fallback dispatcher(若 action 在 registry)
|
||
# 這段邏輯在 Step 2 之後再處理,這裡只是佔位註解
|
||
|
||
# 驗證使用者 + Nonce
|
||
user = await self._security.verify_callback(
|
||
user_id=user_id,
|
||
callback_id=callback_query_id,
|
||
nonce=nonce,
|
||
)
|
||
|
||
# ===================================================================
|
||
# Step 1.8: ADR-071-D 狀態機守衛(State Machine Guardrail)
|
||
# 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批最高優先)
|
||
# 防止已 RESOLVED/CLOSED 的事件卡片被誤點再次執行
|
||
# 防止 MITIGATING 中的事件被重複觸發
|
||
# ===================================================================
|
||
guard_result = await self._check_incident_state_guard(
|
||
approval_id=approval_id,
|
||
callback_query_id=callback_query_id,
|
||
message_id=message_id,
|
||
original_text=original_text,
|
||
)
|
||
if guard_result is not None:
|
||
return guard_result
|
||
|
||
# ===================================================================
|
||
# Step 1.85: 2026-04-19 ogt + Claude Opus 4.7 — drift_* 按鈕直接處理
|
||
# 修 Telegram 子系統 bug TG-2: drift_view/drift_adopt/drift_revert
|
||
# 過去無 handler → 按下永遠「執行中」/ fallthrough 誤觸發 approve
|
||
# ===================================================================
|
||
if action in ("drift_view", "drift_adopt", "drift_revert"):
|
||
return await self._handle_drift_action(
|
||
action=action,
|
||
approval_id=approval_id, # 本身即 report_id
|
||
callback_query_id=callback_query_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
user=user,
|
||
)
|
||
|
||
# ===================================================================
|
||
# 2026-04-19 P0 修 (ADR-092): ai_advisory_* 按鈕路由
|
||
# 4 LLM scanner (capacity/compliance/coverage/rule_quality) 的互動按鈕
|
||
# callback_data 格式: 'ai_advisory_{handled|snooze|view|produce_cmd}:{type}:{id}'
|
||
# nonce 解析後 action = 'ai_advisory_handled' 等,approval_id 內嵌 type+id
|
||
# ===================================================================
|
||
if action.startswith("ai_advisory_"):
|
||
return await self._handle_ai_advisory_action(
|
||
action=action,
|
||
advisory_payload=approval_id, # 格式: '{type}:{id}'
|
||
callback_query_id=callback_query_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
user=user,
|
||
message_id=message_id,
|
||
)
|
||
|
||
# ===================================================================
|
||
# Step 1.9: Phase 5 Sprint 5.3 — 分類按鈕寫類 action 路由
|
||
# 2026-04-14 Claude Sonnet 4.6
|
||
# 若 action 在 callback_action_spec registry 且非 approve/reject/silence/tune
|
||
# → 走 dispatcher 執行 MCP + audit log
|
||
# ===================================================================
|
||
from src.services.callback_dispatcher import get_action_spec as _get_spec
|
||
_category_spec = _get_spec(action)
|
||
if _category_spec and action not in (
|
||
"approve", "reject", "silence", "tune", "log_manual_fix"
|
||
):
|
||
# Multi-Sig 守衛 (Sprint 5.4 secops 類)
|
||
if _category_spec.requires_multi_sig:
|
||
# 檢查 approval_records.current_signatures 是否已達 2
|
||
try:
|
||
from src.services.approval_db import get_approval_service as _svc
|
||
from uuid import UUID as _UUID
|
||
_existing = await _svc().get_approval(_UUID(approval_id))
|
||
_sigs = (
|
||
len(_existing.signatures) if _existing and _existing.signatures else 0
|
||
)
|
||
except Exception:
|
||
_sigs = 0
|
||
if _sigs < 2:
|
||
await self._answer_callback(
|
||
callback_query_id, action,
|
||
text=f"⚠️ 需 2 人簽核 ({_sigs}/2)",
|
||
)
|
||
logger.info(
|
||
"category_action_multi_sig_pending",
|
||
action=action, approval_id=approval_id, current_sigs=_sigs,
|
||
)
|
||
return {
|
||
"action": action, "approval_id": approval_id,
|
||
"user": user, "success": False,
|
||
"reason": "multi_sig_pending",
|
||
}
|
||
|
||
# Audit log 開始(寫類動作)
|
||
logger.info(
|
||
"category_write_action_audit_start",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
risk=_category_spec.risk,
|
||
provider=_category_spec.mcp_provider,
|
||
tool=_category_spec.mcp_tool,
|
||
)
|
||
|
||
# Ack Telegram
|
||
await self._answer_callback(
|
||
callback_query_id, action,
|
||
text=f"{_category_spec.emoji} {_category_spec.label} 執行中...",
|
||
)
|
||
|
||
# 查 incident_id + labels for template
|
||
_incident_id_resolved = approval_id # fallback
|
||
_labels: dict = {}
|
||
try:
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
_repo = get_incident_repository()
|
||
# approval_id 可能是 INC-xxx 或 UUID,先試 INC 格式
|
||
if approval_id.startswith("INC-"):
|
||
_inc = await _repo.get_by_id(approval_id)
|
||
else:
|
||
# UUID → 找 approval → incident_id
|
||
from src.services.approval_db import get_approval_service
|
||
from uuid import UUID
|
||
_app = await get_approval_service().get_approval(UUID(approval_id))
|
||
_inc_id = getattr(_app, "incident_id", None) if _app else None
|
||
_inc = await _repo.get_by_id(_inc_id) if _inc_id else None
|
||
if _inc:
|
||
_incident_id_resolved = _inc.incident_id
|
||
if _inc and _inc.signals:
|
||
_labels = _inc.signals[0].labels or {}
|
||
except Exception as _e:
|
||
logger.debug("category_action_labels_lookup_failed", error=str(_e))
|
||
|
||
# Dispatch
|
||
from src.services.callback_dispatcher import dispatch_action as _dispatch
|
||
_result = await _dispatch(
|
||
action_name=action,
|
||
incident_id=_incident_id_resolved,
|
||
user_id=user_id,
|
||
labels=_labels,
|
||
)
|
||
|
||
# Reply 結果到原告警卡片
|
||
try:
|
||
from src.core.redis_client import get_redis as _gr
|
||
_rds = _gr()
|
||
_msg_id_raw = await _rds.get(f"tg_msg:{_incident_id_resolved}")
|
||
_orig_msg = int(_msg_id_raw) if _msg_id_raw else None
|
||
except Exception:
|
||
_orig_msg = None
|
||
try:
|
||
_payload = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": _result.result_text,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if _orig_msg:
|
||
_payload["reply_to_message_id"] = _orig_msg
|
||
await self._send_request("sendMessage", _payload)
|
||
except Exception as _re:
|
||
logger.warning("category_action_reply_send_failed", error=str(_re))
|
||
|
||
# Audit log 完成
|
||
logger.info(
|
||
"category_write_action_audit_complete",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
success=_result.success,
|
||
error=_result.error,
|
||
duration_ms=round(_result.duration_ms, 1),
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": _result.success,
|
||
"category_action": True,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2: 處理自動調優 (Shadow Mode)
|
||
# ===================================================================
|
||
auto_tuning_result = None
|
||
if action == "tune":
|
||
auto_tuning_result = await self._handle_auto_tuning(
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
# 回應 Callback Query
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"tune",
|
||
text="⚡ 調優指令已記錄 (Shadow Mode)",
|
||
)
|
||
# 更新訊息
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="tune",
|
||
username=username,
|
||
original_text=original_text,
|
||
extra_info=auto_tuning_result.get("command", ""),
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"auto_tuning_result": auto_tuning_result,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2.5: 處理稍後/靜默 (2026-03-27 P1 優化)
|
||
# ===================================================================
|
||
if action == "snooze":
|
||
snooze_result = await self._handle_snooze(
|
||
approval_id=approval_id,
|
||
username=username,
|
||
)
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"snooze",
|
||
text="⏰ 30 分鐘後再提醒",
|
||
)
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="snooze",
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"snooze_result": snooze_result,
|
||
}
|
||
|
||
if action == "silence":
|
||
silence_result = await self._handle_silence(
|
||
approval_id=approval_id,
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"silence",
|
||
text="🔕 此類告警靜默 1 小時",
|
||
)
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action="silence",
|
||
username=username,
|
||
original_text=original_text,
|
||
extra_info=silence_result.get("resource_name", ""),
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"silence_result": silence_result,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 2.8: ADR-071-H 手動修復記錄 (TYPE-4)
|
||
# 2026-04-11 Claude Sonnet 4.6 (ADR-071 第一批)
|
||
# 使用者點擊 [手動修復後記錄] → Bot 提示輸入步驟
|
||
# 實際步驟收集在 handle_message() 的 /done 流程中完成
|
||
# ===================================================================
|
||
if action == "log_manual_fix":
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"log_manual_fix",
|
||
text="📝 請輸入修復步驟,完成後傳送 /done",
|
||
)
|
||
# 在 Redis 儲存「等待手動修復輸入」狀態
|
||
try:
|
||
redis = get_redis()
|
||
await redis.setex(
|
||
f"manual_fix_pending:{user_id}",
|
||
1800, # 30 分鐘
|
||
approval_id,
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("manual_fix_pending_store_failed", error=str(_e))
|
||
|
||
await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": self.alert_chat_id,
|
||
"text": (
|
||
"📝 <b>手動修復記錄</b>\n"
|
||
"━━━━━━━━━━━━━━━━━━━\n"
|
||
"請輸入您的修復步驟(可多行)。\n"
|
||
"輸入完畢後傳送 <code>/done</code>\n\n"
|
||
"<i>30 分鐘內有效</i>"
|
||
),
|
||
"parse_mode": "HTML",
|
||
},
|
||
)
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
"waiting_for_manual_fix": True,
|
||
}
|
||
|
||
# ===================================================================
|
||
# Step 3: 回應 Callback Query (簽核/拒絕)
|
||
# ===================================================================
|
||
await self._answer_callback(callback_query_id, action)
|
||
|
||
# ===================================================================
|
||
# Step 4: 更新訊息 (保留原始內容 + 簽核鋼印)
|
||
# ===================================================================
|
||
await self._update_message_after_action(
|
||
message_id=message_id,
|
||
action=action,
|
||
username=username,
|
||
original_text=original_text,
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_callback_processed",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
return {
|
||
"action": action,
|
||
"approval_id": approval_id,
|
||
"user": user,
|
||
"success": True,
|
||
}
|
||
|
||
except UserNotWhitelistedError as e:
|
||
logger.warning("telegram_callback_denied", error=str(e), user_id=user_id)
|
||
await self._answer_callback_nonfatal(
|
||
callback_query_id,
|
||
"denied",
|
||
text="⛔ 您沒有簽核權限",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
except NonceReplayError as e:
|
||
logger.warning("telegram_callback_replay", error=str(e))
|
||
await self._answer_callback_nonfatal(
|
||
callback_query_id,
|
||
"replay",
|
||
text="⚠️ 此操作已處理過",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_callback_error", error=str(e))
|
||
await self._answer_callback_nonfatal(
|
||
callback_query_id,
|
||
"error",
|
||
text="❌ 處理失敗",
|
||
)
|
||
return {"success": False, "error": str(e)}
|
||
|
||
async def _check_incident_state_guard(
|
||
self,
|
||
approval_id: str,
|
||
callback_query_id: str,
|
||
message_id: int,
|
||
original_text: str,
|
||
) -> dict | None:
|
||
"""
|
||
ADR-071-D 狀態機守衛
|
||
|
||
從 approval_id 查詢關聯 incident 的當下狀態:
|
||
- RESOLVED / CLOSED → 拒絕執行,更新卡片文字,移除按鈕
|
||
- MITIGATING → 防止重複觸發,回覆「修復中」提示
|
||
- 其他 / 查不到 → 返回 None(讓主流程繼續)
|
||
|
||
2026-04-11 Claude Sonnet 4.6 (ADR-071-D)
|
||
"""
|
||
try:
|
||
from uuid import UUID
|
||
from src.services.approval_db import get_approval_service
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.models.incident import IncidentStatus
|
||
|
||
approval_svc = get_approval_service()
|
||
try:
|
||
approval = await approval_svc.get_approval_by_id(UUID(approval_id))
|
||
except (ValueError, Exception):
|
||
return None # approval_id 格式異常,讓主流程處理
|
||
|
||
if not approval or not approval.incident_id:
|
||
return None # 無關聯 incident,放行
|
||
|
||
incident_repo = get_incident_repository()
|
||
incident = await incident_repo.get_by_id(approval.incident_id)
|
||
if not incident:
|
||
return None
|
||
|
||
if incident.status in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED):
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"blocked",
|
||
text="✅ 此事件已解決",
|
||
)
|
||
try:
|
||
separator = "──────────────"
|
||
safe_original = html.escape(original_text) if original_text else ""
|
||
stamp = _format_resolved_guard_stamp(incident.resolved_at)
|
||
await self._send_request("editMessageText", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"text": f"{safe_original}\n{separator}\n{stamp}" if safe_original else stamp,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": {"inline_keyboard": []},
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except Exception:
|
||
# 移除按鈕保底
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except Exception:
|
||
pass
|
||
logger.info(
|
||
"state_guard_blocked_resolved",
|
||
approval_id=approval_id,
|
||
incident_id=approval.incident_id,
|
||
incident_status=incident.status.value,
|
||
)
|
||
return {"blocked": True, "reason": "already_resolved", "approval_id": approval_id}
|
||
|
||
if incident.status == IncidentStatus.MITIGATING:
|
||
await self._answer_callback(
|
||
callback_query_id,
|
||
"blocked",
|
||
text="⏳ 正在修復中,請稍候...",
|
||
)
|
||
logger.info(
|
||
"state_guard_blocked_mitigating",
|
||
approval_id=approval_id,
|
||
incident_id=approval.incident_id,
|
||
)
|
||
return {"blocked": True, "reason": "already_executing", "approval_id": approval_id}
|
||
|
||
except Exception as e:
|
||
# 守衛失敗不阻塞主流程
|
||
logger.warning("state_guard_error", approval_id=approval_id, error=str(e))
|
||
|
||
return None
|
||
|
||
async def handle_manual_fix_done(
|
||
self,
|
||
user_id: int,
|
||
username: str,
|
||
fix_steps: str,
|
||
) -> dict:
|
||
"""
|
||
ADR-071-H: 處理使用者輸入 /done 後的手動修復步驟記錄
|
||
|
||
流程:
|
||
1. 從 Redis 取得 pending approval_id
|
||
2. 查詢 ApprovalRecord → 取得 incident_id
|
||
3. 更新 incidents.manual_fix_steps + manual_fix_by
|
||
4. 寫入 alert_operation_log MANUAL_FIX_RECORDED
|
||
5. 觸發 KMConversionService.convert()
|
||
6. 回覆 Telegram 確認訊息
|
||
|
||
Args:
|
||
user_id: Telegram user ID
|
||
username: Telegram username
|
||
fix_steps: 使用者輸入的修復步驟
|
||
"""
|
||
try:
|
||
from src.core.redis_client import get_redis as _get_redis
|
||
redis = _get_redis()
|
||
|
||
pending_key = f"manual_fix_pending:{user_id}"
|
||
approval_id_bytes = await redis.get(pending_key)
|
||
if not approval_id_bytes:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": "⚠️ 找不到待記錄的修復任務,或已逾時。",
|
||
"parse_mode": "HTML",
|
||
})
|
||
return {"success": False, "reason": "no_pending_task"}
|
||
|
||
approval_id = approval_id_bytes.decode() if isinstance(approval_id_bytes, bytes) else str(approval_id_bytes)
|
||
await redis.delete(pending_key)
|
||
|
||
# 查 ApprovalRecord → incident
|
||
from src.repositories.incident_repository import IncidentDBRepository
|
||
from src.repositories.approval_repository import ApprovalDBRepository
|
||
|
||
approval_repo = ApprovalDBRepository()
|
||
approval = await approval_repo.get_by_approval_id(approval_id)
|
||
if not approval:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": f"⚠️ 找不到簽核單 <code>{html.escape(approval_id)}</code>",
|
||
"parse_mode": "HTML",
|
||
})
|
||
return {"success": False, "reason": "approval_not_found"}
|
||
|
||
incident_repo = IncidentDBRepository()
|
||
incident = await incident_repo.get_by_id(approval.incident_id)
|
||
if not incident:
|
||
return {"success": False, "reason": "incident_not_found"}
|
||
|
||
# 更新 incidents.manual_fix_steps + manual_fix_by
|
||
from src.db.base import get_db_context
|
||
from src.db.models import Incident as IncidentORM
|
||
from sqlalchemy import update
|
||
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
update(IncidentORM)
|
||
.where(IncidentORM.incident_id == approval.incident_id)
|
||
.values(
|
||
manual_fix_steps=fix_steps,
|
||
manual_fix_by=username or str(user_id),
|
||
)
|
||
)
|
||
await db.commit()
|
||
|
||
# 寫操作日誌
|
||
from src.repositories.alert_operation_log_repository import get_alert_operation_log_repository
|
||
op_log_repo = get_alert_operation_log_repository()
|
||
await op_log_repo.append(
|
||
event_type="MANUAL_FIX_RECORDED",
|
||
incident_id=approval.incident_id,
|
||
approval_id=approval_id,
|
||
actor=username or str(user_id),
|
||
action_detail=fix_steps[:500],
|
||
success=True,
|
||
)
|
||
|
||
# 觸發 KM 轉換(直接 await,避免 create_task() 在 DB session 關閉後的競爭條件)
|
||
# 重讀 incident 確保 manual_fix_steps 已寫入
|
||
incident_updated = await incident_repo.get_by_id(approval.incident_id)
|
||
if incident_updated:
|
||
from src.services.km_conversion_service import get_km_conversion_service
|
||
km_svc = get_km_conversion_service()
|
||
try:
|
||
await km_svc.convert(incident_updated)
|
||
except Exception as _km_err:
|
||
logger.warning(
|
||
"km_conversion_failed",
|
||
incident_id=approval.incident_id,
|
||
error=str(_km_err),
|
||
)
|
||
|
||
# 回覆確認
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": (
|
||
f"✅ <b>手動修復步驟已記錄</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||
f"📋 事件: <code>{html.escape(approval.incident_id)}</code>\n"
|
||
f"👤 記錄者: @{html.escape(username or str(user_id))}\n\n"
|
||
f"<i>正在建立草稿 Playbook,請至 AWOOOI 審核後生效。</i>"
|
||
),
|
||
"parse_mode": "HTML",
|
||
})
|
||
|
||
logger.info(
|
||
"manual_fix_recorded",
|
||
incident_id=approval.incident_id,
|
||
user=username,
|
||
)
|
||
return {"success": True, "incident_id": approval.incident_id}
|
||
|
||
except Exception as e:
|
||
logger.error("handle_manual_fix_done_failed", error=str(e))
|
||
return {"success": False, "error": str(e)}
|
||
|
||
async def _handle_auto_tuning(
|
||
self,
|
||
approval_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
) -> dict:
|
||
"""
|
||
處理自動調優請求 (Shadow Mode)
|
||
|
||
統帥鐵律: Shadow Mode 下嚴禁實際執行 K8s 命令
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
user_id: 執行者 Telegram ID
|
||
username: 執行者名稱
|
||
|
||
Returns:
|
||
dict: 調優結果
|
||
"""
|
||
try:
|
||
# Shadow Mode: 僅記錄調優請求
|
||
# 實際生產環境需從 ApprovalRecord 取得完整調優指令
|
||
# Shadow Mode: 僅記錄調優請求
|
||
# 實際生產環境需從 ApprovalRecord 取得完整調優指令
|
||
log_message = f"[SHADOW MODE] 自動調優請求 - 簽核單: {approval_id}"
|
||
|
||
if settings.SHADOW_MODE_ENABLED:
|
||
logger.info(
|
||
"shadow_mode_auto_tuning_triggered",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
shadow_mode=True,
|
||
)
|
||
print(f"\n{'='*60}")
|
||
print("[SHADOW MODE] AI 生成的調優指令請求")
|
||
print(f"簽核單: {approval_id}")
|
||
print(f"執行者: @{username} (ID: {user_id})")
|
||
print(f"時間: {datetime.now(UTC).isoformat()}")
|
||
print("狀態: 僅記錄,未實際執行")
|
||
print(f"{'='*60}\n")
|
||
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": True,
|
||
"approval_id": approval_id,
|
||
"triggered_by": username,
|
||
"command": "kubectl command logged (see server logs)",
|
||
"log": log_message,
|
||
}
|
||
else:
|
||
logger.warning(
|
||
"auto_tuning_blocked_not_shadow_mode",
|
||
approval_id=approval_id,
|
||
message="Production execution requires multi-sig approval",
|
||
)
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": False,
|
||
"approval_id": approval_id,
|
||
"error": "Production execution requires multi-sig approval",
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("auto_tuning_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"executed": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _handle_snooze(
|
||
self,
|
||
approval_id: str,
|
||
username: str,
|
||
) -> dict:
|
||
"""
|
||
處理稍後提醒 (2026-03-27 P1 優化)
|
||
|
||
功能: 延遲 30 分鐘後再提醒此告警
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
username: 執行者名稱
|
||
|
||
Returns:
|
||
dict: 處理結果
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
snooze_key = f"{SNOOZE_KEY_PREFIX}{approval_id}"
|
||
|
||
# 設置 30 分鐘延遲標記
|
||
await redis.setex(
|
||
snooze_key,
|
||
SNOOZE_TTL_SECONDS,
|
||
f"{username}:{datetime.now(UTC).isoformat()}",
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_snooze_set",
|
||
approval_id=approval_id,
|
||
username=username,
|
||
ttl_minutes=SNOOZE_TTL_SECONDS // 60,
|
||
)
|
||
|
||
return {
|
||
"snoozed": True,
|
||
"approval_id": approval_id,
|
||
"snooze_until": datetime.now(UTC).isoformat(),
|
||
"ttl_minutes": SNOOZE_TTL_SECONDS // 60,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("snooze_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"snoozed": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _handle_silence(
|
||
self,
|
||
approval_id: str,
|
||
username: str,
|
||
original_text: str,
|
||
) -> dict:
|
||
"""
|
||
處理靜默 1 小時 (2026-03-27 P1 優化)
|
||
|
||
功能: 同類告警 (相同資源) 1 小時內不再發送
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
username: 執行者名稱
|
||
original_text: 原始訊息 (用於解析資源名稱)
|
||
|
||
Returns:
|
||
dict: 處理結果
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
|
||
# 從原始訊息解析資源名稱 (格式: 🎯 資源: xxx)
|
||
resource_name = "unknown"
|
||
for line in original_text.split("\n"):
|
||
if "🎯 資源:" in line or "🎯 資源: " in line:
|
||
resource_name = line.split(":")[-1].strip()
|
||
break
|
||
|
||
silence_key = f"{SILENCE_KEY_PREFIX}{resource_name}"
|
||
|
||
# 設置 1 小時靜默標記
|
||
await redis.setex(
|
||
silence_key,
|
||
SILENCE_TTL_SECONDS,
|
||
f"{username}:{datetime.now(UTC).isoformat()}:{approval_id}",
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_silence_set",
|
||
approval_id=approval_id,
|
||
resource_name=resource_name,
|
||
username=username,
|
||
ttl_hours=SILENCE_TTL_SECONDS // 3600,
|
||
)
|
||
|
||
return {
|
||
"silenced": True,
|
||
"approval_id": approval_id,
|
||
"resource_name": resource_name,
|
||
"silence_until": datetime.now(UTC).isoformat(),
|
||
"ttl_hours": SILENCE_TTL_SECONDS // 3600,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error("silence_error", error=str(e), approval_id=approval_id)
|
||
return {
|
||
"silenced": False,
|
||
"error": str(e),
|
||
}
|
||
|
||
async def _handle_llm_action_callback(
|
||
self,
|
||
callback_query_id: str,
|
||
callback_data: str,
|
||
user_id: int,
|
||
username: str = "",
|
||
) -> dict:
|
||
"""
|
||
B4: 處理 LLM 動態按鈕 callback(格式 la:{short_id})
|
||
|
||
2026-04-27 Claude Sonnet 4.6: H1+B4 Fix — 鬼魂按鈕鐵律修復
|
||
鬼魂按鈕三缺一絕不發送(callback格式+handler+MCP能力);
|
||
本方法補上 handler,與 H3 Redis short_id 映射配合。
|
||
|
||
流程:
|
||
1. 白名單驗證
|
||
2. Redis GET tg:la:{short_id} → 還原 payload(找不到 → 按鈕已過期)
|
||
3. 呼叫 dispatch_llm_action 取得執行規格
|
||
4. high risk 未確認 → 回應確認提示(TODO: 實作二次確認流程)
|
||
5. low/medium → answer_callback_query + 執行 MCP → 回報結果
|
||
6. 失敗 → 回報錯誤,不 crash
|
||
"""
|
||
import json as _json # noqa: PLC0415
|
||
|
||
from src.services.callback_dispatcher import dispatch_llm_action # noqa: PLC0415
|
||
|
||
# ── 1. 白名單驗證 ─────────────────────────────────────────────────────
|
||
if not self._security.is_whitelisted(user_id):
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": "❌ 您沒有執行此操作的權限",
|
||
"show_alert": True,
|
||
})
|
||
return {"action": "llm_action", "ok": False, "reason": "not_whitelisted"}
|
||
|
||
# ── 2. Redis GET → 還原 payload ───────────────────────────────────────
|
||
short_id = callback_data[3:] # 去掉 "la:" 前綴
|
||
redis_key = f"tg:la:{short_id}"
|
||
payload: dict | None = None
|
||
try:
|
||
redis = get_redis()
|
||
raw = await redis.get(redis_key)
|
||
if raw:
|
||
payload = _json.loads(raw)
|
||
except Exception as exc:
|
||
# P1: Redis 故障與按鈕過期分開處理
|
||
logger.error("llm_action_redis_get_failed", short_id=short_id, error=str(exc))
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": "⚠️ 系統暫時不可用,請稍後重試",
|
||
"show_alert": True,
|
||
})
|
||
return {"action": "llm_action", "ok": False, "reason": "redis_unavailable"}
|
||
|
||
if payload is None:
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": "⏰ 此按鈕已過期,請重新觸發告警流程",
|
||
"show_alert": True,
|
||
})
|
||
logger.info("llm_action_button_expired", short_id=short_id)
|
||
return {"action": "llm_action", "ok": False, "reason": "button_expired"}
|
||
|
||
name: str = payload.get("name", "")
|
||
provider: str = payload.get("provider", "")
|
||
tool: str = payload.get("tool", "")
|
||
risk: str = payload.get("risk", "low")
|
||
|
||
# ── 3. 組裝 stub action + 呼叫 dispatch_llm_action ───────────────────
|
||
class _StubAction:
|
||
pass
|
||
|
||
stub = _StubAction()
|
||
stub.name = name # type: ignore[attr-defined]
|
||
stub.mcp_provider = provider # type: ignore[attr-defined]
|
||
stub.mcp_tool = tool # type: ignore[attr-defined]
|
||
stub.risk = risk # type: ignore[attr-defined]
|
||
stub.params = {} # type: ignore[attr-defined]
|
||
|
||
# P0 Fix: 從 Redis payload 取真實 incident_id,不用隨機 short_id
|
||
real_incident_id: str = payload.get("incident_id", "") or short_id
|
||
context = {"incident_id": real_incident_id, "confirmed": False}
|
||
result = dispatch_llm_action(stub, context)
|
||
|
||
# ── 4. high risk → 二次確認提示 ───────────────────────────────────────
|
||
if not result.get("ok") and result.get("reason") == "high_risk_requires_confirmation":
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": f"⚠️ 高風險操作:{name},請傳送指令確認後再執行",
|
||
"show_alert": True,
|
||
})
|
||
logger.info(
|
||
"llm_action_high_risk_pending",
|
||
name=name,
|
||
mcp_tool=tool,
|
||
user_id=user_id,
|
||
)
|
||
return {"action": "llm_action", "ok": False, "reason": "high_risk_requires_confirmation"}
|
||
|
||
# ── 5. dispatch 失敗(allowlist / critical 等) ───────────────────────
|
||
if not result.get("ok"):
|
||
reason = result.get("reason", "unknown")
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": f"❌ 無法執行:{reason}",
|
||
"show_alert": True,
|
||
})
|
||
logger.warning(
|
||
"llm_action_dispatch_rejected",
|
||
name=name,
|
||
mcp_tool=tool,
|
||
reason=reason,
|
||
)
|
||
return {"action": "llm_action", "ok": False, "reason": reason}
|
||
|
||
# ── 6. 允許執行 → answer_callback + 回報結果 ─────────────────────────
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": f"▶️ 執行中:{name}",
|
||
"show_alert": False,
|
||
})
|
||
|
||
logger.info(
|
||
"llm_action_executing",
|
||
name=name,
|
||
mcp_tool=tool,
|
||
mcp_provider=provider,
|
||
risk=risk,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
|
||
# 回報執行結果到 Telegram(MCP 實際呼叫由外部整合,此處發送確認訊息)
|
||
import html as _html # noqa: PLC0415
|
||
result_text = (
|
||
f"✅ <b>LLM 動作已觸發</b>\n"
|
||
f"動作:<code>{_html.escape(name)}</code>\n"
|
||
f"工具:<code>{_html.escape(provider)}/{_html.escape(tool)}</code>\n"
|
||
f"風險:<code>{_html.escape(risk)}</code>\n"
|
||
f"操作者:@{_html.escape(str(username or user_id))}"
|
||
)
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": result_text,
|
||
"parse_mode": "HTML",
|
||
})
|
||
except Exception as exc:
|
||
logger.warning("llm_action_result_notify_failed", error=str(exc))
|
||
|
||
return {
|
||
"action": "llm_action",
|
||
"ok": True,
|
||
"name": name,
|
||
"mcp_tool": tool,
|
||
"mcp_provider": provider,
|
||
"risk": risk,
|
||
"user": {"id": user_id, "username": username},
|
||
}
|
||
|
||
async def _answer_callback(
|
||
self,
|
||
callback_query_id: str,
|
||
action: str,
|
||
text: str | None = None,
|
||
) -> None:
|
||
"""回應 Callback Query"""
|
||
if text is None:
|
||
if action == "approve":
|
||
text = "✅ 已簽核"
|
||
elif action == "reject":
|
||
text = "❌ 已拒絕"
|
||
elif action == "tune":
|
||
text = "⚡ 調優中..."
|
||
elif action == "snooze":
|
||
text = "⏰ 30 分鐘後再提醒"
|
||
elif action == "silence":
|
||
text = "🔕 此類告警靜默 1 小時"
|
||
else:
|
||
text = "✓ 已處理"
|
||
|
||
await self._send_request("answerCallbackQuery", {
|
||
"callback_query_id": callback_query_id,
|
||
"text": text,
|
||
"show_alert": False,
|
||
})
|
||
|
||
async def _answer_callback_nonfatal(
|
||
self,
|
||
callback_query_id: str,
|
||
action: str,
|
||
text: str | None = None,
|
||
) -> None:
|
||
"""Best-effort callback toast; never block the actual DB-backed reply."""
|
||
try:
|
||
await self._answer_callback(callback_query_id, action, text=text)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"telegram_answer_callback_nonfatal_failed",
|
||
action=action,
|
||
error=str(exc),
|
||
)
|
||
|
||
async def _update_message_after_action(
|
||
self,
|
||
message_id: int,
|
||
action: str,
|
||
username: str,
|
||
original_text: str,
|
||
extra_info: str = "",
|
||
) -> None:
|
||
"""
|
||
更新訊息: 保留原始卡片內容 + 簽核/調優鋼印
|
||
|
||
UX 要求:
|
||
- 嚴禁覆蓋原始內容
|
||
- 必須在底部加上分隔線與簽核狀態
|
||
- 移除所有按鈕
|
||
"""
|
||
# 構建鋼印 (2026-03-27 ogt: 新增 snooze/silence)
|
||
if action == "approve":
|
||
stamp = f"✅ 已由 @{username} 授權執行"
|
||
elif action == "reject":
|
||
stamp = f"❌ 已由 @{username} 拒絕執行"
|
||
elif action == "tune":
|
||
stamp = f"⚡ 已由 @{username} 觸發自動調優 (Shadow Mode)"
|
||
if extra_info:
|
||
stamp += "\n📝 指令已記錄"
|
||
elif action == "snooze":
|
||
stamp = f"⏰ @{username} 已設定 30 分鐘後再提醒"
|
||
elif action == "silence":
|
||
resource_info = f" ({extra_info})" if extra_info else ""
|
||
stamp = f"🔕 @{username} 已靜默此類告警 1 小時{resource_info}"
|
||
else:
|
||
stamp = f"✓ 已由 @{username} 處理"
|
||
|
||
# Step 1: 先移除按鈕 (確保按鈕一定消失,即使文字更新失敗)
|
||
# 2026-04-05 Claude Code: editMessageText 因 HTML 特殊字符可能失敗,
|
||
# 先用 editMessageReplyMarkup 確保按鈕移除,再嘗試更新文字
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("telegram_remove_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
# Step 2: 嘗試更新文字 (原始文字已轉義,確保 HTML 安全)
|
||
separator = "──────────────"
|
||
safe_original = html.escape(original_text)
|
||
safe_updated_text = f"{safe_original}\n{separator}\n{stamp}"
|
||
try:
|
||
await self._send_request("editMessageText", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"text": safe_updated_text,
|
||
"parse_mode": "HTML",
|
||
"reply_markup": {"inline_keyboard": []},
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
# 文字更新失敗不影響整體流程,按鈕已移除
|
||
logger.warning("telegram_update_text_failed", message_id=message_id, error=str(e))
|
||
|
||
async def mark_auto_repaired(
|
||
self,
|
||
approval_id: str,
|
||
playbook_name: str,
|
||
execution_time_ms: int,
|
||
success: bool = True,
|
||
) -> bool:
|
||
"""
|
||
自動修復完成後更新 Telegram 卡片:
|
||
1. 移除批准/拒絕/靜默按鈕
|
||
2. 回覆原訊息顯示修復結果
|
||
|
||
2026-04-10 Claude Sonnet 4.6 Asia/Taipei (ADR-068 閉環)
|
||
"""
|
||
try:
|
||
stored = await get_redis().get(f"tg_approval:{approval_id}")
|
||
if not stored:
|
||
logger.warning("mark_auto_repaired_no_msg_id", approval_id=approval_id)
|
||
return False
|
||
|
||
message_id = int(stored)
|
||
|
||
# 移除按鈕
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": {"inline_keyboard": []},
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("mark_auto_repaired_remove_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
# 回覆原訊息說明結果
|
||
_status = "✅ 已自動修復" if success else "❌ 自動修復失敗"
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": (
|
||
f"{_status}\n"
|
||
f"Playbook: <code>{html.escape(playbook_name)}</code>\n"
|
||
f"耗時: {execution_time_ms}ms"
|
||
),
|
||
"parse_mode": "HTML",
|
||
"reply_parameters": {"message_id": message_id},
|
||
})
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.warning("mark_auto_repaired_failed", approval_id=approval_id, error=str(e))
|
||
return False
|
||
|
||
async def append_incident_update(
|
||
self,
|
||
incident_id: str,
|
||
status_line: str,
|
||
keep_info_buttons: bool = True,
|
||
) -> bool:
|
||
"""
|
||
在原始告警訊息追加狀態行,並換掉操作按鈕。
|
||
用於自動修復完成/失敗後更新原訊息,讓狀態變更在同一則訊息上延續。
|
||
|
||
流程:
|
||
1. 從 Redis 取 tg_msg:{incident_id} 得到 message_id
|
||
2. editMessageText: 原文 + 分隔線 + status_line
|
||
3. editMessageReplyMarkup: 移除 Row 1 (批准/拒絕/靜默),保留 Row 2 (詳情/重診/歷史)
|
||
|
||
Args:
|
||
incident_id: Incident ID(用於查 Redis 的 message_id)
|
||
status_line: 追加的狀態文字,如「✅ 已自動修復: kubectl rollout restart…」
|
||
keep_info_buttons: 是否保留詳情/重診/歷史按鈕(預設 True)
|
||
|
||
Returns:
|
||
bool: True = 成功 edit 原訊息;False = 找不到 message_id(fallback 需另行處理)
|
||
|
||
2026-04-09 Claude Sonnet 4.6 Asia/Taipei (統帥要求: 狀態變更在原訊息延續)
|
||
"""
|
||
redis = get_redis()
|
||
redis_key = f"tg_msg:{incident_id}"
|
||
stored = await redis.get(redis_key)
|
||
if not stored:
|
||
logger.warning(
|
||
"append_incident_update_no_message_id",
|
||
incident_id=incident_id,
|
||
reason="message_id not in Redis",
|
||
)
|
||
return False
|
||
|
||
try:
|
||
message_id = int(stored)
|
||
except (ValueError, TypeError):
|
||
logger.warning("append_incident_update_invalid_message_id", stored=stored)
|
||
return False
|
||
|
||
# Telegram 只適合放決策摘要;同一 incident 的相同狀態 5 分鐘內不重複回覆,
|
||
# 詳細執行紀錄應進 timeline / AwoooP Run Monitor,避免群組被 auto-failure 洗版。
|
||
status_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
|
||
dedup_key = f"{INCIDENT_UPDATE_DEDUP_PREFIX}{incident_id}:{status_hash}"
|
||
try:
|
||
was_set = await redis.set(
|
||
dedup_key,
|
||
"1",
|
||
ex=INCIDENT_UPDATE_DEDUP_TTL_SECONDS,
|
||
nx=True,
|
||
)
|
||
if not was_set:
|
||
logger.info(
|
||
"append_incident_update_dedup_suppressed",
|
||
incident_id=incident_id,
|
||
dedup_key=dedup_key,
|
||
)
|
||
return True
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"append_incident_update_dedup_failed",
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
suppress_reply = False
|
||
if _is_noisy_failure_update(status_line):
|
||
# 不同 incident 若卡在同一個自動修復/診斷失敗摘要,Telegram 只推第一則;
|
||
# 每個 incident 仍會繼續移除原卡危險按鈕,完整細節交給 timeline / AwoooP。
|
||
global_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
|
||
global_dedup_key = f"{INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX}{global_hash}"
|
||
try:
|
||
was_global_set = await redis.set(
|
||
global_dedup_key,
|
||
incident_id,
|
||
ex=INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS,
|
||
nx=True,
|
||
)
|
||
suppress_reply = not bool(was_global_set)
|
||
if suppress_reply:
|
||
logger.info(
|
||
"append_incident_update_global_failure_dedup_suppressed",
|
||
incident_id=incident_id,
|
||
dedup_key=global_dedup_key,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"append_incident_update_global_failure_dedup_failed",
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
# Step 1: 取得原始訊息文字(Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
|
||
# 策略: 只追加 status_line,不讀取原文(Telegram edit 要傳完整新文字)
|
||
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態
|
||
# → 實際上用 reply_to_message_id 讓 Telegram 顯示連結更直觀
|
||
|
||
# Step 1: 換掉按鈕 (移除 Row 1 批准/拒絕/靜默,保留 Row 2 資訊按鈕)
|
||
if keep_info_buttons:
|
||
inline_keyboard = [
|
||
[
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "🔄 重診", "callback_data": f"reanalyze:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
],
|
||
]
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
inline_keyboard.append(awooop_row)
|
||
new_keyboard = {"inline_keyboard": inline_keyboard}
|
||
else:
|
||
new_keyboard = {"inline_keyboard": []}
|
||
|
||
try:
|
||
await self._send_request("editMessageReplyMarkup", {
|
||
"chat_id": self.alert_chat_id,
|
||
"message_id": message_id,
|
||
"reply_markup": new_keyboard,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("append_incident_update_edit_buttons_failed", message_id=message_id, error=str(e))
|
||
|
||
if suppress_reply:
|
||
return True
|
||
|
||
# Step 2: Reply 原訊息追加狀態(保留原文不動,以 reply 方式延續)
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": status_line,
|
||
"parse_mode": "HTML",
|
||
"reply_to_message_id": message_id,
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except TelegramGatewayError as e:
|
||
logger.warning("append_incident_update_reply_failed", message_id=message_id, error=str(e))
|
||
|
||
logger.info(
|
||
"append_incident_update_done",
|
||
incident_id=incident_id,
|
||
message_id=message_id,
|
||
)
|
||
return True
|
||
|
||
async def append_grouped_alert_digest(
|
||
self,
|
||
*,
|
||
incident_id: str,
|
||
group_key: str,
|
||
digest_text: str,
|
||
) -> bool:
|
||
"""
|
||
將同組告警收斂摘要回覆到父告警卡,不移除原卡按鈕。
|
||
|
||
與 append_incident_update 不同:digest 是觀測訊息,不代表執行狀態改變,
|
||
因此不能動 approve/reject/silence 按鈕。
|
||
"""
|
||
redis = get_redis()
|
||
stored = await redis.get(f"tg_msg:{incident_id}")
|
||
if not stored:
|
||
logger.info(
|
||
"grouped_alert_digest_no_parent_message",
|
||
incident_id=incident_id,
|
||
group_key=group_key,
|
||
)
|
||
return False
|
||
|
||
try:
|
||
message_id = int(stored)
|
||
except (ValueError, TypeError):
|
||
logger.warning(
|
||
"grouped_alert_digest_invalid_parent_message",
|
||
incident_id=incident_id,
|
||
stored=stored,
|
||
)
|
||
return False
|
||
|
||
dedup_key = f"{GROUPED_ALERT_DIGEST_DEDUP_PREFIX}{group_key}"
|
||
try:
|
||
was_set = await redis.set(
|
||
dedup_key,
|
||
incident_id,
|
||
ex=GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS,
|
||
nx=True,
|
||
)
|
||
if not was_set:
|
||
logger.info(
|
||
"grouped_alert_digest_dedup_suppressed",
|
||
incident_id=incident_id,
|
||
group_key=group_key,
|
||
)
|
||
return True
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"grouped_alert_digest_dedup_failed",
|
||
incident_id=incident_id,
|
||
group_key=group_key,
|
||
error=str(exc),
|
||
)
|
||
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": digest_text[:1400],
|
||
"parse_mode": "HTML",
|
||
"reply_parameters": {
|
||
"message_id": message_id,
|
||
"allow_sending_without_reply": True,
|
||
},
|
||
"disable_web_page_preview": True,
|
||
})
|
||
except TelegramGatewayError as exc:
|
||
logger.warning(
|
||
"grouped_alert_digest_reply_failed",
|
||
incident_id=incident_id,
|
||
group_key=group_key,
|
||
message_id=message_id,
|
||
error=str(exc),
|
||
)
|
||
return False
|
||
|
||
logger.info(
|
||
"grouped_alert_digest_reply_sent",
|
||
incident_id=incident_id,
|
||
group_key=group_key,
|
||
message_id=message_id,
|
||
)
|
||
return True
|
||
|
||
async def _dispatch_category_action(
|
||
self,
|
||
callback_query_id: str,
|
||
action: str,
|
||
incident_id: str,
|
||
user_id: int,
|
||
) -> None:
|
||
"""
|
||
Phase 5 Sprint 5.1 (2026-04-14 Claude Sonnet 4.6):
|
||
Fallback dispatcher — 未知 info action 查 callback_action_spec.yaml
|
||
|
||
流程:
|
||
1. 查 action registry
|
||
2. 若不存在 → 原「⚠️ 未知操作」回覆
|
||
3. 若存在 → 從 incident 取 labels → dispatch_action → reply_to 原卡片
|
||
|
||
注意: 此方法只處理 info action (查類)。nonce action (寫類) 走另一路徑。
|
||
"""
|
||
from src.services.callback_dispatcher import dispatch_action, get_action_spec
|
||
|
||
spec = get_action_spec(action)
|
||
if not spec:
|
||
await self._answer_callback(callback_query_id, action, text="⚠️ 未知操作")
|
||
return
|
||
|
||
# Acknowledge callback immediately(避免 Telegram 端 timeout)
|
||
await self._answer_callback(
|
||
callback_query_id, action, text=f"{spec.emoji} 執行中..."
|
||
)
|
||
|
||
# 從 incident 取 labels (供模板替換)
|
||
labels: dict = {}
|
||
try:
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
if incident and incident.signals:
|
||
labels = incident.signals[0].labels or {}
|
||
except Exception as _e:
|
||
logger.debug("dispatch_labels_lookup_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
# Dispatch
|
||
result = await dispatch_action(
|
||
action_name=action,
|
||
incident_id=incident_id,
|
||
user_id=user_id,
|
||
labels=labels,
|
||
)
|
||
|
||
# Reply to 原卡片 — 從 Redis tg_msg 查 message_id
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
msg_id_raw = await redis.get(f"tg_msg:{incident_id}")
|
||
orig_msg_id = int(msg_id_raw) if msg_id_raw else None
|
||
except Exception:
|
||
orig_msg_id = None
|
||
|
||
try:
|
||
payload: dict = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": result.result_text,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if orig_msg_id:
|
||
payload["reply_to_message_id"] = orig_msg_id
|
||
await self._send_request("sendMessage", payload)
|
||
logger.info(
|
||
"category_action_reply_sent",
|
||
action=action,
|
||
incident_id=incident_id,
|
||
success=result.success,
|
||
duration_ms=round(result.duration_ms, 1),
|
||
)
|
||
except Exception as _e:
|
||
logger.warning("category_action_reply_failed", action=action, error=str(_e))
|
||
|
||
async def _send_incident_detail(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 傳送事件詳情訊息 (不修改原始簽核卡片)
|
||
|
||
2026-04-01 Claude Code (ADR-050 P2): detail button handler
|
||
"""
|
||
# 延遲 import 避免循環依賴 (與 approval_service 同一模式)
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.services.incident_timeline_service import fetch_incident_timeline
|
||
|
||
try:
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
|
||
if not incident:
|
||
await self.send_notification(f"⚠️ 找不到事件 <code>{html.escape(incident_id)}</code>")
|
||
return
|
||
|
||
dc = incident.decision_chain
|
||
confidence_bar = "█" * int((dc.confidence if dc else 0) * 10) + "░" * (10 - int((dc.confidence if dc else 0) * 10))
|
||
|
||
lines = [
|
||
"📋 <b>事件詳情</b>",
|
||
"",
|
||
f"🔖 <b>ID:</b> <code>{html.escape(incident.incident_id)}</code>",
|
||
f"📊 <b>狀態:</b> {incident.status.value}",
|
||
f"⚡ <b>嚴重度:</b> {incident.severity.value}",
|
||
]
|
||
|
||
if incident.affected_services:
|
||
lines.append(f"🎯 <b>受影響服務:</b> {', '.join(html.escape(s) for s in incident.affected_services[:3])}")
|
||
|
||
if dc:
|
||
lines += [
|
||
"",
|
||
f"🤖 <b>AI 分析</b> ({html.escape(dc.model_used)})",
|
||
f"💡 {html.escape(dc.hypothesis)}",
|
||
f"📈 信心: [{confidence_bar}] {dc.confidence:.0%}",
|
||
]
|
||
if dc.probable_root_causes:
|
||
lines.append(f"🔍 根因: {html.escape(dc.probable_root_causes[0][:100])}")
|
||
|
||
# 2026-04-02 Claude Code: 修正時區 — 必須轉台北時區 (feedback_timezone_taipei.md)
|
||
from zoneinfo import ZoneInfo
|
||
created_taipei = incident.created_at.astimezone(ZoneInfo("Asia/Taipei")) if incident.created_at else incident.created_at
|
||
lines += [
|
||
"",
|
||
f"🕐 <b>建立:</b> {created_taipei.strftime('%m/%d %H:%M') if created_taipei else 'N/A'}",
|
||
]
|
||
|
||
if incident.frequency_stats:
|
||
fs = incident.frequency_stats
|
||
lines.append(f"📉 <b>頻率:</b> 1h={fs.count_1h} 24h={fs.count_24h} 7d={fs.count_7d}")
|
||
|
||
timeline = await fetch_incident_timeline(incident_id)
|
||
if timeline and timeline.get("ascii_timeline"):
|
||
lines += [
|
||
"",
|
||
"🧭 <b>處理歷程</b>",
|
||
f"<code>{html.escape(timeline['ascii_timeline'])}</code>",
|
||
]
|
||
reconciliation = timeline.get("reconciliation") or {}
|
||
if reconciliation.get("consistency_status") in {"blocked", "degraded"}:
|
||
mismatch_codes = [
|
||
str(row.get("code"))
|
||
for row in reconciliation.get("mismatches", [])
|
||
if row.get("code")
|
||
]
|
||
lines += [
|
||
"",
|
||
"🚦 <b>真相鏈狀態</b>",
|
||
f"狀態: <code>{html.escape(str(reconciliation.get('consistency_status')))}</code>",
|
||
f"下一步: <code>{html.escape(str(reconciliation.get('operator_next_state')))}</code>",
|
||
]
|
||
if mismatch_codes:
|
||
lines.append(
|
||
"矛盾: "
|
||
+ html.escape(", ".join(mismatch_codes[:4]))
|
||
)
|
||
|
||
try:
|
||
from src.services.adr100_remediation_service import (
|
||
get_adr100_remediation_service,
|
||
)
|
||
|
||
remediation_history = await get_adr100_remediation_service().history(
|
||
limit=5,
|
||
incident_id=incident_id,
|
||
)
|
||
lines += _format_remediation_history_lines(remediation_history)
|
||
except Exception as remediation_exc:
|
||
logger.warning(
|
||
"incident_detail_remediation_history_summary_failed",
|
||
incident_id=incident_id,
|
||
error=str(remediation_exc),
|
||
)
|
||
|
||
try:
|
||
from src.services.awooop_truth_chain_service import fetch_truth_chain
|
||
|
||
truth_chain = await fetch_truth_chain(
|
||
source_id=incident_id,
|
||
project_id=getattr(incident, "project_id", None) or "awoooi",
|
||
)
|
||
gateway_summary = (
|
||
(truth_chain.get("mcp") or {})
|
||
.get("awooop_gateway")
|
||
)
|
||
lines += _format_gateway_summary_lines(gateway_summary)
|
||
lines += _format_automation_quality_lines(
|
||
truth_chain.get("automation_quality")
|
||
)
|
||
except Exception as truth_exc:
|
||
logger.warning(
|
||
"incident_detail_truth_chain_summary_failed",
|
||
incident_id=incident_id,
|
||
error=str(truth_exc),
|
||
)
|
||
|
||
await self._send_html_line_message(
|
||
lines,
|
||
failure_context="incident_detail",
|
||
reply_markup=_awooop_runs_reply_markup(incident_id),
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("send_incident_detail_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(f"⚠️ 無法取得事件詳情: {html.escape(str(e)[:100])}")
|
||
|
||
async def _send_incident_history(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 傳送事件頻率統計訊息與 DB truth-chain 摘要
|
||
|
||
Phase 27 雙層策略 (2026-04-10 ogt):
|
||
- Layer 1: DB frequency_snapshot — 建立時刻快照,永久保存
|
||
- Layer 2: Redis AnomalyCounter — 跨 incident 累積統計 (35d TTL)
|
||
- Layer 3: AwoooP truth-chain — 補足 auto-repair / evidence / KM / MCP 階段
|
||
"""
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.services.anomaly_counter import get_anomaly_counter
|
||
|
||
try:
|
||
repo = get_incident_repository()
|
||
incident = await repo.get_by_id(incident_id)
|
||
|
||
if not incident:
|
||
await self.send_notification(f"⚠️ 找不到事件 <code>{html.escape(incident_id)}</code>")
|
||
return
|
||
|
||
lines = [
|
||
"📊 <b>事件歷史統計</b>",
|
||
"",
|
||
f"🔖 <code>{html.escape(incident_id)}</code>",
|
||
]
|
||
|
||
# === Layer 1: DB 快照 (建立時刻,永久) ===
|
||
fs = incident.frequency_stats
|
||
if fs:
|
||
lines += [
|
||
"",
|
||
"📌 <b>建立時刻快照</b>",
|
||
f" 1小時: {fs.count_1h} 次",
|
||
f" 24小時: {fs.count_24h} 次",
|
||
f" 7天: {fs.count_7d} 次",
|
||
f" 30天: {fs.count_30d} 次",
|
||
]
|
||
if fs.auto_repair_count > 0:
|
||
lines.append(f" 自動修復: {fs.auto_repair_count} 次")
|
||
if fs.last_repair_action:
|
||
lines.append(f" 最後動作: {html.escape(fs.last_repair_action)}")
|
||
if fs.escalation_level:
|
||
lines.append(f" 升級等級: {html.escape(fs.escalation_level)}")
|
||
if fs.anomaly_key:
|
||
lines.append(f"🔑 告警鍵: <code>{html.escape(fs.anomaly_key)}</code>")
|
||
anomaly_key = fs.anomaly_key
|
||
else:
|
||
lines += ["", "⚠️ 無建立時快照(舊 incident 或 Redis 已超期)"]
|
||
# 嘗試從 signals 推導 anomaly_key
|
||
anomaly_key = None
|
||
if incident.signals:
|
||
sig = incident.signals[0]
|
||
parts = [
|
||
sig.alert_name or "",
|
||
incident.affected_services[0] if incident.affected_services else "",
|
||
(sig.labels or {}).get("namespace", ""),
|
||
(sig.labels or {}).get("error_type", ""),
|
||
]
|
||
candidate = ":".join(p for p in parts if p)
|
||
if candidate:
|
||
anomaly_key = candidate
|
||
|
||
# === Layer 2: Redis 累積統計 (35d TTL) ===
|
||
if anomaly_key:
|
||
try:
|
||
counter = get_anomaly_counter()
|
||
disposition = await counter.get_disposition_stats(anomaly_key)
|
||
auto_r = disposition.get("auto_repair_count", 0)
|
||
cold_s = disposition.get("cold_start_trust_count", 0)
|
||
human_a = disposition.get("human_approved_count", 0)
|
||
manual_r = disposition.get("manual_resolved_count", 0)
|
||
total_res = auto_r + cold_s + human_a + manual_r
|
||
if total_res > 0:
|
||
auto_rate = int((auto_r + cold_s) / total_res * 100)
|
||
lines += [
|
||
"",
|
||
f"📋 <b>累積處置分佈</b> (共 {total_res} 次,35天內)",
|
||
f" 🤖 自動修復: {auto_r}",
|
||
f" ❄️ 冷啟動信任: {cold_s}",
|
||
f" 👤 人工審核: {human_a}",
|
||
f" 🔧 手動處理: {manual_r}",
|
||
f" 📈 自動化率: <b>{auto_rate}%</b>",
|
||
]
|
||
else:
|
||
lines += ["", "📋 <b>累積處置</b>: 尚無記錄 (Redis TTL 35天)"]
|
||
except Exception as redis_err:
|
||
logger.warning("incident_history_redis_error", error=str(redis_err))
|
||
lines += ["", "⚠️ Redis 統計暫時無法取得"]
|
||
|
||
# === Layer 3: DB truth-chain(避免 Redis TTL / frequency_snapshot 缺口造成誤判) ===
|
||
try:
|
||
from src.services.awooop_truth_chain_service import fetch_truth_chain
|
||
|
||
truth_chain = await fetch_truth_chain(
|
||
source_id=incident_id,
|
||
project_id=getattr(incident, "project_id", None) or "awoooi",
|
||
)
|
||
truth_status = truth_chain.get("truth_status") or {}
|
||
if truth_status:
|
||
lines += [
|
||
"",
|
||
"🧭 <b>DB Truth-chain</b>",
|
||
(
|
||
"階段: "
|
||
f"<code>{html.escape(str(truth_status.get('current_stage') or 'unknown'))}</code>"
|
||
" / "
|
||
f"<code>{html.escape(str(truth_status.get('stage_status') or 'unknown'))}</code>"
|
||
),
|
||
(
|
||
"人工介入: "
|
||
f"<code>{'yes' if truth_status.get('needs_human') else 'no'}</code>"
|
||
),
|
||
]
|
||
blockers = truth_status.get("blockers")
|
||
if isinstance(blockers, list) and blockers:
|
||
lines.append(
|
||
"卡點: "
|
||
+ html.escape(", ".join(str(item) for item in blockers[:4]))
|
||
)
|
||
lines += _format_automation_quality_lines(
|
||
truth_chain.get("automation_quality")
|
||
)
|
||
except Exception as truth_exc:
|
||
logger.warning(
|
||
"incident_history_truth_chain_summary_failed",
|
||
incident_id=incident_id,
|
||
error=str(truth_exc),
|
||
)
|
||
|
||
await self._send_html_line_message(
|
||
lines,
|
||
failure_context="incident_history",
|
||
reply_markup=_awooop_runs_reply_markup(incident_id),
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("send_incident_history_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(f"⚠️ 無法取得歷史統計: {html.escape(str(e))}")
|
||
|
||
async def _send_reanalyze_result(self, incident_id: str) -> None:
|
||
"""
|
||
ADR-050 P2: 觸發重診並傳送結果訊息
|
||
|
||
呼叫 IncidentService.trigger_reanalysis(),以新訊息回報排程結果。
|
||
不修改原始簽核卡片,避免干擾授權流程。
|
||
|
||
2026-04-01 Claude Code (ADR-050 P2): reanalyze button handler
|
||
"""
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
try:
|
||
service = get_incident_service()
|
||
result = await service.trigger_reanalysis(incident_id)
|
||
|
||
if result["already_analyzing"]:
|
||
msg = (
|
||
f"⏳ <b>重診進行中</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"{html.escape(result['message'])}"
|
||
)
|
||
elif result["triggered"]:
|
||
msg = (
|
||
f"🔄 <b>重診已排程</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"✅ {html.escape(result['message'])}\n"
|
||
f"AI 分析結果將自動更新事件狀態。"
|
||
)
|
||
else:
|
||
msg = (
|
||
f"⚠️ <b>重診失敗</b>\n\n"
|
||
f"🔖 <code>{html.escape(incident_id)}</code>\n\n"
|
||
f"{html.escape(result['message'])}"
|
||
)
|
||
|
||
await self.send_notification(msg)
|
||
|
||
except Exception as e:
|
||
logger.warning("send_reanalyze_result_failed", incident_id=incident_id, error=str(e))
|
||
await self.send_notification(
|
||
f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}"
|
||
)
|
||
|
||
# =========================================================================
|
||
# Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景
|
||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||
# =========================================================================
|
||
|
||
async def send_guardrail_blocked(
|
||
self,
|
||
service_name: str,
|
||
alertname: str,
|
||
reason: str,
|
||
) -> None:
|
||
"""T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
|
||
try:
|
||
text = (
|
||
"🚫 <b>[服務保護] 自動修復已阻擋</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"告警: <code>{html.escape(alertname)}</code>\n"
|
||
f"原因: {html.escape(reason)}\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"⚠️ 請人工評估並手動處理"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t1_guardrail_blocked_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_preflight_failed(
|
||
self,
|
||
service_name: str,
|
||
backup_age_hours: float,
|
||
max_age_hours: float,
|
||
backup_name: str | None,
|
||
) -> None:
|
||
"""T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
|
||
try:
|
||
backup_status = (
|
||
f"緊急備份: 已啟動 <code>{html.escape(backup_name)}</code>"
|
||
if backup_name
|
||
else "緊急備份: <b>啟動失敗</b>,請人工處理"
|
||
)
|
||
text = (
|
||
"⏸ <b>[Pre-flight 阻擋] 備份已過期,修復暫停</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
|
||
f"{backup_status}\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"請等待備份完成後,人工重新評估修復方案"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t2_preflight_failed_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_backup_result(
|
||
self,
|
||
backup_name: str,
|
||
success: bool,
|
||
error_msg: str | None = None,
|
||
) -> None:
|
||
"""T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
|
||
try:
|
||
if success:
|
||
text = (
|
||
"✅ <b>緊急備份完成</b>\n"
|
||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||
"可繼續手動執行修復"
|
||
)
|
||
else:
|
||
err = html.escape(error_msg or "未知錯誤")
|
||
text = (
|
||
"❌ <b>緊急備份失敗</b>\n"
|
||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||
f"錯誤: {err}\n"
|
||
"請人工介入,備份異常"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t3_backup_result_notify_failed", backup=backup_name, error=str(e))
|
||
|
||
async def send_multisig_waiting(
|
||
self,
|
||
action: str,
|
||
service_name: str,
|
||
votes_received: int,
|
||
votes_required: int,
|
||
approval_id: str,
|
||
) -> None:
|
||
"""T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
|
||
try:
|
||
text = (
|
||
"🔐 <b>[MultiSig] 等待第 2 票授權</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"操作: {html.escape(action)}\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
f"風險: CRITICAL(HITL 雙簽)\n"
|
||
f"已獲授權: {votes_received}/{votes_required} 票\n"
|
||
f"審核 ID: <code>{html.escape(approval_id)}</code>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
"請第二位審核者登入確認"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t4_multisig_waiting_notify_failed", approval=approval_id, error=str(e))
|
||
|
||
async def send_multisig_approved(
|
||
self,
|
||
action: str,
|
||
service_name: str,
|
||
) -> None:
|
||
"""T5: MultiSig 完成(2/2)"""
|
||
try:
|
||
text = (
|
||
"✅ <b>[MultiSig 完成] 雙簽授權通過</b>\n"
|
||
f"操作: {html.escape(action)}\n"
|
||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||
"授權: 2/2 票 開始執行..."
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t5_multisig_approved_notify_failed", service=service_name, error=str(e))
|
||
|
||
async def send_change_applied(
|
||
self,
|
||
operator: str,
|
||
action_description: str,
|
||
timestamp: str,
|
||
) -> None:
|
||
"""T6: CHANGE_APPLIED — 手動變更記錄"""
|
||
try:
|
||
text = (
|
||
"📝 <b>[變更記錄] 手動操作已記錄</b>\n"
|
||
"━━━━━━━━━━━━━━━━━\n"
|
||
f"操作者: {html.escape(operator)}\n"
|
||
f"動作: {html.escape(action_description)}\n"
|
||
f"時間: {html.escape(timestamp)}"
|
||
)
|
||
await self.send_notification(text)
|
||
except Exception as e:
|
||
logger.error("t6_change_applied_notify_failed", operator=operator, error=str(e))
|
||
|
||
async def send_notification(
|
||
self,
|
||
text: str,
|
||
parse_mode: str = "HTML",
|
||
chat_id: str | int | None = None,
|
||
) -> dict:
|
||
"""
|
||
發送純文字通知
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
parse_mode: 解析模式
|
||
|
||
Returns:
|
||
dict: API 回應
|
||
"""
|
||
payload_text = text[:500]
|
||
payload_parse_mode = parse_mode
|
||
if parse_mode and parse_mode.upper() == "HTML" and len(text) > 500:
|
||
payload_text = _plain_text_from_html(text, limit=500)
|
||
payload_parse_mode = None
|
||
|
||
payload = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": payload_text, # SOUL.md 字數限制
|
||
}
|
||
if payload_parse_mode:
|
||
payload["parse_mode"] = payload_parse_mode
|
||
|
||
try:
|
||
return await self._send_request("sendMessage", payload)
|
||
except TelegramGatewayError as exc:
|
||
if payload_parse_mode and payload_parse_mode.upper() == "HTML" and "HTTP error: 400" in str(exc):
|
||
fallback_payload = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": _plain_text_from_html(text, limit=500),
|
||
}
|
||
return await self._send_request("sendMessage", fallback_payload)
|
||
raise
|
||
|
||
async def _send_html_line_message(
|
||
self,
|
||
lines: list[str],
|
||
*,
|
||
chat_id: str | int | None = None,
|
||
failure_context: str,
|
||
reply_markup: dict | None = None,
|
||
) -> None:
|
||
"""Send a multi-line HTML message without cutting Telegram tags in half."""
|
||
chunks = _telegram_html_chunks(lines)
|
||
for index, chunk in enumerate(chunks):
|
||
try:
|
||
payload: dict = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": chunk,
|
||
"parse_mode": "HTML",
|
||
}
|
||
if index == 0 and reply_markup:
|
||
payload["reply_markup"] = reply_markup
|
||
await self._send_request(
|
||
"sendMessage",
|
||
payload,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"telegram_html_line_message_failed",
|
||
failure_context=failure_context,
|
||
chunk_index=index,
|
||
chunk_count=len(chunks),
|
||
error=str(exc),
|
||
)
|
||
fallback_payload: dict = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": _plain_text_from_html(chunk),
|
||
}
|
||
if index == 0 and reply_markup:
|
||
fallback_payload["reply_markup"] = reply_markup
|
||
try:
|
||
await self._send_request(
|
||
"sendMessage",
|
||
fallback_payload,
|
||
)
|
||
except Exception as fallback_exc:
|
||
logger.warning(
|
||
"telegram_html_line_message_plain_fallback_failed",
|
||
failure_context=failure_context,
|
||
chunk_index=index,
|
||
chunk_count=len(chunks),
|
||
error=str(fallback_exc),
|
||
)
|
||
rescue_payload: dict = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": _plain_text_from_html(chunk, limit=3500),
|
||
"_skip_incident_thread_reply": True,
|
||
}
|
||
try:
|
||
await self._send_request(
|
||
"sendMessage",
|
||
rescue_payload,
|
||
)
|
||
except Exception as rescue_exc:
|
||
logger.error(
|
||
"telegram_html_line_message_rescue_failed",
|
||
failure_context=failure_context,
|
||
chunk_index=index,
|
||
chunk_count=len(chunks),
|
||
error=str(rescue_exc),
|
||
)
|
||
|
||
async def send_alert_notification(
|
||
self,
|
||
text: str,
|
||
parse_mode: str = "HTML",
|
||
reply_markup: dict | None = None,
|
||
) -> dict:
|
||
"""發送告警型純文字通知到 SRE 戰情室群組。"""
|
||
payload: dict = {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
}
|
||
if reply_markup:
|
||
payload["reply_markup"] = reply_markup
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
# =========================================================================
|
||
# 2026-05-04 Claude Sonnet 4.6: send_text 公開 wrapper(修復 drift_adopt_telegram_failed)
|
||
# =========================================================================
|
||
|
||
async def send_text(
|
||
self,
|
||
text: str,
|
||
chat_id: int | str | None = None,
|
||
parse_mode: str = "HTML",
|
||
disable_web_page_preview: bool = True,
|
||
) -> dict:
|
||
"""
|
||
公開 send_text wrapper — 委派至 _send_request('sendMessage', ...)
|
||
|
||
給 drift_adopt_service / drift_remediator / runbook_generator /
|
||
signoz_webhook 等服務使用的通用純文字送出方法。
|
||
預設送往 alert_chat_id(SRE 群組)。
|
||
|
||
Args:
|
||
text: 訊息內容(最多 4096 字元)
|
||
chat_id: 目標 chat ID,None 時使用 alert_chat_id
|
||
parse_mode: 解析模式(預設 HTML)
|
||
disable_web_page_preview: 是否關閉網頁預覽
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
payload: dict = {
|
||
"chat_id": chat_id or self.alert_chat_id,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
"disable_web_page_preview": disable_web_page_preview,
|
||
}
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
# =========================================================================
|
||
# 2026-04-24 Claude Sonnet 4.6 (ADR-095 WS4): Hermes NL 回覆
|
||
# =========================================================================
|
||
|
||
async def send_hermes_reply(
|
||
self,
|
||
text: str,
|
||
chat_id: str | int,
|
||
reply_to_message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
傳送 Hermes NL 回覆(長文,最多 4096 字元,純文字模式)。
|
||
|
||
Args:
|
||
text: 回覆內容(由 nl_gateway 已截斷至 4000 字以內)
|
||
chat_id: 目標 chat ID
|
||
reply_to_message_id: 回覆哪則訊息(可選)
|
||
"""
|
||
payload: dict = {
|
||
"chat_id": chat_id,
|
||
"text": text[:4096],
|
||
}
|
||
if reply_to_message_id:
|
||
payload["reply_to_message_id"] = reply_to_message_id
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
# =========================================================================
|
||
# 2026-04-03 ogt: SRE 戰情室群組三頭政治 (Triumvirate) — ADR-053
|
||
# @tsenyangbot 發告警卡片到群組,OpenClaw/NemoClaw Bot 各自回覆分析
|
||
# =========================================================================
|
||
|
||
async def send_to_group(
|
||
self,
|
||
text: str,
|
||
parse_mode: str = "HTML",
|
||
reply_markup: dict | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @tsenyangbot 發訊息到 SRE 群組 (SRE_GROUP_CHAT_ID)
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
parse_mode: 解析模式
|
||
reply_markup: 按鈕 (可選)
|
||
|
||
Returns:
|
||
dict: Telegram API 回應 (含 message_id)
|
||
"""
|
||
if not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_to_group_skipped", reason="SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
payload: dict = {
|
||
"chat_id": settings.SRE_GROUP_CHAT_ID,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
}
|
||
if reply_markup:
|
||
payload["reply_markup"] = reply_markup
|
||
|
||
return await self._send_request("sendMessage", payload)
|
||
|
||
async def _send_as_bot(
|
||
self,
|
||
token: str,
|
||
chat_id: str,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
parse_mode: str = "HTML",
|
||
) -> dict:
|
||
"""
|
||
用指定 Bot Token 發訊息。
|
||
|
||
Args:
|
||
token: Bot Token
|
||
chat_id: 群組 Chat ID
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息的 message_id
|
||
parse_mode: 解析模式
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not self._http_client:
|
||
raise TelegramGatewayError("HTTP client not initialized")
|
||
|
||
url = f"{self.TELEGRAM_API_BASE}/bot{token}/sendMessage"
|
||
payload: dict = {
|
||
"chat_id": chat_id,
|
||
"text": text[:4096],
|
||
"parse_mode": parse_mode,
|
||
}
|
||
# 2026-04-03 ogt: supergroup 跨 Bot reply 需用 reply_parameters (Bot API v6.7+)
|
||
# 舊的 reply_to_message_id 在 supergroup 會 400,改用新格式 + allow_sending_without_reply
|
||
if reply_to_message_id:
|
||
payload["reply_parameters"] = {
|
||
"message_id": reply_to_message_id,
|
||
"allow_sending_without_reply": True,
|
||
}
|
||
|
||
response = await self._http_client.post(url, json=payload)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
result_val = result.get("result") if isinstance(result, dict) else None
|
||
if isinstance(result_val, dict) and "message_id" in result_val:
|
||
await self._mirror_outbound_message(
|
||
method="sendMessage",
|
||
payload=payload,
|
||
provider_message_id=str(result_val["message_id"]),
|
||
)
|
||
return result
|
||
|
||
async def send_as_openclaw(
|
||
self,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @OpenClawAwoooI_Bot 在群組發言
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not settings.OPENCLAW_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_as_openclaw_skipped", reason="OPENCLAW_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
return await self._send_as_bot(
|
||
token=settings.OPENCLAW_BOT_TOKEN,
|
||
chat_id=settings.SRE_GROUP_CHAT_ID,
|
||
text=text,
|
||
reply_to_message_id=reply_to_message_id,
|
||
)
|
||
|
||
async def send_as_nemotron(
|
||
self,
|
||
text: str,
|
||
reply_to_message_id: int | None = None,
|
||
) -> dict:
|
||
"""
|
||
用 @NemoTronAwoooI_Bot 在群組發言
|
||
|
||
Args:
|
||
text: 訊息內容
|
||
reply_to_message_id: 回覆哪則訊息
|
||
|
||
Returns:
|
||
dict: Telegram API 回應
|
||
"""
|
||
if not settings.NEMOTRON_BOT_TOKEN or not settings.SRE_GROUP_CHAT_ID:
|
||
logger.warning("send_as_nemotron_skipped", reason="NEMOTRON_BOT_TOKEN or SRE_GROUP_CHAT_ID not configured")
|
||
return {}
|
||
|
||
return await self._send_as_bot(
|
||
token=settings.NEMOTRON_BOT_TOKEN,
|
||
chat_id=settings.SRE_GROUP_CHAT_ID,
|
||
text=text,
|
||
reply_to_message_id=reply_to_message_id,
|
||
)
|
||
|
||
async def trigger_group_ai_discussion(
|
||
self,
|
||
alert_message_id: int,
|
||
alert_summary: str,
|
||
) -> None:
|
||
"""
|
||
觸發群組 AI 並行分析(三頭政治核心流程)
|
||
|
||
流程 (2026-04-03 ogt: 統帥指示改為並行):
|
||
- OpenClaw 和 NemoClaw 同時對告警進行獨立分析
|
||
- 兩者都 reply 同一條告警訊息
|
||
- 並行執行,總等待時間 = max(OpenClaw, NemoClaw) 而非相加
|
||
|
||
此方法由 asyncio.create_task 非同步呼叫,失敗不影響主流程。
|
||
|
||
Args:
|
||
alert_message_id: 告警訊息的 message_id(兩個 Bot 回覆的起點)
|
||
alert_summary: 告警摘要文字(提供給 AI 分析用)
|
||
"""
|
||
try:
|
||
from src.services.chat_manager import ChatManager # noqa: PLC0415
|
||
except ImportError:
|
||
logger.error("trigger_group_ai_discussion_failed", reason="Cannot import ChatManager")
|
||
return
|
||
|
||
try:
|
||
chat_mgr = ChatManager()
|
||
|
||
# 2026-04-03 ogt: 老闆指示 — 告警分析只由 OpenClaw 負責,NemoClaw 不分析告警
|
||
openclaw_prompt = (
|
||
f"你是 OpenClaw,AWOOOI SRE 戰情室首席 AI,精通 K8s、Prometheus、告警分析。\n"
|
||
f"以下是一則基礎設施告警,請進行 RCA 根因分析並給出 3 點具體建議行動。\n"
|
||
f"繁體中文回應,不超過 300 字:\n\n"
|
||
f"{alert_summary}"
|
||
)
|
||
|
||
openclaw_analysis = await chat_mgr._call_openclaw(
|
||
system_prompt="你是 OpenClaw,AWOOOI SRE 戰情室首席 AI。稱呼用戶為「老闆」。",
|
||
user_message=openclaw_prompt,
|
||
)
|
||
|
||
if openclaw_analysis and not isinstance(openclaw_analysis, Exception):
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw 分析</b>\n\n{openclaw_analysis}",
|
||
reply_to_message_id=alert_message_id,
|
||
)
|
||
logger.info("group_ai_discussion_openclaw_sent")
|
||
else:
|
||
logger.warning("trigger_group_ai_discussion_openclaw_empty")
|
||
|
||
logger.info("group_ai_discussion_completed", alert_message_id=alert_message_id)
|
||
|
||
except Exception as e:
|
||
# 群組 AI 討論失敗不影響主流程
|
||
logger.error("trigger_group_ai_discussion_failed", error=str(e))
|
||
|
||
async def close(self) -> None:
|
||
"""關閉 Gateway"""
|
||
# 停止 Long Polling 與 Leader 相關 Tasks
|
||
self._polling_active = False
|
||
for task in (self._polling_task, self._leader_task):
|
||
if task and not task.done():
|
||
task.cancel()
|
||
try:
|
||
await task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self._polling_task = None
|
||
self._leader_task = None
|
||
|
||
if self._http_client:
|
||
await self._http_client.aclose()
|
||
self._http_client = None
|
||
self._initialized = False
|
||
logger.info("telegram_gateway_closed")
|
||
|
||
# =========================================================================
|
||
# Long Polling 實作 (Phase 5 內網修復)
|
||
# =========================================================================
|
||
|
||
async def start_long_polling(self) -> None:
|
||
"""
|
||
啟動 Long Polling 背景任務
|
||
|
||
取代 Webhook 模式,適用於內網環境
|
||
統帥鐵律: 內網無法接收外部 Webhook,必須主動輪詢
|
||
|
||
2026-04-01 Claude Code: 加入 Redis Leader Election
|
||
多 Pod 環境下,只有 Leader 執行 getUpdates,其餘 Pod 進入 Watcher 模式
|
||
"""
|
||
if not self._initialized:
|
||
success = await self.initialize()
|
||
if not success:
|
||
logger.error("telegram_long_polling_failed", reason="Gateway not initialized")
|
||
return
|
||
|
||
if self._polling_active:
|
||
logger.warning("telegram_long_polling_already_running")
|
||
return
|
||
|
||
# 嘗試取得 Leader Lock (NX = 僅在不存在時設定)
|
||
redis = await get_redis()
|
||
acquired = await redis.set(POLLING_LEADER_KEY, self._pod_id, nx=True, ex=POLLING_LEADER_TTL)
|
||
|
||
if not acquired:
|
||
current_leader = await redis.get(POLLING_LEADER_KEY)
|
||
logger.info(
|
||
"telegram_polling_not_leader",
|
||
pod_id=self._pod_id,
|
||
current_leader=current_leader,
|
||
action="watcher_mode",
|
||
)
|
||
# 啟動 Watcher:定期嘗試接管
|
||
self._leader_task = asyncio.create_task(self._leader_watcher())
|
||
return
|
||
|
||
# 取得 Leader Lock,開始 Polling
|
||
await self._delete_webhook()
|
||
|
||
self._polling_active = True
|
||
self._last_update_id = 0
|
||
self._polling_task = asyncio.create_task(self._polling_loop())
|
||
self._leader_task = asyncio.create_task(self._leader_renewer())
|
||
|
||
logger.info(
|
||
"telegram_long_polling_started",
|
||
pod_id=self._pod_id,
|
||
timeout=LONG_POLLING_TIMEOUT,
|
||
chat_id=self.chat_id[:10] + "..." if self.chat_id else "N/A",
|
||
)
|
||
|
||
async def _delete_webhook(self) -> None:
|
||
"""
|
||
刪除現有 Webhook (切換至 Long Polling 模式)
|
||
|
||
統帥鐵律: Webhook 和 Long Polling 不能共存
|
||
必須先刪除 Webhook 才能使用 getUpdates
|
||
"""
|
||
if not self._http_client:
|
||
return
|
||
|
||
try:
|
||
# Step 1: 刪除 Webhook
|
||
url = f"{self.api_url}/deleteWebhook"
|
||
response = await self._http_client.post(url, json={"drop_pending_updates": True})
|
||
result = response.json()
|
||
|
||
if result.get("ok"):
|
||
logger.info(
|
||
"telegram_webhook_deleted",
|
||
description=result.get("description", "Webhook deleted"),
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"telegram_webhook_delete_failed",
|
||
error=result.get("description"),
|
||
)
|
||
|
||
# Step 2: 等待 Telegram 伺服器同步 (避免 409 Conflict)
|
||
await asyncio.sleep(1)
|
||
|
||
# Step 3: 驗證 Webhook 狀態
|
||
info_url = f"{self.api_url}/getWebhookInfo"
|
||
info_response = await self._http_client.get(info_url)
|
||
info_result = info_response.json()
|
||
|
||
webhook_url = info_result.get("result", {}).get("url", "")
|
||
if webhook_url:
|
||
logger.warning(
|
||
"telegram_webhook_still_active",
|
||
url=webhook_url[:50],
|
||
)
|
||
else:
|
||
logger.info("telegram_webhook_confirmed_deleted")
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_webhook_delete_error", error=str(e))
|
||
|
||
async def _polling_loop(self) -> None:
|
||
"""
|
||
Long Polling 主循環
|
||
|
||
使用 getUpdates API 持續監聽 Telegram 更新
|
||
"""
|
||
logger.info("[Telegram] Long polling started - 神經已接通,等待統帥指令...")
|
||
|
||
while self._polling_active:
|
||
try:
|
||
updates = await self._get_updates()
|
||
|
||
for update in updates:
|
||
await self._process_update(update)
|
||
|
||
except asyncio.CancelledError:
|
||
logger.info("telegram_long_polling_cancelled")
|
||
break
|
||
|
||
except httpx.TimeoutException:
|
||
# Long polling timeout 是正常的,繼續下一輪
|
||
continue
|
||
|
||
except httpx.HTTPStatusError as e:
|
||
if e.response.status_code == 409:
|
||
# 409 Conflict: 另一個 Pod 正在 polling,主動釋放 Leader Lock
|
||
# 2026-04-01 Claude Code: 改為釋放 Lock 讓 Watcher 競爭
|
||
# (舊: 侵略性搶佔 2s,已不適用 - 現在是多 Pod 場景而非 .188 搶佔)
|
||
logger.warning(
|
||
"telegram_polling_conflict",
|
||
status=409,
|
||
pod_id=self._pod_id,
|
||
action="releasing_leader_lock",
|
||
)
|
||
redis = await get_redis()
|
||
current = await redis.get(POLLING_LEADER_KEY)
|
||
if current == self._pod_id:
|
||
await redis.delete(POLLING_LEADER_KEY)
|
||
self._polling_active = False
|
||
# Watcher 會在 POLLING_LEADER_WATCH 秒後重新競爭
|
||
self._leader_task = asyncio.create_task(self._leader_watcher())
|
||
break
|
||
else:
|
||
logger.error("telegram_polling_http_error", status=e.response.status_code)
|
||
await asyncio.sleep(LONG_POLLING_RETRY_DELAY)
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_polling_error", error=str(e))
|
||
# 錯誤後等待再重試
|
||
await asyncio.sleep(LONG_POLLING_RETRY_DELAY)
|
||
|
||
logger.info("telegram_long_polling_stopped")
|
||
|
||
async def _leader_renewer(self) -> None:
|
||
"""
|
||
Leader Lock 續約背景任務
|
||
|
||
每 POLLING_LEADER_RENEW 秒更新 Redis TTL,
|
||
確保 Leader 在 Poll 期間持續持有 Lock。
|
||
若 Lock 被搶走,停止 Polling。
|
||
|
||
2026-04-01 Claude Code: 分散式 Leader Election
|
||
"""
|
||
while self._polling_active:
|
||
await asyncio.sleep(POLLING_LEADER_RENEW)
|
||
if not self._polling_active:
|
||
break
|
||
try:
|
||
redis = await get_redis()
|
||
current = await redis.get(POLLING_LEADER_KEY)
|
||
if current != self._pod_id:
|
||
logger.warning(
|
||
"telegram_leader_lock_lost",
|
||
pod_id=self._pod_id,
|
||
current_leader=current,
|
||
)
|
||
self._polling_active = False
|
||
break
|
||
await redis.expire(POLLING_LEADER_KEY, POLLING_LEADER_TTL)
|
||
except Exception as e:
|
||
logger.error("telegram_leader_renew_error", error=str(e))
|
||
|
||
async def _leader_watcher(self) -> None:
|
||
"""
|
||
非 Leader Pod 的接管監控任務
|
||
|
||
每 POLLING_LEADER_WATCH 秒嘗試取得 Leader Lock。
|
||
若原 Leader 宕掉(TTL 過期),此 Pod 接管 Polling。
|
||
|
||
2026-04-01 Claude Code: 分散式 Leader Election
|
||
"""
|
||
while not self._polling_active:
|
||
await asyncio.sleep(POLLING_LEADER_WATCH)
|
||
try:
|
||
redis = await get_redis()
|
||
acquired = await redis.set(
|
||
POLLING_LEADER_KEY, self._pod_id, nx=True, ex=POLLING_LEADER_TTL
|
||
)
|
||
if acquired:
|
||
logger.info(
|
||
"telegram_leader_acquired",
|
||
pod_id=self._pod_id,
|
||
action="starting_polling",
|
||
)
|
||
await self._delete_webhook()
|
||
self._polling_active = True
|
||
self._last_update_id = 0
|
||
self._polling_task = asyncio.create_task(self._polling_loop())
|
||
self._leader_task = asyncio.create_task(self._leader_renewer())
|
||
break
|
||
except asyncio.CancelledError:
|
||
break
|
||
except Exception as e:
|
||
logger.error("telegram_leader_watch_error", error=str(e))
|
||
|
||
async def _get_updates(self) -> list[dict]:
|
||
"""
|
||
呼叫 Telegram getUpdates API
|
||
|
||
Returns:
|
||
list[dict]: 更新列表
|
||
"""
|
||
if not self._http_client:
|
||
return []
|
||
|
||
url = f"{self.api_url}/getUpdates"
|
||
payload = {
|
||
"offset": self._last_update_id + 1,
|
||
"timeout": LONG_POLLING_TIMEOUT,
|
||
"allowed_updates": ["callback_query", "message"], # 監聽按鈕與文字訊息
|
||
}
|
||
|
||
response = await self._http_client.post(
|
||
url,
|
||
json=payload,
|
||
timeout=LONG_POLLING_TIMEOUT + 10, # 比 API timeout 多一點
|
||
)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
|
||
if not result.get("ok"):
|
||
raise TelegramGatewayError(f"getUpdates failed: {result.get('description')}")
|
||
|
||
updates = result.get("result", [])
|
||
|
||
# 更新 offset
|
||
if updates:
|
||
self._last_update_id = updates[-1]["update_id"]
|
||
|
||
return updates
|
||
|
||
async def _process_update(self, update: dict) -> None:
|
||
"""
|
||
處理單個 Telegram Update
|
||
|
||
Args:
|
||
update: Telegram Update 物件
|
||
"""
|
||
update_id = update.get("update_id")
|
||
callback_query = update.get("callback_query")
|
||
message = update.get("message")
|
||
|
||
if not callback_query and not message:
|
||
logger.debug("telegram_update_ignored", update_id=update_id, reason="unsupported update type")
|
||
return
|
||
|
||
if callback_query:
|
||
await self._handle_callback_query(update_id, callback_query)
|
||
elif message:
|
||
await self._handle_chat_message(update_id, message)
|
||
|
||
async def _handle_callback_query(self, update_id: int, callback_query: dict) -> None:
|
||
"""處理按鈕點擊更新"""
|
||
callback_query_id = callback_query.get("id")
|
||
callback_data = callback_query.get("data")
|
||
user = callback_query.get("from", {})
|
||
user_id = user.get("id")
|
||
|
||
if not all([callback_query_id, callback_data, user_id]):
|
||
logger.warning("telegram_callback_invalid", update_id=update_id)
|
||
return
|
||
|
||
username = user.get("username") or user.get("first_name") or str(user_id)
|
||
original_text = callback_query.get("message", {}).get("text", "")
|
||
message_id = callback_query.get("message", {}).get("message_id")
|
||
|
||
logger.info(
|
||
"telegram_callback_received",
|
||
update_id=update_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
|
||
# 呼叫現有的 handle_callback 邏輯
|
||
result = await self.handle_callback(
|
||
callback_query_id=callback_query_id,
|
||
callback_data=callback_data,
|
||
user_id=user_id,
|
||
message_id=message_id,
|
||
original_text=original_text,
|
||
username=username,
|
||
)
|
||
|
||
if result.get("success"):
|
||
# 執行資料庫更新 (簽核/拒絕)
|
||
await self._execute_approval_action(
|
||
action=result["action"],
|
||
approval_id=result["approval_id"],
|
||
user_id=user_id,
|
||
username=username,
|
||
message_id=message_id,
|
||
)
|
||
|
||
async def _handle_chat_message(self, update_id: int, message: dict) -> None:
|
||
"""處理统帥的文字訊息(個人 chat 或 SRE 群組)"""
|
||
text = message.get("text")
|
||
user = message.get("from", {})
|
||
user_id = user.get("id")
|
||
chat_id = message.get("chat", {}).get("id")
|
||
chat_type = message.get("chat", {}).get("type", "private")
|
||
message_id = message.get("message_id")
|
||
username = user.get("username") or user.get("first_name") or str(user_id)
|
||
|
||
# Phase 34 (ADR-067 2026-04-10): 圖片訊息路由
|
||
photos = message.get("photo")
|
||
if photos and user_id:
|
||
if not user.get("is_bot"):
|
||
best = max(photos, key=lambda p: p.get("file_size", 0))
|
||
file_id = best.get("file_id", "")
|
||
caption = message.get("caption", "請用繁體中文描述這張圖片")
|
||
if file_id:
|
||
try:
|
||
from src.services.image_analysis_service import get_image_analysis_service
|
||
svc = get_image_analysis_service()
|
||
await svc.download_and_analyze(
|
||
chat_id=str(chat_id),
|
||
file_id=file_id,
|
||
question=caption,
|
||
)
|
||
except Exception as _img_err:
|
||
logger.warning("image_analysis_polling_failed", error=str(_img_err))
|
||
return
|
||
|
||
if not text or not user_id:
|
||
return
|
||
|
||
# Bot 訊息忽略(避免 Bot 互相觸發無限循環)
|
||
if user.get("is_bot"):
|
||
return
|
||
|
||
logger.info(
|
||
"telegram_chat_received",
|
||
update_id=update_id,
|
||
user_id=user_id,
|
||
username=username,
|
||
chat_type=chat_type,
|
||
text=text[:50],
|
||
)
|
||
|
||
# 1. 群組訊息路由優先 (2026-04-03 ogt: SRE 戰情室群組無需個人白名單)
|
||
# 群組是封閉環境,成員由 Telegram 群組管理員控制,不走個人 whitelist
|
||
is_group = chat_type in ("group", "supergroup")
|
||
is_sre_group = str(chat_id) == str(settings.SRE_GROUP_CHAT_ID)
|
||
|
||
if is_group and is_sre_group:
|
||
reply_to_message = message.get("reply_to_message")
|
||
await self._handle_group_message(text, user_id, username, chat_id, message_id, reply_to_message)
|
||
return
|
||
|
||
# 2. 個人 chat 安全檢查 (ADR-012)
|
||
try:
|
||
interceptor = get_security_interceptor()
|
||
await interceptor.intercept_telegram(user_id)
|
||
except Exception as e:
|
||
logger.warning("telegram_chat_unauthorized", user_id=user_id, error=str(e))
|
||
return
|
||
|
||
# 3. /ai 指令攔截 (Phase 24 C — 2026-04-03 ogt)
|
||
if text.strip().lower().startswith("/ai"):
|
||
whitelist = settings.get_tg_user_whitelist()
|
||
if not whitelist or user_id not in whitelist:
|
||
logger.warning("telegram_ai_command_unauthorized", user_id=user_id, whitelist_empty=not whitelist)
|
||
await self.send_notification("⛔ 未授權:/ai 指令僅限白名單用戶", parse_mode="HTML", chat_id=chat_id)
|
||
return
|
||
from src.services.ai_control import handle_ai_command
|
||
response = await handle_ai_command(text.strip())
|
||
await self.send_notification(response, parse_mode="HTML", chat_id=chat_id)
|
||
logger.info("telegram_ai_command_handled", user_id=user_id, text=text[:50])
|
||
return
|
||
|
||
# 4. 個人 chat — 顯示輸入狀態
|
||
await self._send_chat_action(chat_id, "typing")
|
||
|
||
# 5. ChatManager 處理(個人 chat)
|
||
chat_manager = get_chat_manager()
|
||
response = await chat_manager.generate_response(
|
||
user_id=user_id,
|
||
username=username,
|
||
message_text=text,
|
||
)
|
||
await self.send_notification(response, parse_mode="HTML", chat_id=chat_id)
|
||
|
||
async def _handle_group_message(
|
||
self,
|
||
text: str,
|
||
user_id: int,
|
||
username: str,
|
||
chat_id: int, # noqa: ARG002
|
||
message_id: int | None,
|
||
reply_to_message: dict | None = None,
|
||
) -> None:
|
||
"""
|
||
處理 SRE 群組訊息 (2026-04-03 ogt: Phase 22.6 Triumvirate)
|
||
|
||
路由規則:
|
||
Reply OpenClaw 訊息 → 只有 OpenClaw 回應
|
||
Reply NemoClaw 訊息 → 只有 NemoClaw 回應
|
||
@OpenClawAwoooI_Bot <msg> → 只有 OpenClaw 回應
|
||
@NemoTronAwoooI_Bot <msg> → 只有 NemoClaw 回應
|
||
其他訊息 → 兩個 AI 並行回應
|
||
"""
|
||
# ── 指令路由 (2026-04-03 ogt: 方案B slash commands) ──────────────────
|
||
cmd = text.strip().split()[0].lower().split("@")[0] if text.strip() else ""
|
||
if cmd.startswith("/"):
|
||
await self._handle_group_command(cmd, chat_id, message_id, full_text=text.strip())
|
||
return
|
||
|
||
# ── Hermes NL: @tsenyangbot @mention → Claude Agent SDK 12-Agent (ADR-094/095) ──
|
||
# 2026-04-25 Claude Sonnet 4.6: 接入 polling 路徑,HERMES_NL_ENABLED 控制
|
||
if settings.HERMES_NL_ENABLED:
|
||
_bot_un = getattr(settings, "TELEGRAM_BOT_USERNAME", "tsenyangbot")
|
||
import unicodedata as _uc
|
||
if f"@{_bot_un}".lower() in _uc.normalize("NFKC", text).lower():
|
||
_clean = text.replace(f"@{_bot_un}", "").strip()
|
||
if _clean:
|
||
from src.hermes.nl_gateway import process_nl_message as _nl
|
||
try:
|
||
_reply = await _nl(
|
||
_clean,
|
||
chat_id=str(chat_id),
|
||
user_id=user_id,
|
||
username=username,
|
||
)
|
||
await self.send_hermes_reply(
|
||
text=_reply,
|
||
chat_id=str(chat_id),
|
||
reply_to_message_id=message_id,
|
||
)
|
||
except Exception as _hermes_err:
|
||
logger.error("hermes_nl_polling_failed", error=str(_hermes_err))
|
||
return
|
||
|
||
from src.services.chat_manager import get_chat_manager as _get_cm
|
||
chat_mgr = _get_cm()
|
||
|
||
# 全形/半形統一化後比較
|
||
import unicodedata
|
||
text_normalized = unicodedata.normalize("NFKC", text).lower()
|
||
|
||
# Reply 路由: 若 Reply 的是 Bot 訊息,直接認定目標 AI (2026-04-03 ogt)
|
||
if reply_to_message:
|
||
replied_from = reply_to_message.get("from", {})
|
||
if replied_from.get("is_bot"):
|
||
replied_username = (replied_from.get("username") or "").lower()
|
||
if "openclawawoooi" in replied_username:
|
||
mention_openclaw, mention_nemo = True, False
|
||
elif "nemotronawoooi" in replied_username:
|
||
mention_openclaw, mention_nemo = False, True
|
||
else:
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
else:
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
else:
|
||
# 別名: 小O / 小o (含全形O) → OpenClaw; 小賀 / 小贺 → NemoClaw
|
||
mention_openclaw = "@openclawawoooi_bot" in text_normalized or "小o" in text_normalized
|
||
mention_nemo = "@nemotronawoooi_bot" in text_normalized or "小賀" in text_normalized or "小贺" in text_normalized
|
||
|
||
# 去掉 @ mention 與別名,取出純訊息
|
||
clean_text = unicodedata.normalize("NFKC", text)
|
||
for token in ["@openclawawoooi_bot", "@OpenClawAwoooI_Bot", "@nemotronawoooi_bot", "@NemoTronAwoooI_Bot",
|
||
"小O", "小o", "小O", "小o", "小賀", "小贺"]:
|
||
clean_text = clean_text.replace(token, "").strip()
|
||
if not clean_text:
|
||
clean_text = text
|
||
|
||
context = await chat_mgr.get_system_context()
|
||
|
||
def _clean_ai_reply(text: str, max_chars: int = 600) -> str:
|
||
"""清理 AI 回覆:移除 Markdown 語法,截斷超長內容"""
|
||
import re
|
||
# 移除 Markdown bold/italic (**text**, *text*, __text__, _text_)
|
||
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
||
text = re.sub(r'\*(.+?)\*', r'\1', text)
|
||
text = re.sub(r'__(.+?)__', r'\1', text)
|
||
text = re.sub(r'_(.+?)_', r'\1', text)
|
||
# 移除 Markdown header (#, ##, ###)
|
||
text = re.sub(r'^#{1,3}\s+', '', text, flags=re.MULTILINE)
|
||
# 移除 <think> 標籤(deepseek-r1)
|
||
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
|
||
# 截斷
|
||
if len(text) > max_chars:
|
||
text = text[:max_chars].rsplit('\n', 1)[0] + '…'
|
||
return text.strip()
|
||
|
||
if mention_openclaw and not mention_nemo:
|
||
# 只 OpenClaw 回應
|
||
result = await chat_mgr._call_openclaw(
|
||
f"{context}\n用戶 {username} 在 SRE 戰情室問你:",
|
||
clean_text,
|
||
)
|
||
body = _clean_ai_reply(result) if result else '🔴 無響應'
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw</b>\n\n{body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
elif mention_nemo and not mention_openclaw:
|
||
# 只 NemoClaw 回應
|
||
result = await chat_mgr._call_nemotron(
|
||
f"{context}\n用戶 {username} 在 SRE 戰情室問你:",
|
||
clean_text,
|
||
)
|
||
body = (_clean_ai_reply(result) if result else '') or '🔴 無響應 (deepseek-r1 超時或思考截斷)'
|
||
await self.send_as_nemotron(
|
||
text=f"🤖 <b>NemoClaw</b>\n\n{body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
else:
|
||
# 兩個 AI 並行回應,完成後互相評論
|
||
oc_task = asyncio.create_task(
|
||
chat_mgr._call_openclaw(f"{context}\n用戶 {username} 在 SRE 戰情室:", clean_text)
|
||
)
|
||
nemo_task = asyncio.create_task(
|
||
chat_mgr._call_nemotron(f"{context}\n用戶 {username} 在 SRE 戰情室:", clean_text)
|
||
)
|
||
oc_result, nemo_result = await asyncio.gather(oc_task, nemo_task, return_exceptions=True)
|
||
|
||
if oc_result and not isinstance(oc_result, Exception):
|
||
await self.send_as_openclaw(
|
||
text=f"🦞 <b>OpenClaw</b>\n\n{_clean_ai_reply(oc_result)}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
if nemo_result and not isinstance(nemo_result, Exception):
|
||
nemo_body = _clean_ai_reply(nemo_result) or "🔴 回覆清理後為空 (deepseek-r1 思考超時)"
|
||
await self.send_as_nemotron(
|
||
text=f"🤖 <b>NemoClaw</b>\n\n{nemo_body}",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
|
||
logger.info("group_message_handled", user_id=user_id, text=text[:50])
|
||
|
||
async def _handle_group_command(self, cmd: str, _chat_id: int, message_id: int | None, full_text: str = "") -> None:
|
||
"""
|
||
SRE 群組 Slash Commands (2026-04-03 ogt: 方案B)
|
||
|
||
/status → K8s Cluster 健康狀態
|
||
/incidents → 活躍告警列表
|
||
/cost → 本月 AI 費用統計
|
||
/pods → 異常 Pod 列表
|
||
/rag → RAG 知識庫查詢 (ADR-067 Phase 33)
|
||
/help → 指令說明
|
||
"""
|
||
from src.repositories.k8s_repository import get_k8s_repository
|
||
from src.repositories.incident_repository import get_incident_repository
|
||
from src.core.redis_client import get_redis
|
||
from src.utils.timezone import now_taipei
|
||
|
||
if cmd == "/status":
|
||
try:
|
||
k8s = get_k8s_repository()
|
||
s = await k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||
running, total = s.get("running", 0), s.get("total", 0)
|
||
problems = s.get("problem_pods", [])
|
||
lines = ["<b>🖥 Cluster 狀態</b>", f"• Pods: {running}/{total} Running"]
|
||
if problems:
|
||
lines.append(f"• 異常: {len(problems)} 個")
|
||
for p in problems[:5]:
|
||
lines.append(f" ⚠️ {p}")
|
||
else:
|
||
lines.append("• 全部正常 ✅")
|
||
msg = "\n".join(lines)
|
||
except Exception as e:
|
||
msg = f"<b>🖥 Cluster 狀態</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/incidents":
|
||
try:
|
||
repo = get_incident_repository()
|
||
incidents = await repo.get_active()
|
||
if incidents:
|
||
lines = ["<b>🚨 活躍告警</b>"]
|
||
for inc in incidents[:10]:
|
||
lines.append(f"• <code>{inc.incident_id}</code> SEV{inc.severity.value} — {inc.status.value}")
|
||
msg = "\n".join(lines)
|
||
else:
|
||
msg = "<b>🚨 活躍告警</b>\n✅ 目前無告警"
|
||
except Exception as e:
|
||
msg = f"<b>🚨 活躍告警</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/cost":
|
||
redis = get_redis()
|
||
month = now_taipei().strftime("%Y-%m")
|
||
try:
|
||
gemini_cost = float(await redis.get(f"gemini_cost:{month}") or 0)
|
||
claude_cost = float(await redis.get(f"claude_cost:{month}") or 0)
|
||
total = gemini_cost + claude_cost
|
||
msg = (
|
||
f"<b>💰 {month} AI 費用統計</b>\n"
|
||
f"• 🦞 OpenClaw (Gemini Flash-Lite): <b>${gemini_cost:.4f}</b> / $10.00 上限\n"
|
||
f"• 🤖 NemoClaw (Claude Haiku 4.5): <b>${claude_cost:.4f}</b>\n"
|
||
f"• 合計: <b>${total:.4f}</b>"
|
||
)
|
||
except Exception as e:
|
||
msg = f"<b>💰 費用統計</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/pods":
|
||
try:
|
||
k8s = get_k8s_repository()
|
||
s = await k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||
problems = s.get("problem_pods", [])
|
||
if problems:
|
||
lines = [f"<b>⚠️ 異常 Pod ({len(problems)} 個)</b>"]
|
||
for p in problems[:15]:
|
||
lines.append(f"• <code>{p}</code>")
|
||
msg = "\n".join(lines)
|
||
else:
|
||
msg = "<b>⚠️ 異常 Pod</b>\n✅ 全部 Pod 正常"
|
||
except Exception as e:
|
||
msg = f"<b>⚠️ 異常 Pod</b>\n⚠️ 無法取得: {e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/rag":
|
||
# /rag <查詢內容> — RAG 知識庫語義查詢 (ADR-067 Phase 33)
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
parts = full_text.split(None, 1)
|
||
if len(parts) < 2 or not parts[1].strip():
|
||
await self.send_as_openclaw(
|
||
text="<b>📚 RAG 知識庫查詢</b>\n用法: <code>/rag 你的問題</code>\n例如: <code>/rag 什麼是 ADR-067?</code>",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
return
|
||
question = parts[1].strip()
|
||
await self.send_as_openclaw(
|
||
text=f"<b>📚 查詢知識庫中...</b>\n<code>{question[:80]}</code>",
|
||
reply_to_message_id=message_id,
|
||
)
|
||
try:
|
||
from src.services.knowledge_rag_service import get_knowledge_rag_service
|
||
svc = get_knowledge_rag_service()
|
||
answer = await svc.query(question, top_k=5)
|
||
msg = f"<b>📚 RAG 知識庫</b>\n<i>Q: {question[:80]}</i>\n\n{answer}"
|
||
except Exception as e:
|
||
logger.warning("rag_telegram_query_failed", error=str(e))
|
||
msg = f"<b>📚 RAG 查詢失敗</b>\n{e}"
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
elif cmd == "/help":
|
||
msg = (
|
||
"<b>🤖 SRE 戰情室指令</b>\n\n"
|
||
"/status — 查詢 K8s Cluster 狀態\n"
|
||
"/incidents — 列出活躍告警\n"
|
||
"/cost — 查詢本月 AI 費用\n"
|
||
"/pods — 列出異常 Pod\n"
|
||
"/rag <問題> — 查詢 RAG 知識庫\n"
|
||
"/help — 顯示此說明\n\n"
|
||
"<b>對話方式:</b>\n"
|
||
"• 直接輸入 → 小O + 小賀 同時回應\n"
|
||
"• 小O 或 @OpenClawAwoooI_Bot → 只有 OpenClaw\n"
|
||
"• 小賀 或 @NemoTronAwoooI_Bot → 只有 NemoClaw\n"
|
||
"• Reply 某個 Bot 的訊息 → 只有那個 Bot 回應"
|
||
)
|
||
await self.send_as_openclaw(text=msg, reply_to_message_id=message_id)
|
||
|
||
else:
|
||
logger.debug("group_unknown_command", cmd=cmd)
|
||
|
||
async def _send_chat_action(self, chat_id: int, action: str) -> None:
|
||
"""發送聊天狀態 (e.g., typing)"""
|
||
if not self._http_client: return
|
||
try:
|
||
url = f"{self.api_url}/sendChatAction"
|
||
await self._http_client.post(url, json={"chat_id": chat_id, "action": action})
|
||
except: pass
|
||
|
||
async def _notify_approval_result(
|
||
self,
|
||
message_id: int | None,
|
||
incident_id: str,
|
||
action: str,
|
||
username: str,
|
||
execution_triggered: bool,
|
||
) -> None:
|
||
"""
|
||
2026-04-09 Claude Sonnet 4.6: 批准/拒絕後立即更新 Telegram 訊息狀態。
|
||
|
||
策略:
|
||
1. editMessageReplyMarkup — 移除批准/拒絕按鈕,保留資訊按鈕
|
||
2. sendMessage reply_to → 在原訊息下方附加狀態行
|
||
3. 如果 message_id 找不到,fallback 到 send_notification
|
||
"""
|
||
import html as _html
|
||
|
||
chat_id = self.alert_chat_id
|
||
if not chat_id:
|
||
return
|
||
|
||
# 找到原始告警訊息 ID(優先 Redis,fallback DB)
|
||
orig_msg_id = message_id
|
||
if not orig_msg_id:
|
||
try:
|
||
redis = await get_redis()
|
||
_val = await redis.get(f"tg_msg:{incident_id}")
|
||
if _val:
|
||
orig_msg_id = int(_val)
|
||
else:
|
||
# DB fallback
|
||
from src.services.approval_db import get_approval_service as _svc
|
||
_approvals = await _svc().get_all_approvals(incident_id=incident_id)
|
||
if _approvals and _approvals[0].telegram_message_id:
|
||
orig_msg_id = _approvals[0].telegram_message_id
|
||
except Exception:
|
||
pass
|
||
|
||
if action == "approve":
|
||
status_emoji = "✅"
|
||
status_text = f"<b>已批准</b> by {_html.escape(username)}"
|
||
# 2026-04-14 Claude Sonnet 4.6: 原「等待執行」誤導(實際沒有 gate 會卡住路徑)
|
||
# 批准後一律顯示「執行中」,真實結果由 _push_execution_result_to_alert reply 補上
|
||
suffix = "⚡ 執行中..."
|
||
else:
|
||
status_emoji = "❌"
|
||
status_text = f"<b>已拒絕</b> by {_html.escape(username)}"
|
||
suffix = ""
|
||
|
||
status_line = f"{status_emoji} {status_text} {suffix}".strip()
|
||
|
||
if orig_msg_id:
|
||
try:
|
||
# 1. 移除批准/拒絕按鈕(只保留資訊按鈕列)
|
||
info_buttons = [[
|
||
{"text": "📋 詳情", "callback_data": f"detail:{incident_id}"},
|
||
{"text": "📊 歷史", "callback_data": f"history:{incident_id}"},
|
||
]]
|
||
awooop_row = _awooop_runs_button_row(incident_id)
|
||
if awooop_row:
|
||
info_buttons.append(awooop_row)
|
||
await self._send_request(
|
||
"editMessageReplyMarkup",
|
||
{
|
||
"chat_id": chat_id,
|
||
"message_id": orig_msg_id,
|
||
"reply_markup": {"inline_keyboard": info_buttons},
|
||
},
|
||
)
|
||
except Exception as _e:
|
||
# 2026-04-09 Claude Sonnet 4.6: I3 架構Review修復 — 加 warning 防止靜默失敗
|
||
logger.warning("notify_approval_edit_keyboard_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
try:
|
||
# 2. 在原訊息下回覆狀態
|
||
await self._send_request(
|
||
"sendMessage",
|
||
{
|
||
"chat_id": chat_id,
|
||
"text": status_line,
|
||
"parse_mode": "HTML",
|
||
"reply_to_message_id": orig_msg_id,
|
||
},
|
||
)
|
||
return
|
||
except Exception as _e:
|
||
logger.warning("notify_approval_reply_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
# fallback: 發新通知
|
||
try:
|
||
await self.send_alert_notification(status_line, parse_mode="HTML")
|
||
except Exception as _e:
|
||
logger.warning("notify_approval_fallback_failed", incident_id=incident_id, error=str(_e))
|
||
|
||
async def _execute_approval_action(
|
||
self,
|
||
action: str,
|
||
approval_id: str,
|
||
user_id: int,
|
||
username: str,
|
||
message_id: int, # noqa: ARG002
|
||
) -> None:
|
||
"""
|
||
執行簽核動作 (更新資料庫)
|
||
|
||
Args:
|
||
action: approve/reject/tune
|
||
approval_id: 簽核單 ID
|
||
user_id: Telegram User ID
|
||
username: 使用者名稱
|
||
message_id: 訊息 ID
|
||
"""
|
||
# 2026-03-29 ogt: 修復方法呼叫 - add_signature/reject 不存在
|
||
# 正確方法: sign_approval / reject_approval
|
||
from uuid import UUID
|
||
|
||
from src.services.approval_db import get_approval_service
|
||
|
||
try:
|
||
service = get_approval_service()
|
||
|
||
# approval_id 可能是 INC-xxx (incident_id) 格式,需查出真正的 UUID
|
||
# 2026-04-06 Claude Code: decision_manager 傳入的是 incident.incident_id
|
||
approval_uuid: UUID | None = None
|
||
try:
|
||
approval_uuid = UUID(approval_id)
|
||
except ValueError:
|
||
# 非 UUID 格式,嘗試用 incident_id 查出 pending approval
|
||
pending_list = await service.get_all_approvals(incident_id=approval_id)
|
||
if pending_list:
|
||
approval_uuid = UUID(pending_list[0].id) if isinstance(pending_list[0].id, str) else pending_list[0].id
|
||
else:
|
||
logger.warning(
|
||
"telegram_approval_not_found_by_incident",
|
||
approval_id=approval_id,
|
||
)
|
||
return
|
||
|
||
if action == "approve":
|
||
approval, message, execution_triggered = await service.sign_approval(
|
||
approval_id=approval_uuid,
|
||
signer_id=f"tg_{user_id}",
|
||
signer_name=username,
|
||
comment="Telegram 簽核 (Long Polling)",
|
||
)
|
||
|
||
if approval:
|
||
from src.models.approval import ApprovalStatus
|
||
status_val = approval.status.value if hasattr(approval.status, "value") else str(approval.status)
|
||
logger.info(
|
||
"telegram_approval_signed_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
status=status_val,
|
||
execution_triggered=execution_triggered,
|
||
)
|
||
# 2026-04-22 Claude Sonnet 4.6: 只有真正轉為 APPROVED 才發「執行中...」
|
||
# 非 PENDING 狀態下 sign_approval early-return → approval 是舊 record
|
||
# 此時不應發「執行中...」,應告知用戶告警已處理過
|
||
if approval.status == ApprovalStatus.APPROVED:
|
||
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態 + answer callback
|
||
await self._notify_approval_result(
|
||
message_id=message_id,
|
||
incident_id=approval_id,
|
||
action="approve",
|
||
username=username,
|
||
execution_triggered=execution_triggered,
|
||
)
|
||
else:
|
||
# 告警已是 execution_failed / execution_success / rejected 等終態
|
||
try:
|
||
await self._send_request("sendMessage", {
|
||
"chat_id": self.alert_chat_id,
|
||
"text": f"ℹ️ 此告警已處理(狀態:{status_val}),無法重複批准 by @{username}",
|
||
"reply_to_message_id": message_id,
|
||
})
|
||
except Exception as _ne:
|
||
logger.warning("telegram_approval_already_resolved_notify_failed", error=str(_ne))
|
||
return
|
||
|
||
# ADR-073 修補 + 2026-04-14 Claude Sonnet 4.6 修復:
|
||
# 原本 gate 用 execution_triggered,race condition 時失效(樂觀鎖失敗)
|
||
# 改用 approval.status == APPROVED(與 REST API 路徑 approvals.py:360 對齊)
|
||
# 用 Redis lock exec:{approval_id} 防重入(REST + Telegram 同時簽核)
|
||
if approval.status == ApprovalStatus.APPROVED:
|
||
import asyncio
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.approval_execution import get_execution_service
|
||
|
||
_redis = get_redis()
|
||
_lock_key = f"exec:{approval.id}"
|
||
# SET NX EX 60 — 60s 內同一 approval 只能執行一次
|
||
_acquired = await _redis.set(_lock_key, "1", nx=True, ex=60)
|
||
if _acquired:
|
||
_exec_task = asyncio.create_task(
|
||
get_execution_service().execute_approved_action(approval)
|
||
)
|
||
_exec_task.add_done_callback(
|
||
lambda t: t.exception() if not t.cancelled() else None
|
||
)
|
||
logger.info(
|
||
"telegram_approval_execution_triggered",
|
||
approval_id=approval_id,
|
||
action=approval.action,
|
||
gate="status=APPROVED",
|
||
)
|
||
else:
|
||
logger.info(
|
||
"telegram_approval_execution_skipped_lock_held",
|
||
approval_id=approval_id,
|
||
reason="另一路徑 (REST/自動) 已取得 exec lock",
|
||
)
|
||
|
||
elif action == "reject":
|
||
approval, message = await service.reject_approval(
|
||
approval_id=approval_uuid,
|
||
rejector_id=f"tg_{user_id}",
|
||
rejector_name=username,
|
||
reason="Telegram 拒絕 (Long Polling)",
|
||
)
|
||
|
||
if approval:
|
||
logger.info(
|
||
"telegram_approval_rejected_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
# 2026-04-09 Claude Sonnet 4.6: 回應 Telegram — 更新訊息狀態
|
||
await self._notify_approval_result(
|
||
message_id=message_id,
|
||
incident_id=approval_id,
|
||
action="reject",
|
||
username=username,
|
||
execution_triggered=False,
|
||
)
|
||
try:
|
||
from src.services.incident_approval_service import (
|
||
get_incident_approval_service,
|
||
)
|
||
|
||
await get_incident_approval_service().on_approval_status_change(
|
||
approval_id=str(approval_uuid),
|
||
new_status="rejected",
|
||
)
|
||
logger.info(
|
||
"telegram_rejection_incident_synced_via_polling",
|
||
approval_id=str(approval_uuid),
|
||
incident_id=getattr(approval, "incident_id", None),
|
||
)
|
||
except Exception as _sync_e:
|
||
logger.warning(
|
||
"telegram_rejection_incident_sync_failed_via_polling",
|
||
approval_id=str(approval_uuid),
|
||
incident_id=getattr(approval, "incident_id", None),
|
||
error=str(_sync_e),
|
||
)
|
||
|
||
elif action == "tune":
|
||
logger.info(
|
||
"telegram_auto_tuning_via_polling",
|
||
approval_id=approval_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"telegram_approval_action_failed",
|
||
action=action,
|
||
approval_id=approval_id,
|
||
error=str(e),
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Phase 6.5: 心跳監控方法
|
||
# =============================================================================
|
||
|
||
async def _check_nemotron_health(self) -> tuple[bool, str]:
|
||
"""
|
||
探測 Nemotron (NVIDIA NIM) 是否可用
|
||
|
||
2026-04-03 ogt: 新增 — Nemotron 100% 超時但沒有告警,補足監控盲區
|
||
Returns: (is_healthy, status_text)
|
||
"""
|
||
import httpx
|
||
from src.core.config import get_settings
|
||
settings = get_settings()
|
||
|
||
api_key = settings.NVIDIA_API_KEY
|
||
if not api_key:
|
||
return False, "❌ NVIDIA_API_KEY 未設定"
|
||
|
||
# 2026-04-03 ogt: 用 /v1/models 輕量端點探測,避免觸發推理計費
|
||
# timeout 改為 25s — NIM 免費 tier 冷啟動可能需要 15-20s
|
||
try:
|
||
async with httpx.AsyncClient(timeout=25.0) as client:
|
||
resp = await client.get(
|
||
"https://integrate.api.nvidia.com/v1/models",
|
||
headers={"Authorization": f"Bearer {api_key}"},
|
||
)
|
||
if resp.status_code == 200:
|
||
return True, "✅ 正常"
|
||
return False, f"❌ HTTP {resp.status_code}"
|
||
except httpx.TimeoutException:
|
||
return False, "⚠️ 超時 (>25s)"
|
||
except Exception as e:
|
||
return False, f"❌ {str(e)[:40]}"
|
||
|
||
async def send_heartbeat(self) -> bool:
|
||
"""
|
||
發送心跳報告到 SRE 戰情室群組
|
||
|
||
ADR-073 重構 (2026-04-12 ogt):
|
||
- Redis 分散式鎖:2 個 replica 只發一條
|
||
- 並行探測所有服務(HeartbeatReportService)
|
||
- 一條彙整報告發到 SRE_GROUP_CHAT_ID,不散發
|
||
- 沉默告警整合進報告 warnings,不額外多發
|
||
|
||
2026-04-15 ogt: 修復多 replica 重複發送 bug
|
||
- 舊做法:RedisLock 在 async with 結束後立即 release,
|
||
同 slot 另一 pod 若 10s 後 wake 可再次搶到鎖 → 重複發送
|
||
- 新做法:slot-based key(heartbeat:slot:{slot_id}),
|
||
SET NX EX interval_seconds,讓 key 自然過期;
|
||
不主動 release,整個 slot 週期只有一個 pod 能寫入
|
||
"""
|
||
try:
|
||
if not self._initialized:
|
||
await self.initialize()
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.heartbeat_report_service import (
|
||
HeartbeatReportService,
|
||
report_to_telegram_html,
|
||
)
|
||
|
||
# Slot-based 去重:每個 30min slot 只有第一個搶到的 replica 發送
|
||
# key 自然過期(TTL = interval_seconds),不主動 release
|
||
interval_seconds = 30 * 60
|
||
slot_id = int(datetime.now(UTC).timestamp() / interval_seconds)
|
||
slot_key = f"heartbeat:slot:{slot_id}"
|
||
|
||
redis_client = get_redis()
|
||
acquired = await redis_client.set(slot_key, "1", nx=True, ex=interval_seconds)
|
||
if not acquired:
|
||
logger.debug("heartbeat_skipped_slot_taken", slot_id=slot_id)
|
||
return True
|
||
|
||
report = await HeartbeatReportService().collect()
|
||
|
||
# 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #4 heartbeat 噪音降頻
|
||
# 鐵證:原本 30min/次 = 一天 48 條,統帥每天看相同內容 = 變相重複告警
|
||
# 修法(不違反「監控工具必須被監控」鐵律):
|
||
# 健康(無 warnings)→ 6h 內最多 1 次「我活著」訊號
|
||
# 有 warnings 跟上次相同 → 跳過(hash 對比)
|
||
# 有 warnings 跟上次不同 → 立即推送(新狀況不漏)
|
||
import hashlib
|
||
SILENT_REPORT_INTERVAL_HOURS = 6
|
||
WARNINGS_HASH_TTL = 24 * 3600
|
||
silent_key = "heartbeat:silent_last_sent"
|
||
warnings_hash_key = "heartbeat:warnings_hash"
|
||
|
||
warnings_str = "|".join(sorted(report.warnings))
|
||
warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
|
||
|
||
if not report.warnings:
|
||
# 健康狀態:6h 1 次「我活著」訊號
|
||
if await redis_client.exists(silent_key):
|
||
logger.debug(
|
||
"telegram_heartbeat_skipped_silent_recent",
|
||
slot_id=slot_id,
|
||
)
|
||
return True
|
||
await redis_client.setex(
|
||
silent_key, SILENT_REPORT_INTERVAL_HOURS * 3600, "1",
|
||
)
|
||
# 清掉舊的 warnings hash(從有事 → 健康,下次有事要立即推)
|
||
await redis_client.delete(warnings_hash_key)
|
||
else:
|
||
# 有事:跟上次同 hash 跳過
|
||
last_hash_raw = await redis_client.get(warnings_hash_key)
|
||
last_hash = (
|
||
last_hash_raw.decode() if isinstance(last_hash_raw, bytes)
|
||
else last_hash_raw
|
||
)
|
||
if last_hash == warnings_hash:
|
||
logger.debug(
|
||
"telegram_heartbeat_skipped_warnings_unchanged",
|
||
slot_id=slot_id,
|
||
warnings_hash=warnings_hash,
|
||
)
|
||
return True
|
||
await redis_client.setex(
|
||
warnings_hash_key, WARNINGS_HASH_TTL, warnings_hash,
|
||
)
|
||
# 清掉 silent marker(從健康 → 有事,下次健康要過 6h 才再推)
|
||
await redis_client.delete(silent_key)
|
||
|
||
text = report_to_telegram_html(report)
|
||
|
||
# 只發到 SRE 戰情室群組
|
||
if settings.SRE_GROUP_CHAT_ID:
|
||
await self.send_to_group(text=text)
|
||
else:
|
||
# SRE_GROUP_CHAT_ID 未注入時,fallback 到個人頻道並加警告
|
||
fallback = (
|
||
"⚠️ <b>SRE_GROUP_CHAT_ID 未設定</b>,心跳報告暫發到個人頻道\n\n"
|
||
+ text
|
||
)
|
||
await self.send_notification(fallback)
|
||
|
||
self._last_message_time = datetime.now(UTC)
|
||
logger.info(
|
||
"telegram_heartbeat_sent",
|
||
warnings=len(report.warnings),
|
||
warnings_hash=warnings_hash,
|
||
has_sre_group=bool(settings.SRE_GROUP_CHAT_ID),
|
||
)
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error("telegram_heartbeat_failed", error=str(e))
|
||
return False
|
||
|
||
async def start_heartbeat_monitor(
|
||
self,
|
||
heartbeat_interval_minutes: int = 30,
|
||
silence_threshold_hours: int = 2,
|
||
) -> None:
|
||
"""
|
||
啟動心跳監控背景任務
|
||
|
||
Args:
|
||
heartbeat_interval_minutes: 心跳間隔 (預設 30 分鐘)
|
||
silence_threshold_hours: 沉默告警閾值 (預設 2 小時)
|
||
"""
|
||
if self._heartbeat_active:
|
||
logger.warning("telegram_heartbeat_already_running")
|
||
return
|
||
|
||
self._heartbeat_active = True
|
||
self._heartbeat_task = asyncio.create_task(
|
||
self._heartbeat_loop(heartbeat_interval_minutes, silence_threshold_hours)
|
||
)
|
||
|
||
logger.info(
|
||
"telegram_heartbeat_monitor_started",
|
||
interval_minutes=heartbeat_interval_minutes,
|
||
silence_threshold_hours=silence_threshold_hours,
|
||
)
|
||
|
||
async def _heartbeat_loop(
|
||
self,
|
||
interval_minutes: int,
|
||
_silence_hours: int, # 保留參數簽名相容性,沉默判斷已整合進 HeartbeatReport.warnings
|
||
) -> None:
|
||
"""
|
||
心跳監控循環
|
||
|
||
ADR-073 重構 (2026-04-12 ogt):
|
||
- 移除額外沉默告警多發邏輯(已整合進 HeartbeatReport.warnings)
|
||
- send_heartbeat() 內部有 RedisLock,2 個 replica 各自跑 loop 也只發一條
|
||
"""
|
||
interval_seconds = interval_minutes * 60
|
||
|
||
# 對齊到下一個整點倍數(例如 interval=30 → 對齊到 :00 或 :30)
|
||
# 避免多 replica 因啟動時間不同而各自發送
|
||
now_ts = datetime.now(UTC).timestamp()
|
||
next_slot = (int(now_ts / interval_seconds) + 1) * interval_seconds
|
||
wait_seconds = next_slot - now_ts
|
||
try:
|
||
await asyncio.sleep(wait_seconds)
|
||
except asyncio.CancelledError:
|
||
return
|
||
|
||
while self._heartbeat_active:
|
||
try:
|
||
await self.send_heartbeat()
|
||
await asyncio.sleep(interval_seconds)
|
||
except asyncio.CancelledError:
|
||
break
|
||
except Exception as e:
|
||
logger.error("telegram_heartbeat_loop_error", error=str(e))
|
||
await asyncio.sleep(60)
|
||
|
||
async def stop_heartbeat_monitor(self) -> None:
|
||
"""停止心跳監控"""
|
||
self._heartbeat_active = False
|
||
if self._heartbeat_task and not self._heartbeat_task.done():
|
||
self._heartbeat_task.cancel()
|
||
try:
|
||
await self._heartbeat_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self._heartbeat_task = None
|
||
logger.info("telegram_heartbeat_monitor_stopped")
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_gateway: TelegramGateway | None = None
|
||
|
||
|
||
def get_telegram_gateway() -> TelegramGateway:
|
||
"""取得全域 TelegramGateway 實例"""
|
||
global _gateway
|
||
if _gateway is None:
|
||
_gateway = TelegramGateway()
|
||
return _gateway
|