Files
awoooi/apps/api/src/services/failover_alerter.py
Your Name 341c3b6523
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m13s
CD Pipeline / build-and-deploy (push) Successful in 3m22s
CD Pipeline / post-deploy-checks (push) Successful in 1m28s
fix(telegram): format governance and runbook alerts
2026-05-07 00:58:20 +08:00

556 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Ollama 容災 / 自動恢復 / Gemini 帳單 Telegram 告警
設計原則:
- 每次 failover 觸發都通知(用戶明確指示)
- 但用 10min Redis dedup TTL 防同樣狀態重複告警
- 三種告警類型failover_triggered / recovery_succeeded / gemini_quota_exceeded
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
"""
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime, timedelta, timezone
from typing import Any
import structlog
TAIPEI_TZ = timezone(timedelta(hours=8))
DEDUP_TTL_SEC = 600 # 10 min故障切換用
RECOVERY_DEDUP_TTL_SEC = 3600 # 1h — GCP 健康閃爍時 1 小時內不重複告警
QUOTA_DEDUP_TTL_SEC = 86400 # 24h每日 quota 告警只發一次)
logger = structlog.get_logger(__name__)
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
class FailoverAlerter:
"""Ollama 容災 Telegram 告警
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
"""
def __init__(self, redis_client=None) -> None:
# telegram_gateway 從 singleton 取不注入lifespan 已確保初始化)
self._redis = redis_client
# 2026-04-27 Wave8-X2 by Claude — alerter dedup fail-open 修復
# Redis 不可用時改用 in-memory dedup避免同一事件狂發 Telegram
# 限制:同 process 內生效;重啟後記憶清空(可接受,重啟本身就是罕見事件)
self._memory_dedup: dict[str, float] = {}
self._memory_dedup_max_size = 1000
async def alert_failover(self, event: dict[str, Any]) -> None:
"""Ollama 故障切換告警 — 10min dedup
# 2026-05-03 ogt: ADR-110 三層容災,故障主機從 event["failed_host"] 動態讀取
"""
to_provider = event.get("to_provider", "unknown")
dedup_key = f"alert:failover:{to_provider}"
if not await self._check_dedup(dedup_key, ttl=DEDUP_TTL_SEC):
logger.debug("failover_alert_dedup_skipped", to_provider=to_provider)
return
reason = event.get("reason", "unknown")
model = event.get("model", "?")
timestamp = event.get("timestamp", datetime.now(TAIPEI_TZ).isoformat())
fallback_chain_str = event.get("fallback_chain_str", "?")
# 2026-05-03 ogt: ADR-110 — 故障主機動態,不再硬編碼 111
failed_host = event.get("failed_host", "Ollama")
msg = (
f"*Ollama 容災激活*\n\n"
f"故障主機:{_escape_md(failed_host)}\n"
f"故障狀態:{_escape_md(reason)}\n"
f"切換目標:{_escape_md(to_provider)} \\(model: {_escape_md(model)}\\)\n"
f"切換時間:{_escape_md(timestamp)}\n\n"
f"Fallback 鏈:{_escape_md(fallback_chain_str)}\n\n"
f"自動恢復服務持續監控3 次 HEALTHY 後自動切回"
)
await self._send(msg)
logger.info("failover_alert_sent", to_provider=to_provider)
async def alert_recovery(self, event: dict[str, Any]) -> None:
"""Ollama 自動恢復告警 — 1h dedup per host
# 2026-05-03 ogt: ADR-110 三層容災,恢復主機從 event["recovered_host"] 動態讀取
# 2026-05-04 ogt: dedup key 加 recovered_host + 改 1h TTL
# 原 key 固定 "alert:recovery" → GCP-A 每 10min 閃爍就重發
"""
recovered_host = event.get("recovered_host", event.get("to_provider", "ollama"))
# sanitize host → 只留 IP/hostname 部分,拿掉 http:// 前綴與 port
safe_host = str(recovered_host).replace("http://", "").replace("/", "_").replace(":", "_")
dedup_key = f"alert:recovery:{safe_host}"
if not await self._check_dedup(dedup_key, ttl=RECOVERY_DEDUP_TTL_SEC):
logger.debug("recovery_alert_dedup_skipped")
return
stable_count = event.get("stable_count", event.get("to", "?"))
# 相容 auto_recovery 傳入的 {"from": ..., "to": ...} 格式
from_provider = event.get("from_provider", event.get("from", "?"))
to_provider = event.get("to_provider", event.get("to", "ollama"))
recovery_time = event.get("recovery_time", datetime.now(TAIPEI_TZ).isoformat())
msg = (
f"*Ollama 自動恢復*\n\n"
f"恢復主機:{_escape_md(str(recovered_host))}\n"
f"穩定計數:連續 {stable_count} 次 HEALTHY\n"
f"切回時間:{_escape_md(str(recovery_time))}\n"
f"切換路徑:{_escape_md(str(from_provider))}{_escape_md(str(to_provider))}\n\n"
f"自動化飛輪已恢復至高效能推理模式"
)
await self._send(msg)
logger.info("recovery_alert_sent", from_provider=from_provider)
async def alert_governance(self, event_type: str, payload: dict[str, Any]) -> None:
"""AI 治理告警dedup 1h
event_type: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
dedup TTL 3600s — 同類告警 1 小時內不重複發送
2026-04-26 P2.2 by Claude
2026-05-02 ogt + Claude Sonnet 4.6 — Bug 3 修復dedup key 加 payload hash
原 key 只看 event_type不看 payload 內容,導致同 event_type 但不同影響
的告警例如trust_drift 4 條→25 條漂移)全被 1h dedup 吃掉。
2026-05-02 ogt + Claude Opus 4.7 — critic P1-3 連鎖修復
前次只 hash 頂層 allowlist 欄位,對 slo_*_violation / governance_self_failure
等只把 metric 放在 impact subdict 的事件失效hash 永遠相同)。
改 hash 整個 impact subdict — schema 強制 5 種 event type 都有 impact
各自的 metric 值都會反映在 hash 裡,數值變動就會繞過 dedup。
sha256 取代 md5 避開 bandit B324 lint warning非密碼學用途
"""
# sanitize防 SLO 名稱(如 "slo_km_growth_rate")含 ":" 或空格污染 key
safe_event_type = event_type.replace(":", "_").replace(" ", "_").lower()
# impact hashhash payload.impact subdictschema 強制存在;含各 event 的 metric 值)
# default=str 容錯 datetime / Decimal / 其他非原生 JSON 型別
impact = payload.get("impact", {}) if isinstance(payload, dict) else {}
_payload_hash = hashlib.sha256(
json.dumps(impact, sort_keys=True, default=str).encode()
).hexdigest()[:8]
dedup_key = f"alert:governance:{safe_event_type}:{_payload_hash}"
if not await self._check_dedup(dedup_key, ttl=3600):
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
return
msg = format_governance_alert_card(event_type, payload)
await self._send(msg)
logger.info("governance_alert_sent", event_type=event_type)
async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None:
"""Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup每日重置"""
# 2026-04-26 critic-H1 hotfix by Claude Opus 4.7 — dedup key 加日期後綴
# 原 key 常數 + 24h TTL 會吞跨日告警(昨 22:00 觸發,今 21:30 再觸發時 dedup 還沒過期)
# 修法key 加 YYYY-MM-DD每日獨立 dedup windowTTL 縮短到 8h 足夠當日內防重複
date_str = datetime.now(TAIPEI_TZ).date().isoformat()
dedup_key = f"alert:gemini_quota_exceeded:{date_str}"
if not await self._check_dedup(dedup_key, ttl=QUOTA_DEDUP_TTL_SEC):
logger.debug("quota_alert_dedup_skipped", date=date_str)
return
quota = event.get("quota", "?")
current_count = event.get("current_count", "?")
msg = (
f"*Gemini 每日配額耗盡*\n\n"
f"日期:{_escape_md(date_str)}\n"
f"上限:{_escape_md(str(quota))} calls/day\n"
f"當前用量:{_escape_md(str(current_count))}\n"
f"降級目標Nemotron → Claude \\(Gemini 不可用\\)\n\n"
f"進入容災模式至明日 0:00\n"
f"建議檢查是否有異常流量,評估是否升級 Gemini 配額"
)
await self._send(msg)
logger.info("quota_alert_sent", quota=quota, current_count=current_count)
async def alert_provider_version_changed(self, changed_providers: list[str], probed: int) -> None:
"""AI Provider 版本變更告警 — dedup 1h/provider
P3.2.3 by Claude Sonnet 4.6 2026-04-27
每個 provider 獨立 dedup避免同一版本重複告警。
"""
now_str = datetime.now(TAIPEI_TZ).strftime("%Y-%m-%d %H:%M")
sent: list[str] = []
for provider in changed_providers:
dedup_key = f"alert:provider_version_changed:{provider}"
if not await self._check_dedup(dedup_key, ttl=3600):
logger.debug("provider_version_alert_dedup_skipped", provider=provider)
continue
sent.append(provider)
if not sent:
return
providers_md = "\n".join(f"{_escape_md(p)}" for p in sent)
msg = (
f"*AI Provider 版本變更偵測*\n\n"
f"時間:{_escape_md(now_str)}\n"
f"探測總數:{probed}\n"
f"版本已變更:\n{providers_md}\n\n"
f"系統已自動記錄版本歷史,請確認是否需要重新驗證推理品質"
)
await self._send(msg)
logger.info("provider_version_alert_sent", sent=sent)
# -------------------------------------------------------------------------
# DedupRedis SET NX EX
# -------------------------------------------------------------------------
async def _check_dedup(self, key: str, ttl: int) -> bool:
"""
Redis SET NX EX 防止重複告警。
True = 第一次應送出False = 已送過(跳過)。
2026-04-25 P1.5 by Claude Engineer-D — Telegram dedup 鐵律 10min/24h TTL
2026-04-27 Wave8-X2 by Claude — dedup fail-open 修復
原行為Redis 不可用 → return True → 每次都發 → Telegram 轟炸
新行為Redis 不可用時降級到 in-memory dedup同 process 內限流)
Redis 恢復後自動優先走 Redisin-memory 只在 except 分支觸發)
"""
# 優先嘗試 Redis
if self._redis is not None:
try:
ok = await self._redis.set(f"{key}:dedup", "1", ex=ttl, nx=True)
return bool(ok)
except Exception as e:
logger.warning("dedup_redis_failed_using_memory", error=str(e))
# Redis 失敗 → 降級到 in-memory不 fail-open
# In-memory fallback dedupRedis 不可用時,或 redis_client=None 時)
import time
now = time.time()
# GC超過容量上限時清除過期 entry防 dict 無限成長
if len(self._memory_dedup) >= self._memory_dedup_max_size:
self._memory_dedup = {
k: v for k, v in self._memory_dedup.items() if now - v < ttl
}
last_sent = self._memory_dedup.get(key, 0.0)
if now - last_sent < ttl:
return False # dedup 命中,跳過
self._memory_dedup[key] = now
return True
# -------------------------------------------------------------------------
# 發送(透過 TelegramGateway singleton
# -------------------------------------------------------------------------
async def _send(self, message: str) -> None:
"""發送至 Telegram SRE_GROUP_CHAT_ID
使用現有 TelegramGateway singleton不另建 HTTP client
parse_mode=MarkdownV2 對應 MarkdownV2 escape 規則。
alerter 失敗不阻斷主路由邏輯exception 吞掉只 log
2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程
"""
try:
from src.core.config import get_settings
from src.services.telegram_gateway import get_telegram_gateway
settings = get_settings()
chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) or getattr(settings, "OPENCLAW_TG_CHAT_ID", None)
if not chat_id:
logger.warning("telegram_chat_id_missing_failover_alert")
return
gateway = get_telegram_gateway()
await gateway.send_alert_notification(text=message, parse_mode="MarkdownV2")
logger.info("telegram_failover_alert_sent", message_len=len(message))
except Exception as e:
# 不 raise — 告警失敗不該阻斷主流程(鐵律)
# 2026-05-06 Codex: Telegram/httpx exception 字串可能包含 bot token URL
# 禁止用 logger.exception 輸出 chained traceback。
logger.warning(
"telegram_failover_send_failed",
error=_sanitize_telegram_error(str(e)),
error_type=type(e).__name__,
)
# -------------------------------------------------------------------------
# MarkdownV2 escape 工具
# -------------------------------------------------------------------------
_MD2_SPECIAL = r"\_*[]()~`>#+-=|{}.!"
def _escape_md(text: str) -> str:
"""Escape MarkdownV2 特殊字元,防止 Telegram parse error。
2026-04-25 P1.5 by Claude Engineer-D — MarkdownV2 安全逸出
"""
for ch in _MD2_SPECIAL:
text = text.replace(ch, f"\\{ch}")
return text
def _sanitize_telegram_error(text: str) -> str:
"""遮蔽 Telegram Bot URL 中的 token避免例外訊息寫入 log。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
_EVENT_DISPLAY_NAMES = {
"trust_drift": "信任漂移",
"knowledge_degradation": "知識庫劣化",
"governance_slo_data_gap": "SLO 資料缺口",
"governance_self_failure": "治理自檢失敗",
"llm_hallucination": "LLM 驗證失敗",
"execution_blast_radius": "執行風險擴大",
}
_STATUS_BADGES = {
"critical": "🔴 critical",
"error": "🔴 error",
"violation": "🔴 violation",
"warning": "🟡 warning",
"degraded": "🟠 degraded",
"ok": "🟢 ok",
}
_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
"trust_drift": [
("drifted_count", "漂移 Playbook"),
("total_playbooks", "總 Playbook"),
("drift_ratio", "漂移比例"),
("threshold", "警戒門檻"),
("auto_deprecated_count", "自動停用"),
],
"knowledge_degradation": [
("stale_count", "陳舊 KM"),
("total_count", "總 KM"),
("stale_ratio", "陳舊比例"),
("threshold", "警戒門檻"),
("stale_days", "陳舊天數"),
],
"governance_slo_data_gap": [
("reason", "缺口原因"),
("skipped_count", "略過指標"),
("all_slo_metrics_not_emitted", "SLO 指標缺失"),
],
"governance_self_failure": [
("failed_checks", "失敗檢查"),
("total_checks", "總檢查"),
("failure_rate", "失敗比例"),
],
"execution_blast_radius": [
("affected_services", "受影響服務"),
("blast_radius", "爆炸半徑"),
("threshold", "警戒門檻"),
],
"llm_hallucination": [
("failed", "驗證失敗"),
("rate", "失敗比例"),
("threshold", "警戒門檻"),
],
}
def _event_display_name(event_type: str) -> str:
if event_type in _EVENT_DISPLAY_NAMES:
return _EVENT_DISPLAY_NAMES[event_type]
if event_type.startswith("slo_"):
return "SLO 違反"
return event_type.replace("_", " ").strip().title()
def _status_badge(status: Any) -> str:
status_text = str(status or "warning")
return _STATUS_BADGES.get(status_text.lower(), status_text)
def _format_metric_value(key: str, value: Any) -> str:
if isinstance(value, bool):
return "" if value else ""
if isinstance(value, (float, int)) and (
key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
):
return f"{float(value) * 100:.1f}%"
if isinstance(value, list):
if not value:
return "0"
shown = ", ".join(str(item) for item in value[:3])
if len(value) > 3:
shown += f"…(共 {len(value)}"
return shown
return str(value)
def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
if not data:
return []
used: set[str] = set()
rows: list[str] = []
for key, label in _IMPACT_PROFILES.get(event_type, []):
if key in data:
rows.append(f"{label}{_format_metric_value(key, data[key])}")
used.add(key)
for key in sorted(data.keys()):
if len(rows) >= max_rows:
break
if key in used:
continue
rows.append(f"{key}{_format_metric_value(key, data[key])}")
if len(data) > len(used) + max(0, max_rows - len(rows)):
rows.append("更多欄位已收斂至 AwoooP 稽核資料")
return rows[:max_rows]
def _tree_lines(rows: list[str]) -> str:
if not rows:
return ""
rendered: list[str] = []
for idx, row in enumerate(rows):
branch = "" if idx == len(rows) - 1 else ""
rendered.append(f"{branch} {_escape_md(str(row))}")
return "\n".join(rendered)
def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
rows = _profiled_rows(event_type, impact)
return _tree_lines(rows)
def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
"""格式化 AI 治理 Telegram 卡片。
2026-05-07 Codex — 保留治理 payload僅在 Telegram 邊界層把 raw key/value
轉成可掃描卡片,避免大量純文字欄位洗版。
"""
payload = payload if isinstance(payload, dict) else {}
impact = _as_dict(payload.get("impact"))
remediation = _as_dict(payload.get("remediation"))
actionable = _as_dict(payload.get("actionable"))
status = payload.get("status", "warning")
sections: list[str] = [
f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*",
"──────────────────────",
f"類型:{_escape_md(event_type)}",
f"狀態:{_escape_md(_status_badge(status))}",
]
impact_lines = _governance_summary_lines(event_type, impact)
if impact_lines:
sections.extend(["", "🧭 *影響摘要*", impact_lines])
remediation_lines = _lines_from_list(remediation.get("items"))
remediation_next_action = remediation.get("next_action")
remediation_hint = remediation.get("hint")
if remediation_lines or remediation_next_action or remediation_hint:
sections.extend(["", "🛠️ *修復方向*"])
if remediation_lines:
sections.append(remediation_lines)
if remediation_next_action:
sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}")
if remediation_hint:
sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}")
actionable_lines = _lines_from_list(actionable.get("items"))
if actionable_lines:
sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
fallback_items = _fallback_pairs(
payload,
keep={"status", "impact", "remediation", "actionable"},
max_items=4,
)
if fallback_items:
sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
return "\n".join(sections)
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
if not data:
return ""
rows = []
idx = 0
for k in sorted(data.keys()) if isinstance(data, dict) else []:
if idx >= max_items:
break
rows.append(f"{_escape_md(str(k))}{_escape_md(str(data.get(k)))}")
idx += 1
if compact and len(rows) >= max_items:
rows.append(_escape_md("...(更多欄位略)"))
return "\n".join(f" {line}" for line in rows)
def _lines_from_list(value: Any) -> str:
if not isinstance(value, list):
return ""
return "\n".join(
f" {_escape_md(str(idx + 1))}\\. {_escape_md(str(item))}"
for idx, item in enumerate(value)
)
def _fallback_pairs(
payload: dict[str, Any],
keep: set[str] | None = None,
*,
max_items: int | None = None,
) -> list[str]:
if not isinstance(payload, dict):
return []
keep = set(keep or set())
rows = []
for key in sorted(payload.keys()):
if key in keep:
continue
if max_items is not None and len(rows) >= max_items:
rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
break
rows.append(f"{_escape_md(str(key))}{_escape_md(str(payload.get(key)))}")
return rows
# =============================================================================
# Singleton
# =============================================================================
_alerter_instance: FailoverAlerter | None = None
def get_failover_alerter() -> FailoverAlerter:
"""取得 FailoverAlerter singletonlifespan 中注入依賴前也可安全呼叫,僅 fail-open
2026-04-25 P1.5 by Claude Engineer-D — Singleton 取得 alerter
"""
global _alerter_instance
if _alerter_instance is None:
_alerter_instance = FailoverAlerter()
return _alerter_instance
def configure_alerter(redis_client) -> None:
"""Lifespan 注入redis_client 就緒後呼叫,讓 dedup 功能生效。
telegram 不另外注入,直接從 get_telegram_gateway() singleton 取得
lifespan startup 已保證 TelegramGateway 初始化完成)。
2026-04-25 P1.5 by Claude Engineer-D — Lifespan 注入
"""
global _alerter_instance
_alerter_instance = FailoverAlerter(redis_client=redis_client)
logger.info("failover_alerter_configured")
def reset_failover_alerter() -> None:
"""重置 singleton測試用
2026-04-25 P1.5 by Claude Engineer-D
"""
global _alerter_instance
_alerter_instance = None