744 lines
28 KiB
Python
744 lines
28 KiB
Python
"""Ollama 容災 / 自動恢復 / Gemini 帳單 Telegram 告警
|
||
|
||
設計原則:
|
||
- 每次 failover 觸發都通知(用戶明確指示)
|
||
- 但用 10min Redis dedup TTL 防同樣狀態重複告警
|
||
- 三種告警類型:failover_triggered / recovery_succeeded / gemini_quota_exceeded
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
import json
|
||
import re
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||
DEDUP_TTL_SEC = 600 # 10 min(故障切換用)
|
||
RECOVERY_DEDUP_TTL_SEC = 3600 # 1h — GCP 健康閃爍時 1 小時內不重複告警
|
||
QUOTA_DEDUP_TTL_SEC = 86400 # 24h(每日 quota 告警只發一次)
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
|
||
|
||
|
||
class FailoverAlerter:
|
||
"""Ollama 容災 Telegram 告警
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — Telegram Alerter for Ollama Failover
|
||
"""
|
||
|
||
def __init__(self, redis_client=None) -> None:
|
||
# telegram_gateway 從 singleton 取,不注入(lifespan 已確保初始化)
|
||
self._redis = redis_client
|
||
# 2026-04-27 Wave8-X2 by Claude — alerter dedup fail-open 修復
|
||
# Redis 不可用時改用 in-memory dedup,避免同一事件狂發 Telegram
|
||
# 限制:同 process 內生效;重啟後記憶清空(可接受,重啟本身就是罕見事件)
|
||
self._memory_dedup: dict[str, float] = {}
|
||
self._memory_dedup_max_size = 1000
|
||
|
||
async def alert_failover(self, event: dict[str, Any]) -> None:
|
||
"""Ollama 故障切換告警 — 10min dedup
|
||
# 2026-05-03 ogt: ADR-110 三層容災,故障主機從 event["failed_host"] 動態讀取
|
||
"""
|
||
to_provider = event.get("to_provider", "unknown")
|
||
dedup_key = f"alert:failover:{to_provider}"
|
||
if not await self._check_dedup(dedup_key, ttl=DEDUP_TTL_SEC):
|
||
logger.debug("failover_alert_dedup_skipped", to_provider=to_provider)
|
||
return
|
||
|
||
reason = event.get("reason", "unknown")
|
||
model = event.get("model", "?")
|
||
timestamp = event.get("timestamp", datetime.now(TAIPEI_TZ).isoformat())
|
||
fallback_chain_str = event.get("fallback_chain_str", "?")
|
||
# 2026-05-03 ogt: ADR-110 — 故障主機動態,不再硬編碼 111
|
||
failed_host = event.get("failed_host", "Ollama")
|
||
|
||
msg = (
|
||
f"*Ollama 容災激活*\n\n"
|
||
f"故障主機:{_escape_md(failed_host)}\n"
|
||
f"故障狀態:{_escape_md(reason)}\n"
|
||
f"切換目標:{_escape_md(to_provider)} \\(model: {_escape_md(model)}\\)\n"
|
||
f"切換時間:{_escape_md(timestamp)}\n\n"
|
||
f"Fallback 鏈:{_escape_md(fallback_chain_str)}\n\n"
|
||
f"自動恢復服務持續監控,3 次 HEALTHY 後自動切回"
|
||
)
|
||
await self._send(msg)
|
||
logger.info("failover_alert_sent", to_provider=to_provider)
|
||
|
||
async def alert_recovery(self, event: dict[str, Any]) -> None:
|
||
"""Ollama 自動恢復告警 — 1h dedup per host
|
||
# 2026-05-03 ogt: ADR-110 三層容災,恢復主機從 event["recovered_host"] 動態讀取
|
||
# 2026-05-04 ogt: dedup key 加 recovered_host + 改 1h TTL
|
||
# 原 key 固定 "alert:recovery" → GCP-A 每 10min 閃爍就重發
|
||
"""
|
||
recovered_host = event.get("recovered_host", event.get("to_provider", "ollama"))
|
||
# sanitize host → 只留 IP/hostname 部分,拿掉 http:// 前綴與 port
|
||
safe_host = str(recovered_host).replace("http://", "").replace("/", "_").replace(":", "_")
|
||
dedup_key = f"alert:recovery:{safe_host}"
|
||
if not await self._check_dedup(dedup_key, ttl=RECOVERY_DEDUP_TTL_SEC):
|
||
logger.debug("recovery_alert_dedup_skipped")
|
||
return
|
||
|
||
stable_count = event.get("stable_count", event.get("to", "?"))
|
||
# 相容 auto_recovery 傳入的 {"from": ..., "to": ...} 格式
|
||
from_provider = event.get("from_provider", event.get("from", "?"))
|
||
to_provider = event.get("to_provider", event.get("to", "ollama"))
|
||
recovery_time = event.get("recovery_time", datetime.now(TAIPEI_TZ).isoformat())
|
||
|
||
msg = (
|
||
f"*Ollama 自動恢復*\n\n"
|
||
f"恢復主機:{_escape_md(str(recovered_host))}\n"
|
||
f"穩定計數:連續 {stable_count} 次 HEALTHY\n"
|
||
f"切回時間:{_escape_md(str(recovery_time))}\n"
|
||
f"切換路徑:{_escape_md(str(from_provider))} → {_escape_md(str(to_provider))}\n\n"
|
||
f"自動化飛輪已恢復至高效能推理模式"
|
||
)
|
||
await self._send(msg)
|
||
logger.info("recovery_alert_sent", from_provider=from_provider)
|
||
|
||
async def alert_governance(self, event_type: str, payload: dict[str, Any]) -> None:
|
||
"""AI 治理告警(dedup 1h)
|
||
|
||
event_type: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||
dedup TTL 3600s — 同類告警 1 小時內不重複發送
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-05-02 ogt + Claude Sonnet 4.6 — Bug 3 修復:dedup key 加 payload hash
|
||
原 key 只看 event_type,不看 payload 內容,導致同 event_type 但不同影響
|
||
的告警(例如:trust_drift 4 條→25 條漂移)全被 1h dedup 吃掉。
|
||
|
||
2026-05-02 ogt + Claude Opus 4.7 — critic P1-3 連鎖修復
|
||
前次只 hash 頂層 allowlist 欄位,對 slo_*_violation / governance_self_failure
|
||
等只把 metric 放在 impact subdict 的事件失效(hash 永遠相同)。
|
||
改 hash 整個 impact subdict — schema 強制 5 種 event type 都有 impact,
|
||
各自的 metric 值都會反映在 hash 裡,數值變動就會繞過 dedup。
|
||
sha256 取代 md5 避開 bandit B324 lint warning(非密碼學用途)。
|
||
"""
|
||
# sanitize:防 SLO 名稱(如 "slo_km_growth_rate")含 ":" 或空格污染 key
|
||
safe_event_type = event_type.replace(":", "_").replace(" ", "_").lower()
|
||
|
||
# impact hash:hash payload.impact subdict(schema 強制存在;含各 event 的 metric 值)
|
||
# default=str 容錯 datetime / Decimal / 其他非原生 JSON 型別
|
||
impact = payload.get("impact", {}) if isinstance(payload, dict) else {}
|
||
_payload_hash = hashlib.sha256(
|
||
json.dumps(impact, sort_keys=True, default=str).encode()
|
||
).hexdigest()[:8]
|
||
dedup_key = f"alert:governance:{safe_event_type}:{_payload_hash}"
|
||
if not await self._check_dedup(dedup_key, ttl=3600):
|
||
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
|
||
return
|
||
|
||
msg = format_governance_alert_card(event_type, payload)
|
||
await self._send(msg)
|
||
logger.info("governance_alert_sent", event_type=event_type)
|
||
|
||
async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None:
|
||
"""Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup(每日重置)"""
|
||
# 2026-04-26 critic-H1 hotfix by Claude Opus 4.7 — dedup key 加日期後綴
|
||
# 原 key 常數 + 24h TTL 會吞跨日告警(昨 22:00 觸發,今 21:30 再觸發時 dedup 還沒過期)
|
||
# 修法:key 加 YYYY-MM-DD,每日獨立 dedup window;TTL 縮短到 8h 足夠當日內防重複
|
||
date_str = datetime.now(TAIPEI_TZ).date().isoformat()
|
||
dedup_key = f"alert:gemini_quota_exceeded:{date_str}"
|
||
if not await self._check_dedup(dedup_key, ttl=QUOTA_DEDUP_TTL_SEC):
|
||
logger.debug("quota_alert_dedup_skipped", date=date_str)
|
||
return
|
||
|
||
quota = event.get("quota", "?")
|
||
current_count = event.get("current_count", "?")
|
||
|
||
msg = (
|
||
f"*Gemini 每日配額耗盡*\n\n"
|
||
f"日期:{_escape_md(date_str)}\n"
|
||
f"上限:{_escape_md(str(quota))} calls/day\n"
|
||
f"當前用量:{_escape_md(str(current_count))}\n"
|
||
f"降級目標:Nemotron → Claude \\(Gemini 不可用\\)\n\n"
|
||
f"進入容災模式至明日 0:00\n"
|
||
f"建議檢查是否有異常流量,評估是否升級 Gemini 配額"
|
||
)
|
||
await self._send(msg)
|
||
logger.info("quota_alert_sent", quota=quota, current_count=current_count)
|
||
|
||
async def alert_provider_version_changed(self, changed_providers: list[str], probed: int) -> None:
|
||
"""AI Provider 版本變更告警 — dedup 1h/provider
|
||
|
||
P3.2.3 by Claude Sonnet 4.6 2026-04-27
|
||
每個 provider 獨立 dedup,避免同一版本重複告警。
|
||
"""
|
||
now_str = datetime.now(TAIPEI_TZ).strftime("%Y-%m-%d %H:%M")
|
||
sent: list[str] = []
|
||
|
||
for provider in changed_providers:
|
||
dedup_key = f"alert:provider_version_changed:{provider}"
|
||
if not await self._check_dedup(dedup_key, ttl=3600):
|
||
logger.debug("provider_version_alert_dedup_skipped", provider=provider)
|
||
continue
|
||
sent.append(provider)
|
||
|
||
if not sent:
|
||
return
|
||
|
||
providers_md = "\n".join(f"• {_escape_md(p)}" for p in sent)
|
||
msg = (
|
||
f"*AI Provider 版本變更偵測*\n\n"
|
||
f"時間:{_escape_md(now_str)}\n"
|
||
f"探測總數:{probed}\n"
|
||
f"版本已變更:\n{providers_md}\n\n"
|
||
f"系統已自動記錄版本歷史,請確認是否需要重新驗證推理品質"
|
||
)
|
||
await self._send(msg)
|
||
logger.info("provider_version_alert_sent", sent=sent)
|
||
|
||
# -------------------------------------------------------------------------
|
||
# Dedup(Redis SET NX EX)
|
||
# -------------------------------------------------------------------------
|
||
|
||
async def _check_dedup(self, key: str, ttl: int) -> bool:
|
||
"""
|
||
Redis SET NX EX 防止重複告警。
|
||
True = 第一次(應送出),False = 已送過(跳過)。
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — Telegram dedup 鐵律 10min/24h TTL
|
||
2026-04-27 Wave8-X2 by Claude — dedup fail-open 修復
|
||
原行為:Redis 不可用 → return True → 每次都發 → Telegram 轟炸
|
||
新行為:Redis 不可用時降級到 in-memory dedup(同 process 內限流)
|
||
Redis 恢復後自動優先走 Redis(in-memory 只在 except 分支觸發)
|
||
"""
|
||
# 優先嘗試 Redis
|
||
if self._redis is not None:
|
||
try:
|
||
ok = await self._redis.set(f"{key}:dedup", "1", ex=ttl, nx=True)
|
||
return bool(ok)
|
||
except Exception as e:
|
||
logger.warning("dedup_redis_failed_using_memory", error=str(e))
|
||
# Redis 失敗 → 降級到 in-memory(不 fail-open)
|
||
|
||
# In-memory fallback dedup(Redis 不可用時,或 redis_client=None 時)
|
||
import time
|
||
|
||
now = time.time()
|
||
# GC:超過容量上限時清除過期 entry,防 dict 無限成長
|
||
if len(self._memory_dedup) >= self._memory_dedup_max_size:
|
||
self._memory_dedup = {
|
||
k: v for k, v in self._memory_dedup.items() if now - v < ttl
|
||
}
|
||
last_sent = self._memory_dedup.get(key, 0.0)
|
||
if now - last_sent < ttl:
|
||
return False # dedup 命中,跳過
|
||
self._memory_dedup[key] = now
|
||
return True
|
||
|
||
# -------------------------------------------------------------------------
|
||
# 發送(透過 TelegramGateway singleton)
|
||
# -------------------------------------------------------------------------
|
||
|
||
async def _send(self, message: str) -> None:
|
||
"""發送至 Telegram SRE_GROUP_CHAT_ID
|
||
|
||
使用現有 TelegramGateway singleton(不另建 HTTP client),
|
||
parse_mode=MarkdownV2 對應 MarkdownV2 escape 規則。
|
||
|
||
alerter 失敗不阻斷主路由邏輯(exception 吞掉只 log)
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程
|
||
"""
|
||
try:
|
||
from src.core.config import get_settings
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
|
||
settings = get_settings()
|
||
chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None)
|
||
if not chat_id:
|
||
logger.warning("telegram_chat_id_missing_failover_alert")
|
||
return
|
||
|
||
gateway = get_telegram_gateway()
|
||
await gateway.send_alert_notification(text=message, parse_mode="MarkdownV2")
|
||
logger.info("telegram_failover_alert_sent", message_len=len(message))
|
||
except Exception as e:
|
||
# 不 raise — 告警失敗不該阻斷主流程(鐵律)
|
||
# 2026-05-06 Codex: Telegram/httpx exception 字串可能包含 bot token URL,
|
||
# 禁止用 logger.exception 輸出 chained traceback。
|
||
logger.warning(
|
||
"telegram_failover_send_failed",
|
||
error=_sanitize_telegram_error(str(e)),
|
||
error_type=type(e).__name__,
|
||
)
|
||
|
||
|
||
# -------------------------------------------------------------------------
|
||
# MarkdownV2 escape 工具
|
||
# -------------------------------------------------------------------------
|
||
|
||
_MD2_SPECIAL = r"\_*[]()~`>#+-=|{}.!"
|
||
|
||
|
||
def _escape_md(text: str) -> str:
|
||
"""Escape MarkdownV2 特殊字元,防止 Telegram parse error。
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — MarkdownV2 安全逸出
|
||
"""
|
||
for ch in _MD2_SPECIAL:
|
||
text = text.replace(ch, f"\\{ch}")
|
||
return text
|
||
|
||
|
||
def _sanitize_telegram_error(text: str) -> str:
|
||
"""遮蔽 Telegram Bot URL 中的 token,避免例外訊息寫入 log。"""
|
||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||
|
||
|
||
def _as_dict(value: Any) -> dict[str, Any]:
|
||
return value if isinstance(value, dict) else {}
|
||
|
||
|
||
_EVENT_DISPLAY_NAMES = {
|
||
"trust_drift": "信任漂移",
|
||
"knowledge_degradation": "KM 需要更新(影響 AI 判斷)",
|
||
"governance_slo_data_gap": "SLO 資料缺口",
|
||
"governance_self_failure": "治理自檢失敗",
|
||
"llm_hallucination": "LLM 驗證失敗",
|
||
"execution_blast_radius": "執行風險擴大",
|
||
}
|
||
|
||
_STATUS_BADGES = {
|
||
"critical": "🔴 critical",
|
||
"error": "🔴 error",
|
||
"violation": "🔴 violation",
|
||
"warning": "🟡 warning",
|
||
"degraded": "🟠 degraded",
|
||
"ok": "🟢 ok",
|
||
}
|
||
|
||
_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
|
||
"trust_drift": [
|
||
("drifted_count", "漂移 Playbook"),
|
||
("total_playbooks", "總 Playbook"),
|
||
("drift_ratio", "漂移比例"),
|
||
("threshold", "警戒門檻"),
|
||
("auto_deprecated_count", "自動停用"),
|
||
],
|
||
"knowledge_degradation": [
|
||
("stale_count", "陳舊 KM"),
|
||
("total_count", "總 KM"),
|
||
("stale_ratio", "陳舊比例"),
|
||
("threshold", "警戒門檻"),
|
||
("stale_days", "陳舊天數"),
|
||
],
|
||
"governance_slo_data_gap": [
|
||
("reason", "缺口原因"),
|
||
("skipped_count", "略過指標"),
|
||
("all_slo_metrics_not_emitted", "SLO 指標缺失"),
|
||
],
|
||
"governance_self_failure": [
|
||
("failed_checks", "失敗檢查"),
|
||
("total_checks", "總檢查"),
|
||
("failure_rate", "失敗比例"),
|
||
],
|
||
"execution_blast_radius": [
|
||
("affected_services", "受影響服務"),
|
||
("blast_radius", "爆炸半徑"),
|
||
("threshold", "警戒門檻"),
|
||
],
|
||
"llm_hallucination": [
|
||
("failed", "驗證失敗"),
|
||
("rate", "失敗比例"),
|
||
("threshold", "警戒門檻"),
|
||
],
|
||
}
|
||
|
||
_TOP_LEVEL_IMPACT_ALIASES: dict[str, dict[str, tuple[str, ...]]] = {
|
||
"knowledge_degradation": {
|
||
"stale_count": ("stale_count", "stale", "stale_km"),
|
||
"total_count": ("total_count", "total", "total_km"),
|
||
"stale_ratio": ("stale_ratio", "ratio"),
|
||
"threshold": ("threshold",),
|
||
"stale_days": ("stale_days",),
|
||
},
|
||
}
|
||
|
||
_TOP_LEVEL_FALLBACK_KEEP: dict[str, set[str]] = {
|
||
"knowledge_degradation": {
|
||
"automatable_work",
|
||
"next_action",
|
||
"next_step",
|
||
"ownership",
|
||
"ratio",
|
||
"stale",
|
||
"stale_count",
|
||
"stale_days",
|
||
"stale_km",
|
||
"stale_ratio",
|
||
"threshold",
|
||
"total",
|
||
"total_count",
|
||
"total_km",
|
||
},
|
||
}
|
||
|
||
|
||
def _event_display_name(event_type: str) -> str:
|
||
if event_type in _EVENT_DISPLAY_NAMES:
|
||
return _EVENT_DISPLAY_NAMES[event_type]
|
||
if event_type.startswith("slo_"):
|
||
return "SLO 違反"
|
||
return event_type.replace("_", " ").strip().title()
|
||
|
||
|
||
def _status_badge(status: Any) -> str:
|
||
status_text = str(status or "warning")
|
||
return _STATUS_BADGES.get(status_text.lower(), status_text)
|
||
|
||
|
||
def _format_metric_value(key: str, value: Any) -> str:
|
||
if isinstance(value, bool):
|
||
return "是" if value else "否"
|
||
if isinstance(value, (float, int)) and (
|
||
key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
|
||
):
|
||
return f"{float(value) * 100:.1f}%"
|
||
if isinstance(value, list):
|
||
if not value:
|
||
return "0"
|
||
shown = ", ".join(str(item) for item in value[:3])
|
||
if len(value) > 3:
|
||
shown += f"…(共 {len(value)})"
|
||
return shown
|
||
return str(value)
|
||
|
||
|
||
def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
|
||
if not data:
|
||
return []
|
||
|
||
used: set[str] = set()
|
||
rows: list[str] = []
|
||
for key, label in _IMPACT_PROFILES.get(event_type, []):
|
||
if key in data:
|
||
rows.append(f"{label}:{_format_metric_value(key, data[key])}")
|
||
used.add(key)
|
||
|
||
for key in sorted(data.keys()):
|
||
if len(rows) >= max_rows:
|
||
break
|
||
if key in used:
|
||
continue
|
||
rows.append(f"{key}:{_format_metric_value(key, data[key])}")
|
||
|
||
if len(data) > len(used) + max(0, max_rows - len(rows)):
|
||
rows.append("更多欄位已收斂至 AwoooP 稽核資料")
|
||
return rows[:max_rows]
|
||
|
||
|
||
def _tree_lines(rows: list[str]) -> str:
|
||
if not rows:
|
||
return ""
|
||
rendered: list[str] = []
|
||
for idx, row in enumerate(rows):
|
||
branch = "└" if idx == len(rows) - 1 else "├"
|
||
rendered.append(f"{branch} {_escape_md(str(row))}")
|
||
return "\n".join(rendered)
|
||
|
||
|
||
def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
|
||
rows = _profiled_rows(event_type, impact)
|
||
return _tree_lines(rows)
|
||
|
||
|
||
def _normalized_impact(event_type: str, payload: dict[str, Any]) -> dict[str, Any]:
|
||
impact = dict(_as_dict(payload.get("impact")))
|
||
for canonical_key, aliases in _TOP_LEVEL_IMPACT_ALIASES.get(event_type, {}).items():
|
||
if canonical_key in impact:
|
||
continue
|
||
for alias in aliases:
|
||
if alias in payload:
|
||
impact[canonical_key] = payload[alias]
|
||
break
|
||
return impact
|
||
|
||
|
||
def _section_payload(
|
||
payload: dict[str, Any],
|
||
canonical_key: str,
|
||
*,
|
||
item_aliases: tuple[str, ...] = (),
|
||
next_action_aliases: tuple[str, ...] = (),
|
||
) -> dict[str, Any]:
|
||
raw = payload.get(canonical_key)
|
||
section = dict(raw) if isinstance(raw, dict) else {}
|
||
if isinstance(raw, list) and "items" not in section:
|
||
section["items"] = raw
|
||
|
||
if "items" not in section:
|
||
for alias in item_aliases:
|
||
value = payload.get(alias)
|
||
if isinstance(value, list):
|
||
section["items"] = value
|
||
break
|
||
|
||
if "next_action" not in section:
|
||
for alias in next_action_aliases:
|
||
value = payload.get(alias)
|
||
if value:
|
||
section["next_action"] = value
|
||
break
|
||
|
||
return section
|
||
|
||
|
||
def _governance_operator_context(event_type: str, impact: dict[str, Any]) -> list[str]:
|
||
"""Return operator-facing guidance for governance alerts.
|
||
|
||
The governance loop stores machine-readable details in AwoooP. Telegram
|
||
needs a shorter "what this means / what to do now" layer so operators do
|
||
not have to infer the process stage from raw metric names.
|
||
"""
|
||
if event_type != "knowledge_degradation":
|
||
return []
|
||
|
||
stale_count = impact.get("stale_count", "?")
|
||
total_count = impact.get("total_count", "?")
|
||
stale_days = impact.get("stale_days", "?")
|
||
threshold = _format_metric_value("threshold", impact.get("threshold", 0.2))
|
||
stale_ratio = _format_metric_value("stale_ratio", impact.get("stale_ratio", 0))
|
||
|
||
plain_summary = (
|
||
f"{stale_count} / {total_count} 筆 KM 超過 {stale_days} 天未更新,"
|
||
"AI 做告警分類、規則匹配、PlayBook 推薦時可能引用舊資訊。"
|
||
)
|
||
policy_summary = (
|
||
f"這是治理品質警報,不是服務故障;目標是把 stale ratio "
|
||
f"{stale_ratio} 降到門檻 {threshold} 以下。"
|
||
)
|
||
|
||
return [
|
||
"",
|
||
"💬 *白話說明*",
|
||
_escape_md(plain_summary),
|
||
_escape_md(policy_summary),
|
||
"",
|
||
"🧩 *AI 流程狀態*",
|
||
_tree_lines(
|
||
[
|
||
"階段:detected → queued_kb_healthcheck → waiting_owner_review",
|
||
"AI 已做:統計 stale KM,產生補齊與審核方向",
|
||
"AI 可做:反查 Incident / Sentry / SigNoz / PlayBook,產生 KM 更新草稿與任務",
|
||
"需要人工:owner 審核高影響 KM 內容,避免 AI 自動寫入錯誤知識",
|
||
]
|
||
),
|
||
"",
|
||
"✅ *現在要做*",
|
||
_lines_from_list(
|
||
[
|
||
"確認 run_kb_growth_healthcheck 是否已排程或已執行",
|
||
"到 AwoooP Work Items / AI 治理篩選 knowledge_degradation",
|
||
"優先審核最近被告警、Sentry、SigNoz、PlayBook 引用的 KM",
|
||
"不用重啟服務;等 stale_ratio 降到 20% 以下再關閉治理警報",
|
||
]
|
||
),
|
||
]
|
||
|
||
|
||
def _governance_ownership_lines(event_type: str, payload: dict[str, Any]) -> list[str]:
|
||
"""Return explicit agent ownership for governance alerts."""
|
||
ownership = _as_dict(payload.get("ownership"))
|
||
|
||
if not ownership and event_type == "knowledge_degradation":
|
||
ownership = {
|
||
"lead_agent": "Hermes",
|
||
"lead_reason": "E7 自動 KM 主責:反查 Incident / Sentry / SigNoz / PlayBook,產生 KM 更新草稿與任務。",
|
||
"support_agents": [
|
||
"OpenClaw:提供告警分類、規則匹配與 PlayBook 脈絡摘要,不直接批量改寫 KM。",
|
||
"ElephantAlpha:read-only 稽核高影響 KM 草稿與風險,不執行寫入或通知。",
|
||
],
|
||
"human_owner": "KM owner / SRE owner",
|
||
"human_reason": "審核高影響 KM 後才允許寫入,避免 AI 自動固化錯誤知識。",
|
||
}
|
||
|
||
if not ownership:
|
||
return []
|
||
|
||
rows: list[str] = []
|
||
lead_agent = ownership.get("lead_agent")
|
||
lead_reason = ownership.get("lead_reason")
|
||
if lead_agent:
|
||
lead_text = f"主責:{lead_agent}"
|
||
if lead_reason:
|
||
lead_text += f" — {lead_reason}"
|
||
rows.append(lead_text)
|
||
|
||
support_agents = ownership.get("support_agents")
|
||
if isinstance(support_agents, list):
|
||
rows.extend(str(item) for item in support_agents if item)
|
||
|
||
human_owner = ownership.get("human_owner")
|
||
human_reason = ownership.get("human_reason")
|
||
if human_owner:
|
||
human_text = f"人工覆核:{human_owner}"
|
||
if human_reason:
|
||
human_text += f" — {human_reason}"
|
||
rows.append(human_text)
|
||
|
||
if not rows:
|
||
return []
|
||
return ["", "👥 *負責分工*", _tree_lines(rows)]
|
||
|
||
|
||
def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
|
||
"""格式化 AI 治理 Telegram 卡片。
|
||
|
||
2026-05-07 Codex — 保留治理 payload,僅在 Telegram 邊界層把 raw key/value
|
||
轉成可掃描卡片,避免大量純文字欄位洗版。
|
||
"""
|
||
payload = payload if isinstance(payload, dict) else {}
|
||
impact = _normalized_impact(event_type, payload)
|
||
remediation = _section_payload(
|
||
payload,
|
||
"remediation",
|
||
next_action_aliases=("next_step", "next_action"),
|
||
)
|
||
actionable = _section_payload(
|
||
payload,
|
||
"actionable",
|
||
item_aliases=("automatable_work",),
|
||
)
|
||
status = payload.get("status", "warning")
|
||
|
||
sections: list[str] = [
|
||
f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*",
|
||
"──────────────────────",
|
||
f"類型:{_escape_md(event_type)}",
|
||
f"狀態:{_escape_md(_status_badge(status))}",
|
||
]
|
||
|
||
sections.extend(_governance_operator_context(event_type, impact))
|
||
sections.extend(_governance_ownership_lines(event_type, payload))
|
||
|
||
impact_lines = _governance_summary_lines(event_type, impact)
|
||
if impact_lines:
|
||
sections.extend(["", "🧭 *影響摘要*", impact_lines])
|
||
|
||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||
remediation_next_action = remediation.get("next_action")
|
||
remediation_hint = remediation.get("hint")
|
||
if remediation_lines or remediation_next_action or remediation_hint:
|
||
sections.extend(["", "🛠️ *修復方向*"])
|
||
if remediation_lines:
|
||
sections.append(remediation_lines)
|
||
if remediation_next_action:
|
||
sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}")
|
||
if remediation_hint:
|
||
sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}")
|
||
|
||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||
if actionable_lines:
|
||
sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
|
||
|
||
profiled_keys = {key for key, _label in _IMPACT_PROFILES.get(event_type, [])}
|
||
top_level_keep = _TOP_LEVEL_FALLBACK_KEEP.get(event_type, set())
|
||
fallback_items = _fallback_pairs(
|
||
payload,
|
||
keep={
|
||
"status",
|
||
"impact",
|
||
"remediation",
|
||
"actionable",
|
||
*profiled_keys,
|
||
*top_level_keep,
|
||
},
|
||
max_items=4,
|
||
)
|
||
if fallback_items:
|
||
sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
|
||
|
||
return "\n".join(sections)
|
||
|
||
|
||
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
|
||
if not data:
|
||
return ""
|
||
rows = []
|
||
idx = 0
|
||
for k in sorted(data.keys()) if isinstance(data, dict) else []:
|
||
if idx >= max_items:
|
||
break
|
||
rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}")
|
||
idx += 1
|
||
if compact and len(rows) >= max_items:
|
||
rows.append(_escape_md("...(更多欄位略)"))
|
||
return "\n".join(f" {line}" for line in rows)
|
||
|
||
|
||
def _lines_from_list(value: Any) -> str:
|
||
if not isinstance(value, list):
|
||
return ""
|
||
return "\n".join(
|
||
f" {_escape_md(str(idx + 1))}\\. {_escape_md(str(item))}"
|
||
for idx, item in enumerate(value)
|
||
)
|
||
|
||
|
||
def _fallback_pairs(
|
||
payload: dict[str, Any],
|
||
keep: set[str] | None = None,
|
||
*,
|
||
max_items: int | None = None,
|
||
) -> list[str]:
|
||
if not isinstance(payload, dict):
|
||
return []
|
||
keep = set(keep or set())
|
||
rows = []
|
||
for key in sorted(payload.keys()):
|
||
if key in keep:
|
||
continue
|
||
if max_items is not None and len(rows) >= max_items:
|
||
rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
|
||
break
|
||
rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}")
|
||
return rows
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_alerter_instance: FailoverAlerter | None = None
|
||
|
||
|
||
def get_failover_alerter() -> FailoverAlerter:
|
||
"""取得 FailoverAlerter singleton(lifespan 中注入依賴前也可安全呼叫,僅 fail-open)
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — Singleton 取得 alerter
|
||
"""
|
||
global _alerter_instance
|
||
if _alerter_instance is None:
|
||
_alerter_instance = FailoverAlerter()
|
||
return _alerter_instance
|
||
|
||
|
||
def configure_alerter(redis_client) -> None:
|
||
"""Lifespan 注入:redis_client 就緒後呼叫,讓 dedup 功能生效。
|
||
|
||
telegram 不另外注入,直接從 get_telegram_gateway() singleton 取得
|
||
(lifespan startup 已保證 TelegramGateway 初始化完成)。
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D — Lifespan 注入
|
||
"""
|
||
global _alerter_instance
|
||
_alerter_instance = FailoverAlerter(redis_client=redis_client)
|
||
logger.info("failover_alerter_configured")
|
||
|
||
|
||
def reset_failover_alerter() -> None:
|
||
"""重置 singleton(測試用)
|
||
|
||
2026-04-25 P1.5 by Claude Engineer-D
|
||
"""
|
||
global _alerter_instance
|
||
_alerter_instance = None
|