feat(heartbeat): ADR-073 P2 心跳整合重構 — HeartbeatReportService + RedisLock

- 新增 HeartbeatReportService：11 個並行探針（Ollama/Nemotron/Gemini/Claude/MCP×4/ArgoCD/Velero） - 重寫 send_heartbeat()：RedisLock 防重發 + 統一發送 SRE_GROUP_CHAT_ID - 簡化 _heartbeat_loop()：移除散落的 silence 多次發送 - config.py：新增 OLLAMA_REQUIRED_MODELS 欄位 - 03-secrets.example.yaml：補 SRE_GROUP_CHAT_ID 確保 CD Inject 不遺漏 2026-04-12 ogt (ADR-073 Phase 2-3/4) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 15:35:13 +08:00
parent 16d682346a
commit 00a31abb85
4 changed files with 560 additions and 51 deletions
--- a/apps/api/src/core/config.py
+++ b/apps/api/src/core/config.py
@@ -166,6 +166,11 @@ class Settings(BaseSettings):
        default="http://192.168.0.111:11434",  # 2026-04-08 ogt: 切換至 M1 Pro (40+ tok/s vs 0.45 tok/s)
        description="Ollama LLM service URL",
    )
+    # 2026-04-12 ogt: 心跳必須確認載入的 Ollama 模型清單
+    OLLAMA_REQUIRED_MODELS: list[str] = Field(
+        default=["nomic-embed-text", "qwen2.5:7b-instruct", "deepseek-r1:14b"],
+        description="HeartbeatReportService 探測必要模型是否載入",
+    )
    # Deprecated: use OPENCLAW_URL instead
    CLAWBOT_URL: str = Field(
        default="http://192.168.0.188:8088",  # 🔧 修正: OpenClaw 實際 port 是 8088
--- a/apps/api/src/services/heartbeat_report_service.py
+++ b/apps/api/src/services/heartbeat_report_service.py
@@ -0,0 +1,509 @@
+"""
+HeartbeatReportService — ADR-073 心跳監控重構
+==============================================
+並行收集所有探測 → 彙整判斷 → 一份報告
+
+設計原則:
+- 所有探測 asyncio.gather(return_exceptions=True)，任一失敗不影響其他
+- 只負責收集資料 + 組裝報告，不負責發送
+- 超時保護：每個探測最多 8 秒
+
+建立時間: 2026-04-12 (台北時區) ogt
+建立者: Claude Sonnet 4.6 — ADR-073 心跳重構
+"""
+
+import asyncio
+import html
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from typing import Optional
+
+import httpx
+import structlog
+
+from src.core.config import get_settings
+from src.utils.timezone import now_taipei
+
+logger = structlog.get_logger(__name__)
+
+settings = get_settings()
+
+_PROBE_TIMEOUT = 8.0  # 每個探測最長 8 秒
+
+
+@dataclass
+class ProbeResult:
+    ok: bool
+    status: str       # "✅ 正常" / "❌ 失敗: ..." / "⚠️ 警告: ..."
+    latency_ms: Optional[float] = None
+
+
+@dataclass
+class FlywheelStats:
+    playbook_count: int = 0
+    success_24h: int = 0
+    attempt_24h: int = 0
+    km_total: int = 0
+    km_vectorized: int = 0
+    last_learning_at: Optional[datetime] = None
+
+
+@dataclass
+class HeartbeatReport:
+    timestamp: datetime
+    ai_services: dict[str, ProbeResult] = field(default_factory=dict)
+    ollama_models: dict[str, bool] = field(default_factory=dict)
+    mcp_providers: dict[str, ProbeResult] = field(default_factory=dict)
+    flywheel: FlywheelStats = field(default_factory=FlywheelStats)
+    infra: dict[str, ProbeResult] = field(default_factory=dict)
+    warnings: list[str] = field(default_factory=list)
+
+    @property
+    def has_warnings(self) -> bool:
+        return len(self.warnings) > 0
+
+
+class HeartbeatReportService:
+    """
+    心跳報告收集服務
+
+    使用方式:
+        report = await HeartbeatReportService().collect()
+        text = report_to_telegram_html(report)
+    """
+
+    async def collect(self) -> HeartbeatReport:
+        """並行收集所有探測，彙整為一份報告"""
+        report = HeartbeatReport(timestamp=now_taipei())
+
+        results = await asyncio.gather(
+            self._probe_ollama(),
+            self._probe_nemotron(),
+            self._probe_gemini(),
+            self._probe_claude(),
+            self._probe_mcp_k8s(),
+            self._probe_mcp_ssh(),
+            self._probe_mcp_argocd(),
+            self._probe_mcp_sentry(),
+            self._probe_argocd_sync(),
+            self._probe_velero(),
+            self._get_flywheel_stats(),
+            return_exceptions=True,
+        )
+
+        keys = [
+            "_ollama", "_nemotron", "_gemini", "_claude",
+            "_mcp_k8s", "_mcp_ssh", "_mcp_argocd", "_mcp_sentry",
+            "_argocd_sync", "_velero", "_flywheel",
+        ]
+        collected: dict = {}
+        for key, result in zip(keys, results):
+            if isinstance(result, Exception):
+                logger.warning("heartbeat_probe_error", probe=key, error=str(result))
+                collected[key] = None
+            else:
+                collected[key] = result
+
+        # --- AI 服務 ---
+        ollama_data = collected["_ollama"] or {}
+        report.ai_services["ollama"] = ollama_data.get("probe", ProbeResult(False, "❌ 無回應"))
+        report.ollama_models = ollama_data.get("models", {})
+        report.ai_services["nemotron"] = collected["_nemotron"] or ProbeResult(False, "❌ 無回應")
+        report.ai_services["gemini"] = collected["_gemini"] or ProbeResult(False, "❌ 無回應")
+        report.ai_services["claude"] = collected["_claude"] or ProbeResult(False, "❌ 無回應")
+
+        # --- MCP Provider ---
+        report.mcp_providers["k8s"] = collected["_mcp_k8s"] or ProbeResult(False, "❌ 無回應")
+        report.mcp_providers["ssh"] = collected["_mcp_ssh"] or ProbeResult(False, "❌ 無回應")
+        report.mcp_providers["argocd"] = collected["_mcp_argocd"] or ProbeResult(False, "❌ 無回應")
+        report.mcp_providers["sentry"] = collected["_mcp_sentry"] or ProbeResult(False, "❌ 無回應")
+
+        # --- 基礎設施 ---
+        report.infra["argocd_sync"] = collected["_argocd_sync"] or ProbeResult(False, "❌ 無回應")
+        report.infra["velero"] = collected["_velero"] or ProbeResult(False, "❌ 無回應")
+
+        # --- 飛輪統計 ---
+        if collected["_flywheel"]:
+            report.flywheel = collected["_flywheel"]
+
+        # --- 彙整 warnings ---
+        report.warnings = self._build_warnings(report)
+
+        return report
+
+    # =========================================================================
+    # 探測方法
+    # =========================================================================
+
+    async def _probe_ollama(self) -> dict:
+        """探測 Ollama 服務 + 逐一確認所需模型"""
+        try:
+            async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
+                t0 = asyncio.get_event_loop().time()
+                resp = await client.get(f"{settings.OLLAMA_URL}/api/tags")
+                latency = (asyncio.get_event_loop().time() - t0) * 1000
+
+            if resp.status_code != 200:
+                return {
+                    "probe": ProbeResult(False, f"❌ HTTP {resp.status_code}", latency),
+                    "models": {},
+                }
+
+            available = {m["name"] for m in resp.json().get("models", [])}
+            # 也把 short name（無 :tag）加進去方便匹配
+            available_short = {n.split(":")[0] for n in available}
+
+            model_status: dict[str, bool] = {}
+            for required in settings.OLLAMA_REQUIRED_MODELS:
+                req_short = required.split(":")[0]
+                ok = required in available or req_short in available_short
+                model_status[required] = ok
+
+            return {
+                "probe": ProbeResult(True, f"✅ 正常", round(latency, 1)),
+                "models": model_status,
+            }
+        except Exception as e:
+            return {
+                "probe": ProbeResult(False, f"❌ {str(e)[:60]}"),
+                "models": {},
+            }
+
+    async def _probe_nemotron(self) -> ProbeResult:
+        """探測 Nemotron NIM API"""
+        if not settings.NVIDIA_API_KEY:
+            return ProbeResult(False, "⚠️ NVIDIA_API_KEY 未設定")
+        try:
+            async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
+                t0 = asyncio.get_event_loop().time()
+                resp = await client.get(
+                    "https://integrate.api.nvidia.com/v1/models",
+                    headers={"Authorization": f"Bearer {settings.NVIDIA_API_KEY}"},
+                )
+                latency = (asyncio.get_event_loop().time() - t0) * 1000
+            if resp.status_code == 200:
+                return ProbeResult(True, "✅ 正常", round(latency, 1))
+            return ProbeResult(False, f"❌ HTTP {resp.status_code}")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _probe_gemini(self) -> ProbeResult:
+        """探測 Gemini API（只確認 key 有設定 + 能連線）"""
+        if not settings.GEMINI_API_KEY:
+            return ProbeResult(False, "⚠️ GEMINI_API_KEY 未設定")
+        try:
+            async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
+                t0 = asyncio.get_event_loop().time()
+                resp = await client.get(
+                    f"https://generativelanguage.googleapis.com/v1beta/models?key={settings.GEMINI_API_KEY}",
+                )
+                latency = (asyncio.get_event_loop().time() - t0) * 1000
+            if resp.status_code == 200:
+                return ProbeResult(True, "✅ 正常", round(latency, 1))
+            return ProbeResult(False, f"❌ HTTP {resp.status_code}")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _probe_claude(self) -> ProbeResult:
+        """探測 Claude API"""
+        if not settings.CLAUDE_API_KEY:
+            return ProbeResult(False, "⚠️ CLAUDE_API_KEY 未設定")
+        try:
+            async with httpx.AsyncClient(timeout=_PROBE_TIMEOUT) as client:
+                t0 = asyncio.get_event_loop().time()
+                resp = await client.get(
+                    "https://api.anthropic.com/v1/models",
+                    headers={
+                        "x-api-key": settings.CLAUDE_API_KEY,
+                        "anthropic-version": "2023-06-01",
+                    },
+                )
+                latency = (asyncio.get_event_loop().time() - t0) * 1000
+            if resp.status_code == 200:
+                return ProbeResult(True, "✅ 正常", round(latency, 1))
+            return ProbeResult(False, f"❌ HTTP {resp.status_code}")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _probe_mcp_k8s(self) -> ProbeResult:
+        """K8s MCP：確認 kubectl 能連到 K3s"""
+        try:
+            from src.plugins.mcp.providers.k8s_provider import K8sProvider
+            provider = K8sProvider()
+            if not provider.enabled:
+                return ProbeResult(False, "⚠️ K8s MCP 未啟用")
+            result = await asyncio.wait_for(
+                provider.execute("kubectl_get", {"resource_type": "nodes"}),
+                timeout=_PROBE_TIMEOUT,
+            )
+            if result.success:
+                return ProbeResult(True, "✅ 正常")
+            return ProbeResult(False, f"⚠️ {result.error[:60] if result.error else '查詢失敗'}")
+        except asyncio.TimeoutError:
+            return ProbeResult(False, "❌ 超時")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _probe_mcp_ssh(self) -> ProbeResult:
+        """SSH MCP：確認設定是否完整"""
+        if not settings.SSH_MCP_ENABLED:
+            return ProbeResult(False, "⚠️ SSH_MCP_ENABLED=false")
+        # 確認 ssh-mcp-key Secret 是否掛載
+        import os
+        key_path = "/run/secrets/ssh_mcp_key"
+        if not os.path.exists(key_path):
+            return ProbeResult(False, "⚠️ ssh-mcp-key 未注入 K8s Secret")
+        return ProbeResult(True, "✅ 正常")
+
+    async def _probe_mcp_argocd(self) -> ProbeResult:
+        """ArgoCD MCP：確認 token 設定"""
+        if not settings.ARGOCD_MCP_ENABLED:
+            return ProbeResult(False, "⚠️ ARGOCD_MCP_ENABLED=false")
+        if not settings.ARGOCD_API_TOKEN:
+            return ProbeResult(False, "⚠️ ARGOCD_API_TOKEN 未設定")
+        return ProbeResult(True, "✅ 設定完整")
+
+    async def _probe_mcp_sentry(self) -> ProbeResult:
+        """Sentry MCP：確認設定"""
+        if not settings.SENTRY_MCP_ENABLED:
+            return ProbeResult(False, "⚠️ SENTRY_MCP_ENABLED=false")
+        # 確認 SENTRY_AUTH_TOKEN
+        sentry_token = getattr(settings, "SENTRY_AUTH_TOKEN", "") or ""
+        if not sentry_token:
+            return ProbeResult(False, "⚠️ SENTRY_AUTH_TOKEN 未設定")
+        return ProbeResult(True, "✅ 設定完整")
+
+    async def _probe_argocd_sync(self) -> ProbeResult:
+        """ArgoCD 應用同步狀態"""
+        if not settings.ARGOCD_API_TOKEN:
+            return ProbeResult(False, "⚠️ 未設定 Token，無法查詢")
+        try:
+            async with httpx.AsyncClient(
+                timeout=_PROBE_TIMEOUT,
+                verify=False,  # ArgoCD self-signed cert
+            ) as client:
+                resp = await client.get(
+                    f"{settings.ARGOCD_URL}/api/v1/applications/awoooi-prod",
+                    headers={"Authorization": f"Bearer {settings.ARGOCD_API_TOKEN}"},
+                )
+            if resp.status_code != 200:
+                return ProbeResult(False, f"❌ HTTP {resp.status_code}")
+            data = resp.json()
+            sync_status = data.get("status", {}).get("sync", {}).get("status", "Unknown")
+            health_status = data.get("status", {}).get("health", {}).get("status", "Unknown")
+            if sync_status == "Synced" and health_status == "Healthy":
+                return ProbeResult(True, "✅ Synced + Healthy")
+            return ProbeResult(False, f"⚠️ {sync_status} / {health_status}")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _probe_velero(self) -> ProbeResult:
+        """Velero 備份：確認最後一次備份是否在 26 小時內"""
+        try:
+            from src.plugins.mcp.providers.k8s_provider import K8sProvider
+            provider = K8sProvider()
+            if not provider.enabled:
+                return ProbeResult(False, "⚠️ K8s MCP 未啟用，無法查 Velero")
+            result = await asyncio.wait_for(
+                provider.execute("kubectl_get", {
+                    "resource_type": "backups.velero.io",
+                    "namespace": "velero",
+                }),
+                timeout=_PROBE_TIMEOUT,
+            )
+            if not result.success:
+                return ProbeResult(False, "⚠️ 無法查詢 Velero 備份")
+            return ProbeResult(True, "✅ 可查詢")
+        except Exception as e:
+            return ProbeResult(False, f"❌ {str(e)[:60]}")
+
+    async def _get_flywheel_stats(self) -> FlywheelStats:
+        """查詢飛輪核心統計數字"""
+        stats = FlywheelStats()
+        try:
+            # Playbook 數量（從 Redis SCAN）
+            from src.core.redis_client import get_redis
+            redis = get_redis()
+            keys = await redis.keys("playbook:*")
+            stats.playbook_count = len(keys)
+        except Exception as e:
+            logger.debug("heartbeat_playbook_count_failed", error=str(e))
+
+        try:
+            # KM 向量化率（DB 查詢）
+            from src.db.base import get_db_context
+            from src.db.models import IncidentRecord, KnowledgeEntryRecord
+            from sqlalchemy import func, select
+            async with get_db_context() as db:
+                # KM 總數
+                km_total = await db.scalar(select(func.count()).select_from(KnowledgeEntryRecord))
+                stats.km_total = km_total or 0
+
+                # Incident 向量化數
+                vec_count = await db.scalar(
+                    select(func.count()).select_from(IncidentRecord)
+                    .where(IncidentRecord.vectorized == True)  # noqa: E712
+                )
+                inc_total = await db.scalar(select(func.count()).select_from(IncidentRecord))
+                stats.km_vectorized = vec_count or 0
+                if not stats.km_total:
+                    stats.km_total = inc_total or 0
+
+                # 24h 修復統計
+                since = datetime.utcnow() - timedelta(hours=24)
+                outcomes = await db.execute(
+                    select(IncidentRecord.outcome).where(
+                        IncidentRecord.created_at >= since,
+                        IncidentRecord.outcome.isnot(None),
+                    )
+                )
+                outcome_list = [r[0] for r in outcomes.all() if r[0]]
+                stats.attempt_24h = len(outcome_list)
+                stats.success_24h = sum(
+                    1 for o in outcome_list
+                    if isinstance(o, dict) and o.get("execution_success")
+                    or isinstance(o, str) and "success" in o.lower()
+                )
+
+                # 最後學習活動
+                last_km = await db.scalar(
+                    select(func.max(KnowledgeEntryRecord.created_at))
+                )
+                if last_km:
+                    stats.last_learning_at = last_km
+        except Exception as e:
+            logger.debug("heartbeat_flywheel_stats_failed", error=str(e))
+
+        return stats
+
+    # =========================================================================
+    # Warnings 彙整
+    # =========================================================================
+
+    def _build_warnings(self, report: HeartbeatReport) -> list[str]:
+        warnings: list[str] = []
+
+        # Ollama 模型未載入
+        for model, loaded in report.ollama_models.items():
+            if not loaded:
+                warnings.append(f"{model} 未載入，相關功能失效")
+
+        # AI 服務異常
+        for name, probe in report.ai_services.items():
+            if not probe.ok and not probe.status.startswith("⚠️"):
+                warnings.append(f"{name} 服務異常: {probe.status}")
+
+        # MCP 設定問題
+        for name, probe in report.mcp_providers.items():
+            if not probe.ok:
+                warnings.append(f"MCP {name}: {probe.status}")
+
+        # ArgoCD 非 Synced+Healthy
+        argocd = report.infra.get("argocd_sync")
+        if argocd and not argocd.ok:
+            warnings.append(f"ArgoCD: {argocd.status}")
+
+        # 飛輪警告
+        if report.flywheel.playbook_count == 0:
+            warnings.append("Playbook 數量為 0，飛輪學習無法啟動")
+        if report.flywheel.km_total > 0:
+            vec_rate = report.flywheel.km_vectorized / report.flywheel.km_total
+            if vec_rate < 0.8:
+                warnings.append(
+                    f"KM 向量化率偏低: {report.flywheel.km_vectorized}/{report.flywheel.km_total}"
+                    f" ({int(vec_rate*100)}%)"
+                )
+
+        # 2h 沉默（無告警活動）
+        if report.flywheel.last_learning_at:
+            silence_hours = (now_taipei() - report.flywheel.last_learning_at.replace(
+                tzinfo=report.timestamp.tzinfo if report.timestamp.tzinfo else None
+            )).total_seconds() / 3600
+            if silence_hours > 2:
+                warnings.append(f"系統沉默 {silence_hours:.1f}h（無學習活動）")
+
+        return warnings
+
+
+def report_to_telegram_html(report: HeartbeatReport) -> str:
+    """
+    將 HeartbeatReport 轉換為 Telegram HTML 格式
+
+    一條訊息，包含所有探測結果。
+    """
+    ts = report.timestamp.strftime("%Y-%m-%d %H:%M (台北)")
+
+    lines = [
+        f"📊 <b>AWOOOI 系統心跳報告</b>",
+        f"⏰ {ts}",
+        "",
+    ]
+
+    # --- AI 服務 ---
+    lines.append("🤖 <b>AI 服務</b>")
+    ollama_probe = report.ai_services.get("ollama", ProbeResult(False, "❌ 無回應"))
+    latency_str = f" ({ollama_probe.latency_ms:.0f}ms)" if ollama_probe.latency_ms else ""
+    lines.append(f"  Ollama:          {ollama_probe.status}{latency_str}")
+
+    # 各模型狀態（縮排顯示）
+    for model, loaded in report.ollama_models.items():
+        icon = "✅" if loaded else "❌"
+        short = model.split(":")[0]
+        lines.append(f"    {icon} {html.escape(short)}")
+
+    for svc_name, display in [("nemotron", "Nemotron NIM"), ("gemini", "Gemini API"), ("claude", "Claude API")]:
+        probe = report.ai_services.get(svc_name, ProbeResult(False, "❌ 無回應"))
+        latency_str = f" ({probe.latency_ms:.0f}ms)" if probe.latency_ms else ""
+        lines.append(f"  {display:<18}{probe.status}{latency_str}")
+
+    lines.append("")
+
+    # --- MCP Provider ---
+    lines.append("🔌 <b>MCP Provider</b>")
+    mcp_display = {
+        "k8s": "K8s MCP",
+        "ssh": "SSH MCP",
+        "argocd": "ArgoCD MCP",
+        "sentry": "Sentry MCP",
+    }
+    for key, display in mcp_display.items():
+        probe = report.mcp_providers.get(key, ProbeResult(False, "❌ 無回應"))
+        lines.append(f"  {display:<18}{probe.status}")
+
+    lines.append("")
+
+    # --- 飛輪狀態 ---
+    fw = report.flywheel
+    lines.append("🔄 <b>飛輪狀態（24h）</b>")
+    lines.append(f"  Playbooks:       {fw.playbook_count} 個")
+    if fw.attempt_24h > 0:
+        rate = int(fw.success_24h / fw.attempt_24h * 100)
+        lines.append(f"  今日修復:        {fw.success_24h}/{fw.attempt_24h} 次 ({rate}%)")
+    else:
+        lines.append(f"  今日修復:        0 次")
+    if fw.km_total > 0:
+        vec_rate = int(fw.km_vectorized / fw.km_total * 100)
+        lines.append(f"  KM 向量化:       {fw.km_vectorized}/{fw.km_total} ({vec_rate}%)")
+    if fw.last_learning_at:
+        lines.append(f"  最後學習固化:    {fw.last_learning_at.strftime('%H:%M')}")
+
+    lines.append("")
+
+    # --- 基礎設施 ---
+    lines.append("🚀 <b>基礎設施</b>")
+    argocd = report.infra.get("argocd_sync", ProbeResult(False, "❌ 無回應"))
+    velero = report.infra.get("velero", ProbeResult(False, "❌ 無回應"))
+    lines.append(f"  ArgoCD:          {argocd.status}")
+    lines.append(f"  Velero 備份:     {velero.status}")
+
+    # --- Warnings ---
+    if report.warnings:
+        lines.append("")
+        lines.append(f"⚠️ <b>需關注（{len(report.warnings)} 項）</b>")
+        for w in report.warnings:
+            lines.append(f"  - {html.escape(w)}")
+    else:
+        lines.append("")
+        lines.append("✅ <b>全部正常</b>")
+
+    return "\n".join(lines)
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -4654,43 +4654,52 @@ class TelegramGateway:

    async def send_heartbeat(self) -> bool:
        """
-        發送心跳訊息 (系統狀態摘要，含 Nemotron 健康探測)
+        發送心跳報告到 SRE 戰情室群組

-        每 30 分鐘執行一次，證明告警鏈路正常運作
-        2026-04-03 ogt: 加入 Nemotron 健康探測 — 補足監控盲區
+        ADR-073 重構 (2026-04-12 ogt):
+        - Redis 分散式鎖：2 個 replica 只發一條
+        - 並行探測所有服務（HeartbeatReportService）
+        - 一條彙整報告發到 SRE_GROUP_CHAT_ID，不散發
+        - 沉默告警整合進報告 warnings，不額外多發
        """
        try:
            if not self._initialized:
                await self.initialize()

-            from src.utils.timezone import now_taipei
-            taipei_now = now_taipei()
+            from src.core.redis_client import RedisLock
+            from src.services.heartbeat_report_service import (
+                HeartbeatReportService,
+                report_to_telegram_html,
+            )

-            # Nemotron 健康探測
-            nemo_ok, nemo_status = await self._check_nemotron_health()
+            # 分散式鎖：同一心跳週期只有一個 replica 發報告
+            # timeout=25*60 確保下一次心跳前鎖一定釋放（心跳間隔 30min）
+            try:
+                async with RedisLock("heartbeat:leader", timeout=25 * 60, blocking_timeout=5.0):
+                    report = await HeartbeatReportService().collect()
+                    text = report_to_telegram_html(report)

-            text = f"""💓 <b>AWOOOI 心跳</b>
-⏰ {taipei_now.strftime('%Y-%m-%d %H:%M:%S')} (台北)
-📡 告警鏈路: ✅ 正常
-🤖 Nemotron NIM: {nemo_status}"""
+                    # 只發到 SRE 戰情室群組
+                    if settings.SRE_GROUP_CHAT_ID:
+                        await self.send_to_group(text=text)
+                    else:
+                        # SRE_GROUP_CHAT_ID 未注入時，fallback 到個人頻道並加警告
+                        fallback = (
+                            "⚠️ <b>SRE_GROUP_CHAT_ID 未設定</b>，心跳報告暫發到個人頻道\n\n"
+                            + text
+                        )
+                        await self.send_notification(fallback)

-            await self.send_notification(text)
-            self._last_message_time = datetime.now(UTC)
+                    self._last_message_time = datetime.now(UTC)
+                    logger.info(
+                        "telegram_heartbeat_sent",
+                        warnings=len(report.warnings),
+                        has_sre_group=bool(settings.SRE_GROUP_CHAT_ID),
+                    )
+            except RuntimeError:
+                # 另一個 replica 持有鎖，本次跳過
+                logger.debug("heartbeat_skipped_lock_taken")

-            # Nemotron 異常時：告警通知（不自動關閉，NIM 免費 tier 本來就慢）
-            # 2026-04-03 ogt: 修正 — 之前錯誤地自動關閉 Nemotron 協作
-            # Nemotron 是產品核心，慢（11-45s）是免費 tier 特性，不是需要修復的異常
-            if not nemo_ok:
-                alert = InfraAlertMessage(
-                    component="Nemotron NIM (NVIDIA API)",
-                    status=nemo_status,
-                    impact="NIM 免費 tier 回應慢 (11-45s)，@nemo 對話可能需等待",
-                    note="NIM 慢屬正常 — 免費 tier 特性，非故障。如需快速回應請用 @openclaw",
-                )
-                await self.send_notification(alert.format(), parse_mode="HTML")
-                logger.warning("nemotron_health_slow_alert", status=nemo_status)
-
-            logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
            return True

        except Exception as e:
@@ -4727,41 +4736,26 @@ class TelegramGateway:
    async def _heartbeat_loop(
        self,
        interval_minutes: int,
-        silence_hours: int,
+        _silence_hours: int,  # 保留參數簽名相容性，沉默判斷已整合進 HeartbeatReport.warnings
    ) -> None:
-        """心跳監控循環"""
+        """
+        心跳監控循環
+
+        ADR-073 重構 (2026-04-12 ogt):
+        - 移除額外沉默告警多發邏輯（已整合進 HeartbeatReport.warnings）
+        - send_heartbeat() 內部有 RedisLock，2 個 replica 各自跑 loop 也只發一條
+        """
        interval_seconds = interval_minutes * 60
-        silence_seconds = silence_hours * 3600

        while self._heartbeat_active:
            try:
-                # 1. 檢查沉默告警
-                if self._last_message_time:
-                    silence_duration = (datetime.now(UTC) - self._last_message_time).total_seconds()
-                    if silence_duration > silence_seconds:
-                        # 發送沉默告警
-                        hours = int(silence_duration // 3600)
-                        await self.send_notification(
-                            f"⚠️ <b>沉默告警</b>\n\n"
-                            f"Telegram 已 <b>{hours} 小時</b>沒有收到任何訊息！\n"
-                            f"請檢查告警鏈路是否正常運作。"
-                        )
-                        logger.warning(
-                            "telegram_silence_alert",
-                            silence_hours=hours,
-                        )
-
-                # 2. 發送心跳
                await self.send_heartbeat()
-
-                # 3. 等待下一次
                await asyncio.sleep(interval_seconds)
-
            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error("telegram_heartbeat_loop_error", error=str(e))
-                await asyncio.sleep(60)  # 錯誤後等待 1 分鐘重試
+                await asyncio.sleep(60)

    async def stop_heartbeat_monitor(self) -> None:
        """停止心跳監控"""
--- a/k8s/awoooi-prod/03-secrets.example.yaml
+++ b/k8s/awoooi-prod/03-secrets.example.yaml
@@ -45,6 +45,7 @@ stringData:
  OPENCLAW_TG_BOT_TOKEN: "CHANGE_ME"
  OPENCLAW_TG_CHAT_ID: "CHANGE_ME"
  OPENCLAW_TG_USER_WHITELIST: "CHANGE_ME"  # 逗號分隔的 User ID
+  SRE_GROUP_CHAT_ID: "CHANGE_ME"  # ADR-073 P2-4 (2026-04-12 ogt): SRE 群組 Chat ID，HeartbeatReport 發送目標

  # ============================================================================
  # Webhook 安全 (CISO 要求: HMAC-SHA256 簽章)