feat(p43): Ollama 主機 state transition 自動告警 + inline AutoHeal 閉環

問題： Phase 42 加 scheduler 每 15min probe 寫入 host_health_probes，但只是 silent 累積 — 主機真的掛掉時統帥仍然要主動開觀測台才知道。修補： - run_scheduler.py::run_host_health_probe 寫入 DB 之前先查同 host 的最近一筆 probe 比對 state transition 偵測： healthy → unhealthy：推 P1 告警 + inline AutoHeal 按鈕 unhealthy → healthy：推 P3 「已恢復」訊息 - run_scheduler.py::_push_host_transition_alert（新 helper）使用 services.telegram_templates::send_telegram_with_result inline keyboard 含「🩹 立即 AutoHeal {GCP-A|GCP-B|111}」按鈕 + 「📊 查 24h 健康統計」次按鈕按鈕 callback_data 對齊既有 Phase 41 cmd:obs_heal handler - Dedup：1 小時內同 host 同方向 transition 只推一次（防 flapping 洗版）用 host_health_probes 自身查歷史對比，無需新 dedup 表完整閉環： scheduler 每 15min probe → 偵測 state transition → 推 Telegram 告警 → 統帥點 inline button → cmd:obs_heal:{label} → AutoHeal 跑 ADR-013 playbook → 寫入 incidents + heal_logs → 下一次 probe 偵測 unhealthy→ healthy → 推「已恢復」訊息至此觀測台從「raw stats dashboard」進化為： - 持續累積歷史（Phase 42） - 主動告警 + 一鍵修復（Phase 43） - 完整閉環自動化（從監控到復原全自動，僅關鍵節點需人工確認） Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 19:26:54 +08:00
parent d5a4e27344
commit f10999ed1c
1 changed files with 108 additions and 5 deletions
--- a/run_scheduler.py
+++ b/run_scheduler.py
@@ -275,11 +275,9 @@ def run_cost_throttle_evaluate():
 def run_host_health_probe():
    """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama，寫入 host_health_probes。

-    用途：即使無人開觀測台頁面也持續累積健康歷史，
-         讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
-         與 Telegram cmd:obs_health 都有資料可用。
-
-    失敗安全：HTTP/DB 失敗不影響其他 scheduler job。
+    Phase 43 增強：偵測 state transition (healthy→unhealthy / unhealthy→healthy) →
+    主動推 Telegram 告警 + inline AutoHeal 按鈕，完整「監控→告警→修復」閉環。
+    Dedup: 1 小時內同 host 同方向 transition 只推一次（防 flapping 洗版）。
    """
    try:
        import time as _time
@@ -318,6 +316,56 @@ def run_host_health_probe():
                'error_msg': err,
            })

+        # Phase 43: state transition 偵測（在寫入新筆 *之前* 查上一筆狀態）
+        transitions = []
+        try:
+            session = DatabaseManager().get_session()
+            try:
+                for rec in records:
+                    last = session.execute(
+                        _sa("""
+                            SELECT healthy, probed_at FROM host_health_probes
+                            WHERE host_label = :label
+                            ORDER BY probed_at DESC LIMIT 1
+                        """),
+                        {'label': rec['host_label']},
+                    ).fetchone()
+                    if last is None:
+                        continue  # 第一筆探針，無對比基準
+                    prev_healthy = bool(last[0])
+                    if prev_healthy != rec['healthy']:
+                        # 同方向 1h 內 dedup
+                        recent_transition = session.execute(
+                            _sa("""
+                                SELECT 1 FROM host_health_probes p1
+                                WHERE p1.host_label = :label
+                                  AND p1.probed_at >= NOW() - INTERVAL '1 hour'
+                                  AND p1.healthy = :curr_healthy
+                                  AND EXISTS (
+                                      SELECT 1 FROM host_health_probes p2
+                                      WHERE p2.host_label = :label
+                                        AND p2.probed_at < p1.probed_at
+                                        AND p2.probed_at >= NOW() - INTERVAL '90 minutes'
+                                        AND p2.healthy != :curr_healthy
+                                  )
+                                LIMIT 1
+                            """),
+                            {'label': rec['host_label'], 'curr_healthy': rec['healthy']},
+                        ).fetchone()
+                        if recent_transition is None:
+                            transitions.append({
+                                'host_label': rec['host_label'],
+                                'host_url': rec['host_url'],
+                                'prev_healthy': prev_healthy,
+                                'curr_healthy': rec['healthy'],
+                                'error_msg': rec.get('error_msg'),
+                                'response_ms': rec.get('response_ms'),
+                            })
+            finally:
+                session.close()
+        except Exception as e:
+            logger.warning(f"[HostHealthProbe] transition detect failed: {e}")
+
        # 批次寫 DB
        session = DatabaseManager().get_session()
        try:
@@ -337,6 +385,13 @@ def run_host_health_probe():
        finally:
            session.close()

+        # Phase 43: 推 Telegram 告警 / 恢復通知
+        for tr in transitions:
+            try:
+                _push_host_transition_alert(tr)
+            except Exception as e:
+                logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}")
+
        unhealthy = [r['host_label'] for r in records if not r['healthy']]
        if unhealthy:
            logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
@@ -346,6 +401,54 @@ def run_host_health_probe():
        logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)


+def _push_host_transition_alert(tr):
+    """Phase 43: 主機 state transition → 推 Telegram。
+
+    healthy → unhealthy：P1 告警 + inline 「🩹 修 {label}」按鈕
+    unhealthy → healthy：P3 簡訊「已恢復」（不附按鈕）
+    """
+    try:
+        from services.telegram_templates import send_telegram_with_result
+    except Exception:
+        return
+
+    label = tr['host_label']
+    short_label = (
+        'GCP-A' if 'Primary' in label else
+        'GCP-B' if 'Secondary' in label else
+        '111'
+    )
+
+    if tr['curr_healthy']:
+        # 恢復通知（無按鈕）
+        text = (
+            f"<b>✅ Ollama 主機已恢復</b>\n\n"
+            f"主機：<code>{label}</code>\n"
+            f"網址：<code>{tr['host_url']}</code>\n"
+            f"回應：{tr['response_ms']} ms\n\n"
+            f"<i>scheduler 每 15 分鐘自動探針偵測</i>"
+        )
+        send_telegram_with_result(text, parse_mode='HTML')
+    else:
+        # 故障告警 + inline AutoHeal 按鈕
+        err_short = (tr.get('error_msg') or '無錯誤訊息')[:200]
+        text = (
+            f"<b>🚨 Ollama 主機異常</b>\n\n"
+            f"主機：<code>{label}</code>\n"
+            f"網址：<code>{tr['host_url']}</code>\n"
+            f"錯誤：<code>{err_short}</code>\n\n"
+            f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook，"
+            f"或至 <a href=\"https://mo.wooo.work/observability/host_health\">觀測台</a> 詳查。"
+        )
+        reply_markup = {
+            "inline_keyboard": [
+                [{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}],
+                [{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}],
+            ],
+        }
+        send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML')
+
+
 def run_host_health_probe_cleanup():
    """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
    try: