From f10999ed1c8c026ddeff1cb0b4699efa7273b664 Mon Sep 17 00:00:00 2001
From: OoO <ooo@MacBook-Pro.local>
Date: Mon, 4 May 2026 19:26:54 +0800
Subject: [PATCH] =?UTF-8?q?feat(p43):=20Ollama=20=E4=B8=BB=E6=A9=9F=20stat?=
 =?UTF-8?q?e=20transition=20=E8=87=AA=E5=8B=95=E5=91=8A=E8=AD=A6=20+=20inl?=
 =?UTF-8?q?ine=20AutoHeal=20=E9=96=89=E7=92=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

問題：
Phase 42 加 scheduler 每 15min probe 寫入 host_health_probes，但只是
silent 累積 — 主機真的掛掉時統帥仍然要主動開觀測台才知道。

修補：
- run_scheduler.py::run_host_health_probe
  寫入 DB 之前先查同 host 的最近一筆 probe 比對
  state transition 偵測：
    healthy → unhealthy：推 P1 告警 + inline AutoHeal 按鈕
    unhealthy → healthy：推 P3 「已恢復」訊息
- run_scheduler.py::_push_host_transition_alert（新 helper）
  使用 services.telegram_templates::send_telegram_with_result
  inline keyboard 含「🩹 立即 AutoHeal {GCP-A|GCP-B|111}」按鈕
  + 「📊 查 24h 健康統計」次按鈕
  按鈕 callback_data 對齊既有 Phase 41 cmd:obs_heal handler
- Dedup：1 小時內同 host 同方向 transition 只推一次（防 flapping 洗版）
  用 host_health_probes 自身查歷史對比，無需新 dedup 表

完整閉環：
  scheduler 每 15min probe → 偵測 state transition → 推 Telegram 告警
  → 統帥點 inline button → cmd:obs_heal:{label} → AutoHeal 跑 ADR-013
  playbook → 寫入 incidents + heal_logs → 下一次 probe 偵測 unhealthy→
  healthy → 推「已恢復」訊息

至此觀測台從「raw stats dashboard」進化為：
  - 持續累積歷史（Phase 42）
  - 主動告警 + 一鍵修復（Phase 43）
  - 完整閉環自動化（從監控到復原全自動，僅關鍵節點需人工確認）

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 run_scheduler.py | 113 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 108 insertions(+), 5 deletions(-)

diff --git a/run_scheduler.py b/run_scheduler.py
index e371c68..3cab1dc 100644
--- a/run_scheduler.py
+++ b/run_scheduler.py
@@ -275,11 +275,9 @@ def run_cost_throttle_evaluate():
 def run_host_health_probe():
     """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama，寫入 host_health_probes。
 
-    用途：即使無人開觀測台頁面也持續累積健康歷史，
-         讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
-         與 Telegram cmd:obs_health 都有資料可用。
-
-    失敗安全：HTTP/DB 失敗不影響其他 scheduler job。
+    Phase 43 增強：偵測 state transition (healthy→unhealthy / unhealthy→healthy) →
+    主動推 Telegram 告警 + inline AutoHeal 按鈕，完整「監控→告警→修復」閉環。
+    Dedup: 1 小時內同 host 同方向 transition 只推一次（防 flapping 洗版）。
     """
     try:
         import time as _time
@@ -318,6 +316,56 @@ def run_host_health_probe():
                 'error_msg': err,
             })
 
+        # Phase 43: state transition 偵測（在寫入新筆 *之前* 查上一筆狀態）
+        transitions = []
+        try:
+            session = DatabaseManager().get_session()
+            try:
+                for rec in records:
+                    last = session.execute(
+                        _sa("""
+                            SELECT healthy, probed_at FROM host_health_probes
+                            WHERE host_label = :label
+                            ORDER BY probed_at DESC LIMIT 1
+                        """),
+                        {'label': rec['host_label']},
+                    ).fetchone()
+                    if last is None:
+                        continue  # 第一筆探針，無對比基準
+                    prev_healthy = bool(last[0])
+                    if prev_healthy != rec['healthy']:
+                        # 同方向 1h 內 dedup
+                        recent_transition = session.execute(
+                            _sa("""
+                                SELECT 1 FROM host_health_probes p1
+                                WHERE p1.host_label = :label
+                                  AND p1.probed_at >= NOW() - INTERVAL '1 hour'
+                                  AND p1.healthy = :curr_healthy
+                                  AND EXISTS (
+                                      SELECT 1 FROM host_health_probes p2
+                                      WHERE p2.host_label = :label
+                                        AND p2.probed_at < p1.probed_at
+                                        AND p2.probed_at >= NOW() - INTERVAL '90 minutes'
+                                        AND p2.healthy != :curr_healthy
+                                  )
+                                LIMIT 1
+                            """),
+                            {'label': rec['host_label'], 'curr_healthy': rec['healthy']},
+                        ).fetchone()
+                        if recent_transition is None:
+                            transitions.append({
+                                'host_label': rec['host_label'],
+                                'host_url': rec['host_url'],
+                                'prev_healthy': prev_healthy,
+                                'curr_healthy': rec['healthy'],
+                                'error_msg': rec.get('error_msg'),
+                                'response_ms': rec.get('response_ms'),
+                            })
+            finally:
+                session.close()
+        except Exception as e:
+            logger.warning(f"[HostHealthProbe] transition detect failed: {e}")
+
         # 批次寫 DB
         session = DatabaseManager().get_session()
         try:
@@ -337,6 +385,13 @@ def run_host_health_probe():
         finally:
             session.close()
 
+        # Phase 43: 推 Telegram 告警 / 恢復通知
+        for tr in transitions:
+            try:
+                _push_host_transition_alert(tr)
+            except Exception as e:
+                logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}")
+
         unhealthy = [r['host_label'] for r in records if not r['healthy']]
         if unhealthy:
             logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
@@ -346,6 +401,54 @@ def run_host_health_probe():
         logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
 
 
+def _push_host_transition_alert(tr):
+    """Phase 43: 主機 state transition → 推 Telegram。
+
+    healthy → unhealthy：P1 告警 + inline 「🩹 修 {label}」按鈕
+    unhealthy → healthy：P3 簡訊「已恢復」（不附按鈕）
+    """
+    try:
+        from services.telegram_templates import send_telegram_with_result
+    except Exception:
+        return
+
+    label = tr['host_label']
+    short_label = (
+        'GCP-A' if 'Primary' in label else
+        'GCP-B' if 'Secondary' in label else
+        '111'
+    )
+
+    if tr['curr_healthy']:
+        # 恢復通知（無按鈕）
+        text = (
+            f"<b>✅ Ollama 主機已恢復</b>\n\n"
+            f"主機：<code>{label}</code>\n"
+            f"網址：<code>{tr['host_url']}</code>\n"
+            f"回應：{tr['response_ms']} ms\n\n"
+            f"<i>scheduler 每 15 分鐘自動探針偵測</i>"
+        )
+        send_telegram_with_result(text, parse_mode='HTML')
+    else:
+        # 故障告警 + inline AutoHeal 按鈕
+        err_short = (tr.get('error_msg') or '無錯誤訊息')[:200]
+        text = (
+            f"<b>🚨 Ollama 主機異常</b>\n\n"
+            f"主機：<code>{label}</code>\n"
+            f"網址：<code>{tr['host_url']}</code>\n"
+            f"錯誤：<code>{err_short}</code>\n\n"
+            f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook，"
+            f"或至 <a href=\"https://mo.wooo.work/observability/host_health\">觀測台</a> 詳查。"
+        )
+        reply_markup = {
+            "inline_keyboard": [
+                [{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}],
+                [{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}],
+            ],
+        }
+        send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML')
+
+
 def run_host_health_probe_cleanup():
     """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
     try: