diff --git a/run_scheduler.py b/run_scheduler.py
index e371c68..3cab1dc 100644
--- a/run_scheduler.py
+++ b/run_scheduler.py
@@ -275,11 +275,9 @@ def run_cost_throttle_evaluate():
def run_host_health_probe():
"""Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。
- 用途:即使無人開觀測台頁面也持續累積健康歷史,
- 讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
- 與 Telegram cmd:obs_health 都有資料可用。
-
- 失敗安全:HTTP/DB 失敗不影響其他 scheduler job。
+ Phase 43 增強:偵測 state transition (healthy→unhealthy / unhealthy→healthy) →
+ 主動推 Telegram 告警 + inline AutoHeal 按鈕,完整「監控→告警→修復」閉環。
+ Dedup: 1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版)。
"""
try:
import time as _time
@@ -318,6 +316,56 @@ def run_host_health_probe():
'error_msg': err,
})
+ # Phase 43: state transition 偵測(在寫入新筆 *之前* 查上一筆狀態)
+ transitions = []
+ try:
+ session = DatabaseManager().get_session()
+ try:
+ for rec in records:
+ last = session.execute(
+ _sa("""
+ SELECT healthy, probed_at FROM host_health_probes
+ WHERE host_label = :label
+ ORDER BY probed_at DESC LIMIT 1
+ """),
+ {'label': rec['host_label']},
+ ).fetchone()
+ if last is None:
+ continue # 第一筆探針,無對比基準
+ prev_healthy = bool(last[0])
+ if prev_healthy != rec['healthy']:
+ # 同方向 1h 內 dedup
+ recent_transition = session.execute(
+ _sa("""
+ SELECT 1 FROM host_health_probes p1
+ WHERE p1.host_label = :label
+ AND p1.probed_at >= NOW() - INTERVAL '1 hour'
+ AND p1.healthy = :curr_healthy
+ AND EXISTS (
+ SELECT 1 FROM host_health_probes p2
+ WHERE p2.host_label = :label
+ AND p2.probed_at < p1.probed_at
+ AND p2.probed_at >= NOW() - INTERVAL '90 minutes'
+ AND p2.healthy != :curr_healthy
+ )
+ LIMIT 1
+ """),
+ {'label': rec['host_label'], 'curr_healthy': rec['healthy']},
+ ).fetchone()
+ if recent_transition is None:
+ transitions.append({
+ 'host_label': rec['host_label'],
+ 'host_url': rec['host_url'],
+ 'prev_healthy': prev_healthy,
+ 'curr_healthy': rec['healthy'],
+ 'error_msg': rec.get('error_msg'),
+ 'response_ms': rec.get('response_ms'),
+ })
+ finally:
+ session.close()
+ except Exception as e:
+ logger.warning(f"[HostHealthProbe] transition detect failed: {e}")
+
# 批次寫 DB
session = DatabaseManager().get_session()
try:
@@ -337,6 +385,13 @@ def run_host_health_probe():
finally:
session.close()
+ # Phase 43: 推 Telegram 告警 / 恢復通知
+ for tr in transitions:
+ try:
+ _push_host_transition_alert(tr)
+ except Exception as e:
+ logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}")
+
unhealthy = [r['host_label'] for r in records if not r['healthy']]
if unhealthy:
logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
@@ -346,6 +401,54 @@ def run_host_health_probe():
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
+def _push_host_transition_alert(tr):
+ """Phase 43: 主機 state transition → 推 Telegram。
+
+ healthy → unhealthy:P1 告警 + inline 「🩹 修 {label}」按鈕
+ unhealthy → healthy:P3 簡訊「已恢復」(不附按鈕)
+ """
+ try:
+ from services.telegram_templates import send_telegram_with_result
+ except Exception:
+ return
+
+ label = tr['host_label']
+ short_label = (
+ 'GCP-A' if 'Primary' in label else
+ 'GCP-B' if 'Secondary' in label else
+ '111'
+ )
+
+ if tr['curr_healthy']:
+ # 恢復通知(無按鈕)
+ text = (
+ f"✅ Ollama 主機已恢復\n\n"
+ f"主機:{label}\n"
+ f"網址:{tr['host_url']}\n"
+ f"回應:{tr['response_ms']} ms\n\n"
+ f"scheduler 每 15 分鐘自動探針偵測"
+ )
+ send_telegram_with_result(text, parse_mode='HTML')
+ else:
+ # 故障告警 + inline AutoHeal 按鈕
+ err_short = (tr.get('error_msg') or '無錯誤訊息')[:200]
+ text = (
+ f"🚨 Ollama 主機異常\n\n"
+ f"主機:{label}\n"
+ f"網址:{tr['host_url']}\n"
+ f"錯誤:{err_short}\n\n"
+ f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook,"
+ f"或至 觀測台 詳查。"
+ )
+ reply_markup = {
+ "inline_keyboard": [
+ [{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}],
+ [{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}],
+ ],
+ }
+ send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML')
+
+
def run_host_health_probe_cleanup():
"""Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
try: