From d5a4e273449541b47d13ba1d4b0af1d73551afbe Mon Sep 17 00:00:00 2001
From: OoO <ooo@MacBook-Pro.local>
Date: Mon, 4 May 2026 19:24:07 +0800
Subject: [PATCH] =?UTF-8?q?feat(p42):=20scheduler=20=E6=AF=8F=2015=20?=
 =?UTF-8?q?=E5=88=86=E9=90=98=E8=87=AA=E5=8B=95=20probe=20=E4=B8=89?=
 =?UTF-8?q?=E4=B8=BB=E6=A9=9F=EF=BC=88=E4=B8=8D=E9=9D=A0=E4=BA=BA=E9=96=8B?=
 =?UTF-8?q?=E9=A0=81=E7=B4=AF=E7=A9=8D=E6=AD=B7=E5=8F=B2=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

問題：
Phase 38 加了 host_health_probes 表 + 開觀測台頁面時寫一筆，但
無人開頁時沒人寫 → Telegram cmd:obs_health 顯示「24h uptime」永遠空。

修補：
- run_scheduler.py::run_host_health_probe
  - 每 15 min HTTP probe GCP-A/GCP-B/111 三主機 /api/tags
  - 寫入 host_health_probes（label/url/healthy/unhealthy_mark/
    models_count/response_ms/error_msg）
  - 失敗安全：HTTP/DB 失敗只 log warning
- run_scheduler.py::run_host_health_probe_cleanup
  - 每日 03:00 DELETE 30d 前舊資料（防表膨脹）
- 註冊到 schedule.every(15).minutes 與 schedule.every().day.at("03:00")

效果：
- Web /observability/host_health 24h 趨勢卡永遠有資料（即使無人開頁）
- Telegram cmd:obs_health 三主機在線率永遠有資料
- 三主機歷史完整保留 30 天，超出自動清理

Phase 38+39+40+41+42 觀測台戰役完整收官（7 commits）。

部署驗證：
- mo.wooo.work/observability/host_health → HTTP 200 / 42716 byte
  （Phase 38 為 39124 byte，多 3.5KB 證明 24h 趨勢/MCP/AIOps card 已上線）

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 run_scheduler.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/run_scheduler.py b/run_scheduler.py
index 57d6123..e371c68 100644
--- a/run_scheduler.py
+++ b/run_scheduler.py
@@ -121,6 +121,12 @@ def _register_schedules():
     schedule.every().sunday.at("04:30").do(run_embed_consistency_check)
     logger.info("📅 每週日 04:30：bge-m3 跨主機一致性驗證")
 
+    # Phase 42: 三主機 Ollama 健康探針（即使無人開觀測台頁面也持續累積歷史）
+    schedule.every(15).minutes.do(run_host_health_probe)
+    logger.info("📅 每 15 分鐘：host_health_probe（三主機 → host_health_probes 表）")
+    schedule.every().day.at("03:00").do(run_host_health_probe_cleanup)
+    logger.info("📅 每日 03:00：host_health_probe_cleanup（清 30d 前舊資料）")
+
     # Phase 20: 成本自動節流（COST_THROTTLE_ENABLED 預設 OFF）
     schedule.every(1).hours.do(run_cost_throttle_evaluate)
     logger.info("📅 每 1 小時：cost_throttle_evaluate")
@@ -266,6 +272,98 @@ def run_cost_throttle_evaluate():
         logger.error(f"[CostThrottle] task failed: {e}", exc_info=True)
 
 
+def run_host_health_probe():
+    """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama，寫入 host_health_probes。
+
+    用途：即使無人開觀測台頁面也持續累積健康歷史，
+         讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
+         與 Telegram cmd:obs_health 都有資料可用。
+
+    失敗安全：HTTP/DB 失敗不影響其他 scheduler job。
+    """
+    try:
+        import time as _time
+        import requests as _r
+        from sqlalchemy import text as _sa
+        from database.manager import DatabaseManager
+        from services.ollama_service import (
+            OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK,
+            _is_unhealthy,
+        )
+
+        records = []
+        for label, host in [
+            ('Primary (GCP)',   OLLAMA_HOST_PRIMARY),
+            ('Secondary (GCP)', OLLAMA_HOST_SECONDARY),
+            ('Fallback (111)',  OLLAMA_HOST_FALLBACK),
+        ]:
+            t0 = _time.monotonic()
+            healthy = False
+            err = None
+            models_count = 0
+            try:
+                resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3)
+                if resp.status_code == 200:
+                    healthy = True
+                    models_count = len(resp.json().get('models', []) or [])
+                else:
+                    err = f"HTTP {resp.status_code}"
+            except Exception as e:
+                err = f"{type(e).__name__}: {str(e)[:200]}"
+            records.append({
+                'host_label': label, 'host_url': host, 'healthy': healthy,
+                'unhealthy_mark': _is_unhealthy(host),
+                'models_count': models_count,
+                'response_ms': int((_time.monotonic() - t0) * 1000),
+                'error_msg': err,
+            })
+
+        # 批次寫 DB
+        session = DatabaseManager().get_session()
+        try:
+            for rec in records:
+                session.execute(
+                    _sa("""
+                        INSERT INTO host_health_probes
+                            (host_label, host_url, healthy, unhealthy_mark,
+                             models_count, response_ms, error_msg)
+                        VALUES
+                            (:host_label, :host_url, :healthy, :unhealthy_mark,
+                             :models_count, :response_ms, :error_msg)
+                    """),
+                    rec,
+                )
+            session.commit()
+        finally:
+            session.close()
+
+        unhealthy = [r['host_label'] for r in records if not r['healthy']]
+        if unhealthy:
+            logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
+        else:
+            logger.debug("[HostHealthProbe] all 3 hosts healthy")
+    except Exception as e:
+        logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
+
+
+def run_host_health_probe_cleanup():
+    """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
+    try:
+        from sqlalchemy import text as _sa
+        from database.manager import DatabaseManager
+        session = DatabaseManager().get_session()
+        try:
+            result = session.execute(
+                _sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"),
+            )
+            session.commit()
+            logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)")
+        finally:
+            session.close()
+    except Exception as e:
+        logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
+
+
 def run_cost_throttle_reset_if_new_month():
     """每日 00:05 — 若當天是月份第 1 天，清 throttle state（跨月重置）。"""
     try: