feat(p42): scheduler 每 15 分鐘自動 probe 三主機（不靠人開頁累積歷史）

問題： Phase 38 加了 host_health_probes 表 + 開觀測台頁面時寫一筆，但無人開頁時沒人寫 → Telegram cmd:obs_health 顯示「24h uptime」永遠空。修補： - run_scheduler.py::run_host_health_probe - 每 15 min HTTP probe GCP-A/GCP-B/111 三主機 /api/tags - 寫入 host_health_probes（label/url/healthy/unhealthy_mark/ models_count/response_ms/error_msg） - 失敗安全：HTTP/DB 失敗只 log warning - run_scheduler.py::run_host_health_probe_cleanup - 每日 03:00 DELETE 30d 前舊資料（防表膨脹） - 註冊到 schedule.every(15).minutes 與 schedule.every().day.at("03:00") 效果： - Web /observability/host_health 24h 趨勢卡永遠有資料（即使無人開頁） - Telegram cmd:obs_health 三主機在線率永遠有資料 - 三主機歷史完整保留 30 天，超出自動清理 Phase 38+39+40+41+42 觀測台戰役完整收官（7 commits）。部署驗證： - mo.wooo.work/observability/host_health → HTTP 200 / 42716 byte （Phase 38 為 39124 byte，多 3.5KB 證明 24h 趨勢/MCP/AIOps card 已上線） Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 19:24:07 +08:00
parent 4020b734a5
commit d5a4e27344
1 changed files with 98 additions and 0 deletions
--- a/run_scheduler.py
+++ b/run_scheduler.py
@@ -121,6 +121,12 @@ def _register_schedules():
    schedule.every().sunday.at("04:30").do(run_embed_consistency_check)
    logger.info("📅 每週日 04:30：bge-m3 跨主機一致性驗證")

+    # Phase 42: 三主機 Ollama 健康探針（即使無人開觀測台頁面也持續累積歷史）
+    schedule.every(15).minutes.do(run_host_health_probe)
+    logger.info("📅 每 15 分鐘：host_health_probe（三主機 → host_health_probes 表）")
+    schedule.every().day.at("03:00").do(run_host_health_probe_cleanup)
+    logger.info("📅 每日 03:00：host_health_probe_cleanup（清 30d 前舊資料）")
+
    # Phase 20: 成本自動節流（COST_THROTTLE_ENABLED 預設 OFF）
    schedule.every(1).hours.do(run_cost_throttle_evaluate)
    logger.info("📅 每 1 小時：cost_throttle_evaluate")
@@ -266,6 +272,98 @@ def run_cost_throttle_evaluate():
        logger.error(f"[CostThrottle] task failed: {e}", exc_info=True)


+def run_host_health_probe():
+    """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama，寫入 host_health_probes。
+
+    用途：即使無人開觀測台頁面也持續累積健康歷史，
+         讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
+         與 Telegram cmd:obs_health 都有資料可用。
+
+    失敗安全：HTTP/DB 失敗不影響其他 scheduler job。
+    """
+    try:
+        import time as _time
+        import requests as _r
+        from sqlalchemy import text as _sa
+        from database.manager import DatabaseManager
+        from services.ollama_service import (
+            OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK,
+            _is_unhealthy,
+        )
+
+        records = []
+        for label, host in [
+            ('Primary (GCP)',   OLLAMA_HOST_PRIMARY),
+            ('Secondary (GCP)', OLLAMA_HOST_SECONDARY),
+            ('Fallback (111)',  OLLAMA_HOST_FALLBACK),
+        ]:
+            t0 = _time.monotonic()
+            healthy = False
+            err = None
+            models_count = 0
+            try:
+                resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3)
+                if resp.status_code == 200:
+                    healthy = True
+                    models_count = len(resp.json().get('models', []) or [])
+                else:
+                    err = f"HTTP {resp.status_code}"
+            except Exception as e:
+                err = f"{type(e).__name__}: {str(e)[:200]}"
+            records.append({
+                'host_label': label, 'host_url': host, 'healthy': healthy,
+                'unhealthy_mark': _is_unhealthy(host),
+                'models_count': models_count,
+                'response_ms': int((_time.monotonic() - t0) * 1000),
+                'error_msg': err,
+            })
+
+        # 批次寫 DB
+        session = DatabaseManager().get_session()
+        try:
+            for rec in records:
+                session.execute(
+                    _sa("""
+                        INSERT INTO host_health_probes
+                            (host_label, host_url, healthy, unhealthy_mark,
+                             models_count, response_ms, error_msg)
+                        VALUES
+                            (:host_label, :host_url, :healthy, :unhealthy_mark,
+                             :models_count, :response_ms, :error_msg)
+                    """),
+                    rec,
+                )
+            session.commit()
+        finally:
+            session.close()
+
+        unhealthy = [r['host_label'] for r in records if not r['healthy']]
+        if unhealthy:
+            logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
+        else:
+            logger.debug("[HostHealthProbe] all 3 hosts healthy")
+    except Exception as e:
+        logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
+
+
+def run_host_health_probe_cleanup():
+    """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
+    try:
+        from sqlalchemy import text as _sa
+        from database.manager import DatabaseManager
+        session = DatabaseManager().get_session()
+        try:
+            result = session.execute(
+                _sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"),
+            )
+            session.commit()
+            logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)")
+        finally:
+            session.close()
+    except Exception as e:
+        logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
+
+
 def run_cost_throttle_reset_if_new_month():
    """每日 00:05 — 若當天是月份第 1 天，清 throttle state（跨月重置）。"""
    try: