diff --git a/run_scheduler.py b/run_scheduler.py index 57d6123..e371c68 100644 --- a/run_scheduler.py +++ b/run_scheduler.py @@ -121,6 +121,12 @@ def _register_schedules(): schedule.every().sunday.at("04:30").do(run_embed_consistency_check) logger.info("📅 每週日 04:30:bge-m3 跨主機一致性驗證") + # Phase 42: 三主機 Ollama 健康探針(即使無人開觀測台頁面也持續累積歷史) + schedule.every(15).minutes.do(run_host_health_probe) + logger.info("📅 每 15 分鐘:host_health_probe(三主機 → host_health_probes 表)") + schedule.every().day.at("03:00").do(run_host_health_probe_cleanup) + logger.info("📅 每日 03:00:host_health_probe_cleanup(清 30d 前舊資料)") + # Phase 20: 成本自動節流(COST_THROTTLE_ENABLED 預設 OFF) schedule.every(1).hours.do(run_cost_throttle_evaluate) logger.info("📅 每 1 小時:cost_throttle_evaluate") @@ -266,6 +272,98 @@ def run_cost_throttle_evaluate(): logger.error(f"[CostThrottle] task failed: {e}", exc_info=True) +def run_host_health_probe(): + """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。 + + 用途:即使無人開觀測台頁面也持續累積健康歷史, + 讓 mo.wooo.work/observability/host_health 的 24h uptime 統計 + 與 Telegram cmd:obs_health 都有資料可用。 + + 失敗安全:HTTP/DB 失敗不影響其他 scheduler job。 + """ + try: + import time as _time + import requests as _r + from sqlalchemy import text as _sa + from database.manager import DatabaseManager + from services.ollama_service import ( + OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK, + _is_unhealthy, + ) + + records = [] + for label, host in [ + ('Primary (GCP)', OLLAMA_HOST_PRIMARY), + ('Secondary (GCP)', OLLAMA_HOST_SECONDARY), + ('Fallback (111)', OLLAMA_HOST_FALLBACK), + ]: + t0 = _time.monotonic() + healthy = False + err = None + models_count = 0 + try: + resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3) + if resp.status_code == 200: + healthy = True + models_count = len(resp.json().get('models', []) or []) + else: + err = f"HTTP {resp.status_code}" + except Exception as e: + err = f"{type(e).__name__}: {str(e)[:200]}" + records.append({ + 'host_label': label, 'host_url': host, 'healthy': healthy, + 'unhealthy_mark': _is_unhealthy(host), + 'models_count': models_count, + 'response_ms': int((_time.monotonic() - t0) * 1000), + 'error_msg': err, + }) + + # 批次寫 DB + session = DatabaseManager().get_session() + try: + for rec in records: + session.execute( + _sa(""" + INSERT INTO host_health_probes + (host_label, host_url, healthy, unhealthy_mark, + models_count, response_ms, error_msg) + VALUES + (:host_label, :host_url, :healthy, :unhealthy_mark, + :models_count, :response_ms, :error_msg) + """), + rec, + ) + session.commit() + finally: + session.close() + + unhealthy = [r['host_label'] for r in records if not r['healthy']] + if unhealthy: + logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}") + else: + logger.debug("[HostHealthProbe] all 3 hosts healthy") + except Exception as e: + logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True) + + +def run_host_health_probe_cleanup(): + """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。""" + try: + from sqlalchemy import text as _sa + from database.manager import DatabaseManager + session = DatabaseManager().get_session() + try: + result = session.execute( + _sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"), + ) + session.commit() + logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)") + finally: + session.close() + except Exception as e: + logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True) + + def run_cost_throttle_reset_if_new_month(): """每日 00:05 — 若當天是月份第 1 天,清 throttle state(跨月重置)。""" try: