From d5a4e273449541b47d13ba1d4b0af1d73551afbe Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 4 May 2026 19:24:07 +0800 Subject: [PATCH] =?UTF-8?q?feat(p42):=20scheduler=20=E6=AF=8F=2015=20?= =?UTF-8?q?=E5=88=86=E9=90=98=E8=87=AA=E5=8B=95=20probe=20=E4=B8=89?= =?UTF-8?q?=E4=B8=BB=E6=A9=9F=EF=BC=88=E4=B8=8D=E9=9D=A0=E4=BA=BA=E9=96=8B?= =?UTF-8?q?=E9=A0=81=E7=B4=AF=E7=A9=8D=E6=AD=B7=E5=8F=B2=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題: Phase 38 加了 host_health_probes 表 + 開觀測台頁面時寫一筆,但 無人開頁時沒人寫 → Telegram cmd:obs_health 顯示「24h uptime」永遠空。 修補: - run_scheduler.py::run_host_health_probe - 每 15 min HTTP probe GCP-A/GCP-B/111 三主機 /api/tags - 寫入 host_health_probes(label/url/healthy/unhealthy_mark/ models_count/response_ms/error_msg) - 失敗安全:HTTP/DB 失敗只 log warning - run_scheduler.py::run_host_health_probe_cleanup - 每日 03:00 DELETE 30d 前舊資料(防表膨脹) - 註冊到 schedule.every(15).minutes 與 schedule.every().day.at("03:00") 效果: - Web /observability/host_health 24h 趨勢卡永遠有資料(即使無人開頁) - Telegram cmd:obs_health 三主機在線率永遠有資料 - 三主機歷史完整保留 30 天,超出自動清理 Phase 38+39+40+41+42 觀測台戰役完整收官(7 commits)。 部署驗證: - mo.wooo.work/observability/host_health → HTTP 200 / 42716 byte (Phase 38 為 39124 byte,多 3.5KB 證明 24h 趨勢/MCP/AIOps card 已上線) Co-Authored-By: Claude Opus 4.7 (1M context) --- run_scheduler.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/run_scheduler.py b/run_scheduler.py index 57d6123..e371c68 100644 --- a/run_scheduler.py +++ b/run_scheduler.py @@ -121,6 +121,12 @@ def _register_schedules(): schedule.every().sunday.at("04:30").do(run_embed_consistency_check) logger.info("📅 每週日 04:30:bge-m3 跨主機一致性驗證") + # Phase 42: 三主機 Ollama 健康探針(即使無人開觀測台頁面也持續累積歷史) + schedule.every(15).minutes.do(run_host_health_probe) + logger.info("📅 每 15 分鐘:host_health_probe(三主機 → host_health_probes 表)") + schedule.every().day.at("03:00").do(run_host_health_probe_cleanup) + logger.info("📅 每日 03:00:host_health_probe_cleanup(清 30d 前舊資料)") + # Phase 20: 成本自動節流(COST_THROTTLE_ENABLED 預設 OFF) schedule.every(1).hours.do(run_cost_throttle_evaluate) logger.info("📅 每 1 小時:cost_throttle_evaluate") @@ -266,6 +272,98 @@ def run_cost_throttle_evaluate(): logger.error(f"[CostThrottle] task failed: {e}", exc_info=True) +def run_host_health_probe(): + """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。 + + 用途:即使無人開觀測台頁面也持續累積健康歷史, + 讓 mo.wooo.work/observability/host_health 的 24h uptime 統計 + 與 Telegram cmd:obs_health 都有資料可用。 + + 失敗安全:HTTP/DB 失敗不影響其他 scheduler job。 + """ + try: + import time as _time + import requests as _r + from sqlalchemy import text as _sa + from database.manager import DatabaseManager + from services.ollama_service import ( + OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK, + _is_unhealthy, + ) + + records = [] + for label, host in [ + ('Primary (GCP)', OLLAMA_HOST_PRIMARY), + ('Secondary (GCP)', OLLAMA_HOST_SECONDARY), + ('Fallback (111)', OLLAMA_HOST_FALLBACK), + ]: + t0 = _time.monotonic() + healthy = False + err = None + models_count = 0 + try: + resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3) + if resp.status_code == 200: + healthy = True + models_count = len(resp.json().get('models', []) or []) + else: + err = f"HTTP {resp.status_code}" + except Exception as e: + err = f"{type(e).__name__}: {str(e)[:200]}" + records.append({ + 'host_label': label, 'host_url': host, 'healthy': healthy, + 'unhealthy_mark': _is_unhealthy(host), + 'models_count': models_count, + 'response_ms': int((_time.monotonic() - t0) * 1000), + 'error_msg': err, + }) + + # 批次寫 DB + session = DatabaseManager().get_session() + try: + for rec in records: + session.execute( + _sa(""" + INSERT INTO host_health_probes + (host_label, host_url, healthy, unhealthy_mark, + models_count, response_ms, error_msg) + VALUES + (:host_label, :host_url, :healthy, :unhealthy_mark, + :models_count, :response_ms, :error_msg) + """), + rec, + ) + session.commit() + finally: + session.close() + + unhealthy = [r['host_label'] for r in records if not r['healthy']] + if unhealthy: + logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}") + else: + logger.debug("[HostHealthProbe] all 3 hosts healthy") + except Exception as e: + logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True) + + +def run_host_health_probe_cleanup(): + """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。""" + try: + from sqlalchemy import text as _sa + from database.manager import DatabaseManager + session = DatabaseManager().get_session() + try: + result = session.execute( + _sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"), + ) + session.commit() + logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)") + finally: + session.close() + except Exception as e: + logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True) + + def run_cost_throttle_reset_if_new_month(): """每日 00:05 — 若當天是月份第 1 天,清 throttle state(跨月重置)。""" try: