feat(p42): scheduler 每 15 分鐘自動 probe 三主機(不靠人開頁累積歷史)
Some checks failed
CD Pipeline / deploy (push) Has been cancelled

問題:
Phase 38 加了 host_health_probes 表 + 開觀測台頁面時寫一筆,但
無人開頁時沒人寫 → Telegram cmd:obs_health 顯示「24h uptime」永遠空。

修補:
- run_scheduler.py::run_host_health_probe
  - 每 15 min HTTP probe GCP-A/GCP-B/111 三主機 /api/tags
  - 寫入 host_health_probes(label/url/healthy/unhealthy_mark/
    models_count/response_ms/error_msg)
  - 失敗安全:HTTP/DB 失敗只 log warning
- run_scheduler.py::run_host_health_probe_cleanup
  - 每日 03:00 DELETE 30d 前舊資料(防表膨脹)
- 註冊到 schedule.every(15).minutes 與 schedule.every().day.at("03:00")

效果:
- Web /observability/host_health 24h 趨勢卡永遠有資料(即使無人開頁)
- Telegram cmd:obs_health 三主機在線率永遠有資料
- 三主機歷史完整保留 30 天,超出自動清理

Phase 38+39+40+41+42 觀測台戰役完整收官(7 commits)。

部署驗證:
- mo.wooo.work/observability/host_health → HTTP 200 / 42716 byte
  (Phase 38 為 39124 byte,多 3.5KB 證明 24h 趨勢/MCP/AIOps card 已上線)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
OoO
2026-05-04 19:24:07 +08:00
parent 4020b734a5
commit d5a4e27344

View File

@@ -121,6 +121,12 @@ def _register_schedules():
schedule.every().sunday.at("04:30").do(run_embed_consistency_check)
logger.info("📅 每週日 04:30bge-m3 跨主機一致性驗證")
# Phase 42: 三主機 Ollama 健康探針(即使無人開觀測台頁面也持續累積歷史)
schedule.every(15).minutes.do(run_host_health_probe)
logger.info("📅 每 15 分鐘host_health_probe三主機 → host_health_probes 表)")
schedule.every().day.at("03:00").do(run_host_health_probe_cleanup)
logger.info("📅 每日 03:00host_health_probe_cleanup清 30d 前舊資料)")
# Phase 20: 成本自動節流COST_THROTTLE_ENABLED 預設 OFF
schedule.every(1).hours.do(run_cost_throttle_evaluate)
logger.info("📅 每 1 小時cost_throttle_evaluate")
@@ -266,6 +272,98 @@ def run_cost_throttle_evaluate():
logger.error(f"[CostThrottle] task failed: {e}", exc_info=True)
def run_host_health_probe():
"""Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama寫入 host_health_probes。
用途:即使無人開觀測台頁面也持續累積健康歷史,
讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
與 Telegram cmd:obs_health 都有資料可用。
失敗安全HTTP/DB 失敗不影響其他 scheduler job。
"""
try:
import time as _time
import requests as _r
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
from services.ollama_service import (
OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK,
_is_unhealthy,
)
records = []
for label, host in [
('Primary (GCP)', OLLAMA_HOST_PRIMARY),
('Secondary (GCP)', OLLAMA_HOST_SECONDARY),
('Fallback (111)', OLLAMA_HOST_FALLBACK),
]:
t0 = _time.monotonic()
healthy = False
err = None
models_count = 0
try:
resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3)
if resp.status_code == 200:
healthy = True
models_count = len(resp.json().get('models', []) or [])
else:
err = f"HTTP {resp.status_code}"
except Exception as e:
err = f"{type(e).__name__}: {str(e)[:200]}"
records.append({
'host_label': label, 'host_url': host, 'healthy': healthy,
'unhealthy_mark': _is_unhealthy(host),
'models_count': models_count,
'response_ms': int((_time.monotonic() - t0) * 1000),
'error_msg': err,
})
# 批次寫 DB
session = DatabaseManager().get_session()
try:
for rec in records:
session.execute(
_sa("""
INSERT INTO host_health_probes
(host_label, host_url, healthy, unhealthy_mark,
models_count, response_ms, error_msg)
VALUES
(:host_label, :host_url, :healthy, :unhealthy_mark,
:models_count, :response_ms, :error_msg)
"""),
rec,
)
session.commit()
finally:
session.close()
unhealthy = [r['host_label'] for r in records if not r['healthy']]
if unhealthy:
logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
else:
logger.debug("[HostHealthProbe] all 3 hosts healthy")
except Exception as e:
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
def run_host_health_probe_cleanup():
"""Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
try:
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
session = DatabaseManager().get_session()
try:
result = session.execute(
_sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"),
)
session.commit()
logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)")
finally:
session.close()
except Exception as e:
logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
def run_cost_throttle_reset_if_new_month():
"""每日 00:05 — 若當天是月份第 1 天,清 throttle state跨月重置"""
try: