feat(p42): scheduler 每 15 分鐘自動 probe 三主機(不靠人開頁累積歷史)
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
問題:
Phase 38 加了 host_health_probes 表 + 開觀測台頁面時寫一筆,但
無人開頁時沒人寫 → Telegram cmd:obs_health 顯示「24h uptime」永遠空。
修補:
- run_scheduler.py::run_host_health_probe
- 每 15 min HTTP probe GCP-A/GCP-B/111 三主機 /api/tags
- 寫入 host_health_probes(label/url/healthy/unhealthy_mark/
models_count/response_ms/error_msg)
- 失敗安全:HTTP/DB 失敗只 log warning
- run_scheduler.py::run_host_health_probe_cleanup
- 每日 03:00 DELETE 30d 前舊資料(防表膨脹)
- 註冊到 schedule.every(15).minutes 與 schedule.every().day.at("03:00")
效果:
- Web /observability/host_health 24h 趨勢卡永遠有資料(即使無人開頁)
- Telegram cmd:obs_health 三主機在線率永遠有資料
- 三主機歷史完整保留 30 天,超出自動清理
Phase 38+39+40+41+42 觀測台戰役完整收官(7 commits)。
部署驗證:
- mo.wooo.work/observability/host_health → HTTP 200 / 42716 byte
(Phase 38 為 39124 byte,多 3.5KB 證明 24h 趨勢/MCP/AIOps card 已上線)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -121,6 +121,12 @@ def _register_schedules():
|
||||
schedule.every().sunday.at("04:30").do(run_embed_consistency_check)
|
||||
logger.info("📅 每週日 04:30:bge-m3 跨主機一致性驗證")
|
||||
|
||||
# Phase 42: 三主機 Ollama 健康探針(即使無人開觀測台頁面也持續累積歷史)
|
||||
schedule.every(15).minutes.do(run_host_health_probe)
|
||||
logger.info("📅 每 15 分鐘:host_health_probe(三主機 → host_health_probes 表)")
|
||||
schedule.every().day.at("03:00").do(run_host_health_probe_cleanup)
|
||||
logger.info("📅 每日 03:00:host_health_probe_cleanup(清 30d 前舊資料)")
|
||||
|
||||
# Phase 20: 成本自動節流(COST_THROTTLE_ENABLED 預設 OFF)
|
||||
schedule.every(1).hours.do(run_cost_throttle_evaluate)
|
||||
logger.info("📅 每 1 小時:cost_throttle_evaluate")
|
||||
@@ -266,6 +272,98 @@ def run_cost_throttle_evaluate():
|
||||
logger.error(f"[CostThrottle] task failed: {e}", exc_info=True)
|
||||
|
||||
|
||||
def run_host_health_probe():
|
||||
"""Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。
|
||||
|
||||
用途:即使無人開觀測台頁面也持續累積健康歷史,
|
||||
讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
|
||||
與 Telegram cmd:obs_health 都有資料可用。
|
||||
|
||||
失敗安全:HTTP/DB 失敗不影響其他 scheduler job。
|
||||
"""
|
||||
try:
|
||||
import time as _time
|
||||
import requests as _r
|
||||
from sqlalchemy import text as _sa
|
||||
from database.manager import DatabaseManager
|
||||
from services.ollama_service import (
|
||||
OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK,
|
||||
_is_unhealthy,
|
||||
)
|
||||
|
||||
records = []
|
||||
for label, host in [
|
||||
('Primary (GCP)', OLLAMA_HOST_PRIMARY),
|
||||
('Secondary (GCP)', OLLAMA_HOST_SECONDARY),
|
||||
('Fallback (111)', OLLAMA_HOST_FALLBACK),
|
||||
]:
|
||||
t0 = _time.monotonic()
|
||||
healthy = False
|
||||
err = None
|
||||
models_count = 0
|
||||
try:
|
||||
resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3)
|
||||
if resp.status_code == 200:
|
||||
healthy = True
|
||||
models_count = len(resp.json().get('models', []) or [])
|
||||
else:
|
||||
err = f"HTTP {resp.status_code}"
|
||||
except Exception as e:
|
||||
err = f"{type(e).__name__}: {str(e)[:200]}"
|
||||
records.append({
|
||||
'host_label': label, 'host_url': host, 'healthy': healthy,
|
||||
'unhealthy_mark': _is_unhealthy(host),
|
||||
'models_count': models_count,
|
||||
'response_ms': int((_time.monotonic() - t0) * 1000),
|
||||
'error_msg': err,
|
||||
})
|
||||
|
||||
# 批次寫 DB
|
||||
session = DatabaseManager().get_session()
|
||||
try:
|
||||
for rec in records:
|
||||
session.execute(
|
||||
_sa("""
|
||||
INSERT INTO host_health_probes
|
||||
(host_label, host_url, healthy, unhealthy_mark,
|
||||
models_count, response_ms, error_msg)
|
||||
VALUES
|
||||
(:host_label, :host_url, :healthy, :unhealthy_mark,
|
||||
:models_count, :response_ms, :error_msg)
|
||||
"""),
|
||||
rec,
|
||||
)
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
unhealthy = [r['host_label'] for r in records if not r['healthy']]
|
||||
if unhealthy:
|
||||
logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
|
||||
else:
|
||||
logger.debug("[HostHealthProbe] all 3 hosts healthy")
|
||||
except Exception as e:
|
||||
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
|
||||
|
||||
|
||||
def run_host_health_probe_cleanup():
|
||||
"""Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
|
||||
try:
|
||||
from sqlalchemy import text as _sa
|
||||
from database.manager import DatabaseManager
|
||||
session = DatabaseManager().get_session()
|
||||
try:
|
||||
result = session.execute(
|
||||
_sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"),
|
||||
)
|
||||
session.commit()
|
||||
logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)")
|
||||
finally:
|
||||
session.close()
|
||||
except Exception as e:
|
||||
logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
|
||||
|
||||
|
||||
def run_cost_throttle_reset_if_new_month():
|
||||
"""每日 00:05 — 若當天是月份第 1 天,清 throttle state(跨月重置)。"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user