補齊 scheduler 觀測任務失敗告警
All checks were successful
CD Pipeline / deploy (push) Successful in 57s

This commit is contained in:
OoO
2026-05-13 09:37:22 +08:00
parent 34db2db5fd
commit 5785a584c4
2 changed files with 42 additions and 0 deletions

View File

@@ -477,6 +477,13 @@ def run_host_health_probe():
logger.debug("[HostHealthProbe] all 3 hosts healthy")
except Exception as e:
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_host_health_probe",
e,
source="Scheduler.HostHealth",
event_type="host_health_probe_failure",
title="三主機健康探針失敗",
)
def _push_host_transition_alert(tr):
@@ -605,6 +612,13 @@ def run_ai_calls_error_spike_check():
)
except Exception as e:
logger.error(f"[AICallsErrorSpike] failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_ai_calls_error_spike_check",
e,
source="Scheduler.Observability",
event_type="ai_calls_error_spike_check_failure",
title="AI 呼叫錯誤率檢查失敗",
)
def run_observability_daily_summary():
@@ -751,6 +765,13 @@ def run_observability_daily_summary():
logger.info("[ObservabilityDaily] summary pushed to Telegram")
except Exception as e:
logger.error(f"[ObservabilityDaily] failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_observability_daily_summary",
e,
source="Scheduler.Observability",
event_type="observability_daily_summary_failure",
title="觀測台每日摘要失敗",
)
def run_host_health_probe_cleanup():
@@ -769,6 +790,13 @@ def run_host_health_probe_cleanup():
session.close()
except Exception as e:
logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_host_health_probe_cleanup",
e,
source="Scheduler.HostHealth",
event_type="host_health_probe_cleanup_failure",
title="主機健康探針清理失敗",
)
def run_cost_throttle_reset_if_new_month():

View File

@@ -1,4 +1,5 @@
import importlib
import inspect
def _load_run_scheduler(monkeypatch):
@@ -94,3 +95,16 @@ def test_notify_scheduler_failure_without_active_exception_uses_error_trace(monk
)
assert calls[0]["trace"] == "RuntimeError: mismatch"
def test_scheduler_observability_wrappers_notify_on_exception(monkeypatch):
run_scheduler = _load_run_scheduler(monkeypatch)
for fn_name in [
"run_host_health_probe",
"run_ai_calls_error_spike_check",
"run_observability_daily_summary",
"run_host_health_probe_cleanup",
]:
source = inspect.getsource(getattr(run_scheduler, fn_name))
assert "_notify_scheduler_failure(" in source