From 5785a584c4a9c4e3cf0f821045e6271f5d7bdd6b Mon Sep 17 00:00:00 2001 From: OoO Date: Wed, 13 May 2026 09:37:22 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A3=9C=E9=BD=8A=20scheduler=20=E8=A7=80?= =?UTF-8?q?=E6=B8=AC=E4=BB=BB=E5=8B=99=E5=A4=B1=E6=95=97=E5=91=8A=E8=AD=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run_scheduler.py | 28 +++++++++++++++++++ tests/test_run_scheduler_embed_consistency.py | 14 ++++++++++ 2 files changed, 42 insertions(+) diff --git a/run_scheduler.py b/run_scheduler.py index d7f8079..6a51459 100644 --- a/run_scheduler.py +++ b/run_scheduler.py @@ -477,6 +477,13 @@ def run_host_health_probe(): logger.debug("[HostHealthProbe] all 3 hosts healthy") except Exception as e: logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True) + _notify_scheduler_failure( + "run_host_health_probe", + e, + source="Scheduler.HostHealth", + event_type="host_health_probe_failure", + title="三主機健康探針失敗", + ) def _push_host_transition_alert(tr): @@ -605,6 +612,13 @@ def run_ai_calls_error_spike_check(): ) except Exception as e: logger.error(f"[AICallsErrorSpike] failed: {e}", exc_info=True) + _notify_scheduler_failure( + "run_ai_calls_error_spike_check", + e, + source="Scheduler.Observability", + event_type="ai_calls_error_spike_check_failure", + title="AI 呼叫錯誤率檢查失敗", + ) def run_observability_daily_summary(): @@ -751,6 +765,13 @@ def run_observability_daily_summary(): logger.info("[ObservabilityDaily] summary pushed to Telegram") except Exception as e: logger.error(f"[ObservabilityDaily] failed: {e}", exc_info=True) + _notify_scheduler_failure( + "run_observability_daily_summary", + e, + source="Scheduler.Observability", + event_type="observability_daily_summary_failure", + title="觀測台每日摘要失敗", + ) def run_host_health_probe_cleanup(): @@ -769,6 +790,13 @@ def run_host_health_probe_cleanup(): session.close() except Exception as e: logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True) + _notify_scheduler_failure( + "run_host_health_probe_cleanup", + e, + source="Scheduler.HostHealth", + event_type="host_health_probe_cleanup_failure", + title="主機健康探針清理失敗", + ) def run_cost_throttle_reset_if_new_month(): diff --git a/tests/test_run_scheduler_embed_consistency.py b/tests/test_run_scheduler_embed_consistency.py index b2ad338..6819eae 100644 --- a/tests/test_run_scheduler_embed_consistency.py +++ b/tests/test_run_scheduler_embed_consistency.py @@ -1,4 +1,5 @@ import importlib +import inspect def _load_run_scheduler(monkeypatch): @@ -94,3 +95,16 @@ def test_notify_scheduler_failure_without_active_exception_uses_error_trace(monk ) assert calls[0]["trace"] == "RuntimeError: mismatch" + + +def test_scheduler_observability_wrappers_notify_on_exception(monkeypatch): + run_scheduler = _load_run_scheduler(monkeypatch) + + for fn_name in [ + "run_host_health_probe", + "run_ai_calls_error_spike_check", + "run_observability_daily_summary", + "run_host_health_probe_cleanup", + ]: + source = inspect.getsource(getattr(run_scheduler, fn_name)) + assert "_notify_scheduler_failure(" in source