From f10999ed1c8c026ddeff1cb0b4699efa7273b664 Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 4 May 2026 19:26:54 +0800 Subject: [PATCH] =?UTF-8?q?feat(p43):=20Ollama=20=E4=B8=BB=E6=A9=9F=20stat?= =?UTF-8?q?e=20transition=20=E8=87=AA=E5=8B=95=E5=91=8A=E8=AD=A6=20+=20inl?= =?UTF-8?q?ine=20AutoHeal=20=E9=96=89=E7=92=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題: Phase 42 加 scheduler 每 15min probe 寫入 host_health_probes,但只是 silent 累積 — 主機真的掛掉時統帥仍然要主動開觀測台才知道。 修補: - run_scheduler.py::run_host_health_probe 寫入 DB 之前先查同 host 的最近一筆 probe 比對 state transition 偵測: healthy → unhealthy:推 P1 告警 + inline AutoHeal 按鈕 unhealthy → healthy:推 P3 「已恢復」訊息 - run_scheduler.py::_push_host_transition_alert(新 helper) 使用 services.telegram_templates::send_telegram_with_result inline keyboard 含「🩹 立即 AutoHeal {GCP-A|GCP-B|111}」按鈕 + 「📊 查 24h 健康統計」次按鈕 按鈕 callback_data 對齊既有 Phase 41 cmd:obs_heal handler - Dedup:1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版) 用 host_health_probes 自身查歷史對比,無需新 dedup 表 完整閉環: scheduler 每 15min probe → 偵測 state transition → 推 Telegram 告警 → 統帥點 inline button → cmd:obs_heal:{label} → AutoHeal 跑 ADR-013 playbook → 寫入 incidents + heal_logs → 下一次 probe 偵測 unhealthy→ healthy → 推「已恢復」訊息 至此觀測台從「raw stats dashboard」進化為: - 持續累積歷史(Phase 42) - 主動告警 + 一鍵修復(Phase 43) - 完整閉環自動化(從監控到復原全自動,僅關鍵節點需人工確認) Co-Authored-By: Claude Opus 4.7 (1M context) --- run_scheduler.py | 113 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 5 deletions(-) diff --git a/run_scheduler.py b/run_scheduler.py index e371c68..3cab1dc 100644 --- a/run_scheduler.py +++ b/run_scheduler.py @@ -275,11 +275,9 @@ def run_cost_throttle_evaluate(): def run_host_health_probe(): """Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。 - 用途:即使無人開觀測台頁面也持續累積健康歷史, - 讓 mo.wooo.work/observability/host_health 的 24h uptime 統計 - 與 Telegram cmd:obs_health 都有資料可用。 - - 失敗安全:HTTP/DB 失敗不影響其他 scheduler job。 + Phase 43 增強:偵測 state transition (healthy→unhealthy / unhealthy→healthy) → + 主動推 Telegram 告警 + inline AutoHeal 按鈕,完整「監控→告警→修復」閉環。 + Dedup: 1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版)。 """ try: import time as _time @@ -318,6 +316,56 @@ def run_host_health_probe(): 'error_msg': err, }) + # Phase 43: state transition 偵測(在寫入新筆 *之前* 查上一筆狀態) + transitions = [] + try: + session = DatabaseManager().get_session() + try: + for rec in records: + last = session.execute( + _sa(""" + SELECT healthy, probed_at FROM host_health_probes + WHERE host_label = :label + ORDER BY probed_at DESC LIMIT 1 + """), + {'label': rec['host_label']}, + ).fetchone() + if last is None: + continue # 第一筆探針,無對比基準 + prev_healthy = bool(last[0]) + if prev_healthy != rec['healthy']: + # 同方向 1h 內 dedup + recent_transition = session.execute( + _sa(""" + SELECT 1 FROM host_health_probes p1 + WHERE p1.host_label = :label + AND p1.probed_at >= NOW() - INTERVAL '1 hour' + AND p1.healthy = :curr_healthy + AND EXISTS ( + SELECT 1 FROM host_health_probes p2 + WHERE p2.host_label = :label + AND p2.probed_at < p1.probed_at + AND p2.probed_at >= NOW() - INTERVAL '90 minutes' + AND p2.healthy != :curr_healthy + ) + LIMIT 1 + """), + {'label': rec['host_label'], 'curr_healthy': rec['healthy']}, + ).fetchone() + if recent_transition is None: + transitions.append({ + 'host_label': rec['host_label'], + 'host_url': rec['host_url'], + 'prev_healthy': prev_healthy, + 'curr_healthy': rec['healthy'], + 'error_msg': rec.get('error_msg'), + 'response_ms': rec.get('response_ms'), + }) + finally: + session.close() + except Exception as e: + logger.warning(f"[HostHealthProbe] transition detect failed: {e}") + # 批次寫 DB session = DatabaseManager().get_session() try: @@ -337,6 +385,13 @@ def run_host_health_probe(): finally: session.close() + # Phase 43: 推 Telegram 告警 / 恢復通知 + for tr in transitions: + try: + _push_host_transition_alert(tr) + except Exception as e: + logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}") + unhealthy = [r['host_label'] for r in records if not r['healthy']] if unhealthy: logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}") @@ -346,6 +401,54 @@ def run_host_health_probe(): logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True) +def _push_host_transition_alert(tr): + """Phase 43: 主機 state transition → 推 Telegram。 + + healthy → unhealthy:P1 告警 + inline 「🩹 修 {label}」按鈕 + unhealthy → healthy:P3 簡訊「已恢復」(不附按鈕) + """ + try: + from services.telegram_templates import send_telegram_with_result + except Exception: + return + + label = tr['host_label'] + short_label = ( + 'GCP-A' if 'Primary' in label else + 'GCP-B' if 'Secondary' in label else + '111' + ) + + if tr['curr_healthy']: + # 恢復通知(無按鈕) + text = ( + f"✅ Ollama 主機已恢復\n\n" + f"主機:{label}\n" + f"網址:{tr['host_url']}\n" + f"回應:{tr['response_ms']} ms\n\n" + f"scheduler 每 15 分鐘自動探針偵測" + ) + send_telegram_with_result(text, parse_mode='HTML') + else: + # 故障告警 + inline AutoHeal 按鈕 + err_short = (tr.get('error_msg') or '無錯誤訊息')[:200] + text = ( + f"🚨 Ollama 主機異常\n\n" + f"主機:{label}\n" + f"網址:{tr['host_url']}\n" + f"錯誤:{err_short}\n\n" + f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook," + f"或至 觀測台 詳查。" + ) + reply_markup = { + "inline_keyboard": [ + [{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}], + [{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}], + ], + } + send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML') + + def run_host_health_probe_cleanup(): """Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。""" try: