feat(p43): Ollama 主機 state transition 自動告警 + inline AutoHeal 閉環
Some checks are pending
CD Pipeline / deploy (push) Has started running
Some checks are pending
CD Pipeline / deploy (push) Has started running
問題:
Phase 42 加 scheduler 每 15min probe 寫入 host_health_probes,但只是
silent 累積 — 主機真的掛掉時統帥仍然要主動開觀測台才知道。
修補:
- run_scheduler.py::run_host_health_probe
寫入 DB 之前先查同 host 的最近一筆 probe 比對
state transition 偵測:
healthy → unhealthy:推 P1 告警 + inline AutoHeal 按鈕
unhealthy → healthy:推 P3 「已恢復」訊息
- run_scheduler.py::_push_host_transition_alert(新 helper)
使用 services.telegram_templates::send_telegram_with_result
inline keyboard 含「🩹 立即 AutoHeal {GCP-A|GCP-B|111}」按鈕
+ 「📊 查 24h 健康統計」次按鈕
按鈕 callback_data 對齊既有 Phase 41 cmd:obs_heal handler
- Dedup:1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版)
用 host_health_probes 自身查歷史對比,無需新 dedup 表
完整閉環:
scheduler 每 15min probe → 偵測 state transition → 推 Telegram 告警
→ 統帥點 inline button → cmd:obs_heal:{label} → AutoHeal 跑 ADR-013
playbook → 寫入 incidents + heal_logs → 下一次 probe 偵測 unhealthy→
healthy → 推「已恢復」訊息
至此觀測台從「raw stats dashboard」進化為:
- 持續累積歷史(Phase 42)
- 主動告警 + 一鍵修復(Phase 43)
- 完整閉環自動化(從監控到復原全自動,僅關鍵節點需人工確認)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
113
run_scheduler.py
113
run_scheduler.py
@@ -275,11 +275,9 @@ def run_cost_throttle_evaluate():
|
||||
def run_host_health_probe():
|
||||
"""Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。
|
||||
|
||||
用途:即使無人開觀測台頁面也持續累積健康歷史,
|
||||
讓 mo.wooo.work/observability/host_health 的 24h uptime 統計
|
||||
與 Telegram cmd:obs_health 都有資料可用。
|
||||
|
||||
失敗安全:HTTP/DB 失敗不影響其他 scheduler job。
|
||||
Phase 43 增強:偵測 state transition (healthy→unhealthy / unhealthy→healthy) →
|
||||
主動推 Telegram 告警 + inline AutoHeal 按鈕,完整「監控→告警→修復」閉環。
|
||||
Dedup: 1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版)。
|
||||
"""
|
||||
try:
|
||||
import time as _time
|
||||
@@ -318,6 +316,56 @@ def run_host_health_probe():
|
||||
'error_msg': err,
|
||||
})
|
||||
|
||||
# Phase 43: state transition 偵測(在寫入新筆 *之前* 查上一筆狀態)
|
||||
transitions = []
|
||||
try:
|
||||
session = DatabaseManager().get_session()
|
||||
try:
|
||||
for rec in records:
|
||||
last = session.execute(
|
||||
_sa("""
|
||||
SELECT healthy, probed_at FROM host_health_probes
|
||||
WHERE host_label = :label
|
||||
ORDER BY probed_at DESC LIMIT 1
|
||||
"""),
|
||||
{'label': rec['host_label']},
|
||||
).fetchone()
|
||||
if last is None:
|
||||
continue # 第一筆探針,無對比基準
|
||||
prev_healthy = bool(last[0])
|
||||
if prev_healthy != rec['healthy']:
|
||||
# 同方向 1h 內 dedup
|
||||
recent_transition = session.execute(
|
||||
_sa("""
|
||||
SELECT 1 FROM host_health_probes p1
|
||||
WHERE p1.host_label = :label
|
||||
AND p1.probed_at >= NOW() - INTERVAL '1 hour'
|
||||
AND p1.healthy = :curr_healthy
|
||||
AND EXISTS (
|
||||
SELECT 1 FROM host_health_probes p2
|
||||
WHERE p2.host_label = :label
|
||||
AND p2.probed_at < p1.probed_at
|
||||
AND p2.probed_at >= NOW() - INTERVAL '90 minutes'
|
||||
AND p2.healthy != :curr_healthy
|
||||
)
|
||||
LIMIT 1
|
||||
"""),
|
||||
{'label': rec['host_label'], 'curr_healthy': rec['healthy']},
|
||||
).fetchone()
|
||||
if recent_transition is None:
|
||||
transitions.append({
|
||||
'host_label': rec['host_label'],
|
||||
'host_url': rec['host_url'],
|
||||
'prev_healthy': prev_healthy,
|
||||
'curr_healthy': rec['healthy'],
|
||||
'error_msg': rec.get('error_msg'),
|
||||
'response_ms': rec.get('response_ms'),
|
||||
})
|
||||
finally:
|
||||
session.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"[HostHealthProbe] transition detect failed: {e}")
|
||||
|
||||
# 批次寫 DB
|
||||
session = DatabaseManager().get_session()
|
||||
try:
|
||||
@@ -337,6 +385,13 @@ def run_host_health_probe():
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Phase 43: 推 Telegram 告警 / 恢復通知
|
||||
for tr in transitions:
|
||||
try:
|
||||
_push_host_transition_alert(tr)
|
||||
except Exception as e:
|
||||
logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}")
|
||||
|
||||
unhealthy = [r['host_label'] for r in records if not r['healthy']]
|
||||
if unhealthy:
|
||||
logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
|
||||
@@ -346,6 +401,54 @@ def run_host_health_probe():
|
||||
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
|
||||
|
||||
|
||||
def _push_host_transition_alert(tr):
|
||||
"""Phase 43: 主機 state transition → 推 Telegram。
|
||||
|
||||
healthy → unhealthy:P1 告警 + inline 「🩹 修 {label}」按鈕
|
||||
unhealthy → healthy:P3 簡訊「已恢復」(不附按鈕)
|
||||
"""
|
||||
try:
|
||||
from services.telegram_templates import send_telegram_with_result
|
||||
except Exception:
|
||||
return
|
||||
|
||||
label = tr['host_label']
|
||||
short_label = (
|
||||
'GCP-A' if 'Primary' in label else
|
||||
'GCP-B' if 'Secondary' in label else
|
||||
'111'
|
||||
)
|
||||
|
||||
if tr['curr_healthy']:
|
||||
# 恢復通知(無按鈕)
|
||||
text = (
|
||||
f"<b>✅ Ollama 主機已恢復</b>\n\n"
|
||||
f"主機:<code>{label}</code>\n"
|
||||
f"網址:<code>{tr['host_url']}</code>\n"
|
||||
f"回應:{tr['response_ms']} ms\n\n"
|
||||
f"<i>scheduler 每 15 分鐘自動探針偵測</i>"
|
||||
)
|
||||
send_telegram_with_result(text, parse_mode='HTML')
|
||||
else:
|
||||
# 故障告警 + inline AutoHeal 按鈕
|
||||
err_short = (tr.get('error_msg') or '無錯誤訊息')[:200]
|
||||
text = (
|
||||
f"<b>🚨 Ollama 主機異常</b>\n\n"
|
||||
f"主機:<code>{label}</code>\n"
|
||||
f"網址:<code>{tr['host_url']}</code>\n"
|
||||
f"錯誤:<code>{err_short}</code>\n\n"
|
||||
f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook,"
|
||||
f"或至 <a href=\"https://mo.wooo.work/observability/host_health\">觀測台</a> 詳查。"
|
||||
)
|
||||
reply_markup = {
|
||||
"inline_keyboard": [
|
||||
[{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}],
|
||||
[{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}],
|
||||
],
|
||||
}
|
||||
send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML')
|
||||
|
||||
|
||||
def run_host_health_probe_cleanup():
|
||||
"""Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user