#!/usr/bin/env python3
"""
run_scheduler.py — momo-scheduler 容器入口點
排程任務清單(對齊 app.py init_scheduler + scheduler.py 全任務):
每 30 分鐘:auto_import、whitepage_check
每 1 小時:momo、edm、festival
每 4 小時:competitor_price_feeder、icaim_analysis
每 6 小時:quality_rescore
每 12 小時:dedup_batch
每 1 天 :db_backup(03:00)、cleanup_agent_context(03:30)、backup_monitor(04:00)、daily_report(09:00)、ai_smoke_summary(09:10)、pchome_match_backfill(10:30)、openclaw_meta_analysis(12:00, Phase 4 降頻)、daily_token_report(23:55)
每 1 週 :weekly_strategy(週一 06:00)
每 1 月 :monthly_report(每月1日 07:00)
"""
import asyncio
import logging
import threading
import time
import schedule
# 匯入全部排程任務函式
from scheduler import (
run_momo_task,
run_edm_task,
run_festival_task,
run_promo_event_task,
run_auto_import_task,
run_whitepage_check,
run_competitor_price_feeder_task,
run_pchome_match_backfill_task,
run_icaim_analysis_task,
run_weekly_strategy_task,
run_db_backup_task,
run_backup_monitor_task,
run_openclaw_meta_analysis_task,
run_dedup_batch_task,
run_quality_rescore_task,
run_daily_report_task,
run_ai_smoke_daily_summary_task,
run_monthly_report_task,
)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger(__name__)
_AI_CALLS_ERROR_SPIKE_LAST_PUSH_TS = 0.0
def _notify_scheduler_failure(
task_name: str,
error: Exception,
*,
source: str,
event_type: str,
priority: str = "P2",
title: str = None,
dedup_ttl_sec: int = 3600,
) -> None:
"""Best-effort EventRouter failure notification for scheduler wrapper tasks."""
try:
import traceback
from services.event_router import notify_failure
notify_failure(
task_name=task_name,
error=error,
source=source,
event_type=event_type,
priority=priority,
title=title or f"{task_name} 失敗",
trace=traceback.format_exc(),
dedup_ttl_sec=dedup_ttl_sec,
)
except Exception as router_error:
logger.error("[%s] event_router notify failed: %s", task_name, router_error)
def _register_schedules():
schedule.every(30).minutes.do(run_auto_import_task)
logger.info("📅 每 30 分鐘:auto_import")
schedule.every(30).minutes.do(run_whitepage_check)
logger.info("📅 每 30 分鐘:whitepage_check")
schedule.every(1).hours.do(run_momo_task)
logger.info("📅 每 1 小時:momo_task")
schedule.every(1).hours.do(run_edm_task)
logger.info("📅 每 1 小時:edm_task")
schedule.every(1).hours.do(run_festival_task)
logger.info("📅 每 1 小時:festival_task")
# 動態註冊促銷活動爬蟲(根據配置)
from services.crawler_config_loader import get_enabled_crawlers
enabled_crawlers = get_enabled_crawlers()
promo_event_configs = {
'mothers_day_2026': {'lpn': '', 'page_type': 'mothers_day', 'name': '母親節超值限時購'},
'valentine_520_2026': {'lpn': '', 'page_type': 'valentine_520', 'name': '520情人節限定購物'},
'labor_day_2026': {'lpn': '', 'page_type': 'labor_day', 'name': '勞動節購物優惠'}
}
for crawler_key, config in enabled_crawlers.items():
if crawler_key in promo_event_configs:
event_config = promo_event_configs[crawler_key]
lpn_code = config.get('lpn_code', '')
if lpn_code:
schedule_hours = config.get('schedule_hours', 4)
schedule.every(schedule_hours).hours.do(
lambda lpn=lpn_code, pt=event_config['page_type'], an=event_config['name']:
run_promo_event_task(lpn, pt, an)
)
logger.info(f"📅 每 {schedule_hours} 小時:{event_config['name']} ({event_config['page_type']})")
else:
logger.warning(f"⚠️ {event_config['name']} 未配置 LPN 代碼,跳過排程")
schedule.every(4).hours.do(run_competitor_price_feeder_task)
logger.info("📅 每 4 小時:competitor_price_feeder")
schedule.every(4).hours.do(run_icaim_analysis_task)
logger.info("📅 每 4 小時:icaim_analysis")
# Operation Ollama-First v5.0 Phase 4:Meta 自審降頻 6h → 每日 12:00(月省 ~1.875M Gemini tokens)
# icaim_analysis 內原本 line 2233/2253 的額外觸發已同步移除(避免重複呼叫)
schedule.every().day.at("12:00").do(run_openclaw_meta_analysis_task)
logger.info("📅 每日 12:00:openclaw_meta_analysis(Phase 4 降頻:原 6h)")
schedule.every(6).hours.do(run_quality_rescore_task)
logger.info("📅 每 6 小時:quality_rescore")
schedule.every(12).hours.do(run_dedup_batch_task)
logger.info("📅 每 12 小時:dedup_batch")
# Operation Ollama-First v5.0 Phase 11+ — RAG 學習迴圈 worker(Phase 12 收尾)
# 預設 RAG_ENABLED=false 時,learning_episodes 不會有資料,worker 跑空 loop(無害)
schedule.every(5).minutes.do(run_promotion_gate_worker)
logger.info("📅 每 5 分鐘:promotion_gate_worker(pending → promote/reject/await)")
schedule.every(30).minutes.do(run_awaiting_review_push)
logger.info("📅 每 30 分鐘:awaiting_review_push(推 Telegram 等 👍/👎)")
schedule.every(4).hours.do(run_expire_stale_reviews)
logger.info("📅 每 4 小時:expire_stale_reviews(24h 無回應降權 0.5)")
# Phase 11.0 護欄 #3:BGE-M3 跨主機一致性驗證(ADR-033)
# 每週一次足夠(驗證模型版本未漂移;不需每次啟動)
schedule.every().sunday.at("04:30").do(run_embed_consistency_check)
logger.info("📅 每週日 04:30:bge-m3 跨主機一致性驗證")
# Phase 42: 三主機 Ollama 健康探針(即使無人開觀測台頁面也持續累積歷史)
schedule.every(15).minutes.do(run_host_health_probe)
logger.info("📅 每 15 分鐘:host_health_probe(三主機 → host_health_probes 表)")
schedule.every().day.at("03:00").do(run_host_health_probe_cleanup)
logger.info("📅 每日 03:00:host_health_probe_cleanup(清 30d 前舊資料)")
# Phase 44: AI 呼叫錯誤率突增告警
schedule.every(30).minutes.do(run_ai_calls_error_spike_check)
logger.info("📅 每 30 分鐘:ai_calls_error_spike_check(錯誤率 ≥ 30% 推 Telegram)")
# Phase 44: 觀測台每日 09:30 健康摘要推送
schedule.every().day.at("09:30").do(run_observability_daily_summary)
logger.info("📅 每日 09:30:observability_daily_summary(早晨報三主機/AI/Cost/PPT)")
# Phase 20: 成本自動節流(COST_THROTTLE_ENABLED 預設 OFF)
schedule.every(1).hours.do(run_cost_throttle_evaluate)
logger.info("📅 每 1 小時:cost_throttle_evaluate")
schedule.every().day.at("00:05").do(run_cost_throttle_reset_if_new_month)
logger.info("📅 每日 00:05:cost_throttle_reset_if_new_month")
# Phase 24: ROI 月報(每日 09:05 跑,內部判斷是否月初第 1 日;避開 09:00 daily_report)
schedule.every().day.at("09:05").do(run_roi_monthly_report_if_new_month)
logger.info("📅 每日 09:05:roi_monthly_report(月初第 1 日才送)")
# Phase 26: PPT 視覺審核(每日 22:00 掃當天新生 .pptx,有 issues 才推 Telegram)
schedule.every().day.at("22:00").do(run_ppt_vision_audit)
logger.info("📅 每日 22:00:ppt_vision_audit(PPT_VISION_ENABLED=true 才生效)")
schedule.every().day.at("03:00").do(run_db_backup_task)
logger.info("📅 每日 03:00:db_backup")
schedule.every().day.at("03:30").do(run_cleanup_agent_context)
logger.info("📅 每日 03:30:cleanup_agent_context")
schedule.every().day.at("04:00").do(run_backup_monitor_task)
logger.info("📅 每日 04:00:backup_monitor")
schedule.every().monday.at("06:00").do(run_weekly_strategy_task)
logger.info("📅 每週一 06:00:weekly_strategy")
schedule.every().day.at("09:00").do(run_daily_report_task)
logger.info("📅 每日 09:00:daily_report")
schedule.every().day.at("09:10").do(run_ai_smoke_daily_summary_task)
logger.info("📅 每日 09:10:ai_smoke_daily_summary")
schedule.every().day.at("10:30").do(run_pchome_match_backfill_task)
logger.info("📅 每日 10:30:pchome_match_backfill")
# Operation Ollama-First v5.0 — Phase 1 收尾:每日 23:55 LLM Token 日報
schedule.every().day.at("23:55").do(run_daily_token_report_task)
logger.info("📅 每日 23:55:daily_token_report")
# 每月1日 07:00 月報(schedule 不支援 every().month,用每日 07:00 + 日期判斷)
def _monthly_report_gate():
from datetime import datetime as _dt
if _dt.now().day == 1:
run_monthly_report_task()
schedule.every().day.at("07:00").do(_monthly_report_gate)
logger.info("📅 每月1日 07:00:monthly_report")
def run_daily_token_report_task():
"""每日 23:55 — Operation Ollama-First v5.0 Phase 1 收尾:LLM Token 日報。
任務:
1. 查 ai_calls 過去 24h 統計(總覽 / 供應商 / TOP caller / 成本 / 趨勢 / 告警)
2. 推 Telegram + 寫 ai_insights(type='daily_token_report')
紀律:
- 失敗安全:DB 查不到資料 → 推「⚠️ 報表生成失敗」訊息但不爆 scheduler
- 不影響其他排程:例外完全吞掉,僅 log error
"""
try:
from services.token_report_service import send_daily_report
result = send_daily_report()
logger.info(
"[TokenReport] sent=%s failed=%s chars=%s ok=%s",
result.get('sent'), result.get('failed'),
result.get('chars'), result.get('ok'),
)
except Exception as e:
logger.error(f"[TokenReport] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_daily_token_report_task",
e,
source="Scheduler.TokenReport",
event_type="daily_token_report_failure",
title="LLM Token 日報失敗",
)
# ─────────────────────────────────────────────────────────────────────────────
# Operation Ollama-First v5.0 Phase 11+ — RAG 學習迴圈 worker(Phase 12 收尾)
# ─────────────────────────────────────────────────────────────────────────────
def run_promotion_gate_worker():
"""每 5 分鐘 — 批次處理 learning_episodes pending → can_promote → promote/reject/await。
依 ADR-032 PromotionGate 4 階段,不主動跑 LLM(Distiller 純規則引擎)。
RAG_ENABLED=false 時 learning_episodes 為空,worker 跑空 loop(無害)。
"""
try:
from services.learning_pipeline import process_pending_episodes
stats = process_pending_episodes()
if stats.get('pending_seen', 0) > 0:
logger.info(
"[PromotionWorker] pending=%d promoted=%d rejected=%d awaiting=%d errors=%d",
stats['pending_seen'], stats['promoted'], stats['rejected'],
stats['awaiting'], stats['errors'],
)
except Exception as e:
logger.error(f"[PromotionWorker] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_promotion_gate_worker",
e,
source="Scheduler.PromotionGate",
event_type="promotion_gate_worker_failure",
title="PromotionGate worker 失敗",
)
def run_awaiting_review_push():
"""每 30 分鐘 — 推 awaiting_review episode 到 Telegram 等 👍/👎。
限制:TELEGRAM_ADMIN_CHAT_ID 未設則跳過(fail-safe)。
"""
try:
from services.learning_pipeline import push_awaiting_reviews_to_telegram
pushed = push_awaiting_reviews_to_telegram()
if pushed > 0:
logger.info("[AwaitingReviewPush] pushed=%d episodes", pushed)
except Exception as e:
logger.error(f"[AwaitingReviewPush] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_awaiting_review_push",
e,
source="Scheduler.PromotionGate",
event_type="awaiting_review_push_failure",
title="PromotionGate 審查推播失敗",
)
def run_expire_stale_reviews():
"""每 4 小時 — 24h 無回應 awaiting_review → expired(weight=0.5)。
依 ADR-033 護欄 #1 Stage 4 規則。
"""
try:
from services.learning_pipeline import expire_stale_reviews
n = expire_stale_reviews()
if n > 0:
logger.info("[ExpireStale] expired %d awaiting_review episodes (24h timeout)", n)
except Exception as e:
logger.error(f"[ExpireStale] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_expire_stale_reviews",
e,
source="Scheduler.PromotionGate",
event_type="expire_stale_reviews_failure",
title="PromotionGate 過期審查清理失敗",
)
def run_cost_throttle_evaluate():
"""每小時 — Phase 20 成本自動節流評估(COST_THROTTLE_ENABLED 預設 OFF)。
跑 evaluate_throttle_status;若有 provider 月底推估超預算 110% → 標 throttled。
狀態變化(throttle/unthrottle)會自動推 Telegram。
"""
try:
from services.cost_throttle_service import (
evaluate_throttle_status, is_cost_throttle_enabled,
)
if not is_cost_throttle_enabled():
return # flag OFF 直接 skip(不打 DB)
state = evaluate_throttle_status()
throttled = [p for p, info in state.items() if info.get('throttled')]
if throttled:
logger.warning("[CostThrottle] currently throttled: %s", throttled)
else:
logger.debug("[CostThrottle] no provider throttled")
except Exception as e:
logger.error(f"[CostThrottle] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_cost_throttle_evaluate",
e,
source="Scheduler.CostThrottle",
event_type="cost_throttle_evaluate_failure",
title="成本節流評估失敗",
)
def run_host_health_probe():
"""Phase 42 — 每 15 分鐘自動 probe 三主機 Ollama,寫入 host_health_probes。
Phase 43 增強:偵測 state transition (healthy→unhealthy / unhealthy→healthy) →
主動推 Telegram 告警 + inline AutoHeal 按鈕,完整「監控→告警→修復」閉環。
Dedup: 1 小時內同 host 同方向 transition 只推一次(防 flapping 洗版)。
"""
try:
import time as _time
import requests as _r
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
from services.ollama_service import (
OLLAMA_HOST_PRIMARY, OLLAMA_HOST_SECONDARY, OLLAMA_HOST_FALLBACK,
_is_unhealthy,
)
records = []
for label, host in [
('Primary (GCP)', OLLAMA_HOST_PRIMARY),
('Secondary (GCP)', OLLAMA_HOST_SECONDARY),
('Fallback (111)', OLLAMA_HOST_FALLBACK),
]:
t0 = _time.monotonic()
healthy = False
err = None
models_count = 0
try:
resp = _r.get(f"{host.rstrip('/')}/api/tags", timeout=3)
if resp.status_code == 200:
healthy = True
models_count = len(resp.json().get('models', []) or [])
else:
err = f"HTTP {resp.status_code}"
except Exception as e:
err = f"{type(e).__name__}: {str(e)[:200]}"
records.append({
'host_label': label, 'host_url': host, 'healthy': healthy,
'unhealthy_mark': _is_unhealthy(host),
'models_count': models_count,
'response_ms': int((_time.monotonic() - t0) * 1000),
'error_msg': err,
})
# Phase 43: state transition 偵測(在寫入新筆 *之前* 查上一筆狀態)
transitions = []
try:
session = DatabaseManager().get_session()
try:
for rec in records:
last = session.execute(
_sa("""
SELECT healthy, probed_at FROM host_health_probes
WHERE host_label = :label
ORDER BY probed_at DESC LIMIT 1
"""),
{'label': rec['host_label']},
).fetchone()
if last is None:
continue # 第一筆探針,無對比基準
prev_healthy = bool(last[0])
if prev_healthy != rec['healthy']:
# 同方向 1h 內 dedup
recent_transition = session.execute(
_sa("""
SELECT 1 FROM host_health_probes p1
WHERE p1.host_label = :label
AND p1.probed_at >= NOW() - INTERVAL '1 hour'
AND p1.healthy = :curr_healthy
AND EXISTS (
SELECT 1 FROM host_health_probes p2
WHERE p2.host_label = :label
AND p2.probed_at < p1.probed_at
AND p2.probed_at >= NOW() - INTERVAL '90 minutes'
AND p2.healthy != :curr_healthy
)
LIMIT 1
"""),
{'label': rec['host_label'], 'curr_healthy': rec['healthy']},
).fetchone()
if recent_transition is None:
transitions.append({
'host_label': rec['host_label'],
'host_url': rec['host_url'],
'prev_healthy': prev_healthy,
'curr_healthy': rec['healthy'],
'error_msg': rec.get('error_msg'),
'response_ms': rec.get('response_ms'),
})
finally:
session.close()
except Exception as e:
logger.warning(f"[HostHealthProbe] transition detect failed: {e}")
# 批次寫 DB
session = DatabaseManager().get_session()
try:
for rec in records:
session.execute(
_sa("""
INSERT INTO host_health_probes
(host_label, host_url, healthy, unhealthy_mark,
models_count, response_ms, error_msg)
VALUES
(:host_label, :host_url, :healthy, :unhealthy_mark,
:models_count, :response_ms, :error_msg)
"""),
rec,
)
session.commit()
finally:
session.close()
# Phase 43: 推 Telegram 告警 / 恢復通知
for tr in transitions:
try:
_push_host_transition_alert(tr)
except Exception as e:
logger.error(f"[HostHealthProbe] push alert failed for {tr['host_label']}: {e}")
unhealthy = [r['host_label'] for r in records if not r['healthy']]
if unhealthy:
logger.warning(f"[HostHealthProbe] unhealthy hosts: {unhealthy}")
else:
logger.debug("[HostHealthProbe] all 3 hosts healthy")
except Exception as e:
logger.error(f"[HostHealthProbe] failed: {e}", exc_info=True)
def _push_host_transition_alert(tr):
"""Phase 43: 主機 state transition → 推 Telegram。
healthy → unhealthy:P1 告警 + inline 「🩹 修 {label}」按鈕
unhealthy → healthy:P3 簡訊「已恢復」(不附按鈕)
"""
try:
from services.telegram_templates import send_telegram_with_result
except Exception:
return
label = tr['host_label']
short_label = (
'GCP-A' if 'Primary' in label else
'GCP-B' if 'Secondary' in label else
'111'
)
if tr['curr_healthy']:
# 恢復通知(無按鈕)
text = (
f"✅ Ollama 主機已恢復\n\n"
f"主機:{label}\n"
f"網址:{tr['host_url']}\n"
f"回應:{tr['response_ms']} ms\n\n"
f"scheduler 每 15 分鐘自動探針偵測"
)
send_telegram_with_result(text, parse_mode='HTML')
else:
# 故障告警 + inline AutoHeal 按鈕
err_short = (tr.get('error_msg') or '無錯誤訊息')[:200]
text = (
f"🚨 Ollama 主機異常\n\n"
f"主機:{label}\n"
f"網址:{tr['host_url']}\n"
f"錯誤:{err_short}\n\n"
f"💡 點下方按鈕一鍵觸發 ADR-013 AutoHeal Playbook,"
f"或至 觀測台 詳查。"
)
reply_markup = {
"inline_keyboard": [
[{"text": f"🩹 立即 AutoHeal {short_label}", "callback_data": f"cmd:obs_heal:{short_label}"}],
[{"text": "📊 查 24h 健康統計", "callback_data": "cmd:obs_health"}],
],
}
send_telegram_with_result(text, reply_markup=reply_markup, parse_mode='HTML')
def run_ai_calls_error_spike_check():
"""Phase 44 — 每 30 分鐘檢查 AI 呼叫錯誤率,異常時主動告警。
觸發條件:過去 1 小時 ai_calls 錯誤率 ≥ 30% 且總呼叫 ≥ 20
Dedup:1 小時內同性質告警只推一次(看 push log)— 簡化用 60min 視窗 + 已告警 flag。
"""
try:
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
session = DatabaseManager().get_session()
try:
row = session.execute(
_sa("""
SELECT COUNT(*) AS total,
COUNT(*) FILTER (WHERE status NOT IN ('ok','cache_only')) AS errors,
COUNT(DISTINCT caller) AS distinct_callers
FROM ai_calls
WHERE called_at >= NOW() - INTERVAL '1 hour'
"""),
).fetchone()
total = int(row[0] or 0)
errors = int(row[1] or 0)
if total < 20:
return # 樣本太少不告警
error_rate = errors / total
if error_rate < 0.30:
return # 低於閾值,正常
# 查 top 3 problematic caller
top_callers = session.execute(
_sa("""
SELECT caller, COUNT(*) AS calls,
COUNT(*) FILTER (WHERE status NOT IN ('ok','cache_only')) AS errs
FROM ai_calls
WHERE called_at >= NOW() - INTERVAL '1 hour'
GROUP BY caller
HAVING COUNT(*) FILTER (WHERE status NOT IN ('ok','cache_only')) >= 3
ORDER BY errs DESC
LIMIT 3
"""),
).fetchall()
finally:
session.close()
# 推 Telegram 告警 + inline Code Review 按鈕。
# 1 小時內同類 spike 只推一次,避免每 30 分鐘洗版。
global _AI_CALLS_ERROR_SPIKE_LAST_PUSH_TS
now_ts = time.time()
if now_ts - _AI_CALLS_ERROR_SPIKE_LAST_PUSH_TS < 3600:
logger.info("[AICallsErrorSpike] skip duplicate alert within 1h window")
return
from services.telegram_templates import send_telegram_with_result
lines = [
"🚨 AI 呼叫錯誤率異常", "",
f"過去 1 小時:{errors}/{total} 失敗({error_rate*100:.1f}%)",
"",
]
if top_callers:
lines.append("問題 caller Top 3:")
for c in top_callers:
lines.append(f"• {c[0]}:{c[2]}/{c[1]} 失敗")
lines.append("")
lines.append("💡 點按鈕觸發 Code Review Pipeline 自動審查最新 commit。")
reply_markup = {
"inline_keyboard": [
[{"text": "🔬 觸發 Code Review", "callback_data": "cmd:obs_trigger_review"}],
[{"text": "📊 查 24h AI 呼叫", "callback_data": "cmd:obs_ai_calls"}],
],
}
send_telegram_with_result('\n'.join(lines), reply_markup=reply_markup, parse_mode='HTML')
_AI_CALLS_ERROR_SPIKE_LAST_PUSH_TS = now_ts
logger.warning(
f"[AICallsErrorSpike] alert pushed: total={total} errors={errors} "
f"rate={error_rate*100:.1f}%"
)
except Exception as e:
logger.error(f"[AICallsErrorSpike] failed: {e}", exc_info=True)
def run_observability_daily_summary():
"""Phase 44 — 每日 09:30 推送觀測台健康摘要(早晨報)。
一頁式涵蓋:三主機 24h uptime / AI 呼叫量 / 當月 cost / 待審 episode 數 / PPT audit 通過率。
讓統帥早上滑手機就能看到全景,不用主動開觀測台。
"""
try:
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
from datetime import datetime as _dt
session = DatabaseManager().get_session()
try:
# 三主機 24h uptime
host_rows = session.execute(
_sa("""
SELECT host_label,
COUNT(*) AS total,
COUNT(*) FILTER (WHERE healthy) AS up
FROM host_health_probes
WHERE probed_at >= NOW() - INTERVAL '24 hours'
GROUP BY host_label ORDER BY host_label
"""),
).fetchall()
# AI 呼叫 24h
ai_row = session.execute(
_sa("""
SELECT COUNT(*) AS total,
COALESCE(SUM(input_tokens + output_tokens), 0) AS tokens,
COALESCE(SUM(cost_usd), 0) AS cost,
COUNT(*) FILTER (WHERE status NOT IN ('ok','cache_only')) AS errors,
COUNT(*) FILTER (WHERE rag_hit) AS rag_hits
FROM ai_calls
WHERE called_at >= NOW() - INTERVAL '24 hours'
"""),
).fetchone()
# 當月 cost
today = _dt.now()
month_start = _dt(today.year, today.month, 1)
month_cost = session.execute(
_sa("SELECT COALESCE(SUM(cost_usd), 0) FROM ai_calls WHERE called_at >= :ms"),
{'ms': month_start},
).fetchone()[0] or 0
# 待審 episodes
ep_pending = session.execute(
_sa("SELECT COUNT(*) FROM learning_episodes WHERE promotion_status = 'awaiting_review' AND reviewed_at IS NULL"),
).fetchone()[0] or 0
# PPT audit 7d
ppt_row = session.execute(
_sa("""
SELECT COUNT(*) AS total,
COUNT(*) FILTER (WHERE audit_status = 'passed') AS passed,
COUNT(*) FILTER (WHERE audit_status = 'failed') AS failed
FROM ppt_audit_results
WHERE audited_at >= NOW() - INTERVAL '7 days'
"""),
).fetchone()
# Phase 49: 商業面未跟進機會(high-confidence 卻無 action_plan)
unfollowed_count = 0
try:
unfollowed_count = session.execute(
_sa("""
SELECT COUNT(*) FROM ai_price_recommendations r
WHERE r.created_at >= NOW() - INTERVAL '7 days'
AND r.confidence >= 0.7
AND NOT EXISTS (
SELECT 1 FROM action_plans p
WHERE p.sku = r.sku
AND p.created_at >= r.created_at
AND p.created_at < r.created_at + INTERVAL '7 days'
)
"""),
).fetchone()[0] or 0
except Exception:
pass
finally:
session.close()
ai_total = int(ai_row[0] or 0)
ai_errors = int(ai_row[3] or 0)
ai_rag = int(ai_row[4] or 0)
ai_cost = float(ai_row[2] or 0)
ai_tokens = int(ai_row[1] or 0)
ppt_total = int(ppt_row[0] or 0)
ppt_passed = int(ppt_row[1] or 0)
ppt_failed = int(ppt_row[2] or 0)
lines = [
f"🛰 觀測台日報 — {today.strftime('%Y-%m-%d')}",
"",
"三主機 24h 在線率:",
]
if host_rows:
for label, total, up in host_rows:
pct = (float(up) / float(total) * 100) if total else 0
emoji = "✅" if pct >= 99 else "⚠️" if pct >= 90 else "🚨"
lines.append(f"{emoji} {label}:{pct:.1f}%({up}/{total})")
else:
lines.append("(無資料 — scheduler probe 尚未啟動?)")
lines.append("")
lines.append("AI 呼叫 24h:")
err_rate = (ai_errors / ai_total * 100) if ai_total else 0
rag_rate = (ai_rag / ai_total * 100) if ai_total else 0
err_emoji = "✅" if err_rate < 5 else "⚠️" if err_rate < 15 else "🚨"
lines.append(f"• 呼叫:{ai_total:,} 次 · Token:{ai_tokens:,}")
lines.append(f"• 24h 成本:${ai_cost:.2f} · 當月累計:${month_cost:.2f}")
lines.append(f"{err_emoji} 錯誤率:{err_rate:.1f}%({ai_errors})")
lines.append(f"💡 RAG 命中率:{rag_rate:.1f}%({ai_rag})")
if ep_pending:
lines.append("")
lines.append(f"📋 待審 episodes:{ep_pending} 筆")
if ppt_total:
lines.append("")
ppt_pass_rate = (ppt_passed / ppt_total * 100) if ppt_total else 0
ppt_emoji = "✅" if ppt_pass_rate >= 80 else "⚠️"
lines.append(f"{ppt_emoji} PPT 視覺審核 7d:{ppt_passed}/{ppt_total} 通過({ppt_pass_rate:.0f}%)")
if ppt_failed:
lines.append(f" 失敗:{ppt_failed} 筆")
if unfollowed_count > 0:
lines.append("")
lines.append(f"⚠️ 商業面未跟進:{unfollowed_count} 筆"
f"(high-confidence AI 建議未轉化為 action_plan)")
lines.append("")
lines.append('→ 開觀測台總覽')
from services.telegram_templates import send_telegram_with_result
reply_markup = {
"inline_keyboard": [
[{"text": "🛰 觀測台總覽", "callback_data": "cmd:obs_overview"},
{"text": "🌐 Agent 編排", "callback_data": "cmd:obs_orchestration"}],
[{"text": "💼 商業面 AI", "callback_data": "cmd:obs_business"},
{"text": "🏥 主機健康", "callback_data": "cmd:obs_health"}],
[{"text": "📊 AI 呼叫", "callback_data": "cmd:obs_ai_calls"},
{"text": "💰 預算", "callback_data": "cmd:obs_budget"}],
],
}
send_telegram_with_result('\n'.join(lines), reply_markup=reply_markup, parse_mode='HTML')
logger.info("[ObservabilityDaily] summary pushed to Telegram")
except Exception as e:
logger.error(f"[ObservabilityDaily] failed: {e}", exc_info=True)
def run_host_health_probe_cleanup():
"""Phase 42 — 每日 03:00 清 host_health_probes 30 天前資料。"""
try:
from sqlalchemy import text as _sa
from database.manager import DatabaseManager
session = DatabaseManager().get_session()
try:
result = session.execute(
_sa("DELETE FROM host_health_probes WHERE probed_at < NOW() - INTERVAL '30 days'"),
)
session.commit()
logger.info(f"[HostHealthProbe] cleanup: {result.rowcount} rows deleted (>30d)")
finally:
session.close()
except Exception as e:
logger.error(f"[HostHealthProbe] cleanup failed: {e}", exc_info=True)
def run_cost_throttle_reset_if_new_month():
"""每日 00:05 — 若當天是月份第 1 天,清 throttle state(跨月重置)。"""
try:
from datetime import datetime
from services.cost_throttle_service import reset_throttle_state
if datetime.now().day == 1:
reset_throttle_state()
logger.info("[CostThrottle] new month detected, state reset")
except Exception as e:
logger.error(f"[CostThrottle] reset failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_cost_throttle_reset_if_new_month",
e,
source="Scheduler.CostThrottle",
event_type="cost_throttle_reset_failure",
title="成本節流跨月重置失敗",
)
def run_ppt_vision_audit():
"""每日 22:00 — Phase 26 PPT 視覺審核
掃 reports/ 目錄當天新生 .pptx,跑 minicpm-v 視覺檢查,有 issues 推 Telegram。
PPT_VISION_ENABLED=false 時 audit_recent_ppts 內部直接 skip(不打 LLM)。
需 LibreOffice headless 在 PATH(轉 .pptx → png);不在則 fail-safe skip。
"""
try:
from services.ppt_vision_service import audit_recent_ppts, push_ppt_audit_to_telegram
summary = audit_recent_ppts(reports_dir='reports', hours=24, max_files=10)
if summary['total_issues'] > 0:
pushed = push_ppt_audit_to_telegram(summary)
logger.info(
"[PPTVisionAudit] %d files, %d issues, telegram=%s",
len(summary['audited_files']), summary['total_issues'], pushed,
)
else:
logger.debug("[PPTVisionAudit] no issues found")
except Exception as e:
logger.error(f"[PPTVisionAudit] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_ppt_vision_audit",
e,
source="Scheduler.PPTVision",
event_type="ppt_vision_audit_failure",
title="PPT 視覺審核失敗",
)
def run_roi_monthly_report_if_new_month():
"""每日 09:00 — Phase 24 ROI 月報(內部判斷月初第 1 日才送)
對比上月 ai_calls 統計 vs 戰前 baseline,推 Telegram「節省 X tokens / $Y」。
寫入 ai_insights (type='roi_monthly_report') 作長期記錄。
"""
try:
from datetime import datetime
if datetime.now().day != 1:
return # 非月初第 1 日 skip
from services.roi_report_service import generate_and_send_roi_report
result = generate_and_send_roi_report()
logger.info("[ROIReport] sent=%s period=%s",
result.get('sent'), result.get('period', '?'))
except Exception as e:
logger.error(f"[ROIReport] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_roi_monthly_report_if_new_month",
e,
source="Scheduler.ROIReport",
event_type="roi_monthly_report_failure",
title="ROI 月報失敗",
)
def run_embed_consistency_check():
"""每週日 04:30 — BGE-M3 跨主機一致性驗證(ADR-033 護欄 #3)。
跑 verify_embedding_consistency,不一致時 logger.error;ok 時 logger.info。
每週一次足夠(驗證模型版本未漂移;過頻會打三主機 Ollama 浪費)。
"""
try:
from services.rag_service import verify_embedding_consistency
result = verify_embedding_consistency()
logger.info(
"[EmbedConsistency] ok=%s reachable=%s max_diff=%.2e signature=%s",
result['ok'], result['reachable'],
result['max_diff'], result['signature'],
)
if not result['ok']:
logger.error(
"[EmbedConsistency] ⚠️ INCONSISTENT — RAG 召回率將下降;"
"檢查三主機 bge-m3 模型版本是否同步(ollama list)"
)
except Exception as e:
logger.error(f"[EmbedConsistency] task failed: {e}", exc_info=True)
_notify_scheduler_failure(
"run_embed_consistency_check",
e,
source="Scheduler.RAG",
event_type="embed_consistency_check_failure",
title="BGE-M3 一致性檢查失敗",
)
def run_cleanup_agent_context():
"""每日 03:30 — 清理 agent_context 表中已過期的 TTL 記錄(migration 018 定義)"""
from database.manager import get_session
from sqlalchemy import text
session = get_session()
try:
session.execute(text("SELECT cleanup_expired_agent_context()"))
session.commit()
logger.info("[Cleanup] agent_context TTL 清理完成")
except Exception as e:
logger.error(f"[Cleanup] agent_context 清理失敗: {e}")
try:
from services.event_router import notify_failure
notify_failure(
task_name="run_cleanup_agent_context",
error=e,
source="Scheduler.Cleanup",
event_type="agent_context_cleanup_failure",
priority="P2",
title="agent_context TTL 清理失敗",
dedup_ttl_sec=3600,
)
except Exception as _router_e:
logger.error(f"[Cleanup] event_router 失敗: {_router_e}")
finally:
session.close()
def _run_elephant_alpha_engine():
"""Daemon thread: ElephantAlpha 自主監控引擎(獨立 asyncio loop)"""
loop = None
try:
from services.elephant_alpha_autonomous_engine import autonomous_engine
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
logger.info("🐘 [ElephantAlpha] Autonomous engine thread started")
loop.run_until_complete(autonomous_engine.start_autonomous_monitoring())
except Exception as e:
logger.error(f"🐘 [ElephantAlpha] Engine crashed: {e}")
finally:
if loop is not None:
loop.close()
if __name__ == "__main__":
logger.info("🚀 momo-scheduler 啟動中...")
_register_schedules()
logger.info("✅ 全部排程任務已註冊")
_ea_thread = threading.Thread(
target=_run_elephant_alpha_engine,
daemon=True,
name="elephant-alpha-engine",
)
_ea_thread.start()
logger.info("🐘 [ElephantAlpha] Autonomous engine thread launched")
logger.info("⏰ 排程主迴圈啟動,等待任務觸發...")
_ea_watchdog_counter = 0 # 每 60 秒(60 次 sleep(1))做一次存活檢查
while True:
try:
schedule.run_pending()
time.sleep(1)
# 每 60 秒檢查 ElephantAlpha 執行緒是否還活著
_ea_watchdog_counter += 1
if _ea_watchdog_counter >= 60:
_ea_watchdog_counter = 0
if not _ea_thread.is_alive():
logger.error("[ElephantAlpha] 監控執行緒已死亡,嘗試重啟")
try:
from services.event_router import dispatch_sync as _dispatch_sync
_dispatch_sync({
"source": "Scheduler.ElephantAlpha",
"event_type": "thread_crashed",
"severity": "alert",
"title": "ElephantAlpha 執行緒死亡",
"status": "自動重啟中",
"impact": "P2 - 自主監控引擎暫停",
"summary": "ElephantAlphaEngine daemon thread 意外終止,排程主迴圈已偵測並觸發重啟",
})
except Exception as _alert_err:
logger.error(f"[ElephantAlpha] 無法發送告警: {_alert_err}")
_ea_thread = threading.Thread(
target=_run_elephant_alpha_engine,
daemon=True,
name="ElephantAlphaEngine",
)
_ea_thread.start()
logger.info("[ElephantAlpha] 執行緒已重啟")
except KeyboardInterrupt:
logger.info("⛔ Scheduler stopped.")
break
except Exception as e:
logger.error(f"Scheduler error: {e}")
time.sleep(5)