fix(stability): 補強 scheduler 例外記錄
All checks were successful
CD Pipeline / deploy (push) Successful in 1m36s

This commit is contained in:
OoO
2026-04-30 10:28:37 +08:00
parent 18b0fa8af2
commit e73cd6e6a3
9 changed files with 86 additions and 34 deletions

View File

@@ -2,7 +2,7 @@
> 本文件定義專案開發的核心準則與不可違反的規範
> **建立日期**: 2026-01-12
> **當前版本**: V10.17 (Ollama embedding /api/embed 強化版)
> **當前版本**: V10.18 (Scheduler 例外記錄強化版)
> **最後更新**: 2026-04-30
---

View File

@@ -26,6 +26,7 @@
- ElephantAlpha NIM fallback 強化:預設改用 production 可呼叫的 `nvidia/llama-3.3-nemotron-super-49b-v1.5`Ultra 253B 權限 404 時自動 fallback。
- DatabaseManager 連線池收斂PostgreSQL 每 worker pool 調整為 `pool_size=2/max_overflow=3`,避免多 route 重複 new manager 時吃滿連線。
- Ollama embedding 強化:改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`,並新增 `EMBEDDING_TIMEOUT`。
- Scheduler 例外記錄強化:清除 `scheduler.py` 靜默 `except/pass`資源清理、EDM 可選欄位、備份 insight/通知失敗全改為可診斷 log。
【下次待辦】
- 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。

4
app.py
View File

@@ -95,8 +95,8 @@ except Exception as e:
sys_log.error(f"無法檢測磁碟空間: {e}")
# 🚩 系統版本定義 (備份與顯示用)
# 🚩 2026-04-30 V10.17: Ollama embedding /api/embed hardening
SYSTEM_VERSION = "V10.17"
# 🚩 2026-04-30 V10.18: Scheduler exception logging hardening
SYSTEM_VERSION = "V10.18"
# ==========================================
# 🔒 SQL Injection 防護函數

View File

@@ -254,7 +254,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.17"
SYSTEM_VERSION = "V10.18"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -2,7 +2,7 @@
> **最後更新**: 2026-04-30 (台北時間)
> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape / CD Gunicorn 掛載具測試覆蓋
> **適用版本**: V10.17 Ollama embedding /api/embed 強化版
> **適用版本**: V10.18 Scheduler 例外記錄強化版
---

View File

@@ -26,6 +26,7 @@
- 2026-04-30 production `NVIDIA_API_KEY` 可列出 Ultra 253B 但呼叫 `nvidia/llama-3.1-nemotron-ultra-253b-v1` 會 404ElephantAlpha 預設改用 `nvidia/llama-3.3-nemotron-super-49b-v1.5` 並加入 fallback models。
- 2026-04-30 `DatabaseManager()` 多 route 重複建立曾有吃滿 PostgreSQL clients 風險;已重用 engine/session 並將每 worker pool 收斂為 `pool_size=2/max_overflow=3`
- 2026-04-30 OpenClaw embedding worker 曾在舊 `/api/embeddings` 路徑遇到 Hermes timeoutOllama client 已改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`
- 2026-04-30 `scheduler.py` 殘留靜默 `except/pass`;已改為 warning/debug log備份 insight 與 Telegram 通知失敗不再靜默。
## 已落地範圍
@@ -64,6 +65,7 @@
- 2026-04-30 ElephantAlpha NIM fallback hardening新增 `tests/test_elephant_service.py`
- 2026-04-30 DatabaseManager pool convergence`tests/test_database_manager_cache.py` 覆蓋 pool size/overflow 與 engine reuse。
- 2026-04-30 Ollama embedding API migration新增 `tests/test_ollama_embedding.py`
- 2026-04-30 Phase 3f cleanup contracts`tests/test_phase3f_cleanup_contracts.py` 覆蓋 orphan services、env 範例、scheduler 靜默例外。
- 2026-04-29 L2 安全記憶批次:`24 passed`
- collect-only`48 tests collected`
- `git diff --check` 已通過。

View File

@@ -39,6 +39,7 @@
- **ElephantAlpha NIM fallback 強化**: production 帳號呼叫 Ultra 253B 會 404預設改用可呼叫的 Nemotron Super 49B v1.5,並加入 70B / 8B fallback。
- **DatabaseManager 連線池收斂**: PostgreSQL 每 worker pool 收斂為 `pool_size=2/max_overflow=3`,並以 cache 重用 engine/session。
- **Ollama embedding API 遷移**: embedding client 優先使用官方 `/api/embed`,舊節點才 fallback `/api/embeddings`,降低 deprecated endpoint 與 timeout 風險。
- **Scheduler 例外記錄強化**: 清除 `scheduler.py` 靜默 `except/pass`Chrome 清理、EDM optional 欄位、備份 insight/Telegram 失敗均保留 log。
### 2026-04-28~29Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除
- **app.py 縮減 -10.8%**: 7,386 → 6,590 行11 commits 全綠零 502。

View File

@@ -151,8 +151,11 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
if driver:
try:
driver.quit()
except:
pass
except Exception as cleanup_error:
logging.warning(
f"[Scraper] [Resource] ⚠️ Chrome 初始化失敗後關閉 driver 也失敗 | Error: {cleanup_error}",
exc_info=True,
)
driver = None
if retry_count <= max_retries:
@@ -178,8 +181,11 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
# 先關閉所有視窗
try:
driver.close()
except:
pass
except Exception as close_error:
logging.debug(
f"[Scraper] [Resource] Chrome 視窗關閉失敗但繼續 quit | Error: {close_error}",
exc_info=True,
)
# 再退出 driver
driver.quit()
except Exception as quit_error:
@@ -188,14 +194,20 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
try:
import subprocess
subprocess.run(['pkill', '-f', 'chrome.*--headless'], timeout=5, capture_output=True)
except:
pass
except Exception as pkill_error:
logging.warning(
f"[Scraper] [Resource] ⚠️ Chrome 強制清理失敗 | Error: {pkill_error}",
exc_info=True,
)
if session:
try:
session.close()
except:
pass
except Exception as session_error:
logging.warning(
f"[Scraper] [Resource] ⚠️ DB session 關閉失敗 | Error: {session_error}",
exc_info=True,
)
def run_momo_task():
@@ -206,8 +218,11 @@ def run_momo_task():
if is_task_paused("run_momo_task"):
logging.info("[Crawler] [MOMO] ⏸️ 任務被 HITL 暫停中,本次跳過")
return
except Exception:
pass # agent_actions 未就緒時不阻塞排程
except Exception as pause_check_error:
logging.debug(
f"[Crawler] [MOMO] HITL 暫停檢查失敗但繼續排程 | Error: {pause_check_error}",
exc_info=True,
)
try:
# V-New: 每次執行任務時,動態從 JSON 檔案重新讀取分類
@@ -517,8 +532,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
if "/" in txt and ":" in txt: # 增強判斷:需包含日期斜線與時間冒號
activity_time_text = txt
break
except:
pass
except Exception as activity_time_error:
logging.debug(
f"[Crawler] [EDM] 活動時間文字解析失敗但繼續 | Error: {activity_time_error}",
exc_info=True,
)
if not activity_time_text:
activity_time_text = activity_name
logging.info(f"[Crawler] [EDM] ⏰ 抓取到的全站活動時間: {activity_time_text}")
@@ -553,8 +571,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
# 嘗試抓取 .dateTime若無則嘗試找包含 "開搶" 的元素
dt_el = parent.find_element(By.CSS_SELECTOR, ".dateTime")
session_time_text = dt_el.text.strip()
except:
pass
except Exception as session_time_error:
logging.debug(
f"[Crawler] [EDM] 區塊時間說明解析失敗但繼續 | Block: {i+1} | Error: {session_time_error}",
exc_info=True,
)
time_el = parent.find_element(By.CSS_SELECTOR, ".dateTime .time span")
if time_el:
@@ -642,9 +663,10 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
price_text = price_el.text.replace(",", "").strip()
if price_text.isdigit():
price = int(price_text)
except Exception:
logging.info(f"[Crawler] [EDM] 找不到價格元素 | i_code: {i_code} | Info: 可能已售完")
pass # price 保持為 None
except Exception as price_error:
logging.info(
f"[Crawler] [EDM] 找不到價格元素 | i_code: {i_code} | Info: 可能已售完 | Error: {price_error}"
)
# V9.91: 解析折扣數
discount_text = ""
@@ -666,8 +688,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
qty_text = qty_span.text.strip().replace(",", "")
if qty_text.isdigit():
remain_qty = int(qty_text)
except:
pass
except Exception as remain_qty_error:
logging.debug(
f"[Crawler] [EDM] 倒數組數解析失敗但繼續 | i_code: {i_code} | Error: {remain_qty_error}",
exc_info=True,
)
current_scan_icodes.add((i_code, time_slot))
@@ -1810,8 +1835,11 @@ def run_auto_import_task():
if is_task_paused("run_auto_import_task"):
logging.info("[Scheduler] [AutoImport] ⏸️ 任務被 HITL 暫停中,本次跳過")
return
except Exception:
pass
except Exception as pause_check_error:
logging.debug(
f"[Scheduler] [AutoImport] HITL 暫停檢查失敗但繼續排程 | Error: {pause_check_error}",
exc_info=True,
)
try:
from services.import_service import import_service
@@ -2226,8 +2254,11 @@ def run_db_backup_task():
metadata={"status": "success", "size_kb": size_kb, "deleted_old": deleted_count},
ai_model="scheduler",
)
except Exception:
pass
except Exception as insight_error:
logging.warning(
f"[Scheduler] [Backup] ⚠️ 備份成功 insight 寫入失敗但繼續通知 | Error: {insight_error}",
exc_info=True,
)
else:
msg = (
f"🚨 資料庫備份失敗 ({now_str})\n"
@@ -2262,8 +2293,11 @@ def run_db_backup_task():
metadata={"status": "failed", "error": result.get("error")},
ai_model="scheduler",
)
except Exception:
pass
except Exception as insight_error:
logging.warning(
f"[Scheduler] [Backup] ⚠️ 備份失敗 insight 寫入失敗但繼續通知 | Error: {insight_error}",
exc_info=True,
)
notifier._send_telegram_messages([msg])
@@ -2288,8 +2322,11 @@ def run_db_backup_task():
NotificationManager()._send_telegram_messages([
f"🚨 DB 備份排程異常\n錯誤:{e}"
])
except Exception:
pass
except Exception as notify_error:
logging.warning(
f"[Scheduler] [Backup] ⚠️ 備份異常 Telegram 通知失敗 | Error: {notify_error}",
exc_info=True,
)
def run_backup_monitor_task():
@@ -2362,8 +2399,11 @@ def run_backup_monitor_task():
metadata={"alert": True, "reason": alert_reason, "latest_file": info.get("filename")},
ai_model="scheduler",
)
except Exception:
pass
except Exception as insight_error:
logging.warning(
f"[Scheduler] [BackupMonitor] ⚠️ 備份監控 insight 寫入失敗但繼續 | Error: {insight_error}",
exc_info=True,
)
else:
created_at = info.get("created_at")
logging.info(f"[Scheduler] [BackupMonitor] ✅ 備份狀態正常 | 最新: {info.get('filename')} @ {created_at}")

View File

@@ -1,3 +1,4 @@
import re
from pathlib import Path
@@ -69,3 +70,10 @@ def test_env_example_documents_runtime_and_ai_automation_variables():
}
assert expected_keys <= _env_example_keys()
def test_scheduler_does_not_silently_swallow_exceptions():
scheduler_source = (ROOT / "scheduler.py").read_text(encoding="utf-8")
assert "except:" not in scheduler_source
assert not re.search(r"except(?: Exception)?[^\n]*:\n\s+pass(?:\s|#|$)", scheduler_source)