fix(stability): 補強 scheduler 例外記錄
All checks were successful
CD Pipeline / deploy (push) Successful in 1m36s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m36s
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
|
||||
> 本文件定義專案開發的核心準則與不可違反的規範
|
||||
> **建立日期**: 2026-01-12
|
||||
> **當前版本**: V10.17 (Ollama embedding /api/embed 強化版)
|
||||
> **當前版本**: V10.18 (Scheduler 例外記錄強化版)
|
||||
> **最後更新**: 2026-04-30
|
||||
|
||||
---
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
- ElephantAlpha NIM fallback 強化:預設改用 production 可呼叫的 `nvidia/llama-3.3-nemotron-super-49b-v1.5`,Ultra 253B 權限 404 時自動 fallback。
|
||||
- DatabaseManager 連線池收斂:PostgreSQL 每 worker pool 調整為 `pool_size=2/max_overflow=3`,避免多 route 重複 new manager 時吃滿連線。
|
||||
- Ollama embedding 強化:改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`,並新增 `EMBEDDING_TIMEOUT`。
|
||||
- Scheduler 例外記錄強化:清除 `scheduler.py` 靜默 `except/pass`,資源清理、EDM 可選欄位、備份 insight/通知失敗全改為可診斷 log。
|
||||
|
||||
【下次待辦】
|
||||
- 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。
|
||||
|
||||
4
app.py
4
app.py
@@ -95,8 +95,8 @@ except Exception as e:
|
||||
sys_log.error(f"無法檢測磁碟空間: {e}")
|
||||
|
||||
# 🚩 系統版本定義 (備份與顯示用)
|
||||
# 🚩 2026-04-30 V10.17: Ollama embedding /api/embed hardening
|
||||
SYSTEM_VERSION = "V10.17"
|
||||
# 🚩 2026-04-30 V10.18: Scheduler exception logging hardening
|
||||
SYSTEM_VERSION = "V10.18"
|
||||
|
||||
# ==========================================
|
||||
# 🔒 SQL Injection 防護函數
|
||||
|
||||
@@ -254,7 +254,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
|
||||
# ==========================================
|
||||
# 系統版本與路徑
|
||||
# ==========================================
|
||||
SYSTEM_VERSION = "V10.17"
|
||||
SYSTEM_VERSION = "V10.18"
|
||||
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
|
||||
public_url = PUBLIC_URL # 用於模板顯示
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
> **最後更新**: 2026-04-30 (台北時間)
|
||||
> **狀態**: 🟢 四 AI Agent 自動化閉環已落地 — EventRouter / AutoHeal / OpenClaw Memory / ElephantAlpha bridge / Prometheus metrics / Smoke Dashboard / Smoke Trend Management / Telegram Summary / Grafana provisioning / Prometheus scrape / CD Gunicorn 掛載具測試覆蓋
|
||||
> **適用版本**: V10.17 Ollama embedding /api/embed 強化版
|
||||
> **適用版本**: V10.18 Scheduler 例外記錄強化版
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
- 2026-04-30 production `NVIDIA_API_KEY` 可列出 Ultra 253B 但呼叫 `nvidia/llama-3.1-nemotron-ultra-253b-v1` 會 404;ElephantAlpha 預設改用 `nvidia/llama-3.3-nemotron-super-49b-v1.5` 並加入 fallback models。
|
||||
- 2026-04-30 `DatabaseManager()` 多 route 重複建立曾有吃滿 PostgreSQL clients 風險;已重用 engine/session 並將每 worker pool 收斂為 `pool_size=2/max_overflow=3`。
|
||||
- 2026-04-30 OpenClaw embedding worker 曾在舊 `/api/embeddings` 路徑遇到 Hermes timeout;Ollama client 已改為優先 `/api/embed`,舊節點才 fallback `/api/embeddings`。
|
||||
- 2026-04-30 `scheduler.py` 殘留靜默 `except/pass`;已改為 warning/debug log,備份 insight 與 Telegram 通知失敗不再靜默。
|
||||
|
||||
## 已落地範圍
|
||||
|
||||
@@ -64,6 +65,7 @@
|
||||
- 2026-04-30 ElephantAlpha NIM fallback hardening:新增 `tests/test_elephant_service.py`。
|
||||
- 2026-04-30 DatabaseManager pool convergence:`tests/test_database_manager_cache.py` 覆蓋 pool size/overflow 與 engine reuse。
|
||||
- 2026-04-30 Ollama embedding API migration:新增 `tests/test_ollama_embedding.py`。
|
||||
- 2026-04-30 Phase 3f cleanup contracts:`tests/test_phase3f_cleanup_contracts.py` 覆蓋 orphan services、env 範例、scheduler 靜默例外。
|
||||
- 2026-04-29 L2 安全記憶批次:`24 passed`。
|
||||
- collect-only:`48 tests collected`。
|
||||
- `git diff --check` 已通過。
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
- **ElephantAlpha NIM fallback 強化**: production 帳號呼叫 Ultra 253B 會 404,預設改用可呼叫的 Nemotron Super 49B v1.5,並加入 70B / 8B fallback。
|
||||
- **DatabaseManager 連線池收斂**: PostgreSQL 每 worker pool 收斂為 `pool_size=2/max_overflow=3`,並以 cache 重用 engine/session。
|
||||
- **Ollama embedding API 遷移**: embedding client 優先使用官方 `/api/embed`,舊節點才 fallback `/api/embeddings`,降低 deprecated endpoint 與 timeout 風險。
|
||||
- **Scheduler 例外記錄強化**: 清除 `scheduler.py` 靜默 `except/pass`,Chrome 清理、EDM optional 欄位、備份 insight/Telegram 失敗均保留 log。
|
||||
|
||||
### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除
|
||||
- **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。
|
||||
|
||||
98
scheduler.py
98
scheduler.py
@@ -151,8 +151,11 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
|
||||
if driver:
|
||||
try:
|
||||
driver.quit()
|
||||
except:
|
||||
pass
|
||||
except Exception as cleanup_error:
|
||||
logging.warning(
|
||||
f"[Scraper] [Resource] ⚠️ Chrome 初始化失敗後關閉 driver 也失敗 | Error: {cleanup_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
driver = None
|
||||
|
||||
if retry_count <= max_retries:
|
||||
@@ -178,8 +181,11 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
|
||||
# 先關閉所有視窗
|
||||
try:
|
||||
driver.close()
|
||||
except:
|
||||
pass
|
||||
except Exception as close_error:
|
||||
logging.debug(
|
||||
f"[Scraper] [Resource] Chrome 視窗關閉失敗但繼續 quit | Error: {close_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
# 再退出 driver
|
||||
driver.quit()
|
||||
except Exception as quit_error:
|
||||
@@ -188,14 +194,20 @@ def managed_scraper_resources(window_size='1920,5000', debug=False, timeout=45,
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(['pkill', '-f', 'chrome.*--headless'], timeout=5, capture_output=True)
|
||||
except:
|
||||
pass
|
||||
except Exception as pkill_error:
|
||||
logging.warning(
|
||||
f"[Scraper] [Resource] ⚠️ Chrome 強制清理失敗 | Error: {pkill_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if session:
|
||||
try:
|
||||
session.close()
|
||||
except:
|
||||
pass
|
||||
except Exception as session_error:
|
||||
logging.warning(
|
||||
f"[Scraper] [Resource] ⚠️ DB session 關閉失敗 | Error: {session_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def run_momo_task():
|
||||
@@ -206,8 +218,11 @@ def run_momo_task():
|
||||
if is_task_paused("run_momo_task"):
|
||||
logging.info("[Crawler] [MOMO] ⏸️ 任務被 HITL 暫停中,本次跳過")
|
||||
return
|
||||
except Exception:
|
||||
pass # agent_actions 未就緒時不阻塞排程
|
||||
except Exception as pause_check_error:
|
||||
logging.debug(
|
||||
f"[Crawler] [MOMO] HITL 暫停檢查失敗但繼續排程 | Error: {pause_check_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
try:
|
||||
# V-New: 每次執行任務時,動態從 JSON 檔案重新讀取分類
|
||||
@@ -517,8 +532,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
|
||||
if "/" in txt and ":" in txt: # 增強判斷:需包含日期斜線與時間冒號
|
||||
activity_time_text = txt
|
||||
break
|
||||
except:
|
||||
pass
|
||||
except Exception as activity_time_error:
|
||||
logging.debug(
|
||||
f"[Crawler] [EDM] 活動時間文字解析失敗但繼續 | Error: {activity_time_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
if not activity_time_text:
|
||||
activity_time_text = activity_name
|
||||
logging.info(f"[Crawler] [EDM] ⏰ 抓取到的全站活動時間: {activity_time_text}")
|
||||
@@ -553,8 +571,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
|
||||
# 嘗試抓取 .dateTime,若無則嘗試找包含 "開搶" 的元素
|
||||
dt_el = parent.find_element(By.CSS_SELECTOR, ".dateTime")
|
||||
session_time_text = dt_el.text.strip()
|
||||
except:
|
||||
pass
|
||||
except Exception as session_time_error:
|
||||
logging.debug(
|
||||
f"[Crawler] [EDM] 區塊時間說明解析失敗但繼續 | Block: {i+1} | Error: {session_time_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
time_el = parent.find_element(By.CSS_SELECTOR, ".dateTime .time span")
|
||||
if time_el:
|
||||
@@ -642,9 +663,10 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
|
||||
price_text = price_el.text.replace(",", "").strip()
|
||||
if price_text.isdigit():
|
||||
price = int(price_text)
|
||||
except Exception:
|
||||
logging.info(f"[Crawler] [EDM] ℹ️ 找不到價格元素 | i_code: {i_code} | Info: 可能已售完")
|
||||
pass # price 保持為 None
|
||||
except Exception as price_error:
|
||||
logging.info(
|
||||
f"[Crawler] [EDM] ℹ️ 找不到價格元素 | i_code: {i_code} | Info: 可能已售完 | Error: {price_error}"
|
||||
)
|
||||
|
||||
# V9.91: 解析折扣數
|
||||
discount_text = ""
|
||||
@@ -666,8 +688,11 @@ def run_edm_task(lpn_code="O1K5FBOqsvN"):
|
||||
qty_text = qty_span.text.strip().replace(",", "")
|
||||
if qty_text.isdigit():
|
||||
remain_qty = int(qty_text)
|
||||
except:
|
||||
pass
|
||||
except Exception as remain_qty_error:
|
||||
logging.debug(
|
||||
f"[Crawler] [EDM] 倒數組數解析失敗但繼續 | i_code: {i_code} | Error: {remain_qty_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
current_scan_icodes.add((i_code, time_slot))
|
||||
|
||||
@@ -1810,8 +1835,11 @@ def run_auto_import_task():
|
||||
if is_task_paused("run_auto_import_task"):
|
||||
logging.info("[Scheduler] [AutoImport] ⏸️ 任務被 HITL 暫停中,本次跳過")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as pause_check_error:
|
||||
logging.debug(
|
||||
f"[Scheduler] [AutoImport] HITL 暫停檢查失敗但繼續排程 | Error: {pause_check_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
try:
|
||||
from services.import_service import import_service
|
||||
@@ -2226,8 +2254,11 @@ def run_db_backup_task():
|
||||
metadata={"status": "success", "size_kb": size_kb, "deleted_old": deleted_count},
|
||||
ai_model="scheduler",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as insight_error:
|
||||
logging.warning(
|
||||
f"[Scheduler] [Backup] ⚠️ 備份成功 insight 寫入失敗但繼續通知 | Error: {insight_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
else:
|
||||
msg = (
|
||||
f"🚨 資料庫備份失敗 ({now_str})\n"
|
||||
@@ -2262,8 +2293,11 @@ def run_db_backup_task():
|
||||
metadata={"status": "failed", "error": result.get("error")},
|
||||
ai_model="scheduler",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as insight_error:
|
||||
logging.warning(
|
||||
f"[Scheduler] [Backup] ⚠️ 備份失敗 insight 寫入失敗但繼續通知 | Error: {insight_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
notifier._send_telegram_messages([msg])
|
||||
|
||||
@@ -2288,8 +2322,11 @@ def run_db_backup_task():
|
||||
NotificationManager()._send_telegram_messages([
|
||||
f"🚨 DB 備份排程異常\n錯誤:{e}"
|
||||
])
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as notify_error:
|
||||
logging.warning(
|
||||
f"[Scheduler] [Backup] ⚠️ 備份異常 Telegram 通知失敗 | Error: {notify_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
def run_backup_monitor_task():
|
||||
@@ -2362,8 +2399,11 @@ def run_backup_monitor_task():
|
||||
metadata={"alert": True, "reason": alert_reason, "latest_file": info.get("filename")},
|
||||
ai_model="scheduler",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as insight_error:
|
||||
logging.warning(
|
||||
f"[Scheduler] [BackupMonitor] ⚠️ 備份監控 insight 寫入失敗但繼續 | Error: {insight_error}",
|
||||
exc_info=True,
|
||||
)
|
||||
else:
|
||||
created_at = info.get("created_at")
|
||||
logging.info(f"[Scheduler] [BackupMonitor] ✅ 備份狀態正常 | 最新: {info.get('filename')} @ {created_at}")
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -69,3 +70,10 @@ def test_env_example_documents_runtime_and_ai_automation_variables():
|
||||
}
|
||||
|
||||
assert expected_keys <= _env_example_keys()
|
||||
|
||||
|
||||
def test_scheduler_does_not_silently_swallow_exceptions():
|
||||
scheduler_source = (ROOT / "scheduler.py").read_text(encoding="utf-8")
|
||||
|
||||
assert "except:" not in scheduler_source
|
||||
assert not re.search(r"except(?: Exception)?[^\n]*:\n\s+pass(?:\s|#|$)", scheduler_source)
|
||||
|
||||
Reference in New Issue
Block a user