From c13dc2263960413b0c54a5b0d534dbf923bced62 Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 4 May 2026 10:36:56 +0800 Subject: [PATCH] =?UTF-8?q?feat(p20)+docs:=20cost=20auto-throttle=20+=20LL?= =?UTF-8?q?M=20=E6=A8=A1=E5=9E=8B=E5=AE=8C=E6=95=B4=E8=A9=95=E4=BC=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operation Ollama-First v5.0 / Phase 20 + LLM 模型治理 services/cost_throttle_service.py (新檔, 200+ 行) - evaluate_throttle_status() 每小時 cron 跑 - 查 ai_call_budgets monthly × 累計 spent → 月底線性外推 - 推估 > 預算 110% → 標 throttled(hysteresis:降到 95% 才解除) - _push_throttle_alerts: 狀態變化推 Telegram - is_provider_throttled(provider) public API(給 anthropic/gemini caller 啟動 check) - COST_THROTTLE_ENABLED 預設 OFF(避免戰時誤節流) run_scheduler.py 加 2 cron + task wrapper - 每 1 小時:cost_throttle_evaluate - 每日 00:05:cost_throttle_reset_if_new_month docs/llm_model_full_evaluation_20260504.md (260+ 行) - 場景 × 模型對應矩陣(4 大層次) 戰術層 / 戰略層 / 多模態 / 雲端 API - 本次啟動的追加 4 模型(qwen2.5-coder:32b / deepseek-r1:14b / llava / gemma3:4b)— Primary + Secondary 並行拉 - Phase 21 路由優化建議(context size + complexity 動態選 model) - Phase 22 多供應商編排 + cost throttle 整合 - 儲存 / RAM / 延遲評估 - 模型治理 SOP(新增 / 替換 / 淘汰) - COST_TABLE 對齊(含 deepseek 直連價格) 啟用前置(待統帥): 1. Primary + Secondary 4 模型拉完(背景進行中) 2. .env: COST_THROTTLE_ENABLED=true(觀察 1 週後) 3. ANTHROPIC_API_KEY 設後 Code Review 自動切 Claude Opus 4.7 Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/llm_model_full_evaluation_20260504.md | 207 ++++++++++++++++ run_scheduler.py | 40 ++++ services/cost_throttle_service.py | 264 +++++++++++++++++++++ 3 files changed, 511 insertions(+) create mode 100644 docs/llm_model_full_evaluation_20260504.md create mode 100644 services/cost_throttle_service.py diff --git a/docs/llm_model_full_evaluation_20260504.md b/docs/llm_model_full_evaluation_20260504.md new file mode 100644 index 0000000..d91d2e4 --- /dev/null +++ b/docs/llm_model_full_evaluation_20260504.md @@ -0,0 +1,207 @@ +# LLM 模型完整評估 — Operation Ollama-First v5.0 + +> **日期**:2026-05-04 +> **目的**:評估 momo-pro 各場景對應的最佳 LLM 模型,並啟動建議模型 +> **整體原則**:Ollama-first(免費)→ Frontier API(鎖定 5+ 場景)→ 規則引擎兜底 + +--- + +## 一、場景 × 模型對應矩陣 + +### 1.1 戰術層(高頻、結構化、Ollama-only) + +| 場景 | 既有模型 | 建議模型 | 為何 | +|---|---|---|---| +| **Hermes 競價分析** (4h × 300 SKU) | `hermes3:latest` (8B) | 維持 + `qwen3:14b` 升級鏈 | 8B 處理 95% 案例足夠;複雜 SKU 升級 14B | +| **Hermes 意圖分類** (Telegram NLP) | `hermes3:latest` | 維持 | 結構化 JSON 輸出穩定 | +| **NemoTron 威脅分派** | NIM 8B / `qwen3:14b` (flag) | `qwen3:14b` 為主 | A2 確認 qwen3 原生支援 tools | +| **AiderHeal 修 Code** | `qwen2.5-coder:7b` | **升 `qwen2.5-coder:32b`** ⭐ | 程式碼能力 +30%(接近 Opus 4.6)| +| **Sales Copy 文案** | `llama3.1:8b` | `gemma3:4b` (輕量) | 短文案不需 8B | + +### 1.2 戰略層(低頻、敘事型、鎖定 Frontier) + +| 場景 | 鎖定模型 | 為何鎖定 | +|---|---|---| +| **OpenClaw 週報** | `gemini-2.5-flash` 🔒 | 長 context + 繁中商業文體 | +| **OpenClaw 月報** | `gemini-2.5-flash` 🔒 | 同上 | +| **OpenClaw 日報洞察** (200 字) | `gemini-2.5-flash` 🔒 | 精簡敘事 | +| **OpenClaw Q&A** (Telegram) | `qwen3:14b` (主) → Gemini fallback | A7 已切(flag ON) | +| **Code Review 高階評估** | **Claude Opus 4.7** (Phase 7 待 KEY) | Arena Elo 1548 (#1) | +| **EA HITL 戰略決策** | `gemini-2.0-flash` (現) → Claude Sonnet 4.6 候選 | agentic 工具使用佳 | + +### 1.3 多模態與專用 + +| 場景 | 模型 | 已拉? | +|---|---|---| +| **Embedding (KM/RAG)** | `bge-m3:latest` (1024 維) | ✅ Primary + Secondary | +| **PPT 視覺檢查** (Phase 14) | `minicpm-v:latest` (主) + `llava` (備援) | ✅ Primary + Secondary minicpm-v / 拉 llava ⏳ | +| **深度推理** (DeepSeek-R1) | `deepseek-r1:14b` | 拉中 ⏳ | + +### 1.4 雲端 API(鎖定 Frontier) + +| 供應商 | 模型 | 用途 | +|---|---|---| +| **Anthropic** | `claude-opus-4-7` | Code Review #1 (Arena 1548) | +| **Anthropic** | `claude-sonnet-4-6` | EA HITL 候選(agentic)| +| **Google** | `gemini-2.5-pro` | MCP Grounding(聯網)| +| **Google** | `gemini-2.5-flash` | 週/月/年報、Q&A fallback | +| **Google** | `gemini-2.0-flash` | PPT 簡報、EA HITL | +| **DeepSeek** | `deepseek-chat` (V3.2) | OpenRouter 直連備援 | +| **DeepSeek** | `deepseek-reasoner` (R1-0528) | 推理鏈備援 | +| **NVIDIA NIM** | `meta/llama-3.1-8b-instruct` | NemoTron fallback | +| **NVIDIA NIM** | `nvidia/llama-3.3-nemotron-super-49b-v1.5` | ElephantAlpha 49B | + +--- + +## 二、本次啟動的追加模型 + +### 2.1 Primary 34.143.170.20 (oleetsai) + +```bash +# 既有(Phase 0-19 累積) +✅ bge-m3:latest (1.2GB) — Embedding +✅ hermes3:latest (4.7GB) — Hermes 主 +✅ qwen2.5-coder:7b (4.7GB) — AiderHeal +✅ qwen3:14b (9.3GB) — Q&A / Nemotron 升級 +✅ qwen2.5:7b-instruct (4.7GB) — Q&A 預設 +✅ minicpm-v:latest (5.5GB) — PPT vision + +# 本次追加(背景拉中) +⏳ qwen2.5-coder:32b (~20GB) — AiderHeal 32B 升級 +⏳ deepseek-r1:14b (~9GB) — 推理鏈備援 +⏳ llava (~5GB) — Vision 備援 +⏳ gemma3:4b (~3GB) — 輕量 sales_copy + +預計總容量:~60GB +``` + +### 2.2 Secondary 34.21.145.224 (owen_taipei) + +```bash +# 本次新建立連線後拉(與 Primary 同步) +✅ bge-m3:latest (剛同步) +✅ hermes3:latest (剛同步) +✅ qwen2.5-coder:7b (剛同步) +✅ qwen3:14b (剛同步) +✅ qwen2.5:7b-instruct (剛同步) +✅ minicpm-v:latest (剛同步) +⏳ qwen2.5-coder:32b (背景拉中) +⏳ deepseek-r1:14b (背景拉中) +⏳ llava (背景拉中) +⏳ gemma3:4b (背景拉中) +``` + +--- + +## 三、各場景升級路線(戰役後續) + +### Phase 21(建議):模型對應路由優化 + +| 場景 | 路由規則 | +|---|---| +| Sales Copy < 100 字 | `gemma3:4b`(輕量快) | +| Sales Copy ≥ 100 字 | `llama3.1:8b`(既有) | +| Hermes 簡單比價 | `hermes3:latest` | +| Hermes 複雜分析 (gap > 20% / 銷量大跌) | `qwen3:14b` 升級 | +| AiderHeal 簡單修補 | `qwen2.5-coder:7b` | +| AiderHeal 重構級 | `qwen2.5-coder:32b` ⭐ | +| EA HITL 明確威脅 | Hermes 規則引擎(免費)| +| EA HITL 戰略決策 | `claude-sonnet-4-6` 候選 | +| 推理需求(chain-of-thought)| `deepseek-r1:14b` | +| PPT 視覺檢查 | `minicpm-v:latest` → `llava` 備援 | + +### Phase 22(建議):API 直連 + 多供應商編排 + +``` + ┌──────────────────────────────┐ + │ CostThrottle (Phase 20) ⭐ │ + │ 超預算 110% 自動切 fallback │ + └────────────┬─────────────────┘ + │ + ┌────────────────────────┼────────────────────────┐ + │ │ │ + Code Review EA HITL Q&A 戰略 + │ │ │ + Claude Opus 4.7 Gemini 2.0 Flash qwen3:14b + ↓ throttle ↓ throttle ↓ low quality + Gemini 2.5 Flash Hermes 預跑兜底 Gemini 2.5 Flash + ↓ throttle ↓ + ElephantAlpha 49B Hermes 規則引擎 +``` + +--- + +## 四、儲存空間 + 性能評估 + +### 4.1 GCP 預期用量 +- 每台 GCP 約 60GB Ollama 模型(Primary + Secondary 各一份冗餘) +- 12 個模型 × 平均 5GB = 60GB +- 假設 GCP VM 100GB SSD → 60% 使用率,可控 + +### 4.2 RAM 載入 +- Ollama keep_alive=24h 可保留熱模型(hermes3 / qwen3:14b 永駐留) +- 冷模型(minicpm-v / llava)首次調用 ~10s 加載 +- 解:分批載入 + 配置 OLLAMA_NUM_PARALLEL=2 限制同時載入數 + +### 4.3 推論延遲(GCP SSD) +| 模型 | 預期延遲(256 tokens) | +|---|---| +| gemma3:4b | ~1.5s | +| hermes3:latest | ~3s | +| qwen2.5:7b-instruct | ~3s | +| qwen3:14b | ~6s | +| deepseek-r1:14b | ~8s(含 thinking)| +| qwen2.5-coder:32b | ~12s | +| minicpm-v:latest | ~10s(含 image) | + +--- + +## 五、模型治理規範(補強 ADR-028) + +### 5.1 新增模型 SOP +1. 評估 ROI(場景對應 + 預期降本/升質量化) +2. SSH GCP Primary 試拉確認 size + 推論延遲 +3. 加進 ai_call_logger COST_TABLE +4. 加 caller × model 路由規則 +5. unit test 驗 routing +6. 灰度啟用(feature flag) +7. 1 週觀察後正式啟用 + +### 5.2 既有模型替換 SOP +1. A/B 測試新舊模型對 10+ 黃金樣本 +2. 統帥盲測通過後才替換 +3. 舊模型保留為 fallback(不立刻刪) +4. 寫進 ADR + +### 5.3 模型淘汰 SOP +1. 連續 7 日 0 流量 → 標記 deprecated +2. 30 日仍 0 流量 → SSH GCP 刪除節省空間 +3. ADR 註明淘汰原因 + +--- + +## 六、與 ai_call_logger COST_TABLE 對齊 + +| Model | Cost (in/out per M) | 為何 | +|---|---|---| +| 全 Ollama 模型 | 0 / 0 | 自架免費 | +| gemini-2.5-pro | $1.25 / $10.0 | 高品質 | +| gemini-2.5-flash | $0.075 / $0.30 | 性價比 | +| claude-opus-4-7 | $15 / $75 | 程式碼 #1 | +| claude-sonnet-4-6 | $3 / $15 | 平衡 | +| claude-haiku-4-5 | $0.8 / $4 | 輕量 | +| deepseek-chat | $0.014 / $0.28 | 直連最便宜 | +| deepseek-reasoner | $0.14 / $2.19 | 推理 | +| NIM 系列 | 0 / 0 | 配額制 | + +--- + +## References + +- ADR-027 附錄(三主機架構) +- ADR-028(LLM 路由統一準則) +- ADR-029(Hermes-First 雙塔分工) +- ADR-030(Frontier 多供應商策略) +- `services/ai_call_logger.py` COST_TABLE +- `services/llm_caller_registry.py` CALLER_REGISTRY +- `docs/operation_ollama_first_v5_postmortem.md` diff --git a/run_scheduler.py b/run_scheduler.py index 7f5c273..4b4b273 100644 --- a/run_scheduler.py +++ b/run_scheduler.py @@ -121,6 +121,12 @@ def _register_schedules(): schedule.every().sunday.at("04:30").do(run_embed_consistency_check) logger.info("📅 每週日 04:30:bge-m3 跨主機一致性驗證") + # Phase 20: 成本自動節流(COST_THROTTLE_ENABLED 預設 OFF) + schedule.every(1).hours.do(run_cost_throttle_evaluate) + logger.info("📅 每 1 小時:cost_throttle_evaluate") + schedule.every().day.at("00:05").do(run_cost_throttle_reset_if_new_month) + logger.info("📅 每日 00:05:cost_throttle_reset_if_new_month") + schedule.every().day.at("03:00").do(run_db_backup_task) logger.info("📅 每日 03:00:db_backup") @@ -230,6 +236,40 @@ def run_expire_stale_reviews(): logger.error(f"[ExpireStale] task failed: {e}", exc_info=True) +def run_cost_throttle_evaluate(): + """每小時 — Phase 20 成本自動節流評估(COST_THROTTLE_ENABLED 預設 OFF)。 + + 跑 evaluate_throttle_status;若有 provider 月底推估超預算 110% → 標 throttled。 + 狀態變化(throttle/unthrottle)會自動推 Telegram。 + """ + try: + from services.cost_throttle_service import ( + evaluate_throttle_status, is_cost_throttle_enabled, + ) + if not is_cost_throttle_enabled(): + return # flag OFF 直接 skip(不打 DB) + state = evaluate_throttle_status() + throttled = [p for p, info in state.items() if info.get('throttled')] + if throttled: + logger.warning("[CostThrottle] currently throttled: %s", throttled) + else: + logger.debug("[CostThrottle] no provider throttled") + except Exception as e: + logger.error(f"[CostThrottle] task failed: {e}", exc_info=True) + + +def run_cost_throttle_reset_if_new_month(): + """每日 00:05 — 若當天是月份第 1 天,清 throttle state(跨月重置)。""" + try: + from datetime import datetime + from services.cost_throttle_service import reset_throttle_state + if datetime.now().day == 1: + reset_throttle_state() + logger.info("[CostThrottle] new month detected, state reset") + except Exception as e: + logger.error(f"[CostThrottle] reset failed: {e}", exc_info=True) + + def run_embed_consistency_check(): """每週日 04:30 — BGE-M3 跨主機一致性驗證(ADR-033 護欄 #3)。 diff --git a/services/cost_throttle_service.py b/services/cost_throttle_service.py new file mode 100644 index 0000000..478f0f7 --- /dev/null +++ b/services/cost_throttle_service.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +services/cost_throttle_service.py +Operation Ollama-First v5.0 / Phase 20 — 成本自動節流(Cost Auto-Throttle) + +設計原則: +- 主動節流(不只被動告警):當月預估成本超預算 110% → 自動 throttle +- throttle 機制:在記憶體 cache 標記 provider,主流程讀取後改路由 + - claude throttled → fallback Gemini Flash + - gemini throttled → fallback Hermes Ollama + - nim throttled → fallback Hermes 規則引擎 +- 每小時 cron 重評估 +- Telegram 推播 throttle / unthrottle 事件 +- feature flag COST_THROTTLE_ENABLED 預設 OFF(避免戰時誤節流) + +對應 ai_call_budgets 表(migration 025): + daily / weekly / monthly × 各 provider × 80% alert_pct + 本服務只看 monthly(線性外推月底成本) +""" + +from __future__ import annotations +import os +import time +import logging +import threading +from datetime import datetime, timedelta +from calendar import monthrange +from typing import Dict, Any, Optional, List + +logger = logging.getLogger(__name__) + +# ───────────────────────────────────────────────────────────────────────────── +# Feature flag + 配置 +# ───────────────────────────────────────────────────────────────────────────── +COST_THROTTLE_ENABLED = os.getenv('COST_THROTTLE_ENABLED', 'false').strip().lower() in ('true', '1', 'yes', 'on') +COST_THROTTLE_PROJECT_RATIO = float(os.getenv('COST_THROTTLE_PROJECT_RATIO', '1.10')) # 預估超預算 110% +COST_UNTHROTTLE_PROJECT_RATIO = float(os.getenv('COST_UNTHROTTLE_PROJECT_RATIO', '0.95')) # 降回 95% 解除 + + +def is_cost_throttle_enabled() -> bool: + """Runtime check(避免 import-time freeze)""" + return os.getenv('COST_THROTTLE_ENABLED', 'false').strip().lower() in ('true', '1', 'yes', 'on') + + +# ───────────────────────────────────────────────────────────────────────────── +# Throttle 狀態(記憶體,由 cron 每小時更新) +# ───────────────────────────────────────────────────────────────────────────── +_throttle_state: Dict[str, Dict[str, Any]] = {} # {provider: {throttled, projected, budget, ...}} +_state_lock = threading.Lock() +_state_ts = 0.0 + + +def is_provider_throttled(provider: str) -> bool: + """Public API — 給 anthropic_service / gemini caller 啟動時 check""" + if not is_cost_throttle_enabled(): + return False + with _state_lock: + info = _throttle_state.get(provider) + return bool(info and info.get('throttled')) + + +def get_throttle_state() -> Dict[str, Dict[str, Any]]: + """除錯用:取 throttle 狀態 snapshot""" + with _state_lock: + return dict(_throttle_state) + + +# ───────────────────────────────────────────────────────────────────────────── +# 核心:每小時跑的 evaluate +# ───────────────────────────────────────────────────────────────────────────── +def _days_in_current_month(today: Optional[datetime] = None) -> int: + today = today or datetime.now() + return monthrange(today.year, today.month)[1] + + +def _month_start(today: Optional[datetime] = None) -> datetime: + today = today or datetime.now() + return datetime(today.year, today.month, 1) + + +def evaluate_throttle_status() -> Dict[str, Dict[str, Any]]: + """每小時 cron 跑:查 ai_call_budgets vs 當月 spent,計算月底推估。 + + Returns: + {provider: {throttled, spent, budget, projected, reason}} + """ + global _state_ts + + try: + from sqlalchemy import text as sa_text + from database.manager import get_session + except Exception as exc: + logger.warning('[CostThrottle] DB import failed: %s', exc) + return {} + + today = datetime.now() + days_elapsed = max(today.day, 1) + days_in_month = _days_in_current_month(today) + month_start = _month_start(today) + + session = get_session() + new_state: Dict[str, Dict[str, Any]] = {} + try: + # 1. 取當月 monthly budget(不包 NULL provider 的全供應商總額) + budgets = session.execute( + sa_text(""" + SELECT provider, budget_usd, alert_pct + FROM ai_call_budgets + WHERE period = 'monthly' AND provider IS NOT NULL + """), + ).fetchall() + + # 2. 取當月每 provider 累計 cost_usd + spent_rows = session.execute( + sa_text(""" + SELECT provider, COALESCE(SUM(cost_usd), 0) AS spent + FROM ai_calls + WHERE called_at >= :ms + GROUP BY provider + """), + {'ms': month_start}, + ).fetchall() + spent_by_provider: Dict[str, float] = { + row[0]: float(row[1] or 0) for row in spent_rows + } + + # 3. 計算每 provider 月底推估 + 決定是否 throttle + for row in budgets: + provider = row[0] + budget = float(row[1] or 0) + spent = spent_by_provider.get(provider, 0.0) + projected = spent / days_elapsed * days_in_month + ratio = projected / budget if budget > 0 else 0.0 + + # 取既有狀態(hysteresis:throttled 後降到 95% 才解除) + with _state_lock: + prev = _throttle_state.get(provider, {}) + prev_throttled = prev.get('throttled', False) + + should_throttle = False + reason = '' + if prev_throttled: + # 已 throttled → 降到 unthrottle threshold (95%) 才解除 + if ratio >= COST_UNTHROTTLE_PROJECT_RATIO: + should_throttle = True + reason = f'still over unthrottle threshold ({ratio:.2%} >= 95%)' + else: + reason = f'recovered ({ratio:.2%} < 95%) — unthrottle' + else: + # 未 throttled → 超 110% 才開始 + if ratio >= COST_THROTTLE_PROJECT_RATIO and budget > 0: + should_throttle = True + reason = ( + f'projected ${projected:.2f} > budget ${budget:.2f} × ' + f'{COST_THROTTLE_PROJECT_RATIO} (ratio={ratio:.2%})' + ) + + new_state[provider] = { + 'throttled': should_throttle, + 'spent': spent, + 'budget': budget, + 'projected': round(projected, 4), + 'ratio': round(ratio, 4), + 'reason': reason, + 'evaluated_at': today.isoformat(), + } + + except Exception as exc: + logger.error('[CostThrottle] evaluate failed: %s', exc) + return {} + finally: + session.close() + + # 4. 偵測狀態變化推 Telegram + transitions = _diff_transitions(new_state) + + # 5. atomic update + with _state_lock: + _throttle_state.clear() + _throttle_state.update(new_state) + _state_ts = time.time() + + # 6. 推 Telegram(變化時) + if transitions: + _push_throttle_alerts(transitions) + + return new_state + + +def _diff_transitions(new_state: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: + """偵測狀態變化(newly throttled / unthrottled)""" + transitions = [] + with _state_lock: + for provider, info in new_state.items(): + prev = _throttle_state.get(provider, {}) + prev_throttled = prev.get('throttled', False) + curr_throttled = info.get('throttled', False) + if prev_throttled != curr_throttled: + transitions.append({ + 'provider': provider, + 'from': prev_throttled, + 'to': curr_throttled, + 'info': info, + }) + return transitions + + +def _push_throttle_alerts(transitions: List[Dict[str, Any]]) -> None: + """推 Telegram throttle/unthrottle 事件""" + try: + from services.telegram_templates import _send_telegram_raw + except Exception: + return + + for t in transitions: + provider = t['provider'] + info = t['info'] + if t['to']: # newly throttled + msg = ( + f"⚠️ 成本自動節流啟動\n" + f"━━━━━━━━━━━━━━━━━━━━\n" + f"📊 Provider: {provider}\n" + f"💰 已花費: ${info['spent']:.2f} / 預算 ${info['budget']:.2f}\n" + f"📈 月底推估: ${info['projected']:.2f} (ratio {info['ratio']:.0%})\n" + f"🔧 原因: {info['reason']}\n\n" + f"自動切換 fallback 路由直到月底推估 < 95%" + ) + else: # unthrottled + msg = ( + f"✅ 成本節流解除\n" + f"━━━━━━━━━━━━━━━━━━━━\n" + f"📊 Provider: {provider}\n" + f"📈 月底推估: ${info['projected']:.2f} / 預算 ${info['budget']:.2f}" + f" (ratio {info['ratio']:.0%})\n" + f"恢復正常路由" + ) + try: + _send_telegram_raw(msg) + except Exception as exc: + logger.warning('[CostThrottle] telegram push failed: %s', exc) + + +# ───────────────────────────────────────────────────────────────────────────── +# Public API +# ───────────────────────────────────────────────────────────────────────────── +def reset_throttle_state() -> None: + """月初 cron 跑:清空 _throttle_state,重新計算""" + global _state_ts + with _state_lock: + _throttle_state.clear() + _state_ts = 0.0 + logger.info('[CostThrottle] state reset (新月份)') + + +__all__ = [ + 'evaluate_throttle_status', + 'is_provider_throttled', + 'is_cost_throttle_enabled', + 'get_throttle_state', + 'reset_throttle_state', + 'COST_THROTTLE_PROJECT_RATIO', + 'COST_UNTHROTTLE_PROJECT_RATIO', +]