#!/usr/bin/env python3 """Operation Ollama-First v5.0 deploy doctor. 在 188 / 本機跑:python3 scripts/deploy_doctor_v5.py 檢查 v5.0 部署狀態,列出統帥手動還沒做的事。 退出碼:0=全綠,1=有 WARN,2=有 FAIL。 """ from __future__ import annotations import os import shutil import subprocess import sys from typing import Tuple # ───────────────────────────────────────────────────────────────────────── # v5.0 env vars: name → (criticality, expected_value_or_None, hint) # ───────────────────────────────────────────────────────────────────────── ENV_SPEC = { # API keys (FAIL if missing — 沒這些 v5.0 直接斷) 'ANTHROPIC_API_KEY': ('FAIL', None, 'Claude SDK 用;需 https://console.anthropic.com 申請'), 'GOOGLE_API_KEY': ('FAIL', None, 'Gemini fallback 用;舊有'), 'DEEPSEEK_API_KEY': ('WARN', None, '若 DEEPSEEK_DIRECT_ENABLED=true 需要'), 'TAVILY_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'), 'EXA_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'), 'TELEGRAM_ADMIN_CHAT_ID': ('WARN', None, 'Phase 28 PromotionGate Telegram 推播 audience'), 'TELEGRAM_BOT_TOKEN': ('FAIL', None, 'Telegram 推播必備'), # Feature flags (應 ON 才生效) 'MODEL_ROUTER_ENABLED': ('WARN', 'true', 'caller × context 動態路由(預設 true)'), 'COST_THROTTLE_ENABLED': ('WARN', 'true', '成本超 110% 自動 throttle(預設 true)'), 'MCP_ROUTER_ENABLED': ('WARN', 'true', 'MCP 4-server 統一路由(預設 false)'), 'PPT_VISION_ENABLED': ('WARN', 'true', 'PPT minicpm-v 視覺審核(預設 false)'), 'DEEPSEEK_DIRECT_ENABLED': ('INFO', None, '直連 DeepSeek API;省 NIM 中間層'), # RAG / embedding 'RAG_EMBED_MODEL': ('INFO', 'bge-m3', 'embedding 模型;ADR-026 鎖定 bge-m3'), 'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度;migration 027 已寫死 1024'), } # 必跑 migrations(含 P38 新增 029/030 持久化表) REQUIRED_TABLES = { 'ai_calls': '024', 'mcp_calls': '025', 'ai_call_budgets': '025', 'rag_query_log': '027', 'learning_episodes': '028', 'ollama_host_history': '029', 'ppt_audit_history_db': '030', } # Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌) OLLAMA_HOSTS = [ ('Primary GCP (direct)', '34.87.90.216:11434'), ('Secondary GCP (direct)', '34.21.145.224:11434'), ('GCP-A via Nginx 110', '192.168.0.110:11435'), ('GCP-B via Nginx 110', '192.168.0.110:11436'), ('111 Mac', '192.168.0.111:11434'), ] # Phase 38-52 觀測台 endpoint(prod smoke) PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/') OBSERVABILITY_ENDPOINTS = [ '/observability/overview', '/observability/rag_queries', '/observability/business_intel', '/observability/agent_orchestration', '/observability/ai_calls', '/observability/promotion_review', '/observability/quality_trend', '/observability/host_health', '/observability/budget', '/observability/ppt_audit_history', '/observability/api/health_indicator', ] # nginx SPA fallback fingerprint(外部 LAN nginx 的) SPA_SHELL_LEN = 7480 SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1' def color(s: str, c: str) -> str: if not sys.stdout.isatty(): return s codes = {'red': 31, 'green': 32, 'yellow': 33, 'cyan': 36, 'bold': 1} return f'\x1b[{codes.get(c, 0)}m{s}\x1b[0m' def status(level: str) -> str: return { 'OK': color(' OK ', 'green'), 'WARN': color(' WARN ', 'yellow'), 'FAIL': color(' FAIL ', 'red'), 'INFO': color(' INFO ', 'cyan'), }.get(level, level) # ───────────────────────────────────────────────────────────────────────── # Checks # ───────────────────────────────────────────────────────────────────────── def check_env() -> Tuple[int, int]: """Return (warn_count, fail_count).""" print(color('\n[1/7] 環境變數', 'bold')) warn = fail = 0 for name, (crit, expected, hint) in ENV_SPEC.items(): val = os.getenv(name, '').strip() if not val: print(f' {status(crit)} {name:30} <未設> — {hint}') if crit == 'FAIL': fail += 1 elif crit == 'WARN': warn += 1 continue if expected and val.lower() != expected.lower(): print(f' {status("WARN")} {name:30} ={val!r} 期望={expected!r} — {hint}') warn += 1 else: shown = val[:8] + '...' if 'KEY' in name or 'TOKEN' in name else val print(f' {status("OK")} {name:30} ={shown}') return warn, fail def check_db() -> Tuple[int, int]: print(color('\n[2/7] 資料庫 migrations', 'bold')) warn = fail = 0 db_url = os.getenv('DATABASE_URL', '').strip() if not db_url: print(f' {status("WARN")} DATABASE_URL 未設 — 跳過 DB 檢查') return 1, 0 try: import psycopg2 # type: ignore except ImportError: print(f' {status("WARN")} psycopg2 not installed — 跳過 DB 檢查') return 1, 0 try: conn = psycopg2.connect(db_url, connect_timeout=5) except Exception as e: print(f' {status("FAIL")} DB 無法連線: {type(e).__name__}: {str(e)[:100]}') return 0, 1 try: cur = conn.cursor() for table, mig in REQUIRED_TABLES.items(): cur.execute( 'SELECT 1 FROM information_schema.tables WHERE table_name = %s', (table,), ) if cur.fetchone(): print(f' {status("OK")} {table:25} (migration {mig})') else: print(f' {status("FAIL")} {table:25} 不存在 — 跑 migrations/{mig}_*.sql') fail += 1 # ai_call_budgets seed 檢查 cur.execute('SELECT COUNT(*) FROM ai_call_budgets') n = cur.fetchone()[0] if n < 8: print(f' {status("WARN")} ai_call_budgets 只有 {n} 筆 (期望 ≥8) — migration 025 seed 可能漏掉') warn += 1 else: print(f' {status("OK")} ai_call_budgets {n} 筆 seed') except Exception as e: print(f' {status("FAIL")} 查詢失敗: {e}') fail += 1 finally: conn.close() return warn, fail def check_ollama() -> Tuple[int, int]: print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy)', 'bold')) warn = fail = 0 try: import urllib.request except ImportError: return 0, 0 healthy = 0 for label, host in OLLAMA_HOSTS: try: with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r: if r.status == 200: print(f' {status("OK")} {label:24} {host}') healthy += 1 continue except Exception as e: print(f' {status("WARN")} {label:24} {host} — {type(e).__name__}') warn += 1 if healthy == 0: print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作') fail += 1 elif healthy < len(OLLAMA_HOSTS): print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback,可運作') return warn, fail def check_observability_endpoints() -> Tuple[int, int]: """Phase 56 新增:probe Phase 38-52 觀測台 11 endpoint 正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗""" print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold')) warn = fail = 0 try: import urllib.request from urllib.error import HTTPError except ImportError: return 0, 0 for ep in OBSERVABILITY_ENDPOINTS: url = f'{PROD_BASE_URL}{ep}' try: req = urllib.request.Request(url, method='GET') with urllib.request.urlopen(req, timeout=5) as r: size = int(r.headers.get('Content-Length', '0') or 0) etag = (r.headers.get('etag', '') or '').strip('"').lower() if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG: print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})') fail += 1 else: print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}') except HTTPError as e: # 302/308 redirect to /login = login_required 正常工作 if e.code in (302, 308, 401, 403): print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)') else: print(f' {status("WARN")} {ep:42} HTTP {e.code}') warn += 1 except Exception as e: print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}') warn += 1 return warn, fail def check_cd_pipeline() -> Tuple[int, int]: """Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態""" print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold')) warn = fail = 0 try: import urllib.request import json except ImportError: return 0, 0 try: url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3' with urllib.request.urlopen(url, timeout=5) as r: data = json.loads(r.read()) runs = data.get('workflow_runs', [])[:3] if not runs: print(f' {status("WARN")} Gitea API 回應沒 runs') return 1, 0 for run in runs: num = run.get('run_number', '?') sha = (run.get('head_sha') or '')[:8] st = run.get('status', '?') title = (run.get('display_title') or '')[:50] level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL') print(f' {status(level)} run #{num} {sha} {st:>10} | {title}') if level == 'FAIL': fail += 1 elif level == 'WARN': warn += 1 except Exception as e: print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down') warn += 1 return warn, fail def check_libreoffice() -> Tuple[int, int]: print(color('\n[4/7] LibreOffice (PPT vision)', 'bold')) bin_path = shutil.which('libreoffice') or shutil.which('soffice') if bin_path: try: ver = subprocess.check_output([bin_path, '--version'], timeout=5, text=True).strip() print(f' {status("OK")} {bin_path} — {ver}') return 0, 0 except Exception: print(f' {status("WARN")} {bin_path} 找到但跑不起來') return 1, 0 if os.getenv('PPT_VISION_ENABLED', '').lower() == 'true': print(f' {status("FAIL")} 未安裝;PPT_VISION_ENABLED=true 但 .pptx→.png 會失敗') print(f' apt install libreoffice # 188 上跑') return 0, 1 print(f' {status("INFO")} 未安裝;PPT_VISION_ENABLED 未開,可忽略') return 0, 0 def check_mcp() -> Tuple[int, int]: print(color('\n[5/7] MCP servers', 'bold')) if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true': print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過') return 0, 0 warn = 0 try: import urllib.request except ImportError: return 0, 0 for var in ('MCP_FIRECRAWL_URL', 'MCP_OMNISEARCH_URL', 'MCP_FILESYSTEM_URL', 'MCP_POSTGRES_URL'): url = os.getenv(var, '').strip() if not url: print(f' {status("WARN")} {var} 未設') warn += 1 continue try: with urllib.request.urlopen(url.rstrip('/') + '/health', timeout=2) as r: print(f' {status("OK")} {var:22} {url}') except Exception as e: print(f' {status("WARN")} {var:22} {url} — {type(e).__name__}') warn += 1 return warn, 0 def main() -> int: print(color('═══ Operation Ollama-First v5.0 Deploy Doctor ═══', 'bold')) print(f' cwd: {os.getcwd()}') print(f' host: {os.uname().nodename}') total_warn = total_fail = 0 for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp, check_observability_endpoints, check_cd_pipeline): try: w, f = fn() total_warn += w total_fail += f except Exception as e: print(f' {status("FAIL")} {fn.__name__} 自身爆炸: {type(e).__name__}: {e}') total_fail += 1 print(color('\n═══ 總結 ═══', 'bold')) print(f' WARN: {total_warn}') print(f' FAIL: {total_fail}') if total_fail: print(color(' ❌ 有 FAIL — v5.0 部署未完成', 'red')) return 2 if total_warn: print(color(' ⚠️ 有 WARN — 可運作但部分 feature 未啟用', 'yellow')) return 1 print(color(' ✅ 全綠 — v5.0 部署完整', 'green')) return 0 if __name__ == '__main__': sys.exit(main())