Files
ewoooc/scripts/deploy_doctor_v5.py
OoO ba5fe06b13
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
fix: update ollama primary host
2026-06-18 14:24:55 +08:00

338 lines
13 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Operation Ollama-First v5.0 deploy doctor.
在 188 / 本機跑python3 scripts/deploy_doctor_v5.py
檢查 v5.0 部署狀態,列出統帥手動還沒做的事。
退出碼0=全綠1=有 WARN2=有 FAIL。
"""
from __future__ import annotations
import os
import shutil
import subprocess
import sys
from typing import Tuple
# ─────────────────────────────────────────────────────────────────────────
# v5.0 env vars: name → (criticality, expected_value_or_None, hint)
# ─────────────────────────────────────────────────────────────────────────
ENV_SPEC = {
# API keys (FAIL if missing — 沒這些 v5.0 直接斷)
'ANTHROPIC_API_KEY': ('FAIL', None, 'Claude SDK 用;需 https://console.anthropic.com 申請'),
'GOOGLE_API_KEY': ('FAIL', None, 'Gemini fallback 用;舊有'),
'DEEPSEEK_API_KEY': ('WARN', None, '若 DEEPSEEK_DIRECT_ENABLED=true 需要'),
'TAVILY_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'),
'EXA_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'),
'TELEGRAM_ADMIN_CHAT_ID': ('WARN', None, 'Phase 28 PromotionGate Telegram 推播 audience'),
'TELEGRAM_BOT_TOKEN': ('FAIL', None, 'Telegram 推播必備'),
# Feature flags (應 ON 才生效)
'MODEL_ROUTER_ENABLED': ('WARN', 'true', 'caller × context 動態路由(預設 true'),
'COST_THROTTLE_ENABLED': ('WARN', 'true', '成本超 110% 自動 throttle預設 true'),
'MCP_ROUTER_ENABLED': ('WARN', 'true', 'MCP 4-server 統一路由(預設 false'),
'PPT_VISION_ENABLED': ('WARN', 'true', 'PPT minicpm-v 視覺審核(預設 false'),
'DEEPSEEK_DIRECT_ENABLED': ('INFO', None, '直連 DeepSeek API省 NIM 中間層'),
# RAG / embedding
'RAG_EMBED_MODEL': ('INFO', 'bge-m3', 'embedding 模型ADR-026 鎖定 bge-m3'),
'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度migration 027 已寫死 1024'),
}
# 必跑 migrations含 P38 新增 029/030 持久化表)
REQUIRED_TABLES = {
'ai_calls': '024',
'mcp_calls': '025',
'ai_call_budgets': '025',
'rag_query_log': '027',
'learning_episodes': '028',
'ollama_host_history': '029',
'ppt_audit_history_db': '030',
}
# Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌)
OLLAMA_HOSTS = [
('Primary GCP (direct)', '34.87.90.216:11434'),
('Secondary GCP (direct)', '34.21.145.224:11434'),
('GCP-A via Nginx 110', '192.168.0.110:11435'),
('GCP-B via Nginx 110', '192.168.0.110:11436'),
('111 Mac', '192.168.0.111:11434'),
]
# Phase 38-52 觀測台 endpointprod smoke
PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/')
OBSERVABILITY_ENDPOINTS = [
'/observability/overview',
'/observability/rag_queries',
'/observability/business_intel',
'/observability/agent_orchestration',
'/observability/ai_calls',
'/observability/promotion_review',
'/observability/quality_trend',
'/observability/host_health',
'/observability/budget',
'/observability/ppt_audit_history',
'/observability/api/health_indicator',
]
# nginx SPA fallback fingerprint外部 LAN nginx 的)
SPA_SHELL_LEN = 7480
SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1'
def color(s: str, c: str) -> str:
if not sys.stdout.isatty():
return s
codes = {'red': 31, 'green': 32, 'yellow': 33, 'cyan': 36, 'bold': 1}
return f'\x1b[{codes.get(c, 0)}m{s}\x1b[0m'
def status(level: str) -> str:
return {
'OK': color(' OK ', 'green'),
'WARN': color(' WARN ', 'yellow'),
'FAIL': color(' FAIL ', 'red'),
'INFO': color(' INFO ', 'cyan'),
}.get(level, level)
# ─────────────────────────────────────────────────────────────────────────
# Checks
# ─────────────────────────────────────────────────────────────────────────
def check_env() -> Tuple[int, int]:
"""Return (warn_count, fail_count)."""
print(color('\n[1/7] 環境變數', 'bold'))
warn = fail = 0
for name, (crit, expected, hint) in ENV_SPEC.items():
val = os.getenv(name, '').strip()
if not val:
print(f' {status(crit)} {name:30} <未設> — {hint}')
if crit == 'FAIL':
fail += 1
elif crit == 'WARN':
warn += 1
continue
if expected and val.lower() != expected.lower():
print(f' {status("WARN")} {name:30} ={val!r} 期望={expected!r}{hint}')
warn += 1
else:
shown = val[:8] + '...' if 'KEY' in name or 'TOKEN' in name else val
print(f' {status("OK")} {name:30} ={shown}')
return warn, fail
def check_db() -> Tuple[int, int]:
print(color('\n[2/7] 資料庫 migrations', 'bold'))
warn = fail = 0
db_url = os.getenv('DATABASE_URL', '').strip()
if not db_url:
print(f' {status("WARN")} DATABASE_URL 未設 — 跳過 DB 檢查')
return 1, 0
try:
import psycopg2 # type: ignore
except ImportError:
print(f' {status("WARN")} psycopg2 not installed — 跳過 DB 檢查')
return 1, 0
try:
conn = psycopg2.connect(db_url, connect_timeout=5)
except Exception as e:
print(f' {status("FAIL")} DB 無法連線: {type(e).__name__}: {str(e)[:100]}')
return 0, 1
try:
cur = conn.cursor()
for table, mig in REQUIRED_TABLES.items():
cur.execute(
'SELECT 1 FROM information_schema.tables WHERE table_name = %s',
(table,),
)
if cur.fetchone():
print(f' {status("OK")} {table:25} (migration {mig})')
else:
print(f' {status("FAIL")} {table:25} 不存在 — 跑 migrations/{mig}_*.sql')
fail += 1
# ai_call_budgets seed 檢查
cur.execute('SELECT COUNT(*) FROM ai_call_budgets')
n = cur.fetchone()[0]
if n < 8:
print(f' {status("WARN")} ai_call_budgets 只有 {n} 筆 (期望 ≥8) — migration 025 seed 可能漏掉')
warn += 1
else:
print(f' {status("OK")} ai_call_budgets {n} 筆 seed')
except Exception as e:
print(f' {status("FAIL")} 查詢失敗: {e}')
fail += 1
finally:
conn.close()
return warn, fail
def check_ollama() -> Tuple[int, int]:
print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy', 'bold'))
warn = fail = 0
try:
import urllib.request
except ImportError:
return 0, 0
healthy = 0
for label, host in OLLAMA_HOSTS:
try:
with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r:
if r.status == 200:
print(f' {status("OK")} {label:24} {host}')
healthy += 1
continue
except Exception as e:
print(f' {status("WARN")} {label:24} {host}{type(e).__name__}')
warn += 1
if healthy == 0:
print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作')
fail += 1
elif healthy < len(OLLAMA_HOSTS):
print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback可運作')
return warn, fail
def check_observability_endpoints() -> Tuple[int, int]:
"""Phase 56 新增probe Phase 38-52 觀測台 11 endpoint
正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗"""
print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold'))
warn = fail = 0
try:
import urllib.request
from urllib.error import HTTPError
except ImportError:
return 0, 0
for ep in OBSERVABILITY_ENDPOINTS:
url = f'{PROD_BASE_URL}{ep}'
try:
req = urllib.request.Request(url, method='GET')
with urllib.request.urlopen(req, timeout=5) as r:
size = int(r.headers.get('Content-Length', '0') or 0)
etag = (r.headers.get('etag', '') or '').strip('"').lower()
if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG:
print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})')
fail += 1
else:
print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}')
except HTTPError as e:
# 302/308 redirect to /login = login_required 正常工作
if e.code in (302, 308, 401, 403):
print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)')
else:
print(f' {status("WARN")} {ep:42} HTTP {e.code}')
warn += 1
except Exception as e:
print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}')
warn += 1
return warn, fail
def check_cd_pipeline() -> Tuple[int, int]:
"""Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態"""
print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold'))
warn = fail = 0
try:
import urllib.request
import json
except ImportError:
return 0, 0
try:
url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3'
with urllib.request.urlopen(url, timeout=5) as r:
data = json.loads(r.read())
runs = data.get('workflow_runs', [])[:3]
if not runs:
print(f' {status("WARN")} Gitea API 回應沒 runs')
return 1, 0
for run in runs:
num = run.get('run_number', '?')
sha = (run.get('head_sha') or '')[:8]
st = run.get('status', '?')
title = (run.get('display_title') or '')[:50]
level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL')
print(f' {status(level)} run #{num} {sha} {st:>10} | {title}')
if level == 'FAIL':
fail += 1
elif level == 'WARN':
warn += 1
except Exception as e:
print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down')
warn += 1
return warn, fail
def check_libreoffice() -> Tuple[int, int]:
print(color('\n[4/7] LibreOffice (PPT vision)', 'bold'))
bin_path = shutil.which('libreoffice') or shutil.which('soffice')
if bin_path:
try:
ver = subprocess.check_output([bin_path, '--version'], timeout=5, text=True).strip()
print(f' {status("OK")} {bin_path}{ver}')
return 0, 0
except Exception:
print(f' {status("WARN")} {bin_path} 找到但跑不起來')
return 1, 0
if os.getenv('PPT_VISION_ENABLED', '').lower() == 'true':
print(f' {status("FAIL")} 未安裝PPT_VISION_ENABLED=true 但 .pptx→.png 會失敗')
print(f' apt install libreoffice # 188 上跑')
return 0, 1
print(f' {status("INFO")} 未安裝PPT_VISION_ENABLED 未開,可忽略')
return 0, 0
def check_mcp() -> Tuple[int, int]:
print(color('\n[5/7] MCP servers', 'bold'))
if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true':
print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過')
return 0, 0
warn = 0
try:
import urllib.request
except ImportError:
return 0, 0
for var in ('MCP_FIRECRAWL_URL', 'MCP_OMNISEARCH_URL', 'MCP_FILESYSTEM_URL', 'MCP_POSTGRES_URL'):
url = os.getenv(var, '').strip()
if not url:
print(f' {status("WARN")} {var} 未設')
warn += 1
continue
try:
with urllib.request.urlopen(url.rstrip('/') + '/health', timeout=2) as r:
print(f' {status("OK")} {var:22} {url}')
except Exception as e:
print(f' {status("WARN")} {var:22} {url}{type(e).__name__}')
warn += 1
return warn, 0
def main() -> int:
print(color('═══ Operation Ollama-First v5.0 Deploy Doctor ═══', 'bold'))
print(f' cwd: {os.getcwd()}')
print(f' host: {os.uname().nodename}')
total_warn = total_fail = 0
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp,
check_observability_endpoints, check_cd_pipeline):
try:
w, f = fn()
total_warn += w
total_fail += f
except Exception as e:
print(f' {status("FAIL")} {fn.__name__} 自身爆炸: {type(e).__name__}: {e}')
total_fail += 1
print(color('\n═══ 總結 ═══', 'bold'))
print(f' WARN: {total_warn}')
print(f' FAIL: {total_fail}')
if total_fail:
print(color(' ❌ 有 FAIL — v5.0 部署未完成', 'red'))
return 2
if total_warn:
print(color(' ⚠️ 有 WARN — 可運作但部分 feature 未啟用', 'yellow'))
return 1
print(color(' ✅ 全綠 — v5.0 部署完整', 'green'))
return 0
if __name__ == '__main__':
sys.exit(main())