5 階段 → 7 階段: [3/7] Ollama 主機(從 3 → 5 機) + 192.168.0.110:11435 (P53 K8s Nginx Proxy GCP-A) + 192.168.0.110:11436 (P53 K8s Nginx Proxy GCP-B) [6/7] Observability 11 endpoint (新) 全 prod smoke:mo.wooo.work/observability/* + api/health_indicator SPA shell fingerprint 偵測(size=7480 / etag e167a58a... = FAIL) 302/308/401/403 (auth redirect) 視為 OK = login_required 正常工作 PROD_BASE_URL env 可覆寫測 staging [7/7] CD Pipeline (新) Gitea API 撈最近 3 個 run,狀態映射 OK/WARN/FAIL 110 不可達 → 自動 WARN(不阻 deploy doctor exit code) DB migrations 表清單 + 029 ollama_host_history / 030 ppt_audit_history_db。 本機跑實證:11 endpoint 全綠,Gitea 110 down 正確 WARN。
338 lines
13 KiB
Python
Executable File
338 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""Operation Ollama-First v5.0 deploy doctor.
|
||
|
||
在 188 / 本機跑:python3 scripts/deploy_doctor_v5.py
|
||
檢查 v5.0 部署狀態,列出統帥手動還沒做的事。
|
||
退出碼:0=全綠,1=有 WARN,2=有 FAIL。
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
import sys
|
||
from typing import Tuple
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────
|
||
# v5.0 env vars: name → (criticality, expected_value_or_None, hint)
|
||
# ─────────────────────────────────────────────────────────────────────────
|
||
ENV_SPEC = {
|
||
# API keys (FAIL if missing — 沒這些 v5.0 直接斷)
|
||
'ANTHROPIC_API_KEY': ('FAIL', None, 'Claude SDK 用;需 https://console.anthropic.com 申請'),
|
||
'GOOGLE_API_KEY': ('FAIL', None, 'Gemini fallback 用;舊有'),
|
||
'DEEPSEEK_API_KEY': ('WARN', None, '若 DEEPSEEK_DIRECT_ENABLED=true 需要'),
|
||
'TAVILY_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'),
|
||
'EXA_API_KEY': ('WARN', None, 'MCP omnisearch 用;可選'),
|
||
'TELEGRAM_ADMIN_CHAT_ID': ('WARN', None, 'Phase 28 PromotionGate Telegram 推播 audience'),
|
||
'TELEGRAM_BOT_TOKEN': ('FAIL', None, 'Telegram 推播必備'),
|
||
|
||
# Feature flags (應 ON 才生效)
|
||
'MODEL_ROUTER_ENABLED': ('WARN', 'true', 'caller × context 動態路由(預設 true)'),
|
||
'COST_THROTTLE_ENABLED': ('WARN', 'true', '成本超 110% 自動 throttle(預設 true)'),
|
||
'MCP_ROUTER_ENABLED': ('WARN', 'true', 'MCP 4-server 統一路由(預設 false)'),
|
||
'PPT_VISION_ENABLED': ('WARN', 'true', 'PPT minicpm-v 視覺審核(預設 false)'),
|
||
'DEEPSEEK_DIRECT_ENABLED': ('INFO', None, '直連 DeepSeek API;省 NIM 中間層'),
|
||
|
||
# RAG / embedding
|
||
'RAG_EMBED_MODEL': ('INFO', 'bge-m3', 'embedding 模型;ADR-026 鎖定 bge-m3'),
|
||
'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度;migration 027 已寫死 1024'),
|
||
}
|
||
|
||
# 必跑 migrations(含 P38 新增 029/030 持久化表)
|
||
REQUIRED_TABLES = {
|
||
'ai_calls': '024',
|
||
'mcp_calls': '025',
|
||
'ai_call_budgets': '025',
|
||
'rag_query_log': '027',
|
||
'learning_episodes': '028',
|
||
'ollama_host_history': '029',
|
||
'ppt_audit_history_db': '030',
|
||
}
|
||
|
||
# Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌)
|
||
OLLAMA_HOSTS = [
|
||
('Primary GCP (direct)', '34.143.170.20:11434'),
|
||
('Secondary GCP (direct)', '34.21.145.224:11434'),
|
||
('GCP-A via Nginx 110', '192.168.0.110:11435'),
|
||
('GCP-B via Nginx 110', '192.168.0.110:11436'),
|
||
('111 Mac', '192.168.0.111:11434'),
|
||
]
|
||
|
||
# Phase 38-52 觀測台 endpoint(prod smoke)
|
||
PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/')
|
||
OBSERVABILITY_ENDPOINTS = [
|
||
'/observability/overview',
|
||
'/observability/rag_queries',
|
||
'/observability/business_intel',
|
||
'/observability/agent_orchestration',
|
||
'/observability/ai_calls',
|
||
'/observability/promotion_review',
|
||
'/observability/quality_trend',
|
||
'/observability/host_health',
|
||
'/observability/budget',
|
||
'/observability/ppt_audit_history',
|
||
'/observability/api/health_indicator',
|
||
]
|
||
# nginx SPA fallback fingerprint(外部 LAN nginx 的)
|
||
SPA_SHELL_LEN = 7480
|
||
SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1'
|
||
|
||
|
||
def color(s: str, c: str) -> str:
|
||
if not sys.stdout.isatty():
|
||
return s
|
||
codes = {'red': 31, 'green': 32, 'yellow': 33, 'cyan': 36, 'bold': 1}
|
||
return f'\x1b[{codes.get(c, 0)}m{s}\x1b[0m'
|
||
|
||
|
||
def status(level: str) -> str:
|
||
return {
|
||
'OK': color(' OK ', 'green'),
|
||
'WARN': color(' WARN ', 'yellow'),
|
||
'FAIL': color(' FAIL ', 'red'),
|
||
'INFO': color(' INFO ', 'cyan'),
|
||
}.get(level, level)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────
|
||
# Checks
|
||
# ─────────────────────────────────────────────────────────────────────────
|
||
|
||
def check_env() -> Tuple[int, int]:
|
||
"""Return (warn_count, fail_count)."""
|
||
print(color('\n[1/7] 環境變數', 'bold'))
|
||
warn = fail = 0
|
||
for name, (crit, expected, hint) in ENV_SPEC.items():
|
||
val = os.getenv(name, '').strip()
|
||
if not val:
|
||
print(f' {status(crit)} {name:30} <未設> — {hint}')
|
||
if crit == 'FAIL':
|
||
fail += 1
|
||
elif crit == 'WARN':
|
||
warn += 1
|
||
continue
|
||
if expected and val.lower() != expected.lower():
|
||
print(f' {status("WARN")} {name:30} ={val!r} 期望={expected!r} — {hint}')
|
||
warn += 1
|
||
else:
|
||
shown = val[:8] + '...' if 'KEY' in name or 'TOKEN' in name else val
|
||
print(f' {status("OK")} {name:30} ={shown}')
|
||
return warn, fail
|
||
|
||
|
||
def check_db() -> Tuple[int, int]:
|
||
print(color('\n[2/7] 資料庫 migrations', 'bold'))
|
||
warn = fail = 0
|
||
db_url = os.getenv('DATABASE_URL', '').strip()
|
||
if not db_url:
|
||
print(f' {status("WARN")} DATABASE_URL 未設 — 跳過 DB 檢查')
|
||
return 1, 0
|
||
try:
|
||
import psycopg2 # type: ignore
|
||
except ImportError:
|
||
print(f' {status("WARN")} psycopg2 not installed — 跳過 DB 檢查')
|
||
return 1, 0
|
||
try:
|
||
conn = psycopg2.connect(db_url, connect_timeout=5)
|
||
except Exception as e:
|
||
print(f' {status("FAIL")} DB 無法連線: {type(e).__name__}: {str(e)[:100]}')
|
||
return 0, 1
|
||
try:
|
||
cur = conn.cursor()
|
||
for table, mig in REQUIRED_TABLES.items():
|
||
cur.execute(
|
||
'SELECT 1 FROM information_schema.tables WHERE table_name = %s',
|
||
(table,),
|
||
)
|
||
if cur.fetchone():
|
||
print(f' {status("OK")} {table:25} (migration {mig})')
|
||
else:
|
||
print(f' {status("FAIL")} {table:25} 不存在 — 跑 migrations/{mig}_*.sql')
|
||
fail += 1
|
||
# ai_call_budgets seed 檢查
|
||
cur.execute('SELECT COUNT(*) FROM ai_call_budgets')
|
||
n = cur.fetchone()[0]
|
||
if n < 8:
|
||
print(f' {status("WARN")} ai_call_budgets 只有 {n} 筆 (期望 ≥8) — migration 025 seed 可能漏掉')
|
||
warn += 1
|
||
else:
|
||
print(f' {status("OK")} ai_call_budgets {n} 筆 seed')
|
||
except Exception as e:
|
||
print(f' {status("FAIL")} 查詢失敗: {e}')
|
||
fail += 1
|
||
finally:
|
||
conn.close()
|
||
return warn, fail
|
||
|
||
|
||
def check_ollama() -> Tuple[int, int]:
|
||
print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy)', 'bold'))
|
||
warn = fail = 0
|
||
try:
|
||
import urllib.request
|
||
except ImportError:
|
||
return 0, 0
|
||
healthy = 0
|
||
for label, host in OLLAMA_HOSTS:
|
||
try:
|
||
with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r:
|
||
if r.status == 200:
|
||
print(f' {status("OK")} {label:24} {host}')
|
||
healthy += 1
|
||
continue
|
||
except Exception as e:
|
||
print(f' {status("WARN")} {label:24} {host} — {type(e).__name__}')
|
||
warn += 1
|
||
if healthy == 0:
|
||
print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作')
|
||
fail += 1
|
||
elif healthy < len(OLLAMA_HOSTS):
|
||
print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback,可運作')
|
||
return warn, fail
|
||
|
||
|
||
def check_observability_endpoints() -> Tuple[int, int]:
|
||
"""Phase 56 新增:probe Phase 38-52 觀測台 11 endpoint
|
||
正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗"""
|
||
print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold'))
|
||
warn = fail = 0
|
||
try:
|
||
import urllib.request
|
||
from urllib.error import HTTPError
|
||
except ImportError:
|
||
return 0, 0
|
||
for ep in OBSERVABILITY_ENDPOINTS:
|
||
url = f'{PROD_BASE_URL}{ep}'
|
||
try:
|
||
req = urllib.request.Request(url, method='GET')
|
||
with urllib.request.urlopen(req, timeout=5) as r:
|
||
size = int(r.headers.get('Content-Length', '0') or 0)
|
||
etag = (r.headers.get('etag', '') or '').strip('"').lower()
|
||
if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG:
|
||
print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})')
|
||
fail += 1
|
||
else:
|
||
print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}')
|
||
except HTTPError as e:
|
||
# 302/308 redirect to /login = login_required 正常工作
|
||
if e.code in (302, 308, 401, 403):
|
||
print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)')
|
||
else:
|
||
print(f' {status("WARN")} {ep:42} HTTP {e.code}')
|
||
warn += 1
|
||
except Exception as e:
|
||
print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}')
|
||
warn += 1
|
||
return warn, fail
|
||
|
||
|
||
def check_cd_pipeline() -> Tuple[int, int]:
|
||
"""Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態"""
|
||
print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold'))
|
||
warn = fail = 0
|
||
try:
|
||
import urllib.request
|
||
import json
|
||
except ImportError:
|
||
return 0, 0
|
||
try:
|
||
url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3'
|
||
with urllib.request.urlopen(url, timeout=5) as r:
|
||
data = json.loads(r.read())
|
||
runs = data.get('workflow_runs', [])[:3]
|
||
if not runs:
|
||
print(f' {status("WARN")} Gitea API 回應沒 runs')
|
||
return 1, 0
|
||
for run in runs:
|
||
num = run.get('run_number', '?')
|
||
sha = (run.get('head_sha') or '')[:8]
|
||
st = run.get('status', '?')
|
||
title = (run.get('display_title') or '')[:50]
|
||
level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL')
|
||
print(f' {status(level)} run #{num} {sha} {st:>10} | {title}')
|
||
if level == 'FAIL':
|
||
fail += 1
|
||
elif level == 'WARN':
|
||
warn += 1
|
||
except Exception as e:
|
||
print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down')
|
||
warn += 1
|
||
return warn, fail
|
||
|
||
|
||
def check_libreoffice() -> Tuple[int, int]:
|
||
print(color('\n[4/7] LibreOffice (PPT vision)', 'bold'))
|
||
bin_path = shutil.which('libreoffice') or shutil.which('soffice')
|
||
if bin_path:
|
||
try:
|
||
ver = subprocess.check_output([bin_path, '--version'], timeout=5, text=True).strip()
|
||
print(f' {status("OK")} {bin_path} — {ver}')
|
||
return 0, 0
|
||
except Exception:
|
||
print(f' {status("WARN")} {bin_path} 找到但跑不起來')
|
||
return 1, 0
|
||
if os.getenv('PPT_VISION_ENABLED', '').lower() == 'true':
|
||
print(f' {status("FAIL")} 未安裝;PPT_VISION_ENABLED=true 但 .pptx→.png 會失敗')
|
||
print(f' apt install libreoffice # 188 上跑')
|
||
return 0, 1
|
||
print(f' {status("INFO")} 未安裝;PPT_VISION_ENABLED 未開,可忽略')
|
||
return 0, 0
|
||
|
||
|
||
def check_mcp() -> Tuple[int, int]:
|
||
print(color('\n[5/7] MCP servers', 'bold'))
|
||
if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true':
|
||
print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過')
|
||
return 0, 0
|
||
warn = 0
|
||
try:
|
||
import urllib.request
|
||
except ImportError:
|
||
return 0, 0
|
||
for var in ('MCP_FIRECRAWL_URL', 'MCP_OMNISEARCH_URL', 'MCP_FILESYSTEM_URL', 'MCP_POSTGRES_URL'):
|
||
url = os.getenv(var, '').strip()
|
||
if not url:
|
||
print(f' {status("WARN")} {var} 未設')
|
||
warn += 1
|
||
continue
|
||
try:
|
||
with urllib.request.urlopen(url.rstrip('/') + '/health', timeout=2) as r:
|
||
print(f' {status("OK")} {var:22} {url}')
|
||
except Exception as e:
|
||
print(f' {status("WARN")} {var:22} {url} — {type(e).__name__}')
|
||
warn += 1
|
||
return warn, 0
|
||
|
||
|
||
def main() -> int:
|
||
print(color('═══ Operation Ollama-First v5.0 Deploy Doctor ═══', 'bold'))
|
||
print(f' cwd: {os.getcwd()}')
|
||
print(f' host: {os.uname().nodename}')
|
||
|
||
total_warn = total_fail = 0
|
||
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp,
|
||
check_observability_endpoints, check_cd_pipeline):
|
||
try:
|
||
w, f = fn()
|
||
total_warn += w
|
||
total_fail += f
|
||
except Exception as e:
|
||
print(f' {status("FAIL")} {fn.__name__} 自身爆炸: {type(e).__name__}: {e}')
|
||
total_fail += 1
|
||
|
||
print(color('\n═══ 總結 ═══', 'bold'))
|
||
print(f' WARN: {total_warn}')
|
||
print(f' FAIL: {total_fail}')
|
||
if total_fail:
|
||
print(color(' ❌ 有 FAIL — v5.0 部署未完成', 'red'))
|
||
return 2
|
||
if total_warn:
|
||
print(color(' ⚠️ 有 WARN — 可運作但部分 feature 未啟用', 'yellow'))
|
||
return 1
|
||
print(color(' ✅ 全綠 — v5.0 部署完整', 'green'))
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|