feat(p56): deploy_doctor 擴充 — Observability + CD Pipeline 兩階段檢查
5 階段 → 7 階段: [3/7] Ollama 主機(從 3 → 5 機) + 192.168.0.110:11435 (P53 K8s Nginx Proxy GCP-A) + 192.168.0.110:11436 (P53 K8s Nginx Proxy GCP-B) [6/7] Observability 11 endpoint (新) 全 prod smoke:mo.wooo.work/observability/* + api/health_indicator SPA shell fingerprint 偵測(size=7480 / etag e167a58a... = FAIL) 302/308/401/403 (auth redirect) 視為 OK = login_required 正常工作 PROD_BASE_URL env 可覆寫測 staging [7/7] CD Pipeline (新) Gitea API 撈最近 3 個 run,狀態映射 OK/WARN/FAIL 110 不可達 → 自動 WARN(不阻 deploy doctor exit code) DB migrations 表清單 + 029 ollama_host_history / 030 ppt_audit_history_db。 本機跑實證:11 endpoint 全綠,Gitea 110 down 正確 WARN。
This commit is contained in:
@@ -38,22 +38,45 @@ ENV_SPEC = {
|
||||
'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度;migration 027 已寫死 1024'),
|
||||
}
|
||||
|
||||
# 必跑 migrations
|
||||
# 必跑 migrations(含 P38 新增 029/030 持久化表)
|
||||
REQUIRED_TABLES = {
|
||||
'ai_calls': '024',
|
||||
'mcp_calls': '025',
|
||||
'ai_call_budgets': '025',
|
||||
'rag_query_log': '027',
|
||||
'learning_episodes': '028',
|
||||
'ollama_host_history': '029',
|
||||
'ppt_audit_history_db': '030',
|
||||
}
|
||||
|
||||
# 三主機
|
||||
# Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌)
|
||||
OLLAMA_HOSTS = [
|
||||
('Primary GCP', '34.143.170.20:11434'),
|
||||
('Secondary GCP', '34.21.145.224:11434'),
|
||||
('Primary GCP (direct)', '34.143.170.20:11434'),
|
||||
('Secondary GCP (direct)', '34.21.145.224:11434'),
|
||||
('GCP-A via Nginx 110', '192.168.0.110:11435'),
|
||||
('GCP-B via Nginx 110', '192.168.0.110:11436'),
|
||||
('111 Mac', '192.168.0.111:11434'),
|
||||
]
|
||||
|
||||
# Phase 38-52 觀測台 endpoint(prod smoke)
|
||||
PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/')
|
||||
OBSERVABILITY_ENDPOINTS = [
|
||||
'/observability/overview',
|
||||
'/observability/rag_queries',
|
||||
'/observability/business_intel',
|
||||
'/observability/agent_orchestration',
|
||||
'/observability/ai_calls',
|
||||
'/observability/promotion_review',
|
||||
'/observability/quality_trend',
|
||||
'/observability/host_health',
|
||||
'/observability/budget',
|
||||
'/observability/ppt_audit_history',
|
||||
'/observability/api/health_indicator',
|
||||
]
|
||||
# nginx SPA fallback fingerprint(外部 LAN nginx 的)
|
||||
SPA_SHELL_LEN = 7480
|
||||
SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1'
|
||||
|
||||
|
||||
def color(s: str, c: str) -> str:
|
||||
if not sys.stdout.isatty():
|
||||
@@ -77,7 +100,7 @@ def status(level: str) -> str:
|
||||
|
||||
def check_env() -> Tuple[int, int]:
|
||||
"""Return (warn_count, fail_count)."""
|
||||
print(color('\n[1/5] 環境變數', 'bold'))
|
||||
print(color('\n[1/7] 環境變數', 'bold'))
|
||||
warn = fail = 0
|
||||
for name, (crit, expected, hint) in ENV_SPEC.items():
|
||||
val = os.getenv(name, '').strip()
|
||||
@@ -98,7 +121,7 @@ def check_env() -> Tuple[int, int]:
|
||||
|
||||
|
||||
def check_db() -> Tuple[int, int]:
|
||||
print(color('\n[2/5] 資料庫 migrations', 'bold'))
|
||||
print(color('\n[2/7] 資料庫 migrations', 'bold'))
|
||||
warn = fail = 0
|
||||
db_url = os.getenv('DATABASE_URL', '').strip()
|
||||
if not db_url:
|
||||
@@ -143,7 +166,7 @@ def check_db() -> Tuple[int, int]:
|
||||
|
||||
|
||||
def check_ollama() -> Tuple[int, int]:
|
||||
print(color('\n[3/5] Ollama 三主機', 'bold'))
|
||||
print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy)', 'bold'))
|
||||
warn = fail = 0
|
||||
try:
|
||||
import urllib.request
|
||||
@@ -154,22 +177,91 @@ def check_ollama() -> Tuple[int, int]:
|
||||
try:
|
||||
with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r:
|
||||
if r.status == 200:
|
||||
print(f' {status("OK")} {label:18} {host}')
|
||||
print(f' {status("OK")} {label:24} {host}')
|
||||
healthy += 1
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f' {status("WARN")} {label:18} {host} — {type(e).__name__}')
|
||||
print(f' {status("WARN")} {label:24} {host} — {type(e).__name__}')
|
||||
warn += 1
|
||||
if healthy == 0:
|
||||
print(f' {status("FAIL")} 三主機全 DOWN — v5.0 無法運作')
|
||||
print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作')
|
||||
fail += 1
|
||||
elif healthy < 3:
|
||||
print(f' {status("INFO")} {healthy}/3 healthy — retry 鏈有 fallback,可運作')
|
||||
elif healthy < len(OLLAMA_HOSTS):
|
||||
print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback,可運作')
|
||||
return warn, fail
|
||||
|
||||
|
||||
def check_observability_endpoints() -> Tuple[int, int]:
|
||||
"""Phase 56 新增:probe Phase 38-52 觀測台 11 endpoint
|
||||
正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗"""
|
||||
print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold'))
|
||||
warn = fail = 0
|
||||
try:
|
||||
import urllib.request
|
||||
from urllib.error import HTTPError
|
||||
except ImportError:
|
||||
return 0, 0
|
||||
for ep in OBSERVABILITY_ENDPOINTS:
|
||||
url = f'{PROD_BASE_URL}{ep}'
|
||||
try:
|
||||
req = urllib.request.Request(url, method='GET')
|
||||
with urllib.request.urlopen(req, timeout=5) as r:
|
||||
size = int(r.headers.get('Content-Length', '0') or 0)
|
||||
etag = (r.headers.get('etag', '') or '').strip('"').lower()
|
||||
if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG:
|
||||
print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})')
|
||||
fail += 1
|
||||
else:
|
||||
print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}')
|
||||
except HTTPError as e:
|
||||
# 302/308 redirect to /login = login_required 正常工作
|
||||
if e.code in (302, 308, 401, 403):
|
||||
print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)')
|
||||
else:
|
||||
print(f' {status("WARN")} {ep:42} HTTP {e.code}')
|
||||
warn += 1
|
||||
except Exception as e:
|
||||
print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}')
|
||||
warn += 1
|
||||
return warn, fail
|
||||
|
||||
|
||||
def check_cd_pipeline() -> Tuple[int, int]:
|
||||
"""Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態"""
|
||||
print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold'))
|
||||
warn = fail = 0
|
||||
try:
|
||||
import urllib.request
|
||||
import json
|
||||
except ImportError:
|
||||
return 0, 0
|
||||
try:
|
||||
url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3'
|
||||
with urllib.request.urlopen(url, timeout=5) as r:
|
||||
data = json.loads(r.read())
|
||||
runs = data.get('workflow_runs', [])[:3]
|
||||
if not runs:
|
||||
print(f' {status("WARN")} Gitea API 回應沒 runs')
|
||||
return 1, 0
|
||||
for run in runs:
|
||||
num = run.get('run_number', '?')
|
||||
sha = (run.get('head_sha') or '')[:8]
|
||||
st = run.get('status', '?')
|
||||
title = (run.get('display_title') or '')[:50]
|
||||
level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL')
|
||||
print(f' {status(level)} run #{num} {sha} {st:>10} | {title}')
|
||||
if level == 'FAIL':
|
||||
fail += 1
|
||||
elif level == 'WARN':
|
||||
warn += 1
|
||||
except Exception as e:
|
||||
print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down')
|
||||
warn += 1
|
||||
return warn, fail
|
||||
|
||||
|
||||
def check_libreoffice() -> Tuple[int, int]:
|
||||
print(color('\n[4/5] LibreOffice (PPT vision)', 'bold'))
|
||||
print(color('\n[4/7] LibreOffice (PPT vision)', 'bold'))
|
||||
bin_path = shutil.which('libreoffice') or shutil.which('soffice')
|
||||
if bin_path:
|
||||
try:
|
||||
@@ -188,7 +280,7 @@ def check_libreoffice() -> Tuple[int, int]:
|
||||
|
||||
|
||||
def check_mcp() -> Tuple[int, int]:
|
||||
print(color('\n[5/5] MCP servers', 'bold'))
|
||||
print(color('\n[5/7] MCP servers', 'bold'))
|
||||
if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true':
|
||||
print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過')
|
||||
return 0, 0
|
||||
@@ -218,7 +310,8 @@ def main() -> int:
|
||||
print(f' host: {os.uname().nodename}')
|
||||
|
||||
total_warn = total_fail = 0
|
||||
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp):
|
||||
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp,
|
||||
check_observability_endpoints, check_cd_pipeline):
|
||||
try:
|
||||
w, f = fn()
|
||||
total_warn += w
|
||||
|
||||
Reference in New Issue
Block a user