feat(p56): deploy_doctor 擴充 — Observability + CD Pipeline 兩階段檢查

5 階段 → 7 階段:

[3/7] Ollama 主機(從 3 → 5 機)
  + 192.168.0.110:11435 (P53 K8s Nginx Proxy GCP-A)
  + 192.168.0.110:11436 (P53 K8s Nginx Proxy GCP-B)

[6/7] Observability 11 endpoint (新)
  全 prod smoke:mo.wooo.work/observability/* + api/health_indicator
  SPA shell fingerprint 偵測(size=7480 / etag e167a58a... = FAIL)
  302/308/401/403 (auth redirect) 視為 OK = login_required 正常工作
  PROD_BASE_URL env 可覆寫測 staging

[7/7] CD Pipeline (新)
  Gitea API 撈最近 3 個 run,狀態映射 OK/WARN/FAIL
  110 不可達 → 自動 WARN(不阻 deploy doctor exit code)

DB migrations 表清單 + 029 ollama_host_history / 030 ppt_audit_history_db。

本機跑實證:11 endpoint 全綠,Gitea 110 down 正確 WARN。
This commit is contained in:
OoO
2026-05-05 12:27:51 +08:00
parent 326285d8b9
commit 2bb2e16442

View File

@@ -38,22 +38,45 @@ ENV_SPEC = {
'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度migration 027 已寫死 1024'),
}
# 必跑 migrations
# 必跑 migrations(含 P38 新增 029/030 持久化表)
REQUIRED_TABLES = {
'ai_calls': '024',
'mcp_calls': '025',
'ai_call_budgets': '025',
'rag_query_log': '027',
'learning_episodes': '028',
'ollama_host_history': '029',
'ppt_audit_history_db': '030',
}
# 三主機
# Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌)
OLLAMA_HOSTS = [
('Primary GCP', '34.143.170.20:11434'),
('Secondary GCP', '34.21.145.224:11434'),
('Primary GCP (direct)', '34.143.170.20:11434'),
('Secondary GCP (direct)', '34.21.145.224:11434'),
('GCP-A via Nginx 110', '192.168.0.110:11435'),
('GCP-B via Nginx 110', '192.168.0.110:11436'),
('111 Mac', '192.168.0.111:11434'),
]
# Phase 38-52 觀測台 endpointprod smoke
PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/')
OBSERVABILITY_ENDPOINTS = [
'/observability/overview',
'/observability/rag_queries',
'/observability/business_intel',
'/observability/agent_orchestration',
'/observability/ai_calls',
'/observability/promotion_review',
'/observability/quality_trend',
'/observability/host_health',
'/observability/budget',
'/observability/ppt_audit_history',
'/observability/api/health_indicator',
]
# nginx SPA fallback fingerprint外部 LAN nginx 的)
SPA_SHELL_LEN = 7480
SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1'
def color(s: str, c: str) -> str:
if not sys.stdout.isatty():
@@ -77,7 +100,7 @@ def status(level: str) -> str:
def check_env() -> Tuple[int, int]:
"""Return (warn_count, fail_count)."""
print(color('\n[1/5] 環境變數', 'bold'))
print(color('\n[1/7] 環境變數', 'bold'))
warn = fail = 0
for name, (crit, expected, hint) in ENV_SPEC.items():
val = os.getenv(name, '').strip()
@@ -98,7 +121,7 @@ def check_env() -> Tuple[int, int]:
def check_db() -> Tuple[int, int]:
print(color('\n[2/5] 資料庫 migrations', 'bold'))
print(color('\n[2/7] 資料庫 migrations', 'bold'))
warn = fail = 0
db_url = os.getenv('DATABASE_URL', '').strip()
if not db_url:
@@ -143,7 +166,7 @@ def check_db() -> Tuple[int, int]:
def check_ollama() -> Tuple[int, int]:
print(color('\n[3/5] Ollama 主機', 'bold'))
print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy', 'bold'))
warn = fail = 0
try:
import urllib.request
@@ -154,22 +177,91 @@ def check_ollama() -> Tuple[int, int]:
try:
with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r:
if r.status == 200:
print(f' {status("OK")} {label:18} {host}')
print(f' {status("OK")} {label:24} {host}')
healthy += 1
continue
except Exception as e:
print(f' {status("WARN")} {label:18} {host}{type(e).__name__}')
print(f' {status("WARN")} {label:24} {host}{type(e).__name__}')
warn += 1
if healthy == 0:
print(f' {status("FAIL")} 主機 DOWN — v5.0 無法運作')
print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作')
fail += 1
elif healthy < 3:
print(f' {status("INFO")} {healthy}/3 healthy — retry 鏈有 fallback可運作')
elif healthy < len(OLLAMA_HOSTS):
print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback可運作')
return warn, fail
def check_observability_endpoints() -> Tuple[int, int]:
"""Phase 56 新增probe Phase 38-52 觀測台 11 endpoint
正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗"""
print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold'))
warn = fail = 0
try:
import urllib.request
from urllib.error import HTTPError
except ImportError:
return 0, 0
for ep in OBSERVABILITY_ENDPOINTS:
url = f'{PROD_BASE_URL}{ep}'
try:
req = urllib.request.Request(url, method='GET')
with urllib.request.urlopen(req, timeout=5) as r:
size = int(r.headers.get('Content-Length', '0') or 0)
etag = (r.headers.get('etag', '') or '').strip('"').lower()
if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG:
print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})')
fail += 1
else:
print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}')
except HTTPError as e:
# 302/308 redirect to /login = login_required 正常工作
if e.code in (302, 308, 401, 403):
print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)')
else:
print(f' {status("WARN")} {ep:42} HTTP {e.code}')
warn += 1
except Exception as e:
print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}')
warn += 1
return warn, fail
def check_cd_pipeline() -> Tuple[int, int]:
"""Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態"""
print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold'))
warn = fail = 0
try:
import urllib.request
import json
except ImportError:
return 0, 0
try:
url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3'
with urllib.request.urlopen(url, timeout=5) as r:
data = json.loads(r.read())
runs = data.get('workflow_runs', [])[:3]
if not runs:
print(f' {status("WARN")} Gitea API 回應沒 runs')
return 1, 0
for run in runs:
num = run.get('run_number', '?')
sha = (run.get('head_sha') or '')[:8]
st = run.get('status', '?')
title = (run.get('display_title') or '')[:50]
level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL')
print(f' {status(level)} run #{num} {sha} {st:>10} | {title}')
if level == 'FAIL':
fail += 1
elif level == 'WARN':
warn += 1
except Exception as e:
print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down')
warn += 1
return warn, fail
def check_libreoffice() -> Tuple[int, int]:
print(color('\n[4/5] LibreOffice (PPT vision)', 'bold'))
print(color('\n[4/7] LibreOffice (PPT vision)', 'bold'))
bin_path = shutil.which('libreoffice') or shutil.which('soffice')
if bin_path:
try:
@@ -188,7 +280,7 @@ def check_libreoffice() -> Tuple[int, int]:
def check_mcp() -> Tuple[int, int]:
print(color('\n[5/5] MCP servers', 'bold'))
print(color('\n[5/7] MCP servers', 'bold'))
if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true':
print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過')
return 0, 0
@@ -218,7 +310,8 @@ def main() -> int:
print(f' host: {os.uname().nodename}')
total_warn = total_fail = 0
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp):
for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp,
check_observability_endpoints, check_cd_pipeline):
try:
w, f = fn()
total_warn += w