From 2bb2e16442fda3ab08739a9f199408790996c5be Mon Sep 17 00:00:00 2001 From: OoO Date: Tue, 5 May 2026 12:27:51 +0800 Subject: [PATCH] =?UTF-8?q?feat(p56):=20deploy=5Fdoctor=20=E6=93=B4?= =?UTF-8?q?=E5=85=85=20=E2=80=94=20Observability=20+=20CD=20Pipeline=20?= =?UTF-8?q?=E5=85=A9=E9=9A=8E=E6=AE=B5=E6=AA=A2=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5 階段 → 7 階段: [3/7] Ollama 主機(從 3 → 5 機) + 192.168.0.110:11435 (P53 K8s Nginx Proxy GCP-A) + 192.168.0.110:11436 (P53 K8s Nginx Proxy GCP-B) [6/7] Observability 11 endpoint (新) 全 prod smoke:mo.wooo.work/observability/* + api/health_indicator SPA shell fingerprint 偵測(size=7480 / etag e167a58a... = FAIL) 302/308/401/403 (auth redirect) 視為 OK = login_required 正常工作 PROD_BASE_URL env 可覆寫測 staging [7/7] CD Pipeline (新) Gitea API 撈最近 3 個 run,狀態映射 OK/WARN/FAIL 110 不可達 → 自動 WARN(不阻 deploy doctor exit code) DB migrations 表清單 + 029 ollama_host_history / 030 ppt_audit_history_db。 本機跑實證:11 endpoint 全綠,Gitea 110 down 正確 WARN。 --- scripts/deploy_doctor_v5.py | 123 +++++++++++++++++++++++++++++++----- 1 file changed, 108 insertions(+), 15 deletions(-) diff --git a/scripts/deploy_doctor_v5.py b/scripts/deploy_doctor_v5.py index 7bd66e1..18d819a 100755 --- a/scripts/deploy_doctor_v5.py +++ b/scripts/deploy_doctor_v5.py @@ -38,22 +38,45 @@ ENV_SPEC = { 'RAG_EMBED_DIM': ('INFO', '1024', 'bge-m3 維度;migration 027 已寫死 1024'), } -# 必跑 migrations +# 必跑 migrations(含 P38 新增 029/030 持久化表) REQUIRED_TABLES = { 'ai_calls': '024', 'mcp_calls': '025', 'ai_call_budgets': '025', 'rag_query_log': '027', 'learning_episodes': '028', + 'ollama_host_history': '029', + 'ppt_audit_history_db': '030', } -# 三主機 +# Ollama 主機(直連 + P53 K8s Nginx Proxy 雙軌) OLLAMA_HOSTS = [ - ('Primary GCP', '34.143.170.20:11434'), - ('Secondary GCP', '34.21.145.224:11434'), + ('Primary GCP (direct)', '34.143.170.20:11434'), + ('Secondary GCP (direct)', '34.21.145.224:11434'), + ('GCP-A via Nginx 110', '192.168.0.110:11435'), + ('GCP-B via Nginx 110', '192.168.0.110:11436'), ('111 Mac', '192.168.0.111:11434'), ] +# Phase 38-52 觀測台 endpoint(prod smoke) +PROD_BASE_URL = os.getenv('PROD_BASE_URL', 'https://mo.wooo.work').rstrip('/') +OBSERVABILITY_ENDPOINTS = [ + '/observability/overview', + '/observability/rag_queries', + '/observability/business_intel', + '/observability/agent_orchestration', + '/observability/ai_calls', + '/observability/promotion_review', + '/observability/quality_trend', + '/observability/host_health', + '/observability/budget', + '/observability/ppt_audit_history', + '/observability/api/health_indicator', +] +# nginx SPA fallback fingerprint(外部 LAN nginx 的) +SPA_SHELL_LEN = 7480 +SPA_SHELL_ETAG = 'e167a58a1baf907f55a2925a2e8665d1' + def color(s: str, c: str) -> str: if not sys.stdout.isatty(): @@ -77,7 +100,7 @@ def status(level: str) -> str: def check_env() -> Tuple[int, int]: """Return (warn_count, fail_count).""" - print(color('\n[1/5] 環境變數', 'bold')) + print(color('\n[1/7] 環境變數', 'bold')) warn = fail = 0 for name, (crit, expected, hint) in ENV_SPEC.items(): val = os.getenv(name, '').strip() @@ -98,7 +121,7 @@ def check_env() -> Tuple[int, int]: def check_db() -> Tuple[int, int]: - print(color('\n[2/5] 資料庫 migrations', 'bold')) + print(color('\n[2/7] 資料庫 migrations', 'bold')) warn = fail = 0 db_url = os.getenv('DATABASE_URL', '').strip() if not db_url: @@ -143,7 +166,7 @@ def check_db() -> Tuple[int, int]: def check_ollama() -> Tuple[int, int]: - print(color('\n[3/5] Ollama 三主機', 'bold')) + print(color('\n[3/7] Ollama 主機(直連 + K8s Nginx Proxy)', 'bold')) warn = fail = 0 try: import urllib.request @@ -154,22 +177,91 @@ def check_ollama() -> Tuple[int, int]: try: with urllib.request.urlopen(f'http://{host}/api/tags', timeout=3) as r: if r.status == 200: - print(f' {status("OK")} {label:18} {host}') + print(f' {status("OK")} {label:24} {host}') healthy += 1 continue except Exception as e: - print(f' {status("WARN")} {label:18} {host} — {type(e).__name__}') + print(f' {status("WARN")} {label:24} {host} — {type(e).__name__}') warn += 1 if healthy == 0: - print(f' {status("FAIL")} 三主機全 DOWN — v5.0 無法運作') + print(f' {status("FAIL")} 全部主機 DOWN — v5.0 無法運作') fail += 1 - elif healthy < 3: - print(f' {status("INFO")} {healthy}/3 healthy — retry 鏈有 fallback,可運作') + elif healthy < len(OLLAMA_HOSTS): + print(f' {status("INFO")} {healthy}/{len(OLLAMA_HOSTS)} healthy — retry 鏈有 fallback,可運作') + return warn, fail + + +def check_observability_endpoints() -> Tuple[int, int]: + """Phase 56 新增:probe Phase 38-52 觀測台 11 endpoint + 正常 prod 應回 302 (login redirect) 或 200 — 7480 byte SPA shell = 失敗""" + print(color(f'\n[6/7] Observability 11 endpoint ({PROD_BASE_URL})', 'bold')) + warn = fail = 0 + try: + import urllib.request + from urllib.error import HTTPError + except ImportError: + return 0, 0 + for ep in OBSERVABILITY_ENDPOINTS: + url = f'{PROD_BASE_URL}{ep}' + try: + req = urllib.request.Request(url, method='GET') + with urllib.request.urlopen(req, timeout=5) as r: + size = int(r.headers.get('Content-Length', '0') or 0) + etag = (r.headers.get('etag', '') or '').strip('"').lower() + if size == SPA_SHELL_LEN or etag == SPA_SHELL_ETAG: + print(f' {status("FAIL")} {ep:42} HTTP {r.status} but SPA shell (size={size})') + fail += 1 + else: + print(f' {status("OK")} {ep:42} HTTP {r.status} size={size}') + except HTTPError as e: + # 302/308 redirect to /login = login_required 正常工作 + if e.code in (302, 308, 401, 403): + print(f' {status("OK")} {ep:42} HTTP {e.code} (auth redirect = expected)') + else: + print(f' {status("WARN")} {ep:42} HTTP {e.code}') + warn += 1 + except Exception as e: + print(f' {status("WARN")} {ep:42} {type(e).__name__}: {str(e)[:50]}') + warn += 1 + return warn, fail + + +def check_cd_pipeline() -> Tuple[int, int]: + """Phase 56 新增:查 Gitea API 最近 3 個 CD run 狀態""" + print(color('\n[7/7] CD Pipeline (Gitea Actions latest 3 runs)', 'bold')) + warn = fail = 0 + try: + import urllib.request + import json + except ImportError: + return 0, 0 + try: + url = 'http://192.168.0.110:3001/api/v1/repos/wooo/ewoooc/actions/tasks?limit=3' + with urllib.request.urlopen(url, timeout=5) as r: + data = json.loads(r.read()) + runs = data.get('workflow_runs', [])[:3] + if not runs: + print(f' {status("WARN")} Gitea API 回應沒 runs') + return 1, 0 + for run in runs: + num = run.get('run_number', '?') + sha = (run.get('head_sha') or '')[:8] + st = run.get('status', '?') + title = (run.get('display_title') or '')[:50] + level = 'OK' if st == 'success' else ('WARN' if st in ('running', 'cancelled') else 'FAIL') + print(f' {status(level)} run #{num} {sha} {st:>10} | {title}') + if level == 'FAIL': + fail += 1 + elif level == 'WARN': + warn += 1 + except Exception as e: + print(f' {status("WARN")} Gitea 不可達: {type(e).__name__} — 110 可能 down') + warn += 1 return warn, fail def check_libreoffice() -> Tuple[int, int]: - print(color('\n[4/5] LibreOffice (PPT vision)', 'bold')) + print(color('\n[4/7] LibreOffice (PPT vision)', 'bold')) bin_path = shutil.which('libreoffice') or shutil.which('soffice') if bin_path: try: @@ -188,7 +280,7 @@ def check_libreoffice() -> Tuple[int, int]: def check_mcp() -> Tuple[int, int]: - print(color('\n[5/5] MCP servers', 'bold')) + print(color('\n[5/7] MCP servers', 'bold')) if os.getenv('MCP_ROUTER_ENABLED', '').lower() != 'true': print(f' {status("INFO")} MCP_ROUTER_ENABLED=false — 跳過') return 0, 0 @@ -218,7 +310,8 @@ def main() -> int: print(f' host: {os.uname().nodename}') total_warn = total_fail = 0 - for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp): + for fn in (check_env, check_db, check_ollama, check_libreoffice, check_mcp, + check_observability_endpoints, check_cd_pipeline): try: w, f = fn() total_warn += w