From 903cf1a27aeed49896e68ca647fd752ee9de2f39 Mon Sep 17 00:00:00 2001 From: ogt Date: Thu, 25 Jun 2026 14:45:02 +0800 Subject: [PATCH] fix: align deploy health checks with live endpoint --- config.py | 2 +- docker/nginx/html/api/health-check.sh | 4 +- docker/nginx/html/api/health.sh | 4 +- docker/prometheus/prometheus.yml | 20 +----- docs/AI_INTELLIGENCE_MODULE_SOT.md | 1 + routes/cicd_routes.py | 67 +++++++++++++++---- routes/system_public_routes.py | 9 ++- scripts/auto-repair/env-sync-monitor.sh | 4 +- scripts/domain-health-monitor.sh | 3 +- templates/cicd_dashboard.html | 2 +- tests/test_prometheus_ai_automation_scrape.py | 4 +- tests/test_webcrumbs_asset_proxy.py | 1 + 12 files changed, 76 insertions(+), 45 deletions(-) diff --git a/config.py b/config.py index 1cb0b63..10e1b19 100644 --- a/config.py +++ b/config.py @@ -402,7 +402,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.676" +SYSTEM_VERSION = "V10.677" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docker/nginx/html/api/health-check.sh b/docker/nginx/html/api/health-check.sh index b7638ab..acc8dde 100644 --- a/docker/nginx/html/api/health-check.sh +++ b/docker/nginx/html/api/health-check.sh @@ -14,8 +14,8 @@ SERVICE=$(echo "$QUERY_STRING" | sed -n 's/.*service=\([^&]*\).*/\1/p') # 定義服務健康檢查 URL declare -A HEALTH_URLS=( - ["momo-uat"]="https://mo.wooo.work/health" - ["momo-gcp"]="https://momo.wooo.work/health" + ["momo-live"]="https://mo.wooo.work/health" + ["momo-prod"]="https://mo.wooo.work/health" ["gitlab"]="http://127.0.0.1:8929/" ["registry"]="http://127.0.0.1:5002/v2/" ["n8n"]="http://127.0.0.1:5678/" diff --git a/docker/nginx/html/api/health.sh b/docker/nginx/html/api/health.sh index 9fa2206..49f4eec 100644 --- a/docker/nginx/html/api/health.sh +++ b/docker/nginx/html/api/health.sh @@ -27,9 +27,9 @@ check_service() { echo '{"services": {' # 核心服務 -check_service "momo-uat" "https://mo.wooo.work/health" +check_service "momo-live" "https://mo.wooo.work/health" echo "," -check_service "momo-gcp" "https://momo.wooo.work/health" +check_service "momo-prod" "https://mo.wooo.work/health" echo "," # 開發工具 diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 1016c9c..25a4ba2 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -119,7 +119,7 @@ scrape_configs: module: [http_2xx] static_configs: - targets: - - https://momo.wooo.work/health + - https://mo.wooo.work/health labels: env: 'prod' probe_type: 'http' @@ -268,24 +268,6 @@ scrape_configs: - target_label: __address__ replacement: blackbox-exporter:9115 - - job_name: 'blackbox-dns-momo' - metrics_path: /probe - params: - module: [dns_check_momo] - static_configs: - - targets: - - 8.8.8.8 # Google DNS - momo.wooo.work - labels: - domain: 'momo.wooo.work' - probe_type: 'dns' - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: blackbox-exporter:9115 - # =========================================================================== # 監控系統自身 # =========================================================================== diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 16589b1..e133d2e 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -753,3 +753,4 @@ POSTGRES_HOST=momo-db | 2026-06-25 | 共用成長流程列手機版不可溢出畫面 | V10.674 起全站 `momo-growth-rail` 在手機寬度改為換行呈現,避免「評估 / 分析 / 建議 / 解法 / 治理」流程 chip 超出視覺邊界。 | | 2026-06-25 | 匯入、缺貨、設定與通知模板頁不可外露 SQL / 資料表 / 模板代碼 | V10.675 起匯入 job API 對前台回傳白話處置訊息,保留 raw error 於 DB/log;自動匯入失敗通知不再顯示 `psycopg2`、`daily_sales_snapshot`、`snapshot_date` 等內部字串;缺貨首頁、系統設定與通知模板列表改用營運語言,並補上逾時匯入任務重置與取消 API。 | | 2026-06-25 | 觀測台入口與通知預覽不可用工程主語干擾營運判讀 | V10.676 起觀測台導覽統一使用「AI 分工矩陣」,通知模板列表會把 K8s/Pod/資料庫/CI Pipeline 等內部詞轉成服務健康、資料連線與部署流程;主機健康事件與自癒劇本改顯示任務/問題/處置提醒,不直接露 `unknown_task`、`scheduler_task_failure`、`CODE_FIX` 等 raw code。 | +| 2026-06-25 | 部署監控不得用退役正式域名判定失敗 | V10.677 起 CI/CD 狀態 API 與 active blackbox 監控預設以 `PUBLIC_URL` / `PROD_BASE_URL` 對齊現行正式入口 `https://mo.wooo.work/health`,不再把 `momo.wooo.work` timeout 判成正式部署失敗;Webcrumbs loader fallback 也改為資訊級降級訊號,避免健康頁與 log 產生假紅燈。 | diff --git a/routes/cicd_routes.py b/routes/cicd_routes.py index 16a8f3f..bc8052a 100644 --- a/routes/cicd_routes.py +++ b/routes/cicd_routes.py @@ -11,6 +11,7 @@ import json import logging import os import re +from urllib.parse import urlparse cicd_bp = Blueprint('cicd', __name__) cicd_log = logging.getLogger('cicd_routes') @@ -97,14 +98,41 @@ if not GITLAB_ENABLED: ) # 環境配置 +def _normalize_base_url(value, fallback='https://mo.wooo.work'): + candidate = (value or '').strip().rstrip('/') or fallback + parsed = urlparse(candidate) + if parsed.scheme not in {'http', 'https'} or not parsed.netloc: + return fallback.rstrip('/') + return candidate + + +def _health_endpoint_for(base_url): + return f"{base_url.rstrip('/')}/health" + + +PUBLIC_BASE_URL = _normalize_base_url(os.getenv('PUBLIC_URL'), 'https://mo.wooo.work') +CICD_UAT_BASE_URL = _normalize_base_url( + os.getenv('CICD_UAT_BASE_URL') + or os.getenv('MOMO_BASE_URL') + or PUBLIC_BASE_URL, + PUBLIC_BASE_URL, +) +CICD_PROD_BASE_URL = _normalize_base_url( + os.getenv('CICD_PROD_BASE_URL') + or os.getenv('PROD_BASE_URL') + or PUBLIC_BASE_URL, + PUBLIC_BASE_URL, +) + + ENVIRONMENTS = { 'uat': { - 'name': 'UAT', - 'label': '測試環境', + 'name': 'LIVE', + 'label': '線上入口', 'color': '#3498db', 'icon': '🟦', - 'url': 'https://mo.wooo.work', - 'health_endpoint': 'https://mo.wooo.work/health', + 'url': CICD_UAT_BASE_URL, + 'health_endpoint': _health_endpoint_for(CICD_UAT_BASE_URL), 'runtime_host': '192.168.0.188' }, 'prod': { @@ -112,12 +140,21 @@ ENVIRONMENTS = { 'label': '正式環境', 'color': '#e74c3c', 'icon': '🟥', - 'url': 'https://momo.wooo.work', - 'health_endpoint': 'https://momo.wooo.work/health', + 'url': CICD_PROD_BASE_URL, + 'health_endpoint': _health_endpoint_for(CICD_PROD_BASE_URL), 'runtime_host': '192.168.0.188' } } + +def _public_health_error(exc): + text = str(exc or '').lower() + if 'timeout' in text or 'timed out' in text: + return '健康檢查逾時,請確認正式入口、Nginx 與 188 應用容器狀態。' + if 'connection' in text or 'refused' in text or 'max retries' in text: + return '健康檢查無法連線,請確認正式入口、Nginx 與 188 應用容器狀態。' + return '健康檢查暫時無法完成,請稍後重試或查看部署診斷。' + # ============================================================================= # 部署監控頁面 # ============================================================================= @@ -171,7 +208,7 @@ def get_cicd_status(): 'type': 'environment', 'environment': env_id, 'message': f"{env_status.get('name')} 環境異常", - 'error': env_status.get('error'), + 'error': env_status.get('display_error') or env_status.get('error'), 'severity': 'critical', 'auto_fixable': True, 'fix_action': 'diagnose' @@ -259,7 +296,7 @@ def get_pipeline_detail(pipeline_id): except Exception as e: return jsonify({ 'success': False, - 'error': str(e) + 'error': '部署監控暫時無法完成,請稍後重試或查看服務健康狀態。' }), 500 @@ -448,7 +485,7 @@ def run_diagnosis(env): diagnosis['checks'].append({ 'name': '健康端點', 'status': 'failed', - 'error': str(e) + 'error': _public_health_error(e) }) # EwoooC 已撤除舊叢集 runtime,這裡只保留現行 Docker Compose 狀態說明。 @@ -473,7 +510,7 @@ def run_diagnosis(env): diagnosis['checks'].append({ 'name': 'Registry', 'status': 'failed', - 'error': str(e) + 'error': '映像倉庫健康檢查暫時無法完成,請查看 Registry 服務狀態。' }) # 生成總結 @@ -501,7 +538,7 @@ def run_diagnosis(env): }) except Exception as e: - diagnosis['error'] = str(e) + diagnosis['error'] = '部署診斷暫時無法完成,請稍後重試。' return diagnosis @@ -877,7 +914,13 @@ def get_environment_status(env_id): status['response_time'] = round(response_time, 2) status['last_check'] = datetime.now().isoformat() except Exception as e: - status['error'] = str(e) + cicd_log.warning( + "[CI/CD] Health check failed env=%s url=%s error=%s", + env_id, + env_config.get('health_endpoint'), + e, + ) + status['error'] = _public_health_error(e) status['runtime_note'] = 'Docker Compose on 192.168.0.188; legacy cluster probes disabled.' diff --git a/routes/system_public_routes.py b/routes/system_public_routes.py index 0db3d14..1ea6518 100644 --- a/routes/system_public_routes.py +++ b/routes/system_public_routes.py @@ -217,15 +217,20 @@ def webcrumbs_asset_proxy(asset_path): try: upstream_response = requests.get(upstream_url, timeout=(2, 8)) except requests.RequestException as exc: - sys_log.warning(f"[Webcrumbs] Asset proxy failed: {normalized_path} -> {exc}") if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH: + sys_log.info("[Webcrumbs] Loader upstream unavailable; serving local fallback") return _webcrumbs_fallback_loader_response('upstream-unavailable') + sys_log.warning(f"[Webcrumbs] Asset proxy failed: {normalized_path} -> {exc}") return Response('Webcrumbs asset upstream unavailable', status=502, mimetype='text/plain') if upstream_response.status_code >= 400: - sys_log.warning(f"[Webcrumbs] Asset proxy returned {upstream_response.status_code}: {normalized_path}") if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH: + sys_log.info( + "[Webcrumbs] Loader upstream returned %s; serving local fallback", + upstream_response.status_code, + ) return _webcrumbs_fallback_loader_response(f'upstream-{upstream_response.status_code}') + sys_log.warning(f"[Webcrumbs] Asset proxy returned {upstream_response.status_code}: {normalized_path}") return Response('Webcrumbs asset upstream returned error', status=upstream_response.status_code, mimetype='text/plain') content_type = upstream_response.headers.get('Content-Type') diff --git a/scripts/auto-repair/env-sync-monitor.sh b/scripts/auto-repair/env-sync-monitor.sh index 53a8e1c..18443d2 100755 --- a/scripts/auto-repair/env-sync-monitor.sh +++ b/scripts/auto-repair/env-sync-monitor.sh @@ -97,7 +97,7 @@ compare_health() { log "💓 比對服務健康狀態..." UAT_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null) - GCP_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://momo.wooo.work/health" 2>/dev/null) + GCP_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null) log " UAT: ${UAT_HEALTH}" log " GCP: ${GCP_HEALTH}" @@ -228,7 +228,7 @@ main() { 檢查時間: $(date '+%Y-%m-%d %H:%M:%S') UAT: https://mo.wooo.work -GCP: https://momo.wooo.work +正式入口: https://mo.wooo.work 所有配置、版本、服務狀態一致。" fi diff --git a/scripts/domain-health-monitor.sh b/scripts/domain-health-monitor.sh index 14dcb2e..6d67e4a 100644 --- a/scripts/domain-health-monitor.sh +++ b/scripts/domain-health-monitor.sh @@ -34,8 +34,7 @@ NC='\033[0m' declare -A DOMAINS=( # 核心業務 - 最高優先級 - ["https://mo.wooo.work/health"]="200|MOMO App UAT|kubectl rollout restart deployment/momo-app -n momo|60" - ["https://momo.wooo.work/health"]="200|MOMO App GCP|gcloud compute ssh momo-pro-gcp --zone=asia-east1-b --command='sudo kubectl rollout restart deployment/momo-app -n momo'|60" + ["https://mo.wooo.work/health"]="200|MOMO Pro 正式入口|ssh ollama@192.168.0.188 'cd /home/ollama/momo-pro && docker compose up -d --no-deps --force-recreate momo-app'|60" # CI/CD 工具 - 高優先級(GitLab 需要更長啟動時間) ["http://192.168.0.110:8929/"]="200|GitLab|docker restart wooo-gitlab|120" diff --git a/templates/cicd_dashboard.html b/templates/cicd_dashboard.html index 9ef8776..75ab029 100644 --- a/templates/cicd_dashboard.html +++ b/templates/cicd_dashboard.html @@ -718,7 +718,7 @@
${issue.type === 'job' ? `${displayStageName(issue.stage)}` : ''} ${issue.type === 'runtime' ? `${issue.environment?.toUpperCase()}` : ''} - ${issue.error ? `
${escapeHtml(issue.error.substring(0, 100))}` : ''} + ${issue.error ? `
${escapeHtml(issue.error.substring(0, 120))}` : ''}
${issue.fix_suggestion ? `
💡 ${escapeHtml(issue.fix_suggestion)}
` : ''} ${issue.error_log ? `
${escapeHtml(issue.error_log.substring(0, 300))}
` : ''} diff --git a/tests/test_prometheus_ai_automation_scrape.py b/tests/test_prometheus_ai_automation_scrape.py index c646735..8a7f9bc 100644 --- a/tests/test_prometheus_ai_automation_scrape.py +++ b/tests/test_prometheus_ai_automation_scrape.py @@ -85,5 +85,5 @@ def test_compose_prometheus_blackbox_targets_health_only(): assert "- http://momo-pro-system:80/health" in uat_block assert "- https://mo.wooo.work\n" not in uat_block assert "- http://192.168.0.110:5001\n" not in uat_block - assert "- https://momo.wooo.work/health" in prod_block - assert "- https://momo.wooo.work\n" not in prod_block + assert "- https://mo.wooo.work/health" in prod_block + assert "- https://momo.wooo.work" not in prod_block diff --git a/tests/test_webcrumbs_asset_proxy.py b/tests/test_webcrumbs_asset_proxy.py index d3b8d85..1e2310f 100644 --- a/tests/test_webcrumbs_asset_proxy.py +++ b/tests/test_webcrumbs_asset_proxy.py @@ -9,3 +9,4 @@ def test_webcrumbs_loader_has_safe_fallback_response(): assert "status=200, mimetype='application/javascript'" in source assert "X-Webcrumbs-Fallback" in source assert "upstream-unavailable" in source + assert "Loader upstream unavailable; serving local fallback" in source