fix: align deploy health checks with live endpoint
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s

This commit is contained in:
ogt
2026-06-25 14:45:02 +08:00
parent 83561c8530
commit 903cf1a27a
12 changed files with 76 additions and 45 deletions

View File

@@ -402,7 +402,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ========================================== # ==========================================
# 系統版本與路徑 # 系統版本與路徑
# ========================================== # ==========================================
SYSTEM_VERSION = "V10.676" SYSTEM_VERSION = "V10.677"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示 public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -14,8 +14,8 @@ SERVICE=$(echo "$QUERY_STRING" | sed -n 's/.*service=\([^&]*\).*/\1/p')
# 定義服務健康檢查 URL # 定義服務健康檢查 URL
declare -A HEALTH_URLS=( declare -A HEALTH_URLS=(
["momo-uat"]="https://mo.wooo.work/health" ["momo-live"]="https://mo.wooo.work/health"
["momo-gcp"]="https://momo.wooo.work/health" ["momo-prod"]="https://mo.wooo.work/health"
["gitlab"]="http://127.0.0.1:8929/" ["gitlab"]="http://127.0.0.1:8929/"
["registry"]="http://127.0.0.1:5002/v2/" ["registry"]="http://127.0.0.1:5002/v2/"
["n8n"]="http://127.0.0.1:5678/" ["n8n"]="http://127.0.0.1:5678/"

View File

@@ -27,9 +27,9 @@ check_service() {
echo '{"services": {' echo '{"services": {'
# 核心服務 # 核心服務
check_service "momo-uat" "https://mo.wooo.work/health" check_service "momo-live" "https://mo.wooo.work/health"
echo "," echo ","
check_service "momo-gcp" "https://momo.wooo.work/health" check_service "momo-prod" "https://mo.wooo.work/health"
echo "," echo ","
# 開發工具 # 開發工具

View File

@@ -119,7 +119,7 @@ scrape_configs:
module: [http_2xx] module: [http_2xx]
static_configs: static_configs:
- targets: - targets:
- https://momo.wooo.work/health - https://mo.wooo.work/health
labels: labels:
env: 'prod' env: 'prod'
probe_type: 'http' probe_type: 'http'
@@ -268,24 +268,6 @@ scrape_configs:
- target_label: __address__ - target_label: __address__
replacement: blackbox-exporter:9115 replacement: blackbox-exporter:9115
- job_name: 'blackbox-dns-momo'
metrics_path: /probe
params:
module: [dns_check_momo]
static_configs:
- targets:
- 8.8.8.8 # Google DNS - momo.wooo.work
labels:
domain: 'momo.wooo.work'
probe_type: 'dns'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# =========================================================================== # ===========================================================================
# 監控系統自身 # 監控系統自身
# =========================================================================== # ===========================================================================

View File

@@ -753,3 +753,4 @@ POSTGRES_HOST=momo-db
| 2026-06-25 | 共用成長流程列手機版不可溢出畫面 | V10.674 起全站 `momo-growth-rail` 在手機寬度改為換行呈現,避免「評估 / 分析 / 建議 / 解法 / 治理」流程 chip 超出視覺邊界。 | | 2026-06-25 | 共用成長流程列手機版不可溢出畫面 | V10.674 起全站 `momo-growth-rail` 在手機寬度改為換行呈現,避免「評估 / 分析 / 建議 / 解法 / 治理」流程 chip 超出視覺邊界。 |
| 2026-06-25 | 匯入、缺貨、設定與通知模板頁不可外露 SQL / 資料表 / 模板代碼 | V10.675 起匯入 job API 對前台回傳白話處置訊息,保留 raw error 於 DB/log自動匯入失敗通知不再顯示 `psycopg2``daily_sales_snapshot``snapshot_date` 等內部字串;缺貨首頁、系統設定與通知模板列表改用營運語言,並補上逾時匯入任務重置與取消 API。 | | 2026-06-25 | 匯入、缺貨、設定與通知模板頁不可外露 SQL / 資料表 / 模板代碼 | V10.675 起匯入 job API 對前台回傳白話處置訊息,保留 raw error 於 DB/log自動匯入失敗通知不再顯示 `psycopg2``daily_sales_snapshot``snapshot_date` 等內部字串;缺貨首頁、系統設定與通知模板列表改用營運語言,並補上逾時匯入任務重置與取消 API。 |
| 2026-06-25 | 觀測台入口與通知預覽不可用工程主語干擾營運判讀 | V10.676 起觀測台導覽統一使用「AI 分工矩陣」,通知模板列表會把 K8s/Pod/資料庫/CI Pipeline 等內部詞轉成服務健康、資料連線與部署流程;主機健康事件與自癒劇本改顯示任務/問題/處置提醒,不直接露 `unknown_task``scheduler_task_failure``CODE_FIX` 等 raw code。 | | 2026-06-25 | 觀測台入口與通知預覽不可用工程主語干擾營運判讀 | V10.676 起觀測台導覽統一使用「AI 分工矩陣」,通知模板列表會把 K8s/Pod/資料庫/CI Pipeline 等內部詞轉成服務健康、資料連線與部署流程;主機健康事件與自癒劇本改顯示任務/問題/處置提醒,不直接露 `unknown_task``scheduler_task_failure``CODE_FIX` 等 raw code。 |
| 2026-06-25 | 部署監控不得用退役正式域名判定失敗 | V10.677 起 CI/CD 狀態 API 與 active blackbox 監控預設以 `PUBLIC_URL` / `PROD_BASE_URL` 對齊現行正式入口 `https://mo.wooo.work/health`,不再把 `momo.wooo.work` timeout 判成正式部署失敗Webcrumbs loader fallback 也改為資訊級降級訊號,避免健康頁與 log 產生假紅燈。 |

View File

@@ -11,6 +11,7 @@ import json
import logging import logging
import os import os
import re import re
from urllib.parse import urlparse
cicd_bp = Blueprint('cicd', __name__) cicd_bp = Blueprint('cicd', __name__)
cicd_log = logging.getLogger('cicd_routes') cicd_log = logging.getLogger('cicd_routes')
@@ -97,14 +98,41 @@ if not GITLAB_ENABLED:
) )
# 環境配置 # 環境配置
def _normalize_base_url(value, fallback='https://mo.wooo.work'):
candidate = (value or '').strip().rstrip('/') or fallback
parsed = urlparse(candidate)
if parsed.scheme not in {'http', 'https'} or not parsed.netloc:
return fallback.rstrip('/')
return candidate
def _health_endpoint_for(base_url):
return f"{base_url.rstrip('/')}/health"
PUBLIC_BASE_URL = _normalize_base_url(os.getenv('PUBLIC_URL'), 'https://mo.wooo.work')
CICD_UAT_BASE_URL = _normalize_base_url(
os.getenv('CICD_UAT_BASE_URL')
or os.getenv('MOMO_BASE_URL')
or PUBLIC_BASE_URL,
PUBLIC_BASE_URL,
)
CICD_PROD_BASE_URL = _normalize_base_url(
os.getenv('CICD_PROD_BASE_URL')
or os.getenv('PROD_BASE_URL')
or PUBLIC_BASE_URL,
PUBLIC_BASE_URL,
)
ENVIRONMENTS = { ENVIRONMENTS = {
'uat': { 'uat': {
'name': 'UAT', 'name': 'LIVE',
'label': '測試環境', 'label': '線上入口',
'color': '#3498db', 'color': '#3498db',
'icon': '🟦', 'icon': '🟦',
'url': 'https://mo.wooo.work', 'url': CICD_UAT_BASE_URL,
'health_endpoint': 'https://mo.wooo.work/health', 'health_endpoint': _health_endpoint_for(CICD_UAT_BASE_URL),
'runtime_host': '192.168.0.188' 'runtime_host': '192.168.0.188'
}, },
'prod': { 'prod': {
@@ -112,12 +140,21 @@ ENVIRONMENTS = {
'label': '正式環境', 'label': '正式環境',
'color': '#e74c3c', 'color': '#e74c3c',
'icon': '🟥', 'icon': '🟥',
'url': 'https://momo.wooo.work', 'url': CICD_PROD_BASE_URL,
'health_endpoint': 'https://momo.wooo.work/health', 'health_endpoint': _health_endpoint_for(CICD_PROD_BASE_URL),
'runtime_host': '192.168.0.188' 'runtime_host': '192.168.0.188'
} }
} }
def _public_health_error(exc):
text = str(exc or '').lower()
if 'timeout' in text or 'timed out' in text:
return '健康檢查逾時請確認正式入口、Nginx 與 188 應用容器狀態。'
if 'connection' in text or 'refused' in text or 'max retries' in text:
return '健康檢查無法連線請確認正式入口、Nginx 與 188 應用容器狀態。'
return '健康檢查暫時無法完成,請稍後重試或查看部署診斷。'
# ============================================================================= # =============================================================================
# 部署監控頁面 # 部署監控頁面
# ============================================================================= # =============================================================================
@@ -171,7 +208,7 @@ def get_cicd_status():
'type': 'environment', 'type': 'environment',
'environment': env_id, 'environment': env_id,
'message': f"{env_status.get('name')} 環境異常", 'message': f"{env_status.get('name')} 環境異常",
'error': env_status.get('error'), 'error': env_status.get('display_error') or env_status.get('error'),
'severity': 'critical', 'severity': 'critical',
'auto_fixable': True, 'auto_fixable': True,
'fix_action': 'diagnose' 'fix_action': 'diagnose'
@@ -259,7 +296,7 @@ def get_pipeline_detail(pipeline_id):
except Exception as e: except Exception as e:
return jsonify({ return jsonify({
'success': False, 'success': False,
'error': str(e) 'error': '部署監控暫時無法完成,請稍後重試或查看服務健康狀態。'
}), 500 }), 500
@@ -448,7 +485,7 @@ def run_diagnosis(env):
diagnosis['checks'].append({ diagnosis['checks'].append({
'name': '健康端點', 'name': '健康端點',
'status': 'failed', 'status': 'failed',
'error': str(e) 'error': _public_health_error(e)
}) })
# EwoooC 已撤除舊叢集 runtime這裡只保留現行 Docker Compose 狀態說明。 # EwoooC 已撤除舊叢集 runtime這裡只保留現行 Docker Compose 狀態說明。
@@ -473,7 +510,7 @@ def run_diagnosis(env):
diagnosis['checks'].append({ diagnosis['checks'].append({
'name': 'Registry', 'name': 'Registry',
'status': 'failed', 'status': 'failed',
'error': str(e) 'error': '映像倉庫健康檢查暫時無法完成,請查看 Registry 服務狀態。'
}) })
# 生成總結 # 生成總結
@@ -501,7 +538,7 @@ def run_diagnosis(env):
}) })
except Exception as e: except Exception as e:
diagnosis['error'] = str(e) diagnosis['error'] = '部署診斷暫時無法完成,請稍後重試。'
return diagnosis return diagnosis
@@ -877,7 +914,13 @@ def get_environment_status(env_id):
status['response_time'] = round(response_time, 2) status['response_time'] = round(response_time, 2)
status['last_check'] = datetime.now().isoformat() status['last_check'] = datetime.now().isoformat()
except Exception as e: except Exception as e:
status['error'] = str(e) cicd_log.warning(
"[CI/CD] Health check failed env=%s url=%s error=%s",
env_id,
env_config.get('health_endpoint'),
e,
)
status['error'] = _public_health_error(e)
status['runtime_note'] = 'Docker Compose on 192.168.0.188; legacy cluster probes disabled.' status['runtime_note'] = 'Docker Compose on 192.168.0.188; legacy cluster probes disabled.'

View File

@@ -217,15 +217,20 @@ def webcrumbs_asset_proxy(asset_path):
try: try:
upstream_response = requests.get(upstream_url, timeout=(2, 8)) upstream_response = requests.get(upstream_url, timeout=(2, 8))
except requests.RequestException as exc: except requests.RequestException as exc:
sys_log.warning(f"[Webcrumbs] Asset proxy failed: {normalized_path} -> {exc}")
if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH: if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH:
sys_log.info("[Webcrumbs] Loader upstream unavailable; serving local fallback")
return _webcrumbs_fallback_loader_response('upstream-unavailable') return _webcrumbs_fallback_loader_response('upstream-unavailable')
sys_log.warning(f"[Webcrumbs] Asset proxy failed: {normalized_path} -> {exc}")
return Response('Webcrumbs asset upstream unavailable', status=502, mimetype='text/plain') return Response('Webcrumbs asset upstream unavailable', status=502, mimetype='text/plain')
if upstream_response.status_code >= 400: if upstream_response.status_code >= 400:
sys_log.warning(f"[Webcrumbs] Asset proxy returned {upstream_response.status_code}: {normalized_path}")
if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH: if normalized_path == WEBCRUMBS_COMPATIBLE_LOADER_PATH:
sys_log.info(
"[Webcrumbs] Loader upstream returned %s; serving local fallback",
upstream_response.status_code,
)
return _webcrumbs_fallback_loader_response(f'upstream-{upstream_response.status_code}') return _webcrumbs_fallback_loader_response(f'upstream-{upstream_response.status_code}')
sys_log.warning(f"[Webcrumbs] Asset proxy returned {upstream_response.status_code}: {normalized_path}")
return Response('Webcrumbs asset upstream returned error', status=upstream_response.status_code, mimetype='text/plain') return Response('Webcrumbs asset upstream returned error', status=upstream_response.status_code, mimetype='text/plain')
content_type = upstream_response.headers.get('Content-Type') content_type = upstream_response.headers.get('Content-Type')

View File

@@ -97,7 +97,7 @@ compare_health() {
log "💓 比對服務健康狀態..." log "💓 比對服務健康狀態..."
UAT_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null) UAT_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null)
GCP_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://momo.wooo.work/health" 2>/dev/null) GCP_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null)
log " UAT: ${UAT_HEALTH}" log " UAT: ${UAT_HEALTH}"
log " GCP: ${GCP_HEALTH}" log " GCP: ${GCP_HEALTH}"
@@ -228,7 +228,7 @@ main() {
<b>檢查時間:</b> $(date '+%Y-%m-%d %H:%M:%S') <b>檢查時間:</b> $(date '+%Y-%m-%d %H:%M:%S')
<b>UAT:</b> https://mo.wooo.work <b>UAT:</b> https://mo.wooo.work
<b>GCP:</b> https://momo.wooo.work <b>正式入口:</b> https://mo.wooo.work
所有配置、版本、服務狀態一致。" 所有配置、版本、服務狀態一致。"
fi fi

View File

@@ -34,8 +34,7 @@ NC='\033[0m'
declare -A DOMAINS=( declare -A DOMAINS=(
# 核心業務 - 最高優先級 # 核心業務 - 最高優先級
["https://mo.wooo.work/health"]="200|MOMO App UAT|kubectl rollout restart deployment/momo-app -n momo|60" ["https://mo.wooo.work/health"]="200|MOMO Pro 正式入口|ssh ollama@192.168.0.188 'cd /home/ollama/momo-pro && docker compose up -d --no-deps --force-recreate momo-app'|60"
["https://momo.wooo.work/health"]="200|MOMO App GCP|gcloud compute ssh momo-pro-gcp --zone=asia-east1-b --command='sudo kubectl rollout restart deployment/momo-app -n momo'|60"
# CI/CD 工具 - 高優先級GitLab 需要更長啟動時間) # CI/CD 工具 - 高優先級GitLab 需要更長啟動時間)
["http://192.168.0.110:8929/"]="200|GitLab|docker restart wooo-gitlab|120" ["http://192.168.0.110:8929/"]="200|GitLab|docker restart wooo-gitlab|120"

View File

@@ -718,7 +718,7 @@
<div class="issue-detail"> <div class="issue-detail">
${issue.type === 'job' ? `<span class="badge bg-secondary me-1">${displayStageName(issue.stage)}</span>` : ''} ${issue.type === 'job' ? `<span class="badge bg-secondary me-1">${displayStageName(issue.stage)}</span>` : ''}
${issue.type === 'runtime' ? `<span class="badge bg-info me-1">${issue.environment?.toUpperCase()}</span>` : ''} ${issue.type === 'runtime' ? `<span class="badge bg-info me-1">${issue.environment?.toUpperCase()}</span>` : ''}
${issue.error ? `<br><code>${escapeHtml(issue.error.substring(0, 100))}</code>` : ''} ${issue.error ? `<br><span class="text-muted">${escapeHtml(issue.error.substring(0, 120))}</span>` : ''}
</div> </div>
${issue.fix_suggestion ? `<div class="issue-suggestion">💡 ${escapeHtml(issue.fix_suggestion)}</div>` : ''} ${issue.fix_suggestion ? `<div class="issue-suggestion">💡 ${escapeHtml(issue.fix_suggestion)}</div>` : ''}
${issue.error_log ? `<div class="error-log-preview">${escapeHtml(issue.error_log.substring(0, 300))}</div>` : ''} ${issue.error_log ? `<div class="error-log-preview">${escapeHtml(issue.error_log.substring(0, 300))}</div>` : ''}

View File

@@ -85,5 +85,5 @@ def test_compose_prometheus_blackbox_targets_health_only():
assert "- http://momo-pro-system:80/health" in uat_block assert "- http://momo-pro-system:80/health" in uat_block
assert "- https://mo.wooo.work\n" not in uat_block assert "- https://mo.wooo.work\n" not in uat_block
assert "- http://192.168.0.110:5001\n" not in uat_block assert "- http://192.168.0.110:5001\n" not in uat_block
assert "- https://momo.wooo.work/health" in prod_block assert "- https://mo.wooo.work/health" in prod_block
assert "- https://momo.wooo.work\n" not in prod_block assert "- https://momo.wooo.work" not in prod_block

View File

@@ -9,3 +9,4 @@ def test_webcrumbs_loader_has_safe_fallback_response():
assert "status=200, mimetype='application/javascript'" in source assert "status=200, mimetype='application/javascript'" in source
assert "X-Webcrumbs-Fallback" in source assert "X-Webcrumbs-Fallback" in source
assert "upstream-unavailable" in source assert "upstream-unavailable" in source
assert "Loader upstream unavailable; serving local fallback" in source