Files
ewoooc/scripts/auto-repair/env-sync-monitor.sh
ogt 903cf1a27a
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s
fix: align deploy health checks with live endpoint
2026-06-25 14:45:02 +08:00

262 lines
8.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# MOMO Pro System - 環境同步監控腳本
# 功能:比對 UAT 與 GCP 環境差異,自動告警並嘗試同步修復
# 版本1.0.0
# 日期2026-02-14
# =============================================================================
set -e
# 配置
UAT_HOST="wooo@192.168.0.110"
GCP_PROJECT="astral-gateway-484913-d7"
GCP_ZONE="asia-east1-b"
GCP_VM="momo-pro-gcp"
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT_ID="5619078117"
LOG_FILE="/var/log/env_sync_monitor.log"
DIFF_REPORT="/tmp/env_diff_report.txt"
# 顏色輸出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() {
echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d parse_mode="HTML" \
-d text="$message" > /dev/null 2>&1
}
# 取得 UAT 版本資訊
get_uat_version() {
ssh ${UAT_HOST} "sudo kubectl exec deploy/momo-app -n momo -- cat /app/VERSION 2>/dev/null || echo 'unknown'" 2>/dev/null
}
# 取得 GCP 版本資訊
get_gcp_version() {
gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl exec deploy/momo-app -n momo -- cat /app/VERSION 2>/dev/null || echo 'unknown'" 2>/dev/null
}
# 比對 ConfigMap
compare_configmaps() {
log "📋 比對 ConfigMap..."
# 取得 UAT ConfigMap
ssh ${UAT_HOST} "sudo kubectl get configmap momo-config -n momo -o jsonpath='{.data}' 2>/dev/null" | jq -S '.' > /tmp/uat_configmap.json 2>/dev/null || echo "{}" > /tmp/uat_configmap.json
# 取得 GCP ConfigMap
gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl get configmap momo-config -n momo -o jsonpath='{.data}' 2>/dev/null" | jq -S '.' > /tmp/gcp_configmap.json 2>/dev/null || echo "{}" > /tmp/gcp_configmap.json
# 比對
if ! diff -q /tmp/uat_configmap.json /tmp/gcp_configmap.json > /dev/null 2>&1; then
log "${RED}❌ ConfigMap 不一致${NC}"
diff /tmp/uat_configmap.json /tmp/gcp_configmap.json >> "$DIFF_REPORT" 2>/dev/null || true
return 1
else
log "${GREEN}✅ ConfigMap 一致${NC}"
return 0
fi
}
# 比對 Pod 映像版本
compare_images() {
log "🐳 比對容器映像版本..."
# 取得 UAT 映像
UAT_IMAGE=$(ssh ${UAT_HOST} "sudo kubectl get deploy momo-app -n momo -o jsonpath='{.spec.template.spec.containers[0].image}'" 2>/dev/null || echo "unknown")
# 取得 GCP 映像
GCP_IMAGE=$(gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl get deploy momo-app -n momo -o jsonpath='{.spec.template.spec.containers[0].image}'" 2>/dev/null || echo "unknown")
log " UAT: ${UAT_IMAGE}"
log " GCP: ${GCP_IMAGE}"
# 映像名稱可能不同,但版本應該一致(例如 :latest 或 :local
# 這裡只檢查是否都在運行
if [ "$UAT_IMAGE" == "unknown" ] || [ "$GCP_IMAGE" == "unknown" ]; then
log "${RED}❌ 無法取得映像資訊${NC}"
return 1
fi
return 0
}
# 比對服務健康狀態
compare_health() {
log "💓 比對服務健康狀態..."
UAT_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null)
GCP_HEALTH=$(curl -s -o /dev/null -w '%{http_code}' "https://mo.wooo.work/health" 2>/dev/null)
log " UAT: ${UAT_HEALTH}"
log " GCP: ${GCP_HEALTH}"
if [ "$UAT_HEALTH" != "200" ] || [ "$GCP_HEALTH" != "200" ]; then
log "${RED}❌ 健康檢查異常${NC}"
echo "Health check failed - UAT: ${UAT_HEALTH}, GCP: ${GCP_HEALTH}" >> "$DIFF_REPORT"
return 1
else
log "${GREEN}✅ 服務健康${NC}"
return 0
fi
}
# 比對 Pod 狀態
compare_pods() {
log "🔍 比對 Pod 狀態..."
UAT_PODS=$(ssh ${UAT_HOST} "sudo kubectl get pods -n momo --no-headers 2>/dev/null | awk '{print \$1, \$3}'" 2>/dev/null)
GCP_PODS=$(gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl get pods -n momo --no-headers 2>/dev/null | awk '{print \$1, \$3}'" 2>/dev/null)
# 檢查是否都是 Running
UAT_RUNNING=$(echo "$UAT_PODS" | grep -c "Running" || echo "0")
GCP_RUNNING=$(echo "$GCP_PODS" | grep -c "Running" || echo "0")
log " UAT Running Pods: ${UAT_RUNNING}"
log " GCP Running Pods: ${GCP_RUNNING}"
if [ "$UAT_RUNNING" -lt 3 ] || [ "$GCP_RUNNING" -lt 3 ]; then
log "${YELLOW}⚠️ Pod 數量異常${NC}"
echo "Pod count mismatch - UAT: ${UAT_RUNNING}, GCP: ${GCP_RUNNING}" >> "$DIFF_REPORT"
return 1
fi
return 0
}
# 比對關鍵檔案 checksum
compare_files() {
log "📁 比對關鍵檔案..."
CRITICAL_FILES=(
"app.py"
"config.py"
"requirements.txt"
)
local mismatch=0
for file in "${CRITICAL_FILES[@]}"; do
UAT_HASH=$(ssh ${UAT_HOST} "sudo kubectl exec deploy/momo-app -n momo -- md5sum /app/${file} 2>/dev/null | awk '{print \$1}'" 2>/dev/null || echo "none")
GCP_HASH=$(gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl exec deploy/momo-app -n momo -- md5sum /app/${file} 2>/dev/null | awk '{print \$1}'" 2>/dev/null || echo "none")
if [ "$UAT_HASH" != "$GCP_HASH" ]; then
log "${YELLOW} ⚠️ ${file}: UAT=${UAT_HASH:0:8}... GCP=${GCP_HASH:0:8}...${NC}"
echo "File mismatch: ${file}" >> "$DIFF_REPORT"
mismatch=1
else
log "${file}: 一致"
fi
done
return $mismatch
}
# 自動同步 ConfigMap
auto_sync_configmap() {
log "🔄 嘗試自動同步 ConfigMap..."
# 將 UAT 的 ConfigMap 複製到 GCP
ssh ${UAT_HOST} "sudo kubectl get configmap momo-config -n momo -o yaml" > /tmp/sync_configmap.yaml 2>/dev/null
# 透過 gcloud 傳送
gcloud compute scp /tmp/sync_configmap.yaml ${GCP_VM}:/tmp/ --zone=${GCP_ZONE} --project=${GCP_PROJECT}
# 套用
gcloud compute ssh ${GCP_VM} --zone=${GCP_ZONE} --project=${GCP_PROJECT} \
--command="sudo kubectl apply -f /tmp/sync_configmap.yaml && sudo kubectl rollout restart deployment/momo-app -n momo"
log "${GREEN}✅ ConfigMap 同步完成${NC}"
}
# 主程式
main() {
log "=========================================="
log "🔍 開始環境同步檢查"
log "=========================================="
# 清空報告
> "$DIFF_REPORT"
local has_diff=0
# 執行各項檢查
compare_health || has_diff=1
compare_pods || has_diff=1
compare_configmaps || { has_diff=1; auto_sync_configmap; }
compare_images || has_diff=1
compare_files || has_diff=1
# 發送報告
if [ $has_diff -eq 1 ]; then
local report=$(cat "$DIFF_REPORT" 2>/dev/null || echo "詳細資訊請查看日誌")
send_telegram "🔴 <b>【環境同步告警】</b>
發現 UAT 與 GCP 環境不一致!
<b>時間:</b> $(date '+%Y-%m-%d %H:%M:%S')
<b>差異項目:</b>
<code>${report:0:500}</code>
<b>建議動作:</b>
1. 檢查 CI/CD Pipeline 是否正常
2. 確認最近的部署操作
3. 執行 <code>./scripts/auto-repair/env-sync-monitor.sh --sync</code> 手動同步"
log "${RED}❌ 環境檢查發現差異,已發送告警${NC}"
else
log "${GREEN}✅ 環境檢查通過UAT 與 GCP 完全一致${NC}"
# 每日 09:00 發送狀態報告
if [ "$(date '+%H')" == "09" ]; then
send_telegram "🟢 <b>【環境同步報告】</b>
✅ UAT 與 GCP 環境完全同步
<b>檢查時間:</b> $(date '+%Y-%m-%d %H:%M:%S')
<b>UAT:</b> https://mo.wooo.work
<b>正式入口:</b> https://mo.wooo.work
所有配置、版本、服務狀態一致。"
fi
fi
log "=========================================="
log "🏁 環境同步檢查完成"
log "=========================================="
}
# 命令行參數處理
case "${1:-check}" in
check)
main
;;
--sync)
log "🔄 執行強制同步..."
auto_sync_configmap
;;
--help)
echo "用法: $0 [check|--sync|--help]"
echo " check - 檢查環境差異(預設)"
echo " --sync - 強制同步 UAT 配置到 GCP"
echo " --help - 顯示此幫助"
;;
*)
echo "未知選項: $1"
exit 1
;;
esac