Files
ewoooc/scripts/domain-health-monitor.sh
ogt 903cf1a27a
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s
fix: align deploy health checks with live endpoint
2026-06-25 14:45:02 +08:00

249 lines
8.5 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO TECH - 域名健康監控腳本
# 功能:監控所有域名狀態 => 告警 => 自動修復
# 使用:./scripts/domain-health-monitor.sh
# Cron*/5 * * * * /home/wooo/momo_pro_system/scripts/domain-health-monitor.sh
# =============================================================================
set -e
# 配置
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT_ID="5619078117"
LOG_FILE="/home/wooo/logs/domain_health_monitor.log"
LOCK_FILE="/tmp/domain_health_monitor.lock"
# 狀態檔案:記錄上次異常的服務,用於發送「已恢復」通知
STATUS_FILE="/tmp/domain_health_status.txt"
# 顏色定義
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 域名監控列表 (格式: 域名|預期狀態碼|服務名稱|修復命令|等待時間)
# 2026-02-13 修正n8n 和 Superset 是 Docker 容器,不是 K8s
# 重要服務優先順序:
# 1. MOMO App (核心業務)
# 2. GitLab (CI/CD)
# 3. Registry (映像倉庫)
# 4. n8n (自動化)
# 5. Superset (BI)
# 6. 監控頁面
declare -A DOMAINS=(
# 核心業務 - 最高優先級
["https://mo.wooo.work/health"]="200|MOMO Pro 正式入口|ssh ollama@192.168.0.188 'cd /home/ollama/momo-pro && docker compose up -d --no-deps --force-recreate momo-app'|60"
# CI/CD 工具 - 高優先級GitLab 需要更長啟動時間)
["http://192.168.0.110:8929/"]="200|GitLab|docker restart wooo-gitlab|120"
["https://registry.wooo.work/v2/"]="401|Docker Registry|cd /home/wooo/devops/registry && docker compose restart|30"
# 自動化與 BI - 中優先級Docker 容器)
["http://127.0.0.1:5678/"]="200|n8n|docker start momo-n8n 2>/dev/null || docker run -d --name momo-n8n --restart unless-stopped -p 5678:5678 -e N8N_BASIC_AUTH_USER=admin -e N8N_BASIC_AUTH_PASSWORD=Wooo_N8n_2026 -e N8N_RUNNERS_DISABLED=true -v n8n_data:/home/node/.n8n n8nio/n8n:latest|30"
["http://127.0.0.1:8088/health"]="200|Superset|cd /home/wooo/momo_pro_system/docker/superset && docker compose up -d|60"
# 監控頁面 - 較低優先級
["https://monitor.wooo.work/"]="200|Monitor 首頁|systemctl reload nginx|10"
)
# 發送 Telegram 通知
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "parse_mode=HTML" \
-d "text=${message}" > /dev/null 2>&1
}
# 記錄日誌
log() {
local level="$1"
local message="$2"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message" | tee -a "$LOG_FILE"
}
# 檢查服務健康狀態
check_health() {
local url="$1"
local expected_code="$2"
local timeout=10
local response=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout $timeout -L "$url" 2>/dev/null || echo "000")
if [[ "$response" == "$expected_code" ]]; then
return 0
else
return 1
fi
}
# 執行自動修復
auto_repair() {
local service="$1"
local repair_cmd="$2"
log "REPAIR" "嘗試修復 $service: $repair_cmd"
# 設置 KUBECONFIG
export KUBECONFIG=/home/wooo/.kube/config
# 執行修復命令
if eval "$repair_cmd" 2>&1; then
log "REPAIR" "$service 修復命令已執行"
return 0
else
log "ERROR" "$service 修復命令執行失敗"
return 1
fi
}
# 主程式
main() {
# 防止重複執行
if [[ -f "$LOCK_FILE" ]]; then
pid=$(cat "$LOCK_FILE")
if kill -0 "$pid" 2>/dev/null; then
log "WARN" "另一個實例正在運行 (PID: $pid)"
exit 0
fi
fi
echo $$ > "$LOCK_FILE"
trap "rm -f $LOCK_FILE" EXIT
log "INFO" "===== 開始域名健康檢查 ====="
local failed_services=()
local repaired_services=()
local still_failed=()
# 檢查所有域名
for url in "${!DOMAINS[@]}"; do
IFS='|' read -r expected_code service_name repair_cmd wait_time <<< "${DOMAINS[$url]}"
# 預設等待時間 30 秒
wait_time=${wait_time:-30}
if check_health "$url" "$expected_code"; then
log "OK" "$service_name ($url) - 狀態正常"
else
log "ERROR" "$service_name ($url) - 狀態異常"
failed_services+=("$service_name")
# 嘗試自動修復
if [[ -n "$repair_cmd" ]]; then
if auto_repair "$service_name" "$repair_cmd"; then
# 等待服務恢復(使用服務特定的等待時間)
log "INFO" "$service_name 等待 ${wait_time} 秒後驗證..."
sleep "$wait_time"
# 重新檢查(最多重試 3 次,每次間隔 10 秒)
local retry=0
local max_retry=3
local repaired=false
while [[ $retry -lt $max_retry ]]; do
if check_health "$url" "$expected_code"; then
log "OK" "$service_name 自動修復成功 (第 $((retry+1)) 次檢查)"
repaired_services+=("$service_name")
repaired=true
break
fi
retry=$((retry + 1))
if [[ $retry -lt $max_retry ]]; then
log "WARN" "$service_name$retry 次檢查失敗10 秒後重試..."
sleep 10
fi
done
if [[ "$repaired" == "false" ]]; then
log "ERROR" "$service_name 自動修復失敗(已重試 $max_retry 次)"
still_failed+=("$service_name")
fi
else
still_failed+=("$service_name")
fi
else
still_failed+=("$service_name")
fi
fi
done
# 發送告警通知
if [[ ${#failed_services[@]} -gt 0 ]]; then
local alert_msg="🔴 <b>【UAT】域名健康監控告警</b>\n\n"
alert_msg+="<b>異常服務:</b>\n"
for svc in "${failed_services[@]}"; do
alert_msg+="$svc\n"
done
if [[ ${#repaired_services[@]} -gt 0 ]]; then
alert_msg+="\n✅ <b>已自動修復:</b>\n"
for svc in "${repaired_services[@]}"; do
alert_msg+="$svc\n"
done
fi
if [[ ${#still_failed[@]} -gt 0 ]]; then
alert_msg+="\n❌ <b>需要手動處理:</b>\n"
for svc in "${still_failed[@]}"; do
alert_msg+="$svc\n"
done
fi
alert_msg+="\n⏰ <i>時間: $(date '+%Y-%m-%d %H:%M:%S')</i>"
send_telegram "$alert_msg"
log "INFO" "Telegram 告警已發送"
fi
# 讀取上次異常的服務
local previous_failed=()
if [[ -f "$STATUS_FILE" ]]; then
while IFS= read -r line; do
[[ -n "$line" ]] && previous_failed+=("$line")
done < "$STATUS_FILE"
fi
# 檢查是否有服務從異常恢復
local recovered_services=()
for prev_svc in "${previous_failed[@]}"; do
local still_down=false
for fail_svc in "${still_failed[@]}"; do
if [[ "$prev_svc" == "$fail_svc" ]]; then
still_down=true
break
fi
done
if [[ "$still_down" == "false" ]]; then
recovered_services+=("$prev_svc")
fi
done
# 發送恢復通知(如果有之前異常的服務現在正常了)
if [[ ${#recovered_services[@]} -gt 0 && ${#still_failed[@]} -eq 0 ]]; then
local recovery_msg="🟢 <b>【UAT】服務已恢復正常</b>\n\n"
recovery_msg+="<b>已恢復的服務:</b>\n"
for svc in "${recovered_services[@]}"; do
recovery_msg+="• ✅ $svc\n"
done
recovery_msg+="\n所有監控服務已恢復正常運作。\n"
recovery_msg+="\n⏰ <i>時間: $(date '+%Y-%m-%d %H:%M:%S')</i>"
send_telegram "$recovery_msg"
log "INFO" "Telegram 恢復通知已發送"
fi
# 更新狀態檔案(記錄當前仍異常的服務)
> "$STATUS_FILE"
for svc in "${still_failed[@]}"; do
echo "$svc" >> "$STATUS_FILE"
done
log "INFO" "===== 健康檢查完成 ====="
log "INFO" "總計: ${#DOMAINS[@]} 個服務, ${#failed_services[@]} 個異常, ${#repaired_services[@]} 個已修復, ${#recovered_services[@]} 個已恢復"
}
main "$@"