250 lines
8.6 KiB
Bash
250 lines
8.6 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO TECH - 域名健康監控腳本
|
||
# 功能:監控所有域名狀態 => 告警 => 自動修復
|
||
# 使用:./scripts/domain-health-monitor.sh
|
||
# Cron:*/5 * * * * /home/wooo/momo_pro_system/scripts/domain-health-monitor.sh
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
# 配置
|
||
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT_ID="5619078117"
|
||
LOG_FILE="/home/wooo/logs/domain_health_monitor.log"
|
||
LOCK_FILE="/tmp/domain_health_monitor.lock"
|
||
# 狀態檔案:記錄上次異常的服務,用於發送「已恢復」通知
|
||
STATUS_FILE="/tmp/domain_health_status.txt"
|
||
|
||
# 顏色定義
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
NC='\033[0m'
|
||
|
||
# 域名監控列表 (格式: 域名|預期狀態碼|服務名稱|修復命令|等待時間)
|
||
# 2026-02-13 修正:n8n 和 Superset 是 Docker 容器,不是 K8s
|
||
# 重要服務優先順序:
|
||
# 1. MOMO App (核心業務)
|
||
# 2. GitLab (CI/CD)
|
||
# 3. Registry (映像倉庫)
|
||
# 4. n8n (自動化)
|
||
# 5. Superset (BI)
|
||
# 6. 監控頁面
|
||
|
||
declare -A DOMAINS=(
|
||
# 核心業務 - 最高優先級
|
||
["https://mo.wooo.work/health"]="200|MOMO App UAT|kubectl rollout restart deployment/momo-app -n momo|60"
|
||
["https://momo.wooo.work/health"]="200|MOMO App GCP|gcloud compute ssh momo-pro-gcp --zone=asia-east1-b --command='sudo kubectl rollout restart deployment/momo-app -n momo'|60"
|
||
|
||
# CI/CD 工具 - 高優先級(GitLab 需要更長啟動時間)
|
||
["http://192.168.0.110:8929/"]="200|GitLab|docker restart wooo-gitlab|120"
|
||
["https://registry.wooo.work/v2/"]="401|Docker Registry|cd /home/wooo/devops/registry && docker compose restart|30"
|
||
|
||
# 自動化與 BI - 中優先級(Docker 容器)
|
||
["http://127.0.0.1:5678/"]="200|n8n|docker start momo-n8n 2>/dev/null || docker run -d --name momo-n8n --restart unless-stopped -p 5678:5678 -e N8N_BASIC_AUTH_USER=admin -e N8N_BASIC_AUTH_PASSWORD=Wooo_N8n_2026 -e N8N_RUNNERS_DISABLED=true -v n8n_data:/home/node/.n8n n8nio/n8n:latest|30"
|
||
["http://127.0.0.1:8088/health"]="200|Superset|cd /home/wooo/momo_pro_system/docker/superset && docker compose up -d|60"
|
||
|
||
# 監控頁面 - 較低優先級
|
||
["https://monitor.wooo.work/"]="200|Monitor 首頁|systemctl reload nginx|10"
|
||
)
|
||
|
||
# 發送 Telegram 通知
|
||
send_telegram() {
|
||
local message="$1"
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
||
-d "parse_mode=HTML" \
|
||
-d "text=${message}" > /dev/null 2>&1
|
||
}
|
||
|
||
# 記錄日誌
|
||
log() {
|
||
local level="$1"
|
||
local message="$2"
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $message" | tee -a "$LOG_FILE"
|
||
}
|
||
|
||
# 檢查服務健康狀態
|
||
check_health() {
|
||
local url="$1"
|
||
local expected_code="$2"
|
||
local timeout=10
|
||
|
||
local response=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout $timeout -L "$url" 2>/dev/null || echo "000")
|
||
|
||
if [[ "$response" == "$expected_code" ]]; then
|
||
return 0
|
||
else
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 執行自動修復
|
||
auto_repair() {
|
||
local service="$1"
|
||
local repair_cmd="$2"
|
||
|
||
log "REPAIR" "嘗試修復 $service: $repair_cmd"
|
||
|
||
# 設置 KUBECONFIG
|
||
export KUBECONFIG=/home/wooo/.kube/config
|
||
|
||
# 執行修復命令
|
||
if eval "$repair_cmd" 2>&1; then
|
||
log "REPAIR" "$service 修復命令已執行"
|
||
return 0
|
||
else
|
||
log "ERROR" "$service 修復命令執行失敗"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 主程式
|
||
main() {
|
||
# 防止重複執行
|
||
if [[ -f "$LOCK_FILE" ]]; then
|
||
pid=$(cat "$LOCK_FILE")
|
||
if kill -0 "$pid" 2>/dev/null; then
|
||
log "WARN" "另一個實例正在運行 (PID: $pid)"
|
||
exit 0
|
||
fi
|
||
fi
|
||
echo $$ > "$LOCK_FILE"
|
||
trap "rm -f $LOCK_FILE" EXIT
|
||
|
||
log "INFO" "===== 開始域名健康檢查 ====="
|
||
|
||
local failed_services=()
|
||
local repaired_services=()
|
||
local still_failed=()
|
||
|
||
# 檢查所有域名
|
||
for url in "${!DOMAINS[@]}"; do
|
||
IFS='|' read -r expected_code service_name repair_cmd wait_time <<< "${DOMAINS[$url]}"
|
||
|
||
# 預設等待時間 30 秒
|
||
wait_time=${wait_time:-30}
|
||
|
||
if check_health "$url" "$expected_code"; then
|
||
log "OK" "$service_name ($url) - 狀態正常"
|
||
else
|
||
log "ERROR" "$service_name ($url) - 狀態異常"
|
||
failed_services+=("$service_name")
|
||
|
||
# 嘗試自動修復
|
||
if [[ -n "$repair_cmd" ]]; then
|
||
if auto_repair "$service_name" "$repair_cmd"; then
|
||
# 等待服務恢復(使用服務特定的等待時間)
|
||
log "INFO" "$service_name 等待 ${wait_time} 秒後驗證..."
|
||
sleep "$wait_time"
|
||
|
||
# 重新檢查(最多重試 3 次,每次間隔 10 秒)
|
||
local retry=0
|
||
local max_retry=3
|
||
local repaired=false
|
||
|
||
while [[ $retry -lt $max_retry ]]; do
|
||
if check_health "$url" "$expected_code"; then
|
||
log "OK" "$service_name 自動修復成功 (第 $((retry+1)) 次檢查)"
|
||
repaired_services+=("$service_name")
|
||
repaired=true
|
||
break
|
||
fi
|
||
retry=$((retry + 1))
|
||
if [[ $retry -lt $max_retry ]]; then
|
||
log "WARN" "$service_name 第 $retry 次檢查失敗,10 秒後重試..."
|
||
sleep 10
|
||
fi
|
||
done
|
||
|
||
if [[ "$repaired" == "false" ]]; then
|
||
log "ERROR" "$service_name 自動修復失敗(已重試 $max_retry 次)"
|
||
still_failed+=("$service_name")
|
||
fi
|
||
else
|
||
still_failed+=("$service_name")
|
||
fi
|
||
else
|
||
still_failed+=("$service_name")
|
||
fi
|
||
fi
|
||
done
|
||
|
||
# 發送告警通知
|
||
if [[ ${#failed_services[@]} -gt 0 ]]; then
|
||
local alert_msg="🔴 <b>【UAT】域名健康監控告警</b>\n\n"
|
||
alert_msg+="<b>異常服務:</b>\n"
|
||
for svc in "${failed_services[@]}"; do
|
||
alert_msg+="• $svc\n"
|
||
done
|
||
|
||
if [[ ${#repaired_services[@]} -gt 0 ]]; then
|
||
alert_msg+="\n✅ <b>已自動修復:</b>\n"
|
||
for svc in "${repaired_services[@]}"; do
|
||
alert_msg+="• $svc\n"
|
||
done
|
||
fi
|
||
|
||
if [[ ${#still_failed[@]} -gt 0 ]]; then
|
||
alert_msg+="\n❌ <b>需要手動處理:</b>\n"
|
||
for svc in "${still_failed[@]}"; do
|
||
alert_msg+="• $svc\n"
|
||
done
|
||
fi
|
||
|
||
alert_msg+="\n⏰ <i>時間: $(date '+%Y-%m-%d %H:%M:%S')</i>"
|
||
|
||
send_telegram "$alert_msg"
|
||
log "INFO" "Telegram 告警已發送"
|
||
fi
|
||
|
||
# 讀取上次異常的服務
|
||
local previous_failed=()
|
||
if [[ -f "$STATUS_FILE" ]]; then
|
||
while IFS= read -r line; do
|
||
[[ -n "$line" ]] && previous_failed+=("$line")
|
||
done < "$STATUS_FILE"
|
||
fi
|
||
|
||
# 檢查是否有服務從異常恢復
|
||
local recovered_services=()
|
||
for prev_svc in "${previous_failed[@]}"; do
|
||
local still_down=false
|
||
for fail_svc in "${still_failed[@]}"; do
|
||
if [[ "$prev_svc" == "$fail_svc" ]]; then
|
||
still_down=true
|
||
break
|
||
fi
|
||
done
|
||
if [[ "$still_down" == "false" ]]; then
|
||
recovered_services+=("$prev_svc")
|
||
fi
|
||
done
|
||
|
||
# 發送恢復通知(如果有之前異常的服務現在正常了)
|
||
if [[ ${#recovered_services[@]} -gt 0 && ${#still_failed[@]} -eq 0 ]]; then
|
||
local recovery_msg="🟢 <b>【UAT】服務已恢復正常</b>\n\n"
|
||
recovery_msg+="<b>已恢復的服務:</b>\n"
|
||
for svc in "${recovered_services[@]}"; do
|
||
recovery_msg+="• ✅ $svc\n"
|
||
done
|
||
recovery_msg+="\n所有監控服務已恢復正常運作。\n"
|
||
recovery_msg+="\n⏰ <i>時間: $(date '+%Y-%m-%d %H:%M:%S')</i>"
|
||
|
||
send_telegram "$recovery_msg"
|
||
log "INFO" "Telegram 恢復通知已發送"
|
||
fi
|
||
|
||
# 更新狀態檔案(記錄當前仍異常的服務)
|
||
> "$STATUS_FILE"
|
||
for svc in "${still_failed[@]}"; do
|
||
echo "$svc" >> "$STATUS_FILE"
|
||
done
|
||
|
||
log "INFO" "===== 健康檢查完成 ====="
|
||
log "INFO" "總計: ${#DOMAINS[@]} 個服務, ${#failed_services[@]} 個異常, ${#repaired_services[@]} 個已修復, ${#recovered_services[@]} 個已恢復"
|
||
}
|
||
|
||
main "$@"
|