266 lines
7.4 KiB
Bash
266 lines
7.4 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# Docker 網路健康監控與自動修復腳本
|
||
#
|
||
# 功能:
|
||
# 1. 監控 mo.wooo.work 服務健康狀態
|
||
# 2. 檢測 Docker 網路轉發問題
|
||
# 3. 自動修復並發送 Telegram 告警
|
||
#
|
||
# 用法:
|
||
# 每 5 分鐘執行一次(透過 cron)
|
||
# */5 * * * * /home/wooo/scripts/docker_health_monitor.sh >> /var/log/docker_health_monitor.log 2>&1
|
||
#
|
||
# 作者:Claude Code
|
||
# 日期:2026-01-28
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
# =============================================================================
|
||
# 配置區
|
||
# =============================================================================
|
||
|
||
# 服務端點
|
||
HEALTH_URL="https://mo.wooo.work/health"
|
||
LOCAL_URL="http://127.0.0.1:5001/health"
|
||
CONTAINER_NAME="momo-pro-system"
|
||
|
||
# Telegram 配置
|
||
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT_ID="5619078117"
|
||
|
||
# 超時設定(秒)
|
||
CURL_TIMEOUT=15
|
||
MAX_RETRIES=2
|
||
|
||
# 日誌檔案
|
||
LOG_FILE="/var/log/docker_health_monitor.log"
|
||
|
||
# =============================================================================
|
||
# 函數定義
|
||
# =============================================================================
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
||
}
|
||
|
||
send_telegram() {
|
||
local message="$1"
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
||
-d text="${message}" \
|
||
-d parse_mode="HTML" \
|
||
> /dev/null 2>&1 || true
|
||
}
|
||
|
||
check_external_health() {
|
||
# 檢查外部健康端點
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} --max-time ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
|
||
echo "${http_code}"
|
||
}
|
||
|
||
check_local_port() {
|
||
# 檢查本地端口連通性
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} --max-time ${CURL_TIMEOUT} "${LOCAL_URL}" 2>/dev/null || echo "000")
|
||
echo "${http_code}"
|
||
}
|
||
|
||
check_container_internal() {
|
||
# 檢查容器內部服務是否正常
|
||
local result
|
||
result=$(docker exec ${CONTAINER_NAME} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://127.0.0.1:80/health 2>/dev/null || echo "000")
|
||
echo "${result}"
|
||
}
|
||
|
||
is_container_running() {
|
||
docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"
|
||
}
|
||
|
||
restart_docker_service() {
|
||
log "正在重啟 Docker 服務..."
|
||
sudo systemctl restart docker
|
||
|
||
# 等待容器啟動
|
||
log "等待容器啟動(60 秒)..."
|
||
sleep 60
|
||
|
||
# 驗證恢復
|
||
local http_code
|
||
http_code=$(check_external_health)
|
||
|
||
if [ "${http_code}" = "200" ]; then
|
||
log "Docker 服務重啟成功,服務已恢復"
|
||
return 0
|
||
else
|
||
log "Docker 服務重啟後服務仍未恢復 (HTTP ${http_code})"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
restart_container() {
|
||
log "正在重啟容器 ${CONTAINER_NAME}..."
|
||
docker restart ${CONTAINER_NAME}
|
||
|
||
# 等待容器啟動
|
||
log "等待容器啟動(30 秒)..."
|
||
sleep 30
|
||
|
||
# 驗證恢復
|
||
local http_code
|
||
http_code=$(check_external_health)
|
||
|
||
if [ "${http_code}" = "200" ]; then
|
||
log "容器重啟成功,服務已恢復"
|
||
return 0
|
||
else
|
||
log "容器重啟後服務仍未恢復 (HTTP ${http_code})"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# =============================================================================
|
||
# 主程式
|
||
# =============================================================================
|
||
|
||
main() {
|
||
log "========== 開始健康檢查 =========="
|
||
|
||
# 第一步:檢查外部健康端點
|
||
local external_code
|
||
external_code=$(check_external_health)
|
||
log "外部健康檢查: HTTP ${external_code}"
|
||
|
||
if [ "${external_code}" = "200" ]; then
|
||
log "服務正常運行"
|
||
log "========== 檢查完成 =========="
|
||
exit 0
|
||
fi
|
||
|
||
# 第二步:服務異常,開始診斷
|
||
log "偵測到服務異常,開始診斷..."
|
||
|
||
# 檢查容器是否運行
|
||
if ! is_container_running; then
|
||
log "容器 ${CONTAINER_NAME} 未運行"
|
||
|
||
# 嘗試啟動容器
|
||
log "嘗試啟動容器..."
|
||
docker start ${CONTAINER_NAME} 2>/dev/null || true
|
||
sleep 30
|
||
|
||
external_code=$(check_external_health)
|
||
if [ "${external_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
|
||
|
||
問題: 容器未運行
|
||
動作: 已自動啟動容器
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
log "容器啟動成功,服務已恢復"
|
||
exit 0
|
||
fi
|
||
fi
|
||
|
||
# 檢查本地端口連通性
|
||
local local_code
|
||
local_code=$(check_local_port)
|
||
log "本地端口檢查: HTTP ${local_code}"
|
||
|
||
# 檢查容器內部服務
|
||
local internal_code
|
||
internal_code=$(check_container_internal)
|
||
log "容器內部檢查: HTTP ${internal_code}"
|
||
|
||
# 診斷邏輯
|
||
if [ "${internal_code}" = "200" ] && [ "${local_code}" != "200" ]; then
|
||
# 容器內部正常但外部端口不通 → Docker 網路問題
|
||
log "診斷結果: Docker 網路轉發失效"
|
||
|
||
send_telegram "🔴 <b>MOMO Pro 服務異常</b>
|
||
|
||
症狀: 502 Bad Gateway
|
||
診斷: Docker 網路轉發失效
|
||
動作: 正在重啟 Docker 服務...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
|
||
if restart_docker_service; then
|
||
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
|
||
|
||
問題: Docker 網路轉發失效
|
||
動作: Docker 服務重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
else
|
||
send_telegram "🔴 <b>MOMO Pro 自動修復失敗</b>
|
||
|
||
問題: Docker 網路轉發失效
|
||
狀態: Docker 重啟後服務仍未恢復
|
||
需要: 人工介入處理
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
|
||
SSH: ssh wooo@192.168.0.110"
|
||
fi
|
||
|
||
elif [ "${internal_code}" != "200" ]; then
|
||
# 容器內部服務異常 → 應用問題
|
||
log "診斷結果: 應用服務異常"
|
||
|
||
send_telegram "🔴 <b>MOMO Pro 服務異常</b>
|
||
|
||
症狀: 應用服務無回應
|
||
診斷: 容器內部服務異常
|
||
動作: 正在重啟容器...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
|
||
if restart_container; then
|
||
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
|
||
|
||
問題: 應用服務異常
|
||
動作: 容器重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
else
|
||
# 容器重啟失敗,嘗試重啟 Docker
|
||
log "容器重啟失敗,嘗試重啟 Docker 服務..."
|
||
|
||
if restart_docker_service; then
|
||
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
|
||
|
||
問題: 應用服務異常
|
||
動作: Docker 服務重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
else
|
||
send_telegram "🔴 <b>MOMO Pro 自動修復失敗</b>
|
||
|
||
問題: 應用服務異常
|
||
狀態: 多次修復嘗試後服務仍未恢復
|
||
需要: 人工介入處理
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
|
||
SSH: ssh wooo@192.168.0.110
|
||
日誌: kubectl logs -f deployment/momo-app -n momo"
|
||
fi
|
||
fi
|
||
|
||
else
|
||
# 無法判斷問題
|
||
log "診斷結果: 無法判斷問題原因"
|
||
|
||
send_telegram "🟠 <b>MOMO Pro 服務異常</b>
|
||
|
||
症狀: 服務無法存取 (HTTP ${external_code})
|
||
診斷: 無法判斷問題原因
|
||
本地端口: HTTP ${local_code}
|
||
容器內部: HTTP ${internal_code}
|
||
需要: 人工介入處理
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
|
||
SSH: ssh wooo@192.168.0.110"
|
||
fi
|
||
|
||
log "========== 檢查完成 =========="
|
||
}
|
||
|
||
# 執行主程式
|
||
main "$@"
|