Files
ewoooc/scripts/docker_health_monitor.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

266 lines
7.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# Docker 網路健康監控與自動修復腳本
#
# 功能:
# 1. 監控 mo.wooo.work 服務健康狀態
# 2. 檢測 Docker 網路轉發問題
# 3. 自動修復並發送 Telegram 告警
#
# 用法:
# 每 5 分鐘執行一次(透過 cron
# */5 * * * * /home/wooo/scripts/docker_health_monitor.sh >> /var/log/docker_health_monitor.log 2>&1
#
# 作者Claude Code
# 日期2026-01-28
# =============================================================================
set -e
# =============================================================================
# 配置區
# =============================================================================
# 服務端點
HEALTH_URL="https://mo.wooo.work/health"
LOCAL_URL="http://127.0.0.1:5001/health"
CONTAINER_NAME="momo-pro-system"
# Telegram 配置
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT_ID="5619078117"
# 超時設定(秒)
CURL_TIMEOUT=15
MAX_RETRIES=2
# 日誌檔案
LOG_FILE="/var/log/docker_health_monitor.log"
# =============================================================================
# 函數定義
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="${message}" \
-d parse_mode="HTML" \
> /dev/null 2>&1 || true
}
check_external_health() {
# 檢查外部健康端點
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} --max-time ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
echo "${http_code}"
}
check_local_port() {
# 檢查本地端口連通性
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} --max-time ${CURL_TIMEOUT} "${LOCAL_URL}" 2>/dev/null || echo "000")
echo "${http_code}"
}
check_container_internal() {
# 檢查容器內部服務是否正常
local result
result=$(docker exec ${CONTAINER_NAME} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://127.0.0.1:80/health 2>/dev/null || echo "000")
echo "${result}"
}
is_container_running() {
docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"
}
restart_docker_service() {
log "正在重啟 Docker 服務..."
sudo systemctl restart docker
# 等待容器啟動
log "等待容器啟動60 秒)..."
sleep 60
# 驗證恢復
local http_code
http_code=$(check_external_health)
if [ "${http_code}" = "200" ]; then
log "Docker 服務重啟成功,服務已恢復"
return 0
else
log "Docker 服務重啟後服務仍未恢復 (HTTP ${http_code})"
return 1
fi
}
restart_container() {
log "正在重啟容器 ${CONTAINER_NAME}..."
docker restart ${CONTAINER_NAME}
# 等待容器啟動
log "等待容器啟動30 秒)..."
sleep 30
# 驗證恢復
local http_code
http_code=$(check_external_health)
if [ "${http_code}" = "200" ]; then
log "容器重啟成功,服務已恢復"
return 0
else
log "容器重啟後服務仍未恢復 (HTTP ${http_code})"
return 1
fi
}
# =============================================================================
# 主程式
# =============================================================================
main() {
log "========== 開始健康檢查 =========="
# 第一步:檢查外部健康端點
local external_code
external_code=$(check_external_health)
log "外部健康檢查: HTTP ${external_code}"
if [ "${external_code}" = "200" ]; then
log "服務正常運行"
log "========== 檢查完成 =========="
exit 0
fi
# 第二步:服務異常,開始診斷
log "偵測到服務異常,開始診斷..."
# 檢查容器是否運行
if ! is_container_running; then
log "容器 ${CONTAINER_NAME} 未運行"
# 嘗試啟動容器
log "嘗試啟動容器..."
docker start ${CONTAINER_NAME} 2>/dev/null || true
sleep 30
external_code=$(check_external_health)
if [ "${external_code}" = "200" ]; then
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
問題: 容器未運行
動作: 已自動啟動容器
時間: $(date '+%Y-%m-%d %H:%M:%S')"
log "容器啟動成功,服務已恢復"
exit 0
fi
fi
# 檢查本地端口連通性
local local_code
local_code=$(check_local_port)
log "本地端口檢查: HTTP ${local_code}"
# 檢查容器內部服務
local internal_code
internal_code=$(check_container_internal)
log "容器內部檢查: HTTP ${internal_code}"
# 診斷邏輯
if [ "${internal_code}" = "200" ] && [ "${local_code}" != "200" ]; then
# 容器內部正常但外部端口不通 → Docker 網路問題
log "診斷結果: Docker 網路轉發失效"
send_telegram "🔴 <b>MOMO Pro 服務異常</b>
症狀: 502 Bad Gateway
診斷: Docker 網路轉發失效
動作: 正在重啟 Docker 服務...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
if restart_docker_service; then
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
問題: Docker 網路轉發失效
動作: Docker 服務重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
else
send_telegram "🔴 <b>MOMO Pro 自動修復失敗</b>
問題: Docker 網路轉發失效
狀態: Docker 重啟後服務仍未恢復
需要: 人工介入處理
時間: $(date '+%Y-%m-%d %H:%M:%S')
SSH: ssh wooo@192.168.0.110"
fi
elif [ "${internal_code}" != "200" ]; then
# 容器內部服務異常 → 應用問題
log "診斷結果: 應用服務異常"
send_telegram "🔴 <b>MOMO Pro 服務異常</b>
症狀: 應用服務無回應
診斷: 容器內部服務異常
動作: 正在重啟容器...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
if restart_container; then
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
問題: 應用服務異常
動作: 容器重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
else
# 容器重啟失敗,嘗試重啟 Docker
log "容器重啟失敗,嘗試重啟 Docker 服務..."
if restart_docker_service; then
send_telegram "🟢 <b>MOMO Pro 服務已恢復</b>
問題: 應用服務異常
動作: Docker 服務重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
else
send_telegram "🔴 <b>MOMO Pro 自動修復失敗</b>
問題: 應用服務異常
狀態: 多次修復嘗試後服務仍未恢復
需要: 人工介入處理
時間: $(date '+%Y-%m-%d %H:%M:%S')
SSH: ssh wooo@192.168.0.110
日誌: kubectl logs -f deployment/momo-app -n momo"
fi
fi
else
# 無法判斷問題
log "診斷結果: 無法判斷問題原因"
send_telegram "🟠 <b>MOMO Pro 服務異常</b>
症狀: 服務無法存取 (HTTP ${external_code})
診斷: 無法判斷問題原因
本地端口: HTTP ${local_code}
容器內部: HTTP ${internal_code}
需要: 人工介入處理
時間: $(date '+%Y-%m-%d %H:%M:%S')
SSH: ssh wooo@192.168.0.110"
fi
log "========== 檢查完成 =========="
}
# 執行主程式
main "$@"