302 lines
9.3 KiB
Bash
302 lines
9.3 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# MOMO Pro System 統一健康監控腳本
|
||
#
|
||
# 功能:整合所有監控項目
|
||
# 1. Docker 網路健康(502 Bad Gateway 預防)
|
||
# 2. Harbor Registry 服務(K8s ImagePullBackOff 預防)
|
||
# 3. Google Drive 認證(自動匯入功能)
|
||
# 4. 應用服務健康檢查
|
||
#
|
||
# 用法:
|
||
# 每 5 分鐘執行一次(透過 cron)
|
||
# */5 * * * * /home/wooo/scripts/system_health_monitor.sh >> /var/log/system_health_monitor.log 2>&1
|
||
#
|
||
# 作者:Claude Code
|
||
# 日期:2026-01-28
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
# =============================================================================
|
||
# 配置區
|
||
# =============================================================================
|
||
|
||
# 服務端點
|
||
HEALTH_URL="https://mo.wooo.work/health"
|
||
GOOGLE_DRIVE_URL="https://mo.wooo.work/api/test_drive_connection"
|
||
HARBOR_URL="http://127.0.0.1:5050/api/v2.0/health"
|
||
|
||
# Docker 容器
|
||
APP_CONTAINER="momo-pro-system"
|
||
SCHEDULER_CONTAINER="momo-scheduler"
|
||
|
||
# Harbor 目錄
|
||
HARBOR_DIR="/home/wooo/devops/harbor/harbor"
|
||
|
||
# Telegram 配置
|
||
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT_ID="5619078117"
|
||
|
||
# 超時設定(秒)
|
||
CURL_TIMEOUT=15
|
||
|
||
# 狀態檔案目錄
|
||
STATE_DIR="/tmp/momo_monitor_state"
|
||
mkdir -p "${STATE_DIR}"
|
||
|
||
# =============================================================================
|
||
# 函數定義
|
||
# =============================================================================
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
||
}
|
||
|
||
send_telegram() {
|
||
local message="$1"
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
||
-d text="${message}" \
|
||
-d parse_mode="HTML" \
|
||
> /dev/null 2>&1 || true
|
||
}
|
||
|
||
# 檢查是否在告警冷卻期
|
||
should_alert() {
|
||
local key="$1"
|
||
local cooldown="${2:-3600}" # 預設 1 小時
|
||
local state_file="${STATE_DIR}/${key}"
|
||
|
||
if [ -f "${state_file}" ]; then
|
||
local last_alert=$(cat "${state_file}" 2>/dev/null || echo "0")
|
||
local now=$(date +%s)
|
||
local diff=$((now - last_alert))
|
||
if [ ${diff} -lt ${cooldown} ]; then
|
||
return 1
|
||
fi
|
||
fi
|
||
return 0
|
||
}
|
||
|
||
record_alert() {
|
||
local key="$1"
|
||
date +%s > "${STATE_DIR}/${key}"
|
||
}
|
||
|
||
clear_alert() {
|
||
local key="$1"
|
||
rm -f "${STATE_DIR}/${key}"
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 1: Docker 網路健康
|
||
# =============================================================================
|
||
|
||
check_docker_network() {
|
||
log "[Docker 網路] 開始檢查..."
|
||
|
||
# 檢查外部健康端點
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
|
||
|
||
if [ "${http_code}" = "200" ]; then
|
||
log "[Docker 網路] ✅ 正常 (HTTP ${http_code})"
|
||
clear_alert "docker_network"
|
||
return 0
|
||
fi
|
||
|
||
log "[Docker 網路] ❌ 異常 (HTTP ${http_code})"
|
||
|
||
# 診斷:檢查容器內部
|
||
local internal_check
|
||
internal_check=$(docker exec ${APP_CONTAINER} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://127.0.0.1:80/health 2>/dev/null || echo "000")
|
||
|
||
if [ "${internal_check}" = "200" ]; then
|
||
# 容器內部正常但外部不通 → Docker 網路問題
|
||
log "[Docker 網路] 診斷: 容器內部正常,Docker 網路轉發失效"
|
||
|
||
if should_alert "docker_network" 1800; then
|
||
send_telegram "🔴 <b>Docker 網路轉發失效</b>
|
||
|
||
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
|
||
診斷: 容器內部正常,端口映射失效
|
||
動作: 正在自動重啟 Docker 服務...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "docker_network"
|
||
fi
|
||
|
||
# 自動修復:重啟 Docker
|
||
log "[Docker 網路] 執行自動修復: 重啟 Docker 服務"
|
||
sudo systemctl restart docker
|
||
sleep 60
|
||
|
||
# 驗證修復
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
|
||
if [ "${http_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>Docker 網路已恢復</b>
|
||
|
||
動作: Docker 服務重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
clear_alert "docker_network"
|
||
else
|
||
send_telegram "🔴 <b>Docker 網路修復失敗</b>
|
||
|
||
狀態: 重啟後服務仍未恢復
|
||
需要: 人工介入處理
|
||
SSH: ssh wooo@192.168.0.110"
|
||
fi
|
||
else
|
||
# 容器內部也異常
|
||
log "[Docker 網路] 診斷: 容器內部也異常,可能是應用問題"
|
||
|
||
if should_alert "docker_network" 1800; then
|
||
send_telegram "🔴 <b>MOMO 應用服務異常</b>
|
||
|
||
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
|
||
診斷: 容器內部服務也異常
|
||
動作: 正在重啟容器...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "docker_network"
|
||
fi
|
||
|
||
# 嘗試重啟容器
|
||
docker restart ${APP_CONTAINER} ${SCHEDULER_CONTAINER}
|
||
sleep 30
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 2: Harbor Registry 服務
|
||
# =============================================================================
|
||
|
||
check_harbor() {
|
||
log "[Harbor] 開始檢查..."
|
||
|
||
# 檢查 Harbor 健康狀態
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
|
||
|
||
if [ "${http_code}" = "200" ]; then
|
||
log "[Harbor] ✅ 正常 (HTTP ${http_code})"
|
||
clear_alert "harbor"
|
||
return 0
|
||
fi
|
||
|
||
log "[Harbor] ❌ 異常 (HTTP ${http_code})"
|
||
|
||
# 檢查 Harbor 容器是否運行
|
||
if ! docker ps --format '{{.Names}}' | grep -q "^harbor-core$"; then
|
||
log "[Harbor] 診斷: Harbor 容器未運行"
|
||
|
||
if should_alert "harbor" 1800; then
|
||
send_telegram "🔴 <b>Harbor Registry 服務停止</b>
|
||
|
||
症狀: Harbor 容器未運行
|
||
影響: K8s Pods 將無法拉取映像
|
||
動作: 正在自動啟動 Harbor...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "harbor"
|
||
fi
|
||
|
||
# 自動修復:啟動 Harbor
|
||
log "[Harbor] 執行自動修復: 啟動 Harbor"
|
||
cd "${HARBOR_DIR}"
|
||
docker compose up -d
|
||
sleep 60
|
||
|
||
# 驗證修復
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
|
||
if [ "${http_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>Harbor Registry 已恢復</b>
|
||
|
||
動作: Docker Compose 啟動成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')
|
||
|
||
⚠️ K8s Pods 可能需要重啟:
|
||
<code>sudo kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo</code>"
|
||
clear_alert "harbor"
|
||
else
|
||
send_telegram "🔴 <b>Harbor 啟動失敗</b>
|
||
|
||
需要: 人工介入處理
|
||
SSH: ssh wooo@192.168.0.110
|
||
目錄: ${HARBOR_DIR}"
|
||
fi
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 3: Google Drive 認證
|
||
# =============================================================================
|
||
|
||
check_google_drive() {
|
||
log "[Google Drive] 開始檢查..."
|
||
|
||
# 測試 Google Drive API 連線
|
||
local response
|
||
response=$(curl -s --connect-timeout ${CURL_TIMEOUT} \
|
||
-X POST "${GOOGLE_DRIVE_URL}" \
|
||
-H "Content-Type: application/json" \
|
||
-d '{}' 2>/dev/null || echo '{"success":false}')
|
||
|
||
if echo "${response}" | grep -q '"success":true'; then
|
||
log "[Google Drive] ✅ 認證正常"
|
||
clear_alert "google_drive"
|
||
return 0
|
||
fi
|
||
|
||
log "[Google Drive] ❌ 認證失敗"
|
||
|
||
# Google Drive 認證無法自動修復,只能發送告警
|
||
if should_alert "google_drive" 3600; then
|
||
send_telegram "🔴 <b>Google Drive 認證失敗</b>
|
||
|
||
症狀: OAuth Token 已過期或被撤銷
|
||
影響: 當日業績自動匯入功能停止
|
||
|
||
<b>需要人工處理:</b>
|
||
1. 在本機重新認證:
|
||
<code>cd /Users/ogt/momo-pro-system
|
||
rm -f config/google_token.pickle
|
||
python3 -c \"from services.google_drive_service import drive_service; drive_service.authenticate()\"</code>
|
||
|
||
2. 更新 Docker 容器:
|
||
<code>scp config/google_*.* wooo@192.168.0.110:/tmp/
|
||
ssh wooo@192.168.0.110 'docker cp /tmp/google_credentials.json momo-pro-system:/app/config/ && docker cp /tmp/google_token.pickle momo-pro-system:/app/config/ && docker restart momo-pro-system momo-scheduler'</code>
|
||
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "google_drive"
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# =============================================================================
|
||
# 主程式
|
||
# =============================================================================
|
||
|
||
main() {
|
||
log "========== 開始系統健康檢查 =========="
|
||
|
||
local all_healthy=true
|
||
|
||
# 執行所有檢查
|
||
check_docker_network || all_healthy=false
|
||
check_harbor || all_healthy=false
|
||
check_google_drive || all_healthy=false
|
||
|
||
if [ "${all_healthy}" = true ]; then
|
||
log "========== 所有服務正常 =========="
|
||
else
|
||
log "========== 部分服務異常,已嘗試修復或發送告警 =========="
|
||
fi
|
||
}
|
||
|
||
# 執行主程式
|
||
main "$@"
|