#!/bin/bash
# =============================================================================
# MOMO Pro System 統一健康監控腳本
#
# 功能:整合所有監控項目
# 1. Docker 網路健康(502 Bad Gateway 預防)
# 2. Harbor Registry 服務(K8s ImagePullBackOff 預防)
# 3. Google Drive 認證(自動匯入功能)
# 4. 應用服務健康檢查
#
# 用法:
# 每 5 分鐘執行一次(透過 cron)
# */5 * * * * /home/wooo/scripts/system_health_monitor.sh >> /var/log/system_health_monitor.log 2>&1
#
# 作者:Claude Code
# 日期:2026-01-28
# =============================================================================
set -e
# =============================================================================
# 配置區
# =============================================================================
# 服務端點
HEALTH_URL="https://mo.wooo.work/health"
GOOGLE_DRIVE_URL="https://mo.wooo.work/api/test_drive_connection"
HARBOR_URL="http://127.0.0.1:5050/api/v2.0/health"
# Docker 容器
APP_CONTAINER="momo-pro-system"
SCHEDULER_CONTAINER="momo-scheduler"
# Harbor 目錄
HARBOR_DIR="/home/wooo/devops/harbor/harbor"
# Telegram 配置
TELEGRAM_BOT_TOKEN="8075645931:AAH-EGKMo8ZC4QJs-Nc1_0s92xHrGdQvdpg"
TELEGRAM_CHAT_ID="5619078117"
# 超時設定(秒)
CURL_TIMEOUT=15
# 狀態檔案目錄
STATE_DIR="/tmp/momo_monitor_state"
mkdir -p "${STATE_DIR}"
# =============================================================================
# 函數定義
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="${message}" \
-d parse_mode="HTML" \
> /dev/null 2>&1 || true
}
# 檢查是否在告警冷卻期
should_alert() {
local key="$1"
local cooldown="${2:-3600}" # 預設 1 小時
local state_file="${STATE_DIR}/${key}"
if [ -f "${state_file}" ]; then
local last_alert=$(cat "${state_file}" 2>/dev/null || echo "0")
local now=$(date +%s)
local diff=$((now - last_alert))
if [ ${diff} -lt ${cooldown} ]; then
return 1
fi
fi
return 0
}
record_alert() {
local key="$1"
date +%s > "${STATE_DIR}/${key}"
}
clear_alert() {
local key="$1"
rm -f "${STATE_DIR}/${key}"
}
# =============================================================================
# 監控項目 1: Docker 網路健康
# =============================================================================
check_docker_network() {
log "[Docker 網路] 開始檢查..."
# 檢查外部健康端點
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log "[Docker 網路] ✅ 正常 (HTTP ${http_code})"
clear_alert "docker_network"
return 0
fi
log "[Docker 網路] ❌ 異常 (HTTP ${http_code})"
# 診斷:檢查容器內部
local internal_check
internal_check=$(docker exec ${APP_CONTAINER} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://127.0.0.1:80/health 2>/dev/null || echo "000")
if [ "${internal_check}" = "200" ]; then
# 容器內部正常但外部不通 → Docker 網路問題
log "[Docker 網路] 診斷: 容器內部正常,Docker 網路轉發失效"
if should_alert "docker_network" 1800; then
send_telegram "🔴 Docker 網路轉發失效
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
診斷: 容器內部正常,端口映射失效
動作: 正在自動重啟 Docker 服務...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "docker_network"
fi
# 自動修復:重啟 Docker
log "[Docker 網路] 執行自動修復: 重啟 Docker 服務"
sudo systemctl restart docker
sleep 60
# 驗證修復
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
send_telegram "🟢 Docker 網路已恢復
動作: Docker 服務重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "docker_network"
else
send_telegram "🔴 Docker 網路修復失敗
狀態: 重啟後服務仍未恢復
需要: 人工介入處理
SSH: ssh wooo@192.168.0.110"
fi
else
# 容器內部也異常
log "[Docker 網路] 診斷: 容器內部也異常,可能是應用問題"
if should_alert "docker_network" 1800; then
send_telegram "🔴 MOMO 應用服務異常
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
診斷: 容器內部服務也異常
動作: 正在重啟容器...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "docker_network"
fi
# 嘗試重啟容器
docker restart ${APP_CONTAINER} ${SCHEDULER_CONTAINER}
sleep 30
fi
return 1
}
# =============================================================================
# 監控項目 2: Harbor Registry 服務
# =============================================================================
check_harbor() {
log "[Harbor] 開始檢查..."
# 檢查 Harbor 健康狀態
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log "[Harbor] ✅ 正常 (HTTP ${http_code})"
clear_alert "harbor"
return 0
fi
log "[Harbor] ❌ 異常 (HTTP ${http_code})"
# 檢查 Harbor 容器是否運行
if ! docker ps --format '{{.Names}}' | grep -q "^harbor-core$"; then
log "[Harbor] 診斷: Harbor 容器未運行"
if should_alert "harbor" 1800; then
send_telegram "🔴 Harbor Registry 服務停止
症狀: Harbor 容器未運行
影響: K8s Pods 將無法拉取映像
動作: 正在自動啟動 Harbor...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "harbor"
fi
# 自動修復:啟動 Harbor
log "[Harbor] 執行自動修復: 啟動 Harbor"
cd "${HARBOR_DIR}"
docker compose up -d
sleep 60
# 驗證修復
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
send_telegram "🟢 Harbor Registry 已恢復
動作: Docker Compose 啟動成功
時間: $(date '+%Y-%m-%d %H:%M:%S')
⚠️ K8s Pods 可能需要重啟:
sudo kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo"
clear_alert "harbor"
else
send_telegram "🔴 Harbor 啟動失敗
需要: 人工介入處理
SSH: ssh wooo@192.168.0.110
目錄: ${HARBOR_DIR}"
fi
fi
return 1
}
# =============================================================================
# 監控項目 3: Google Drive 認證
# =============================================================================
check_google_drive() {
log "[Google Drive] 開始檢查..."
# 測試 Google Drive API 連線
local response
response=$(curl -s --connect-timeout ${CURL_TIMEOUT} \
-X POST "${GOOGLE_DRIVE_URL}" \
-H "Content-Type: application/json" \
-d '{}' 2>/dev/null || echo '{"success":false}')
if echo "${response}" | grep -q '"success":true'; then
log "[Google Drive] ✅ 認證正常"
clear_alert "google_drive"
return 0
fi
log "[Google Drive] ❌ 認證失敗"
# Google Drive 認證無法自動修復,只能發送告警
if should_alert "google_drive" 3600; then
send_telegram "🔴 Google Drive 認證失敗
症狀: OAuth Token 已過期或被撤銷
影響: 當日業績自動匯入功能停止
需要人工處理:
1. 在本機重新認證:
cd /Users/ogt/momo-pro-system
rm -f config/google_token.pickle
python3 -c \"from services.google_drive_service import drive_service; drive_service.authenticate()\"
2. 更新 Docker 容器:
scp config/google_*.* wooo@192.168.0.110:/tmp/
ssh wooo@192.168.0.110 'docker cp /tmp/google_credentials.json momo-pro-system:/app/config/ && docker cp /tmp/google_token.pickle momo-pro-system:/app/config/ && docker restart momo-pro-system momo-scheduler'
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "google_drive"
fi
return 1
}
# =============================================================================
# 主程式
# =============================================================================
main() {
log "========== 開始系統健康檢查 =========="
local all_healthy=true
# 執行所有檢查
check_docker_network || all_healthy=false
check_harbor || all_healthy=false
check_google_drive || all_healthy=false
if [ "${all_healthy}" = true ]; then
log "========== 所有服務正常 =========="
else
log "========== 部分服務異常,已嘗試修復或發送告警 =========="
fi
}
# 執行主程式
main "$@"