Files
ewoooc/scripts/system_health_monitor.sh
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

302 lines
9.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# MOMO Pro System 統一健康監控腳本
#
# 功能:整合所有監控項目
# 1. Docker 網路健康502 Bad Gateway 預防)
# 2. Harbor Registry 服務K8s ImagePullBackOff 預防)
# 3. Google Drive 認證(自動匯入功能)
# 4. 應用服務健康檢查
#
# 用法:
# 每 5 分鐘執行一次(透過 cron
# */5 * * * * /home/wooo/scripts/system_health_monitor.sh >> /var/log/system_health_monitor.log 2>&1
#
# 作者Claude Code
# 日期2026-01-28
# =============================================================================
set -e
# =============================================================================
# 配置區
# =============================================================================
# 服務端點
HEALTH_URL="https://mo.wooo.work/health"
GOOGLE_DRIVE_URL="https://mo.wooo.work/api/test_drive_connection"
HARBOR_URL="http://127.0.0.1:5050/api/v2.0/health"
# Docker 容器
APP_CONTAINER="momo-pro-system"
SCHEDULER_CONTAINER="momo-scheduler"
# Harbor 目錄
HARBOR_DIR="/home/wooo/devops/harbor/harbor"
# Telegram 配置
TELEGRAM_BOT_TOKEN="8075645931:AAH-EGKMo8ZC4QJs-Nc1_0s92xHrGdQvdpg"
TELEGRAM_CHAT_ID="5619078117"
# 超時設定(秒)
CURL_TIMEOUT=15
# 狀態檔案目錄
STATE_DIR="/tmp/momo_monitor_state"
mkdir -p "${STATE_DIR}"
# =============================================================================
# 函數定義
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="${message}" \
-d parse_mode="HTML" \
> /dev/null 2>&1 || true
}
# 檢查是否在告警冷卻期
should_alert() {
local key="$1"
local cooldown="${2:-3600}" # 預設 1 小時
local state_file="${STATE_DIR}/${key}"
if [ -f "${state_file}" ]; then
local last_alert=$(cat "${state_file}" 2>/dev/null || echo "0")
local now=$(date +%s)
local diff=$((now - last_alert))
if [ ${diff} -lt ${cooldown} ]; then
return 1
fi
fi
return 0
}
record_alert() {
local key="$1"
date +%s > "${STATE_DIR}/${key}"
}
clear_alert() {
local key="$1"
rm -f "${STATE_DIR}/${key}"
}
# =============================================================================
# 監控項目 1: Docker 網路健康
# =============================================================================
check_docker_network() {
log "[Docker 網路] 開始檢查..."
# 檢查外部健康端點
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log "[Docker 網路] ✅ 正常 (HTTP ${http_code})"
clear_alert "docker_network"
return 0
fi
log "[Docker 網路] ❌ 異常 (HTTP ${http_code})"
# 診斷:檢查容器內部
local internal_check
internal_check=$(docker exec ${APP_CONTAINER} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://127.0.0.1:80/health 2>/dev/null || echo "000")
if [ "${internal_check}" = "200" ]; then
# 容器內部正常但外部不通 → Docker 網路問題
log "[Docker 網路] 診斷: 容器內部正常Docker 網路轉發失效"
if should_alert "docker_network" 1800; then
send_telegram "🔴 <b>Docker 網路轉發失效</b>
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
診斷: 容器內部正常,端口映射失效
動作: 正在自動重啟 Docker 服務...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "docker_network"
fi
# 自動修復:重啟 Docker
log "[Docker 網路] 執行自動修復: 重啟 Docker 服務"
sudo systemctl restart docker
sleep 60
# 驗證修復
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HEALTH_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
send_telegram "🟢 <b>Docker 網路已恢復</b>
動作: Docker 服務重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "docker_network"
else
send_telegram "🔴 <b>Docker 網路修復失敗</b>
狀態: 重啟後服務仍未恢復
需要: 人工介入處理
SSH: ssh wooo@192.168.0.110"
fi
else
# 容器內部也異常
log "[Docker 網路] 診斷: 容器內部也異常,可能是應用問題"
if should_alert "docker_network" 1800; then
send_telegram "🔴 <b>MOMO 應用服務異常</b>
症狀: mo.wooo.work 無法存取 (HTTP ${http_code})
診斷: 容器內部服務也異常
動作: 正在重啟容器...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "docker_network"
fi
# 嘗試重啟容器
docker restart ${APP_CONTAINER} ${SCHEDULER_CONTAINER}
sleep 30
fi
return 1
}
# =============================================================================
# 監控項目 2: Harbor Registry 服務
# =============================================================================
check_harbor() {
log "[Harbor] 開始檢查..."
# 檢查 Harbor 健康狀態
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log "[Harbor] ✅ 正常 (HTTP ${http_code})"
clear_alert "harbor"
return 0
fi
log "[Harbor] ❌ 異常 (HTTP ${http_code})"
# 檢查 Harbor 容器是否運行
if ! docker ps --format '{{.Names}}' | grep -q "^harbor-core$"; then
log "[Harbor] 診斷: Harbor 容器未運行"
if should_alert "harbor" 1800; then
send_telegram "🔴 <b>Harbor Registry 服務停止</b>
症狀: Harbor 容器未運行
影響: K8s Pods 將無法拉取映像
動作: 正在自動啟動 Harbor...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "harbor"
fi
# 自動修復:啟動 Harbor
log "[Harbor] 執行自動修復: 啟動 Harbor"
cd "${HARBOR_DIR}"
docker compose up -d
sleep 60
# 驗證修復
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${HARBOR_URL}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
send_telegram "🟢 <b>Harbor Registry 已恢復</b>
動作: Docker Compose 啟動成功
時間: $(date '+%Y-%m-%d %H:%M:%S')
⚠️ K8s Pods 可能需要重啟:
<code>sudo kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo</code>"
clear_alert "harbor"
else
send_telegram "🔴 <b>Harbor 啟動失敗</b>
需要: 人工介入處理
SSH: ssh wooo@192.168.0.110
目錄: ${HARBOR_DIR}"
fi
fi
return 1
}
# =============================================================================
# 監控項目 3: Google Drive 認證
# =============================================================================
check_google_drive() {
log "[Google Drive] 開始檢查..."
# 測試 Google Drive API 連線
local response
response=$(curl -s --connect-timeout ${CURL_TIMEOUT} \
-X POST "${GOOGLE_DRIVE_URL}" \
-H "Content-Type: application/json" \
-d '{}' 2>/dev/null || echo '{"success":false}')
if echo "${response}" | grep -q '"success":true'; then
log "[Google Drive] ✅ 認證正常"
clear_alert "google_drive"
return 0
fi
log "[Google Drive] ❌ 認證失敗"
# Google Drive 認證無法自動修復,只能發送告警
if should_alert "google_drive" 3600; then
send_telegram "🔴 <b>Google Drive 認證失敗</b>
症狀: OAuth Token 已過期或被撤銷
影響: 當日業績自動匯入功能停止
<b>需要人工處理:</b>
1. 在本機重新認證:
<code>cd /Users/ogt/momo-pro-system
rm -f config/google_token.pickle
python3 -c \"from services.google_drive_service import drive_service; drive_service.authenticate()\"</code>
2. 更新 Docker 容器:
<code>scp config/google_*.* wooo@192.168.0.110:/tmp/
ssh wooo@192.168.0.110 'docker cp /tmp/google_credentials.json momo-pro-system:/app/config/ && docker cp /tmp/google_token.pickle momo-pro-system:/app/config/ && docker restart momo-pro-system momo-scheduler'</code>
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "google_drive"
fi
return 1
}
# =============================================================================
# 主程式
# =============================================================================
main() {
log "========== 開始系統健康檢查 =========="
local all_healthy=true
# 執行所有檢查
check_docker_network || all_healthy=false
check_harbor || all_healthy=false
check_google_drive || all_healthy=false
if [ "${all_healthy}" = true ]; then
log "========== 所有服務正常 =========="
else
log "========== 部分服務異常,已嘗試修復或發送告警 =========="
fi
}
# 執行主程式
main "$@"