325 lines
9.9 KiB
Bash
325 lines
9.9 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# Ollama / Open WebUI 健康監控腳本
|
||
#
|
||
# 功能:
|
||
# 1. 監控 Open WebUI 服務(port 3000 / 80)
|
||
# 2. 監控 Ollama API 服務(port 11434)
|
||
# 3. 檢測 502 Bad Gateway 並自動修復
|
||
# 4. Docker 網路失效時自動重啟 Docker daemon
|
||
#
|
||
# 用法:
|
||
# 每 5 分鐘執行一次(透過 cron)
|
||
# */5 * * * * /home/ollama/scripts/ollama_health_monitor.sh >> /var/log/ollama_health_monitor.log 2>&1
|
||
#
|
||
# 作者:Claude Code
|
||
# 日期:2026-01-28
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
# =============================================================================
|
||
# 配置區
|
||
# =============================================================================
|
||
|
||
# 服務端點
|
||
OPEN_WEBUI_INTERNAL="http://127.0.0.1:3000"
|
||
OPEN_WEBUI_EXTERNAL="http://192.168.0.188"
|
||
OLLAMA_API="http://127.0.0.1:11434/api/tags"
|
||
|
||
# 容器名稱
|
||
OPEN_WEBUI_CONTAINER="open-webui"
|
||
|
||
# Telegram 配置
|
||
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
|
||
TELEGRAM_CHAT_ID="5619078117"
|
||
|
||
# 超時設定(秒)
|
||
CURL_TIMEOUT=10
|
||
|
||
# 狀態檔案目錄
|
||
STATE_DIR="/tmp/ollama_monitor_state"
|
||
mkdir -p "${STATE_DIR}"
|
||
|
||
# =============================================================================
|
||
# 函數定義
|
||
# =============================================================================
|
||
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
||
}
|
||
|
||
send_telegram() {
|
||
local message="$1"
|
||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
||
-d text="${message}" \
|
||
-d parse_mode="HTML" \
|
||
> /dev/null 2>&1 || true
|
||
}
|
||
|
||
# 檢查是否在告警冷卻期
|
||
should_alert() {
|
||
local key="$1"
|
||
local cooldown="${2:-1800}" # 預設 30 分鐘
|
||
local state_file="${STATE_DIR}/${key}"
|
||
|
||
if [ -f "${state_file}" ]; then
|
||
local last_alert=$(cat "${state_file}" 2>/dev/null || echo "0")
|
||
local now=$(date +%s)
|
||
local diff=$((now - last_alert))
|
||
if [ ${diff} -lt ${cooldown} ]; then
|
||
return 1
|
||
fi
|
||
fi
|
||
return 0
|
||
}
|
||
|
||
record_alert() {
|
||
local key="$1"
|
||
date +%s > "${STATE_DIR}/${key}"
|
||
}
|
||
|
||
clear_alert() {
|
||
local key="$1"
|
||
rm -f "${STATE_DIR}/${key}"
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 1: Ollama API
|
||
# =============================================================================
|
||
|
||
check_ollama_api() {
|
||
log "[Ollama API] 開始檢查..."
|
||
|
||
local http_code
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OLLAMA_API}" 2>/dev/null || echo "000")
|
||
|
||
if [ "${http_code}" = "200" ]; then
|
||
log "[Ollama API] ✅ 正常 (HTTP ${http_code})"
|
||
clear_alert "ollama_api"
|
||
return 0
|
||
fi
|
||
|
||
log "[Ollama API] ❌ 異常 (HTTP ${http_code})"
|
||
|
||
# 檢查 Ollama 進程是否運行
|
||
if ! pgrep -x "ollama" > /dev/null 2>&1; then
|
||
log "[Ollama API] 診斷: Ollama 進程未運行"
|
||
|
||
if should_alert "ollama_api" 1800; then
|
||
send_telegram "🔴 <b>Ollama API 服務停止</b>
|
||
|
||
症狀: Ollama 進程未運行
|
||
影響: AI 功能無法使用
|
||
|
||
<b>嘗試自動修復...</b>
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "ollama_api"
|
||
fi
|
||
|
||
# 嘗試重啟 Ollama
|
||
log "[Ollama API] 執行自動修復: 重啟 Ollama"
|
||
systemctl restart ollama 2>/dev/null || ollama serve &
|
||
sleep 30
|
||
|
||
# 驗證修復
|
||
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OLLAMA_API}" 2>/dev/null || echo "000")
|
||
if [ "${http_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>Ollama API 已恢復</b>
|
||
|
||
動作: Ollama 服務重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
clear_alert "ollama_api"
|
||
return 0
|
||
fi
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 2: Open WebUI
|
||
# =============================================================================
|
||
|
||
check_open_webui() {
|
||
log "[Open WebUI] 開始檢查..."
|
||
|
||
# 檢查外部訪問(通過 nginx)
|
||
local external_code
|
||
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
|
||
|
||
if [ "${external_code}" = "200" ]; then
|
||
log "[Open WebUI] ✅ 外部訪問正常 (HTTP ${external_code})"
|
||
clear_alert "open_webui"
|
||
return 0
|
||
fi
|
||
|
||
log "[Open WebUI] ❌ 外部訪問異常 (HTTP ${external_code})"
|
||
|
||
# 診斷:檢查容器內部
|
||
local internal_code
|
||
internal_code=$(docker exec ${OPEN_WEBUI_CONTAINER} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/ 2>/dev/null || echo "000")
|
||
|
||
if [ "${internal_code}" = "200" ]; then
|
||
# 容器內部正常但外部不通 → Docker 網路問題
|
||
log "[Open WebUI] 診斷: 容器內部正常 (${internal_code}),Docker 網路轉發失效"
|
||
|
||
# 檢查 port 3000 是否可達
|
||
local port_check
|
||
port_check=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "${OPEN_WEBUI_INTERNAL}" 2>/dev/null || echo "000")
|
||
|
||
if [ "${port_check}" = "000" ] || [ "${port_check}" = "502" ]; then
|
||
log "[Open WebUI] 診斷: Port 3000 不可達,Docker 端口映射失效"
|
||
|
||
if should_alert "open_webui" 1800; then
|
||
send_telegram "🔴 <b>Open WebUI Docker 網路失效</b>
|
||
|
||
症狀: http://192.168.0.188 返回 ${external_code}
|
||
診斷: 容器內部正常,端口映射失效
|
||
動作: 正在重啟 Docker daemon...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "open_webui"
|
||
fi
|
||
|
||
# 自動修復:重啟 Docker daemon
|
||
log "[Open WebUI] 執行自動修復: 重啟 Docker daemon"
|
||
sudo systemctl restart docker
|
||
sleep 60
|
||
|
||
# 驗證修復
|
||
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
|
||
if [ "${external_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>Open WebUI 已恢復</b>
|
||
|
||
動作: Docker daemon 重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
clear_alert "open_webui"
|
||
return 0
|
||
else
|
||
send_telegram "🔴 <b>Open WebUI 修復失敗</b>
|
||
|
||
狀態: 重啟後服務仍未恢復 (HTTP ${external_code})
|
||
需要: 人工介入處理
|
||
SSH: ssh ollama@192.168.0.188"
|
||
fi
|
||
fi
|
||
else
|
||
# 容器內部也異常 → 容器問題
|
||
log "[Open WebUI] 診斷: 容器內部異常 (${internal_code})"
|
||
|
||
# 檢查容器狀態
|
||
local container_status
|
||
container_status=$(docker inspect -f '{{.State.Status}}' ${OPEN_WEBUI_CONTAINER} 2>/dev/null || echo "not_found")
|
||
|
||
if [ "${container_status}" != "running" ]; then
|
||
log "[Open WebUI] 診斷: 容器狀態 = ${container_status}"
|
||
|
||
if should_alert "open_webui" 1800; then
|
||
send_telegram "🔴 <b>Open WebUI 容器異常</b>
|
||
|
||
症狀: 容器狀態 = ${container_status}
|
||
動作: 正在重啟容器...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "open_webui"
|
||
fi
|
||
|
||
# 重啟容器
|
||
docker restart ${OPEN_WEBUI_CONTAINER}
|
||
sleep 60
|
||
else
|
||
# 容器運行中但服務不響應
|
||
log "[Open WebUI] 診斷: 容器運行中但服務卡住"
|
||
|
||
if should_alert "open_webui" 1800; then
|
||
send_telegram "🔴 <b>Open WebUI 服務卡住</b>
|
||
|
||
症狀: 容器運行中但不響應請求
|
||
動作: 正在重啟容器...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "open_webui"
|
||
fi
|
||
|
||
docker restart ${OPEN_WEBUI_CONTAINER}
|
||
sleep 60
|
||
fi
|
||
|
||
# 驗證修復
|
||
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
|
||
if [ "${external_code}" = "200" ]; then
|
||
send_telegram "🟢 <b>Open WebUI 已恢復</b>
|
||
|
||
動作: 容器重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
clear_alert "open_webui"
|
||
return 0
|
||
fi
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# =============================================================================
|
||
# 監控項目 3: 檢查 nginx 代理
|
||
# =============================================================================
|
||
|
||
check_nginx() {
|
||
log "[Nginx] 開始檢查..."
|
||
|
||
# 檢查 nginx 進程
|
||
if ! pgrep -x "nginx" > /dev/null 2>&1; then
|
||
log "[Nginx] ❌ nginx 未運行"
|
||
|
||
if should_alert "nginx" 1800; then
|
||
send_telegram "🔴 <b>Ollama 伺服器 Nginx 停止</b>
|
||
|
||
症狀: nginx 進程未運行
|
||
動作: 正在重啟 nginx...
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
record_alert "nginx"
|
||
fi
|
||
|
||
sudo systemctl restart nginx
|
||
sleep 5
|
||
|
||
if pgrep -x "nginx" > /dev/null 2>&1; then
|
||
send_telegram "🟢 <b>Nginx 已恢復</b>
|
||
|
||
動作: nginx 重啟成功
|
||
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
clear_alert "nginx"
|
||
return 0
|
||
fi
|
||
|
||
return 1
|
||
fi
|
||
|
||
log "[Nginx] ✅ 正常運行"
|
||
clear_alert "nginx"
|
||
return 0
|
||
}
|
||
|
||
# =============================================================================
|
||
# 主程式
|
||
# =============================================================================
|
||
|
||
main() {
|
||
log "========== 開始 Ollama 健康檢查 =========="
|
||
|
||
local all_healthy=true
|
||
|
||
# 執行所有檢查
|
||
check_nginx || all_healthy=false
|
||
check_ollama_api || all_healthy=false
|
||
check_open_webui || all_healthy=false
|
||
|
||
if [ "${all_healthy}" = true ]; then
|
||
log "========== 所有服務正常 =========="
|
||
else
|
||
log "========== 部分服務異常,已嘗試修復或發送告警 =========="
|
||
fi
|
||
}
|
||
|
||
# 執行主程式
|
||
main "$@"
|