Files
ewoooc/scripts/ollama_health_monitor.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

325 lines
9.9 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# Ollama / Open WebUI 健康監控腳本
#
# 功能:
# 1. 監控 Open WebUI 服務port 3000 / 80
# 2. 監控 Ollama API 服務port 11434
# 3. 檢測 502 Bad Gateway 並自動修復
# 4. Docker 網路失效時自動重啟 Docker daemon
#
# 用法:
# 每 5 分鐘執行一次(透過 cron
# */5 * * * * /home/ollama/scripts/ollama_health_monitor.sh >> /var/log/ollama_health_monitor.log 2>&1
#
# 作者Claude Code
# 日期2026-01-28
# =============================================================================
set -e
# =============================================================================
# 配置區
# =============================================================================
# 服務端點
OPEN_WEBUI_INTERNAL="http://127.0.0.1:3000"
OPEN_WEBUI_EXTERNAL="http://192.168.0.188"
OLLAMA_API="http://127.0.0.1:11434/api/tags"
# 容器名稱
OPEN_WEBUI_CONTAINER="open-webui"
# Telegram 配置
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT_ID="5619078117"
# 超時設定(秒)
CURL_TIMEOUT=10
# 狀態檔案目錄
STATE_DIR="/tmp/ollama_monitor_state"
mkdir -p "${STATE_DIR}"
# =============================================================================
# 函數定義
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="${message}" \
-d parse_mode="HTML" \
> /dev/null 2>&1 || true
}
# 檢查是否在告警冷卻期
should_alert() {
local key="$1"
local cooldown="${2:-1800}" # 預設 30 分鐘
local state_file="${STATE_DIR}/${key}"
if [ -f "${state_file}" ]; then
local last_alert=$(cat "${state_file}" 2>/dev/null || echo "0")
local now=$(date +%s)
local diff=$((now - last_alert))
if [ ${diff} -lt ${cooldown} ]; then
return 1
fi
fi
return 0
}
record_alert() {
local key="$1"
date +%s > "${STATE_DIR}/${key}"
}
clear_alert() {
local key="$1"
rm -f "${STATE_DIR}/${key}"
}
# =============================================================================
# 監控項目 1: Ollama API
# =============================================================================
check_ollama_api() {
log "[Ollama API] 開始檢查..."
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OLLAMA_API}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
log "[Ollama API] ✅ 正常 (HTTP ${http_code})"
clear_alert "ollama_api"
return 0
fi
log "[Ollama API] ❌ 異常 (HTTP ${http_code})"
# 檢查 Ollama 進程是否運行
if ! pgrep -x "ollama" > /dev/null 2>&1; then
log "[Ollama API] 診斷: Ollama 進程未運行"
if should_alert "ollama_api" 1800; then
send_telegram "🔴 <b>Ollama API 服務停止</b>
症狀: Ollama 進程未運行
影響: AI 功能無法使用
<b>嘗試自動修復...</b>
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "ollama_api"
fi
# 嘗試重啟 Ollama
log "[Ollama API] 執行自動修復: 重啟 Ollama"
systemctl restart ollama 2>/dev/null || ollama serve &
sleep 30
# 驗證修復
http_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OLLAMA_API}" 2>/dev/null || echo "000")
if [ "${http_code}" = "200" ]; then
send_telegram "🟢 <b>Ollama API 已恢復</b>
動作: Ollama 服務重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "ollama_api"
return 0
fi
fi
return 1
}
# =============================================================================
# 監控項目 2: Open WebUI
# =============================================================================
check_open_webui() {
log "[Open WebUI] 開始檢查..."
# 檢查外部訪問(通過 nginx
local external_code
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
if [ "${external_code}" = "200" ]; then
log "[Open WebUI] ✅ 外部訪問正常 (HTTP ${external_code})"
clear_alert "open_webui"
return 0
fi
log "[Open WebUI] ❌ 外部訪問異常 (HTTP ${external_code})"
# 診斷:檢查容器內部
local internal_code
internal_code=$(docker exec ${OPEN_WEBUI_CONTAINER} curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/ 2>/dev/null || echo "000")
if [ "${internal_code}" = "200" ]; then
# 容器內部正常但外部不通 → Docker 網路問題
log "[Open WebUI] 診斷: 容器內部正常 (${internal_code})Docker 網路轉發失效"
# 檢查 port 3000 是否可達
local port_check
port_check=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "${OPEN_WEBUI_INTERNAL}" 2>/dev/null || echo "000")
if [ "${port_check}" = "000" ] || [ "${port_check}" = "502" ]; then
log "[Open WebUI] 診斷: Port 3000 不可達Docker 端口映射失效"
if should_alert "open_webui" 1800; then
send_telegram "🔴 <b>Open WebUI Docker 網路失效</b>
症狀: http://192.168.0.188 返回 ${external_code}
診斷: 容器內部正常,端口映射失效
動作: 正在重啟 Docker daemon...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "open_webui"
fi
# 自動修復:重啟 Docker daemon
log "[Open WebUI] 執行自動修復: 重啟 Docker daemon"
sudo systemctl restart docker
sleep 60
# 驗證修復
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
if [ "${external_code}" = "200" ]; then
send_telegram "🟢 <b>Open WebUI 已恢復</b>
動作: Docker daemon 重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "open_webui"
return 0
else
send_telegram "🔴 <b>Open WebUI 修復失敗</b>
狀態: 重啟後服務仍未恢復 (HTTP ${external_code})
需要: 人工介入處理
SSH: ssh ollama@192.168.0.188"
fi
fi
else
# 容器內部也異常 → 容器問題
log "[Open WebUI] 診斷: 容器內部異常 (${internal_code})"
# 檢查容器狀態
local container_status
container_status=$(docker inspect -f '{{.State.Status}}' ${OPEN_WEBUI_CONTAINER} 2>/dev/null || echo "not_found")
if [ "${container_status}" != "running" ]; then
log "[Open WebUI] 診斷: 容器狀態 = ${container_status}"
if should_alert "open_webui" 1800; then
send_telegram "🔴 <b>Open WebUI 容器異常</b>
症狀: 容器狀態 = ${container_status}
動作: 正在重啟容器...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "open_webui"
fi
# 重啟容器
docker restart ${OPEN_WEBUI_CONTAINER}
sleep 60
else
# 容器運行中但服務不響應
log "[Open WebUI] 診斷: 容器運行中但服務卡住"
if should_alert "open_webui" 1800; then
send_telegram "🔴 <b>Open WebUI 服務卡住</b>
症狀: 容器運行中但不響應請求
動作: 正在重啟容器...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "open_webui"
fi
docker restart ${OPEN_WEBUI_CONTAINER}
sleep 60
fi
# 驗證修復
external_code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout ${CURL_TIMEOUT} "${OPEN_WEBUI_EXTERNAL}" 2>/dev/null || echo "000")
if [ "${external_code}" = "200" ]; then
send_telegram "🟢 <b>Open WebUI 已恢復</b>
動作: 容器重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "open_webui"
return 0
fi
fi
return 1
}
# =============================================================================
# 監控項目 3: 檢查 nginx 代理
# =============================================================================
check_nginx() {
log "[Nginx] 開始檢查..."
# 檢查 nginx 進程
if ! pgrep -x "nginx" > /dev/null 2>&1; then
log "[Nginx] ❌ nginx 未運行"
if should_alert "nginx" 1800; then
send_telegram "🔴 <b>Ollama 伺服器 Nginx 停止</b>
症狀: nginx 進程未運行
動作: 正在重啟 nginx...
時間: $(date '+%Y-%m-%d %H:%M:%S')"
record_alert "nginx"
fi
sudo systemctl restart nginx
sleep 5
if pgrep -x "nginx" > /dev/null 2>&1; then
send_telegram "🟢 <b>Nginx 已恢復</b>
動作: nginx 重啟成功
時間: $(date '+%Y-%m-%d %H:%M:%S')"
clear_alert "nginx"
return 0
fi
return 1
fi
log "[Nginx] ✅ 正常運行"
clear_alert "nginx"
return 0
}
# =============================================================================
# 主程式
# =============================================================================
main() {
log "========== 開始 Ollama 健康檢查 =========="
local all_healthy=true
# 執行所有檢查
check_nginx || all_healthy=false
check_ollama_api || all_healthy=false
check_open_webui || all_healthy=false
if [ "${all_healthy}" = true ]; then
log "========== 所有服務正常 =========="
else
log "========== 部分服務異常,已嘗試修復或發送告警 =========="
fi
}
# 執行主程式
main "$@"