263 lines
7.7 KiB
Bash
263 lines
7.7 KiB
Bash
#!/bin/bash
|
|
# K8s 健康監控腳本 (含自動修復)
|
|
# 部署位置: 192.168.0.110:/home/wooo/scripts/k8s_health_monitor.sh
|
|
# Cron: */5 * * * * /home/wooo/scripts/k8s_health_monitor.sh >> /var/log/k8s_health_monitor.log 2>&1
|
|
|
|
set -e
|
|
|
|
# ===== 配置區域 =====
|
|
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
|
|
TELEGRAM_CHAT_ID="5619078117"
|
|
MOMO_NAMESPACE="momo"
|
|
APP_HEALTH_URL="https://mo.wooo.work/health"
|
|
LOG_FILE="/var/log/k8s_health_monitor.log"
|
|
STATE_FILE="/tmp/k8s_monitor_state.json"
|
|
|
|
# ===== 顏色和日誌 =====
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
|
}
|
|
|
|
# ===== Telegram 通知 =====
|
|
send_telegram() {
|
|
local message="$1"
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-d chat_id="${TELEGRAM_CHAT_ID}" \
|
|
-d text="${message}" \
|
|
-d parse_mode="HTML" > /dev/null 2>&1
|
|
}
|
|
|
|
# ===== kubectl 命令 (使用 sudo) =====
|
|
KUBECTL="sudo kubectl"
|
|
|
|
# ===== 檢查 kubectl 權限 =====
|
|
check_kubectl() {
|
|
if ! ${KUBECTL} get pods -n ${MOMO_NAMESPACE} > /dev/null 2>&1; then
|
|
log "ERROR: kubectl 權限不足"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ===== 檢查 CoreDNS 狀態 =====
|
|
check_coredns() {
|
|
local coredns_status=$(${KUBECTL} get pods -n kube-system -l k8s-app=kube-dns -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
|
|
local coredns_ready=$(${KUBECTL} get pods -n kube-system -l k8s-app=kube-dns -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null)
|
|
|
|
if [[ "$coredns_status" != "Running" ]] || [[ "$coredns_ready" != "true" ]]; then
|
|
log "WARNING: CoreDNS 異常 - Status: $coredns_status, Ready: $coredns_ready"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ===== 修復 CoreDNS =====
|
|
repair_coredns() {
|
|
log "正在重啟 CoreDNS..."
|
|
${KUBECTL} rollout restart deployment coredns -n kube-system
|
|
sleep 30
|
|
|
|
if check_coredns; then
|
|
log "CoreDNS 修復成功"
|
|
send_telegram "🟢 <b>K8s CoreDNS 已修復</b>
|
|
|
|
動作: 重啟 CoreDNS Deployment
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
return 0
|
|
else
|
|
log "CoreDNS 修復失敗"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# ===== 檢查 MOMO App Pod 狀態 =====
|
|
check_momo_pods() {
|
|
local pod_info=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-app -o jsonpath='{range .items[*]}{.metadata.name}|{.status.phase}|{.status.containerStatuses[0].ready}|{.status.containerStatuses[0].restartCount}{"\n"}{end}' 2>/dev/null)
|
|
|
|
local issues=""
|
|
local has_healthy=false
|
|
|
|
while IFS='|' read -r name phase ready restarts; do
|
|
[[ -z "$name" ]] && continue
|
|
|
|
if [[ "$phase" == "Running" ]] && [[ "$ready" == "true" ]]; then
|
|
has_healthy=true
|
|
log "Pod $name: Running, Ready"
|
|
else
|
|
issues="${issues}Pod ${name}: Phase=${phase}, Ready=${ready}, Restarts=${restarts}\n"
|
|
log "WARNING: Pod $name 異常 - Phase: $phase, Ready: $ready, Restarts: $restarts"
|
|
fi
|
|
done <<< "$pod_info"
|
|
|
|
if [[ -n "$issues" ]] && [[ "$has_healthy" == "false" ]]; then
|
|
echo "$issues"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ===== 檢查 MOMO App 健康端點 =====
|
|
check_app_health() {
|
|
local http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "${APP_HEALTH_URL}" 2>/dev/null)
|
|
|
|
if [[ "$http_code" != "200" ]]; then
|
|
log "WARNING: App 健康檢查失敗 - HTTP $http_code"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
# ===== 修復 MOMO App =====
|
|
repair_momo_app() {
|
|
local repair_step="$1"
|
|
|
|
case "$repair_step" in
|
|
1)
|
|
# 步驟 1: 重啟 Pod
|
|
log "修復步驟 1: 重啟 momo-app Pod..."
|
|
${KUBECTL} rollout restart deployment momo-app -n ${MOMO_NAMESPACE}
|
|
sleep 60
|
|
;;
|
|
2)
|
|
# 步驟 2: 先檢查 CoreDNS
|
|
log "修復步驟 2: 檢查並修復 CoreDNS..."
|
|
if ! check_coredns; then
|
|
repair_coredns
|
|
fi
|
|
# 再重啟 Pod
|
|
${KUBECTL} delete pods -l app=momo-app -n ${MOMO_NAMESPACE} --force --grace-period=0 2>/dev/null
|
|
sleep 60
|
|
;;
|
|
3)
|
|
# 步驟 3: 重啟整個 Deployment
|
|
log "修復步驟 3: 強制重建 Deployment..."
|
|
${KUBECTL} scale deployment momo-app -n ${MOMO_NAMESPACE} --replicas=0
|
|
sleep 10
|
|
${KUBECTL} scale deployment momo-app -n ${MOMO_NAMESPACE} --replicas=1
|
|
sleep 90
|
|
;;
|
|
*)
|
|
return 1
|
|
;;
|
|
esac
|
|
|
|
# 驗證修復
|
|
if check_app_health; then
|
|
log "momo-app 修復成功"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# ===== 主程式 =====
|
|
main() {
|
|
log "========== K8s 健康檢查開始 =========="
|
|
|
|
# 檢查 kubectl
|
|
if ! check_kubectl; then
|
|
send_telegram "🔴 <b>K8s 監控錯誤</b>
|
|
|
|
kubectl 權限不足,無法執行監控
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
exit 1
|
|
fi
|
|
|
|
local issues_found=false
|
|
local issue_details=""
|
|
|
|
# 1. 檢查 CoreDNS
|
|
if ! check_coredns; then
|
|
issues_found=true
|
|
issue_details="${issue_details}• CoreDNS 異常\n"
|
|
|
|
# 嘗試修復 CoreDNS
|
|
if repair_coredns; then
|
|
issue_details="${issue_details} ✅ CoreDNS 已修復\n"
|
|
else
|
|
issue_details="${issue_details} ❌ CoreDNS 修復失敗\n"
|
|
fi
|
|
fi
|
|
|
|
# 2. 檢查 MOMO App Pod
|
|
local pod_issues=$(check_momo_pods)
|
|
if [[ $? -ne 0 ]]; then
|
|
issues_found=true
|
|
issue_details="${issue_details}• momo-app Pod 異常:\n${pod_issues}"
|
|
fi
|
|
|
|
# 3. 檢查 App 健康端點
|
|
if ! check_app_health; then
|
|
issues_found=true
|
|
issue_details="${issue_details}• App 健康檢查失敗\n"
|
|
|
|
# 發送告警
|
|
send_telegram "🔴 <b>MOMO App 服務異常</b>
|
|
|
|
健康檢查失敗 (HTTP != 200)
|
|
URL: ${APP_HEALTH_URL}
|
|
|
|
<b>自動修復中...</b>
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
# 嘗試修復 (最多 3 次)
|
|
for step in 1 2 3; do
|
|
if repair_momo_app $step; then
|
|
send_telegram "🟢 <b>MOMO App 已恢復</b>
|
|
|
|
修復步驟: ${step}
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
issue_details="${issue_details} ✅ 修復成功 (步驟 ${step})\n"
|
|
break
|
|
fi
|
|
|
|
if [[ $step -eq 3 ]]; then
|
|
send_telegram "🔴 <b>MOMO App 修復失敗</b>
|
|
|
|
嘗試了 3 種修復方法均失敗
|
|
<b>需要人工介入</b>
|
|
|
|
SSH: ssh wooo@192.168.0.110
|
|
${KUBECTL} get pods -n momo
|
|
kubectl logs deployment/momo-app -n momo
|
|
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
issue_details="${issue_details} ❌ 所有修復步驟均失敗\n"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# 4. 檢查 PostgreSQL
|
|
local pg_status=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-postgres -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
|
|
if [[ "$pg_status" != "Running" ]]; then
|
|
issues_found=true
|
|
issue_details="${issue_details}• PostgreSQL 異常: ${pg_status}\n"
|
|
|
|
send_telegram "🔴 <b>PostgreSQL 異常</b>
|
|
|
|
狀態: ${pg_status}
|
|
需要人工介入
|
|
時間: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
fi
|
|
|
|
# 5. 檢查 Scheduler
|
|
local scheduler_status=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-scheduler -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
|
|
if [[ "$scheduler_status" != "Running" ]]; then
|
|
issues_found=true
|
|
issue_details="${issue_details}• Scheduler 異常: ${scheduler_status}\n"
|
|
|
|
# 重啟 Scheduler
|
|
${KUBECTL} rollout restart deployment momo-scheduler -n ${MOMO_NAMESPACE}
|
|
fi
|
|
|
|
if [[ "$issues_found" == "true" ]]; then
|
|
log "發現問題:\n${issue_details}"
|
|
else
|
|
log "所有服務正常"
|
|
fi
|
|
|
|
log "========== K8s 健康檢查完成 =========="
|
|
}
|
|
|
|
# 執行主程式
|
|
main "$@"
|