Files
ewoooc/scripts/k8s_health_monitor.sh
OoO d6d8777e41
All checks were successful
CD Pipeline / deploy (push) Successful in 1m12s
V10.601 收斂 Gemini 與密鑰治理
2026-06-06 14:52:46 +08:00

263 lines
7.7 KiB
Bash

#!/bin/bash
# K8s 健康監控腳本 (含自動修復)
# 部署位置: 192.168.0.110:/home/wooo/scripts/k8s_health_monitor.sh
# Cron: */5 * * * * /home/wooo/scripts/k8s_health_monitor.sh >> /var/log/k8s_health_monitor.log 2>&1
set -e
# ===== 配置區域 =====
TELEGRAM_BOT_TOKEN="<TELEGRAM_BOT_TOKEN>"
TELEGRAM_CHAT_ID="5619078117"
MOMO_NAMESPACE="momo"
APP_HEALTH_URL="https://mo.wooo.work/health"
LOG_FILE="/var/log/k8s_health_monitor.log"
STATE_FILE="/tmp/k8s_monitor_state.json"
# ===== 顏色和日誌 =====
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# ===== Telegram 通知 =====
send_telegram() {
local message="$1"
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d text="${message}" \
-d parse_mode="HTML" > /dev/null 2>&1
}
# ===== kubectl 命令 (使用 sudo) =====
KUBECTL="sudo kubectl"
# ===== 檢查 kubectl 權限 =====
check_kubectl() {
if ! ${KUBECTL} get pods -n ${MOMO_NAMESPACE} > /dev/null 2>&1; then
log "ERROR: kubectl 權限不足"
return 1
fi
return 0
}
# ===== 檢查 CoreDNS 狀態 =====
check_coredns() {
local coredns_status=$(${KUBECTL} get pods -n kube-system -l k8s-app=kube-dns -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
local coredns_ready=$(${KUBECTL} get pods -n kube-system -l k8s-app=kube-dns -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null)
if [[ "$coredns_status" != "Running" ]] || [[ "$coredns_ready" != "true" ]]; then
log "WARNING: CoreDNS 異常 - Status: $coredns_status, Ready: $coredns_ready"
return 1
fi
return 0
}
# ===== 修復 CoreDNS =====
repair_coredns() {
log "正在重啟 CoreDNS..."
${KUBECTL} rollout restart deployment coredns -n kube-system
sleep 30
if check_coredns; then
log "CoreDNS 修復成功"
send_telegram "🟢 <b>K8s CoreDNS 已修復</b>
動作: 重啟 CoreDNS Deployment
時間: $(date '+%Y-%m-%d %H:%M:%S')"
return 0
else
log "CoreDNS 修復失敗"
return 1
fi
}
# ===== 檢查 MOMO App Pod 狀態 =====
check_momo_pods() {
local pod_info=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-app -o jsonpath='{range .items[*]}{.metadata.name}|{.status.phase}|{.status.containerStatuses[0].ready}|{.status.containerStatuses[0].restartCount}{"\n"}{end}' 2>/dev/null)
local issues=""
local has_healthy=false
while IFS='|' read -r name phase ready restarts; do
[[ -z "$name" ]] && continue
if [[ "$phase" == "Running" ]] && [[ "$ready" == "true" ]]; then
has_healthy=true
log "Pod $name: Running, Ready"
else
issues="${issues}Pod ${name}: Phase=${phase}, Ready=${ready}, Restarts=${restarts}\n"
log "WARNING: Pod $name 異常 - Phase: $phase, Ready: $ready, Restarts: $restarts"
fi
done <<< "$pod_info"
if [[ -n "$issues" ]] && [[ "$has_healthy" == "false" ]]; then
echo "$issues"
return 1
fi
return 0
}
# ===== 檢查 MOMO App 健康端點 =====
check_app_health() {
local http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "${APP_HEALTH_URL}" 2>/dev/null)
if [[ "$http_code" != "200" ]]; then
log "WARNING: App 健康檢查失敗 - HTTP $http_code"
return 1
fi
return 0
}
# ===== 修復 MOMO App =====
repair_momo_app() {
local repair_step="$1"
case "$repair_step" in
1)
# 步驟 1: 重啟 Pod
log "修復步驟 1: 重啟 momo-app Pod..."
${KUBECTL} rollout restart deployment momo-app -n ${MOMO_NAMESPACE}
sleep 60
;;
2)
# 步驟 2: 先檢查 CoreDNS
log "修復步驟 2: 檢查並修復 CoreDNS..."
if ! check_coredns; then
repair_coredns
fi
# 再重啟 Pod
${KUBECTL} delete pods -l app=momo-app -n ${MOMO_NAMESPACE} --force --grace-period=0 2>/dev/null
sleep 60
;;
3)
# 步驟 3: 重啟整個 Deployment
log "修復步驟 3: 強制重建 Deployment..."
${KUBECTL} scale deployment momo-app -n ${MOMO_NAMESPACE} --replicas=0
sleep 10
${KUBECTL} scale deployment momo-app -n ${MOMO_NAMESPACE} --replicas=1
sleep 90
;;
*)
return 1
;;
esac
# 驗證修復
if check_app_health; then
log "momo-app 修復成功"
return 0
fi
return 1
}
# ===== 主程式 =====
main() {
log "========== K8s 健康檢查開始 =========="
# 檢查 kubectl
if ! check_kubectl; then
send_telegram "🔴 <b>K8s 監控錯誤</b>
kubectl 權限不足,無法執行監控
時間: $(date '+%Y-%m-%d %H:%M:%S')"
exit 1
fi
local issues_found=false
local issue_details=""
# 1. 檢查 CoreDNS
if ! check_coredns; then
issues_found=true
issue_details="${issue_details}• CoreDNS 異常\n"
# 嘗試修復 CoreDNS
if repair_coredns; then
issue_details="${issue_details} ✅ CoreDNS 已修復\n"
else
issue_details="${issue_details} ❌ CoreDNS 修復失敗\n"
fi
fi
# 2. 檢查 MOMO App Pod
local pod_issues=$(check_momo_pods)
if [[ $? -ne 0 ]]; then
issues_found=true
issue_details="${issue_details}• momo-app Pod 異常:\n${pod_issues}"
fi
# 3. 檢查 App 健康端點
if ! check_app_health; then
issues_found=true
issue_details="${issue_details}• App 健康檢查失敗\n"
# 發送告警
send_telegram "🔴 <b>MOMO App 服務異常</b>
健康檢查失敗 (HTTP != 200)
URL: ${APP_HEALTH_URL}
<b>自動修復中...</b>
時間: $(date '+%Y-%m-%d %H:%M:%S')"
# 嘗試修復 (最多 3 次)
for step in 1 2 3; do
if repair_momo_app $step; then
send_telegram "🟢 <b>MOMO App 已恢復</b>
修復步驟: ${step}
時間: $(date '+%Y-%m-%d %H:%M:%S')"
issue_details="${issue_details} ✅ 修復成功 (步驟 ${step})\n"
break
fi
if [[ $step -eq 3 ]]; then
send_telegram "🔴 <b>MOMO App 修復失敗</b>
嘗試了 3 種修復方法均失敗
<b>需要人工介入</b>
SSH: ssh wooo@192.168.0.110
${KUBECTL} get pods -n momo
kubectl logs deployment/momo-app -n momo
時間: $(date '+%Y-%m-%d %H:%M:%S')"
issue_details="${issue_details} ❌ 所有修復步驟均失敗\n"
fi
done
fi
# 4. 檢查 PostgreSQL
local pg_status=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-postgres -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
if [[ "$pg_status" != "Running" ]]; then
issues_found=true
issue_details="${issue_details}• PostgreSQL 異常: ${pg_status}\n"
send_telegram "🔴 <b>PostgreSQL 異常</b>
狀態: ${pg_status}
需要人工介入
時間: $(date '+%Y-%m-%d %H:%M:%S')"
fi
# 5. 檢查 Scheduler
local scheduler_status=$(${KUBECTL} get pods -n ${MOMO_NAMESPACE} -l app=momo-scheduler -o jsonpath='{.items[0].status.phase}' 2>/dev/null)
if [[ "$scheduler_status" != "Running" ]]; then
issues_found=true
issue_details="${issue_details}• Scheduler 異常: ${scheduler_status}\n"
# 重啟 Scheduler
${KUBECTL} rollout restart deployment momo-scheduler -n ${MOMO_NAMESPACE}
fi
if [[ "$issues_found" == "true" ]]; then
log "發現問題:\n${issue_details}"
else
log "所有服務正常"
fi
log "========== K8s 健康檢查完成 =========="
}
# 執行主程式
main "$@"