Files
awoooi/scripts/health_check_session.sh

110 lines
3.5 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# 2026-04-28 ogt + Claude Opus 4.7: P2-2 Session 啟動健康驗證
# 來源tool-expert 統一治理方案
# 目的:每次 Claude session 啟動時快速確認 5 主機 + 關鍵服務可達
# 純 read-onlycurl + ssh -o BatchMode不修改任何狀態
#
# 用法:
# bash scripts/health_check_session.sh
# 或加 alias: alias awoooi-health='bash ~/awoooi/scripts/health_check_session.sh'
set -uo pipefail # 不要 -e個別 check 失敗不阻擋全部
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m'
ok() { printf "${GREEN}[OK]${NC} %s\n" "$1"; }
fail() { printf "${RED}[FAIL]${NC} %s\n" "$1"; }
warn() { printf "${YELLOW}[WARN]${NC} %s\n" "$1"; }
check_url() {
local name=$1 url=$2
local code
code=$(curl -sk --max-time 3 -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
if [[ "$code" =~ ^[23] ]]; then
ok "$name$url ($code)"
return 0
elif [[ "$code" =~ ^[45] ]]; then
warn "$name$url ($code, 服務有回應但非 2xx/3xx)"
return 0
else
fail "$name$url (unreachable)"
return 1
fi
}
check_ssh() {
local name=$1 host=$2
if ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=accept-new \
"$host" "echo ok" 2>/dev/null | grep -q ok; then
ok "SSH $name ($host)"
return 0
else
fail "SSH $name ($host) — 無法連線timeout / 認證失敗 / 主機不可達)"
return 1
fi
}
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
TS="$(date '+%Y-%m-%d %H:%M %Z')"
echo "=========================================="
echo "AWOOOI Session Health Check $TS"
echo "=========================================="
echo ""
echo "--- K8s 控制平面 ---"
check_url "K3s VIP API" "https://192.168.0.125:6443/healthz"
check_url "ArgoCD (121)" "https://192.168.0.121:30443"
echo ""
echo "--- AI 推理層 ---"
check_url "Ollama 111 GPU" "http://192.168.0.111:11434/api/tags"
echo ""
echo "--- 觀測層 ---"
check_url "Prometheus 110" "http://192.168.0.110:9090/-/healthy"
check_url "Alertmanager 110" "http://192.168.0.110:9093/-/healthy"
check_url "Gitea 110" "http://192.168.0.110:3001"
check_url "Langfuse 110" "http://192.168.0.110:3100"
echo ""
echo "--- AWOOOI 核心服務 (prod NodePort) ---"
check_url "AWOOOI API (125)" "http://192.168.0.125:32334/api/v1/health"
echo ""
echo "--- SSH 連通 ---"
check_ssh "awoooi-devops (110)" "wooo@192.168.0.110"
check_ssh "k3s-1 (120)" "wooo@192.168.0.120"
check_ssh "k3s-2 (121)" "wooo@192.168.0.121"
check_ssh "ollama-111-gpu (ProxyJump 110)" "ollama-111-gpu"
echo ""
echo "--- Config Drift Check ---"
if [ -x "$ROOT/scripts/check_config_drift.py" ]; then
python3 "$ROOT/scripts/check_config_drift.py" || warn "config drift detected (見上方 [DRIFT] 行)"
else
warn "drift checker 不存在 ($ROOT/scripts/check_config_drift.py)"
fi
echo ""
echo "--- Git 狀態 ---"
if [ -d "$ROOT/.git" ]; then
cd "$ROOT" || exit
branch=$(git branch --show-current 2>/dev/null || echo "<detached>")
upstream_diff=$(git rev-list --count "@{u}..HEAD" 2>/dev/null || echo "?")
echo " 分支: $branch (本地超前上游 $upstream_diff 個 commit)"
if ! git diff --quiet 2>/dev/null; then
warn " 有未 commit 的變更git status 自查)"
else
ok " 工作目錄 clean"
fi
fi
echo ""
echo "=========================================="
echo "Session Health Check 結束"
echo "=========================================="