110 lines
3.5 KiB
Bash
Executable File
110 lines
3.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# 2026-04-28 ogt + Claude Opus 4.7: P2-2 Session 啟動健康驗證
|
||
# 來源:tool-expert 統一治理方案
|
||
# 目的:每次 Claude session 啟動時快速確認 5 主機 + 關鍵服務可達
|
||
# 純 read-only(curl + ssh -o BatchMode),不修改任何狀態
|
||
#
|
||
# 用法:
|
||
# bash scripts/health_check_session.sh
|
||
# 或加 alias: alias awoooi-health='bash ~/awoooi/scripts/health_check_session.sh'
|
||
|
||
set -uo pipefail # 不要 -e,個別 check 失敗不阻擋全部
|
||
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[0;33m'
|
||
NC='\033[0m'
|
||
|
||
ok() { printf "${GREEN}[OK]${NC} %s\n" "$1"; }
|
||
fail() { printf "${RED}[FAIL]${NC} %s\n" "$1"; }
|
||
warn() { printf "${YELLOW}[WARN]${NC} %s\n" "$1"; }
|
||
|
||
check_url() {
|
||
local name=$1 url=$2
|
||
local code
|
||
code=$(curl -sk --max-time 3 -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
|
||
if [[ "$code" =~ ^[23] ]]; then
|
||
ok "$name → $url ($code)"
|
||
return 0
|
||
elif [[ "$code" =~ ^[45] ]]; then
|
||
warn "$name → $url ($code, 服務有回應但非 2xx/3xx)"
|
||
return 0
|
||
else
|
||
fail "$name → $url (unreachable)"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
check_ssh() {
|
||
local name=$1 host=$2
|
||
if ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=no \
|
||
"$host" "echo ok" 2>/dev/null | grep -q ok; then
|
||
ok "SSH $name ($host)"
|
||
return 0
|
||
else
|
||
fail "SSH $name ($host) — 無法連線(timeout / 認證失敗 / 主機不可達)"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||
TS="$(date '+%Y-%m-%d %H:%M %Z')"
|
||
|
||
echo "=========================================="
|
||
echo "AWOOOI Session Health Check $TS"
|
||
echo "=========================================="
|
||
|
||
echo ""
|
||
echo "--- K8s 控制平面 ---"
|
||
check_url "K3s VIP API" "https://192.168.0.125:6443/healthz"
|
||
check_url "ArgoCD (121)" "https://192.168.0.121:30443"
|
||
|
||
echo ""
|
||
echo "--- AI 推理層 ---"
|
||
check_url "Ollama 111 GPU" "http://192.168.0.111:11434/api/tags"
|
||
|
||
echo ""
|
||
echo "--- 觀測層 ---"
|
||
check_url "Prometheus 110" "http://192.168.0.110:9090/-/healthy"
|
||
check_url "Alertmanager 110" "http://192.168.0.110:9093/-/healthy"
|
||
check_url "Gitea 110" "http://192.168.0.110:3001"
|
||
check_url "Langfuse 110" "http://192.168.0.110:3100"
|
||
|
||
echo ""
|
||
echo "--- AWOOOI 核心服務 (prod NodePort) ---"
|
||
check_url "AWOOOI API (125)" "http://192.168.0.125:32334/api/v1/health"
|
||
|
||
echo ""
|
||
echo "--- SSH 連通 ---"
|
||
check_ssh "awoooi-devops (110)" "wooo@192.168.0.110"
|
||
check_ssh "k3s-1 (120)" "wooo@192.168.0.120"
|
||
check_ssh "k3s-2 (121)" "wooo@192.168.0.121"
|
||
check_ssh "ollama-111-gpu (ProxyJump 110)" "ollama-111-gpu"
|
||
|
||
echo ""
|
||
echo "--- Config Drift Check ---"
|
||
if [ -x "$ROOT/scripts/check_config_drift.py" ]; then
|
||
python3 "$ROOT/scripts/check_config_drift.py" || warn "config drift detected (見上方 [DRIFT] 行)"
|
||
else
|
||
warn "drift checker 不存在 ($ROOT/scripts/check_config_drift.py)"
|
||
fi
|
||
|
||
echo ""
|
||
echo "--- Git 狀態 ---"
|
||
if [ -d "$ROOT/.git" ]; then
|
||
cd "$ROOT" || exit
|
||
branch=$(git branch --show-current 2>/dev/null || echo "<detached>")
|
||
upstream_diff=$(git rev-list --count "@{u}..HEAD" 2>/dev/null || echo "?")
|
||
echo " 分支: $branch (本地超前上游 $upstream_diff 個 commit)"
|
||
if ! git diff --quiet 2>/dev/null; then
|
||
warn " 有未 commit 的變更(git status 自查)"
|
||
else
|
||
ok " 工作目錄 clean"
|
||
fi
|
||
fi
|
||
|
||
echo ""
|
||
echo "=========================================="
|
||
echo "Session Health Check 結束"
|
||
echo "=========================================="
|