137 lines
4.8 KiB
Bash
Executable File
137 lines
4.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -u
|
|
|
|
# Rootless Ollama GCP failover diagnostic.
|
|
# It verifies the direct GCP-A/GCP-B/111 endpoints plus the 110 proxy ports.
|
|
# It does not modify nginx, Docker, GCP, or any production service.
|
|
|
|
PRIMARY_URL="${OLLAMA_HOST_PRIMARY:-http://34.87.90.216:11434}"
|
|
SECONDARY_URL="${OLLAMA_HOST_SECONDARY:-http://34.21.145.224:11434}"
|
|
FALLBACK_URL="${OLLAMA_HOST_FALLBACK:-http://192.168.0.111:11434}"
|
|
PROXY_PRIMARY_URL="${OLLAMA_PROXY_PRIMARY:-http://192.168.0.110:11435}"
|
|
PROXY_SECONDARY_URL="${OLLAMA_PROXY_SECONDARY:-http://192.168.0.110:11436}"
|
|
EMBED_MODEL="${OLLAMA_DIAG_EMBED_MODEL:-bge-m3:latest}"
|
|
EMBED_TIMEOUT="${OLLAMA_DIAG_EMBED_TIMEOUT:-30}"
|
|
CONNECT_TIMEOUT="${OLLAMA_DIAG_CONNECT_TIMEOUT:-3}"
|
|
RUN_EMBED_PROBE="${OLLAMA_DIAG_RUN_EMBED_PROBE:-1}"
|
|
INCLUDE_111_EMBED="${OLLAMA_DIAG_INCLUDE_111_EMBED:-0}"
|
|
|
|
ok_count=0
|
|
warn_count=0
|
|
fail_count=0
|
|
|
|
say() {
|
|
printf '%s\n' "$*"
|
|
}
|
|
|
|
status_line() {
|
|
local state="$1"
|
|
local label="$2"
|
|
local detail="$3"
|
|
printf '%-6s %-24s %s\n' "$state" "$label" "$detail"
|
|
case "$state" in
|
|
OK) ok_count=$((ok_count + 1)) ;;
|
|
WARN) warn_count=$((warn_count + 1)) ;;
|
|
FAIL) fail_count=$((fail_count + 1)) ;;
|
|
esac
|
|
}
|
|
|
|
curl_probe() {
|
|
local label="$1"
|
|
local url="$2"
|
|
local path="${3:-/api/version}"
|
|
local max_time="${4:-10}"
|
|
local output
|
|
local code
|
|
output="$(mktemp)"
|
|
code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$max_time" \
|
|
-o "$output" -w '%{http_code} %{time_total}' "${url%/}${path}" 2>"$output.err")"
|
|
local rc=$?
|
|
if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then
|
|
status_line OK "$label" "${path} ${code}"
|
|
rm -f "$output" "$output.err"
|
|
return 0
|
|
fi
|
|
local err
|
|
err="$(head -c 180 "$output.err" 2>/dev/null || true)"
|
|
[[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)"
|
|
status_line FAIL "$label" "${path} ${code:-curl_rc=$rc} ${err}"
|
|
rm -f "$output" "$output.err"
|
|
return 1
|
|
}
|
|
|
|
embed_probe() {
|
|
local label="$1"
|
|
local url="$2"
|
|
local output
|
|
local code
|
|
output="$(mktemp)"
|
|
code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$EMBED_TIMEOUT" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"model\":\"${EMBED_MODEL}\",\"input\":\"health\",\"keep_alive\":\"1m\"}" \
|
|
-o "$output" -w '%{http_code} %{time_total} %{size_download}' \
|
|
"${url%/}/api/embed" 2>"$output.err")"
|
|
local rc=$?
|
|
if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then
|
|
status_line OK "$label" "/api/embed ${code}"
|
|
rm -f "$output" "$output.err"
|
|
return 0
|
|
fi
|
|
local err
|
|
err="$(head -c 180 "$output.err" 2>/dev/null || true)"
|
|
[[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)"
|
|
status_line FAIL "$label" "/api/embed ${code:-curl_rc=$rc} ${err}"
|
|
rm -f "$output" "$output.err"
|
|
return 1
|
|
}
|
|
|
|
say "Ollama GCP failover diagnostic"
|
|
say "time=$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
|
say "primary=${PRIMARY_URL}"
|
|
say "secondary=${SECONDARY_URL}"
|
|
say "fallback=${FALLBACK_URL}"
|
|
say "proxy_primary=${PROXY_PRIMARY_URL}"
|
|
say "proxy_secondary=${PROXY_SECONDARY_URL}"
|
|
say ""
|
|
|
|
primary_version_ok=0
|
|
secondary_version_ok=0
|
|
proxy_primary_ok=0
|
|
proxy_secondary_ok=0
|
|
|
|
curl_probe "GCP-A direct" "$PRIMARY_URL" "/api/version" 10 && primary_version_ok=1
|
|
curl_probe "GCP-B direct" "$SECONDARY_URL" "/api/version" 10 && secondary_version_ok=1
|
|
curl_probe "111 fallback" "$FALLBACK_URL" "/api/version" 10 || true
|
|
curl_probe "110 proxy primary" "$PROXY_PRIMARY_URL" "/api/version" 10 && proxy_primary_ok=1
|
|
curl_probe "110 proxy secondary" "$PROXY_SECONDARY_URL" "/api/version" 10 && proxy_secondary_ok=1
|
|
|
|
if [[ "$RUN_EMBED_PROBE" == "1" ]]; then
|
|
say ""
|
|
say "Embedding runtime probe model=${EMBED_MODEL} timeout=${EMBED_TIMEOUT}s"
|
|
[[ "$primary_version_ok" == "1" ]] && embed_probe "GCP-A embed" "$PRIMARY_URL" || status_line WARN "GCP-A embed" "skipped because /api/version is not healthy"
|
|
[[ "$secondary_version_ok" == "1" ]] && embed_probe "GCP-B embed" "$SECONDARY_URL" || status_line WARN "GCP-B embed" "skipped because /api/version is not healthy"
|
|
if [[ "$INCLUDE_111_EMBED" == "1" ]]; then
|
|
embed_probe "111 embed" "$FALLBACK_URL" || true
|
|
else
|
|
status_line WARN "111 embed" "skipped by default; 111 must not carry background bge-m3 probes"
|
|
fi
|
|
fi
|
|
|
|
say ""
|
|
say "Summary: OK=${ok_count} WARN=${warn_count} FAIL=${fail_count}"
|
|
|
|
if [[ "$primary_version_ok" != "1" && "$secondary_version_ok" == "1" ]]; then
|
|
say "Diagnosis: GCP-A primary is down/refusing, but GCP-B is available. Keep app routing on GCP-A -> GCP-B -> 111; repair GCP-A with GCP/SSH access."
|
|
fi
|
|
if [[ "$proxy_primary_ok" != "1" && "$proxy_secondary_ok" == "1" ]]; then
|
|
say "Diagnosis: 110:11435 primary proxy is unhealthy while 110:11436 works. Root on 110 is required for nginx failover config or reload."
|
|
fi
|
|
|
|
if [[ "$secondary_version_ok" != "1" ]]; then
|
|
exit 2
|
|
fi
|
|
if [[ "$primary_version_ok" != "1" || "$proxy_primary_ok" != "1" ]]; then
|
|
exit 1
|
|
fi
|
|
exit 0
|