加入 Ollama GCP failover 診斷與 unhealthy skip
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s

This commit is contained in:
OoO
2026-05-25 14:16:51 +08:00
parent 44ef5a70a1
commit a00f34ce87
11 changed files with 341 additions and 8 deletions

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env bash
set -u
# Rootless Ollama GCP failover diagnostic.
# It verifies the direct GCP-A/GCP-B/111 endpoints plus the 110 proxy ports.
# It does not modify nginx, Docker, GCP, or any production service.
PRIMARY_URL="${OLLAMA_HOST_PRIMARY:-http://34.143.170.20:11434}"
SECONDARY_URL="${OLLAMA_HOST_SECONDARY:-http://34.21.145.224:11434}"
FALLBACK_URL="${OLLAMA_HOST_FALLBACK:-http://192.168.0.111:11434}"
PROXY_PRIMARY_URL="${OLLAMA_PROXY_PRIMARY:-http://192.168.0.110:11435}"
PROXY_SECONDARY_URL="${OLLAMA_PROXY_SECONDARY:-http://192.168.0.110:11436}"
EMBED_MODEL="${OLLAMA_DIAG_EMBED_MODEL:-bge-m3:latest}"
EMBED_TIMEOUT="${OLLAMA_DIAG_EMBED_TIMEOUT:-30}"
CONNECT_TIMEOUT="${OLLAMA_DIAG_CONNECT_TIMEOUT:-3}"
RUN_EMBED_PROBE="${OLLAMA_DIAG_RUN_EMBED_PROBE:-1}"
INCLUDE_111_EMBED="${OLLAMA_DIAG_INCLUDE_111_EMBED:-0}"
ok_count=0
warn_count=0
fail_count=0
say() {
printf '%s\n' "$*"
}
status_line() {
local state="$1"
local label="$2"
local detail="$3"
printf '%-6s %-24s %s\n' "$state" "$label" "$detail"
case "$state" in
OK) ok_count=$((ok_count + 1)) ;;
WARN) warn_count=$((warn_count + 1)) ;;
FAIL) fail_count=$((fail_count + 1)) ;;
esac
}
curl_probe() {
local label="$1"
local url="$2"
local path="${3:-/api/version}"
local max_time="${4:-10}"
local output
local code
output="$(mktemp)"
code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$max_time" \
-o "$output" -w '%{http_code} %{time_total}' "${url%/}${path}" 2>"$output.err")"
local rc=$?
if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then
status_line OK "$label" "${path} ${code}"
rm -f "$output" "$output.err"
return 0
fi
local err
err="$(head -c 180 "$output.err" 2>/dev/null || true)"
[[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)"
status_line FAIL "$label" "${path} ${code:-curl_rc=$rc} ${err}"
rm -f "$output" "$output.err"
return 1
}
embed_probe() {
local label="$1"
local url="$2"
local output
local code
output="$(mktemp)"
code="$(curl -sS --connect-timeout "$CONNECT_TIMEOUT" --max-time "$EMBED_TIMEOUT" \
-H 'Content-Type: application/json' \
-d "{\"model\":\"${EMBED_MODEL}\",\"input\":\"health\",\"keep_alive\":\"1m\"}" \
-o "$output" -w '%{http_code} %{time_total} %{size_download}' \
"${url%/}/api/embed" 2>"$output.err")"
local rc=$?
if [[ "$rc" -eq 0 && "${code%% *}" == "200" ]]; then
status_line OK "$label" "/api/embed ${code}"
rm -f "$output" "$output.err"
return 0
fi
local err
err="$(head -c 180 "$output.err" 2>/dev/null || true)"
[[ -z "$err" ]] && err="$(head -c 180 "$output" 2>/dev/null || true)"
status_line FAIL "$label" "/api/embed ${code:-curl_rc=$rc} ${err}"
rm -f "$output" "$output.err"
return 1
}
say "Ollama GCP failover diagnostic"
say "time=$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
say "primary=${PRIMARY_URL}"
say "secondary=${SECONDARY_URL}"
say "fallback=${FALLBACK_URL}"
say "proxy_primary=${PROXY_PRIMARY_URL}"
say "proxy_secondary=${PROXY_SECONDARY_URL}"
say ""
primary_version_ok=0
secondary_version_ok=0
proxy_primary_ok=0
proxy_secondary_ok=0
curl_probe "GCP-A direct" "$PRIMARY_URL" "/api/version" 10 && primary_version_ok=1
curl_probe "GCP-B direct" "$SECONDARY_URL" "/api/version" 10 && secondary_version_ok=1
curl_probe "111 fallback" "$FALLBACK_URL" "/api/version" 10 || true
curl_probe "110 proxy primary" "$PROXY_PRIMARY_URL" "/api/version" 10 && proxy_primary_ok=1
curl_probe "110 proxy secondary" "$PROXY_SECONDARY_URL" "/api/version" 10 && proxy_secondary_ok=1
if [[ "$RUN_EMBED_PROBE" == "1" ]]; then
say ""
say "Embedding runtime probe model=${EMBED_MODEL} timeout=${EMBED_TIMEOUT}s"
[[ "$primary_version_ok" == "1" ]] && embed_probe "GCP-A embed" "$PRIMARY_URL" || status_line WARN "GCP-A embed" "skipped because /api/version is not healthy"
[[ "$secondary_version_ok" == "1" ]] && embed_probe "GCP-B embed" "$SECONDARY_URL" || status_line WARN "GCP-B embed" "skipped because /api/version is not healthy"
if [[ "$INCLUDE_111_EMBED" == "1" ]]; then
embed_probe "111 embed" "$FALLBACK_URL" || true
else
status_line WARN "111 embed" "skipped by default; 111 must not carry background bge-m3 probes"
fi
fi
say ""
say "Summary: OK=${ok_count} WARN=${warn_count} FAIL=${fail_count}"
if [[ "$primary_version_ok" != "1" && "$secondary_version_ok" == "1" ]]; then
say "Diagnosis: GCP-A primary is down/refusing, but GCP-B is available. Keep app routing on GCP-A -> GCP-B -> 111; repair GCP-A with GCP/SSH access."
fi
if [[ "$proxy_primary_ok" != "1" && "$proxy_secondary_ok" == "1" ]]; then
say "Diagnosis: 110:11435 primary proxy is unhealthy while 110:11436 works. Root on 110 is required for nginx failover config or reload."
fi
if [[ "$secondary_version_ok" != "1" ]]; then
exit 2
fi
if [[ "$primary_version_ok" != "1" || "$proxy_primary_ok" != "1" ]]; then
exit 1
fi
exit 0