fix: skip unhealthy direct ollama probes
All checks were successful
CD Pipeline / deploy (push) Successful in 1m10s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m10s
This commit is contained in:
@@ -211,6 +211,84 @@ def _host_label_for_embedding_health(host: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
def _host_label_for_direct_health(host: str) -> str:
|
||||
"""Map only direct GCP Ollama URLs to host_health_probes labels."""
|
||||
if not host:
|
||||
return ''
|
||||
if '34.143.170.20:11434' in host:
|
||||
return 'Primary (GCP)'
|
||||
if '34.21.145.224:11434' in host:
|
||||
return 'Secondary (GCP)'
|
||||
return ''
|
||||
|
||||
|
||||
def _recent_direct_host_unhealthy(host: str) -> bool:
|
||||
"""Skip recent unhealthy direct GCP endpoints before trying proxy rescue.
|
||||
|
||||
This is deliberately direct-host only. Proxy rescue URLs must still be
|
||||
probed even when the direct GCP endpoint is unhealthy.
|
||||
"""
|
||||
if not _env_flag('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_ENABLED', True):
|
||||
return False
|
||||
|
||||
host_label = _host_label_for_direct_health(host)
|
||||
if not host_label:
|
||||
return False
|
||||
|
||||
try:
|
||||
window_minutes = int(os.getenv('OLLAMA_RESOLVE_HOST_HEALTH_SKIP_WINDOW_MINUTES', '20'))
|
||||
except (TypeError, ValueError):
|
||||
window_minutes = 20
|
||||
window_minutes = max(1, window_minutes)
|
||||
|
||||
try:
|
||||
from sqlalchemy import text as sa_text
|
||||
from database.manager import get_session
|
||||
|
||||
session = get_session()
|
||||
try:
|
||||
row = session.execute(
|
||||
sa_text("""
|
||||
SELECT healthy, error_msg, probed_at
|
||||
FROM host_health_probes
|
||||
WHERE host_label = :host_label
|
||||
ORDER BY probed_at DESC
|
||||
LIMIT 1
|
||||
"""),
|
||||
{'host_label': host_label},
|
||||
).fetchone()
|
||||
finally:
|
||||
session.close()
|
||||
except Exception:
|
||||
logger.debug("[OllamaHost] direct host health skip fail-open for host=%s", host, exc_info=True)
|
||||
return False
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
healthy, error_msg, probed_at = row[0], row[1], row[2]
|
||||
if probed_at:
|
||||
try:
|
||||
now = datetime.now(probed_at.tzinfo) if getattr(probed_at, 'tzinfo', None) else datetime.now()
|
||||
if now - probed_at > timedelta(minutes=window_minutes):
|
||||
return False
|
||||
except Exception:
|
||||
logger.debug("[OllamaHost] could not evaluate host health probe age for host=%s", host, exc_info=True)
|
||||
return False
|
||||
|
||||
if bool(healthy):
|
||||
return False
|
||||
|
||||
logger.warning(
|
||||
"[OllamaHost] skip recent unhealthy direct host=%s label=%s window=%sm error=%s",
|
||||
host,
|
||||
host_label,
|
||||
window_minutes,
|
||||
(error_msg or '')[:180],
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _recent_embedding_host_unhealthy(host: str) -> bool:
|
||||
"""Skip known-bad GCP embedding runtimes using recent host_health_probes rows.
|
||||
|
||||
@@ -507,9 +585,11 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY,
|
||||
|
||||
primary_proxy = _proxy_rescue_for_primary(primary)
|
||||
secondary_proxy = _proxy_rescue_for_secondary(secondary)
|
||||
primary_recent_unhealthy = _recent_direct_host_unhealthy(primary)
|
||||
secondary_recent_unhealthy = _recent_direct_host_unhealthy(secondary)
|
||||
|
||||
# B4: primary 若被標 unhealthy,先嘗試同順位 110 proxy,再嘗試 secondary
|
||||
if not _is_unhealthy(primary) and _is_reachable(primary):
|
||||
if not _is_unhealthy(primary) and not primary_recent_unhealthy and _is_reachable(primary):
|
||||
selected = primary
|
||||
logger.info(f"[OllamaHost] Primary 主機可用: {primary}")
|
||||
elif primary_proxy and not _is_unhealthy(primary_proxy) and _is_reachable(primary_proxy):
|
||||
@@ -518,7 +598,7 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY,
|
||||
"[OllamaHost] Primary direct 不可用,使用 110 primary proxy: %s",
|
||||
primary_proxy,
|
||||
)
|
||||
elif not _is_unhealthy(secondary) and _is_reachable(secondary):
|
||||
elif not _is_unhealthy(secondary) and not secondary_recent_unhealthy and _is_reachable(secondary):
|
||||
selected = secondary
|
||||
logger.info(f"[OllamaHost] Primary 不可用,使用 Secondary: {secondary}")
|
||||
elif secondary_proxy and not _is_unhealthy(secondary_proxy) and _is_reachable(secondary_proxy):
|
||||
|
||||
Reference in New Issue
Block a user