fix(alerts): bypass proxy timeout for GCP Ollama
This commit is contained in:
@@ -215,8 +215,8 @@ class Settings(BaseSettings):
|
||||
description="Phase 25 P0: DIAGNOSE NIM timeout (秒),實測 2.2-27.3s avg 10.6s,60s 含 buffer",
|
||||
)
|
||||
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
|
||||
default=200,
|
||||
description="Phase 25 P0: Ollama timeout (秒),實測 CPU-only 238s,保留欄位但 DIAGNOSE 不再走 Ollama",
|
||||
default=300,
|
||||
description="Ollama diagnose timeout (秒)。GCP qwen3:14b CPU-only can exceed the old 120s proxy limit.",
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
@@ -534,7 +534,7 @@ class Settings(BaseSettings):
|
||||
),
|
||||
)
|
||||
INCIDENT_LLM_TIMEOUT_SECONDS: int = Field(
|
||||
default=240,
|
||||
default=360,
|
||||
description=(
|
||||
"Outer timeout for incident OpenClaw proposal generation. This must "
|
||||
"be long enough for the GCP-A/GCP-B/111 Ollama lane to complete "
|
||||
|
||||
@@ -70,11 +70,11 @@ def _agent_debate_global_timeout_seconds() -> float:
|
||||
90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
|
||||
"""
|
||||
|
||||
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "260.0")
|
||||
raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "420.0")
|
||||
try:
|
||||
timeout = float(raw)
|
||||
except (TypeError, ValueError):
|
||||
timeout = 260.0
|
||||
timeout = 420.0
|
||||
return max(timeout, 90.0)
|
||||
|
||||
|
||||
|
||||
@@ -143,7 +143,7 @@ class OllamaProvider:
|
||||
options = registry.get_provider_options("ollama")
|
||||
|
||||
# P0 2026-04-04 Claude Code: per-task timeout(Option C 分情境)
|
||||
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS (200s,實測 ~173s)
|
||||
# FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||||
# 其他 → OPENCLAW_TIMEOUT(既有設定)
|
||||
task_type = (context or {}).get("task_type", "")
|
||||
if task_type in ("diagnose", "force_local"):
|
||||
|
||||
@@ -1137,7 +1137,7 @@ class OpenClawService:
|
||||
|
||||
# 2026-04-29 ogt + Claude Code: 注入 task_type 讓 Ollama 用正確 timeout
|
||||
# 根因: ai_providers/ollama.py:77 讀 context["task_type"] 決定 timeout
|
||||
# - "diagnose"/"force_local" → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS=200s
|
||||
# - "diagnose"/"force_local" → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||||
# - 其他/未注入 → OPENCLAW_TIMEOUT=30s(不夠 qwen2.5:7b 推理)
|
||||
# webhooks alert_context 從未注入 task_type → Ollama fallback 永遠 30s timeout
|
||||
# 對齊 decision.intent 後 Ollama fallback 真正能跑完
|
||||
|
||||
@@ -71,8 +71,8 @@ class TestTimeoutDefaults:
|
||||
f"Critic default timeout 期望 15.0,實際 {mod.AGENT_CRITIC_TIMEOUT_SEC}"
|
||||
)
|
||||
|
||||
def test_agent_debate_global_timeout_default_is_260(self, monkeypatch):
|
||||
"""Agent debate global timeout defaults to the GCP Ollama-first budget."""
|
||||
def test_agent_debate_global_timeout_default_is_420(self, monkeypatch):
|
||||
"""Agent debate global timeout defaults to the direct GCP qwen3 budget."""
|
||||
monkeypatch.delenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", raising=False)
|
||||
|
||||
if "src.services.agent_orchestrator" in sys.modules:
|
||||
@@ -80,7 +80,7 @@ class TestTimeoutDefaults:
|
||||
import src.services.agent_orchestrator as mod
|
||||
importlib.reload(mod)
|
||||
|
||||
assert mod.GLOBAL_TIMEOUT_SEC == 260.0
|
||||
assert mod.GLOBAL_TIMEOUT_SEC == 420.0
|
||||
|
||||
def test_deprecated_alias_matches_new_constant_diagnostician(self, monkeypatch):
|
||||
"""PHASE2_STEP_TIMEOUT_SEC alias 應等於 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC(相容性保證)"""
|
||||
@@ -183,7 +183,7 @@ class TestEnvOverride:
|
||||
assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_CRITIC_TIMEOUT_SEC
|
||||
|
||||
def test_agent_debate_global_timeout_env_override(self, monkeypatch):
|
||||
"""AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=300 覆蓋 default 260.0"""
|
||||
"""AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=300 覆蓋 default 420.0"""
|
||||
monkeypatch.setenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "300")
|
||||
|
||||
if "src.services.agent_orchestrator" in sys.modules:
|
||||
|
||||
@@ -1,3 +1,16 @@
|
||||
## 2026-05-06 | GCP Ollama direct endpoint hotfix for alert diagnosis
|
||||
|
||||
**背景**:生產 log 顯示 alert path 的 provider order 已是 `ollama_gcp_a → ollama_gcp_b → ollama_local → gemini`,但 GCP-A/GCP-B 經 110 nginx bridge 各跑滿 120s 後回 `504 Gateway Time-out`,因此最後仍 fallback 到 Gemini 並產生成本。110 同時存在 `conf.d/ollama-gcp-proxy.conf`(120s)與 `sites-enabled/110-ollama-proxy.conf`(300s),較早載入的 `conf.d` 實際截斷了 qwen3:14b。
|
||||
|
||||
**本次修補**:
|
||||
- production active endpoint 暫改 direct GCP:`OLLAMA_URL=http://34.143.170.20:11434`、`OLLAMA_SECONDARY_URL=http://34.21.145.224:11434`,111 維持最後 Ollama fallback。
|
||||
- `OLLAMA_DIAGNOSE_TIMEOUT_SECONDS=300`、`INCIDENT_LLM_TIMEOUT_SECONDS=360`、`AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=420`,讓 qwen3:14b 有足夠時間完成。
|
||||
- ADR-125 / GCP proxy runbook 補註 direct endpoint 只是 110 bridge timeout 衝突的 stopgap;長期仍走 WireGuard mesh + AwoooP Inference Gateway。
|
||||
|
||||
**驗證**:
|
||||
- API pod 直連 `34.143.170.20:11434/api/tags` 與 `34.21.145.224:11434/api/tags` 均 200。
|
||||
- `bge-m3:latest` embedding 已在 GCP-A/GCP-B 回傳 1024 維,RAG 不再打舊 `nomic-embed-text`。
|
||||
|
||||
## 2026-05-06 | CD host-key prompt unblock for AwoooP Ollama rollout
|
||||
|
||||
**背景**:`09256be6` 已推到 Gitea main,但 CD `build-and-deploy` 卡在 SSH 到 `192.168.0.121` 的 host-key authenticity prompt,runner 無互動輸入,導致新 image tag 尚未注入 `kustomization.yaml`。
|
||||
|
||||
@@ -58,6 +58,14 @@ Ollama endpoints after cutover:
|
||||
The current `192.168.0.110:11435/11436` nginx proxy remains an emergency bridge
|
||||
only until the mesh cutover passes shadow and canary gates.
|
||||
|
||||
2026-05-06 operational amendment: the active production env temporarily uses the
|
||||
GCP public Ollama IPs directly (`34.143.170.20:11434`,
|
||||
`34.21.145.224:11434`) because the 110 bridge has an older duplicate
|
||||
`/etc/nginx/conf.d/ollama-gcp-proxy.conf` with a 120s read timeout that returns
|
||||
504 before `qwen3:14b` can finish. This is a stopgap, not the target transport.
|
||||
The next durable transport remains WireGuard mesh plus the AwoooP Inference
|
||||
Gateway.
|
||||
|
||||
### D2 - Public Ollama exposure is forbidden after cutover
|
||||
|
||||
After mesh cutover:
|
||||
@@ -110,10 +118,13 @@ gateway explicitly opens a maintenance window.
|
||||
|
||||
## Migration Plan
|
||||
|
||||
### Phase 0 - Current bridge
|
||||
### Phase 0 - Current bridge / direct stopgap
|
||||
|
||||
- Keep `192.168.0.110:11435` and `192.168.0.110:11436` active.
|
||||
- Alert path uses `ALERT_OLLAMA_MODEL=gemma3:4b`.
|
||||
- Active production may use direct GCP IPs until the 110 bridge timeout conflict
|
||||
is removed.
|
||||
- Alert path uses `ALERT_OLLAMA_MODEL=qwen3:14b` when the user prioritizes
|
||||
problem resolution quality over fast card delivery.
|
||||
- Gemini remains paid emergency fallback only.
|
||||
|
||||
### Phase 1 - Mesh build in parallel
|
||||
@@ -184,4 +195,3 @@ Paid provider fallback must remain budget-gated.
|
||||
- AwoooP can manage Ollama as a platform resource shared by all tenants.
|
||||
- CPU-only GCP performance remains a capacity constraint; routing must keep
|
||||
heavy jobs off the alert lane or use GPU-capable GCP nodes.
|
||||
|
||||
|
||||
@@ -10,8 +10,12 @@
|
||||
|
||||
## 背景
|
||||
|
||||
GCP Ollama (34.143.170.20 / 34.21.145.224) 已部署完成,但 K3s 叢集內無法直接連線 GCP 外網 IP。
|
||||
透過 192.168.0.110 (DevOps 金庫) 架設 nginx 反向代理,讓 K3s Pod 走內網連線 GCP Ollama。
|
||||
GCP Ollama (34.143.170.20 / 34.21.145.224) 已部署完成。
|
||||
最初透過 192.168.0.110 (DevOps 金庫) 架設 nginx 反向代理,讓 K3s Pod 走內網連線 GCP Ollama。
|
||||
|
||||
2026-05-06 現況:K3s Pod 已可直連 GCP-A/GCP-B `11434/tcp`,且 production
|
||||
暫時改用 direct endpoint,避開 110 上舊 `conf.d/ollama-gcp-proxy.conf`
|
||||
的 120s `proxy_read_timeout`。正式長期方案仍是 ADR-125 WireGuard mesh。
|
||||
|
||||
---
|
||||
|
||||
@@ -84,7 +88,7 @@ curl http://192.168.0.110:11435/api/tags
|
||||
kubectl edit configmap -n awoooi-prod awoooi-config
|
||||
```
|
||||
|
||||
修改以下欄位:
|
||||
若使用 nginx bridge,修改以下欄位:
|
||||
|
||||
```yaml
|
||||
# 修改前
|
||||
@@ -104,6 +108,16 @@ OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434" # Local GPU 最後防線
|
||||
kubectl rollout restart deployment/awoooi-api -n awoooi-prod
|
||||
```
|
||||
|
||||
若需要繞過 110 bridge timeout,使用 direct GCP endpoint:
|
||||
|
||||
```yaml
|
||||
OLLAMA_URL: "http://34.143.170.20:11434"
|
||||
OLLAMA_SECONDARY_URL: "http://34.21.145.224:11434"
|
||||
OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"
|
||||
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: "300"
|
||||
INCIDENT_LLM_TIMEOUT_SECONDS: "360"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 確認模型已載入
|
||||
|
||||
@@ -165,9 +165,10 @@ spec:
|
||||
# Gitea — CI/CD 主倉 probe + monitoring
|
||||
- protocol: TCP
|
||||
port: 3001
|
||||
# 2026-05-04 ogt: GCP Ollama nginx proxy
|
||||
# K8s → GCP-A/B:11434 外網路由不通(NetworkPolicy 外網 egress 只開 443)
|
||||
# 在 110 架設 nginx 反向代理,K8s 走內網 110:11435(GCP-A) / 110:11436(GCP-B)
|
||||
# 2026-05-04 ogt: GCP Ollama nginx proxy bridge.
|
||||
# 2026-05-06 Codex: production active inference temporarily bypasses
|
||||
# this bridge because an older 110 nginx conf.d server block still has
|
||||
# a 120s read timeout. Keep the bridge ports for rollback/emergency use.
|
||||
- protocol: TCP
|
||||
port: 11435
|
||||
- protocol: TCP
|
||||
|
||||
@@ -17,14 +17,13 @@ data:
|
||||
# 服務端點 (非機密)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111(GPU 機,RTX)
|
||||
# 188 = CPU-only Ollama,推理極慢(>60s);111 有 GPU,avg 10s
|
||||
# 2026-05-04 ogt: ADR-110 三層容災正式路由(nginx proxy 架設完成後恢復 GCP 優先)
|
||||
# GCP-A(via 110:11435) → GCP-B(via 110:11436) → Local(via 110:11437) 統一走 nginx proxy
|
||||
# 110 nginx proxy 轉發:11435→GCP-A, 11436→GCP-B, 11437→192.168.0.111:11434
|
||||
# K8s pods 不可直連 GCP:11434(NetworkPolicy 外網 egress 只開 443)
|
||||
# 2026-05-05 Codex: 110:11437 proxy 從 pod 內 connection refused,暫回直連 111;
|
||||
# 保持告警成本路由為 GCP-A → GCP-B → 111 → Gemini backup。
|
||||
OLLAMA_URL: "http://192.168.0.110:11435"
|
||||
OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436"
|
||||
# 2026-05-06 Codex: bypass the transitional 110 nginx bridge for active
|
||||
# inference because /etc/nginx/conf.d/ollama-gcp-proxy.conf still has a 120s
|
||||
# read timeout and returns 504 before qwen3:14b can finish. NetworkPolicy
|
||||
# already allows direct GCP-A/GCP-B:11434. Target architecture remains
|
||||
# ADR-125 WireGuard mesh + AwoooP Inference Gateway.
|
||||
OLLAMA_URL: "http://34.143.170.20:11434"
|
||||
OLLAMA_SECONDARY_URL: "http://34.21.145.224:11434"
|
||||
OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"
|
||||
OPENCLAW_URL: "http://192.168.0.188:8088"
|
||||
KALI_SCANNER_URL: "http://192.168.0.112:8080"
|
||||
@@ -67,8 +66,9 @@ data:
|
||||
OLLAMA_EMBEDDING_MODEL: "bge-m3:latest"
|
||||
OPENCLAW_DEFAULT_MODEL: "qwen2.5:7b-instruct"
|
||||
OPENCLAW_TIMEOUT: "120"
|
||||
INCIDENT_LLM_TIMEOUT_SECONDS: "240"
|
||||
AGENT_DEBATE_GLOBAL_TIMEOUT_SEC: "260"
|
||||
OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: "300"
|
||||
INCIDENT_LLM_TIMEOUT_SECONDS: "360"
|
||||
AGENT_DEBATE_GLOBAL_TIMEOUT_SEC: "420"
|
||||
AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: "100"
|
||||
AGENT_SOLVER_TIMEOUT_SEC: "80"
|
||||
# ADR-105 P1: OpenClaw Agent Loop shadow canary.
|
||||
|
||||
@@ -68,9 +68,9 @@ spec:
|
||||
- name: TELEGRAM_ENABLE_POLLING
|
||||
value: "true"
|
||||
- name: OLLAMA_URL
|
||||
value: "http://192.168.0.110:11435" # 2026-05-04 ogt: GCP-A primary via 110 nginx proxy(11435 → 34.143.170.20:11434)
|
||||
value: "http://34.143.170.20:11434" # 2026-05-06 Codex: GCP-A direct; avoids 110 nginx 120s bridge timeout
|
||||
- name: OLLAMA_SECONDARY_URL
|
||||
value: "http://192.168.0.110:11436" # 2026-05-04 ogt: GCP-B secondary via 110 nginx proxy(11436 → 34.21.145.224:11434)
|
||||
value: "http://34.21.145.224:11434" # 2026-05-06 Codex: GCP-B direct; mesh gateway remains target architecture
|
||||
- name: OLLAMA_FALLBACK_URL
|
||||
value: "http://192.168.0.111:11434" # 2026-05-04 ogt: 111 兜底(K8s 內網直連,GPU RTX)
|
||||
- name: ALERT_AI_ALLOW_CLOUD_FALLBACK
|
||||
@@ -87,10 +87,12 @@ spec:
|
||||
value: "qwen2.5:7b-instruct"
|
||||
- name: OPENCLAW_TIMEOUT
|
||||
value: "120"
|
||||
- name: OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||||
value: "300"
|
||||
- name: INCIDENT_LLM_TIMEOUT_SECONDS
|
||||
value: "240"
|
||||
value: "360"
|
||||
- name: AGENT_DEBATE_GLOBAL_TIMEOUT_SEC
|
||||
value: "260"
|
||||
value: "420"
|
||||
- name: AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
|
||||
value: "100"
|
||||
- name: AGENT_SOLVER_TIMEOUT_SEC
|
||||
|
||||
Reference in New Issue
Block a user