fix(alerts): bypass proxy timeout for GCP Ollama

2026-05-06 08:55:02 +08:00
parent df5e6c6626
commit 9ef9633aff
11 changed files with 74 additions and 34 deletions
--- a/apps/api/src/core/config.py
+++ b/apps/api/src/core/config.py
@@ -215,8 +215,8 @@ class Settings(BaseSettings):
        description="Phase 25 P0: DIAGNOSE NIM timeout (秒)，實測 2.2-27.3s avg 10.6s，60s 含 buffer",
    )
    OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: int = Field(
-        default=200,
-        description="Phase 25 P0: Ollama timeout (秒)，實測 CPU-only 238s，保留欄位但 DIAGNOSE 不再走 Ollama",
+        default=300,
+        description="Ollama diagnose timeout (秒)。GCP qwen3:14b CPU-only can exceed the old 120s proxy limit.",
    )

    # ==========================================================================
@@ -534,7 +534,7 @@ class Settings(BaseSettings):
        ),
    )
    INCIDENT_LLM_TIMEOUT_SECONDS: int = Field(
-        default=240,
+        default=360,
        description=(
            "Outer timeout for incident OpenClaw proposal generation. This must "
            "be long enough for the GCP-A/GCP-B/111 Ollama lane to complete "
--- a/apps/api/src/services/agent_orchestrator.py
+++ b/apps/api/src/services/agent_orchestrator.py
@@ -70,11 +70,11 @@ def _agent_debate_global_timeout_seconds() -> float:
    90s guard. Keep a hard ceiling, but make it an explicit deployment knob.
    """

-    raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "260.0")
+    raw = os.environ.get("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "420.0")
    try:
        timeout = float(raw)
    except (TypeError, ValueError):
-        timeout = 260.0
+        timeout = 420.0
    return max(timeout, 90.0)


--- a/apps/api/src/services/ai_providers/ollama.py
+++ b/apps/api/src/services/ai_providers/ollama.py
@@ -143,7 +143,7 @@ class OllamaProvider:
            options = registry.get_provider_options("ollama")

            # P0 2026-04-04 Claude Code: per-task timeout（Option C 分情境）
-            # FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS (200s，實測 ~173s)
+            # FORCE_LOCAL/diagnose → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
            # 其他 → OPENCLAW_TIMEOUT（既有設定）
            task_type = (context or {}).get("task_type", "")
            if task_type in ("diagnose", "force_local"):
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -1137,7 +1137,7 @@ class OpenClawService:

                # 2026-04-29 ogt + Claude Code: 注入 task_type 讓 Ollama 用正確 timeout
                # 根因: ai_providers/ollama.py:77 讀 context["task_type"] 決定 timeout
-                #   - "diagnose"/"force_local" → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS=200s
+                #   - "diagnose"/"force_local" → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
                #   - 其他/未注入 → OPENCLAW_TIMEOUT=30s（不夠 qwen2.5:7b 推理）
                # webhooks alert_context 從未注入 task_type → Ollama fallback 永遠 30s timeout
                # 對齊 decision.intent 後 Ollama fallback 真正能跑完
--- a/apps/api/tests/test_agent_step_timeouts.py
+++ b/apps/api/tests/test_agent_step_timeouts.py
@@ -71,8 +71,8 @@ class TestTimeoutDefaults:
            f"Critic default timeout 期望 15.0，實際 {mod.AGENT_CRITIC_TIMEOUT_SEC}"
        )

-    def test_agent_debate_global_timeout_default_is_260(self, monkeypatch):
-        """Agent debate global timeout defaults to the GCP Ollama-first budget."""
+    def test_agent_debate_global_timeout_default_is_420(self, monkeypatch):
+        """Agent debate global timeout defaults to the direct GCP qwen3 budget."""
        monkeypatch.delenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", raising=False)

        if "src.services.agent_orchestrator" in sys.modules:
@@ -80,7 +80,7 @@ class TestTimeoutDefaults:
        import src.services.agent_orchestrator as mod
        importlib.reload(mod)

-        assert mod.GLOBAL_TIMEOUT_SEC == 260.0
+        assert mod.GLOBAL_TIMEOUT_SEC == 420.0

    def test_deprecated_alias_matches_new_constant_diagnostician(self, monkeypatch):
        """PHASE2_STEP_TIMEOUT_SEC alias 應等於 AGENT_DIAGNOSTICIAN_TIMEOUT_SEC（相容性保證）"""
@@ -183,7 +183,7 @@ class TestEnvOverride:
        assert mod.PHASE2_STEP_TIMEOUT_SEC == mod.AGENT_CRITIC_TIMEOUT_SEC

    def test_agent_debate_global_timeout_env_override(self, monkeypatch):
-        """AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=300 覆蓋 default 260.0"""
+        """AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=300 覆蓋 default 420.0"""
        monkeypatch.setenv("AGENT_DEBATE_GLOBAL_TIMEOUT_SEC", "300")

        if "src.services.agent_orchestrator" in sys.modules:
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,16 @@
+## 2026-05-06 | GCP Ollama direct endpoint hotfix for alert diagnosis
+
+**背景**：生產 log 顯示 alert path 的 provider order 已是 `ollama_gcp_a → ollama_gcp_b → ollama_local → gemini`，但 GCP-A/GCP-B 經 110 nginx bridge 各跑滿 120s 後回 `504 Gateway Time-out`，因此最後仍 fallback 到 Gemini 並產生成本。110 同時存在 `conf.d/ollama-gcp-proxy.conf`（120s）與 `sites-enabled/110-ollama-proxy.conf`（300s），較早載入的 `conf.d` 實際截斷了 qwen3:14b。
+
+**本次修補**：
+- production active endpoint 暫改 direct GCP：`OLLAMA_URL=http://34.143.170.20:11434`、`OLLAMA_SECONDARY_URL=http://34.21.145.224:11434`，111 維持最後 Ollama fallback。
+- `OLLAMA_DIAGNOSE_TIMEOUT_SECONDS=300`、`INCIDENT_LLM_TIMEOUT_SECONDS=360`、`AGENT_DEBATE_GLOBAL_TIMEOUT_SEC=420`，讓 qwen3:14b 有足夠時間完成。
+- ADR-125 / GCP proxy runbook 補註 direct endpoint 只是 110 bridge timeout 衝突的 stopgap；長期仍走 WireGuard mesh + AwoooP Inference Gateway。
+
+**驗證**：
+- API pod 直連 `34.143.170.20:11434/api/tags` 與 `34.21.145.224:11434/api/tags` 均 200。
+- `bge-m3:latest` embedding 已在 GCP-A/GCP-B 回傳 1024 維，RAG 不再打舊 `nomic-embed-text`。
+
 ## 2026-05-06 | CD host-key prompt unblock for AwoooP Ollama rollout

 **背景**：`09256be6` 已推到 Gitea main，但 CD `build-and-deploy` 卡在 SSH 到 `192.168.0.121` 的 host-key authenticity prompt，runner 無互動輸入，導致新 image tag 尚未注入 `kustomization.yaml`。
--- a/docs/adr/ADR-125-gcp-ollama-private-mesh-inference-gateway.md
+++ b/docs/adr/ADR-125-gcp-ollama-private-mesh-inference-gateway.md
@@ -58,6 +58,14 @@ Ollama endpoints after cutover:
 The current `192.168.0.110:11435/11436` nginx proxy remains an emergency bridge
 only until the mesh cutover passes shadow and canary gates.

+2026-05-06 operational amendment: the active production env temporarily uses the
+GCP public Ollama IPs directly (`34.143.170.20:11434`,
+`34.21.145.224:11434`) because the 110 bridge has an older duplicate
+`/etc/nginx/conf.d/ollama-gcp-proxy.conf` with a 120s read timeout that returns
+504 before `qwen3:14b` can finish. This is a stopgap, not the target transport.
+The next durable transport remains WireGuard mesh plus the AwoooP Inference
+Gateway.
+
 ### D2 - Public Ollama exposure is forbidden after cutover

 After mesh cutover:
@@ -110,10 +118,13 @@ gateway explicitly opens a maintenance window.

 ## Migration Plan

-### Phase 0 - Current bridge
+### Phase 0 - Current bridge / direct stopgap

 - Keep `192.168.0.110:11435` and `192.168.0.110:11436` active.
- Alert path uses `ALERT_OLLAMA_MODEL=gemma3:4b`.
+- Active production may use direct GCP IPs until the 110 bridge timeout conflict
+  is removed.
+- Alert path uses `ALERT_OLLAMA_MODEL=qwen3:14b` when the user prioritizes
+  problem resolution quality over fast card delivery.
 - Gemini remains paid emergency fallback only.

 ### Phase 1 - Mesh build in parallel
@@ -184,4 +195,3 @@ Paid provider fallback must remain budget-gated.
 - AwoooP can manage Ollama as a platform resource shared by all tenants.
 - CPU-only GCP performance remains a capacity constraint; routing must keep
  heavy jobs off the alert lane or use GPU-capable GCP nodes.
-
--- a/docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md
+++ b/docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md
@@ -10,8 +10,12 @@

 ## 背景

-GCP Ollama (34.143.170.20 / 34.21.145.224) 已部署完成，但 K3s 叢集內無法直接連線 GCP 外網 IP。  
-透過 192.168.0.110 (DevOps 金庫) 架設 nginx 反向代理，讓 K3s Pod 走內網連線 GCP Ollama。
+GCP Ollama (34.143.170.20 / 34.21.145.224) 已部署完成。
+最初透過 192.168.0.110 (DevOps 金庫) 架設 nginx 反向代理，讓 K3s Pod 走內網連線 GCP Ollama。
+
+2026-05-06 現況：K3s Pod 已可直連 GCP-A/GCP-B `11434/tcp`，且 production
+暫時改用 direct endpoint，避開 110 上舊 `conf.d/ollama-gcp-proxy.conf`
+的 120s `proxy_read_timeout`。正式長期方案仍是 ADR-125 WireGuard mesh。

 ---

@@ -84,7 +88,7 @@ curl http://192.168.0.110:11435/api/tags
 kubectl edit configmap -n awoooi-prod awoooi-config
 ```

-修改以下欄位：
+若使用 nginx bridge，修改以下欄位：

 ```yaml
 # 修改前
@@ -104,6 +108,16 @@ OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"   # Local GPU 最後防線
 kubectl rollout restart deployment/awoooi-api -n awoooi-prod
 ```

+若需要繞過 110 bridge timeout，使用 direct GCP endpoint：
+
+```yaml
+OLLAMA_URL: "http://34.143.170.20:11434"
+OLLAMA_SECONDARY_URL: "http://34.21.145.224:11434"
+OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"
+OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: "300"
+INCIDENT_LLM_TIMEOUT_SECONDS: "360"
+```
+
 ---

 ## 確認模型已載入
--- a/k8s/awoooi-prod/02-network-policy.yaml
+++ b/k8s/awoooi-prod/02-network-policy.yaml
@@ -165,9 +165,10 @@ spec:
        # Gitea — CI/CD 主倉 probe + monitoring
        - protocol: TCP
          port: 3001
-        # 2026-05-04 ogt: GCP Ollama nginx proxy
-        # K8s → GCP-A/B:11434 外網路由不通（NetworkPolicy 外網 egress 只開 443）
-        # 在 110 架設 nginx 反向代理，K8s 走內網 110:11435(GCP-A) / 110:11436(GCP-B)
+        # 2026-05-04 ogt: GCP Ollama nginx proxy bridge.
+        # 2026-05-06 Codex: production active inference temporarily bypasses
+        # this bridge because an older 110 nginx conf.d server block still has
+        # a 120s read timeout. Keep the bridge ports for rollback/emergency use.
        - protocol: TCP
          port: 11435
        - protocol: TCP
--- a/k8s/awoooi-prod/04-configmap.yaml
+++ b/k8s/awoooi-prod/04-configmap.yaml
@@ -17,14 +17,13 @@ data:
  # 服務端點 (非機密)
  # 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111（GPU 機，RTX）
  #   188 = CPU-only Ollama，推理極慢（>60s）；111 有 GPU，avg 10s
-  # 2026-05-04 ogt: ADR-110 三層容災正式路由（nginx proxy 架設完成後恢復 GCP 優先）
-  #   GCP-A(via 110:11435) → GCP-B(via 110:11436) → Local(via 110:11437) 統一走 nginx proxy
-  #   110 nginx proxy 轉發：11435→GCP-A, 11436→GCP-B, 11437→192.168.0.111:11434
-  #   K8s pods 不可直連 GCP:11434（NetworkPolicy 外網 egress 只開 443）
-  # 2026-05-05 Codex: 110:11437 proxy 從 pod 內 connection refused，暫回直連 111；
-  # 保持告警成本路由為 GCP-A → GCP-B → 111 → Gemini backup。
-  OLLAMA_URL: "http://192.168.0.110:11435"
-  OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436"
+  # 2026-05-06 Codex: bypass the transitional 110 nginx bridge for active
+  # inference because /etc/nginx/conf.d/ollama-gcp-proxy.conf still has a 120s
+  # read timeout and returns 504 before qwen3:14b can finish. NetworkPolicy
+  # already allows direct GCP-A/GCP-B:11434. Target architecture remains
+  # ADR-125 WireGuard mesh + AwoooP Inference Gateway.
+  OLLAMA_URL: "http://34.143.170.20:11434"
+  OLLAMA_SECONDARY_URL: "http://34.21.145.224:11434"
  OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"
  OPENCLAW_URL: "http://192.168.0.188:8088"
  KALI_SCANNER_URL: "http://192.168.0.112:8080"
@@ -67,8 +66,9 @@ data:
  OLLAMA_EMBEDDING_MODEL: "bge-m3:latest"
  OPENCLAW_DEFAULT_MODEL: "qwen2.5:7b-instruct"
  OPENCLAW_TIMEOUT: "120"
-  INCIDENT_LLM_TIMEOUT_SECONDS: "240"
-  AGENT_DEBATE_GLOBAL_TIMEOUT_SEC: "260"
+  OLLAMA_DIAGNOSE_TIMEOUT_SECONDS: "300"
+  INCIDENT_LLM_TIMEOUT_SECONDS: "360"
+  AGENT_DEBATE_GLOBAL_TIMEOUT_SEC: "420"
  AGENT_DIAGNOSTICIAN_TIMEOUT_SEC: "100"
  AGENT_SOLVER_TIMEOUT_SEC: "80"
  # ADR-105 P1: OpenClaw Agent Loop shadow canary.
--- a/k8s/awoooi-prod/06-deployment-api.yaml
+++ b/k8s/awoooi-prod/06-deployment-api.yaml
@@ -68,9 +68,9 @@ spec:
            - name: TELEGRAM_ENABLE_POLLING
              value: "true"
            - name: OLLAMA_URL
-              value: "http://192.168.0.110:11435"  # 2026-05-04 ogt: GCP-A primary via 110 nginx proxy（11435 → 34.143.170.20:11434）
+              value: "http://34.143.170.20:11434"  # 2026-05-06 Codex: GCP-A direct; avoids 110 nginx 120s bridge timeout
            - name: OLLAMA_SECONDARY_URL
-              value: "http://192.168.0.110:11436"  # 2026-05-04 ogt: GCP-B secondary via 110 nginx proxy（11436 → 34.21.145.224:11434）
+              value: "http://34.21.145.224:11434"  # 2026-05-06 Codex: GCP-B direct; mesh gateway remains target architecture
            - name: OLLAMA_FALLBACK_URL
              value: "http://192.168.0.111:11434"  # 2026-05-04 ogt: 111 兜底（K8s 內網直連，GPU RTX）
            - name: ALERT_AI_ALLOW_CLOUD_FALLBACK
@@ -87,10 +87,12 @@ spec:
              value: "qwen2.5:7b-instruct"
            - name: OPENCLAW_TIMEOUT
              value: "120"
+            - name: OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
+              value: "300"
            - name: INCIDENT_LLM_TIMEOUT_SECONDS
-              value: "240"
+              value: "360"
            - name: AGENT_DEBATE_GLOBAL_TIMEOUT_SEC
-              value: "260"
+              value: "420"
            - name: AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
              value: "100"
            - name: AGENT_SOLVER_TIMEOUT_SEC