From 5298786180edc00656d988b17ade3514a16b58cc Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 25 May 2026 11:48:53 +0800 Subject: [PATCH] fix(ollama): restore 111 fallback before gemini --- docs/LOGBOOK.md | 78 +++++++++++++++++++ infra/ansible/inventory/hosts.yml | 8 ++ .../ansible/playbooks/111-ollama-fallback.yml | 42 ++++++++++ infra/ansible/playbooks/nginx-sync.yml | 35 +++++++++ k8s/awoooi-prod/02-network-policy.yaml | 8 +- k8s/awoooi-prod/04-configmap.yaml | 12 +-- k8s/awoooi-prod/06-deployment-api.yaml | 6 +- 7 files changed, 176 insertions(+), 13 deletions(-) create mode 100644 infra/ansible/playbooks/111-ollama-fallback.yml diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 715bd310..0f8861c5 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,81 @@ +## 2026-05-25|T172 Ollama Provider lane 恢復 111 fallback 接線 + +**背景**: + +- T171 production smoke 發現 `/api/v1/health` degraded,`ai-route-status` 選到 Gemini。 +- 統帥再次確認所有 Ollama 路徑必須依序 `GCP-A → GCP-B → 111 → Gemini`,Gemini 只能是最後備援。 +- 本階段先驗 live state,不用 label 包裝成修好;目標是讓路由在 GCP-A/GCP-B upstream 掛掉時能真正落到 111。 + +**live 驗證結論**: + +- API pod / 110 / 本機打 direct GCP-A `34.143.170.20:11434`、GCP-B `34.21.145.224:11434` 均 connection refused。 +- 110 nginx 11435/11436/11437 都有 listen,但 live 存在重複設定: + - `/etc/nginx/conf.d/ollama-gcp-proxy.conf` + - `/etc/nginx/sites-enabled/110-ollama-proxy.conf` +- 111 本機 Ollama `127.0.0.1:11434` 可用;對外 `192.168.0.111:11434` 由 `com.momo.ollama111-allow-proxy` 控制。 +- 111 allowlist 原本只允許 `127.0.0.1/32,192.168.0.111/32,192.168.0.188/32`,因此 110 / K3s node 進來會被 reset。 +- live NetworkPolicy 原本只允許 Pod → 110 `11435/11436`,缺 111 fallback proxy `11437`。 + +**本次修補**: + +- `k8s/awoooi-prod/04-configmap.yaml`、`06-deployment-api.yaml` 恢復 ADR-110 runtime order: + - `OLLAMA_URL=http://192.168.0.110:11435` + - `OLLAMA_SECONDARY_URL=http://192.168.0.110:11436` + - `OLLAMA_FALLBACK_URL=http://192.168.0.110:11437` +- `k8s/awoooi-prod/02-network-policy.yaml` 補 Pod → 110 `11437` egress。 +- `infra/ansible/playbooks/nginx-sync.yml` 新增舊 `conf.d/ollama-gcp-proxy.conf` 備份後移除,避免 11435/11436 duplicate server block。 +- 新增 `infra/ansible/playbooks/111-ollama-fallback.yml`,把 111 allowlist 收斂為: + `127.0.0.1/32,192.168.0.111/32,192.168.0.188/32,192.168.0.110/32`。 +- live 111 已套用 allowlist 並重啟 LaunchAgent;live 110 已用同一份 repo template 恢復 nginx,舊 conf 已移到 `/var/backups/awoooi/nginx/`。 + +**驗證**: + +```text +ruby YAML load -> ok +ansible-playbook --syntax-check: + - nginx-sync.yml -> ok + - 111-ollama-fallback.yml -> ok +kubectl apply --dry-run=server -f k8s/awoooi-prod/02-network-policy.yaml -> ok +pytest: + DATABASE_URL=... PYTHONPATH=apps/api pytest apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_ollama_failover_manager.py -q + -> 41 passed +ruff F/E9 targeted -> passed +git diff --check -> passed +kubectl kustomize k8s/awoooi-prod -> OLLAMA_URL/SECONDARY/FALLBACK resolve to 110:11435/11436/11437 + +live 110: + nginx -T -> only sites-enabled/110-ollama-proxy.conf owns Ollama proxy + 11435 -> GCP-A upstream (currently 502 because GCP-A connection refused) + 11436 -> GCP-B upstream (currently 502 because GCP-B connection refused) + 11437 -> 111 upstream, /api/tags returns model list + +live 111: + com.momo.ollama111-allow-proxy launched with 192.168.0.110/32 allowed + 110 -> 192.168.0.111:11434 /api/tags returns model list +``` + +**注意 / 下一步**: + +- GCP-A/GCP-B upstream 仍是真正紅燈;本次先恢復「不跳過 111」的容災鏈。 +- ArgoCD 會把手動 `kubectl set env` 回滾到 Git 的舊 manifest;必須等本 commit 推到 Gitea main 後,CD/GitOps 才會讓 production API 正式吃到 `110:11435/11436/11437`。 +- 110 Ansible 實際執行仍卡在 `Incorrect sudo password`,本次 live 110 用 Docker privileged/nsenter 套用;下一階段需收斂 110/188 的 Ansible become 憑證或改成正式 rootless 管理路徑。 + +**目前整體進度**: + +- AwoooP 告警可觀測鏈:約 99.2%。 +- 低風險自動修復閉環:約 95.5%。 +- 前端 AI 自動化管理介面同步:約 96.4%。 +- Telegram detail/history 可解釋性:約 95.5%。 +- Callback evidence / DB 回放性:約 95.6%。 +- MCP / 自建 MCP 使用可視性:約 88%。 +- Sentry / SigNoz source correlation 可視性:約 88%。 +- Ansible / PlayBook 決策可視性:約 84.5%。 +- KM owner-review / completion 可治理鏈:約 84%。 +- AI Provider lane 健康度:約 78%(111 fallback 已恢復;GCP-A/GCP-B upstream 仍待修)。 +- 完整 AI 自動化管理產品化:約 93.4%。 + +--- + ## 2026-05-25|T171 Runs list 顯示 Callback Snapshot Capture 摘要 **背景**: diff --git a/infra/ansible/inventory/hosts.yml b/infra/ansible/inventory/hosts.yml index 6a44e954..3b27b313 100644 --- a/infra/ansible/inventory/hosts.yml +++ b/infra/ansible/inventory/hosts.yml @@ -20,6 +20,14 @@ all: ansible_user: ollama ansible_ssh_private_key_file: "~/.ssh/id_ed25519" + ollama_fallback: + hosts: + host_111: + ansible_host: 192.168.0.111 + ansible_user: ooo + ansible_ssh_private_key_file: "~/.ssh/id_rsa" + ansible_ssh_common_args: "-o ProxyJump=192.168.0.110 -o StrictHostKeyChecking=accept-new" + k3s_masters: hosts: host_120: diff --git a/infra/ansible/playbooks/111-ollama-fallback.yml b/infra/ansible/playbooks/111-ollama-fallback.yml new file mode 100644 index 00000000..b6326fd2 --- /dev/null +++ b/infra/ansible/playbooks/111-ollama-fallback.yml @@ -0,0 +1,42 @@ +--- +# AWOOOI Ansible — 192.168.0.111 Ollama fallback proxy +# 原則: 111 的 Ollama 仍只綁本機,由 allowlist proxy 控制哪些主機能進入。 + +- name: "111 Ollama fallback allowlist 收斂" + hosts: host_111 + gather_facts: false + vars: + proxy_label: com.momo.ollama111-allow-proxy + proxy_user_uid: 501 + proxy_plist: /Users/ooo/Library/LaunchAgents/com.momo.ollama111-allow-proxy.plist + allowed_cidrs: "127.0.0.1/32,192.168.0.111/32,192.168.0.188/32,192.168.0.110/32" + + tasks: + - name: "Ollama111 | 讀取現行 allowlist" + ansible.builtin.command: + cmd: "/usr/libexec/PlistBuddy -c 'Print :EnvironmentVariables:OLLAMA111_PROXY_ALLOWED_CIDRS' {{ proxy_plist }}" + register: current_allowlist + changed_when: false + tags: ["111", "ollama-fallback"] + + - name: "Ollama111 | 放行 110 proxy 進入 111 fallback" + ansible.builtin.command: + cmd: "/usr/libexec/PlistBuddy -c 'Set :EnvironmentVariables:OLLAMA111_PROXY_ALLOWED_CIDRS {{ allowed_cidrs }}' {{ proxy_plist }}" + when: current_allowlist.stdout != allowed_cidrs + notify: Restart ollama111 allow proxy + tags: ["111", "ollama-fallback"] + + - name: "Ollama111 | 驗證本機 Ollama tag API" + ansible.builtin.command: + cmd: "curl -fsS --connect-timeout 3 --max-time 8 http://127.0.0.1:11434/api/tags" + changed_when: false + tags: ["111", "ollama-fallback"] + + handlers: + - name: Restart ollama111 allow proxy + ansible.builtin.shell: + cmd: | + set -e + launchctl enable gui/{{ proxy_user_uid }}/{{ proxy_label }} 2>/dev/null || true + launchctl bootout gui/{{ proxy_user_uid }}/{{ proxy_label }} 2>/dev/null || true + launchctl bootstrap gui/{{ proxy_user_uid }} {{ proxy_plist }} diff --git a/infra/ansible/playbooks/nginx-sync.yml b/infra/ansible/playbooks/nginx-sync.yml index 8832bbe2..839de708 100644 --- a/infra/ansible/playbooks/nginx-sync.yml +++ b/infra/ansible/playbooks/nginx-sync.yml @@ -69,6 +69,41 @@ notify: Reload nginx 110 tags: ["110", "nginx", "ollama-proxy"] + - name: "Nginx | 檢查舊 conf.d Ollama proxy 是否仍存在" + ansible.builtin.stat: + path: /etc/nginx/conf.d/ollama-gcp-proxy.conf + register: stale_ollama_conf + tags: ["110", "nginx", "ollama-proxy"] + + - name: "Nginx | 建立舊 conf.d Ollama proxy 備份目錄" + ansible.builtin.file: + path: /var/backups/awoooi/nginx + state: directory + owner: root + group: root + mode: "0755" + when: stale_ollama_conf.stat.exists + tags: ["110", "nginx", "ollama-proxy"] + + - name: "Nginx | 備份舊 conf.d Ollama proxy" + ansible.builtin.copy: + remote_src: true + src: /etc/nginx/conf.d/ollama-gcp-proxy.conf + dest: "/var/backups/awoooi/nginx/ollama-gcp-proxy.conf.{{ ansible_date_time.iso8601_basic }}" + owner: root + group: root + mode: "0644" + when: stale_ollama_conf.stat.exists + tags: ["110", "nginx", "ollama-proxy"] + + - name: "Nginx | 移除舊 conf.d Ollama proxy,避免 11435/11436 重複 server block" + ansible.builtin.file: + path: /etc/nginx/conf.d/ollama-gcp-proxy.conf + state: absent + when: stale_ollama_conf.stat.exists + notify: Reload nginx 110 + tags: ["110", "nginx", "ollama-proxy"] + - name: "Nginx | 測試 110 設定" ansible.builtin.command: cmd: "nginx -t" diff --git a/k8s/awoooi-prod/02-network-policy.yaml b/k8s/awoooi-prod/02-network-policy.yaml index 484130ee..c0b48ebc 100644 --- a/k8s/awoooi-prod/02-network-policy.yaml +++ b/k8s/awoooi-prod/02-network-policy.yaml @@ -165,14 +165,14 @@ spec: # Gitea — CI/CD 主倉 probe + monitoring - protocol: TCP port: 3001 - # 2026-05-04 ogt: GCP Ollama nginx proxy bridge. - # 2026-05-06 Codex: production active inference temporarily bypasses - # this bridge because an older 110 nginx conf.d server block still has - # a 120s read timeout. Keep the bridge ports for rollback/emergency use. + # 2026-05-25 Codex: ADR-110 production Ollama proxy pool. + # 11435=GCP-A, 11436=GCP-B, 11437=Local 111 fallback. - protocol: TCP port: 11435 - protocol: TCP port: 11436 + - protocol: TCP + port: 11437 # 允許訪問 192.168.0.112 安全掃描服務 - to: diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index 20057d7a..5466bc97 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -17,12 +17,12 @@ data: # 服務端點 (非機密) # 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111(GPU 機,RTX) # 188 = CPU-only Ollama,推理極慢(>60s);111 有 GPU,avg 10s - # 2026-05-25 Codex: temporary failover while GCP-A 34.143.170.20 is - # unreachable from 110/GCP-B/office networks. Keep active inference on GCP-B - # until GCP-A VM/network is repaired, then restore ADR-110 primary. - OLLAMA_URL: "http://34.21.145.224:11434" - OLLAMA_SECONDARY_URL: "http://34.21.145.224:11434" - OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434" + # 2026-05-25 Codex: restore ADR-110 production runtime order via 110 proxy. + # Direct GCP IPs remain upstream targets; K8s uses 110:11435/11436/11437 so + # GCP-A → GCP-B → 111 is preserved before any paid-provider fallback. + OLLAMA_URL: "http://192.168.0.110:11435" + OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436" + OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437" OPENCLAW_URL: "http://192.168.0.188:8088" KALI_SCANNER_URL: "http://192.168.0.112:8080" SIGNOZ_URL: "http://192.168.0.188:3301" diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 30e74ed3..41c11496 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -70,11 +70,11 @@ spec: - name: TELEGRAM_ENABLE_POLLING value: "true" - name: OLLAMA_URL - value: "http://34.21.145.224:11434" # 2026-05-25 Codex: temporary primary while GCP-A 34.143.170.20 is unreachable + value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy - name: OLLAMA_SECONDARY_URL - value: "http://34.21.145.224:11434" # 2026-05-06 Codex: GCP-B direct; mesh gateway remains target architecture + value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy - name: OLLAMA_FALLBACK_URL - value: "http://192.168.0.111:11434" # 2026-05-04 ogt: 111 兜底(K8s 內網直連,GPU RTX) + value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini - name: ALERT_AI_ALLOW_CLOUD_FALLBACK value: "true" # Gemini 只作 GCP-A → GCP-B → 111 全失敗後的備援 - name: ALERT_AI_ENFORCE_OLLAMA_FIRST