From ec013f662def2bb4cd535ad12807a6a53fa99357 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 May 2026 23:12:35 +0800 Subject: [PATCH] =?UTF-8?q?fix(watchdog):=20=E4=BF=AE=E5=A4=8D=20Trust=20D?= =?UTF-8?q?rift=20=E9=87=8D=E5=A4=8D=E5=91=8A=E8=AD=A6=20+=20=E5=BB=BA?= =?UTF-8?q?=E7=AB=8B=20GCP=20Ollama=20nginx=20proxy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ai_slo_watchdog_job: 改用 trust_drift_detector 纯统计 lib 避免与 governance_agent 每小时自检查重复触发 Telegram - infra/ansible: 建立 110 nginx proxy 转发到 GCP-A/B 端口 11435 -> 34.143.170.20:11434 (GCP-A) 端口 11436 -> 34.21.145.224:11434 (GCP-B) - docs/runbooks: DEPLOY-GCP-OLLAMA-PROXY.md 完整部署指南 - ops/nginx: 手动部署脚本供 110 直接执行 ADR-110 三层容灾启用前提:先部署 proxy,再改 ConfigMap --- apps/api/src/jobs/ai_slo_watchdog_job.py | 22 +- docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md | 201 ++++++++++++++++++ infra/ansible/playbooks/nginx-sync.yml | 45 +++- .../nginx/templates/110-ollama-proxy.conf.j2 | 71 +++++++ ops/nginx/deploy-ollama-proxy-110.sh | 121 +++++++++++ 5 files changed, 446 insertions(+), 14 deletions(-) create mode 100644 docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md create mode 100644 infra/ansible/roles/nginx/templates/110-ollama-proxy.conf.j2 create mode 100755 ops/nginx/deploy-ollama-proxy-110.sh diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 7c7fbd08..6f2551d0 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -159,21 +159,17 @@ async def _check_once() -> None: # W-6: Trust Drift 偵測(Playbook 信任度漂移) # 2026-05-02 ogt + Claude Sonnet 4.6(亞太): 整併雙寫路徑 - # 原行為:呼叫 trust_drift_detector.run() 直接寫 event_type=trust_drift 到 PG - # governance_agent.check_trust_drift() 每 1h 也寫同一 event_type → 雙寫 - # 整併:改呼叫 governance_agent.check_trust_drift() 為唯一 source-of-truth - # W-6 watchdog 仍每 15 分鐘執行(感知器),violations 計數用於 meta-alert 觸發 - # PG 寫入由 governance_agent._alert() 統一處理,避免雙寫 + # 2026-05-04 ogt + Claude: 修復重複告警 — 改為直接用 trust_drift_detector 純統計 + # 背景:原本呼叫 governance_agent.check_trust_drift() 會觸發 Telegram 告警 + # 但 governance_agent.run_self_check() 每 1h 也會呼叫同一方法 → 雙重 Telegram + # 修正:watchdog 只取統計數字,不觸發 Telegram;告警由 governance_agent 獨家負責 try: - from src.services.governance_agent import get_governance_agent - trust_result = await get_governance_agent().check_trust_drift() - if trust_result.get("drifted", 0) > 0: - drifted = trust_result["drifted"] - auto_deprecated = trust_result.get("auto_deprecated", 0) - kept = trust_result.get("kept", 0) + from src.services.trust_drift_detector import get_trust_drift_detector + dist = await get_trust_drift_detector().detect() + if dist.drift_detected: violations.append( - f"Trust Drift 偵測到 {drifted} 個 Playbook 信任度低落" - f"(auto-deprecated: {auto_deprecated},待人工審核: {kept})" + f"Trust Drift 偵測到 {dist.low_count} 個 Playbook 信任度低落" + f"(low_ratio: {dist.low_ratio:.1%},mean_trust: {dist.mean_trust:.2f})" ) except Exception as e: logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e)) diff --git a/docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md b/docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md new file mode 100644 index 00000000..c07a6687 --- /dev/null +++ b/docs/runbooks/DEPLOY-GCP-OLLAMA-PROXY.md @@ -0,0 +1,201 @@ +# GCP Ollama Nginx Proxy 部署指南 + +> ADR-110 三層容災 — 啟用 GCP Ollama 的關鍵步驟 + +--- + +## 背景 + +GCP Ollama (34.143.170.20 / 34.21.145.224) 已部署完成,但 K3s 叢集內無法直接連線 GCP 外網 IP。 +透過 192.168.0.110 (DevOps 金庫) 架設 nginx 反向代理,讓 K3s Pod 走內網連線 GCP Ollama。 + +--- + +## 部署檔案 + +| 檔案 | 用途 | +|-----|------| +| `infra/ansible/roles/nginx/templates/110-ollama-proxy.conf.j2` | nginx 配置模板 | +| `infra/ansible/playbooks/nginx-sync.yml` | Ansible Playbook | + +--- + +## 執行部署 + +```bash +# 1. 進入 Ansible 目錄 +cd /Users/ogt/awoooi/infra/ansible + +# 2. 部署到 110 (Dry-run 先驗證) +ansible-playbook -i inventory/hosts.yml playbooks/nginx-sync.yml --tags 110 --check + +# 3. 正式部署 +ansible-playbook -i inventory/hosts.yml playbooks/nginx-sync.yml --tags 110 +``` + +--- + +## 驗證部署 + +### 從 110 本機驗證 + +```bash +# 測試 GCP-A proxy +curl http://127.0.0.1:11435/api/tags + +# 測試 GCP-B proxy +curl http://127.0.0.1:11436/api/tags +``` + +### 從 K3s Node 驗證 + +```bash +# 進入 K3s node (120 或 121) +ssh wooo@192.168.0.120 + +# 測試連線 110 proxy +curl http://192.168.0.110:11435/api/tags +curl http://192.168.0.110:11436/api/tags +``` + +### 從 K8s Pod 驗證 + +```bash +# 進入 API Pod +kubectl exec -it -n awoooi-prod deployment/awoooi-api -- bash + +# 測試連線 +apt-get update && apt-get install -y curl +curl http://192.168.0.110:11435/api/tags +``` + +--- + +## 啟用 GCP Ollama + +代理部署完成後,修改 ConfigMap 啟用 GCP 端點: + +```bash +# 編輯 ConfigMap +kubectl edit configmap -n awoooi-prod awoooi-config +``` + +修改以下欄位: + +```yaml +# 修改前 +OLLAMA_URL: "http://192.168.0.111:11434" +OLLAMA_SECONDARY_URL: "http://192.168.0.110:11435" +OLLAMA_FALLBACK_URL: "http://192.168.0.110:11436" + +# 修改後 (啟用 GCP-A 作為 Primary) +OLLAMA_URL: "http://192.168.0.110:11435" # GCP-A via proxy +OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436" # GCP-B via proxy +OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434" # Local GPU 最後防線 +``` + +重啟 Deployment: + +```bash +kubectl rollout restart deployment/awoooi-api -n awoooi-prod +``` + +--- + +## 確認模型已載入 + +GCP Ollama 必須已載入以下模型: + +```bash +# GCP-A 檢查 +curl http://34.143.170.20:11434/api/tags | jq '.models[].name' + +# 必須包含: +# - bge-m3:latest (embedding) +# - qwen2.5:7b-instruct (health check) +# - qwen3:14b (RCA analysis) +# - hermes3:latest (tool calling) +# - deepseek-r1:14b (reasoning) +``` + +若模型未載入,SSH 到 GCP 主機執行: + +```bash +ollama pull bge-m3:latest +ollama pull qwen2.5:7b-instruct +ollama pull qwen3:14b +ollama pull hermes3:latest +ollama pull deepseek-r1:14b +``` + +--- + +## 部署檢查清單 + +- [ ] Ansible playbook 執行成功 (110) +- [ ] 110:11435 監聽確認 (`ss -tlnp | grep 11435`) +- [ ] 110:11436 監聽確認 (`ss -tlnp | grep 11436`) +- [ ] K3s node 可連線 110:11435/11436 +- [ ] K8s Pod 可連線 110:11435/11436 +- [ ] GCP-A/B 模型已載入 +- [ ] ConfigMap 已修改 +- [ ] Deployment 已重啟 +- [ ] API Pod 啟動無錯誤 +- [ ] 推理測試成功 (檢查 latency < 10s) + +--- + +## 常見問題 + +### 1. K3s Pod 連線被拒絕 + +檢查 NetworkPolicy: +```bash +kubectl describe networkpolicy -n awoooi-prod allow-required-egress +``` + +確認包含: +```yaml +- to: + - ipBlock: + cidr: 192.168.0.110/32 + ports: + - protocol: TCP + port: 11435 + - protocol: TCP + port: 11436 +``` + +### 2. nginx 無法連線 GCP + +檢查 110 外網連線: +```bash +curl -v http://34.143.170.20:11434/api/tags +``` + +若失敗,檢查 GCP 防火牆規則是否開放 0.0.0.0/0:11434。 + +### 3. 模型載入但推理失敗 + +檢查 GCP VM 記憶體/CPU 使用率: +```bash +# GCP Console → Compute Engine → VM 執行個體 → 監控 +``` + +若記憶體不足,升級機型或減少同時載入模型數量。 + +--- + +## 相關文件 + +- ADR-110: GCP 三層容災架構 +- `k8s/awoooi-prod/04-configmap.yaml` +- `k8s/awoooi-prod/02-network-policy.yaml` +- `docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md` + +--- + +## 負責人 + +- 建立: Claude Sonnet 4.6 — 2026-05-04 +- 審查: 首席架構師 ogt diff --git a/infra/ansible/playbooks/nginx-sync.yml b/infra/ansible/playbooks/nginx-sync.yml index 091db783..fd9a35b0 100644 --- a/infra/ansible/playbooks/nginx-sync.yml +++ b/infra/ansible/playbooks/nginx-sync.yml @@ -36,11 +36,13 @@ name: nginx state: reloaded -- name: "110 Nginx conf 同步(若有)" +- name: "110 Ollama GCP Proxy 部署" hosts: host_110 become: true vars: ansible_become_pass: "{{ vault_sudo_password | default(omit) }}" + ollama_proxy_src: "{{ playbook_dir }}/../roles/nginx/templates/110-ollama-proxy.conf.j2" + ollama_proxy_dest: /etc/nginx/sites-enabled/110-ollama-proxy.conf tasks: - name: "nginx | 確認 110 nginx 無 all-sites-from-188.conf 在 sites-enabled" @@ -54,3 +56,44 @@ msg: "⚠️ 110 sites-enabled 仍有 all-sites-from-188.conf,應已封存" when: stale_conf.stat.exists tags: ["110", "nginx"] + + # ADR-110: Ollama GCP 三層容災 — 110 作為 nginx proxy 轉發 K3s 流量到 GCP + - name: "nginx | 部署 Ollama GCP Proxy 配置" + ansible.builtin.template: + src: "{{ ollama_proxy_src }}" + dest: "{{ ollama_proxy_dest }}" + owner: root + group: root + mode: "0644" + backup: true + notify: reload nginx 110 + tags: ["110", "nginx", "ollama-proxy"] + + - name: "nginx | 測試 110 設定" + ansible.builtin.command: + cmd: "nginx -t" + changed_when: false + tags: ["110", "nginx", "ollama-proxy"] + + - name: "nginx | 確認 nginx 已啟動" + ansible.builtin.systemd: + name: nginx + state: started + enabled: true + tags: ["110", "nginx", "ollama-proxy"] + + - name: "nginx | 驗證 Ollama proxy 端口監聽" + ansible.builtin.wait_for: + port: "{{ item }}" + host: 127.0.0.1 + timeout: 10 + loop: + - 11435 # GCP-A + - 11436 # GCP-B + tags: ["110", "nginx", "ollama-proxy"] + + handlers: + - name: reload nginx 110 + ansible.builtin.systemd: + name: nginx + state: reloaded diff --git a/infra/ansible/roles/nginx/templates/110-ollama-proxy.conf.j2 b/infra/ansible/roles/nginx/templates/110-ollama-proxy.conf.j2 new file mode 100644 index 00000000..94f9a95d --- /dev/null +++ b/infra/ansible/roles/nginx/templates/110-ollama-proxy.conf.j2 @@ -0,0 +1,71 @@ +# 110 Ollama GCP Proxy — ADR-110 三層容災 +# 讓 K3s 叢集內可透過內網 110 存取 GCP 外網 Ollama +# 建立時間: 2026-05-04 +# 部署: ansible-playbook -i inventory/hosts.yml playbooks/nginx-sync.yml --tags 110 + +# ============================================================ +# Ollama GCP-A Primary (port 11435 → 34.143.170.20:11434) +# ============================================================ +server { + listen 11435; + listen [::]:11435; + server_name _; + + access_log /var/log/nginx/ollama-gcp-a-access.log; + error_log /var/log/nginx/ollama-gcp-a-error.log warn; + + location / { + proxy_pass http://34.143.170.20:11434; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # Ollama 推理可能較慢,給較長超時 + proxy_connect_timeout 10s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + # 支援 streaming response + proxy_buffering off; + proxy_cache off; + } + + # 健康檢查端點 + location /nginx-health { + access_log off; + return 200 "Ollama GCP-A Proxy OK\n"; + add_header Content-Type text/plain; + } +} + +# ============================================================ +# Ollama GCP-B Secondary (port 11436 → 34.21.145.224:11434) +# ============================================================ +server { + listen 11436; + listen [::]:11436; + server_name _; + + access_log /var/log/nginx/ollama-gcp-b-access.log; + error_log /var/log/nginx/ollama-gcp-b-error.log warn; + + location / { + proxy_pass http://34.21.145.224:11434; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + proxy_connect_timeout 10s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + proxy_buffering off; + proxy_cache off; + } + + location /nginx-health { + access_log off; + return 200 "Ollama GCP-B Proxy OK\n"; + add_header Content-Type text/plain; + } +} diff --git a/ops/nginx/deploy-ollama-proxy-110.sh b/ops/nginx/deploy-ollama-proxy-110.sh new file mode 100755 index 00000000..8a183e7d --- /dev/null +++ b/ops/nginx/deploy-ollama-proxy-110.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# GCP Ollama Nginx Proxy 部署腳本 (110 手動執行) +# ADR-110 三層容災 — 讓 K3s 透過內網存取 GCP Ollama +# 執行: ssh wooo@192.168.0.110 'sudo bash -s' < deploy-ollama-proxy-110.sh + +set -euo pipefail + +echo "🚀 部署 GCP Ollama Nginx Proxy (110)..." + +# 配置內容 +NGINX_CONF="/etc/nginx/sites-enabled/110-ollama-proxy.conf" + +# 備份現有配置 +if [ -f "$NGINX_CONF" ]; then + echo "📦 備份現有配置..." + cp "$NGINX_CONF" "${NGINX_CONF}.backup.$(date +%Y%m%d%H%M%S)" +fi + +# 寫入 nginx 配置 +echo "📝 寫入 nginx 配置..." +cat > "$NGINX_CONF" << 'EOF' +# 110 Ollama GCP Proxy — ADR-110 三層容災 +# 讓 K3s 叢集內可透過內網 110 存取 GCP 外網 Ollama + +# ============================================================ +# Ollama GCP-A Primary (port 11435 → 34.143.170.20:11434) +# ============================================================ +server { + listen 11435; + listen [::]:11435; + server_name _; + + access_log /var/log/nginx/ollama-gcp-a-access.log; + error_log /var/log/nginx/ollama-gcp-a-error.log warn; + + location / { + proxy_pass http://34.143.170.20:11434; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # Ollama 推理可能較慢,給較長超時 + proxy_connect_timeout 10s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + # 支援 streaming response + proxy_buffering off; + proxy_cache off; + } + + # 健康檢查端點 + location /nginx-health { + access_log off; + return 200 "Ollama GCP-A Proxy OK\n"; + add_header Content-Type text/plain; + } +} + +# ============================================================ +# Ollama GCP-B Secondary (port 11436 → 34.21.145.224:11434) +# ============================================================ +server { + listen 11436; + listen [::]:11436; + server_name _; + + access_log /var/log/nginx/ollama-gcp-b-access.log; + error_log /var/log/nginx/ollama-gcp-b-error.log warn; + + location / { + proxy_pass http://34.21.145.224:11434; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + proxy_connect_timeout 10s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + + proxy_buffering off; + proxy_cache off; + } + + location /nginx-health { + access_log off; + return 200 "Ollama GCP-B Proxy OK\n"; + add_header Content-Type text/plain; + } +} +EOF + +# 測試 nginx 配置 +echo "🧪 測試 nginx 配置..." +nginx -t + +# 重載 nginx +echo "🔄 重載 nginx..." +systemctl reload nginx + +# 驗證端口監聽 +echo "🔍 驗證端口監聽..." +sleep 2 +ss -tlnp | grep -E '11435|11436' || true + +# 本地測試 +echo "🌐 本地測試 proxy..." +echo "測試 GCP-A proxy (11435)..." +curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:11435/api/tags || echo "連線失敗" +echo "" + +echo "測試 GCP-B proxy (11436)..." +curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:11436/api/tags || echo "連線失敗" +echo "" + +echo "✅ 部署完成!" +echo "" +echo "下一步:" +echo "1. 從 K3s node 測試: curl http://192.168.0.110:11435/api/tags" +echo "2. 修改 K8s ConfigMap 指向 110:11435/11436" +echo "3. 重啟 awoooi-api deployment"