From d441f706930024fe29c8b3dfadced8f546468ff3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 May 2026 14:55:21 +0800 Subject: [PATCH] fix(ai): add 188 ollama retirement gate --- .../api/scripts/test_nemotron_tool_calling.py | 2 +- docs/LOGBOOK.md | 17 +++ docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md | 50 +++++++ k8s/awoooi-dev/02-configmap.yaml | 7 +- k8s/monitoring/k3s-alerts-supplemental.yaml | 12 -- k8s/monitoring/prometheus.yml | 3 +- scripts/ops/ollama188-retirement-gate.sh | 128 ++++++++++++++++++ 7 files changed, 203 insertions(+), 16 deletions(-) create mode 100644 docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md create mode 100755 scripts/ops/ollama188-retirement-gate.sh diff --git a/apps/api/scripts/test_nemotron_tool_calling.py b/apps/api/scripts/test_nemotron_tool_calling.py index b33b3b9f..cb11c313 100644 --- a/apps/api/scripts/test_nemotron_tool_calling.py +++ b/apps/api/scripts/test_nemotron_tool_calling.py @@ -28,7 +28,7 @@ except ImportError: # ============================================================================ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY") -OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434") +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.110:11435") if not NVIDIA_API_KEY: print("❌ 請設定 NVIDIA_API_KEY 環境變數") diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 55d477ef..053dd386 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,20 @@ +## 2026-05-06 | 188 legacy Ollama 退場 Gate 與 dev 路由修正 + +**背景**:Telegram 告警已不再應出現 `Router:OLLAMA_188`;統帥要求 188 Ollama 移除,正式順序維持 GCP-A → GCP-B → 111 → Gemini 備援。 + +**本次修補**: +- live `awoooi-dev` 原本仍設定 `OLLAMA_URL=http://192.168.0.188:11434`,已 patch 到 `110:11435/11436/11437` 並重啟 dev API。 +- `k8s/awoooi-dev/02-configmap.yaml` 對齊正式 Ollama pool,避免 dev 環境繼續污染 188 使用判斷。 +- `k8s/monitoring/prometheus.yml` 移除 `192.168.0.188:11434` blackbox target,`k3s-alerts-supplemental.yaml` 移除舊 `OllamaDown` 188 告警;live Prometheus 已做精準 patch、`promtool check config` 通過並 SIGHUP reload。 +- `apps/api/scripts/test_nemotron_tool_calling.py` 預設 Ollama endpoint 改為 `110:11435`。 +- 新增 `scripts/ops/ollama188-retirement-gate.sh` 與 `docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md`,把停止/disable/uninstall 的條件明確化。 + +**驗證**: +- `awoooi-dev` live env:`OLLAMA_URL=110:11435`、`OLLAMA_SECONDARY_URL=110:11436`、`OLLAMA_FALLBACK_URL=110:11437`。 +- dev Pod 內三個 endpoint `/api/tags` 均 OK。 +- 短窗口 Gate:`POST_SINCE=25 minutes ago HEALTH_SINCE=2 minutes ago scripts/ops/ollama188-retirement-gate.sh` → failures=0。 +- 24 小時 Gate:仍看到 `192.168.0.88` 在 24 小時內送過 `/api/generate` / `/v1/chat/completions`,因此未釐清前不可解除安裝,只能先做零流量觀察。 + ## 2026-05-06 | Gitea CD SSH key path no longer expands to /root **背景**:`2c2bf9d6` 的 CD `build-and-deploy` 在 `Inject K8s Secrets` 失敗;runner 先把 deploy key 寫到 `${HOME}/.ssh/deploy_key`,但 `ssh -i ~/.ssh/deploy_key` 由 OpenSSH 展開成 `/root/.ssh/deploy_key`,導致 `Permission denied`。 diff --git a/docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md b/docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md new file mode 100644 index 00000000..0f9dff1d --- /dev/null +++ b/docs/runbooks/OLLAMA-188-RETIREMENT-GATE.md @@ -0,0 +1,50 @@ +# 188 Legacy Ollama 退場 Gate + +## 目的 + +`192.168.0.188:11434` 已不再是 AWOOOI / AwoooP 的 Ollama provider。正式順序是: + +1. GCP-A:`192.168.0.110:11435` +2. GCP-B:`192.168.0.110:11436` +3. Local 111:`192.168.0.110:11437` +4. Gemini / Claude:雲端備援,不是主路徑 + +188 主機仍承載 PostgreSQL、Redis、SigNoz、OpenClaw 等服務,所以不能把「解除 188 Ollama」誤解成「動 188 主機」。退場只針對 `ollama.service` 與 `:11434`。 + +## 退場前不變式 + +- repo runtime 不得再出現 `ollama_188`、`OLLAMA_188`、`Ollama188Provider` 或 `192.168.0.188:11434`。 +- `awoooi-prod` 與 `awoooi-dev` live env 不得指向 `192.168.0.188:11434`。 +- Prometheus live config 不得再探測 `192.168.0.188:11434`。 +- 188 的 Ollama journal 在觀察窗口內不得有 `/api/generate`、`/api/chat`、`/v1/chat/completions` 推理 POST。 +- `192.168.0.121` 不得再固定對 188 `/api/tags` 做 dev health check。 + +## 檢查方式 + +```bash +scripts/ops/ollama188-retirement-gate.sh +``` + +可調整觀察窗口: + +```bash +POST_SINCE="24 hours ago" HEALTH_SINCE="10 minutes ago" scripts/ops/ollama188-retirement-gate.sh +``` + +## 階段決策 + +| 階段 | 條件 | 動作 | +| --- | --- | --- | +| Observe | Gate 仍有 failures | 不停止服務,只繼續追來源 | +| Stop | Gate 連續 24 小時綠燈 | 可執行 `systemctl stop ollama`,保留 binary 與模型 | +| Disable | Stop 後 48 小時無回歸 | 可 `systemctl disable ollama` | +| Uninstall | Disable 後 7 天無回歸,且確認無其他專案依賴 | 才評估移除套件與模型資料 | + +## 今日發現 + +- `awoooi-prod` provider registry 已沒有 `ollama_188`。 +- `awoooi-dev` 原本仍使用 `OLLAMA_URL=http://192.168.0.188:11434`,已 live patch 到 GCP-A/GCP-B/111 proxy。 +- Prometheus 舊 blackbox target 已移除並 reload,避免退役後誤報。 +- 188 journal 中近期少量推理 POST 來源為 `192.168.0.88`,不是 K8s Pod;此來源未釐清前,不建議解除安裝。 +- 2026-05-06 14:53 短窗口 Gate 綠燈:repo、K8s env、Prometheus target、dev health check 均已避開 188。 +- 24 小時 Gate 尚未綠燈:仍看得到 `192.168.0.88` 在 24 小時內送過推理 POST。 diff --git a/k8s/awoooi-dev/02-configmap.yaml b/k8s/awoooi-dev/02-configmap.yaml index 0d156316..96057e66 100644 --- a/k8s/awoooi-dev/02-configmap.yaml +++ b/k8s/awoooi-dev/02-configmap.yaml @@ -13,7 +13,12 @@ data: ENVIRONMENT: "dev" SYSTEM_NAME: "awoooi" - OLLAMA_URL: "http://192.168.0.188:11434" + # 2026-05-06 Codex: dev 也必須走正式 Ollama pool,禁止回到 188 legacy Ollama。 + OLLAMA_URL: "http://192.168.0.110:11435" + OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436" + OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437" + ALERT_AI_ENFORCE_OLLAMA_FIRST: "true" + ALERT_AI_ALLOW_CLOUD_FALLBACK: "true" OPENCLAW_URL: "http://192.168.0.188:8089" KALI_SCANNER_URL: "http://192.168.0.112:8080" SIGNOZ_URL: "http://192.168.0.188:3301" diff --git a/k8s/monitoring/k3s-alerts-supplemental.yaml b/k8s/monitoring/k3s-alerts-supplemental.yaml index 6bf6a91f..b621c619 100644 --- a/k8s/monitoring/k3s-alerts-supplemental.yaml +++ b/k8s/monitoring/k3s-alerts-supplemental.yaml @@ -33,18 +33,6 @@ groups: summary: "🔴 PostgreSQL 服務離線" description: "PostgreSQL (192.168.0.188:5432) 已離線超過 1 分鐘" - # Ollama AI 服務離線 - - alert: OllamaDown - expr: probe_success{job="blackbox-tcp", instance="192.168.0.188:11434"} == 0 - for: 2m - labels: - severity: warning - team: ops - component: ollama - annotations: - summary: "⚠️ Ollama AI 服務離線" - description: "Ollama (192.168.0.188:11434) 已離線超過 2 分鐘,AI 功能降級" - # OpenClaw 服務離線 - alert: OpenClawDown expr: probe_success{job="blackbox-tcp", instance="192.168.0.188:8089"} == 0 diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index 24008e6f..2452a9fd 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -87,11 +87,10 @@ scrape_configs: - 192.168.0.110:9000 - 192.168.0.110:5000 - 192.168.0.110:3100 - # 188 服務 + # 188 服務(Ollama 已於 2026-05-06 自 AWOOOI provider pool 退役) - 192.168.0.188:3301 - 192.168.0.188:5432 - 192.168.0.188:6380 - - 192.168.0.188:11434 - 192.168.0.188:8089 # K3s Worker - 192.168.0.120:31234 diff --git a/scripts/ops/ollama188-retirement-gate.sh b/scripts/ops/ollama188-retirement-gate.sh new file mode 100755 index 00000000..33447561 --- /dev/null +++ b/scripts/ops/ollama188-retirement-gate.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +# 188 legacy Ollama 退場檢查。 +# 這個腳本只讀取 repo、K8s 與主機日誌,不會停止或移除任何服務。 + +set -euo pipefail + +ROOT_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" +LEGACY_SSH="${LEGACY_SSH:-ollama@192.168.0.188}" +PROMETHEUS_SSH="${PROMETHEUS_SSH:-wooo@192.168.0.110}" +POST_SINCE="${POST_SINCE:-24 hours ago}" +HEALTH_SINCE="${HEALTH_SINCE:-5 minutes ago}" + +failures=0 +warnings=0 + +info() { printf '[INFO] %s\n' "$*"; } +warn() { printf '[WARN] %s\n' "$*"; warnings=$((warnings + 1)); } +fail() { printf '[FAIL] %s\n' "$*"; failures=$((failures + 1)); } +pass() { printf '[PASS] %s\n' "$*"; } + +run_or_warn() { + local label="$1" + shift + if ! "$@"; then + warn "$label 無法完成,請確認本機是否有對應權限或網路" + return 1 + fi +} + +check_repo_runtime_refs() { + info "檢查 repo runtime 是否仍引用 188 Ollama" + local pattern='ollama_188|OLLAMA_188|Ollama188Provider|192\.168\.0\.188:11434' + local output + output="$( + cd "$ROOT_DIR" && rg -n "$pattern" \ + apps/api/src apps/api/scripts scripts k8s ops \ + -g '!scripts/ops/ollama188-retirement-gate.sh' 2>/dev/null || true + )" + + if [[ -n "$output" ]]; then + printf '%s\n' "$output" + fail "runtime 仍有 188 Ollama 引用" + else + pass "runtime 已無 188 Ollama 引用" + fi +} + +check_k8s_env() { + info "檢查 live K8s API Deployment env" + local ns env_output + for ns in awoooi-prod awoooi-dev; do + if ! env_output="$(kubectl -n "$ns" exec deploy/awoooi-api -- sh -lc 'env | sort | grep OLLAMA' 2>/dev/null)"; then + warn "$ns 無法讀取 live env" + continue + fi + + printf -- '--- %s ---\n%s\n' "$ns" "$env_output" + if printf '%s\n' "$env_output" | grep -q '192.168.0.188:11434'; then + fail "$ns live env 仍指向 188 Ollama" + else + pass "$ns live env 已避開 188 Ollama" + fi + done +} + +check_prometheus_config() { + info "檢查 live Prometheus 是否仍探測 188 Ollama" + local output + if ! output="$(ssh -o BatchMode=yes -o ConnectTimeout=5 "$PROMETHEUS_SSH" \ + "docker exec prometheus sh -lc 'grep -R \"192.168.0.188:11434\" /etc/prometheus 2>/dev/null || true'" 2>/dev/null)"; then + warn "無法讀取 Prometheus live config" + return + fi + + if [[ -n "$output" ]]; then + printf '%s\n' "$output" + fail "Prometheus live config 仍包含 188 Ollama target" + else + pass "Prometheus live config 已無 188 Ollama target" + fi +} + +check_legacy_inference_posts() { + info "檢查 188 Ollama 最近是否仍有推理 POST(POST_SINCE=${POST_SINCE})" + local output + if ! output="$(ssh -o BatchMode=yes -o ConnectTimeout=5 "$LEGACY_SSH" \ + "journalctl -u ollama --since \"$POST_SINCE\" --no-pager | grep -E 'POST[[:space:]]+\"/(api/(chat|generate)|v1/chat/completions)' || true" 2>/dev/null)"; then + warn "無法讀取 188 ollama journal" + return + fi + + if [[ -n "$output" ]]; then + printf '%s\n' "$output" | tail -40 + fail "188 Ollama 仍有推理 POST,不可解除安裝" + else + pass "觀察窗口內沒有推理 POST" + fi +} + +check_dev_health_noise() { + info "檢查 dev health check 是否還在打 188(HEALTH_SINCE=${HEALTH_SINCE})" + local output + if ! output="$(ssh -o BatchMode=yes -o ConnectTimeout=5 "$LEGACY_SSH" \ + "journalctl -u ollama --since \"$HEALTH_SINCE\" --no-pager | grep '192.168.0.121' || true" 2>/dev/null)"; then + warn "無法讀取 188 ollama journal" + return + fi + + if [[ -n "$output" ]]; then + printf '%s\n' "$output" | tail -30 + fail "awoooi-dev 或 mon1 仍在對 188 做 health check" + else + pass "近期未看到 mon1/dev 對 188 的 health check" + fi +} + +check_repo_runtime_refs +check_k8s_env +check_prometheus_config +check_legacy_inference_posts +check_dev_health_noise + +printf '\n結果:failures=%s warnings=%s\n' "$failures" "$warnings" +if [[ "$failures" -gt 0 ]]; then + exit 1 +fi + +exit 0