From dc18b0ebd68e84871fc2095f7cd952adaa0b3320 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 29 Apr 2026 09:50:35 +0800 Subject: [PATCH] =?UTF-8?q?fix(prometheus=5Furl):=20drift=20=E6=AE=98?= =?UTF-8?q?=E5=AD=98=E8=BF=BD=E4=BF=AE=20=E2=80=94=20kured=20=E5=AE=88?= =?UTF-8?q?=E9=96=80=E5=93=A1=20+=20monitoring=20API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit debugger 全 codebase 追根溯源後揪出 5 處 PROMETHEUS_URL drift 殘存 (根因:docs/reference/SERVICE-ENDPOINTS.md 早期把 Prometheus 標在 188 是整個 codebase drift 的源頭)。 本次修最急的 2 處: ## 🔴🔴 kured.yaml:132(守門員失效風險) - 188 → 110 - kured 跑 reboot 前會查 Prometheus alerts,連錯主機 = 跳過保護直接 reboot 主機 - 對齊 ConfigMap + config.py PROMETHEUS_URL ## 🟡 monitoring.py:67(單一事實源) - 寫死 110:9090 改用 settings.PROMETHEUS_URL - 主機巧合正確但繞過 ConfigMap 注入機制 - 未來 Prometheus 再遷移避免再次 drift ## 暫不修 - k3s_monitor_service.py:38 用 121:30090 是 K3s NodePort 內網端點 與外部 PROMETHEUS_URL 概念不同,需新增 PROMETHEUS_INTERNAL_URL setting - 其他 docstring + 文件 drift(SERVICE-ENDPOINTS.md 等)留待後續 ## 驗證 1552 unit tests 全綠(無回歸) Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/src/api/v1/monitoring.py | 5 ++++- k8s/kured/kured.yaml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/api/src/api/v1/monitoring.py b/apps/api/src/api/v1/monitoring.py index f7a47cca..1cad552f 100644 --- a/apps/api/src/api/v1/monitoring.py +++ b/apps/api/src/api/v1/monitoring.py @@ -18,6 +18,7 @@ from datetime import UTC, datetime import httpx from fastapi import APIRouter +from src.core.config import settings from src.core.logging import get_logger logger = get_logger(__name__) @@ -64,7 +65,9 @@ async def _probe_grafana(client: httpx.AsyncClient) -> dict: async def _probe_prometheus(client: httpx.AsyncClient) -> dict: - base = "http://192.168.0.110:9090" + # 2026-04-29 ogt + Claude Opus 4.7: 改用 settings 對齊單一事實源 + # 原本寫死 110:9090 雖巧合正確,但繞過 ConfigMap 注入機制 + base = settings.PROMETHEUS_URL try: health_r = await client.get(f"{base}/-/healthy", timeout=TIMEOUT) if health_r.status_code == 200: diff --git a/k8s/kured/kured.yaml b/k8s/kured/kured.yaml index 18033464..14cf3f78 100644 --- a/k8s/kured/kured.yaml +++ b/k8s/kured/kured.yaml @@ -129,7 +129,10 @@ spec: # PodDisruptionBudget 檢查 - --blocking-pod-selector=app.kubernetes.io/name=awoooi # Prometheus metrics - - --prometheus-url=http://192.168.0.188:9090 + # 2026-04-29 ogt + Claude Opus 4.7: drift fix — 188 是 Ollama Hub + # Prometheus 實際在 110。kured 連錯主機 = 跳過 alert 守門員直接 reboot + # 對齊 ConfigMap 04-configmap.yaml + config.py PROMETHEUS_URL + - --prometheus-url=http://192.168.0.110:9090 resources: limits: cpu: 100m