diff --git a/.env.example b/.env.example index 55fc7ff..e84126c 100644 --- a/.env.example +++ b/.env.example @@ -75,6 +75,10 @@ USE_HTTPS=false # ========================================== # [預設 4] Web worker 數;正式環境需配合 PostgreSQL pool 上限 WEB_CONCURRENCY=4 +# [預設 gthread] Thread worker 讓 /health 不會被 Dashboard 長查詢完全排隊 +GUNICORN_WORKER_CLASS=gthread +# [預設 4] 每個 worker 的 threads;正式環境需配合 DB pool 與 CPU 上限 +GUNICORN_THREADS=4 # [預設 300] 長查詢 / 報表匯出 timeout 秒數 GUNICORN_TIMEOUT=300 diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index e574ecc..34b521c 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -34,6 +34,8 @@ on: - 'database/**' - 'templates/**' - 'static/**' + - 'monitoring/prometheus.yml' + - 'monitoring/blackbox.yml' # 需重建 image 的檔案 - 'Dockerfile' - 'requirements.txt' @@ -84,7 +86,7 @@ jobs: exit 0 fi CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null || echo "") - if echo "$CHANGED" | grep -qE '^(Dockerfile|requirements\.txt|docker-compose\.yml|gunicorn\.conf\.py)$'; then + if echo "$CHANGED" | grep -qE '^(Dockerfile|requirements\.txt|docker-compose\.yml)$'; then echo "type=rebuild" >> $GITHUB_OUTPUT echo "label=🔨 重建 Docker Image" >> $GITHUB_OUTPUT else @@ -239,6 +241,19 @@ jobs: docker compose up -d --no-deps --force-recreate momo-app scheduler telegram-bot && \ echo '✅ Image 重建完成(三容器)'" + - name: 重新載入監控設定 + run: | + CHANGED=$(git diff --name-only HEAD~1 HEAD 2>/dev/null || echo "") + if echo "$CHANGED" | grep -qE '^(monitoring/prometheus\.yml|monitoring/blackbox\.yml)$'; then + ssh -i ~/.ssh/id_deploy ollama@192.168.0.188 \ + "cd /home/ollama/momo-pro/monitoring && \ + docker compose up -d prometheus blackbox-exporter && \ + docker compose restart prometheus blackbox-exporter && \ + echo '✅ Monitoring 設定已重新載入'" + else + echo "ℹ️ Monitoring 設定未變更,略過重新載入" + fi + # ── 健康檢查(H3: HTTP + 三容器狀態雙重驗證) ───────────────────────── - name: 健康檢查 run: | diff --git a/AGENTS.md b/AGENTS.md index 8ef7787..61473a2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,6 +1,6 @@ # EwoooC (MOMO Pro System) — Codex 專案工作規則 -> 版本: V13.4 +> 版本: V13.5 > 目標: 把專案知識整理成 Codex 可低成本讀取、可持續維護、可安全落地的單一工作入口。 ## 1. 入口原則 @@ -126,6 +126,7 @@ - 部署、容器、SSH 類操作先看 `docs/adr/ADR-011-cross-project-resource-isolation.md`。 - `gunicorn.conf.py` 必須透過 `docker-compose.yml` bind mount 進 `momo-app`;除救急外,不以 `docker cp` 當常態部署方式。 - CD rebuild 應先完成 image build,再短暫 recreate 三應用容器;禁止把 no-cache build 時間變成長時間 502。 +- HTTP health / Blackbox / CD 探測必須打 `/health`,不可打 Dashboard 首頁 `/`,避免監控流量觸發重型查詢造成 worker starvation。 ## 8. 常用入口 diff --git a/CONSTITUTION.md b/CONSTITUTION.md index 3b26577..84d7af5 100644 --- a/CONSTITUTION.md +++ b/CONSTITUTION.md @@ -2,7 +2,7 @@ > 本文件定義專案開發的核心準則與不可違反的規範 > **建立日期**: 2026-01-12 -> **當前版本**: V10.55 (Cache dashboard competitor decision overview) +> **當前版本**: V10.56 (Health-safe monitoring runtime) > **最後更新**: 2026-05-01 --- @@ -164,6 +164,11 @@ - ✅ **正確**: 日誌格式必須包含 `[模組] [功能] 狀態 | 詳細資訊` - ❌ **禁止**: 使用 `print()` 輸出日誌 +### 第 18.1 條:健康檢查與監控目標(強制要求) +- ✅ **正確**: Docker healthcheck、CD health check、Blackbox HTTP 監控必須打 `/health`,不可打 Dashboard 首頁 `/` 作為可用性探測。 +- ✅ **正確**: Gunicorn runtime 必須保留可併發回應輕量 health check 的 worker 設定,例如 `gthread` + `GUNICORN_THREADS`。 +- ❌ **禁止**: 用會觸發大量 DB 查詢或模板渲染的頁面作為探測目標,避免監控流量本身造成 worker starvation。 + --- ## 第六章:版本管理規範 diff --git a/app.py b/app.py index 0535b0a..7b325b9 100644 --- a/app.py +++ b/app.py @@ -95,8 +95,8 @@ except Exception as e: sys_log.error(f"無法檢測磁碟空間: {e}") # 🚩 系統版本定義 (備份與顯示用) -# 🚩 2026-05-01 V10.55: Cache dashboard competitor decision overview -SYSTEM_VERSION = "V10.55" +# 🚩 2026-05-01 V10.56: Health-safe monitoring runtime +SYSTEM_VERSION = "V10.56" # ========================================== # 🔒 SQL Injection 防護函數 diff --git a/config.py b/config.py index 48e8fe5..dd9e7fc 100644 --- a/config.py +++ b/config.py @@ -254,7 +254,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.55" +SYSTEM_VERSION = "V10.56" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 3321113..1016c9c 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -97,10 +97,8 @@ scrape_configs: module: [http_2xx] static_configs: - targets: - - https://mo.wooo.work - https://mo.wooo.work/health - - http://192.168.0.110:5001 - - http://192.168.0.110:5001/health + - http://momo-pro-system:80/health labels: env: 'uat' probe_type: 'http' @@ -121,7 +119,6 @@ scrape_configs: module: [http_2xx] static_configs: - targets: - - https://momo.wooo.work - https://momo.wooo.work/health labels: env: 'prod' diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 6db4a27..ac8f3c8 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -84,8 +84,10 @@ SQL漏斗(~300筆) - Smoke 每日摘要支援手動 Telegram 推播,並由 `momo-scheduler` 每日 09:10 呼叫 `run_ai_smoke_daily_summary_task()`。 - Grafana provisioning 新增 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`,觀測 EventRouter dispatch/latency、safe action、Telegram replay 與 AutoHeal action/duration。 - Active monitoring stack 使用 `monitoring/prometheus.yml` 的 `momo-app` job scrape `momo-pro-system:80/metrics`;Prometheus container 需加入 `momo-network`。 +- Active Blackbox HTTP targets 必須探測 `/health`(188 stack 目前 `https://mo.wooo.work/health` 與 `http://momo-pro-system:80/health`;110 gateway stack 目前 `https://mo.wooo.work/health`),不可探測 Dashboard 首頁 `/`,避免監控流量觸發重型 DB 查詢。 - `/metrics` 對 `realtime_sales_monthly` 只用 raw `SELECT COUNT(*)` 取得總筆數,避免 ORM schema drift 讓 Prometheus scrape 產生 warning。 - `momo-app` 必須 bind mount `./gunicorn.conf.py:/app/gunicorn.conf.py:ro`,讓 CD sync/rebuild 後的 Gunicorn runtime 設定與 repo 保持一致。 +- Gunicorn runtime 預設 `worker_class = gthread`、`GUNICORN_THREADS=4`、`preload_app = False`;此組合讓 HUP 熱重載可用,也避免 Dashboard 長查詢完全阻塞 `/health`。 - CD rebuild 模式必須先 build image 成功,再短暫 stop/rm/recreate 三應用容器,避免 no-cache build 造成長時間 502。 - ElephantAlpha 使用 NVIDIA NIM hosted API;production 預設模型為 `nvidia/llama-3.3-nemotron-super-49b-v1.5`,`ELEPHANT_ALPHA_FALLBACK_MODELS` 需保留至少一個可呼叫備援;403/404、408/409/425/429、5xx、timeout 與 connection error 必須嘗試下一個模型。 - OpenClaw/Hermes embedding 優先呼叫 Ollama `/api/embed`,只在舊節點不支援時 fallback `/api/embeddings`;timeout 由 `EMBEDDING_TIMEOUT` / `OLLAMA_EMBED_TIMEOUT` 控制。 diff --git a/docs/guides/ai_automation_session_sop.md b/docs/guides/ai_automation_session_sop.md index 0ffbe17..3027afb 100644 --- a/docs/guides/ai_automation_session_sop.md +++ b/docs/guides/ai_automation_session_sop.md @@ -27,7 +27,7 @@ - EventRouter / AutoHeal 變更必須更新 `services/ai_automation_metrics.py` 指標或確認既有指標已覆蓋。 - AI 自動化閉環變更必須確認 `/api/ai-automation/smoke` 與 `/ai_automation_smoke` 仍能反映新狀態。 - AI 自動化 Prometheus 指標變更必須同步檢查 `docker/grafana/provisioning/dashboards/json/ai-automation-overview.json` 是否需要新增 panel 或查詢。 -- 線上 active monitoring stack 以 `monitoring/prometheus.yml` 為準;若 dashboard 無資料,先確認 Prometheus `momo-app` target 與 `momo-network` 連線。 +- 188 線上 active monitoring stack 以 `monitoring/prometheus.yml` 為準;110 gateway 另有 `/home/wooo/monitoring/prometheus.yml`。若 dashboard 無資料,先確認 Prometheus `momo-app` target 與 `momo-network` 連線;所有 Blackbox HTTP target 必須打 `/health`,不可打 Dashboard 首頁 `/`。 - Smoke dashboard 會保存 JSONL 趨勢;若新增檢查項目,要確保 history compact record 仍保持小而可讀。 - Smoke history 管理只能操作 `MOMO_AI_AUTOMATION_SMOKE_HISTORY` 指向的 JSONL,不得清理 DB 或 EventRouter queue。 - Smoke 每日摘要推播只讀 history,不得重新執行 smoke,也不得把完整 details 寫進 Telegram。 diff --git a/docs/guides/devops_handbook.md b/docs/guides/devops_handbook.md index 04e9bc7..d2bfdd9 100644 --- a/docs/guides/devops_handbook.md +++ b/docs/guides/devops_handbook.md @@ -105,3 +105,8 @@ - **原因**: Hermes/Ollama 負載高或舊 `/api/embeddings` endpoint 慢,會讓 embedding worker 累加 retry。 - **檢查**: 看 `embedding_retry_queue` 的 `pending/processing/failed` 分布,並測 `http://192.168.0.111:11434/api/embed`。 - **修復**: client 預設使用官方 `/api/embed`;若舊節點 404/405 才 fallback `/api/embeddings`。必要時調整 `EMBEDDING_TIMEOUT`。 + +### 11. `/health` 偶發 000 或容器 unhealthy,但 app log 仍有 200 +- **原因**: Blackbox 或外部探測打 Dashboard 首頁 `/`,會觸發商品看板與 PChome 比價重型查詢;少量 sync worker 被長請求佔滿時,輕量 `/health` 也會排隊逾時。 +- **檢查**: `docker logs momo-pro-system --since 20m | grep 'Blackbox-Exporter'` 應只看到 `GET /health`;`docker stats momo-db` 若接近多核心滿載,需同步看 `pg_stat_activity` 的 `latest_momo` 類查詢。 +- **修復**: 188 的 `monitoring/prometheus.yml` 與 110 的 `/home/wooo/monitoring/prometheus.yml` blackbox HTTP targets 必須使用 `/health`;Gunicorn 保持 `worker_class=gthread`、`GUNICORN_THREADS=4`、`preload_app=False`。 diff --git a/docs/memory/ai_automation_closure_20260429.md b/docs/memory/ai_automation_closure_20260429.md index 99c05f6..1db8153 100644 --- a/docs/memory/ai_automation_closure_20260429.md +++ b/docs/memory/ai_automation_closure_20260429.md @@ -17,6 +17,8 @@ - Grafana provisioning 已新增 `MOMO AI Automation Overview`,由 Prometheus `/metrics` 觀測 EventRouter、safe action、replay 與 AutoHeal 趨勢。 - 2026-04-30 active Grafana 已載入 4 個 dashboard;AI dashboard 檔案同步到 188 實際掛載目錄 `monitoring/grafana/provisioning/dashboards/json/`。 - 2026-04-30 active Prometheus 補 `momo-app` scrape job,目標 `momo-pro-system:80/metrics`;Prometheus 需加入 `momo-network` 才能解析 app container DNS。 +- 2026-05-01 active Blackbox HTTP target 改打 `/health`;188 stack 使用 `https://mo.wooo.work/health` + `http://momo-pro-system:80/health`,110 gateway stack 使用 `https://mo.wooo.work/health`。禁止再用 Dashboard 首頁 `/` 當探測目標,避免監控觸發重型查詢並和 `/health` 搶 worker。 +- 2026-05-01 Gunicorn runtime 預設 `gthread` + `GUNICORN_THREADS=4`,保留 `preload_app = False` 供 HUP 熱重載,同時讓輕量 health check 不被少數 Dashboard 長請求完全排隊。 - 2026-04-30 發現並修復 `gunicorn.conf.py` `post_fork` 掃到 Flask/Werkzeug LocalProxy 導致 worker boot fail 的問題。 - 2026-04-30 CD 健康檢查曾因 rebuild 後短暫 502 太早失敗;已改為 internal `docker exec momo-pro-system /health` + external `https://mo.wooo.work/health` 雙檢查,重試約 3 分鐘。 - 2026-04-30 CD Sync 模式曾只 rsync + `docker compose up -d`,導致 host 檔案已是新版但 gunicorn process 仍跑舊版;已補 `docker compose restart momo-app scheduler telegram-bot`。 @@ -49,7 +51,7 @@ - Grafana dashboard 檔案:`docker/grafana/provisioning/dashboards/json/ai-automation-overview.json`;provider 會載入 JSON 目錄,不需要修改 dashboard provider。 - Active monitoring 使用 `monitoring/prometheus.yml`,不是 `docker/prometheus/prometheus.yml`;若線上 panel 無資料,先查 Prometheus 是否有 `momo-app` target。 - Monitoring compose 的 cAdvisor 只需在 `monitoring` network 內提供 `cadvisor:8080` 給 Prometheus,不應綁定 host `8080`,避免與其他服務衝突;blackbox target 需要 `blackbox-exporter` 容器存在於同一 network。 -- 2026-04-30 線上驗證:目前 active MOMO UAT blackbox target 只保留 `https://mo.wooo.work`;`momo.wooo.work` 與 `wooo.work` 需等 DNS/Nginx 恢復後再加入 active monitoring。 +- 2026-05-01 線上驗證:188 active MOMO UAT blackbox HTTP target 保留 public `https://mo.wooo.work/health` 與 internal `http://momo-pro-system:80/health`;110 gateway Prometheus 也已把 `https://mo.wooo.work` 改為 `https://mo.wooo.work/health`;`momo.wooo.work` 與 `wooo.work` 需等 DNS/Nginx 恢復後再加入 active monitoring。 - 110 Gitea runner 必須只宣告 `ewoooc-host` label;若混入 `ubuntu-latest` / `awoooi-host`,EWOOOC 與 AWOOOI workflows 會互相搶同一個 runner,導致推版卡住或跨專案污染。 - CD sync 模式應對 `momo-pro-system` 發 Gunicorn `HUP` 熱重載,不重啟 app 容器;scheduler / telegram-bot 才用 compose restart。Gunicorn 必須維持 `preload_app = False`,否則 HUP 只重啟 worker、但 app object 仍來自舊 master 預載程式碼。 - App container 的 runtime `gunicorn.conf.py` 由 `docker-compose.yml` bind mount;若未來改 gunicorn 設定,不應再手動 `docker cp` 作為常態流程。 @@ -66,6 +68,7 @@ - 2026-04-30 Gunicorn LocalProxy 修復:新增 `tests/test_gunicorn_config.py`。 - 2026-04-30 Prometheus scrape 修復:新增 `tests/test_prometheus_ai_automation_scrape.py`。 - 2026-04-30 Monitoring exporter 修復:`tests/test_prometheus_ai_automation_scrape.py` 覆蓋 cAdvisor internal-only 與 blackbox exporter 宣告。 +- 2026-05-01 Health-safe monitoring runtime:`tests/test_gunicorn_config.py` 覆蓋 gthread/threads 預設與 override,`tests/test_prometheus_ai_automation_scrape.py` 覆蓋 blackbox 必須打 `/health`,`tests/test_cd_health_check.py` 覆蓋 monitoring reload 與 gunicorn bind-mounted sync。 - 2026-04-30 CD health check hardening:新增 `tests/test_cd_health_check.py`。 - 2026-04-30 CD Gunicorn mount hardening:新增 `tests/test_docker_compose_runtime_mounts.py`。 - 2026-04-30 Metrics schema drift 降噪:`tests/test_ai_automation_metrics.py` 覆蓋 raw sales count query。 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index d02391c..0a33a53 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -66,6 +66,7 @@ - **Vendor query service extraction**: Vendor V2 首頁統計與缺貨清單 query 移到 `services/vendor_stockout_query_service.py`,`routes/vendor_routes.py` 由約 1,821 行降至 1,675 行,回到 request parsing + template rendering,避免巨型 Blueprint 繼續承接資料組裝邏輯。 - **Vendor stockout API query extraction**: `/vendor-stockout/api/stockout/list` 與 `/vendor-stockout/api/stockout/batches` 的 query/serialization 移入同一個 `services/vendor_stockout_query_service.py`,保留既有 JSON shape,`routes/vendor_routes.py` 再降至約 1,565 行。 - **Vendor management query extraction**: `/vendor-stockout/api/vendor/list` 與 `/vendor-stockout/api/vendor/` 的 query/serialization 移入 `services/vendor_stockout_query_service.py`,保留既有 JSON shape,`routes/vendor_routes.py` 再降至約 1,485 行。 +- **Health-safe monitoring runtime**: 健康檢查失敗盤點確認 Blackbox 打 Dashboard 首頁 `/` 會觸發重型查詢並和 `/health` 搶 worker;188 與 110 active blackbox 均改打 `/health`,Gunicorn 預設改為 `gthread` + `GUNICORN_THREADS=4`,CD 也會在 monitoring target 變更時重新載入 Prometheus/blackbox。 ### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。 diff --git a/gunicorn.conf.py b/gunicorn.conf.py index 086194b..04db811 100644 --- a/gunicorn.conf.py +++ b/gunicorn.conf.py @@ -13,6 +13,8 @@ from sqlalchemy.engine import Engine bind = "0.0.0.0:80" workers = int(os.getenv("WEB_CONCURRENCY", "4")) +worker_class = os.getenv("GUNICORN_WORKER_CLASS", "gthread") +threads = int(os.getenv("GUNICORN_THREADS", "4")) timeout = int(os.getenv("GUNICORN_TIMEOUT", "300")) accesslog = "-" errorlog = "-" diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index bf2e508..ed5726c 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -56,8 +56,8 @@ scrape_configs: module: [http_2xx] static_configs: - targets: - - https://mo.wooo.work - - http://192.168.0.110:5000 + - https://mo.wooo.work/health + - http://momo-pro-system:80/health relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/tests/test_cd_health_check.py b/tests/test_cd_health_check.py index a8734c2..970776e 100644 --- a/tests/test_cd_health_check.py +++ b/tests/test_cd_health_check.py @@ -33,13 +33,30 @@ def test_cd_sync_mode_hot_reloads_app_without_container_restart(): assert "preload_app = False" in gunicorn_config +def test_cd_sync_mode_treats_gunicorn_config_as_bind_mounted_runtime_file(): + workflow = CD_WORKFLOW.read_text(encoding="utf-8") + + assert "- 'gunicorn.conf.py'" in workflow + assert "gunicorn\\.conf\\.py" not in workflow + + +def test_cd_reloads_monitoring_config_when_prometheus_targets_change(): + workflow = CD_WORKFLOW.read_text(encoding="utf-8") + + assert "- 'monitoring/prometheus.yml'" in workflow + assert "monitoring/prometheus\\.yml|monitoring/blackbox\\.yml" in workflow + assert "cd /home/ollama/momo-pro/monitoring" in workflow + assert "docker compose up -d prometheus blackbox-exporter" in workflow + assert "docker compose restart prometheus blackbox-exporter" in workflow + + def test_cd_sync_mode_repairs_app_mount_drift_once(): workflow = CD_WORKFLOW.read_text(encoding="utf-8") assert "--inplace" in workflow assert "momo-app mount drift detected" in workflow - assert "grep -qx '/app/app.py'" in workflow - assert "grep -qx '/app/config.py'" in workflow + assert 'grep -qx "/app/app.py"' in workflow + assert 'grep -qx "/app/config.py"' in workflow assert "docker compose up -d --no-deps --force-recreate momo-app" in workflow diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py index 407d1b6..cffd48e 100644 --- a/tests/test_gunicorn_config.py +++ b/tests/test_gunicorn_config.py @@ -78,3 +78,23 @@ def test_gunicorn_disables_preload_for_hup_hot_reload(): config = _load_gunicorn_config() assert config.preload_app is False + + +def test_gunicorn_uses_thread_worker_for_health_resilience(monkeypatch): + monkeypatch.delenv("GUNICORN_WORKER_CLASS", raising=False) + monkeypatch.delenv("GUNICORN_THREADS", raising=False) + + config = _load_gunicorn_config() + + assert config.worker_class == "gthread" + assert config.threads == 4 + + +def test_gunicorn_thread_worker_can_be_overridden(monkeypatch): + monkeypatch.setenv("GUNICORN_WORKER_CLASS", "sync") + monkeypatch.setenv("GUNICORN_THREADS", "2") + + config = _load_gunicorn_config() + + assert config.worker_class == "sync" + assert config.threads == 2 diff --git a/tests/test_phase3f_cleanup_contracts.py b/tests/test_phase3f_cleanup_contracts.py index d6304e5..89fa99a 100644 --- a/tests/test_phase3f_cleanup_contracts.py +++ b/tests/test_phase3f_cleanup_contracts.py @@ -51,6 +51,8 @@ def test_env_example_documents_runtime_and_ai_automation_variables(): "EMBEDDING_HOST", "EMBEDDING_TIMEOUT", "GUNICORN_TIMEOUT", + "GUNICORN_THREADS", + "GUNICORN_WORKER_CLASS", "LINE_ENABLED", "MOMO_AI_AUTOMATION_SMOKE_HISTORY", "MOMO_AI_AUTOMATION_SMOKE_HISTORY_LIMIT", diff --git a/tests/test_prometheus_ai_automation_scrape.py b/tests/test_prometheus_ai_automation_scrape.py index 8e6ab4e..c646735 100644 --- a/tests/test_prometheus_ai_automation_scrape.py +++ b/tests/test_prometheus_ai_automation_scrape.py @@ -39,10 +39,28 @@ def test_monitoring_stack_declares_blackbox_exporter(): def test_active_blackbox_targets_only_include_live_uat_momo_entrypoint(): config = (ROOT / "monitoring/prometheus.yml").read_text(encoding="utf-8") + blackbox_block = config.split("job_name: 'blackbox-http'", 1)[1].split( + " # Blackbox - TCP", + 1, + )[0] - assert "https://mo.wooo.work" in config - assert "https://momo.wooo.work" not in config - assert "https://wooo.work" not in config + assert "- https://mo.wooo.work/health" in blackbox_block + assert "- http://momo-pro-system:80/health" in blackbox_block + assert "- https://mo.wooo.work\n" not in blackbox_block + assert "192.168.0.110:5000" not in blackbox_block + assert "https://momo.wooo.work" not in blackbox_block + assert "https://wooo.work" not in blackbox_block + + +def test_blackbox_http_targets_use_lightweight_health_endpoint(): + config = (ROOT / "monitoring/prometheus.yml").read_text(encoding="utf-8") + blackbox_block = config.split("job_name: 'blackbox-http'", 1)[1].split( + " # Blackbox - TCP", + 1, + )[0] + + assert "/health" in blackbox_block + assert "mo.wooo.work\n" not in blackbox_block def test_compose_prometheus_uses_container_dns_for_momo_app(): @@ -50,3 +68,22 @@ def test_compose_prometheus_uses_container_dns_for_momo_app(): assert "job_name: 'momo-app'" in config assert "targets: ['momo-pro-system:80']" in config + + +def test_compose_prometheus_blackbox_targets_health_only(): + config = (ROOT / "docker/prometheus/prometheus.yml").read_text(encoding="utf-8") + uat_block = config.split("job_name: 'blackbox-http-uat'", 1)[1].split( + "job_name: 'blackbox-http-prod'", + 1, + )[0] + prod_block = config.split("job_name: 'blackbox-http-prod'", 1)[1].split( + "job_name: 'blackbox-tcp'", + 1, + )[0] + + assert "- https://mo.wooo.work/health" in uat_block + assert "- http://momo-pro-system:80/health" in uat_block + assert "- https://mo.wooo.work\n" not in uat_block + assert "- http://192.168.0.110:5001\n" not in uat_block + assert "- https://momo.wooo.work/health" in prod_block + assert "- https://momo.wooo.work\n" not in prod_block