From 5a569d1e05a0d479427be03361bc60c8d22be2d2 Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 30 Apr 2026 08:58:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=B7=E5=8C=96=20CD=20=E5=81=A5=E5=BA=B7?= =?UTF-8?q?=E6=AA=A2=E6=9F=A5=E9=87=8D=E8=A9=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitea/workflows/cd.yaml | 22 +++++++++++-------- CONSTITUTION.md | 2 +- TODO_NEXT_STEPS.txt | 1 + app.py | 4 ++-- config.py | 2 +- docs/guides/devops_handbook.md | 1 + docs/memory/ai_automation_closure_20260429.md | 2 ++ docs/memory/history_logs.md | 1 + tests/test_cd_health_check.py | 22 +++++++++++++++++++ 9 files changed, 44 insertions(+), 13 deletions(-) create mode 100644 tests/test_cd_health_check.py diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 7c9b7b3..42535c7 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -224,17 +224,21 @@ jobs: # ── 健康檢查(H3: HTTP + 三容器狀態雙重驗證) ───────────────────────── - name: 健康檢查 run: | - echo "⏳ 等待服務啟動(15s)..." - sleep 15 - for i in $(seq 1 5); do - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" https://mo.wooo.work/health --max-time 10 || echo "000") - if [ "$HTTP_CODE" = "200" ]; then - echo "✅ HTTP 健康檢查通過(HTTP $HTTP_CODE)" + echo "⏳ 等待服務啟動(30s)..." + sleep 30 + for i in $(seq 1 12); do + INTERNAL_CODE=$(ssh -i ~/.ssh/id_deploy ollama@192.168.0.188 \ + "docker exec momo-pro-system curl -s -o /dev/null -w '%{http_code}' --max-time 8 http://127.0.0.1:80/health" 2>/dev/null || true) + EXTERNAL_CODE=$(curl -s -o /dev/null -w "%{http_code}" https://mo.wooo.work/health --max-time 10 2>/dev/null || true) + INTERNAL_CODE=${INTERNAL_CODE:-000} + EXTERNAL_CODE=${EXTERNAL_CODE:-000} + if [ "$INTERNAL_CODE" = "200" ] && [ "$EXTERNAL_CODE" = "200" ]; then + echo "✅ HTTP 健康檢查通過(internal=$INTERNAL_CODE, external=$EXTERNAL_CODE)" break fi - echo "⏳ 嘗試 $i/5,HTTP $HTTP_CODE,等待 10s..." - [ "$i" -eq 5 ] && echo "❌ HTTP 健康檢查失敗" && exit 1 - sleep 10 + echo "⏳ 嘗試 $i/12,internal=$INTERNAL_CODE external=$EXTERNAL_CODE,等待 15s..." + [ "$i" -eq 12 ] && echo "❌ HTTP 健康檢查失敗" && exit 1 + sleep 15 done # 驗證三應用容器均在 Running 狀態 ssh -i ~/.ssh/id_deploy ollama@192.168.0.188 \ diff --git a/CONSTITUTION.md b/CONSTITUTION.md index d6e2b5e..c17a740 100644 --- a/CONSTITUTION.md +++ b/CONSTITUTION.md @@ -2,7 +2,7 @@ > 本文件定義專案開發的核心準則與不可違反的規範 > **建立日期**: 2026-01-12 -> **當前版本**: V10.11 (四 AI Agent 自動化 Metrics Scrape 修復版) +> **當前版本**: V10.12 (CD 健康檢查強化版) > **最後更新**: 2026-04-29 --- diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 4c91a89..4b462f2 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -18,6 +18,7 @@ - Grafana 線上部署:188 active Grafana 已載入 4 個 dashboard,`MOMO AI Automation Overview` provisioning 成功。 - Prometheus scrape 修復:active monitoring stack 新增 `momo-app` scrape job,目標 `momo-pro-system:80/metrics`。 - Gunicorn preload 修復:`post_fork` 略過 Flask/Werkzeug request-bound LocalProxy,避免 worker boot fail。 + - CD 健康檢查強化:改為 internal container health + external `mo.wooo.work` 雙檢查,重試窗延長到約 3 分鐘。 【下次待辦】 - 觀察 Prometheus scrape 後 `momo_ai_*` 是否在事件發生後產生時間序列。 diff --git a/app.py b/app.py index eb4b916..2326cc2 100644 --- a/app.py +++ b/app.py @@ -95,8 +95,8 @@ except Exception as e: sys_log.error(f"無法檢測磁碟空間: {e}") # 🚩 系統版本定義 (備份與顯示用) -# 🚩 2026-04-30 V10.11: Gunicorn preload guard + AI metrics scrape -SYSTEM_VERSION = "V10.11" +# 🚩 2026-04-30 V10.12: CD health check internal/external hardening +SYSTEM_VERSION = "V10.12" # ========================================== # 🔒 SQL Injection 防護函數 diff --git a/config.py b/config.py index 9adcbb1..b604c05 100644 --- a/config.py +++ b/config.py @@ -253,7 +253,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.11" +SYSTEM_VERSION = "V10.12" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/guides/devops_handbook.md b/docs/guides/devops_handbook.md index c333eb4..e086f59 100644 --- a/docs/guides/devops_handbook.md +++ b/docs/guides/devops_handbook.md @@ -63,6 +63,7 @@ - **原因**: 110 與 188 之間的 SSH 隧道中斷。 - **檢查**: 在 110 執行 `curl -I http://127.0.0.1:5003/health`。 - **修復**: 在 110 執行 `ssh -fN -L 5003:127.0.0.1:5003 ollama@192.168.0.188` 重啟隧道。 +- **CD 判斷**: 先確認 188 內部 `docker exec momo-pro-system curl http://127.0.0.1:80/health`,再看外部 `https://mo.wooo.work/health`;若 internal 已 200 但 external 502,多半是 Nginx/tunnel 短暫延遲。 ### 2. CI/CD 報錯 `parent snapshot ... not found` - **原因**: Docker Buildx 快取損壞。 diff --git a/docs/memory/ai_automation_closure_20260429.md b/docs/memory/ai_automation_closure_20260429.md index 1877a95..ca063c1 100644 --- a/docs/memory/ai_automation_closure_20260429.md +++ b/docs/memory/ai_automation_closure_20260429.md @@ -18,6 +18,7 @@ - 2026-04-30 active Grafana 已載入 4 個 dashboard;AI dashboard 檔案同步到 188 實際掛載目錄 `monitoring/grafana/provisioning/dashboards/json/`。 - 2026-04-30 active Prometheus 補 `momo-app` scrape job,目標 `momo-pro-system:80/metrics`;Prometheus 需加入 `momo-network` 才能解析 app container DNS。 - 2026-04-30 發現並修復 `gunicorn.conf.py` `post_fork` 掃到 Flask/Werkzeug LocalProxy 導致 worker boot fail 的問題。 +- 2026-04-30 CD 健康檢查曾因 rebuild 後短暫 502 太早失敗;已改為 internal `docker exec momo-pro-system /health` + external `https://mo.wooo.work/health` 雙檢查,重試約 3 分鐘。 ## 已落地範圍 @@ -48,6 +49,7 @@ - 2026-04-29 AI Grafana observability + AI core 回歸:`36 passed`,collect-only:`36 tests collected`。 - 2026-04-30 Gunicorn LocalProxy 修復:新增 `tests/test_gunicorn_config.py`。 - 2026-04-30 Prometheus scrape 修復:新增 `tests/test_prometheus_ai_automation_scrape.py`。 +- 2026-04-30 CD health check hardening:新增 `tests/test_cd_health_check.py`。 - 2026-04-29 L2 安全記憶批次:`24 passed`。 - collect-only:`48 tests collected`。 - `git diff --check` 已通過。 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 90fcf50..10e5e1f 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -31,6 +31,7 @@ - **Smoke 每日摘要推播**: 新增 Telegram 手動推播 API 與 momo-scheduler 每日 09:10 摘要任務,只讀 smoke history。 - **Grafana AI 觀測**: 新增 `MOMO AI Automation Overview` provisioning dashboard,覆蓋 EventRouter、safe action、replay、AutoHeal Prometheus 指標。 - **Grafana 線上載入與 scrape 修復**: 188 active Grafana 載入 4 dashboards;active Prometheus 補 `momo-app` scrape job,並修復 gunicorn preload LocalProxy boot crash。 +- **CD 健康檢查強化**: Gitea Actions health check 改為 internal container health + external URL 雙檢查,降低 rebuild 後短暫 502 誤判。 ### 2026-04-28~29:Phase 3e 重構大戰 + daily_sales cache 隱形 bug 根除 - **app.py 縮減 -10.8%**: 7,386 → 6,590 行,11 commits 全綠零 502。 diff --git a/tests/test_cd_health_check.py b/tests/test_cd_health_check.py new file mode 100644 index 0000000..8f823a1 --- /dev/null +++ b/tests/test_cd_health_check.py @@ -0,0 +1,22 @@ +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +CD_WORKFLOW = ROOT / ".gitea/workflows/cd.yaml" + + +def test_cd_health_check_allows_slow_rebuild_warmup(): + workflow = CD_WORKFLOW.read_text(encoding="utf-8") + + assert "等待服務啟動(30s)" in workflow + assert "seq 1 12" in workflow + assert "等待 15s" in workflow + + +def test_cd_health_check_validates_internal_and_external_health(): + workflow = CD_WORKFLOW.read_text(encoding="utf-8") + + assert "docker exec momo-pro-system curl" in workflow + assert "http://127.0.0.1:80/health" in workflow + assert "https://mo.wooo.work/health" in workflow + assert 'internal=$INTERNAL_CODE, external=$EXTERNAL_CODE' in workflow