From bd6310365edd6e950e77dd152ce6bce5bc4c3944 Mon Sep 17 00:00:00 2001 From: OoO Date: Wed, 13 May 2026 19:19:15 +0800 Subject: [PATCH] =?UTF-8?q?=E5=81=9C=E7=94=A8=20CICD=20=E8=88=8A=E5=8F=A2?= =?UTF-8?q?=E9=9B=86=E5=89=AF=E4=BD=9C=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../claude_inventory_validation_20260513.md | 1 + routes/cicd_routes.py | 192 ++++-------------- services/code_review_pipeline_service.py | 2 +- services/ollama_service.py | 4 +- templates/cicd_dashboard.html | 14 +- tests/test_cicd_legacy_cluster_disabled.py | 84 ++++++++ tests/test_ollama_host_label.py | 6 +- 7 files changed, 134 insertions(+), 169 deletions(-) create mode 100644 tests/test_cicd_legacy_cluster_disabled.py diff --git a/docs/memory/claude_inventory_validation_20260513.md b/docs/memory/claude_inventory_validation_20260513.md index c535ded..4563fd0 100644 --- a/docs/memory/claude_inventory_validation_20260513.md +++ b/docs/memory/claude_inventory_validation_20260513.md @@ -64,6 +64,7 @@ - app.py/BP 路由雙寫已完成收斂:active `@app.route` 為 0,`USE_MODULAR_ROUTES` 與舊 routes registry shim 已不存在;`tests/test_phase3f_cleanup_contracts.py::test_app_py_stays_blueprint_only_for_routes` 會防止 route decorator 回到 `app.py`。 - `app.py` 開頭的歷史「重開機後請依序執行」TODO banner 已移除;入口治理以 `AGENTS.md` / `CONSTITUTION.md` / ADR / memory 索引為準,`tests/test_phase3f_cleanup_contracts.py::test_app_py_does_not_start_with_stale_restart_todo_banner` 會防止 stale 操作清單回流。 - `docs/guides/devops_handbook.md` 已移除活躍手冊中的舊 K8s command 區塊,並把 app 操作改為 Gunicorn HUP 熱重載、Docker Compose 精準 force-recreate、`momo-app` service build;`tests/test_phase3f_cleanup_contracts.py::test_devops_handbook_uses_current_docker_runtime_commands` 會防止 `kubectl`、`docker restart momo-pro-system` 與錯誤 compose service name 回流。 +- `/cicd` legacy dashboard 已停用舊叢集副作用:rollback 會回 410,`restart_pods` action 會被拒絕,diagnosis 不再 SSH 執行叢集探測;`tests/test_cicd_legacy_cluster_disabled.py` 會防止 `kubectl` / rollout command 回到 `routes/cicd_routes.py`。 - AI 觀測台 badge/chip 對比規範已補強:`.momo-observability-mode` 內的 badge、pill 與 nested surface 會改走亮底、8px radius、無負字距、可換行且不再殘留低對比 legacy dark-hero 樣式;CSS mirror、`quick_review.sh --observability-ui`、`quick_review.sh --observability-qa --skip-production` 均通過。 - AI 觀測台 rendered visual contract 已入庫:`scripts/check_observability_visual_contract.sh` 會用 Playwright 檢查 10 頁 × desktop/tablet/mobile 的 title typography、surface radius/background、chip contrast、hero height 與水平 overflow;V10.116 main 版 local server 驗證 30 項 PASS。測未部署變更時必須帶 localhost `--base-url`,打 production fail 可能只是正式站尚未部署該版。 - V10.117 已把 AI 觀測台背景語彙收斂為 tokenized dot-matrix:合約會要求 hero/signal/panel surface 的 computed `background-image` 是 `radial-gradient`,並拒絕 legacy `linear-gradient` / `background-image: none` 回流;終端 dot-matrix layer 必須留在 CSS 檔尾以贏過舊 neutralizer。 diff --git a/routes/cicd_routes.py b/routes/cicd_routes.py index d9d2a5d..de238b4 100644 --- a/routes/cicd_routes.py +++ b/routes/cicd_routes.py @@ -26,20 +26,19 @@ ERROR_PATTERNS = { 'auto_fixable': True, 'fix_action': 'restart_registry' }, - 'k8s_timeout': { + 'deploy_timeout': { 'pattern': r'(timed out|timeout|deadline exceeded)', - 'message': 'K8s 操作超時', + 'message': '部署操作超時', 'severity': 'warning', - 'fix_suggestion': '網路可能不穩定,請稍後重試或檢查 K8s 叢集狀態', + 'fix_suggestion': '網路可能不穩定,請稍後重試或檢查 Gitea/CD runner 與 188 Docker Compose 狀態', 'auto_fixable': False }, - 'pod_crash': { + 'runtime_crash': { 'pattern': r'(CrashLoopBackOff|OOMKilled|Error|ImagePullBackOff)', - 'message': 'Pod 異常', + 'message': '容器或舊叢集狀態異常', 'severity': 'critical', - 'fix_suggestion': '檢查 Pod 日誌,可嘗試重啟 Pod', - 'auto_fixable': True, - 'fix_action': 'restart_pods' + 'fix_suggestion': 'EwoooC 現行 runtime 是 188 Docker Compose;請依 DevOps 手冊檢查容器與 /health', + 'auto_fixable': False }, 'test_failed': { 'pattern': r'(pytest.*failed|test.*error|AssertionError)', @@ -102,7 +101,7 @@ ENVIRONMENTS = { 'icon': '🟦', 'url': 'https://mo.wooo.work', 'health_endpoint': 'https://mo.wooo.work/health', - 'k8s_host': '192.168.0.110' + 'runtime_host': '192.168.0.188' }, 'prod': { 'name': 'PROD', @@ -111,7 +110,7 @@ ENVIRONMENTS = { 'icon': '🟥', 'url': 'https://momo.wooo.work', 'health_endpoint': 'https://momo.wooo.work/health', - 'k8s_host': '35.194.233.141' + 'runtime_host': '192.168.0.188' } } @@ -171,21 +170,21 @@ def get_cicd_status(): 'error': env_status.get('error'), 'severity': 'critical', 'auto_fixable': True, - 'fix_action': 'restart_pods' + 'fix_action': 'diagnose' }) - # 檢查 Pod 問題 + # 舊 runtime 資訊若由歷史資料帶入,只允許診斷,不觸發重啟副作用。 for pod in env_status.get('pods', []): if not pod.get('healthy'): issues.append({ - 'type': 'pod', + 'type': 'runtime', 'environment': env_id, - 'message': f"Pod {pod.get('name')} 不健康", + 'message': f"服務 {pod.get('name')} 不健康", 'status': pod.get('status'), 'restarts': pod.get('restarts'), 'severity': 'warning' if pod.get('restarts', 0) < 5 else 'critical', 'auto_fixable': True, - 'fix_action': 'restart_pods' + 'fix_action': 'diagnose' }) # 檢查 Pipeline 問題 @@ -319,31 +318,22 @@ def trigger_rollback(): if env not in ENVIRONMENTS: return jsonify({'success': False, 'error': 'Unknown environment'}), 400 - # 執行 K8s 回滾 - try: - if env == 'uat': - result = execute_uat_rollback() - else: - result = execute_gcp_rollback() - - return jsonify({ - 'success': True, - 'message': f'{ENVIRONMENTS[env]["name"]} 回滾成功', - 'result': result - }) - except Exception as e: - return jsonify({ - 'success': False, - 'error': str(e) - }), 500 + return jsonify({ + 'success': False, + 'error': ( + '舊叢集回滾已停用。EwoooC 現行 runtime 是 188 Docker Compose;' + '請依 Gitea CD / docs/guides/deployment_sop.md 執行人工回滾。' + ) + }), 410 # ============================================================================= # 自動修復 API # ============================================================================= -# 只允許這幾種 fix_action,任何不在清單的請求直接 400 -ALLOWED_FIX_ACTIONS = frozenset({'restart_registry', 'restart_pods', 'diagnose', 'full_repair'}) +# 只允許這幾種 fix_action,任何不在清單的請求直接 400。 +# 舊叢集自動重啟已依 ADR-008/011 停用,避免誤打已撤除 runtime。 +ALLOWED_FIX_ACTIONS = frozenset({'restart_registry', 'diagnose', 'full_repair'}) @cicd_bp.route('/api/cicd/auto-fix', methods=['POST']) @@ -365,16 +355,13 @@ def trigger_auto_fix(): if fix_action == 'restart_registry': result = fix_registry() results.append(result) - elif fix_action == 'restart_pods': - result = fix_pods(env) - results.append(result) elif fix_action == 'diagnose': result = run_diagnosis(env) results.append(result) elif fix_action == 'full_repair': - # 完整修復流程 + # 完整修復保留 registry best-effort;runtime 只診斷,不做舊叢集重啟。 results.append(fix_registry()) - results.append(fix_pods(env)) + results.append(run_diagnosis(env)) # 發送 Telegram 通知 send_fix_notification(env, fix_action, results) @@ -432,38 +419,10 @@ def fix_registry(): return {'action': 'restart_registry', 'success': False, 'error': str(e)} -def fix_pods(env): - """重啟 Pod 修復問題""" - try: - if env == 'uat': - cmd = "kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=5', - 'wooo@192.168.0.110', cmd], - capture_output=True, text=True, timeout=60 - ) - else: - cmd = "sudo kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=10', - 'ogt@35.194.233.141', cmd], - capture_output=True, text=True, timeout=60 - ) - - return { - 'action': 'restart_pods', - 'environment': env, - 'success': result.returncode == 0, - 'output': result.stdout, - 'error': result.stderr if result.returncode != 0 else None - } - except Exception as e: - return {'action': 'restart_pods', 'success': False, 'error': str(e)} - - def run_diagnosis(env): """執行環境診斷""" diagnosis = { + 'action': 'diagnose', 'environment': env, 'timestamp': datetime.now().isoformat(), 'checks': [] @@ -488,15 +447,12 @@ def run_diagnosis(env): 'error': str(e) }) - # 檢查 Pod 狀態 - pods = get_k8s_pods_status(env) - unhealthy_pods = [p for p in pods if not p.get('healthy')] + # EwoooC 已撤除舊叢集 runtime,這裡只保留現行 Docker Compose 狀態說明。 diagnosis['checks'].append({ - 'name': 'Pod 狀態', - 'status': 'ok' if not unhealthy_pods else 'warning', - 'total_pods': len(pods), - 'unhealthy_pods': len(unhealthy_pods), - 'details': unhealthy_pods + 'name': 'Runtime 狀態', + 'status': 'ok', + 'runtime': 'Docker Compose on 192.168.0.188', + 'details': '舊叢集探測已停用;容器狀態請依 DevOps 手冊在 188 查 docker compose / /health。' }) # 檢查 Registry (僅 UAT) @@ -531,8 +487,8 @@ def run_diagnosis(env): for check in failed_checks: if check['name'] == '健康端點': diagnosis['summary']['recommendations'].append({ - 'action': 'restart_pods', - 'description': '建議重啟應用 Pod' + 'action': 'diagnose', + 'description': '先診斷健康端點與 188 Docker Compose 狀態,避免自動重啟資料庫或舊叢集' }) elif check['name'] == 'Registry': diagnosis['summary']['recommendations'].append({ @@ -540,13 +496,6 @@ def run_diagnosis(env): 'description': '建議重啟 Registry 服務' }) - for check in warning_checks: - if check['name'] == 'Pod 狀態': - diagnosis['summary']['recommendations'].append({ - 'action': 'restart_pods', - 'description': f'有 {check["unhealthy_pods"]} 個 Pod 不健康' - }) - except Exception as e: diagnosis['error'] = str(e) @@ -674,7 +623,7 @@ def gitlab_api(endpoint, method='GET', data=None): def gitlab_api_via_ssh(endpoint, method='GET', data=None): """ - 透過 SSH 在主機上呼叫 GitLab API(當 Pod 無法直接連接時)。 + 透過 SSH 在主機上呼叫 GitLab API(當 app 無法直接連接時)。 Security: curl 參數以 list 形式傳給 subprocess,避免 shell injection。 endpoint 和 json_data 均作為獨立 argv 傳入,不經過 shell 解析。 @@ -920,80 +869,11 @@ def get_environment_status(env_id): except Exception as e: status['error'] = str(e) - # 取得 K8s Pod 狀態 (僅 UAT,GCP 需要特殊處理) - if env_id == 'uat': - status['pods'] = get_k8s_pods_status('uat') - elif env_id == 'prod': - status['pods'] = get_k8s_pods_status('prod') + status['runtime_note'] = 'Docker Compose on 192.168.0.188; legacy cluster probes disabled.' return status -def get_k8s_pods_status(env): - """取得 K8s Pod 狀態""" - pods = [] - - try: - if env == 'uat': - cmd = "kubectl get pods -n momo -o json" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=5', - 'wooo@192.168.0.110', cmd], - capture_output=True, text=True, timeout=15 - ) - else: # prod/gcp - cmd = "sudo kubectl get pods -n momo -o json" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=10', - 'ogt@35.194.233.141', cmd], - capture_output=True, text=True, timeout=20 - ) - - if result.returncode == 0: - data = json.loads(result.stdout) - for item in data.get('items', []): - metadata = item.get('metadata', {}) - status_obj = item.get('status', {}) - - # 計算 Ready 狀態 - containers = status_obj.get('containerStatuses', []) - ready_count = sum(1 for c in containers if c.get('ready', False)) - total_count = len(containers) - - pods.append({ - 'name': metadata.get('name'), - 'status': status_obj.get('phase'), - 'ready': f"{ready_count}/{total_count}", - 'restarts': sum(c.get('restartCount', 0) for c in containers), - 'age': calculate_age(metadata.get('creationTimestamp')), - 'healthy': status_obj.get('phase') == 'Running' and ready_count == total_count - }) - except Exception as e: - print(f"Error getting K8s pods for {env}: {e}") - - return pods - - -def execute_uat_rollback(): - """執行 UAT 回滾""" - cmd = "kubectl rollout undo deployment/momo-app deployment/momo-scheduler -n momo" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', 'wooo@192.168.0.110', cmd], - capture_output=True, text=True, timeout=30 - ) - return {'stdout': result.stdout, 'stderr': result.stderr, 'returncode': result.returncode} - - -def execute_gcp_rollback(): - """執行 GCP 回滾""" - cmd = "sudo kubectl rollout undo deployment/momo-app deployment/momo-scheduler -n momo" - result = subprocess.run( - ['ssh', '-o', 'StrictHostKeyChecking=no', 'ogt@35.194.233.141', cmd], - capture_output=True, text=True, timeout=30 - ) - return {'stdout': result.stdout, 'stderr': result.stderr, 'returncode': result.returncode} - - # ============================================================================= # 輔助函數 - 通用 # ============================================================================= diff --git a/services/code_review_pipeline_service.py b/services/code_review_pipeline_service.py index 9637e6b..df19dca 100644 --- a/services/code_review_pipeline_service.py +++ b/services/code_review_pipeline_service.py @@ -229,7 +229,7 @@ class CodeReviewPipeline: # ADR-027 Phase 2 N3:lazy resolve Hermes 主機(GCP 優先 / 111 備援), # 避開 import-time freeze。Phase 53:改用 services.ollama_service.get_provider_tag - # 統一函式,支援 K8s Nginx Proxy(110:11435/11436)。 + # 統一函式,支援 110 Nginx Proxy(11435/11436)。 hermes_url = get_hermes_url() from services.ollama_service import get_provider_tag provider_tag = get_provider_tag(hermes_url) diff --git a/services/ollama_service.py b/services/ollama_service.py index 439d4e2..6d7aa64 100644 --- a/services/ollama_service.py +++ b/services/ollama_service.py @@ -167,7 +167,7 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY, def get_host_label(host: str) -> str: """將 IP/URL 轉換為易讀的主機標籤 - Phase 53:支援 K8s 環境的 Nginx Proxy(110:11435/11436 → GCP)。 + Phase 53:支援 110 Nginx Proxy(11435/11436 → GCP)。 判斷順序:直連 GCP IP > Nginx 轉發 port > 內網 IP > 本地。 """ if not host: @@ -177,7 +177,7 @@ def get_host_label(host: str) -> str: return "GCP-SSD" if "34.21.145.224" in host: return "GCP-SSD-2" - # Nginx Proxy 轉發(K8s 環境,110 跳板代理 GCP) + # Nginx Proxy 轉發(110 跳板代理 GCP) if "192.168.0.110:11435" in host: return "GCP-SSD(via Nginx 110)" if "192.168.0.110:11436" in host: diff --git a/templates/cicd_dashboard.html b/templates/cicd_dashboard.html index c4ec589..c276578 100644 --- a/templates/cicd_dashboard.html +++ b/templates/cicd_dashboard.html @@ -700,7 +700,7 @@
${escapeHtml(issue.message)}
${issue.type === 'job' ? `${issue.stage}` : ''} - ${issue.type === 'pod' ? `${issue.environment?.toUpperCase()}` : ''} + ${issue.type === 'runtime' ? `${issue.environment?.toUpperCase()}` : ''} ${issue.error ? `
${escapeHtml(issue.error.substring(0, 100))}` : ''}
${issue.fix_suggestion ? `
💡 ${escapeHtml(issue.fix_suggestion)}
` : ''} @@ -925,14 +925,14 @@
❌ 連線錯誤
${escapeHtml(env.error)}
-
` : ''}
- Pods 狀態: + Runtime 狀態: ${renderPods(env.pods, envId)}
@@ -943,10 +943,10 @@ container.innerHTML = html; } - // 渲染 Pods 狀態 + // 渲染 runtime 狀態 function renderPods(pods, envId) { if (!pods || pods.length === 0) { - return '

無法取得 Pod 資訊

'; + return '

Docker Compose runtime;舊叢集資訊不適用

'; } return pods.map(pod => ` @@ -1089,7 +1089,7 @@ return; } - if (!confirm(`確定要對 ${env.toUpperCase()} 執行完整修復嗎?\n這將重啟 Registry 和所有 Pod。`)) return; + if (!confirm(`確定要對 ${env.toUpperCase()} 執行完整修復嗎?\n這會重啟 Registry 並執行 runtime 診斷,不會重啟舊叢集。`)) return; showNotification('執行中', '正在執行完整修復...'); diff --git a/tests/test_cicd_legacy_cluster_disabled.py b/tests/test_cicd_legacy_cluster_disabled.py new file mode 100644 index 0000000..d8891d5 --- /dev/null +++ b/tests/test_cicd_legacy_cluster_disabled.py @@ -0,0 +1,84 @@ +from pathlib import Path + +from flask import Flask + + +ROOT = Path(__file__).resolve().parents[1] + + +def _client(): + from routes.cicd_routes import cicd_bp + + app = Flask(__name__, template_folder=str(ROOT / "templates")) + app.register_blueprint(cicd_bp) + return app.test_client() + + +def test_cicd_routes_do_not_execute_legacy_cluster_commands(): + source = (ROOT / "routes" / "cicd_routes.py").read_text(encoding="utf-8") + + assert "kubectl" not in source + assert "rollout restart" not in source + assert "rollout undo" not in source + assert "get_k8s_pods_status" not in source + assert "execute_uat_rollback" not in source + assert "execute_gcp_rollback" not in source + + +def test_cicd_restart_pods_action_is_rejected(monkeypatch): + from routes import cicd_routes + + def fail_if_called(*args, **kwargs): + raise AssertionError("legacy cluster subprocess should not be called") + + monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called) + + response = _client().post( + "/api/cicd/auto-fix", + json={"action": "restart_pods", "environment": "uat"}, + ) + + assert response.status_code == 400 + assert response.get_json()["success"] is False + + +def test_cicd_rollback_is_disabled(monkeypatch): + from routes import cicd_routes + + def fail_if_called(*args, **kwargs): + raise AssertionError("legacy cluster subprocess should not be called") + + monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called) + + response = _client().post("/api/cicd/rollback", json={"environment": "uat"}) + + assert response.status_code == 410 + payload = response.get_json() + assert payload["success"] is False + assert "Docker Compose" in payload["error"] + + +def test_cicd_diagnosis_uses_runtime_note_without_cluster_probe(monkeypatch): + from routes import cicd_routes + + class FakeResponse: + status_code = 200 + elapsed = type("Elapsed", (), {"total_seconds": lambda self: 0.05})() + + def json(self): + return {"status": "healthy", "version": "test"} + + def fake_get(*args, **kwargs): + return FakeResponse() + + def fail_if_called(*args, **kwargs): + raise AssertionError("legacy cluster subprocess should not be called") + + monkeypatch.setattr(cicd_routes.requests, "get", fake_get) + monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called) + + diagnosis = cicd_routes.run_diagnosis("uat") + + runtime_checks = [check for check in diagnosis["checks"] if check["name"] == "Runtime 狀態"] + assert runtime_checks + assert runtime_checks[0]["runtime"] == "Docker Compose on 192.168.0.188" diff --git a/tests/test_ollama_host_label.py b/tests/test_ollama_host_label.py index 0e51e8d..e8c5917 100644 --- a/tests/test_ollama_host_label.py +++ b/tests/test_ollama_host_label.py @@ -1,6 +1,6 @@ """Phase 54 — get_host_label / get_provider_tag 測試覆蓋(P53 新增 fn). -P53 commit 7a10d27 加了 K8s Nginx Proxy 路由判斷(192.168.0.110:11435/11436) +P53 commit 7a10d27 加了 110 Nginx Proxy 路由判斷(192.168.0.110:11435/11436) 但無單測,未來改 IP / 加新 provider 容易破而不自知。 """ from __future__ import annotations @@ -28,11 +28,11 @@ class TestGetHostLabel: assert get_host_label('http://34.21.145.224:11434') == 'GCP-SSD-2' def test_nginx_proxy_gcp_primary(self): - """K8s 環境經 110 跳板 Nginx 轉發 GCP-A""" + """經 110 跳板 Nginx 轉發 GCP-A""" assert get_host_label('http://192.168.0.110:11435') == 'GCP-SSD(via Nginx 110)' def test_nginx_proxy_gcp_secondary(self): - """K8s 環境經 110 跳板 Nginx 轉發 GCP-B""" + """經 110 跳板 Nginx 轉發 GCP-B""" assert get_host_label('http://192.168.0.110:11436') == 'GCP-SSD-2(via Nginx 110)' def test_111_fallback(self):