停用 CICD 舊叢集副作用
Some checks failed
CD Pipeline / deploy (push) Has been cancelled

This commit is contained in:
OoO
2026-05-13 19:19:15 +08:00
parent c5d077ff77
commit bd6310365e
7 changed files with 134 additions and 169 deletions

View File

@@ -64,6 +64,7 @@
- app.py/BP 路由雙寫已完成收斂active `@app.route` 為 0`USE_MODULAR_ROUTES` 與舊 routes registry shim 已不存在;`tests/test_phase3f_cleanup_contracts.py::test_app_py_stays_blueprint_only_for_routes` 會防止 route decorator 回到 `app.py`
- `app.py` 開頭的歷史「重開機後請依序執行」TODO banner 已移除;入口治理以 `AGENTS.md` / `CONSTITUTION.md` / ADR / memory 索引為準,`tests/test_phase3f_cleanup_contracts.py::test_app_py_does_not_start_with_stale_restart_todo_banner` 會防止 stale 操作清單回流。
- `docs/guides/devops_handbook.md` 已移除活躍手冊中的舊 K8s command 區塊,並把 app 操作改為 Gunicorn HUP 熱重載、Docker Compose 精準 force-recreate、`momo-app` service build`tests/test_phase3f_cleanup_contracts.py::test_devops_handbook_uses_current_docker_runtime_commands` 會防止 `kubectl``docker restart momo-pro-system` 與錯誤 compose service name 回流。
- `/cicd` legacy dashboard 已停用舊叢集副作用rollback 會回 410`restart_pods` action 會被拒絕diagnosis 不再 SSH 執行叢集探測;`tests/test_cicd_legacy_cluster_disabled.py` 會防止 `kubectl` / rollout command 回到 `routes/cicd_routes.py`
- AI 觀測台 badge/chip 對比規範已補強:`.momo-observability-mode` 內的 badge、pill 與 nested surface 會改走亮底、8px radius、無負字距、可換行且不再殘留低對比 legacy dark-hero 樣式CSS mirror、`quick_review.sh --observability-ui``quick_review.sh --observability-qa --skip-production` 均通過。
- AI 觀測台 rendered visual contract 已入庫:`scripts/check_observability_visual_contract.sh` 會用 Playwright 檢查 10 頁 × desktop/tablet/mobile 的 title typography、surface radius/background、chip contrast、hero height 與水平 overflowV10.116 main 版 local server 驗證 30 項 PASS。測未部署變更時必須帶 localhost `--base-url`,打 production fail 可能只是正式站尚未部署該版。
- V10.117 已把 AI 觀測台背景語彙收斂為 tokenized dot-matrix合約會要求 hero/signal/panel surface 的 computed `background-image``radial-gradient`,並拒絕 legacy `linear-gradient` / `background-image: none` 回流;終端 dot-matrix layer 必須留在 CSS 檔尾以贏過舊 neutralizer。

View File

@@ -26,20 +26,19 @@ ERROR_PATTERNS = {
'auto_fixable': True,
'fix_action': 'restart_registry'
},
'k8s_timeout': {
'deploy_timeout': {
'pattern': r'(timed out|timeout|deadline exceeded)',
'message': 'K8s 操作超時',
'message': '部署操作超時',
'severity': 'warning',
'fix_suggestion': '網路可能不穩定,請稍後重試或檢查 K8s 叢集狀態',
'fix_suggestion': '網路可能不穩定,請稍後重試或檢查 Gitea/CD runner 與 188 Docker Compose 狀態',
'auto_fixable': False
},
'pod_crash': {
'runtime_crash': {
'pattern': r'(CrashLoopBackOff|OOMKilled|Error|ImagePullBackOff)',
'message': 'Pod 異常',
'message': '容器或舊叢集狀態異常',
'severity': 'critical',
'fix_suggestion': '檢查 Pod 日誌,可嘗試重啟 Pod',
'auto_fixable': True,
'fix_action': 'restart_pods'
'fix_suggestion': 'EwoooC 現行 runtime 是 188 Docker Compose請依 DevOps 手冊檢查容器與 /health',
'auto_fixable': False
},
'test_failed': {
'pattern': r'(pytest.*failed|test.*error|AssertionError)',
@@ -102,7 +101,7 @@ ENVIRONMENTS = {
'icon': '🟦',
'url': 'https://mo.wooo.work',
'health_endpoint': 'https://mo.wooo.work/health',
'k8s_host': '192.168.0.110'
'runtime_host': '192.168.0.188'
},
'prod': {
'name': 'PROD',
@@ -111,7 +110,7 @@ ENVIRONMENTS = {
'icon': '🟥',
'url': 'https://momo.wooo.work',
'health_endpoint': 'https://momo.wooo.work/health',
'k8s_host': '35.194.233.141'
'runtime_host': '192.168.0.188'
}
}
@@ -171,21 +170,21 @@ def get_cicd_status():
'error': env_status.get('error'),
'severity': 'critical',
'auto_fixable': True,
'fix_action': 'restart_pods'
'fix_action': 'diagnose'
})
# 檢查 Pod 問題
# 舊 runtime 資訊若由歷史資料帶入,只允許診斷,不觸發重啟副作用。
for pod in env_status.get('pods', []):
if not pod.get('healthy'):
issues.append({
'type': 'pod',
'type': 'runtime',
'environment': env_id,
'message': f"Pod {pod.get('name')} 不健康",
'message': f"服務 {pod.get('name')} 不健康",
'status': pod.get('status'),
'restarts': pod.get('restarts'),
'severity': 'warning' if pod.get('restarts', 0) < 5 else 'critical',
'auto_fixable': True,
'fix_action': 'restart_pods'
'fix_action': 'diagnose'
})
# 檢查 Pipeline 問題
@@ -319,31 +318,22 @@ def trigger_rollback():
if env not in ENVIRONMENTS:
return jsonify({'success': False, 'error': 'Unknown environment'}), 400
# 執行 K8s 回滾
try:
if env == 'uat':
result = execute_uat_rollback()
else:
result = execute_gcp_rollback()
return jsonify({
'success': True,
'message': f'{ENVIRONMENTS[env]["name"]} 回滾成功',
'result': result
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
return jsonify({
'success': False,
'error': (
'舊叢集回滾已停用。EwoooC 現行 runtime 是 188 Docker Compose'
'請依 Gitea CD / docs/guides/deployment_sop.md 執行人工回滾。'
)
}), 410
# =============================================================================
# 自動修復 API
# =============================================================================
# 只允許這幾種 fix_action任何不在清單的請求直接 400
ALLOWED_FIX_ACTIONS = frozenset({'restart_registry', 'restart_pods', 'diagnose', 'full_repair'})
# 只允許這幾種 fix_action任何不在清單的請求直接 400
# 舊叢集自動重啟已依 ADR-008/011 停用,避免誤打已撤除 runtime。
ALLOWED_FIX_ACTIONS = frozenset({'restart_registry', 'diagnose', 'full_repair'})
@cicd_bp.route('/api/cicd/auto-fix', methods=['POST'])
@@ -365,16 +355,13 @@ def trigger_auto_fix():
if fix_action == 'restart_registry':
result = fix_registry()
results.append(result)
elif fix_action == 'restart_pods':
result = fix_pods(env)
results.append(result)
elif fix_action == 'diagnose':
result = run_diagnosis(env)
results.append(result)
elif fix_action == 'full_repair':
# 完整修復流程
# 完整修復保留 registry best-effortruntime 只診斷,不做舊叢集重啟。
results.append(fix_registry())
results.append(fix_pods(env))
results.append(run_diagnosis(env))
# 發送 Telegram 通知
send_fix_notification(env, fix_action, results)
@@ -432,38 +419,10 @@ def fix_registry():
return {'action': 'restart_registry', 'success': False, 'error': str(e)}
def fix_pods(env):
"""重啟 Pod 修復問題"""
try:
if env == 'uat':
cmd = "kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=5',
'wooo@192.168.0.110', cmd],
capture_output=True, text=True, timeout=60
)
else:
cmd = "sudo kubectl rollout restart deployment/momo-app deployment/momo-scheduler -n momo"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=10',
'ogt@35.194.233.141', cmd],
capture_output=True, text=True, timeout=60
)
return {
'action': 'restart_pods',
'environment': env,
'success': result.returncode == 0,
'output': result.stdout,
'error': result.stderr if result.returncode != 0 else None
}
except Exception as e:
return {'action': 'restart_pods', 'success': False, 'error': str(e)}
def run_diagnosis(env):
"""執行環境診斷"""
diagnosis = {
'action': 'diagnose',
'environment': env,
'timestamp': datetime.now().isoformat(),
'checks': []
@@ -488,15 +447,12 @@ def run_diagnosis(env):
'error': str(e)
})
# 檢查 Pod 狀態
pods = get_k8s_pods_status(env)
unhealthy_pods = [p for p in pods if not p.get('healthy')]
# EwoooC 已撤除舊叢集 runtime這裡只保留現行 Docker Compose 狀態說明。
diagnosis['checks'].append({
'name': 'Pod 狀態',
'status': 'ok' if not unhealthy_pods else 'warning',
'total_pods': len(pods),
'unhealthy_pods': len(unhealthy_pods),
'details': unhealthy_pods
'name': 'Runtime 狀態',
'status': 'ok',
'runtime': 'Docker Compose on 192.168.0.188',
'details': '舊叢集探測已停用;容器狀態請依 DevOps 手冊在 188 查 docker compose / /health。'
})
# 檢查 Registry (僅 UAT)
@@ -531,8 +487,8 @@ def run_diagnosis(env):
for check in failed_checks:
if check['name'] == '健康端點':
diagnosis['summary']['recommendations'].append({
'action': 'restart_pods',
'description': '建議重啟應用 Pod'
'action': 'diagnose',
'description': '先診斷健康端點與 188 Docker Compose 狀態,避免自動重啟資料庫或舊叢集'
})
elif check['name'] == 'Registry':
diagnosis['summary']['recommendations'].append({
@@ -540,13 +496,6 @@ def run_diagnosis(env):
'description': '建議重啟 Registry 服務'
})
for check in warning_checks:
if check['name'] == 'Pod 狀態':
diagnosis['summary']['recommendations'].append({
'action': 'restart_pods',
'description': f'{check["unhealthy_pods"]} 個 Pod 不健康'
})
except Exception as e:
diagnosis['error'] = str(e)
@@ -674,7 +623,7 @@ def gitlab_api(endpoint, method='GET', data=None):
def gitlab_api_via_ssh(endpoint, method='GET', data=None):
"""
透過 SSH 在主機上呼叫 GitLab APIPod 無法直接連接時)。
透過 SSH 在主機上呼叫 GitLab APIapp 無法直接連接時)。
Security: curl 參數以 list 形式傳給 subprocess避免 shell injection。
endpoint 和 json_data 均作為獨立 argv 傳入,不經過 shell 解析。
@@ -920,80 +869,11 @@ def get_environment_status(env_id):
except Exception as e:
status['error'] = str(e)
# 取得 K8s Pod 狀態 (僅 UATGCP 需要特殊處理)
if env_id == 'uat':
status['pods'] = get_k8s_pods_status('uat')
elif env_id == 'prod':
status['pods'] = get_k8s_pods_status('prod')
status['runtime_note'] = 'Docker Compose on 192.168.0.188; legacy cluster probes disabled.'
return status
def get_k8s_pods_status(env):
"""取得 K8s Pod 狀態"""
pods = []
try:
if env == 'uat':
cmd = "kubectl get pods -n momo -o json"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=5',
'wooo@192.168.0.110', cmd],
capture_output=True, text=True, timeout=15
)
else: # prod/gcp
cmd = "sudo kubectl get pods -n momo -o json"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'ConnectTimeout=10',
'ogt@35.194.233.141', cmd],
capture_output=True, text=True, timeout=20
)
if result.returncode == 0:
data = json.loads(result.stdout)
for item in data.get('items', []):
metadata = item.get('metadata', {})
status_obj = item.get('status', {})
# 計算 Ready 狀態
containers = status_obj.get('containerStatuses', [])
ready_count = sum(1 for c in containers if c.get('ready', False))
total_count = len(containers)
pods.append({
'name': metadata.get('name'),
'status': status_obj.get('phase'),
'ready': f"{ready_count}/{total_count}",
'restarts': sum(c.get('restartCount', 0) for c in containers),
'age': calculate_age(metadata.get('creationTimestamp')),
'healthy': status_obj.get('phase') == 'Running' and ready_count == total_count
})
except Exception as e:
print(f"Error getting K8s pods for {env}: {e}")
return pods
def execute_uat_rollback():
"""執行 UAT 回滾"""
cmd = "kubectl rollout undo deployment/momo-app deployment/momo-scheduler -n momo"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', 'wooo@192.168.0.110', cmd],
capture_output=True, text=True, timeout=30
)
return {'stdout': result.stdout, 'stderr': result.stderr, 'returncode': result.returncode}
def execute_gcp_rollback():
"""執行 GCP 回滾"""
cmd = "sudo kubectl rollout undo deployment/momo-app deployment/momo-scheduler -n momo"
result = subprocess.run(
['ssh', '-o', 'StrictHostKeyChecking=no', 'ogt@35.194.233.141', cmd],
capture_output=True, text=True, timeout=30
)
return {'stdout': result.stdout, 'stderr': result.stderr, 'returncode': result.returncode}
# =============================================================================
# 輔助函數 - 通用
# =============================================================================

View File

@@ -229,7 +229,7 @@ class CodeReviewPipeline:
# ADR-027 Phase 2 N3lazy resolve Hermes 主機GCP 優先 / 111 備援),
# 避開 import-time freeze。Phase 53改用 services.ollama_service.get_provider_tag
# 統一函式,支援 K8s Nginx Proxy110:11435/11436
# 統一函式,支援 110 Nginx Proxy11435/11436
hermes_url = get_hermes_url()
from services.ollama_service import get_provider_tag
provider_tag = get_provider_tag(hermes_url)

View File

@@ -167,7 +167,7 @@ def resolve_ollama_host(primary: str = OLLAMA_HOST_PRIMARY,
def get_host_label(host: str) -> str:
"""將 IP/URL 轉換為易讀的主機標籤
Phase 53支援 K8s 環境的 Nginx Proxy110:11435/11436 → GCP
Phase 53支援 110 Nginx Proxy11435/11436 → GCP
判斷順序:直連 GCP IP > Nginx 轉發 port > 內網 IP > 本地。
"""
if not host:
@@ -177,7 +177,7 @@ def get_host_label(host: str) -> str:
return "GCP-SSD"
if "34.21.145.224" in host:
return "GCP-SSD-2"
# Nginx Proxy 轉發(K8s 環境,110 跳板代理 GCP
# Nginx Proxy 轉發110 跳板代理 GCP
if "192.168.0.110:11435" in host:
return "GCP-SSDvia Nginx 110"
if "192.168.0.110:11436" in host:

View File

@@ -700,7 +700,7 @@
<div class="issue-title">${escapeHtml(issue.message)}</div>
<div class="issue-detail">
${issue.type === 'job' ? `<span class="badge bg-secondary me-1">${issue.stage}</span>` : ''}
${issue.type === 'pod' ? `<span class="badge bg-info me-1">${issue.environment?.toUpperCase()}</span>` : ''}
${issue.type === 'runtime' ? `<span class="badge bg-info me-1">${issue.environment?.toUpperCase()}</span>` : ''}
${issue.error ? `<br><code>${escapeHtml(issue.error.substring(0, 100))}</code>` : ''}
</div>
${issue.fix_suggestion ? `<div class="issue-suggestion">💡 ${escapeHtml(issue.fix_suggestion)}</div>` : ''}
@@ -925,14 +925,14 @@
<div class="env-error">
<div class="env-error-title">❌ 連線錯誤</div>
<div class="env-error-detail">${escapeHtml(env.error)}</div>
<button class="btn-fix mt-2" onclick="triggerAutoFix('restart_pods', '${envId}')">
<i class="bi bi-arrow-clockwise me-1"></i>重啟服務
<button class="btn-fix mt-2" onclick="triggerAutoFix('diagnose', '${envId}')">
<i class="bi bi-search me-1"></i>診斷服務
</button>
</div>
` : ''}
<div class="env-details mt-2">
<strong class="d-block mb-2">Pods 狀態:</strong>
<strong class="d-block mb-2">Runtime 狀態:</strong>
${renderPods(env.pods, envId)}
</div>
</div>
@@ -943,10 +943,10 @@
container.innerHTML = html;
}
// 渲染 Pods 狀態
// 渲染 runtime 狀態
function renderPods(pods, envId) {
if (!pods || pods.length === 0) {
return '<p class="text-muted small mb-0">無法取得 Pod 資訊</p>';
return '<p class="text-muted small mb-0">Docker Compose runtime舊叢集資訊不適用</p>';
}
return pods.map(pod => `
@@ -1089,7 +1089,7 @@
return;
}
if (!confirm(`確定要對 ${env.toUpperCase()} 執行完整修復嗎?\n重啟 Registry 和所有 Pod`)) return;
if (!confirm(`確定要對 ${env.toUpperCase()} 執行完整修復嗎?\n重啟 Registry 並執行 runtime 診斷,不會重啟舊叢集`)) return;
showNotification('執行中', '正在執行完整修復...');

View File

@@ -0,0 +1,84 @@
from pathlib import Path
from flask import Flask
ROOT = Path(__file__).resolve().parents[1]
def _client():
from routes.cicd_routes import cicd_bp
app = Flask(__name__, template_folder=str(ROOT / "templates"))
app.register_blueprint(cicd_bp)
return app.test_client()
def test_cicd_routes_do_not_execute_legacy_cluster_commands():
source = (ROOT / "routes" / "cicd_routes.py").read_text(encoding="utf-8")
assert "kubectl" not in source
assert "rollout restart" not in source
assert "rollout undo" not in source
assert "get_k8s_pods_status" not in source
assert "execute_uat_rollback" not in source
assert "execute_gcp_rollback" not in source
def test_cicd_restart_pods_action_is_rejected(monkeypatch):
from routes import cicd_routes
def fail_if_called(*args, **kwargs):
raise AssertionError("legacy cluster subprocess should not be called")
monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called)
response = _client().post(
"/api/cicd/auto-fix",
json={"action": "restart_pods", "environment": "uat"},
)
assert response.status_code == 400
assert response.get_json()["success"] is False
def test_cicd_rollback_is_disabled(monkeypatch):
from routes import cicd_routes
def fail_if_called(*args, **kwargs):
raise AssertionError("legacy cluster subprocess should not be called")
monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called)
response = _client().post("/api/cicd/rollback", json={"environment": "uat"})
assert response.status_code == 410
payload = response.get_json()
assert payload["success"] is False
assert "Docker Compose" in payload["error"]
def test_cicd_diagnosis_uses_runtime_note_without_cluster_probe(monkeypatch):
from routes import cicd_routes
class FakeResponse:
status_code = 200
elapsed = type("Elapsed", (), {"total_seconds": lambda self: 0.05})()
def json(self):
return {"status": "healthy", "version": "test"}
def fake_get(*args, **kwargs):
return FakeResponse()
def fail_if_called(*args, **kwargs):
raise AssertionError("legacy cluster subprocess should not be called")
monkeypatch.setattr(cicd_routes.requests, "get", fake_get)
monkeypatch.setattr(cicd_routes.subprocess, "run", fail_if_called)
diagnosis = cicd_routes.run_diagnosis("uat")
runtime_checks = [check for check in diagnosis["checks"] if check["name"] == "Runtime 狀態"]
assert runtime_checks
assert runtime_checks[0]["runtime"] == "Docker Compose on 192.168.0.188"

View File

@@ -1,6 +1,6 @@
"""Phase 54 — get_host_label / get_provider_tag 測試覆蓋P53 新增 fn.
P53 commit 7a10d27 加了 K8s Nginx Proxy 路由判斷192.168.0.110:11435/11436
P53 commit 7a10d27 加了 110 Nginx Proxy 路由判斷192.168.0.110:11435/11436
但無單測,未來改 IP / 加新 provider 容易破而不自知。
"""
from __future__ import annotations
@@ -28,11 +28,11 @@ class TestGetHostLabel:
assert get_host_label('http://34.21.145.224:11434') == 'GCP-SSD-2'
def test_nginx_proxy_gcp_primary(self):
"""K8s 環境經 110 跳板 Nginx 轉發 GCP-A"""
"""經 110 跳板 Nginx 轉發 GCP-A"""
assert get_host_label('http://192.168.0.110:11435') == 'GCP-SSDvia Nginx 110'
def test_nginx_proxy_gcp_secondary(self):
"""K8s 環境經 110 跳板 Nginx 轉發 GCP-B"""
"""經 110 跳板 Nginx 轉發 GCP-B"""
assert get_host_label('http://192.168.0.110:11436') == 'GCP-SSD-2via Nginx 110'
def test_111_fallback(self):