#!/bin/bash # ============================================================================= # WOOO TECH - CI/CD 自動修復腳本 # 版本: 1.0.0 # 建立日期: 2026-01-25 # ============================================================================= # # 用途: # 當 CI/CD Pipeline 失敗時,自動診斷問題並嘗試修復 # # 使用方式: # ./cicd_auto_repair.sh # # stage: test | build | deploy # error_log_file: 包含錯誤訊息的檔案路徑 # # ============================================================================= set -e # 配置 TELEGRAM_API="https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" UAT_HOST="192.168.0.110" UAT_USER="wooo" MAX_RETRIES=3 # 顏色輸出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # 日誌函數 log_info() { echo -e "${GREEN}[INFO]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } # 發送 Telegram 通知 send_telegram() { local message="$1" if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then curl -s -X POST "$TELEGRAM_API" \ -d "chat_id=${TELEGRAM_CHAT_ID}" \ -d "parse_mode=HTML" \ -d "text=$message" > /dev/null 2>&1 || true fi } # ============================================================================= # 錯誤診斷函數 # ============================================================================= # 診斷測試階段錯誤 diagnose_test_error() { local error_log="$1" local diagnosis="" local fix_action="" local can_auto_fix=false if grep -q "ModuleNotFoundError" "$error_log" 2>/dev/null; then local missing_module=$(grep "ModuleNotFoundError: No module named" "$error_log" | head -1 | sed "s/.*'\([^']*\)'.*/\1/") diagnosis="缺少 Python 模組: $missing_module" fix_action="安裝缺少的模組到 requirements.txt" can_auto_fix=false elif grep -q "ImportError" "$error_log" 2>/dev/null; then diagnosis="Python 導入錯誤" fix_action="檢查模組路徑和依賴關係" can_auto_fix=false elif grep -q "SyntaxError" "$error_log" 2>/dev/null; then local syntax_file=$(grep -B2 "SyntaxError" "$error_log" | grep "File" | head -1) diagnosis="Python 語法錯誤: $syntax_file" fix_action="修正語法錯誤" can_auto_fix=false elif grep -q "No tests found" "$error_log" 2>/dev/null; then diagnosis="沒有找到測試檔案" fix_action="這是正常的,繼續執行" can_auto_fix=true elif grep -q "FAILED" "$error_log" 2>/dev/null; then local failed_tests=$(grep -c "FAILED" "$error_log" 2>/dev/null || echo "0") diagnosis="測試失敗: ${failed_tests} 個測試未通過" fix_action="檢查失敗的測試案例並修正" can_auto_fix=false else diagnosis="未知的測試錯誤" fix_action="手動檢查日誌" can_auto_fix=false fi echo "DIAGNOSIS:$diagnosis" echo "FIX_ACTION:$fix_action" echo "CAN_AUTO_FIX:$can_auto_fix" } # 診斷構建階段錯誤 diagnose_build_error() { local error_log="$1" local diagnosis="" local fix_action="" local can_auto_fix=false if grep -q "unauthorized" "$error_log" 2>/dev/null || grep -q "authentication required" "$error_log" 2>/dev/null; then diagnosis="Harbor 認證失敗" fix_action="檢查 HARBOR_USER 和 HARBOR_PASSWORD 變數" can_auto_fix=false elif grep -q "COPY failed" "$error_log" 2>/dev/null; then local missing_file=$(grep "COPY failed" "$error_log" | sed 's/.*COPY failed: file not found in build context or excluded by .dockerignore: stat \(.*\):.*/\1/') diagnosis="Docker COPY 失敗,檔案不存在: $missing_file" fix_action="確認檔案存在且未被 .dockerignore 排除" can_auto_fix=false elif grep -q "no space left on device" "$error_log" 2>/dev/null; then diagnosis="磁碟空間不足" fix_action="清理 Docker 映像和快取" can_auto_fix=true elif grep -q "connection refused" "$error_log" 2>/dev/null; then diagnosis="Harbor 連線被拒絕" fix_action="檢查 Harbor 服務是否正常運行" can_auto_fix=true elif grep -q "network is unreachable" "$error_log" 2>/dev/null; then diagnosis="網路不可達" fix_action="檢查網路連線" can_auto_fix=false elif grep -q "denied: requested access" "$error_log" 2>/dev/null; then diagnosis="Harbor 推送權限不足" fix_action="檢查 Harbor 專案權限設定" can_auto_fix=false else diagnosis="未知的構建錯誤" fix_action="手動檢查日誌" can_auto_fix=false fi echo "DIAGNOSIS:$diagnosis" echo "FIX_ACTION:$fix_action" echo "CAN_AUTO_FIX:$can_auto_fix" } # 診斷部署階段錯誤 diagnose_deploy_error() { local error_log="$1" local diagnosis="" local fix_action="" local can_auto_fix=false if grep -q "Permission denied (publickey" "$error_log" 2>/dev/null; then diagnosis="SSH 認證失敗 (公鑰)" fix_action="檢查 UAT_SSH_PRIVATE_KEY 變數" can_auto_fix=false elif grep -q "Connection refused" "$error_log" 2>/dev/null; then diagnosis="SSH 連線被拒絕" fix_action="檢查 UAT 伺服器 SSH 服務" can_auto_fix=true elif grep -q "Connection timed out" "$error_log" 2>/dev/null; then diagnosis="SSH 連線超時" fix_action="檢查網路連通性" can_auto_fix=true elif grep -q "fatal: refusing to merge unrelated histories" "$error_log" 2>/dev/null; then diagnosis="Git 歷史不相關" fix_action="使用 --allow-unrelated-histories" can_auto_fix=true elif grep -q "error: failed to push some refs" "$error_log" 2>/dev/null; then diagnosis="Git 推送失敗" fix_action="可能有衝突,需要手動解決" can_auto_fix=false elif grep -q "ImagePullBackOff\|ErrImagePull" "$error_log" 2>/dev/null; then diagnosis="K8s 無法拉取映像" fix_action="檢查 Harbor 映像是否存在,檢查 imagePullSecrets" can_auto_fix=true elif grep -q "CrashLoopBackOff" "$error_log" 2>/dev/null; then diagnosis="K8s Pod 反覆崩潰" fix_action="檢查應用程式日誌" can_auto_fix=false elif grep -q "OOMKilled" "$error_log" 2>/dev/null; then diagnosis="K8s Pod 記憶體不足" fix_action="增加 Pod 記憶體限制" can_auto_fix=false elif grep -q "docker: Error response from daemon" "$error_log" 2>/dev/null; then diagnosis="Docker Daemon 錯誤" fix_action="重啟 Docker 服務" can_auto_fix=true else diagnosis="未知的部署錯誤" fix_action="手動檢查日誌" can_auto_fix=false fi echo "DIAGNOSIS:$diagnosis" echo "FIX_ACTION:$fix_action" echo "CAN_AUTO_FIX:$can_auto_fix" } # ============================================================================= # 自動修復函數 # ============================================================================= # 修復磁碟空間不足 fix_disk_space() { log_info "嘗試清理 Docker 資源..." ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH' # 清理未使用的 Docker 資源 docker system prune -af --volumes 2>/dev/null || true # 清理舊的映像 (保留最新 3 個版本) docker images | grep "momo-pro-system" | tail -n +4 | awk '{print $3}' | xargs -r docker rmi 2>/dev/null || true # 顯示剩餘空間 df -h / ENDSSH log_info "磁碟清理完成" return 0 } # 修復 Harbor 連線問題 fix_harbor_connection() { log_info "嘗試重啟 Harbor 服務..." ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH' cd /home/wooo/harbor docker compose restart 2>/dev/null || docker-compose restart 2>/dev/null || true sleep 10 # 檢查 Harbor 狀態 curl -s -o /dev/null -w "%{http_code}" http://localhost:5050/api/v2.0/health || echo "Harbor 健康檢查失敗" ENDSSH log_info "Harbor 重啟完成" return 0 } # 修復 SSH 連線問題 fix_ssh_connection() { log_info "嘗試修復 SSH 連線..." # 等待 30 秒後重試 sleep 30 # 測試連線 if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ${UAT_USER}@${UAT_HOST} "echo 'SSH 連線成功'" 2>/dev/null; then log_info "SSH 連線已恢復" return 0 else log_error "SSH 連線仍然失敗" return 1 fi } # 修復 K8s 映像拉取問題 fix_k8s_image_pull() { log_info "嘗試修復 K8s 映像拉取..." ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH' # 重新建立 Registry Secret kubectl delete secret harbor-registry -n momo 2>/dev/null || true kubectl create secret docker-registry harbor-registry \ --docker-server=192.168.0.110:5050 \ --docker-username=admin \ --docker-password=$(cat /home/wooo/.harbor_password 2>/dev/null || echo "Wooo_Harbor_2026") \ -n momo # 重啟 Deployment kubectl rollout restart deployment/momo-app -n momo kubectl rollout restart deployment/momo-scheduler -n momo # 等待 Pod 就緒 kubectl wait --for=condition=ready pod -l app=momo-app -n momo --timeout=120s || true ENDSSH log_info "K8s 映像拉取修復完成" return 0 } # 修復 Docker Daemon 錯誤 fix_docker_daemon() { log_info "嘗試重啟 Docker 服務..." ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH' sudo systemctl restart docker sleep 10 docker info > /dev/null 2>&1 && echo "Docker 已恢復" || echo "Docker 仍有問題" ENDSSH log_info "Docker 重啟完成" return 0 } # 修復 Git 歷史問題 fix_git_history() { log_info "嘗試修復 Git 歷史問題..." ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH' cd /home/wooo/momo_pro_system git fetch http://root:glpat-xvT9Dsv7qp7TyJvuBV--@192.168.0.110:8929/root/momo-pro-system.git main --allow-unrelated-histories git reset --hard FETCH_HEAD echo "Git 同步完成" ENDSSH log_info "Git 歷史修復完成" return 0 } # ============================================================================= # 主函數 # ============================================================================= repair() { local stage="$1" local error_log="$2" local attempt=0 local success=false log_info "開始診斷 $stage 階段錯誤..." # 根據階段選擇診斷函數 case "$stage" in test) result=$(diagnose_test_error "$error_log") ;; build) result=$(diagnose_build_error "$error_log") ;; deploy) result=$(diagnose_deploy_error "$error_log") ;; *) log_error "未知的階段: $stage" exit 1 ;; esac # 解析診斷結果 diagnosis=$(echo "$result" | grep "DIAGNOSIS:" | cut -d: -f2-) fix_action=$(echo "$result" | grep "FIX_ACTION:" | cut -d: -f2-) can_auto_fix=$(echo "$result" | grep "CAN_AUTO_FIX:" | cut -d: -f2-) log_info "診斷結果: $diagnosis" log_info "建議修復: $fix_action" log_info "可自動修復: $can_auto_fix" # 發送診斷通知 send_telegram "🔧 [CI/CD 自動診斷]%0A%0A📍 階段: ${stage}%0A🔍 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A🤖 自動修復: ${can_auto_fix}" # 如果可以自動修復,嘗試修復 if [ "$can_auto_fix" = "true" ]; then log_info "嘗試自動修復..." case "$diagnosis" in *"磁碟空間不足"*) fix_disk_space && success=true ;; *"Harbor 連線被拒絕"*) fix_harbor_connection && success=true ;; *"SSH 連線"*) fix_ssh_connection && success=true ;; *"K8s 無法拉取映像"*) fix_k8s_image_pull && success=true ;; *"Docker Daemon"*) fix_docker_daemon && success=true ;; *"Git 歷史"*) fix_git_history && success=true ;; *"沒有找到測試"*) success=true # 這不是真正的錯誤 ;; *) log_warn "沒有對應的自動修復程序" success=false ;; esac if [ "$success" = true ]; then send_telegram "✅ [CI/CD 自動修復]%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A✅ 修復成功%0A%0A▶️ 建議重新執行 Pipeline" log_info "自動修復成功" exit 0 else send_telegram "❌ [CI/CD 自動修復失敗]%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A❌ 自動修復失敗%0A%0A🔗 請手動檢查" log_error "自動修復失敗" exit 1 fi else send_telegram "⚠️ [CI/CD 需要手動介入]%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A%0A此問題無法自動修復,請手動處理。" log_warn "此問題需要手動處理" exit 1 fi } # ============================================================================= # 健康檢查函數 # ============================================================================= health_check() { log_info "執行系統健康檢查..." local issues="" local all_ok=true # 檢查 SSH 連線 if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ${UAT_USER}@${UAT_HOST} "echo 1" > /dev/null 2>&1; then issues="${issues}%0A❌ SSH 連線失敗" all_ok=false else issues="${issues}%0A✅ SSH 連線正常" fi # 檢查 Harbor if ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "curl -s -o /dev/null -w '%{http_code}' http://localhost:5050/api/v2.0/health" 2>/dev/null | grep -q "200"; then issues="${issues}%0A✅ Harbor 服務正常" else issues="${issues}%0A❌ Harbor 服務異常" all_ok=false fi # 檢查 K8s Pod k8s_status=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "kubectl get pods -n momo --no-headers 2>/dev/null | grep -c Running" 2>/dev/null || echo "0") if [ "$k8s_status" -ge "2" ]; then issues="${issues}%0A✅ K8s Pods 運行中 (${k8s_status} 個)" else issues="${issues}%0A❌ K8s Pods 異常 (${k8s_status} 個運行中)" all_ok=false fi # 檢查磁碟空間 disk_usage=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "df -h / | tail -1 | awk '{print \$5}' | tr -d '%'" 2>/dev/null || echo "0") if [ "$disk_usage" -lt "90" ]; then issues="${issues}%0A✅ 磁碟空間充足 (已用 ${disk_usage}%%)" else issues="${issues}%0A⚠️ 磁碟空間不足 (已用 ${disk_usage}%%)" all_ok=false fi # 發送健康檢查報告 if [ "$all_ok" = true ]; then send_telegram "💚 [系統健康檢查]%0A%0A所有服務正常運行${issues}" else send_telegram "💔 [系統健康檢查]%0A%0A發現問題${issues}" fi if [ "$all_ok" = true ]; then exit 0 else exit 1 fi } # ============================================================================= # 入口點 # ============================================================================= case "${1:-}" in repair) if [ -z "$2" ] || [ -z "$3" ]; then echo "用法: $0 repair " echo " stage: test | build | deploy" exit 1 fi repair "$2" "$3" ;; health) health_check ;; *) echo "WOOO TECH - CI/CD 自動修復腳本" echo "" echo "用法:" echo " $0 repair - 診斷並修復錯誤" echo " $0 health - 執行健康檢查" echo "" echo "階段:" echo " test - 測試階段" echo " build - 構建階段" echo " deploy - 部署階段" exit 0 ;; esac