Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
486 lines
16 KiB
Bash
486 lines
16 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO TECH - CI/CD 自動修復腳本
|
||
# 版本: 1.0.0
|
||
# 建立日期: 2026-01-25
|
||
# =============================================================================
|
||
#
|
||
# 用途:
|
||
# 當 CI/CD Pipeline 失敗時,自動診斷問題並嘗試修復
|
||
#
|
||
# 使用方式:
|
||
# ./cicd_auto_repair.sh <stage> <error_log_file>
|
||
#
|
||
# stage: test | build | deploy
|
||
# error_log_file: 包含錯誤訊息的檔案路徑
|
||
#
|
||
# =============================================================================
|
||
|
||
set -e
|
||
|
||
# 配置
|
||
TELEGRAM_API="https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage"
|
||
UAT_HOST="192.168.0.110"
|
||
UAT_USER="wooo"
|
||
MAX_RETRIES=3
|
||
|
||
# 顏色輸出
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# 日誌函數
|
||
log_info() {
|
||
echo -e "${GREEN}[INFO]${NC} $1"
|
||
}
|
||
|
||
log_warn() {
|
||
echo -e "${YELLOW}[WARN]${NC} $1"
|
||
}
|
||
|
||
log_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
# 發送 Telegram 通知
|
||
send_telegram() {
|
||
local message="$1"
|
||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||
curl -s -X POST "$TELEGRAM_API" \
|
||
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
||
-d "parse_mode=HTML" \
|
||
-d "text=$message" > /dev/null 2>&1 || true
|
||
fi
|
||
}
|
||
|
||
# =============================================================================
|
||
# 錯誤診斷函數
|
||
# =============================================================================
|
||
|
||
# 診斷測試階段錯誤
|
||
diagnose_test_error() {
|
||
local error_log="$1"
|
||
local diagnosis=""
|
||
local fix_action=""
|
||
local can_auto_fix=false
|
||
|
||
if grep -q "ModuleNotFoundError" "$error_log" 2>/dev/null; then
|
||
local missing_module=$(grep "ModuleNotFoundError: No module named" "$error_log" | head -1 | sed "s/.*'\([^']*\)'.*/\1/")
|
||
diagnosis="缺少 Python 模組: $missing_module"
|
||
fix_action="安裝缺少的模組到 requirements.txt"
|
||
can_auto_fix=false
|
||
elif grep -q "ImportError" "$error_log" 2>/dev/null; then
|
||
diagnosis="Python 導入錯誤"
|
||
fix_action="檢查模組路徑和依賴關係"
|
||
can_auto_fix=false
|
||
elif grep -q "SyntaxError" "$error_log" 2>/dev/null; then
|
||
local syntax_file=$(grep -B2 "SyntaxError" "$error_log" | grep "File" | head -1)
|
||
diagnosis="Python 語法錯誤: $syntax_file"
|
||
fix_action="修正語法錯誤"
|
||
can_auto_fix=false
|
||
elif grep -q "No tests found" "$error_log" 2>/dev/null; then
|
||
diagnosis="沒有找到測試檔案"
|
||
fix_action="這是正常的,繼續執行"
|
||
can_auto_fix=true
|
||
elif grep -q "FAILED" "$error_log" 2>/dev/null; then
|
||
local failed_tests=$(grep -c "FAILED" "$error_log" 2>/dev/null || echo "0")
|
||
diagnosis="測試失敗: ${failed_tests} 個測試未通過"
|
||
fix_action="檢查失敗的測試案例並修正"
|
||
can_auto_fix=false
|
||
else
|
||
diagnosis="未知的測試錯誤"
|
||
fix_action="手動檢查日誌"
|
||
can_auto_fix=false
|
||
fi
|
||
|
||
echo "DIAGNOSIS:$diagnosis"
|
||
echo "FIX_ACTION:$fix_action"
|
||
echo "CAN_AUTO_FIX:$can_auto_fix"
|
||
}
|
||
|
||
# 診斷構建階段錯誤
|
||
diagnose_build_error() {
|
||
local error_log="$1"
|
||
local diagnosis=""
|
||
local fix_action=""
|
||
local can_auto_fix=false
|
||
|
||
if grep -q "unauthorized" "$error_log" 2>/dev/null || grep -q "authentication required" "$error_log" 2>/dev/null; then
|
||
diagnosis="Harbor 認證失敗"
|
||
fix_action="檢查 HARBOR_USER 和 HARBOR_PASSWORD 變數"
|
||
can_auto_fix=false
|
||
elif grep -q "COPY failed" "$error_log" 2>/dev/null; then
|
||
local missing_file=$(grep "COPY failed" "$error_log" | sed 's/.*COPY failed: file not found in build context or excluded by .dockerignore: stat \(.*\):.*/\1/')
|
||
diagnosis="Docker COPY 失敗,檔案不存在: $missing_file"
|
||
fix_action="確認檔案存在且未被 .dockerignore 排除"
|
||
can_auto_fix=false
|
||
elif grep -q "no space left on device" "$error_log" 2>/dev/null; then
|
||
diagnosis="磁碟空間不足"
|
||
fix_action="清理 Docker 映像和快取"
|
||
can_auto_fix=true
|
||
elif grep -q "connection refused" "$error_log" 2>/dev/null; then
|
||
diagnosis="Harbor 連線被拒絕"
|
||
fix_action="檢查 Harbor 服務是否正常運行"
|
||
can_auto_fix=true
|
||
elif grep -q "network is unreachable" "$error_log" 2>/dev/null; then
|
||
diagnosis="網路不可達"
|
||
fix_action="檢查網路連線"
|
||
can_auto_fix=false
|
||
elif grep -q "denied: requested access" "$error_log" 2>/dev/null; then
|
||
diagnosis="Harbor 推送權限不足"
|
||
fix_action="檢查 Harbor 專案權限設定"
|
||
can_auto_fix=false
|
||
else
|
||
diagnosis="未知的構建錯誤"
|
||
fix_action="手動檢查日誌"
|
||
can_auto_fix=false
|
||
fi
|
||
|
||
echo "DIAGNOSIS:$diagnosis"
|
||
echo "FIX_ACTION:$fix_action"
|
||
echo "CAN_AUTO_FIX:$can_auto_fix"
|
||
}
|
||
|
||
# 診斷部署階段錯誤
|
||
diagnose_deploy_error() {
|
||
local error_log="$1"
|
||
local diagnosis=""
|
||
local fix_action=""
|
||
local can_auto_fix=false
|
||
|
||
if grep -q "Permission denied (publickey" "$error_log" 2>/dev/null; then
|
||
diagnosis="SSH 認證失敗 (公鑰)"
|
||
fix_action="檢查 UAT_SSH_PRIVATE_KEY 變數"
|
||
can_auto_fix=false
|
||
elif grep -q "Connection refused" "$error_log" 2>/dev/null; then
|
||
diagnosis="SSH 連線被拒絕"
|
||
fix_action="檢查 UAT 伺服器 SSH 服務"
|
||
can_auto_fix=true
|
||
elif grep -q "Connection timed out" "$error_log" 2>/dev/null; then
|
||
diagnosis="SSH 連線超時"
|
||
fix_action="檢查網路連通性"
|
||
can_auto_fix=true
|
||
elif grep -q "fatal: refusing to merge unrelated histories" "$error_log" 2>/dev/null; then
|
||
diagnosis="Git 歷史不相關"
|
||
fix_action="使用 --allow-unrelated-histories"
|
||
can_auto_fix=true
|
||
elif grep -q "error: failed to push some refs" "$error_log" 2>/dev/null; then
|
||
diagnosis="Git 推送失敗"
|
||
fix_action="可能有衝突,需要手動解決"
|
||
can_auto_fix=false
|
||
elif grep -q "ImagePullBackOff\|ErrImagePull" "$error_log" 2>/dev/null; then
|
||
diagnosis="K8s 無法拉取映像"
|
||
fix_action="檢查 Harbor 映像是否存在,檢查 imagePullSecrets"
|
||
can_auto_fix=true
|
||
elif grep -q "CrashLoopBackOff" "$error_log" 2>/dev/null; then
|
||
diagnosis="K8s Pod 反覆崩潰"
|
||
fix_action="檢查應用程式日誌"
|
||
can_auto_fix=false
|
||
elif grep -q "OOMKilled" "$error_log" 2>/dev/null; then
|
||
diagnosis="K8s Pod 記憶體不足"
|
||
fix_action="增加 Pod 記憶體限制"
|
||
can_auto_fix=false
|
||
elif grep -q "docker: Error response from daemon" "$error_log" 2>/dev/null; then
|
||
diagnosis="Docker Daemon 錯誤"
|
||
fix_action="重啟 Docker 服務"
|
||
can_auto_fix=true
|
||
else
|
||
diagnosis="未知的部署錯誤"
|
||
fix_action="手動檢查日誌"
|
||
can_auto_fix=false
|
||
fi
|
||
|
||
echo "DIAGNOSIS:$diagnosis"
|
||
echo "FIX_ACTION:$fix_action"
|
||
echo "CAN_AUTO_FIX:$can_auto_fix"
|
||
}
|
||
|
||
# =============================================================================
|
||
# 自動修復函數
|
||
# =============================================================================
|
||
|
||
# 修復磁碟空間不足
|
||
fix_disk_space() {
|
||
log_info "嘗試清理 Docker 資源..."
|
||
|
||
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
|
||
# 清理未使用的 Docker 資源
|
||
docker system prune -af --volumes 2>/dev/null || true
|
||
|
||
# 清理舊的映像 (保留最新 3 個版本)
|
||
docker images | grep "momo-pro-system" | tail -n +4 | awk '{print $3}' | xargs -r docker rmi 2>/dev/null || true
|
||
|
||
# 顯示剩餘空間
|
||
df -h /
|
||
ENDSSH
|
||
|
||
log_info "磁碟清理完成"
|
||
return 0
|
||
}
|
||
|
||
# 修復 Harbor 連線問題
|
||
fix_harbor_connection() {
|
||
log_info "嘗試重啟 Harbor 服務..."
|
||
|
||
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
|
||
cd /home/wooo/harbor
|
||
docker compose restart 2>/dev/null || docker-compose restart 2>/dev/null || true
|
||
sleep 10
|
||
# 檢查 Harbor 狀態
|
||
curl -s -o /dev/null -w "%{http_code}" http://localhost:5050/api/v2.0/health || echo "Harbor 健康檢查失敗"
|
||
ENDSSH
|
||
|
||
log_info "Harbor 重啟完成"
|
||
return 0
|
||
}
|
||
|
||
# 修復 SSH 連線問題
|
||
fix_ssh_connection() {
|
||
log_info "嘗試修復 SSH 連線..."
|
||
|
||
# 等待 30 秒後重試
|
||
sleep 30
|
||
|
||
# 測試連線
|
||
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ${UAT_USER}@${UAT_HOST} "echo 'SSH 連線成功'" 2>/dev/null; then
|
||
log_info "SSH 連線已恢復"
|
||
return 0
|
||
else
|
||
log_error "SSH 連線仍然失敗"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# 修復 K8s 映像拉取問題
|
||
fix_k8s_image_pull() {
|
||
log_info "嘗試修復 K8s 映像拉取..."
|
||
|
||
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
|
||
# 重新建立 Registry Secret
|
||
kubectl delete secret harbor-registry -n momo 2>/dev/null || true
|
||
kubectl create secret docker-registry harbor-registry \
|
||
--docker-server=192.168.0.110:5050 \
|
||
--docker-username=admin \
|
||
--docker-password=$(cat /home/wooo/.harbor_password 2>/dev/null || echo "Wooo_Harbor_2026") \
|
||
-n momo
|
||
|
||
# 重啟 Deployment
|
||
kubectl rollout restart deployment/momo-app -n momo
|
||
kubectl rollout restart deployment/momo-scheduler -n momo
|
||
|
||
# 等待 Pod 就緒
|
||
kubectl wait --for=condition=ready pod -l app=momo-app -n momo --timeout=120s || true
|
||
ENDSSH
|
||
|
||
log_info "K8s 映像拉取修復完成"
|
||
return 0
|
||
}
|
||
|
||
# 修復 Docker Daemon 錯誤
|
||
fix_docker_daemon() {
|
||
log_info "嘗試重啟 Docker 服務..."
|
||
|
||
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
|
||
sudo systemctl restart docker
|
||
sleep 10
|
||
docker info > /dev/null 2>&1 && echo "Docker 已恢復" || echo "Docker 仍有問題"
|
||
ENDSSH
|
||
|
||
log_info "Docker 重啟完成"
|
||
return 0
|
||
}
|
||
|
||
# 修復 Git 歷史問題
|
||
fix_git_history() {
|
||
log_info "嘗試修復 Git 歷史問題..."
|
||
|
||
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
|
||
cd /home/wooo/momo_pro_system
|
||
git fetch http://root:glpat-xvT9Dsv7qp7TyJvuBV--@192.168.0.110:8929/root/momo-pro-system.git main --allow-unrelated-histories
|
||
git reset --hard FETCH_HEAD
|
||
echo "Git 同步完成"
|
||
ENDSSH
|
||
|
||
log_info "Git 歷史修復完成"
|
||
return 0
|
||
}
|
||
|
||
# =============================================================================
|
||
# 主函數
|
||
# =============================================================================
|
||
|
||
repair() {
|
||
local stage="$1"
|
||
local error_log="$2"
|
||
local attempt=0
|
||
local success=false
|
||
|
||
log_info "開始診斷 $stage 階段錯誤..."
|
||
|
||
# 根據階段選擇診斷函數
|
||
case "$stage" in
|
||
test)
|
||
result=$(diagnose_test_error "$error_log")
|
||
;;
|
||
build)
|
||
result=$(diagnose_build_error "$error_log")
|
||
;;
|
||
deploy)
|
||
result=$(diagnose_deploy_error "$error_log")
|
||
;;
|
||
*)
|
||
log_error "未知的階段: $stage"
|
||
exit 1
|
||
;;
|
||
esac
|
||
|
||
# 解析診斷結果
|
||
diagnosis=$(echo "$result" | grep "DIAGNOSIS:" | cut -d: -f2-)
|
||
fix_action=$(echo "$result" | grep "FIX_ACTION:" | cut -d: -f2-)
|
||
can_auto_fix=$(echo "$result" | grep "CAN_AUTO_FIX:" | cut -d: -f2-)
|
||
|
||
log_info "診斷結果: $diagnosis"
|
||
log_info "建議修復: $fix_action"
|
||
log_info "可自動修復: $can_auto_fix"
|
||
|
||
# 發送診斷通知
|
||
send_telegram "🔧 <b>[CI/CD 自動診斷]</b>%0A%0A📍 階段: ${stage}%0A🔍 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A🤖 自動修復: ${can_auto_fix}"
|
||
|
||
# 如果可以自動修復,嘗試修復
|
||
if [ "$can_auto_fix" = "true" ]; then
|
||
log_info "嘗試自動修復..."
|
||
|
||
case "$diagnosis" in
|
||
*"磁碟空間不足"*)
|
||
fix_disk_space && success=true
|
||
;;
|
||
*"Harbor 連線被拒絕"*)
|
||
fix_harbor_connection && success=true
|
||
;;
|
||
*"SSH 連線"*)
|
||
fix_ssh_connection && success=true
|
||
;;
|
||
*"K8s 無法拉取映像"*)
|
||
fix_k8s_image_pull && success=true
|
||
;;
|
||
*"Docker Daemon"*)
|
||
fix_docker_daemon && success=true
|
||
;;
|
||
*"Git 歷史"*)
|
||
fix_git_history && success=true
|
||
;;
|
||
*"沒有找到測試"*)
|
||
success=true # 這不是真正的錯誤
|
||
;;
|
||
*)
|
||
log_warn "沒有對應的自動修復程序"
|
||
success=false
|
||
;;
|
||
esac
|
||
|
||
if [ "$success" = true ]; then
|
||
send_telegram "✅ <b>[CI/CD 自動修復]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A✅ 修復成功%0A%0A▶️ 建議重新執行 Pipeline"
|
||
log_info "自動修復成功"
|
||
exit 0
|
||
else
|
||
send_telegram "❌ <b>[CI/CD 自動修復失敗]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A❌ 自動修復失敗%0A%0A🔗 請手動檢查"
|
||
log_error "自動修復失敗"
|
||
exit 1
|
||
fi
|
||
else
|
||
send_telegram "⚠️ <b>[CI/CD 需要手動介入]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A%0A此問題無法自動修復,請手動處理。"
|
||
log_warn "此問題需要手動處理"
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# =============================================================================
|
||
# 健康檢查函數
|
||
# =============================================================================
|
||
|
||
health_check() {
|
||
log_info "執行系統健康檢查..."
|
||
|
||
local issues=""
|
||
local all_ok=true
|
||
|
||
# 檢查 SSH 連線
|
||
if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ${UAT_USER}@${UAT_HOST} "echo 1" > /dev/null 2>&1; then
|
||
issues="${issues}%0A❌ SSH 連線失敗"
|
||
all_ok=false
|
||
else
|
||
issues="${issues}%0A✅ SSH 連線正常"
|
||
fi
|
||
|
||
# 檢查 Harbor
|
||
if ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "curl -s -o /dev/null -w '%{http_code}' http://localhost:5050/api/v2.0/health" 2>/dev/null | grep -q "200"; then
|
||
issues="${issues}%0A✅ Harbor 服務正常"
|
||
else
|
||
issues="${issues}%0A❌ Harbor 服務異常"
|
||
all_ok=false
|
||
fi
|
||
|
||
# 檢查 K8s Pod
|
||
k8s_status=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "kubectl get pods -n momo --no-headers 2>/dev/null | grep -c Running" 2>/dev/null || echo "0")
|
||
if [ "$k8s_status" -ge "2" ]; then
|
||
issues="${issues}%0A✅ K8s Pods 運行中 (${k8s_status} 個)"
|
||
else
|
||
issues="${issues}%0A❌ K8s Pods 異常 (${k8s_status} 個運行中)"
|
||
all_ok=false
|
||
fi
|
||
|
||
# 檢查磁碟空間
|
||
disk_usage=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "df -h / | tail -1 | awk '{print \$5}' | tr -d '%'" 2>/dev/null || echo "0")
|
||
if [ "$disk_usage" -lt "90" ]; then
|
||
issues="${issues}%0A✅ 磁碟空間充足 (已用 ${disk_usage}%%)"
|
||
else
|
||
issues="${issues}%0A⚠️ 磁碟空間不足 (已用 ${disk_usage}%%)"
|
||
all_ok=false
|
||
fi
|
||
|
||
# 發送健康檢查報告
|
||
if [ "$all_ok" = true ]; then
|
||
send_telegram "💚 <b>[系統健康檢查]</b>%0A%0A所有服務正常運行${issues}"
|
||
else
|
||
send_telegram "💔 <b>[系統健康檢查]</b>%0A%0A發現問題${issues}"
|
||
fi
|
||
|
||
if [ "$all_ok" = true ]; then
|
||
exit 0
|
||
else
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# =============================================================================
|
||
# 入口點
|
||
# =============================================================================
|
||
|
||
case "${1:-}" in
|
||
repair)
|
||
if [ -z "$2" ] || [ -z "$3" ]; then
|
||
echo "用法: $0 repair <stage> <error_log_file>"
|
||
echo " stage: test | build | deploy"
|
||
exit 1
|
||
fi
|
||
repair "$2" "$3"
|
||
;;
|
||
health)
|
||
health_check
|
||
;;
|
||
*)
|
||
echo "WOOO TECH - CI/CD 自動修復腳本"
|
||
echo ""
|
||
echo "用法:"
|
||
echo " $0 repair <stage> <error_log> - 診斷並修復錯誤"
|
||
echo " $0 health - 執行健康檢查"
|
||
echo ""
|
||
echo "階段:"
|
||
echo " test - 測試階段"
|
||
echo " build - 構建階段"
|
||
echo " deploy - 部署階段"
|
||
exit 0
|
||
;;
|
||
esac
|