Files
ewoooc/scripts/cicd_auto_repair.sh
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

486 lines
16 KiB
Bash
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO TECH - CI/CD 自動修復腳本
# 版本: 1.0.0
# 建立日期: 2026-01-25
# =============================================================================
#
# 用途:
# 當 CI/CD Pipeline 失敗時,自動診斷問題並嘗試修復
#
# 使用方式:
# ./cicd_auto_repair.sh <stage> <error_log_file>
#
# stage: test | build | deploy
# error_log_file: 包含錯誤訊息的檔案路徑
#
# =============================================================================
set -e
# 配置
TELEGRAM_API="https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage"
UAT_HOST="192.168.0.110"
UAT_USER="wooo"
MAX_RETRIES=3
# 顏色輸出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 日誌函數
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 發送 Telegram 通知
send_telegram() {
local message="$1"
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
curl -s -X POST "$TELEGRAM_API" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "parse_mode=HTML" \
-d "text=$message" > /dev/null 2>&1 || true
fi
}
# =============================================================================
# 錯誤診斷函數
# =============================================================================
# 診斷測試階段錯誤
diagnose_test_error() {
local error_log="$1"
local diagnosis=""
local fix_action=""
local can_auto_fix=false
if grep -q "ModuleNotFoundError" "$error_log" 2>/dev/null; then
local missing_module=$(grep "ModuleNotFoundError: No module named" "$error_log" | head -1 | sed "s/.*'\([^']*\)'.*/\1/")
diagnosis="缺少 Python 模組: $missing_module"
fix_action="安裝缺少的模組到 requirements.txt"
can_auto_fix=false
elif grep -q "ImportError" "$error_log" 2>/dev/null; then
diagnosis="Python 導入錯誤"
fix_action="檢查模組路徑和依賴關係"
can_auto_fix=false
elif grep -q "SyntaxError" "$error_log" 2>/dev/null; then
local syntax_file=$(grep -B2 "SyntaxError" "$error_log" | grep "File" | head -1)
diagnosis="Python 語法錯誤: $syntax_file"
fix_action="修正語法錯誤"
can_auto_fix=false
elif grep -q "No tests found" "$error_log" 2>/dev/null; then
diagnosis="沒有找到測試檔案"
fix_action="這是正常的,繼續執行"
can_auto_fix=true
elif grep -q "FAILED" "$error_log" 2>/dev/null; then
local failed_tests=$(grep -c "FAILED" "$error_log" 2>/dev/null || echo "0")
diagnosis="測試失敗: ${failed_tests} 個測試未通過"
fix_action="檢查失敗的測試案例並修正"
can_auto_fix=false
else
diagnosis="未知的測試錯誤"
fix_action="手動檢查日誌"
can_auto_fix=false
fi
echo "DIAGNOSIS:$diagnosis"
echo "FIX_ACTION:$fix_action"
echo "CAN_AUTO_FIX:$can_auto_fix"
}
# 診斷構建階段錯誤
diagnose_build_error() {
local error_log="$1"
local diagnosis=""
local fix_action=""
local can_auto_fix=false
if grep -q "unauthorized" "$error_log" 2>/dev/null || grep -q "authentication required" "$error_log" 2>/dev/null; then
diagnosis="Harbor 認證失敗"
fix_action="檢查 HARBOR_USER 和 HARBOR_PASSWORD 變數"
can_auto_fix=false
elif grep -q "COPY failed" "$error_log" 2>/dev/null; then
local missing_file=$(grep "COPY failed" "$error_log" | sed 's/.*COPY failed: file not found in build context or excluded by .dockerignore: stat \(.*\):.*/\1/')
diagnosis="Docker COPY 失敗,檔案不存在: $missing_file"
fix_action="確認檔案存在且未被 .dockerignore 排除"
can_auto_fix=false
elif grep -q "no space left on device" "$error_log" 2>/dev/null; then
diagnosis="磁碟空間不足"
fix_action="清理 Docker 映像和快取"
can_auto_fix=true
elif grep -q "connection refused" "$error_log" 2>/dev/null; then
diagnosis="Harbor 連線被拒絕"
fix_action="檢查 Harbor 服務是否正常運行"
can_auto_fix=true
elif grep -q "network is unreachable" "$error_log" 2>/dev/null; then
diagnosis="網路不可達"
fix_action="檢查網路連線"
can_auto_fix=false
elif grep -q "denied: requested access" "$error_log" 2>/dev/null; then
diagnosis="Harbor 推送權限不足"
fix_action="檢查 Harbor 專案權限設定"
can_auto_fix=false
else
diagnosis="未知的構建錯誤"
fix_action="手動檢查日誌"
can_auto_fix=false
fi
echo "DIAGNOSIS:$diagnosis"
echo "FIX_ACTION:$fix_action"
echo "CAN_AUTO_FIX:$can_auto_fix"
}
# 診斷部署階段錯誤
diagnose_deploy_error() {
local error_log="$1"
local diagnosis=""
local fix_action=""
local can_auto_fix=false
if grep -q "Permission denied (publickey" "$error_log" 2>/dev/null; then
diagnosis="SSH 認證失敗 (公鑰)"
fix_action="檢查 UAT_SSH_PRIVATE_KEY 變數"
can_auto_fix=false
elif grep -q "Connection refused" "$error_log" 2>/dev/null; then
diagnosis="SSH 連線被拒絕"
fix_action="檢查 UAT 伺服器 SSH 服務"
can_auto_fix=true
elif grep -q "Connection timed out" "$error_log" 2>/dev/null; then
diagnosis="SSH 連線超時"
fix_action="檢查網路連通性"
can_auto_fix=true
elif grep -q "fatal: refusing to merge unrelated histories" "$error_log" 2>/dev/null; then
diagnosis="Git 歷史不相關"
fix_action="使用 --allow-unrelated-histories"
can_auto_fix=true
elif grep -q "error: failed to push some refs" "$error_log" 2>/dev/null; then
diagnosis="Git 推送失敗"
fix_action="可能有衝突,需要手動解決"
can_auto_fix=false
elif grep -q "ImagePullBackOff\|ErrImagePull" "$error_log" 2>/dev/null; then
diagnosis="K8s 無法拉取映像"
fix_action="檢查 Harbor 映像是否存在,檢查 imagePullSecrets"
can_auto_fix=true
elif grep -q "CrashLoopBackOff" "$error_log" 2>/dev/null; then
diagnosis="K8s Pod 反覆崩潰"
fix_action="檢查應用程式日誌"
can_auto_fix=false
elif grep -q "OOMKilled" "$error_log" 2>/dev/null; then
diagnosis="K8s Pod 記憶體不足"
fix_action="增加 Pod 記憶體限制"
can_auto_fix=false
elif grep -q "docker: Error response from daemon" "$error_log" 2>/dev/null; then
diagnosis="Docker Daemon 錯誤"
fix_action="重啟 Docker 服務"
can_auto_fix=true
else
diagnosis="未知的部署錯誤"
fix_action="手動檢查日誌"
can_auto_fix=false
fi
echo "DIAGNOSIS:$diagnosis"
echo "FIX_ACTION:$fix_action"
echo "CAN_AUTO_FIX:$can_auto_fix"
}
# =============================================================================
# 自動修復函數
# =============================================================================
# 修復磁碟空間不足
fix_disk_space() {
log_info "嘗試清理 Docker 資源..."
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
# 清理未使用的 Docker 資源
docker system prune -af --volumes 2>/dev/null || true
# 清理舊的映像 (保留最新 3 個版本)
docker images | grep "momo-pro-system" | tail -n +4 | awk '{print $3}' | xargs -r docker rmi 2>/dev/null || true
# 顯示剩餘空間
df -h /
ENDSSH
log_info "磁碟清理完成"
return 0
}
# 修復 Harbor 連線問題
fix_harbor_connection() {
log_info "嘗試重啟 Harbor 服務..."
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
cd /home/wooo/harbor
docker compose restart 2>/dev/null || docker-compose restart 2>/dev/null || true
sleep 10
# 檢查 Harbor 狀態
curl -s -o /dev/null -w "%{http_code}" http://localhost:5050/api/v2.0/health || echo "Harbor 健康檢查失敗"
ENDSSH
log_info "Harbor 重啟完成"
return 0
}
# 修復 SSH 連線問題
fix_ssh_connection() {
log_info "嘗試修復 SSH 連線..."
# 等待 30 秒後重試
sleep 30
# 測試連線
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ${UAT_USER}@${UAT_HOST} "echo 'SSH 連線成功'" 2>/dev/null; then
log_info "SSH 連線已恢復"
return 0
else
log_error "SSH 連線仍然失敗"
return 1
fi
}
# 修復 K8s 映像拉取問題
fix_k8s_image_pull() {
log_info "嘗試修復 K8s 映像拉取..."
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
# 重新建立 Registry Secret
kubectl delete secret harbor-registry -n momo 2>/dev/null || true
kubectl create secret docker-registry harbor-registry \
--docker-server=192.168.0.110:5050 \
--docker-username=admin \
--docker-password=$(cat /home/wooo/.harbor_password 2>/dev/null || echo "Wooo_Harbor_2026") \
-n momo
# 重啟 Deployment
kubectl rollout restart deployment/momo-app -n momo
kubectl rollout restart deployment/momo-scheduler -n momo
# 等待 Pod 就緒
kubectl wait --for=condition=ready pod -l app=momo-app -n momo --timeout=120s || true
ENDSSH
log_info "K8s 映像拉取修復完成"
return 0
}
# 修復 Docker Daemon 錯誤
fix_docker_daemon() {
log_info "嘗試重啟 Docker 服務..."
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
sudo systemctl restart docker
sleep 10
docker info > /dev/null 2>&1 && echo "Docker 已恢復" || echo "Docker 仍有問題"
ENDSSH
log_info "Docker 重啟完成"
return 0
}
# 修復 Git 歷史問題
fix_git_history() {
log_info "嘗試修復 Git 歷史問題..."
ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} << 'ENDSSH'
cd /home/wooo/momo_pro_system
git fetch http://root:glpat-xvT9Dsv7qp7TyJvuBV--@192.168.0.110:8929/root/momo-pro-system.git main --allow-unrelated-histories
git reset --hard FETCH_HEAD
echo "Git 同步完成"
ENDSSH
log_info "Git 歷史修復完成"
return 0
}
# =============================================================================
# 主函數
# =============================================================================
repair() {
local stage="$1"
local error_log="$2"
local attempt=0
local success=false
log_info "開始診斷 $stage 階段錯誤..."
# 根據階段選擇診斷函數
case "$stage" in
test)
result=$(diagnose_test_error "$error_log")
;;
build)
result=$(diagnose_build_error "$error_log")
;;
deploy)
result=$(diagnose_deploy_error "$error_log")
;;
*)
log_error "未知的階段: $stage"
exit 1
;;
esac
# 解析診斷結果
diagnosis=$(echo "$result" | grep "DIAGNOSIS:" | cut -d: -f2-)
fix_action=$(echo "$result" | grep "FIX_ACTION:" | cut -d: -f2-)
can_auto_fix=$(echo "$result" | grep "CAN_AUTO_FIX:" | cut -d: -f2-)
log_info "診斷結果: $diagnosis"
log_info "建議修復: $fix_action"
log_info "可自動修復: $can_auto_fix"
# 發送診斷通知
send_telegram "🔧 <b>[CI/CD 自動診斷]</b>%0A%0A📍 階段: ${stage}%0A🔍 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A🤖 自動修復: ${can_auto_fix}"
# 如果可以自動修復,嘗試修復
if [ "$can_auto_fix" = "true" ]; then
log_info "嘗試自動修復..."
case "$diagnosis" in
*"磁碟空間不足"*)
fix_disk_space && success=true
;;
*"Harbor 連線被拒絕"*)
fix_harbor_connection && success=true
;;
*"SSH 連線"*)
fix_ssh_connection && success=true
;;
*"K8s 無法拉取映像"*)
fix_k8s_image_pull && success=true
;;
*"Docker Daemon"*)
fix_docker_daemon && success=true
;;
*"Git 歷史"*)
fix_git_history && success=true
;;
*"沒有找到測試"*)
success=true # 這不是真正的錯誤
;;
*)
log_warn "沒有對應的自動修復程序"
success=false
;;
esac
if [ "$success" = true ]; then
send_telegram "✅ <b>[CI/CD 自動修復]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A✅ 修復成功%0A%0A▶ 建議重新執行 Pipeline"
log_info "自動修復成功"
exit 0
else
send_telegram "❌ <b>[CI/CD 自動修復失敗]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A❌ 自動修復失敗%0A%0A🔗 請手動檢查"
log_error "自動修復失敗"
exit 1
fi
else
send_telegram "⚠️ <b>[CI/CD 需要手動介入]</b>%0A%0A📍 階段: ${stage}%0A🔧 問題: ${diagnosis}%0A💡 建議: ${fix_action}%0A%0A此問題無法自動修復請手動處理。"
log_warn "此問題需要手動處理"
exit 1
fi
}
# =============================================================================
# 健康檢查函數
# =============================================================================
health_check() {
log_info "執行系統健康檢查..."
local issues=""
local all_ok=true
# 檢查 SSH 連線
if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ${UAT_USER}@${UAT_HOST} "echo 1" > /dev/null 2>&1; then
issues="${issues}%0A❌ SSH 連線失敗"
all_ok=false
else
issues="${issues}%0A✅ SSH 連線正常"
fi
# 檢查 Harbor
if ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "curl -s -o /dev/null -w '%{http_code}' http://localhost:5050/api/v2.0/health" 2>/dev/null | grep -q "200"; then
issues="${issues}%0A✅ Harbor 服務正常"
else
issues="${issues}%0A❌ Harbor 服務異常"
all_ok=false
fi
# 檢查 K8s Pod
k8s_status=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "kubectl get pods -n momo --no-headers 2>/dev/null | grep -c Running" 2>/dev/null || echo "0")
if [ "$k8s_status" -ge "2" ]; then
issues="${issues}%0A✅ K8s Pods 運行中 (${k8s_status} 個)"
else
issues="${issues}%0A❌ K8s Pods 異常 (${k8s_status} 個運行中)"
all_ok=false
fi
# 檢查磁碟空間
disk_usage=$(ssh -o StrictHostKeyChecking=no ${UAT_USER}@${UAT_HOST} "df -h / | tail -1 | awk '{print \$5}' | tr -d '%'" 2>/dev/null || echo "0")
if [ "$disk_usage" -lt "90" ]; then
issues="${issues}%0A✅ 磁碟空間充足 (已用 ${disk_usage}%%)"
else
issues="${issues}%0A⚠ 磁碟空間不足 (已用 ${disk_usage}%%)"
all_ok=false
fi
# 發送健康檢查報告
if [ "$all_ok" = true ]; then
send_telegram "💚 <b>[系統健康檢查]</b>%0A%0A所有服務正常運行${issues}"
else
send_telegram "💔 <b>[系統健康檢查]</b>%0A%0A發現問題${issues}"
fi
if [ "$all_ok" = true ]; then
exit 0
else
exit 1
fi
}
# =============================================================================
# 入口點
# =============================================================================
case "${1:-}" in
repair)
if [ -z "$2" ] || [ -z "$3" ]; then
echo "用法: $0 repair <stage> <error_log_file>"
echo " stage: test | build | deploy"
exit 1
fi
repair "$2" "$3"
;;
health)
health_check
;;
*)
echo "WOOO TECH - CI/CD 自動修復腳本"
echo ""
echo "用法:"
echo " $0 repair <stage> <error_log> - 診斷並修復錯誤"
echo " $0 health - 執行健康檢查"
echo ""
echo "階段:"
echo " test - 測試階段"
echo " build - 構建階段"
echo " deploy - 部署階段"
exit 0
;;
esac