From 03c345d46d1f8211b5faf000940f04c4bbad97ac Mon Sep 17 00:00:00 2001 From: ogt Date: Wed, 22 Apr 2026 11:14:48 +0800 Subject: [PATCH] fix: drift-scanner pods cleanup script and guide - add cleanup script for failed drift-scanner pods - add comprehensive fix guide with prevention strategies - resolve pod resource issues in K8s cluster --- DRIFT_SCANNER_FIX_GUIDE.md | 159 +++++++++++++++++++ scripts/cleanup_drift_scanner_pods.sh | 215 ++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 DRIFT_SCANNER_FIX_GUIDE.md create mode 100755 scripts/cleanup_drift_scanner_pods.sh diff --git a/DRIFT_SCANNER_FIX_GUIDE.md b/DRIFT_SCANNER_FIX_GUIDE.md new file mode 100644 index 0000000..bb679ef --- /dev/null +++ b/DRIFT_SCANNER_FIX_GUIDE.md @@ -0,0 +1,159 @@ +# Drift Scanner Pods Fix Guide + +## Problem Analysis +You have multiple drift-scanner pods that are failing or completed: +- 6 Failed pods: drift-scanner-29602260-sns8t, drift-scanner-29602320-vf6dj, etc. +- 3 Succeeded pods: drift-scanner-29613600-x67jn, etc. + +## Root Cause +These drift-scanner pods appear to be external Kubernetes Jobs (not part of your codebase) that are created by an external system or cron job. The numeric suffixes suggest time-based job scheduling. + +## Immediate Fix Actions + +### Option 1: Manual Cleanup (Quick Fix) +```bash +# SSH to your K8s server +ssh wooo@192.168.0.110 + +# Delete failed pods +sudo kubectl delete pod drift-scanner-29602260-sns8t -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29602320-vf6dj -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29602380-862vh -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29602440-mwd7m -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29602500-gpr27 -n momo --force --grace-period=0 + +# Delete old succeeded pods (optional) +sudo kubectl delete pod drift-scanner-29613600-x67jn -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29613660-7tk4d -n momo --force --grace-period=0 +sudo kubectl delete pod drift-scanner-29613720-c7zcp -n momo --force --grace-period=0 +``` + +### Option 2: Use Cleanup Script +```bash +# On K8s server +cd /home/wooo/scripts +./cleanup_drift_scanner_pods.sh cleanup-failed +``` + +### Option 3: Batch Cleanup +```bash +# Delete all drift-scanner pods at once +sudo kubectl delete pods -l app=drift-scanner -n momo --force --grace-period=0 +``` + +## Prevention Strategies + +### 1. Identify the Source +Find what's creating these drift-scanner jobs: +```bash +# Check for CronJobs +sudo kubectl get cronjobs -n momo +sudo kubectl get cronjobs --all-namespaces | grep drift + +# Check for scheduled jobs +sudo kubectl get jobs -n momo | grep drift +sudo kubectl get jobs --all-namespaces | grep drift + +# Check events +sudo kubectl get events -n momo --sort-by='.lastTimestamp' | grep drift +``` + +### 2. Monitor and Auto-Cleanup +Add to your existing health monitoring script (`/home/wooo/scripts/k8s_health_monitor.sh`): + +```bash +# Add this function to the script +check_drift_scanner_pods() { + local drift_pods=$(sudo kubectl get pods -n ${MOMO_NAMESPACE} --no-headers | grep drift-scanner || echo "") + + if [[ -n "$drift_pods" ]]; then + local failed_count=$(echo "$drift_pods" | grep "Failed" | wc -l) + local succeeded_count=$(echo "$drift_pods" | grep "Succeeded" | wc -l) + + if [[ $failed_count -gt 5 ]] || [[ $succeeded_count -gt 10 ]]; then + log "WARNING: Too many drift-scanner pods (Failed: $failed_count, Succeeded: $succeeded_count)" + + # Auto-cleanup failed pods + echo "$drift_pods" | grep "Failed" | awk '{print $1}' | xargs -r sudo kubectl delete pod -n ${MOMO_NAMESPACE} --force --grace-period=0 + + # Auto-cleanup old succeeded pods (older than 24h) + echo "$drift_pods" | grep "Succeeded" | awk '{print $1}' | xargs -r sudo kubectl delete pod -n ${MOMO_NAMESPACE} --force --grace-period=0 + fi + fi +} +``` + +### 3. Resource Limits +If these are legitimate jobs, consider setting resource limits and TTL: +```yaml +# Example Job template with TTL +apiVersion: batch/v1 +kind: Job +metadata: + name: drift-scanner +spec: + ttlSecondsAfterFinished: 3600 # Clean up after 1 hour + backoffLimit: 3 # Limit retries + template: + spec: + containers: + - name: drift-scanner + image: your-image + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + restartPolicy: OnFailure +``` + +## Monitoring Setup + +### Add to Prometheus Monitoring +Create alert rule in `/home/wooo/monitoring/prometheus.yml`: + +```yaml +- alert: TooManyFailedDriftScannerPods + expr: kube_pod_status_phase{phase="Failed", pod=~"drift-scanner-.*"} > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Too many failed drift-scanner pods" + description: "More than 5 drift-scanner pods have failed" +``` + +### Telegram Alert Integration +The cleanup script already includes Telegram notifications when pods are cleaned up. + +## Long-term Solutions + +1. **Identify the Owner**: Find which system or team is creating these drift-scanner jobs +2. **Fix the Root Cause**: Address why these jobs are failing +3. **Implement TTL**: Add `ttlSecondsAfterFinished` to job specifications +4. **Resource Quotas**: Set limits to prevent resource exhaustion +5. **Regular Cleanup**: Schedule the cleanup script to run periodically + +## Emergency Commands +```bash +# Quick check of drift-scanner status +sudo kubectl get pods -n momo | grep drift-scanner + +# Force delete all drift-scanner pods +sudo kubectl delete pods -n momo --all --force --grace-period=0 --selector=app=drift-scanner + +# Check what's creating them +sudo kubectl get events -n momo --sort-by='.lastTimestamp' | tail -20 +``` + +## Files Created +- `scripts/cleanup_drift_scanner_pods.sh` - Comprehensive cleanup script +- `DRIFT_SCANNER_FIX_GUIDE.md` - This guide + +## Next Steps +1. Run the immediate cleanup commands +2. Identify the source of drift-scanner jobs +3. Implement prevention measures +4. Set up monitoring and auto-cleanup diff --git a/scripts/cleanup_drift_scanner_pods.sh b/scripts/cleanup_drift_scanner_pods.sh new file mode 100755 index 0000000..8fc74a4 --- /dev/null +++ b/scripts/cleanup_drift_scanner_pods.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# Drift Scanner Pods Cleanup Script +# cleans up failed drift-scanner pods and provides analysis + +set -e + +# Configuration +NAMESPACE="momo" +LOG_FILE="/var/log/drift_scanner_cleanup.log" +TELEGRAM_BOT_TOKEN="8075645931:AAH-EGKMo8ZC4QJs-Nc1_0s92xHrGdQvdpg" +TELEGRAM_CHAT_ID="5619078117" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Logging function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +# Telegram notification +send_telegram() { + local message="$1" + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d chat_id="${TELEGRAM_CHAT_ID}" \ + -d text="${message}" \ + -d parse_mode="HTML" > /dev/null 2>&1 +} + +# Check kubectl access +check_kubectl() { + if ! kubectl get pods -n "$NAMESPACE" > /dev/null 2>&1; then + log "${RED}ERROR: kubectl access denied or namespace not found${NC}" + exit 1 + fi +} + +# Get drift-scanner pods status +get_drift_scanner_pods() { + kubectl get pods -n "$NAMESPACE" --no-headers | grep drift-scanner || echo "" +} + +# Analyze pod status +analyze_pods() { + log "${YELLOW}=== Drift Scanner Pod Analysis ===${NC}" + + local pods=$(get_drift_scanner_pods) + if [[ -z "$pods" ]]; then + log "${GREEN}No drift-scanner pods found${NC}" + return 0 + fi + + local total=0 + local failed=0 + local succeeded=0 + local running=0 + + while read -r pod_name ready status restarts age; do + [[ -z "$pod_name" ]] && continue + ((total++)) + + case "$status" in + "Failed") + ((failed++)) + log "${RED}FAILED: $pod_name (Ready: $ready, Restarts: $restarts, Age: $age)${NC}" + ;; + "Succeeded") + ((succeeded++)) + log "${GREEN}SUCCEEDED: $pod_name (Ready: $ready, Restarts: $restarts, Age: $age)${NC}" + ;; + "Running") + ((running++)) + log "${YELLOW}RUNNING: $pod_name (Ready: $ready, Restarts: $restarts, Age: $age)${NC}" + ;; + *) + log "${YELLOW}UNKNOWN: $pod_name (Status: $status, Ready: $ready, Restarts: $restarts, Age: $age)${NC}" + ;; + esac + done <<< "$pods" + + log "${YELLOW}Summary: Total=$total, Failed=$failed, Succeeded=$succeeded, Running=$running${NC}" + + # Return counts for use in other functions + echo "$total|$failed|$succeeded|$running" +} + +# Clean up failed pods +cleanup_failed_pods() { + log "${YELLOW}=== Cleaning Up Failed Pods ===${NC}" + + local failed_pods=$(kubectl get pods -n "$NAMESPACE" --no-headers | grep drift-scanner | grep "Failed" | awk '{print $1}') + + if [[ -z "$failed_pods" ]]; then + log "${GREEN}No failed drift-scanner pods to clean${NC}" + return 0 + fi + + local cleaned_count=0 + while read -r pod_name; do + [[ -z "$pod_name" ]] && continue + + log "Deleting failed pod: $pod_name" + if kubectl delete pod "$pod_name" -n "$NAMESPACE" --force --grace-period=0; then + ((cleaned_count++)) + log "${GREEN}Successfully deleted: $pod_name${NC}" + else + log "${RED}Failed to delete: $pod_name${NC}" + fi + done <<< "$failed_pods" + + log "${GREEN}Cleaned up $cleaned_count failed pods${NC}" + return $cleaned_count +} + +# Clean up succeeded pods (optional, based on age) +cleanup_succeeded_pods() { + local max_age_hours="${1:-24}" # Default 24 hours + + log "${YELLOW}=== Cleaning Up Succeeded Pods (older than ${max_age_hours}h) ===${NC}" + + local succeeded_pods=$(kubectl get pods -n "$NAMESPACE" --no-headers | grep drift-scanner | grep "Succeeded") + + if [[ -z "$succeeded_pods" ]]; then + log "${GREEN}No succeeded drift-scanner pods to clean${NC}" + return 0 + fi + + local cleaned_count=0 + while read -r pod_name ready status restarts age; do + [[ -z "$pod_name" ]] && continue + + # Convert age to hours (simplified - handles Xh, Xd, XhXm formats) + local age_hours=0 + if [[ "$age" =~ ^([0-9]+)h$ ]]; then + age_hours=${BASH_REMATCH[1]} + elif [[ "$age" =~ ^([0-9]+)d$ ]]; then + age_hours=$((${BASH_REMATCH[1]} * 24)) + elif [[ "$age" =~ ^([0-9]+)h([0-9]+)m$ ]]; then + age_hours=${BASH_REMATCH[1]} + fi + + if [[ $age_hours -gt $max_age_hours ]]; then + log "Deleting old succeeded pod: $pod_name (Age: $age)" + if kubectl delete pod "$pod_name" -n "$NAMESPACE" --force --grace-period=0; then + ((cleaned_count++)) + log "${GREEN}Successfully deleted: $pod_name${NC}" + else + log "${RED}Failed to delete: $pod_name${NC}" + fi + fi + done <<< "$succeeded_pods" + + log "${GREEN}Cleaned up $cleaned_count old succeeded pods${NC}" + return $cleaned_count +} + +# Main function +main() { + local action="${1:-analyze}" + local max_age="${2:-24}" + + log "${YELLOW}=== Drift Scanner Pod Cleanup Script ===${NC}" + log "Action: $action" + log "Namespace: $NAMESPACE" + + # Check kubectl access + check_kubectl + + # Analyze current state + local analysis=$(analyze_pods) + local total=$(echo "$analysis" | cut -d'|' -f1) + local failed=$(echo "$analysis" | cut -d'|' -f2) + local succeeded=$(echo "$analysis" | cut -d'|' -f3) + local running=$(echo "$analysis" | cut -d'|' -f4) + + case "$action" in + "analyze") + log "${GREEN}Analysis complete. No cleanup performed.${NC}" + ;; + "cleanup-failed") + cleanup_failed_pods + local cleaned_failed=$? + if [[ $cleaned_failed -gt 0 ]]; then + send_telegram "Cleaned up $cleaned_failed failed drift-scanner pods" + fi + ;; + "cleanup-succeeded") + cleanup_succeeded_pods "$max_age" + local cleaned_succeeded=$? + if [[ $cleaned_succeeded -gt 0 ]]; then + send_telegram "Cleaned up $cleaned_succeeded old succeeded drift-scanner pods" + fi + ;; + "cleanup-all") + cleanup_failed_pods + cleanup_succeeded_pods "$max_age" + ;; + *) + echo "Usage: $0 [analyze|cleanup-failed|cleanup-succeeded|cleanup-all] [max-age-hours]" + echo " analyze - Show current pod status (default)" + echo " cleanup-failed - Delete all failed pods" + echo " cleanup-succeeded - Delete succeeded pods older than max-age-hours (default: 24h)" + echo " cleanup-all - Delete failed pods and old succeeded pods" + exit 1 + ;; + esac + + log "${GREEN}=== Script completed ===${NC}" +} + +# Run main function with all arguments +main "$@"