diff --git a/apps/api/src/services/anomaly_counter.py b/apps/api/src/services/anomaly_counter.py index 17a687a1..e8924631 100644 --- a/apps/api/src/services/anomaly_counter.py +++ b/apps/api/src/services/anomaly_counter.py @@ -309,33 +309,36 @@ class AnomalyCounter: action: 修復動作 (e.g., restart_pod, scale_up) success: 是否成功 """ - repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" + try: + repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" - # 遞增修復嘗試次數 - await self.redis.incr(repair_key) - await self.redis.expire(repair_key, self.TTL_SECONDS) + # 遞增修復嘗試次數 + await self.redis.incr(repair_key) + await self.redis.expire(repair_key, self.TTL_SECONDS) - # 記錄修復歷史 (用於學習) - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" - await self.redis.lpush( - history_key, - json.dumps( - { - "action": action, - "success": success, - "timestamp": datetime.now().isoformat(), - } - ), - ) - await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次 - await self.redis.expire(history_key, self.TTL_SECONDS) + # 記錄修復歷史 (用於學習) + history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + await self.redis.lpush( + history_key, + json.dumps( + { + "action": action, + "success": success, + "timestamp": datetime.now().isoformat(), + } + ), + ) + await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次 + await self.redis.expire(history_key, self.TTL_SECONDS) - logger.info( - "repair_attempt_recorded", - anomaly_key=anomaly_key, - action=action, - success=success, - ) + logger.info( + "repair_attempt_recorded", + anomaly_key=anomaly_key, + action=action, + success=success, + ) + except Exception as e: + logger.warning("record_repair_attempt_redis_error", error=str(e), anomaly_key=anomaly_key) async def mark_permanent_fix_applied( self, @@ -349,28 +352,31 @@ class AnomalyCounter: anomaly_key: 異常 key fix_description: 修復說明 """ - await self.redis.set( - f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", - "1", - ex=90 * 24 * 3600, # 90 天 - ) + try: + await self.redis.set( + f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", + "1", + ex=90 * 24 * 3600, # 90 天 + ) - # 記錄修復詳情 - metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" - await self.redis.hset( - metadata_key, - mapping={ - "permanent_fix_applied": "true", - "permanent_fix_description": fix_description, - "permanent_fix_time": datetime.now().isoformat(), - }, - ) + # 記錄修復詳情 + metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + await self.redis.hset( + metadata_key, + mapping={ + "permanent_fix_applied": "true", + "permanent_fix_description": fix_description, + "permanent_fix_time": datetime.now().isoformat(), + }, + ) - logger.info( - "permanent_fix_marked", - anomaly_key=anomaly_key, - fix_description=fix_description, - ) + logger.info( + "permanent_fix_marked", + anomaly_key=anomaly_key, + fix_description=fix_description, + ) + except Exception as e: + logger.warning("mark_permanent_fix_applied_redis_error", error=str(e), anomaly_key=anomaly_key) async def get_repair_success_rate( self, @@ -388,25 +394,34 @@ class AnomalyCounter: 'success_rate': 0.3, } """ - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" - history = await self.redis.lrange(history_key, 0, -1) + try: + history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) - total = 0 - success_count = 0 + total = 0 + success_count = 0 - for item in history: - data = json.loads(item) - if data["action"] == action: - total += 1 - if data["success"]: - success_count += 1 + for item in history: + data = json.loads(item) + if data["action"] == action: + total += 1 + if data["success"]: + success_count += 1 - return { - "action": action, - "total": total, - "success": success_count, - "success_rate": success_count / total if total > 0 else 0.0, - } + return { + "action": action, + "total": total, + "success": success_count, + "success_rate": success_count / total if total > 0 else 0.0, + } + except Exception as e: + logger.warning("get_repair_success_rate_redis_error", error=str(e), anomaly_key=anomaly_key) + return { + "action": action, + "total": 0, + "success": 0, + "success_rate": 0.0, + } async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]: """ @@ -418,30 +433,34 @@ class AnomalyCounter: 'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5}, } """ - history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" - history = await self.redis.lrange(history_key, 0, -1) + try: + history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) - stats: dict[str, dict] = {} + stats: dict[str, dict] = {} - for item in history: - data = json.loads(item) - action = data["action"] + for item in history: + data = json.loads(item) + action = data["action"] - if action not in stats: - stats[action] = {"total": 0, "success": 0} + if action not in stats: + stats[action] = {"total": 0, "success": 0} - stats[action]["total"] += 1 - if data["success"]: - stats[action]["success"] += 1 + stats[action]["total"] += 1 + if data["success"]: + stats[action]["success"] += 1 - # 計算成功率 - for action_stats in stats.values(): - total = action_stats["total"] - action_stats["success_rate"] = ( - action_stats["success"] / total if total > 0 else 0.0 - ) + # 計算成功率 + for action_stats in stats.values(): + total = action_stats["total"] + action_stats["success_rate"] = ( + action_stats["success"] / total if total > 0 else 0.0 + ) - return stats + return stats + except Exception as e: + logger.warning("get_all_repair_stats_redis_error", error=str(e), anomaly_key=anomaly_key) + return {} async def get_frequency(self, anomaly_key: str) -> AnomalyFrequency | None: """ @@ -451,84 +470,88 @@ class AnomalyCounter: anomaly_key: 異常 key Returns: - AnomalyFrequency 或 None (若無記錄) + AnomalyFrequency 或 None (若無記錄 或 Redis 重連失敗) """ - timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" + try: + timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" - # 檢查是否有記錄 - if not await self.redis.exists(timeline_key): + # 檢查是否有記錄 + if not await self.redis.exists(timeline_key): + return None + + now = datetime.now() + cutoff_30d = (now - timedelta(days=30)).timestamp() + + # 計算各時間窗口的計數 + count_1h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=1)).timestamp(), + "+inf", + ) + count_24h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=24)).timestamp(), + "+inf", + ) + count_7d = await self.redis.zcount( + timeline_key, + (now - timedelta(days=7)).timestamp(), + "+inf", + ) + count_30d = await self.redis.zcount( + timeline_key, + cutoff_30d, + "+inf", + ) + + # 取得時間範圍 + first_seen_data = await self.redis.zrange( + timeline_key, 0, 0, withscores=True + ) + last_seen_data = await self.redis.zrange( + timeline_key, -1, -1, withscores=True + ) + + first_seen = ( + datetime.fromtimestamp(first_seen_data[0][1]) + if first_seen_data + else now + ) + last_seen = ( + datetime.fromtimestamp(last_seen_data[0][1]) + if last_seen_data + else now + ) + + # 讀取修復統計 + repair_count_str = await self.redis.get( + f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" + ) + auto_repair_count = int(repair_count_str) if repair_count_str else 0 + + permanent_fix_str = await self.redis.get( + f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}" + ) + permanent_fix = permanent_fix_str == "1" + + escalation_level = self._get_escalation_level(count_24h) + + return AnomalyFrequency( + anomaly_key=anomaly_key, + count_1h=count_1h, + count_24h=count_24h, + count_7d=count_7d, + count_30d=count_30d, + first_seen=first_seen, + last_seen=last_seen, + auto_repair_count=auto_repair_count, + permanent_fix_applied=permanent_fix, + escalation_level=escalation_level, + ) + except Exception as e: + logger.warning("get_frequency_redis_error", error=str(e), anomaly_key=anomaly_key) return None - now = datetime.now() - cutoff_30d = (now - timedelta(days=30)).timestamp() - - # 計算各時間窗口的計數 - count_1h = await self.redis.zcount( - timeline_key, - (now - timedelta(hours=1)).timestamp(), - "+inf", - ) - count_24h = await self.redis.zcount( - timeline_key, - (now - timedelta(hours=24)).timestamp(), - "+inf", - ) - count_7d = await self.redis.zcount( - timeline_key, - (now - timedelta(days=7)).timestamp(), - "+inf", - ) - count_30d = await self.redis.zcount( - timeline_key, - cutoff_30d, - "+inf", - ) - - # 取得時間範圍 - first_seen_data = await self.redis.zrange( - timeline_key, 0, 0, withscores=True - ) - last_seen_data = await self.redis.zrange( - timeline_key, -1, -1, withscores=True - ) - - first_seen = ( - datetime.fromtimestamp(first_seen_data[0][1]) - if first_seen_data - else now - ) - last_seen = ( - datetime.fromtimestamp(last_seen_data[0][1]) - if last_seen_data - else now - ) - - # 讀取修復統計 - repair_count_str = await self.redis.get( - f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" - ) - auto_repair_count = int(repair_count_str) if repair_count_str else 0 - - permanent_fix_str = await self.redis.get( - f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}" - ) - permanent_fix = permanent_fix_str == "1" - - escalation_level = self._get_escalation_level(count_24h) - - return AnomalyFrequency( - anomaly_key=anomaly_key, - count_1h=count_1h, - count_24h=count_24h, - count_7d=count_7d, - count_30d=count_30d, - first_seen=first_seen, - last_seen=last_seen, - auto_repair_count=auto_repair_count, - permanent_fix_applied=permanent_fix, - escalation_level=escalation_level, - ) - async def should_skip_action( self, anomaly_key: str, diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 27f4609c..3878690b 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -430,6 +430,24 @@ class IncidentService: Incident | None: 成功返回 Incident,失敗返回 None """ try: + # 0. 去抖動 (Debounce) - 防止告警風暴 + fingerprint = signal_data.get("fingerprint") + if fingerprint: + try: + redis_client = get_redis() + debounce_key = f"debounce:{fingerprint}" + # SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s) + is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True) + if not is_new: + logger.info( + "incident_debounced", + fingerprint=fingerprint, + reason="Duplicate signal within 3 minutes", + ) + return None + except Exception as e: + logger.warning("incident_debounce_redis_error", error=str(e)) + # 1. 解析 Signal signal = Signal( alert_name=signal_data.get("alert_name", "unknown"), diff --git a/k8s/awoooi-prod/08-deployment-worker.yaml b/k8s/awoooi-prod/08-deployment-worker.yaml index 7d5e8fd9..677399ba 100644 --- a/k8s/awoooi-prod/08-deployment-worker.yaml +++ b/k8s/awoooi-prod/08-deployment-worker.yaml @@ -49,6 +49,10 @@ spec: # Harbor 金庫: 110 主機 (192.168.0.110:5000) image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 5"] # Worker 模式啟動 (非 HTTP 服務) command: ["python", "-m", "src.workers.signal_worker"] envFrom: