feat(safety-net): complete wave 1 atomicity (adr-038, adr-039, debounce, graceful degrade, xclaim)
This commit is contained in:
@@ -309,33 +309,36 @@ class AnomalyCounter:
|
||||
action: 修復動作 (e.g., restart_pod, scale_up)
|
||||
success: 是否成功
|
||||
"""
|
||||
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
try:
|
||||
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
|
||||
# 遞增修復嘗試次數
|
||||
await self.redis.incr(repair_key)
|
||||
await self.redis.expire(repair_key, self.TTL_SECONDS)
|
||||
# 遞增修復嘗試次數
|
||||
await self.redis.incr(repair_key)
|
||||
await self.redis.expire(repair_key, self.TTL_SECONDS)
|
||||
|
||||
# 記錄修復歷史 (用於學習)
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
await self.redis.lpush(
|
||||
history_key,
|
||||
json.dumps(
|
||||
{
|
||||
"action": action,
|
||||
"success": success,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
),
|
||||
)
|
||||
await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次
|
||||
await self.redis.expire(history_key, self.TTL_SECONDS)
|
||||
# 記錄修復歷史 (用於學習)
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
await self.redis.lpush(
|
||||
history_key,
|
||||
json.dumps(
|
||||
{
|
||||
"action": action,
|
||||
"success": success,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
),
|
||||
)
|
||||
await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次
|
||||
await self.redis.expire(history_key, self.TTL_SECONDS)
|
||||
|
||||
logger.info(
|
||||
"repair_attempt_recorded",
|
||||
anomaly_key=anomaly_key,
|
||||
action=action,
|
||||
success=success,
|
||||
)
|
||||
logger.info(
|
||||
"repair_attempt_recorded",
|
||||
anomaly_key=anomaly_key,
|
||||
action=action,
|
||||
success=success,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("record_repair_attempt_redis_error", error=str(e), anomaly_key=anomaly_key)
|
||||
|
||||
async def mark_permanent_fix_applied(
|
||||
self,
|
||||
@@ -349,28 +352,31 @@ class AnomalyCounter:
|
||||
anomaly_key: 異常 key
|
||||
fix_description: 修復說明
|
||||
"""
|
||||
await self.redis.set(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}",
|
||||
"1",
|
||||
ex=90 * 24 * 3600, # 90 天
|
||||
)
|
||||
try:
|
||||
await self.redis.set(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}",
|
||||
"1",
|
||||
ex=90 * 24 * 3600, # 90 天
|
||||
)
|
||||
|
||||
# 記錄修復詳情
|
||||
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
await self.redis.hset(
|
||||
metadata_key,
|
||||
mapping={
|
||||
"permanent_fix_applied": "true",
|
||||
"permanent_fix_description": fix_description,
|
||||
"permanent_fix_time": datetime.now().isoformat(),
|
||||
},
|
||||
)
|
||||
# 記錄修復詳情
|
||||
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
await self.redis.hset(
|
||||
metadata_key,
|
||||
mapping={
|
||||
"permanent_fix_applied": "true",
|
||||
"permanent_fix_description": fix_description,
|
||||
"permanent_fix_time": datetime.now().isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"permanent_fix_marked",
|
||||
anomaly_key=anomaly_key,
|
||||
fix_description=fix_description,
|
||||
)
|
||||
logger.info(
|
||||
"permanent_fix_marked",
|
||||
anomaly_key=anomaly_key,
|
||||
fix_description=fix_description,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("mark_permanent_fix_applied_redis_error", error=str(e), anomaly_key=anomaly_key)
|
||||
|
||||
async def get_repair_success_rate(
|
||||
self,
|
||||
@@ -388,25 +394,34 @@ class AnomalyCounter:
|
||||
'success_rate': 0.3,
|
||||
}
|
||||
"""
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
try:
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
|
||||
total = 0
|
||||
success_count = 0
|
||||
total = 0
|
||||
success_count = 0
|
||||
|
||||
for item in history:
|
||||
data = json.loads(item)
|
||||
if data["action"] == action:
|
||||
total += 1
|
||||
if data["success"]:
|
||||
success_count += 1
|
||||
for item in history:
|
||||
data = json.loads(item)
|
||||
if data["action"] == action:
|
||||
total += 1
|
||||
if data["success"]:
|
||||
success_count += 1
|
||||
|
||||
return {
|
||||
"action": action,
|
||||
"total": total,
|
||||
"success": success_count,
|
||||
"success_rate": success_count / total if total > 0 else 0.0,
|
||||
}
|
||||
return {
|
||||
"action": action,
|
||||
"total": total,
|
||||
"success": success_count,
|
||||
"success_rate": success_count / total if total > 0 else 0.0,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("get_repair_success_rate_redis_error", error=str(e), anomaly_key=anomaly_key)
|
||||
return {
|
||||
"action": action,
|
||||
"total": 0,
|
||||
"success": 0,
|
||||
"success_rate": 0.0,
|
||||
}
|
||||
|
||||
async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]:
|
||||
"""
|
||||
@@ -418,30 +433,34 @@ class AnomalyCounter:
|
||||
'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5},
|
||||
}
|
||||
"""
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
try:
|
||||
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
|
||||
history = await self.redis.lrange(history_key, 0, -1)
|
||||
|
||||
stats: dict[str, dict] = {}
|
||||
stats: dict[str, dict] = {}
|
||||
|
||||
for item in history:
|
||||
data = json.loads(item)
|
||||
action = data["action"]
|
||||
for item in history:
|
||||
data = json.loads(item)
|
||||
action = data["action"]
|
||||
|
||||
if action not in stats:
|
||||
stats[action] = {"total": 0, "success": 0}
|
||||
if action not in stats:
|
||||
stats[action] = {"total": 0, "success": 0}
|
||||
|
||||
stats[action]["total"] += 1
|
||||
if data["success"]:
|
||||
stats[action]["success"] += 1
|
||||
stats[action]["total"] += 1
|
||||
if data["success"]:
|
||||
stats[action]["success"] += 1
|
||||
|
||||
# 計算成功率
|
||||
for action_stats in stats.values():
|
||||
total = action_stats["total"]
|
||||
action_stats["success_rate"] = (
|
||||
action_stats["success"] / total if total > 0 else 0.0
|
||||
)
|
||||
# 計算成功率
|
||||
for action_stats in stats.values():
|
||||
total = action_stats["total"]
|
||||
action_stats["success_rate"] = (
|
||||
action_stats["success"] / total if total > 0 else 0.0
|
||||
)
|
||||
|
||||
return stats
|
||||
return stats
|
||||
except Exception as e:
|
||||
logger.warning("get_all_repair_stats_redis_error", error=str(e), anomaly_key=anomaly_key)
|
||||
return {}
|
||||
|
||||
async def get_frequency(self, anomaly_key: str) -> AnomalyFrequency | None:
|
||||
"""
|
||||
@@ -451,84 +470,88 @@ class AnomalyCounter:
|
||||
anomaly_key: 異常 key
|
||||
|
||||
Returns:
|
||||
AnomalyFrequency 或 None (若無記錄)
|
||||
AnomalyFrequency 或 None (若無記錄 或 Redis 重連失敗)
|
||||
"""
|
||||
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
||||
try:
|
||||
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
||||
|
||||
# 檢查是否有記錄
|
||||
if not await self.redis.exists(timeline_key):
|
||||
# 檢查是否有記錄
|
||||
if not await self.redis.exists(timeline_key):
|
||||
return None
|
||||
|
||||
now = datetime.now()
|
||||
cutoff_30d = (now - timedelta(days=30)).timestamp()
|
||||
|
||||
# 計算各時間窗口的計數
|
||||
count_1h = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(hours=1)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_24h = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(hours=24)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_7d = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(days=7)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_30d = await self.redis.zcount(
|
||||
timeline_key,
|
||||
cutoff_30d,
|
||||
"+inf",
|
||||
)
|
||||
|
||||
# 取得時間範圍
|
||||
first_seen_data = await self.redis.zrange(
|
||||
timeline_key, 0, 0, withscores=True
|
||||
)
|
||||
last_seen_data = await self.redis.zrange(
|
||||
timeline_key, -1, -1, withscores=True
|
||||
)
|
||||
|
||||
first_seen = (
|
||||
datetime.fromtimestamp(first_seen_data[0][1])
|
||||
if first_seen_data
|
||||
else now
|
||||
)
|
||||
last_seen = (
|
||||
datetime.fromtimestamp(last_seen_data[0][1])
|
||||
if last_seen_data
|
||||
else now
|
||||
)
|
||||
|
||||
# 讀取修復統計
|
||||
repair_count_str = await self.redis.get(
|
||||
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
)
|
||||
auto_repair_count = int(repair_count_str) if repair_count_str else 0
|
||||
|
||||
permanent_fix_str = await self.redis.get(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
|
||||
)
|
||||
permanent_fix = permanent_fix_str == "1"
|
||||
|
||||
escalation_level = self._get_escalation_level(count_24h)
|
||||
|
||||
return AnomalyFrequency(
|
||||
anomaly_key=anomaly_key,
|
||||
count_1h=count_1h,
|
||||
count_24h=count_24h,
|
||||
count_7d=count_7d,
|
||||
count_30d=count_30d,
|
||||
first_seen=first_seen,
|
||||
last_seen=last_seen,
|
||||
auto_repair_count=auto_repair_count,
|
||||
permanent_fix_applied=permanent_fix,
|
||||
escalation_level=escalation_level,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("get_frequency_redis_error", error=str(e), anomaly_key=anomaly_key)
|
||||
return None
|
||||
|
||||
now = datetime.now()
|
||||
cutoff_30d = (now - timedelta(days=30)).timestamp()
|
||||
|
||||
# 計算各時間窗口的計數
|
||||
count_1h = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(hours=1)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_24h = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(hours=24)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_7d = await self.redis.zcount(
|
||||
timeline_key,
|
||||
(now - timedelta(days=7)).timestamp(),
|
||||
"+inf",
|
||||
)
|
||||
count_30d = await self.redis.zcount(
|
||||
timeline_key,
|
||||
cutoff_30d,
|
||||
"+inf",
|
||||
)
|
||||
|
||||
# 取得時間範圍
|
||||
first_seen_data = await self.redis.zrange(
|
||||
timeline_key, 0, 0, withscores=True
|
||||
)
|
||||
last_seen_data = await self.redis.zrange(
|
||||
timeline_key, -1, -1, withscores=True
|
||||
)
|
||||
|
||||
first_seen = (
|
||||
datetime.fromtimestamp(first_seen_data[0][1])
|
||||
if first_seen_data
|
||||
else now
|
||||
)
|
||||
last_seen = (
|
||||
datetime.fromtimestamp(last_seen_data[0][1])
|
||||
if last_seen_data
|
||||
else now
|
||||
)
|
||||
|
||||
# 讀取修復統計
|
||||
repair_count_str = await self.redis.get(
|
||||
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
||||
)
|
||||
auto_repair_count = int(repair_count_str) if repair_count_str else 0
|
||||
|
||||
permanent_fix_str = await self.redis.get(
|
||||
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
|
||||
)
|
||||
permanent_fix = permanent_fix_str == "1"
|
||||
|
||||
escalation_level = self._get_escalation_level(count_24h)
|
||||
|
||||
return AnomalyFrequency(
|
||||
anomaly_key=anomaly_key,
|
||||
count_1h=count_1h,
|
||||
count_24h=count_24h,
|
||||
count_7d=count_7d,
|
||||
count_30d=count_30d,
|
||||
first_seen=first_seen,
|
||||
last_seen=last_seen,
|
||||
auto_repair_count=auto_repair_count,
|
||||
permanent_fix_applied=permanent_fix,
|
||||
escalation_level=escalation_level,
|
||||
)
|
||||
|
||||
async def should_skip_action(
|
||||
self,
|
||||
anomaly_key: str,
|
||||
|
||||
@@ -430,6 +430,24 @@ class IncidentService:
|
||||
Incident | None: 成功返回 Incident,失敗返回 None
|
||||
"""
|
||||
try:
|
||||
# 0. 去抖動 (Debounce) - 防止告警風暴
|
||||
fingerprint = signal_data.get("fingerprint")
|
||||
if fingerprint:
|
||||
try:
|
||||
redis_client = get_redis()
|
||||
debounce_key = f"debounce:{fingerprint}"
|
||||
# SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s)
|
||||
is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True)
|
||||
if not is_new:
|
||||
logger.info(
|
||||
"incident_debounced",
|
||||
fingerprint=fingerprint,
|
||||
reason="Duplicate signal within 3 minutes",
|
||||
)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("incident_debounce_redis_error", error=str(e))
|
||||
|
||||
# 1. 解析 Signal
|
||||
signal = Signal(
|
||||
alert_name=signal_data.get("alert_name", "unknown"),
|
||||
|
||||
@@ -49,6 +49,10 @@ spec:
|
||||
# Harbor 金庫: 110 主機 (192.168.0.110:5000)
|
||||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||||
imagePullPolicy: Always
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh", "-c", "sleep 5"]
|
||||
# Worker 模式啟動 (非 HTTP 服務)
|
||||
command: ["python", "-m", "src.workers.signal_worker"]
|
||||
envFrom:
|
||||
|
||||
Reference in New Issue
Block a user