feat(safety-net): complete wave 1 atomicity (adr-038, adr-039, debounce, graceful degrade, xclaim)
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
OG T
2026-03-29 23:55:38 +08:00
parent e802600482
commit 89f0bae3f2
3 changed files with 197 additions and 152 deletions

View File

@@ -309,33 +309,36 @@ class AnomalyCounter:
action: 修復動作 (e.g., restart_pod, scale_up)
success: 是否成功
"""
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
try:
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
# 遞增修復嘗試次數
await self.redis.incr(repair_key)
await self.redis.expire(repair_key, self.TTL_SECONDS)
# 遞增修復嘗試次數
await self.redis.incr(repair_key)
await self.redis.expire(repair_key, self.TTL_SECONDS)
# 記錄修復歷史 (用於學習)
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
await self.redis.lpush(
history_key,
json.dumps(
{
"action": action,
"success": success,
"timestamp": datetime.now().isoformat(),
}
),
)
await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次
await self.redis.expire(history_key, self.TTL_SECONDS)
# 記錄修復歷史 (用於學習)
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
await self.redis.lpush(
history_key,
json.dumps(
{
"action": action,
"success": success,
"timestamp": datetime.now().isoformat(),
}
),
)
await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次
await self.redis.expire(history_key, self.TTL_SECONDS)
logger.info(
"repair_attempt_recorded",
anomaly_key=anomaly_key,
action=action,
success=success,
)
logger.info(
"repair_attempt_recorded",
anomaly_key=anomaly_key,
action=action,
success=success,
)
except Exception as e:
logger.warning("record_repair_attempt_redis_error", error=str(e), anomaly_key=anomaly_key)
async def mark_permanent_fix_applied(
self,
@@ -349,28 +352,31 @@ class AnomalyCounter:
anomaly_key: 異常 key
fix_description: 修復說明
"""
await self.redis.set(
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}",
"1",
ex=90 * 24 * 3600, # 90 天
)
try:
await self.redis.set(
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}",
"1",
ex=90 * 24 * 3600, # 90 天
)
# 記錄修復詳情
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
await self.redis.hset(
metadata_key,
mapping={
"permanent_fix_applied": "true",
"permanent_fix_description": fix_description,
"permanent_fix_time": datetime.now().isoformat(),
},
)
# 記錄修復詳情
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
await self.redis.hset(
metadata_key,
mapping={
"permanent_fix_applied": "true",
"permanent_fix_description": fix_description,
"permanent_fix_time": datetime.now().isoformat(),
},
)
logger.info(
"permanent_fix_marked",
anomaly_key=anomaly_key,
fix_description=fix_description,
)
logger.info(
"permanent_fix_marked",
anomaly_key=anomaly_key,
fix_description=fix_description,
)
except Exception as e:
logger.warning("mark_permanent_fix_applied_redis_error", error=str(e), anomaly_key=anomaly_key)
async def get_repair_success_rate(
self,
@@ -388,25 +394,34 @@ class AnomalyCounter:
'success_rate': 0.3,
}
"""
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
history = await self.redis.lrange(history_key, 0, -1)
try:
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
history = await self.redis.lrange(history_key, 0, -1)
total = 0
success_count = 0
total = 0
success_count = 0
for item in history:
data = json.loads(item)
if data["action"] == action:
total += 1
if data["success"]:
success_count += 1
for item in history:
data = json.loads(item)
if data["action"] == action:
total += 1
if data["success"]:
success_count += 1
return {
"action": action,
"total": total,
"success": success_count,
"success_rate": success_count / total if total > 0 else 0.0,
}
return {
"action": action,
"total": total,
"success": success_count,
"success_rate": success_count / total if total > 0 else 0.0,
}
except Exception as e:
logger.warning("get_repair_success_rate_redis_error", error=str(e), anomaly_key=anomaly_key)
return {
"action": action,
"total": 0,
"success": 0,
"success_rate": 0.0,
}
async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]:
"""
@@ -418,30 +433,34 @@ class AnomalyCounter:
'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5},
}
"""
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
history = await self.redis.lrange(history_key, 0, -1)
try:
history_key = f"{self.PREFIX_REPAIR_HISTORY}{anomaly_key}"
history = await self.redis.lrange(history_key, 0, -1)
stats: dict[str, dict] = {}
stats: dict[str, dict] = {}
for item in history:
data = json.loads(item)
action = data["action"]
for item in history:
data = json.loads(item)
action = data["action"]
if action not in stats:
stats[action] = {"total": 0, "success": 0}
if action not in stats:
stats[action] = {"total": 0, "success": 0}
stats[action]["total"] += 1
if data["success"]:
stats[action]["success"] += 1
stats[action]["total"] += 1
if data["success"]:
stats[action]["success"] += 1
# 計算成功率
for action_stats in stats.values():
total = action_stats["total"]
action_stats["success_rate"] = (
action_stats["success"] / total if total > 0 else 0.0
)
# 計算成功率
for action_stats in stats.values():
total = action_stats["total"]
action_stats["success_rate"] = (
action_stats["success"] / total if total > 0 else 0.0
)
return stats
return stats
except Exception as e:
logger.warning("get_all_repair_stats_redis_error", error=str(e), anomaly_key=anomaly_key)
return {}
async def get_frequency(self, anomaly_key: str) -> AnomalyFrequency | None:
"""
@@ -451,84 +470,88 @@ class AnomalyCounter:
anomaly_key: 異常 key
Returns:
AnomalyFrequency 或 None (若無記錄)
AnomalyFrequency 或 None (若無記錄 或 Redis 重連失敗)
"""
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
try:
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
# 檢查是否有記錄
if not await self.redis.exists(timeline_key):
# 檢查是否有記錄
if not await self.redis.exists(timeline_key):
return None
now = datetime.now()
cutoff_30d = (now - timedelta(days=30)).timestamp()
# 計算各時間窗口的計數
count_1h = await self.redis.zcount(
timeline_key,
(now - timedelta(hours=1)).timestamp(),
"+inf",
)
count_24h = await self.redis.zcount(
timeline_key,
(now - timedelta(hours=24)).timestamp(),
"+inf",
)
count_7d = await self.redis.zcount(
timeline_key,
(now - timedelta(days=7)).timestamp(),
"+inf",
)
count_30d = await self.redis.zcount(
timeline_key,
cutoff_30d,
"+inf",
)
# 取得時間範圍
first_seen_data = await self.redis.zrange(
timeline_key, 0, 0, withscores=True
)
last_seen_data = await self.redis.zrange(
timeline_key, -1, -1, withscores=True
)
first_seen = (
datetime.fromtimestamp(first_seen_data[0][1])
if first_seen_data
else now
)
last_seen = (
datetime.fromtimestamp(last_seen_data[0][1])
if last_seen_data
else now
)
# 讀取修復統計
repair_count_str = await self.redis.get(
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
)
auto_repair_count = int(repair_count_str) if repair_count_str else 0
permanent_fix_str = await self.redis.get(
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
)
permanent_fix = permanent_fix_str == "1"
escalation_level = self._get_escalation_level(count_24h)
return AnomalyFrequency(
anomaly_key=anomaly_key,
count_1h=count_1h,
count_24h=count_24h,
count_7d=count_7d,
count_30d=count_30d,
first_seen=first_seen,
last_seen=last_seen,
auto_repair_count=auto_repair_count,
permanent_fix_applied=permanent_fix,
escalation_level=escalation_level,
)
except Exception as e:
logger.warning("get_frequency_redis_error", error=str(e), anomaly_key=anomaly_key)
return None
now = datetime.now()
cutoff_30d = (now - timedelta(days=30)).timestamp()
# 計算各時間窗口的計數
count_1h = await self.redis.zcount(
timeline_key,
(now - timedelta(hours=1)).timestamp(),
"+inf",
)
count_24h = await self.redis.zcount(
timeline_key,
(now - timedelta(hours=24)).timestamp(),
"+inf",
)
count_7d = await self.redis.zcount(
timeline_key,
(now - timedelta(days=7)).timestamp(),
"+inf",
)
count_30d = await self.redis.zcount(
timeline_key,
cutoff_30d,
"+inf",
)
# 取得時間範圍
first_seen_data = await self.redis.zrange(
timeline_key, 0, 0, withscores=True
)
last_seen_data = await self.redis.zrange(
timeline_key, -1, -1, withscores=True
)
first_seen = (
datetime.fromtimestamp(first_seen_data[0][1])
if first_seen_data
else now
)
last_seen = (
datetime.fromtimestamp(last_seen_data[0][1])
if last_seen_data
else now
)
# 讀取修復統計
repair_count_str = await self.redis.get(
f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
)
auto_repair_count = int(repair_count_str) if repair_count_str else 0
permanent_fix_str = await self.redis.get(
f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}"
)
permanent_fix = permanent_fix_str == "1"
escalation_level = self._get_escalation_level(count_24h)
return AnomalyFrequency(
anomaly_key=anomaly_key,
count_1h=count_1h,
count_24h=count_24h,
count_7d=count_7d,
count_30d=count_30d,
first_seen=first_seen,
last_seen=last_seen,
auto_repair_count=auto_repair_count,
permanent_fix_applied=permanent_fix,
escalation_level=escalation_level,
)
async def should_skip_action(
self,
anomaly_key: str,

View File

@@ -430,6 +430,24 @@ class IncidentService:
Incident | None: 成功返回 Incident失敗返回 None
"""
try:
# 0. 去抖動 (Debounce) - 防止告警風暴
fingerprint = signal_data.get("fingerprint")
if fingerprint:
try:
redis_client = get_redis()
debounce_key = f"debounce:{fingerprint}"
# SETNX 若成功表示是新的,給予 3 分鐘 TTL (180s)
is_new = await redis_client.set(debounce_key, "1", ex=180, nx=True)
if not is_new:
logger.info(
"incident_debounced",
fingerprint=fingerprint,
reason="Duplicate signal within 3 minutes",
)
return None
except Exception as e:
logger.warning("incident_debounce_redis_error", error=str(e))
# 1. 解析 Signal
signal = Signal(
alert_name=signal_data.get("alert_name", "unknown"),

View File

@@ -49,6 +49,10 @@ spec:
# Harbor 金庫: 110 主機 (192.168.0.110:5000)
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 5"]
# Worker 模式啟動 (非 HTTP 服務)
command: ["python", "-m", "src.workers.signal_worker"]
envFrom: