Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
700 lines
21 KiB
Markdown
700 lines
21 KiB
Markdown
# AnomalyCounter 服務實施步驟
|
|
|
|
> **優先級**: P0
|
|
> **預估工時**: 4h
|
|
> **目標**: 建立異常頻率追蹤能力
|
|
|
|
---
|
|
|
|
## Step 1: 建立 anomaly_counter.py (1h)
|
|
|
|
### 1.1 建立檔案
|
|
|
|
```bash
|
|
touch apps/api/src/services/anomaly_counter.py
|
|
```
|
|
|
|
### 1.2 實作 AnomalyCounter 類別
|
|
|
|
```python
|
|
# apps/api/src/services/anomaly_counter.py
|
|
"""
|
|
異常頻率統計服務
|
|
================================
|
|
2026-03-29 ogt: 監控戰略規劃 Section 9 實作
|
|
|
|
使用 Redis Sorted Set 實作滑動窗口計數:
|
|
- ZADD anomaly:timeline:{key} {timestamp} {timestamp}
|
|
- ZCOUNT anomaly:timeline:{key} {start} +inf
|
|
- ZREMRANGEBYSCORE anomaly:timeline:{key} -inf {cutoff}
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
from typing import NamedTuple
|
|
|
|
import redis.asyncio as redis
|
|
import structlog
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
class AnomalyFrequency(NamedTuple):
|
|
"""異常頻率資料"""
|
|
anomaly_key: str
|
|
count_1h: int
|
|
count_24h: int
|
|
count_7d: int
|
|
count_30d: int
|
|
first_seen: datetime
|
|
last_seen: datetime
|
|
auto_repair_count: int
|
|
permanent_fix_applied: bool
|
|
escalation_level: str | None # None, REPEAT, ESCALATE, PERMANENT_FIX
|
|
|
|
|
|
class AnomalyCounter:
|
|
"""
|
|
異常計數器 - 追蹤每種異常的發生頻率
|
|
|
|
閾值配置 (可透過環境變數覆寫):
|
|
- ANOMALY_REPEAT_THRESHOLD: 3 (預設)
|
|
- ANOMALY_ESCALATE_THRESHOLD: 5 (預設)
|
|
- ANOMALY_PERMANENT_FIX_THRESHOLD: 10 (預設)
|
|
"""
|
|
|
|
THRESHOLDS = {
|
|
'REPEAT': 3, # 3 次 → 重複告警
|
|
'ESCALATE': 5, # 5 次 → 人工介入
|
|
'PERMANENT_FIX': 10, # 10 次 → 必須永久修復
|
|
}
|
|
|
|
# Redis Key 前綴
|
|
PREFIX_TIMELINE = "anomaly:timeline:"
|
|
PREFIX_REPAIR_COUNT = "anomaly:repair_count:"
|
|
PREFIX_PERMANENT_FIX = "anomaly:permanent_fix:"
|
|
PREFIX_METADATA = "anomaly:metadata:"
|
|
|
|
def __init__(self, redis_client: redis.Redis):
|
|
self.redis = redis_client
|
|
|
|
@staticmethod
|
|
def _hash_signature(signature: dict) -> str:
|
|
"""
|
|
生成異常簽名的 hash key
|
|
|
|
簽名欄位:
|
|
- alert_name: 告警名稱 (e.g., PodCrashLoopBackOff)
|
|
- service: 服務名稱 (e.g., awoooi-api)
|
|
- namespace: K8s 命名空間 (e.g., awoooi-prod)
|
|
- error_type: 錯誤類型 (e.g., OOMKilled)
|
|
"""
|
|
# 只取關鍵欄位,忽略時間戳等易變欄位
|
|
key_fields = {
|
|
'alert_name': signature.get('alert_name', signature.get('alertname', '')),
|
|
'service': signature.get('service', signature.get('job', '')),
|
|
'namespace': signature.get('namespace', ''),
|
|
'error_type': signature.get('error_type', signature.get('reason', '')),
|
|
}
|
|
# 排序確保一致性
|
|
canonical = json.dumps(key_fields, sort_keys=True)
|
|
return hashlib.sha256(canonical.encode()).hexdigest()[:16]
|
|
|
|
async def record_anomaly(self, anomaly_signature: dict) -> AnomalyFrequency:
|
|
"""
|
|
記錄一次異常發生
|
|
|
|
Args:
|
|
anomaly_signature: 異常簽名字典
|
|
|
|
Returns:
|
|
AnomalyFrequency: 當前頻率統計
|
|
"""
|
|
anomaly_key = self._hash_signature(anomaly_signature)
|
|
now = datetime.now()
|
|
timestamp = now.timestamp()
|
|
timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
|
|
|
|
# 1. 添加到 Sorted Set (score = timestamp, member = timestamp string)
|
|
await self.redis.zadd(timeline_key, {str(timestamp): timestamp})
|
|
|
|
# 2. 清理過期數據 (30 天前)
|
|
cutoff_30d = (now - timedelta(days=30)).timestamp()
|
|
await self.redis.zremrangebyscore(timeline_key, '-inf', cutoff_30d)
|
|
|
|
# 3. 設置 TTL (35 天,比清理週期長一點)
|
|
await self.redis.expire(timeline_key, 35 * 24 * 3600)
|
|
|
|
# 4. 計算各時間窗口的計數
|
|
count_1h = await self.redis.zcount(
|
|
timeline_key,
|
|
(now - timedelta(hours=1)).timestamp(),
|
|
'+inf'
|
|
)
|
|
count_24h = await self.redis.zcount(
|
|
timeline_key,
|
|
(now - timedelta(hours=24)).timestamp(),
|
|
'+inf'
|
|
)
|
|
count_7d = await self.redis.zcount(
|
|
timeline_key,
|
|
(now - timedelta(days=7)).timestamp(),
|
|
'+inf'
|
|
)
|
|
count_30d = await self.redis.zcount(
|
|
timeline_key,
|
|
cutoff_30d,
|
|
'+inf'
|
|
)
|
|
|
|
# 5. 取得首次/最近時間
|
|
first_seen_data = await self.redis.zrange(timeline_key, 0, 0, withscores=True)
|
|
last_seen_data = await self.redis.zrange(timeline_key, -1, -1, withscores=True)
|
|
|
|
first_seen = datetime.fromtimestamp(first_seen_data[0][1]) if first_seen_data else now
|
|
last_seen = datetime.fromtimestamp(last_seen_data[0][1]) if last_seen_data else now
|
|
|
|
# 6. 讀取修復統計
|
|
auto_repair_count = int(await self.redis.get(f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}") or 0)
|
|
permanent_fix = await self.redis.get(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}") == b'1'
|
|
|
|
# 7. 儲存 metadata (首次記錄時)
|
|
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
|
if not await self.redis.exists(metadata_key):
|
|
await self.redis.hset(metadata_key, mapping={
|
|
'signature': json.dumps(anomaly_signature),
|
|
'first_seen': now.isoformat(),
|
|
})
|
|
await self.redis.expire(metadata_key, 35 * 24 * 3600)
|
|
|
|
# 8. 判斷升級等級
|
|
escalation_level = self._get_escalation_level(count_24h)
|
|
|
|
freq = AnomalyFrequency(
|
|
anomaly_key=anomaly_key,
|
|
count_1h=count_1h,
|
|
count_24h=count_24h,
|
|
count_7d=count_7d,
|
|
count_30d=count_30d,
|
|
first_seen=first_seen,
|
|
last_seen=last_seen,
|
|
auto_repair_count=auto_repair_count,
|
|
permanent_fix_applied=permanent_fix,
|
|
escalation_level=escalation_level,
|
|
)
|
|
|
|
# 9. 記錄日誌
|
|
logger.info(
|
|
"anomaly_recorded",
|
|
anomaly_key=anomaly_key,
|
|
count_1h=count_1h,
|
|
count_24h=count_24h,
|
|
count_30d=count_30d,
|
|
escalation_level=escalation_level,
|
|
)
|
|
|
|
return freq
|
|
|
|
def _get_escalation_level(self, count_24h: int) -> str | None:
|
|
"""判斷升級等級"""
|
|
if count_24h >= self.THRESHOLDS['PERMANENT_FIX']:
|
|
return 'PERMANENT_FIX'
|
|
elif count_24h >= self.THRESHOLDS['ESCALATE']:
|
|
return 'ESCALATE'
|
|
elif count_24h >= self.THRESHOLDS['REPEAT']:
|
|
return 'REPEAT'
|
|
return None
|
|
|
|
async def record_repair_attempt(self, anomaly_key: str, action: str, success: bool):
|
|
"""
|
|
記錄修復嘗試
|
|
|
|
Args:
|
|
anomaly_key: 異常 key
|
|
action: 修復動作 (e.g., restart_pod, scale_up)
|
|
success: 是否成功
|
|
"""
|
|
repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
|
|
|
|
# 遞增修復嘗試次數
|
|
await self.redis.incr(repair_key)
|
|
await self.redis.expire(repair_key, 35 * 24 * 3600)
|
|
|
|
# 記錄修復歷史 (用於學習)
|
|
history_key = f"anomaly:repair_history:{anomaly_key}"
|
|
await self.redis.lpush(history_key, json.dumps({
|
|
'action': action,
|
|
'success': success,
|
|
'timestamp': datetime.now().isoformat(),
|
|
}))
|
|
await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次
|
|
await self.redis.expire(history_key, 35 * 24 * 3600)
|
|
|
|
logger.info(
|
|
"repair_attempt_recorded",
|
|
anomaly_key=anomaly_key,
|
|
action=action,
|
|
success=success,
|
|
)
|
|
|
|
async def mark_permanent_fix_applied(self, anomaly_key: str, fix_description: str):
|
|
"""
|
|
標記已套用永久修復
|
|
|
|
Args:
|
|
anomaly_key: 異常 key
|
|
fix_description: 修復說明
|
|
"""
|
|
await self.redis.set(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", '1')
|
|
await self.redis.expire(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", 90 * 24 * 3600) # 90 天
|
|
|
|
# 記錄修復詳情
|
|
metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
|
await self.redis.hset(metadata_key, mapping={
|
|
'permanent_fix_applied': 'true',
|
|
'permanent_fix_description': fix_description,
|
|
'permanent_fix_time': datetime.now().isoformat(),
|
|
})
|
|
|
|
logger.info(
|
|
"permanent_fix_marked",
|
|
anomaly_key=anomaly_key,
|
|
fix_description=fix_description,
|
|
)
|
|
|
|
async def get_repair_success_rate(self, anomaly_key: str, action: str) -> dict:
|
|
"""
|
|
取得特定動作的修復成功率
|
|
|
|
Returns:
|
|
{
|
|
'action': 'restart_pod',
|
|
'total': 10,
|
|
'success': 3,
|
|
'success_rate': 0.3,
|
|
}
|
|
"""
|
|
history_key = f"anomaly:repair_history:{anomaly_key}"
|
|
history = await self.redis.lrange(history_key, 0, -1)
|
|
|
|
total = 0
|
|
success = 0
|
|
|
|
for item in history:
|
|
data = json.loads(item)
|
|
if data['action'] == action:
|
|
total += 1
|
|
if data['success']:
|
|
success += 1
|
|
|
|
return {
|
|
'action': action,
|
|
'total': total,
|
|
'success': success,
|
|
'success_rate': success / total if total > 0 else 0.0,
|
|
}
|
|
|
|
async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]:
|
|
"""
|
|
取得所有修復動作的統計
|
|
|
|
Returns:
|
|
{
|
|
'restart_pod': {'total': 10, 'success': 3, 'success_rate': 0.3},
|
|
'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5},
|
|
}
|
|
"""
|
|
history_key = f"anomaly:repair_history:{anomaly_key}"
|
|
history = await self.redis.lrange(history_key, 0, -1)
|
|
|
|
stats: dict[str, dict] = {}
|
|
|
|
for item in history:
|
|
data = json.loads(item)
|
|
action = data['action']
|
|
|
|
if action not in stats:
|
|
stats[action] = {'total': 0, 'success': 0}
|
|
|
|
stats[action]['total'] += 1
|
|
if data['success']:
|
|
stats[action]['success'] += 1
|
|
|
|
# 計算成功率
|
|
for action, s in stats.items():
|
|
s['success_rate'] = s['success'] / s['total'] if s['total'] > 0 else 0.0
|
|
|
|
return stats
|
|
|
|
|
|
# =============================================================================
|
|
# Singleton 模式
|
|
# =============================================================================
|
|
_anomaly_counter: AnomalyCounter | None = None
|
|
|
|
|
|
def get_anomaly_counter() -> AnomalyCounter:
|
|
"""取得 AnomalyCounter 實例"""
|
|
global _anomaly_counter
|
|
if _anomaly_counter is None:
|
|
from src.core.redis import get_redis_client
|
|
_anomaly_counter = AnomalyCounter(get_redis_client())
|
|
return _anomaly_counter
|
|
```
|
|
|
|
---
|
|
|
|
## Step 2: 整合到 alertmanager_webhook.py (1h)
|
|
|
|
### 2.1 在收到告警時記錄頻率
|
|
|
|
```python
|
|
# apps/api/src/api/v1/alertmanager_webhook.py
|
|
# 在 handle_alertmanager 函數中新增
|
|
|
|
from src.services.anomaly_counter import get_anomaly_counter
|
|
|
|
async def handle_alertmanager(request: Request, background_tasks: BackgroundTasks):
|
|
# ... 現有代碼 ...
|
|
|
|
# 🆕 記錄異常頻率
|
|
anomaly_counter = get_anomaly_counter()
|
|
for alert in alerts:
|
|
anomaly_signature = {
|
|
'alert_name': alert.get('labels', {}).get('alertname'),
|
|
'service': alert.get('labels', {}).get('job'),
|
|
'namespace': alert.get('labels', {}).get('namespace'),
|
|
'error_type': alert.get('labels', {}).get('reason'),
|
|
}
|
|
freq = await anomaly_counter.record_anomaly(anomaly_signature)
|
|
|
|
# 將頻率資訊傳遞給後續處理
|
|
alert['_anomaly_frequency'] = freq._asdict()
|
|
|
|
# ... 繼續現有流程 ...
|
|
```
|
|
|
|
### 2.2 在 Telegram 告警中顯示頻率
|
|
|
|
```python
|
|
# apps/api/src/services/telegram_gateway.py
|
|
# 修改 send_approval_card 方法,新增頻率資訊
|
|
|
|
async def send_approval_card(
|
|
self,
|
|
approval_id: str,
|
|
risk_level: str,
|
|
resource_name: str,
|
|
root_cause: str,
|
|
suggested_action: str,
|
|
primary_responsibility: str,
|
|
confidence: float,
|
|
namespace: str,
|
|
anomaly_frequency: dict | None = None, # 🆕 新增參數
|
|
):
|
|
# ... 現有代碼 ...
|
|
|
|
# 🆕 頻率資訊區塊
|
|
frequency_section = ""
|
|
if anomaly_frequency and anomaly_frequency.get('count_24h', 0) > 1:
|
|
freq = anomaly_frequency
|
|
escalation_emoji = {
|
|
None: "",
|
|
'REPEAT': "⚠️",
|
|
'ESCALATE': "🔴",
|
|
'PERMANENT_FIX': "🚨",
|
|
}.get(freq.get('escalation_level'), "")
|
|
|
|
frequency_section = f"""
|
|
📊 頻率統計 {escalation_emoji}:
|
|
• 1小時: {freq.get('count_1h', 0)} 次
|
|
• 24小時: {freq.get('count_24h', 0)} 次
|
|
• 7天: {freq.get('count_7d', 0)} 次
|
|
• 30天: {freq.get('count_30d', 0)} 次
|
|
• 修復嘗試: {freq.get('auto_repair_count', 0)} 次
|
|
"""
|
|
if freq.get('escalation_level'):
|
|
frequency_section += f" 🔺 升級建議: {freq['escalation_level']}\n"
|
|
|
|
# 插入到告警卡片中
|
|
# ...
|
|
```
|
|
|
|
---
|
|
|
|
## Step 3: 整合到 sentry_webhook.py (30min)
|
|
|
|
### 3.1 Sentry 告警也要記錄頻率
|
|
|
|
```python
|
|
# apps/api/src/api/v1/sentry_webhook.py
|
|
# 在 analyze_and_comment 函數中新增
|
|
|
|
from src.services.anomaly_counter import get_anomaly_counter
|
|
|
|
async def analyze_and_comment(
|
|
error_context: dict,
|
|
issue_id: str,
|
|
project_slug: str
|
|
):
|
|
# 🆕 記錄異常頻率
|
|
anomaly_counter = get_anomaly_counter()
|
|
anomaly_signature = {
|
|
'alert_name': 'sentry_error',
|
|
'service': error_context.get('project', 'unknown'),
|
|
'error_type': error_context.get('title', 'unknown'),
|
|
'culprit': error_context.get('culprit', 'unknown'),
|
|
}
|
|
freq = await anomaly_counter.record_anomaly(anomaly_signature)
|
|
|
|
# 傳遞給 Telegram 告警
|
|
await send_sentry_telegram_alert(
|
|
error_context=error_context,
|
|
analysis=analysis,
|
|
approval_id=approval_id,
|
|
anomaly_frequency=freq._asdict(), # 🆕
|
|
)
|
|
```
|
|
|
|
---
|
|
|
|
## Step 4: 整合到 auto_repair_service.py (1h)
|
|
|
|
### 4.1 修復前檢查頻率,決定 Tier
|
|
|
|
```python
|
|
# apps/api/src/services/auto_repair_service.py
|
|
# 新增 Tier 決策邏輯
|
|
|
|
from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency
|
|
|
|
class AutoRepairService:
|
|
async def determine_repair_tier(
|
|
self,
|
|
anomaly_key: str,
|
|
frequency: AnomalyFrequency,
|
|
) -> int:
|
|
"""
|
|
根據頻率決定修復 Tier
|
|
|
|
Returns:
|
|
1: 臨時修復 (重啟)
|
|
2: 緩解修復 (擴容)
|
|
3: 根因修復 (配置變更)
|
|
4: 架構修復 (需開發)
|
|
"""
|
|
# 取得修復歷史
|
|
counter = get_anomaly_counter()
|
|
stats = await counter.get_all_repair_stats(anomaly_key)
|
|
|
|
# 計算重啟次數
|
|
restart_count = stats.get('restart_pod', {}).get('total', 0)
|
|
restart_count += stats.get('restart_container', {}).get('total', 0)
|
|
|
|
# Tier 決策邏輯
|
|
if frequency.permanent_fix_applied:
|
|
return 4 # 已有永久修復但仍出問題 → 需架構級修復
|
|
|
|
if frequency.escalation_level == 'PERMANENT_FIX':
|
|
return 3 # 24h 內 ≥10 次 → 根因修復
|
|
|
|
if frequency.escalation_level == 'ESCALATE':
|
|
return 2 # 24h 內 ≥5 次 → 緩解修復
|
|
|
|
if restart_count >= 2:
|
|
return 2 # 已重啟 2 次 → 升級到緩解
|
|
|
|
return 1 # 預設臨時修復
|
|
|
|
async def get_tier_actions(self, tier: int) -> list[str]:
|
|
"""
|
|
根據 Tier 返回可用修復動作
|
|
"""
|
|
TIER_ACTIONS = {
|
|
1: ['restart_pod', 'restart_container'],
|
|
2: ['scale_up', 'increase_memory', 'adjust_limits'],
|
|
3: ['apply_hotfix', 'update_config', 'patch_deployment'],
|
|
4: ['create_issue', 'notify_team', 'schedule_fix'],
|
|
}
|
|
return TIER_ACTIONS.get(tier, TIER_ACTIONS[1])
|
|
```
|
|
|
|
### 4.2 修復後記錄結果
|
|
|
|
```python
|
|
# apps/api/src/services/auto_repair_service.py
|
|
# 在執行修復後
|
|
|
|
async def execute_repair(self, ...):
|
|
# ... 執行修復 ...
|
|
|
|
# 🆕 記錄修復嘗試
|
|
counter = get_anomaly_counter()
|
|
await counter.record_repair_attempt(
|
|
anomaly_key=anomaly_key,
|
|
action=repair_action,
|
|
success=result.success,
|
|
)
|
|
|
|
# 如果是 Tier 3 永久修復成功
|
|
if tier == 3 and result.success:
|
|
await counter.mark_permanent_fix_applied(
|
|
anomaly_key=anomaly_key,
|
|
fix_description=f"Applied {repair_action}: {result.message}",
|
|
)
|
|
```
|
|
|
|
---
|
|
|
|
## Step 5: 單元測試 (30min)
|
|
|
|
### 5.1 建立測試檔案
|
|
|
|
```python
|
|
# apps/api/tests/test_anomaly_counter.py
|
|
"""
|
|
AnomalyCounter 單元測試
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timedelta
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
from src.services.anomaly_counter import AnomalyCounter, AnomalyFrequency
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_redis():
|
|
"""模擬 Redis 客戶端"""
|
|
redis = AsyncMock()
|
|
redis.zadd = AsyncMock()
|
|
redis.zremrangebyscore = AsyncMock()
|
|
redis.expire = AsyncMock()
|
|
redis.zcount = AsyncMock(return_value=5)
|
|
redis.zrange = AsyncMock(return_value=[(b'123', 1234567890.0)])
|
|
redis.get = AsyncMock(return_value=None)
|
|
redis.exists = AsyncMock(return_value=False)
|
|
redis.hset = AsyncMock()
|
|
return redis
|
|
|
|
|
|
@pytest.fixture
|
|
def counter(mock_redis):
|
|
return AnomalyCounter(mock_redis)
|
|
|
|
|
|
class TestHashSignature:
|
|
def test_same_input_same_hash(self):
|
|
sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
|
|
sig2 = {'alert_name': 'PodCrash', 'service': 'api'}
|
|
assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2)
|
|
|
|
def test_different_input_different_hash(self):
|
|
sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
|
|
sig2 = {'alert_name': 'PodCrash', 'service': 'web'}
|
|
assert AnomalyCounter._hash_signature(sig1) != AnomalyCounter._hash_signature(sig2)
|
|
|
|
def test_ignores_extra_fields(self):
|
|
sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
|
|
sig2 = {'alert_name': 'PodCrash', 'service': 'api', 'timestamp': '2026-01-01'}
|
|
assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2)
|
|
|
|
|
|
class TestEscalationLevel:
|
|
def test_no_escalation(self, counter):
|
|
assert counter._get_escalation_level(2) is None
|
|
|
|
def test_repeat_level(self, counter):
|
|
assert counter._get_escalation_level(3) == 'REPEAT'
|
|
assert counter._get_escalation_level(4) == 'REPEAT'
|
|
|
|
def test_escalate_level(self, counter):
|
|
assert counter._get_escalation_level(5) == 'ESCALATE'
|
|
assert counter._get_escalation_level(9) == 'ESCALATE'
|
|
|
|
def test_permanent_fix_level(self, counter):
|
|
assert counter._get_escalation_level(10) == 'PERMANENT_FIX'
|
|
assert counter._get_escalation_level(100) == 'PERMANENT_FIX'
|
|
|
|
|
|
class TestRecordAnomaly:
|
|
@pytest.mark.asyncio
|
|
async def test_records_to_redis(self, counter, mock_redis):
|
|
sig = {'alert_name': 'PodCrash', 'service': 'api'}
|
|
freq = await counter.record_anomaly(sig)
|
|
|
|
# 驗證 Redis 操作
|
|
mock_redis.zadd.assert_called_once()
|
|
mock_redis.zremrangebyscore.assert_called_once()
|
|
mock_redis.expire.assert_called()
|
|
|
|
# 驗證返回值
|
|
assert isinstance(freq, AnomalyFrequency)
|
|
assert freq.count_1h == 5 # mock 返回值
|
|
```
|
|
|
|
---
|
|
|
|
## Step 6: 部署驗證 (30min)
|
|
|
|
### 6.1 本地測試
|
|
|
|
```bash
|
|
cd apps/api
|
|
pytest tests/test_anomaly_counter.py -v
|
|
```
|
|
|
|
### 6.2 整合測試
|
|
|
|
```bash
|
|
# 啟動本地 Redis
|
|
docker run -d --name test-redis -p 6380:6379 redis:7
|
|
|
|
# 手動測試
|
|
python -c "
|
|
import asyncio
|
|
from src.services.anomaly_counter import AnomalyCounter
|
|
import redis.asyncio as redis
|
|
|
|
async def test():
|
|
r = redis.Redis(host='localhost', port=6380)
|
|
counter = AnomalyCounter(r)
|
|
|
|
# 記錄 5 次異常
|
|
for i in range(5):
|
|
freq = await counter.record_anomaly({'alert_name': 'TestAlert', 'service': 'test'})
|
|
print(f'Count: {freq.count_24h}, Level: {freq.escalation_level}')
|
|
|
|
asyncio.run(test())
|
|
"
|
|
```
|
|
|
|
### 6.3 預期輸出
|
|
|
|
```
|
|
Count: 1, Level: None
|
|
Count: 2, Level: None
|
|
Count: 3, Level: REPEAT
|
|
Count: 4, Level: REPEAT
|
|
Count: 5, Level: ESCALATE
|
|
```
|
|
|
|
---
|
|
|
|
## 交付物清單
|
|
|
|
| 檔案 | 狀態 | 說明 |
|
|
|------|------|------|
|
|
| `apps/api/src/services/anomaly_counter.py` | 🆕 新建 | 核心服務 |
|
|
| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 整合頻率追蹤 |
|
|
| `apps/api/src/api/v1/sentry_webhook.py` | 📝 修改 | 整合頻率追蹤 |
|
|
| `apps/api/src/services/telegram_gateway.py` | 📝 修改 | 顯示頻率資訊 |
|
|
| `apps/api/src/services/auto_repair_service.py` | 📝 修改 | Tier 決策 |
|
|
| `apps/api/tests/test_anomaly_counter.py` | 🆕 新建 | 單元測試 |
|
|
|
|
---
|
|
|
|
**預估總工時**: 4h
|
|
**前置依賴**: Redis (已有)
|
|
**後續工作**: Phase B 資料庫 Exporter
|