Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
512 lines
15 KiB
Markdown
512 lines
15 KiB
Markdown
# Incident 模型頻率欄位實施步驟
|
||
|
||
> **優先級**: P0
|
||
> **預估工時**: 2h
|
||
> **目標**: Incident 支援頻率統計與聚合
|
||
|
||
---
|
||
|
||
## 現狀分析
|
||
|
||
| 模型 | hit_count | frequency | escalation |
|
||
|------|-----------|-----------|------------|
|
||
| Approval | ✅ 有 | ❌ 無 | ❌ 無 |
|
||
| Incident | ❌ 無 | ❌ 無 | ❌ 無 |
|
||
|
||
---
|
||
|
||
## Step 1: 更新 Incident 模型 (30min)
|
||
|
||
### 1.1 新增欄位
|
||
|
||
```python
|
||
# apps/api/src/models/incident.py
|
||
# 在 Incident 類別中新增以下欄位
|
||
|
||
from datetime import datetime
|
||
from typing import Optional
|
||
from pydantic import BaseModel, Field
|
||
|
||
|
||
class IncidentFrequencyStats(BaseModel):
|
||
"""事件頻率統計"""
|
||
anomaly_key: str = Field(..., description="異常簽名 hash")
|
||
count_1h: int = Field(default=0, description="1 小時內發生次數")
|
||
count_24h: int = Field(default=0, description="24 小時內發生次數")
|
||
count_7d: int = Field(default=0, description="7 天內發生次數")
|
||
count_30d: int = Field(default=0, description="30 天內發生次數")
|
||
first_seen: datetime = Field(default_factory=datetime.now)
|
||
last_seen: datetime = Field(default_factory=datetime.now)
|
||
escalation_level: Optional[str] = Field(
|
||
default=None,
|
||
description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX"
|
||
)
|
||
|
||
|
||
class IncidentRepairStats(BaseModel):
|
||
"""修復嘗試統計"""
|
||
total_attempts: int = Field(default=0, description="總修復嘗試次數")
|
||
successful_attempts: int = Field(default=0, description="成功次數")
|
||
last_repair_action: Optional[str] = Field(default=None, description="最近修復動作")
|
||
last_repair_time: Optional[datetime] = Field(default=None)
|
||
repair_history: list[dict] = Field(
|
||
default_factory=list,
|
||
description="修復歷史: [{action, success, timestamp}]"
|
||
)
|
||
recommended_tier: int = Field(
|
||
default=1,
|
||
description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構"
|
||
)
|
||
|
||
|
||
# 在 Incident 模型中新增
|
||
class Incident(BaseModel):
|
||
# ... 現有欄位 ...
|
||
|
||
# 🆕 頻率統計
|
||
frequency_stats: Optional[IncidentFrequencyStats] = Field(
|
||
default=None,
|
||
description="異常頻率統計"
|
||
)
|
||
|
||
# 🆕 修復統計
|
||
repair_stats: Optional[IncidentRepairStats] = Field(
|
||
default=None,
|
||
description="修復嘗試統計"
|
||
)
|
||
|
||
# 🆕 聚合控制
|
||
is_aggregated: bool = Field(
|
||
default=False,
|
||
description="是否為聚合告警 (同一問題多次觸發)"
|
||
)
|
||
aggregated_count: int = Field(
|
||
default=1,
|
||
description="聚合次數 (窗口期內的觸發次數)"
|
||
)
|
||
aggregation_window_start: Optional[datetime] = Field(
|
||
default=None,
|
||
description="聚合窗口開始時間"
|
||
)
|
||
```
|
||
|
||
### 1.2 資料庫遷移 (如使用 SQLAlchemy)
|
||
|
||
```python
|
||
# apps/api/src/db/migrations/add_incident_frequency.py
|
||
"""
|
||
新增 Incident 頻率欄位
|
||
2026-03-29 ogt: 監控戰略規劃
|
||
"""
|
||
|
||
from alembic import op
|
||
import sqlalchemy as sa
|
||
from sqlalchemy.dialects.postgresql import JSONB
|
||
|
||
|
||
def upgrade():
|
||
# 新增 frequency_stats JSONB 欄位
|
||
op.add_column(
|
||
'incidents',
|
||
sa.Column('frequency_stats', JSONB, nullable=True)
|
||
)
|
||
|
||
# 新增 repair_stats JSONB 欄位
|
||
op.add_column(
|
||
'incidents',
|
||
sa.Column('repair_stats', JSONB, nullable=True)
|
||
)
|
||
|
||
# 新增聚合欄位
|
||
op.add_column(
|
||
'incidents',
|
||
sa.Column('is_aggregated', sa.Boolean, default=False)
|
||
)
|
||
op.add_column(
|
||
'incidents',
|
||
sa.Column('aggregated_count', sa.Integer, default=1)
|
||
)
|
||
op.add_column(
|
||
'incidents',
|
||
sa.Column('aggregation_window_start', sa.DateTime, nullable=True)
|
||
)
|
||
|
||
# 建立索引 (用於查詢重複事件)
|
||
op.create_index(
|
||
'ix_incidents_frequency_anomaly_key',
|
||
'incidents',
|
||
[sa.text("(frequency_stats->>'anomaly_key')")]
|
||
)
|
||
|
||
|
||
def downgrade():
|
||
op.drop_index('ix_incidents_frequency_anomaly_key')
|
||
op.drop_column('incidents', 'aggregation_window_start')
|
||
op.drop_column('incidents', 'aggregated_count')
|
||
op.drop_column('incidents', 'is_aggregated')
|
||
op.drop_column('incidents', 'repair_stats')
|
||
op.drop_column('incidents', 'frequency_stats')
|
||
```
|
||
|
||
---
|
||
|
||
## Step 2: 更新 IncidentService (45min)
|
||
|
||
### 2.1 新增聚合邏輯
|
||
|
||
```python
|
||
# apps/api/src/services/incident_service.py
|
||
# 新增或修改以下方法
|
||
|
||
from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency
|
||
from src.models.incident import IncidentFrequencyStats, IncidentRepairStats
|
||
|
||
|
||
class IncidentService:
|
||
# 聚合窗口 (10 分鐘內同一問題不建新 Incident)
|
||
AGGREGATION_WINDOW_MINUTES = 10
|
||
|
||
async def create_or_aggregate_incident(
|
||
self,
|
||
alert_data: dict,
|
||
analysis_result: dict | None = None,
|
||
) -> tuple[Incident, bool]:
|
||
"""
|
||
建立或聚合 Incident
|
||
|
||
Returns:
|
||
tuple[Incident, bool]: (Incident, is_new)
|
||
- is_new=True: 新建的 Incident
|
||
- is_new=False: 聚合到現有 Incident
|
||
"""
|
||
# 1. 記錄到 AnomalyCounter
|
||
anomaly_counter = get_anomaly_counter()
|
||
anomaly_signature = self._extract_signature(alert_data)
|
||
frequency = await anomaly_counter.record_anomaly(anomaly_signature)
|
||
|
||
# 2. 檢查是否有可聚合的現有 Incident
|
||
existing = await self._find_aggregatable_incident(
|
||
anomaly_key=frequency.anomaly_key,
|
||
window_minutes=self.AGGREGATION_WINDOW_MINUTES,
|
||
)
|
||
|
||
if existing:
|
||
# 聚合到現有 Incident
|
||
return await self._aggregate_to_existing(existing, frequency), False
|
||
else:
|
||
# 建立新 Incident
|
||
return await self._create_new_incident(
|
||
alert_data=alert_data,
|
||
frequency=frequency,
|
||
analysis_result=analysis_result,
|
||
), True
|
||
|
||
async def _find_aggregatable_incident(
|
||
self,
|
||
anomaly_key: str,
|
||
window_minutes: int,
|
||
) -> Incident | None:
|
||
"""
|
||
查找可聚合的現有 Incident
|
||
|
||
條件:
|
||
1. 相同 anomaly_key
|
||
2. 在聚合窗口內
|
||
3. 狀態為 OPEN 或 ANALYZING
|
||
"""
|
||
cutoff = datetime.now() - timedelta(minutes=window_minutes)
|
||
|
||
# Redis 快速查詢
|
||
cache_key = f"incident:aggregation:{anomaly_key}"
|
||
cached_id = await self.redis.get(cache_key)
|
||
|
||
if cached_id:
|
||
incident = await self.get_by_id(cached_id.decode())
|
||
if incident and incident.status in ['OPEN', 'ANALYZING']:
|
||
return incident
|
||
|
||
# 資料庫查詢 (fallback)
|
||
# ... 實作資料庫查詢 ...
|
||
|
||
return None
|
||
|
||
async def _aggregate_to_existing(
|
||
self,
|
||
incident: Incident,
|
||
frequency: AnomalyFrequency,
|
||
) -> Incident:
|
||
"""
|
||
聚合到現有 Incident
|
||
"""
|
||
# 更新聚合計數
|
||
incident.aggregated_count += 1
|
||
incident.is_aggregated = True
|
||
|
||
# 更新頻率統計
|
||
incident.frequency_stats = IncidentFrequencyStats(
|
||
anomaly_key=frequency.anomaly_key,
|
||
count_1h=frequency.count_1h,
|
||
count_24h=frequency.count_24h,
|
||
count_7d=frequency.count_7d,
|
||
count_30d=frequency.count_30d,
|
||
first_seen=frequency.first_seen,
|
||
last_seen=frequency.last_seen,
|
||
escalation_level=frequency.escalation_level,
|
||
)
|
||
|
||
# 更新修復建議 Tier
|
||
if incident.repair_stats:
|
||
incident.repair_stats.recommended_tier = await self._calculate_tier(frequency)
|
||
|
||
# 儲存
|
||
await self.update(incident)
|
||
|
||
logger.info(
|
||
"incident_aggregated",
|
||
incident_id=str(incident.id),
|
||
aggregated_count=incident.aggregated_count,
|
||
escalation_level=frequency.escalation_level,
|
||
)
|
||
|
||
return incident
|
||
|
||
async def _create_new_incident(
|
||
self,
|
||
alert_data: dict,
|
||
frequency: AnomalyFrequency,
|
||
analysis_result: dict | None,
|
||
) -> Incident:
|
||
"""
|
||
建立新 Incident (含頻率統計)
|
||
"""
|
||
# 計算建議 Tier
|
||
recommended_tier = await self._calculate_tier(frequency)
|
||
|
||
incident = Incident(
|
||
# ... 現有欄位 ...
|
||
frequency_stats=IncidentFrequencyStats(
|
||
anomaly_key=frequency.anomaly_key,
|
||
count_1h=frequency.count_1h,
|
||
count_24h=frequency.count_24h,
|
||
count_7d=frequency.count_7d,
|
||
count_30d=frequency.count_30d,
|
||
first_seen=frequency.first_seen,
|
||
last_seen=frequency.last_seen,
|
||
escalation_level=frequency.escalation_level,
|
||
),
|
||
repair_stats=IncidentRepairStats(
|
||
recommended_tier=recommended_tier,
|
||
),
|
||
is_aggregated=False,
|
||
aggregated_count=1,
|
||
aggregation_window_start=datetime.now(),
|
||
)
|
||
|
||
# 儲存
|
||
await self.create(incident)
|
||
|
||
# 設置聚合快取 (10 分鐘)
|
||
cache_key = f"incident:aggregation:{frequency.anomaly_key}"
|
||
await self.redis.setex(cache_key, 600, str(incident.id))
|
||
|
||
return incident
|
||
|
||
async def _calculate_tier(self, frequency: AnomalyFrequency) -> int:
|
||
"""
|
||
根據頻率計算建議修復 Tier
|
||
"""
|
||
# 取得修復歷史
|
||
counter = get_anomaly_counter()
|
||
stats = await counter.get_all_repair_stats(frequency.anomaly_key)
|
||
|
||
restart_count = stats.get('restart_pod', {}).get('total', 0)
|
||
restart_count += stats.get('restart_container', {}).get('total', 0)
|
||
|
||
if frequency.permanent_fix_applied:
|
||
return 4 # 已有永久修復但仍出問題
|
||
if frequency.escalation_level == 'PERMANENT_FIX':
|
||
return 3 # 24h ≥10 次
|
||
if frequency.escalation_level == 'ESCALATE':
|
||
return 2 # 24h ≥5 次
|
||
if restart_count >= 2:
|
||
return 2 # 已重啟 2 次
|
||
return 1
|
||
|
||
def _extract_signature(self, alert_data: dict) -> dict:
|
||
"""
|
||
從告警資料提取異常簽名
|
||
"""
|
||
labels = alert_data.get('labels', {})
|
||
return {
|
||
'alert_name': labels.get('alertname', ''),
|
||
'service': labels.get('job', labels.get('service', '')),
|
||
'namespace': labels.get('namespace', ''),
|
||
'error_type': labels.get('reason', labels.get('error_type', '')),
|
||
}
|
||
|
||
async def record_repair_result(
|
||
self,
|
||
incident_id: str,
|
||
action: str,
|
||
success: bool,
|
||
):
|
||
"""
|
||
記錄修復結果到 Incident
|
||
"""
|
||
incident = await self.get_by_id(incident_id)
|
||
if not incident:
|
||
return
|
||
|
||
# 更新 repair_stats
|
||
if not incident.repair_stats:
|
||
incident.repair_stats = IncidentRepairStats()
|
||
|
||
incident.repair_stats.total_attempts += 1
|
||
if success:
|
||
incident.repair_stats.successful_attempts += 1
|
||
|
||
incident.repair_stats.last_repair_action = action
|
||
incident.repair_stats.last_repair_time = datetime.now()
|
||
incident.repair_stats.repair_history.append({
|
||
'action': action,
|
||
'success': success,
|
||
'timestamp': datetime.now().isoformat(),
|
||
})
|
||
|
||
# 只保留最近 20 次
|
||
incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:]
|
||
|
||
# 同步到 AnomalyCounter
|
||
if incident.frequency_stats:
|
||
counter = get_anomaly_counter()
|
||
await counter.record_repair_attempt(
|
||
anomaly_key=incident.frequency_stats.anomaly_key,
|
||
action=action,
|
||
success=success,
|
||
)
|
||
|
||
await self.update(incident)
|
||
```
|
||
|
||
---
|
||
|
||
## Step 3: 更新 alertmanager_webhook.py (30min)
|
||
|
||
### 3.1 使用新的聚合方法
|
||
|
||
```python
|
||
# apps/api/src/api/v1/alertmanager_webhook.py
|
||
# 修改告警處理邏輯
|
||
|
||
@router.post("/alertmanager")
|
||
async def handle_alertmanager(
|
||
request: Request,
|
||
background_tasks: BackgroundTasks,
|
||
):
|
||
payload = await request.json()
|
||
alerts = payload.get("alerts", [])
|
||
|
||
for alert in alerts:
|
||
if alert.get("status") == "firing":
|
||
# 🆕 使用聚合方法
|
||
incident_service = get_incident_service()
|
||
incident, is_new = await incident_service.create_or_aggregate_incident(
|
||
alert_data=alert,
|
||
)
|
||
|
||
if is_new:
|
||
# 新 Incident: 觸發 AI 分析 + Telegram
|
||
background_tasks.add_task(
|
||
analyze_and_notify,
|
||
incident=incident,
|
||
alert_data=alert,
|
||
)
|
||
else:
|
||
# 聚合 Incident: 只更新統計,不重複通知
|
||
# (除非達到升級閾值)
|
||
if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']:
|
||
background_tasks.add_task(
|
||
send_escalation_notification,
|
||
incident=incident,
|
||
)
|
||
|
||
return {"status": "ok", "processed": len(alerts)}
|
||
```
|
||
|
||
---
|
||
|
||
## Step 4: 前端顯示頻率資訊 (15min)
|
||
|
||
### 4.1 Incident 卡片新增頻率區塊
|
||
|
||
```typescript
|
||
// apps/web/src/components/incidents/IncidentCard.tsx
|
||
// 新增頻率統計顯示
|
||
|
||
interface FrequencyStatsProps {
|
||
stats: {
|
||
count_1h: number;
|
||
count_24h: number;
|
||
count_7d: number;
|
||
count_30d: number;
|
||
escalation_level: string | null;
|
||
};
|
||
}
|
||
|
||
function FrequencyStats({ stats }: FrequencyStatsProps) {
|
||
const escalationColors = {
|
||
REPEAT: 'text-yellow-500',
|
||
ESCALATE: 'text-orange-500',
|
||
PERMANENT_FIX: 'text-red-500',
|
||
};
|
||
|
||
return (
|
||
<div className="mt-4 p-3 bg-gray-50 rounded-lg">
|
||
<h4 className="text-sm font-medium text-gray-700 mb-2">
|
||
📊 頻率統計
|
||
{stats.escalation_level && (
|
||
<span className={`ml-2 ${escalationColors[stats.escalation_level]}`}>
|
||
⚠️ {stats.escalation_level}
|
||
</span>
|
||
)}
|
||
</h4>
|
||
<div className="grid grid-cols-4 gap-2 text-sm">
|
||
<div>
|
||
<span className="text-gray-500">1h:</span>
|
||
<span className="ml-1 font-medium">{stats.count_1h}</span>
|
||
</div>
|
||
<div>
|
||
<span className="text-gray-500">24h:</span>
|
||
<span className="ml-1 font-medium">{stats.count_24h}</span>
|
||
</div>
|
||
<div>
|
||
<span className="text-gray-500">7d:</span>
|
||
<span className="ml-1 font-medium">{stats.count_7d}</span>
|
||
</div>
|
||
<div>
|
||
<span className="text-gray-500">30d:</span>
|
||
<span className="ml-1 font-medium">{stats.count_30d}</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
);
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 交付物清單
|
||
|
||
| 檔案 | 狀態 | 說明 |
|
||
|------|------|------|
|
||
| `apps/api/src/models/incident.py` | 📝 修改 | 新增頻率欄位 |
|
||
| `apps/api/src/db/migrations/add_incident_frequency.py` | 🆕 | DB 遷移 |
|
||
| `apps/api/src/services/incident_service.py` | 📝 修改 | 聚合邏輯 |
|
||
| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 使用聚合 |
|
||
| `apps/web/src/components/incidents/IncidentCard.tsx` | 📝 修改 | 頻率顯示 |
|
||
|
||
---
|
||
|
||
**預估總工時**: 2h
|
||
**前置依賴**: Phase A (AnomalyCounter)
|