# Incident 模型頻率欄位實施步驟 > **優先級**: P0 > **預估工時**: 2h > **目標**: Incident 支援頻率統計與聚合 --- ## 現狀分析 | 模型 | hit_count | frequency | escalation | |------|-----------|-----------|------------| | Approval | ✅ 有 | ❌ 無 | ❌ 無 | | Incident | ❌ 無 | ❌ 無 | ❌ 無 | --- ## Step 1: 更新 Incident 模型 (30min) ### 1.1 新增欄位 ```python # apps/api/src/models/incident.py # 在 Incident 類別中新增以下欄位 from datetime import datetime from typing import Optional from pydantic import BaseModel, Field class IncidentFrequencyStats(BaseModel): """事件頻率統計""" anomaly_key: str = Field(..., description="異常簽名 hash") count_1h: int = Field(default=0, description="1 小時內發生次數") count_24h: int = Field(default=0, description="24 小時內發生次數") count_7d: int = Field(default=0, description="7 天內發生次數") count_30d: int = Field(default=0, description="30 天內發生次數") first_seen: datetime = Field(default_factory=datetime.now) last_seen: datetime = Field(default_factory=datetime.now) escalation_level: Optional[str] = Field( default=None, description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX" ) class IncidentRepairStats(BaseModel): """修復嘗試統計""" total_attempts: int = Field(default=0, description="總修復嘗試次數") successful_attempts: int = Field(default=0, description="成功次數") last_repair_action: Optional[str] = Field(default=None, description="最近修復動作") last_repair_time: Optional[datetime] = Field(default=None) repair_history: list[dict] = Field( default_factory=list, description="修復歷史: [{action, success, timestamp}]" ) recommended_tier: int = Field( default=1, description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構" ) # 在 Incident 模型中新增 class Incident(BaseModel): # ... 現有欄位 ... # 🆕 頻率統計 frequency_stats: Optional[IncidentFrequencyStats] = Field( default=None, description="異常頻率統計" ) # 🆕 修復統計 repair_stats: Optional[IncidentRepairStats] = Field( default=None, description="修復嘗試統計" ) # 🆕 聚合控制 is_aggregated: bool = Field( default=False, description="是否為聚合告警 (同一問題多次觸發)" ) aggregated_count: int = Field( default=1, description="聚合次數 (窗口期內的觸發次數)" ) aggregation_window_start: Optional[datetime] = Field( default=None, description="聚合窗口開始時間" ) ``` ### 1.2 資料庫遷移 (如使用 SQLAlchemy) ```python # apps/api/src/db/migrations/add_incident_frequency.py """ 新增 Incident 頻率欄位 2026-03-29 ogt: 監控戰略規劃 """ from alembic import op import sqlalchemy as sa from sqlalchemy.dialects.postgresql import JSONB def upgrade(): # 新增 frequency_stats JSONB 欄位 op.add_column( 'incidents', sa.Column('frequency_stats', JSONB, nullable=True) ) # 新增 repair_stats JSONB 欄位 op.add_column( 'incidents', sa.Column('repair_stats', JSONB, nullable=True) ) # 新增聚合欄位 op.add_column( 'incidents', sa.Column('is_aggregated', sa.Boolean, default=False) ) op.add_column( 'incidents', sa.Column('aggregated_count', sa.Integer, default=1) ) op.add_column( 'incidents', sa.Column('aggregation_window_start', sa.DateTime, nullable=True) ) # 建立索引 (用於查詢重複事件) op.create_index( 'ix_incidents_frequency_anomaly_key', 'incidents', [sa.text("(frequency_stats->>'anomaly_key')")] ) def downgrade(): op.drop_index('ix_incidents_frequency_anomaly_key') op.drop_column('incidents', 'aggregation_window_start') op.drop_column('incidents', 'aggregated_count') op.drop_column('incidents', 'is_aggregated') op.drop_column('incidents', 'repair_stats') op.drop_column('incidents', 'frequency_stats') ``` --- ## Step 2: 更新 IncidentService (45min) ### 2.1 新增聚合邏輯 ```python # apps/api/src/services/incident_service.py # 新增或修改以下方法 from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency from src.models.incident import IncidentFrequencyStats, IncidentRepairStats class IncidentService: # 聚合窗口 (10 分鐘內同一問題不建新 Incident) AGGREGATION_WINDOW_MINUTES = 10 async def create_or_aggregate_incident( self, alert_data: dict, analysis_result: dict | None = None, ) -> tuple[Incident, bool]: """ 建立或聚合 Incident Returns: tuple[Incident, bool]: (Incident, is_new) - is_new=True: 新建的 Incident - is_new=False: 聚合到現有 Incident """ # 1. 記錄到 AnomalyCounter anomaly_counter = get_anomaly_counter() anomaly_signature = self._extract_signature(alert_data) frequency = await anomaly_counter.record_anomaly(anomaly_signature) # 2. 檢查是否有可聚合的現有 Incident existing = await self._find_aggregatable_incident( anomaly_key=frequency.anomaly_key, window_minutes=self.AGGREGATION_WINDOW_MINUTES, ) if existing: # 聚合到現有 Incident return await self._aggregate_to_existing(existing, frequency), False else: # 建立新 Incident return await self._create_new_incident( alert_data=alert_data, frequency=frequency, analysis_result=analysis_result, ), True async def _find_aggregatable_incident( self, anomaly_key: str, window_minutes: int, ) -> Incident | None: """ 查找可聚合的現有 Incident 條件: 1. 相同 anomaly_key 2. 在聚合窗口內 3. 狀態為 OPEN 或 ANALYZING """ cutoff = datetime.now() - timedelta(minutes=window_minutes) # Redis 快速查詢 cache_key = f"incident:aggregation:{anomaly_key}" cached_id = await self.redis.get(cache_key) if cached_id: incident = await self.get_by_id(cached_id.decode()) if incident and incident.status in ['OPEN', 'ANALYZING']: return incident # 資料庫查詢 (fallback) # ... 實作資料庫查詢 ... return None async def _aggregate_to_existing( self, incident: Incident, frequency: AnomalyFrequency, ) -> Incident: """ 聚合到現有 Incident """ # 更新聚合計數 incident.aggregated_count += 1 incident.is_aggregated = True # 更新頻率統計 incident.frequency_stats = IncidentFrequencyStats( anomaly_key=frequency.anomaly_key, count_1h=frequency.count_1h, count_24h=frequency.count_24h, count_7d=frequency.count_7d, count_30d=frequency.count_30d, first_seen=frequency.first_seen, last_seen=frequency.last_seen, escalation_level=frequency.escalation_level, ) # 更新修復建議 Tier if incident.repair_stats: incident.repair_stats.recommended_tier = await self._calculate_tier(frequency) # 儲存 await self.update(incident) logger.info( "incident_aggregated", incident_id=str(incident.id), aggregated_count=incident.aggregated_count, escalation_level=frequency.escalation_level, ) return incident async def _create_new_incident( self, alert_data: dict, frequency: AnomalyFrequency, analysis_result: dict | None, ) -> Incident: """ 建立新 Incident (含頻率統計) """ # 計算建議 Tier recommended_tier = await self._calculate_tier(frequency) incident = Incident( # ... 現有欄位 ... frequency_stats=IncidentFrequencyStats( anomaly_key=frequency.anomaly_key, count_1h=frequency.count_1h, count_24h=frequency.count_24h, count_7d=frequency.count_7d, count_30d=frequency.count_30d, first_seen=frequency.first_seen, last_seen=frequency.last_seen, escalation_level=frequency.escalation_level, ), repair_stats=IncidentRepairStats( recommended_tier=recommended_tier, ), is_aggregated=False, aggregated_count=1, aggregation_window_start=datetime.now(), ) # 儲存 await self.create(incident) # 設置聚合快取 (10 分鐘) cache_key = f"incident:aggregation:{frequency.anomaly_key}" await self.redis.setex(cache_key, 600, str(incident.id)) return incident async def _calculate_tier(self, frequency: AnomalyFrequency) -> int: """ 根據頻率計算建議修復 Tier """ # 取得修復歷史 counter = get_anomaly_counter() stats = await counter.get_all_repair_stats(frequency.anomaly_key) restart_count = stats.get('restart_pod', {}).get('total', 0) restart_count += stats.get('restart_container', {}).get('total', 0) if frequency.permanent_fix_applied: return 4 # 已有永久修復但仍出問題 if frequency.escalation_level == 'PERMANENT_FIX': return 3 # 24h ≥10 次 if frequency.escalation_level == 'ESCALATE': return 2 # 24h ≥5 次 if restart_count >= 2: return 2 # 已重啟 2 次 return 1 def _extract_signature(self, alert_data: dict) -> dict: """ 從告警資料提取異常簽名 """ labels = alert_data.get('labels', {}) return { 'alert_name': labels.get('alertname', ''), 'service': labels.get('job', labels.get('service', '')), 'namespace': labels.get('namespace', ''), 'error_type': labels.get('reason', labels.get('error_type', '')), } async def record_repair_result( self, incident_id: str, action: str, success: bool, ): """ 記錄修復結果到 Incident """ incident = await self.get_by_id(incident_id) if not incident: return # 更新 repair_stats if not incident.repair_stats: incident.repair_stats = IncidentRepairStats() incident.repair_stats.total_attempts += 1 if success: incident.repair_stats.successful_attempts += 1 incident.repair_stats.last_repair_action = action incident.repair_stats.last_repair_time = datetime.now() incident.repair_stats.repair_history.append({ 'action': action, 'success': success, 'timestamp': datetime.now().isoformat(), }) # 只保留最近 20 次 incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:] # 同步到 AnomalyCounter if incident.frequency_stats: counter = get_anomaly_counter() await counter.record_repair_attempt( anomaly_key=incident.frequency_stats.anomaly_key, action=action, success=success, ) await self.update(incident) ``` --- ## Step 3: 更新 alertmanager_webhook.py (30min) ### 3.1 使用新的聚合方法 ```python # apps/api/src/api/v1/alertmanager_webhook.py # 修改告警處理邏輯 @router.post("/alertmanager") async def handle_alertmanager( request: Request, background_tasks: BackgroundTasks, ): payload = await request.json() alerts = payload.get("alerts", []) for alert in alerts: if alert.get("status") == "firing": # 🆕 使用聚合方法 incident_service = get_incident_service() incident, is_new = await incident_service.create_or_aggregate_incident( alert_data=alert, ) if is_new: # 新 Incident: 觸發 AI 分析 + Telegram background_tasks.add_task( analyze_and_notify, incident=incident, alert_data=alert, ) else: # 聚合 Incident: 只更新統計,不重複通知 # (除非達到升級閾值) if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']: background_tasks.add_task( send_escalation_notification, incident=incident, ) return {"status": "ok", "processed": len(alerts)} ``` --- ## Step 4: 前端顯示頻率資訊 (15min) ### 4.1 Incident 卡片新增頻率區塊 ```typescript // apps/web/src/components/incidents/IncidentCard.tsx // 新增頻率統計顯示 interface FrequencyStatsProps { stats: { count_1h: number; count_24h: number; count_7d: number; count_30d: number; escalation_level: string | null; }; } function FrequencyStats({ stats }: FrequencyStatsProps) { const escalationColors = { REPEAT: 'text-yellow-500', ESCALATE: 'text-orange-500', PERMANENT_FIX: 'text-red-500', }; return (

📊 頻率統計 {stats.escalation_level && ( ⚠️ {stats.escalation_level} )}

1h: {stats.count_1h}
24h: {stats.count_24h}
7d: {stats.count_7d}
30d: {stats.count_30d}
); } ``` --- ## 交付物清單 | 檔案 | 狀態 | 說明 | |------|------|------| | `apps/api/src/models/incident.py` | 📝 修改 | 新增頻率欄位 | | `apps/api/src/db/migrations/add_incident_frequency.py` | 🆕 | DB 遷移 | | `apps/api/src/services/incident_service.py` | 📝 修改 | 聚合邏輯 | | `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 使用聚合 | | `apps/web/src/components/incidents/IncidentCard.tsx` | 📝 修改 | 頻率顯示 | --- **預估總工時**: 2h **前置依賴**: Phase A (AnomalyCounter)