Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
15 KiB
15 KiB
Incident 模型頻率欄位實施步驟
優先級: P0 預估工時: 2h 目標: Incident 支援頻率統計與聚合
現狀分析
| 模型 | hit_count | frequency | escalation |
|---|---|---|---|
| Approval | ✅ 有 | ❌ 無 | ❌ 無 |
| Incident | ❌ 無 | ❌ 無 | ❌ 無 |
Step 1: 更新 Incident 模型 (30min)
1.1 新增欄位
# apps/api/src/models/incident.py
# 在 Incident 類別中新增以下欄位
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class IncidentFrequencyStats(BaseModel):
"""事件頻率統計"""
anomaly_key: str = Field(..., description="異常簽名 hash")
count_1h: int = Field(default=0, description="1 小時內發生次數")
count_24h: int = Field(default=0, description="24 小時內發生次數")
count_7d: int = Field(default=0, description="7 天內發生次數")
count_30d: int = Field(default=0, description="30 天內發生次數")
first_seen: datetime = Field(default_factory=datetime.now)
last_seen: datetime = Field(default_factory=datetime.now)
escalation_level: Optional[str] = Field(
default=None,
description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX"
)
class IncidentRepairStats(BaseModel):
"""修復嘗試統計"""
total_attempts: int = Field(default=0, description="總修復嘗試次數")
successful_attempts: int = Field(default=0, description="成功次數")
last_repair_action: Optional[str] = Field(default=None, description="最近修復動作")
last_repair_time: Optional[datetime] = Field(default=None)
repair_history: list[dict] = Field(
default_factory=list,
description="修復歷史: [{action, success, timestamp}]"
)
recommended_tier: int = Field(
default=1,
description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構"
)
# 在 Incident 模型中新增
class Incident(BaseModel):
# ... 現有欄位 ...
# 🆕 頻率統計
frequency_stats: Optional[IncidentFrequencyStats] = Field(
default=None,
description="異常頻率統計"
)
# 🆕 修復統計
repair_stats: Optional[IncidentRepairStats] = Field(
default=None,
description="修復嘗試統計"
)
# 🆕 聚合控制
is_aggregated: bool = Field(
default=False,
description="是否為聚合告警 (同一問題多次觸發)"
)
aggregated_count: int = Field(
default=1,
description="聚合次數 (窗口期內的觸發次數)"
)
aggregation_window_start: Optional[datetime] = Field(
default=None,
description="聚合窗口開始時間"
)
1.2 資料庫遷移 (如使用 SQLAlchemy)
# apps/api/src/db/migrations/add_incident_frequency.py
"""
新增 Incident 頻率欄位
2026-03-29 ogt: 監控戰略規劃
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB
def upgrade():
# 新增 frequency_stats JSONB 欄位
op.add_column(
'incidents',
sa.Column('frequency_stats', JSONB, nullable=True)
)
# 新增 repair_stats JSONB 欄位
op.add_column(
'incidents',
sa.Column('repair_stats', JSONB, nullable=True)
)
# 新增聚合欄位
op.add_column(
'incidents',
sa.Column('is_aggregated', sa.Boolean, default=False)
)
op.add_column(
'incidents',
sa.Column('aggregated_count', sa.Integer, default=1)
)
op.add_column(
'incidents',
sa.Column('aggregation_window_start', sa.DateTime, nullable=True)
)
# 建立索引 (用於查詢重複事件)
op.create_index(
'ix_incidents_frequency_anomaly_key',
'incidents',
[sa.text("(frequency_stats->>'anomaly_key')")]
)
def downgrade():
op.drop_index('ix_incidents_frequency_anomaly_key')
op.drop_column('incidents', 'aggregation_window_start')
op.drop_column('incidents', 'aggregated_count')
op.drop_column('incidents', 'is_aggregated')
op.drop_column('incidents', 'repair_stats')
op.drop_column('incidents', 'frequency_stats')
Step 2: 更新 IncidentService (45min)
2.1 新增聚合邏輯
# apps/api/src/services/incident_service.py
# 新增或修改以下方法
from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency
from src.models.incident import IncidentFrequencyStats, IncidentRepairStats
class IncidentService:
# 聚合窗口 (10 分鐘內同一問題不建新 Incident)
AGGREGATION_WINDOW_MINUTES = 10
async def create_or_aggregate_incident(
self,
alert_data: dict,
analysis_result: dict | None = None,
) -> tuple[Incident, bool]:
"""
建立或聚合 Incident
Returns:
tuple[Incident, bool]: (Incident, is_new)
- is_new=True: 新建的 Incident
- is_new=False: 聚合到現有 Incident
"""
# 1. 記錄到 AnomalyCounter
anomaly_counter = get_anomaly_counter()
anomaly_signature = self._extract_signature(alert_data)
frequency = await anomaly_counter.record_anomaly(anomaly_signature)
# 2. 檢查是否有可聚合的現有 Incident
existing = await self._find_aggregatable_incident(
anomaly_key=frequency.anomaly_key,
window_minutes=self.AGGREGATION_WINDOW_MINUTES,
)
if existing:
# 聚合到現有 Incident
return await self._aggregate_to_existing(existing, frequency), False
else:
# 建立新 Incident
return await self._create_new_incident(
alert_data=alert_data,
frequency=frequency,
analysis_result=analysis_result,
), True
async def _find_aggregatable_incident(
self,
anomaly_key: str,
window_minutes: int,
) -> Incident | None:
"""
查找可聚合的現有 Incident
條件:
1. 相同 anomaly_key
2. 在聚合窗口內
3. 狀態為 OPEN 或 ANALYZING
"""
cutoff = datetime.now() - timedelta(minutes=window_minutes)
# Redis 快速查詢
cache_key = f"incident:aggregation:{anomaly_key}"
cached_id = await self.redis.get(cache_key)
if cached_id:
incident = await self.get_by_id(cached_id.decode())
if incident and incident.status in ['OPEN', 'ANALYZING']:
return incident
# 資料庫查詢 (fallback)
# ... 實作資料庫查詢 ...
return None
async def _aggregate_to_existing(
self,
incident: Incident,
frequency: AnomalyFrequency,
) -> Incident:
"""
聚合到現有 Incident
"""
# 更新聚合計數
incident.aggregated_count += 1
incident.is_aggregated = True
# 更新頻率統計
incident.frequency_stats = IncidentFrequencyStats(
anomaly_key=frequency.anomaly_key,
count_1h=frequency.count_1h,
count_24h=frequency.count_24h,
count_7d=frequency.count_7d,
count_30d=frequency.count_30d,
first_seen=frequency.first_seen,
last_seen=frequency.last_seen,
escalation_level=frequency.escalation_level,
)
# 更新修復建議 Tier
if incident.repair_stats:
incident.repair_stats.recommended_tier = await self._calculate_tier(frequency)
# 儲存
await self.update(incident)
logger.info(
"incident_aggregated",
incident_id=str(incident.id),
aggregated_count=incident.aggregated_count,
escalation_level=frequency.escalation_level,
)
return incident
async def _create_new_incident(
self,
alert_data: dict,
frequency: AnomalyFrequency,
analysis_result: dict | None,
) -> Incident:
"""
建立新 Incident (含頻率統計)
"""
# 計算建議 Tier
recommended_tier = await self._calculate_tier(frequency)
incident = Incident(
# ... 現有欄位 ...
frequency_stats=IncidentFrequencyStats(
anomaly_key=frequency.anomaly_key,
count_1h=frequency.count_1h,
count_24h=frequency.count_24h,
count_7d=frequency.count_7d,
count_30d=frequency.count_30d,
first_seen=frequency.first_seen,
last_seen=frequency.last_seen,
escalation_level=frequency.escalation_level,
),
repair_stats=IncidentRepairStats(
recommended_tier=recommended_tier,
),
is_aggregated=False,
aggregated_count=1,
aggregation_window_start=datetime.now(),
)
# 儲存
await self.create(incident)
# 設置聚合快取 (10 分鐘)
cache_key = f"incident:aggregation:{frequency.anomaly_key}"
await self.redis.setex(cache_key, 600, str(incident.id))
return incident
async def _calculate_tier(self, frequency: AnomalyFrequency) -> int:
"""
根據頻率計算建議修復 Tier
"""
# 取得修復歷史
counter = get_anomaly_counter()
stats = await counter.get_all_repair_stats(frequency.anomaly_key)
restart_count = stats.get('restart_pod', {}).get('total', 0)
restart_count += stats.get('restart_container', {}).get('total', 0)
if frequency.permanent_fix_applied:
return 4 # 已有永久修復但仍出問題
if frequency.escalation_level == 'PERMANENT_FIX':
return 3 # 24h ≥10 次
if frequency.escalation_level == 'ESCALATE':
return 2 # 24h ≥5 次
if restart_count >= 2:
return 2 # 已重啟 2 次
return 1
def _extract_signature(self, alert_data: dict) -> dict:
"""
從告警資料提取異常簽名
"""
labels = alert_data.get('labels', {})
return {
'alert_name': labels.get('alertname', ''),
'service': labels.get('job', labels.get('service', '')),
'namespace': labels.get('namespace', ''),
'error_type': labels.get('reason', labels.get('error_type', '')),
}
async def record_repair_result(
self,
incident_id: str,
action: str,
success: bool,
):
"""
記錄修復結果到 Incident
"""
incident = await self.get_by_id(incident_id)
if not incident:
return
# 更新 repair_stats
if not incident.repair_stats:
incident.repair_stats = IncidentRepairStats()
incident.repair_stats.total_attempts += 1
if success:
incident.repair_stats.successful_attempts += 1
incident.repair_stats.last_repair_action = action
incident.repair_stats.last_repair_time = datetime.now()
incident.repair_stats.repair_history.append({
'action': action,
'success': success,
'timestamp': datetime.now().isoformat(),
})
# 只保留最近 20 次
incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:]
# 同步到 AnomalyCounter
if incident.frequency_stats:
counter = get_anomaly_counter()
await counter.record_repair_attempt(
anomaly_key=incident.frequency_stats.anomaly_key,
action=action,
success=success,
)
await self.update(incident)
Step 3: 更新 alertmanager_webhook.py (30min)
3.1 使用新的聚合方法
# apps/api/src/api/v1/alertmanager_webhook.py
# 修改告警處理邏輯
@router.post("/alertmanager")
async def handle_alertmanager(
request: Request,
background_tasks: BackgroundTasks,
):
payload = await request.json()
alerts = payload.get("alerts", [])
for alert in alerts:
if alert.get("status") == "firing":
# 🆕 使用聚合方法
incident_service = get_incident_service()
incident, is_new = await incident_service.create_or_aggregate_incident(
alert_data=alert,
)
if is_new:
# 新 Incident: 觸發 AI 分析 + Telegram
background_tasks.add_task(
analyze_and_notify,
incident=incident,
alert_data=alert,
)
else:
# 聚合 Incident: 只更新統計,不重複通知
# (除非達到升級閾值)
if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']:
background_tasks.add_task(
send_escalation_notification,
incident=incident,
)
return {"status": "ok", "processed": len(alerts)}
Step 4: 前端顯示頻率資訊 (15min)
4.1 Incident 卡片新增頻率區塊
// apps/web/src/components/incidents/IncidentCard.tsx
// 新增頻率統計顯示
interface FrequencyStatsProps {
stats: {
count_1h: number;
count_24h: number;
count_7d: number;
count_30d: number;
escalation_level: string | null;
};
}
function FrequencyStats({ stats }: FrequencyStatsProps) {
const escalationColors = {
REPEAT: 'text-yellow-500',
ESCALATE: 'text-orange-500',
PERMANENT_FIX: 'text-red-500',
};
return (
<div className="mt-4 p-3 bg-gray-50 rounded-lg">
<h4 className="text-sm font-medium text-gray-700 mb-2">
📊 頻率統計
{stats.escalation_level && (
<span className={`ml-2 ${escalationColors[stats.escalation_level]}`}>
⚠️ {stats.escalation_level}
</span>
)}
</h4>
<div className="grid grid-cols-4 gap-2 text-sm">
<div>
<span className="text-gray-500">1h:</span>
<span className="ml-1 font-medium">{stats.count_1h}</span>
</div>
<div>
<span className="text-gray-500">24h:</span>
<span className="ml-1 font-medium">{stats.count_24h}</span>
</div>
<div>
<span className="text-gray-500">7d:</span>
<span className="ml-1 font-medium">{stats.count_7d}</span>
</div>
<div>
<span className="text-gray-500">30d:</span>
<span className="ml-1 font-medium">{stats.count_30d}</span>
</div>
</div>
</div>
);
}
交付物清單
| 檔案 | 狀態 | 說明 |
|---|---|---|
apps/api/src/models/incident.py |
📝 修改 | 新增頻率欄位 |
apps/api/src/db/migrations/add_incident_frequency.py |
🆕 | DB 遷移 |
apps/api/src/services/incident_service.py |
📝 修改 | 聚合邏輯 |
apps/api/src/api/v1/alertmanager_webhook.py |
📝 修改 | 使用聚合 |
apps/web/src/components/incidents/IncidentCard.tsx |
📝 修改 | 頻率顯示 |
預估總工時: 2h 前置依賴: Phase A (AnomalyCounter)