diff --git a/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md new file mode 100644 index 00000000..6e965b56 --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md @@ -0,0 +1,699 @@ +# AnomalyCounter 服務實施步驟 + +> **優先級**: P0 +> **預估工時**: 4h +> **目標**: 建立異常頻率追蹤能力 + +--- + +## Step 1: 建立 anomaly_counter.py (1h) + +### 1.1 建立檔案 + +```bash +touch apps/api/src/services/anomaly_counter.py +``` + +### 1.2 實作 AnomalyCounter 類別 + +```python +# apps/api/src/services/anomaly_counter.py +""" +異常頻率統計服務 +================================ +2026-03-29 ogt: 監控戰略規劃 Section 9 實作 + +使用 Redis Sorted Set 實作滑動窗口計數: +- ZADD anomaly:timeline:{key} {timestamp} {timestamp} +- ZCOUNT anomaly:timeline:{key} {start} +inf +- ZREMRANGEBYSCORE anomaly:timeline:{key} -inf {cutoff} +""" + +import hashlib +import json +from datetime import datetime, timedelta +from typing import NamedTuple + +import redis.asyncio as redis +import structlog + +logger = structlog.get_logger(__name__) + + +class AnomalyFrequency(NamedTuple): + """異常頻率資料""" + anomaly_key: str + count_1h: int + count_24h: int + count_7d: int + count_30d: int + first_seen: datetime + last_seen: datetime + auto_repair_count: int + permanent_fix_applied: bool + escalation_level: str | None # None, REPEAT, ESCALATE, PERMANENT_FIX + + +class AnomalyCounter: + """ + 異常計數器 - 追蹤每種異常的發生頻率 + + 閾值配置 (可透過環境變數覆寫): + - ANOMALY_REPEAT_THRESHOLD: 3 (預設) + - ANOMALY_ESCALATE_THRESHOLD: 5 (預設) + - ANOMALY_PERMANENT_FIX_THRESHOLD: 10 (預設) + """ + + THRESHOLDS = { + 'REPEAT': 3, # 3 次 → 重複告警 + 'ESCALATE': 5, # 5 次 → 人工介入 + 'PERMANENT_FIX': 10, # 10 次 → 必須永久修復 + } + + # Redis Key 前綴 + PREFIX_TIMELINE = "anomaly:timeline:" + PREFIX_REPAIR_COUNT = "anomaly:repair_count:" + PREFIX_PERMANENT_FIX = "anomaly:permanent_fix:" + PREFIX_METADATA = "anomaly:metadata:" + + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + + @staticmethod + def _hash_signature(signature: dict) -> str: + """ + 生成異常簽名的 hash key + + 簽名欄位: + - alert_name: 告警名稱 (e.g., PodCrashLoopBackOff) + - service: 服務名稱 (e.g., awoooi-api) + - namespace: K8s 命名空間 (e.g., awoooi-prod) + - error_type: 錯誤類型 (e.g., OOMKilled) + """ + # 只取關鍵欄位,忽略時間戳等易變欄位 + key_fields = { + 'alert_name': signature.get('alert_name', signature.get('alertname', '')), + 'service': signature.get('service', signature.get('job', '')), + 'namespace': signature.get('namespace', ''), + 'error_type': signature.get('error_type', signature.get('reason', '')), + } + # 排序確保一致性 + canonical = json.dumps(key_fields, sort_keys=True) + return hashlib.sha256(canonical.encode()).hexdigest()[:16] + + async def record_anomaly(self, anomaly_signature: dict) -> AnomalyFrequency: + """ + 記錄一次異常發生 + + Args: + anomaly_signature: 異常簽名字典 + + Returns: + AnomalyFrequency: 當前頻率統計 + """ + anomaly_key = self._hash_signature(anomaly_signature) + now = datetime.now() + timestamp = now.timestamp() + timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" + + # 1. 添加到 Sorted Set (score = timestamp, member = timestamp string) + await self.redis.zadd(timeline_key, {str(timestamp): timestamp}) + + # 2. 清理過期數據 (30 天前) + cutoff_30d = (now - timedelta(days=30)).timestamp() + await self.redis.zremrangebyscore(timeline_key, '-inf', cutoff_30d) + + # 3. 設置 TTL (35 天,比清理週期長一點) + await self.redis.expire(timeline_key, 35 * 24 * 3600) + + # 4. 計算各時間窗口的計數 + count_1h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=1)).timestamp(), + '+inf' + ) + count_24h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=24)).timestamp(), + '+inf' + ) + count_7d = await self.redis.zcount( + timeline_key, + (now - timedelta(days=7)).timestamp(), + '+inf' + ) + count_30d = await self.redis.zcount( + timeline_key, + cutoff_30d, + '+inf' + ) + + # 5. 取得首次/最近時間 + first_seen_data = await self.redis.zrange(timeline_key, 0, 0, withscores=True) + last_seen_data = await self.redis.zrange(timeline_key, -1, -1, withscores=True) + + first_seen = datetime.fromtimestamp(first_seen_data[0][1]) if first_seen_data else now + last_seen = datetime.fromtimestamp(last_seen_data[0][1]) if last_seen_data else now + + # 6. 讀取修復統計 + auto_repair_count = int(await self.redis.get(f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}") or 0) + permanent_fix = await self.redis.get(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}") == b'1' + + # 7. 儲存 metadata (首次記錄時) + metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + if not await self.redis.exists(metadata_key): + await self.redis.hset(metadata_key, mapping={ + 'signature': json.dumps(anomaly_signature), + 'first_seen': now.isoformat(), + }) + await self.redis.expire(metadata_key, 35 * 24 * 3600) + + # 8. 判斷升級等級 + escalation_level = self._get_escalation_level(count_24h) + + freq = AnomalyFrequency( + anomaly_key=anomaly_key, + count_1h=count_1h, + count_24h=count_24h, + count_7d=count_7d, + count_30d=count_30d, + first_seen=first_seen, + last_seen=last_seen, + auto_repair_count=auto_repair_count, + permanent_fix_applied=permanent_fix, + escalation_level=escalation_level, + ) + + # 9. 記錄日誌 + logger.info( + "anomaly_recorded", + anomaly_key=anomaly_key, + count_1h=count_1h, + count_24h=count_24h, + count_30d=count_30d, + escalation_level=escalation_level, + ) + + return freq + + def _get_escalation_level(self, count_24h: int) -> str | None: + """判斷升級等級""" + if count_24h >= self.THRESHOLDS['PERMANENT_FIX']: + return 'PERMANENT_FIX' + elif count_24h >= self.THRESHOLDS['ESCALATE']: + return 'ESCALATE' + elif count_24h >= self.THRESHOLDS['REPEAT']: + return 'REPEAT' + return None + + async def record_repair_attempt(self, anomaly_key: str, action: str, success: bool): + """ + 記錄修復嘗試 + + Args: + anomaly_key: 異常 key + action: 修復動作 (e.g., restart_pod, scale_up) + success: 是否成功 + """ + repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" + + # 遞增修復嘗試次數 + await self.redis.incr(repair_key) + await self.redis.expire(repair_key, 35 * 24 * 3600) + + # 記錄修復歷史 (用於學習) + history_key = f"anomaly:repair_history:{anomaly_key}" + await self.redis.lpush(history_key, json.dumps({ + 'action': action, + 'success': success, + 'timestamp': datetime.now().isoformat(), + })) + await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次 + await self.redis.expire(history_key, 35 * 24 * 3600) + + logger.info( + "repair_attempt_recorded", + anomaly_key=anomaly_key, + action=action, + success=success, + ) + + async def mark_permanent_fix_applied(self, anomaly_key: str, fix_description: str): + """ + 標記已套用永久修復 + + Args: + anomaly_key: 異常 key + fix_description: 修復說明 + """ + await self.redis.set(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", '1') + await self.redis.expire(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", 90 * 24 * 3600) # 90 天 + + # 記錄修復詳情 + metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + await self.redis.hset(metadata_key, mapping={ + 'permanent_fix_applied': 'true', + 'permanent_fix_description': fix_description, + 'permanent_fix_time': datetime.now().isoformat(), + }) + + logger.info( + "permanent_fix_marked", + anomaly_key=anomaly_key, + fix_description=fix_description, + ) + + async def get_repair_success_rate(self, anomaly_key: str, action: str) -> dict: + """ + 取得特定動作的修復成功率 + + Returns: + { + 'action': 'restart_pod', + 'total': 10, + 'success': 3, + 'success_rate': 0.3, + } + """ + history_key = f"anomaly:repair_history:{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) + + total = 0 + success = 0 + + for item in history: + data = json.loads(item) + if data['action'] == action: + total += 1 + if data['success']: + success += 1 + + return { + 'action': action, + 'total': total, + 'success': success, + 'success_rate': success / total if total > 0 else 0.0, + } + + async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]: + """ + 取得所有修復動作的統計 + + Returns: + { + 'restart_pod': {'total': 10, 'success': 3, 'success_rate': 0.3}, + 'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5}, + } + """ + history_key = f"anomaly:repair_history:{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) + + stats: dict[str, dict] = {} + + for item in history: + data = json.loads(item) + action = data['action'] + + if action not in stats: + stats[action] = {'total': 0, 'success': 0} + + stats[action]['total'] += 1 + if data['success']: + stats[action]['success'] += 1 + + # 計算成功率 + for action, s in stats.items(): + s['success_rate'] = s['success'] / s['total'] if s['total'] > 0 else 0.0 + + return stats + + +# ============================================================================= +# Singleton 模式 +# ============================================================================= +_anomaly_counter: AnomalyCounter | None = None + + +def get_anomaly_counter() -> AnomalyCounter: + """取得 AnomalyCounter 實例""" + global _anomaly_counter + if _anomaly_counter is None: + from src.core.redis import get_redis_client + _anomaly_counter = AnomalyCounter(get_redis_client()) + return _anomaly_counter +``` + +--- + +## Step 2: 整合到 alertmanager_webhook.py (1h) + +### 2.1 在收到告警時記錄頻率 + +```python +# apps/api/src/api/v1/alertmanager_webhook.py +# 在 handle_alertmanager 函數中新增 + +from src.services.anomaly_counter import get_anomaly_counter + +async def handle_alertmanager(request: Request, background_tasks: BackgroundTasks): + # ... 現有代碼 ... + + # 🆕 記錄異常頻率 + anomaly_counter = get_anomaly_counter() + for alert in alerts: + anomaly_signature = { + 'alert_name': alert.get('labels', {}).get('alertname'), + 'service': alert.get('labels', {}).get('job'), + 'namespace': alert.get('labels', {}).get('namespace'), + 'error_type': alert.get('labels', {}).get('reason'), + } + freq = await anomaly_counter.record_anomaly(anomaly_signature) + + # 將頻率資訊傳遞給後續處理 + alert['_anomaly_frequency'] = freq._asdict() + + # ... 繼續現有流程 ... +``` + +### 2.2 在 Telegram 告警中顯示頻率 + +```python +# apps/api/src/services/telegram_gateway.py +# 修改 send_approval_card 方法,新增頻率資訊 + +async def send_approval_card( + self, + approval_id: str, + risk_level: str, + resource_name: str, + root_cause: str, + suggested_action: str, + primary_responsibility: str, + confidence: float, + namespace: str, + anomaly_frequency: dict | None = None, # 🆕 新增參數 +): + # ... 現有代碼 ... + + # 🆕 頻率資訊區塊 + frequency_section = "" + if anomaly_frequency and anomaly_frequency.get('count_24h', 0) > 1: + freq = anomaly_frequency + escalation_emoji = { + None: "", + 'REPEAT': "⚠️", + 'ESCALATE': "🔴", + 'PERMANENT_FIX': "🚨", + }.get(freq.get('escalation_level'), "") + + frequency_section = f""" +📊 頻率統計 {escalation_emoji}: + • 1小時: {freq.get('count_1h', 0)} 次 + • 24小時: {freq.get('count_24h', 0)} 次 + • 7天: {freq.get('count_7d', 0)} 次 + • 30天: {freq.get('count_30d', 0)} 次 + • 修復嘗試: {freq.get('auto_repair_count', 0)} 次 +""" + if freq.get('escalation_level'): + frequency_section += f" 🔺 升級建議: {freq['escalation_level']}\n" + + # 插入到告警卡片中 + # ... +``` + +--- + +## Step 3: 整合到 sentry_webhook.py (30min) + +### 3.1 Sentry 告警也要記錄頻率 + +```python +# apps/api/src/api/v1/sentry_webhook.py +# 在 analyze_and_comment 函數中新增 + +from src.services.anomaly_counter import get_anomaly_counter + +async def analyze_and_comment( + error_context: dict, + issue_id: str, + project_slug: str +): + # 🆕 記錄異常頻率 + anomaly_counter = get_anomaly_counter() + anomaly_signature = { + 'alert_name': 'sentry_error', + 'service': error_context.get('project', 'unknown'), + 'error_type': error_context.get('title', 'unknown'), + 'culprit': error_context.get('culprit', 'unknown'), + } + freq = await anomaly_counter.record_anomaly(anomaly_signature) + + # 傳遞給 Telegram 告警 + await send_sentry_telegram_alert( + error_context=error_context, + analysis=analysis, + approval_id=approval_id, + anomaly_frequency=freq._asdict(), # 🆕 + ) +``` + +--- + +## Step 4: 整合到 auto_repair_service.py (1h) + +### 4.1 修復前檢查頻率,決定 Tier + +```python +# apps/api/src/services/auto_repair_service.py +# 新增 Tier 決策邏輯 + +from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency + +class AutoRepairService: + async def determine_repair_tier( + self, + anomaly_key: str, + frequency: AnomalyFrequency, + ) -> int: + """ + 根據頻率決定修復 Tier + + Returns: + 1: 臨時修復 (重啟) + 2: 緩解修復 (擴容) + 3: 根因修復 (配置變更) + 4: 架構修復 (需開發) + """ + # 取得修復歷史 + counter = get_anomaly_counter() + stats = await counter.get_all_repair_stats(anomaly_key) + + # 計算重啟次數 + restart_count = stats.get('restart_pod', {}).get('total', 0) + restart_count += stats.get('restart_container', {}).get('total', 0) + + # Tier 決策邏輯 + if frequency.permanent_fix_applied: + return 4 # 已有永久修復但仍出問題 → 需架構級修復 + + if frequency.escalation_level == 'PERMANENT_FIX': + return 3 # 24h 內 ≥10 次 → 根因修復 + + if frequency.escalation_level == 'ESCALATE': + return 2 # 24h 內 ≥5 次 → 緩解修復 + + if restart_count >= 2: + return 2 # 已重啟 2 次 → 升級到緩解 + + return 1 # 預設臨時修復 + + async def get_tier_actions(self, tier: int) -> list[str]: + """ + 根據 Tier 返回可用修復動作 + """ + TIER_ACTIONS = { + 1: ['restart_pod', 'restart_container'], + 2: ['scale_up', 'increase_memory', 'adjust_limits'], + 3: ['apply_hotfix', 'update_config', 'patch_deployment'], + 4: ['create_issue', 'notify_team', 'schedule_fix'], + } + return TIER_ACTIONS.get(tier, TIER_ACTIONS[1]) +``` + +### 4.2 修復後記錄結果 + +```python +# apps/api/src/services/auto_repair_service.py +# 在執行修復後 + +async def execute_repair(self, ...): + # ... 執行修復 ... + + # 🆕 記錄修復嘗試 + counter = get_anomaly_counter() + await counter.record_repair_attempt( + anomaly_key=anomaly_key, + action=repair_action, + success=result.success, + ) + + # 如果是 Tier 3 永久修復成功 + if tier == 3 and result.success: + await counter.mark_permanent_fix_applied( + anomaly_key=anomaly_key, + fix_description=f"Applied {repair_action}: {result.message}", + ) +``` + +--- + +## Step 5: 單元測試 (30min) + +### 5.1 建立測試檔案 + +```python +# apps/api/tests/test_anomaly_counter.py +""" +AnomalyCounter 單元測試 +""" + +import pytest +from datetime import datetime, timedelta +from unittest.mock import AsyncMock, MagicMock +from src.services.anomaly_counter import AnomalyCounter, AnomalyFrequency + + +@pytest.fixture +def mock_redis(): + """模擬 Redis 客戶端""" + redis = AsyncMock() + redis.zadd = AsyncMock() + redis.zremrangebyscore = AsyncMock() + redis.expire = AsyncMock() + redis.zcount = AsyncMock(return_value=5) + redis.zrange = AsyncMock(return_value=[(b'123', 1234567890.0)]) + redis.get = AsyncMock(return_value=None) + redis.exists = AsyncMock(return_value=False) + redis.hset = AsyncMock() + return redis + + +@pytest.fixture +def counter(mock_redis): + return AnomalyCounter(mock_redis) + + +class TestHashSignature: + def test_same_input_same_hash(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'api'} + assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2) + + def test_different_input_different_hash(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'web'} + assert AnomalyCounter._hash_signature(sig1) != AnomalyCounter._hash_signature(sig2) + + def test_ignores_extra_fields(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'api', 'timestamp': '2026-01-01'} + assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2) + + +class TestEscalationLevel: + def test_no_escalation(self, counter): + assert counter._get_escalation_level(2) is None + + def test_repeat_level(self, counter): + assert counter._get_escalation_level(3) == 'REPEAT' + assert counter._get_escalation_level(4) == 'REPEAT' + + def test_escalate_level(self, counter): + assert counter._get_escalation_level(5) == 'ESCALATE' + assert counter._get_escalation_level(9) == 'ESCALATE' + + def test_permanent_fix_level(self, counter): + assert counter._get_escalation_level(10) == 'PERMANENT_FIX' + assert counter._get_escalation_level(100) == 'PERMANENT_FIX' + + +class TestRecordAnomaly: + @pytest.mark.asyncio + async def test_records_to_redis(self, counter, mock_redis): + sig = {'alert_name': 'PodCrash', 'service': 'api'} + freq = await counter.record_anomaly(sig) + + # 驗證 Redis 操作 + mock_redis.zadd.assert_called_once() + mock_redis.zremrangebyscore.assert_called_once() + mock_redis.expire.assert_called() + + # 驗證返回值 + assert isinstance(freq, AnomalyFrequency) + assert freq.count_1h == 5 # mock 返回值 +``` + +--- + +## Step 6: 部署驗證 (30min) + +### 6.1 本地測試 + +```bash +cd apps/api +pytest tests/test_anomaly_counter.py -v +``` + +### 6.2 整合測試 + +```bash +# 啟動本地 Redis +docker run -d --name test-redis -p 6380:6379 redis:7 + +# 手動測試 +python -c " +import asyncio +from src.services.anomaly_counter import AnomalyCounter +import redis.asyncio as redis + +async def test(): + r = redis.Redis(host='localhost', port=6380) + counter = AnomalyCounter(r) + + # 記錄 5 次異常 + for i in range(5): + freq = await counter.record_anomaly({'alert_name': 'TestAlert', 'service': 'test'}) + print(f'Count: {freq.count_24h}, Level: {freq.escalation_level}') + +asyncio.run(test()) +" +``` + +### 6.3 預期輸出 + +``` +Count: 1, Level: None +Count: 2, Level: None +Count: 3, Level: REPEAT +Count: 4, Level: REPEAT +Count: 5, Level: ESCALATE +``` + +--- + +## 交付物清單 + +| 檔案 | 狀態 | 說明 | +|------|------|------| +| `apps/api/src/services/anomaly_counter.py` | 🆕 新建 | 核心服務 | +| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 整合頻率追蹤 | +| `apps/api/src/api/v1/sentry_webhook.py` | 📝 修改 | 整合頻率追蹤 | +| `apps/api/src/services/telegram_gateway.py` | 📝 修改 | 顯示頻率資訊 | +| `apps/api/src/services/auto_repair_service.py` | 📝 修改 | Tier 決策 | +| `apps/api/tests/test_anomaly_counter.py` | 🆕 新建 | 單元測試 | + +--- + +**預估總工時**: 4h +**前置依賴**: Redis (已有) +**後續工作**: Phase B 資料庫 Exporter diff --git a/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md new file mode 100644 index 00000000..daebc509 --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md @@ -0,0 +1,522 @@ +# 資料庫 Exporter 部署實施步驟 + +> **優先級**: P0 +> **預估工時**: 3h +> **目標**: PostgreSQL 與 Redis 完整監控覆蓋 + +--- + +## 現狀分析 + +| 服務 | 當前監控 | 缺失指標 | +|------|---------|---------| +| PostgreSQL | ❌ 零 | 連接數、慢查詢、鎖等待、複製延遲 | +| Redis | ❌ 零 | 記憶體使用、命中率、命令延遲、驅逐率 | + +--- + +## Phase B-1: PostgreSQL Exporter (1.5h) + +### Step 1: 建立 Docker Compose 配置 (15min) + +```yaml +# ops/monitoring/docker-compose.exporters.yaml +# 2026-03-29 ogt: 資料庫監控 Exporter +# 部署位置: 192.168.0.188 (pg 主機) + +version: '3.8' + +services: + # ========================================================================== + # PostgreSQL Exporter + # ========================================================================== + postgres-exporter: + image: prometheuscommunity/postgres-exporter:v0.15.0 + container_name: postgres-exporter + restart: unless-stopped + ports: + - "9187:9187" + environment: + DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable" + PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml" + volumes: + - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro + networks: + - monitoring + depends_on: + - postgres + labels: + - "prometheus.scrape=true" + - "prometheus.port=9187" + + # ========================================================================== + # Redis Exporter + # ========================================================================== + redis-exporter: + image: oliver006/redis_exporter:v1.58.0 + container_name: redis-exporter + restart: unless-stopped + ports: + - "9121:9121" + environment: + REDIS_ADDR: "redis://redis:6379" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + networks: + - monitoring + depends_on: + - redis + labels: + - "prometheus.scrape=true" + - "prometheus.port=9121" + +networks: + monitoring: + external: true +``` + +### Step 2: 自訂 PostgreSQL 查詢 (15min) + +```yaml +# ops/monitoring/postgres-exporter-queries.yaml +# 自訂查詢 - 擴展預設指標 + +# ========================================================================== +# 連接池監控 +# ========================================================================== +pg_stat_activity_count: + query: | + SELECT + datname, + state, + count(*) as count + FROM pg_stat_activity + WHERE datname IS NOT NULL + GROUP BY datname, state + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - state: + usage: "LABEL" + description: "Connection state" + - count: + usage: "GAUGE" + description: "Number of connections" + +# ========================================================================== +# 慢查詢監控 (> 1 秒) +# ========================================================================== +pg_slow_queries: + query: | + SELECT + datname, + usename, + count(*) as slow_query_count + FROM pg_stat_activity + WHERE state = 'active' + AND query_start < now() - interval '1 second' + AND query NOT LIKE 'SELECT pg_%' + GROUP BY datname, usename + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - usename: + usage: "LABEL" + description: "User name" + - slow_query_count: + usage: "GAUGE" + description: "Number of slow queries (> 1s)" + +# ========================================================================== +# 鎖等待監控 +# ========================================================================== +pg_locks_waiting: + query: | + SELECT + datname, + mode, + count(*) as waiting_count + FROM pg_locks + WHERE NOT granted + GROUP BY datname, mode + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - mode: + usage: "LABEL" + description: "Lock mode" + - waiting_count: + usage: "GAUGE" + description: "Number of locks waiting" + +# ========================================================================== +# 表膨脹估算 (Dead Tuples) +# ========================================================================== +pg_stat_user_tables_bloat: + query: | + SELECT + schemaname, + relname, + n_dead_tup, + n_live_tup, + CASE WHEN n_live_tup > 0 + THEN round(100.0 * n_dead_tup / n_live_tup, 2) + ELSE 0 + END as dead_tuple_ratio + FROM pg_stat_user_tables + WHERE n_live_tup > 1000 + ORDER BY n_dead_tup DESC + LIMIT 20 + metrics: + - schemaname: + usage: "LABEL" + description: "Schema name" + - relname: + usage: "LABEL" + description: "Table name" + - n_dead_tup: + usage: "GAUGE" + description: "Dead tuples" + - n_live_tup: + usage: "GAUGE" + description: "Live tuples" + - dead_tuple_ratio: + usage: "GAUGE" + description: "Dead tuple percentage" + +# ========================================================================== +# 資料庫大小 +# ========================================================================== +pg_database_size_bytes: + query: | + SELECT + datname, + pg_database_size(datname) as size_bytes + FROM pg_database + WHERE datname NOT IN ('template0', 'template1') + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - size_bytes: + usage: "GAUGE" + description: "Database size in bytes" +``` + +### Step 3: 建立 Prometheus Scrape 配置 (15min) + +```yaml +# k8s/monitoring/prometheus-scrape-exporters.yaml +# 新增到 Prometheus ConfigMap + +# PostgreSQL Exporter +- job_name: 'postgres-exporter' + static_configs: + - targets: ['192.168.0.188:9187'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'postgres-primary' + +# Redis Exporter +- job_name: 'redis-exporter' + static_configs: + - targets: ['192.168.0.188:9121'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'redis-primary' +``` + +### Step 4: 建立告警規則 (30min) + +```yaml +# k8s/monitoring/database-alerts.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: database-alerts + namespace: monitoring + labels: + app: prometheus +spec: + groups: + # ========================================================================= + # PostgreSQL 告警 + # ========================================================================= + - name: postgresql + rules: + # 連接池即將耗盡 + - alert: PostgreSQLConnectionPoolNearLimit + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) + / + (SELECT setting::int FROM pg_settings WHERE name = 'max_connections') + > 0.8 + for: 5m + labels: + severity: warning + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 連接池使用率 > 80%" + description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}" + auto_repair: "analyze_connection_leak" + + # 連接池耗盡 + - alert: PostgreSQLConnectionPoolExhausted + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) + / + 100 # 假設 max_connections = 100 + > 0.95 + for: 2m + labels: + severity: critical + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 連接池即將耗盡" + description: "Database {{ $labels.datname }} 連接使用率 > 95%" + auto_repair: "restart_api_pods" + + # 慢查詢告警 + - alert: PostgreSQLSlowQueries + expr: pg_slow_queries > 5 + for: 5m + labels: + severity: warning + service: postgres + owner: backend-team + annotations: + summary: "PostgreSQL 慢查詢數量過多" + description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢" + auto_repair: "analyze_slow_queries" + + # 鎖等待告警 + - alert: PostgreSQLLockWaiting + expr: sum(pg_locks_waiting) > 10 + for: 2m + labels: + severity: warning + service: postgres + owner: backend-team + annotations: + summary: "PostgreSQL 鎖等待過多" + description: "{{ $value }} 個查詢正在等待鎖" + + # 表膨脹告警 + - alert: PostgreSQLTableBloat + expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20 + for: 30m + labels: + severity: warning + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 表膨脹" + description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%" + auto_repair: "schedule_vacuum" + + # PostgreSQL Down + - alert: PostgreSQLDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 無法連線" + auto_repair: "restart_postgres_container" + + # ========================================================================= + # Redis 告警 + # ========================================================================= + - name: redis + rules: + # 記憶體使用過高 + - alert: RedisMemoryHigh + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.85 + for: 5m + labels: + severity: warning + service: redis + owner: infra-team + annotations: + summary: "Redis 記憶體使用 > 85%" + description: "Redis 記憶體: {{ $value | humanizePercentage }}" + auto_repair: "analyze_redis_keys" + + # 記憶體即將耗盡 + - alert: RedisMemoryCritical + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.95 + for: 2m + labels: + severity: critical + service: redis + owner: infra-team + annotations: + summary: "Redis 記憶體即將耗盡" + description: "Redis 記憶體使用 > 95%" + auto_repair: "flush_expired_keys" + + # 快取命中率過低 + - alert: RedisCacheHitRateLow + expr: | + rate(redis_keyspace_hits_total[5m]) + / + (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) + < 0.8 + for: 15m + labels: + severity: warning + service: redis + owner: backend-team + annotations: + summary: "Redis 快取命中率 < 80%" + description: "命中率: {{ $value | humanizePercentage }}" + + # 連接數過高 + - alert: RedisConnectionsHigh + expr: redis_connected_clients > 500 + for: 5m + labels: + severity: warning + service: redis + owner: infra-team + annotations: + summary: "Redis 連接數過高" + description: "連接數: {{ $value }}" + + # Key 驅逐告警 + - alert: RedisEvictedKeys + expr: rate(redis_evicted_keys_total[5m]) > 100 + for: 5m + labels: + severity: warning + service: redis + owner: backend-team + annotations: + summary: "Redis Key 驅逐頻繁" + description: "每秒驅逐 {{ $value }} 個 key" + auto_repair: "increase_redis_memory" + + # Redis Down + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + service: redis + owner: infra-team + annotations: + summary: "Redis 無法連線" + auto_repair: "restart_redis_container" +``` + +### Step 5: 部署腳本 (15min) + +```bash +#!/bin/bash +# ops/monitoring/deploy-exporters.sh +# 部署資料庫 Exporter 到 192.168.0.188 + +set -euo pipefail + +HOST="192.168.0.188" +DEPLOY_DIR="/opt/monitoring/exporters" + +echo "=== 部署資料庫 Exporter ===" + +# 1. 建立目錄 +ssh $HOST "mkdir -p $DEPLOY_DIR" + +# 2. 複製配置 +scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml +scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/ + +# 3. 載入環境變數 (從 .env) +ssh $HOST "cd $DEPLOY_DIR && docker compose up -d" + +# 4. 驗證 +echo "等待服務啟動..." +sleep 10 + +echo "驗證 PostgreSQL Exporter..." +curl -s http://$HOST:9187/metrics | head -5 + +echo "驗證 Redis Exporter..." +curl -s http://$HOST:9121/metrics | head -5 + +# 5. 更新 Prometheus 配置 +echo "更新 Prometheus scrape 配置..." +kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml + +# 6. 部署告警規則 +echo "部署告警規則..." +kubectl apply -f k8s/monitoring/database-alerts.yaml + +# 7. 重載 Prometheus +kubectl rollout restart deployment/prometheus -n monitoring + +echo "=== 部署完成 ===" +echo "PostgreSQL Exporter: http://$HOST:9187/metrics" +echo "Redis Exporter: http://$HOST:9121/metrics" +``` + +--- + +## Phase B-2: 驗證清單 (30min) + +### 驗證 Prometheus Targets + +```bash +# 檢查 targets 是否 UP +curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}' +``` + +預期輸出: +```json +{"job": "postgres-exporter", "health": "up"} +{"job": "redis-exporter", "health": "up"} +``` + +### 驗證關鍵指標 + +```bash +# PostgreSQL 連接數 +curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count + +# Redis 記憶體 +curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes +``` + +### 觸發測試告警 + +```bash +# 模擬連接池壓力測試 +pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi +``` + +--- + +## 交付物清單 + +| 檔案 | 狀態 | 說明 | +|------|------|------| +| `ops/monitoring/docker-compose.exporters.yaml` | 🆕 | Exporter 容器配置 | +| `ops/monitoring/postgres-exporter-queries.yaml` | 🆕 | 自訂 PG 查詢 | +| `k8s/monitoring/prometheus-scrape-exporters.yaml` | 🆕 | Scrape 配置 | +| `k8s/monitoring/database-alerts.yaml` | 🆕 | 告警規則 | +| `ops/monitoring/deploy-exporters.sh` | 🆕 | 部署腳本 | + +--- + +**預估總工時**: 3h +**部署位置**: 192.168.0.188 +**依賴**: Docker Compose, 現有 PostgreSQL/Redis diff --git a/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md new file mode 100644 index 00000000..85ce2ceb --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md @@ -0,0 +1,511 @@ +# Incident 模型頻率欄位實施步驟 + +> **優先級**: P0 +> **預估工時**: 2h +> **目標**: Incident 支援頻率統計與聚合 + +--- + +## 現狀分析 + +| 模型 | hit_count | frequency | escalation | +|------|-----------|-----------|------------| +| Approval | ✅ 有 | ❌ 無 | ❌ 無 | +| Incident | ❌ 無 | ❌ 無 | ❌ 無 | + +--- + +## Step 1: 更新 Incident 模型 (30min) + +### 1.1 新增欄位 + +```python +# apps/api/src/models/incident.py +# 在 Incident 類別中新增以下欄位 + +from datetime import datetime +from typing import Optional +from pydantic import BaseModel, Field + + +class IncidentFrequencyStats(BaseModel): + """事件頻率統計""" + anomaly_key: str = Field(..., description="異常簽名 hash") + count_1h: int = Field(default=0, description="1 小時內發生次數") + count_24h: int = Field(default=0, description="24 小時內發生次數") + count_7d: int = Field(default=0, description="7 天內發生次數") + count_30d: int = Field(default=0, description="30 天內發生次數") + first_seen: datetime = Field(default_factory=datetime.now) + last_seen: datetime = Field(default_factory=datetime.now) + escalation_level: Optional[str] = Field( + default=None, + description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX" + ) + + +class IncidentRepairStats(BaseModel): + """修復嘗試統計""" + total_attempts: int = Field(default=0, description="總修復嘗試次數") + successful_attempts: int = Field(default=0, description="成功次數") + last_repair_action: Optional[str] = Field(default=None, description="最近修復動作") + last_repair_time: Optional[datetime] = Field(default=None) + repair_history: list[dict] = Field( + default_factory=list, + description="修復歷史: [{action, success, timestamp}]" + ) + recommended_tier: int = Field( + default=1, + description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構" + ) + + +# 在 Incident 模型中新增 +class Incident(BaseModel): + # ... 現有欄位 ... + + # 🆕 頻率統計 + frequency_stats: Optional[IncidentFrequencyStats] = Field( + default=None, + description="異常頻率統計" + ) + + # 🆕 修復統計 + repair_stats: Optional[IncidentRepairStats] = Field( + default=None, + description="修復嘗試統計" + ) + + # 🆕 聚合控制 + is_aggregated: bool = Field( + default=False, + description="是否為聚合告警 (同一問題多次觸發)" + ) + aggregated_count: int = Field( + default=1, + description="聚合次數 (窗口期內的觸發次數)" + ) + aggregation_window_start: Optional[datetime] = Field( + default=None, + description="聚合窗口開始時間" + ) +``` + +### 1.2 資料庫遷移 (如使用 SQLAlchemy) + +```python +# apps/api/src/db/migrations/add_incident_frequency.py +""" +新增 Incident 頻率欄位 +2026-03-29 ogt: 監控戰略規劃 +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + + +def upgrade(): + # 新增 frequency_stats JSONB 欄位 + op.add_column( + 'incidents', + sa.Column('frequency_stats', JSONB, nullable=True) + ) + + # 新增 repair_stats JSONB 欄位 + op.add_column( + 'incidents', + sa.Column('repair_stats', JSONB, nullable=True) + ) + + # 新增聚合欄位 + op.add_column( + 'incidents', + sa.Column('is_aggregated', sa.Boolean, default=False) + ) + op.add_column( + 'incidents', + sa.Column('aggregated_count', sa.Integer, default=1) + ) + op.add_column( + 'incidents', + sa.Column('aggregation_window_start', sa.DateTime, nullable=True) + ) + + # 建立索引 (用於查詢重複事件) + op.create_index( + 'ix_incidents_frequency_anomaly_key', + 'incidents', + [sa.text("(frequency_stats->>'anomaly_key')")] + ) + + +def downgrade(): + op.drop_index('ix_incidents_frequency_anomaly_key') + op.drop_column('incidents', 'aggregation_window_start') + op.drop_column('incidents', 'aggregated_count') + op.drop_column('incidents', 'is_aggregated') + op.drop_column('incidents', 'repair_stats') + op.drop_column('incidents', 'frequency_stats') +``` + +--- + +## Step 2: 更新 IncidentService (45min) + +### 2.1 新增聚合邏輯 + +```python +# apps/api/src/services/incident_service.py +# 新增或修改以下方法 + +from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency +from src.models.incident import IncidentFrequencyStats, IncidentRepairStats + + +class IncidentService: + # 聚合窗口 (10 分鐘內同一問題不建新 Incident) + AGGREGATION_WINDOW_MINUTES = 10 + + async def create_or_aggregate_incident( + self, + alert_data: dict, + analysis_result: dict | None = None, + ) -> tuple[Incident, bool]: + """ + 建立或聚合 Incident + + Returns: + tuple[Incident, bool]: (Incident, is_new) + - is_new=True: 新建的 Incident + - is_new=False: 聚合到現有 Incident + """ + # 1. 記錄到 AnomalyCounter + anomaly_counter = get_anomaly_counter() + anomaly_signature = self._extract_signature(alert_data) + frequency = await anomaly_counter.record_anomaly(anomaly_signature) + + # 2. 檢查是否有可聚合的現有 Incident + existing = await self._find_aggregatable_incident( + anomaly_key=frequency.anomaly_key, + window_minutes=self.AGGREGATION_WINDOW_MINUTES, + ) + + if existing: + # 聚合到現有 Incident + return await self._aggregate_to_existing(existing, frequency), False + else: + # 建立新 Incident + return await self._create_new_incident( + alert_data=alert_data, + frequency=frequency, + analysis_result=analysis_result, + ), True + + async def _find_aggregatable_incident( + self, + anomaly_key: str, + window_minutes: int, + ) -> Incident | None: + """ + 查找可聚合的現有 Incident + + 條件: + 1. 相同 anomaly_key + 2. 在聚合窗口內 + 3. 狀態為 OPEN 或 ANALYZING + """ + cutoff = datetime.now() - timedelta(minutes=window_minutes) + + # Redis 快速查詢 + cache_key = f"incident:aggregation:{anomaly_key}" + cached_id = await self.redis.get(cache_key) + + if cached_id: + incident = await self.get_by_id(cached_id.decode()) + if incident and incident.status in ['OPEN', 'ANALYZING']: + return incident + + # 資料庫查詢 (fallback) + # ... 實作資料庫查詢 ... + + return None + + async def _aggregate_to_existing( + self, + incident: Incident, + frequency: AnomalyFrequency, + ) -> Incident: + """ + 聚合到現有 Incident + """ + # 更新聚合計數 + incident.aggregated_count += 1 + incident.is_aggregated = True + + # 更新頻率統計 + incident.frequency_stats = IncidentFrequencyStats( + anomaly_key=frequency.anomaly_key, + count_1h=frequency.count_1h, + count_24h=frequency.count_24h, + count_7d=frequency.count_7d, + count_30d=frequency.count_30d, + first_seen=frequency.first_seen, + last_seen=frequency.last_seen, + escalation_level=frequency.escalation_level, + ) + + # 更新修復建議 Tier + if incident.repair_stats: + incident.repair_stats.recommended_tier = await self._calculate_tier(frequency) + + # 儲存 + await self.update(incident) + + logger.info( + "incident_aggregated", + incident_id=str(incident.id), + aggregated_count=incident.aggregated_count, + escalation_level=frequency.escalation_level, + ) + + return incident + + async def _create_new_incident( + self, + alert_data: dict, + frequency: AnomalyFrequency, + analysis_result: dict | None, + ) -> Incident: + """ + 建立新 Incident (含頻率統計) + """ + # 計算建議 Tier + recommended_tier = await self._calculate_tier(frequency) + + incident = Incident( + # ... 現有欄位 ... + frequency_stats=IncidentFrequencyStats( + anomaly_key=frequency.anomaly_key, + count_1h=frequency.count_1h, + count_24h=frequency.count_24h, + count_7d=frequency.count_7d, + count_30d=frequency.count_30d, + first_seen=frequency.first_seen, + last_seen=frequency.last_seen, + escalation_level=frequency.escalation_level, + ), + repair_stats=IncidentRepairStats( + recommended_tier=recommended_tier, + ), + is_aggregated=False, + aggregated_count=1, + aggregation_window_start=datetime.now(), + ) + + # 儲存 + await self.create(incident) + + # 設置聚合快取 (10 分鐘) + cache_key = f"incident:aggregation:{frequency.anomaly_key}" + await self.redis.setex(cache_key, 600, str(incident.id)) + + return incident + + async def _calculate_tier(self, frequency: AnomalyFrequency) -> int: + """ + 根據頻率計算建議修復 Tier + """ + # 取得修復歷史 + counter = get_anomaly_counter() + stats = await counter.get_all_repair_stats(frequency.anomaly_key) + + restart_count = stats.get('restart_pod', {}).get('total', 0) + restart_count += stats.get('restart_container', {}).get('total', 0) + + if frequency.permanent_fix_applied: + return 4 # 已有永久修復但仍出問題 + if frequency.escalation_level == 'PERMANENT_FIX': + return 3 # 24h ≥10 次 + if frequency.escalation_level == 'ESCALATE': + return 2 # 24h ≥5 次 + if restart_count >= 2: + return 2 # 已重啟 2 次 + return 1 + + def _extract_signature(self, alert_data: dict) -> dict: + """ + 從告警資料提取異常簽名 + """ + labels = alert_data.get('labels', {}) + return { + 'alert_name': labels.get('alertname', ''), + 'service': labels.get('job', labels.get('service', '')), + 'namespace': labels.get('namespace', ''), + 'error_type': labels.get('reason', labels.get('error_type', '')), + } + + async def record_repair_result( + self, + incident_id: str, + action: str, + success: bool, + ): + """ + 記錄修復結果到 Incident + """ + incident = await self.get_by_id(incident_id) + if not incident: + return + + # 更新 repair_stats + if not incident.repair_stats: + incident.repair_stats = IncidentRepairStats() + + incident.repair_stats.total_attempts += 1 + if success: + incident.repair_stats.successful_attempts += 1 + + incident.repair_stats.last_repair_action = action + incident.repair_stats.last_repair_time = datetime.now() + incident.repair_stats.repair_history.append({ + 'action': action, + 'success': success, + 'timestamp': datetime.now().isoformat(), + }) + + # 只保留最近 20 次 + incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:] + + # 同步到 AnomalyCounter + if incident.frequency_stats: + counter = get_anomaly_counter() + await counter.record_repair_attempt( + anomaly_key=incident.frequency_stats.anomaly_key, + action=action, + success=success, + ) + + await self.update(incident) +``` + +--- + +## Step 3: 更新 alertmanager_webhook.py (30min) + +### 3.1 使用新的聚合方法 + +```python +# apps/api/src/api/v1/alertmanager_webhook.py +# 修改告警處理邏輯 + +@router.post("/alertmanager") +async def handle_alertmanager( + request: Request, + background_tasks: BackgroundTasks, +): + payload = await request.json() + alerts = payload.get("alerts", []) + + for alert in alerts: + if alert.get("status") == "firing": + # 🆕 使用聚合方法 + incident_service = get_incident_service() + incident, is_new = await incident_service.create_or_aggregate_incident( + alert_data=alert, + ) + + if is_new: + # 新 Incident: 觸發 AI 分析 + Telegram + background_tasks.add_task( + analyze_and_notify, + incident=incident, + alert_data=alert, + ) + else: + # 聚合 Incident: 只更新統計,不重複通知 + # (除非達到升級閾值) + if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']: + background_tasks.add_task( + send_escalation_notification, + incident=incident, + ) + + return {"status": "ok", "processed": len(alerts)} +``` + +--- + +## Step 4: 前端顯示頻率資訊 (15min) + +### 4.1 Incident 卡片新增頻率區塊 + +```typescript +// apps/web/src/components/incidents/IncidentCard.tsx +// 新增頻率統計顯示 + +interface FrequencyStatsProps { + stats: { + count_1h: number; + count_24h: number; + count_7d: number; + count_30d: number; + escalation_level: string | null; + }; +} + +function FrequencyStats({ stats }: FrequencyStatsProps) { + const escalationColors = { + REPEAT: 'text-yellow-500', + ESCALATE: 'text-orange-500', + PERMANENT_FIX: 'text-red-500', + }; + + return ( +