From 7b2f585244202d18d8381e9035ce6b3ebcd7b98b Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 29 Mar 2026 10:23:04 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20=E5=AE=8C=E6=95=B4=E7=9B=A3=E6=8E=A7?= =?UTF-8?q?=E5=AF=A6=E6=96=BD=E6=AD=A5=E9=A9=9F=20(7=20Phase=20=E8=A9=B3?= =?UTF-8?q?=E7=B4=B0=E6=96=87=E6=AA=94)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase A: AnomalyCounter 服務 (4h) - Redis Sorted Set 滑動窗口計數 - 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX) - Tier 決策邏輯整合 Phase B: Database Exporters (3h) - pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控 - redis_exporter: 記憶體/命中率/驅逐監控 - 15+ 告警規則 Phase C: Incident 頻率欄位 (2h) - IncidentFrequencyStats 模型 - 告警聚合邏輯 (10 分鐘窗口) - 前端頻率顯示 Phase D: Sentry Comment 回寫 (1h) - 完成 TODO 實作 - Sentry API Token 配置 Phase E: SignOz 告警規則 (2h) - Error Rate / Latency 告警 - Trace 異常檢測 - SignOz Webhook Handler Phase F: Alert Chain E2E (2h) - Smoke Test 腳本 - CD Pipeline 整合 - 鏈路監控告警 Phase G: Learning Service (3h) - 修復效果學習 - 成功率計算 - Playbook 自動更新 總工時: 17h (2-3 天) Co-Authored-By: Claude Opus 4.5 --- .../IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md | 699 ++++++++++ ...IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md | 522 ++++++++ ...IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md | 511 ++++++++ .../IMPLEMENTATION_STEPS_REMAINING_PHASES.md | 1168 +++++++++++++++++ 4 files changed, 2900 insertions(+) create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md diff --git a/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md new file mode 100644 index 00000000..6e965b56 --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md @@ -0,0 +1,699 @@ +# AnomalyCounter 服務實施步驟 + +> **優先級**: P0 +> **預估工時**: 4h +> **目標**: 建立異常頻率追蹤能力 + +--- + +## Step 1: 建立 anomaly_counter.py (1h) + +### 1.1 建立檔案 + +```bash +touch apps/api/src/services/anomaly_counter.py +``` + +### 1.2 實作 AnomalyCounter 類別 + +```python +# apps/api/src/services/anomaly_counter.py +""" +異常頻率統計服務 +================================ +2026-03-29 ogt: 監控戰略規劃 Section 9 實作 + +使用 Redis Sorted Set 實作滑動窗口計數: +- ZADD anomaly:timeline:{key} {timestamp} {timestamp} +- ZCOUNT anomaly:timeline:{key} {start} +inf +- ZREMRANGEBYSCORE anomaly:timeline:{key} -inf {cutoff} +""" + +import hashlib +import json +from datetime import datetime, timedelta +from typing import NamedTuple + +import redis.asyncio as redis +import structlog + +logger = structlog.get_logger(__name__) + + +class AnomalyFrequency(NamedTuple): + """異常頻率資料""" + anomaly_key: str + count_1h: int + count_24h: int + count_7d: int + count_30d: int + first_seen: datetime + last_seen: datetime + auto_repair_count: int + permanent_fix_applied: bool + escalation_level: str | None # None, REPEAT, ESCALATE, PERMANENT_FIX + + +class AnomalyCounter: + """ + 異常計數器 - 追蹤每種異常的發生頻率 + + 閾值配置 (可透過環境變數覆寫): + - ANOMALY_REPEAT_THRESHOLD: 3 (預設) + - ANOMALY_ESCALATE_THRESHOLD: 5 (預設) + - ANOMALY_PERMANENT_FIX_THRESHOLD: 10 (預設) + """ + + THRESHOLDS = { + 'REPEAT': 3, # 3 次 → 重複告警 + 'ESCALATE': 5, # 5 次 → 人工介入 + 'PERMANENT_FIX': 10, # 10 次 → 必須永久修復 + } + + # Redis Key 前綴 + PREFIX_TIMELINE = "anomaly:timeline:" + PREFIX_REPAIR_COUNT = "anomaly:repair_count:" + PREFIX_PERMANENT_FIX = "anomaly:permanent_fix:" + PREFIX_METADATA = "anomaly:metadata:" + + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + + @staticmethod + def _hash_signature(signature: dict) -> str: + """ + 生成異常簽名的 hash key + + 簽名欄位: + - alert_name: 告警名稱 (e.g., PodCrashLoopBackOff) + - service: 服務名稱 (e.g., awoooi-api) + - namespace: K8s 命名空間 (e.g., awoooi-prod) + - error_type: 錯誤類型 (e.g., OOMKilled) + """ + # 只取關鍵欄位,忽略時間戳等易變欄位 + key_fields = { + 'alert_name': signature.get('alert_name', signature.get('alertname', '')), + 'service': signature.get('service', signature.get('job', '')), + 'namespace': signature.get('namespace', ''), + 'error_type': signature.get('error_type', signature.get('reason', '')), + } + # 排序確保一致性 + canonical = json.dumps(key_fields, sort_keys=True) + return hashlib.sha256(canonical.encode()).hexdigest()[:16] + + async def record_anomaly(self, anomaly_signature: dict) -> AnomalyFrequency: + """ + 記錄一次異常發生 + + Args: + anomaly_signature: 異常簽名字典 + + Returns: + AnomalyFrequency: 當前頻率統計 + """ + anomaly_key = self._hash_signature(anomaly_signature) + now = datetime.now() + timestamp = now.timestamp() + timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}" + + # 1. 添加到 Sorted Set (score = timestamp, member = timestamp string) + await self.redis.zadd(timeline_key, {str(timestamp): timestamp}) + + # 2. 清理過期數據 (30 天前) + cutoff_30d = (now - timedelta(days=30)).timestamp() + await self.redis.zremrangebyscore(timeline_key, '-inf', cutoff_30d) + + # 3. 設置 TTL (35 天,比清理週期長一點) + await self.redis.expire(timeline_key, 35 * 24 * 3600) + + # 4. 計算各時間窗口的計數 + count_1h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=1)).timestamp(), + '+inf' + ) + count_24h = await self.redis.zcount( + timeline_key, + (now - timedelta(hours=24)).timestamp(), + '+inf' + ) + count_7d = await self.redis.zcount( + timeline_key, + (now - timedelta(days=7)).timestamp(), + '+inf' + ) + count_30d = await self.redis.zcount( + timeline_key, + cutoff_30d, + '+inf' + ) + + # 5. 取得首次/最近時間 + first_seen_data = await self.redis.zrange(timeline_key, 0, 0, withscores=True) + last_seen_data = await self.redis.zrange(timeline_key, -1, -1, withscores=True) + + first_seen = datetime.fromtimestamp(first_seen_data[0][1]) if first_seen_data else now + last_seen = datetime.fromtimestamp(last_seen_data[0][1]) if last_seen_data else now + + # 6. 讀取修復統計 + auto_repair_count = int(await self.redis.get(f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}") or 0) + permanent_fix = await self.redis.get(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}") == b'1' + + # 7. 儲存 metadata (首次記錄時) + metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + if not await self.redis.exists(metadata_key): + await self.redis.hset(metadata_key, mapping={ + 'signature': json.dumps(anomaly_signature), + 'first_seen': now.isoformat(), + }) + await self.redis.expire(metadata_key, 35 * 24 * 3600) + + # 8. 判斷升級等級 + escalation_level = self._get_escalation_level(count_24h) + + freq = AnomalyFrequency( + anomaly_key=anomaly_key, + count_1h=count_1h, + count_24h=count_24h, + count_7d=count_7d, + count_30d=count_30d, + first_seen=first_seen, + last_seen=last_seen, + auto_repair_count=auto_repair_count, + permanent_fix_applied=permanent_fix, + escalation_level=escalation_level, + ) + + # 9. 記錄日誌 + logger.info( + "anomaly_recorded", + anomaly_key=anomaly_key, + count_1h=count_1h, + count_24h=count_24h, + count_30d=count_30d, + escalation_level=escalation_level, + ) + + return freq + + def _get_escalation_level(self, count_24h: int) -> str | None: + """判斷升級等級""" + if count_24h >= self.THRESHOLDS['PERMANENT_FIX']: + return 'PERMANENT_FIX' + elif count_24h >= self.THRESHOLDS['ESCALATE']: + return 'ESCALATE' + elif count_24h >= self.THRESHOLDS['REPEAT']: + return 'REPEAT' + return None + + async def record_repair_attempt(self, anomaly_key: str, action: str, success: bool): + """ + 記錄修復嘗試 + + Args: + anomaly_key: 異常 key + action: 修復動作 (e.g., restart_pod, scale_up) + success: 是否成功 + """ + repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}" + + # 遞增修復嘗試次數 + await self.redis.incr(repair_key) + await self.redis.expire(repair_key, 35 * 24 * 3600) + + # 記錄修復歷史 (用於學習) + history_key = f"anomaly:repair_history:{anomaly_key}" + await self.redis.lpush(history_key, json.dumps({ + 'action': action, + 'success': success, + 'timestamp': datetime.now().isoformat(), + })) + await self.redis.ltrim(history_key, 0, 99) # 只保留最近 100 次 + await self.redis.expire(history_key, 35 * 24 * 3600) + + logger.info( + "repair_attempt_recorded", + anomaly_key=anomaly_key, + action=action, + success=success, + ) + + async def mark_permanent_fix_applied(self, anomaly_key: str, fix_description: str): + """ + 標記已套用永久修復 + + Args: + anomaly_key: 異常 key + fix_description: 修復說明 + """ + await self.redis.set(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", '1') + await self.redis.expire(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", 90 * 24 * 3600) # 90 天 + + # 記錄修復詳情 + metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}" + await self.redis.hset(metadata_key, mapping={ + 'permanent_fix_applied': 'true', + 'permanent_fix_description': fix_description, + 'permanent_fix_time': datetime.now().isoformat(), + }) + + logger.info( + "permanent_fix_marked", + anomaly_key=anomaly_key, + fix_description=fix_description, + ) + + async def get_repair_success_rate(self, anomaly_key: str, action: str) -> dict: + """ + 取得特定動作的修復成功率 + + Returns: + { + 'action': 'restart_pod', + 'total': 10, + 'success': 3, + 'success_rate': 0.3, + } + """ + history_key = f"anomaly:repair_history:{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) + + total = 0 + success = 0 + + for item in history: + data = json.loads(item) + if data['action'] == action: + total += 1 + if data['success']: + success += 1 + + return { + 'action': action, + 'total': total, + 'success': success, + 'success_rate': success / total if total > 0 else 0.0, + } + + async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]: + """ + 取得所有修復動作的統計 + + Returns: + { + 'restart_pod': {'total': 10, 'success': 3, 'success_rate': 0.3}, + 'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5}, + } + """ + history_key = f"anomaly:repair_history:{anomaly_key}" + history = await self.redis.lrange(history_key, 0, -1) + + stats: dict[str, dict] = {} + + for item in history: + data = json.loads(item) + action = data['action'] + + if action not in stats: + stats[action] = {'total': 0, 'success': 0} + + stats[action]['total'] += 1 + if data['success']: + stats[action]['success'] += 1 + + # 計算成功率 + for action, s in stats.items(): + s['success_rate'] = s['success'] / s['total'] if s['total'] > 0 else 0.0 + + return stats + + +# ============================================================================= +# Singleton 模式 +# ============================================================================= +_anomaly_counter: AnomalyCounter | None = None + + +def get_anomaly_counter() -> AnomalyCounter: + """取得 AnomalyCounter 實例""" + global _anomaly_counter + if _anomaly_counter is None: + from src.core.redis import get_redis_client + _anomaly_counter = AnomalyCounter(get_redis_client()) + return _anomaly_counter +``` + +--- + +## Step 2: 整合到 alertmanager_webhook.py (1h) + +### 2.1 在收到告警時記錄頻率 + +```python +# apps/api/src/api/v1/alertmanager_webhook.py +# 在 handle_alertmanager 函數中新增 + +from src.services.anomaly_counter import get_anomaly_counter + +async def handle_alertmanager(request: Request, background_tasks: BackgroundTasks): + # ... 現有代碼 ... + + # 🆕 記錄異常頻率 + anomaly_counter = get_anomaly_counter() + for alert in alerts: + anomaly_signature = { + 'alert_name': alert.get('labels', {}).get('alertname'), + 'service': alert.get('labels', {}).get('job'), + 'namespace': alert.get('labels', {}).get('namespace'), + 'error_type': alert.get('labels', {}).get('reason'), + } + freq = await anomaly_counter.record_anomaly(anomaly_signature) + + # 將頻率資訊傳遞給後續處理 + alert['_anomaly_frequency'] = freq._asdict() + + # ... 繼續現有流程 ... +``` + +### 2.2 在 Telegram 告警中顯示頻率 + +```python +# apps/api/src/services/telegram_gateway.py +# 修改 send_approval_card 方法,新增頻率資訊 + +async def send_approval_card( + self, + approval_id: str, + risk_level: str, + resource_name: str, + root_cause: str, + suggested_action: str, + primary_responsibility: str, + confidence: float, + namespace: str, + anomaly_frequency: dict | None = None, # 🆕 新增參數 +): + # ... 現有代碼 ... + + # 🆕 頻率資訊區塊 + frequency_section = "" + if anomaly_frequency and anomaly_frequency.get('count_24h', 0) > 1: + freq = anomaly_frequency + escalation_emoji = { + None: "", + 'REPEAT': "⚠️", + 'ESCALATE': "🔴", + 'PERMANENT_FIX': "🚨", + }.get(freq.get('escalation_level'), "") + + frequency_section = f""" +📊 頻率統計 {escalation_emoji}: + • 1小時: {freq.get('count_1h', 0)} 次 + • 24小時: {freq.get('count_24h', 0)} 次 + • 7天: {freq.get('count_7d', 0)} 次 + • 30天: {freq.get('count_30d', 0)} 次 + • 修復嘗試: {freq.get('auto_repair_count', 0)} 次 +""" + if freq.get('escalation_level'): + frequency_section += f" 🔺 升級建議: {freq['escalation_level']}\n" + + # 插入到告警卡片中 + # ... +``` + +--- + +## Step 3: 整合到 sentry_webhook.py (30min) + +### 3.1 Sentry 告警也要記錄頻率 + +```python +# apps/api/src/api/v1/sentry_webhook.py +# 在 analyze_and_comment 函數中新增 + +from src.services.anomaly_counter import get_anomaly_counter + +async def analyze_and_comment( + error_context: dict, + issue_id: str, + project_slug: str +): + # 🆕 記錄異常頻率 + anomaly_counter = get_anomaly_counter() + anomaly_signature = { + 'alert_name': 'sentry_error', + 'service': error_context.get('project', 'unknown'), + 'error_type': error_context.get('title', 'unknown'), + 'culprit': error_context.get('culprit', 'unknown'), + } + freq = await anomaly_counter.record_anomaly(anomaly_signature) + + # 傳遞給 Telegram 告警 + await send_sentry_telegram_alert( + error_context=error_context, + analysis=analysis, + approval_id=approval_id, + anomaly_frequency=freq._asdict(), # 🆕 + ) +``` + +--- + +## Step 4: 整合到 auto_repair_service.py (1h) + +### 4.1 修復前檢查頻率,決定 Tier + +```python +# apps/api/src/services/auto_repair_service.py +# 新增 Tier 決策邏輯 + +from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency + +class AutoRepairService: + async def determine_repair_tier( + self, + anomaly_key: str, + frequency: AnomalyFrequency, + ) -> int: + """ + 根據頻率決定修復 Tier + + Returns: + 1: 臨時修復 (重啟) + 2: 緩解修復 (擴容) + 3: 根因修復 (配置變更) + 4: 架構修復 (需開發) + """ + # 取得修復歷史 + counter = get_anomaly_counter() + stats = await counter.get_all_repair_stats(anomaly_key) + + # 計算重啟次數 + restart_count = stats.get('restart_pod', {}).get('total', 0) + restart_count += stats.get('restart_container', {}).get('total', 0) + + # Tier 決策邏輯 + if frequency.permanent_fix_applied: + return 4 # 已有永久修復但仍出問題 → 需架構級修復 + + if frequency.escalation_level == 'PERMANENT_FIX': + return 3 # 24h 內 ≥10 次 → 根因修復 + + if frequency.escalation_level == 'ESCALATE': + return 2 # 24h 內 ≥5 次 → 緩解修復 + + if restart_count >= 2: + return 2 # 已重啟 2 次 → 升級到緩解 + + return 1 # 預設臨時修復 + + async def get_tier_actions(self, tier: int) -> list[str]: + """ + 根據 Tier 返回可用修復動作 + """ + TIER_ACTIONS = { + 1: ['restart_pod', 'restart_container'], + 2: ['scale_up', 'increase_memory', 'adjust_limits'], + 3: ['apply_hotfix', 'update_config', 'patch_deployment'], + 4: ['create_issue', 'notify_team', 'schedule_fix'], + } + return TIER_ACTIONS.get(tier, TIER_ACTIONS[1]) +``` + +### 4.2 修復後記錄結果 + +```python +# apps/api/src/services/auto_repair_service.py +# 在執行修復後 + +async def execute_repair(self, ...): + # ... 執行修復 ... + + # 🆕 記錄修復嘗試 + counter = get_anomaly_counter() + await counter.record_repair_attempt( + anomaly_key=anomaly_key, + action=repair_action, + success=result.success, + ) + + # 如果是 Tier 3 永久修復成功 + if tier == 3 and result.success: + await counter.mark_permanent_fix_applied( + anomaly_key=anomaly_key, + fix_description=f"Applied {repair_action}: {result.message}", + ) +``` + +--- + +## Step 5: 單元測試 (30min) + +### 5.1 建立測試檔案 + +```python +# apps/api/tests/test_anomaly_counter.py +""" +AnomalyCounter 單元測試 +""" + +import pytest +from datetime import datetime, timedelta +from unittest.mock import AsyncMock, MagicMock +from src.services.anomaly_counter import AnomalyCounter, AnomalyFrequency + + +@pytest.fixture +def mock_redis(): + """模擬 Redis 客戶端""" + redis = AsyncMock() + redis.zadd = AsyncMock() + redis.zremrangebyscore = AsyncMock() + redis.expire = AsyncMock() + redis.zcount = AsyncMock(return_value=5) + redis.zrange = AsyncMock(return_value=[(b'123', 1234567890.0)]) + redis.get = AsyncMock(return_value=None) + redis.exists = AsyncMock(return_value=False) + redis.hset = AsyncMock() + return redis + + +@pytest.fixture +def counter(mock_redis): + return AnomalyCounter(mock_redis) + + +class TestHashSignature: + def test_same_input_same_hash(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'api'} + assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2) + + def test_different_input_different_hash(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'web'} + assert AnomalyCounter._hash_signature(sig1) != AnomalyCounter._hash_signature(sig2) + + def test_ignores_extra_fields(self): + sig1 = {'alert_name': 'PodCrash', 'service': 'api'} + sig2 = {'alert_name': 'PodCrash', 'service': 'api', 'timestamp': '2026-01-01'} + assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2) + + +class TestEscalationLevel: + def test_no_escalation(self, counter): + assert counter._get_escalation_level(2) is None + + def test_repeat_level(self, counter): + assert counter._get_escalation_level(3) == 'REPEAT' + assert counter._get_escalation_level(4) == 'REPEAT' + + def test_escalate_level(self, counter): + assert counter._get_escalation_level(5) == 'ESCALATE' + assert counter._get_escalation_level(9) == 'ESCALATE' + + def test_permanent_fix_level(self, counter): + assert counter._get_escalation_level(10) == 'PERMANENT_FIX' + assert counter._get_escalation_level(100) == 'PERMANENT_FIX' + + +class TestRecordAnomaly: + @pytest.mark.asyncio + async def test_records_to_redis(self, counter, mock_redis): + sig = {'alert_name': 'PodCrash', 'service': 'api'} + freq = await counter.record_anomaly(sig) + + # 驗證 Redis 操作 + mock_redis.zadd.assert_called_once() + mock_redis.zremrangebyscore.assert_called_once() + mock_redis.expire.assert_called() + + # 驗證返回值 + assert isinstance(freq, AnomalyFrequency) + assert freq.count_1h == 5 # mock 返回值 +``` + +--- + +## Step 6: 部署驗證 (30min) + +### 6.1 本地測試 + +```bash +cd apps/api +pytest tests/test_anomaly_counter.py -v +``` + +### 6.2 整合測試 + +```bash +# 啟動本地 Redis +docker run -d --name test-redis -p 6380:6379 redis:7 + +# 手動測試 +python -c " +import asyncio +from src.services.anomaly_counter import AnomalyCounter +import redis.asyncio as redis + +async def test(): + r = redis.Redis(host='localhost', port=6380) + counter = AnomalyCounter(r) + + # 記錄 5 次異常 + for i in range(5): + freq = await counter.record_anomaly({'alert_name': 'TestAlert', 'service': 'test'}) + print(f'Count: {freq.count_24h}, Level: {freq.escalation_level}') + +asyncio.run(test()) +" +``` + +### 6.3 預期輸出 + +``` +Count: 1, Level: None +Count: 2, Level: None +Count: 3, Level: REPEAT +Count: 4, Level: REPEAT +Count: 5, Level: ESCALATE +``` + +--- + +## 交付物清單 + +| 檔案 | 狀態 | 說明 | +|------|------|------| +| `apps/api/src/services/anomaly_counter.py` | 🆕 新建 | 核心服務 | +| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 整合頻率追蹤 | +| `apps/api/src/api/v1/sentry_webhook.py` | 📝 修改 | 整合頻率追蹤 | +| `apps/api/src/services/telegram_gateway.py` | 📝 修改 | 顯示頻率資訊 | +| `apps/api/src/services/auto_repair_service.py` | 📝 修改 | Tier 決策 | +| `apps/api/tests/test_anomaly_counter.py` | 🆕 新建 | 單元測試 | + +--- + +**預估總工時**: 4h +**前置依賴**: Redis (已有) +**後續工作**: Phase B 資料庫 Exporter diff --git a/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md new file mode 100644 index 00000000..daebc509 --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md @@ -0,0 +1,522 @@ +# 資料庫 Exporter 部署實施步驟 + +> **優先級**: P0 +> **預估工時**: 3h +> **目標**: PostgreSQL 與 Redis 完整監控覆蓋 + +--- + +## 現狀分析 + +| 服務 | 當前監控 | 缺失指標 | +|------|---------|---------| +| PostgreSQL | ❌ 零 | 連接數、慢查詢、鎖等待、複製延遲 | +| Redis | ❌ 零 | 記憶體使用、命中率、命令延遲、驅逐率 | + +--- + +## Phase B-1: PostgreSQL Exporter (1.5h) + +### Step 1: 建立 Docker Compose 配置 (15min) + +```yaml +# ops/monitoring/docker-compose.exporters.yaml +# 2026-03-29 ogt: 資料庫監控 Exporter +# 部署位置: 192.168.0.188 (pg 主機) + +version: '3.8' + +services: + # ========================================================================== + # PostgreSQL Exporter + # ========================================================================== + postgres-exporter: + image: prometheuscommunity/postgres-exporter:v0.15.0 + container_name: postgres-exporter + restart: unless-stopped + ports: + - "9187:9187" + environment: + DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable" + PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml" + volumes: + - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro + networks: + - monitoring + depends_on: + - postgres + labels: + - "prometheus.scrape=true" + - "prometheus.port=9187" + + # ========================================================================== + # Redis Exporter + # ========================================================================== + redis-exporter: + image: oliver006/redis_exporter:v1.58.0 + container_name: redis-exporter + restart: unless-stopped + ports: + - "9121:9121" + environment: + REDIS_ADDR: "redis://redis:6379" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + networks: + - monitoring + depends_on: + - redis + labels: + - "prometheus.scrape=true" + - "prometheus.port=9121" + +networks: + monitoring: + external: true +``` + +### Step 2: 自訂 PostgreSQL 查詢 (15min) + +```yaml +# ops/monitoring/postgres-exporter-queries.yaml +# 自訂查詢 - 擴展預設指標 + +# ========================================================================== +# 連接池監控 +# ========================================================================== +pg_stat_activity_count: + query: | + SELECT + datname, + state, + count(*) as count + FROM pg_stat_activity + WHERE datname IS NOT NULL + GROUP BY datname, state + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - state: + usage: "LABEL" + description: "Connection state" + - count: + usage: "GAUGE" + description: "Number of connections" + +# ========================================================================== +# 慢查詢監控 (> 1 秒) +# ========================================================================== +pg_slow_queries: + query: | + SELECT + datname, + usename, + count(*) as slow_query_count + FROM pg_stat_activity + WHERE state = 'active' + AND query_start < now() - interval '1 second' + AND query NOT LIKE 'SELECT pg_%' + GROUP BY datname, usename + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - usename: + usage: "LABEL" + description: "User name" + - slow_query_count: + usage: "GAUGE" + description: "Number of slow queries (> 1s)" + +# ========================================================================== +# 鎖等待監控 +# ========================================================================== +pg_locks_waiting: + query: | + SELECT + datname, + mode, + count(*) as waiting_count + FROM pg_locks + WHERE NOT granted + GROUP BY datname, mode + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - mode: + usage: "LABEL" + description: "Lock mode" + - waiting_count: + usage: "GAUGE" + description: "Number of locks waiting" + +# ========================================================================== +# 表膨脹估算 (Dead Tuples) +# ========================================================================== +pg_stat_user_tables_bloat: + query: | + SELECT + schemaname, + relname, + n_dead_tup, + n_live_tup, + CASE WHEN n_live_tup > 0 + THEN round(100.0 * n_dead_tup / n_live_tup, 2) + ELSE 0 + END as dead_tuple_ratio + FROM pg_stat_user_tables + WHERE n_live_tup > 1000 + ORDER BY n_dead_tup DESC + LIMIT 20 + metrics: + - schemaname: + usage: "LABEL" + description: "Schema name" + - relname: + usage: "LABEL" + description: "Table name" + - n_dead_tup: + usage: "GAUGE" + description: "Dead tuples" + - n_live_tup: + usage: "GAUGE" + description: "Live tuples" + - dead_tuple_ratio: + usage: "GAUGE" + description: "Dead tuple percentage" + +# ========================================================================== +# 資料庫大小 +# ========================================================================== +pg_database_size_bytes: + query: | + SELECT + datname, + pg_database_size(datname) as size_bytes + FROM pg_database + WHERE datname NOT IN ('template0', 'template1') + metrics: + - datname: + usage: "LABEL" + description: "Database name" + - size_bytes: + usage: "GAUGE" + description: "Database size in bytes" +``` + +### Step 3: 建立 Prometheus Scrape 配置 (15min) + +```yaml +# k8s/monitoring/prometheus-scrape-exporters.yaml +# 新增到 Prometheus ConfigMap + +# PostgreSQL Exporter +- job_name: 'postgres-exporter' + static_configs: + - targets: ['192.168.0.188:9187'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'postgres-primary' + +# Redis Exporter +- job_name: 'redis-exporter' + static_configs: + - targets: ['192.168.0.188:9121'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'redis-primary' +``` + +### Step 4: 建立告警規則 (30min) + +```yaml +# k8s/monitoring/database-alerts.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: database-alerts + namespace: monitoring + labels: + app: prometheus +spec: + groups: + # ========================================================================= + # PostgreSQL 告警 + # ========================================================================= + - name: postgresql + rules: + # 連接池即將耗盡 + - alert: PostgreSQLConnectionPoolNearLimit + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) + / + (SELECT setting::int FROM pg_settings WHERE name = 'max_connections') + > 0.8 + for: 5m + labels: + severity: warning + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 連接池使用率 > 80%" + description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}" + auto_repair: "analyze_connection_leak" + + # 連接池耗盡 + - alert: PostgreSQLConnectionPoolExhausted + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) + / + 100 # 假設 max_connections = 100 + > 0.95 + for: 2m + labels: + severity: critical + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 連接池即將耗盡" + description: "Database {{ $labels.datname }} 連接使用率 > 95%" + auto_repair: "restart_api_pods" + + # 慢查詢告警 + - alert: PostgreSQLSlowQueries + expr: pg_slow_queries > 5 + for: 5m + labels: + severity: warning + service: postgres + owner: backend-team + annotations: + summary: "PostgreSQL 慢查詢數量過多" + description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢" + auto_repair: "analyze_slow_queries" + + # 鎖等待告警 + - alert: PostgreSQLLockWaiting + expr: sum(pg_locks_waiting) > 10 + for: 2m + labels: + severity: warning + service: postgres + owner: backend-team + annotations: + summary: "PostgreSQL 鎖等待過多" + description: "{{ $value }} 個查詢正在等待鎖" + + # 表膨脹告警 + - alert: PostgreSQLTableBloat + expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20 + for: 30m + labels: + severity: warning + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 表膨脹" + description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%" + auto_repair: "schedule_vacuum" + + # PostgreSQL Down + - alert: PostgreSQLDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + service: postgres + owner: infra-team + annotations: + summary: "PostgreSQL 無法連線" + auto_repair: "restart_postgres_container" + + # ========================================================================= + # Redis 告警 + # ========================================================================= + - name: redis + rules: + # 記憶體使用過高 + - alert: RedisMemoryHigh + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.85 + for: 5m + labels: + severity: warning + service: redis + owner: infra-team + annotations: + summary: "Redis 記憶體使用 > 85%" + description: "Redis 記憶體: {{ $value | humanizePercentage }}" + auto_repair: "analyze_redis_keys" + + # 記憶體即將耗盡 + - alert: RedisMemoryCritical + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.95 + for: 2m + labels: + severity: critical + service: redis + owner: infra-team + annotations: + summary: "Redis 記憶體即將耗盡" + description: "Redis 記憶體使用 > 95%" + auto_repair: "flush_expired_keys" + + # 快取命中率過低 + - alert: RedisCacheHitRateLow + expr: | + rate(redis_keyspace_hits_total[5m]) + / + (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) + < 0.8 + for: 15m + labels: + severity: warning + service: redis + owner: backend-team + annotations: + summary: "Redis 快取命中率 < 80%" + description: "命中率: {{ $value | humanizePercentage }}" + + # 連接數過高 + - alert: RedisConnectionsHigh + expr: redis_connected_clients > 500 + for: 5m + labels: + severity: warning + service: redis + owner: infra-team + annotations: + summary: "Redis 連接數過高" + description: "連接數: {{ $value }}" + + # Key 驅逐告警 + - alert: RedisEvictedKeys + expr: rate(redis_evicted_keys_total[5m]) > 100 + for: 5m + labels: + severity: warning + service: redis + owner: backend-team + annotations: + summary: "Redis Key 驅逐頻繁" + description: "每秒驅逐 {{ $value }} 個 key" + auto_repair: "increase_redis_memory" + + # Redis Down + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + service: redis + owner: infra-team + annotations: + summary: "Redis 無法連線" + auto_repair: "restart_redis_container" +``` + +### Step 5: 部署腳本 (15min) + +```bash +#!/bin/bash +# ops/monitoring/deploy-exporters.sh +# 部署資料庫 Exporter 到 192.168.0.188 + +set -euo pipefail + +HOST="192.168.0.188" +DEPLOY_DIR="/opt/monitoring/exporters" + +echo "=== 部署資料庫 Exporter ===" + +# 1. 建立目錄 +ssh $HOST "mkdir -p $DEPLOY_DIR" + +# 2. 複製配置 +scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml +scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/ + +# 3. 載入環境變數 (從 .env) +ssh $HOST "cd $DEPLOY_DIR && docker compose up -d" + +# 4. 驗證 +echo "等待服務啟動..." +sleep 10 + +echo "驗證 PostgreSQL Exporter..." +curl -s http://$HOST:9187/metrics | head -5 + +echo "驗證 Redis Exporter..." +curl -s http://$HOST:9121/metrics | head -5 + +# 5. 更新 Prometheus 配置 +echo "更新 Prometheus scrape 配置..." +kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml + +# 6. 部署告警規則 +echo "部署告警規則..." +kubectl apply -f k8s/monitoring/database-alerts.yaml + +# 7. 重載 Prometheus +kubectl rollout restart deployment/prometheus -n monitoring + +echo "=== 部署完成 ===" +echo "PostgreSQL Exporter: http://$HOST:9187/metrics" +echo "Redis Exporter: http://$HOST:9121/metrics" +``` + +--- + +## Phase B-2: 驗證清單 (30min) + +### 驗證 Prometheus Targets + +```bash +# 檢查 targets 是否 UP +curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}' +``` + +預期輸出: +```json +{"job": "postgres-exporter", "health": "up"} +{"job": "redis-exporter", "health": "up"} +``` + +### 驗證關鍵指標 + +```bash +# PostgreSQL 連接數 +curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count + +# Redis 記憶體 +curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes +``` + +### 觸發測試告警 + +```bash +# 模擬連接池壓力測試 +pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi +``` + +--- + +## 交付物清單 + +| 檔案 | 狀態 | 說明 | +|------|------|------| +| `ops/monitoring/docker-compose.exporters.yaml` | 🆕 | Exporter 容器配置 | +| `ops/monitoring/postgres-exporter-queries.yaml` | 🆕 | 自訂 PG 查詢 | +| `k8s/monitoring/prometheus-scrape-exporters.yaml` | 🆕 | Scrape 配置 | +| `k8s/monitoring/database-alerts.yaml` | 🆕 | 告警規則 | +| `ops/monitoring/deploy-exporters.sh` | 🆕 | 部署腳本 | + +--- + +**預估總工時**: 3h +**部署位置**: 192.168.0.188 +**依賴**: Docker Compose, 現有 PostgreSQL/Redis diff --git a/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md new file mode 100644 index 00000000..85ce2ceb --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md @@ -0,0 +1,511 @@ +# Incident 模型頻率欄位實施步驟 + +> **優先級**: P0 +> **預估工時**: 2h +> **目標**: Incident 支援頻率統計與聚合 + +--- + +## 現狀分析 + +| 模型 | hit_count | frequency | escalation | +|------|-----------|-----------|------------| +| Approval | ✅ 有 | ❌ 無 | ❌ 無 | +| Incident | ❌ 無 | ❌ 無 | ❌ 無 | + +--- + +## Step 1: 更新 Incident 模型 (30min) + +### 1.1 新增欄位 + +```python +# apps/api/src/models/incident.py +# 在 Incident 類別中新增以下欄位 + +from datetime import datetime +from typing import Optional +from pydantic import BaseModel, Field + + +class IncidentFrequencyStats(BaseModel): + """事件頻率統計""" + anomaly_key: str = Field(..., description="異常簽名 hash") + count_1h: int = Field(default=0, description="1 小時內發生次數") + count_24h: int = Field(default=0, description="24 小時內發生次數") + count_7d: int = Field(default=0, description="7 天內發生次數") + count_30d: int = Field(default=0, description="30 天內發生次數") + first_seen: datetime = Field(default_factory=datetime.now) + last_seen: datetime = Field(default_factory=datetime.now) + escalation_level: Optional[str] = Field( + default=None, + description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX" + ) + + +class IncidentRepairStats(BaseModel): + """修復嘗試統計""" + total_attempts: int = Field(default=0, description="總修復嘗試次數") + successful_attempts: int = Field(default=0, description="成功次數") + last_repair_action: Optional[str] = Field(default=None, description="最近修復動作") + last_repair_time: Optional[datetime] = Field(default=None) + repair_history: list[dict] = Field( + default_factory=list, + description="修復歷史: [{action, success, timestamp}]" + ) + recommended_tier: int = Field( + default=1, + description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構" + ) + + +# 在 Incident 模型中新增 +class Incident(BaseModel): + # ... 現有欄位 ... + + # 🆕 頻率統計 + frequency_stats: Optional[IncidentFrequencyStats] = Field( + default=None, + description="異常頻率統計" + ) + + # 🆕 修復統計 + repair_stats: Optional[IncidentRepairStats] = Field( + default=None, + description="修復嘗試統計" + ) + + # 🆕 聚合控制 + is_aggregated: bool = Field( + default=False, + description="是否為聚合告警 (同一問題多次觸發)" + ) + aggregated_count: int = Field( + default=1, + description="聚合次數 (窗口期內的觸發次數)" + ) + aggregation_window_start: Optional[datetime] = Field( + default=None, + description="聚合窗口開始時間" + ) +``` + +### 1.2 資料庫遷移 (如使用 SQLAlchemy) + +```python +# apps/api/src/db/migrations/add_incident_frequency.py +""" +新增 Incident 頻率欄位 +2026-03-29 ogt: 監控戰略規劃 +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + + +def upgrade(): + # 新增 frequency_stats JSONB 欄位 + op.add_column( + 'incidents', + sa.Column('frequency_stats', JSONB, nullable=True) + ) + + # 新增 repair_stats JSONB 欄位 + op.add_column( + 'incidents', + sa.Column('repair_stats', JSONB, nullable=True) + ) + + # 新增聚合欄位 + op.add_column( + 'incidents', + sa.Column('is_aggregated', sa.Boolean, default=False) + ) + op.add_column( + 'incidents', + sa.Column('aggregated_count', sa.Integer, default=1) + ) + op.add_column( + 'incidents', + sa.Column('aggregation_window_start', sa.DateTime, nullable=True) + ) + + # 建立索引 (用於查詢重複事件) + op.create_index( + 'ix_incidents_frequency_anomaly_key', + 'incidents', + [sa.text("(frequency_stats->>'anomaly_key')")] + ) + + +def downgrade(): + op.drop_index('ix_incidents_frequency_anomaly_key') + op.drop_column('incidents', 'aggregation_window_start') + op.drop_column('incidents', 'aggregated_count') + op.drop_column('incidents', 'is_aggregated') + op.drop_column('incidents', 'repair_stats') + op.drop_column('incidents', 'frequency_stats') +``` + +--- + +## Step 2: 更新 IncidentService (45min) + +### 2.1 新增聚合邏輯 + +```python +# apps/api/src/services/incident_service.py +# 新增或修改以下方法 + +from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency +from src.models.incident import IncidentFrequencyStats, IncidentRepairStats + + +class IncidentService: + # 聚合窗口 (10 分鐘內同一問題不建新 Incident) + AGGREGATION_WINDOW_MINUTES = 10 + + async def create_or_aggregate_incident( + self, + alert_data: dict, + analysis_result: dict | None = None, + ) -> tuple[Incident, bool]: + """ + 建立或聚合 Incident + + Returns: + tuple[Incident, bool]: (Incident, is_new) + - is_new=True: 新建的 Incident + - is_new=False: 聚合到現有 Incident + """ + # 1. 記錄到 AnomalyCounter + anomaly_counter = get_anomaly_counter() + anomaly_signature = self._extract_signature(alert_data) + frequency = await anomaly_counter.record_anomaly(anomaly_signature) + + # 2. 檢查是否有可聚合的現有 Incident + existing = await self._find_aggregatable_incident( + anomaly_key=frequency.anomaly_key, + window_minutes=self.AGGREGATION_WINDOW_MINUTES, + ) + + if existing: + # 聚合到現有 Incident + return await self._aggregate_to_existing(existing, frequency), False + else: + # 建立新 Incident + return await self._create_new_incident( + alert_data=alert_data, + frequency=frequency, + analysis_result=analysis_result, + ), True + + async def _find_aggregatable_incident( + self, + anomaly_key: str, + window_minutes: int, + ) -> Incident | None: + """ + 查找可聚合的現有 Incident + + 條件: + 1. 相同 anomaly_key + 2. 在聚合窗口內 + 3. 狀態為 OPEN 或 ANALYZING + """ + cutoff = datetime.now() - timedelta(minutes=window_minutes) + + # Redis 快速查詢 + cache_key = f"incident:aggregation:{anomaly_key}" + cached_id = await self.redis.get(cache_key) + + if cached_id: + incident = await self.get_by_id(cached_id.decode()) + if incident and incident.status in ['OPEN', 'ANALYZING']: + return incident + + # 資料庫查詢 (fallback) + # ... 實作資料庫查詢 ... + + return None + + async def _aggregate_to_existing( + self, + incident: Incident, + frequency: AnomalyFrequency, + ) -> Incident: + """ + 聚合到現有 Incident + """ + # 更新聚合計數 + incident.aggregated_count += 1 + incident.is_aggregated = True + + # 更新頻率統計 + incident.frequency_stats = IncidentFrequencyStats( + anomaly_key=frequency.anomaly_key, + count_1h=frequency.count_1h, + count_24h=frequency.count_24h, + count_7d=frequency.count_7d, + count_30d=frequency.count_30d, + first_seen=frequency.first_seen, + last_seen=frequency.last_seen, + escalation_level=frequency.escalation_level, + ) + + # 更新修復建議 Tier + if incident.repair_stats: + incident.repair_stats.recommended_tier = await self._calculate_tier(frequency) + + # 儲存 + await self.update(incident) + + logger.info( + "incident_aggregated", + incident_id=str(incident.id), + aggregated_count=incident.aggregated_count, + escalation_level=frequency.escalation_level, + ) + + return incident + + async def _create_new_incident( + self, + alert_data: dict, + frequency: AnomalyFrequency, + analysis_result: dict | None, + ) -> Incident: + """ + 建立新 Incident (含頻率統計) + """ + # 計算建議 Tier + recommended_tier = await self._calculate_tier(frequency) + + incident = Incident( + # ... 現有欄位 ... + frequency_stats=IncidentFrequencyStats( + anomaly_key=frequency.anomaly_key, + count_1h=frequency.count_1h, + count_24h=frequency.count_24h, + count_7d=frequency.count_7d, + count_30d=frequency.count_30d, + first_seen=frequency.first_seen, + last_seen=frequency.last_seen, + escalation_level=frequency.escalation_level, + ), + repair_stats=IncidentRepairStats( + recommended_tier=recommended_tier, + ), + is_aggregated=False, + aggregated_count=1, + aggregation_window_start=datetime.now(), + ) + + # 儲存 + await self.create(incident) + + # 設置聚合快取 (10 分鐘) + cache_key = f"incident:aggregation:{frequency.anomaly_key}" + await self.redis.setex(cache_key, 600, str(incident.id)) + + return incident + + async def _calculate_tier(self, frequency: AnomalyFrequency) -> int: + """ + 根據頻率計算建議修復 Tier + """ + # 取得修復歷史 + counter = get_anomaly_counter() + stats = await counter.get_all_repair_stats(frequency.anomaly_key) + + restart_count = stats.get('restart_pod', {}).get('total', 0) + restart_count += stats.get('restart_container', {}).get('total', 0) + + if frequency.permanent_fix_applied: + return 4 # 已有永久修復但仍出問題 + if frequency.escalation_level == 'PERMANENT_FIX': + return 3 # 24h ≥10 次 + if frequency.escalation_level == 'ESCALATE': + return 2 # 24h ≥5 次 + if restart_count >= 2: + return 2 # 已重啟 2 次 + return 1 + + def _extract_signature(self, alert_data: dict) -> dict: + """ + 從告警資料提取異常簽名 + """ + labels = alert_data.get('labels', {}) + return { + 'alert_name': labels.get('alertname', ''), + 'service': labels.get('job', labels.get('service', '')), + 'namespace': labels.get('namespace', ''), + 'error_type': labels.get('reason', labels.get('error_type', '')), + } + + async def record_repair_result( + self, + incident_id: str, + action: str, + success: bool, + ): + """ + 記錄修復結果到 Incident + """ + incident = await self.get_by_id(incident_id) + if not incident: + return + + # 更新 repair_stats + if not incident.repair_stats: + incident.repair_stats = IncidentRepairStats() + + incident.repair_stats.total_attempts += 1 + if success: + incident.repair_stats.successful_attempts += 1 + + incident.repair_stats.last_repair_action = action + incident.repair_stats.last_repair_time = datetime.now() + incident.repair_stats.repair_history.append({ + 'action': action, + 'success': success, + 'timestamp': datetime.now().isoformat(), + }) + + # 只保留最近 20 次 + incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:] + + # 同步到 AnomalyCounter + if incident.frequency_stats: + counter = get_anomaly_counter() + await counter.record_repair_attempt( + anomaly_key=incident.frequency_stats.anomaly_key, + action=action, + success=success, + ) + + await self.update(incident) +``` + +--- + +## Step 3: 更新 alertmanager_webhook.py (30min) + +### 3.1 使用新的聚合方法 + +```python +# apps/api/src/api/v1/alertmanager_webhook.py +# 修改告警處理邏輯 + +@router.post("/alertmanager") +async def handle_alertmanager( + request: Request, + background_tasks: BackgroundTasks, +): + payload = await request.json() + alerts = payload.get("alerts", []) + + for alert in alerts: + if alert.get("status") == "firing": + # 🆕 使用聚合方法 + incident_service = get_incident_service() + incident, is_new = await incident_service.create_or_aggregate_incident( + alert_data=alert, + ) + + if is_new: + # 新 Incident: 觸發 AI 分析 + Telegram + background_tasks.add_task( + analyze_and_notify, + incident=incident, + alert_data=alert, + ) + else: + # 聚合 Incident: 只更新統計,不重複通知 + # (除非達到升級閾值) + if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']: + background_tasks.add_task( + send_escalation_notification, + incident=incident, + ) + + return {"status": "ok", "processed": len(alerts)} +``` + +--- + +## Step 4: 前端顯示頻率資訊 (15min) + +### 4.1 Incident 卡片新增頻率區塊 + +```typescript +// apps/web/src/components/incidents/IncidentCard.tsx +// 新增頻率統計顯示 + +interface FrequencyStatsProps { + stats: { + count_1h: number; + count_24h: number; + count_7d: number; + count_30d: number; + escalation_level: string | null; + }; +} + +function FrequencyStats({ stats }: FrequencyStatsProps) { + const escalationColors = { + REPEAT: 'text-yellow-500', + ESCALATE: 'text-orange-500', + PERMANENT_FIX: 'text-red-500', + }; + + return ( +
+

+ 📊 頻率統計 + {stats.escalation_level && ( + + ⚠️ {stats.escalation_level} + + )} +

+
+
+ 1h: + {stats.count_1h} +
+
+ 24h: + {stats.count_24h} +
+
+ 7d: + {stats.count_7d} +
+
+ 30d: + {stats.count_30d} +
+
+
+ ); +} +``` + +--- + +## 交付物清單 + +| 檔案 | 狀態 | 說明 | +|------|------|------| +| `apps/api/src/models/incident.py` | 📝 修改 | 新增頻率欄位 | +| `apps/api/src/db/migrations/add_incident_frequency.py` | 🆕 | DB 遷移 | +| `apps/api/src/services/incident_service.py` | 📝 修改 | 聚合邏輯 | +| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 使用聚合 | +| `apps/web/src/components/incidents/IncidentCard.tsx` | 📝 修改 | 頻率顯示 | + +--- + +**預估總工時**: 2h +**前置依賴**: Phase A (AnomalyCounter) diff --git a/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md b/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md new file mode 100644 index 00000000..1ce7aa57 --- /dev/null +++ b/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md @@ -0,0 +1,1168 @@ +# 剩餘 Phase 實施步驟 (D-G) + +> **總工時**: 10h +> **優先級**: P0-P1 + +--- + +## Phase D: Sentry Comment 回寫 (1h) + +### 現狀 + +```python +# sentry_webhook.py:290-302 - 目前是 TODO +# TODO: 需要 Sentry API Token +logger.info(f"Would post comment to issue {issue_id}...") +``` + +### Step D-1: 取得 Sentry API Token (10min) + +```bash +# 在 Sentry Self-Hosted 管理後台 +# Settings → API Tokens → Create New Token +# 權限: project:read, issue:write + +# 儲存到 K8s Secret +kubectl create secret generic sentry-api-token \ + --from-literal=SENTRY_API_TOKEN=your_token_here \ + -n awoooi-prod +``` + +### Step D-2: 實作 Comment 回寫 (30min) + +```python +# apps/api/src/api/v1/sentry_webhook.py +# 完成 post_sentry_comment 實作 + +import os + +SENTRY_API_TOKEN = os.getenv("SENTRY_API_TOKEN") +SENTRY_API_URL = "http://192.168.0.110:9000" + + +async def post_sentry_comment( + project_slug: str, + issue_id: str, + analysis: ErrorAnalysisResult, +): + """ + 回寫分析結果到 Sentry Issue Comment + + API: POST /api/0/issues/{issue_id}/comments/ + Docs: https://docs.sentry.io/api/events/create-a-comment/ + """ + if not SENTRY_API_TOKEN: + logger.warning("SENTRY_API_TOKEN not configured, skipping comment") + return + + comment_text = f"""## 🧠 AI 錯誤分析 (by {analysis.analyzed_by}) + +**根本原因 (Root Cause)** +{analysis.root_cause} + +**影響範圍 (Impact)** +{analysis.impact} + +**建議修復 (Fix Suggestion)** +``` +{analysis.fix_suggestion} +``` + +**預防措施 (Prevention)** +{analysis.prevention} + +--- +*分析信心度: {analysis.confidence:.0%} | 分析時間: {now_taipei_iso()}* +*Powered by AWOOOI + OpenClaw* +""" + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + f"{SENTRY_API_URL}/api/0/issues/{issue_id}/comments/", + headers={ + "Authorization": f"Bearer {SENTRY_API_TOKEN}", + "Content-Type": "application/json", + }, + json={"text": comment_text} + ) + + if response.status_code == 201: + logger.info( + "sentry_comment_posted", + issue_id=issue_id, + comment_length=len(comment_text), + ) + else: + logger.warning( + "sentry_comment_failed", + issue_id=issue_id, + status=response.status_code, + response=response.text[:200], + ) + + except Exception as e: + logger.exception("sentry_comment_error", issue_id=issue_id, error=str(e)) +``` + +### Step D-3: 更新 K8s Deployment (10min) + +```yaml +# k8s/awoooi-prod/03-secrets.yaml +# 新增 Sentry API Token + +--- +apiVersion: v1 +kind: Secret +metadata: + name: sentry-api-token + namespace: awoooi-prod +type: Opaque +stringData: + SENTRY_API_TOKEN: "${SENTRY_API_TOKEN}" +``` + +```yaml +# k8s/awoooi-prod/04-deployment-api.yaml +# 掛載環境變數 + +env: + - name: SENTRY_API_TOKEN + valueFrom: + secretKeyRef: + name: sentry-api-token + key: SENTRY_API_TOKEN +``` + +### Step D-4: 驗證 (10min) + +```bash +# 手動觸發測試 +curl -X POST http://localhost:8000/api/v1/webhooks/sentry/error \ + -H "Content-Type: application/json" \ + -d '{ + "action": "triggered", + "data": { + "issue": { + "id": "12345", + "title": "Test Error", + "level": "error", + "project": {"slug": "awoooi-api"} + } + } + }' + +# 檢查 Sentry Issue 是否有 Comment +``` + +--- + +## Phase E: SignOz 告警規則 (2h) + +### 現狀分析 + +- SignOz 只做資料收集,無告警輸出 +- Error Rate / Latency 異常無法即時通知 + +### Step E-1: SignOz 告警配置 (1h) + +```yaml +# signoz/alerting/rules.yaml +# SignOz 自訂告警規則 + +groups: + # ========================================================================= + # API Error Rate 告警 + # ========================================================================= + - name: api_errors + rules: + - alert: APIHighErrorRate + expr: | + sum(rate(signoz_spans_total{ + service_name="awoooi-api", + status_code=~"5.." + }[5m])) by (service_name) + / + sum(rate(signoz_spans_total{ + service_name="awoooi-api" + }[5m])) by (service_name) + > 0.05 + for: 5m + labels: + severity: critical + source: signoz + annotations: + summary: "API 錯誤率 > 5%" + description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}" + webhook: "http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz" + + # ========================================================================= + # Latency 告警 + # ========================================================================= + - name: latency + rules: + - alert: APIHighLatencyP99 + expr: | + histogram_quantile(0.99, + sum(rate(signoz_spans_duration_bucket{ + service_name="awoooi-api" + }[5m])) by (le, service_name) + ) > 2 + for: 5m + labels: + severity: warning + source: signoz + annotations: + summary: "API P99 延遲 > 2s" + description: "服務 {{ $labels.service_name }} P99: {{ $value }}s" + + - alert: APIHighLatencyP95 + expr: | + histogram_quantile(0.95, + sum(rate(signoz_spans_duration_bucket{ + service_name="awoooi-api" + }[5m])) by (le, service_name) + ) > 1 + for: 10m + labels: + severity: warning + source: signoz + annotations: + summary: "API P95 延遲 > 1s" + + # ========================================================================= + # Trace 異常告警 + # ========================================================================= + - name: traces + rules: + - alert: NoTracesReceived + expr: | + sum(rate(signoz_spans_total[15m])) == 0 + for: 15m + labels: + severity: warning + source: signoz + annotations: + summary: "15 分鐘內無 Trace 數據" + description: "可能是 OTEL Collector 或應用程式問題" + + - alert: HighSpanDropRate + expr: | + sum(rate(otelcol_exporter_send_failed_spans[5m])) + / + sum(rate(otelcol_exporter_sent_spans[5m])) + > 0.01 + for: 5m + labels: + severity: warning + source: signoz + annotations: + summary: "Span 丟棄率 > 1%" +``` + +### Step E-2: 建立 SignOz Webhook Handler (30min) + +```python +# apps/api/src/api/v1/signoz_webhook.py +""" +SignOz 告警 Webhook Handler +""" + +from fastapi import APIRouter, Request, BackgroundTasks +import structlog + +from src.services.incident_service import get_incident_service +from src.services.telegram_gateway import get_telegram_gateway + +logger = structlog.get_logger(__name__) +router = APIRouter(prefix="/webhooks/signoz", tags=["SignOz Webhook"]) + + +@router.post("/alert") +async def handle_signoz_alert( + request: Request, + background_tasks: BackgroundTasks, +): + """ + 處理 SignOz 告警 + + SignOz 告警格式: + { + "alertname": "APIHighErrorRate", + "status": "firing", + "labels": {...}, + "annotations": {...}, + "startsAt": "2026-03-29T10:00:00Z" + } + """ + payload = await request.json() + logger.info("signoz_alert_received", payload=payload) + + alert_name = payload.get("alertname") + status = payload.get("status") + + if status != "firing": + return {"status": "ignored", "reason": "not firing"} + + # 轉換為標準告警格式 + normalized = { + "labels": { + "alertname": alert_name, + "source": "signoz", + **payload.get("labels", {}), + }, + "annotations": payload.get("annotations", {}), + "startsAt": payload.get("startsAt"), + } + + # 建立 Incident + incident_service = get_incident_service() + incident, is_new = await incident_service.create_or_aggregate_incident( + alert_data=normalized, + ) + + if is_new: + # 發送 Telegram + background_tasks.add_task( + notify_signoz_alert, + incident=incident, + alert_data=normalized, + ) + + return { + "status": "accepted", + "incident_id": str(incident.id), + "is_new": is_new, + } + + +async def notify_signoz_alert(incident, alert_data: dict): + """發送 SignOz 告警到 Telegram""" + telegram = get_telegram_gateway() + await telegram.initialize() + + annotations = alert_data.get("annotations", {}) + + await telegram.send_alert_card( + title=f"📊 SignOz: {alert_data['labels']['alertname']}", + severity=alert_data['labels'].get('severity', 'warning'), + description=annotations.get('description', annotations.get('summary', '')), + source="signoz", + incident_id=str(incident.id), + ) +``` + +### Step E-3: 註冊路由 (10min) + +```python +# apps/api/src/main.py +from src.api.v1 import signoz_webhook + +app.include_router(signoz_webhook.router, prefix="/api/v1") +``` + +### Step E-4: 部署告警規則 (20min) + +```bash +# 複製規則到 SignOz +scp signoz/alerting/rules.yaml 192.168.0.188:/opt/signoz/config/alerting/ + +# 重啟 SignOz Query Service +ssh 192.168.0.188 "docker restart signoz-query-service" + +# 驗證規則載入 +curl http://192.168.0.188:3301/api/v3/alerts/rules +``` + +--- + +## Phase F: 告警鏈路 E2E 驗證 (2h) + +### 現狀問題 + +- 2026-03-26: 路徑錯誤導致 2 天無告警 +- 部署後無自動驗證機制 + +### Step F-1: 建立 Smoke Test 腳本 (30min) + +```python +# ops/scripts/alert_chain_smoke_test.py +#!/usr/bin/env python3 +""" +告警鏈路端到端驗證 + +執行: + python ops/scripts/alert_chain_smoke_test.py + +驗證項目: +1. Alertmanager Webhook 可達 +2. Sentry Webhook 可達 +3. SignOz Webhook 可達 +4. Telegram 發送成功 +5. Approval 建立成功 +""" + +import asyncio +import httpx +import sys +from datetime import datetime + + +API_BASE = "http://awoooi-api.awoooi-prod.svc.cluster.local:8000" +# 本地測試用 +# API_BASE = "http://localhost:8000" + +TIMEOUT = 30 + + +async def test_alertmanager_webhook() -> bool: + """測試 Alertmanager Webhook""" + print("🔍 Testing Alertmanager Webhook...") + + test_payload = { + "version": "4", + "status": "firing", + "alerts": [{ + "status": "firing", + "labels": { + "alertname": "E2E_SMOKE_TEST", + "severity": "info", + "service": "smoke-test", + "namespace": "test", + }, + "annotations": { + "summary": "E2E Smoke Test - 請忽略", + "description": f"自動測試 @ {datetime.now().isoformat()}", + }, + "startsAt": datetime.now().isoformat() + "Z", + }] + } + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.post( + f"{API_BASE}/api/v1/webhooks/alertmanager", + json=test_payload, + ) + if response.status_code == 200: + print(" ✅ Alertmanager Webhook: OK") + return True + else: + print(f" ❌ Alertmanager Webhook: {response.status_code}") + print(f" Response: {response.text[:200]}") + return False + except Exception as e: + print(f" ❌ Alertmanager Webhook: {e}") + return False + + +async def test_sentry_webhook() -> bool: + """測試 Sentry Webhook""" + print("🔍 Testing Sentry Webhook...") + + test_payload = { + "action": "triggered", + "data": { + "issue": { + "id": "smoke-test-" + datetime.now().strftime("%Y%m%d%H%M%S"), + "title": "E2E Smoke Test Error", + "level": "error", + "culprit": "smoke_test.py:test", + "project": {"slug": "awoooi-api"}, + "firstSeen": datetime.now().isoformat(), + "count": 1, + }, + "event": { + "message": "E2E Smoke Test - 請忽略", + "platform": "python", + }, + }, + } + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.post( + f"{API_BASE}/api/v1/webhooks/sentry/error", + json=test_payload, + ) + if response.status_code == 200: + result = response.json() + if result.get("status") in ["accepted", "deduplicated"]: + print(" ✅ Sentry Webhook: OK") + return True + print(f" ❌ Sentry Webhook: {response.status_code}") + return False + except Exception as e: + print(f" ❌ Sentry Webhook: {e}") + return False + + +async def test_health_endpoint() -> bool: + """測試 Health Endpoint""" + print("🔍 Testing Health Endpoint...") + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.get(f"{API_BASE}/api/v1/health") + if response.status_code == 200: + print(" ✅ Health: OK") + return True + else: + print(f" ❌ Health: {response.status_code}") + return False + except Exception as e: + print(f" ❌ Health: {e}") + return False + + +async def test_telegram_connectivity() -> bool: + """測試 Telegram 連通性""" + print("🔍 Testing Telegram Connectivity...") + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + # 透過內部 API 檢查 Telegram 狀態 + response = await client.get(f"{API_BASE}/api/v1/telegram/status") + if response.status_code == 200: + data = response.json() + if data.get("connected"): + print(" ✅ Telegram: Connected") + return True + else: + print(" ⚠️ Telegram: Not Connected (but endpoint reachable)") + return True # 端點可達即可 + else: + print(f" ❌ Telegram: {response.status_code}") + return False + except Exception as e: + print(f" ⚠️ Telegram: {e} (endpoint may not exist)") + return True # 不影響整體測試 + + +async def main(): + print("=" * 60) + print("🚀 AWOOOI 告警鏈路 E2E Smoke Test") + print(f" 時間: {datetime.now().isoformat()}") + print(f" 目標: {API_BASE}") + print("=" * 60) + + results = await asyncio.gather( + test_health_endpoint(), + test_alertmanager_webhook(), + test_sentry_webhook(), + test_telegram_connectivity(), + ) + + print("=" * 60) + passed = sum(results) + total = len(results) + + if passed == total: + print(f"✅ 全部通過 ({passed}/{total})") + sys.exit(0) + else: + print(f"❌ 部分失敗 ({passed}/{total})") + sys.exit(1) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Step F-2: 整合到 CD Pipeline (30min) + +```yaml +# .github/workflows/cd.yaml +# 新增 smoke test 步驟 + +jobs: + deploy: + # ... 現有步驟 ... + + - name: Wait for Pods Ready + run: | + kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=5m + + # 🆕 告警鏈路驗證 + - name: Alert Chain Smoke Test + run: | + # 等待服務完全啟動 + sleep 30 + + # 執行 smoke test + python ops/scripts/alert_chain_smoke_test.py + + env: + API_BASE: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000" + + - name: Notify on Smoke Test Failure + if: failure() + run: | + # 直接發送 Telegram 告警 (繞過可能壞掉的 API) + curl -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \ + -d "chat_id=${TG_CHAT_ID}" \ + -d "text=🚨 AWOOOI CD Smoke Test 失敗!告警鏈路可能中斷!" + env: + TG_BOT_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }} + TG_CHAT_ID: ${{ secrets.OPENCLAW_TG_CHAT_ID }} +``` + +### Step F-3: 建立鏈路監控告警 (30min) + +```yaml +# k8s/monitoring/alert-chain-monitor.yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: alert-chain-monitor + namespace: monitoring +spec: + groups: + - name: alert_chain + rules: + # Alertmanager Webhook 無回應 + - alert: AlertChainBroken_Alertmanager + expr: | + sum(rate(http_requests_total{ + path="/api/v1/webhooks/alertmanager", + status!="200" + }[5m])) > 0 + or + absent(http_requests_total{path="/api/v1/webhooks/alertmanager"}) + for: 10m + labels: + severity: critical + service: alert-chain + annotations: + summary: "Alertmanager Webhook 鏈路異常" + description: "告警無法送達 AWOOOI API" + + # Sentry Webhook 無回應 + - alert: AlertChainBroken_Sentry + expr: | + sum(rate(http_requests_total{ + path="/api/v1/webhooks/sentry/error", + status!="200" + }[5m])) > 0 + for: 10m + labels: + severity: warning + service: alert-chain + annotations: + summary: "Sentry Webhook 鏈路異常" + + # 長時間無告警 (可能鏈路斷了) + - alert: NoAlertsReceivedLong + expr: | + time() - max(awoooi_last_alert_received_timestamp) > 7200 + for: 5m + labels: + severity: warning + service: alert-chain + annotations: + summary: "2 小時內未收到任何告警" + description: "可能是告警鏈路問題或系統異常穩定" +``` + +### Step F-4: 新增 Metrics (30min) + +```python +# apps/api/src/core/metrics.py +# 新增告警鏈路 metrics + +from prometheus_client import Counter, Gauge, Histogram +import time + +# 最後收到告警的時間戳 +LAST_ALERT_RECEIVED = Gauge( + 'awoooi_last_alert_received_timestamp', + 'Timestamp of last received alert', +) + +# 告警接收計數 +ALERTS_RECEIVED = Counter( + 'awoooi_alerts_received_total', + 'Total alerts received', + ['source', 'status'] +) + +# Webhook 處理延遲 +WEBHOOK_LATENCY = Histogram( + 'awoooi_webhook_latency_seconds', + 'Webhook processing latency', + ['webhook_type'], + buckets=[0.1, 0.5, 1, 2, 5, 10, 30] +) + + +def record_alert_received(source: str, status: str = "accepted"): + """記錄收到告警""" + LAST_ALERT_RECEIVED.set(time.time()) + ALERTS_RECEIVED.labels(source=source, status=status).inc() +``` + +--- + +## Phase G: Learning Service (3h) + +### Step G-1: 建立 learning_service.py (1.5h) + +```python +# apps/api/src/services/learning_service.py +""" +異常學習服務 - 從解決方案中學習 +================================ +2026-03-29 ogt: 監控戰略規劃 Section 9.4 實作 + +功能: +1. 記錄每次修復的效果 +2. 計算各動作的成功率 +3. 推薦最佳修復方案 +4. 自動更新 Playbook +""" + +import json +from datetime import datetime +from typing import Optional + +import redis.asyncio as redis +import structlog + +from src.services.anomaly_counter import get_anomaly_counter +from src.services.playbook_service import get_playbook_service + +logger = structlog.get_logger(__name__) + + +class LearningService: + """ + 學習每次修復的效果,自動更新 Playbook + """ + + # 學習門檻: 需要至少 N 次數據才能推薦 + MIN_SAMPLES = 5 + + # 成功率門檻: 高於此值才會被推薦 + SUCCESS_RATE_THRESHOLD = 0.6 + + # Tier 對應的動作 + TIER_ACTIONS = { + 1: ['restart_pod', 'restart_container', 'delete_pod'], + 2: ['scale_up', 'increase_memory', 'increase_cpu', 'adjust_limits'], + 3: ['apply_hotfix', 'update_config', 'patch_deployment', 'rollback'], + 4: ['create_issue', 'notify_team', 'schedule_fix', 'manual_intervention'], + } + + def __init__(self, redis_client: redis.Redis): + self.redis = redis_client + + async def record_repair_result( + self, + anomaly_key: str, + repair_action: str, + success: bool, + root_cause: Optional[str] = None, + fix_description: Optional[str] = None, + execution_time_seconds: Optional[float] = None, + ): + """ + 記錄修復結果,用於學習 + + Args: + anomaly_key: 異常 key + repair_action: 修復動作 + success: 是否成功 + root_cause: 根因 (如果找到) + fix_description: 修復說明 + execution_time_seconds: 執行時間 + """ + # 1. 記錄到 AnomalyCounter + counter = get_anomaly_counter() + await counter.record_repair_attempt(anomaly_key, repair_action, success) + + # 2. 記錄詳細學習數據 + learning_key = f"learning:repair:{anomaly_key}:{repair_action}" + record = { + 'success': success, + 'root_cause': root_cause, + 'fix_description': fix_description, + 'execution_time': execution_time_seconds, + 'timestamp': datetime.now().isoformat(), + } + + await self.redis.lpush(learning_key, json.dumps(record)) + await self.redis.ltrim(learning_key, 0, 99) # 保留最近 100 次 + await self.redis.expire(learning_key, 90 * 24 * 3600) # 90 天 + + # 3. 如果找到根因且修復成功,考慮更新 Playbook + if success and root_cause: + await self._consider_playbook_update( + anomaly_key=anomaly_key, + repair_action=repair_action, + root_cause=root_cause, + fix_description=fix_description, + ) + + logger.info( + "learning_recorded", + anomaly_key=anomaly_key, + action=repair_action, + success=success, + has_root_cause=root_cause is not None, + ) + + async def get_recommended_fix(self, anomaly_key: str) -> dict: + """ + 根據歷史學習,推薦最佳修復方案 + + Returns: + { + 'action': 'scale_up', + 'confidence': 0.85, + 'tier': 2, + 'based_on': '12 次歷史數據', + 'avg_execution_time': 45.2, + 'alternatives': [...] + } + """ + counter = get_anomaly_counter() + all_stats = await counter.get_all_repair_stats(anomaly_key) + + if not all_stats: + return self._default_recommendation() + + # 計算各動作的加權分數 + scored_actions = [] + for action, stats in all_stats.items(): + if stats['total'] >= self.MIN_SAMPLES: + success_rate = stats['success_rate'] + if success_rate >= self.SUCCESS_RATE_THRESHOLD: + # 加權: 成功率 * log(樣本數) + import math + score = success_rate * math.log(stats['total'] + 1) + + # 取得平均執行時間 + avg_time = await self._get_avg_execution_time(anomaly_key, action) + + scored_actions.append({ + 'action': action, + 'score': score, + 'success_rate': success_rate, + 'total_samples': stats['total'], + 'tier': self._get_action_tier(action), + 'avg_execution_time': avg_time, + }) + + if not scored_actions: + return self._default_recommendation() + + # 排序: 優先高成功率,其次低 Tier + scored_actions.sort(key=lambda x: (-x['score'], x['tier'])) + + best = scored_actions[0] + alternatives = scored_actions[1:3] if len(scored_actions) > 1 else [] + + return { + 'action': best['action'], + 'confidence': best['success_rate'], + 'tier': best['tier'], + 'based_on': f"{best['total_samples']} 次歷史數據", + 'avg_execution_time': best['avg_execution_time'], + 'alternatives': [ + {'action': a['action'], 'confidence': a['success_rate'], 'tier': a['tier']} + for a in alternatives + ], + } + + async def _get_avg_execution_time(self, anomaly_key: str, action: str) -> float: + """取得平均執行時間""" + learning_key = f"learning:repair:{anomaly_key}:{action}" + records = await self.redis.lrange(learning_key, 0, 19) # 最近 20 次 + + times = [] + for r in records: + data = json.loads(r) + if data.get('execution_time'): + times.append(data['execution_time']) + + return sum(times) / len(times) if times else 0.0 + + async def _consider_playbook_update( + self, + anomaly_key: str, + repair_action: str, + root_cause: str, + fix_description: str, + ): + """ + 考慮是否要更新 Playbook + + 條件: + 1. 該動作成功率 >= 80% + 2. 至少有 5 次成功記錄 + 3. Playbook 中沒有更好的方案 + """ + counter = get_anomaly_counter() + stats = await counter.get_repair_success_rate(anomaly_key, repair_action) + + if stats['total'] >= 5 and stats['success_rate'] >= 0.8: + # 檢查是否已有 Playbook + playbook_service = get_playbook_service() + existing = await playbook_service.find_by_anomaly_key(anomaly_key) + + if not existing or existing.success_rate < stats['success_rate']: + # 建立或更新 Playbook + await playbook_service.create_or_update( + anomaly_key=anomaly_key, + root_cause=root_cause, + fix_action=repair_action, + fix_description=fix_description, + success_rate=stats['success_rate'], + total_executions=stats['total'], + source='auto_learning', + ) + + logger.info( + "playbook_auto_updated", + anomaly_key=anomaly_key, + action=repair_action, + success_rate=stats['success_rate'], + ) + + def _get_action_tier(self, action: str) -> int: + """取得動作的 Tier""" + for tier, actions in self.TIER_ACTIONS.items(): + if action in actions: + return tier + return 1 # 預設 Tier 1 + + def _default_recommendation(self) -> dict: + """預設推薦 (無歷史數據時)""" + return { + 'action': 'restart_pod', + 'confidence': 0.3, + 'tier': 1, + 'based_on': '無歷史數據,使用預設', + 'avg_execution_time': 30.0, + 'alternatives': [ + {'action': 'delete_pod', 'confidence': 0.3, 'tier': 1}, + ], + } + + async def get_learning_summary(self, anomaly_key: str) -> dict: + """ + 取得學習摘要 + + Returns: + { + 'anomaly_key': 'abc123', + 'total_occurrences': 15, + 'total_repair_attempts': 8, + 'overall_success_rate': 0.625, + 'actions_tried': ['restart_pod', 'scale_up'], + 'best_action': {'action': 'scale_up', 'success_rate': 0.75}, + 'learning_status': 'sufficient', # insufficient, sufficient, excellent + } + """ + counter = get_anomaly_counter() + + # 取得頻率統計 + # 需要從 Redis 讀取,這裡簡化 + timeline_key = f"anomaly:timeline:{anomaly_key}" + total_occurrences = await self.redis.zcard(timeline_key) + + # 取得所有修復統計 + all_stats = await counter.get_all_repair_stats(anomaly_key) + + total_attempts = sum(s['total'] for s in all_stats.values()) + total_success = sum(s['success'] for s in all_stats.values()) + overall_rate = total_success / total_attempts if total_attempts > 0 else 0 + + # 找出最佳動作 + best_action = None + best_rate = 0 + for action, stats in all_stats.items(): + if stats['total'] >= 3 and stats['success_rate'] > best_rate: + best_rate = stats['success_rate'] + best_action = {'action': action, 'success_rate': best_rate} + + # 判斷學習狀態 + if total_attempts < 3: + status = 'insufficient' + elif total_attempts < 10: + status = 'learning' + elif overall_rate >= 0.8: + status = 'excellent' + else: + status = 'sufficient' + + return { + 'anomaly_key': anomaly_key, + 'total_occurrences': total_occurrences, + 'total_repair_attempts': total_attempts, + 'overall_success_rate': overall_rate, + 'actions_tried': list(all_stats.keys()), + 'best_action': best_action, + 'learning_status': status, + } + + +# ============================================================================= +# Singleton +# ============================================================================= +_learning_service: LearningService | None = None + + +def get_learning_service() -> LearningService: + """取得 LearningService 實例""" + global _learning_service + if _learning_service is None: + from src.core.redis import get_redis_client + _learning_service = LearningService(get_redis_client()) + return _learning_service +``` + +### Step G-2: 整合到 auto_repair_service.py (1h) + +```python +# apps/api/src/services/auto_repair_service.py +# 修改執行修復的流程 + +from src.services.learning_service import get_learning_service +import time + + +class AutoRepairService: + async def execute_repair( + self, + incident_id: str, + anomaly_key: str, + repair_action: str, + dry_run: bool = False, + ) -> AutoRepairResult: + """ + 執行修復並記錄學習數據 + """ + learning = get_learning_service() + start_time = time.time() + + try: + # 1. 執行修復 + result = await self._do_execute(repair_action, ...) + + # 2. 記錄學習數據 + execution_time = time.time() - start_time + await learning.record_repair_result( + anomaly_key=anomaly_key, + repair_action=repair_action, + success=result.success, + root_cause=result.root_cause if hasattr(result, 'root_cause') else None, + fix_description=result.message, + execution_time_seconds=execution_time, + ) + + return result + + except Exception as e: + # 記錄失敗 + await learning.record_repair_result( + anomaly_key=anomaly_key, + repair_action=repair_action, + success=False, + fix_description=str(e), + execution_time_seconds=time.time() - start_time, + ) + raise + + async def get_smart_recommendation(self, anomaly_key: str) -> dict: + """ + 取得智慧修復建議 (結合 AI 分析 + 歷史學習) + """ + learning = get_learning_service() + + # 1. 取得學習推薦 + learned = await learning.get_recommended_fix(anomaly_key) + + # 2. 如果學習信心度高,直接使用 + if learned['confidence'] >= 0.8: + return { + 'source': 'learning', + 'recommendation': learned, + } + + # 3. 否則結合 AI 分析 + # (呼叫 OpenClaw 取得建議) + ai_recommendation = await self._get_ai_recommendation(anomaly_key) + + # 4. 合併推薦 + return { + 'source': 'hybrid', + 'learning': learned, + 'ai': ai_recommendation, + 'final_recommendation': self._merge_recommendations(learned, ai_recommendation), + } +``` + +### Step G-3: 新增 API 端點 (30min) + +```python +# apps/api/src/api/v1/learning.py +""" +學習系統 API +""" + +from fastapi import APIRouter +from src.services.learning_service import get_learning_service + +router = APIRouter(prefix="/learning", tags=["Learning"]) + + +@router.get("/summary/{anomaly_key}") +async def get_learning_summary(anomaly_key: str): + """取得異常學習摘要""" + learning = get_learning_service() + return await learning.get_learning_summary(anomaly_key) + + +@router.get("/recommendation/{anomaly_key}") +async def get_recommendation(anomaly_key: str): + """取得修復推薦""" + learning = get_learning_service() + return await learning.get_recommended_fix(anomaly_key) +``` + +--- + +## 完整實作清單總覽 + +| Phase | 項目 | 工時 | 優先級 | 依賴 | +|-------|------|------|--------|------| +| A | AnomalyCounter | 4h | P0 | Redis | +| B | Database Exporters | 3h | P0 | Docker | +| C | Incident 頻率欄位 | 2h | P0 | Phase A | +| D | Sentry Comment | 1h | P1 | Sentry Token | +| E | SignOz 告警 | 2h | P1 | SignOz | +| F | Alert Chain E2E | 2h | P0 | Phase A | +| G | Learning Service | 3h | P1 | Phase A, C | + +**總工時**: 17h (約 2-3 天) + +--- + +## 執行順序建議 + +``` +Day 1 (8h): + ├─ Phase A: AnomalyCounter (4h) ✅ + ├─ Phase B: Database Exporters (3h) ✅ + └─ Phase F: Alert Chain E2E (部分, 1h) ✅ + +Day 2 (6h): + ├─ Phase C: Incident 頻率 (2h) ✅ + ├─ Phase D: Sentry Comment (1h) ✅ + └─ Phase G: Learning Service (3h) ✅ + +Day 3 (3h): + ├─ Phase E: SignOz 告警 (2h) ✅ + └─ Phase F: Alert Chain E2E (完成, 1h) ✅ +```