From 7b2f585244202d18d8381e9035ce6b3ebcd7b98b Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Sun, 29 Mar 2026 10:23:04 +0800
Subject: [PATCH] =?UTF-8?q?docs:=20=E5=AE=8C=E6=95=B4=E7=9B=A3=E6=8E=A7?=
 =?UTF-8?q?=E5=AF=A6=E6=96=BD=E6=AD=A5=E9=A9=9F=20(7=20Phase=20=E8=A9=B3?=
 =?UTF-8?q?=E7=B4=B0=E6=96=87=E6=AA=94)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase A: AnomalyCounter 服務 (4h)
- Redis Sorted Set 滑動窗口計數
- 頻率閾值告警 (REPEAT/ESCALATE/PERMANENT_FIX)
- Tier 決策邏輯整合

Phase B: Database Exporters (3h)
- pg_exporter: 連接池/慢查詢/鎖等待/膨脹監控
- redis_exporter: 記憶體/命中率/驅逐監控
- 15+ 告警規則

Phase C: Incident 頻率欄位 (2h)
- IncidentFrequencyStats 模型
- 告警聚合邏輯 (10 分鐘窗口)
- 前端頻率顯示

Phase D: Sentry Comment 回寫 (1h)
- 完成 TODO 實作
- Sentry API Token 配置

Phase E: SignOz 告警規則 (2h)
- Error Rate / Latency 告警
- Trace 異常檢測
- SignOz Webhook Handler

Phase F: Alert Chain E2E (2h)
- Smoke Test 腳本
- CD Pipeline 整合
- 鏈路監控告警

Phase G: Learning Service (3h)
- 修復效果學習
- 成功率計算
- Playbook 自動更新

總工時: 17h (2-3 天)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md   |  699 ++++++++++
 ...IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md |  522 ++++++++
 ...IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md |  511 ++++++++
 .../IMPLEMENTATION_STEPS_REMAINING_PHASES.md  | 1168 +++++++++++++++++
 4 files changed, 2900 insertions(+)
 create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md
 create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md
 create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md
 create mode 100644 docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md

diff --git a/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md
new file mode 100644
index 00000000..6e965b56
--- /dev/null
+++ b/docs/proposals/IMPLEMENTATION_STEPS_ANOMALY_COUNTER.md
@@ -0,0 +1,699 @@
+# AnomalyCounter 服務實施步驟
+
+> **優先級**: P0
+> **預估工時**: 4h
+> **目標**: 建立異常頻率追蹤能力
+
+---
+
+## Step 1: 建立 anomaly_counter.py (1h)
+
+### 1.1 建立檔案
+
+```bash
+touch apps/api/src/services/anomaly_counter.py
+```
+
+### 1.2 實作 AnomalyCounter 類別
+
+```python
+# apps/api/src/services/anomaly_counter.py
+"""
+異常頻率統計服務
+================================
+2026-03-29 ogt: 監控戰略規劃 Section 9 實作
+
+使用 Redis Sorted Set 實作滑動窗口計數:
+- ZADD anomaly:timeline:{key} {timestamp} {timestamp}
+- ZCOUNT anomaly:timeline:{key} {start} +inf
+- ZREMRANGEBYSCORE anomaly:timeline:{key} -inf {cutoff}
+"""
+
+import hashlib
+import json
+from datetime import datetime, timedelta
+from typing import NamedTuple
+
+import redis.asyncio as redis
+import structlog
+
+logger = structlog.get_logger(__name__)
+
+
+class AnomalyFrequency(NamedTuple):
+    """異常頻率資料"""
+    anomaly_key: str
+    count_1h: int
+    count_24h: int
+    count_7d: int
+    count_30d: int
+    first_seen: datetime
+    last_seen: datetime
+    auto_repair_count: int
+    permanent_fix_applied: bool
+    escalation_level: str | None  # None, REPEAT, ESCALATE, PERMANENT_FIX
+
+
+class AnomalyCounter:
+    """
+    異常計數器 - 追蹤每種異常的發生頻率
+
+    閾值配置 (可透過環境變數覆寫):
+    - ANOMALY_REPEAT_THRESHOLD: 3 (預設)
+    - ANOMALY_ESCALATE_THRESHOLD: 5 (預設)
+    - ANOMALY_PERMANENT_FIX_THRESHOLD: 10 (預設)
+    """
+
+    THRESHOLDS = {
+        'REPEAT': 3,          # 3 次 → 重複告警
+        'ESCALATE': 5,        # 5 次 → 人工介入
+        'PERMANENT_FIX': 10,  # 10 次 → 必須永久修復
+    }
+
+    # Redis Key 前綴
+    PREFIX_TIMELINE = "anomaly:timeline:"
+    PREFIX_REPAIR_COUNT = "anomaly:repair_count:"
+    PREFIX_PERMANENT_FIX = "anomaly:permanent_fix:"
+    PREFIX_METADATA = "anomaly:metadata:"
+
+    def __init__(self, redis_client: redis.Redis):
+        self.redis = redis_client
+
+    @staticmethod
+    def _hash_signature(signature: dict) -> str:
+        """
+        生成異常簽名的 hash key
+
+        簽名欄位:
+        - alert_name: 告警名稱 (e.g., PodCrashLoopBackOff)
+        - service: 服務名稱 (e.g., awoooi-api)
+        - namespace: K8s 命名空間 (e.g., awoooi-prod)
+        - error_type: 錯誤類型 (e.g., OOMKilled)
+        """
+        # 只取關鍵欄位，忽略時間戳等易變欄位
+        key_fields = {
+            'alert_name': signature.get('alert_name', signature.get('alertname', '')),
+            'service': signature.get('service', signature.get('job', '')),
+            'namespace': signature.get('namespace', ''),
+            'error_type': signature.get('error_type', signature.get('reason', '')),
+        }
+        # 排序確保一致性
+        canonical = json.dumps(key_fields, sort_keys=True)
+        return hashlib.sha256(canonical.encode()).hexdigest()[:16]
+
+    async def record_anomaly(self, anomaly_signature: dict) -> AnomalyFrequency:
+        """
+        記錄一次異常發生
+
+        Args:
+            anomaly_signature: 異常簽名字典
+
+        Returns:
+            AnomalyFrequency: 當前頻率統計
+        """
+        anomaly_key = self._hash_signature(anomaly_signature)
+        now = datetime.now()
+        timestamp = now.timestamp()
+        timeline_key = f"{self.PREFIX_TIMELINE}{anomaly_key}"
+
+        # 1. 添加到 Sorted Set (score = timestamp, member = timestamp string)
+        await self.redis.zadd(timeline_key, {str(timestamp): timestamp})
+
+        # 2. 清理過期數據 (30 天前)
+        cutoff_30d = (now - timedelta(days=30)).timestamp()
+        await self.redis.zremrangebyscore(timeline_key, '-inf', cutoff_30d)
+
+        # 3. 設置 TTL (35 天，比清理週期長一點)
+        await self.redis.expire(timeline_key, 35 * 24 * 3600)
+
+        # 4. 計算各時間窗口的計數
+        count_1h = await self.redis.zcount(
+            timeline_key,
+            (now - timedelta(hours=1)).timestamp(),
+            '+inf'
+        )
+        count_24h = await self.redis.zcount(
+            timeline_key,
+            (now - timedelta(hours=24)).timestamp(),
+            '+inf'
+        )
+        count_7d = await self.redis.zcount(
+            timeline_key,
+            (now - timedelta(days=7)).timestamp(),
+            '+inf'
+        )
+        count_30d = await self.redis.zcount(
+            timeline_key,
+            cutoff_30d,
+            '+inf'
+        )
+
+        # 5. 取得首次/最近時間
+        first_seen_data = await self.redis.zrange(timeline_key, 0, 0, withscores=True)
+        last_seen_data = await self.redis.zrange(timeline_key, -1, -1, withscores=True)
+
+        first_seen = datetime.fromtimestamp(first_seen_data[0][1]) if first_seen_data else now
+        last_seen = datetime.fromtimestamp(last_seen_data[0][1]) if last_seen_data else now
+
+        # 6. 讀取修復統計
+        auto_repair_count = int(await self.redis.get(f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}") or 0)
+        permanent_fix = await self.redis.get(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}") == b'1'
+
+        # 7. 儲存 metadata (首次記錄時)
+        metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
+        if not await self.redis.exists(metadata_key):
+            await self.redis.hset(metadata_key, mapping={
+                'signature': json.dumps(anomaly_signature),
+                'first_seen': now.isoformat(),
+            })
+            await self.redis.expire(metadata_key, 35 * 24 * 3600)
+
+        # 8. 判斷升級等級
+        escalation_level = self._get_escalation_level(count_24h)
+
+        freq = AnomalyFrequency(
+            anomaly_key=anomaly_key,
+            count_1h=count_1h,
+            count_24h=count_24h,
+            count_7d=count_7d,
+            count_30d=count_30d,
+            first_seen=first_seen,
+            last_seen=last_seen,
+            auto_repair_count=auto_repair_count,
+            permanent_fix_applied=permanent_fix,
+            escalation_level=escalation_level,
+        )
+
+        # 9. 記錄日誌
+        logger.info(
+            "anomaly_recorded",
+            anomaly_key=anomaly_key,
+            count_1h=count_1h,
+            count_24h=count_24h,
+            count_30d=count_30d,
+            escalation_level=escalation_level,
+        )
+
+        return freq
+
+    def _get_escalation_level(self, count_24h: int) -> str | None:
+        """判斷升級等級"""
+        if count_24h >= self.THRESHOLDS['PERMANENT_FIX']:
+            return 'PERMANENT_FIX'
+        elif count_24h >= self.THRESHOLDS['ESCALATE']:
+            return 'ESCALATE'
+        elif count_24h >= self.THRESHOLDS['REPEAT']:
+            return 'REPEAT'
+        return None
+
+    async def record_repair_attempt(self, anomaly_key: str, action: str, success: bool):
+        """
+        記錄修復嘗試
+
+        Args:
+            anomaly_key: 異常 key
+            action: 修復動作 (e.g., restart_pod, scale_up)
+            success: 是否成功
+        """
+        repair_key = f"{self.PREFIX_REPAIR_COUNT}{anomaly_key}"
+
+        # 遞增修復嘗試次數
+        await self.redis.incr(repair_key)
+        await self.redis.expire(repair_key, 35 * 24 * 3600)
+
+        # 記錄修復歷史 (用於學習)
+        history_key = f"anomaly:repair_history:{anomaly_key}"
+        await self.redis.lpush(history_key, json.dumps({
+            'action': action,
+            'success': success,
+            'timestamp': datetime.now().isoformat(),
+        }))
+        await self.redis.ltrim(history_key, 0, 99)  # 只保留最近 100 次
+        await self.redis.expire(history_key, 35 * 24 * 3600)
+
+        logger.info(
+            "repair_attempt_recorded",
+            anomaly_key=anomaly_key,
+            action=action,
+            success=success,
+        )
+
+    async def mark_permanent_fix_applied(self, anomaly_key: str, fix_description: str):
+        """
+        標記已套用永久修復
+
+        Args:
+            anomaly_key: 異常 key
+            fix_description: 修復說明
+        """
+        await self.redis.set(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", '1')
+        await self.redis.expire(f"{self.PREFIX_PERMANENT_FIX}{anomaly_key}", 90 * 24 * 3600)  # 90 天
+
+        # 記錄修復詳情
+        metadata_key = f"{self.PREFIX_METADATA}{anomaly_key}"
+        await self.redis.hset(metadata_key, mapping={
+            'permanent_fix_applied': 'true',
+            'permanent_fix_description': fix_description,
+            'permanent_fix_time': datetime.now().isoformat(),
+        })
+
+        logger.info(
+            "permanent_fix_marked",
+            anomaly_key=anomaly_key,
+            fix_description=fix_description,
+        )
+
+    async def get_repair_success_rate(self, anomaly_key: str, action: str) -> dict:
+        """
+        取得特定動作的修復成功率
+
+        Returns:
+            {
+                'action': 'restart_pod',
+                'total': 10,
+                'success': 3,
+                'success_rate': 0.3,
+            }
+        """
+        history_key = f"anomaly:repair_history:{anomaly_key}"
+        history = await self.redis.lrange(history_key, 0, -1)
+
+        total = 0
+        success = 0
+
+        for item in history:
+            data = json.loads(item)
+            if data['action'] == action:
+                total += 1
+                if data['success']:
+                    success += 1
+
+        return {
+            'action': action,
+            'total': total,
+            'success': success,
+            'success_rate': success / total if total > 0 else 0.0,
+        }
+
+    async def get_all_repair_stats(self, anomaly_key: str) -> dict[str, dict]:
+        """
+        取得所有修復動作的統計
+
+        Returns:
+            {
+                'restart_pod': {'total': 10, 'success': 3, 'success_rate': 0.3},
+                'scale_up': {'total': 2, 'success': 1, 'success_rate': 0.5},
+            }
+        """
+        history_key = f"anomaly:repair_history:{anomaly_key}"
+        history = await self.redis.lrange(history_key, 0, -1)
+
+        stats: dict[str, dict] = {}
+
+        for item in history:
+            data = json.loads(item)
+            action = data['action']
+
+            if action not in stats:
+                stats[action] = {'total': 0, 'success': 0}
+
+            stats[action]['total'] += 1
+            if data['success']:
+                stats[action]['success'] += 1
+
+        # 計算成功率
+        for action, s in stats.items():
+            s['success_rate'] = s['success'] / s['total'] if s['total'] > 0 else 0.0
+
+        return stats
+
+
+# =============================================================================
+# Singleton 模式
+# =============================================================================
+_anomaly_counter: AnomalyCounter | None = None
+
+
+def get_anomaly_counter() -> AnomalyCounter:
+    """取得 AnomalyCounter 實例"""
+    global _anomaly_counter
+    if _anomaly_counter is None:
+        from src.core.redis import get_redis_client
+        _anomaly_counter = AnomalyCounter(get_redis_client())
+    return _anomaly_counter
+```
+
+---
+
+## Step 2: 整合到 alertmanager_webhook.py (1h)
+
+### 2.1 在收到告警時記錄頻率
+
+```python
+# apps/api/src/api/v1/alertmanager_webhook.py
+# 在 handle_alertmanager 函數中新增
+
+from src.services.anomaly_counter import get_anomaly_counter
+
+async def handle_alertmanager(request: Request, background_tasks: BackgroundTasks):
+    # ... 現有代碼 ...
+
+    # 🆕 記錄異常頻率
+    anomaly_counter = get_anomaly_counter()
+    for alert in alerts:
+        anomaly_signature = {
+            'alert_name': alert.get('labels', {}).get('alertname'),
+            'service': alert.get('labels', {}).get('job'),
+            'namespace': alert.get('labels', {}).get('namespace'),
+            'error_type': alert.get('labels', {}).get('reason'),
+        }
+        freq = await anomaly_counter.record_anomaly(anomaly_signature)
+
+        # 將頻率資訊傳遞給後續處理
+        alert['_anomaly_frequency'] = freq._asdict()
+
+    # ... 繼續現有流程 ...
+```
+
+### 2.2 在 Telegram 告警中顯示頻率
+
+```python
+# apps/api/src/services/telegram_gateway.py
+# 修改 send_approval_card 方法，新增頻率資訊
+
+async def send_approval_card(
+    self,
+    approval_id: str,
+    risk_level: str,
+    resource_name: str,
+    root_cause: str,
+    suggested_action: str,
+    primary_responsibility: str,
+    confidence: float,
+    namespace: str,
+    anomaly_frequency: dict | None = None,  # 🆕 新增參數
+):
+    # ... 現有代碼 ...
+
+    # 🆕 頻率資訊區塊
+    frequency_section = ""
+    if anomaly_frequency and anomaly_frequency.get('count_24h', 0) > 1:
+        freq = anomaly_frequency
+        escalation_emoji = {
+            None: "",
+            'REPEAT': "⚠️",
+            'ESCALATE': "🔴",
+            'PERMANENT_FIX': "🚨",
+        }.get(freq.get('escalation_level'), "")
+
+        frequency_section = f"""
+📊 頻率統計 {escalation_emoji}:
+  • 1小時: {freq.get('count_1h', 0)} 次
+  • 24小時: {freq.get('count_24h', 0)} 次
+  • 7天: {freq.get('count_7d', 0)} 次
+  • 30天: {freq.get('count_30d', 0)} 次
+  • 修復嘗試: {freq.get('auto_repair_count', 0)} 次
+"""
+        if freq.get('escalation_level'):
+            frequency_section += f"  🔺 升級建議: {freq['escalation_level']}\n"
+
+    # 插入到告警卡片中
+    # ...
+```
+
+---
+
+## Step 3: 整合到 sentry_webhook.py (30min)
+
+### 3.1 Sentry 告警也要記錄頻率
+
+```python
+# apps/api/src/api/v1/sentry_webhook.py
+# 在 analyze_and_comment 函數中新增
+
+from src.services.anomaly_counter import get_anomaly_counter
+
+async def analyze_and_comment(
+    error_context: dict,
+    issue_id: str,
+    project_slug: str
+):
+    # 🆕 記錄異常頻率
+    anomaly_counter = get_anomaly_counter()
+    anomaly_signature = {
+        'alert_name': 'sentry_error',
+        'service': error_context.get('project', 'unknown'),
+        'error_type': error_context.get('title', 'unknown'),
+        'culprit': error_context.get('culprit', 'unknown'),
+    }
+    freq = await anomaly_counter.record_anomaly(anomaly_signature)
+
+    # 傳遞給 Telegram 告警
+    await send_sentry_telegram_alert(
+        error_context=error_context,
+        analysis=analysis,
+        approval_id=approval_id,
+        anomaly_frequency=freq._asdict(),  # 🆕
+    )
+```
+
+---
+
+## Step 4: 整合到 auto_repair_service.py (1h)
+
+### 4.1 修復前檢查頻率，決定 Tier
+
+```python
+# apps/api/src/services/auto_repair_service.py
+# 新增 Tier 決策邏輯
+
+from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency
+
+class AutoRepairService:
+    async def determine_repair_tier(
+        self,
+        anomaly_key: str,
+        frequency: AnomalyFrequency,
+    ) -> int:
+        """
+        根據頻率決定修復 Tier
+
+        Returns:
+            1: 臨時修復 (重啟)
+            2: 緩解修復 (擴容)
+            3: 根因修復 (配置變更)
+            4: 架構修復 (需開發)
+        """
+        # 取得修復歷史
+        counter = get_anomaly_counter()
+        stats = await counter.get_all_repair_stats(anomaly_key)
+
+        # 計算重啟次數
+        restart_count = stats.get('restart_pod', {}).get('total', 0)
+        restart_count += stats.get('restart_container', {}).get('total', 0)
+
+        # Tier 決策邏輯
+        if frequency.permanent_fix_applied:
+            return 4  # 已有永久修復但仍出問題 → 需架構級修復
+
+        if frequency.escalation_level == 'PERMANENT_FIX':
+            return 3  # 24h 內 ≥10 次 → 根因修復
+
+        if frequency.escalation_level == 'ESCALATE':
+            return 2  # 24h 內 ≥5 次 → 緩解修復
+
+        if restart_count >= 2:
+            return 2  # 已重啟 2 次 → 升級到緩解
+
+        return 1  # 預設臨時修復
+
+    async def get_tier_actions(self, tier: int) -> list[str]:
+        """
+        根據 Tier 返回可用修復動作
+        """
+        TIER_ACTIONS = {
+            1: ['restart_pod', 'restart_container'],
+            2: ['scale_up', 'increase_memory', 'adjust_limits'],
+            3: ['apply_hotfix', 'update_config', 'patch_deployment'],
+            4: ['create_issue', 'notify_team', 'schedule_fix'],
+        }
+        return TIER_ACTIONS.get(tier, TIER_ACTIONS[1])
+```
+
+### 4.2 修復後記錄結果
+
+```python
+# apps/api/src/services/auto_repair_service.py
+# 在執行修復後
+
+async def execute_repair(self, ...):
+    # ... 執行修復 ...
+
+    # 🆕 記錄修復嘗試
+    counter = get_anomaly_counter()
+    await counter.record_repair_attempt(
+        anomaly_key=anomaly_key,
+        action=repair_action,
+        success=result.success,
+    )
+
+    # 如果是 Tier 3 永久修復成功
+    if tier == 3 and result.success:
+        await counter.mark_permanent_fix_applied(
+            anomaly_key=anomaly_key,
+            fix_description=f"Applied {repair_action}: {result.message}",
+        )
+```
+
+---
+
+## Step 5: 單元測試 (30min)
+
+### 5.1 建立測試檔案
+
+```python
+# apps/api/tests/test_anomaly_counter.py
+"""
+AnomalyCounter 單元測試
+"""
+
+import pytest
+from datetime import datetime, timedelta
+from unittest.mock import AsyncMock, MagicMock
+from src.services.anomaly_counter import AnomalyCounter, AnomalyFrequency
+
+
+@pytest.fixture
+def mock_redis():
+    """模擬 Redis 客戶端"""
+    redis = AsyncMock()
+    redis.zadd = AsyncMock()
+    redis.zremrangebyscore = AsyncMock()
+    redis.expire = AsyncMock()
+    redis.zcount = AsyncMock(return_value=5)
+    redis.zrange = AsyncMock(return_value=[(b'123', 1234567890.0)])
+    redis.get = AsyncMock(return_value=None)
+    redis.exists = AsyncMock(return_value=False)
+    redis.hset = AsyncMock()
+    return redis
+
+
+@pytest.fixture
+def counter(mock_redis):
+    return AnomalyCounter(mock_redis)
+
+
+class TestHashSignature:
+    def test_same_input_same_hash(self):
+        sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
+        sig2 = {'alert_name': 'PodCrash', 'service': 'api'}
+        assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2)
+
+    def test_different_input_different_hash(self):
+        sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
+        sig2 = {'alert_name': 'PodCrash', 'service': 'web'}
+        assert AnomalyCounter._hash_signature(sig1) != AnomalyCounter._hash_signature(sig2)
+
+    def test_ignores_extra_fields(self):
+        sig1 = {'alert_name': 'PodCrash', 'service': 'api'}
+        sig2 = {'alert_name': 'PodCrash', 'service': 'api', 'timestamp': '2026-01-01'}
+        assert AnomalyCounter._hash_signature(sig1) == AnomalyCounter._hash_signature(sig2)
+
+
+class TestEscalationLevel:
+    def test_no_escalation(self, counter):
+        assert counter._get_escalation_level(2) is None
+
+    def test_repeat_level(self, counter):
+        assert counter._get_escalation_level(3) == 'REPEAT'
+        assert counter._get_escalation_level(4) == 'REPEAT'
+
+    def test_escalate_level(self, counter):
+        assert counter._get_escalation_level(5) == 'ESCALATE'
+        assert counter._get_escalation_level(9) == 'ESCALATE'
+
+    def test_permanent_fix_level(self, counter):
+        assert counter._get_escalation_level(10) == 'PERMANENT_FIX'
+        assert counter._get_escalation_level(100) == 'PERMANENT_FIX'
+
+
+class TestRecordAnomaly:
+    @pytest.mark.asyncio
+    async def test_records_to_redis(self, counter, mock_redis):
+        sig = {'alert_name': 'PodCrash', 'service': 'api'}
+        freq = await counter.record_anomaly(sig)
+
+        # 驗證 Redis 操作
+        mock_redis.zadd.assert_called_once()
+        mock_redis.zremrangebyscore.assert_called_once()
+        mock_redis.expire.assert_called()
+
+        # 驗證返回值
+        assert isinstance(freq, AnomalyFrequency)
+        assert freq.count_1h == 5  # mock 返回值
+```
+
+---
+
+## Step 6: 部署驗證 (30min)
+
+### 6.1 本地測試
+
+```bash
+cd apps/api
+pytest tests/test_anomaly_counter.py -v
+```
+
+### 6.2 整合測試
+
+```bash
+# 啟動本地 Redis
+docker run -d --name test-redis -p 6380:6379 redis:7
+
+# 手動測試
+python -c "
+import asyncio
+from src.services.anomaly_counter import AnomalyCounter
+import redis.asyncio as redis
+
+async def test():
+    r = redis.Redis(host='localhost', port=6380)
+    counter = AnomalyCounter(r)
+
+    # 記錄 5 次異常
+    for i in range(5):
+        freq = await counter.record_anomaly({'alert_name': 'TestAlert', 'service': 'test'})
+        print(f'Count: {freq.count_24h}, Level: {freq.escalation_level}')
+
+asyncio.run(test())
+"
+```
+
+### 6.3 預期輸出
+
+```
+Count: 1, Level: None
+Count: 2, Level: None
+Count: 3, Level: REPEAT
+Count: 4, Level: REPEAT
+Count: 5, Level: ESCALATE
+```
+
+---
+
+## 交付物清單
+
+| 檔案 | 狀態 | 說明 |
+|------|------|------|
+| `apps/api/src/services/anomaly_counter.py` | 🆕 新建 | 核心服務 |
+| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 整合頻率追蹤 |
+| `apps/api/src/api/v1/sentry_webhook.py` | 📝 修改 | 整合頻率追蹤 |
+| `apps/api/src/services/telegram_gateway.py` | 📝 修改 | 顯示頻率資訊 |
+| `apps/api/src/services/auto_repair_service.py` | 📝 修改 | Tier 決策 |
+| `apps/api/tests/test_anomaly_counter.py` | 🆕 新建 | 單元測試 |
+
+---
+
+**預估總工時**: 4h
+**前置依賴**: Redis (已有)
+**後續工作**: Phase B 資料庫 Exporter
diff --git a/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md
new file mode 100644
index 00000000..daebc509
--- /dev/null
+++ b/docs/proposals/IMPLEMENTATION_STEPS_DATABASE_EXPORTERS.md
@@ -0,0 +1,522 @@
+# 資料庫 Exporter 部署實施步驟
+
+> **優先級**: P0
+> **預估工時**: 3h
+> **目標**: PostgreSQL 與 Redis 完整監控覆蓋
+
+---
+
+## 現狀分析
+
+| 服務 | 當前監控 | 缺失指標 |
+|------|---------|---------|
+| PostgreSQL | ❌ 零 | 連接數、慢查詢、鎖等待、複製延遲 |
+| Redis | ❌ 零 | 記憶體使用、命中率、命令延遲、驅逐率 |
+
+---
+
+## Phase B-1: PostgreSQL Exporter (1.5h)
+
+### Step 1: 建立 Docker Compose 配置 (15min)
+
+```yaml
+# ops/monitoring/docker-compose.exporters.yaml
+# 2026-03-29 ogt: 資料庫監控 Exporter
+# 部署位置: 192.168.0.188 (pg 主機)
+
+version: '3.8'
+
+services:
+  # ==========================================================================
+  # PostgreSQL Exporter
+  # ==========================================================================
+  postgres-exporter:
+    image: prometheuscommunity/postgres-exporter:v0.15.0
+    container_name: postgres-exporter
+    restart: unless-stopped
+    ports:
+      - "9187:9187"
+    environment:
+      DATA_SOURCE_NAME: "postgresql://postgres:${POSTGRES_PASSWORD}@postgres:5432/awoooi?sslmode=disable"
+      PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
+    volumes:
+      - ./postgres-exporter-queries.yaml:/etc/postgres_exporter/queries.yaml:ro
+    networks:
+      - monitoring
+    depends_on:
+      - postgres
+    labels:
+      - "prometheus.scrape=true"
+      - "prometheus.port=9187"
+
+  # ==========================================================================
+  # Redis Exporter
+  # ==========================================================================
+  redis-exporter:
+    image: oliver006/redis_exporter:v1.58.0
+    container_name: redis-exporter
+    restart: unless-stopped
+    ports:
+      - "9121:9121"
+    environment:
+      REDIS_ADDR: "redis://redis:6379"
+      REDIS_PASSWORD: "${REDIS_PASSWORD}"
+    networks:
+      - monitoring
+    depends_on:
+      - redis
+    labels:
+      - "prometheus.scrape=true"
+      - "prometheus.port=9121"
+
+networks:
+  monitoring:
+    external: true
+```
+
+### Step 2: 自訂 PostgreSQL 查詢 (15min)
+
+```yaml
+# ops/monitoring/postgres-exporter-queries.yaml
+# 自訂查詢 - 擴展預設指標
+
+# ==========================================================================
+# 連接池監控
+# ==========================================================================
+pg_stat_activity_count:
+  query: |
+    SELECT
+      datname,
+      state,
+      count(*) as count
+    FROM pg_stat_activity
+    WHERE datname IS NOT NULL
+    GROUP BY datname, state
+  metrics:
+    - datname:
+        usage: "LABEL"
+        description: "Database name"
+    - state:
+        usage: "LABEL"
+        description: "Connection state"
+    - count:
+        usage: "GAUGE"
+        description: "Number of connections"
+
+# ==========================================================================
+# 慢查詢監控 (> 1 秒)
+# ==========================================================================
+pg_slow_queries:
+  query: |
+    SELECT
+      datname,
+      usename,
+      count(*) as slow_query_count
+    FROM pg_stat_activity
+    WHERE state = 'active'
+      AND query_start < now() - interval '1 second'
+      AND query NOT LIKE 'SELECT pg_%'
+    GROUP BY datname, usename
+  metrics:
+    - datname:
+        usage: "LABEL"
+        description: "Database name"
+    - usename:
+        usage: "LABEL"
+        description: "User name"
+    - slow_query_count:
+        usage: "GAUGE"
+        description: "Number of slow queries (> 1s)"
+
+# ==========================================================================
+# 鎖等待監控
+# ==========================================================================
+pg_locks_waiting:
+  query: |
+    SELECT
+      datname,
+      mode,
+      count(*) as waiting_count
+    FROM pg_locks
+    WHERE NOT granted
+    GROUP BY datname, mode
+  metrics:
+    - datname:
+        usage: "LABEL"
+        description: "Database name"
+    - mode:
+        usage: "LABEL"
+        description: "Lock mode"
+    - waiting_count:
+        usage: "GAUGE"
+        description: "Number of locks waiting"
+
+# ==========================================================================
+# 表膨脹估算 (Dead Tuples)
+# ==========================================================================
+pg_stat_user_tables_bloat:
+  query: |
+    SELECT
+      schemaname,
+      relname,
+      n_dead_tup,
+      n_live_tup,
+      CASE WHEN n_live_tup > 0
+        THEN round(100.0 * n_dead_tup / n_live_tup, 2)
+        ELSE 0
+      END as dead_tuple_ratio
+    FROM pg_stat_user_tables
+    WHERE n_live_tup > 1000
+    ORDER BY n_dead_tup DESC
+    LIMIT 20
+  metrics:
+    - schemaname:
+        usage: "LABEL"
+        description: "Schema name"
+    - relname:
+        usage: "LABEL"
+        description: "Table name"
+    - n_dead_tup:
+        usage: "GAUGE"
+        description: "Dead tuples"
+    - n_live_tup:
+        usage: "GAUGE"
+        description: "Live tuples"
+    - dead_tuple_ratio:
+        usage: "GAUGE"
+        description: "Dead tuple percentage"
+
+# ==========================================================================
+# 資料庫大小
+# ==========================================================================
+pg_database_size_bytes:
+  query: |
+    SELECT
+      datname,
+      pg_database_size(datname) as size_bytes
+    FROM pg_database
+    WHERE datname NOT IN ('template0', 'template1')
+  metrics:
+    - datname:
+        usage: "LABEL"
+        description: "Database name"
+    - size_bytes:
+        usage: "GAUGE"
+        description: "Database size in bytes"
+```
+
+### Step 3: 建立 Prometheus Scrape 配置 (15min)
+
+```yaml
+# k8s/monitoring/prometheus-scrape-exporters.yaml
+# 新增到 Prometheus ConfigMap
+
+# PostgreSQL Exporter
+- job_name: 'postgres-exporter'
+  static_configs:
+    - targets: ['192.168.0.188:9187']
+  relabel_configs:
+    - source_labels: [__address__]
+      target_label: instance
+      replacement: 'postgres-primary'
+
+# Redis Exporter
+- job_name: 'redis-exporter'
+  static_configs:
+    - targets: ['192.168.0.188:9121']
+  relabel_configs:
+    - source_labels: [__address__]
+      target_label: instance
+      replacement: 'redis-primary'
+```
+
+### Step 4: 建立告警規則 (30min)
+
+```yaml
+# k8s/monitoring/database-alerts.yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: database-alerts
+  namespace: monitoring
+  labels:
+    app: prometheus
+spec:
+  groups:
+    # =========================================================================
+    # PostgreSQL 告警
+    # =========================================================================
+    - name: postgresql
+      rules:
+        # 連接池即將耗盡
+        - alert: PostgreSQLConnectionPoolNearLimit
+          expr: |
+            sum(pg_stat_activity_count{state="active"}) by (datname)
+            /
+            (SELECT setting::int FROM pg_settings WHERE name = 'max_connections')
+            > 0.8
+          for: 5m
+          labels:
+            severity: warning
+            service: postgres
+            owner: infra-team
+          annotations:
+            summary: "PostgreSQL 連接池使用率 > 80%"
+            description: "Database {{ $labels.datname }} 連接使用率: {{ $value | humanizePercentage }}"
+            auto_repair: "analyze_connection_leak"
+
+        # 連接池耗盡
+        - alert: PostgreSQLConnectionPoolExhausted
+          expr: |
+            sum(pg_stat_activity_count{state="active"}) by (datname)
+            /
+            100  # 假設 max_connections = 100
+            > 0.95
+          for: 2m
+          labels:
+            severity: critical
+            service: postgres
+            owner: infra-team
+          annotations:
+            summary: "PostgreSQL 連接池即將耗盡"
+            description: "Database {{ $labels.datname }} 連接使用率 > 95%"
+            auto_repair: "restart_api_pods"
+
+        # 慢查詢告警
+        - alert: PostgreSQLSlowQueries
+          expr: pg_slow_queries > 5
+          for: 5m
+          labels:
+            severity: warning
+            service: postgres
+            owner: backend-team
+          annotations:
+            summary: "PostgreSQL 慢查詢數量過多"
+            description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢"
+            auto_repair: "analyze_slow_queries"
+
+        # 鎖等待告警
+        - alert: PostgreSQLLockWaiting
+          expr: sum(pg_locks_waiting) > 10
+          for: 2m
+          labels:
+            severity: warning
+            service: postgres
+            owner: backend-team
+          annotations:
+            summary: "PostgreSQL 鎖等待過多"
+            description: "{{ $value }} 個查詢正在等待鎖"
+
+        # 表膨脹告警
+        - alert: PostgreSQLTableBloat
+          expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
+          for: 30m
+          labels:
+            severity: warning
+            service: postgres
+            owner: infra-team
+          annotations:
+            summary: "PostgreSQL 表膨脹"
+            description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
+            auto_repair: "schedule_vacuum"
+
+        # PostgreSQL Down
+        - alert: PostgreSQLDown
+          expr: pg_up == 0
+          for: 1m
+          labels:
+            severity: critical
+            service: postgres
+            owner: infra-team
+          annotations:
+            summary: "PostgreSQL 無法連線"
+            auto_repair: "restart_postgres_container"
+
+    # =========================================================================
+    # Redis 告警
+    # =========================================================================
+    - name: redis
+      rules:
+        # 記憶體使用過高
+        - alert: RedisMemoryHigh
+          expr: |
+            redis_memory_used_bytes / redis_memory_max_bytes > 0.85
+          for: 5m
+          labels:
+            severity: warning
+            service: redis
+            owner: infra-team
+          annotations:
+            summary: "Redis 記憶體使用 > 85%"
+            description: "Redis 記憶體: {{ $value | humanizePercentage }}"
+            auto_repair: "analyze_redis_keys"
+
+        # 記憶體即將耗盡
+        - alert: RedisMemoryCritical
+          expr: |
+            redis_memory_used_bytes / redis_memory_max_bytes > 0.95
+          for: 2m
+          labels:
+            severity: critical
+            service: redis
+            owner: infra-team
+          annotations:
+            summary: "Redis 記憶體即將耗盡"
+            description: "Redis 記憶體使用 > 95%"
+            auto_repair: "flush_expired_keys"
+
+        # 快取命中率過低
+        - alert: RedisCacheHitRateLow
+          expr: |
+            rate(redis_keyspace_hits_total[5m])
+            /
+            (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
+            < 0.8
+          for: 15m
+          labels:
+            severity: warning
+            service: redis
+            owner: backend-team
+          annotations:
+            summary: "Redis 快取命中率 < 80%"
+            description: "命中率: {{ $value | humanizePercentage }}"
+
+        # 連接數過高
+        - alert: RedisConnectionsHigh
+          expr: redis_connected_clients > 500
+          for: 5m
+          labels:
+            severity: warning
+            service: redis
+            owner: infra-team
+          annotations:
+            summary: "Redis 連接數過高"
+            description: "連接數: {{ $value }}"
+
+        # Key 驅逐告警
+        - alert: RedisEvictedKeys
+          expr: rate(redis_evicted_keys_total[5m]) > 100
+          for: 5m
+          labels:
+            severity: warning
+            service: redis
+            owner: backend-team
+          annotations:
+            summary: "Redis Key 驅逐頻繁"
+            description: "每秒驅逐 {{ $value }} 個 key"
+            auto_repair: "increase_redis_memory"
+
+        # Redis Down
+        - alert: RedisDown
+          expr: redis_up == 0
+          for: 1m
+          labels:
+            severity: critical
+            service: redis
+            owner: infra-team
+          annotations:
+            summary: "Redis 無法連線"
+            auto_repair: "restart_redis_container"
+```
+
+### Step 5: 部署腳本 (15min)
+
+```bash
+#!/bin/bash
+# ops/monitoring/deploy-exporters.sh
+# 部署資料庫 Exporter 到 192.168.0.188
+
+set -euo pipefail
+
+HOST="192.168.0.188"
+DEPLOY_DIR="/opt/monitoring/exporters"
+
+echo "=== 部署資料庫 Exporter ==="
+
+# 1. 建立目錄
+ssh $HOST "mkdir -p $DEPLOY_DIR"
+
+# 2. 複製配置
+scp ops/monitoring/docker-compose.exporters.yaml $HOST:$DEPLOY_DIR/docker-compose.yaml
+scp ops/monitoring/postgres-exporter-queries.yaml $HOST:$DEPLOY_DIR/
+
+# 3. 載入環境變數 (從 .env)
+ssh $HOST "cd $DEPLOY_DIR && docker compose up -d"
+
+# 4. 驗證
+echo "等待服務啟動..."
+sleep 10
+
+echo "驗證 PostgreSQL Exporter..."
+curl -s http://$HOST:9187/metrics | head -5
+
+echo "驗證 Redis Exporter..."
+curl -s http://$HOST:9121/metrics | head -5
+
+# 5. 更新 Prometheus 配置
+echo "更新 Prometheus scrape 配置..."
+kubectl apply -f k8s/monitoring/prometheus-scrape-exporters.yaml
+
+# 6. 部署告警規則
+echo "部署告警規則..."
+kubectl apply -f k8s/monitoring/database-alerts.yaml
+
+# 7. 重載 Prometheus
+kubectl rollout restart deployment/prometheus -n monitoring
+
+echo "=== 部署完成 ==="
+echo "PostgreSQL Exporter: http://$HOST:9187/metrics"
+echo "Redis Exporter: http://$HOST:9121/metrics"
+```
+
+---
+
+## Phase B-2: 驗證清單 (30min)
+
+### 驗證 Prometheus Targets
+
+```bash
+# 檢查 targets 是否 UP
+curl -s http://192.168.0.120:30090/api/v1/targets | jq '.data.activeTargets[] | select(.labels.job | contains("exporter")) | {job: .labels.job, health: .health}'
+```
+
+預期輸出:
+```json
+{"job": "postgres-exporter", "health": "up"}
+{"job": "redis-exporter", "health": "up"}
+```
+
+### 驗證關鍵指標
+
+```bash
+# PostgreSQL 連接數
+curl -s http://192.168.0.188:9187/metrics | grep pg_stat_activity_count
+
+# Redis 記憶體
+curl -s http://192.168.0.188:9121/metrics | grep redis_memory_used_bytes
+```
+
+### 觸發測試告警
+
+```bash
+# 模擬連接池壓力測試
+pgbench -c 80 -j 4 -T 60 -h 192.168.0.188 -U postgres awoooi
+```
+
+---
+
+## 交付物清單
+
+| 檔案 | 狀態 | 說明 |
+|------|------|------|
+| `ops/monitoring/docker-compose.exporters.yaml` | 🆕 | Exporter 容器配置 |
+| `ops/monitoring/postgres-exporter-queries.yaml` | 🆕 | 自訂 PG 查詢 |
+| `k8s/monitoring/prometheus-scrape-exporters.yaml` | 🆕 | Scrape 配置 |
+| `k8s/monitoring/database-alerts.yaml` | 🆕 | 告警規則 |
+| `ops/monitoring/deploy-exporters.sh` | 🆕 | 部署腳本 |
+
+---
+
+**預估總工時**: 3h
+**部署位置**: 192.168.0.188
+**依賴**: Docker Compose, 現有 PostgreSQL/Redis
diff --git a/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md
new file mode 100644
index 00000000..85ce2ceb
--- /dev/null
+++ b/docs/proposals/IMPLEMENTATION_STEPS_INCIDENT_FREQUENCY.md
@@ -0,0 +1,511 @@
+# Incident 模型頻率欄位實施步驟
+
+> **優先級**: P0
+> **預估工時**: 2h
+> **目標**: Incident 支援頻率統計與聚合
+
+---
+
+## 現狀分析
+
+| 模型 | hit_count | frequency | escalation |
+|------|-----------|-----------|------------|
+| Approval | ✅ 有 | ❌ 無 | ❌ 無 |
+| Incident | ❌ 無 | ❌ 無 | ❌ 無 |
+
+---
+
+## Step 1: 更新 Incident 模型 (30min)
+
+### 1.1 新增欄位
+
+```python
+# apps/api/src/models/incident.py
+# 在 Incident 類別中新增以下欄位
+
+from datetime import datetime
+from typing import Optional
+from pydantic import BaseModel, Field
+
+
+class IncidentFrequencyStats(BaseModel):
+    """事件頻率統計"""
+    anomaly_key: str = Field(..., description="異常簽名 hash")
+    count_1h: int = Field(default=0, description="1 小時內發生次數")
+    count_24h: int = Field(default=0, description="24 小時內發生次數")
+    count_7d: int = Field(default=0, description="7 天內發生次數")
+    count_30d: int = Field(default=0, description="30 天內發生次數")
+    first_seen: datetime = Field(default_factory=datetime.now)
+    last_seen: datetime = Field(default_factory=datetime.now)
+    escalation_level: Optional[str] = Field(
+        default=None,
+        description="升級等級: REPEAT, ESCALATE, PERMANENT_FIX"
+    )
+
+
+class IncidentRepairStats(BaseModel):
+    """修復嘗試統計"""
+    total_attempts: int = Field(default=0, description="總修復嘗試次數")
+    successful_attempts: int = Field(default=0, description="成功次數")
+    last_repair_action: Optional[str] = Field(default=None, description="最近修復動作")
+    last_repair_time: Optional[datetime] = Field(default=None)
+    repair_history: list[dict] = Field(
+        default_factory=list,
+        description="修復歷史: [{action, success, timestamp}]"
+    )
+    recommended_tier: int = Field(
+        default=1,
+        description="建議修復 Tier: 1=重啟, 2=緩解, 3=根因, 4=架構"
+    )
+
+
+# 在 Incident 模型中新增
+class Incident(BaseModel):
+    # ... 現有欄位 ...
+
+    # 🆕 頻率統計
+    frequency_stats: Optional[IncidentFrequencyStats] = Field(
+        default=None,
+        description="異常頻率統計"
+    )
+
+    # 🆕 修復統計
+    repair_stats: Optional[IncidentRepairStats] = Field(
+        default=None,
+        description="修復嘗試統計"
+    )
+
+    # 🆕 聚合控制
+    is_aggregated: bool = Field(
+        default=False,
+        description="是否為聚合告警 (同一問題多次觸發)"
+    )
+    aggregated_count: int = Field(
+        default=1,
+        description="聚合次數 (窗口期內的觸發次數)"
+    )
+    aggregation_window_start: Optional[datetime] = Field(
+        default=None,
+        description="聚合窗口開始時間"
+    )
+```
+
+### 1.2 資料庫遷移 (如使用 SQLAlchemy)
+
+```python
+# apps/api/src/db/migrations/add_incident_frequency.py
+"""
+新增 Incident 頻率欄位
+2026-03-29 ogt: 監控戰略規劃
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import JSONB
+
+
+def upgrade():
+    # 新增 frequency_stats JSONB 欄位
+    op.add_column(
+        'incidents',
+        sa.Column('frequency_stats', JSONB, nullable=True)
+    )
+
+    # 新增 repair_stats JSONB 欄位
+    op.add_column(
+        'incidents',
+        sa.Column('repair_stats', JSONB, nullable=True)
+    )
+
+    # 新增聚合欄位
+    op.add_column(
+        'incidents',
+        sa.Column('is_aggregated', sa.Boolean, default=False)
+    )
+    op.add_column(
+        'incidents',
+        sa.Column('aggregated_count', sa.Integer, default=1)
+    )
+    op.add_column(
+        'incidents',
+        sa.Column('aggregation_window_start', sa.DateTime, nullable=True)
+    )
+
+    # 建立索引 (用於查詢重複事件)
+    op.create_index(
+        'ix_incidents_frequency_anomaly_key',
+        'incidents',
+        [sa.text("(frequency_stats->>'anomaly_key')")]
+    )
+
+
+def downgrade():
+    op.drop_index('ix_incidents_frequency_anomaly_key')
+    op.drop_column('incidents', 'aggregation_window_start')
+    op.drop_column('incidents', 'aggregated_count')
+    op.drop_column('incidents', 'is_aggregated')
+    op.drop_column('incidents', 'repair_stats')
+    op.drop_column('incidents', 'frequency_stats')
+```
+
+---
+
+## Step 2: 更新 IncidentService (45min)
+
+### 2.1 新增聚合邏輯
+
+```python
+# apps/api/src/services/incident_service.py
+# 新增或修改以下方法
+
+from src.services.anomaly_counter import get_anomaly_counter, AnomalyFrequency
+from src.models.incident import IncidentFrequencyStats, IncidentRepairStats
+
+
+class IncidentService:
+    # 聚合窗口 (10 分鐘內同一問題不建新 Incident)
+    AGGREGATION_WINDOW_MINUTES = 10
+
+    async def create_or_aggregate_incident(
+        self,
+        alert_data: dict,
+        analysis_result: dict | None = None,
+    ) -> tuple[Incident, bool]:
+        """
+        建立或聚合 Incident
+
+        Returns:
+            tuple[Incident, bool]: (Incident, is_new)
+            - is_new=True: 新建的 Incident
+            - is_new=False: 聚合到現有 Incident
+        """
+        # 1. 記錄到 AnomalyCounter
+        anomaly_counter = get_anomaly_counter()
+        anomaly_signature = self._extract_signature(alert_data)
+        frequency = await anomaly_counter.record_anomaly(anomaly_signature)
+
+        # 2. 檢查是否有可聚合的現有 Incident
+        existing = await self._find_aggregatable_incident(
+            anomaly_key=frequency.anomaly_key,
+            window_minutes=self.AGGREGATION_WINDOW_MINUTES,
+        )
+
+        if existing:
+            # 聚合到現有 Incident
+            return await self._aggregate_to_existing(existing, frequency), False
+        else:
+            # 建立新 Incident
+            return await self._create_new_incident(
+                alert_data=alert_data,
+                frequency=frequency,
+                analysis_result=analysis_result,
+            ), True
+
+    async def _find_aggregatable_incident(
+        self,
+        anomaly_key: str,
+        window_minutes: int,
+    ) -> Incident | None:
+        """
+        查找可聚合的現有 Incident
+
+        條件:
+        1. 相同 anomaly_key
+        2. 在聚合窗口內
+        3. 狀態為 OPEN 或 ANALYZING
+        """
+        cutoff = datetime.now() - timedelta(minutes=window_minutes)
+
+        # Redis 快速查詢
+        cache_key = f"incident:aggregation:{anomaly_key}"
+        cached_id = await self.redis.get(cache_key)
+
+        if cached_id:
+            incident = await self.get_by_id(cached_id.decode())
+            if incident and incident.status in ['OPEN', 'ANALYZING']:
+                return incident
+
+        # 資料庫查詢 (fallback)
+        # ... 實作資料庫查詢 ...
+
+        return None
+
+    async def _aggregate_to_existing(
+        self,
+        incident: Incident,
+        frequency: AnomalyFrequency,
+    ) -> Incident:
+        """
+        聚合到現有 Incident
+        """
+        # 更新聚合計數
+        incident.aggregated_count += 1
+        incident.is_aggregated = True
+
+        # 更新頻率統計
+        incident.frequency_stats = IncidentFrequencyStats(
+            anomaly_key=frequency.anomaly_key,
+            count_1h=frequency.count_1h,
+            count_24h=frequency.count_24h,
+            count_7d=frequency.count_7d,
+            count_30d=frequency.count_30d,
+            first_seen=frequency.first_seen,
+            last_seen=frequency.last_seen,
+            escalation_level=frequency.escalation_level,
+        )
+
+        # 更新修復建議 Tier
+        if incident.repair_stats:
+            incident.repair_stats.recommended_tier = await self._calculate_tier(frequency)
+
+        # 儲存
+        await self.update(incident)
+
+        logger.info(
+            "incident_aggregated",
+            incident_id=str(incident.id),
+            aggregated_count=incident.aggregated_count,
+            escalation_level=frequency.escalation_level,
+        )
+
+        return incident
+
+    async def _create_new_incident(
+        self,
+        alert_data: dict,
+        frequency: AnomalyFrequency,
+        analysis_result: dict | None,
+    ) -> Incident:
+        """
+        建立新 Incident (含頻率統計)
+        """
+        # 計算建議 Tier
+        recommended_tier = await self._calculate_tier(frequency)
+
+        incident = Incident(
+            # ... 現有欄位 ...
+            frequency_stats=IncidentFrequencyStats(
+                anomaly_key=frequency.anomaly_key,
+                count_1h=frequency.count_1h,
+                count_24h=frequency.count_24h,
+                count_7d=frequency.count_7d,
+                count_30d=frequency.count_30d,
+                first_seen=frequency.first_seen,
+                last_seen=frequency.last_seen,
+                escalation_level=frequency.escalation_level,
+            ),
+            repair_stats=IncidentRepairStats(
+                recommended_tier=recommended_tier,
+            ),
+            is_aggregated=False,
+            aggregated_count=1,
+            aggregation_window_start=datetime.now(),
+        )
+
+        # 儲存
+        await self.create(incident)
+
+        # 設置聚合快取 (10 分鐘)
+        cache_key = f"incident:aggregation:{frequency.anomaly_key}"
+        await self.redis.setex(cache_key, 600, str(incident.id))
+
+        return incident
+
+    async def _calculate_tier(self, frequency: AnomalyFrequency) -> int:
+        """
+        根據頻率計算建議修復 Tier
+        """
+        # 取得修復歷史
+        counter = get_anomaly_counter()
+        stats = await counter.get_all_repair_stats(frequency.anomaly_key)
+
+        restart_count = stats.get('restart_pod', {}).get('total', 0)
+        restart_count += stats.get('restart_container', {}).get('total', 0)
+
+        if frequency.permanent_fix_applied:
+            return 4  # 已有永久修復但仍出問題
+        if frequency.escalation_level == 'PERMANENT_FIX':
+            return 3  # 24h ≥10 次
+        if frequency.escalation_level == 'ESCALATE':
+            return 2  # 24h ≥5 次
+        if restart_count >= 2:
+            return 2  # 已重啟 2 次
+        return 1
+
+    def _extract_signature(self, alert_data: dict) -> dict:
+        """
+        從告警資料提取異常簽名
+        """
+        labels = alert_data.get('labels', {})
+        return {
+            'alert_name': labels.get('alertname', ''),
+            'service': labels.get('job', labels.get('service', '')),
+            'namespace': labels.get('namespace', ''),
+            'error_type': labels.get('reason', labels.get('error_type', '')),
+        }
+
+    async def record_repair_result(
+        self,
+        incident_id: str,
+        action: str,
+        success: bool,
+    ):
+        """
+        記錄修復結果到 Incident
+        """
+        incident = await self.get_by_id(incident_id)
+        if not incident:
+            return
+
+        # 更新 repair_stats
+        if not incident.repair_stats:
+            incident.repair_stats = IncidentRepairStats()
+
+        incident.repair_stats.total_attempts += 1
+        if success:
+            incident.repair_stats.successful_attempts += 1
+
+        incident.repair_stats.last_repair_action = action
+        incident.repair_stats.last_repair_time = datetime.now()
+        incident.repair_stats.repair_history.append({
+            'action': action,
+            'success': success,
+            'timestamp': datetime.now().isoformat(),
+        })
+
+        # 只保留最近 20 次
+        incident.repair_stats.repair_history = incident.repair_stats.repair_history[-20:]
+
+        # 同步到 AnomalyCounter
+        if incident.frequency_stats:
+            counter = get_anomaly_counter()
+            await counter.record_repair_attempt(
+                anomaly_key=incident.frequency_stats.anomaly_key,
+                action=action,
+                success=success,
+            )
+
+        await self.update(incident)
+```
+
+---
+
+## Step 3: 更新 alertmanager_webhook.py (30min)
+
+### 3.1 使用新的聚合方法
+
+```python
+# apps/api/src/api/v1/alertmanager_webhook.py
+# 修改告警處理邏輯
+
+@router.post("/alertmanager")
+async def handle_alertmanager(
+    request: Request,
+    background_tasks: BackgroundTasks,
+):
+    payload = await request.json()
+    alerts = payload.get("alerts", [])
+
+    for alert in alerts:
+        if alert.get("status") == "firing":
+            # 🆕 使用聚合方法
+            incident_service = get_incident_service()
+            incident, is_new = await incident_service.create_or_aggregate_incident(
+                alert_data=alert,
+            )
+
+            if is_new:
+                # 新 Incident: 觸發 AI 分析 + Telegram
+                background_tasks.add_task(
+                    analyze_and_notify,
+                    incident=incident,
+                    alert_data=alert,
+                )
+            else:
+                # 聚合 Incident: 只更新統計，不重複通知
+                # (除非達到升級閾值)
+                if incident.frequency_stats.escalation_level in ['ESCALATE', 'PERMANENT_FIX']:
+                    background_tasks.add_task(
+                        send_escalation_notification,
+                        incident=incident,
+                    )
+
+    return {"status": "ok", "processed": len(alerts)}
+```
+
+---
+
+## Step 4: 前端顯示頻率資訊 (15min)
+
+### 4.1 Incident 卡片新增頻率區塊
+
+```typescript
+// apps/web/src/components/incidents/IncidentCard.tsx
+// 新增頻率統計顯示
+
+interface FrequencyStatsProps {
+  stats: {
+    count_1h: number;
+    count_24h: number;
+    count_7d: number;
+    count_30d: number;
+    escalation_level: string | null;
+  };
+}
+
+function FrequencyStats({ stats }: FrequencyStatsProps) {
+  const escalationColors = {
+    REPEAT: 'text-yellow-500',
+    ESCALATE: 'text-orange-500',
+    PERMANENT_FIX: 'text-red-500',
+  };
+
+  return (
+    <div className="mt-4 p-3 bg-gray-50 rounded-lg">
+      <h4 className="text-sm font-medium text-gray-700 mb-2">
+        📊 頻率統計
+        {stats.escalation_level && (
+          <span className={`ml-2 ${escalationColors[stats.escalation_level]}`}>
+            ⚠️ {stats.escalation_level}
+          </span>
+        )}
+      </h4>
+      <div className="grid grid-cols-4 gap-2 text-sm">
+        <div>
+          <span className="text-gray-500">1h:</span>
+          <span className="ml-1 font-medium">{stats.count_1h}</span>
+        </div>
+        <div>
+          <span className="text-gray-500">24h:</span>
+          <span className="ml-1 font-medium">{stats.count_24h}</span>
+        </div>
+        <div>
+          <span className="text-gray-500">7d:</span>
+          <span className="ml-1 font-medium">{stats.count_7d}</span>
+        </div>
+        <div>
+          <span className="text-gray-500">30d:</span>
+          <span className="ml-1 font-medium">{stats.count_30d}</span>
+        </div>
+      </div>
+    </div>
+  );
+}
+```
+
+---
+
+## 交付物清單
+
+| 檔案 | 狀態 | 說明 |
+|------|------|------|
+| `apps/api/src/models/incident.py` | 📝 修改 | 新增頻率欄位 |
+| `apps/api/src/db/migrations/add_incident_frequency.py` | 🆕 | DB 遷移 |
+| `apps/api/src/services/incident_service.py` | 📝 修改 | 聚合邏輯 |
+| `apps/api/src/api/v1/alertmanager_webhook.py` | 📝 修改 | 使用聚合 |
+| `apps/web/src/components/incidents/IncidentCard.tsx` | 📝 修改 | 頻率顯示 |
+
+---
+
+**預估總工時**: 2h
+**前置依賴**: Phase A (AnomalyCounter)
diff --git a/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md b/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md
new file mode 100644
index 00000000..1ce7aa57
--- /dev/null
+++ b/docs/proposals/IMPLEMENTATION_STEPS_REMAINING_PHASES.md
@@ -0,0 +1,1168 @@
+# 剩餘 Phase 實施步驟 (D-G)
+
+> **總工時**: 10h
+> **優先級**: P0-P1
+
+---
+
+## Phase D: Sentry Comment 回寫 (1h)
+
+### 現狀
+
+```python
+# sentry_webhook.py:290-302 - 目前是 TODO
+# TODO: 需要 Sentry API Token
+logger.info(f"Would post comment to issue {issue_id}...")
+```
+
+### Step D-1: 取得 Sentry API Token (10min)
+
+```bash
+# 在 Sentry Self-Hosted 管理後台
+# Settings → API Tokens → Create New Token
+# 權限: project:read, issue:write
+
+# 儲存到 K8s Secret
+kubectl create secret generic sentry-api-token \
+  --from-literal=SENTRY_API_TOKEN=your_token_here \
+  -n awoooi-prod
+```
+
+### Step D-2: 實作 Comment 回寫 (30min)
+
+```python
+# apps/api/src/api/v1/sentry_webhook.py
+# 完成 post_sentry_comment 實作
+
+import os
+
+SENTRY_API_TOKEN = os.getenv("SENTRY_API_TOKEN")
+SENTRY_API_URL = "http://192.168.0.110:9000"
+
+
+async def post_sentry_comment(
+    project_slug: str,
+    issue_id: str,
+    analysis: ErrorAnalysisResult,
+):
+    """
+    回寫分析結果到 Sentry Issue Comment
+
+    API: POST /api/0/issues/{issue_id}/comments/
+    Docs: https://docs.sentry.io/api/events/create-a-comment/
+    """
+    if not SENTRY_API_TOKEN:
+        logger.warning("SENTRY_API_TOKEN not configured, skipping comment")
+        return
+
+    comment_text = f"""## 🧠 AI 錯誤分析 (by {analysis.analyzed_by})
+
+**根本原因 (Root Cause)**
+{analysis.root_cause}
+
+**影響範圍 (Impact)**
+{analysis.impact}
+
+**建議修復 (Fix Suggestion)**
+```
+{analysis.fix_suggestion}
+```
+
+**預防措施 (Prevention)**
+{analysis.prevention}
+
+---
+*分析信心度: {analysis.confidence:.0%} | 分析時間: {now_taipei_iso()}*
+*Powered by AWOOOI + OpenClaw*
+"""
+
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(
+                f"{SENTRY_API_URL}/api/0/issues/{issue_id}/comments/",
+                headers={
+                    "Authorization": f"Bearer {SENTRY_API_TOKEN}",
+                    "Content-Type": "application/json",
+                },
+                json={"text": comment_text}
+            )
+
+            if response.status_code == 201:
+                logger.info(
+                    "sentry_comment_posted",
+                    issue_id=issue_id,
+                    comment_length=len(comment_text),
+                )
+            else:
+                logger.warning(
+                    "sentry_comment_failed",
+                    issue_id=issue_id,
+                    status=response.status_code,
+                    response=response.text[:200],
+                )
+
+    except Exception as e:
+        logger.exception("sentry_comment_error", issue_id=issue_id, error=str(e))
+```
+
+### Step D-3: 更新 K8s Deployment (10min)
+
+```yaml
+# k8s/awoooi-prod/03-secrets.yaml
+# 新增 Sentry API Token
+
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: sentry-api-token
+  namespace: awoooi-prod
+type: Opaque
+stringData:
+  SENTRY_API_TOKEN: "${SENTRY_API_TOKEN}"
+```
+
+```yaml
+# k8s/awoooi-prod/04-deployment-api.yaml
+# 掛載環境變數
+
+env:
+  - name: SENTRY_API_TOKEN
+    valueFrom:
+      secretKeyRef:
+        name: sentry-api-token
+        key: SENTRY_API_TOKEN
+```
+
+### Step D-4: 驗證 (10min)
+
+```bash
+# 手動觸發測試
+curl -X POST http://localhost:8000/api/v1/webhooks/sentry/error \
+  -H "Content-Type: application/json" \
+  -d '{
+    "action": "triggered",
+    "data": {
+      "issue": {
+        "id": "12345",
+        "title": "Test Error",
+        "level": "error",
+        "project": {"slug": "awoooi-api"}
+      }
+    }
+  }'
+
+# 檢查 Sentry Issue 是否有 Comment
+```
+
+---
+
+## Phase E: SignOz 告警規則 (2h)
+
+### 現狀分析
+
+- SignOz 只做資料收集，無告警輸出
+- Error Rate / Latency 異常無法即時通知
+
+### Step E-1: SignOz 告警配置 (1h)
+
+```yaml
+# signoz/alerting/rules.yaml
+# SignOz 自訂告警規則
+
+groups:
+  # =========================================================================
+  # API Error Rate 告警
+  # =========================================================================
+  - name: api_errors
+    rules:
+      - alert: APIHighErrorRate
+        expr: |
+          sum(rate(signoz_spans_total{
+            service_name="awoooi-api",
+            status_code=~"5.."
+          }[5m])) by (service_name)
+          /
+          sum(rate(signoz_spans_total{
+            service_name="awoooi-api"
+          }[5m])) by (service_name)
+          > 0.05
+        for: 5m
+        labels:
+          severity: critical
+          source: signoz
+        annotations:
+          summary: "API 錯誤率 > 5%"
+          description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}"
+          webhook: "http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz"
+
+  # =========================================================================
+  # Latency 告警
+  # =========================================================================
+  - name: latency
+    rules:
+      - alert: APIHighLatencyP99
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(signoz_spans_duration_bucket{
+              service_name="awoooi-api"
+            }[5m])) by (le, service_name)
+          ) > 2
+        for: 5m
+        labels:
+          severity: warning
+          source: signoz
+        annotations:
+          summary: "API P99 延遲 > 2s"
+          description: "服務 {{ $labels.service_name }} P99: {{ $value }}s"
+
+      - alert: APIHighLatencyP95
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(signoz_spans_duration_bucket{
+              service_name="awoooi-api"
+            }[5m])) by (le, service_name)
+          ) > 1
+        for: 10m
+        labels:
+          severity: warning
+          source: signoz
+        annotations:
+          summary: "API P95 延遲 > 1s"
+
+  # =========================================================================
+  # Trace 異常告警
+  # =========================================================================
+  - name: traces
+    rules:
+      - alert: NoTracesReceived
+        expr: |
+          sum(rate(signoz_spans_total[15m])) == 0
+        for: 15m
+        labels:
+          severity: warning
+          source: signoz
+        annotations:
+          summary: "15 分鐘內無 Trace 數據"
+          description: "可能是 OTEL Collector 或應用程式問題"
+
+      - alert: HighSpanDropRate
+        expr: |
+          sum(rate(otelcol_exporter_send_failed_spans[5m]))
+          /
+          sum(rate(otelcol_exporter_sent_spans[5m]))
+          > 0.01
+        for: 5m
+        labels:
+          severity: warning
+          source: signoz
+        annotations:
+          summary: "Span 丟棄率 > 1%"
+```
+
+### Step E-2: 建立 SignOz Webhook Handler (30min)
+
+```python
+# apps/api/src/api/v1/signoz_webhook.py
+"""
+SignOz 告警 Webhook Handler
+"""
+
+from fastapi import APIRouter, Request, BackgroundTasks
+import structlog
+
+from src.services.incident_service import get_incident_service
+from src.services.telegram_gateway import get_telegram_gateway
+
+logger = structlog.get_logger(__name__)
+router = APIRouter(prefix="/webhooks/signoz", tags=["SignOz Webhook"])
+
+
+@router.post("/alert")
+async def handle_signoz_alert(
+    request: Request,
+    background_tasks: BackgroundTasks,
+):
+    """
+    處理 SignOz 告警
+
+    SignOz 告警格式:
+    {
+        "alertname": "APIHighErrorRate",
+        "status": "firing",
+        "labels": {...},
+        "annotations": {...},
+        "startsAt": "2026-03-29T10:00:00Z"
+    }
+    """
+    payload = await request.json()
+    logger.info("signoz_alert_received", payload=payload)
+
+    alert_name = payload.get("alertname")
+    status = payload.get("status")
+
+    if status != "firing":
+        return {"status": "ignored", "reason": "not firing"}
+
+    # 轉換為標準告警格式
+    normalized = {
+        "labels": {
+            "alertname": alert_name,
+            "source": "signoz",
+            **payload.get("labels", {}),
+        },
+        "annotations": payload.get("annotations", {}),
+        "startsAt": payload.get("startsAt"),
+    }
+
+    # 建立 Incident
+    incident_service = get_incident_service()
+    incident, is_new = await incident_service.create_or_aggregate_incident(
+        alert_data=normalized,
+    )
+
+    if is_new:
+        # 發送 Telegram
+        background_tasks.add_task(
+            notify_signoz_alert,
+            incident=incident,
+            alert_data=normalized,
+        )
+
+    return {
+        "status": "accepted",
+        "incident_id": str(incident.id),
+        "is_new": is_new,
+    }
+
+
+async def notify_signoz_alert(incident, alert_data: dict):
+    """發送 SignOz 告警到 Telegram"""
+    telegram = get_telegram_gateway()
+    await telegram.initialize()
+
+    annotations = alert_data.get("annotations", {})
+
+    await telegram.send_alert_card(
+        title=f"📊 SignOz: {alert_data['labels']['alertname']}",
+        severity=alert_data['labels'].get('severity', 'warning'),
+        description=annotations.get('description', annotations.get('summary', '')),
+        source="signoz",
+        incident_id=str(incident.id),
+    )
+```
+
+### Step E-3: 註冊路由 (10min)
+
+```python
+# apps/api/src/main.py
+from src.api.v1 import signoz_webhook
+
+app.include_router(signoz_webhook.router, prefix="/api/v1")
+```
+
+### Step E-4: 部署告警規則 (20min)
+
+```bash
+# 複製規則到 SignOz
+scp signoz/alerting/rules.yaml 192.168.0.188:/opt/signoz/config/alerting/
+
+# 重啟 SignOz Query Service
+ssh 192.168.0.188 "docker restart signoz-query-service"
+
+# 驗證規則載入
+curl http://192.168.0.188:3301/api/v3/alerts/rules
+```
+
+---
+
+## Phase F: 告警鏈路 E2E 驗證 (2h)
+
+### 現狀問題
+
+- 2026-03-26: 路徑錯誤導致 2 天無告警
+- 部署後無自動驗證機制
+
+### Step F-1: 建立 Smoke Test 腳本 (30min)
+
+```python
+# ops/scripts/alert_chain_smoke_test.py
+#!/usr/bin/env python3
+"""
+告警鏈路端到端驗證
+
+執行:
+  python ops/scripts/alert_chain_smoke_test.py
+
+驗證項目:
+1. Alertmanager Webhook 可達
+2. Sentry Webhook 可達
+3. SignOz Webhook 可達
+4. Telegram 發送成功
+5. Approval 建立成功
+"""
+
+import asyncio
+import httpx
+import sys
+from datetime import datetime
+
+
+API_BASE = "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
+# 本地測試用
+# API_BASE = "http://localhost:8000"
+
+TIMEOUT = 30
+
+
+async def test_alertmanager_webhook() -> bool:
+    """測試 Alertmanager Webhook"""
+    print("🔍 Testing Alertmanager Webhook...")
+
+    test_payload = {
+        "version": "4",
+        "status": "firing",
+        "alerts": [{
+            "status": "firing",
+            "labels": {
+                "alertname": "E2E_SMOKE_TEST",
+                "severity": "info",
+                "service": "smoke-test",
+                "namespace": "test",
+            },
+            "annotations": {
+                "summary": "E2E Smoke Test - 請忽略",
+                "description": f"自動測試 @ {datetime.now().isoformat()}",
+            },
+            "startsAt": datetime.now().isoformat() + "Z",
+        }]
+    }
+
+    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+        try:
+            response = await client.post(
+                f"{API_BASE}/api/v1/webhooks/alertmanager",
+                json=test_payload,
+            )
+            if response.status_code == 200:
+                print("  ✅ Alertmanager Webhook: OK")
+                return True
+            else:
+                print(f"  ❌ Alertmanager Webhook: {response.status_code}")
+                print(f"     Response: {response.text[:200]}")
+                return False
+        except Exception as e:
+            print(f"  ❌ Alertmanager Webhook: {e}")
+            return False
+
+
+async def test_sentry_webhook() -> bool:
+    """測試 Sentry Webhook"""
+    print("🔍 Testing Sentry Webhook...")
+
+    test_payload = {
+        "action": "triggered",
+        "data": {
+            "issue": {
+                "id": "smoke-test-" + datetime.now().strftime("%Y%m%d%H%M%S"),
+                "title": "E2E Smoke Test Error",
+                "level": "error",
+                "culprit": "smoke_test.py:test",
+                "project": {"slug": "awoooi-api"},
+                "firstSeen": datetime.now().isoformat(),
+                "count": 1,
+            },
+            "event": {
+                "message": "E2E Smoke Test - 請忽略",
+                "platform": "python",
+            },
+        },
+    }
+
+    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+        try:
+            response = await client.post(
+                f"{API_BASE}/api/v1/webhooks/sentry/error",
+                json=test_payload,
+            )
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("status") in ["accepted", "deduplicated"]:
+                    print("  ✅ Sentry Webhook: OK")
+                    return True
+            print(f"  ❌ Sentry Webhook: {response.status_code}")
+            return False
+        except Exception as e:
+            print(f"  ❌ Sentry Webhook: {e}")
+            return False
+
+
+async def test_health_endpoint() -> bool:
+    """測試 Health Endpoint"""
+    print("🔍 Testing Health Endpoint...")
+
+    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+        try:
+            response = await client.get(f"{API_BASE}/api/v1/health")
+            if response.status_code == 200:
+                print("  ✅ Health: OK")
+                return True
+            else:
+                print(f"  ❌ Health: {response.status_code}")
+                return False
+        except Exception as e:
+            print(f"  ❌ Health: {e}")
+            return False
+
+
+async def test_telegram_connectivity() -> bool:
+    """測試 Telegram 連通性"""
+    print("🔍 Testing Telegram Connectivity...")
+
+    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+        try:
+            # 透過內部 API 檢查 Telegram 狀態
+            response = await client.get(f"{API_BASE}/api/v1/telegram/status")
+            if response.status_code == 200:
+                data = response.json()
+                if data.get("connected"):
+                    print("  ✅ Telegram: Connected")
+                    return True
+                else:
+                    print("  ⚠️ Telegram: Not Connected (but endpoint reachable)")
+                    return True  # 端點可達即可
+            else:
+                print(f"  ❌ Telegram: {response.status_code}")
+                return False
+        except Exception as e:
+            print(f"  ⚠️ Telegram: {e} (endpoint may not exist)")
+            return True  # 不影響整體測試
+
+
+async def main():
+    print("=" * 60)
+    print("🚀 AWOOOI 告警鏈路 E2E Smoke Test")
+    print(f"   時間: {datetime.now().isoformat()}")
+    print(f"   目標: {API_BASE}")
+    print("=" * 60)
+
+    results = await asyncio.gather(
+        test_health_endpoint(),
+        test_alertmanager_webhook(),
+        test_sentry_webhook(),
+        test_telegram_connectivity(),
+    )
+
+    print("=" * 60)
+    passed = sum(results)
+    total = len(results)
+
+    if passed == total:
+        print(f"✅ 全部通過 ({passed}/{total})")
+        sys.exit(0)
+    else:
+        print(f"❌ 部分失敗 ({passed}/{total})")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Step F-2: 整合到 CD Pipeline (30min)
+
+```yaml
+# .github/workflows/cd.yaml
+# 新增 smoke test 步驟
+
+jobs:
+  deploy:
+    # ... 現有步驟 ...
+
+    - name: Wait for Pods Ready
+      run: |
+        kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=5m
+
+    # 🆕 告警鏈路驗證
+    - name: Alert Chain Smoke Test
+      run: |
+        # 等待服務完全啟動
+        sleep 30
+
+        # 執行 smoke test
+        python ops/scripts/alert_chain_smoke_test.py
+
+      env:
+        API_BASE: "http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
+
+    - name: Notify on Smoke Test Failure
+      if: failure()
+      run: |
+        # 直接發送 Telegram 告警 (繞過可能壞掉的 API)
+        curl -X POST "https://api.telegram.org/bot${TG_BOT_TOKEN}/sendMessage" \
+          -d "chat_id=${TG_CHAT_ID}" \
+          -d "text=🚨 AWOOOI CD Smoke Test 失敗！告警鏈路可能中斷！"
+      env:
+        TG_BOT_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }}
+        TG_CHAT_ID: ${{ secrets.OPENCLAW_TG_CHAT_ID }}
+```
+
+### Step F-3: 建立鏈路監控告警 (30min)
+
+```yaml
+# k8s/monitoring/alert-chain-monitor.yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: alert-chain-monitor
+  namespace: monitoring
+spec:
+  groups:
+    - name: alert_chain
+      rules:
+        # Alertmanager Webhook 無回應
+        - alert: AlertChainBroken_Alertmanager
+          expr: |
+            sum(rate(http_requests_total{
+              path="/api/v1/webhooks/alertmanager",
+              status!="200"
+            }[5m])) > 0
+            or
+            absent(http_requests_total{path="/api/v1/webhooks/alertmanager"})
+          for: 10m
+          labels:
+            severity: critical
+            service: alert-chain
+          annotations:
+            summary: "Alertmanager Webhook 鏈路異常"
+            description: "告警無法送達 AWOOOI API"
+
+        # Sentry Webhook 無回應
+        - alert: AlertChainBroken_Sentry
+          expr: |
+            sum(rate(http_requests_total{
+              path="/api/v1/webhooks/sentry/error",
+              status!="200"
+            }[5m])) > 0
+          for: 10m
+          labels:
+            severity: warning
+            service: alert-chain
+          annotations:
+            summary: "Sentry Webhook 鏈路異常"
+
+        # 長時間無告警 (可能鏈路斷了)
+        - alert: NoAlertsReceivedLong
+          expr: |
+            time() - max(awoooi_last_alert_received_timestamp) > 7200
+          for: 5m
+          labels:
+            severity: warning
+            service: alert-chain
+          annotations:
+            summary: "2 小時內未收到任何告警"
+            description: "可能是告警鏈路問題或系統異常穩定"
+```
+
+### Step F-4: 新增 Metrics (30min)
+
+```python
+# apps/api/src/core/metrics.py
+# 新增告警鏈路 metrics
+
+from prometheus_client import Counter, Gauge, Histogram
+import time
+
+# 最後收到告警的時間戳
+LAST_ALERT_RECEIVED = Gauge(
+    'awoooi_last_alert_received_timestamp',
+    'Timestamp of last received alert',
+)
+
+# 告警接收計數
+ALERTS_RECEIVED = Counter(
+    'awoooi_alerts_received_total',
+    'Total alerts received',
+    ['source', 'status']
+)
+
+# Webhook 處理延遲
+WEBHOOK_LATENCY = Histogram(
+    'awoooi_webhook_latency_seconds',
+    'Webhook processing latency',
+    ['webhook_type'],
+    buckets=[0.1, 0.5, 1, 2, 5, 10, 30]
+)
+
+
+def record_alert_received(source: str, status: str = "accepted"):
+    """記錄收到告警"""
+    LAST_ALERT_RECEIVED.set(time.time())
+    ALERTS_RECEIVED.labels(source=source, status=status).inc()
+```
+
+---
+
+## Phase G: Learning Service (3h)
+
+### Step G-1: 建立 learning_service.py (1.5h)
+
+```python
+# apps/api/src/services/learning_service.py
+"""
+異常學習服務 - 從解決方案中學習
+================================
+2026-03-29 ogt: 監控戰略規劃 Section 9.4 實作
+
+功能:
+1. 記錄每次修復的效果
+2. 計算各動作的成功率
+3. 推薦最佳修復方案
+4. 自動更新 Playbook
+"""
+
+import json
+from datetime import datetime
+from typing import Optional
+
+import redis.asyncio as redis
+import structlog
+
+from src.services.anomaly_counter import get_anomaly_counter
+from src.services.playbook_service import get_playbook_service
+
+logger = structlog.get_logger(__name__)
+
+
+class LearningService:
+    """
+    學習每次修復的效果，自動更新 Playbook
+    """
+
+    # 學習門檻: 需要至少 N 次數據才能推薦
+    MIN_SAMPLES = 5
+
+    # 成功率門檻: 高於此值才會被推薦
+    SUCCESS_RATE_THRESHOLD = 0.6
+
+    # Tier 對應的動作
+    TIER_ACTIONS = {
+        1: ['restart_pod', 'restart_container', 'delete_pod'],
+        2: ['scale_up', 'increase_memory', 'increase_cpu', 'adjust_limits'],
+        3: ['apply_hotfix', 'update_config', 'patch_deployment', 'rollback'],
+        4: ['create_issue', 'notify_team', 'schedule_fix', 'manual_intervention'],
+    }
+
+    def __init__(self, redis_client: redis.Redis):
+        self.redis = redis_client
+
+    async def record_repair_result(
+        self,
+        anomaly_key: str,
+        repair_action: str,
+        success: bool,
+        root_cause: Optional[str] = None,
+        fix_description: Optional[str] = None,
+        execution_time_seconds: Optional[float] = None,
+    ):
+        """
+        記錄修復結果，用於學習
+
+        Args:
+            anomaly_key: 異常 key
+            repair_action: 修復動作
+            success: 是否成功
+            root_cause: 根因 (如果找到)
+            fix_description: 修復說明
+            execution_time_seconds: 執行時間
+        """
+        # 1. 記錄到 AnomalyCounter
+        counter = get_anomaly_counter()
+        await counter.record_repair_attempt(anomaly_key, repair_action, success)
+
+        # 2. 記錄詳細學習數據
+        learning_key = f"learning:repair:{anomaly_key}:{repair_action}"
+        record = {
+            'success': success,
+            'root_cause': root_cause,
+            'fix_description': fix_description,
+            'execution_time': execution_time_seconds,
+            'timestamp': datetime.now().isoformat(),
+        }
+
+        await self.redis.lpush(learning_key, json.dumps(record))
+        await self.redis.ltrim(learning_key, 0, 99)  # 保留最近 100 次
+        await self.redis.expire(learning_key, 90 * 24 * 3600)  # 90 天
+
+        # 3. 如果找到根因且修復成功，考慮更新 Playbook
+        if success and root_cause:
+            await self._consider_playbook_update(
+                anomaly_key=anomaly_key,
+                repair_action=repair_action,
+                root_cause=root_cause,
+                fix_description=fix_description,
+            )
+
+        logger.info(
+            "learning_recorded",
+            anomaly_key=anomaly_key,
+            action=repair_action,
+            success=success,
+            has_root_cause=root_cause is not None,
+        )
+
+    async def get_recommended_fix(self, anomaly_key: str) -> dict:
+        """
+        根據歷史學習，推薦最佳修復方案
+
+        Returns:
+            {
+                'action': 'scale_up',
+                'confidence': 0.85,
+                'tier': 2,
+                'based_on': '12 次歷史數據',
+                'avg_execution_time': 45.2,
+                'alternatives': [...]
+            }
+        """
+        counter = get_anomaly_counter()
+        all_stats = await counter.get_all_repair_stats(anomaly_key)
+
+        if not all_stats:
+            return self._default_recommendation()
+
+        # 計算各動作的加權分數
+        scored_actions = []
+        for action, stats in all_stats.items():
+            if stats['total'] >= self.MIN_SAMPLES:
+                success_rate = stats['success_rate']
+                if success_rate >= self.SUCCESS_RATE_THRESHOLD:
+                    # 加權: 成功率 * log(樣本數)
+                    import math
+                    score = success_rate * math.log(stats['total'] + 1)
+
+                    # 取得平均執行時間
+                    avg_time = await self._get_avg_execution_time(anomaly_key, action)
+
+                    scored_actions.append({
+                        'action': action,
+                        'score': score,
+                        'success_rate': success_rate,
+                        'total_samples': stats['total'],
+                        'tier': self._get_action_tier(action),
+                        'avg_execution_time': avg_time,
+                    })
+
+        if not scored_actions:
+            return self._default_recommendation()
+
+        # 排序: 優先高成功率，其次低 Tier
+        scored_actions.sort(key=lambda x: (-x['score'], x['tier']))
+
+        best = scored_actions[0]
+        alternatives = scored_actions[1:3] if len(scored_actions) > 1 else []
+
+        return {
+            'action': best['action'],
+            'confidence': best['success_rate'],
+            'tier': best['tier'],
+            'based_on': f"{best['total_samples']} 次歷史數據",
+            'avg_execution_time': best['avg_execution_time'],
+            'alternatives': [
+                {'action': a['action'], 'confidence': a['success_rate'], 'tier': a['tier']}
+                for a in alternatives
+            ],
+        }
+
+    async def _get_avg_execution_time(self, anomaly_key: str, action: str) -> float:
+        """取得平均執行時間"""
+        learning_key = f"learning:repair:{anomaly_key}:{action}"
+        records = await self.redis.lrange(learning_key, 0, 19)  # 最近 20 次
+
+        times = []
+        for r in records:
+            data = json.loads(r)
+            if data.get('execution_time'):
+                times.append(data['execution_time'])
+
+        return sum(times) / len(times) if times else 0.0
+
+    async def _consider_playbook_update(
+        self,
+        anomaly_key: str,
+        repair_action: str,
+        root_cause: str,
+        fix_description: str,
+    ):
+        """
+        考慮是否要更新 Playbook
+
+        條件:
+        1. 該動作成功率 >= 80%
+        2. 至少有 5 次成功記錄
+        3. Playbook 中沒有更好的方案
+        """
+        counter = get_anomaly_counter()
+        stats = await counter.get_repair_success_rate(anomaly_key, repair_action)
+
+        if stats['total'] >= 5 and stats['success_rate'] >= 0.8:
+            # 檢查是否已有 Playbook
+            playbook_service = get_playbook_service()
+            existing = await playbook_service.find_by_anomaly_key(anomaly_key)
+
+            if not existing or existing.success_rate < stats['success_rate']:
+                # 建立或更新 Playbook
+                await playbook_service.create_or_update(
+                    anomaly_key=anomaly_key,
+                    root_cause=root_cause,
+                    fix_action=repair_action,
+                    fix_description=fix_description,
+                    success_rate=stats['success_rate'],
+                    total_executions=stats['total'],
+                    source='auto_learning',
+                )
+
+                logger.info(
+                    "playbook_auto_updated",
+                    anomaly_key=anomaly_key,
+                    action=repair_action,
+                    success_rate=stats['success_rate'],
+                )
+
+    def _get_action_tier(self, action: str) -> int:
+        """取得動作的 Tier"""
+        for tier, actions in self.TIER_ACTIONS.items():
+            if action in actions:
+                return tier
+        return 1  # 預設 Tier 1
+
+    def _default_recommendation(self) -> dict:
+        """預設推薦 (無歷史數據時)"""
+        return {
+            'action': 'restart_pod',
+            'confidence': 0.3,
+            'tier': 1,
+            'based_on': '無歷史數據，使用預設',
+            'avg_execution_time': 30.0,
+            'alternatives': [
+                {'action': 'delete_pod', 'confidence': 0.3, 'tier': 1},
+            ],
+        }
+
+    async def get_learning_summary(self, anomaly_key: str) -> dict:
+        """
+        取得學習摘要
+
+        Returns:
+            {
+                'anomaly_key': 'abc123',
+                'total_occurrences': 15,
+                'total_repair_attempts': 8,
+                'overall_success_rate': 0.625,
+                'actions_tried': ['restart_pod', 'scale_up'],
+                'best_action': {'action': 'scale_up', 'success_rate': 0.75},
+                'learning_status': 'sufficient',  # insufficient, sufficient, excellent
+            }
+        """
+        counter = get_anomaly_counter()
+
+        # 取得頻率統計
+        # 需要從 Redis 讀取，這裡簡化
+        timeline_key = f"anomaly:timeline:{anomaly_key}"
+        total_occurrences = await self.redis.zcard(timeline_key)
+
+        # 取得所有修復統計
+        all_stats = await counter.get_all_repair_stats(anomaly_key)
+
+        total_attempts = sum(s['total'] for s in all_stats.values())
+        total_success = sum(s['success'] for s in all_stats.values())
+        overall_rate = total_success / total_attempts if total_attempts > 0 else 0
+
+        # 找出最佳動作
+        best_action = None
+        best_rate = 0
+        for action, stats in all_stats.items():
+            if stats['total'] >= 3 and stats['success_rate'] > best_rate:
+                best_rate = stats['success_rate']
+                best_action = {'action': action, 'success_rate': best_rate}
+
+        # 判斷學習狀態
+        if total_attempts < 3:
+            status = 'insufficient'
+        elif total_attempts < 10:
+            status = 'learning'
+        elif overall_rate >= 0.8:
+            status = 'excellent'
+        else:
+            status = 'sufficient'
+
+        return {
+            'anomaly_key': anomaly_key,
+            'total_occurrences': total_occurrences,
+            'total_repair_attempts': total_attempts,
+            'overall_success_rate': overall_rate,
+            'actions_tried': list(all_stats.keys()),
+            'best_action': best_action,
+            'learning_status': status,
+        }
+
+
+# =============================================================================
+# Singleton
+# =============================================================================
+_learning_service: LearningService | None = None
+
+
+def get_learning_service() -> LearningService:
+    """取得 LearningService 實例"""
+    global _learning_service
+    if _learning_service is None:
+        from src.core.redis import get_redis_client
+        _learning_service = LearningService(get_redis_client())
+    return _learning_service
+```
+
+### Step G-2: 整合到 auto_repair_service.py (1h)
+
+```python
+# apps/api/src/services/auto_repair_service.py
+# 修改執行修復的流程
+
+from src.services.learning_service import get_learning_service
+import time
+
+
+class AutoRepairService:
+    async def execute_repair(
+        self,
+        incident_id: str,
+        anomaly_key: str,
+        repair_action: str,
+        dry_run: bool = False,
+    ) -> AutoRepairResult:
+        """
+        執行修復並記錄學習數據
+        """
+        learning = get_learning_service()
+        start_time = time.time()
+
+        try:
+            # 1. 執行修復
+            result = await self._do_execute(repair_action, ...)
+
+            # 2. 記錄學習數據
+            execution_time = time.time() - start_time
+            await learning.record_repair_result(
+                anomaly_key=anomaly_key,
+                repair_action=repair_action,
+                success=result.success,
+                root_cause=result.root_cause if hasattr(result, 'root_cause') else None,
+                fix_description=result.message,
+                execution_time_seconds=execution_time,
+            )
+
+            return result
+
+        except Exception as e:
+            # 記錄失敗
+            await learning.record_repair_result(
+                anomaly_key=anomaly_key,
+                repair_action=repair_action,
+                success=False,
+                fix_description=str(e),
+                execution_time_seconds=time.time() - start_time,
+            )
+            raise
+
+    async def get_smart_recommendation(self, anomaly_key: str) -> dict:
+        """
+        取得智慧修復建議 (結合 AI 分析 + 歷史學習)
+        """
+        learning = get_learning_service()
+
+        # 1. 取得學習推薦
+        learned = await learning.get_recommended_fix(anomaly_key)
+
+        # 2. 如果學習信心度高，直接使用
+        if learned['confidence'] >= 0.8:
+            return {
+                'source': 'learning',
+                'recommendation': learned,
+            }
+
+        # 3. 否則結合 AI 分析
+        # (呼叫 OpenClaw 取得建議)
+        ai_recommendation = await self._get_ai_recommendation(anomaly_key)
+
+        # 4. 合併推薦
+        return {
+            'source': 'hybrid',
+            'learning': learned,
+            'ai': ai_recommendation,
+            'final_recommendation': self._merge_recommendations(learned, ai_recommendation),
+        }
+```
+
+### Step G-3: 新增 API 端點 (30min)
+
+```python
+# apps/api/src/api/v1/learning.py
+"""
+學習系統 API
+"""
+
+from fastapi import APIRouter
+from src.services.learning_service import get_learning_service
+
+router = APIRouter(prefix="/learning", tags=["Learning"])
+
+
+@router.get("/summary/{anomaly_key}")
+async def get_learning_summary(anomaly_key: str):
+    """取得異常學習摘要"""
+    learning = get_learning_service()
+    return await learning.get_learning_summary(anomaly_key)
+
+
+@router.get("/recommendation/{anomaly_key}")
+async def get_recommendation(anomaly_key: str):
+    """取得修復推薦"""
+    learning = get_learning_service()
+    return await learning.get_recommended_fix(anomaly_key)
+```
+
+---
+
+## 完整實作清單總覽
+
+| Phase | 項目 | 工時 | 優先級 | 依賴 |
+|-------|------|------|--------|------|
+| A | AnomalyCounter | 4h | P0 | Redis |
+| B | Database Exporters | 3h | P0 | Docker |
+| C | Incident 頻率欄位 | 2h | P0 | Phase A |
+| D | Sentry Comment | 1h | P1 | Sentry Token |
+| E | SignOz 告警 | 2h | P1 | SignOz |
+| F | Alert Chain E2E | 2h | P0 | Phase A |
+| G | Learning Service | 3h | P1 | Phase A, C |
+
+**總工時**: 17h (約 2-3 天)
+
+---
+
+## 執行順序建議
+
+```
+Day 1 (8h):
+  ├─ Phase A: AnomalyCounter (4h) ✅
+  ├─ Phase B: Database Exporters (3h) ✅
+  └─ Phase F: Alert Chain E2E (部分, 1h) ✅
+
+Day 2 (6h):
+  ├─ Phase C: Incident 頻率 (2h) ✅
+  ├─ Phase D: Sentry Comment (1h) ✅
+  └─ Phase G: Learning Service (3h) ✅
+
+Day 3 (3h):
+  ├─ Phase E: SignOz 告警 (2h) ✅
+  └─ Phase F: Alert Chain E2E (完成, 1h) ✅
+```