awoooi/apps/api/src/services/stats_service.py

"""
Stats Service - Phase 17 P0 Router 層違規修復
=============================================

封裝統計 API 的快取邏輯與資料庫查詢，消除 Router 層直接存取 Redis/DB。

功能:
- 快取包裝器 (Redis)
- 統計計算 (透過 SQLAlchemy)

符合 leWOOOgo 積木化規範:
- Router -> Service -> Redis/Repository

@author Claude Code (首席架構師)
@version 2.0.0
@date 2026-03-28 (台北時間)
@see feedback_lewooogo_modular_enforcement.md
"""

import json
from collections.abc import Callable, Coroutine
from datetime import datetime, timedelta
from typing import Any, Protocol, runtime_checkable

import structlog
from sqlalchemy import func, select

from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import IncidentStatus

logger = structlog.get_logger(__name__)

# 快取 TTL (秒)
STATS_CACHE_TTL = 300  # 5 分鐘


# =============================================================================
# Protocol (Interface)
# =============================================================================


@runtime_checkable
class IStatsService(Protocol):
    """
    統計服務介面

    Phase 17 P1: 定義 Protocol 供依賴注入
    """

    async def get_incident_summary(
        self, days: int = 30
    ) -> dict[str, Any]:
        """取得事件總覽統計"""
        ...

    async def get_resolution_stats(
        self, days: int = 30
    ) -> dict[str, Any]:
        """取得解決時間統計"""
        ...

    async def get_ai_performance(
        self, days: int = 30
    ) -> dict[str, Any]:
        """取得 AI 效能統計"""
        ...

    async def get_affected_services(
        self, days: int = 30, limit: int = 10
    ) -> list[dict[str, Any]]:
        """取得受影響服務排名"""
        ...

    async def get_incident_trends(
        self, days: int = 30, period: str = "daily"
    ) -> dict[str, Any]:
        """取得事件趨勢"""
        ...

    async def get_feedback_summary(
        self, days: int = 30
    ) -> dict[str, Any]:
        """取得人類回饋摘要"""
        ...


# =============================================================================
# Implementation
# =============================================================================


class StatsService:
    """
    統計服務實作

    封裝統計 API 的快取邏輯與資料庫查詢
    """

    # -------------------------------------------------------------------------
    # 快取相關
    # -------------------------------------------------------------------------

    async def get_cached_or_compute(
        self,
        cache_key: str,
        compute_fn: Callable[[], Coroutine[Any, Any, dict[str, Any]]],
        ttl: int = STATS_CACHE_TTL,
    ) -> dict[str, Any]:
        """
        快取包裝器: 先查 Redis，沒有則計算並快取

        Phase 17: 從 Router 層遷移至 Service 層
        """
        redis_client = get_redis()

        # 嘗試從快取取得
        try:
            cached = await redis_client.get(cache_key)
            if cached:
                logger.debug("stats_cache_hit", key=cache_key)
                return json.loads(cached)
        except Exception as e:
            logger.warning("stats_cache_read_error", key=cache_key, error=str(e))

        # 計算結果
        result = await compute_fn()

        # 寫入快取
        try:
            await redis_client.set(cache_key, json.dumps(result), ex=ttl)
            logger.debug("stats_cache_set", key=cache_key, ttl=ttl)
        except Exception as e:
            logger.warning("stats_cache_write_error", key=cache_key, error=str(e))

        return result

    async def invalidate_cache(self, cache_key: str) -> bool:
        """清除指定快取"""
        redis_client = get_redis()
        try:
            await redis_client.delete(cache_key)
            logger.info("stats_cache_invalidated", key=cache_key)
            return True
        except Exception as e:
            logger.warning("stats_cache_invalidate_error", key=cache_key, error=str(e))
            return False

    # -------------------------------------------------------------------------
    # 統計查詢 (Phase 17 P1: 從 Router 層遷移)
    # -------------------------------------------------------------------------

    async def get_incident_summary(self, days: int = 30) -> dict[str, Any]:
        """
        取得事件總覽統計

        包含: 總事件數、狀態分佈、嚴重度分佈、解決率
        """
        cache_key = f"stats:incident_summary:{days}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                # 總數
                total_result = await db.execute(
                    select(func.count(IncidentRecord.incident_id)).where(
                        IncidentRecord.created_at >= since
                    )
                )
                total = total_result.scalar() or 0

                # 狀態分佈
                status_result = await db.execute(
                    select(IncidentRecord.status, func.count(IncidentRecord.incident_id))
                    .where(IncidentRecord.created_at >= since)
                    .group_by(IncidentRecord.status)
                )
                status_dist = [
                    {"status": str(row[0]), "count": row[1]}
                    for row in status_result.all()
                ]

                # 嚴重度分佈
                severity_result = await db.execute(
                    select(IncidentRecord.severity, func.count(IncidentRecord.incident_id))
                    .where(IncidentRecord.created_at >= since)
                    .group_by(IncidentRecord.severity)
                )
                severity_dist = [
                    {"severity": str(row[0]), "count": row[1]}
                    for row in severity_result.all()
                ]

                # 解決率
                resolved_result = await db.execute(
                    select(func.count(IncidentRecord.incident_id)).where(
                        IncidentRecord.created_at >= since,
                        IncidentRecord.status.in_(
                            [IncidentStatus.RESOLVED, IncidentStatus.CLOSED]
                        ),
                    )
                )
                resolved_count = resolved_result.scalar() or 0
                resolved_rate = (resolved_count / total * 100) if total > 0 else 0.0

                # 平均告警聚合數
                signals_result = await db.execute(
                    select(func.avg(func.json_array_length(IncidentRecord.signals))).where(
                        IncidentRecord.created_at >= since
                    )
                )
                avg_signals = signals_result.scalar() or 0.0

                logger.info(
                    "stats_incident_summary",
                    total=total,
                    resolved_rate=resolved_rate,
                    days=days,
                )

                return {
                    "total_incidents": total,
                    "status_distribution": status_dist,
                    "severity_distribution": severity_dist,
                    "resolved_rate": round(resolved_rate, 2),
                    "avg_signals_per_incident": round(float(avg_signals), 2),
                }

        return await self.get_cached_or_compute(cache_key, compute)

    async def get_resolution_stats(self, days: int = 30) -> dict[str, Any]:
        """
        取得解決時間統計

        計算: 平均、P50、P95、最快、最慢解決時間
        """
        cache_key = f"stats:resolution:{days}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                result = await db.execute(
                    select(
                        IncidentRecord.created_at,
                        IncidentRecord.resolved_at,
                    ).where(
                        IncidentRecord.created_at >= since,
                        IncidentRecord.resolved_at.isnot(None),
                    )
                )
                rows = result.all()

                if not rows:
                    return {
                        "avg_minutes": None,
                        "p50_minutes": None,
                        "p95_minutes": None,
                        "fastest_minutes": None,
                        "slowest_minutes": None,
                        "sample_size": 0,
                    }

                durations = []
                for row in rows:
                    if row.resolved_at and row.created_at:
                        delta = row.resolved_at - row.created_at
                        durations.append(delta.total_seconds() / 60)

                if not durations:
                    return {
                        "avg_minutes": None,
                        "p50_minutes": None,
                        "p95_minutes": None,
                        "fastest_minutes": None,
                        "slowest_minutes": None,
                        "sample_size": 0,
                    }

                durations.sort()
                n = len(durations)

                return {
                    "avg_minutes": round(sum(durations) / n, 2),
                    "p50_minutes": round(durations[n // 2], 2),
                    "p95_minutes": round(durations[min(int(n * 0.95), n - 1)], 2),
                    "fastest_minutes": round(min(durations), 2),
                    "slowest_minutes": round(max(durations), 2),
                    "sample_size": n,
                }

        return await self.get_cached_or_compute(cache_key, compute)

    async def get_ai_performance(self, days: int = 30) -> dict[str, Any]:
        """
        取得 AI 提案效能統計

        評估: 提案執行率、成功率、有效性評分
        """
        cache_key = f"stats:ai_performance:{days}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                result = await db.execute(
                    select(IncidentRecord.outcome).where(
                        IncidentRecord.created_at >= since,
                        IncidentRecord.outcome.isnot(None),
                    )
                )
                outcomes = [row[0] for row in result.all() if row[0]]

                total = len(outcomes)
                executed = sum(1 for o in outcomes if o.get("proposal_executed"))
                success = sum(
                    1 for o in outcomes if o.get("proposal_executed") and o.get("execution_success")
                )

                effectiveness_dist: dict[int, int] = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
                scores = []
                for o in outcomes:
                    score = o.get("effectiveness_score")
                    if score and 1 <= score <= 5:
                        effectiveness_dist[score] += 1
                        scores.append(score)

                avg_effectiveness = sum(scores) / len(scores) if scores else None

                return {
                    "total_proposals": total,
                    "executed_count": executed,
                    "execution_rate": round((executed / total * 100) if total > 0 else 0, 2),
                    "success_count": success,
                    "success_rate": round((success / executed * 100) if executed > 0 else 0, 2),
                    "avg_effectiveness": round(avg_effectiveness, 2) if avg_effectiveness else None,
                    "effectiveness_distribution": effectiveness_dist,
                }

        return await self.get_cached_or_compute(cache_key, compute)

    async def get_affected_services(
        self, days: int = 30, limit: int = 10
    ) -> list[dict[str, Any]]:
        """
        取得最常受影響的服務排名
        """
        cache_key = f"stats:affected_services:{days}:{limit}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                result = await db.execute(
                    select(
                        IncidentRecord.affected_services,
                        IncidentRecord.severity,
                    ).where(IncidentRecord.created_at >= since)
                )

                service_stats: dict[str, dict[str, Any]] = {}
                for row in result.all():
                    services = row[0] or []
                    severity = str(row[1])
                    for svc in services:
                        if svc not in service_stats:
                            service_stats[svc] = {"count": 0, "severity": {}}
                        service_stats[svc]["count"] += 1
                        service_stats[svc]["severity"][severity] = (
                            service_stats[svc]["severity"].get(severity, 0) + 1
                        )

                sorted_services = sorted(
                    service_stats.items(), key=lambda x: x[1]["count"], reverse=True
                )[:limit]

                return {
                    "services": [
                        {
                            "service": svc,
                            "incident_count": stats["count"],
                            "severity_breakdown": stats["severity"],
                        }
                        for svc, stats in sorted_services
                    ]
                }

        result = await self.get_cached_or_compute(cache_key, compute)
        return result.get("services", [])

    async def get_incident_trends(
        self, days: int = 30, period: str = "daily"
    ) -> dict[str, Any]:
        """
        取得事件趨勢數據 (SQL GROUP BY 優化版)
        """
        cache_key = f"stats:incident_trends:{days}:{period}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                trunc_unit = {"daily": "day", "weekly": "week", "monthly": "month"}.get(
                    period, "day"
                )

                result = await db.execute(
                    select(
                        func.date_trunc(trunc_unit, IncidentRecord.created_at).label("period"),
                        func.count(IncidentRecord.incident_id).label("count"),
                    )
                    .where(IncidentRecord.created_at >= since)
                    .group_by(func.date_trunc(trunc_unit, IncidentRecord.created_at))
                    .order_by(func.date_trunc(trunc_unit, IncidentRecord.created_at))
                )
                rows = result.all()

                trend_data = []
                for row in rows:
                    if row.period:
                        if period == "daily":
                            date_str = row.period.strftime("%Y-%m-%d")
                        elif period == "weekly":
                            date_str = row.period.strftime("%Y-W%W")
                        else:
                            date_str = row.period.strftime("%Y-%m")
                        trend_data.append({"date": date_str, "count": row.count})

                logger.info(
                    "stats_incident_trends",
                    period=period,
                    days=days,
                    data_points=len(trend_data),
                )

                return {"period": period, "data": trend_data}

        return await self.get_cached_or_compute(cache_key, compute)

    async def get_feedback_summary(self, days: int = 30) -> dict[str, Any]:
        """
        取得人類回饋統計
        """
        cache_key = f"stats:feedback_summary:{days}"

        async def compute() -> dict[str, Any]:
            async with get_db_context() as db:
                since = datetime.utcnow() - timedelta(days=days)

                result = await db.execute(
                    select(IncidentRecord.outcome).where(
                        IncidentRecord.created_at >= since,
                        IncidentRecord.outcome.isnot(None),
                    )
                )
                outcomes = [row[0] for row in result.all() if row[0]]

                positive = 0
                neutral = 0
                negative = 0
                themes: dict[str, int] = {}

                for o in outcomes:
                    score = o.get("effectiveness_score") or o.get("feedback_score")
                    if score:
                        if score >= 4:
                            positive += 1
                        elif score == 3:
                            neutral += 1
                        else:
                            negative += 1

                    notes = o.get("learning_notes") or o.get("notes") or ""
                    if notes:
                        notes_lower = notes.lower()
                        theme_keywords = {
                            "timeout": ["timeout", "超時", "timed out", "deadline"],
                            "latency": ["latency", "延遲", "slow", "慢", "p99", "p95"],
                            "memory": ["memory", "記憶體", "oom", "heap", "內存"],
                            "cpu": ["cpu", "處理器", "high load", "負載"],
                            "network": ["network", "網路", "dns", "connection refused"],
                            "connection": ["connection", "連線", "socket", "tcp"],
                            "disk": ["disk", "磁碟", "storage", "io", "iops"],
                            "database": ["database", "資料庫", "db", "query", "deadlock"],
                            "pod": ["pod", "container", "restart", "crashloop"],
                            "scaling": ["scale", "擴容", "replica", "hpa"],
                            "error": ["error", "錯誤", "exception", "fail"],
                            "config": ["config", "配置", "env", "secret"],
                        }
                        for theme, keywords in theme_keywords.items():
                            if any(kw in notes_lower for kw in keywords):
                                themes[theme] = themes.get(theme, 0) + 1

                sorted_themes = sorted(themes.items(), key=lambda x: x[1], reverse=True)[:5]
                common_themes = [t[0] for t in sorted_themes]

                total = positive + neutral + negative

                logger.info(
                    "stats_feedback_summary",
                    total=total,
                    positive=positive,
                    negative=negative,
                    days=days,
                )

                return {
                    "total_feedback": total,
                    "positive_count": positive,
                    "neutral_count": neutral,
                    "negative_count": negative,
                    "common_themes": common_themes,
                }

        return await self.get_cached_or_compute(cache_key, compute)


# =============================================================================
# Dependency Injection
# =============================================================================

_stats_service: StatsService | None = None


def get_stats_service() -> StatsService:
    """取得 StatsService 實例 (Singleton)"""
    global _stats_service
    if _stats_service is None:
        _stats_service = StatsService()
    return _stats_service