awoooi/apps/api/src/services/flywheel_stats_service.py

"""
Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1

飛輪健康度指標計算服務：
  - 供 Prometheus Exporter（M1）抓取
  - 供前端 /api/v1/stats/flywheel 即時顯示（C1）

Metrics:
  awoooi_flywheel_playbook_count          目標 ≥ 20
  awoooi_flywheel_execution_success_rate  目標 ≥ 0.3
  awoooi_flywheel_km_unvectorized_count   目標 = 0
  awoooi_flywheel_alertname_null_rate     目標 = 0
  awoooi_flywheel_incidents_stuck         目標 = 0

2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1)
"""

from __future__ import annotations

import json
from datetime import datetime, timedelta
from typing import Any

import structlog
from sqlalchemy import func, select, text

from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord, KnowledgeEntryRecord
from src.models.incident import IncidentStatus
from src.utils.timezone import now_taipei

logger = structlog.get_logger(__name__)

# Redis key prefix（與 playbook_repository.py 一致）
_PLAYBOOK_KEY_PREFIX = "playbook:"

# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復（W-3 fresh deploy 假告警）
# execution_success_rate 需要最少樣本數才有統計意義；
# Redis 空（fresh deploy / restart）時 total_exec=0 → rate=0.0 → watchdog W-3 立即觸發假告警
# 修法：total_exec < FLYWHEEL_MIN_SAMPLE 時回 None，watchdog 判 None 跳過 W-3 檢查
# TODO: 未來移至 settings（目前 hardcode 以避免 config 改動超出本輪範圍）
FLYWHEEL_MIN_SAMPLE = 10

# 飛輪六節點名稱
FLYWHEEL_NODES = [
    "monitoring",
    "deduplication",
    "diagnosis",
    "reasoning",
    "execution",
    "learning",
]


# =============================================================================
# 核心指標資料結構
# =============================================================================


class FlywheelMetrics:
    """飛輪健康度指標快照"""

    def __init__(
        self,
        playbook_count: int,
        execution_success_rate: float | None,
        km_unvectorized_count: int,
        alertname_null_rate: float,
        incidents_stuck: int,
        today_processed: int,
        flywheel_conversions_today: int,
        km_vectorized_rate: float,
        node_stats: dict[str, Any],
        current_flow: list[dict[str, Any]],
        computed_at: datetime,
    ) -> None:
        # 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復
        # execution_success_rate 為 None 時表示樣本不足（< FLYWHEEL_MIN_SAMPLE），
        # watchdog W-3 應跳過該檢查，避免 fresh deploy 假告警
        self.playbook_count = playbook_count
        self.execution_success_rate = execution_success_rate
        self.km_unvectorized_count = km_unvectorized_count
        self.alertname_null_rate = alertname_null_rate
        self.incidents_stuck = incidents_stuck
        self.today_processed = today_processed
        self.flywheel_conversions_today = flywheel_conversions_today
        self.km_vectorized_rate = km_vectorized_rate
        self.node_stats = node_stats
        self.current_flow = current_flow
        self.computed_at = computed_at
        self.type4_count: int = 0  # TYPE-4 incidents 數（ADR-073-C C2）

    def to_prometheus_lines(self) -> str:
        """輸出 Prometheus text format"""
        ts = int(self.computed_at.timestamp() * 1000)
        # 2026-05-02 ogt + Claude Opus 4.7 — Bug 2 後續修復（critic P0-1 連鎖修復）
        # sentinel 用 NaN 而非 -1.0：Prometheus 對 NaN 比較永遠回 false，
        # 既有 alert rule `awoooi_flywheel_execution_success_rate < 0.1` 自然不會被
        # sentinel 觸發；同時 Grafana 渲染為「無資料」gap，比 -1 spike 直觀。
        # 前次嘗試 -1.0 會讓 ops/monitoring/alerts.yml:775 等 3 份 prom rule
        # 在 fresh deploy 後 2h 必噴 FlywheelExecutionSuccessLow 假告警，跟 watchdog skip 自相矛盾。
        rate_str = (
            f"{self.execution_success_rate:.4f}"
            if self.execution_success_rate is not None
            else "NaN"
        )
        lines = [
            "# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis",
            "# TYPE awoooi_flywheel_playbook_count gauge",
            f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}",
            "",
            "# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1), NaN=insufficient sample",
            "# TYPE awoooi_flywheel_execution_success_rate gauge",
            f"awoooi_flywheel_execution_success_rate {rate_str} {ts}",
            "",
            "# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized",
            "# TYPE awoooi_flywheel_km_unvectorized_count gauge",
            f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}",
            "",
            "# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname",
            "# TYPE awoooi_flywheel_alertname_null_rate gauge",
            f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}",
            "",
            "# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h",
            "# TYPE awoooi_flywheel_incidents_stuck gauge",
            f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}",
            "",
            "# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized",
            "# TYPE awoooi_flywheel_km_vectorized_rate gauge",
            f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}",
        ]
        return "\n".join(lines) + "\n"

    def to_flywheel_api_dict(self) -> dict[str, Any]:
        """輸出 /api/v1/stats/flywheel 格式"""
        return {
            "nodes": self.node_stats,
            "current_flow": self.current_flow,
            "type4_count": self.type4_count,
            "computed_at": self.computed_at.isoformat(),
        }

    def to_summary_api_dict(self) -> dict[str, Any]:
        """輸出 /api/v1/stats/summary 格式"""
        return {
            "playbook_count": self.playbook_count,
            "execution_success_rate": round(self.execution_success_rate, 4) if self.execution_success_rate is not None else None,
            "today_processed": self.today_processed,
            "flywheel_conversions_today": self.flywheel_conversions_today,
            "km_vectorized_rate": round(self.km_vectorized_rate, 4),
            "km_unvectorized_count": self.km_unvectorized_count,
            "alertname_null_rate": round(self.alertname_null_rate, 4),
            "incidents_stuck": self.incidents_stuck,
            "computed_at": self.computed_at.isoformat(),
        }


# =============================================================================
# FlywheelStatsService
# =============================================================================


class FlywheelStatsService:
    """
    飛輪健康度指標計算服務

    ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取
    ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示
    """

    async def compute(self) -> FlywheelMetrics:
        """計算所有飛輪指標（單次完整查詢）"""
        now = now_taipei()

        playbook_count, execution_success_rate = await self._playbook_stats()
        (
            km_unvectorized_count,
            km_vectorized_rate,
            flywheel_conversions_today,
        ) = await self._km_stats(now)
        (
            alertname_null_rate,
            incidents_stuck,
            today_processed,
            node_stats,
            current_flow,
            type4_count,
        ) = await self._incident_stats(now)

        metrics = FlywheelMetrics(
            playbook_count=playbook_count,
            execution_success_rate=execution_success_rate,
            km_unvectorized_count=km_unvectorized_count,
            alertname_null_rate=alertname_null_rate,
            incidents_stuck=incidents_stuck,
            today_processed=today_processed,
            flywheel_conversions_today=flywheel_conversions_today,
            km_vectorized_rate=km_vectorized_rate,
            node_stats=node_stats,
            current_flow=current_flow,
            computed_at=now,
        )
        metrics.type4_count = type4_count
        return metrics

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    async def _playbook_stats(self) -> tuple[int, float | None]:
        """Playbook 數量 + 執行成功率（從 Redis）

        2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復（W-3 fresh deploy 假告警）
        total_exec < FLYWHEEL_MIN_SAMPLE 時回 None，代表樣本不足，
        watchdog W-3 判 None 跳過該檢查，避免每次 restart 觸發假告警。
        """
        try:
            redis = get_redis()
            count = 0
            total_exec = 0
            total_success = 0

            async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200):
                raw = await redis.get(key)
                if not raw:
                    continue
                try:
                    pb = json.loads(raw)
                    status = pb.get("status", "")
                    if status == "approved":
                        count += 1
                    success_count = pb.get("success_count", 0) or 0
                    failure_count = pb.get("failure_count", 0) or 0
                    total_exec += success_count + failure_count
                    total_success += success_count
                except (json.JSONDecodeError, KeyError):
                    continue

            # 2026-05-06 ogt + Codex:
            # 執行成功率的 source of truth 是 auto_repair_executions。
            # Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後，
            # 造成 governance / heartbeat 判定「飛輪沒有執行」。
            # 2026-05-29 Codex:
            # 24h 低流量不是資料管線斷流；若 24h 未達最小樣本，改用 7d
            # 穩定窗口，避免 FlywheelExecutionRateMissing 長期誤報。
            try:
                async with get_db_context() as db:
                    row = await db.execute(
                        text("""
                            SELECT
                                COUNT(*) FILTER (WHERE success IS TRUE) AS success,
                                COUNT(*) AS total
                            FROM auto_repair_executions
                            WHERE created_at >= NOW() - interval '24 hours'
                        """)
                    )
                    repair_stats = row.one()
                    db_total_exec = int(repair_stats.total or 0)
                    if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
                        db_total_success = int(repair_stats.success or 0)
                        return count, db_total_success / db_total_exec

                    fallback_row = await db.execute(
                        text("""
                            SELECT
                                COUNT(*) FILTER (WHERE success IS TRUE) AS success,
                                COUNT(*) AS total
                            FROM auto_repair_executions
                            WHERE created_at >= NOW() - interval '7 days'
                        """)
                    )
                    fallback_stats = fallback_row.one()
                    fallback_total = int(fallback_stats.total or 0)
                    if fallback_total >= FLYWHEEL_MIN_SAMPLE:
                        fallback_success = int(fallback_stats.success or 0)
                        return count, fallback_success / fallback_total
                    if db_total_exec > 0 or fallback_total > 0:
                        return count, None
            except Exception:
                logger.warning("flywheel_stats_auto_repair_execution_query_failed")

            if total_exec < FLYWHEEL_MIN_SAMPLE:
                # 樣本不足（含 Redis 空），回 None 通知呼叫方跳過 W-3 告警判斷
                return count, None
            rate = total_success / total_exec
            return count, rate

        except Exception:
            logger.exception("flywheel_stats_playbook_error")
            return 0, None

    async def _km_stats(self, now: datetime) -> tuple[int, float, int]:
        """KM 向量化率 + 今日飛輪轉化數（從 PostgreSQL）"""
        try:
            today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)

            async with get_db_context() as db:
                # 未向量化數量 (embedding IS NULL = 未向量化)
                # 2026-04-15 ogt: KnowledgeEntryRecord ORM 不宣告 embedding 欄位（pgvector），
                #                  改用 raw SQL 避免 AttributeError
                unvectorized_q = await db.execute(
                    text("SELECT COUNT(*) FROM knowledge_entries WHERE embedding IS NULL")
                )
                unvectorized = unvectorized_q.scalar_one_or_none() or 0

                # 總數
                total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id)))
                total = total_q.scalar_one_or_none() or 0

                vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0

                # 今日轉化數（今日建立的 KM）
                conversions_q = await db.execute(
                    select(func.count()).where(
                        KnowledgeEntryRecord.created_at >= today_start
                    )
                )
                conversions_today = conversions_q.scalar_one_or_none() or 0

            return unvectorized, vectorized_rate, conversions_today

        except Exception:
            logger.exception("flywheel_stats_km_error")
            return 0, 0.0, 0

    async def _incident_stats(
        self, now: datetime
    ) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]:
        """Incident 相關指標（alertname NULL 率、卡住數、今日處理數、節點狀態、當前流）"""
        try:
            today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
            stuck_threshold = now - timedelta(hours=24)
            recent_1h = now - timedelta(hours=1)

            async with get_db_context() as db:
                # alertname NULL 率
                total_q = await db.execute(select(func.count(IncidentRecord.incident_id)))
                total = total_q.scalar_one_or_none() or 0

                null_q = await db.execute(
                    select(func.count()).where(IncidentRecord.alertname.is_(None))
                )
                null_count = null_q.scalar_one_or_none() or 0
                alertname_null_rate = null_count / total if total > 0 else 0.0

                # 卡住的 Incident（INVESTIGATING > 24h）
                stuck_q = await db.execute(
                    select(func.count()).where(
                        IncidentRecord.status == IncidentStatus.INVESTIGATING,
                        IncidentRecord.created_at <= stuck_threshold,
                    )
                )
                incidents_stuck = stuck_q.scalar_one_or_none() or 0

                # TYPE-4 Incident 數（ADR-073-C C2 — 供前端 hasType4 判斷）
                # 2026-04-12 ogt
                type4_q = await db.execute(
                    select(func.count()).where(
                        IncidentRecord.notification_type == "TYPE-4",
                        IncidentRecord.status == IncidentStatus.INVESTIGATING,
                    )
                )
                type4_count = type4_q.scalar_one_or_none() or 0

                # 今日處理數
                today_q = await db.execute(
                    select(func.count()).where(
                        IncidentRecord.created_at >= today_start
                    )
                )
                today_processed = today_q.scalar_one_or_none() or 0

                # 節點狀態（監控/去重/執行）
                recent_q = await db.execute(
                    select(func.count()).where(
                        IncidentRecord.created_at >= recent_1h
                    )
                )
                count_1h = recent_q.scalar_one_or_none() or 0

                # 自動執行成功數（今日）
                success_q = await db.execute(
                    text(
                        "SELECT COUNT(*) FROM incidents WHERE created_at >= :today"
                        " AND outcome::text LIKE '%execution_success%true%'"
                    ),
                    {"today": today_start},
                )
                exec_success_today = success_q.scalar_one_or_none() or 0

                # 當前流（最近 10 筆活躍 Incident）
                active_q = await db.execute(
                    select(
                        IncidentRecord.incident_id,
                        IncidentRecord.alertname,
                        IncidentRecord.status,
                        IncidentRecord.created_at,
                    )
                    .where(
                        IncidentRecord.status.in_([
                            IncidentStatus.INVESTIGATING.value,
                            IncidentStatus.MITIGATING.value,
                        ])
                    )
                    .order_by(IncidentRecord.created_at.desc())
                    .limit(10)
                )
                active_rows = active_q.fetchall()

            current_flow = [
                {
                    "incident_id": row.incident_id,
                    "alertname": row.alertname or "unknown",
                    "current_node": _status_to_node(row.status),
                    "ts": row.created_at.isoformat() if row.created_at else None,
                }
                for row in active_rows
            ]

            node_stats = {
                "monitoring": {
                    "status": "active" if count_1h > 0 else "idle",
                    "count_1h": count_1h,
                },
                "deduplication": {
                    "status": "active",
                    "dedup_window_min": 30,
                },
                "diagnosis": {
                    "status": "active",
                    "mcp_providers_used": ["k8s", "ssh", "prometheus"],
                },
                "reasoning": {
                    "status": "active",
                    "today_processed": today_processed,
                },
                "execution": {
                    "status": "active",
                    "success_today": exec_success_today,
                },
                "learning": {
                    "status": "active",
                },
            }

            return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow, type4_count

        except Exception:
            logger.exception("flywheel_stats_incident_error")
            return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, [], 0


def _status_to_node(status: str) -> str:
    mapping = {
        IncidentStatus.INVESTIGATING.value: "diagnosis",
        IncidentStatus.MITIGATING.value: "execution",
        IncidentStatus.RESOLVED.value: "learning",
        IncidentStatus.CLOSED.value: "learning",
    }
    return mapping.get(status, "reasoning")


# =============================================================================
# DI 工廠
# =============================================================================

_instance: FlywheelStatsService | None = None


def get_flywheel_stats_service() -> FlywheelStatsService:
    global _instance
    if _instance is None:
        _instance = FlywheelStatsService()
    return _instance