awoooi/apps/api/src/services/aiops_kpi_service.py

"""
AIOps KPI Service — ADR-090 + MASTER §7.1
==========================================
Router 層呼叫本 Service 取得 KPI 全景,Router 禁直接存取 DB (leWOOOgo 積木化鐵律).

2026-04-19 ogt + Claude Opus 4.7 (1M context) Asia/Taipei
"""
from __future__ import annotations

import math
from typing import Any

from sqlalchemy import text as _sql

from src.db.base import get_db_context
from src.utils.timezone import now_taipei


class AiopsKpiService:
    """組裝 AI 自主化成熟度 KPI 全景."""

    async def get_snapshot(self) -> dict[str, Any]:
        """一次回傳 6 section + 自主化總分."""
        async with get_db_context() as db:
            inventory = await self._fetch_asset_inventory(db)
            coverage = await self._fetch_coverage_kpi(db)
            rule_quality = await self._fetch_rule_quality(db)
            capacity = await self._fetch_capacity_health(db)
            aol_flow = await self._fetch_automation_flow_24h(db)
        autonomy = self._compute_autonomy_score(
            inventory, coverage, rule_quality, capacity, aol_flow,
        )
        return {
            "generated_at": now_taipei().isoformat(),
            "asset_inventory": inventory,
            "coverage_kpi": coverage,
            "rule_quality": rule_quality,
            "capacity_health": capacity,
            "automation_flow_24h": aol_flow,
            "ai_autonomy_score": autonomy,
        }

    async def _fetch_asset_inventory(self, db) -> dict[str, Any]:
        rows = await db.execute(_sql("""
            SELECT asset_type, count(*) AS cnt
            FROM asset_inventory
            WHERE lifecycle_state = 'active'
            GROUP BY asset_type
            ORDER BY cnt DESC
        """))
        by_type = {r.asset_type: int(r.cnt) for r in rows.fetchall()}

        run_row = await db.execute(_sql("""
            SELECT run_id, ended_at, total_assets, new_assets, modified_assets, duration_ms
            FROM asset_discovery_run
            WHERE status = 'success'
            ORDER BY ended_at DESC LIMIT 1
        """))
        run = run_row.one_or_none()
        last_run: dict[str, Any] | None = None
        if run:
            last_run = {
                "run_id": str(run.run_id),
                "ended_at": run.ended_at.isoformat() if run.ended_at else None,
                "total_assets": run.total_assets,
                "new_assets": run.new_assets,
                "modified_assets": run.modified_assets,
                "duration_ms": run.duration_ms,
            }
        return {
            "by_type": by_type,
            "total": sum(by_type.values()),
            "last_scan": last_run,
        }

    async def _fetch_coverage_kpi(self, db) -> dict[str, Any]:
        rows = await db.execute(_sql("""
            SELECT dimension, coverage_status, count(*) AS cnt
            FROM asset_coverage_snapshot
            WHERE run_id = (
                SELECT run_id FROM asset_discovery_run
                WHERE status = 'success' ORDER BY ended_at DESC LIMIT 1
            )
            GROUP BY dimension, coverage_status
            ORDER BY dimension, coverage_status
        """))
        by_dim: dict[str, dict[str, int]] = {}
        for r in rows.fetchall():
            by_dim.setdefault(r.dimension, {})[r.coverage_status] = int(r.cnt)

        slo_per_dim: dict[str, float] = {}
        for dim, statuses in by_dim.items():
            total = sum(statuses.values())
            green = statuses.get("green", 0)
            slo_per_dim[dim] = round(green / total, 4) if total else 0.0

        return {
            "by_dimension": by_dim,
            "green_ratio_per_dim": slo_per_dim,
            "overall_green_ratio": round(
                sum(slo_per_dim.values()) / len(slo_per_dim), 4
            ) if slo_per_dim else 0.0,
        }

    async def _fetch_rule_quality(self, db) -> dict[str, Any]:
        summary = await db.execute(_sql("""
            SELECT
                count(*) AS total,
                count(*) FILTER (WHERE last_fired_at IS NOT NULL) AS with_fires,
                count(*) FILTER (WHERE noise_rate > 0.5) AS noisy,
                count(*) FILTER (WHERE review_status = 'deprecated') AS deprecated,
                count(*) FILTER (WHERE source = 'ai_generated') AS ai_generated
            FROM alert_rule_catalog
        """))
        s = summary.one()

        noisy_rows = await db.execute(_sql("""
            SELECT rule_name, severity, true_positive_count AS tp, false_positive_count AS fp,
                   noise_rate, last_fired_at
            FROM alert_rule_catalog
            WHERE noise_rate IS NOT NULL
              AND review_status IS DISTINCT FROM 'deprecated'
            ORDER BY noise_rate DESC, true_positive_count + false_positive_count DESC
            LIMIT 5
        """))
        top_noisy = [
            {
                "rule_name": r.rule_name,
                "severity": r.severity,
                "tp": int(r.tp or 0),
                "fp": int(r.fp or 0),
                "noise_rate": float(r.noise_rate) if r.noise_rate else 0.0,
                "last_fired_at": r.last_fired_at.isoformat() if r.last_fired_at else None,
            }
            for r in noisy_rows.fetchall()
        ]
        return {
            "total": int(s.total or 0),
            "with_fires": int(s.with_fires or 0),
            "noisy_above_0_5": int(s.noisy or 0),
            "deprecated": int(s.deprecated or 0),
            "ai_generated": int(s.ai_generated or 0),
            "top_noisy": top_noisy,
        }

    async def _fetch_capacity_health(self, db) -> dict[str, Any]:
        rows = await db.execute(_sql("""
            SELECT DISTINCT ON (host)
                host, ai_verdict, cpu_used_pct, mem_used_pct, swap_used_pct,
                captured_at, ai_reasoning
            FROM host_capacity_snapshot
            ORDER BY host, captured_at DESC
        """))
        hosts = [
            {
                "host": r.host,
                "ai_verdict": r.ai_verdict,
                "cpu_used_pct": float(r.cpu_used_pct) if r.cpu_used_pct else None,
                "mem_used_pct": float(r.mem_used_pct) if r.mem_used_pct else None,
                "swap_used_pct": float(r.swap_used_pct) if r.swap_used_pct else None,
                "captured_at": r.captured_at.isoformat() if r.captured_at else None,
                "reasoning": r.ai_reasoning,
            }
            for r in rows.fetchall()
        ]
        by_verdict: dict[str, int] = {}
        for h in hosts:
            key = h["ai_verdict"] or "unknown"
            by_verdict[key] = by_verdict.get(key, 0) + 1

        violations = await db.execute(_sql("""
            SELECT count(*) AS cnt FROM capacity_violation_event
            WHERE detected_at > NOW() - INTERVAL '7 days'
        """))
        return {
            "hosts": hosts,
            "by_verdict": by_verdict,
            "violations_7d": int(violations.scalar() or 0),
        }

    async def _fetch_automation_flow_24h(self, db) -> dict[str, Any]:
        rows = await db.execute(_sql("""
            SELECT operation_type, actor, status, count(*) AS cnt
            FROM automation_operation_log
            WHERE created_at > NOW() - INTERVAL '24 hours'
            GROUP BY operation_type, actor, status
            ORDER BY cnt DESC
        """))
        flows = [
            {"operation_type": r.operation_type, "actor": r.actor, "status": r.status, "count": int(r.cnt)}
            for r in rows.fetchall()
        ]

        by_actor: dict[str, int] = {}
        by_type: dict[str, int] = {}
        for f in flows:
            by_actor[f["actor"]] = by_actor.get(f["actor"], 0) + f["count"]
            by_type[f["operation_type"]] = by_type.get(f["operation_type"], 0) + f["count"]

        return {
            "detail": flows,
            "by_actor": by_actor,
            "by_operation_type": by_type,
            "total": sum(by_type.values()),
        }

    def _compute_autonomy_score(
        self,
        inventory: dict[str, Any],
        coverage: dict[str, Any],
        rule_quality: dict[str, Any],
        capacity: dict[str, Any],
        aol_flow: dict[str, Any],
    ) -> dict[str, Any]:
        """AI 自主化總分 (0-100),5 子項各 20 分."""
        score_coverage = min(20.0, coverage.get("overall_green_ratio", 0.0) * 20)

        total_rules = rule_quality.get("total", 1) or 1
        noisy = rule_quality.get("noisy_above_0_5", 0)
        score_rule = max(0.0, 20 - (noisy / total_rules * 20))

        by_verdict = capacity.get("by_verdict", {})
        critical = by_verdict.get("critical", 0)
        warning = by_verdict.get("warning", 0)
        deductions = min(20, critical * 10 + warning * 3)
        score_capacity = max(0.0, 20 - deductions)

        total_ops_24h = aol_flow.get("total", 0)
        if total_ops_24h > 0:
            score_flow = min(20.0, math.log10(total_ops_24h + 1) / math.log10(101) * 20)
        else:
            score_flow = 0.0

        ai_rules = rule_quality.get("ai_generated", 0)
        op_types = len(aol_flow.get("by_operation_type", {}))
        score_diversity = min(20.0, ai_rules * 1.0 + min(op_types, 10))

        total = score_coverage + score_rule + score_capacity + score_flow + score_diversity

        if total >= 90:
            grade = "mature"
        elif total >= 70:
            grade = "in_progress"
        elif total >= 50:
            grade = "starter"
        else:
            grade = "initial"

        return {
            "total": round(total, 1),
            "grade": grade,
            "breakdown": {
                "asset_coverage": round(score_coverage, 2),
                "rule_quality": round(score_rule, 2),
                "capacity_health": round(score_capacity, 2),
                "automation_flow": round(score_flow, 2),
                "ai_diversity": round(score_diversity, 2),
            },
            "max": 100.0,
        }


_singleton: AiopsKpiService | None = None


def get_aiops_kpi_service() -> AiopsKpiService:
    global _singleton
    if _singleton is None:
        _singleton = AiopsKpiService()
    return _singleton