From 0004554bc62ce1458ae576f7010b2dc6438bc6ca Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Sun, 19 Apr 2026 21:20:13 +0800
Subject: [PATCH] =?UTF-8?q?feat(api):=20AIOps=20KPI=20Dashboard=20?=
 =?UTF-8?q?=E2=80=94=20AI=20=E8=87=AA=E4=B8=BB=E5=8C=96=E6=88=90=E7=86=9F?=
 =?UTF-8?q?=E5=BA=A6=E5=85=A8=E6=99=AF=20(=E7=A9=8D=E6=9C=A8=E5=8C=96?=
 =?UTF-8?q?=E9=87=8D=E6=A7=8B)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GET /api/v1/aiops/kpi → 一次整合 MASTER §7.1 全部 KPI.

leWOOOgo 積木化鐵律對齊:
  - Router (api/v1/aiops_kpi.py) 僅 HTTP 路由, 不碰 DB
  - Service (services/aiops_kpi_service.py) 負責所有 SQL + 計算
  - 前次 commit 被 hook 擋下 (Router 直接 import get_db_context), 本次修正

services/aiops_kpi_service.py (~230 行):
  AiopsKpiService.get_snapshot() 回 6 section:

  1. asset_inventory: by_type + total + last_scan (run_id/ended_at/總計/new/modified)
  2. coverage_kpi: 7 維 × (green/yellow/red/unknown)
     + green_ratio_per_dim + overall_green_ratio (MASTER §7.1 #5 SLO)
  3. rule_quality: total/with_fires/noisy/deprecated/ai_generated + top 5 noisy
  4. capacity_health: 最新 snapshot per host + by_verdict + violations_7d
  5. automation_flow_24h: aol detail + by_actor + by_operation_type
  6. ai_autonomy_score: 0-100 總分
     5 子項 × 20: asset_coverage / rule_quality / capacity_health /
                  automation_flow / ai_diversity
     grade: mature(90+) / in_progress(70-90) / starter(50-70) / initial(<50)

api/v1/aiops_kpi.py (~35 行 精簡 router):
  只做 router = APIRouter() + @router.get 委派給 service

main.py:
  include_router(aiops_kpi_v1.router, prefix='/api/v1', tags=['AIOps KPI'])

統帥使用:
  curl http://192.168.0.121:32334/api/v1/aiops/kpi | jq .
  一次看見 AI 自主化成熟度全景

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 apps/api/src/api/v1/aiops_kpi.py           |  36 +++
 apps/api/src/main.py                       |   2 +
 apps/api/src/services/aiops_kpi_service.py | 270 +++++++++++++++++++++
 3 files changed, 308 insertions(+)
 create mode 100644 apps/api/src/api/v1/aiops_kpi.py
 create mode 100644 apps/api/src/services/aiops_kpi_service.py

diff --git a/apps/api/src/api/v1/aiops_kpi.py b/apps/api/src/api/v1/aiops_kpi.py
new file mode 100644
index 00000000..a87e4618
--- /dev/null
+++ b/apps/api/src/api/v1/aiops_kpi.py
@@ -0,0 +1,36 @@
+"""
+AIOps KPI Dashboard — ADR-090 + MASTER §7.1
+=============================================
+GET /api/v1/aiops/kpi → 一次回傳 AI 自主化成熟度全景.
+
+Router 層只負責 HTTP 路由,DB/business logic 由 AiopsKpiService 處理
+(leWOOOgo 積木化鐵律: Router 禁直接存取 DB).
+
+2026-04-19 ogt + Claude Opus 4.7 (1M context) Asia/Taipei
+"""
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import APIRouter
+
+from src.services.aiops_kpi_service import get_aiops_kpi_service
+
+router = APIRouter()
+
+
+@router.get("/aiops/kpi", tags=["AIOps KPI"])
+async def get_aiops_kpi() -> dict[str, Any]:
+    """
+    AI 自主化成熟度全景 KPI.
+
+    一次返回 6 個 section + autonomy_score:
+    - asset_inventory: 資產盤點 (by type + last_scan)
+    - coverage_kpi: 7 維自動化覆蓋 SLO (green/yellow/red/unknown)
+    - rule_quality: 規則品質 (noisy/deprecated/with_fires + top 5)
+    - capacity_health: 主機容量健康 (ai_verdict 分布)
+    - automation_flow_24h: 過去 24h aol 動作流量
+    - ai_autonomy_score: 自主化總分 (0-100, 5 子項 × 20)
+    """
+    svc = get_aiops_kpi_service()
+    return await svc.get_snapshot()
diff --git a/apps/api/src/main.py b/apps/api/src/main.py
index aced2951..c4b3f0c5 100644
--- a/apps/api/src/main.py
+++ b/apps/api/src/main.py
@@ -35,6 +35,7 @@ from sentry_sdk.integrations.starlette import StarletteIntegration
 from src.api.v1 import agents as agents_v1  # Phase 9.5: Agent Teams API
 from src.api.v1 import ai as ai_v1
 from src.api.v1 import ai_slo as ai_slo_v1  # Phase 6 ADR-087: AI SLO 自我治理
+from src.api.v1 import aiops_kpi as aiops_kpi_v1  # ADR-090 § Phase 7 KPI Dashboard
 from src.api.v1 import approvals as approvals_v1
 from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
 from src.api.v1 import audit_logs as audit_logs_v1
@@ -685,6 +686,7 @@ app.include_router(dashboard_v1.router, prefix="/api/v1", tags=["Dashboard"])
 app.include_router(approvals_v1.router, prefix="/api/v1", tags=["HITL Approvals"])
 app.include_router(ai_v1.router, prefix="/api/v1", tags=["AI Decision"])
 app.include_router(ai_slo_v1.router, prefix="/api/v1", tags=["AI SLO"])  # Phase 6 ADR-087
+app.include_router(aiops_kpi_v1.router, prefix="/api/v1", tags=["AIOps KPI"])  # ADR-090 § Phase 7 Dashboard
 app.include_router(webhooks_v1.router, prefix="/api/v1", tags=["Webhooks"])
 app.include_router(timeline_v1.router, prefix="/api/v1", tags=["Timeline"])
 app.include_router(audit_logs_v1.router, prefix="/api/v1", tags=["Audit Logs"])
diff --git a/apps/api/src/services/aiops_kpi_service.py b/apps/api/src/services/aiops_kpi_service.py
new file mode 100644
index 00000000..790b8582
--- /dev/null
+++ b/apps/api/src/services/aiops_kpi_service.py
@@ -0,0 +1,270 @@
+"""
+AIOps KPI Service — ADR-090 + MASTER §7.1
+==========================================
+Router 層呼叫本 Service 取得 KPI 全景,Router 禁直接存取 DB (leWOOOgo 積木化鐵律).
+
+2026-04-19 ogt + Claude Opus 4.7 (1M context) Asia/Taipei
+"""
+from __future__ import annotations
+
+import math
+from typing import Any
+
+from sqlalchemy import text as _sql
+
+from src.db.base import get_db_context
+from src.utils.timezone import now_taipei
+
+
+class AiopsKpiService:
+    """組裝 AI 自主化成熟度 KPI 全景."""
+
+    async def get_snapshot(self) -> dict[str, Any]:
+        """一次回傳 6 section + 自主化總分."""
+        async with get_db_context() as db:
+            inventory = await self._fetch_asset_inventory(db)
+            coverage = await self._fetch_coverage_kpi(db)
+            rule_quality = await self._fetch_rule_quality(db)
+            capacity = await self._fetch_capacity_health(db)
+            aol_flow = await self._fetch_automation_flow_24h(db)
+        autonomy = self._compute_autonomy_score(
+            inventory, coverage, rule_quality, capacity, aol_flow,
+        )
+        return {
+            "generated_at": now_taipei().isoformat(),
+            "asset_inventory": inventory,
+            "coverage_kpi": coverage,
+            "rule_quality": rule_quality,
+            "capacity_health": capacity,
+            "automation_flow_24h": aol_flow,
+            "ai_autonomy_score": autonomy,
+        }
+
+    async def _fetch_asset_inventory(self, db) -> dict[str, Any]:
+        rows = await db.execute(_sql("""
+            SELECT asset_type, count(*) AS cnt
+            FROM asset_inventory
+            WHERE lifecycle_state = 'active'
+            GROUP BY asset_type
+            ORDER BY cnt DESC
+        """))
+        by_type = {r.asset_type: int(r.cnt) for r in rows.fetchall()}
+
+        run_row = await db.execute(_sql("""
+            SELECT run_id, ended_at, total_assets, new_assets, modified_assets, duration_ms
+            FROM asset_discovery_run
+            WHERE status = 'success'
+            ORDER BY ended_at DESC LIMIT 1
+        """))
+        run = run_row.one_or_none()
+        last_run: dict[str, Any] | None = None
+        if run:
+            last_run = {
+                "run_id": str(run.run_id),
+                "ended_at": run.ended_at.isoformat() if run.ended_at else None,
+                "total_assets": run.total_assets,
+                "new_assets": run.new_assets,
+                "modified_assets": run.modified_assets,
+                "duration_ms": run.duration_ms,
+            }
+        return {
+            "by_type": by_type,
+            "total": sum(by_type.values()),
+            "last_scan": last_run,
+        }
+
+    async def _fetch_coverage_kpi(self, db) -> dict[str, Any]:
+        rows = await db.execute(_sql("""
+            SELECT dimension, coverage_status, count(*) AS cnt
+            FROM asset_coverage_snapshot
+            WHERE run_id = (
+                SELECT run_id FROM asset_discovery_run
+                WHERE status = 'success' ORDER BY ended_at DESC LIMIT 1
+            )
+            GROUP BY dimension, coverage_status
+            ORDER BY dimension, coverage_status
+        """))
+        by_dim: dict[str, dict[str, int]] = {}
+        for r in rows.fetchall():
+            by_dim.setdefault(r.dimension, {})[r.coverage_status] = int(r.cnt)
+
+        slo_per_dim: dict[str, float] = {}
+        for dim, statuses in by_dim.items():
+            total = sum(statuses.values())
+            green = statuses.get("green", 0)
+            slo_per_dim[dim] = round(green / total, 4) if total else 0.0
+
+        return {
+            "by_dimension": by_dim,
+            "green_ratio_per_dim": slo_per_dim,
+            "overall_green_ratio": round(
+                sum(slo_per_dim.values()) / len(slo_per_dim), 4
+            ) if slo_per_dim else 0.0,
+        }
+
+    async def _fetch_rule_quality(self, db) -> dict[str, Any]:
+        summary = await db.execute(_sql("""
+            SELECT
+                count(*) AS total,
+                count(*) FILTER (WHERE last_fired_at IS NOT NULL) AS with_fires,
+                count(*) FILTER (WHERE noise_rate > 0.5) AS noisy,
+                count(*) FILTER (WHERE review_status = 'deprecated') AS deprecated,
+                count(*) FILTER (WHERE source = 'ai_generated') AS ai_generated
+            FROM alert_rule_catalog
+        """))
+        s = summary.one()
+
+        noisy_rows = await db.execute(_sql("""
+            SELECT rule_name, severity, true_positive_count AS tp, false_positive_count AS fp,
+                   noise_rate, last_fired_at
+            FROM alert_rule_catalog
+            WHERE noise_rate IS NOT NULL
+              AND review_status IS DISTINCT FROM 'deprecated'
+            ORDER BY noise_rate DESC, true_positive_count + false_positive_count DESC
+            LIMIT 5
+        """))
+        top_noisy = [
+            {
+                "rule_name": r.rule_name,
+                "severity": r.severity,
+                "tp": int(r.tp or 0),
+                "fp": int(r.fp or 0),
+                "noise_rate": float(r.noise_rate) if r.noise_rate else 0.0,
+                "last_fired_at": r.last_fired_at.isoformat() if r.last_fired_at else None,
+            }
+            for r in noisy_rows.fetchall()
+        ]
+        return {
+            "total": int(s.total or 0),
+            "with_fires": int(s.with_fires or 0),
+            "noisy_above_0_5": int(s.noisy or 0),
+            "deprecated": int(s.deprecated or 0),
+            "ai_generated": int(s.ai_generated or 0),
+            "top_noisy": top_noisy,
+        }
+
+    async def _fetch_capacity_health(self, db) -> dict[str, Any]:
+        rows = await db.execute(_sql("""
+            SELECT DISTINCT ON (host)
+                host, ai_verdict, cpu_used_pct, mem_used_pct, swap_used_pct,
+                captured_at, ai_reasoning
+            FROM host_capacity_snapshot
+            ORDER BY host, captured_at DESC
+        """))
+        hosts = [
+            {
+                "host": r.host,
+                "ai_verdict": r.ai_verdict,
+                "cpu_used_pct": float(r.cpu_used_pct) if r.cpu_used_pct else None,
+                "mem_used_pct": float(r.mem_used_pct) if r.mem_used_pct else None,
+                "swap_used_pct": float(r.swap_used_pct) if r.swap_used_pct else None,
+                "captured_at": r.captured_at.isoformat() if r.captured_at else None,
+                "reasoning": r.ai_reasoning,
+            }
+            for r in rows.fetchall()
+        ]
+        by_verdict: dict[str, int] = {}
+        for h in hosts:
+            key = h["ai_verdict"] or "unknown"
+            by_verdict[key] = by_verdict.get(key, 0) + 1
+
+        violations = await db.execute(_sql("""
+            SELECT count(*) AS cnt FROM capacity_violation_event
+            WHERE detected_at > NOW() - INTERVAL '7 days'
+        """))
+        return {
+            "hosts": hosts,
+            "by_verdict": by_verdict,
+            "violations_7d": int(violations.scalar() or 0),
+        }
+
+    async def _fetch_automation_flow_24h(self, db) -> dict[str, Any]:
+        rows = await db.execute(_sql("""
+            SELECT operation_type, actor, status, count(*) AS cnt
+            FROM automation_operation_log
+            WHERE created_at > NOW() - INTERVAL '24 hours'
+            GROUP BY operation_type, actor, status
+            ORDER BY cnt DESC
+        """))
+        flows = [
+            {"operation_type": r.operation_type, "actor": r.actor, "status": r.status, "count": int(r.cnt)}
+            for r in rows.fetchall()
+        ]
+
+        by_actor: dict[str, int] = {}
+        by_type: dict[str, int] = {}
+        for f in flows:
+            by_actor[f["actor"]] = by_actor.get(f["actor"], 0) + f["count"]
+            by_type[f["operation_type"]] = by_type.get(f["operation_type"], 0) + f["count"]
+
+        return {
+            "detail": flows,
+            "by_actor": by_actor,
+            "by_operation_type": by_type,
+            "total": sum(by_type.values()),
+        }
+
+    def _compute_autonomy_score(
+        self,
+        inventory: dict[str, Any],
+        coverage: dict[str, Any],
+        rule_quality: dict[str, Any],
+        capacity: dict[str, Any],
+        aol_flow: dict[str, Any],
+    ) -> dict[str, Any]:
+        """AI 自主化總分 (0-100),5 子項各 20 分."""
+        score_coverage = min(20.0, coverage.get("overall_green_ratio", 0.0) * 20)
+
+        total_rules = rule_quality.get("total", 1) or 1
+        noisy = rule_quality.get("noisy_above_0_5", 0)
+        score_rule = max(0.0, 20 - (noisy / total_rules * 20))
+
+        by_verdict = capacity.get("by_verdict", {})
+        critical = by_verdict.get("critical", 0)
+        warning = by_verdict.get("warning", 0)
+        deductions = min(20, critical * 10 + warning * 3)
+        score_capacity = max(0.0, 20 - deductions)
+
+        total_ops_24h = aol_flow.get("total", 0)
+        if total_ops_24h > 0:
+            score_flow = min(20.0, math.log10(total_ops_24h + 1) / math.log10(101) * 20)
+        else:
+            score_flow = 0.0
+
+        ai_rules = rule_quality.get("ai_generated", 0)
+        op_types = len(aol_flow.get("by_operation_type", {}))
+        score_diversity = min(20.0, ai_rules * 1.0 + min(op_types, 10))
+
+        total = score_coverage + score_rule + score_capacity + score_flow + score_diversity
+
+        if total >= 90:
+            grade = "mature"
+        elif total >= 70:
+            grade = "in_progress"
+        elif total >= 50:
+            grade = "starter"
+        else:
+            grade = "initial"
+
+        return {
+            "total": round(total, 1),
+            "grade": grade,
+            "breakdown": {
+                "asset_coverage": round(score_coverage, 2),
+                "rule_quality": round(score_rule, 2),
+                "capacity_health": round(score_capacity, 2),
+                "automation_flow": round(score_flow, 2),
+                "ai_diversity": round(score_diversity, 2),
+            },
+            "max": 100.0,
+        }
+
+
+_singleton: AiopsKpiService | None = None
+
+
+def get_aiops_kpi_service() -> AiopsKpiService:
+    global _singleton
+    if _singleton is None:
+        _singleton = AiopsKpiService()
+    return _singleton