diff --git a/apps/api/src/api/v1/stats.py b/apps/api/src/api/v1/stats.py index e77f73e4..cec093fc 100644 --- a/apps/api/src/api/v1/stats.py +++ b/apps/api/src/api/v1/stats.py @@ -19,14 +19,16 @@ # @see feedback_lewooogo_modular_enforcement.md # ============================================================================= -from typing import Annotated +from typing import Annotated, Any from fastapi import APIRouter, Depends, Query +from fastapi.responses import PlainTextResponse from pydantic import BaseModel, Field from src.services.stats_service import StatsService, get_stats_service from src.services.k3s_monitor_service import K3sMonitorService, get_k3s_monitor_service from src.services.weekly_report_service import WeeklyReportService, get_weekly_report_service +from src.services.flywheel_stats_service import FlywheelStatsService, get_flywheel_stats_service router = APIRouter(prefix="/stats", tags=["Statistics"]) @@ -489,3 +491,64 @@ async def get_disposition_stats() -> DispositionResponse: import structlog structlog.get_logger(__name__).warning("disposition_stats_error", error=str(e)) return DispositionResponse(summary=DispositionSummary()) + + +# ============================================================================= +# ADR-073-C C1 + ADR-074 M1 — 飛輪健康度 API +# 2026-04-12 ogt +# ============================================================================= + +FlywheelStatsDep = Annotated[FlywheelStatsService, Depends(get_flywheel_stats_service)] + + +@router.get( + "/flywheel", + summary="飛輪六節點即時狀態(ADR-073-C C1)", + response_model=None, +) +async def get_flywheel_stats(svc: FlywheelStatsDep) -> dict[str, Any]: + """ + 飛輪六節點即時狀態 + 當前流動中的告警。 + 供前端飛輪動畫元件接真實數據。 + """ + metrics = await svc.compute() + return metrics.to_flywheel_api_dict() + + +@router.get( + "/summary", + summary="飛輪 KPI 摘要(ADR-073-C C1)", + response_model=None, +) +async def get_flywheel_summary(svc: FlywheelStatsDep) -> dict[str, Any]: + """ + 飛輪 KPI 面板數據:Playbook 數、成功率、今日處理數、KM 向量化率。 + 供前端右上角三個 KPI 卡片顯示真實數據。 + """ + metrics = await svc.compute() + return metrics.to_summary_api_dict() + + +@router.get( + "/flywheel/metrics", + summary="Prometheus 飛輪健康度指標(ADR-074 M1)", + response_class=PlainTextResponse, +) +async def get_flywheel_prometheus_metrics(svc: FlywheelStatsDep) -> PlainTextResponse: + """ + Prometheus text format 飛輪健康度指標。 + Prometheus scrape target: /api/v1/stats/flywheel/metrics + + Metrics: + awoooi_flywheel_playbook_count + awoooi_flywheel_execution_success_rate + awoooi_flywheel_km_unvectorized_count + awoooi_flywheel_alertname_null_rate + awoooi_flywheel_incidents_stuck + awoooi_flywheel_km_vectorized_rate + """ + metrics = await svc.compute() + return PlainTextResponse( + content=metrics.to_prometheus_lines(), + media_type="text/plain; version=0.0.4; charset=utf-8", + ) diff --git a/apps/api/src/services/flywheel_stats_service.py b/apps/api/src/services/flywheel_stats_service.py new file mode 100644 index 00000000..a216d032 --- /dev/null +++ b/apps/api/src/services/flywheel_stats_service.py @@ -0,0 +1,386 @@ +""" +Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1 + +飛輪健康度指標計算服務: + - 供 Prometheus Exporter(M1)抓取 + - 供前端 /api/v1/stats/flywheel 即時顯示(C1) + +Metrics: + awoooi_flywheel_playbook_count 目標 ≥ 20 + awoooi_flywheel_execution_success_rate 目標 ≥ 0.3 + awoooi_flywheel_km_unvectorized_count 目標 = 0 + awoooi_flywheel_alertname_null_rate 目標 = 0 + awoooi_flywheel_incidents_stuck 目標 = 0 + +2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1) +""" + +from __future__ import annotations + +import json +from datetime import datetime, timedelta +from typing import Any + +import structlog +from sqlalchemy import func, select, text + +from src.core.redis_client import get_redis +from src.db.base import get_db_context +from src.db.models import IncidentRecord, KnowledgeEntryRecord +from src.models.incident import IncidentStatus +from src.utils.timezone import now_taipei + +logger = structlog.get_logger(__name__) + +# Redis key prefix(與 playbook_repository.py 一致) +_PLAYBOOK_KEY_PREFIX = "playbook:" + +# 飛輪六節點名稱 +FLYWHEEL_NODES = [ + "monitoring", + "deduplication", + "diagnosis", + "reasoning", + "execution", + "learning", +] + + +# ============================================================================= +# 核心指標資料結構 +# ============================================================================= + + +class FlywheelMetrics: + """飛輪健康度指標快照""" + + def __init__( + self, + playbook_count: int, + execution_success_rate: float, + km_unvectorized_count: int, + alertname_null_rate: float, + incidents_stuck: int, + today_processed: int, + flywheel_conversions_today: int, + km_vectorized_rate: float, + node_stats: dict[str, Any], + current_flow: list[dict[str, Any]], + computed_at: datetime, + ) -> None: + self.playbook_count = playbook_count + self.execution_success_rate = execution_success_rate + self.km_unvectorized_count = km_unvectorized_count + self.alertname_null_rate = alertname_null_rate + self.incidents_stuck = incidents_stuck + self.today_processed = today_processed + self.flywheel_conversions_today = flywheel_conversions_today + self.km_vectorized_rate = km_vectorized_rate + self.node_stats = node_stats + self.current_flow = current_flow + self.computed_at = computed_at + + def to_prometheus_lines(self) -> str: + """輸出 Prometheus text format""" + ts = int(self.computed_at.timestamp() * 1000) + lines = [ + "# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis", + "# TYPE awoooi_flywheel_playbook_count gauge", + f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}", + "", + "# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1)", + "# TYPE awoooi_flywheel_execution_success_rate gauge", + f"awoooi_flywheel_execution_success_rate {self.execution_success_rate:.4f} {ts}", + "", + "# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized", + "# TYPE awoooi_flywheel_km_unvectorized_count gauge", + f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}", + "", + "# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname", + "# TYPE awoooi_flywheel_alertname_null_rate gauge", + f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}", + "", + "# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h", + "# TYPE awoooi_flywheel_incidents_stuck gauge", + f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}", + "", + "# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized", + "# TYPE awoooi_flywheel_km_vectorized_rate gauge", + f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}", + ] + return "\n".join(lines) + "\n" + + def to_flywheel_api_dict(self) -> dict[str, Any]: + """輸出 /api/v1/stats/flywheel 格式""" + return { + "nodes": self.node_stats, + "current_flow": self.current_flow, + "computed_at": self.computed_at.isoformat(), + } + + def to_summary_api_dict(self) -> dict[str, Any]: + """輸出 /api/v1/stats/summary 格式""" + return { + "playbook_count": self.playbook_count, + "execution_success_rate": round(self.execution_success_rate, 4), + "today_processed": self.today_processed, + "flywheel_conversions_today": self.flywheel_conversions_today, + "km_vectorized_rate": round(self.km_vectorized_rate, 4), + "km_unvectorized_count": self.km_unvectorized_count, + "alertname_null_rate": round(self.alertname_null_rate, 4), + "incidents_stuck": self.incidents_stuck, + "computed_at": self.computed_at.isoformat(), + } + + +# ============================================================================= +# FlywheelStatsService +# ============================================================================= + + +class FlywheelStatsService: + """ + 飛輪健康度指標計算服務 + + ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取 + ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示 + """ + + async def compute(self) -> FlywheelMetrics: + """計算所有飛輪指標(單次完整查詢)""" + now = now_taipei() + + playbook_count, execution_success_rate = await self._playbook_stats() + ( + km_unvectorized_count, + km_vectorized_rate, + flywheel_conversions_today, + ) = await self._km_stats(now) + ( + alertname_null_rate, + incidents_stuck, + today_processed, + node_stats, + current_flow, + ) = await self._incident_stats(now) + + return FlywheelMetrics( + playbook_count=playbook_count, + execution_success_rate=execution_success_rate, + km_unvectorized_count=km_unvectorized_count, + alertname_null_rate=alertname_null_rate, + incidents_stuck=incidents_stuck, + today_processed=today_processed, + flywheel_conversions_today=flywheel_conversions_today, + km_vectorized_rate=km_vectorized_rate, + node_stats=node_stats, + current_flow=current_flow, + computed_at=now, + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + async def _playbook_stats(self) -> tuple[int, float]: + """Playbook 數量 + 執行成功率(從 Redis)""" + try: + redis = get_redis() + count = 0 + total_exec = 0 + total_success = 0 + + async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200): + raw = await redis.get(key) + if not raw: + continue + try: + pb = json.loads(raw) + status = pb.get("status", "") + if status == "approved": + count += 1 + exec_count = pb.get("execution_count", 0) or 0 + success_count = pb.get("success_count", 0) or 0 + total_exec += exec_count + total_success += success_count + except (json.JSONDecodeError, KeyError): + continue + + rate = total_success / total_exec if total_exec > 0 else 0.0 + return count, rate + + except Exception: + logger.exception("flywheel_stats_playbook_error") + return 0, 0.0 + + async def _km_stats(self, now: datetime) -> tuple[int, float, int]: + """KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL)""" + try: + today_start = now.replace(hour=0, minute=0, second=0, microsecond=0) + + async with get_db_context() as db: + # 未向量化數量 + unvectorized_q = await db.execute( + select(func.count()).where(KnowledgeEntryRecord.vectorized.is_(False)) + ) + unvectorized = unvectorized_q.scalar_one_or_none() or 0 + + # 總數 + total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id))) + total = total_q.scalar_one_or_none() or 0 + + vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0 + + # 今日轉化數(今日建立的 KM) + conversions_q = await db.execute( + select(func.count()).where( + KnowledgeEntryRecord.created_at >= today_start + ) + ) + conversions_today = conversions_q.scalar_one_or_none() or 0 + + return unvectorized, vectorized_rate, conversions_today + + except Exception: + logger.exception("flywheel_stats_km_error") + return 0, 0.0, 0 + + async def _incident_stats( + self, now: datetime + ) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]: + """Incident 相關指標(alertname NULL 率、卡住數、今日處理數、節點狀態、當前流)""" + try: + today_start = now.replace(hour=0, minute=0, second=0, microsecond=0) + stuck_threshold = now - timedelta(hours=24) + recent_1h = now - timedelta(hours=1) + + async with get_db_context() as db: + # alertname NULL 率 + total_q = await db.execute(select(func.count(IncidentRecord.id))) + total = total_q.scalar_one_or_none() or 0 + + null_q = await db.execute( + select(func.count()).where(IncidentRecord.alertname.is_(None)) + ) + null_count = null_q.scalar_one_or_none() or 0 + alertname_null_rate = null_count / total if total > 0 else 0.0 + + # 卡住的 Incident(INVESTIGATING > 24h) + stuck_q = await db.execute( + select(func.count()).where( + IncidentRecord.status == IncidentStatus.INVESTIGATING.value, + IncidentRecord.created_at <= stuck_threshold, + ) + ) + incidents_stuck = stuck_q.scalar_one_or_none() or 0 + + # 今日處理數 + today_q = await db.execute( + select(func.count()).where( + IncidentRecord.created_at >= today_start + ) + ) + today_processed = today_q.scalar_one_or_none() or 0 + + # 節點狀態(監控/去重/執行) + recent_q = await db.execute( + select(func.count()).where( + IncidentRecord.created_at >= recent_1h + ) + ) + count_1h = recent_q.scalar_one_or_none() or 0 + + # 自動執行成功數(今日) + success_q = await db.execute( + text( + "SELECT COUNT(*) FROM incidents WHERE created_at >= :today" + " AND outcomes::text LIKE '%execution_success%true%'" + ), + {"today": today_start}, + ) + exec_success_today = success_q.scalar_one_or_none() or 0 + + # 當前流(最近 10 筆活躍 Incident) + active_q = await db.execute( + select( + IncidentRecord.incident_id, + IncidentRecord.alertname, + IncidentRecord.status, + IncidentRecord.created_at, + ) + .where( + IncidentRecord.status.in_([ + IncidentStatus.INVESTIGATING.value, + IncidentStatus.MITIGATING.value, + ]) + ) + .order_by(IncidentRecord.created_at.desc()) + .limit(10) + ) + active_rows = active_q.fetchall() + + current_flow = [ + { + "incident_id": row.incident_id, + "alertname": row.alertname or "unknown", + "current_node": _status_to_node(row.status), + "ts": row.created_at.isoformat() if row.created_at else None, + } + for row in active_rows + ] + + node_stats = { + "monitoring": { + "status": "active" if count_1h > 0 else "idle", + "count_1h": count_1h, + }, + "deduplication": { + "status": "active", + "dedup_window_min": 30, + }, + "diagnosis": { + "status": "active", + "mcp_providers_used": ["k8s", "ssh", "prometheus"], + }, + "reasoning": { + "status": "active", + "today_processed": today_processed, + }, + "execution": { + "status": "active", + "success_today": exec_success_today, + }, + "learning": { + "status": "active", + }, + } + + return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow + + except Exception: + logger.exception("flywheel_stats_incident_error") + return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, [] + + +def _status_to_node(status: str) -> str: + mapping = { + IncidentStatus.INVESTIGATING.value: "diagnosis", + IncidentStatus.MITIGATING.value: "execution", + IncidentStatus.RESOLVED.value: "learning", + IncidentStatus.CLOSED.value: "learning", + } + return mapping.get(status, "reasoning") + + +# ============================================================================= +# DI 工廠 +# ============================================================================= + +_instance: FlywheelStatsService | None = None + + +def get_flywheel_stats_service() -> FlywheelStatsService: + global _instance + if _instance is None: + _instance = FlywheelStatsService() + return _instance diff --git a/k8s/monitoring/flywheel-alerts.yaml b/k8s/monitoring/flywheel-alerts.yaml new file mode 100644 index 00000000..6f4d85d7 --- /dev/null +++ b/k8s/monitoring/flywheel-alerts.yaml @@ -0,0 +1,105 @@ +# ============================================================================= +# 飛輪健康度告警規則 — ADR-074 M1 +# ============================================================================= +# Prometheus PrometheusRule CRD — 飛輪自監控告警 +# 數據來源:/api/v1/stats/flywheel/metrics(awoooi-flywheel scrape job) +# +# 部署:kubectl apply -f k8s/monitoring/flywheel-alerts.yaml +# +# 2026-04-12 ogt (ADR-074 M1) +# ============================================================================= + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flywheel-alerts + namespace: monitoring + labels: + release: prometheus + app: prometheus +spec: + groups: + - name: awoooi_flywheel_health + interval: 5m + rules: + + # P0: Playbook 完全沒有 → 飛輪學習節點失效 + - alert: FlywheelPlaybookZero + expr: awoooi_flywheel_playbook_count == 0 + for: 1h + labels: + severity: critical + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "飛輪 Playbook 數量為 0" + description: "Playbook 數量持續 1 小時為 0,飛輪學習節點完全失效。" + runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" + + # P0: 執行成功率極低 + - alert: FlywheelExecutionSuccessLow + expr: awoooi_flywheel_execution_success_rate < 0.1 + for: 2h + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "飛輪自動修復成功率低於 10%" + description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。" + runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" + + # P0: KM 大量未向量化 → RAG 無法使用歷史案例 + - alert: FlywheelKMVectorizationLow + expr: awoooi_flywheel_km_unvectorized_count > 10 + for: 30m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "飛輪 KM 未向量化數量 > 10" + description: "{{ $value }} 筆 KM 條目尚未向量化,RAG 查詢品質下降。" + runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態" + + # P1: alertname NULL 率異常 + - alert: FlywheelAlertnameNullHigh + expr: awoooi_flywheel_alertname_null_rate > 0.05 + for: 30m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "飛輪 alertname NULL 率超過 5%" + description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。" + runbook: "執行 scripts/backfill_alertname.py 回填" + + # P1: Incident 卡住超過 24 小時 + - alert: FlywheelIncidentsStuck + expr: awoooi_flywheel_incidents_stuck > 5 + for: 10m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時" + description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。" + + - name: awoooi_host_connectivity + interval: 60s + rules: + + # P0: 主機間網路分區 + - alert: HostNetworkPartition + expr: probe_success{job="host-connectivity"} == 0 + for: 5m + labels: + severity: critical + alert_category: infrastructure + notification_type: TYPE-3 + annotations: + summary: "主機 {{ $labels.instance }} 無法連通" + description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。" + runbook: "SSH 檢查路由和防火牆規則" diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index 5f716740..a51efcbf 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -156,6 +156,37 @@ scrape_configs: service: 'awoooi-api' env: 'prod' + # === ADR-074 M1: 飛輪健康度指標 (2026-04-12 ogt) === + - job_name: 'awoooi-flywheel' + scrape_interval: 5m + metrics_path: /api/v1/stats/flywheel/metrics + static_configs: + - targets: ['192.168.0.125:32334'] + labels: + host: '125' + service: 'awoooi-flywheel' + env: 'prod' + + # === ADR-074 M2: 主機間網路連通性 (2026-04-12 ogt) === + - job_name: 'host-connectivity' + scrape_interval: 60s + metrics_path: /probe + params: + module: [tcp_connect] + static_configs: + - targets: + - 192.168.0.110:22 + - 192.168.0.188:22 + - 192.168.0.120:6443 + - 192.168.0.121:6443 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 192.168.0.188:9115 + # === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) === - job_name: 'postgres-exporter' scrape_interval: 30s