476 lines
20 KiB
Python
476 lines
20 KiB
Python
"""
|
||
Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1
|
||
|
||
飛輪健康度指標計算服務:
|
||
- 供 Prometheus Exporter(M1)抓取
|
||
- 供前端 /api/v1/stats/flywheel 即時顯示(C1)
|
||
|
||
Metrics:
|
||
awoooi_flywheel_playbook_count 目標 ≥ 20
|
||
awoooi_flywheel_execution_success_rate 目標 ≥ 0.3
|
||
awoooi_flywheel_km_unvectorized_count 目標 = 0
|
||
awoooi_flywheel_alertname_null_rate 目標 = 0
|
||
awoooi_flywheel_incidents_stuck 目標 = 0
|
||
|
||
2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from datetime import datetime, timedelta
|
||
from typing import Any
|
||
|
||
import structlog
|
||
from sqlalchemy import func, select, text
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.db.base import get_db_context
|
||
from src.db.models import IncidentRecord, KnowledgeEntryRecord
|
||
from src.models.incident import IncidentStatus
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# Redis key prefix(與 playbook_repository.py 一致)
|
||
_PLAYBOOK_KEY_PREFIX = "playbook:"
|
||
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(W-3 fresh deploy 假告警)
|
||
# execution_success_rate 需要最少樣本數才有統計意義;
|
||
# Redis 空(fresh deploy / restart)時 total_exec=0 → rate=0.0 → watchdog W-3 立即觸發假告警
|
||
# 修法:total_exec < FLYWHEEL_MIN_SAMPLE 時回 None,watchdog 判 None 跳過 W-3 檢查
|
||
# TODO: 未來移至 settings(目前 hardcode 以避免 config 改動超出本輪範圍)
|
||
FLYWHEEL_MIN_SAMPLE = 10
|
||
|
||
# 飛輪六節點名稱
|
||
FLYWHEEL_NODES = [
|
||
"monitoring",
|
||
"deduplication",
|
||
"diagnosis",
|
||
"reasoning",
|
||
"execution",
|
||
"learning",
|
||
]
|
||
|
||
|
||
# =============================================================================
|
||
# 核心指標資料結構
|
||
# =============================================================================
|
||
|
||
|
||
class FlywheelMetrics:
|
||
"""飛輪健康度指標快照"""
|
||
|
||
def __init__(
|
||
self,
|
||
playbook_count: int,
|
||
execution_success_rate: float | None,
|
||
km_unvectorized_count: int,
|
||
alertname_null_rate: float,
|
||
incidents_stuck: int,
|
||
today_processed: int,
|
||
flywheel_conversions_today: int,
|
||
km_vectorized_rate: float,
|
||
node_stats: dict[str, Any],
|
||
current_flow: list[dict[str, Any]],
|
||
computed_at: datetime,
|
||
) -> None:
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復
|
||
# execution_success_rate 為 None 時表示樣本不足(< FLYWHEEL_MIN_SAMPLE),
|
||
# watchdog W-3 應跳過該檢查,避免 fresh deploy 假告警
|
||
self.playbook_count = playbook_count
|
||
self.execution_success_rate = execution_success_rate
|
||
self.km_unvectorized_count = km_unvectorized_count
|
||
self.alertname_null_rate = alertname_null_rate
|
||
self.incidents_stuck = incidents_stuck
|
||
self.today_processed = today_processed
|
||
self.flywheel_conversions_today = flywheel_conversions_today
|
||
self.km_vectorized_rate = km_vectorized_rate
|
||
self.node_stats = node_stats
|
||
self.current_flow = current_flow
|
||
self.computed_at = computed_at
|
||
self.type4_count: int = 0 # TYPE-4 incidents 數(ADR-073-C C2)
|
||
|
||
def to_prometheus_lines(self) -> str:
|
||
"""輸出 Prometheus text format"""
|
||
ts = int(self.computed_at.timestamp() * 1000)
|
||
# 2026-05-02 ogt + Claude Opus 4.7 — Bug 2 後續修復(critic P0-1 連鎖修復)
|
||
# sentinel 用 NaN 而非 -1.0:Prometheus 對 NaN 比較永遠回 false,
|
||
# 既有 alert rule `awoooi_flywheel_execution_success_rate < 0.1` 自然不會被
|
||
# sentinel 觸發;同時 Grafana 渲染為「無資料」gap,比 -1 spike 直觀。
|
||
# 前次嘗試 -1.0 會讓 ops/monitoring/alerts.yml:775 等 3 份 prom rule
|
||
# 在 fresh deploy 後 2h 必噴 FlywheelExecutionSuccessLow 假告警,跟 watchdog skip 自相矛盾。
|
||
rate_str = (
|
||
f"{self.execution_success_rate:.4f}"
|
||
if self.execution_success_rate is not None
|
||
else "NaN"
|
||
)
|
||
lines = [
|
||
"# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis",
|
||
"# TYPE awoooi_flywheel_playbook_count gauge",
|
||
f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}",
|
||
"",
|
||
"# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1), NaN=insufficient sample",
|
||
"# TYPE awoooi_flywheel_execution_success_rate gauge",
|
||
f"awoooi_flywheel_execution_success_rate {rate_str} {ts}",
|
||
"",
|
||
"# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized",
|
||
"# TYPE awoooi_flywheel_km_unvectorized_count gauge",
|
||
f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}",
|
||
"",
|
||
"# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname",
|
||
"# TYPE awoooi_flywheel_alertname_null_rate gauge",
|
||
f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}",
|
||
"",
|
||
"# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h",
|
||
"# TYPE awoooi_flywheel_incidents_stuck gauge",
|
||
f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}",
|
||
"",
|
||
"# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized",
|
||
"# TYPE awoooi_flywheel_km_vectorized_rate gauge",
|
||
f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}",
|
||
]
|
||
return "\n".join(lines) + "\n"
|
||
|
||
def to_flywheel_api_dict(self) -> dict[str, Any]:
|
||
"""輸出 /api/v1/stats/flywheel 格式"""
|
||
return {
|
||
"nodes": self.node_stats,
|
||
"current_flow": self.current_flow,
|
||
"type4_count": self.type4_count,
|
||
"computed_at": self.computed_at.isoformat(),
|
||
}
|
||
|
||
def to_summary_api_dict(self) -> dict[str, Any]:
|
||
"""輸出 /api/v1/stats/summary 格式"""
|
||
return {
|
||
"playbook_count": self.playbook_count,
|
||
"execution_success_rate": round(self.execution_success_rate, 4) if self.execution_success_rate is not None else None,
|
||
"today_processed": self.today_processed,
|
||
"flywheel_conversions_today": self.flywheel_conversions_today,
|
||
"km_vectorized_rate": round(self.km_vectorized_rate, 4),
|
||
"km_unvectorized_count": self.km_unvectorized_count,
|
||
"alertname_null_rate": round(self.alertname_null_rate, 4),
|
||
"incidents_stuck": self.incidents_stuck,
|
||
"computed_at": self.computed_at.isoformat(),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# FlywheelStatsService
|
||
# =============================================================================
|
||
|
||
|
||
class FlywheelStatsService:
|
||
"""
|
||
飛輪健康度指標計算服務
|
||
|
||
ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取
|
||
ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示
|
||
"""
|
||
|
||
async def compute(self) -> FlywheelMetrics:
|
||
"""計算所有飛輪指標(單次完整查詢)"""
|
||
now = now_taipei()
|
||
|
||
playbook_count, execution_success_rate = await self._playbook_stats()
|
||
(
|
||
km_unvectorized_count,
|
||
km_vectorized_rate,
|
||
flywheel_conversions_today,
|
||
) = await self._km_stats(now)
|
||
(
|
||
alertname_null_rate,
|
||
incidents_stuck,
|
||
today_processed,
|
||
node_stats,
|
||
current_flow,
|
||
type4_count,
|
||
) = await self._incident_stats(now)
|
||
|
||
metrics = FlywheelMetrics(
|
||
playbook_count=playbook_count,
|
||
execution_success_rate=execution_success_rate,
|
||
km_unvectorized_count=km_unvectorized_count,
|
||
alertname_null_rate=alertname_null_rate,
|
||
incidents_stuck=incidents_stuck,
|
||
today_processed=today_processed,
|
||
flywheel_conversions_today=flywheel_conversions_today,
|
||
km_vectorized_rate=km_vectorized_rate,
|
||
node_stats=node_stats,
|
||
current_flow=current_flow,
|
||
computed_at=now,
|
||
)
|
||
metrics.type4_count = type4_count
|
||
return metrics
|
||
|
||
# ------------------------------------------------------------------
|
||
# Internal helpers
|
||
# ------------------------------------------------------------------
|
||
|
||
async def _playbook_stats(self) -> tuple[int, float | None]:
|
||
"""Playbook 數量 + 執行成功率(從 Redis)
|
||
|
||
2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復(W-3 fresh deploy 假告警)
|
||
total_exec < FLYWHEEL_MIN_SAMPLE 時回 None,代表樣本不足,
|
||
watchdog W-3 判 None 跳過該檢查,避免每次 restart 觸發假告警。
|
||
"""
|
||
try:
|
||
redis = get_redis()
|
||
count = 0
|
||
total_exec = 0
|
||
total_success = 0
|
||
|
||
async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200):
|
||
raw = await redis.get(key)
|
||
if not raw:
|
||
continue
|
||
try:
|
||
pb = json.loads(raw)
|
||
status = pb.get("status", "")
|
||
if status == "approved":
|
||
count += 1
|
||
success_count = pb.get("success_count", 0) or 0
|
||
failure_count = pb.get("failure_count", 0) or 0
|
||
total_exec += success_count + failure_count
|
||
total_success += success_count
|
||
except (json.JSONDecodeError, KeyError):
|
||
continue
|
||
|
||
# 2026-05-06 ogt + Codex:
|
||
# 執行成功率的 source of truth 是 auto_repair_executions。
|
||
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
|
||
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
|
||
# 2026-05-29 Codex:
|
||
# 24h 低流量不是資料管線斷流;若 24h 未達最小樣本,改用 7d
|
||
# 穩定窗口,避免 FlywheelExecutionRateMissing 長期誤報。
|
||
try:
|
||
async with get_db_context() as db:
|
||
row = await db.execute(
|
||
text("""
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||
COUNT(*) AS total
|
||
FROM auto_repair_executions
|
||
WHERE created_at >= NOW() - interval '24 hours'
|
||
""")
|
||
)
|
||
repair_stats = row.one()
|
||
db_total_exec = int(repair_stats.total or 0)
|
||
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
|
||
db_total_success = int(repair_stats.success or 0)
|
||
return count, db_total_success / db_total_exec
|
||
|
||
fallback_row = await db.execute(
|
||
text("""
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
|
||
COUNT(*) AS total
|
||
FROM auto_repair_executions
|
||
WHERE created_at >= NOW() - interval '7 days'
|
||
""")
|
||
)
|
||
fallback_stats = fallback_row.one()
|
||
fallback_total = int(fallback_stats.total or 0)
|
||
if fallback_total >= FLYWHEEL_MIN_SAMPLE:
|
||
fallback_success = int(fallback_stats.success or 0)
|
||
return count, fallback_success / fallback_total
|
||
if db_total_exec > 0 or fallback_total > 0:
|
||
return count, None
|
||
except Exception:
|
||
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
|
||
|
||
if total_exec < FLYWHEEL_MIN_SAMPLE:
|
||
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
|
||
return count, None
|
||
rate = total_success / total_exec
|
||
return count, rate
|
||
|
||
except Exception:
|
||
logger.exception("flywheel_stats_playbook_error")
|
||
return 0, None
|
||
|
||
async def _km_stats(self, now: datetime) -> tuple[int, float, int]:
|
||
"""KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL)"""
|
||
try:
|
||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||
|
||
async with get_db_context() as db:
|
||
# 未向量化數量 (embedding IS NULL = 未向量化)
|
||
# 2026-04-15 ogt: KnowledgeEntryRecord ORM 不宣告 embedding 欄位(pgvector),
|
||
# 改用 raw SQL 避免 AttributeError
|
||
unvectorized_q = await db.execute(
|
||
text("SELECT COUNT(*) FROM knowledge_entries WHERE embedding IS NULL")
|
||
)
|
||
unvectorized = unvectorized_q.scalar_one_or_none() or 0
|
||
|
||
# 總數
|
||
total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id)))
|
||
total = total_q.scalar_one_or_none() or 0
|
||
|
||
vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0
|
||
|
||
# 今日轉化數(今日建立的 KM)
|
||
conversions_q = await db.execute(
|
||
select(func.count()).where(
|
||
KnowledgeEntryRecord.created_at >= today_start
|
||
)
|
||
)
|
||
conversions_today = conversions_q.scalar_one_or_none() or 0
|
||
|
||
return unvectorized, vectorized_rate, conversions_today
|
||
|
||
except Exception:
|
||
logger.exception("flywheel_stats_km_error")
|
||
return 0, 0.0, 0
|
||
|
||
async def _incident_stats(
|
||
self, now: datetime
|
||
) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]:
|
||
"""Incident 相關指標(alertname NULL 率、卡住數、今日處理數、節點狀態、當前流)"""
|
||
try:
|
||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||
stuck_threshold = now - timedelta(hours=24)
|
||
recent_1h = now - timedelta(hours=1)
|
||
|
||
async with get_db_context() as db:
|
||
# alertname NULL 率
|
||
total_q = await db.execute(select(func.count(IncidentRecord.incident_id)))
|
||
total = total_q.scalar_one_or_none() or 0
|
||
|
||
null_q = await db.execute(
|
||
select(func.count()).where(IncidentRecord.alertname.is_(None))
|
||
)
|
||
null_count = null_q.scalar_one_or_none() or 0
|
||
alertname_null_rate = null_count / total if total > 0 else 0.0
|
||
|
||
# 卡住的 Incident(INVESTIGATING > 24h)
|
||
stuck_q = await db.execute(
|
||
select(func.count()).where(
|
||
IncidentRecord.status == IncidentStatus.INVESTIGATING,
|
||
IncidentRecord.created_at <= stuck_threshold,
|
||
)
|
||
)
|
||
incidents_stuck = stuck_q.scalar_one_or_none() or 0
|
||
|
||
# TYPE-4 Incident 數(ADR-073-C C2 — 供前端 hasType4 判斷)
|
||
# 2026-04-12 ogt
|
||
type4_q = await db.execute(
|
||
select(func.count()).where(
|
||
IncidentRecord.notification_type == "TYPE-4",
|
||
IncidentRecord.status == IncidentStatus.INVESTIGATING,
|
||
)
|
||
)
|
||
type4_count = type4_q.scalar_one_or_none() or 0
|
||
|
||
# 今日處理數
|
||
today_q = await db.execute(
|
||
select(func.count()).where(
|
||
IncidentRecord.created_at >= today_start
|
||
)
|
||
)
|
||
today_processed = today_q.scalar_one_or_none() or 0
|
||
|
||
# 節點狀態(監控/去重/執行)
|
||
recent_q = await db.execute(
|
||
select(func.count()).where(
|
||
IncidentRecord.created_at >= recent_1h
|
||
)
|
||
)
|
||
count_1h = recent_q.scalar_one_or_none() or 0
|
||
|
||
# 自動執行成功數(今日)
|
||
success_q = await db.execute(
|
||
text(
|
||
"SELECT COUNT(*) FROM incidents WHERE created_at >= :today"
|
||
" AND outcome::text LIKE '%execution_success%true%'"
|
||
),
|
||
{"today": today_start},
|
||
)
|
||
exec_success_today = success_q.scalar_one_or_none() or 0
|
||
|
||
# 當前流(最近 10 筆活躍 Incident)
|
||
active_q = await db.execute(
|
||
select(
|
||
IncidentRecord.incident_id,
|
||
IncidentRecord.alertname,
|
||
IncidentRecord.status,
|
||
IncidentRecord.created_at,
|
||
)
|
||
.where(
|
||
IncidentRecord.status.in_([
|
||
IncidentStatus.INVESTIGATING.value,
|
||
IncidentStatus.MITIGATING.value,
|
||
])
|
||
)
|
||
.order_by(IncidentRecord.created_at.desc())
|
||
.limit(10)
|
||
)
|
||
active_rows = active_q.fetchall()
|
||
|
||
current_flow = [
|
||
{
|
||
"incident_id": row.incident_id,
|
||
"alertname": row.alertname or "unknown",
|
||
"current_node": _status_to_node(row.status),
|
||
"ts": row.created_at.isoformat() if row.created_at else None,
|
||
}
|
||
for row in active_rows
|
||
]
|
||
|
||
node_stats = {
|
||
"monitoring": {
|
||
"status": "active" if count_1h > 0 else "idle",
|
||
"count_1h": count_1h,
|
||
},
|
||
"deduplication": {
|
||
"status": "active",
|
||
"dedup_window_min": 30,
|
||
},
|
||
"diagnosis": {
|
||
"status": "active",
|
||
"mcp_providers_used": ["k8s", "ssh", "prometheus"],
|
||
},
|
||
"reasoning": {
|
||
"status": "active",
|
||
"today_processed": today_processed,
|
||
},
|
||
"execution": {
|
||
"status": "active",
|
||
"success_today": exec_success_today,
|
||
},
|
||
"learning": {
|
||
"status": "active",
|
||
},
|
||
}
|
||
|
||
return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow, type4_count
|
||
|
||
except Exception:
|
||
logger.exception("flywheel_stats_incident_error")
|
||
return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, [], 0
|
||
|
||
|
||
def _status_to_node(status: str) -> str:
|
||
mapping = {
|
||
IncidentStatus.INVESTIGATING.value: "diagnosis",
|
||
IncidentStatus.MITIGATING.value: "execution",
|
||
IncidentStatus.RESOLVED.value: "learning",
|
||
IncidentStatus.CLOSED.value: "learning",
|
||
}
|
||
return mapping.get(status, "reasoning")
|
||
|
||
|
||
# =============================================================================
|
||
# DI 工廠
|
||
# =============================================================================
|
||
|
||
_instance: FlywheelStatsService | None = None
|
||
|
||
|
||
def get_flywheel_stats_service() -> FlywheelStatsService:
|
||
global _instance
|
||
if _instance is None:
|
||
_instance = FlywheelStatsService()
|
||
return _instance
|