Files
awoooi/apps/api/src/services/flywheel_stats_service.py
Your Name d7db0faa4d
All checks were successful
CD Pipeline / tests (push) Successful in 1m31s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 4m5s
CD Pipeline / post-deploy-checks (push) Successful in 1m59s
fix(api): stabilize flywheel success rate window
2026-05-29 11:33:29 +08:00

476 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1
飛輪健康度指標計算服務:
- 供 Prometheus ExporterM1抓取
- 供前端 /api/v1/stats/flywheel 即時顯示C1
Metrics:
awoooi_flywheel_playbook_count 目標 ≥ 20
awoooi_flywheel_execution_success_rate 目標 ≥ 0.3
awoooi_flywheel_km_unvectorized_count 目標 = 0
awoooi_flywheel_alertname_null_rate 目標 = 0
awoooi_flywheel_incidents_stuck 目標 = 0
2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1)
"""
from __future__ import annotations
import json
from datetime import datetime, timedelta
from typing import Any
import structlog
from sqlalchemy import func, select, text
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord, KnowledgeEntryRecord
from src.models.incident import IncidentStatus
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# Redis key prefix與 playbook_repository.py 一致)
_PLAYBOOK_KEY_PREFIX = "playbook:"
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復W-3 fresh deploy 假告警)
# execution_success_rate 需要最少樣本數才有統計意義;
# Redis 空fresh deploy / restart時 total_exec=0 → rate=0.0 → watchdog W-3 立即觸發假告警
# 修法total_exec < FLYWHEEL_MIN_SAMPLE 時回 Nonewatchdog 判 None 跳過 W-3 檢查
# TODO: 未來移至 settings目前 hardcode 以避免 config 改動超出本輪範圍)
FLYWHEEL_MIN_SAMPLE = 10
# 飛輪六節點名稱
FLYWHEEL_NODES = [
"monitoring",
"deduplication",
"diagnosis",
"reasoning",
"execution",
"learning",
]
# =============================================================================
# 核心指標資料結構
# =============================================================================
class FlywheelMetrics:
"""飛輪健康度指標快照"""
def __init__(
self,
playbook_count: int,
execution_success_rate: float | None,
km_unvectorized_count: int,
alertname_null_rate: float,
incidents_stuck: int,
today_processed: int,
flywheel_conversions_today: int,
km_vectorized_rate: float,
node_stats: dict[str, Any],
current_flow: list[dict[str, Any]],
computed_at: datetime,
) -> None:
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復
# execution_success_rate 為 None 時表示樣本不足(< FLYWHEEL_MIN_SAMPLE
# watchdog W-3 應跳過該檢查,避免 fresh deploy 假告警
self.playbook_count = playbook_count
self.execution_success_rate = execution_success_rate
self.km_unvectorized_count = km_unvectorized_count
self.alertname_null_rate = alertname_null_rate
self.incidents_stuck = incidents_stuck
self.today_processed = today_processed
self.flywheel_conversions_today = flywheel_conversions_today
self.km_vectorized_rate = km_vectorized_rate
self.node_stats = node_stats
self.current_flow = current_flow
self.computed_at = computed_at
self.type4_count: int = 0 # TYPE-4 incidents 數ADR-073-C C2
def to_prometheus_lines(self) -> str:
"""輸出 Prometheus text format"""
ts = int(self.computed_at.timestamp() * 1000)
# 2026-05-02 ogt + Claude Opus 4.7 — Bug 2 後續修復critic P0-1 連鎖修復)
# sentinel 用 NaN 而非 -1.0Prometheus 對 NaN 比較永遠回 false
# 既有 alert rule `awoooi_flywheel_execution_success_rate < 0.1` 自然不會被
# sentinel 觸發;同時 Grafana 渲染為「無資料」gap比 -1 spike 直觀。
# 前次嘗試 -1.0 會讓 ops/monitoring/alerts.yml:775 等 3 份 prom rule
# 在 fresh deploy 後 2h 必噴 FlywheelExecutionSuccessLow 假告警,跟 watchdog skip 自相矛盾。
rate_str = (
f"{self.execution_success_rate:.4f}"
if self.execution_success_rate is not None
else "NaN"
)
lines = [
"# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis",
"# TYPE awoooi_flywheel_playbook_count gauge",
f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}",
"",
"# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1), NaN=insufficient sample",
"# TYPE awoooi_flywheel_execution_success_rate gauge",
f"awoooi_flywheel_execution_success_rate {rate_str} {ts}",
"",
"# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized",
"# TYPE awoooi_flywheel_km_unvectorized_count gauge",
f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}",
"",
"# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname",
"# TYPE awoooi_flywheel_alertname_null_rate gauge",
f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}",
"",
"# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h",
"# TYPE awoooi_flywheel_incidents_stuck gauge",
f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}",
"",
"# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized",
"# TYPE awoooi_flywheel_km_vectorized_rate gauge",
f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}",
]
return "\n".join(lines) + "\n"
def to_flywheel_api_dict(self) -> dict[str, Any]:
"""輸出 /api/v1/stats/flywheel 格式"""
return {
"nodes": self.node_stats,
"current_flow": self.current_flow,
"type4_count": self.type4_count,
"computed_at": self.computed_at.isoformat(),
}
def to_summary_api_dict(self) -> dict[str, Any]:
"""輸出 /api/v1/stats/summary 格式"""
return {
"playbook_count": self.playbook_count,
"execution_success_rate": round(self.execution_success_rate, 4) if self.execution_success_rate is not None else None,
"today_processed": self.today_processed,
"flywheel_conversions_today": self.flywheel_conversions_today,
"km_vectorized_rate": round(self.km_vectorized_rate, 4),
"km_unvectorized_count": self.km_unvectorized_count,
"alertname_null_rate": round(self.alertname_null_rate, 4),
"incidents_stuck": self.incidents_stuck,
"computed_at": self.computed_at.isoformat(),
}
# =============================================================================
# FlywheelStatsService
# =============================================================================
class FlywheelStatsService:
"""
飛輪健康度指標計算服務
ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取
ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示
"""
async def compute(self) -> FlywheelMetrics:
"""計算所有飛輪指標(單次完整查詢)"""
now = now_taipei()
playbook_count, execution_success_rate = await self._playbook_stats()
(
km_unvectorized_count,
km_vectorized_rate,
flywheel_conversions_today,
) = await self._km_stats(now)
(
alertname_null_rate,
incidents_stuck,
today_processed,
node_stats,
current_flow,
type4_count,
) = await self._incident_stats(now)
metrics = FlywheelMetrics(
playbook_count=playbook_count,
execution_success_rate=execution_success_rate,
km_unvectorized_count=km_unvectorized_count,
alertname_null_rate=alertname_null_rate,
incidents_stuck=incidents_stuck,
today_processed=today_processed,
flywheel_conversions_today=flywheel_conversions_today,
km_vectorized_rate=km_vectorized_rate,
node_stats=node_stats,
current_flow=current_flow,
computed_at=now,
)
metrics.type4_count = type4_count
return metrics
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
async def _playbook_stats(self) -> tuple[int, float | None]:
"""Playbook 數量 + 執行成功率(從 Redis
2026-05-02 ogt + Claude Sonnet 4.6 — Bug 2 修復W-3 fresh deploy 假告警)
total_exec < FLYWHEEL_MIN_SAMPLE 時回 None代表樣本不足
watchdog W-3 判 None 跳過該檢查,避免每次 restart 觸發假告警。
"""
try:
redis = get_redis()
count = 0
total_exec = 0
total_success = 0
async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200):
raw = await redis.get(key)
if not raw:
continue
try:
pb = json.loads(raw)
status = pb.get("status", "")
if status == "approved":
count += 1
success_count = pb.get("success_count", 0) or 0
failure_count = pb.get("failure_count", 0) or 0
total_exec += success_count + failure_count
total_success += success_count
except (json.JSONDecodeError, KeyError):
continue
# 2026-05-06 ogt + Codex:
# 執行成功率的 source of truth 是 auto_repair_executions。
# Redis playbook success_count/failure_count 會因回寫鏈路中斷而落後,
# 造成 governance / heartbeat 判定「飛輪沒有執行」。
# 2026-05-29 Codex:
# 24h 低流量不是資料管線斷流;若 24h 未達最小樣本,改用 7d
# 穩定窗口,避免 FlywheelExecutionRateMissing 長期誤報。
try:
async with get_db_context() as db:
row = await db.execute(
text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '24 hours'
""")
)
repair_stats = row.one()
db_total_exec = int(repair_stats.total or 0)
if db_total_exec >= FLYWHEEL_MIN_SAMPLE:
db_total_success = int(repair_stats.success or 0)
return count, db_total_success / db_total_exec
fallback_row = await db.execute(
text("""
SELECT
COUNT(*) FILTER (WHERE success IS TRUE) AS success,
COUNT(*) AS total
FROM auto_repair_executions
WHERE created_at >= NOW() - interval '7 days'
""")
)
fallback_stats = fallback_row.one()
fallback_total = int(fallback_stats.total or 0)
if fallback_total >= FLYWHEEL_MIN_SAMPLE:
fallback_success = int(fallback_stats.success or 0)
return count, fallback_success / fallback_total
if db_total_exec > 0 or fallback_total > 0:
return count, None
except Exception:
logger.warning("flywheel_stats_auto_repair_execution_query_failed")
if total_exec < FLYWHEEL_MIN_SAMPLE:
# 樣本不足(含 Redis 空),回 None 通知呼叫方跳過 W-3 告警判斷
return count, None
rate = total_success / total_exec
return count, rate
except Exception:
logger.exception("flywheel_stats_playbook_error")
return 0, None
async def _km_stats(self, now: datetime) -> tuple[int, float, int]:
"""KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL"""
try:
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
async with get_db_context() as db:
# 未向量化數量 (embedding IS NULL = 未向量化)
# 2026-04-15 ogt: KnowledgeEntryRecord ORM 不宣告 embedding 欄位pgvector
# 改用 raw SQL 避免 AttributeError
unvectorized_q = await db.execute(
text("SELECT COUNT(*) FROM knowledge_entries WHERE embedding IS NULL")
)
unvectorized = unvectorized_q.scalar_one_or_none() or 0
# 總數
total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id)))
total = total_q.scalar_one_or_none() or 0
vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0
# 今日轉化數(今日建立的 KM
conversions_q = await db.execute(
select(func.count()).where(
KnowledgeEntryRecord.created_at >= today_start
)
)
conversions_today = conversions_q.scalar_one_or_none() or 0
return unvectorized, vectorized_rate, conversions_today
except Exception:
logger.exception("flywheel_stats_km_error")
return 0, 0.0, 0
async def _incident_stats(
self, now: datetime
) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]:
"""Incident 相關指標alertname NULL 率、卡住數、今日處理數、節點狀態、當前流)"""
try:
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
stuck_threshold = now - timedelta(hours=24)
recent_1h = now - timedelta(hours=1)
async with get_db_context() as db:
# alertname NULL 率
total_q = await db.execute(select(func.count(IncidentRecord.incident_id)))
total = total_q.scalar_one_or_none() or 0
null_q = await db.execute(
select(func.count()).where(IncidentRecord.alertname.is_(None))
)
null_count = null_q.scalar_one_or_none() or 0
alertname_null_rate = null_count / total if total > 0 else 0.0
# 卡住的 IncidentINVESTIGATING > 24h
stuck_q = await db.execute(
select(func.count()).where(
IncidentRecord.status == IncidentStatus.INVESTIGATING,
IncidentRecord.created_at <= stuck_threshold,
)
)
incidents_stuck = stuck_q.scalar_one_or_none() or 0
# TYPE-4 Incident 數ADR-073-C C2 — 供前端 hasType4 判斷)
# 2026-04-12 ogt
type4_q = await db.execute(
select(func.count()).where(
IncidentRecord.notification_type == "TYPE-4",
IncidentRecord.status == IncidentStatus.INVESTIGATING,
)
)
type4_count = type4_q.scalar_one_or_none() or 0
# 今日處理數
today_q = await db.execute(
select(func.count()).where(
IncidentRecord.created_at >= today_start
)
)
today_processed = today_q.scalar_one_or_none() or 0
# 節點狀態(監控/去重/執行)
recent_q = await db.execute(
select(func.count()).where(
IncidentRecord.created_at >= recent_1h
)
)
count_1h = recent_q.scalar_one_or_none() or 0
# 自動執行成功數(今日)
success_q = await db.execute(
text(
"SELECT COUNT(*) FROM incidents WHERE created_at >= :today"
" AND outcome::text LIKE '%execution_success%true%'"
),
{"today": today_start},
)
exec_success_today = success_q.scalar_one_or_none() or 0
# 當前流(最近 10 筆活躍 Incident
active_q = await db.execute(
select(
IncidentRecord.incident_id,
IncidentRecord.alertname,
IncidentRecord.status,
IncidentRecord.created_at,
)
.where(
IncidentRecord.status.in_([
IncidentStatus.INVESTIGATING.value,
IncidentStatus.MITIGATING.value,
])
)
.order_by(IncidentRecord.created_at.desc())
.limit(10)
)
active_rows = active_q.fetchall()
current_flow = [
{
"incident_id": row.incident_id,
"alertname": row.alertname or "unknown",
"current_node": _status_to_node(row.status),
"ts": row.created_at.isoformat() if row.created_at else None,
}
for row in active_rows
]
node_stats = {
"monitoring": {
"status": "active" if count_1h > 0 else "idle",
"count_1h": count_1h,
},
"deduplication": {
"status": "active",
"dedup_window_min": 30,
},
"diagnosis": {
"status": "active",
"mcp_providers_used": ["k8s", "ssh", "prometheus"],
},
"reasoning": {
"status": "active",
"today_processed": today_processed,
},
"execution": {
"status": "active",
"success_today": exec_success_today,
},
"learning": {
"status": "active",
},
}
return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow, type4_count
except Exception:
logger.exception("flywheel_stats_incident_error")
return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, [], 0
def _status_to_node(status: str) -> str:
mapping = {
IncidentStatus.INVESTIGATING.value: "diagnosis",
IncidentStatus.MITIGATING.value: "execution",
IncidentStatus.RESOLVED.value: "learning",
IncidentStatus.CLOSED.value: "learning",
}
return mapping.get(status, "reasoning")
# =============================================================================
# DI 工廠
# =============================================================================
_instance: FlywheelStatsService | None = None
def get_flywheel_stats_service() -> FlywheelStatsService:
global _instance
if _instance is None:
_instance = FlywheelStatsService()
return _instance