feat(adr-074): M1 飛輪健康度 Exporter + M2 主機網路監控
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

ADR-074 M1:
  - FlywheelStatsService: 計算6項飛輪指標(Playbook數/成功率/KM向量化/alertname NULL/卡住數)
  - GET /api/v1/stats/flywheel — 六節點即時狀態(C1 前端用)
  - GET /api/v1/stats/summary — KPI 面板數據(C1 前端用)
  - GET /api/v1/stats/flywheel/metrics — Prometheus text format
  - flywheel-alerts.yaml: 5條告警規則(FlywheelPlaybookZero/ExecutionSuccessLow/KMVectorizationLow/AlertnameNullHigh/IncidentsStuck)
  - prometheus.yml: awoooi-flywheel scrape job(5分鐘間隔)

ADR-074 M2:
  - prometheus.yml: host-connectivity Blackbox TCP probe(110:22/188:22/120:6443/121:6443)
  - flywheel-alerts.yaml: HostNetworkPartition 告警規則

597 unit tests passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-12 15:31:01 +08:00
parent 4e952ab57f
commit 16d682346a
4 changed files with 586 additions and 1 deletions

View File

@@ -19,14 +19,16 @@
# @see feedback_lewooogo_modular_enforcement.md # @see feedback_lewooogo_modular_enforcement.md
# ============================================================================= # =============================================================================
from typing import Annotated from typing import Annotated, Any
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
from fastapi.responses import PlainTextResponse
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from src.services.stats_service import StatsService, get_stats_service from src.services.stats_service import StatsService, get_stats_service
from src.services.k3s_monitor_service import K3sMonitorService, get_k3s_monitor_service from src.services.k3s_monitor_service import K3sMonitorService, get_k3s_monitor_service
from src.services.weekly_report_service import WeeklyReportService, get_weekly_report_service from src.services.weekly_report_service import WeeklyReportService, get_weekly_report_service
from src.services.flywheel_stats_service import FlywheelStatsService, get_flywheel_stats_service
router = APIRouter(prefix="/stats", tags=["Statistics"]) router = APIRouter(prefix="/stats", tags=["Statistics"])
@@ -489,3 +491,64 @@ async def get_disposition_stats() -> DispositionResponse:
import structlog import structlog
structlog.get_logger(__name__).warning("disposition_stats_error", error=str(e)) structlog.get_logger(__name__).warning("disposition_stats_error", error=str(e))
return DispositionResponse(summary=DispositionSummary()) return DispositionResponse(summary=DispositionSummary())
# =============================================================================
# ADR-073-C C1 + ADR-074 M1 — 飛輪健康度 API
# 2026-04-12 ogt
# =============================================================================
FlywheelStatsDep = Annotated[FlywheelStatsService, Depends(get_flywheel_stats_service)]
@router.get(
"/flywheel",
summary="飛輪六節點即時狀態ADR-073-C C1",
response_model=None,
)
async def get_flywheel_stats(svc: FlywheelStatsDep) -> dict[str, Any]:
"""
飛輪六節點即時狀態 + 當前流動中的告警。
供前端飛輪動畫元件接真實數據。
"""
metrics = await svc.compute()
return metrics.to_flywheel_api_dict()
@router.get(
"/summary",
summary="飛輪 KPI 摘要ADR-073-C C1",
response_model=None,
)
async def get_flywheel_summary(svc: FlywheelStatsDep) -> dict[str, Any]:
"""
飛輪 KPI 面板數據Playbook 數、成功率、今日處理數、KM 向量化率。
供前端右上角三個 KPI 卡片顯示真實數據。
"""
metrics = await svc.compute()
return metrics.to_summary_api_dict()
@router.get(
"/flywheel/metrics",
summary="Prometheus 飛輪健康度指標ADR-074 M1",
response_class=PlainTextResponse,
)
async def get_flywheel_prometheus_metrics(svc: FlywheelStatsDep) -> PlainTextResponse:
"""
Prometheus text format 飛輪健康度指標。
Prometheus scrape target: /api/v1/stats/flywheel/metrics
Metrics:
awoooi_flywheel_playbook_count
awoooi_flywheel_execution_success_rate
awoooi_flywheel_km_unvectorized_count
awoooi_flywheel_alertname_null_rate
awoooi_flywheel_incidents_stuck
awoooi_flywheel_km_vectorized_rate
"""
metrics = await svc.compute()
return PlainTextResponse(
content=metrics.to_prometheus_lines(),
media_type="text/plain; version=0.0.4; charset=utf-8",
)

View File

@@ -0,0 +1,386 @@
"""
Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1
飛輪健康度指標計算服務:
- 供 Prometheus ExporterM1抓取
- 供前端 /api/v1/stats/flywheel 即時顯示C1
Metrics:
awoooi_flywheel_playbook_count 目標 ≥ 20
awoooi_flywheel_execution_success_rate 目標 ≥ 0.3
awoooi_flywheel_km_unvectorized_count 目標 = 0
awoooi_flywheel_alertname_null_rate 目標 = 0
awoooi_flywheel_incidents_stuck 目標 = 0
2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1)
"""
from __future__ import annotations
import json
from datetime import datetime, timedelta
from typing import Any
import structlog
from sqlalchemy import func, select, text
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord, KnowledgeEntryRecord
from src.models.incident import IncidentStatus
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# Redis key prefix與 playbook_repository.py 一致)
_PLAYBOOK_KEY_PREFIX = "playbook:"
# 飛輪六節點名稱
FLYWHEEL_NODES = [
"monitoring",
"deduplication",
"diagnosis",
"reasoning",
"execution",
"learning",
]
# =============================================================================
# 核心指標資料結構
# =============================================================================
class FlywheelMetrics:
"""飛輪健康度指標快照"""
def __init__(
self,
playbook_count: int,
execution_success_rate: float,
km_unvectorized_count: int,
alertname_null_rate: float,
incidents_stuck: int,
today_processed: int,
flywheel_conversions_today: int,
km_vectorized_rate: float,
node_stats: dict[str, Any],
current_flow: list[dict[str, Any]],
computed_at: datetime,
) -> None:
self.playbook_count = playbook_count
self.execution_success_rate = execution_success_rate
self.km_unvectorized_count = km_unvectorized_count
self.alertname_null_rate = alertname_null_rate
self.incidents_stuck = incidents_stuck
self.today_processed = today_processed
self.flywheel_conversions_today = flywheel_conversions_today
self.km_vectorized_rate = km_vectorized_rate
self.node_stats = node_stats
self.current_flow = current_flow
self.computed_at = computed_at
def to_prometheus_lines(self) -> str:
"""輸出 Prometheus text format"""
ts = int(self.computed_at.timestamp() * 1000)
lines = [
"# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis",
"# TYPE awoooi_flywheel_playbook_count gauge",
f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}",
"",
"# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1)",
"# TYPE awoooi_flywheel_execution_success_rate gauge",
f"awoooi_flywheel_execution_success_rate {self.execution_success_rate:.4f} {ts}",
"",
"# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized",
"# TYPE awoooi_flywheel_km_unvectorized_count gauge",
f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}",
"",
"# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname",
"# TYPE awoooi_flywheel_alertname_null_rate gauge",
f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}",
"",
"# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h",
"# TYPE awoooi_flywheel_incidents_stuck gauge",
f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}",
"",
"# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized",
"# TYPE awoooi_flywheel_km_vectorized_rate gauge",
f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}",
]
return "\n".join(lines) + "\n"
def to_flywheel_api_dict(self) -> dict[str, Any]:
"""輸出 /api/v1/stats/flywheel 格式"""
return {
"nodes": self.node_stats,
"current_flow": self.current_flow,
"computed_at": self.computed_at.isoformat(),
}
def to_summary_api_dict(self) -> dict[str, Any]:
"""輸出 /api/v1/stats/summary 格式"""
return {
"playbook_count": self.playbook_count,
"execution_success_rate": round(self.execution_success_rate, 4),
"today_processed": self.today_processed,
"flywheel_conversions_today": self.flywheel_conversions_today,
"km_vectorized_rate": round(self.km_vectorized_rate, 4),
"km_unvectorized_count": self.km_unvectorized_count,
"alertname_null_rate": round(self.alertname_null_rate, 4),
"incidents_stuck": self.incidents_stuck,
"computed_at": self.computed_at.isoformat(),
}
# =============================================================================
# FlywheelStatsService
# =============================================================================
class FlywheelStatsService:
"""
飛輪健康度指標計算服務
ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取
ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示
"""
async def compute(self) -> FlywheelMetrics:
"""計算所有飛輪指標(單次完整查詢)"""
now = now_taipei()
playbook_count, execution_success_rate = await self._playbook_stats()
(
km_unvectorized_count,
km_vectorized_rate,
flywheel_conversions_today,
) = await self._km_stats(now)
(
alertname_null_rate,
incidents_stuck,
today_processed,
node_stats,
current_flow,
) = await self._incident_stats(now)
return FlywheelMetrics(
playbook_count=playbook_count,
execution_success_rate=execution_success_rate,
km_unvectorized_count=km_unvectorized_count,
alertname_null_rate=alertname_null_rate,
incidents_stuck=incidents_stuck,
today_processed=today_processed,
flywheel_conversions_today=flywheel_conversions_today,
km_vectorized_rate=km_vectorized_rate,
node_stats=node_stats,
current_flow=current_flow,
computed_at=now,
)
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
async def _playbook_stats(self) -> tuple[int, float]:
"""Playbook 數量 + 執行成功率(從 Redis"""
try:
redis = get_redis()
count = 0
total_exec = 0
total_success = 0
async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200):
raw = await redis.get(key)
if not raw:
continue
try:
pb = json.loads(raw)
status = pb.get("status", "")
if status == "approved":
count += 1
exec_count = pb.get("execution_count", 0) or 0
success_count = pb.get("success_count", 0) or 0
total_exec += exec_count
total_success += success_count
except (json.JSONDecodeError, KeyError):
continue
rate = total_success / total_exec if total_exec > 0 else 0.0
return count, rate
except Exception:
logger.exception("flywheel_stats_playbook_error")
return 0, 0.0
async def _km_stats(self, now: datetime) -> tuple[int, float, int]:
"""KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL"""
try:
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
async with get_db_context() as db:
# 未向量化數量
unvectorized_q = await db.execute(
select(func.count()).where(KnowledgeEntryRecord.vectorized.is_(False))
)
unvectorized = unvectorized_q.scalar_one_or_none() or 0
# 總數
total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id)))
total = total_q.scalar_one_or_none() or 0
vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0
# 今日轉化數(今日建立的 KM
conversions_q = await db.execute(
select(func.count()).where(
KnowledgeEntryRecord.created_at >= today_start
)
)
conversions_today = conversions_q.scalar_one_or_none() or 0
return unvectorized, vectorized_rate, conversions_today
except Exception:
logger.exception("flywheel_stats_km_error")
return 0, 0.0, 0
async def _incident_stats(
self, now: datetime
) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]:
"""Incident 相關指標alertname NULL 率、卡住數、今日處理數、節點狀態、當前流)"""
try:
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
stuck_threshold = now - timedelta(hours=24)
recent_1h = now - timedelta(hours=1)
async with get_db_context() as db:
# alertname NULL 率
total_q = await db.execute(select(func.count(IncidentRecord.id)))
total = total_q.scalar_one_or_none() or 0
null_q = await db.execute(
select(func.count()).where(IncidentRecord.alertname.is_(None))
)
null_count = null_q.scalar_one_or_none() or 0
alertname_null_rate = null_count / total if total > 0 else 0.0
# 卡住的 IncidentINVESTIGATING > 24h
stuck_q = await db.execute(
select(func.count()).where(
IncidentRecord.status == IncidentStatus.INVESTIGATING.value,
IncidentRecord.created_at <= stuck_threshold,
)
)
incidents_stuck = stuck_q.scalar_one_or_none() or 0
# 今日處理數
today_q = await db.execute(
select(func.count()).where(
IncidentRecord.created_at >= today_start
)
)
today_processed = today_q.scalar_one_or_none() or 0
# 節點狀態(監控/去重/執行)
recent_q = await db.execute(
select(func.count()).where(
IncidentRecord.created_at >= recent_1h
)
)
count_1h = recent_q.scalar_one_or_none() or 0
# 自動執行成功數(今日)
success_q = await db.execute(
text(
"SELECT COUNT(*) FROM incidents WHERE created_at >= :today"
" AND outcomes::text LIKE '%execution_success%true%'"
),
{"today": today_start},
)
exec_success_today = success_q.scalar_one_or_none() or 0
# 當前流(最近 10 筆活躍 Incident
active_q = await db.execute(
select(
IncidentRecord.incident_id,
IncidentRecord.alertname,
IncidentRecord.status,
IncidentRecord.created_at,
)
.where(
IncidentRecord.status.in_([
IncidentStatus.INVESTIGATING.value,
IncidentStatus.MITIGATING.value,
])
)
.order_by(IncidentRecord.created_at.desc())
.limit(10)
)
active_rows = active_q.fetchall()
current_flow = [
{
"incident_id": row.incident_id,
"alertname": row.alertname or "unknown",
"current_node": _status_to_node(row.status),
"ts": row.created_at.isoformat() if row.created_at else None,
}
for row in active_rows
]
node_stats = {
"monitoring": {
"status": "active" if count_1h > 0 else "idle",
"count_1h": count_1h,
},
"deduplication": {
"status": "active",
"dedup_window_min": 30,
},
"diagnosis": {
"status": "active",
"mcp_providers_used": ["k8s", "ssh", "prometheus"],
},
"reasoning": {
"status": "active",
"today_processed": today_processed,
},
"execution": {
"status": "active",
"success_today": exec_success_today,
},
"learning": {
"status": "active",
},
}
return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow
except Exception:
logger.exception("flywheel_stats_incident_error")
return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, []
def _status_to_node(status: str) -> str:
mapping = {
IncidentStatus.INVESTIGATING.value: "diagnosis",
IncidentStatus.MITIGATING.value: "execution",
IncidentStatus.RESOLVED.value: "learning",
IncidentStatus.CLOSED.value: "learning",
}
return mapping.get(status, "reasoning")
# =============================================================================
# DI 工廠
# =============================================================================
_instance: FlywheelStatsService | None = None
def get_flywheel_stats_service() -> FlywheelStatsService:
global _instance
if _instance is None:
_instance = FlywheelStatsService()
return _instance

View File

@@ -0,0 +1,105 @@
# =============================================================================
# 飛輪健康度告警規則 — ADR-074 M1
# =============================================================================
# Prometheus PrometheusRule CRD — 飛輪自監控告警
# 數據來源:/api/v1/stats/flywheel/metricsawoooi-flywheel scrape job
#
# 部署kubectl apply -f k8s/monitoring/flywheel-alerts.yaml
#
# 2026-04-12 ogt (ADR-074 M1)
# =============================================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: flywheel-alerts
namespace: monitoring
labels:
release: prometheus
app: prometheus
spec:
groups:
- name: awoooi_flywheel_health
interval: 5m
rules:
# P0: Playbook 完全沒有 → 飛輪學習節點失效
- alert: FlywheelPlaybookZero
expr: awoooi_flywheel_playbook_count == 0
for: 1h
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "飛輪 Playbook 數量為 0"
description: "Playbook 數量持續 1 小時為 0飛輪學習節點完全失效。"
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
# P0: 執行成功率極低
- alert: FlywheelExecutionSuccessLow
expr: awoooi_flywheel_execution_success_rate < 0.1
for: 2h
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "飛輪自動修復成功率低於 10%"
description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。"
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
# P0: KM 大量未向量化 → RAG 無法使用歷史案例
- alert: FlywheelKMVectorizationLow
expr: awoooi_flywheel_km_unvectorized_count > 10
for: 30m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "飛輪 KM 未向量化數量 > 10"
description: "{{ $value }} 筆 KM 條目尚未向量化RAG 查詢品質下降。"
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
# P1: alertname NULL 率異常
- alert: FlywheelAlertnameNullHigh
expr: awoooi_flywheel_alertname_null_rate > 0.05
for: 30m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "飛輪 alertname NULL 率超過 5%"
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
runbook: "執行 scripts/backfill_alertname.py 回填"
# P1: Incident 卡住超過 24 小時
- alert: FlywheelIncidentsStuck
expr: awoooi_flywheel_incidents_stuck > 5
for: 10m
labels:
severity: warning
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。"
- name: awoooi_host_connectivity
interval: 60s
rules:
# P0: 主機間網路分區
- alert: HostNetworkPartition
expr: probe_success{job="host-connectivity"} == 0
for: 5m
labels:
severity: critical
alert_category: infrastructure
notification_type: TYPE-3
annotations:
summary: "主機 {{ $labels.instance }} 無法連通"
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
runbook: "SSH 檢查路由和防火牆規則"

View File

@@ -156,6 +156,37 @@ scrape_configs:
service: 'awoooi-api' service: 'awoooi-api'
env: 'prod' env: 'prod'
# === ADR-074 M1: 飛輪健康度指標 (2026-04-12 ogt) ===
- job_name: 'awoooi-flywheel'
scrape_interval: 5m
metrics_path: /api/v1/stats/flywheel/metrics
static_configs:
- targets: ['192.168.0.125:32334']
labels:
host: '125'
service: 'awoooi-flywheel'
env: 'prod'
# === ADR-074 M2: 主機間網路連通性 (2026-04-12 ogt) ===
- job_name: 'host-connectivity'
scrape_interval: 60s
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 192.168.0.110:22
- 192.168.0.188:22
- 192.168.0.120:6443
- 192.168.0.121:6443
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.0.188:9115
# === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) === # === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) ===
- job_name: 'postgres-exporter' - job_name: 'postgres-exporter'
scrape_interval: 30s scrape_interval: 30s