feat(adr-074): M1 飛輪健康度 Exporter + M2 主機網路監控
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
ADR-074 M1: - FlywheelStatsService: 計算6項飛輪指標(Playbook數/成功率/KM向量化/alertname NULL/卡住數) - GET /api/v1/stats/flywheel — 六節點即時狀態(C1 前端用) - GET /api/v1/stats/summary — KPI 面板數據(C1 前端用) - GET /api/v1/stats/flywheel/metrics — Prometheus text format - flywheel-alerts.yaml: 5條告警規則(FlywheelPlaybookZero/ExecutionSuccessLow/KMVectorizationLow/AlertnameNullHigh/IncidentsStuck) - prometheus.yml: awoooi-flywheel scrape job(5分鐘間隔) ADR-074 M2: - prometheus.yml: host-connectivity Blackbox TCP probe(110:22/188:22/120:6443/121:6443) - flywheel-alerts.yaml: HostNetworkPartition 告警規則 597 unit tests passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -19,14 +19,16 @@
|
||||
# @see feedback_lewooogo_modular_enforcement.md
|
||||
# =============================================================================
|
||||
|
||||
from typing import Annotated
|
||||
from typing import Annotated, Any
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.services.stats_service import StatsService, get_stats_service
|
||||
from src.services.k3s_monitor_service import K3sMonitorService, get_k3s_monitor_service
|
||||
from src.services.weekly_report_service import WeeklyReportService, get_weekly_report_service
|
||||
from src.services.flywheel_stats_service import FlywheelStatsService, get_flywheel_stats_service
|
||||
|
||||
router = APIRouter(prefix="/stats", tags=["Statistics"])
|
||||
|
||||
@@ -489,3 +491,64 @@ async def get_disposition_stats() -> DispositionResponse:
|
||||
import structlog
|
||||
structlog.get_logger(__name__).warning("disposition_stats_error", error=str(e))
|
||||
return DispositionResponse(summary=DispositionSummary())
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ADR-073-C C1 + ADR-074 M1 — 飛輪健康度 API
|
||||
# 2026-04-12 ogt
|
||||
# =============================================================================
|
||||
|
||||
FlywheelStatsDep = Annotated[FlywheelStatsService, Depends(get_flywheel_stats_service)]
|
||||
|
||||
|
||||
@router.get(
|
||||
"/flywheel",
|
||||
summary="飛輪六節點即時狀態(ADR-073-C C1)",
|
||||
response_model=None,
|
||||
)
|
||||
async def get_flywheel_stats(svc: FlywheelStatsDep) -> dict[str, Any]:
|
||||
"""
|
||||
飛輪六節點即時狀態 + 當前流動中的告警。
|
||||
供前端飛輪動畫元件接真實數據。
|
||||
"""
|
||||
metrics = await svc.compute()
|
||||
return metrics.to_flywheel_api_dict()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/summary",
|
||||
summary="飛輪 KPI 摘要(ADR-073-C C1)",
|
||||
response_model=None,
|
||||
)
|
||||
async def get_flywheel_summary(svc: FlywheelStatsDep) -> dict[str, Any]:
|
||||
"""
|
||||
飛輪 KPI 面板數據:Playbook 數、成功率、今日處理數、KM 向量化率。
|
||||
供前端右上角三個 KPI 卡片顯示真實數據。
|
||||
"""
|
||||
metrics = await svc.compute()
|
||||
return metrics.to_summary_api_dict()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/flywheel/metrics",
|
||||
summary="Prometheus 飛輪健康度指標(ADR-074 M1)",
|
||||
response_class=PlainTextResponse,
|
||||
)
|
||||
async def get_flywheel_prometheus_metrics(svc: FlywheelStatsDep) -> PlainTextResponse:
|
||||
"""
|
||||
Prometheus text format 飛輪健康度指標。
|
||||
Prometheus scrape target: /api/v1/stats/flywheel/metrics
|
||||
|
||||
Metrics:
|
||||
awoooi_flywheel_playbook_count
|
||||
awoooi_flywheel_execution_success_rate
|
||||
awoooi_flywheel_km_unvectorized_count
|
||||
awoooi_flywheel_alertname_null_rate
|
||||
awoooi_flywheel_incidents_stuck
|
||||
awoooi_flywheel_km_vectorized_rate
|
||||
"""
|
||||
metrics = await svc.compute()
|
||||
return PlainTextResponse(
|
||||
content=metrics.to_prometheus_lines(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8",
|
||||
)
|
||||
|
||||
386
apps/api/src/services/flywheel_stats_service.py
Normal file
386
apps/api/src/services/flywheel_stats_service.py
Normal file
@@ -0,0 +1,386 @@
|
||||
"""
|
||||
Flywheel Stats Service — ADR-074 M1 + ADR-073-C C1
|
||||
|
||||
飛輪健康度指標計算服務:
|
||||
- 供 Prometheus Exporter(M1)抓取
|
||||
- 供前端 /api/v1/stats/flywheel 即時顯示(C1)
|
||||
|
||||
Metrics:
|
||||
awoooi_flywheel_playbook_count 目標 ≥ 20
|
||||
awoooi_flywheel_execution_success_rate 目標 ≥ 0.3
|
||||
awoooi_flywheel_km_unvectorized_count 目標 = 0
|
||||
awoooi_flywheel_alertname_null_rate 目標 = 0
|
||||
awoooi_flywheel_incidents_stuck 目標 = 0
|
||||
|
||||
2026-04-12 ogt (ADR-074 M1 + ADR-073-C C1)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import func, select, text
|
||||
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import IncidentRecord, KnowledgeEntryRecord
|
||||
from src.models.incident import IncidentStatus
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# Redis key prefix(與 playbook_repository.py 一致)
|
||||
_PLAYBOOK_KEY_PREFIX = "playbook:"
|
||||
|
||||
# 飛輪六節點名稱
|
||||
FLYWHEEL_NODES = [
|
||||
"monitoring",
|
||||
"deduplication",
|
||||
"diagnosis",
|
||||
"reasoning",
|
||||
"execution",
|
||||
"learning",
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 核心指標資料結構
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class FlywheelMetrics:
|
||||
"""飛輪健康度指標快照"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
playbook_count: int,
|
||||
execution_success_rate: float,
|
||||
km_unvectorized_count: int,
|
||||
alertname_null_rate: float,
|
||||
incidents_stuck: int,
|
||||
today_processed: int,
|
||||
flywheel_conversions_today: int,
|
||||
km_vectorized_rate: float,
|
||||
node_stats: dict[str, Any],
|
||||
current_flow: list[dict[str, Any]],
|
||||
computed_at: datetime,
|
||||
) -> None:
|
||||
self.playbook_count = playbook_count
|
||||
self.execution_success_rate = execution_success_rate
|
||||
self.km_unvectorized_count = km_unvectorized_count
|
||||
self.alertname_null_rate = alertname_null_rate
|
||||
self.incidents_stuck = incidents_stuck
|
||||
self.today_processed = today_processed
|
||||
self.flywheel_conversions_today = flywheel_conversions_today
|
||||
self.km_vectorized_rate = km_vectorized_rate
|
||||
self.node_stats = node_stats
|
||||
self.current_flow = current_flow
|
||||
self.computed_at = computed_at
|
||||
|
||||
def to_prometheus_lines(self) -> str:
|
||||
"""輸出 Prometheus text format"""
|
||||
ts = int(self.computed_at.timestamp() * 1000)
|
||||
lines = [
|
||||
"# HELP awoooi_flywheel_playbook_count Total approved playbooks in Redis",
|
||||
"# TYPE awoooi_flywheel_playbook_count gauge",
|
||||
f"awoooi_flywheel_playbook_count {self.playbook_count} {ts}",
|
||||
"",
|
||||
"# HELP awoooi_flywheel_execution_success_rate Auto-repair success rate (0-1)",
|
||||
"# TYPE awoooi_flywheel_execution_success_rate gauge",
|
||||
f"awoooi_flywheel_execution_success_rate {self.execution_success_rate:.4f} {ts}",
|
||||
"",
|
||||
"# HELP awoooi_flywheel_km_unvectorized_count KM entries not yet vectorized",
|
||||
"# TYPE awoooi_flywheel_km_unvectorized_count gauge",
|
||||
f"awoooi_flywheel_km_unvectorized_count {self.km_unvectorized_count} {ts}",
|
||||
"",
|
||||
"# HELP awoooi_flywheel_alertname_null_rate Fraction of incidents with null alertname",
|
||||
"# TYPE awoooi_flywheel_alertname_null_rate gauge",
|
||||
f"awoooi_flywheel_alertname_null_rate {self.alertname_null_rate:.4f} {ts}",
|
||||
"",
|
||||
"# HELP awoooi_flywheel_incidents_stuck Incidents stuck in INVESTIGATING > 24h",
|
||||
"# TYPE awoooi_flywheel_incidents_stuck gauge",
|
||||
f"awoooi_flywheel_incidents_stuck {self.incidents_stuck} {ts}",
|
||||
"",
|
||||
"# HELP awoooi_flywheel_km_vectorized_rate Fraction of KM entries vectorized",
|
||||
"# TYPE awoooi_flywheel_km_vectorized_rate gauge",
|
||||
f"awoooi_flywheel_km_vectorized_rate {self.km_vectorized_rate:.4f} {ts}",
|
||||
]
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
def to_flywheel_api_dict(self) -> dict[str, Any]:
|
||||
"""輸出 /api/v1/stats/flywheel 格式"""
|
||||
return {
|
||||
"nodes": self.node_stats,
|
||||
"current_flow": self.current_flow,
|
||||
"computed_at": self.computed_at.isoformat(),
|
||||
}
|
||||
|
||||
def to_summary_api_dict(self) -> dict[str, Any]:
|
||||
"""輸出 /api/v1/stats/summary 格式"""
|
||||
return {
|
||||
"playbook_count": self.playbook_count,
|
||||
"execution_success_rate": round(self.execution_success_rate, 4),
|
||||
"today_processed": self.today_processed,
|
||||
"flywheel_conversions_today": self.flywheel_conversions_today,
|
||||
"km_vectorized_rate": round(self.km_vectorized_rate, 4),
|
||||
"km_unvectorized_count": self.km_unvectorized_count,
|
||||
"alertname_null_rate": round(self.alertname_null_rate, 4),
|
||||
"incidents_stuck": self.incidents_stuck,
|
||||
"computed_at": self.computed_at.isoformat(),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FlywheelStatsService
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class FlywheelStatsService:
|
||||
"""
|
||||
飛輪健康度指標計算服務
|
||||
|
||||
ADR-074 M1: 供 Prometheus /metrics/flywheel 抓取
|
||||
ADR-073-C C1: 供前端 /api/v1/stats/flywheel 顯示
|
||||
"""
|
||||
|
||||
async def compute(self) -> FlywheelMetrics:
|
||||
"""計算所有飛輪指標(單次完整查詢)"""
|
||||
now = now_taipei()
|
||||
|
||||
playbook_count, execution_success_rate = await self._playbook_stats()
|
||||
(
|
||||
km_unvectorized_count,
|
||||
km_vectorized_rate,
|
||||
flywheel_conversions_today,
|
||||
) = await self._km_stats(now)
|
||||
(
|
||||
alertname_null_rate,
|
||||
incidents_stuck,
|
||||
today_processed,
|
||||
node_stats,
|
||||
current_flow,
|
||||
) = await self._incident_stats(now)
|
||||
|
||||
return FlywheelMetrics(
|
||||
playbook_count=playbook_count,
|
||||
execution_success_rate=execution_success_rate,
|
||||
km_unvectorized_count=km_unvectorized_count,
|
||||
alertname_null_rate=alertname_null_rate,
|
||||
incidents_stuck=incidents_stuck,
|
||||
today_processed=today_processed,
|
||||
flywheel_conversions_today=flywheel_conversions_today,
|
||||
km_vectorized_rate=km_vectorized_rate,
|
||||
node_stats=node_stats,
|
||||
current_flow=current_flow,
|
||||
computed_at=now,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _playbook_stats(self) -> tuple[int, float]:
|
||||
"""Playbook 數量 + 執行成功率(從 Redis)"""
|
||||
try:
|
||||
redis = get_redis()
|
||||
count = 0
|
||||
total_exec = 0
|
||||
total_success = 0
|
||||
|
||||
async for key in redis.scan_iter(match=f"{_PLAYBOOK_KEY_PREFIX}PB-*", count=200):
|
||||
raw = await redis.get(key)
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
pb = json.loads(raw)
|
||||
status = pb.get("status", "")
|
||||
if status == "approved":
|
||||
count += 1
|
||||
exec_count = pb.get("execution_count", 0) or 0
|
||||
success_count = pb.get("success_count", 0) or 0
|
||||
total_exec += exec_count
|
||||
total_success += success_count
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
continue
|
||||
|
||||
rate = total_success / total_exec if total_exec > 0 else 0.0
|
||||
return count, rate
|
||||
|
||||
except Exception:
|
||||
logger.exception("flywheel_stats_playbook_error")
|
||||
return 0, 0.0
|
||||
|
||||
async def _km_stats(self, now: datetime) -> tuple[int, float, int]:
|
||||
"""KM 向量化率 + 今日飛輪轉化數(從 PostgreSQL)"""
|
||||
try:
|
||||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
async with get_db_context() as db:
|
||||
# 未向量化數量
|
||||
unvectorized_q = await db.execute(
|
||||
select(func.count()).where(KnowledgeEntryRecord.vectorized.is_(False))
|
||||
)
|
||||
unvectorized = unvectorized_q.scalar_one_or_none() or 0
|
||||
|
||||
# 總數
|
||||
total_q = await db.execute(select(func.count(KnowledgeEntryRecord.id)))
|
||||
total = total_q.scalar_one_or_none() or 0
|
||||
|
||||
vectorized_rate = (total - unvectorized) / total if total > 0 else 0.0
|
||||
|
||||
# 今日轉化數(今日建立的 KM)
|
||||
conversions_q = await db.execute(
|
||||
select(func.count()).where(
|
||||
KnowledgeEntryRecord.created_at >= today_start
|
||||
)
|
||||
)
|
||||
conversions_today = conversions_q.scalar_one_or_none() or 0
|
||||
|
||||
return unvectorized, vectorized_rate, conversions_today
|
||||
|
||||
except Exception:
|
||||
logger.exception("flywheel_stats_km_error")
|
||||
return 0, 0.0, 0
|
||||
|
||||
async def _incident_stats(
|
||||
self, now: datetime
|
||||
) -> tuple[float, int, int, dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Incident 相關指標(alertname NULL 率、卡住數、今日處理數、節點狀態、當前流)"""
|
||||
try:
|
||||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
stuck_threshold = now - timedelta(hours=24)
|
||||
recent_1h = now - timedelta(hours=1)
|
||||
|
||||
async with get_db_context() as db:
|
||||
# alertname NULL 率
|
||||
total_q = await db.execute(select(func.count(IncidentRecord.id)))
|
||||
total = total_q.scalar_one_or_none() or 0
|
||||
|
||||
null_q = await db.execute(
|
||||
select(func.count()).where(IncidentRecord.alertname.is_(None))
|
||||
)
|
||||
null_count = null_q.scalar_one_or_none() or 0
|
||||
alertname_null_rate = null_count / total if total > 0 else 0.0
|
||||
|
||||
# 卡住的 Incident(INVESTIGATING > 24h)
|
||||
stuck_q = await db.execute(
|
||||
select(func.count()).where(
|
||||
IncidentRecord.status == IncidentStatus.INVESTIGATING.value,
|
||||
IncidentRecord.created_at <= stuck_threshold,
|
||||
)
|
||||
)
|
||||
incidents_stuck = stuck_q.scalar_one_or_none() or 0
|
||||
|
||||
# 今日處理數
|
||||
today_q = await db.execute(
|
||||
select(func.count()).where(
|
||||
IncidentRecord.created_at >= today_start
|
||||
)
|
||||
)
|
||||
today_processed = today_q.scalar_one_or_none() or 0
|
||||
|
||||
# 節點狀態(監控/去重/執行)
|
||||
recent_q = await db.execute(
|
||||
select(func.count()).where(
|
||||
IncidentRecord.created_at >= recent_1h
|
||||
)
|
||||
)
|
||||
count_1h = recent_q.scalar_one_or_none() or 0
|
||||
|
||||
# 自動執行成功數(今日)
|
||||
success_q = await db.execute(
|
||||
text(
|
||||
"SELECT COUNT(*) FROM incidents WHERE created_at >= :today"
|
||||
" AND outcomes::text LIKE '%execution_success%true%'"
|
||||
),
|
||||
{"today": today_start},
|
||||
)
|
||||
exec_success_today = success_q.scalar_one_or_none() or 0
|
||||
|
||||
# 當前流(最近 10 筆活躍 Incident)
|
||||
active_q = await db.execute(
|
||||
select(
|
||||
IncidentRecord.incident_id,
|
||||
IncidentRecord.alertname,
|
||||
IncidentRecord.status,
|
||||
IncidentRecord.created_at,
|
||||
)
|
||||
.where(
|
||||
IncidentRecord.status.in_([
|
||||
IncidentStatus.INVESTIGATING.value,
|
||||
IncidentStatus.MITIGATING.value,
|
||||
])
|
||||
)
|
||||
.order_by(IncidentRecord.created_at.desc())
|
||||
.limit(10)
|
||||
)
|
||||
active_rows = active_q.fetchall()
|
||||
|
||||
current_flow = [
|
||||
{
|
||||
"incident_id": row.incident_id,
|
||||
"alertname": row.alertname or "unknown",
|
||||
"current_node": _status_to_node(row.status),
|
||||
"ts": row.created_at.isoformat() if row.created_at else None,
|
||||
}
|
||||
for row in active_rows
|
||||
]
|
||||
|
||||
node_stats = {
|
||||
"monitoring": {
|
||||
"status": "active" if count_1h > 0 else "idle",
|
||||
"count_1h": count_1h,
|
||||
},
|
||||
"deduplication": {
|
||||
"status": "active",
|
||||
"dedup_window_min": 30,
|
||||
},
|
||||
"diagnosis": {
|
||||
"status": "active",
|
||||
"mcp_providers_used": ["k8s", "ssh", "prometheus"],
|
||||
},
|
||||
"reasoning": {
|
||||
"status": "active",
|
||||
"today_processed": today_processed,
|
||||
},
|
||||
"execution": {
|
||||
"status": "active",
|
||||
"success_today": exec_success_today,
|
||||
},
|
||||
"learning": {
|
||||
"status": "active",
|
||||
},
|
||||
}
|
||||
|
||||
return alertname_null_rate, incidents_stuck, today_processed, node_stats, current_flow
|
||||
|
||||
except Exception:
|
||||
logger.exception("flywheel_stats_incident_error")
|
||||
return 0.0, 0, 0, {n: {"status": "unknown"} for n in FLYWHEEL_NODES}, []
|
||||
|
||||
|
||||
def _status_to_node(status: str) -> str:
|
||||
mapping = {
|
||||
IncidentStatus.INVESTIGATING.value: "diagnosis",
|
||||
IncidentStatus.MITIGATING.value: "execution",
|
||||
IncidentStatus.RESOLVED.value: "learning",
|
||||
IncidentStatus.CLOSED.value: "learning",
|
||||
}
|
||||
return mapping.get(status, "reasoning")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DI 工廠
|
||||
# =============================================================================
|
||||
|
||||
_instance: FlywheelStatsService | None = None
|
||||
|
||||
|
||||
def get_flywheel_stats_service() -> FlywheelStatsService:
|
||||
global _instance
|
||||
if _instance is None:
|
||||
_instance = FlywheelStatsService()
|
||||
return _instance
|
||||
105
k8s/monitoring/flywheel-alerts.yaml
Normal file
105
k8s/monitoring/flywheel-alerts.yaml
Normal file
@@ -0,0 +1,105 @@
|
||||
# =============================================================================
|
||||
# 飛輪健康度告警規則 — ADR-074 M1
|
||||
# =============================================================================
|
||||
# Prometheus PrometheusRule CRD — 飛輪自監控告警
|
||||
# 數據來源:/api/v1/stats/flywheel/metrics(awoooi-flywheel scrape job)
|
||||
#
|
||||
# 部署:kubectl apply -f k8s/monitoring/flywheel-alerts.yaml
|
||||
#
|
||||
# 2026-04-12 ogt (ADR-074 M1)
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: flywheel-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: prometheus
|
||||
app: prometheus
|
||||
spec:
|
||||
groups:
|
||||
- name: awoooi_flywheel_health
|
||||
interval: 5m
|
||||
rules:
|
||||
|
||||
# P0: Playbook 完全沒有 → 飛輪學習節點失效
|
||||
- alert: FlywheelPlaybookZero
|
||||
expr: awoooi_flywheel_playbook_count == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "飛輪 Playbook 數量為 0"
|
||||
description: "Playbook 數量持續 1 小時為 0,飛輪學習節點完全失效。"
|
||||
runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
|
||||
|
||||
# P0: 執行成功率極低
|
||||
- alert: FlywheelExecutionSuccessLow
|
||||
expr: awoooi_flywheel_execution_success_rate < 0.1
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "飛輪自動修復成功率低於 10%"
|
||||
description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。"
|
||||
runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態"
|
||||
|
||||
# P0: KM 大量未向量化 → RAG 無法使用歷史案例
|
||||
- alert: FlywheelKMVectorizationLow
|
||||
expr: awoooi_flywheel_km_unvectorized_count > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "飛輪 KM 未向量化數量 > 10"
|
||||
description: "{{ $value }} 筆 KM 條目尚未向量化,RAG 查詢品質下降。"
|
||||
runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
|
||||
|
||||
# P1: alertname NULL 率異常
|
||||
- alert: FlywheelAlertnameNullHigh
|
||||
expr: awoooi_flywheel_alertname_null_rate > 0.05
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "飛輪 alertname NULL 率超過 5%"
|
||||
description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。"
|
||||
runbook: "執行 scripts/backfill_alertname.py 回填"
|
||||
|
||||
# P1: Incident 卡住超過 24 小時
|
||||
- alert: FlywheelIncidentsStuck
|
||||
expr: awoooi_flywheel_incidents_stuck > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
|
||||
description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。"
|
||||
|
||||
- name: awoooi_host_connectivity
|
||||
interval: 60s
|
||||
rules:
|
||||
|
||||
# P0: 主機間網路分區
|
||||
- alert: HostNetworkPartition
|
||||
expr: probe_success{job="host-connectivity"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_category: infrastructure
|
||||
notification_type: TYPE-3
|
||||
annotations:
|
||||
summary: "主機 {{ $labels.instance }} 無法連通"
|
||||
description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。"
|
||||
runbook: "SSH 檢查路由和防火牆規則"
|
||||
@@ -156,6 +156,37 @@ scrape_configs:
|
||||
service: 'awoooi-api'
|
||||
env: 'prod'
|
||||
|
||||
# === ADR-074 M1: 飛輪健康度指標 (2026-04-12 ogt) ===
|
||||
- job_name: 'awoooi-flywheel'
|
||||
scrape_interval: 5m
|
||||
metrics_path: /api/v1/stats/flywheel/metrics
|
||||
static_configs:
|
||||
- targets: ['192.168.0.125:32334']
|
||||
labels:
|
||||
host: '125'
|
||||
service: 'awoooi-flywheel'
|
||||
env: 'prod'
|
||||
|
||||
# === ADR-074 M2: 主機間網路連通性 (2026-04-12 ogt) ===
|
||||
- job_name: 'host-connectivity'
|
||||
scrape_interval: 60s
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [tcp_connect]
|
||||
static_configs:
|
||||
- targets:
|
||||
- 192.168.0.110:22
|
||||
- 192.168.0.188:22
|
||||
- 192.168.0.120:6443
|
||||
- 192.168.0.121:6443
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: 192.168.0.188:9115
|
||||
|
||||
# === Sprint 5.2 Plan B: PostgreSQL Exporter (2026-04-08 Claude Sonnet 4.6) ===
|
||||
- job_name: 'postgres-exporter'
|
||||
scrape_interval: 30s
|
||||
|
||||
Reference in New Issue
Block a user