Files
awoooi/apps/api/src/core/metrics.py
OG T d89f0520f9 fix(api): 修復 34 個 Ruff lint 錯誤
- 自動修復 import 排序、unused imports
- 手動修復 raise from、isinstance union、unused variable
- scripts/ 暫時保留 (非 CI 阻擋)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 15:27:49 +08:00

181 lines
5.7 KiB
Python

"""
AWOOOI Alert Chain Metrics
===========================
ADR-037 Wave A.5: 告警鏈路 Prometheus 指標
用於監控告警鏈路健康狀態:
- Webhook 請求計數與延遲
- 告警處理成功率
- 異常頻率統計
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
from prometheus_client import Counter, Gauge, Histogram
# =============================================================================
# Webhook Metrics (告警來源: Alertmanager/Sentry/SignOz)
# =============================================================================
WEBHOOK_REQUESTS_TOTAL = Counter(
"awoooi_webhook_requests_total",
"Total webhook requests received",
["source", "status"], # source: alertmanager/sentry/signoz, status: success/error
)
WEBHOOK_LATENCY_HISTOGRAM = Histogram(
"awoooi_webhook_latency_seconds",
"Webhook processing latency in seconds",
["source"],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30],
)
# =============================================================================
# Alert Processing Metrics (告警處理)
# =============================================================================
ALERT_PROCESSED_TOTAL = Counter(
"awoooi_alerts_processed_total",
"Total alerts processed",
["source", "severity", "outcome"], # outcome: incident_created/deduped/ignored
)
INCIDENT_CREATED_TOTAL = Counter(
"awoooi_incidents_created_total",
"Total incidents created from alerts",
["source", "severity"],
)
TELEGRAM_NOTIFICATIONS_TOTAL = Counter(
"awoooi_telegram_notifications_total",
"Total Telegram notifications sent",
["source", "status"], # status: success/failed
)
# =============================================================================
# Anomaly Counter Metrics (ADR-037)
# =============================================================================
ANOMALY_RECORDED_TOTAL = Counter(
"awoooi_anomaly_recorded_total",
"Total anomalies recorded to counter",
["alert_name", "service"],
)
ANOMALY_ESCALATION_TOTAL = Counter(
"awoooi_anomaly_escalation_total",
"Total anomaly escalations",
["level"], # level: REPEAT/ESCALATE/PERMANENT_FIX
)
ANOMALY_FREQUENCY_GAUGE = Gauge(
"awoooi_anomaly_frequency_24h",
"Current 24h anomaly frequency",
["anomaly_key"],
)
# =============================================================================
# Auto Repair Metrics
# =============================================================================
AUTO_REPAIR_ATTEMPTS_TOTAL = Counter(
"awoooi_auto_repair_attempts_total",
"Total auto repair attempts",
["action", "tier", "outcome"], # outcome: success/failed/skipped
)
AUTO_REPAIR_SUCCESS_RATE = Gauge(
"awoooi_auto_repair_success_rate",
"Auto repair success rate by action",
["action"],
)
# =============================================================================
# Alert Chain Health Metrics
# =============================================================================
ALERT_CHAIN_LAST_SUCCESS = Gauge(
"awoooi_alert_chain_last_success_timestamp",
"Last successful alert chain completion timestamp",
["source"],
)
ALERT_CHAIN_HEALTHY = Gauge(
"awoooi_alert_chain_healthy",
"Alert chain health status (1=healthy, 0=unhealthy)",
["source"],
)
# =============================================================================
# Sentry Comment Metrics
# =============================================================================
SENTRY_COMMENT_TOTAL = Counter(
"awoooi_sentry_comment_total",
"Total Sentry comments posted",
["status"], # status: success/failed/skipped
)
# =============================================================================
# Learning Service Metrics (ADR-037 Phase G)
# =============================================================================
LEARNING_SKIP_TOTAL = Counter(
"awoooi_learning_skip_total",
"Actions skipped due to low success rate",
["action"],
)
# =============================================================================
# Helper Functions
# =============================================================================
def record_webhook_request(source: str, status: str, latency: float) -> None:
"""記錄 Webhook 請求指標"""
WEBHOOK_REQUESTS_TOTAL.labels(source=source, status=status).inc()
WEBHOOK_LATENCY_HISTOGRAM.labels(source=source).observe(latency)
def record_alert_processed(source: str, severity: str, outcome: str) -> None:
"""記錄告警處理指標"""
ALERT_PROCESSED_TOTAL.labels(
source=source, severity=severity, outcome=outcome
).inc()
def record_telegram_notification(source: str, success: bool) -> None:
"""記錄 Telegram 通知指標"""
status = "success" if success else "failed"
TELEGRAM_NOTIFICATIONS_TOTAL.labels(source=source, status=status).inc()
def record_anomaly(alert_name: str, service: str, frequency_24h: int, escalation_level: str | None) -> None:
"""記錄異常頻率指標"""
ANOMALY_RECORDED_TOTAL.labels(alert_name=alert_name, service=service).inc()
if escalation_level:
ANOMALY_ESCALATION_TOTAL.labels(level=escalation_level).inc()
def record_auto_repair(action: str, tier: int, success: bool) -> None:
"""記錄自動修復指標"""
outcome = "success" if success else "failed"
AUTO_REPAIR_ATTEMPTS_TOTAL.labels(
action=action, tier=str(tier), outcome=outcome
).inc()
def record_alert_chain_success(source: str) -> None:
"""記錄告警鏈路成功完成"""
import time
ALERT_CHAIN_LAST_SUCCESS.labels(source=source).set(time.time())
ALERT_CHAIN_HEALTHY.labels(source=source).set(1)
def record_alert_chain_failure(source: str) -> None:
"""記錄告警鏈路失敗"""
ALERT_CHAIN_HEALTHY.labels(source=source).set(0)