- 自動修復 import 排序、unused imports - 手動修復 raise from、isinstance union、unused variable - scripts/ 暫時保留 (非 CI 阻擋) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
181 lines
5.7 KiB
Python
181 lines
5.7 KiB
Python
"""
|
|
AWOOOI Alert Chain Metrics
|
|
===========================
|
|
ADR-037 Wave A.5: 告警鏈路 Prometheus 指標
|
|
|
|
用於監控告警鏈路健康狀態:
|
|
- Webhook 請求計數與延遲
|
|
- 告警處理成功率
|
|
- 異常頻率統計
|
|
|
|
版本: v1.0
|
|
建立: 2026-03-29 (台北時區)
|
|
建立者: Claude Code (Phase 21 ADR-037)
|
|
"""
|
|
|
|
from prometheus_client import Counter, Gauge, Histogram
|
|
|
|
# =============================================================================
|
|
# Webhook Metrics (告警來源: Alertmanager/Sentry/SignOz)
|
|
# =============================================================================
|
|
|
|
WEBHOOK_REQUESTS_TOTAL = Counter(
|
|
"awoooi_webhook_requests_total",
|
|
"Total webhook requests received",
|
|
["source", "status"], # source: alertmanager/sentry/signoz, status: success/error
|
|
)
|
|
|
|
WEBHOOK_LATENCY_HISTOGRAM = Histogram(
|
|
"awoooi_webhook_latency_seconds",
|
|
"Webhook processing latency in seconds",
|
|
["source"],
|
|
buckets=[0.1, 0.5, 1, 2, 5, 10, 30],
|
|
)
|
|
|
|
# =============================================================================
|
|
# Alert Processing Metrics (告警處理)
|
|
# =============================================================================
|
|
|
|
ALERT_PROCESSED_TOTAL = Counter(
|
|
"awoooi_alerts_processed_total",
|
|
"Total alerts processed",
|
|
["source", "severity", "outcome"], # outcome: incident_created/deduped/ignored
|
|
)
|
|
|
|
INCIDENT_CREATED_TOTAL = Counter(
|
|
"awoooi_incidents_created_total",
|
|
"Total incidents created from alerts",
|
|
["source", "severity"],
|
|
)
|
|
|
|
TELEGRAM_NOTIFICATIONS_TOTAL = Counter(
|
|
"awoooi_telegram_notifications_total",
|
|
"Total Telegram notifications sent",
|
|
["source", "status"], # status: success/failed
|
|
)
|
|
|
|
# =============================================================================
|
|
# Anomaly Counter Metrics (ADR-037)
|
|
# =============================================================================
|
|
|
|
ANOMALY_RECORDED_TOTAL = Counter(
|
|
"awoooi_anomaly_recorded_total",
|
|
"Total anomalies recorded to counter",
|
|
["alert_name", "service"],
|
|
)
|
|
|
|
ANOMALY_ESCALATION_TOTAL = Counter(
|
|
"awoooi_anomaly_escalation_total",
|
|
"Total anomaly escalations",
|
|
["level"], # level: REPEAT/ESCALATE/PERMANENT_FIX
|
|
)
|
|
|
|
ANOMALY_FREQUENCY_GAUGE = Gauge(
|
|
"awoooi_anomaly_frequency_24h",
|
|
"Current 24h anomaly frequency",
|
|
["anomaly_key"],
|
|
)
|
|
|
|
# =============================================================================
|
|
# Auto Repair Metrics
|
|
# =============================================================================
|
|
|
|
AUTO_REPAIR_ATTEMPTS_TOTAL = Counter(
|
|
"awoooi_auto_repair_attempts_total",
|
|
"Total auto repair attempts",
|
|
["action", "tier", "outcome"], # outcome: success/failed/skipped
|
|
)
|
|
|
|
AUTO_REPAIR_SUCCESS_RATE = Gauge(
|
|
"awoooi_auto_repair_success_rate",
|
|
"Auto repair success rate by action",
|
|
["action"],
|
|
)
|
|
|
|
# =============================================================================
|
|
# Alert Chain Health Metrics
|
|
# =============================================================================
|
|
|
|
ALERT_CHAIN_LAST_SUCCESS = Gauge(
|
|
"awoooi_alert_chain_last_success_timestamp",
|
|
"Last successful alert chain completion timestamp",
|
|
["source"],
|
|
)
|
|
|
|
ALERT_CHAIN_HEALTHY = Gauge(
|
|
"awoooi_alert_chain_healthy",
|
|
"Alert chain health status (1=healthy, 0=unhealthy)",
|
|
["source"],
|
|
)
|
|
|
|
# =============================================================================
|
|
# Sentry Comment Metrics
|
|
# =============================================================================
|
|
|
|
SENTRY_COMMENT_TOTAL = Counter(
|
|
"awoooi_sentry_comment_total",
|
|
"Total Sentry comments posted",
|
|
["status"], # status: success/failed/skipped
|
|
)
|
|
|
|
# =============================================================================
|
|
# Learning Service Metrics (ADR-037 Phase G)
|
|
# =============================================================================
|
|
|
|
LEARNING_SKIP_TOTAL = Counter(
|
|
"awoooi_learning_skip_total",
|
|
"Actions skipped due to low success rate",
|
|
["action"],
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
def record_webhook_request(source: str, status: str, latency: float) -> None:
|
|
"""記錄 Webhook 請求指標"""
|
|
WEBHOOK_REQUESTS_TOTAL.labels(source=source, status=status).inc()
|
|
WEBHOOK_LATENCY_HISTOGRAM.labels(source=source).observe(latency)
|
|
|
|
|
|
def record_alert_processed(source: str, severity: str, outcome: str) -> None:
|
|
"""記錄告警處理指標"""
|
|
ALERT_PROCESSED_TOTAL.labels(
|
|
source=source, severity=severity, outcome=outcome
|
|
).inc()
|
|
|
|
|
|
def record_telegram_notification(source: str, success: bool) -> None:
|
|
"""記錄 Telegram 通知指標"""
|
|
status = "success" if success else "failed"
|
|
TELEGRAM_NOTIFICATIONS_TOTAL.labels(source=source, status=status).inc()
|
|
|
|
|
|
def record_anomaly(alert_name: str, service: str, frequency_24h: int, escalation_level: str | None) -> None:
|
|
"""記錄異常頻率指標"""
|
|
ANOMALY_RECORDED_TOTAL.labels(alert_name=alert_name, service=service).inc()
|
|
|
|
if escalation_level:
|
|
ANOMALY_ESCALATION_TOTAL.labels(level=escalation_level).inc()
|
|
|
|
|
|
def record_auto_repair(action: str, tier: int, success: bool) -> None:
|
|
"""記錄自動修復指標"""
|
|
outcome = "success" if success else "failed"
|
|
AUTO_REPAIR_ATTEMPTS_TOTAL.labels(
|
|
action=action, tier=str(tier), outcome=outcome
|
|
).inc()
|
|
|
|
|
|
def record_alert_chain_success(source: str) -> None:
|
|
"""記錄告警鏈路成功完成"""
|
|
import time
|
|
ALERT_CHAIN_LAST_SUCCESS.labels(source=source).set(time.time())
|
|
ALERT_CHAIN_HEALTHY.labels(source=source).set(1)
|
|
|
|
|
|
def record_alert_chain_failure(source: str) -> None:
|
|
"""記錄告警鏈路失敗"""
|
|
ALERT_CHAIN_HEALTHY.labels(source=source).set(0)
|