Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
355 lines
12 KiB
Python
355 lines
12 KiB
Python
"""
|
||
AWOOOI Alert Chain Metrics
|
||
===========================
|
||
ADR-037 Wave A.5: 告警鏈路 Prometheus 指標
|
||
|
||
用於監控告警鏈路健康狀態:
|
||
- Webhook 請求計數與延遲
|
||
- 告警處理成功率
|
||
- 異常頻率統計
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-29 (台北時區)
|
||
建立者: Claude Code (Phase 21 ADR-037)
|
||
"""
|
||
|
||
from prometheus_client import Counter, Gauge, Histogram
|
||
|
||
# =============================================================================
|
||
# Webhook Metrics (告警來源: Alertmanager/Sentry/SignOz)
|
||
# =============================================================================
|
||
|
||
WEBHOOK_REQUESTS_TOTAL = Counter(
|
||
"awoooi_webhook_requests_total",
|
||
"Total webhook requests received",
|
||
["source", "status"], # source: alertmanager/sentry/signoz, status: success/error
|
||
)
|
||
|
||
WEBHOOK_LATENCY_HISTOGRAM = Histogram(
|
||
"awoooi_webhook_latency_seconds",
|
||
"Webhook processing latency in seconds",
|
||
["source"],
|
||
buckets=[0.1, 0.5, 1, 2, 5, 10, 30],
|
||
)
|
||
|
||
# =============================================================================
|
||
# Alert Processing Metrics (告警處理)
|
||
# =============================================================================
|
||
|
||
ALERT_PROCESSED_TOTAL = Counter(
|
||
"awoooi_alerts_processed_total",
|
||
"Total alerts processed",
|
||
["source", "severity", "outcome"], # outcome: incident_created/deduped/ignored
|
||
)
|
||
|
||
INCIDENT_CREATED_TOTAL = Counter(
|
||
"awoooi_incidents_created_total",
|
||
"Total incidents created from alerts",
|
||
["source", "severity"],
|
||
)
|
||
|
||
TELEGRAM_NOTIFICATIONS_TOTAL = Counter(
|
||
"awoooi_telegram_notifications_total",
|
||
"Total Telegram notifications sent",
|
||
["source", "status"], # status: success/failed
|
||
)
|
||
|
||
# =============================================================================
|
||
# Anomaly Counter Metrics (ADR-037)
|
||
# =============================================================================
|
||
|
||
ANOMALY_RECORDED_TOTAL = Counter(
|
||
"awoooi_anomaly_recorded_total",
|
||
"Total anomalies recorded to counter",
|
||
["alert_name", "service"],
|
||
)
|
||
|
||
ANOMALY_ESCALATION_TOTAL = Counter(
|
||
"awoooi_anomaly_escalation_total",
|
||
"Total anomaly escalations",
|
||
["level"], # level: REPEAT/ESCALATE/PERMANENT_FIX
|
||
)
|
||
|
||
ANOMALY_FREQUENCY_GAUGE = Gauge(
|
||
"awoooi_anomaly_frequency_24h",
|
||
"Current 24h anomaly frequency",
|
||
["anomaly_key"],
|
||
)
|
||
|
||
# =============================================================================
|
||
# Auto Repair Metrics
|
||
# =============================================================================
|
||
|
||
AUTO_REPAIR_ATTEMPTS_TOTAL = Counter(
|
||
"awoooi_auto_repair_attempts_total",
|
||
"Total auto repair attempts",
|
||
["action", "tier", "outcome"], # outcome: success/failed/skipped
|
||
)
|
||
|
||
AUTO_REPAIR_SUCCESS_RATE = Gauge(
|
||
"awoooi_auto_repair_success_rate",
|
||
"Auto repair success rate by action",
|
||
["action"],
|
||
)
|
||
|
||
# =============================================================================
|
||
# Alert Chain Health Metrics
|
||
# =============================================================================
|
||
|
||
ALERT_CHAIN_LAST_SUCCESS = Gauge(
|
||
"awoooi_alert_chain_last_success_timestamp",
|
||
"Last successful alert chain completion timestamp",
|
||
["source"],
|
||
)
|
||
|
||
ALERT_CHAIN_HEALTHY = Gauge(
|
||
"awoooi_alert_chain_healthy",
|
||
"Alert chain health status (1=healthy, 0=unhealthy)",
|
||
["source"],
|
||
)
|
||
|
||
# =============================================================================
|
||
# Sentry Comment Metrics
|
||
# =============================================================================
|
||
|
||
SENTRY_COMMENT_TOTAL = Counter(
|
||
"awoooi_sentry_comment_total",
|
||
"Total Sentry comments posted",
|
||
["status"], # status: success/failed/skipped
|
||
)
|
||
|
||
# =============================================================================
|
||
# Learning Service Metrics (ADR-037 Phase G)
|
||
# =============================================================================
|
||
|
||
LEARNING_SKIP_TOTAL = Counter(
|
||
"awoooi_learning_skip_total",
|
||
"Actions skipped due to low success rate",
|
||
["action"],
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Ollama 容災指標 (P2.3, 2026-04-26 台北時區)
|
||
# 建立者: Claude Sonnet 4.6 (tool-expert, P2.3)
|
||
#
|
||
# 對應告警規則: ops/monitoring/ollama_health_rules.yaml
|
||
#
|
||
# 使用位置:
|
||
# - ollama_failover_manager.py: OLLAMA_FAILOVER_TRIGGERED_TOTAL, AI_ROUTER_PROVIDER_TOTAL
|
||
# - ollama_auto_recovery.py: OLLAMA_RECOVERY_TRIGGERED_TOTAL
|
||
# - ollama_health_monitor.py: OLLAMA_HEALTH_STATUS
|
||
# - main.py lifespan / background task: GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA
|
||
#
|
||
# Backlog(需設計後另行補入):
|
||
# - ollama_inference_duration_seconds (Histogram) — 需在 _check_inference() 裡 observe
|
||
# - post_execution_verification_failed_total / _total — 需 auto_repair_service.py 補入
|
||
# =============================================================================
|
||
|
||
OLLAMA_FAILOVER_TRIGGERED_TOTAL = Counter(
|
||
"ollama_failover_triggered_total",
|
||
"Ollama failover events (primary switched away from ollama_111)",
|
||
["from_provider", "to_provider"],
|
||
)
|
||
|
||
OLLAMA_RECOVERY_TRIGGERED_TOTAL = Counter(
|
||
"ollama_recovery_triggered_total",
|
||
"Ollama auto-recovery events (primary switched back to ollama_111)",
|
||
["from_provider"],
|
||
)
|
||
|
||
OLLAMA_HEALTH_STATUS = Gauge(
|
||
"ollama_health_status",
|
||
"Ollama instance health (1=healthy, 0=not_healthy/offline)",
|
||
["host"], # host: "111" or "188"
|
||
)
|
||
|
||
OLLAMA_CURRENT_PRIMARY_IS_OLLAMA = Gauge(
|
||
"ollama_current_primary_is_ollama",
|
||
"Whether the current primary AI provider is ollama_111 (1=yes, 0=no)",
|
||
)
|
||
|
||
AI_ROUTER_PROVIDER_TOTAL = Counter(
|
||
"ai_router_selected_provider_total",
|
||
"AI router provider selection count (all routing decisions)",
|
||
["provider"],
|
||
)
|
||
|
||
GEMINI_DAILY_CALL_COUNT = Gauge(
|
||
"gemini_daily_call_count",
|
||
"Gemini API calls made today (read from Redis ollama:gemini_daily_count:{date})",
|
||
)
|
||
|
||
GEMINI_DAILY_QUOTA = Gauge(
|
||
"gemini_daily_quota",
|
||
"Gemini API daily call quota (from settings.GEMINI_DAILY_QUOTA)",
|
||
)
|
||
|
||
# =============================================================================
|
||
# DIAGNOSE Fallback Metrics (A2 INC-20260425, 2026-04-27 台北時區)
|
||
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A2)
|
||
#
|
||
# 背景: INC-20260425 NIM timeout 後 fallback 到 Ollama CPU 238s 造成二次 timeout。
|
||
# 統帥批准 A+B 雙修,A2 移除 Ollama + 新增 fallback 計數 metric,
|
||
# 閾值告警由獨立 Prometheus rule 定義(不在本任務範圍)。
|
||
#
|
||
# 使用位置:
|
||
# - ai_router.py: record_diagnose_fallback() 在 executor fallback 觸發時呼叫
|
||
#
|
||
# 告警建議 (供 Prometheus rule 設計參考):
|
||
# rate(aiops_diagnose_fallback_total[1m]) > 0.5 → 警告
|
||
# rate(aiops_diagnose_fallback_total[5m]) > 0.2 → 嚴重
|
||
# =============================================================================
|
||
|
||
AIOPS_DIAGNOSE_FALLBACK_TOTAL = Counter(
|
||
"aiops_diagnose_fallback_total",
|
||
"DIAGNOSE intent fallback events (from_provider → to_provider)",
|
||
["from_provider", "to_provider"],
|
||
)
|
||
|
||
# 2026-04-27 Claude Sonnet 4.6: F6 — metric 寫入失敗計數器
|
||
# 觸發條件: ai_router.py 的 diagnose_fallback_metric_failed except 分支
|
||
# 用途: 讓 Prometheus 可觀測 metric 管道是否有問題(silent swallow 升 warning + counter)
|
||
# 告警參考: rate(aiops_diagnose_fallback_metric_error_total[5m]) > 0 → 調查 metrics.py import 鏈
|
||
AIOPS_DIAGNOSE_FALLBACK_METRIC_ERROR_TOTAL = Counter(
|
||
"aiops_diagnose_fallback_metric_error_total",
|
||
"Failures when writing aiops_diagnose_fallback_total metric (indicates metric pipeline issue)",
|
||
)
|
||
|
||
|
||
def record_diagnose_fallback(from_provider: str, to_provider: str) -> None:
|
||
"""記錄 DIAGNOSE fallback 事件(per-provider pair 計數)
|
||
|
||
2026-04-27 Claude Sonnet 4.6: A2 INC-20260425
|
||
呼叫方: ai_router.py AIRouterExecutor.execute() 的 DIAGNOSE fallback 路徑
|
||
|
||
Args:
|
||
from_provider: 失敗的 provider 名稱(e.g. "openclaw_nemo")
|
||
to_provider: 下一個嘗試的 provider 名稱(e.g. "gemini")
|
||
"""
|
||
AIOPS_DIAGNOSE_FALLBACK_TOTAL.labels(
|
||
from_provider=from_provider,
|
||
to_provider=to_provider,
|
||
).inc()
|
||
|
||
|
||
# =============================================================================
|
||
# P3.1-T1 Tier-1 三服務整合 Metrics (2026-04-27 台北時區)
|
||
# 建立者: Claude Sonnet 4.6 (P3.1-T1)
|
||
#
|
||
# ROLLBACK_EXECUTED_TOTAL: rollback_manager 整合到 auto_repair_service._verify_and_learn
|
||
# RESOURCE_RESOLVE_TOTAL: resource_resolver 整合到 approval_execution.execute_approved_action
|
||
# =============================================================================
|
||
|
||
ROLLBACK_EXECUTED_TOTAL = Counter(
|
||
"rollback_executed_total",
|
||
"K8s rollback executions triggered by PostExecutionVerifier failure",
|
||
["status", "reason"],
|
||
)
|
||
|
||
RESOURCE_RESOLVE_TOTAL = Counter(
|
||
"resource_resolve_total",
|
||
"Resource resolver attempts in approval execution",
|
||
["result"], # hit / miss / suggestion / error
|
||
)
|
||
|
||
# =============================================================================
|
||
# ADR-100 / ADR-104 Flywheel Emitter Metrics
|
||
# =============================================================================
|
||
|
||
PLAYBOOK_GENERATION_TOTAL = Counter(
|
||
"playbook_generation_total",
|
||
"LLM Playbook generation and governance outcomes",
|
||
["outcome", "source"],
|
||
)
|
||
|
||
PLAYBOOK_STATUS_TOTAL = Gauge(
|
||
"playbook_status_total",
|
||
"Playbook lifecycle status observations from generation/governance",
|
||
["status", "source"],
|
||
)
|
||
|
||
|
||
def record_playbook_generation(outcome: str, source: str) -> None:
|
||
"""Record Playbook generation/governance outcome."""
|
||
PLAYBOOK_GENERATION_TOTAL.labels(outcome=outcome, source=source).inc()
|
||
|
||
|
||
def observe_playbook_status(status: str, source: str) -> None:
|
||
"""Expose latest observed Playbook lifecycle status."""
|
||
PLAYBOOK_STATUS_TOTAL.labels(status=status, source=source).set(1)
|
||
|
||
# =============================================================================
|
||
# Solver MCP Registry Metrics (H2, 2026-04-27 台北時區)
|
||
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, B1 Fix Round)
|
||
#
|
||
# H2+vuln-V3+V4 — registry 健康監控
|
||
# 攻擊場景:registry 載入失敗時 LLM 可自創任意 mcp_tool → 繞過白名單
|
||
# 防護:載入失敗立即 set status=error,Prometheus 可設告警
|
||
#
|
||
# 使用位置:
|
||
# - solver_agent.py: _load_mcp_tool_registry() 呼叫後更新
|
||
#
|
||
# status 值:
|
||
# ok — 載入成功,registry 含 ≥ 1 個 action
|
||
# empty — 載入成功但 registry 為空(YAML 空格式)
|
||
# error — 載入失敗(檔案不存在、YAML 格式錯誤等)
|
||
# =============================================================================
|
||
|
||
SOLVER_MCP_REGISTRY_LOADED = Gauge(
|
||
"aiops_solver_mcp_registry_loaded",
|
||
"MCP registry load status for Solver Agent (1=active status, 0=inactive)",
|
||
["status"], # ok / empty / error
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Helper Functions
|
||
# =============================================================================
|
||
|
||
def record_webhook_request(source: str, status: str, latency: float) -> None:
|
||
"""記錄 Webhook 請求指標"""
|
||
WEBHOOK_REQUESTS_TOTAL.labels(source=source, status=status).inc()
|
||
WEBHOOK_LATENCY_HISTOGRAM.labels(source=source).observe(latency)
|
||
|
||
|
||
def record_alert_processed(source: str, severity: str, outcome: str) -> None:
|
||
"""記錄告警處理指標"""
|
||
ALERT_PROCESSED_TOTAL.labels(
|
||
source=source, severity=severity, outcome=outcome
|
||
).inc()
|
||
|
||
|
||
def record_telegram_notification(source: str, success: bool) -> None:
|
||
"""記錄 Telegram 通知指標"""
|
||
status = "success" if success else "failed"
|
||
TELEGRAM_NOTIFICATIONS_TOTAL.labels(source=source, status=status).inc()
|
||
|
||
|
||
def record_anomaly(alert_name: str, service: str, frequency_24h: int, escalation_level: str | None) -> None:
|
||
"""記錄異常頻率指標"""
|
||
ANOMALY_RECORDED_TOTAL.labels(alert_name=alert_name, service=service).inc()
|
||
|
||
if escalation_level:
|
||
ANOMALY_ESCALATION_TOTAL.labels(level=escalation_level).inc()
|
||
|
||
|
||
def record_auto_repair(action: str, tier: int, success: bool) -> None:
|
||
"""記錄自動修復指標"""
|
||
outcome = "success" if success else "failed"
|
||
AUTO_REPAIR_ATTEMPTS_TOTAL.labels(
|
||
action=action, tier=str(tier), outcome=outcome
|
||
).inc()
|
||
|
||
|
||
def record_alert_chain_success(source: str) -> None:
|
||
"""記錄告警鏈路成功完成"""
|
||
import time
|
||
ALERT_CHAIN_LAST_SUCCESS.labels(source=source).set(time.time())
|
||
ALERT_CHAIN_HEALTHY.labels(source=source).set(1)
|
||
|
||
|
||
def record_alert_chain_failure(source: str) -> None:
|
||
"""記錄告警鏈路失敗"""
|
||
ALERT_CHAIN_HEALTHY.labels(source=source).set(0)
|