""" AWOOOI Alert Chain Metrics =========================== ADR-037 Wave A.5: 告警鏈路 Prometheus 指標 用於監控告警鏈路健康狀態: - Webhook 請求計數與延遲 - 告警處理成功率 - 異常頻率統計 版本: v1.0 建立: 2026-03-29 (台北時區) 建立者: Claude Code (Phase 21 ADR-037) """ from prometheus_client import Counter, Gauge, Histogram # ============================================================================= # Webhook Metrics (告警來源: Alertmanager/Sentry/SignOz) # ============================================================================= WEBHOOK_REQUESTS_TOTAL = Counter( "awoooi_webhook_requests_total", "Total webhook requests received", ["source", "status"], # source: alertmanager/sentry/signoz, status: success/error ) WEBHOOK_LATENCY_HISTOGRAM = Histogram( "awoooi_webhook_latency_seconds", "Webhook processing latency in seconds", ["source"], buckets=[0.1, 0.5, 1, 2, 5, 10, 30], ) # ============================================================================= # Alert Processing Metrics (告警處理) # ============================================================================= ALERT_PROCESSED_TOTAL = Counter( "awoooi_alerts_processed_total", "Total alerts processed", ["source", "severity", "outcome"], # outcome: incident_created/deduped/ignored ) INCIDENT_CREATED_TOTAL = Counter( "awoooi_incidents_created_total", "Total incidents created from alerts", ["source", "severity"], ) TELEGRAM_NOTIFICATIONS_TOTAL = Counter( "awoooi_telegram_notifications_total", "Total Telegram notifications sent", ["source", "status"], # status: success/failed ) # ============================================================================= # Anomaly Counter Metrics (ADR-037) # ============================================================================= ANOMALY_RECORDED_TOTAL = Counter( "awoooi_anomaly_recorded_total", "Total anomalies recorded to counter", ["alert_name", "service"], ) ANOMALY_ESCALATION_TOTAL = Counter( "awoooi_anomaly_escalation_total", "Total anomaly escalations", ["level"], # level: REPEAT/ESCALATE/PERMANENT_FIX ) ANOMALY_FREQUENCY_GAUGE = Gauge( "awoooi_anomaly_frequency_24h", "Current 24h anomaly frequency", ["anomaly_key"], ) # ============================================================================= # Auto Repair Metrics # ============================================================================= AUTO_REPAIR_ATTEMPTS_TOTAL = Counter( "awoooi_auto_repair_attempts_total", "Total auto repair attempts", ["action", "tier", "outcome"], # outcome: success/failed/skipped ) AUTO_REPAIR_SUCCESS_RATE = Gauge( "awoooi_auto_repair_success_rate", "Auto repair success rate by action", ["action"], ) # ============================================================================= # Alert Chain Health Metrics # ============================================================================= ALERT_CHAIN_LAST_SUCCESS = Gauge( "awoooi_alert_chain_last_success_timestamp", "Last successful alert chain completion timestamp", ["source"], ) ALERT_CHAIN_HEALTHY = Gauge( "awoooi_alert_chain_healthy", "Alert chain health status (1=healthy, 0=unhealthy)", ["source"], ) # ============================================================================= # Sentry Comment Metrics # ============================================================================= SENTRY_COMMENT_TOTAL = Counter( "awoooi_sentry_comment_total", "Total Sentry comments posted", ["status"], # status: success/failed/skipped ) # ============================================================================= # Learning Service Metrics (ADR-037 Phase G) # ============================================================================= LEARNING_SKIP_TOTAL = Counter( "awoooi_learning_skip_total", "Actions skipped due to low success rate", ["action"], ) # ============================================================================= # Ollama 容災指標 (P2.3, 2026-04-26 台北時區) # 建立者: Claude Sonnet 4.6 (tool-expert, P2.3) # # 對應告警規則: ops/monitoring/ollama_health_rules.yaml # # 使用位置: # - ollama_failover_manager.py: OLLAMA_FAILOVER_TRIGGERED_TOTAL, AI_ROUTER_PROVIDER_TOTAL # - ollama_auto_recovery.py: OLLAMA_RECOVERY_TRIGGERED_TOTAL # - ollama_health_monitor.py: OLLAMA_HEALTH_STATUS # - main.py lifespan / background task: GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA # # Backlog(需設計後另行補入): # - ollama_inference_duration_seconds (Histogram) — 需在 _check_inference() 裡 observe # - post_execution_verification_failed_total / _total — 需 auto_repair_service.py 補入 # ============================================================================= OLLAMA_FAILOVER_TRIGGERED_TOTAL = Counter( "ollama_failover_triggered_total", "Ollama failover events (primary switched away from ollama_111)", ["from_provider", "to_provider"], ) OLLAMA_RECOVERY_TRIGGERED_TOTAL = Counter( "ollama_recovery_triggered_total", "Ollama auto-recovery events (primary switched back to ollama_111)", ["from_provider"], ) OLLAMA_HEALTH_STATUS = Gauge( "ollama_health_status", "Ollama instance health (1=healthy, 0=not_healthy/offline)", ["host"], # host: "111" or "188" ) OLLAMA_CURRENT_PRIMARY_IS_OLLAMA = Gauge( "ollama_current_primary_is_ollama", "Whether the current primary AI provider is ollama_111 (1=yes, 0=no)", ) AI_ROUTER_PROVIDER_TOTAL = Counter( "ai_router_selected_provider_total", "AI router provider selection count (all routing decisions)", ["provider"], ) GEMINI_DAILY_CALL_COUNT = Gauge( "gemini_daily_call_count", "Gemini API calls made today (read from Redis ollama:gemini_daily_count:{date})", ) GEMINI_DAILY_QUOTA = Gauge( "gemini_daily_quota", "Gemini API daily call quota (from settings.GEMINI_DAILY_QUOTA)", ) # ============================================================================= # DIAGNOSE Fallback Metrics (A2 INC-20260425, 2026-04-27 台北時區) # 建立者: Claude Sonnet 4.6 (fullstack-engineer, A2) # # 背景: INC-20260425 NIM timeout 後 fallback 到 Ollama CPU 238s 造成二次 timeout。 # 統帥批准 A+B 雙修,A2 移除 Ollama + 新增 fallback 計數 metric, # 閾值告警由獨立 Prometheus rule 定義(不在本任務範圍)。 # # 使用位置: # - ai_router.py: record_diagnose_fallback() 在 executor fallback 觸發時呼叫 # # 告警建議 (供 Prometheus rule 設計參考): # rate(aiops_diagnose_fallback_total[1m]) > 0.5 → 警告 # rate(aiops_diagnose_fallback_total[5m]) > 0.2 → 嚴重 # ============================================================================= AIOPS_DIAGNOSE_FALLBACK_TOTAL = Counter( "aiops_diagnose_fallback_total", "DIAGNOSE intent fallback events (from_provider → to_provider)", ["from_provider", "to_provider"], ) # 2026-04-27 Claude Sonnet 4.6: F6 — metric 寫入失敗計數器 # 觸發條件: ai_router.py 的 diagnose_fallback_metric_failed except 分支 # 用途: 讓 Prometheus 可觀測 metric 管道是否有問題(silent swallow 升 warning + counter) # 告警參考: rate(aiops_diagnose_fallback_metric_error_total[5m]) > 0 → 調查 metrics.py import 鏈 AIOPS_DIAGNOSE_FALLBACK_METRIC_ERROR_TOTAL = Counter( "aiops_diagnose_fallback_metric_error_total", "Failures when writing aiops_diagnose_fallback_total metric (indicates metric pipeline issue)", ) def record_diagnose_fallback(from_provider: str, to_provider: str) -> None: """記錄 DIAGNOSE fallback 事件(per-provider pair 計數) 2026-04-27 Claude Sonnet 4.6: A2 INC-20260425 呼叫方: ai_router.py AIRouterExecutor.execute() 的 DIAGNOSE fallback 路徑 Args: from_provider: 失敗的 provider 名稱(e.g. "openclaw_nemo") to_provider: 下一個嘗試的 provider 名稱(e.g. "gemini") """ AIOPS_DIAGNOSE_FALLBACK_TOTAL.labels( from_provider=from_provider, to_provider=to_provider, ).inc() # ============================================================================= # P3.1-T1 Tier-1 三服務整合 Metrics (2026-04-27 台北時區) # 建立者: Claude Sonnet 4.6 (P3.1-T1) # # ROLLBACK_EXECUTED_TOTAL: rollback_manager 整合到 auto_repair_service._verify_and_learn # RESOURCE_RESOLVE_TOTAL: resource_resolver 整合到 approval_execution.execute_approved_action # ============================================================================= ROLLBACK_EXECUTED_TOTAL = Counter( "rollback_executed_total", "K8s rollback executions triggered by PostExecutionVerifier failure", ["status", "reason"], ) RESOURCE_RESOLVE_TOTAL = Counter( "resource_resolve_total", "Resource resolver attempts in approval execution", ["result"], # hit / miss / suggestion / error ) # ============================================================================= # ADR-100 / ADR-104 Flywheel Emitter Metrics # ============================================================================= PLAYBOOK_GENERATION_TOTAL = Counter( "playbook_generation_total", "LLM Playbook generation and governance outcomes", ["outcome", "source"], ) PLAYBOOK_STATUS_TOTAL = Gauge( "playbook_status_total", "Playbook lifecycle status observations from generation/governance", ["status", "source"], ) def record_playbook_generation(outcome: str, source: str) -> None: """Record Playbook generation/governance outcome.""" PLAYBOOK_GENERATION_TOTAL.labels(outcome=outcome, source=source).inc() def observe_playbook_status(status: str, source: str) -> None: """Expose latest observed Playbook lifecycle status.""" PLAYBOOK_STATUS_TOTAL.labels(status=status, source=source).set(1) # ============================================================================= # Solver MCP Registry Metrics (H2, 2026-04-27 台北時區) # 建立者: Claude Sonnet 4.6 (fullstack-engineer, B1 Fix Round) # # H2+vuln-V3+V4 — registry 健康監控 # 攻擊場景:registry 載入失敗時 LLM 可自創任意 mcp_tool → 繞過白名單 # 防護:載入失敗立即 set status=error,Prometheus 可設告警 # # 使用位置: # - solver_agent.py: _load_mcp_tool_registry() 呼叫後更新 # # status 值: # ok — 載入成功,registry 含 ≥ 1 個 action # empty — 載入成功但 registry 為空(YAML 空格式) # error — 載入失敗(檔案不存在、YAML 格式錯誤等) # ============================================================================= SOLVER_MCP_REGISTRY_LOADED = Gauge( "aiops_solver_mcp_registry_loaded", "MCP registry load status for Solver Agent (1=active status, 0=inactive)", ["status"], # ok / empty / error ) # ============================================================================= # Helper Functions # ============================================================================= def record_webhook_request(source: str, status: str, latency: float) -> None: """記錄 Webhook 請求指標""" WEBHOOK_REQUESTS_TOTAL.labels(source=source, status=status).inc() WEBHOOK_LATENCY_HISTOGRAM.labels(source=source).observe(latency) def record_alert_processed(source: str, severity: str, outcome: str) -> None: """記錄告警處理指標""" ALERT_PROCESSED_TOTAL.labels( source=source, severity=severity, outcome=outcome ).inc() def record_telegram_notification(source: str, success: bool) -> None: """記錄 Telegram 通知指標""" status = "success" if success else "failed" TELEGRAM_NOTIFICATIONS_TOTAL.labels(source=source, status=status).inc() def record_anomaly(alert_name: str, service: str, frequency_24h: int, escalation_level: str | None) -> None: """記錄異常頻率指標""" ANOMALY_RECORDED_TOTAL.labels(alert_name=alert_name, service=service).inc() if escalation_level: ANOMALY_ESCALATION_TOTAL.labels(level=escalation_level).inc() def record_auto_repair(action: str, tier: int, success: bool) -> None: """記錄自動修復指標""" outcome = "success" if success else "failed" AUTO_REPAIR_ATTEMPTS_TOTAL.labels( action=action, tier=str(tier), outcome=outcome ).inc() def record_alert_chain_success(source: str) -> None: """記錄告警鏈路成功完成""" import time ALERT_CHAIN_LAST_SUCCESS.labels(source=source).set(time.time()) ALERT_CHAIN_HEALTHY.labels(source=source).set(1) def record_alert_chain_failure(source: str) -> None: """記錄告警鏈路失敗""" ALERT_CHAIN_HEALTHY.labels(source=source).set(0)