Files
awoooi/apps/api/src/core/metrics.py
Your Name 6e04fe9c8a
Some checks failed
CD Pipeline / tests (push) Successful in 1m28s
Code Review / ai-code-review (push) Successful in 29s
Type Sync Check / check-type-sync (push) Failing after 2m41s
CD Pipeline / build-and-deploy (push) Successful in 8m40s
CD Pipeline / post-deploy-checks (push) Successful in 3m10s
feat(playbook): generate drafts with local llm
2026-04-30 23:04:58 +08:00

355 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI Alert Chain Metrics
===========================
ADR-037 Wave A.5: 告警鏈路 Prometheus 指標
用於監控告警鏈路健康狀態:
- Webhook 請求計數與延遲
- 告警處理成功率
- 異常頻率統計
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
from prometheus_client import Counter, Gauge, Histogram
# =============================================================================
# Webhook Metrics (告警來源: Alertmanager/Sentry/SignOz)
# =============================================================================
WEBHOOK_REQUESTS_TOTAL = Counter(
"awoooi_webhook_requests_total",
"Total webhook requests received",
["source", "status"], # source: alertmanager/sentry/signoz, status: success/error
)
WEBHOOK_LATENCY_HISTOGRAM = Histogram(
"awoooi_webhook_latency_seconds",
"Webhook processing latency in seconds",
["source"],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30],
)
# =============================================================================
# Alert Processing Metrics (告警處理)
# =============================================================================
ALERT_PROCESSED_TOTAL = Counter(
"awoooi_alerts_processed_total",
"Total alerts processed",
["source", "severity", "outcome"], # outcome: incident_created/deduped/ignored
)
INCIDENT_CREATED_TOTAL = Counter(
"awoooi_incidents_created_total",
"Total incidents created from alerts",
["source", "severity"],
)
TELEGRAM_NOTIFICATIONS_TOTAL = Counter(
"awoooi_telegram_notifications_total",
"Total Telegram notifications sent",
["source", "status"], # status: success/failed
)
# =============================================================================
# Anomaly Counter Metrics (ADR-037)
# =============================================================================
ANOMALY_RECORDED_TOTAL = Counter(
"awoooi_anomaly_recorded_total",
"Total anomalies recorded to counter",
["alert_name", "service"],
)
ANOMALY_ESCALATION_TOTAL = Counter(
"awoooi_anomaly_escalation_total",
"Total anomaly escalations",
["level"], # level: REPEAT/ESCALATE/PERMANENT_FIX
)
ANOMALY_FREQUENCY_GAUGE = Gauge(
"awoooi_anomaly_frequency_24h",
"Current 24h anomaly frequency",
["anomaly_key"],
)
# =============================================================================
# Auto Repair Metrics
# =============================================================================
AUTO_REPAIR_ATTEMPTS_TOTAL = Counter(
"awoooi_auto_repair_attempts_total",
"Total auto repair attempts",
["action", "tier", "outcome"], # outcome: success/failed/skipped
)
AUTO_REPAIR_SUCCESS_RATE = Gauge(
"awoooi_auto_repair_success_rate",
"Auto repair success rate by action",
["action"],
)
# =============================================================================
# Alert Chain Health Metrics
# =============================================================================
ALERT_CHAIN_LAST_SUCCESS = Gauge(
"awoooi_alert_chain_last_success_timestamp",
"Last successful alert chain completion timestamp",
["source"],
)
ALERT_CHAIN_HEALTHY = Gauge(
"awoooi_alert_chain_healthy",
"Alert chain health status (1=healthy, 0=unhealthy)",
["source"],
)
# =============================================================================
# Sentry Comment Metrics
# =============================================================================
SENTRY_COMMENT_TOTAL = Counter(
"awoooi_sentry_comment_total",
"Total Sentry comments posted",
["status"], # status: success/failed/skipped
)
# =============================================================================
# Learning Service Metrics (ADR-037 Phase G)
# =============================================================================
LEARNING_SKIP_TOTAL = Counter(
"awoooi_learning_skip_total",
"Actions skipped due to low success rate",
["action"],
)
# =============================================================================
# Ollama 容災指標 (P2.3, 2026-04-26 台北時區)
# 建立者: Claude Sonnet 4.6 (tool-expert, P2.3)
#
# 對應告警規則: ops/monitoring/ollama_health_rules.yaml
#
# 使用位置:
# - ollama_failover_manager.py: OLLAMA_FAILOVER_TRIGGERED_TOTAL, AI_ROUTER_PROVIDER_TOTAL
# - ollama_auto_recovery.py: OLLAMA_RECOVERY_TRIGGERED_TOTAL
# - ollama_health_monitor.py: OLLAMA_HEALTH_STATUS
# - main.py lifespan / background task: GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA
#
# Backlog需設計後另行補入
# - ollama_inference_duration_seconds (Histogram) — 需在 _check_inference() 裡 observe
# - post_execution_verification_failed_total / _total — 需 auto_repair_service.py 補入
# =============================================================================
OLLAMA_FAILOVER_TRIGGERED_TOTAL = Counter(
"ollama_failover_triggered_total",
"Ollama failover events (primary switched away from ollama_111)",
["from_provider", "to_provider"],
)
OLLAMA_RECOVERY_TRIGGERED_TOTAL = Counter(
"ollama_recovery_triggered_total",
"Ollama auto-recovery events (primary switched back to ollama_111)",
["from_provider"],
)
OLLAMA_HEALTH_STATUS = Gauge(
"ollama_health_status",
"Ollama instance health (1=healthy, 0=not_healthy/offline)",
["host"], # host: "111" or "188"
)
OLLAMA_CURRENT_PRIMARY_IS_OLLAMA = Gauge(
"ollama_current_primary_is_ollama",
"Whether the current primary AI provider is ollama_111 (1=yes, 0=no)",
)
AI_ROUTER_PROVIDER_TOTAL = Counter(
"ai_router_selected_provider_total",
"AI router provider selection count (all routing decisions)",
["provider"],
)
GEMINI_DAILY_CALL_COUNT = Gauge(
"gemini_daily_call_count",
"Gemini API calls made today (read from Redis ollama:gemini_daily_count:{date})",
)
GEMINI_DAILY_QUOTA = Gauge(
"gemini_daily_quota",
"Gemini API daily call quota (from settings.GEMINI_DAILY_QUOTA)",
)
# =============================================================================
# DIAGNOSE Fallback Metrics (A2 INC-20260425, 2026-04-27 台北時區)
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, A2)
#
# 背景: INC-20260425 NIM timeout 後 fallback 到 Ollama CPU 238s 造成二次 timeout。
# 統帥批准 A+B 雙修A2 移除 Ollama + 新增 fallback 計數 metric
# 閾值告警由獨立 Prometheus rule 定義(不在本任務範圍)。
#
# 使用位置:
# - ai_router.py: record_diagnose_fallback() 在 executor fallback 觸發時呼叫
#
# 告警建議 (供 Prometheus rule 設計參考):
# rate(aiops_diagnose_fallback_total[1m]) > 0.5 → 警告
# rate(aiops_diagnose_fallback_total[5m]) > 0.2 → 嚴重
# =============================================================================
AIOPS_DIAGNOSE_FALLBACK_TOTAL = Counter(
"aiops_diagnose_fallback_total",
"DIAGNOSE intent fallback events (from_provider → to_provider)",
["from_provider", "to_provider"],
)
# 2026-04-27 Claude Sonnet 4.6: F6 — metric 寫入失敗計數器
# 觸發條件: ai_router.py 的 diagnose_fallback_metric_failed except 分支
# 用途: 讓 Prometheus 可觀測 metric 管道是否有問題silent swallow 升 warning + counter
# 告警參考: rate(aiops_diagnose_fallback_metric_error_total[5m]) > 0 → 調查 metrics.py import 鏈
AIOPS_DIAGNOSE_FALLBACK_METRIC_ERROR_TOTAL = Counter(
"aiops_diagnose_fallback_metric_error_total",
"Failures when writing aiops_diagnose_fallback_total metric (indicates metric pipeline issue)",
)
def record_diagnose_fallback(from_provider: str, to_provider: str) -> None:
"""記錄 DIAGNOSE fallback 事件per-provider pair 計數)
2026-04-27 Claude Sonnet 4.6: A2 INC-20260425
呼叫方: ai_router.py AIRouterExecutor.execute() 的 DIAGNOSE fallback 路徑
Args:
from_provider: 失敗的 provider 名稱e.g. "openclaw_nemo"
to_provider: 下一個嘗試的 provider 名稱e.g. "gemini"
"""
AIOPS_DIAGNOSE_FALLBACK_TOTAL.labels(
from_provider=from_provider,
to_provider=to_provider,
).inc()
# =============================================================================
# P3.1-T1 Tier-1 三服務整合 Metrics (2026-04-27 台北時區)
# 建立者: Claude Sonnet 4.6 (P3.1-T1)
#
# ROLLBACK_EXECUTED_TOTAL: rollback_manager 整合到 auto_repair_service._verify_and_learn
# RESOURCE_RESOLVE_TOTAL: resource_resolver 整合到 approval_execution.execute_approved_action
# =============================================================================
ROLLBACK_EXECUTED_TOTAL = Counter(
"rollback_executed_total",
"K8s rollback executions triggered by PostExecutionVerifier failure",
["status", "reason"],
)
RESOURCE_RESOLVE_TOTAL = Counter(
"resource_resolve_total",
"Resource resolver attempts in approval execution",
["result"], # hit / miss / suggestion / error
)
# =============================================================================
# ADR-100 / ADR-104 Flywheel Emitter Metrics
# =============================================================================
PLAYBOOK_GENERATION_TOTAL = Counter(
"playbook_generation_total",
"LLM Playbook generation and governance outcomes",
["outcome", "source"],
)
PLAYBOOK_STATUS_TOTAL = Gauge(
"playbook_status_total",
"Playbook lifecycle status observations from generation/governance",
["status", "source"],
)
def record_playbook_generation(outcome: str, source: str) -> None:
"""Record Playbook generation/governance outcome."""
PLAYBOOK_GENERATION_TOTAL.labels(outcome=outcome, source=source).inc()
def observe_playbook_status(status: str, source: str) -> None:
"""Expose latest observed Playbook lifecycle status."""
PLAYBOOK_STATUS_TOTAL.labels(status=status, source=source).set(1)
# =============================================================================
# Solver MCP Registry Metrics (H2, 2026-04-27 台北時區)
# 建立者: Claude Sonnet 4.6 (fullstack-engineer, B1 Fix Round)
#
# H2+vuln-V3+V4 — registry 健康監控
# 攻擊場景registry 載入失敗時 LLM 可自創任意 mcp_tool → 繞過白名單
# 防護:載入失敗立即 set status=errorPrometheus 可設告警
#
# 使用位置:
# - solver_agent.py: _load_mcp_tool_registry() 呼叫後更新
#
# status 值:
# ok — 載入成功registry 含 ≥ 1 個 action
# empty — 載入成功但 registry 為空YAML 空格式)
# error — 載入失敗檔案不存在、YAML 格式錯誤等)
# =============================================================================
SOLVER_MCP_REGISTRY_LOADED = Gauge(
"aiops_solver_mcp_registry_loaded",
"MCP registry load status for Solver Agent (1=active status, 0=inactive)",
["status"], # ok / empty / error
)
# =============================================================================
# Helper Functions
# =============================================================================
def record_webhook_request(source: str, status: str, latency: float) -> None:
"""記錄 Webhook 請求指標"""
WEBHOOK_REQUESTS_TOTAL.labels(source=source, status=status).inc()
WEBHOOK_LATENCY_HISTOGRAM.labels(source=source).observe(latency)
def record_alert_processed(source: str, severity: str, outcome: str) -> None:
"""記錄告警處理指標"""
ALERT_PROCESSED_TOTAL.labels(
source=source, severity=severity, outcome=outcome
).inc()
def record_telegram_notification(source: str, success: bool) -> None:
"""記錄 Telegram 通知指標"""
status = "success" if success else "failed"
TELEGRAM_NOTIFICATIONS_TOTAL.labels(source=source, status=status).inc()
def record_anomaly(alert_name: str, service: str, frequency_24h: int, escalation_level: str | None) -> None:
"""記錄異常頻率指標"""
ANOMALY_RECORDED_TOTAL.labels(alert_name=alert_name, service=service).inc()
if escalation_level:
ANOMALY_ESCALATION_TOTAL.labels(level=escalation_level).inc()
def record_auto_repair(action: str, tier: int, success: bool) -> None:
"""記錄自動修復指標"""
outcome = "success" if success else "failed"
AUTO_REPAIR_ATTEMPTS_TOTAL.labels(
action=action, tier=str(tier), outcome=outcome
).inc()
def record_alert_chain_success(source: str) -> None:
"""記錄告警鏈路成功完成"""
import time
ALERT_CHAIN_LAST_SUCCESS.labels(source=source).set(time.time())
ALERT_CHAIN_HEALTHY.labels(source=source).set(1)
def record_alert_chain_failure(source: str) -> None:
"""記錄告警鏈路失敗"""
ALERT_CHAIN_HEALTHY.labels(source=source).set(0)