diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 66294877..59895c58 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -651,6 +651,24 @@ groups: summary: "2 小時內未收到任何告警 ({{ $labels.source }})" description: "可能是告警鏈路問題,請執行 Smoke Test" + - alert: SourceProviderIngestionStale + expr: | + time() - max by (source) ( + awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} + ) > 86400 + for: 15m + labels: + severity: warning + layer: k8s + component: source-ingestion + team: platform + auto_repair: "false" + alert_category: "alertchain_provider_freshness" + annotations: + summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" + description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" + runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。" + - alert: AlertChainUnhealthy expr: awoooi_alert_chain_healthy == 0 for: 5m