From ae9d0b7385463d58a367b61bf08ed75bb3000f7f Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 20 May 2026 19:19:21 +0800 Subject: [PATCH] feat(monitoring): alert on stale source provider ingestion --- k8s/monitoring/alert-chain-monitor.yaml | 19 +++++++++++++++++++ ops/monitoring/alerts-unified.yml | 18 ++++++++++++++++++ ops/monitoring/alerts.yml | 18 ++++++++++++++++++ scripts/ops/deploy-alerts.sh | 9 ++++++++- 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/k8s/monitoring/alert-chain-monitor.yaml b/k8s/monitoring/alert-chain-monitor.yaml index 06f81c3c..70728fe5 100644 --- a/k8s/monitoring/alert-chain-monitor.yaml +++ b/k8s/monitoring/alert-chain-monitor.yaml @@ -98,6 +98,25 @@ spec: summary: "Alertmanager 主鏈路 2 小時內未收到告警" description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test: python scripts/alert_chain_smoke_test.py" + # ----------------------------------------------------------------- + # 外部 provider source ingestion 新鮮度 + # ----------------------------------------------------------------- + - alert: SourceProviderIngestionStale + expr: | + time() - max by (source) ( + awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} + ) > 86400 + for: 15m + labels: + severity: warning + service: alert-chain + component: source-ingestion + team: platform + annotations: + summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" + description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" + runbook_url: "https://awoooi.internal/runbooks/alert-chain" + # ----------------------------------------------------------------- # 告警鏈路健康狀態 # ----------------------------------------------------------------- diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 22c7f7a6..56163c6e 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -524,6 +524,24 @@ groups: summary: "Alertmanager 主鏈路 2 小時內未收到告警" description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test" + - alert: SourceProviderIngestionStale + expr: | + time() - max by (source) ( + awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} + ) > 86400 + for: 15m + labels: + severity: warning + layer: k8s + component: source-ingestion + team: platform + auto_repair: "false" + alert_category: "alertchain_provider_freshness" + annotations: + summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" + description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" + runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。" + - alert: AlertChainUnhealthy expr: awoooi_alert_chain_healthy == 0 for: 5m diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index a6b0611a..8a004f98 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -524,6 +524,24 @@ groups: summary: "Alertmanager 主鏈路 2 小時內未收到告警" description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test" + - alert: SourceProviderIngestionStale + expr: | + time() - max by (source) ( + awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"} + ) > 86400 + for: 15m + labels: + severity: warning + layer: k8s + component: source-ingestion + team: platform + auto_repair: "false" + alert_category: "alertchain_provider_freshness" + annotations: + summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新" + description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。" + runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。" + - alert: AlertChainUnhealthy expr: awoooi_alert_chain_healthy == 0 for: 5m diff --git a/scripts/ops/deploy-alerts.sh b/scripts/ops/deploy-alerts.sh index 051b2665..8a593d46 100755 --- a/scripts/ops/deploy-alerts.sh +++ b/scripts/ops/deploy-alerts.sh @@ -110,8 +110,15 @@ if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then fi log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路" +SOURCE_PROVIDER_STALE_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'SourceProviderIngestionStale'), ''))") +if [[ "$SOURCE_PROVIDER_STALE_QUERY" != *'source=~"sentry|signoz"'* ]]; then + echo "ERROR: SourceProviderIngestionStale query 未限制 Sentry/SignOz provider freshness: ${SOURCE_PROVIDER_STALE_QUERY}" + exit 1 +fi +log "✅ SourceProviderIngestionStale query 已限制 Sentry/SignOz provider freshness" + # 驗證關鍵規則存在 -KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy") +KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy" "SourceProviderIngestionStale") for rule in "${KEY_RULES[@]}"; do EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"") if [ "$EXISTS" = "OK" ]; then