feat(monitoring): alert on stale source provider ingestion
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 3m26s
CD Pipeline / build-and-deploy (push) Successful in 3m38s
CD Pipeline / post-deploy-checks (push) Successful in 1m25s

This commit is contained in:
Your Name
2026-05-20 19:19:21 +08:00
parent 4a9d76d29e
commit ae9d0b7385
4 changed files with 63 additions and 1 deletions

View File

@@ -98,6 +98,25 @@ spec:
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
description: "Alertmanager 是固定主鏈路Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test: python scripts/alert_chain_smoke_test.py"
# -----------------------------------------------------------------
# 外部 provider source ingestion 新鮮度
# -----------------------------------------------------------------
- alert: SourceProviderIngestionStale
expr: |
time() - max by (source) (
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
) > 86400
for: 15m
labels:
severity: warning
service: alert-chain
component: source-ingestion
team: platform
annotations:
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
runbook_url: "https://awoooi.internal/runbooks/alert-chain"
# -----------------------------------------------------------------
# 告警鏈路健康狀態
# -----------------------------------------------------------------

View File

@@ -524,6 +524,24 @@ groups:
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
description: "Alertmanager 是固定主鏈路Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
- alert: SourceProviderIngestionStale
expr: |
time() - max by (source) (
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
) > 86400
for: 15m
labels:
severity: warning
layer: k8s
component: source-ingestion
team: platform
auto_repair: "false"
alert_category: "alertchain_provider_freshness"
annotations:
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m

View File

@@ -524,6 +524,24 @@ groups:
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
description: "Alertmanager 是固定主鏈路Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
- alert: SourceProviderIngestionStale
expr: |
time() - max by (source) (
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
) > 86400
for: 15m
labels:
severity: warning
layer: k8s
component: source-ingestion
team: platform
auto_repair: "false"
alert_category: "alertchain_provider_freshness"
annotations:
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m

View File

@@ -110,8 +110,15 @@ if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then
fi
log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路"
SOURCE_PROVIDER_STALE_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'SourceProviderIngestionStale'), ''))")
if [[ "$SOURCE_PROVIDER_STALE_QUERY" != *'source=~"sentry|signoz"'* ]]; then
echo "ERROR: SourceProviderIngestionStale query 未限制 Sentry/SignOz provider freshness: ${SOURCE_PROVIDER_STALE_QUERY}"
exit 1
fi
log "✅ SourceProviderIngestionStale query 已限制 Sentry/SignOz provider freshness"
# 驗證關鍵規則存在
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy")
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy" "SourceProviderIngestionStale")
for rule in "${KEY_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then