feat(monitoring): alert on stale source provider ingestion
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 3m26s
CD Pipeline / build-and-deploy (push) Successful in 3m38s
CD Pipeline / post-deploy-checks (push) Successful in 1m25s
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 3m26s
CD Pipeline / build-and-deploy (push) Successful in 3m38s
CD Pipeline / post-deploy-checks (push) Successful in 1m25s
This commit is contained in:
@@ -98,6 +98,25 @@ spec:
|
||||
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
|
||||
description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test: python scripts/alert_chain_smoke_test.py"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 外部 provider source ingestion 新鮮度
|
||||
# -----------------------------------------------------------------
|
||||
- alert: SourceProviderIngestionStale
|
||||
expr: |
|
||||
time() - max by (source) (
|
||||
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
|
||||
) > 86400
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: alert-chain
|
||||
component: source-ingestion
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
|
||||
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
|
||||
runbook_url: "https://awoooi.internal/runbooks/alert-chain"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 告警鏈路健康狀態
|
||||
# -----------------------------------------------------------------
|
||||
|
||||
@@ -524,6 +524,24 @@ groups:
|
||||
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
|
||||
description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
|
||||
|
||||
- alert: SourceProviderIngestionStale
|
||||
expr: |
|
||||
time() - max by (source) (
|
||||
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
|
||||
) > 86400
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: k8s
|
||||
component: source-ingestion
|
||||
team: platform
|
||||
auto_repair: "false"
|
||||
alert_category: "alertchain_provider_freshness"
|
||||
annotations:
|
||||
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
|
||||
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
|
||||
runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
|
||||
|
||||
- alert: AlertChainUnhealthy
|
||||
expr: awoooi_alert_chain_healthy == 0
|
||||
for: 5m
|
||||
|
||||
@@ -524,6 +524,24 @@ groups:
|
||||
summary: "Alertmanager 主鏈路 2 小時內未收到告警"
|
||||
description: "Alertmanager 是固定主鏈路;Sentry/SignOz 沉默不代表鏈路故障,錯誤率另有 AlertChainBroken_* 規則監控。請執行 Smoke Test"
|
||||
|
||||
- alert: SourceProviderIngestionStale
|
||||
expr: |
|
||||
time() - max by (source) (
|
||||
awoooi_alert_chain_last_success_timestamp{source=~"sentry|signoz"}
|
||||
) > 86400
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
layer: k8s
|
||||
component: source-ingestion
|
||||
team: platform
|
||||
auto_repair: "false"
|
||||
alert_category: "alertchain_provider_freshness"
|
||||
annotations:
|
||||
summary: "{{ $labels.source }} source ingestion 超過 24 小時未更新"
|
||||
description: "{{ $labels.source }} webhook endpoint 可能仍健康,但 AwoooP source dossier 已超過 24 小時沒有新事件。這是 provider ingestion / upstream smoke / correlation freshness 缺口,不是 Alertmanager 主鏈路故障。"
|
||||
runbook: "先查 /api/v1/webhooks/{{ $labels.source }}/health,再查 /api/v1/platform/events/dossier/coverage?provider={{ $labels.source }};若 endpoint OK 但 latest stale,檢查上游 Sentry/SignOz notification channel 或排程 smoke。"
|
||||
|
||||
- alert: AlertChainUnhealthy
|
||||
expr: awoooi_alert_chain_healthy == 0
|
||||
for: 5m
|
||||
|
||||
@@ -110,8 +110,15 @@ if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then
|
||||
fi
|
||||
log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路"
|
||||
|
||||
SOURCE_PROVIDER_STALE_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'SourceProviderIngestionStale'), ''))")
|
||||
if [[ "$SOURCE_PROVIDER_STALE_QUERY" != *'source=~"sentry|signoz"'* ]]; then
|
||||
echo "ERROR: SourceProviderIngestionStale query 未限制 Sentry/SignOz provider freshness: ${SOURCE_PROVIDER_STALE_QUERY}"
|
||||
exit 1
|
||||
fi
|
||||
log "✅ SourceProviderIngestionStale query 已限制 Sentry/SignOz provider freshness"
|
||||
|
||||
# 驗證關鍵規則存在
|
||||
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy")
|
||||
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy" "SourceProviderIngestionStale")
|
||||
for rule in "${KEY_RULES[@]}"; do
|
||||
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
|
||||
if [ "$EXISTS" = "OK" ]; then
|
||||
|
||||
Reference in New Issue
Block a user