Files
awoooi/scripts/ops/deploy-alerts.sh
Your Name d6d2719e02
Some checks failed
Code Review / ai-code-review (push) Has been cancelled
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 29s
fix(alerts): deploy drift guard with canonical rules
2026-05-29 11:14:12 +08:00

165 lines
7.3 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# scripts/ops/deploy-alerts.sh
# 部署統一告警規則到 110 Prometheus
# 2026-04-05 Claude Code: Sprint 1 自動化部署
# 用法: bash scripts/ops/deploy-alerts.sh [--dry-run]
set -eo pipefail
ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
DRIFT_GUARD_SCRIPT="scripts/ops/prometheus-rule-drift-guard.sh"
TARGET_HOST="192.168.0.110"
TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
TARGET_ALERTS_CANONICAL_PATH="/home/wooo/monitoring/alerts-unified.canonical.yml"
TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
TARGET_DRIFT_GUARD_PATH="/home/wooo/scripts/prometheus-rule-drift-guard.sh"
PROMETHEUS_URL="http://${TARGET_HOST}:9090"
DRY_RUN="${1:-}"
log() { echo "[$(date '+%H:%M:%S')] $*"; }
file_sha() {
if command -v sha256sum >/dev/null 2>&1; then
sha256sum "$1" | awk '{print $1}'
else
shasum -a 256 "$1" | awk '{print $1}'
fi
}
remote_file_sha() {
ssh wooo@${TARGET_HOST} "\
if command -v sha256sum >/dev/null 2>&1; then \
sha256sum '$1' | awk '{print \$1}'; \
else \
shasum -a 256 '$1' | awk '{print \$1}'; \
fi"
}
# 確認檔案存在
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE" "$DRIFT_GUARD_SCRIPT"; do
if [ ! -f "$file" ]; then
echo "ERROR: $file not found"
exit 1
fi
done
# 驗證 YAML 語法
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then
:
elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then
:
else
echo "ERROR: YAML syntax error or no YAML parser available: $file"
exit 1
fi
done
log "✅ YAML 語法驗證通過"
# Dry run 模式
if [ "$DRY_RUN" = "--dry-run" ]; then
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}"
log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
log "DRY RUN: would deploy $DRIFT_GUARD_SCRIPT to ${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}"
ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
log "告警規則數量: $ALERT_COUNTSLO recording: $SLO_RECORD_COUNTSLO alerts: $SLO_ALERT_COUNT"
exit 0
fi
# 備份現有規則
ssh wooo@${TARGET_HOST} "\
cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
cp ${TARGET_ALERTS_CANONICAL_PATH} ${TARGET_ALERTS_CANONICAL_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
cp ${TARGET_DRIFT_GUARD_PATH} ${TARGET_DRIFT_GUARD_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
log "✅ 現有規則已備份"
# 部署新規則
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}
scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
scp "$DRIFT_GUARD_SCRIPT" wooo@${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}
ssh wooo@${TARGET_HOST} "chmod 0755 ${TARGET_DRIFT_GUARD_PATH}"
log "✅ 規則已複製到 ${TARGET_HOST}"
LOCAL_ALERTS_SHA="$(file_sha "$ALERT_RULES_FILE")"
REMOTE_ALERTS_SHA="$(remote_file_sha "$TARGET_ALERTS_PATH")"
REMOTE_ALERTS_CANONICAL_SHA="$(remote_file_sha "$TARGET_ALERTS_CANONICAL_PATH")"
LOCAL_SLO_SHA="$(file_sha "$SLO_RULES_FILE")"
REMOTE_SLO_SHA="$(remote_file_sha "$TARGET_SLO_PATH")"
LOCAL_DRIFT_GUARD_SHA="$(file_sha "$DRIFT_GUARD_SCRIPT")"
REMOTE_DRIFT_GUARD_SHA="$(remote_file_sha "$TARGET_DRIFT_GUARD_PATH")"
if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_SHA" ]; then
echo "ERROR: 遠端 alerts.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_SHA}"
exit 1
fi
if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_CANONICAL_SHA" ]; then
echo "ERROR: 遠端 alerts-unified.canonical.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_CANONICAL_SHA}"
exit 1
fi
if [ "$LOCAL_SLO_SHA" != "$REMOTE_SLO_SHA" ]; then
echo "ERROR: 遠端 slo-rules.yml hash 不一致 local=${LOCAL_SLO_SHA} remote=${REMOTE_SLO_SHA}"
exit 1
fi
if [ "$LOCAL_DRIFT_GUARD_SHA" != "$REMOTE_DRIFT_GUARD_SHA" ]; then
echo "ERROR: 遠端 prometheus-rule-drift-guard.sh hash 不一致 local=${LOCAL_DRIFT_GUARD_SHA} remote=${REMOTE_DRIFT_GUARD_SHA}"
exit 1
fi
log "✅ 遠端規則 hash 驗證通過"
# Reload Prometheus
ssh wooo@${TARGET_HOST} "curl -s -X POST ${PROMETHEUS_URL}/-/reload"
sleep 3
# 驗證規則數量
RULE_COUNT=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); print(sum(len(g['rules']) for g in r['data']['groups']))\"")
log "Prometheus 已載入 ${RULE_COUNT} 條規則"
if [ "$RULE_COUNT" -lt 25 ]; then
echo "ERROR: 規則數量異常 ($RULE_COUNT < 25),請檢查"
exit 1
fi
NO_ALERTS_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'NoAlertsReceived2Hours'), ''))")
if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then
echo "ERROR: NoAlertsReceived2Hours query 未限制 alertmanager 主鏈路: ${NO_ALERTS_QUERY}"
exit 1
fi
log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路"
SOURCE_PROVIDER_STALE_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'SourceProviderIngestionStale'), ''))")
if [[ "$SOURCE_PROVIDER_STALE_QUERY" != *'source=~"sentry|signoz"'* ]]; then
echo "ERROR: SourceProviderIngestionStale query 未限制 Sentry/SignOz provider freshness: ${SOURCE_PROVIDER_STALE_QUERY}"
exit 1
fi
log "✅ SourceProviderIngestionStale query 已限制 Sentry/SignOz provider freshness"
# 驗證關鍵規則存在
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy" "SourceProviderIngestionStale")
for rule in "${KEY_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then
log "$rule"
else
echo "$rule 未找到"
exit 1
fi
done
KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical")
for rule in "${KEY_SLO_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then
log "$rule"
else
echo "$rule 未找到"
exit 1
fi
done
log "🎉 部署完成!所有關鍵規則已生效"