#!/bin/bash # scripts/ops/deploy-alerts.sh # 部署統一告警規則到 110 Prometheus # 2026-04-05 Claude Code: Sprint 1 自動化部署 # 用法: bash scripts/ops/deploy-alerts.sh [--dry-run] set -eo pipefail ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml" SLO_RULES_FILE="ops/monitoring/slo-rules.yml" DRIFT_GUARD_SCRIPT="scripts/ops/prometheus-rule-drift-guard.sh" TARGET_HOST="192.168.0.110" TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml" TARGET_ALERTS_CANONICAL_PATH="/home/wooo/monitoring/alerts-unified.canonical.yml" TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml" TARGET_DRIFT_GUARD_PATH="/home/wooo/scripts/prometheus-rule-drift-guard.sh" PROMETHEUS_URL="http://${TARGET_HOST}:9090" DRY_RUN="${1:-}" log() { echo "[$(date '+%H:%M:%S')] $*"; } file_sha() { if command -v sha256sum >/dev/null 2>&1; then sha256sum "$1" | awk '{print $1}' else shasum -a 256 "$1" | awk '{print $1}' fi } remote_file_sha() { ssh wooo@${TARGET_HOST} "\ if command -v sha256sum >/dev/null 2>&1; then \ sha256sum '$1' | awk '{print \$1}'; \ else \ shasum -a 256 '$1' | awk '{print \$1}'; \ fi" } # 確認檔案存在 for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE" "$DRIFT_GUARD_SCRIPT"; do if [ ! -f "$file" ]; then echo "ERROR: $file not found" exit 1 fi done # 驗證 YAML 語法 for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then : elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then : else echo "ERROR: YAML syntax error or no YAML parser available: $file" exit 1 fi done log "✅ YAML 語法驗證通過" # Dry run 模式 if [ "$DRY_RUN" = "--dry-run" ]; then log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}" log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}" log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}" log "DRY RUN: would deploy $DRIFT_GUARD_SCRIPT to ${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}" ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE") SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE") SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE") log "告警規則數量: $ALERT_COUNT 條;SLO recording: $SLO_RECORD_COUNT 條;SLO alerts: $SLO_ALERT_COUNT 條" exit 0 fi # 備份現有規則 ssh wooo@${TARGET_HOST} "\ cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ cp ${TARGET_ALERTS_CANONICAL_PATH} ${TARGET_ALERTS_CANONICAL_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ cp ${TARGET_DRIFT_GUARD_PATH} ${TARGET_DRIFT_GUARD_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true" log "✅ 現有規則已備份" # 部署新規則 scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH} scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH} scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH} scp "$DRIFT_GUARD_SCRIPT" wooo@${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH} ssh wooo@${TARGET_HOST} "chmod 0755 ${TARGET_DRIFT_GUARD_PATH}" log "✅ 規則已複製到 ${TARGET_HOST}" LOCAL_ALERTS_SHA="$(file_sha "$ALERT_RULES_FILE")" REMOTE_ALERTS_SHA="$(remote_file_sha "$TARGET_ALERTS_PATH")" REMOTE_ALERTS_CANONICAL_SHA="$(remote_file_sha "$TARGET_ALERTS_CANONICAL_PATH")" LOCAL_SLO_SHA="$(file_sha "$SLO_RULES_FILE")" REMOTE_SLO_SHA="$(remote_file_sha "$TARGET_SLO_PATH")" LOCAL_DRIFT_GUARD_SHA="$(file_sha "$DRIFT_GUARD_SCRIPT")" REMOTE_DRIFT_GUARD_SHA="$(remote_file_sha "$TARGET_DRIFT_GUARD_PATH")" if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_SHA" ]; then echo "ERROR: 遠端 alerts.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_SHA}" exit 1 fi if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_CANONICAL_SHA" ]; then echo "ERROR: 遠端 alerts-unified.canonical.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_CANONICAL_SHA}" exit 1 fi if [ "$LOCAL_SLO_SHA" != "$REMOTE_SLO_SHA" ]; then echo "ERROR: 遠端 slo-rules.yml hash 不一致 local=${LOCAL_SLO_SHA} remote=${REMOTE_SLO_SHA}" exit 1 fi if [ "$LOCAL_DRIFT_GUARD_SHA" != "$REMOTE_DRIFT_GUARD_SHA" ]; then echo "ERROR: 遠端 prometheus-rule-drift-guard.sh hash 不一致 local=${LOCAL_DRIFT_GUARD_SHA} remote=${REMOTE_DRIFT_GUARD_SHA}" exit 1 fi log "✅ 遠端規則 hash 驗證通過" # Reload Prometheus ssh wooo@${TARGET_HOST} "curl -s -X POST ${PROMETHEUS_URL}/-/reload" sleep 3 # 驗證規則數量 RULE_COUNT=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); print(sum(len(g['rules']) for g in r['data']['groups']))\"") log "Prometheus 已載入 ${RULE_COUNT} 條規則" if [ "$RULE_COUNT" -lt 25 ]; then echo "ERROR: 規則數量異常 ($RULE_COUNT < 25),請檢查" exit 1 fi NO_ALERTS_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'NoAlertsReceived2Hours'), ''))") if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then echo "ERROR: NoAlertsReceived2Hours query 未限制 alertmanager 主鏈路: ${NO_ALERTS_QUERY}" exit 1 fi log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路" SOURCE_PROVIDER_STALE_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'SourceProviderIngestionStale'), ''))") if [[ "$SOURCE_PROVIDER_STALE_QUERY" != *'source=~"sentry|signoz"'* ]]; then echo "ERROR: SourceProviderIngestionStale query 未限制 Sentry/SignOz provider freshness: ${SOURCE_PROVIDER_STALE_QUERY}" exit 1 fi log "✅ SourceProviderIngestionStale query 已限制 Sentry/SignOz provider freshness" # 驗證關鍵規則存在 KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy" "SourceProviderIngestionStale") for rule in "${KEY_RULES[@]}"; do EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"") if [ "$EXISTS" = "OK" ]; then log "✅ $rule" else echo "❌ $rule 未找到" exit 1 fi done KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical") for rule in "${KEY_SLO_RULES[@]}"; do EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"") if [ "$EXISTS" = "OK" ]; then log "✅ $rule" else echo "❌ $rule 未找到" exit 1 fi done log "🎉 部署完成!所有關鍵規則已生效"