Files
awoooi/scripts/ops/deploy-alerts.sh
Your Name d2a4a17969
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(governance): stabilize adr100 km growth slo
2026-05-14 19:33:52 +08:00

99 lines
3.7 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# scripts/ops/deploy-alerts.sh
# 部署統一告警規則到 110 Prometheus
# 2026-04-05 Claude Code: Sprint 1 自動化部署
# 用法: bash scripts/ops/deploy-alerts.sh [--dry-run]
set -eo pipefail
ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
TARGET_HOST="192.168.0.110"
TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
PROMETHEUS_URL="http://${TARGET_HOST}:9090"
DRY_RUN="${1:-}"
log() { echo "[$(date '+%H:%M:%S')] $*"; }
# 確認檔案存在
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
if [ ! -f "$file" ]; then
echo "ERROR: $file not found"
exit 1
fi
done
# 驗證 YAML 語法
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then
:
elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then
:
else
echo "ERROR: YAML syntax error or no YAML parser available: $file"
exit 1
fi
done
log "✅ YAML 語法驗證通過"
# Dry run 模式
if [ "$DRY_RUN" = "--dry-run" ]; then
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
log "告警規則數量: $ALERT_COUNTSLO recording: $SLO_RECORD_COUNTSLO alerts: $SLO_ALERT_COUNT"
exit 0
fi
# 備份現有規則
ssh wooo@${TARGET_HOST} "\
cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
log "✅ 現有規則已備份"
# 部署新規則
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
log "✅ 規則已複製到 ${TARGET_HOST}"
# Reload Prometheus
ssh wooo@${TARGET_HOST} "curl -s -X POST ${PROMETHEUS_URL}/-/reload"
sleep 3
# 驗證規則數量
RULE_COUNT=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); print(sum(len(g['rules']) for g in r['data']['groups']))\"")
log "Prometheus 已載入 ${RULE_COUNT} 條規則"
if [ "$RULE_COUNT" -lt 25 ]; then
echo "ERROR: 規則數量異常 ($RULE_COUNT < 25),請檢查"
exit 1
fi
# 驗證關鍵規則存在
KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy")
for rule in "${KEY_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then
log "$rule"
else
echo "$rule 未找到"
exit 1
fi
done
KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical")
for rule in "${KEY_SLO_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then
log "$rule"
else
echo "$rule 未找到"
exit 1
fi
done
log "🎉 部署完成!所有關鍵規則已生效"