From 4956fbb849c2a103f6bb23965ee5ee9ccd6bdcb4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 20 May 2026 13:26:24 +0800 Subject: [PATCH] fix(monitoring): verify alert rule deploy content --- scripts/ops/deploy-alerts.sh | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/scripts/ops/deploy-alerts.sh b/scripts/ops/deploy-alerts.sh index dffff073..051b2665 100755 --- a/scripts/ops/deploy-alerts.sh +++ b/scripts/ops/deploy-alerts.sh @@ -16,6 +16,23 @@ DRY_RUN="${1:-}" log() { echo "[$(date '+%H:%M:%S')] $*"; } +file_sha() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum "$1" | awk '{print $1}' + else + shasum -a 256 "$1" | awk '{print $1}' + fi +} + +remote_file_sha() { + ssh wooo@${TARGET_HOST} "\ + if command -v sha256sum >/dev/null 2>&1; then \ + sha256sum '$1' | awk '{print \$1}'; \ + else \ + shasum -a 256 '$1' | awk '{print \$1}'; \ + fi" +} + # 確認檔案存在 for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do if [ ! -f "$file" ]; then @@ -59,6 +76,20 @@ scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH} scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH} log "✅ 規則已複製到 ${TARGET_HOST}" +LOCAL_ALERTS_SHA="$(file_sha "$ALERT_RULES_FILE")" +REMOTE_ALERTS_SHA="$(remote_file_sha "$TARGET_ALERTS_PATH")" +LOCAL_SLO_SHA="$(file_sha "$SLO_RULES_FILE")" +REMOTE_SLO_SHA="$(remote_file_sha "$TARGET_SLO_PATH")" +if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_SHA" ]; then + echo "ERROR: 遠端 alerts.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_SHA}" + exit 1 +fi +if [ "$LOCAL_SLO_SHA" != "$REMOTE_SLO_SHA" ]; then + echo "ERROR: 遠端 slo-rules.yml hash 不一致 local=${LOCAL_SLO_SHA} remote=${REMOTE_SLO_SHA}" + exit 1 +fi +log "✅ 遠端規則 hash 驗證通過" + # Reload Prometheus ssh wooo@${TARGET_HOST} "curl -s -X POST ${PROMETHEUS_URL}/-/reload" sleep 3 @@ -72,6 +103,13 @@ if [ "$RULE_COUNT" -lt 25 ]; then exit 1 fi +NO_ALERTS_QUERY=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules" | python3 -c "import sys,json; r=json.load(sys.stdin); print(next((x.get('query','') for g in r['data']['groups'] for x in g['rules'] if x.get('name') == 'NoAlertsReceived2Hours'), ''))") +if [[ "$NO_ALERTS_QUERY" != *'source="alertmanager"'* ]]; then + echo "ERROR: NoAlertsReceived2Hours query 未限制 alertmanager 主鏈路: ${NO_ALERTS_QUERY}" + exit 1 +fi +log "✅ NoAlertsReceived2Hours query 已限制 alertmanager 主鏈路" + # 驗證關鍵規則存在 KEY_RULES=("SentryDown" "HarborDown" "GiteaDown" "OpenClawDown" "AlertmanagerDown" "AlertChainUnhealthy") for rule in "${KEY_RULES[@]}"; do