fix(governance): stabilize adr100 km growth slo
Some checks failed
Code Review / ai-code-review (push) Successful in 22s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 25s
CD Pipeline / tests (push) Successful in 1m11s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-14 19:33:52 +08:00
parent cdb8bf6802
commit d2a4a17969
9 changed files with 267 additions and 30 deletions

View File

@@ -6,45 +6,57 @@
set -eo pipefail
RULES_FILE="ops/monitoring/alerts-unified.yml"
ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
TARGET_HOST="192.168.0.110"
TARGET_PATH="/home/wooo/monitoring/alerts.yml"
TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
PROMETHEUS_URL="http://${TARGET_HOST}:9090"
DRY_RUN="${1:-}"
log() { echo "[$(date '+%H:%M:%S')] $*"; }
# 確認檔案存在
if [ ! -f "$RULES_FILE" ]; then
echo "ERROR: $RULES_FILE not found"
exit 1
fi
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
if [ ! -f "$file" ]; then
echo "ERROR: $file not found"
exit 1
fi
done
# 驗證 YAML 語法
if python3 -c "import yaml; yaml.safe_load(open('$RULES_FILE'))" 2>/dev/null; then
:
elif ruby -e "require 'yaml'; YAML.load_file('$RULES_FILE')" 2>/dev/null; then
:
else
echo "ERROR: YAML syntax error or no YAML parser available"
exit 1
fi
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
if python3 -c "import yaml; yaml.safe_load(open('$file'))" 2>/dev/null; then
:
elif ruby -e "require 'yaml'; YAML.load_file('$file')" 2>/dev/null; then
:
else
echo "ERROR: YAML syntax error or no YAML parser available: $file"
exit 1
fi
done
log "✅ YAML 語法驗證通過"
# Dry run 模式
if [ "$DRY_RUN" = "--dry-run" ]; then
log "DRY RUN: would deploy $RULES_FILE to ${TARGET_HOST}:${TARGET_PATH}"
RULE_COUNT=$(grep -c "alert:" "$RULES_FILE")
log "規則數量: $RULE_COUNT"
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
log "告警規則數量: $ALERT_COUNTSLO recording: $SLO_RECORD_COUNTSLO alerts: $SLO_ALERT_COUNT"
exit 0
fi
# 備份現有規則
ssh wooo@${TARGET_HOST} "cp ${TARGET_PATH} ${TARGET_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
ssh wooo@${TARGET_HOST} "\
cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
log "✅ 現有規則已備份"
# 部署新規則
scp "$RULES_FILE" wooo@${TARGET_HOST}:${TARGET_PATH}
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
log "✅ 規則已複製到 ${TARGET_HOST}"
# Reload Prometheus
@@ -72,4 +84,15 @@ for rule in "${KEY_RULES[@]}"; do
fi
done
KEY_SLO_RULES=("sli:autonomy_rate:5m" "sli:decision_accuracy:5m" "sli:confidence_calibration:1h" "sli:km_growth_rate:24h" "SLO_KMGrowthRate_Critical")
for rule in "${KEY_SLO_RULES[@]}"; do
EXISTS=$(ssh wooo@${TARGET_HOST} "curl -s ${PROMETHEUS_URL}/api/v1/rules | python3 -c \"import sys,json; r=json.load(sys.stdin); names=[x['name'] for g in r['data']['groups'] for x in g['rules']]; print('OK' if '$rule' in names else 'MISSING')\"")
if [ "$EXISTS" = "OK" ]; then
log "$rule"
else
echo "$rule 未找到"
exit 1
fi
done
log "🎉 部署完成!所有關鍵規則已生效"