Files
awoooi/scripts/ops/deploy-alertmanager-config.sh
Your Name ee2cc2bfc3
Some checks failed
CD Pipeline / tests (push) Failing after 1m23s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 15s
fix(alerts): 收斂 Telegram 告警到 SRE 戰情室
2026-06-12 11:06:16 +08:00

145 lines
4.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Render and deploy ops/alertmanager/alertmanager.yml to the 110 Docker Alertmanager.
#
# This script keeps the live direct-Telegram emergency route aligned with Git:
# - inject Telegram bot token and SRE group chat id from K8s secret or env
# - validate with amtool before touching the live config
# - back up the live file
# - keep the bind-mounted live file inode and readable permissions intact
# - reload Alertmanager with SIGHUP
#
# Usage:
# bash scripts/ops/deploy-alertmanager-config.sh [--dry-run]
#
# Optional env:
# TARGET_HOST=192.168.0.110
# TARGET_PATH=/home/wooo/monitoring/alertmanager.yml
# K8S_HOST=192.168.0.120
# K8S_NAMESPACE=awoooi-prod
# K8S_SECRET=awoooi-secrets
# TELEGRAM_BOT_TOKEN=...
# SRE_GROUP_CHAT_ID=...
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CONFIG_TEMPLATE="${REPO_ROOT}/ops/alertmanager/alertmanager.yml"
TARGET_HOST="${TARGET_HOST:-192.168.0.110}"
TARGET_USER="${TARGET_USER:-wooo}"
TARGET_PATH="${TARGET_PATH:-/home/wooo/monitoring/alertmanager.yml}"
K8S_HOST="${K8S_HOST:-192.168.0.120}"
K8S_USER="${K8S_USER:-wooo}"
K8S_NAMESPACE="${K8S_NAMESPACE:-awoooi-prod}"
K8S_SECRET="${K8S_SECRET:-awoooi-secrets}"
DRY_RUN="${1:-}"
log() { printf '[%s] %s\n' "$(date '+%H:%M:%S')" "$*"; }
die() {
echo "ERROR: $*" >&2
exit 1
}
decode_b64() {
python3 -c 'import base64,sys; print(base64.b64decode(sys.stdin.read()).decode().strip())'
}
secret_key_b64() {
local key="$1"
ssh -o BatchMode=yes -o ConnectTimeout=8 "${K8S_USER}@${K8S_HOST}" \
"sudo -n kubectl -n '${K8S_NAMESPACE}' get secret '${K8S_SECRET}' -o jsonpath='{.data.${key}}'" 2>/dev/null
}
read_secret_first_available() {
local env_value="$1"
shift
if [[ -n "$env_value" ]]; then
printf '%s' "$env_value"
return 0
fi
local key raw
for key in "$@"; do
raw="$(secret_key_b64 "$key" || true)"
if [[ -n "$raw" ]]; then
printf '%s' "$raw" | decode_b64
return 0
fi
done
return 1
}
[[ -f "$CONFIG_TEMPLATE" ]] || die "template not found: ${CONFIG_TEMPLATE}"
TELEGRAM_BOT_TOKEN="$(
read_secret_first_available \
"${TELEGRAM_BOT_TOKEN:-}" \
OPENCLAW_TG_BOT_TOKEN \
OPENCLAW_BOT_TOKEN \
TELEGRAM_BOT_TOKEN \
TG_BOT_TOKEN
)" || die "missing Telegram bot token; set TELEGRAM_BOT_TOKEN or add one of the known keys to ${K8S_SECRET}"
SRE_GROUP_CHAT_ID="$(
read_secret_first_available \
"${SRE_GROUP_CHAT_ID:-}" \
SRE_GROUP_CHAT_ID
)" || die "missing SRE_GROUP_CHAT_ID"
[[ "$SRE_GROUP_CHAT_ID" =~ ^-?[0-9]+$ ]] || die "SRE_GROUP_CHAT_ID must be a Telegram numeric chat id"
export TELEGRAM_BOT_TOKEN SRE_GROUP_CHAT_ID
tmp_rendered="$(mktemp)"
trap 'rm -f "$tmp_rendered"' EXIT
chmod 600 "$tmp_rendered"
python3 - "$CONFIG_TEMPLATE" "$tmp_rendered" <<'PY'
from pathlib import Path
import os
import sys
template = Path(sys.argv[1])
target = Path(sys.argv[2])
text = template.read_text()
text = text.replace("TELEGRAM_BOT_TOKEN_PLACEHOLDER", os.environ["TELEGRAM_BOT_TOKEN"])
text = text.replace("SRE_GROUP_CHAT_ID_PLACEHOLDER", os.environ["SRE_GROUP_CHAT_ID"])
if "TELEGRAM_BOT_TOKEN_PLACEHOLDER" in text or "SRE_GROUP_CHAT_ID_PLACEHOLDER" in text:
raise SystemExit("unreplaced secret placeholder remains in rendered config")
target.write_text(text)
PY
log "Validating rendered config with live Alertmanager amtool on ${TARGET_HOST}"
ssh -o BatchMode=yes -o ConnectTimeout=8 "${TARGET_USER}@${TARGET_HOST}" \
"docker exec -i alertmanager sh -c 'cat >/tmp/alertmanager-rendered.yml && amtool check-config /tmp/alertmanager-rendered.yml'" \
< "$tmp_rendered"
if [[ "$DRY_RUN" == "--dry-run" ]]; then
log "DRY RUN: rendered config validated; not deploying"
exit 0
fi
log "Uploading rendered config to ${TARGET_HOST}:${TARGET_PATH}"
ssh -o BatchMode=yes -o ConnectTimeout=8 "${TARGET_USER}@${TARGET_HOST}" \
"umask 077 && cat > /tmp/alertmanager.yml.new" < "$tmp_rendered"
ssh -o BatchMode=yes -o ConnectTimeout=8 "${TARGET_USER}@${TARGET_HOST}" "bash -s" <<REMOTE
set -euo pipefail
target='${TARGET_PATH}'
backup="\${target}.bak.\$(date +%Y%m%d%H%M%S)"
cp "\$target" "\$backup"
# Alertmanager bind-mounts a single file. Keep the existing inode instead of mv'ing
# a replacement over it, then restore readable permissions for the container user.
cat /tmp/alertmanager.yml.new > "\$target"
chmod 0644 "\$target"
rm -f /tmp/alertmanager.yml.new
docker exec alertmanager amtool check-config /etc/alertmanager/alertmanager.yml
docker kill -s HUP alertmanager >/dev/null
sleep 2
docker inspect alertmanager --format 'status={{.State.Status}} started={{.State.StartedAt}}'
echo "backup=\$backup"
REMOTE
log "Alertmanager config deployed and reloaded"