diff --git a/scripts/ops/deploy-docker-health-monitor.sh b/scripts/ops/deploy-docker-health-monitor.sh index 6a5672b6..3d08d1a9 100755 --- a/scripts/ops/deploy-docker-health-monitor.sh +++ b/scripts/ops/deploy-docker-health-monitor.sh @@ -24,22 +24,32 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" MONITOR_SCRIPT="${REPO_ROOT}/scripts/ops/docker-health-monitor.sh" TARGET="${1:-all}" -SSH_KEY="${HOME}/.ssh/deploy_key" -JUMP_HOST="wooo@192.168.0.121" +SSH_KEY="${HOME}/.ssh/id_rsa" + +# 110 用 wooo,188 用 ollama +ssh_user() { + local host="$1" + case "$host" in + 192.168.0.188) echo "ollama" ;; + *) echo "wooo" ;; + esac +} -# 透過 K3s master 跳板連到 110/188 ssh_cmd() { local host="$1" shift - ssh -i "$SSH_KEY" -J "$JUMP_HOST" -o StrictHostKeyChecking=no "wooo@${host}" "$@" + local user + user=$(ssh_user "$host") + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${user}@${host}" "$@" } scp_cmd() { local src="$1" local host="$2" local dst="$3" - # 先透過 scp 到跳板,再 ssh 到目標(act runner 限制) - scp -i "$SSH_KEY" -o StrictHostKeyChecking=no -J "$JUMP_HOST" "$src" "wooo@${host}:${dst}" + local user + user=$(ssh_user "$host") + scp -i "$SSH_KEY" -o StrictHostKeyChecking=no "$src" "${user}@${host}:${dst}" } deploy_to_host() { diff --git a/scripts/ops/docker-health-monitor.sh b/scripts/ops/docker-health-monitor.sh index 7ed973b9..1332a4a1 100755 --- a/scripts/ops/docker-health-monitor.sh +++ b/scripts/ops/docker-health-monitor.sh @@ -1,13 +1,17 @@ #!/usr/bin/env bash # docker-health-monitor.sh -# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作) +# Sprint 5.2 Plan A: 感知 + 自動修復 + 回報 # # 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1 # 設定: /etc/awoooi-ops/secrets.env -# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei -# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062) -# 注意: 禁止在此腳本中執行 docker restart / docker start -# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理 +# 撰寫: Claude Sonnet 4.6 / 2026-04-09 Asia/Taipei +# +# 修復分級 (ADR-060 統帥裁示): +# 自動 docker restart: 一般應用容器 (非 DB/Redis/監控棧) +# 自動 docker start: prometheus / grafana / alertmanager (保護 WAL) +# 僅告警 (禁止重啟): postgres / redis / clickhouse / DB 類 +# +# 流程: Detect → AutoRepair → Report (Intent→Action→Result 三段) set -euo pipefail @@ -22,22 +26,30 @@ fi : "${TELEGRAM_BOT_TOKEN:=}" : "${TELEGRAM_CHAT_ID:=}" : "${LOG_FILE:=/var/log/docker-health-monitor.log}" -# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻) : "${SEND_COOLDOWN_SECONDS:=300}" : "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}" -# 排除清單:逗號分隔,支援 glob(如 signoz-*) -# 用途:init containers、永久停用服務、已知 exited 但不需告警的容器 : "${EXCLUDE_CONTAINERS:=signoz-telemetrystore-migrator,signoz-clickhouse,signoz-init-clickhouse}" mkdir -p "$COOLDOWN_DIR" +# ─── 禁止自動重啟的容器 (模式匹配) ───────────────────────────────────────── +# DB / Cache / 監控棧核心 — 僅告警,不自動重啟 +READONLY_PATTERNS=( + "postgres" "momo-db" "langfuse-db" "harbor-db" "sentry-postgres" + "redis" "harbor-redis" "sentry-redis" "signoz-clickhouse" +) + +# 監控棧容器:用 docker start(非 restart),保護 WAL +SAFE_START_PATTERNS=( + "prometheus" "grafana" "alertmanager" +) + # ─── 工具函數 ──────────────────────────────────────────────────────────────── log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*" } -# 發送冷卻期檢查(避免同一容器短時間重複送 webhook) -is_in_send_cooldown() { +is_in_cooldown() { local container="$1" local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown" if [[ -f "$cooldown_file" ]]; then @@ -46,43 +58,56 @@ is_in_send_cooldown() { now=$(date +%s) elapsed=$(( now - last_sent )) if (( elapsed < SEND_COOLDOWN_SECONDS )); then - log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)" + log "COOLDOWN: ${container} 距上次處理 ${elapsed}s,跳過(冷卻 ${SEND_COOLDOWN_SECONDS}s)" return 0 fi fi return 1 } -set_send_cooldown() { +set_cooldown() { local container="$1" date +%s > "${COOLDOWN_DIR}/${container}.cooldown" } -# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API +# 判斷容器是否符合模式清單 +matches_pattern() { + local name="$1" + shift + local patterns=("$@") + for pattern in "${patterns[@]}"; do + case "$name" in + *"${pattern}"*) return 0 ;; + esac + done + return 1 +} + +# ─── Telegram 直發 Fallback ────────────────────────────────────────────────── send_telegram_direct() { local message="$1" - if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then - log "WARN: Telegram 未設定,跳過 Fallback" - return 0 - fi + [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]] && return 0 curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -H "Content-Type: application/json" \ -d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${message}\",\"parse_mode\":\"HTML\"}" \ > /dev/null 2>&1 || true } -# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API -# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC) -send_to_awoooi() { +# ─── 回報到 AWOOOI API ─────────────────────────────────────────────────────── +report_to_awoooi() { local container="$1" - local status="$2" # unhealthy | exited | dead + local detected_status="$2" # unhealthy | exited | dead + local repair_action="$3" # restarted | started | alert_only | failed + local repair_result="$4" # success | failed | skipped local hostname hostname=$(hostname) - local now_ts now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ') - # 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema) + # auto_repair label 根據實際動作設定 + local auto_repair_label="false" + [[ "$repair_result" == "success" ]] && auto_repair_label="true" + local payload payload=$(cat </dev/null) || http_code="0" + --max-time 30 2>/dev/null) || http_code="0" if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then - log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})" - set_send_cooldown "$container" + log "REPORTED: ${container} repair=${repair_action}(${repair_result}) → API (${http_code})" else - log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API" - send_telegram_direct "🚨 [docker-health-monitor Fallback] 主機: ${hostname} 容器: ${container} 狀態: ${status} (API 不可達,請人工處理)" - set_send_cooldown "$container" + log "WARN: API 回應 ${http_code},Fallback Telegram" + local emoji="🔧" + [[ "$repair_result" == "failed" ]] && emoji="❌" + [[ "$repair_action" == "alert_only" ]] && emoji="⚠️" + send_telegram_direct "${emoji} [docker-health-monitor] 主機: ${hostname} 容器: ${container} 狀態: ${detected_status} 修復: ${repair_action} → ${repair_result} (API 不可達)" + fi +} + +# ─── 自動修復邏輯 ──────────────────────────────────────────────────────────── +attempt_repair() { + local container="$1" + local detected_status="$2" + + # 1. 禁止重啟清單 + if matches_pattern "$container" "${READONLY_PATTERNS[@]}"; then + log "ALERT_ONLY: ${container} 在禁止重啟清單,僅告警" + report_to_awoooi "$container" "$detected_status" "alert_only" "skipped" + return + fi + + # 2. 監控棧 — 用 docker start(保護 WAL) + if matches_pattern "$container" "${SAFE_START_PATTERNS[@]}"; then + log "SAFE_START: ${container} 屬監控棧,執行 docker start" + if docker start "$container" >> /dev/null 2>&1; then + log "SUCCESS: docker start ${container}" + report_to_awoooi "$container" "$detected_status" "started" "success" + else + log "FAILED: docker start ${container}" + report_to_awoooi "$container" "$detected_status" "started" "failed" + fi + return + fi + + # 3. 一般容器 — docker restart + log "AUTO_REPAIR: docker restart ${container}" + if docker restart "$container" >> /dev/null 2>&1; then + log "SUCCESS: docker restart ${container}" + report_to_awoooi "$container" "$detected_status" "restarted" "success" + else + log "FAILED: docker restart ${container}" + report_to_awoooi "$container" "$detected_status" "restarted" "failed" fi } @@ -133,51 +197,39 @@ check_containers() { local hostname hostname=$(hostname) - # 取得所有容器(含停止的) while IFS=$'\t' read -r container_id container_name state health; do - # 跳過 header 或空行 [[ -z "$container_name" ]] && continue - # 排除清單檢查(EXCLUDE_CONTAINERS 逗號分隔) + # 排除清單 local excluded=false IFS=',' read -ra EXCLUDES <<< "$EXCLUDE_CONTAINERS" for pattern in "${EXCLUDES[@]}"; do - pattern="${pattern// /}" # trim spaces + pattern="${pattern// /}" [[ -z "$pattern" ]] && continue # shellcheck disable=SC2254 case "$container_name" in $pattern) excluded=true; break ;; esac done - if $excluded; then - continue - fi + $excluded && continue - local needs_alert=false + local needs_action=false local detected_status="" - # 偵測 exited / dead if [[ "$state" == "exited" || "$state" == "dead" ]]; then - needs_alert=true + needs_action=true detected_status="$state" fi - - # 偵測 unhealthy(health check 存在且失敗) if [[ "$health" == "unhealthy" ]]; then - needs_alert=true + needs_action=true detected_status="unhealthy" fi - if $needs_alert; then - log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}" - - # 冷卻期去重 - if is_in_send_cooldown "$container_name"; then - continue - fi - - # 送 Webhook — 只感知,不修復 - send_to_awoooi "$container_name" "$detected_status" + if $needs_action; then + log "DETECTED: ${container_name} state=${state} health=${health} on ${hostname}" + is_in_cooldown "$container_name" && continue + set_cooldown "$container_name" + attempt_repair "$container_name" "$detected_status" fi done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \ awk -F'\t' '{ @@ -190,7 +242,7 @@ check_containers() { # ─── Main ─────────────────────────────────────────────────────────────────── main() { - log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ===" + log "=== docker-health-monitor 啟動 (自動修復模式) on $(hostname) ===" check_containers log "=== 掃描完成 ===" }