feat(ops): Sprint 5.2 docker-health-monitor 升級為自動修復模式
舊版: 純感知層 (L4-6),只送 Webhook,修復由 API 執行 新版: 感知 + 自動修復 + 回報 修復分級 (ADR-060): - 一般容器: docker restart - 監控棧 (prometheus/grafana/alertmanager): docker start (保護 WAL) - DB/Redis/ClickHouse: 僅告警,禁止重啟 已部署到: - 192.168.0.110 ~/awoooi-ops/docker-health-monitor.sh - 192.168.0.188 ~/awoooi-ops/docker-health-monitor.sh - 兩台 cron */5 * * * * 運行中 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,22 +24,32 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
MONITOR_SCRIPT="${REPO_ROOT}/scripts/ops/docker-health-monitor.sh"
|
||||
TARGET="${1:-all}"
|
||||
|
||||
SSH_KEY="${HOME}/.ssh/deploy_key"
|
||||
JUMP_HOST="wooo@192.168.0.121"
|
||||
SSH_KEY="${HOME}/.ssh/id_rsa"
|
||||
|
||||
# 110 用 wooo,188 用 ollama
|
||||
ssh_user() {
|
||||
local host="$1"
|
||||
case "$host" in
|
||||
192.168.0.188) echo "ollama" ;;
|
||||
*) echo "wooo" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# 透過 K3s master 跳板連到 110/188
|
||||
ssh_cmd() {
|
||||
local host="$1"
|
||||
shift
|
||||
ssh -i "$SSH_KEY" -J "$JUMP_HOST" -o StrictHostKeyChecking=no "wooo@${host}" "$@"
|
||||
local user
|
||||
user=$(ssh_user "$host")
|
||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${user}@${host}" "$@"
|
||||
}
|
||||
|
||||
scp_cmd() {
|
||||
local src="$1"
|
||||
local host="$2"
|
||||
local dst="$3"
|
||||
# 先透過 scp 到跳板,再 ssh 到目標(act runner 限制)
|
||||
scp -i "$SSH_KEY" -o StrictHostKeyChecking=no -J "$JUMP_HOST" "$src" "wooo@${host}:${dst}"
|
||||
local user
|
||||
user=$(ssh_user "$host")
|
||||
scp -i "$SSH_KEY" -o StrictHostKeyChecking=no "$src" "${user}@${host}:${dst}"
|
||||
}
|
||||
|
||||
deploy_to_host() {
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
# docker-health-monitor.sh
|
||||
# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作)
|
||||
# Sprint 5.2 Plan A: 感知 + 自動修復 + 回報
|
||||
#
|
||||
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
|
||||
# 設定: /etc/awoooi-ops/secrets.env
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062)
|
||||
# 注意: 禁止在此腳本中執行 docker restart / docker start
|
||||
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-09 Asia/Taipei
|
||||
#
|
||||
# 修復分級 (ADR-060 統帥裁示):
|
||||
# 自動 docker restart: 一般應用容器 (非 DB/Redis/監控棧)
|
||||
# 自動 docker start: prometheus / grafana / alertmanager (保護 WAL)
|
||||
# 僅告警 (禁止重啟): postgres / redis / clickhouse / DB 類
|
||||
#
|
||||
# 流程: Detect → AutoRepair → Report (Intent→Action→Result 三段)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -22,22 +26,30 @@ fi
|
||||
: "${TELEGRAM_BOT_TOKEN:=}"
|
||||
: "${TELEGRAM_CHAT_ID:=}"
|
||||
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
|
||||
# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻)
|
||||
: "${SEND_COOLDOWN_SECONDS:=300}"
|
||||
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
|
||||
# 排除清單:逗號分隔,支援 glob(如 signoz-*)
|
||||
# 用途:init containers、永久停用服務、已知 exited 但不需告警的容器
|
||||
: "${EXCLUDE_CONTAINERS:=signoz-telemetrystore-migrator,signoz-clickhouse,signoz-init-clickhouse}"
|
||||
|
||||
mkdir -p "$COOLDOWN_DIR"
|
||||
|
||||
# ─── 禁止自動重啟的容器 (模式匹配) ─────────────────────────────────────────
|
||||
# DB / Cache / 監控棧核心 — 僅告警,不自動重啟
|
||||
READONLY_PATTERNS=(
|
||||
"postgres" "momo-db" "langfuse-db" "harbor-db" "sentry-postgres"
|
||||
"redis" "harbor-redis" "sentry-redis" "signoz-clickhouse"
|
||||
)
|
||||
|
||||
# 監控棧容器:用 docker start(非 restart),保護 WAL
|
||||
SAFE_START_PATTERNS=(
|
||||
"prometheus" "grafana" "alertmanager"
|
||||
)
|
||||
|
||||
# ─── 工具函數 ────────────────────────────────────────────────────────────────
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
|
||||
}
|
||||
|
||||
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook)
|
||||
is_in_send_cooldown() {
|
||||
is_in_cooldown() {
|
||||
local container="$1"
|
||||
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
|
||||
if [[ -f "$cooldown_file" ]]; then
|
||||
@@ -46,43 +58,56 @@ is_in_send_cooldown() {
|
||||
now=$(date +%s)
|
||||
elapsed=$(( now - last_sent ))
|
||||
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
|
||||
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)"
|
||||
log "COOLDOWN: ${container} 距上次處理 ${elapsed}s,跳過(冷卻 ${SEND_COOLDOWN_SECONDS}s)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
set_send_cooldown() {
|
||||
set_cooldown() {
|
||||
local container="$1"
|
||||
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
|
||||
}
|
||||
|
||||
# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API
|
||||
# 判斷容器是否符合模式清單
|
||||
matches_pattern() {
|
||||
local name="$1"
|
||||
shift
|
||||
local patterns=("$@")
|
||||
for pattern in "${patterns[@]}"; do
|
||||
case "$name" in
|
||||
*"${pattern}"*) return 0 ;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# ─── Telegram 直發 Fallback ──────────────────────────────────────────────────
|
||||
send_telegram_direct() {
|
||||
local message="$1"
|
||||
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
|
||||
log "WARN: Telegram 未設定,跳過 Fallback"
|
||||
return 0
|
||||
fi
|
||||
[[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]] && return 0
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${message}\",\"parse_mode\":\"HTML\"}" \
|
||||
> /dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
|
||||
# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC)
|
||||
send_to_awoooi() {
|
||||
# ─── 回報到 AWOOOI API ───────────────────────────────────────────────────────
|
||||
report_to_awoooi() {
|
||||
local container="$1"
|
||||
local status="$2" # unhealthy | exited | dead
|
||||
local detected_status="$2" # unhealthy | exited | dead
|
||||
local repair_action="$3" # restarted | started | alert_only | failed
|
||||
local repair_result="$4" # success | failed | skipped
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
local now_ts
|
||||
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema)
|
||||
# auto_repair label 根據實際動作設定
|
||||
local auto_repair_label="false"
|
||||
[[ "$repair_result" == "success" ]] && auto_repair_label="true"
|
||||
|
||||
local payload
|
||||
payload=$(cat <<JSON
|
||||
{
|
||||
@@ -97,12 +122,14 @@ send_to_awoooi() {
|
||||
"host": "${hostname}",
|
||||
"layer": "docker",
|
||||
"severity": "warning",
|
||||
"auto_repair": "true",
|
||||
"auto_repair": "${auto_repair_label}",
|
||||
"repair_action": "${repair_action}",
|
||||
"repair_result": "${repair_result}",
|
||||
"source": "docker-health-monitor"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "容器 ${container} 狀態異常: ${status}",
|
||||
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
|
||||
"summary": "容器 ${container} 狀態=${detected_status} → 修復=${repair_action}(${repair_result})",
|
||||
"description": "主機 ${hostname} | 偵測=${detected_status} | 動作=${repair_action} | 結果=${repair_result}"
|
||||
},
|
||||
"startsAt": "${now_ts}"
|
||||
}]
|
||||
@@ -116,15 +143,52 @@ JSON
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$payload" \
|
||||
--connect-timeout 10 \
|
||||
--max-time 60 2>/dev/null) || http_code="0"
|
||||
--max-time 30 2>/dev/null) || http_code="0"
|
||||
|
||||
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
|
||||
log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
|
||||
set_send_cooldown "$container"
|
||||
log "REPORTED: ${container} repair=${repair_action}(${repair_result}) → API (${http_code})"
|
||||
else
|
||||
log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API"
|
||||
send_telegram_direct "🚨 [docker-health-monitor Fallback] 主機: ${hostname} 容器: ${container} 狀態: ${status} (API 不可達,請人工處理)"
|
||||
set_send_cooldown "$container"
|
||||
log "WARN: API 回應 ${http_code},Fallback Telegram"
|
||||
local emoji="🔧"
|
||||
[[ "$repair_result" == "failed" ]] && emoji="❌"
|
||||
[[ "$repair_action" == "alert_only" ]] && emoji="⚠️"
|
||||
send_telegram_direct "${emoji} [docker-health-monitor] 主機: ${hostname} 容器: ${container} 狀態: ${detected_status} 修復: ${repair_action} → ${repair_result} (API 不可達)"
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── 自動修復邏輯 ────────────────────────────────────────────────────────────
|
||||
attempt_repair() {
|
||||
local container="$1"
|
||||
local detected_status="$2"
|
||||
|
||||
# 1. 禁止重啟清單
|
||||
if matches_pattern "$container" "${READONLY_PATTERNS[@]}"; then
|
||||
log "ALERT_ONLY: ${container} 在禁止重啟清單,僅告警"
|
||||
report_to_awoooi "$container" "$detected_status" "alert_only" "skipped"
|
||||
return
|
||||
fi
|
||||
|
||||
# 2. 監控棧 — 用 docker start(保護 WAL)
|
||||
if matches_pattern "$container" "${SAFE_START_PATTERNS[@]}"; then
|
||||
log "SAFE_START: ${container} 屬監控棧,執行 docker start"
|
||||
if docker start "$container" >> /dev/null 2>&1; then
|
||||
log "SUCCESS: docker start ${container}"
|
||||
report_to_awoooi "$container" "$detected_status" "started" "success"
|
||||
else
|
||||
log "FAILED: docker start ${container}"
|
||||
report_to_awoooi "$container" "$detected_status" "started" "failed"
|
||||
fi
|
||||
return
|
||||
fi
|
||||
|
||||
# 3. 一般容器 — docker restart
|
||||
log "AUTO_REPAIR: docker restart ${container}"
|
||||
if docker restart "$container" >> /dev/null 2>&1; then
|
||||
log "SUCCESS: docker restart ${container}"
|
||||
report_to_awoooi "$container" "$detected_status" "restarted" "success"
|
||||
else
|
||||
log "FAILED: docker restart ${container}"
|
||||
report_to_awoooi "$container" "$detected_status" "restarted" "failed"
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -133,51 +197,39 @@ check_containers() {
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
# 取得所有容器(含停止的)
|
||||
while IFS=$'\t' read -r container_id container_name state health; do
|
||||
# 跳過 header 或空行
|
||||
[[ -z "$container_name" ]] && continue
|
||||
|
||||
# 排除清單檢查(EXCLUDE_CONTAINERS 逗號分隔)
|
||||
# 排除清單
|
||||
local excluded=false
|
||||
IFS=',' read -ra EXCLUDES <<< "$EXCLUDE_CONTAINERS"
|
||||
for pattern in "${EXCLUDES[@]}"; do
|
||||
pattern="${pattern// /}" # trim spaces
|
||||
pattern="${pattern// /}"
|
||||
[[ -z "$pattern" ]] && continue
|
||||
# shellcheck disable=SC2254
|
||||
case "$container_name" in
|
||||
$pattern) excluded=true; break ;;
|
||||
esac
|
||||
done
|
||||
if $excluded; then
|
||||
continue
|
||||
fi
|
||||
$excluded && continue
|
||||
|
||||
local needs_alert=false
|
||||
local needs_action=false
|
||||
local detected_status=""
|
||||
|
||||
# 偵測 exited / dead
|
||||
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
|
||||
needs_alert=true
|
||||
needs_action=true
|
||||
detected_status="$state"
|
||||
fi
|
||||
|
||||
# 偵測 unhealthy(health check 存在且失敗)
|
||||
if [[ "$health" == "unhealthy" ]]; then
|
||||
needs_alert=true
|
||||
needs_action=true
|
||||
detected_status="unhealthy"
|
||||
fi
|
||||
|
||||
if $needs_alert; then
|
||||
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
|
||||
|
||||
# 冷卻期去重
|
||||
if is_in_send_cooldown "$container_name"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# 送 Webhook — 只感知,不修復
|
||||
send_to_awoooi "$container_name" "$detected_status"
|
||||
if $needs_action; then
|
||||
log "DETECTED: ${container_name} state=${state} health=${health} on ${hostname}"
|
||||
is_in_cooldown "$container_name" && continue
|
||||
set_cooldown "$container_name"
|
||||
attempt_repair "$container_name" "$detected_status"
|
||||
fi
|
||||
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
|
||||
awk -F'\t' '{
|
||||
@@ -190,7 +242,7 @@ check_containers() {
|
||||
|
||||
# ─── Main ───────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
|
||||
log "=== docker-health-monitor 啟動 (自動修復模式) on $(hostname) ==="
|
||||
check_containers
|
||||
log "=== 掃描完成 ==="
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user