feat(ops): Sprint 5.2 docker-health-monitor 升級為自動修復模式

舊版: 純感知層 (L4-6),只送 Webhook,修復由 API 執行
新版: 感知 + 自動修復 + 回報

修復分級 (ADR-060):
- 一般容器: docker restart
- 監控棧 (prometheus/grafana/alertmanager): docker start (保護 WAL)
- DB/Redis/ClickHouse: 僅告警,禁止重啟

已部署到:
- 192.168.0.110 ~/awoooi-ops/docker-health-monitor.sh
- 192.168.0.188 ~/awoooi-ops/docker-health-monitor.sh
- 兩台 cron */5 * * * * 運行中

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-09 11:59:11 +08:00
parent b43e1f1818
commit 8d0042ed29
2 changed files with 124 additions and 62 deletions

View File

@@ -24,22 +24,32 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
MONITOR_SCRIPT="${REPO_ROOT}/scripts/ops/docker-health-monitor.sh"
TARGET="${1:-all}"
SSH_KEY="${HOME}/.ssh/deploy_key"
JUMP_HOST="wooo@192.168.0.121"
SSH_KEY="${HOME}/.ssh/id_rsa"
# 110 用 wooo188 用 ollama
ssh_user() {
local host="$1"
case "$host" in
192.168.0.188) echo "ollama" ;;
*) echo "wooo" ;;
esac
}
# 透過 K3s master 跳板連到 110/188
ssh_cmd() {
local host="$1"
shift
ssh -i "$SSH_KEY" -J "$JUMP_HOST" -o StrictHostKeyChecking=no "wooo@${host}" "$@"
local user
user=$(ssh_user "$host")
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no "${user}@${host}" "$@"
}
scp_cmd() {
local src="$1"
local host="$2"
local dst="$3"
# 先透過 scp 到跳板,再 ssh 到目標act runner 限制)
scp -i "$SSH_KEY" -o StrictHostKeyChecking=no -J "$JUMP_HOST" "$src" "wooo@${host}:${dst}"
local user
user=$(ssh_user "$host")
scp -i "$SSH_KEY" -o StrictHostKeyChecking=no "$src" "${user}@${host}:${dst}"
}
deploy_to_host() {

View File

@@ -1,13 +1,17 @@
#!/usr/bin/env bash
# docker-health-monitor.sh
# Sprint 5.1 L4-6: 感知層(偵測→送 Webhook禁止任何修復動作
# Sprint 5.2 Plan A: 感知 + 自動修復 + 回報
#
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
# 設定: /etc/awoooi-ops/secrets.env
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行ADR-062
# 注意: 禁止在此腳本中執行 docker restart / docker start
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
# 撰寫: Claude Sonnet 4.6 / 2026-04-09 Asia/Taipei
#
# 修復分級 (ADR-060 統帥裁示):
# 自動 docker restart: 一般應用容器 (非 DB/Redis/監控棧)
# 自動 docker start: prometheus / grafana / alertmanager (保護 WAL)
# 僅告警 (禁止重啟): postgres / redis / clickhouse / DB 類
#
# 流程: Detect → AutoRepair → Report (Intent→Action→Result 三段)
set -euo pipefail
@@ -22,22 +26,30 @@ fi
: "${TELEGRAM_BOT_TOKEN:=}"
: "${TELEGRAM_CHAT_ID:=}"
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
# 冷卻期:避免同一容器在短時間內重複發送 webhook去重非修復冷卻
: "${SEND_COOLDOWN_SECONDS:=300}"
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
# 排除清單:逗號分隔,支援 glob如 signoz-*
# 用途init containers、永久停用服務、已知 exited 但不需告警的容器
: "${EXCLUDE_CONTAINERS:=signoz-telemetrystore-migrator,signoz-clickhouse,signoz-init-clickhouse}"
mkdir -p "$COOLDOWN_DIR"
# ─── 禁止自動重啟的容器 (模式匹配) ─────────────────────────────────────────
# DB / Cache / 監控棧核心 — 僅告警,不自動重啟
READONLY_PATTERNS=(
"postgres" "momo-db" "langfuse-db" "harbor-db" "sentry-postgres"
"redis" "harbor-redis" "sentry-redis" "signoz-clickhouse"
)
# 監控棧容器:用 docker start非 restart保護 WAL
SAFE_START_PATTERNS=(
"prometheus" "grafana" "alertmanager"
)
# ─── 工具函數 ────────────────────────────────────────────────────────────────
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
}
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook
is_in_send_cooldown() {
is_in_cooldown() {
local container="$1"
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
if [[ -f "$cooldown_file" ]]; then
@@ -46,43 +58,56 @@ is_in_send_cooldown() {
now=$(date +%s)
elapsed=$(( now - last_sent ))
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s跳過冷卻 ${SEND_COOLDOWN_SECONDS}s"
log "COOLDOWN: ${container} 距上次處理 ${elapsed}s跳過冷卻 ${SEND_COOLDOWN_SECONDS}s"
return 0
fi
fi
return 1
}
set_send_cooldown() {
set_cooldown() {
local container="$1"
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
# FallbackAWOOOI API down 時直接呼叫 Telegram Bot API
# 判斷容器是否符合模式清單
matches_pattern() {
local name="$1"
shift
local patterns=("$@")
for pattern in "${patterns[@]}"; do
case "$name" in
*"${pattern}"*) return 0 ;;
esac
done
return 1
}
# ─── Telegram 直發 Fallback ──────────────────────────────────────────────────
send_telegram_direct() {
local message="$1"
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
log "WARN: Telegram 未設定,跳過 Fallback"
return 0
fi
[[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]] && return 0
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${message}\",\"parse_mode\":\"HTML\"}" \
> /dev/null 2>&1 || true
}
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
# 使用現有端點 /api/v1/webhooks/alertmanager內網免 HMAC
send_to_awoooi() {
# ─── 回報到 AWOOOI API ───────────────────────────────────────────────────────
report_to_awoooi() {
local container="$1"
local status="$2" # unhealthy | exited | dead
local detected_status="$2" # unhealthy | exited | dead
local repair_action="$3" # restarted | started | alert_only | failed
local repair_result="$4" # success | failed | skipped
local hostname
hostname=$(hostname)
local now_ts
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
# 組裝 Alertmanager 格式 JSON符合現有 AlertmanagerPayload schema
# auto_repair label 根據實際動作設定
local auto_repair_label="false"
[[ "$repair_result" == "success" ]] && auto_repair_label="true"
local payload
payload=$(cat <<JSON
{
@@ -97,12 +122,14 @@ send_to_awoooi() {
"host": "${hostname}",
"layer": "docker",
"severity": "warning",
"auto_repair": "true",
"auto_repair": "${auto_repair_label}",
"repair_action": "${repair_action}",
"repair_result": "${repair_result}",
"source": "docker-health-monitor"
},
"annotations": {
"summary": "容器 ${container} 狀態異常: ${status}",
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
"summary": "容器 ${container} 狀態=${detected_status} → 修復=${repair_action}(${repair_result})",
"description": "主機 ${hostname} | 偵測=${detected_status} | 動作=${repair_action} | 結果=${repair_result}"
},
"startsAt": "${now_ts}"
}]
@@ -116,15 +143,52 @@ JSON
-H "Content-Type: application/json" \
-d "$payload" \
--connect-timeout 10 \
--max-time 60 2>/dev/null) || http_code="0"
--max-time 30 2>/dev/null) || http_code="0"
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
log "SENT: ${container} 狀態=${status} AWOOOI API (${http_code})"
set_send_cooldown "$container"
log "REPORTED: ${container} repair=${repair_action}(${repair_result}) → API (${http_code})"
else
log "WARN: AWOOOI API 回應 ${http_code}Fallback Telegram Bot API"
send_telegram_direct "🚨 [docker-health-monitor Fallback]&#10;主機: ${hostname}&#10;容器: ${container}&#10;狀態: ${status}&#10;(API 不可達,請人工處理)"
set_send_cooldown "$container"
log "WARN: API 回應 ${http_code}Fallback Telegram"
local emoji="🔧"
[[ "$repair_result" == "failed" ]] && emoji="❌"
[[ "$repair_action" == "alert_only" ]] && emoji="⚠️"
send_telegram_direct "${emoji} [docker-health-monitor]&#10;主機: ${hostname}&#10;容器: ${container}&#10;狀態: ${detected_status}&#10;修復: ${repair_action}${repair_result}&#10;(API 不可達)"
fi
}
# ─── 自動修復邏輯 ────────────────────────────────────────────────────────────
attempt_repair() {
local container="$1"
local detected_status="$2"
# 1. 禁止重啟清單
if matches_pattern "$container" "${READONLY_PATTERNS[@]}"; then
log "ALERT_ONLY: ${container} 在禁止重啟清單,僅告警"
report_to_awoooi "$container" "$detected_status" "alert_only" "skipped"
return
fi
# 2. 監控棧 — 用 docker start保護 WAL
if matches_pattern "$container" "${SAFE_START_PATTERNS[@]}"; then
log "SAFE_START: ${container} 屬監控棧,執行 docker start"
if docker start "$container" >> /dev/null 2>&1; then
log "SUCCESS: docker start ${container}"
report_to_awoooi "$container" "$detected_status" "started" "success"
else
log "FAILED: docker start ${container}"
report_to_awoooi "$container" "$detected_status" "started" "failed"
fi
return
fi
# 3. 一般容器 — docker restart
log "AUTO_REPAIR: docker restart ${container}"
if docker restart "$container" >> /dev/null 2>&1; then
log "SUCCESS: docker restart ${container}"
report_to_awoooi "$container" "$detected_status" "restarted" "success"
else
log "FAILED: docker restart ${container}"
report_to_awoooi "$container" "$detected_status" "restarted" "failed"
fi
}
@@ -133,51 +197,39 @@ check_containers() {
local hostname
hostname=$(hostname)
# 取得所有容器(含停止的)
while IFS=$'\t' read -r container_id container_name state health; do
# 跳過 header 或空行
[[ -z "$container_name" ]] && continue
# 排除清單檢查EXCLUDE_CONTAINERS 逗號分隔)
# 排除清單
local excluded=false
IFS=',' read -ra EXCLUDES <<< "$EXCLUDE_CONTAINERS"
for pattern in "${EXCLUDES[@]}"; do
pattern="${pattern// /}" # trim spaces
pattern="${pattern// /}"
[[ -z "$pattern" ]] && continue
# shellcheck disable=SC2254
case "$container_name" in
$pattern) excluded=true; break ;;
esac
done
if $excluded; then
continue
fi
$excluded && continue
local needs_alert=false
local needs_action=false
local detected_status=""
# 偵測 exited / dead
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
needs_alert=true
needs_action=true
detected_status="$state"
fi
# 偵測 unhealthyhealth check 存在且失敗)
if [[ "$health" == "unhealthy" ]]; then
needs_alert=true
needs_action=true
detected_status="unhealthy"
fi
if $needs_alert; then
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
# 冷卻期去重
if is_in_send_cooldown "$container_name"; then
continue
fi
# 送 Webhook — 只感知,不修復
send_to_awoooi "$container_name" "$detected_status"
if $needs_action; then
log "DETECTED: ${container_name} state=${state} health=${health} on ${hostname}"
is_in_cooldown "$container_name" && continue
set_cooldown "$container_name"
attempt_repair "$container_name" "$detected_status"
fi
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
awk -F'\t' '{
@@ -190,7 +242,7 @@ check_containers() {
# ─── Main ───────────────────────────────────────────────────────────────────
main() {
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
log "=== docker-health-monitor 啟動 (自動修復模式) on $(hostname) ==="
check_containers
log "=== 掃描完成 ==="
}