From 1fb0c0ca906c0d42249c4adcf63382b4d930fd49 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 14:11:50 +0800 Subject: [PATCH] =?UTF-8?q?fix(auto-repair):=20Bug=20#5+#6=20=E2=80=94=20S?= =?UTF-8?q?SH=20binary=20+=20affected=5Fservices=20=E5=8C=B9=E9=85=8D?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug #5 (webhooks.py): target_resource 現在優先用 component label - SentryDown alert 有 labels.component="sentry" - 舊邏輯: labels.instance="192.168.0.110:9000" → Playbook affected_services 不匹配 - 新邏輯: component → pod → instance → alertname Bug #6 (Dockerfile): python:3.11-slim 無 openssh-client - SSH_COMMAND Playbook 執行路徑調用 asyncio.create_subprocess_exec("ssh", ...) - image 沒有 ssh binary → 所有 SSH 修復必然失敗 - 修正: 在 production stage 安裝 openssh-client 服務清單: 補 sentry 主服務到 service-registry.yaml (AUTO 級別) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/Dockerfile | 4 ++++ apps/api/src/api/v1/webhooks.py | 10 +++++++++- ops/config/service-registry.yaml | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 420d1413..220b66a4 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -56,6 +56,10 @@ COPY apps/api/models.json ./models.json # 2026-04-09 ogt: 規則引擎配置 — alert_rule_engine.py 從此檔載入規則 COPY apps/api/alert_rules.yaml ./alert_rules.yaml +# Install openssh-client — SSH_COMMAND Playbook 執行路徑需要 ssh binary +# (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #6 修正 — python:3.11-slim 無 openssh-client) +RUN apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* + # Create non-root user RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app USER appuser diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 3ebda8e6..72ea1fe3 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1197,7 +1197,15 @@ async def alertmanager_webhook( "warning" ) - target_resource = alert.labels.get("pod") or alert.labels.get("instance") or alertname + # 優先用 component label(Docker 層告警用 component,如 SentryDown → "sentry") + # 次優 pod(K8s 告警),再次 instance(blackbox probe),最後 alertname + # (2026-04-09 Claude Sonnet 4.6 Asia/Taipei, Bug #5 修正 — affected_services 匹配 Playbook) + target_resource = ( + alert.labels.get("component") + or alert.labels.get("pod") + or alert.labels.get("instance") + or alertname + ) namespace = alert.labels.get("namespace", "default") message = alert.annotations.get("summary") or alert.annotations.get("description") or alertname diff --git a/ops/config/service-registry.yaml b/ops/config/service-registry.yaml index f68ed681..afb5c162 100644 --- a/ops/config/service-registry.yaml +++ b/ops/config/service-registry.yaml @@ -157,6 +157,13 @@ services: stateful_level: AUTO containers: ["blackbox-exporter"] + - name: sentry + display_name: "Sentry (錯誤追蹤)" + host: "192.168.0.110" + stateful_level: AUTO + reason: "Web server 無狀態,docker compose up -d 即可恢復" + containers: ["sentry-web", "sentry-worker", "sentry-cron"] + - name: langfuse display_name: "Langfuse (LLMOps)" host: "192.168.0.110"