diff --git a/apps/api/migrations/sprint51_alert_log_events.sql b/apps/api/migrations/sprint51_alert_log_events.sql
new file mode 100644
index 00000000..499831fc
--- /dev/null
+++ b/apps/api/migrations/sprint51_alert_log_events.sql
@@ -0,0 +1,18 @@
+-- apps/api/migrations/sprint51_alert_log_events.sql
+-- Sprint 5.1 M-003: alert_operation_log ENUM 擴充
+-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+-- ⚠️ ENUM ADD VALUE 不可 rollback,執行前確認已備份
+-- 說明: 新增 8 個 event_type 支援 Guardrail / Pre-flight / MultiSig / 備份追蹤
+
+BEGIN;
+
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'GUARDRAIL_BLOCKED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_PASSED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_FAILED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_TRIGGERED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_COMPLETED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_FAILED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'APPROVAL_ESCALATED';
+ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'CHANGE_APPLIED';
+
+COMMIT;
diff --git a/apps/api/migrations/sprint51_approval_multisig.sql b/apps/api/migrations/sprint51_approval_multisig.sql
new file mode 100644
index 00000000..bb92f824
--- /dev/null
+++ b/apps/api/migrations/sprint51_approval_multisig.sql
@@ -0,0 +1,31 @@
+-- apps/api/migrations/sprint51_approval_multisig.sql
+-- Sprint 5.1 M-002: MultiSig 雙簽核支援
+-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+-- 說明: approval_records 新增 approval_level / approval_votes / required_votes
+
+BEGIN;
+
+ALTER TABLE approval_records
+ ADD COLUMN IF NOT EXISTS approval_level VARCHAR(20)
+ DEFAULT 'standard'
+ CHECK (approval_level IN ('standard', 'critical')),
+ ADD COLUMN IF NOT EXISTS approval_votes JSONB
+ DEFAULT '[]'::jsonb,
+ ADD COLUMN IF NOT EXISTS required_votes INTEGER
+ DEFAULT 1;
+
+COMMENT ON COLUMN approval_records.approval_level IS
+ 'standard=1票審核, critical=2票MultiSig';
+COMMENT ON COLUMN approval_records.approval_votes IS
+ 'JSON array: [{"user_id": "123", "voted_at": "2026-04-08T...", "action": "approve"}]';
+COMMENT ON COLUMN approval_records.required_votes IS
+ 'standard=1, critical=2';
+
+-- 現有記錄回填(向後相容)
+UPDATE approval_records
+SET approval_level = 'standard',
+ required_votes = 1,
+ approval_votes = '[]'::jsonb
+WHERE approval_level IS NULL;
+
+COMMIT;
diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py
index 9b45072b..0b87427b 100644
--- a/apps/api/src/api/v1/webhooks.py
+++ b/apps/api/src/api/v1/webhooks.py
@@ -218,7 +218,9 @@ async def _try_auto_repair_background(
)
return
- # 記錄自動修復觸發
+ # 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤)
+ # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
+ _langfuse_trace_id = getattr(incident, "langfuse_trace_id", None)
await op_log.append(
"AUTO_REPAIR_TRIGGERED",
incident_id=incident_id,
@@ -231,6 +233,11 @@ async def _try_auto_repair_background(
"playbook_name": decision.playbook.name,
"similarity_score": decision.similarity_score,
"risk_level": decision.risk_level.value if decision.risk_level else None,
+ "langfuse_trace_id": _langfuse_trace_id,
+ "langfuse_url": (
+ f"http://192.168.0.110:3100/trace/{_langfuse_trace_id}"
+ if _langfuse_trace_id else None
+ ),
},
)
@@ -1084,6 +1091,31 @@ async def alertmanager_webhook(
alert = firing_alerts[0]
alert_id = f"alert-{now_taipei().strftime('%Y%m%d%H%M%S')}"
+ # ==========================================================================
+ # Sprint 5.1 L4-2: ALERT_RECEIVED 溯源記錄 + auto_repair flag 讀取
+ # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q9)
+ # ==========================================================================
+ _alert_labels = alert.labels or {}
+ _alertname_for_log = _alert_labels.get("alertname", "UnknownAlert")
+ # Q9: auto_repair flag — Rule=false 強制 HITL(不觸發自動修復背景任務)
+ _can_auto_repair_by_rule = _alert_labels.get("auto_repair", "true").lower() == "true"
+ try:
+ _op_log = get_alert_operation_log_repository()
+ await _op_log.append(
+ "ALERT_RECEIVED",
+ actor="alertmanager",
+ action_detail=f"收到告警: {_alertname_for_log}",
+ context={
+ "source": "alertmanager",
+ "alert_id": alert_id,
+ "alertname": _alertname_for_log,
+ "labels": _alert_labels,
+ "auto_repair_flag": _can_auto_repair_by_rule,
+ },
+ )
+ except Exception as _log_err:
+ logger.warning("alert_received_log_failed", error=str(_log_err))
+
# ==========================================================================
# Alert Normalizer: 轉換 Alertmanager 格式 → AWOOOI AlertPayload
# ==========================================================================
@@ -1326,15 +1358,31 @@ async def alertmanager_webhook(
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
# Incident 建立後立即評估是否可自動修復
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
+ # Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務
+ # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
# ================================================================
- background_tasks.add_task(
- _try_auto_repair_background,
- incident_id=incident_id,
- approval_id=str(approval.id),
- alert_type=alert_type,
- target_resource=target_resource,
- namespace=namespace,
- )
+ if _can_auto_repair_by_rule:
+ background_tasks.add_task(
+ _try_auto_repair_background,
+ incident_id=incident_id,
+ approval_id=str(approval.id),
+ alert_type=alert_type,
+ target_resource=target_resource,
+ namespace=namespace,
+ )
+ else:
+ # auto_repair=false → 記錄 GUARDRAIL_BLOCKED,不觸發自動修復
+ _op_log_rule = get_alert_operation_log_repository()
+ background_tasks.add_task(
+ _op_log_rule.append,
+ "GUARDRAIL_BLOCKED",
+ incident_id=incident_id,
+ approval_id=str(approval.id),
+ actor="prometheus-rule",
+ action_detail=f"Prometheus rule 設定 auto_repair=false,強制人工審核: {alertname}",
+ success=False,
+ context={"alertname": alertname, "auto_repair_flag": False},
+ )
# 推送 Telegram
background_tasks.add_task(
diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py
index 2b891865..7dc9f918 100644
--- a/apps/api/src/db/models.py
+++ b/apps/api/src/db/models.py
@@ -124,6 +124,26 @@ class ApprovalRecord(Base):
comment="Last time this alert pattern was seen",
)
+ # Sprint 5.1 MultiSig 雙簽核支援 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q3)
+ approval_level: Mapped[str] = mapped_column(
+ String(20),
+ default="standard",
+ nullable=False,
+ comment="standard=1票審核, critical=2票MultiSig",
+ )
+ approval_votes: Mapped[list[dict[str, Any]]] = mapped_column(
+ JSON,
+ default=list,
+ nullable=False,
+ comment="[{user_id, voted_at, action}]",
+ )
+ required_votes: Mapped[int] = mapped_column(
+ Integer,
+ default=1,
+ nullable=False,
+ comment="standard=1, critical=2",
+ )
+
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
# Playbook 萃取和 KM 寫入必須知道 incident_id,不能靠文字解析
incident_id: Mapped[str | None] = mapped_column(
diff --git a/apps/api/src/models/playbook.py b/apps/api/src/models/playbook.py
index b7f98508..3088510f 100644
--- a/apps/api/src/models/playbook.py
+++ b/apps/api/src/models/playbook.py
@@ -212,6 +212,20 @@ class Playbook(BaseModel):
tags: list[str] = Field(default_factory=list, description="標籤")
notes: str | None = Field(None, description="人工補充說明")
+ # === Sprint 5.1 資料安全護欄 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei) ===
+ requires_approval_level: str = Field(
+ default="auto",
+ description="auto=直接執行, standard=1票, critical=2票MultiSig(由 Service Registry 決定)",
+ )
+ stateful_targets: list[str] = Field(
+ default_factory=list,
+ description="此 Playbook 操作的 Stateful 服務清單,對應 service-registry.yaml",
+ )
+ requires_pre_backup: bool = Field(
+ default=False,
+ description="執行前是否需要 Pre-flight 備份檢查",
+ )
+
# === 時間軸 ===
created_at: datetime = Field(default_factory=now_taipei)
updated_at: datetime = Field(default_factory=now_taipei)
diff --git a/apps/api/src/repositories/alert_operation_log_repository.py b/apps/api/src/repositories/alert_operation_log_repository.py
index c6a94bdf..b6f0aaec 100644
--- a/apps/api/src/repositories/alert_operation_log_repository.py
+++ b/apps/api/src/repositories/alert_operation_log_repository.py
@@ -22,7 +22,9 @@ from src.db.models import AlertOperationLog
logger = structlog.get_logger(__name__)
# 合法的 event_type 值 (對應 DB ENUM)
+# Sprint 5.1 新增 8 個 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei)
ALERT_EVENT_TYPES = {
+ # 原有 10 個
"ALERT_RECEIVED",
"TELEGRAM_SENT",
"USER_ACTION",
@@ -33,6 +35,15 @@ ALERT_EVENT_TYPES = {
"RESOLVED",
"SILENCED",
"ESCALATED",
+ # Sprint 5.1 Guardrail / Pre-flight / MultiSig / 備份追蹤
+ "GUARDRAIL_BLOCKED",
+ "PRE_FLIGHT_PASSED",
+ "PRE_FLIGHT_FAILED",
+ "BACKUP_TRIGGERED",
+ "BACKUP_COMPLETED",
+ "BACKUP_FAILED",
+ "APPROVAL_ESCALATED",
+ "CHANGE_APPLIED",
}
diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py
index d48bdc7e..266b2001 100644
--- a/apps/api/src/services/auto_repair_service.py
+++ b/apps/api/src/services/auto_repair_service.py
@@ -192,6 +192,32 @@ class AutoRepairService:
blocked_by="GLOBAL_GUARDRAIL",
)
+ # 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
+ # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
+ # 全域熔斷之後、嚴重度之前,BLOCK 等級直接拒絕
+ try:
+ from src.services.service_registry import StatefulLevel, get_service_registry
+ _registry = get_service_registry()
+ _service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
+ if not _service_name and incident.affected_services:
+ _service_name = incident.affected_services[0]
+ _stateful_level = _registry.get_stateful_level(_service_name)
+ if _stateful_level == StatefulLevel.BLOCK:
+ logger.warning(
+ "auto_repair_blocked_guardrail",
+ incident_id=incident.incident_id,
+ service_name=_service_name,
+ stateful_level="BLOCK",
+ )
+ return AutoRepairDecision(
+ can_auto_repair=False,
+ reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml)",
+ blocked_by="SERVICE_REGISTRY_BLOCK",
+ )
+ except Exception as _guardrail_err:
+ logger.error("guardrail_check_failed", error=str(_guardrail_err))
+ # 保守原則:失敗時繼續(不阻擋,但記錄)
+
# 1. 檢查 Incident 嚴重度
if incident.severity and incident.severity.value in ["P0", "P1"]:
logger.info(
diff --git a/apps/api/src/services/preflight_service.py b/apps/api/src/services/preflight_service.py
new file mode 100644
index 00000000..837d9104
--- /dev/null
+++ b/apps/api/src/services/preflight_service.py
@@ -0,0 +1,116 @@
+# apps/api/src/services/preflight_service.py
+# Pre-flight 安全檢查服務 (Q2/Q4 決策)
+# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+# 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient
+# 參考: ADR-062
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+from enum import Enum
+
+from .service_registry import ServiceRegistryClient, get_service_registry
+from .velero_client import VeleroClient, get_velero_client
+
+logger = logging.getLogger(__name__)
+
+
+class PreflightResult(str, Enum):
+ PASS = "PASS"
+ ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED"
+ ABORT_HIGH_IO = "ABORT_HIGH_IO"
+ SKIP = "SKIP" # 服務不需要 Pre-flight
+
+
+@dataclass
+class PreflightReport:
+ result: PreflightResult
+ backup_age_hours: float | None = None
+ backup_name_triggered: str | None = None
+ reason: str = ""
+
+
+class PreflightService:
+ """
+ Pre-flight 安全檢查
+ - 只有 requires_pre_backup=True 的服務才觸發
+ - 備份過期 → Abort + 觸發緊急備份(非同步)
+ - CPU/IO 高負載告警 → 禁止觸發備份(Q4)
+ """
+
+ def __init__(
+ self,
+ registry: ServiceRegistryClient | None = None,
+ velero: VeleroClient | None = None,
+ ) -> None:
+ self._registry = registry or get_service_registry()
+ self._velero = velero or get_velero_client()
+
+ async def check(
+ self,
+ service_name: str,
+ alert_labels: dict | None = None,
+ ) -> PreflightReport:
+ """
+ 執行 Pre-flight 檢查
+ alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載
+ """
+ info = self._registry.get_service(service_name)
+ if info is None or not info.requires_pre_backup:
+ return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight")
+
+ # Q4: CPU/IO 高負載告警時禁止觸發備份
+ if self._is_high_io_alert(alert_labels):
+ logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發")
+ return PreflightReport(
+ result=PreflightResult.ABORT_HIGH_IO,
+ reason="告警類型為 CPU/IO 高負載,禁止觸發備份(Q4 決策)",
+ )
+
+ policies = self._registry.get_backup_policies()
+ max_age = policies.get("velero_max_age_hours", 4)
+
+ age = await self._velero.get_latest_backup_age_hours()
+ if age <= max_age:
+ return PreflightReport(
+ result=PreflightResult.PASS,
+ backup_age_hours=age,
+ reason=f"備份時間正常 ({age:.1f}h < {max_age}h)",
+ )
+
+ # 備份過期 → 觸發緊急備份 + Abort
+ backup_name = f"emergency-preflight-{int(time.time())}"
+ triggered = await self._velero.trigger_emergency_backup(backup_name)
+ return PreflightReport(
+ result=PreflightResult.ABORT_BACKUP_EXPIRED,
+ backup_age_hours=age,
+ backup_name_triggered=backup_name if triggered else None,
+ reason=(
+ f"備份過期 ({age:.1f}h > {max_age}h)。"
+ f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}"
+ ),
+ )
+
+ def _is_high_io_alert(self, labels: dict | None) -> bool:
+ if not labels:
+ return False
+ alert_name = labels.get("alertname", "").lower()
+ return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"])
+
+
+_preflight_service: PreflightService | None = None
+
+
+def get_preflight_service() -> PreflightService:
+ global _preflight_service
+ if _preflight_service is None:
+ _preflight_service = PreflightService()
+ return _preflight_service
+
+
+def set_preflight_service(service: PreflightService) -> None:
+ """測試注入用 (P4 規範)"""
+ global _preflight_service
+ _preflight_service = service
diff --git a/apps/api/src/services/service_registry.py b/apps/api/src/services/service_registry.py
new file mode 100644
index 00000000..c15472b1
--- /dev/null
+++ b/apps/api/src/services/service_registry.py
@@ -0,0 +1,124 @@
+# apps/api/src/services/service_registry.py
+# Service Registry Client — 讀取 ops/config/service-registry.yaml
+# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+# 架構: leWOOOgo 積木化,純 Service 層,無 Router/DB 依賴
+# 參考: ADR-062, ADR-063
+
+from __future__ import annotations
+
+import logging
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+# YAML 路徑(相對於 repo root)
+_DEFAULT_REGISTRY_PATH = Path(__file__).parents[5] / "ops" / "config" / "service-registry.yaml"
+
+
+class StatefulLevel(str, Enum):
+ BLOCK = "BLOCK" # 禁止,僅告警
+ CRITICAL_HITL = "CRITICAL_HITL" # 2 票 MultiSig
+ STANDARD_HITL = "STANDARD_HITL" # 1 票
+ AUTO = "AUTO" # 自動執行
+
+
+class ServiceInfo:
+ def __init__(self, data: dict[str, Any]) -> None:
+ self.name: str = data["name"]
+ self.display_name: str = data.get("display_name", self.name)
+ self.host: str = data.get("host", "unknown")
+ self.stateful_level: StatefulLevel = StatefulLevel(data.get("stateful_level", "AUTO"))
+ self.reason: str = data.get("reason", "")
+ self.alert_only: bool = data.get("alert_only", False)
+ self.requires_pre_backup: bool = data.get("requires_pre_backup", False)
+ self.restart_command: str = data.get("restart_command", "docker restart")
+ self.containers: list[str] = data.get("containers", [])
+
+
+class ServiceRegistryClient:
+ """
+ Service Registry 客戶端
+ 讀取 ops/config/service-registry.yaml,提供服務 Stateful 分級查詢
+ 設計原則: 純讀取,不寫入;失敗時 fallback AUTO(防護不應阻擋告警流程)
+ """
+
+ def __init__(self, registry_path: Path | None = None) -> None:
+ self._path = registry_path or _DEFAULT_REGISTRY_PATH
+ self._services: dict[str, ServiceInfo] = {}
+ self._backup_policies: dict[str, Any] = {}
+ self._multisig_config: dict[str, Any] = {}
+ self._loaded = False
+
+ def _load(self) -> None:
+ if self._loaded:
+ return
+ try:
+ with open(self._path) as f:
+ data = yaml.safe_load(f)
+ for svc in data.get("services", []):
+ info = ServiceInfo(svc)
+ self._services[info.name] = info
+ # 也按 container 名稱建立索引
+ for container in info.containers:
+ self._services[container] = info
+ self._backup_policies = data.get("backup_policies", {})
+ self._multisig_config = data.get("multisig", {})
+ self._loaded = True
+ logger.info(f"Service Registry 載入完成: {len(self._services)} 個服務")
+ except Exception as e:
+ logger.error(f"Service Registry 載入失敗: {e},所有服務 fallback AUTO")
+ self._loaded = True # 防止重複嘗試
+
+ def get_service(self, name: str) -> ServiceInfo | None:
+ self._load()
+ return self._services.get(name)
+
+ def get_stateful_level(self, service_name: str) -> StatefulLevel:
+ """查詢服務分級,未知服務 fallback AUTO"""
+ info = self.get_service(service_name)
+ if info is None:
+ logger.warning(f"未知服務 '{service_name}',fallback AUTO")
+ return StatefulLevel.AUTO
+ return info.stateful_level
+
+ def is_blocked(self, service_name: str) -> bool:
+ return self.get_stateful_level(service_name) == StatefulLevel.BLOCK
+
+ def requires_multisig(self, service_name: str) -> bool:
+ return self.get_stateful_level(service_name) == StatefulLevel.CRITICAL_HITL
+
+ def get_required_votes(self, service_name: str) -> int:
+ self._load()
+ level = self.get_stateful_level(service_name)
+ if level == StatefulLevel.CRITICAL_HITL:
+ return self._multisig_config.get("critical_required_votes", 2)
+ return self._multisig_config.get("standard_required_votes", 1)
+
+ def get_backup_policies(self) -> dict[str, Any]:
+ self._load()
+ return self._backup_policies
+
+ def get_restart_command(self, service_name: str) -> str:
+ info = self.get_service(service_name)
+ return info.restart_command if info else "docker restart"
+
+
+# Singleton
+_registry_client: ServiceRegistryClient | None = None
+
+
+def get_service_registry() -> ServiceRegistryClient:
+ global _registry_client
+ if _registry_client is None:
+ _registry_client = ServiceRegistryClient()
+ return _registry_client
+
+
+def set_service_registry(client: ServiceRegistryClient) -> None:
+ """測試注入用 (P4 規範)"""
+ global _registry_client
+ _registry_client = client
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index 95c5cfeb..4d267ee2 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -2654,6 +2654,128 @@ class TelegramGateway:
f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}"
)
+ # =========================================================================
+ # Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景
+ # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
+ # =========================================================================
+
+ async def send_guardrail_blocked(
+ self,
+ service_name: str,
+ alertname: str,
+ reason: str,
+ ) -> None:
+ """T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
+ text = (
+ "🚫 [服務保護] 自動修復已阻擋\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"告警: {html.escape(alertname)}\n"
+ f"原因: {html.escape(reason)}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "⚠️ 請人工評估並手動處理"
+ )
+ await self.send_notification(text)
+
+ async def send_preflight_failed(
+ self,
+ service_name: str,
+ backup_age_hours: float,
+ max_age_hours: float,
+ backup_name: str | None,
+ ) -> None:
+ """T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
+ backup_status = (
+ f"緊急備份: 已啟動 {html.escape(backup_name)}"
+ if backup_name
+ else "緊急備份: 啟動失敗,請人工處理"
+ )
+ text = (
+ "⏸ [Pre-flight 阻擋] 備份已過期,修復暫停\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
+ f"{backup_status}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "請等待備份完成後,人工重新評估修復方案"
+ )
+ await self.send_notification(text)
+
+ async def send_backup_result(
+ self,
+ backup_name: str,
+ success: bool,
+ error_msg: str | None = None,
+ ) -> None:
+ """T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
+ if success:
+ text = (
+ "✅ 緊急備份完成\n"
+ f"備份: {html.escape(backup_name)}\n"
+ "可繼續手動執行修復"
+ )
+ else:
+ err = html.escape(error_msg or "未知錯誤")
+ text = (
+ "❌ 緊急備份失敗\n"
+ f"備份: {html.escape(backup_name)}\n"
+ f"錯誤: {err}\n"
+ "請人工介入,備份異常"
+ )
+ await self.send_notification(text)
+
+ async def send_multisig_waiting(
+ self,
+ action: str,
+ service_name: str,
+ votes_received: int,
+ votes_required: int,
+ approval_id: str,
+ ) -> None:
+ """T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
+ text = (
+ "🔐 [MultiSig] 等待第 2 票授權\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"操作: {html.escape(action)}\n"
+ f"服務: {html.escape(service_name)}\n"
+ f"風險: CRITICAL(HITL 雙簽)\n"
+ f"已獲授權: {votes_received}/{votes_required} 票\n"
+ f"審核 ID: {html.escape(approval_id)}\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ "請第二位審核者登入確認"
+ )
+ await self.send_notification(text)
+
+ async def send_multisig_approved(
+ self,
+ action: str,
+ service_name: str,
+ ) -> None:
+ """T5: MultiSig 完成(2/2)"""
+ text = (
+ "✅ [MultiSig 完成] 雙簽授權通過\n"
+ f"操作: {html.escape(action)}\n"
+ f"服務: {html.escape(service_name)}\n"
+ "授權: 2/2 票 開始執行..."
+ )
+ await self.send_notification(text)
+
+ async def send_change_applied(
+ self,
+ operator: str,
+ action_description: str,
+ timestamp: str,
+ ) -> None:
+ """T6: CHANGE_APPLIED — 手動變更記錄"""
+ text = (
+ "📝 [變更記錄] 手動操作已記錄\n"
+ "━━━━━━━━━━━━━━━━━\n"
+ f"操作者: {html.escape(operator)}\n"
+ f"動作: {html.escape(action_description)}\n"
+ f"時間: {html.escape(timestamp)}"
+ )
+ await self.send_notification(text)
+
async def send_notification(
self,
text: str,
diff --git a/apps/api/src/services/velero_client.py b/apps/api/src/services/velero_client.py
new file mode 100644
index 00000000..7ad411d2
--- /dev/null
+++ b/apps/api/src/services/velero_client.py
@@ -0,0 +1,113 @@
+# apps/api/src/services/velero_client.py
+# Velero Backup 查詢客戶端 (kubectl 方式,Q7 決策)
+# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+# 架構: leWOOOgo 積木化,純 Service 層
+# 參考: ADR-062
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from datetime import UTC, datetime
+
+logger = logging.getLogger(__name__)
+
+_VELERO_NAMESPACE = "velero"
+_KUBECTL_TIMEOUT = 30 # 秒
+
+
+class VeleroClient:
+ """
+ 透過 kubectl 查詢 Velero 備份狀態
+ 設計原則: 失敗時 fallback「假設備份過期」(保守原則)
+ """
+
+ async def get_latest_backup_age_hours(self) -> float:
+ """
+ 查詢最近一次 Completed 備份距今幾小時
+ 失敗時返回 999.0(視為嚴重過期,觸發 Abort)
+ """
+ try:
+ result = await asyncio.wait_for(
+ self._run_kubectl(
+ ["get", "backup", "-n", _VELERO_NAMESPACE,
+ "-o", "json", "--field-selector", "status.phase=Completed"]
+ ),
+ timeout=_KUBECTL_TIMEOUT,
+ )
+ data = json.loads(result)
+ items = data.get("items", [])
+ if not items:
+ logger.warning("Velero: 找不到任何 Completed 備份")
+ return 999.0
+
+ latest = max(
+ items,
+ key=lambda x: x.get("status", {}).get("completionTimestamp", ""),
+ )
+ completion_ts = latest["status"].get("completionTimestamp", "")
+ if not completion_ts:
+ return 999.0
+
+ completed_at = datetime.fromisoformat(completion_ts.replace("Z", "+00:00"))
+ age = (datetime.now(UTC) - completed_at).total_seconds() / 3600
+ logger.info(f"Velero 最近備份: {completion_ts},距今 {age:.1f} 小時")
+ return age
+
+ except asyncio.TimeoutError:
+ logger.error("Velero kubectl 查詢超時")
+ return 999.0
+ except Exception as e:
+ logger.error(f"Velero 查詢失敗: {e}")
+ return 999.0
+
+ async def trigger_emergency_backup(self, backup_name: str | None = None) -> bool:
+ """
+ 觸發緊急備份(非同步,不等待完成)
+ 返回 True 表示指令已成功發送
+ """
+ name = backup_name or f"emergency-{int(time.time())}"
+ try:
+ await asyncio.wait_for(
+ self._run_kubectl([
+ "create", "backup", name,
+ "-n", _VELERO_NAMESPACE,
+ "--include-namespaces", "awoooi-prod",
+ "--wait=false",
+ ]),
+ timeout=_KUBECTL_TIMEOUT,
+ )
+ logger.info(f"Velero 緊急備份已啟動: {name}")
+ return True
+ except Exception as e:
+ logger.error(f"Velero 緊急備份失敗: {e}")
+ return False
+
+ async def _run_kubectl(self, args: list[str]) -> str:
+ proc = await asyncio.create_subprocess_exec(
+ "kubectl", *args,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+ stdout, stderr = await proc.communicate()
+ if proc.returncode != 0:
+ raise RuntimeError(f"kubectl 失敗: {stderr.decode()}")
+ return stdout.decode()
+
+
+_velero_client: VeleroClient | None = None
+
+
+def get_velero_client() -> VeleroClient:
+ global _velero_client
+ if _velero_client is None:
+ _velero_client = VeleroClient()
+ return _velero_client
+
+
+def set_velero_client(client: VeleroClient) -> None:
+ """測試注入用 (P4 規範)"""
+ global _velero_client
+ _velero_client = client
diff --git a/k8s/rbac/api-velero-reader.yaml b/k8s/rbac/api-velero-reader.yaml
new file mode 100644
index 00000000..47490b88
--- /dev/null
+++ b/k8s/rbac/api-velero-reader.yaml
@@ -0,0 +1,36 @@
+# k8s/rbac/api-velero-reader.yaml
+# API Pod 讀取 Velero backup 資源的 RBAC
+# Sprint 5.1 K-001 / 2026-04-08 Asia/Taipei
+# 說明: awoooi-executor ServiceAccount 需要讀取 velero namespace 的 backup 資源
+# 用於 Pre-flight Check 查詢最近備份時間(Q7 決策:kubectl 方式)
+# 注意: ServiceAccount 名稱為 awoooi-executor(非 awoooi-api,經 L0 確認)
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: awoooi-velero-backup-reader
+ labels:
+ app: awoooi
+ component: api
+ sprint: "5.1"
+rules:
+ - apiGroups: ["velero.io"]
+ resources: ["backups"]
+ verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: awoooi-velero-backup-reader
+ labels:
+ app: awoooi
+ component: api
+ sprint: "5.1"
+subjects:
+ - kind: ServiceAccount
+ name: awoooi-executor
+ namespace: awoooi-prod
+roleRef:
+ kind: ClusterRole
+ name: awoooi-velero-backup-reader
+ apiGroup: rbac.authorization.k8s.io
diff --git a/ops/config/service-registry.yaml b/ops/config/service-registry.yaml
new file mode 100644
index 00000000..f68ed681
--- /dev/null
+++ b/ops/config/service-registry.yaml
@@ -0,0 +1,201 @@
+# ops/config/service-registry.yaml
+# Service Registry — 服務 Stateful 分級清單
+# 版本: 1.0.0
+# 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
+# 維護: 修改需 PR + 統帥審核,禁止直接 push
+# 說明:
+# BLOCK = 系統禁止自動修復,僅告警(資料風險最高)
+# CRITICAL_HITL = 允許 Playbook,但需 MultiSig 2票
+# STANDARD_HITL = 允許 Playbook,需 1票審核
+# AUTO = 允許自動執行(無狀態服務)
+# 參考: ADR-062, ADR-063
+
+services:
+ # ─── BLOCK:系統禁止(連 Playbook 都不提供)────────────────────────────
+ - name: postgres
+ display_name: "PostgreSQL 主庫 (awoooi_prod)"
+ host: "192.168.0.188"
+ stateful_level: BLOCK
+ reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾"
+ alert_only: true
+ containers: ["postgres"]
+
+ - name: momo-db
+ display_name: "PostgreSQL (momo_db)"
+ host: "192.168.0.188"
+ stateful_level: BLOCK
+ reason: "momo 產品資料庫,禁止自動操作"
+ alert_only: true
+ containers: ["momo-db"]
+
+ - name: langfuse-db
+ display_name: "PostgreSQL (Langfuse)"
+ host: "192.168.0.110"
+ stateful_level: BLOCK
+ reason: "LLM trace 資料庫,重啟導致追蹤資料遺失"
+ alert_only: true
+ containers: ["langfuse-db"]
+
+ - name: harbor-db
+ display_name: "PostgreSQL (Harbor Registry)"
+ host: "192.168.0.110"
+ stateful_level: BLOCK
+ reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引"
+ alert_only: true
+ containers: ["harbor-db"]
+
+ - name: sentry-postgres
+ display_name: "PostgreSQL (Sentry)"
+ host: "192.168.0.110"
+ stateful_level: BLOCK
+ reason: "Sentry 錯誤追蹤資料庫"
+ alert_only: true
+ containers: ["sentry-postgres"]
+
+ - name: signoz-clickhouse
+ display_name: "ClickHouse (SignOz)"
+ host: "192.168.0.188"
+ stateful_level: BLOCK
+ reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案"
+ alert_only: true
+ containers: ["signoz-clickhouse"]
+
+ # ─── CRITICAL_HITL:高風險,需 MultiSig 2票 ──────────────────────────
+ - name: redis
+ display_name: "Redis (AWOOOI)"
+ host: "192.168.0.188"
+ stateful_level: CRITICAL_HITL
+ reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態"
+ requires_pre_backup: false
+ containers: ["redis"]
+
+ - name: harbor-redis
+ display_name: "Redis (Harbor)"
+ host: "192.168.0.110"
+ stateful_level: CRITICAL_HITL
+ reason: "Harbor session 快取"
+ containers: ["harbor-redis"]
+
+ - name: sentry-redis
+ display_name: "Redis (Sentry)"
+ host: "192.168.0.110"
+ stateful_level: CRITICAL_HITL
+ reason: "Sentry 任務佇列"
+ containers: ["sentry-redis"]
+
+ - name: gitea
+ display_name: "Gitea (程式碼倉庫)"
+ host: "192.168.0.110"
+ stateful_level: CRITICAL_HITL
+ reason: "restart 會殺掉活躍 SSH session,Git push 中斷可能損壞 working copy"
+ requires_pre_backup: false
+ containers: ["gitea"]
+
+ - name: harbor
+ display_name: "Harbor (Container Registry)"
+ host: "192.168.0.110"
+ stateful_level: CRITICAL_HITL
+ reason: "重啟中斷 pull/push;GC 進行中重啟可能損壞 layer"
+ requires_pre_backup: false
+ containers: ["harbor-core", "harbor-jobservice", "harbor-portal"]
+
+ - name: minio
+ display_name: "MinIO (物件存儲)"
+ host: "192.168.0.188"
+ stateful_level: CRITICAL_HITL
+ reason: "寫入中重啟可能導致 multipart upload 中斷"
+ requires_pre_backup: false
+ containers: ["minio"]
+
+ # ─── STANDARD_HITL:中風險,需 1票審核 ──────────────────────────────
+ - name: prometheus
+ display_name: "Prometheus"
+ host: "192.168.0.110"
+ stateful_level: STANDARD_HITL
+ reason: "有 TSDB WAL,exited 狀態用 docker start(非 restart)"
+ restart_command: "docker start"
+ containers: ["prometheus"]
+
+ - name: grafana
+ display_name: "Grafana"
+ host: "192.168.0.110"
+ stateful_level: STANDARD_HITL
+ reason: "有 SQLite 設定儲存,exited 用 docker start"
+ restart_command: "docker start"
+ containers: ["grafana"]
+
+ - name: alertmanager
+ display_name: "Alertmanager"
+ host: "192.168.0.110"
+ stateful_level: STANDARD_HITL
+ reason: "有 silence 狀態,exited 用 docker start"
+ restart_command: "docker start"
+ containers: ["alertmanager"]
+
+ # ─── AUTO:無狀態,允許自動修復 ──────────────────────────────────────
+ - name: nginx
+ display_name: "Nginx (反向代理)"
+ host: "192.168.0.110"
+ stateful_level: AUTO
+ containers: ["nginx", "nginx-188"]
+
+ - name: awoooi-api
+ display_name: "AWOOOI API (K3s)"
+ host: "k3s"
+ stateful_level: AUTO
+ containers: []
+
+ - name: awoooi-web
+ display_name: "AWOOOI Web (K3s)"
+ host: "k3s"
+ stateful_level: AUTO
+ containers: []
+
+ - name: blackbox-exporter
+ display_name: "Blackbox Exporter"
+ host: "192.168.0.110"
+ stateful_level: AUTO
+ containers: ["blackbox-exporter"]
+
+ - name: langfuse
+ display_name: "Langfuse (LLMOps)"
+ host: "192.168.0.110"
+ stateful_level: AUTO
+ containers: ["langfuse-web", "langfuse-worker"]
+
+ - name: ollama
+ display_name: "Ollama (Local LLM)"
+ host: "192.168.0.188"
+ stateful_level: AUTO
+ containers: ["ollama"]
+
+ - name: momo-app
+ display_name: "momo Web App"
+ host: "192.168.0.188"
+ stateful_level: AUTO
+ containers: ["momo-app"]
+
+ - name: tsenyang-website
+ display_name: "Tsenyang Website"
+ host: "192.168.0.188"
+ stateful_level: AUTO
+ containers: ["tsenyang-website"]
+
+ - name: stock-platform
+ display_name: "Stock Platform"
+ host: "192.168.0.110"
+ stateful_level: AUTO
+ containers: ["stock-platform"]
+
+# ─── 備份策略參考 ────────────────────────────────────────────────────────
+backup_policies:
+ velero_max_age_hours: 4 # Velero 備份過期閾值(Q2 決策)
+ emergency_backup_timeout: 600 # 緊急備份超時秒數
+ block_backup_on_high_io: true # CPU/IO > 80% 時禁止觸發備份(Q4 決策)
+ io_threshold_percent: 80
+
+# ─── MultiSig 設定 ───────────────────────────────────────────────────────
+multisig:
+ critical_required_votes: 2 # CRITICAL_HITL 需要幾票
+ standard_required_votes: 1 # STANDARD_HITL 需要幾票
+ vote_expiry_minutes: 30 # 投票有效期
diff --git a/scripts/ops/docker-health-monitor.sh b/scripts/ops/docker-health-monitor.sh
index 44493ef1..4c2f42a2 100755
--- a/scripts/ops/docker-health-monitor.sh
+++ b/scripts/ops/docker-health-monitor.sh
@@ -1,11 +1,13 @@
#!/usr/bin/env bash
# docker-health-monitor.sh
-# Plan A: Docker 容器健康監控 + 自動修復
+# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作)
#
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
# 設定: /etc/awoooi-ops/secrets.env
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
-# 首席架構師裁示: Intent→Action→Result 三段式,禁止靜默修復
+# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062)
+# 注意: 禁止在此腳本中執行 docker restart / docker start
+# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
set -euo pipefail
@@ -19,48 +21,45 @@ fi
: "${AWOOOI_API_URL:=https://awoooi.wooo.work}"
: "${TELEGRAM_BOT_TOKEN:=}"
: "${TELEGRAM_CHAT_ID:=}"
-: "${WEBHOOK_HMAC_SECRET:=}"
-: "${COOLDOWN_SECONDS:=300}"
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
+# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻)
+: "${SEND_COOLDOWN_SECONDS:=300}"
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
mkdir -p "$COOLDOWN_DIR"
-# ─── 排除清單(禁止自動修復)───────────────────────────────────────────────
-# 判斷方式: echo ":list:" | grep -q ":name:"
-# 分類一:資料庫 — 禁止 restart
-EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:"
-# 分類二:Redis — 禁止 restart
-EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:"
-# 分類三:監控棧 exited → docker start(保護 WAL)
-MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:"
-# 分類四:監控棧 其他 → 僅告警
-EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:"
-# 分類五:關鍵系統 — 永遠禁止(Gitea restart 會殺活躍 SSH)
-EXCLUDED_CRITICAL_LIST=":gitea:"
-
# ─── 工具函數 ────────────────────────────────────────────────────────────────
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
}
-in_list() {
- local name=":${1}:"
- local list="$2"
- [[ "$list" == *"$name"* ]]
+# 發送冷卻期檢查(避免同一容器短時間重複送 webhook)
+is_in_send_cooldown() {
+ local container="$1"
+ local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
+ if [[ -f "$cooldown_file" ]]; then
+ local last_sent now elapsed
+ last_sent=$(cat "$cooldown_file")
+ now=$(date +%s)
+ elapsed=$(( now - last_sent ))
+ if (( elapsed < SEND_COOLDOWN_SECONDS )); then
+ log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)"
+ return 0
+ fi
+ fi
+ return 1
}
-# 計算 HMAC-SHA256 簽章
-sign_payload() {
- local payload="$1"
- printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256
+set_send_cooldown() {
+ local container="$1"
+ date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
-# 傳送 Telegram(Fallback:AWOOOI API down 時直接呼叫 Bot API)
+# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API
send_telegram_direct() {
local message="$1"
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
- log "WARN: Telegram 未設定,跳過通知"
+ log "WARN: Telegram 未設定,跳過 Fallback"
return 0
fi
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
@@ -69,186 +68,113 @@ send_telegram_direct() {
> /dev/null 2>&1 || true
}
-# 傳送 AWOOOI Webhook(若失敗則 Fallback 至 Telegram Bot API)
-send_awoooi_alert() {
- local title="$1"
- local message="$2"
- local severity="${3:-WARNING}"
- local source="docker-health-monitor"
+# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
+# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC)
+send_to_awoooi() {
+ local container="$1"
+ local status="$2" # unhealthy | exited | dead
+ local hostname
+ hostname=$(hostname)
+ local now_ts
+ now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
+
+ # 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema)
local payload
- payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \
- "$title" "$message" "$severity" "$source")
-
- local timestamp
- timestamp=$(date -u +%s)
- local signature
- signature=$(sign_payload "${timestamp}${payload}")
+ payload=$(cat </dev/null) || http_code="0"
- if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then
+ if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
+ log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
+ set_send_cooldown "$container"
+ else
log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API"
- send_telegram_direct "[docker-health-monitor Fallback]
${title}
${message}"
+ send_telegram_direct "🚨 [docker-health-monitor Fallback]
主機: ${hostname}
容器: ${container}
狀態: ${status}
(API 不可達,請人工處理)"
+ set_send_cooldown "$container"
fi
}
-# 冷卻期檢查(避免同一容器短時間重複修復)
-is_in_cooldown() {
- local container="$1"
- local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
- if [[ -f "$cooldown_file" ]]; then
- local last_repair
- last_repair=$(cat "$cooldown_file")
- local now
- now=$(date +%s)
- local elapsed=$(( now - last_repair ))
- if (( elapsed < COOLDOWN_SECONDS )); then
- log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)"
- return 0
- fi
- fi
- return 1
-}
-
-set_cooldown() {
- local container="$1"
- date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
-}
-
-# ─── 核心:處理不健康容器 ───────────────────────────────────────────────────
-handle_unhealthy_container() {
- local container="$1"
- local status="$2" # unhealthy | exited | dead
+# ─── 核心:掃描所有容器 ─────────────────────────────────────────────────────
+check_containers() {
local hostname
hostname=$(hostname)
- log "DETECTED: ${container} 狀態=${status} on ${hostname}"
+ # 取得所有容器(含停止的)
+ while IFS=$'\t' read -r container_id container_name state health; do
+ # 跳過 header 或空行
+ [[ -z "$container_name" ]] && continue
- # ── 排除清單判斷 ─────────────────────────────────────────────────────────
-
- if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then
- log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警"
- send_awoooi_alert \
- "[${hostname}] 關鍵服務異常: ${container}" \
- "容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \
- "CRITICAL"
- return
- fi
-
- if in_list "$container" "$EXCLUDED_DB_LIST"; then
- log "SKIP: ${container} 屬於資料庫排除清單,僅告警"
- send_awoooi_alert \
- "[${hostname}] 資料庫容器異常: ${container}" \
- "容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \
- "CRITICAL"
- return
- fi
-
- if in_list "$container" "$EXCLUDED_REDIS_LIST"; then
- log "SKIP: ${container} 屬於 Redis 排除清單,僅告警"
- send_awoooi_alert \
- "[${hostname}] Redis 容器異常: ${container}" \
- "容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \
- "CRITICAL"
- return
- fi
-
- if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then
- log "SKIP: ${container} 屬於監控棧排除清單,僅告警"
- send_awoooi_alert \
- "[${hostname}] 監控元件異常: ${container}" \
- "容器 ${container} 狀態=${status}。請人工處理。" \
- "WARNING"
- return
- fi
-
- # ── 冷卻期判斷 ────────────────────────────────────────────────────────────
- if is_in_cooldown "$container"; then
- log "SKIP: ${container} 在冷卻期內,跳過本次修復"
- return
- fi
-
- # ── 決定修復動作 ─────────────────────────────────────────────────────────
- local action_cmd="docker restart"
- local action_desc="docker restart"
- if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then
- action_cmd="docker start"
- action_desc="docker start(保護 WAL,非 restart)"
- fi
-
- # ── Phase 1: Intent(決策意圖通知)──────────────────────────────────────
- log "INTENT: 即將對 ${container} 執行 ${action_desc}"
- send_awoooi_alert \
- "[${hostname}] 自動修復 Intent: ${container}" \
- "偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc},2 秒後開始修復。" \
- "WARNING"
-
- sleep 2
-
- # ── Phase 2: Action(執行修復)──────────────────────────────────────────
- log "ACTION: 執行 ${action_cmd} ${container}"
- set_cooldown "$container"
-
- local repair_ok=false
- if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then
- repair_ok=true
- fi
-
- # ── Phase 3: Result(執行結果通知)──────────────────────────────────────
- if $repair_ok; then
- log "RESULT: ${container} 修復成功"
- send_awoooi_alert \
- "[${hostname}] 自動修復成功: ${container}" \
- "容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}。" \
- "INFO"
- else
- log "RESULT: ${container} 修復失敗!需人工介入"
- send_awoooi_alert \
- "[${hostname}] 自動修復失敗: ${container}" \
- "容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \
- "CRITICAL"
- fi
-}
-
-# ─── 主流程 ──────────────────────────────────────────────────────────────────
-main() {
- log "===== docker-health-monitor 啟動 (host=$(hostname)) ====="
-
- # 取得所有容器狀態
- # docker ps -a 格式: Names / Health / State
- while IFS=$'\t' read -r name health_status container_status; do
- [[ -z "$name" ]] && continue
-
- local needs_repair=false
+ local needs_alert=false
local detected_status=""
- if [[ "$health_status" == "unhealthy" ]]; then
- needs_repair=true
+ # 偵測 exited / dead
+ if [[ "$state" == "exited" || "$state" == "dead" ]]; then
+ needs_alert=true
+ detected_status="$state"
+ fi
+
+ # 偵測 unhealthy(health check 存在且失敗)
+ if [[ "$health" == "unhealthy" ]]; then
+ needs_alert=true
detected_status="unhealthy"
- elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then
- needs_repair=true
- detected_status="$container_status"
- elif [[ "$health_status" == "starting" ]]; then
- log "INFO: ${name} health=starting,等待中跳過"
- continue
fi
- if $needs_repair; then
- handle_unhealthy_container "$name" "$detected_status"
+ if $needs_alert; then
+ log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
+
+ # 冷卻期去重
+ if is_in_send_cooldown "$container_name"; then
+ continue
+ fi
+
+ # 送 Webhook — 只感知,不修復
+ send_to_awoooi "$container_name" "$detected_status"
fi
+ done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
+ awk -F'\t' '{
+ health = ""
+ if ($4 ~ /\(unhealthy\)/) health = "unhealthy"
+ else if ($4 ~ /\(healthy\)/) health = "healthy"
+ print $1 "\t" $2 "\t" $3 "\t" health
+ }')
+}
- done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null)
-
- log "===== docker-health-monitor 完成 ====="
+# ─── Main ───────────────────────────────────────────────────────────────────
+main() {
+ log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
+ check_containers
+ log "=== 掃描完成 ==="
}
main "$@"