From 88696dba9b0efd4e5d067b81ce078cf8266f0047 Mon Sep 17 00:00:00 2001 From: OG T Date: Wed, 8 Apr 2026 16:24:09 +0800 Subject: [PATCH] =?UTF-8?q?feat(sprint5.1):=20Data=20Safety=20Guardrails?= =?UTF-8?q?=20=E5=85=A8=E9=8F=88=E8=B7=AF=E6=95=B4=E5=90=88=20(L1-L5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layer 0 - K8s RBAC: - k8s/rbac/api-velero-reader.yaml: awoooi-executor SA Velero backup reader Layer 1 - DB Migration (已在 188 執行): - M-002: approval_records 新增 approval_level/votes/required_votes - M-003: alert_event_type ENUM 新增 8 個值 Layer 2 - IaC: - ops/config/service-registry.yaml: 全服務 Stateful 分級清單 (BLOCK/CRITICAL_HITL/STANDARD_HITL/AUTO) Layer 3 - Python Services: - service_registry.py: 讀取 YAML,提供 is_blocked/requires_multisig/get_required_votes - velero_client.py: kubectl 查詢 Velero 備份年齡,失敗 fallback 999h - preflight_service.py: Pre-flight 安全檢查 (Q2/Q4 決策) Layer 1-M001 - Playbook model: - playbook.py: 新增 requires_approval_level/stateful_targets/requires_pre_backup Layer 4 - 業務邏輯: - alert_operation_log_repository.py: 新增 8 個 event_type (Guardrail/Pre-flight/MultiSig/備份) - auto_repair_service.py: 注入 Service Registry Guardrail 檢查 (BLOCK → 直接拒絕) - webhooks.py: ALERT_RECEIVED 溯源記錄 + auto_repair flag Q9 + Langfuse trace_id Q10 - db/models.py: ApprovalRecord 同步 approval_level/votes/required_votes 欄位 - docker-health-monitor.sh: 純感知層改造(移除所有 docker restart 邏輯) Layer 5 - Telegram 通知: - telegram_gateway.py: T1-T6 六個新通知方法 (Guardrail/Pre-flight/Backup/MultiSig/ChangeApplied) 參考: ADR-062 Data Safety Guardrails, ADR-063 Service Registry IaC Co-Authored-By: Claude Sonnet 4.6 --- .../migrations/sprint51_alert_log_events.sql | 18 ++ .../migrations/sprint51_approval_multisig.sql | 31 ++ apps/api/src/api/v1/webhooks.py | 66 +++- apps/api/src/db/models.py | 20 ++ apps/api/src/models/playbook.py | 14 + .../alert_operation_log_repository.py | 11 + apps/api/src/services/auto_repair_service.py | 26 ++ apps/api/src/services/preflight_service.py | 116 +++++++ apps/api/src/services/service_registry.py | 124 ++++++++ apps/api/src/services/telegram_gateway.py | 122 ++++++++ apps/api/src/services/velero_client.py | 113 +++++++ k8s/rbac/api-velero-reader.yaml | 36 +++ ops/config/service-registry.yaml | 201 ++++++++++++ scripts/ops/docker-health-monitor.sh | 290 +++++++----------- 14 files changed, 997 insertions(+), 191 deletions(-) create mode 100644 apps/api/migrations/sprint51_alert_log_events.sql create mode 100644 apps/api/migrations/sprint51_approval_multisig.sql create mode 100644 apps/api/src/services/preflight_service.py create mode 100644 apps/api/src/services/service_registry.py create mode 100644 apps/api/src/services/velero_client.py create mode 100644 k8s/rbac/api-velero-reader.yaml create mode 100644 ops/config/service-registry.yaml diff --git a/apps/api/migrations/sprint51_alert_log_events.sql b/apps/api/migrations/sprint51_alert_log_events.sql new file mode 100644 index 00000000..499831fc --- /dev/null +++ b/apps/api/migrations/sprint51_alert_log_events.sql @@ -0,0 +1,18 @@ +-- apps/api/migrations/sprint51_alert_log_events.sql +-- Sprint 5.1 M-003: alert_operation_log ENUM 擴充 +-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +-- ⚠️ ENUM ADD VALUE 不可 rollback,執行前確認已備份 +-- 說明: 新增 8 個 event_type 支援 Guardrail / Pre-flight / MultiSig / 備份追蹤 + +BEGIN; + +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'GUARDRAIL_BLOCKED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_PASSED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_FAILED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_TRIGGERED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_COMPLETED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_FAILED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'APPROVAL_ESCALATED'; +ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'CHANGE_APPLIED'; + +COMMIT; diff --git a/apps/api/migrations/sprint51_approval_multisig.sql b/apps/api/migrations/sprint51_approval_multisig.sql new file mode 100644 index 00000000..bb92f824 --- /dev/null +++ b/apps/api/migrations/sprint51_approval_multisig.sql @@ -0,0 +1,31 @@ +-- apps/api/migrations/sprint51_approval_multisig.sql +-- Sprint 5.1 M-002: MultiSig 雙簽核支援 +-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +-- 說明: approval_records 新增 approval_level / approval_votes / required_votes + +BEGIN; + +ALTER TABLE approval_records + ADD COLUMN IF NOT EXISTS approval_level VARCHAR(20) + DEFAULT 'standard' + CHECK (approval_level IN ('standard', 'critical')), + ADD COLUMN IF NOT EXISTS approval_votes JSONB + DEFAULT '[]'::jsonb, + ADD COLUMN IF NOT EXISTS required_votes INTEGER + DEFAULT 1; + +COMMENT ON COLUMN approval_records.approval_level IS + 'standard=1票審核, critical=2票MultiSig'; +COMMENT ON COLUMN approval_records.approval_votes IS + 'JSON array: [{"user_id": "123", "voted_at": "2026-04-08T...", "action": "approve"}]'; +COMMENT ON COLUMN approval_records.required_votes IS + 'standard=1, critical=2'; + +-- 現有記錄回填(向後相容) +UPDATE approval_records +SET approval_level = 'standard', + required_votes = 1, + approval_votes = '[]'::jsonb +WHERE approval_level IS NULL; + +COMMIT; diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index 9b45072b..0b87427b 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -218,7 +218,9 @@ async def _try_auto_repair_background( ) return - # 記錄自動修復觸發 + # 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤) + # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) + _langfuse_trace_id = getattr(incident, "langfuse_trace_id", None) await op_log.append( "AUTO_REPAIR_TRIGGERED", incident_id=incident_id, @@ -231,6 +233,11 @@ async def _try_auto_repair_background( "playbook_name": decision.playbook.name, "similarity_score": decision.similarity_score, "risk_level": decision.risk_level.value if decision.risk_level else None, + "langfuse_trace_id": _langfuse_trace_id, + "langfuse_url": ( + f"http://192.168.0.110:3100/trace/{_langfuse_trace_id}" + if _langfuse_trace_id else None + ), }, ) @@ -1084,6 +1091,31 @@ async def alertmanager_webhook( alert = firing_alerts[0] alert_id = f"alert-{now_taipei().strftime('%Y%m%d%H%M%S')}" + # ========================================================================== + # Sprint 5.1 L4-2: ALERT_RECEIVED 溯源記錄 + auto_repair flag 讀取 + # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q9) + # ========================================================================== + _alert_labels = alert.labels or {} + _alertname_for_log = _alert_labels.get("alertname", "UnknownAlert") + # Q9: auto_repair flag — Rule=false 強制 HITL(不觸發自動修復背景任務) + _can_auto_repair_by_rule = _alert_labels.get("auto_repair", "true").lower() == "true" + try: + _op_log = get_alert_operation_log_repository() + await _op_log.append( + "ALERT_RECEIVED", + actor="alertmanager", + action_detail=f"收到告警: {_alertname_for_log}", + context={ + "source": "alertmanager", + "alert_id": alert_id, + "alertname": _alertname_for_log, + "labels": _alert_labels, + "auto_repair_flag": _can_auto_repair_by_rule, + }, + ) + except Exception as _log_err: + logger.warning("alert_received_log_failed", error=str(_log_err)) + # ========================================================================== # Alert Normalizer: 轉換 Alertmanager 格式 → AWOOOI AlertPayload # ========================================================================== @@ -1326,15 +1358,31 @@ async def alertmanager_webhook( # 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環) # Incident 建立後立即評估是否可自動修復 # P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行 + # Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務 + # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) # ================================================================ - background_tasks.add_task( - _try_auto_repair_background, - incident_id=incident_id, - approval_id=str(approval.id), - alert_type=alert_type, - target_resource=target_resource, - namespace=namespace, - ) + if _can_auto_repair_by_rule: + background_tasks.add_task( + _try_auto_repair_background, + incident_id=incident_id, + approval_id=str(approval.id), + alert_type=alert_type, + target_resource=target_resource, + namespace=namespace, + ) + else: + # auto_repair=false → 記錄 GUARDRAIL_BLOCKED,不觸發自動修復 + _op_log_rule = get_alert_operation_log_repository() + background_tasks.add_task( + _op_log_rule.append, + "GUARDRAIL_BLOCKED", + incident_id=incident_id, + approval_id=str(approval.id), + actor="prometheus-rule", + action_detail=f"Prometheus rule 設定 auto_repair=false,強制人工審核: {alertname}", + success=False, + context={"alertname": alertname, "auto_repair_flag": False}, + ) # 推送 Telegram background_tasks.add_task( diff --git a/apps/api/src/db/models.py b/apps/api/src/db/models.py index 2b891865..7dc9f918 100644 --- a/apps/api/src/db/models.py +++ b/apps/api/src/db/models.py @@ -124,6 +124,26 @@ class ApprovalRecord(Base): comment="Last time this alert pattern was seen", ) + # Sprint 5.1 MultiSig 雙簽核支援 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q3) + approval_level: Mapped[str] = mapped_column( + String(20), + default="standard", + nullable=False, + comment="standard=1票審核, critical=2票MultiSig", + ) + approval_votes: Mapped[list[dict[str, Any]]] = mapped_column( + JSON, + default=list, + nullable=False, + comment="[{user_id, voted_at, action}]", + ) + required_votes: Mapped[int] = mapped_column( + Integer, + default=1, + nullable=False, + comment="standard=1, critical=2", + ) + # 2026-04-06 ogt: Phase 26 — 關聯 Incident ID # Playbook 萃取和 KM 寫入必須知道 incident_id,不能靠文字解析 incident_id: Mapped[str | None] = mapped_column( diff --git a/apps/api/src/models/playbook.py b/apps/api/src/models/playbook.py index b7f98508..3088510f 100644 --- a/apps/api/src/models/playbook.py +++ b/apps/api/src/models/playbook.py @@ -212,6 +212,20 @@ class Playbook(BaseModel): tags: list[str] = Field(default_factory=list, description="標籤") notes: str | None = Field(None, description="人工補充說明") + # === Sprint 5.1 資料安全護欄 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei) === + requires_approval_level: str = Field( + default="auto", + description="auto=直接執行, standard=1票, critical=2票MultiSig(由 Service Registry 決定)", + ) + stateful_targets: list[str] = Field( + default_factory=list, + description="此 Playbook 操作的 Stateful 服務清單,對應 service-registry.yaml", + ) + requires_pre_backup: bool = Field( + default=False, + description="執行前是否需要 Pre-flight 備份檢查", + ) + # === 時間軸 === created_at: datetime = Field(default_factory=now_taipei) updated_at: datetime = Field(default_factory=now_taipei) diff --git a/apps/api/src/repositories/alert_operation_log_repository.py b/apps/api/src/repositories/alert_operation_log_repository.py index c6a94bdf..b6f0aaec 100644 --- a/apps/api/src/repositories/alert_operation_log_repository.py +++ b/apps/api/src/repositories/alert_operation_log_repository.py @@ -22,7 +22,9 @@ from src.db.models import AlertOperationLog logger = structlog.get_logger(__name__) # 合法的 event_type 值 (對應 DB ENUM) +# Sprint 5.1 新增 8 個 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei) ALERT_EVENT_TYPES = { + # 原有 10 個 "ALERT_RECEIVED", "TELEGRAM_SENT", "USER_ACTION", @@ -33,6 +35,15 @@ ALERT_EVENT_TYPES = { "RESOLVED", "SILENCED", "ESCALATED", + # Sprint 5.1 Guardrail / Pre-flight / MultiSig / 備份追蹤 + "GUARDRAIL_BLOCKED", + "PRE_FLIGHT_PASSED", + "PRE_FLIGHT_FAILED", + "BACKUP_TRIGGERED", + "BACKUP_COMPLETED", + "BACKUP_FAILED", + "APPROVAL_ESCALATED", + "CHANGE_APPLIED", } diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index d48bdc7e..266b2001 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -192,6 +192,32 @@ class AutoRepairService: blocked_by="GLOBAL_GUARDRAIL", ) + # 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查 + # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) + # 全域熔斷之後、嚴重度之前,BLOCK 等級直接拒絕 + try: + from src.services.service_registry import StatefulLevel, get_service_registry + _registry = get_service_registry() + _service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else "" + if not _service_name and incident.affected_services: + _service_name = incident.affected_services[0] + _stateful_level = _registry.get_stateful_level(_service_name) + if _stateful_level == StatefulLevel.BLOCK: + logger.warning( + "auto_repair_blocked_guardrail", + incident_id=incident.incident_id, + service_name=_service_name, + stateful_level="BLOCK", + ) + return AutoRepairDecision( + can_auto_repair=False, + reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml)", + blocked_by="SERVICE_REGISTRY_BLOCK", + ) + except Exception as _guardrail_err: + logger.error("guardrail_check_failed", error=str(_guardrail_err)) + # 保守原則:失敗時繼續(不阻擋,但記錄) + # 1. 檢查 Incident 嚴重度 if incident.severity and incident.severity.value in ["P0", "P1"]: logger.info( diff --git a/apps/api/src/services/preflight_service.py b/apps/api/src/services/preflight_service.py new file mode 100644 index 00000000..837d9104 --- /dev/null +++ b/apps/api/src/services/preflight_service.py @@ -0,0 +1,116 @@ +# apps/api/src/services/preflight_service.py +# Pre-flight 安全檢查服務 (Q2/Q4 決策) +# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +# 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient +# 參考: ADR-062 + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass +from enum import Enum + +from .service_registry import ServiceRegistryClient, get_service_registry +from .velero_client import VeleroClient, get_velero_client + +logger = logging.getLogger(__name__) + + +class PreflightResult(str, Enum): + PASS = "PASS" + ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED" + ABORT_HIGH_IO = "ABORT_HIGH_IO" + SKIP = "SKIP" # 服務不需要 Pre-flight + + +@dataclass +class PreflightReport: + result: PreflightResult + backup_age_hours: float | None = None + backup_name_triggered: str | None = None + reason: str = "" + + +class PreflightService: + """ + Pre-flight 安全檢查 + - 只有 requires_pre_backup=True 的服務才觸發 + - 備份過期 → Abort + 觸發緊急備份(非同步) + - CPU/IO 高負載告警 → 禁止觸發備份(Q4) + """ + + def __init__( + self, + registry: ServiceRegistryClient | None = None, + velero: VeleroClient | None = None, + ) -> None: + self._registry = registry or get_service_registry() + self._velero = velero or get_velero_client() + + async def check( + self, + service_name: str, + alert_labels: dict | None = None, + ) -> PreflightReport: + """ + 執行 Pre-flight 檢查 + alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載 + """ + info = self._registry.get_service(service_name) + if info is None or not info.requires_pre_backup: + return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight") + + # Q4: CPU/IO 高負載告警時禁止觸發備份 + if self._is_high_io_alert(alert_labels): + logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發") + return PreflightReport( + result=PreflightResult.ABORT_HIGH_IO, + reason="告警類型為 CPU/IO 高負載,禁止觸發備份(Q4 決策)", + ) + + policies = self._registry.get_backup_policies() + max_age = policies.get("velero_max_age_hours", 4) + + age = await self._velero.get_latest_backup_age_hours() + if age <= max_age: + return PreflightReport( + result=PreflightResult.PASS, + backup_age_hours=age, + reason=f"備份時間正常 ({age:.1f}h < {max_age}h)", + ) + + # 備份過期 → 觸發緊急備份 + Abort + backup_name = f"emergency-preflight-{int(time.time())}" + triggered = await self._velero.trigger_emergency_backup(backup_name) + return PreflightReport( + result=PreflightResult.ABORT_BACKUP_EXPIRED, + backup_age_hours=age, + backup_name_triggered=backup_name if triggered else None, + reason=( + f"備份過期 ({age:.1f}h > {max_age}h)。" + f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}" + ), + ) + + def _is_high_io_alert(self, labels: dict | None) -> bool: + if not labels: + return False + alert_name = labels.get("alertname", "").lower() + return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"]) + + +_preflight_service: PreflightService | None = None + + +def get_preflight_service() -> PreflightService: + global _preflight_service + if _preflight_service is None: + _preflight_service = PreflightService() + return _preflight_service + + +def set_preflight_service(service: PreflightService) -> None: + """測試注入用 (P4 規範)""" + global _preflight_service + _preflight_service = service diff --git a/apps/api/src/services/service_registry.py b/apps/api/src/services/service_registry.py new file mode 100644 index 00000000..c15472b1 --- /dev/null +++ b/apps/api/src/services/service_registry.py @@ -0,0 +1,124 @@ +# apps/api/src/services/service_registry.py +# Service Registry Client — 讀取 ops/config/service-registry.yaml +# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +# 架構: leWOOOgo 積木化,純 Service 層,無 Router/DB 依賴 +# 參考: ADR-062, ADR-063 + +from __future__ import annotations + +import logging +from enum import Enum +from pathlib import Path +from typing import Any + +import yaml + +logger = logging.getLogger(__name__) + +# YAML 路徑(相對於 repo root) +_DEFAULT_REGISTRY_PATH = Path(__file__).parents[5] / "ops" / "config" / "service-registry.yaml" + + +class StatefulLevel(str, Enum): + BLOCK = "BLOCK" # 禁止,僅告警 + CRITICAL_HITL = "CRITICAL_HITL" # 2 票 MultiSig + STANDARD_HITL = "STANDARD_HITL" # 1 票 + AUTO = "AUTO" # 自動執行 + + +class ServiceInfo: + def __init__(self, data: dict[str, Any]) -> None: + self.name: str = data["name"] + self.display_name: str = data.get("display_name", self.name) + self.host: str = data.get("host", "unknown") + self.stateful_level: StatefulLevel = StatefulLevel(data.get("stateful_level", "AUTO")) + self.reason: str = data.get("reason", "") + self.alert_only: bool = data.get("alert_only", False) + self.requires_pre_backup: bool = data.get("requires_pre_backup", False) + self.restart_command: str = data.get("restart_command", "docker restart") + self.containers: list[str] = data.get("containers", []) + + +class ServiceRegistryClient: + """ + Service Registry 客戶端 + 讀取 ops/config/service-registry.yaml,提供服務 Stateful 分級查詢 + 設計原則: 純讀取,不寫入;失敗時 fallback AUTO(防護不應阻擋告警流程) + """ + + def __init__(self, registry_path: Path | None = None) -> None: + self._path = registry_path or _DEFAULT_REGISTRY_PATH + self._services: dict[str, ServiceInfo] = {} + self._backup_policies: dict[str, Any] = {} + self._multisig_config: dict[str, Any] = {} + self._loaded = False + + def _load(self) -> None: + if self._loaded: + return + try: + with open(self._path) as f: + data = yaml.safe_load(f) + for svc in data.get("services", []): + info = ServiceInfo(svc) + self._services[info.name] = info + # 也按 container 名稱建立索引 + for container in info.containers: + self._services[container] = info + self._backup_policies = data.get("backup_policies", {}) + self._multisig_config = data.get("multisig", {}) + self._loaded = True + logger.info(f"Service Registry 載入完成: {len(self._services)} 個服務") + except Exception as e: + logger.error(f"Service Registry 載入失敗: {e},所有服務 fallback AUTO") + self._loaded = True # 防止重複嘗試 + + def get_service(self, name: str) -> ServiceInfo | None: + self._load() + return self._services.get(name) + + def get_stateful_level(self, service_name: str) -> StatefulLevel: + """查詢服務分級,未知服務 fallback AUTO""" + info = self.get_service(service_name) + if info is None: + logger.warning(f"未知服務 '{service_name}',fallback AUTO") + return StatefulLevel.AUTO + return info.stateful_level + + def is_blocked(self, service_name: str) -> bool: + return self.get_stateful_level(service_name) == StatefulLevel.BLOCK + + def requires_multisig(self, service_name: str) -> bool: + return self.get_stateful_level(service_name) == StatefulLevel.CRITICAL_HITL + + def get_required_votes(self, service_name: str) -> int: + self._load() + level = self.get_stateful_level(service_name) + if level == StatefulLevel.CRITICAL_HITL: + return self._multisig_config.get("critical_required_votes", 2) + return self._multisig_config.get("standard_required_votes", 1) + + def get_backup_policies(self) -> dict[str, Any]: + self._load() + return self._backup_policies + + def get_restart_command(self, service_name: str) -> str: + info = self.get_service(service_name) + return info.restart_command if info else "docker restart" + + +# Singleton +_registry_client: ServiceRegistryClient | None = None + + +def get_service_registry() -> ServiceRegistryClient: + global _registry_client + if _registry_client is None: + _registry_client = ServiceRegistryClient() + return _registry_client + + +def set_service_registry(client: ServiceRegistryClient) -> None: + """測試注入用 (P4 規範)""" + global _registry_client + _registry_client = client diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 95c5cfeb..4d267ee2 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -2654,6 +2654,128 @@ class TelegramGateway: f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}" ) + # ========================================================================= + # Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景 + # (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062) + # ========================================================================= + + async def send_guardrail_blocked( + self, + service_name: str, + alertname: str, + reason: str, + ) -> None: + """T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復""" + text = ( + "🚫 [服務保護] 自動修復已阻擋\n" + "━━━━━━━━━━━━━━━━━\n" + f"服務: {html.escape(service_name)}\n" + f"告警: {html.escape(alertname)}\n" + f"原因: {html.escape(reason)}\n" + "━━━━━━━━━━━━━━━━━\n" + "⚠️ 請人工評估並手動處理" + ) + await self.send_notification(text) + + async def send_preflight_failed( + self, + service_name: str, + backup_age_hours: float, + max_age_hours: float, + backup_name: str | None, + ) -> None: + """T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停""" + backup_status = ( + f"緊急備份: 已啟動 {html.escape(backup_name)}" + if backup_name + else "緊急備份: 啟動失敗,請人工處理" + ) + text = ( + "⏸ [Pre-flight 阻擋] 備份已過期,修復暫停\n" + "━━━━━━━━━━━━━━━━━\n" + f"服務: {html.escape(service_name)}\n" + f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n" + f"{backup_status}\n" + "━━━━━━━━━━━━━━━━━\n" + "請等待備份完成後,人工重新評估修復方案" + ) + await self.send_notification(text) + + async def send_backup_result( + self, + backup_name: str, + success: bool, + error_msg: str | None = None, + ) -> None: + """T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果""" + if success: + text = ( + "✅ 緊急備份完成\n" + f"備份: {html.escape(backup_name)}\n" + "可繼續手動執行修復" + ) + else: + err = html.escape(error_msg or "未知錯誤") + text = ( + "❌ 緊急備份失敗\n" + f"備份: {html.escape(backup_name)}\n" + f"錯誤: {err}\n" + "請人工介入,備份異常" + ) + await self.send_notification(text) + + async def send_multisig_waiting( + self, + action: str, + service_name: str, + votes_received: int, + votes_required: int, + approval_id: str, + ) -> None: + """T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票""" + text = ( + "🔐 [MultiSig] 等待第 2 票授權\n" + "━━━━━━━━━━━━━━━━━\n" + f"操作: {html.escape(action)}\n" + f"服務: {html.escape(service_name)}\n" + f"風險: CRITICAL(HITL 雙簽)\n" + f"已獲授權: {votes_received}/{votes_required} 票\n" + f"審核 ID: {html.escape(approval_id)}\n" + "━━━━━━━━━━━━━━━━━\n" + "請第二位審核者登入確認" + ) + await self.send_notification(text) + + async def send_multisig_approved( + self, + action: str, + service_name: str, + ) -> None: + """T5: MultiSig 完成(2/2)""" + text = ( + "✅ [MultiSig 完成] 雙簽授權通過\n" + f"操作: {html.escape(action)}\n" + f"服務: {html.escape(service_name)}\n" + "授權: 2/2 票 開始執行..." + ) + await self.send_notification(text) + + async def send_change_applied( + self, + operator: str, + action_description: str, + timestamp: str, + ) -> None: + """T6: CHANGE_APPLIED — 手動變更記錄""" + text = ( + "📝 [變更記錄] 手動操作已記錄\n" + "━━━━━━━━━━━━━━━━━\n" + f"操作者: {html.escape(operator)}\n" + f"動作: {html.escape(action_description)}\n" + f"時間: {html.escape(timestamp)}" + ) + await self.send_notification(text) + async def send_notification( self, text: str, diff --git a/apps/api/src/services/velero_client.py b/apps/api/src/services/velero_client.py new file mode 100644 index 00000000..7ad411d2 --- /dev/null +++ b/apps/api/src/services/velero_client.py @@ -0,0 +1,113 @@ +# apps/api/src/services/velero_client.py +# Velero Backup 查詢客戶端 (kubectl 方式,Q7 決策) +# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +# 架構: leWOOOgo 積木化,純 Service 層 +# 參考: ADR-062 + +from __future__ import annotations + +import asyncio +import json +import logging +import time +from datetime import UTC, datetime + +logger = logging.getLogger(__name__) + +_VELERO_NAMESPACE = "velero" +_KUBECTL_TIMEOUT = 30 # 秒 + + +class VeleroClient: + """ + 透過 kubectl 查詢 Velero 備份狀態 + 設計原則: 失敗時 fallback「假設備份過期」(保守原則) + """ + + async def get_latest_backup_age_hours(self) -> float: + """ + 查詢最近一次 Completed 備份距今幾小時 + 失敗時返回 999.0(視為嚴重過期,觸發 Abort) + """ + try: + result = await asyncio.wait_for( + self._run_kubectl( + ["get", "backup", "-n", _VELERO_NAMESPACE, + "-o", "json", "--field-selector", "status.phase=Completed"] + ), + timeout=_KUBECTL_TIMEOUT, + ) + data = json.loads(result) + items = data.get("items", []) + if not items: + logger.warning("Velero: 找不到任何 Completed 備份") + return 999.0 + + latest = max( + items, + key=lambda x: x.get("status", {}).get("completionTimestamp", ""), + ) + completion_ts = latest["status"].get("completionTimestamp", "") + if not completion_ts: + return 999.0 + + completed_at = datetime.fromisoformat(completion_ts.replace("Z", "+00:00")) + age = (datetime.now(UTC) - completed_at).total_seconds() / 3600 + logger.info(f"Velero 最近備份: {completion_ts},距今 {age:.1f} 小時") + return age + + except asyncio.TimeoutError: + logger.error("Velero kubectl 查詢超時") + return 999.0 + except Exception as e: + logger.error(f"Velero 查詢失敗: {e}") + return 999.0 + + async def trigger_emergency_backup(self, backup_name: str | None = None) -> bool: + """ + 觸發緊急備份(非同步,不等待完成) + 返回 True 表示指令已成功發送 + """ + name = backup_name or f"emergency-{int(time.time())}" + try: + await asyncio.wait_for( + self._run_kubectl([ + "create", "backup", name, + "-n", _VELERO_NAMESPACE, + "--include-namespaces", "awoooi-prod", + "--wait=false", + ]), + timeout=_KUBECTL_TIMEOUT, + ) + logger.info(f"Velero 緊急備份已啟動: {name}") + return True + except Exception as e: + logger.error(f"Velero 緊急備份失敗: {e}") + return False + + async def _run_kubectl(self, args: list[str]) -> str: + proc = await asyncio.create_subprocess_exec( + "kubectl", *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + raise RuntimeError(f"kubectl 失敗: {stderr.decode()}") + return stdout.decode() + + +_velero_client: VeleroClient | None = None + + +def get_velero_client() -> VeleroClient: + global _velero_client + if _velero_client is None: + _velero_client = VeleroClient() + return _velero_client + + +def set_velero_client(client: VeleroClient) -> None: + """測試注入用 (P4 規範)""" + global _velero_client + _velero_client = client diff --git a/k8s/rbac/api-velero-reader.yaml b/k8s/rbac/api-velero-reader.yaml new file mode 100644 index 00000000..47490b88 --- /dev/null +++ b/k8s/rbac/api-velero-reader.yaml @@ -0,0 +1,36 @@ +# k8s/rbac/api-velero-reader.yaml +# API Pod 讀取 Velero backup 資源的 RBAC +# Sprint 5.1 K-001 / 2026-04-08 Asia/Taipei +# 說明: awoooi-executor ServiceAccount 需要讀取 velero namespace 的 backup 資源 +# 用於 Pre-flight Check 查詢最近備份時間(Q7 決策:kubectl 方式) +# 注意: ServiceAccount 名稱為 awoooi-executor(非 awoooi-api,經 L0 確認) +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: awoooi-velero-backup-reader + labels: + app: awoooi + component: api + sprint: "5.1" +rules: + - apiGroups: ["velero.io"] + resources: ["backups"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: awoooi-velero-backup-reader + labels: + app: awoooi + component: api + sprint: "5.1" +subjects: + - kind: ServiceAccount + name: awoooi-executor + namespace: awoooi-prod +roleRef: + kind: ClusterRole + name: awoooi-velero-backup-reader + apiGroup: rbac.authorization.k8s.io diff --git a/ops/config/service-registry.yaml b/ops/config/service-registry.yaml new file mode 100644 index 00000000..f68ed681 --- /dev/null +++ b/ops/config/service-registry.yaml @@ -0,0 +1,201 @@ +# ops/config/service-registry.yaml +# Service Registry — 服務 Stateful 分級清單 +# 版本: 1.0.0 +# 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei +# 維護: 修改需 PR + 統帥審核,禁止直接 push +# 說明: +# BLOCK = 系統禁止自動修復,僅告警(資料風險最高) +# CRITICAL_HITL = 允許 Playbook,但需 MultiSig 2票 +# STANDARD_HITL = 允許 Playbook,需 1票審核 +# AUTO = 允許自動執行(無狀態服務) +# 參考: ADR-062, ADR-063 + +services: + # ─── BLOCK:系統禁止(連 Playbook 都不提供)──────────────────────────── + - name: postgres + display_name: "PostgreSQL 主庫 (awoooi_prod)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾" + alert_only: true + containers: ["postgres"] + + - name: momo-db + display_name: "PostgreSQL (momo_db)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "momo 產品資料庫,禁止自動操作" + alert_only: true + containers: ["momo-db"] + + - name: langfuse-db + display_name: "PostgreSQL (Langfuse)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "LLM trace 資料庫,重啟導致追蹤資料遺失" + alert_only: true + containers: ["langfuse-db"] + + - name: harbor-db + display_name: "PostgreSQL (Harbor Registry)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引" + alert_only: true + containers: ["harbor-db"] + + - name: sentry-postgres + display_name: "PostgreSQL (Sentry)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "Sentry 錯誤追蹤資料庫" + alert_only: true + containers: ["sentry-postgres"] + + - name: signoz-clickhouse + display_name: "ClickHouse (SignOz)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案" + alert_only: true + containers: ["signoz-clickhouse"] + + # ─── CRITICAL_HITL:高風險,需 MultiSig 2票 ────────────────────────── + - name: redis + display_name: "Redis (AWOOOI)" + host: "192.168.0.188" + stateful_level: CRITICAL_HITL + reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態" + requires_pre_backup: false + containers: ["redis"] + + - name: harbor-redis + display_name: "Redis (Harbor)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "Harbor session 快取" + containers: ["harbor-redis"] + + - name: sentry-redis + display_name: "Redis (Sentry)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "Sentry 任務佇列" + containers: ["sentry-redis"] + + - name: gitea + display_name: "Gitea (程式碼倉庫)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "restart 會殺掉活躍 SSH session,Git push 中斷可能損壞 working copy" + requires_pre_backup: false + containers: ["gitea"] + + - name: harbor + display_name: "Harbor (Container Registry)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "重啟中斷 pull/push;GC 進行中重啟可能損壞 layer" + requires_pre_backup: false + containers: ["harbor-core", "harbor-jobservice", "harbor-portal"] + + - name: minio + display_name: "MinIO (物件存儲)" + host: "192.168.0.188" + stateful_level: CRITICAL_HITL + reason: "寫入中重啟可能導致 multipart upload 中斷" + requires_pre_backup: false + containers: ["minio"] + + # ─── STANDARD_HITL:中風險,需 1票審核 ────────────────────────────── + - name: prometheus + display_name: "Prometheus" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 TSDB WAL,exited 狀態用 docker start(非 restart)" + restart_command: "docker start" + containers: ["prometheus"] + + - name: grafana + display_name: "Grafana" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 SQLite 設定儲存,exited 用 docker start" + restart_command: "docker start" + containers: ["grafana"] + + - name: alertmanager + display_name: "Alertmanager" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 silence 狀態,exited 用 docker start" + restart_command: "docker start" + containers: ["alertmanager"] + + # ─── AUTO:無狀態,允許自動修復 ────────────────────────────────────── + - name: nginx + display_name: "Nginx (反向代理)" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["nginx", "nginx-188"] + + - name: awoooi-api + display_name: "AWOOOI API (K3s)" + host: "k3s" + stateful_level: AUTO + containers: [] + + - name: awoooi-web + display_name: "AWOOOI Web (K3s)" + host: "k3s" + stateful_level: AUTO + containers: [] + + - name: blackbox-exporter + display_name: "Blackbox Exporter" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["blackbox-exporter"] + + - name: langfuse + display_name: "Langfuse (LLMOps)" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["langfuse-web", "langfuse-worker"] + + - name: ollama + display_name: "Ollama (Local LLM)" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["ollama"] + + - name: momo-app + display_name: "momo Web App" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["momo-app"] + + - name: tsenyang-website + display_name: "Tsenyang Website" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["tsenyang-website"] + + - name: stock-platform + display_name: "Stock Platform" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["stock-platform"] + +# ─── 備份策略參考 ──────────────────────────────────────────────────────── +backup_policies: + velero_max_age_hours: 4 # Velero 備份過期閾值(Q2 決策) + emergency_backup_timeout: 600 # 緊急備份超時秒數 + block_backup_on_high_io: true # CPU/IO > 80% 時禁止觸發備份(Q4 決策) + io_threshold_percent: 80 + +# ─── MultiSig 設定 ─────────────────────────────────────────────────────── +multisig: + critical_required_votes: 2 # CRITICAL_HITL 需要幾票 + standard_required_votes: 1 # STANDARD_HITL 需要幾票 + vote_expiry_minutes: 30 # 投票有效期 diff --git a/scripts/ops/docker-health-monitor.sh b/scripts/ops/docker-health-monitor.sh index 44493ef1..4c2f42a2 100755 --- a/scripts/ops/docker-health-monitor.sh +++ b/scripts/ops/docker-health-monitor.sh @@ -1,11 +1,13 @@ #!/usr/bin/env bash # docker-health-monitor.sh -# Plan A: Docker 容器健康監控 + 自動修復 +# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作) # # 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1 # 設定: /etc/awoooi-ops/secrets.env # 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei -# 首席架構師裁示: Intent→Action→Result 三段式,禁止靜默修復 +# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062) +# 注意: 禁止在此腳本中執行 docker restart / docker start +# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理 set -euo pipefail @@ -19,48 +21,45 @@ fi : "${AWOOOI_API_URL:=https://awoooi.wooo.work}" : "${TELEGRAM_BOT_TOKEN:=}" : "${TELEGRAM_CHAT_ID:=}" -: "${WEBHOOK_HMAC_SECRET:=}" -: "${COOLDOWN_SECONDS:=300}" : "${LOG_FILE:=/var/log/docker-health-monitor.log}" +# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻) +: "${SEND_COOLDOWN_SECONDS:=300}" : "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}" mkdir -p "$COOLDOWN_DIR" -# ─── 排除清單(禁止自動修復)─────────────────────────────────────────────── -# 判斷方式: echo ":list:" | grep -q ":name:" -# 分類一:資料庫 — 禁止 restart -EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:" -# 分類二:Redis — 禁止 restart -EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:" -# 分類三:監控棧 exited → docker start(保護 WAL) -MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:" -# 分類四:監控棧 其他 → 僅告警 -EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:" -# 分類五:關鍵系統 — 永遠禁止(Gitea restart 會殺活躍 SSH) -EXCLUDED_CRITICAL_LIST=":gitea:" - # ─── 工具函數 ──────────────────────────────────────────────────────────────── log() { echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*" } -in_list() { - local name=":${1}:" - local list="$2" - [[ "$list" == *"$name"* ]] +# 發送冷卻期檢查(避免同一容器短時間重複送 webhook) +is_in_send_cooldown() { + local container="$1" + local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown" + if [[ -f "$cooldown_file" ]]; then + local last_sent now elapsed + last_sent=$(cat "$cooldown_file") + now=$(date +%s) + elapsed=$(( now - last_sent )) + if (( elapsed < SEND_COOLDOWN_SECONDS )); then + log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)" + return 0 + fi + fi + return 1 } -# 計算 HMAC-SHA256 簽章 -sign_payload() { - local payload="$1" - printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256 +set_send_cooldown() { + local container="$1" + date +%s > "${COOLDOWN_DIR}/${container}.cooldown" } -# 傳送 Telegram(Fallback:AWOOOI API down 時直接呼叫 Bot API) +# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API send_telegram_direct() { local message="$1" if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then - log "WARN: Telegram 未設定,跳過通知" + log "WARN: Telegram 未設定,跳過 Fallback" return 0 fi curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ @@ -69,186 +68,113 @@ send_telegram_direct() { > /dev/null 2>&1 || true } -# 傳送 AWOOOI Webhook(若失敗則 Fallback 至 Telegram Bot API) -send_awoooi_alert() { - local title="$1" - local message="$2" - local severity="${3:-WARNING}" - local source="docker-health-monitor" +# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API +# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC) +send_to_awoooi() { + local container="$1" + local status="$2" # unhealthy | exited | dead + local hostname + hostname=$(hostname) + local now_ts + now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ') + + # 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema) local payload - payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \ - "$title" "$message" "$severity" "$source") - - local timestamp - timestamp=$(date -u +%s) - local signature - signature=$(sign_payload "${timestamp}${payload}") + payload=$(cat </dev/null) || http_code="0" - if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then + if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then + log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})" + set_send_cooldown "$container" + else log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API" - send_telegram_direct "[docker-health-monitor Fallback] ${title} ${message}" + send_telegram_direct "🚨 [docker-health-monitor Fallback] 主機: ${hostname} 容器: ${container} 狀態: ${status} (API 不可達,請人工處理)" + set_send_cooldown "$container" fi } -# 冷卻期檢查(避免同一容器短時間重複修復) -is_in_cooldown() { - local container="$1" - local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown" - if [[ -f "$cooldown_file" ]]; then - local last_repair - last_repair=$(cat "$cooldown_file") - local now - now=$(date +%s) - local elapsed=$(( now - last_repair )) - if (( elapsed < COOLDOWN_SECONDS )); then - log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)" - return 0 - fi - fi - return 1 -} - -set_cooldown() { - local container="$1" - date +%s > "${COOLDOWN_DIR}/${container}.cooldown" -} - -# ─── 核心:處理不健康容器 ─────────────────────────────────────────────────── -handle_unhealthy_container() { - local container="$1" - local status="$2" # unhealthy | exited | dead +# ─── 核心:掃描所有容器 ───────────────────────────────────────────────────── +check_containers() { local hostname hostname=$(hostname) - log "DETECTED: ${container} 狀態=${status} on ${hostname}" + # 取得所有容器(含停止的) + while IFS=$'\t' read -r container_id container_name state health; do + # 跳過 header 或空行 + [[ -z "$container_name" ]] && continue - # ── 排除清單判斷 ───────────────────────────────────────────────────────── - - if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then - log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警" - send_awoooi_alert \ - "[${hostname}] 關鍵服務異常: ${container}" \ - "容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \ - "CRITICAL" - return - fi - - if in_list "$container" "$EXCLUDED_DB_LIST"; then - log "SKIP: ${container} 屬於資料庫排除清單,僅告警" - send_awoooi_alert \ - "[${hostname}] 資料庫容器異常: ${container}" \ - "容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \ - "CRITICAL" - return - fi - - if in_list "$container" "$EXCLUDED_REDIS_LIST"; then - log "SKIP: ${container} 屬於 Redis 排除清單,僅告警" - send_awoooi_alert \ - "[${hostname}] Redis 容器異常: ${container}" \ - "容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \ - "CRITICAL" - return - fi - - if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then - log "SKIP: ${container} 屬於監控棧排除清單,僅告警" - send_awoooi_alert \ - "[${hostname}] 監控元件異常: ${container}" \ - "容器 ${container} 狀態=${status}。請人工處理。" \ - "WARNING" - return - fi - - # ── 冷卻期判斷 ──────────────────────────────────────────────────────────── - if is_in_cooldown "$container"; then - log "SKIP: ${container} 在冷卻期內,跳過本次修復" - return - fi - - # ── 決定修復動作 ───────────────────────────────────────────────────────── - local action_cmd="docker restart" - local action_desc="docker restart" - if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then - action_cmd="docker start" - action_desc="docker start(保護 WAL,非 restart)" - fi - - # ── Phase 1: Intent(決策意圖通知)────────────────────────────────────── - log "INTENT: 即將對 ${container} 執行 ${action_desc}" - send_awoooi_alert \ - "[${hostname}] 自動修復 Intent: ${container}" \ - "偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc},2 秒後開始修復。" \ - "WARNING" - - sleep 2 - - # ── Phase 2: Action(執行修復)────────────────────────────────────────── - log "ACTION: 執行 ${action_cmd} ${container}" - set_cooldown "$container" - - local repair_ok=false - if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then - repair_ok=true - fi - - # ── Phase 3: Result(執行結果通知)────────────────────────────────────── - if $repair_ok; then - log "RESULT: ${container} 修復成功" - send_awoooi_alert \ - "[${hostname}] 自動修復成功: ${container}" \ - "容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}。" \ - "INFO" - else - log "RESULT: ${container} 修復失敗!需人工介入" - send_awoooi_alert \ - "[${hostname}] 自動修復失敗: ${container}" \ - "容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \ - "CRITICAL" - fi -} - -# ─── 主流程 ────────────────────────────────────────────────────────────────── -main() { - log "===== docker-health-monitor 啟動 (host=$(hostname)) =====" - - # 取得所有容器狀態 - # docker ps -a 格式: Names / Health / State - while IFS=$'\t' read -r name health_status container_status; do - [[ -z "$name" ]] && continue - - local needs_repair=false + local needs_alert=false local detected_status="" - if [[ "$health_status" == "unhealthy" ]]; then - needs_repair=true + # 偵測 exited / dead + if [[ "$state" == "exited" || "$state" == "dead" ]]; then + needs_alert=true + detected_status="$state" + fi + + # 偵測 unhealthy(health check 存在且失敗) + if [[ "$health" == "unhealthy" ]]; then + needs_alert=true detected_status="unhealthy" - elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then - needs_repair=true - detected_status="$container_status" - elif [[ "$health_status" == "starting" ]]; then - log "INFO: ${name} health=starting,等待中跳過" - continue fi - if $needs_repair; then - handle_unhealthy_container "$name" "$detected_status" + if $needs_alert; then + log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}" + + # 冷卻期去重 + if is_in_send_cooldown "$container_name"; then + continue + fi + + # 送 Webhook — 只感知,不修復 + send_to_awoooi "$container_name" "$detected_status" fi + done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \ + awk -F'\t' '{ + health = "" + if ($4 ~ /\(unhealthy\)/) health = "unhealthy" + else if ($4 ~ /\(healthy\)/) health = "healthy" + print $1 "\t" $2 "\t" $3 "\t" health + }') +} - done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null) - - log "===== docker-health-monitor 完成 =====" +# ─── Main ─────────────────────────────────────────────────────────────────── +main() { + log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ===" + check_containers + log "=== 掃描完成 ===" } main "$@"