feat(sprint5.1): Data Safety Guardrails 全鏈路整合 (L1-L5)
Layer 0 - K8s RBAC: - k8s/rbac/api-velero-reader.yaml: awoooi-executor SA Velero backup reader Layer 1 - DB Migration (已在 188 執行): - M-002: approval_records 新增 approval_level/votes/required_votes - M-003: alert_event_type ENUM 新增 8 個值 Layer 2 - IaC: - ops/config/service-registry.yaml: 全服務 Stateful 分級清單 (BLOCK/CRITICAL_HITL/STANDARD_HITL/AUTO) Layer 3 - Python Services: - service_registry.py: 讀取 YAML,提供 is_blocked/requires_multisig/get_required_votes - velero_client.py: kubectl 查詢 Velero 備份年齡,失敗 fallback 999h - preflight_service.py: Pre-flight 安全檢查 (Q2/Q4 決策) Layer 1-M001 - Playbook model: - playbook.py: 新增 requires_approval_level/stateful_targets/requires_pre_backup Layer 4 - 業務邏輯: - alert_operation_log_repository.py: 新增 8 個 event_type (Guardrail/Pre-flight/MultiSig/備份) - auto_repair_service.py: 注入 Service Registry Guardrail 檢查 (BLOCK → 直接拒絕) - webhooks.py: ALERT_RECEIVED 溯源記錄 + auto_repair flag Q9 + Langfuse trace_id Q10 - db/models.py: ApprovalRecord 同步 approval_level/votes/required_votes 欄位 - docker-health-monitor.sh: 純感知層改造(移除所有 docker restart 邏輯) Layer 5 - Telegram 通知: - telegram_gateway.py: T1-T6 六個新通知方法 (Guardrail/Pre-flight/Backup/MultiSig/ChangeApplied) 參考: ADR-062 Data Safety Guardrails, ADR-063 Service Registry IaC Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
18
apps/api/migrations/sprint51_alert_log_events.sql
Normal file
18
apps/api/migrations/sprint51_alert_log_events.sql
Normal file
@@ -0,0 +1,18 @@
|
||||
-- apps/api/migrations/sprint51_alert_log_events.sql
|
||||
-- Sprint 5.1 M-003: alert_operation_log ENUM 擴充
|
||||
-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
-- ⚠️ ENUM ADD VALUE 不可 rollback,執行前確認已備份
|
||||
-- 說明: 新增 8 個 event_type 支援 Guardrail / Pre-flight / MultiSig / 備份追蹤
|
||||
|
||||
BEGIN;
|
||||
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'GUARDRAIL_BLOCKED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_PASSED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_FAILED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_TRIGGERED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_COMPLETED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_FAILED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'APPROVAL_ESCALATED';
|
||||
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'CHANGE_APPLIED';
|
||||
|
||||
COMMIT;
|
||||
31
apps/api/migrations/sprint51_approval_multisig.sql
Normal file
31
apps/api/migrations/sprint51_approval_multisig.sql
Normal file
@@ -0,0 +1,31 @@
|
||||
-- apps/api/migrations/sprint51_approval_multisig.sql
|
||||
-- Sprint 5.1 M-002: MultiSig 雙簽核支援
|
||||
-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
-- 說明: approval_records 新增 approval_level / approval_votes / required_votes
|
||||
|
||||
BEGIN;
|
||||
|
||||
ALTER TABLE approval_records
|
||||
ADD COLUMN IF NOT EXISTS approval_level VARCHAR(20)
|
||||
DEFAULT 'standard'
|
||||
CHECK (approval_level IN ('standard', 'critical')),
|
||||
ADD COLUMN IF NOT EXISTS approval_votes JSONB
|
||||
DEFAULT '[]'::jsonb,
|
||||
ADD COLUMN IF NOT EXISTS required_votes INTEGER
|
||||
DEFAULT 1;
|
||||
|
||||
COMMENT ON COLUMN approval_records.approval_level IS
|
||||
'standard=1票審核, critical=2票MultiSig';
|
||||
COMMENT ON COLUMN approval_records.approval_votes IS
|
||||
'JSON array: [{"user_id": "123", "voted_at": "2026-04-08T...", "action": "approve"}]';
|
||||
COMMENT ON COLUMN approval_records.required_votes IS
|
||||
'standard=1, critical=2';
|
||||
|
||||
-- 現有記錄回填(向後相容)
|
||||
UPDATE approval_records
|
||||
SET approval_level = 'standard',
|
||||
required_votes = 1,
|
||||
approval_votes = '[]'::jsonb
|
||||
WHERE approval_level IS NULL;
|
||||
|
||||
COMMIT;
|
||||
@@ -218,7 +218,9 @@ async def _try_auto_repair_background(
|
||||
)
|
||||
return
|
||||
|
||||
# 記錄自動修復觸發
|
||||
# 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤)
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||||
_langfuse_trace_id = getattr(incident, "langfuse_trace_id", None)
|
||||
await op_log.append(
|
||||
"AUTO_REPAIR_TRIGGERED",
|
||||
incident_id=incident_id,
|
||||
@@ -231,6 +233,11 @@ async def _try_auto_repair_background(
|
||||
"playbook_name": decision.playbook.name,
|
||||
"similarity_score": decision.similarity_score,
|
||||
"risk_level": decision.risk_level.value if decision.risk_level else None,
|
||||
"langfuse_trace_id": _langfuse_trace_id,
|
||||
"langfuse_url": (
|
||||
f"http://192.168.0.110:3100/trace/{_langfuse_trace_id}"
|
||||
if _langfuse_trace_id else None
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1084,6 +1091,31 @@ async def alertmanager_webhook(
|
||||
alert = firing_alerts[0]
|
||||
alert_id = f"alert-{now_taipei().strftime('%Y%m%d%H%M%S')}"
|
||||
|
||||
# ==========================================================================
|
||||
# Sprint 5.1 L4-2: ALERT_RECEIVED 溯源記錄 + auto_repair flag 讀取
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q9)
|
||||
# ==========================================================================
|
||||
_alert_labels = alert.labels or {}
|
||||
_alertname_for_log = _alert_labels.get("alertname", "UnknownAlert")
|
||||
# Q9: auto_repair flag — Rule=false 強制 HITL(不觸發自動修復背景任務)
|
||||
_can_auto_repair_by_rule = _alert_labels.get("auto_repair", "true").lower() == "true"
|
||||
try:
|
||||
_op_log = get_alert_operation_log_repository()
|
||||
await _op_log.append(
|
||||
"ALERT_RECEIVED",
|
||||
actor="alertmanager",
|
||||
action_detail=f"收到告警: {_alertname_for_log}",
|
||||
context={
|
||||
"source": "alertmanager",
|
||||
"alert_id": alert_id,
|
||||
"alertname": _alertname_for_log,
|
||||
"labels": _alert_labels,
|
||||
"auto_repair_flag": _can_auto_repair_by_rule,
|
||||
},
|
||||
)
|
||||
except Exception as _log_err:
|
||||
logger.warning("alert_received_log_failed", error=str(_log_err))
|
||||
|
||||
# ==========================================================================
|
||||
# Alert Normalizer: 轉換 Alertmanager 格式 → AWOOOI AlertPayload
|
||||
# ==========================================================================
|
||||
@@ -1326,15 +1358,31 @@ async def alertmanager_webhook(
|
||||
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
|
||||
# Incident 建立後立即評估是否可自動修復
|
||||
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
|
||||
# Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL,不觸發背景任務
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||||
# ================================================================
|
||||
background_tasks.add_task(
|
||||
_try_auto_repair_background,
|
||||
incident_id=incident_id,
|
||||
approval_id=str(approval.id),
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
)
|
||||
if _can_auto_repair_by_rule:
|
||||
background_tasks.add_task(
|
||||
_try_auto_repair_background,
|
||||
incident_id=incident_id,
|
||||
approval_id=str(approval.id),
|
||||
alert_type=alert_type,
|
||||
target_resource=target_resource,
|
||||
namespace=namespace,
|
||||
)
|
||||
else:
|
||||
# auto_repair=false → 記錄 GUARDRAIL_BLOCKED,不觸發自動修復
|
||||
_op_log_rule = get_alert_operation_log_repository()
|
||||
background_tasks.add_task(
|
||||
_op_log_rule.append,
|
||||
"GUARDRAIL_BLOCKED",
|
||||
incident_id=incident_id,
|
||||
approval_id=str(approval.id),
|
||||
actor="prometheus-rule",
|
||||
action_detail=f"Prometheus rule 設定 auto_repair=false,強制人工審核: {alertname}",
|
||||
success=False,
|
||||
context={"alertname": alertname, "auto_repair_flag": False},
|
||||
)
|
||||
|
||||
# 推送 Telegram
|
||||
background_tasks.add_task(
|
||||
|
||||
@@ -124,6 +124,26 @@ class ApprovalRecord(Base):
|
||||
comment="Last time this alert pattern was seen",
|
||||
)
|
||||
|
||||
# Sprint 5.1 MultiSig 雙簽核支援 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062 Q3)
|
||||
approval_level: Mapped[str] = mapped_column(
|
||||
String(20),
|
||||
default="standard",
|
||||
nullable=False,
|
||||
comment="standard=1票審核, critical=2票MultiSig",
|
||||
)
|
||||
approval_votes: Mapped[list[dict[str, Any]]] = mapped_column(
|
||||
JSON,
|
||||
default=list,
|
||||
nullable=False,
|
||||
comment="[{user_id, voted_at, action}]",
|
||||
)
|
||||
required_votes: Mapped[int] = mapped_column(
|
||||
Integer,
|
||||
default=1,
|
||||
nullable=False,
|
||||
comment="standard=1, critical=2",
|
||||
)
|
||||
|
||||
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
|
||||
# Playbook 萃取和 KM 寫入必須知道 incident_id,不能靠文字解析
|
||||
incident_id: Mapped[str | None] = mapped_column(
|
||||
|
||||
@@ -212,6 +212,20 @@ class Playbook(BaseModel):
|
||||
tags: list[str] = Field(default_factory=list, description="標籤")
|
||||
notes: str | None = Field(None, description="人工補充說明")
|
||||
|
||||
# === Sprint 5.1 資料安全護欄 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei) ===
|
||||
requires_approval_level: str = Field(
|
||||
default="auto",
|
||||
description="auto=直接執行, standard=1票, critical=2票MultiSig(由 Service Registry 決定)",
|
||||
)
|
||||
stateful_targets: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="此 Playbook 操作的 Stateful 服務清單,對應 service-registry.yaml",
|
||||
)
|
||||
requires_pre_backup: bool = Field(
|
||||
default=False,
|
||||
description="執行前是否需要 Pre-flight 備份檢查",
|
||||
)
|
||||
|
||||
# === 時間軸 ===
|
||||
created_at: datetime = Field(default_factory=now_taipei)
|
||||
updated_at: datetime = Field(default_factory=now_taipei)
|
||||
|
||||
@@ -22,7 +22,9 @@ from src.db.models import AlertOperationLog
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# 合法的 event_type 值 (對應 DB ENUM)
|
||||
# Sprint 5.1 新增 8 個 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei)
|
||||
ALERT_EVENT_TYPES = {
|
||||
# 原有 10 個
|
||||
"ALERT_RECEIVED",
|
||||
"TELEGRAM_SENT",
|
||||
"USER_ACTION",
|
||||
@@ -33,6 +35,15 @@ ALERT_EVENT_TYPES = {
|
||||
"RESOLVED",
|
||||
"SILENCED",
|
||||
"ESCALATED",
|
||||
# Sprint 5.1 Guardrail / Pre-flight / MultiSig / 備份追蹤
|
||||
"GUARDRAIL_BLOCKED",
|
||||
"PRE_FLIGHT_PASSED",
|
||||
"PRE_FLIGHT_FAILED",
|
||||
"BACKUP_TRIGGERED",
|
||||
"BACKUP_COMPLETED",
|
||||
"BACKUP_FAILED",
|
||||
"APPROVAL_ESCALATED",
|
||||
"CHANGE_APPLIED",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -192,6 +192,32 @@ class AutoRepairService:
|
||||
blocked_by="GLOBAL_GUARDRAIL",
|
||||
)
|
||||
|
||||
# 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||||
# 全域熔斷之後、嚴重度之前,BLOCK 等級直接拒絕
|
||||
try:
|
||||
from src.services.service_registry import StatefulLevel, get_service_registry
|
||||
_registry = get_service_registry()
|
||||
_service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
|
||||
if not _service_name and incident.affected_services:
|
||||
_service_name = incident.affected_services[0]
|
||||
_stateful_level = _registry.get_stateful_level(_service_name)
|
||||
if _stateful_level == StatefulLevel.BLOCK:
|
||||
logger.warning(
|
||||
"auto_repair_blocked_guardrail",
|
||||
incident_id=incident.incident_id,
|
||||
service_name=_service_name,
|
||||
stateful_level="BLOCK",
|
||||
)
|
||||
return AutoRepairDecision(
|
||||
can_auto_repair=False,
|
||||
reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml)",
|
||||
blocked_by="SERVICE_REGISTRY_BLOCK",
|
||||
)
|
||||
except Exception as _guardrail_err:
|
||||
logger.error("guardrail_check_failed", error=str(_guardrail_err))
|
||||
# 保守原則:失敗時繼續(不阻擋,但記錄)
|
||||
|
||||
# 1. 檢查 Incident 嚴重度
|
||||
if incident.severity and incident.severity.value in ["P0", "P1"]:
|
||||
logger.info(
|
||||
|
||||
116
apps/api/src/services/preflight_service.py
Normal file
116
apps/api/src/services/preflight_service.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# apps/api/src/services/preflight_service.py
|
||||
# Pre-flight 安全檢查服務 (Q2/Q4 決策)
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient
|
||||
# 參考: ADR-062
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from .service_registry import ServiceRegistryClient, get_service_registry
|
||||
from .velero_client import VeleroClient, get_velero_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PreflightResult(str, Enum):
|
||||
PASS = "PASS"
|
||||
ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED"
|
||||
ABORT_HIGH_IO = "ABORT_HIGH_IO"
|
||||
SKIP = "SKIP" # 服務不需要 Pre-flight
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreflightReport:
|
||||
result: PreflightResult
|
||||
backup_age_hours: float | None = None
|
||||
backup_name_triggered: str | None = None
|
||||
reason: str = ""
|
||||
|
||||
|
||||
class PreflightService:
|
||||
"""
|
||||
Pre-flight 安全檢查
|
||||
- 只有 requires_pre_backup=True 的服務才觸發
|
||||
- 備份過期 → Abort + 觸發緊急備份(非同步)
|
||||
- CPU/IO 高負載告警 → 禁止觸發備份(Q4)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
registry: ServiceRegistryClient | None = None,
|
||||
velero: VeleroClient | None = None,
|
||||
) -> None:
|
||||
self._registry = registry or get_service_registry()
|
||||
self._velero = velero or get_velero_client()
|
||||
|
||||
async def check(
|
||||
self,
|
||||
service_name: str,
|
||||
alert_labels: dict | None = None,
|
||||
) -> PreflightReport:
|
||||
"""
|
||||
執行 Pre-flight 檢查
|
||||
alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載
|
||||
"""
|
||||
info = self._registry.get_service(service_name)
|
||||
if info is None or not info.requires_pre_backup:
|
||||
return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight")
|
||||
|
||||
# Q4: CPU/IO 高負載告警時禁止觸發備份
|
||||
if self._is_high_io_alert(alert_labels):
|
||||
logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發")
|
||||
return PreflightReport(
|
||||
result=PreflightResult.ABORT_HIGH_IO,
|
||||
reason="告警類型為 CPU/IO 高負載,禁止觸發備份(Q4 決策)",
|
||||
)
|
||||
|
||||
policies = self._registry.get_backup_policies()
|
||||
max_age = policies.get("velero_max_age_hours", 4)
|
||||
|
||||
age = await self._velero.get_latest_backup_age_hours()
|
||||
if age <= max_age:
|
||||
return PreflightReport(
|
||||
result=PreflightResult.PASS,
|
||||
backup_age_hours=age,
|
||||
reason=f"備份時間正常 ({age:.1f}h < {max_age}h)",
|
||||
)
|
||||
|
||||
# 備份過期 → 觸發緊急備份 + Abort
|
||||
backup_name = f"emergency-preflight-{int(time.time())}"
|
||||
triggered = await self._velero.trigger_emergency_backup(backup_name)
|
||||
return PreflightReport(
|
||||
result=PreflightResult.ABORT_BACKUP_EXPIRED,
|
||||
backup_age_hours=age,
|
||||
backup_name_triggered=backup_name if triggered else None,
|
||||
reason=(
|
||||
f"備份過期 ({age:.1f}h > {max_age}h)。"
|
||||
f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}"
|
||||
),
|
||||
)
|
||||
|
||||
def _is_high_io_alert(self, labels: dict | None) -> bool:
|
||||
if not labels:
|
||||
return False
|
||||
alert_name = labels.get("alertname", "").lower()
|
||||
return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"])
|
||||
|
||||
|
||||
_preflight_service: PreflightService | None = None
|
||||
|
||||
|
||||
def get_preflight_service() -> PreflightService:
|
||||
global _preflight_service
|
||||
if _preflight_service is None:
|
||||
_preflight_service = PreflightService()
|
||||
return _preflight_service
|
||||
|
||||
|
||||
def set_preflight_service(service: PreflightService) -> None:
|
||||
"""測試注入用 (P4 規範)"""
|
||||
global _preflight_service
|
||||
_preflight_service = service
|
||||
124
apps/api/src/services/service_registry.py
Normal file
124
apps/api/src/services/service_registry.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# apps/api/src/services/service_registry.py
|
||||
# Service Registry Client — 讀取 ops/config/service-registry.yaml
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 架構: leWOOOgo 積木化,純 Service 層,無 Router/DB 依賴
|
||||
# 參考: ADR-062, ADR-063
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# YAML 路徑(相對於 repo root)
|
||||
_DEFAULT_REGISTRY_PATH = Path(__file__).parents[5] / "ops" / "config" / "service-registry.yaml"
|
||||
|
||||
|
||||
class StatefulLevel(str, Enum):
|
||||
BLOCK = "BLOCK" # 禁止,僅告警
|
||||
CRITICAL_HITL = "CRITICAL_HITL" # 2 票 MultiSig
|
||||
STANDARD_HITL = "STANDARD_HITL" # 1 票
|
||||
AUTO = "AUTO" # 自動執行
|
||||
|
||||
|
||||
class ServiceInfo:
|
||||
def __init__(self, data: dict[str, Any]) -> None:
|
||||
self.name: str = data["name"]
|
||||
self.display_name: str = data.get("display_name", self.name)
|
||||
self.host: str = data.get("host", "unknown")
|
||||
self.stateful_level: StatefulLevel = StatefulLevel(data.get("stateful_level", "AUTO"))
|
||||
self.reason: str = data.get("reason", "")
|
||||
self.alert_only: bool = data.get("alert_only", False)
|
||||
self.requires_pre_backup: bool = data.get("requires_pre_backup", False)
|
||||
self.restart_command: str = data.get("restart_command", "docker restart")
|
||||
self.containers: list[str] = data.get("containers", [])
|
||||
|
||||
|
||||
class ServiceRegistryClient:
|
||||
"""
|
||||
Service Registry 客戶端
|
||||
讀取 ops/config/service-registry.yaml,提供服務 Stateful 分級查詢
|
||||
設計原則: 純讀取,不寫入;失敗時 fallback AUTO(防護不應阻擋告警流程)
|
||||
"""
|
||||
|
||||
def __init__(self, registry_path: Path | None = None) -> None:
|
||||
self._path = registry_path or _DEFAULT_REGISTRY_PATH
|
||||
self._services: dict[str, ServiceInfo] = {}
|
||||
self._backup_policies: dict[str, Any] = {}
|
||||
self._multisig_config: dict[str, Any] = {}
|
||||
self._loaded = False
|
||||
|
||||
def _load(self) -> None:
|
||||
if self._loaded:
|
||||
return
|
||||
try:
|
||||
with open(self._path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
for svc in data.get("services", []):
|
||||
info = ServiceInfo(svc)
|
||||
self._services[info.name] = info
|
||||
# 也按 container 名稱建立索引
|
||||
for container in info.containers:
|
||||
self._services[container] = info
|
||||
self._backup_policies = data.get("backup_policies", {})
|
||||
self._multisig_config = data.get("multisig", {})
|
||||
self._loaded = True
|
||||
logger.info(f"Service Registry 載入完成: {len(self._services)} 個服務")
|
||||
except Exception as e:
|
||||
logger.error(f"Service Registry 載入失敗: {e},所有服務 fallback AUTO")
|
||||
self._loaded = True # 防止重複嘗試
|
||||
|
||||
def get_service(self, name: str) -> ServiceInfo | None:
|
||||
self._load()
|
||||
return self._services.get(name)
|
||||
|
||||
def get_stateful_level(self, service_name: str) -> StatefulLevel:
|
||||
"""查詢服務分級,未知服務 fallback AUTO"""
|
||||
info = self.get_service(service_name)
|
||||
if info is None:
|
||||
logger.warning(f"未知服務 '{service_name}',fallback AUTO")
|
||||
return StatefulLevel.AUTO
|
||||
return info.stateful_level
|
||||
|
||||
def is_blocked(self, service_name: str) -> bool:
|
||||
return self.get_stateful_level(service_name) == StatefulLevel.BLOCK
|
||||
|
||||
def requires_multisig(self, service_name: str) -> bool:
|
||||
return self.get_stateful_level(service_name) == StatefulLevel.CRITICAL_HITL
|
||||
|
||||
def get_required_votes(self, service_name: str) -> int:
|
||||
self._load()
|
||||
level = self.get_stateful_level(service_name)
|
||||
if level == StatefulLevel.CRITICAL_HITL:
|
||||
return self._multisig_config.get("critical_required_votes", 2)
|
||||
return self._multisig_config.get("standard_required_votes", 1)
|
||||
|
||||
def get_backup_policies(self) -> dict[str, Any]:
|
||||
self._load()
|
||||
return self._backup_policies
|
||||
|
||||
def get_restart_command(self, service_name: str) -> str:
|
||||
info = self.get_service(service_name)
|
||||
return info.restart_command if info else "docker restart"
|
||||
|
||||
|
||||
# Singleton
|
||||
_registry_client: ServiceRegistryClient | None = None
|
||||
|
||||
|
||||
def get_service_registry() -> ServiceRegistryClient:
|
||||
global _registry_client
|
||||
if _registry_client is None:
|
||||
_registry_client = ServiceRegistryClient()
|
||||
return _registry_client
|
||||
|
||||
|
||||
def set_service_registry(client: ServiceRegistryClient) -> None:
|
||||
"""測試注入用 (P4 規範)"""
|
||||
global _registry_client
|
||||
_registry_client = client
|
||||
@@ -2654,6 +2654,128 @@ class TelegramGateway:
|
||||
f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景
|
||||
# (2026-04-08 Claude Sonnet 4.6 Asia/Taipei,ADR-062)
|
||||
# =========================================================================
|
||||
|
||||
async def send_guardrail_blocked(
|
||||
self,
|
||||
service_name: str,
|
||||
alertname: str,
|
||||
reason: str,
|
||||
) -> None:
|
||||
"""T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
|
||||
text = (
|
||||
"🚫 <b>[服務保護] 自動修復已阻擋</b>\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||||
f"告警: <code>{html.escape(alertname)}</code>\n"
|
||||
f"原因: {html.escape(reason)}\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
"⚠️ 請人工評估並手動處理"
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_preflight_failed(
|
||||
self,
|
||||
service_name: str,
|
||||
backup_age_hours: float,
|
||||
max_age_hours: float,
|
||||
backup_name: str | None,
|
||||
) -> None:
|
||||
"""T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
|
||||
backup_status = (
|
||||
f"緊急備份: 已啟動 <code>{html.escape(backup_name)}</code>"
|
||||
if backup_name
|
||||
else "緊急備份: <b>啟動失敗</b>,請人工處理"
|
||||
)
|
||||
text = (
|
||||
"⏸ <b>[Pre-flight 阻擋] 備份已過期,修復暫停</b>\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||||
f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
|
||||
f"{backup_status}\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
"請等待備份完成後,人工重新評估修復方案"
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_backup_result(
|
||||
self,
|
||||
backup_name: str,
|
||||
success: bool,
|
||||
error_msg: str | None = None,
|
||||
) -> None:
|
||||
"""T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
|
||||
if success:
|
||||
text = (
|
||||
"✅ <b>緊急備份完成</b>\n"
|
||||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||||
"可繼續手動執行修復"
|
||||
)
|
||||
else:
|
||||
err = html.escape(error_msg or "未知錯誤")
|
||||
text = (
|
||||
"❌ <b>緊急備份失敗</b>\n"
|
||||
f"備份: <code>{html.escape(backup_name)}</code>\n"
|
||||
f"錯誤: {err}\n"
|
||||
"請人工介入,備份異常"
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_multisig_waiting(
|
||||
self,
|
||||
action: str,
|
||||
service_name: str,
|
||||
votes_received: int,
|
||||
votes_required: int,
|
||||
approval_id: str,
|
||||
) -> None:
|
||||
"""T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
|
||||
text = (
|
||||
"🔐 <b>[MultiSig] 等待第 2 票授權</b>\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
f"操作: {html.escape(action)}\n"
|
||||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||||
f"風險: CRITICAL(HITL 雙簽)\n"
|
||||
f"已獲授權: {votes_received}/{votes_required} 票\n"
|
||||
f"審核 ID: <code>{html.escape(approval_id)}</code>\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
"請第二位審核者登入確認"
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_multisig_approved(
|
||||
self,
|
||||
action: str,
|
||||
service_name: str,
|
||||
) -> None:
|
||||
"""T5: MultiSig 完成(2/2)"""
|
||||
text = (
|
||||
"✅ <b>[MultiSig 完成] 雙簽授權通過</b>\n"
|
||||
f"操作: {html.escape(action)}\n"
|
||||
f"服務: <code>{html.escape(service_name)}</code>\n"
|
||||
"授權: 2/2 票 開始執行..."
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_change_applied(
|
||||
self,
|
||||
operator: str,
|
||||
action_description: str,
|
||||
timestamp: str,
|
||||
) -> None:
|
||||
"""T6: CHANGE_APPLIED — 手動變更記錄"""
|
||||
text = (
|
||||
"📝 <b>[變更記錄] 手動操作已記錄</b>\n"
|
||||
"━━━━━━━━━━━━━━━━━\n"
|
||||
f"操作者: {html.escape(operator)}\n"
|
||||
f"動作: {html.escape(action_description)}\n"
|
||||
f"時間: {html.escape(timestamp)}"
|
||||
)
|
||||
await self.send_notification(text)
|
||||
|
||||
async def send_notification(
|
||||
self,
|
||||
text: str,
|
||||
|
||||
113
apps/api/src/services/velero_client.py
Normal file
113
apps/api/src/services/velero_client.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# apps/api/src/services/velero_client.py
|
||||
# Velero Backup 查詢客戶端 (kubectl 方式,Q7 決策)
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 架構: leWOOOgo 積木化,純 Service 層
|
||||
# 參考: ADR-062
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_VELERO_NAMESPACE = "velero"
|
||||
_KUBECTL_TIMEOUT = 30 # 秒
|
||||
|
||||
|
||||
class VeleroClient:
|
||||
"""
|
||||
透過 kubectl 查詢 Velero 備份狀態
|
||||
設計原則: 失敗時 fallback「假設備份過期」(保守原則)
|
||||
"""
|
||||
|
||||
async def get_latest_backup_age_hours(self) -> float:
|
||||
"""
|
||||
查詢最近一次 Completed 備份距今幾小時
|
||||
失敗時返回 999.0(視為嚴重過期,觸發 Abort)
|
||||
"""
|
||||
try:
|
||||
result = await asyncio.wait_for(
|
||||
self._run_kubectl(
|
||||
["get", "backup", "-n", _VELERO_NAMESPACE,
|
||||
"-o", "json", "--field-selector", "status.phase=Completed"]
|
||||
),
|
||||
timeout=_KUBECTL_TIMEOUT,
|
||||
)
|
||||
data = json.loads(result)
|
||||
items = data.get("items", [])
|
||||
if not items:
|
||||
logger.warning("Velero: 找不到任何 Completed 備份")
|
||||
return 999.0
|
||||
|
||||
latest = max(
|
||||
items,
|
||||
key=lambda x: x.get("status", {}).get("completionTimestamp", ""),
|
||||
)
|
||||
completion_ts = latest["status"].get("completionTimestamp", "")
|
||||
if not completion_ts:
|
||||
return 999.0
|
||||
|
||||
completed_at = datetime.fromisoformat(completion_ts.replace("Z", "+00:00"))
|
||||
age = (datetime.now(UTC) - completed_at).total_seconds() / 3600
|
||||
logger.info(f"Velero 最近備份: {completion_ts},距今 {age:.1f} 小時")
|
||||
return age
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("Velero kubectl 查詢超時")
|
||||
return 999.0
|
||||
except Exception as e:
|
||||
logger.error(f"Velero 查詢失敗: {e}")
|
||||
return 999.0
|
||||
|
||||
async def trigger_emergency_backup(self, backup_name: str | None = None) -> bool:
|
||||
"""
|
||||
觸發緊急備份(非同步,不等待完成)
|
||||
返回 True 表示指令已成功發送
|
||||
"""
|
||||
name = backup_name or f"emergency-{int(time.time())}"
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._run_kubectl([
|
||||
"create", "backup", name,
|
||||
"-n", _VELERO_NAMESPACE,
|
||||
"--include-namespaces", "awoooi-prod",
|
||||
"--wait=false",
|
||||
]),
|
||||
timeout=_KUBECTL_TIMEOUT,
|
||||
)
|
||||
logger.info(f"Velero 緊急備份已啟動: {name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Velero 緊急備份失敗: {e}")
|
||||
return False
|
||||
|
||||
async def _run_kubectl(self, args: list[str]) -> str:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"kubectl", *args,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await proc.communicate()
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"kubectl 失敗: {stderr.decode()}")
|
||||
return stdout.decode()
|
||||
|
||||
|
||||
_velero_client: VeleroClient | None = None
|
||||
|
||||
|
||||
def get_velero_client() -> VeleroClient:
|
||||
global _velero_client
|
||||
if _velero_client is None:
|
||||
_velero_client = VeleroClient()
|
||||
return _velero_client
|
||||
|
||||
|
||||
def set_velero_client(client: VeleroClient) -> None:
|
||||
"""測試注入用 (P4 規範)"""
|
||||
global _velero_client
|
||||
_velero_client = client
|
||||
36
k8s/rbac/api-velero-reader.yaml
Normal file
36
k8s/rbac/api-velero-reader.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
# k8s/rbac/api-velero-reader.yaml
|
||||
# API Pod 讀取 Velero backup 資源的 RBAC
|
||||
# Sprint 5.1 K-001 / 2026-04-08 Asia/Taipei
|
||||
# 說明: awoooi-executor ServiceAccount 需要讀取 velero namespace 的 backup 資源
|
||||
# 用於 Pre-flight Check 查詢最近備份時間(Q7 決策:kubectl 方式)
|
||||
# 注意: ServiceAccount 名稱為 awoooi-executor(非 awoooi-api,經 L0 確認)
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: awoooi-velero-backup-reader
|
||||
labels:
|
||||
app: awoooi
|
||||
component: api
|
||||
sprint: "5.1"
|
||||
rules:
|
||||
- apiGroups: ["velero.io"]
|
||||
resources: ["backups"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: awoooi-velero-backup-reader
|
||||
labels:
|
||||
app: awoooi
|
||||
component: api
|
||||
sprint: "5.1"
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: awoooi-executor
|
||||
namespace: awoooi-prod
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: awoooi-velero-backup-reader
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
201
ops/config/service-registry.yaml
Normal file
201
ops/config/service-registry.yaml
Normal file
@@ -0,0 +1,201 @@
|
||||
# ops/config/service-registry.yaml
|
||||
# Service Registry — 服務 Stateful 分級清單
|
||||
# 版本: 1.0.0
|
||||
# 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 維護: 修改需 PR + 統帥審核,禁止直接 push
|
||||
# 說明:
|
||||
# BLOCK = 系統禁止自動修復,僅告警(資料風險最高)
|
||||
# CRITICAL_HITL = 允許 Playbook,但需 MultiSig 2票
|
||||
# STANDARD_HITL = 允許 Playbook,需 1票審核
|
||||
# AUTO = 允許自動執行(無狀態服務)
|
||||
# 參考: ADR-062, ADR-063
|
||||
|
||||
services:
|
||||
# ─── BLOCK:系統禁止(連 Playbook 都不提供)────────────────────────────
|
||||
- name: postgres
|
||||
display_name: "PostgreSQL 主庫 (awoooi_prod)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: BLOCK
|
||||
reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾"
|
||||
alert_only: true
|
||||
containers: ["postgres"]
|
||||
|
||||
- name: momo-db
|
||||
display_name: "PostgreSQL (momo_db)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: BLOCK
|
||||
reason: "momo 產品資料庫,禁止自動操作"
|
||||
alert_only: true
|
||||
containers: ["momo-db"]
|
||||
|
||||
- name: langfuse-db
|
||||
display_name: "PostgreSQL (Langfuse)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: BLOCK
|
||||
reason: "LLM trace 資料庫,重啟導致追蹤資料遺失"
|
||||
alert_only: true
|
||||
containers: ["langfuse-db"]
|
||||
|
||||
- name: harbor-db
|
||||
display_name: "PostgreSQL (Harbor Registry)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: BLOCK
|
||||
reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引"
|
||||
alert_only: true
|
||||
containers: ["harbor-db"]
|
||||
|
||||
- name: sentry-postgres
|
||||
display_name: "PostgreSQL (Sentry)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: BLOCK
|
||||
reason: "Sentry 錯誤追蹤資料庫"
|
||||
alert_only: true
|
||||
containers: ["sentry-postgres"]
|
||||
|
||||
- name: signoz-clickhouse
|
||||
display_name: "ClickHouse (SignOz)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: BLOCK
|
||||
reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案"
|
||||
alert_only: true
|
||||
containers: ["signoz-clickhouse"]
|
||||
|
||||
# ─── CRITICAL_HITL:高風險,需 MultiSig 2票 ──────────────────────────
|
||||
- name: redis
|
||||
display_name: "Redis (AWOOOI)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態"
|
||||
requires_pre_backup: false
|
||||
containers: ["redis"]
|
||||
|
||||
- name: harbor-redis
|
||||
display_name: "Redis (Harbor)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "Harbor session 快取"
|
||||
containers: ["harbor-redis"]
|
||||
|
||||
- name: sentry-redis
|
||||
display_name: "Redis (Sentry)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "Sentry 任務佇列"
|
||||
containers: ["sentry-redis"]
|
||||
|
||||
- name: gitea
|
||||
display_name: "Gitea (程式碼倉庫)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "restart 會殺掉活躍 SSH session,Git push 中斷可能損壞 working copy"
|
||||
requires_pre_backup: false
|
||||
containers: ["gitea"]
|
||||
|
||||
- name: harbor
|
||||
display_name: "Harbor (Container Registry)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "重啟中斷 pull/push;GC 進行中重啟可能損壞 layer"
|
||||
requires_pre_backup: false
|
||||
containers: ["harbor-core", "harbor-jobservice", "harbor-portal"]
|
||||
|
||||
- name: minio
|
||||
display_name: "MinIO (物件存儲)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: CRITICAL_HITL
|
||||
reason: "寫入中重啟可能導致 multipart upload 中斷"
|
||||
requires_pre_backup: false
|
||||
containers: ["minio"]
|
||||
|
||||
# ─── STANDARD_HITL:中風險,需 1票審核 ──────────────────────────────
|
||||
- name: prometheus
|
||||
display_name: "Prometheus"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: STANDARD_HITL
|
||||
reason: "有 TSDB WAL,exited 狀態用 docker start(非 restart)"
|
||||
restart_command: "docker start"
|
||||
containers: ["prometheus"]
|
||||
|
||||
- name: grafana
|
||||
display_name: "Grafana"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: STANDARD_HITL
|
||||
reason: "有 SQLite 設定儲存,exited 用 docker start"
|
||||
restart_command: "docker start"
|
||||
containers: ["grafana"]
|
||||
|
||||
- name: alertmanager
|
||||
display_name: "Alertmanager"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: STANDARD_HITL
|
||||
reason: "有 silence 狀態,exited 用 docker start"
|
||||
restart_command: "docker start"
|
||||
containers: ["alertmanager"]
|
||||
|
||||
# ─── AUTO:無狀態,允許自動修復 ──────────────────────────────────────
|
||||
- name: nginx
|
||||
display_name: "Nginx (反向代理)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: AUTO
|
||||
containers: ["nginx", "nginx-188"]
|
||||
|
||||
- name: awoooi-api
|
||||
display_name: "AWOOOI API (K3s)"
|
||||
host: "k3s"
|
||||
stateful_level: AUTO
|
||||
containers: []
|
||||
|
||||
- name: awoooi-web
|
||||
display_name: "AWOOOI Web (K3s)"
|
||||
host: "k3s"
|
||||
stateful_level: AUTO
|
||||
containers: []
|
||||
|
||||
- name: blackbox-exporter
|
||||
display_name: "Blackbox Exporter"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: AUTO
|
||||
containers: ["blackbox-exporter"]
|
||||
|
||||
- name: langfuse
|
||||
display_name: "Langfuse (LLMOps)"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: AUTO
|
||||
containers: ["langfuse-web", "langfuse-worker"]
|
||||
|
||||
- name: ollama
|
||||
display_name: "Ollama (Local LLM)"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: AUTO
|
||||
containers: ["ollama"]
|
||||
|
||||
- name: momo-app
|
||||
display_name: "momo Web App"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: AUTO
|
||||
containers: ["momo-app"]
|
||||
|
||||
- name: tsenyang-website
|
||||
display_name: "Tsenyang Website"
|
||||
host: "192.168.0.188"
|
||||
stateful_level: AUTO
|
||||
containers: ["tsenyang-website"]
|
||||
|
||||
- name: stock-platform
|
||||
display_name: "Stock Platform"
|
||||
host: "192.168.0.110"
|
||||
stateful_level: AUTO
|
||||
containers: ["stock-platform"]
|
||||
|
||||
# ─── 備份策略參考 ────────────────────────────────────────────────────────
|
||||
backup_policies:
|
||||
velero_max_age_hours: 4 # Velero 備份過期閾值(Q2 決策)
|
||||
emergency_backup_timeout: 600 # 緊急備份超時秒數
|
||||
block_backup_on_high_io: true # CPU/IO > 80% 時禁止觸發備份(Q4 決策)
|
||||
io_threshold_percent: 80
|
||||
|
||||
# ─── MultiSig 設定 ───────────────────────────────────────────────────────
|
||||
multisig:
|
||||
critical_required_votes: 2 # CRITICAL_HITL 需要幾票
|
||||
standard_required_votes: 1 # STANDARD_HITL 需要幾票
|
||||
vote_expiry_minutes: 30 # 投票有效期
|
||||
@@ -1,11 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
# docker-health-monitor.sh
|
||||
# Plan A: Docker 容器健康監控 + 自動修復
|
||||
# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook,禁止任何修復動作)
|
||||
#
|
||||
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
|
||||
# 設定: /etc/awoooi-ops/secrets.env
|
||||
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||||
# 首席架構師裁示: Intent→Action→Result 三段式,禁止靜默修復
|
||||
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行(ADR-062)
|
||||
# 注意: 禁止在此腳本中執行 docker restart / docker start
|
||||
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -19,48 +21,45 @@ fi
|
||||
: "${AWOOOI_API_URL:=https://awoooi.wooo.work}"
|
||||
: "${TELEGRAM_BOT_TOKEN:=}"
|
||||
: "${TELEGRAM_CHAT_ID:=}"
|
||||
: "${WEBHOOK_HMAC_SECRET:=}"
|
||||
: "${COOLDOWN_SECONDS:=300}"
|
||||
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
|
||||
# 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻)
|
||||
: "${SEND_COOLDOWN_SECONDS:=300}"
|
||||
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
|
||||
|
||||
mkdir -p "$COOLDOWN_DIR"
|
||||
|
||||
# ─── 排除清單(禁止自動修復)───────────────────────────────────────────────
|
||||
# 判斷方式: echo ":list:" | grep -q ":name:"
|
||||
# 分類一:資料庫 — 禁止 restart
|
||||
EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:"
|
||||
# 分類二:Redis — 禁止 restart
|
||||
EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:"
|
||||
# 分類三:監控棧 exited → docker start(保護 WAL)
|
||||
MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:"
|
||||
# 分類四:監控棧 其他 → 僅告警
|
||||
EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:"
|
||||
# 分類五:關鍵系統 — 永遠禁止(Gitea restart 會殺活躍 SSH)
|
||||
EXCLUDED_CRITICAL_LIST=":gitea:"
|
||||
|
||||
# ─── 工具函數 ────────────────────────────────────────────────────────────────
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
|
||||
}
|
||||
|
||||
in_list() {
|
||||
local name=":${1}:"
|
||||
local list="$2"
|
||||
[[ "$list" == *"$name"* ]]
|
||||
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook)
|
||||
is_in_send_cooldown() {
|
||||
local container="$1"
|
||||
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
|
||||
if [[ -f "$cooldown_file" ]]; then
|
||||
local last_sent now elapsed
|
||||
last_sent=$(cat "$cooldown_file")
|
||||
now=$(date +%s)
|
||||
elapsed=$(( now - last_sent ))
|
||||
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
|
||||
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s,跳過(冷卻期 ${SEND_COOLDOWN_SECONDS}s)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# 計算 HMAC-SHA256 簽章
|
||||
sign_payload() {
|
||||
local payload="$1"
|
||||
printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256
|
||||
set_send_cooldown() {
|
||||
local container="$1"
|
||||
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
|
||||
}
|
||||
|
||||
# 傳送 Telegram(Fallback:AWOOOI API down 時直接呼叫 Bot API)
|
||||
# Fallback:AWOOOI API down 時直接呼叫 Telegram Bot API
|
||||
send_telegram_direct() {
|
||||
local message="$1"
|
||||
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
|
||||
log "WARN: Telegram 未設定,跳過通知"
|
||||
log "WARN: Telegram 未設定,跳過 Fallback"
|
||||
return 0
|
||||
fi
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
@@ -69,186 +68,113 @@ send_telegram_direct() {
|
||||
> /dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
# 傳送 AWOOOI Webhook(若失敗則 Fallback 至 Telegram Bot API)
|
||||
send_awoooi_alert() {
|
||||
local title="$1"
|
||||
local message="$2"
|
||||
local severity="${3:-WARNING}"
|
||||
local source="docker-health-monitor"
|
||||
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
|
||||
# 使用現有端點 /api/v1/webhooks/alertmanager(內網免 HMAC)
|
||||
send_to_awoooi() {
|
||||
local container="$1"
|
||||
local status="$2" # unhealthy | exited | dead
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
local now_ts
|
||||
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
# 組裝 Alertmanager 格式 JSON(符合現有 AlertmanagerPayload schema)
|
||||
local payload
|
||||
payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \
|
||||
"$title" "$message" "$severity" "$source")
|
||||
|
||||
local timestamp
|
||||
timestamp=$(date -u +%s)
|
||||
local signature
|
||||
signature=$(sign_payload "${timestamp}${payload}")
|
||||
payload=$(cat <<JSON
|
||||
{
|
||||
"version": "4",
|
||||
"groupKey": "docker-health-${hostname}-${container}",
|
||||
"status": "firing",
|
||||
"alerts": [{
|
||||
"status": "firing",
|
||||
"labels": {
|
||||
"alertname": "DockerContainerUnhealthy",
|
||||
"container": "${container}",
|
||||
"host": "${hostname}",
|
||||
"layer": "docker",
|
||||
"severity": "warning",
|
||||
"auto_repair": "true",
|
||||
"source": "docker-health-monitor"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "容器 ${container} 狀態異常: ${status}",
|
||||
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
|
||||
},
|
||||
"startsAt": "${now_ts}"
|
||||
}]
|
||||
}
|
||||
JSON
|
||||
)
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/custom-alert" \
|
||||
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/alertmanager" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Timestamp: ${timestamp}" \
|
||||
-H "X-Signature: sha256=${signature}" \
|
||||
-d "$payload" \
|
||||
--connect-timeout 10 \
|
||||
--max-time 30 2>/dev/null) || http_code="0"
|
||||
|
||||
if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then
|
||||
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
|
||||
log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
|
||||
set_send_cooldown "$container"
|
||||
else
|
||||
log "WARN: AWOOOI API 回應 ${http_code},Fallback 到 Telegram Bot API"
|
||||
send_telegram_direct "[docker-health-monitor Fallback] ${title} ${message}"
|
||||
send_telegram_direct "🚨 [docker-health-monitor Fallback] 主機: ${hostname} 容器: ${container} 狀態: ${status} (API 不可達,請人工處理)"
|
||||
set_send_cooldown "$container"
|
||||
fi
|
||||
}
|
||||
|
||||
# 冷卻期檢查(避免同一容器短時間重複修復)
|
||||
is_in_cooldown() {
|
||||
local container="$1"
|
||||
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
|
||||
if [[ -f "$cooldown_file" ]]; then
|
||||
local last_repair
|
||||
last_repair=$(cat "$cooldown_file")
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local elapsed=$(( now - last_repair ))
|
||||
if (( elapsed < COOLDOWN_SECONDS )); then
|
||||
log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
set_cooldown() {
|
||||
local container="$1"
|
||||
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
|
||||
}
|
||||
|
||||
# ─── 核心:處理不健康容器 ───────────────────────────────────────────────────
|
||||
handle_unhealthy_container() {
|
||||
local container="$1"
|
||||
local status="$2" # unhealthy | exited | dead
|
||||
# ─── 核心:掃描所有容器 ─────────────────────────────────────────────────────
|
||||
check_containers() {
|
||||
local hostname
|
||||
hostname=$(hostname)
|
||||
|
||||
log "DETECTED: ${container} 狀態=${status} on ${hostname}"
|
||||
# 取得所有容器(含停止的)
|
||||
while IFS=$'\t' read -r container_id container_name state health; do
|
||||
# 跳過 header 或空行
|
||||
[[ -z "$container_name" ]] && continue
|
||||
|
||||
# ── 排除清單判斷 ─────────────────────────────────────────────────────────
|
||||
|
||||
if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then
|
||||
log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 關鍵服務異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_DB_LIST"; then
|
||||
log "SKIP: ${container} 屬於資料庫排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 資料庫容器異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_REDIS_LIST"; then
|
||||
log "SKIP: ${container} 屬於 Redis 排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] Redis 容器異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \
|
||||
"CRITICAL"
|
||||
return
|
||||
fi
|
||||
|
||||
if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then
|
||||
log "SKIP: ${container} 屬於監控棧排除清單,僅告警"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 監控元件異常: ${container}" \
|
||||
"容器 ${container} 狀態=${status}。請人工處理。" \
|
||||
"WARNING"
|
||||
return
|
||||
fi
|
||||
|
||||
# ── 冷卻期判斷 ────────────────────────────────────────────────────────────
|
||||
if is_in_cooldown "$container"; then
|
||||
log "SKIP: ${container} 在冷卻期內,跳過本次修復"
|
||||
return
|
||||
fi
|
||||
|
||||
# ── 決定修復動作 ─────────────────────────────────────────────────────────
|
||||
local action_cmd="docker restart"
|
||||
local action_desc="docker restart"
|
||||
if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then
|
||||
action_cmd="docker start"
|
||||
action_desc="docker start(保護 WAL,非 restart)"
|
||||
fi
|
||||
|
||||
# ── Phase 1: Intent(決策意圖通知)──────────────────────────────────────
|
||||
log "INTENT: 即將對 ${container} 執行 ${action_desc}"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復 Intent: ${container}" \
|
||||
"偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc},2 秒後開始修復。" \
|
||||
"WARNING"
|
||||
|
||||
sleep 2
|
||||
|
||||
# ── Phase 2: Action(執行修復)──────────────────────────────────────────
|
||||
log "ACTION: 執行 ${action_cmd} ${container}"
|
||||
set_cooldown "$container"
|
||||
|
||||
local repair_ok=false
|
||||
if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then
|
||||
repair_ok=true
|
||||
fi
|
||||
|
||||
# ── Phase 3: Result(執行結果通知)──────────────────────────────────────
|
||||
if $repair_ok; then
|
||||
log "RESULT: ${container} 修復成功"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復成功: ${container}" \
|
||||
"容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}。" \
|
||||
"INFO"
|
||||
else
|
||||
log "RESULT: ${container} 修復失敗!需人工介入"
|
||||
send_awoooi_alert \
|
||||
"[${hostname}] 自動修復失敗: ${container}" \
|
||||
"容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \
|
||||
"CRITICAL"
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── 主流程 ──────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "===== docker-health-monitor 啟動 (host=$(hostname)) ====="
|
||||
|
||||
# 取得所有容器狀態
|
||||
# docker ps -a 格式: Names / Health / State
|
||||
while IFS=$'\t' read -r name health_status container_status; do
|
||||
[[ -z "$name" ]] && continue
|
||||
|
||||
local needs_repair=false
|
||||
local needs_alert=false
|
||||
local detected_status=""
|
||||
|
||||
if [[ "$health_status" == "unhealthy" ]]; then
|
||||
needs_repair=true
|
||||
# 偵測 exited / dead
|
||||
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
|
||||
needs_alert=true
|
||||
detected_status="$state"
|
||||
fi
|
||||
|
||||
# 偵測 unhealthy(health check 存在且失敗)
|
||||
if [[ "$health" == "unhealthy" ]]; then
|
||||
needs_alert=true
|
||||
detected_status="unhealthy"
|
||||
elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then
|
||||
needs_repair=true
|
||||
detected_status="$container_status"
|
||||
elif [[ "$health_status" == "starting" ]]; then
|
||||
log "INFO: ${name} health=starting,等待中跳過"
|
||||
continue
|
||||
fi
|
||||
|
||||
if $needs_repair; then
|
||||
handle_unhealthy_container "$name" "$detected_status"
|
||||
if $needs_alert; then
|
||||
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
|
||||
|
||||
# 冷卻期去重
|
||||
if is_in_send_cooldown "$container_name"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# 送 Webhook — 只感知,不修復
|
||||
send_to_awoooi "$container_name" "$detected_status"
|
||||
fi
|
||||
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
|
||||
awk -F'\t' '{
|
||||
health = ""
|
||||
if ($4 ~ /\(unhealthy\)/) health = "unhealthy"
|
||||
else if ($4 ~ /\(healthy\)/) health = "healthy"
|
||||
print $1 "\t" $2 "\t" $3 "\t" health
|
||||
}')
|
||||
}
|
||||
|
||||
done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null)
|
||||
|
||||
log "===== docker-health-monitor 完成 ====="
|
||||
# ─── Main ───────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
|
||||
check_containers
|
||||
log "=== 掃描完成 ==="
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
||||
Reference in New Issue
Block a user