feat(sprint5.1): Data Safety Guardrails 全鏈路整合 (L1-L5)
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m33s
Type Sync Check / check-type-sync (push) Failing after 58s

Layer 0 - K8s RBAC:
  - k8s/rbac/api-velero-reader.yaml: awoooi-executor SA Velero backup reader

Layer 1 - DB Migration (已在 188 執行):
  - M-002: approval_records 新增 approval_level/votes/required_votes
  - M-003: alert_event_type ENUM 新增 8 個值

Layer 2 - IaC:
  - ops/config/service-registry.yaml: 全服務 Stateful 分級清單 (BLOCK/CRITICAL_HITL/STANDARD_HITL/AUTO)

Layer 3 - Python Services:
  - service_registry.py: 讀取 YAML,提供 is_blocked/requires_multisig/get_required_votes
  - velero_client.py: kubectl 查詢 Velero 備份年齡,失敗 fallback 999h
  - preflight_service.py: Pre-flight 安全檢查 (Q2/Q4 決策)

Layer 1-M001 - Playbook model:
  - playbook.py: 新增 requires_approval_level/stateful_targets/requires_pre_backup

Layer 4 - 業務邏輯:
  - alert_operation_log_repository.py: 新增 8 個 event_type (Guardrail/Pre-flight/MultiSig/備份)
  - auto_repair_service.py: 注入 Service Registry Guardrail 檢查 (BLOCK → 直接拒絕)
  - webhooks.py: ALERT_RECEIVED 溯源記錄 + auto_repair flag Q9 + Langfuse trace_id Q10
  - db/models.py: ApprovalRecord 同步 approval_level/votes/required_votes 欄位
  - docker-health-monitor.sh: 純感知層改造(移除所有 docker restart 邏輯)

Layer 5 - Telegram 通知:
  - telegram_gateway.py: T1-T6 六個新通知方法 (Guardrail/Pre-flight/Backup/MultiSig/ChangeApplied)

參考: ADR-062 Data Safety Guardrails, ADR-063 Service Registry IaC

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-08 16:24:09 +08:00
parent 6f7a4be2c7
commit 88696dba9b
14 changed files with 997 additions and 191 deletions

View File

@@ -0,0 +1,18 @@
-- apps/api/migrations/sprint51_alert_log_events.sql
-- Sprint 5.1 M-003: alert_operation_log ENUM 擴充
-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
-- ⚠️ ENUM ADD VALUE 不可 rollback執行前確認已備份
-- 說明: 新增 8 個 event_type 支援 Guardrail / Pre-flight / MultiSig / 備份追蹤
BEGIN;
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'GUARDRAIL_BLOCKED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_PASSED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'PRE_FLIGHT_FAILED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_TRIGGERED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_COMPLETED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'BACKUP_FAILED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'APPROVAL_ESCALATED';
ALTER TYPE alert_event_type ADD VALUE IF NOT EXISTS 'CHANGE_APPLIED';
COMMIT;

View File

@@ -0,0 +1,31 @@
-- apps/api/migrations/sprint51_approval_multisig.sql
-- Sprint 5.1 M-002: MultiSig 雙簽核支援
-- 執行者: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
-- 說明: approval_records 新增 approval_level / approval_votes / required_votes
BEGIN;
ALTER TABLE approval_records
ADD COLUMN IF NOT EXISTS approval_level VARCHAR(20)
DEFAULT 'standard'
CHECK (approval_level IN ('standard', 'critical')),
ADD COLUMN IF NOT EXISTS approval_votes JSONB
DEFAULT '[]'::jsonb,
ADD COLUMN IF NOT EXISTS required_votes INTEGER
DEFAULT 1;
COMMENT ON COLUMN approval_records.approval_level IS
'standard=1票審核, critical=2票MultiSig';
COMMENT ON COLUMN approval_records.approval_votes IS
'JSON array: [{"user_id": "123", "voted_at": "2026-04-08T...", "action": "approve"}]';
COMMENT ON COLUMN approval_records.required_votes IS
'standard=1, critical=2';
-- 現有記錄回填(向後相容)
UPDATE approval_records
SET approval_level = 'standard',
required_votes = 1,
approval_votes = '[]'::jsonb
WHERE approval_level IS NULL;
COMMIT;

View File

@@ -218,7 +218,9 @@ async def _try_auto_repair_background(
)
return
# 記錄自動修復觸發
# 記錄自動修復觸發 (Sprint 5.1 Q10: 加入 Langfuse trace_id 追蹤)
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
_langfuse_trace_id = getattr(incident, "langfuse_trace_id", None)
await op_log.append(
"AUTO_REPAIR_TRIGGERED",
incident_id=incident_id,
@@ -231,6 +233,11 @@ async def _try_auto_repair_background(
"playbook_name": decision.playbook.name,
"similarity_score": decision.similarity_score,
"risk_level": decision.risk_level.value if decision.risk_level else None,
"langfuse_trace_id": _langfuse_trace_id,
"langfuse_url": (
f"http://192.168.0.110:3100/trace/{_langfuse_trace_id}"
if _langfuse_trace_id else None
),
},
)
@@ -1084,6 +1091,31 @@ async def alertmanager_webhook(
alert = firing_alerts[0]
alert_id = f"alert-{now_taipei().strftime('%Y%m%d%H%M%S')}"
# ==========================================================================
# Sprint 5.1 L4-2: ALERT_RECEIVED 溯源記錄 + auto_repair flag 讀取
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062 Q9)
# ==========================================================================
_alert_labels = alert.labels or {}
_alertname_for_log = _alert_labels.get("alertname", "UnknownAlert")
# Q9: auto_repair flag — Rule=false 強制 HITL不觸發自動修復背景任務
_can_auto_repair_by_rule = _alert_labels.get("auto_repair", "true").lower() == "true"
try:
_op_log = get_alert_operation_log_repository()
await _op_log.append(
"ALERT_RECEIVED",
actor="alertmanager",
action_detail=f"收到告警: {_alertname_for_log}",
context={
"source": "alertmanager",
"alert_id": alert_id,
"alertname": _alertname_for_log,
"labels": _alert_labels,
"auto_repair_flag": _can_auto_repair_by_rule,
},
)
except Exception as _log_err:
logger.warning("alert_received_log_failed", error=str(_log_err))
# ==========================================================================
# Alert Normalizer: 轉換 Alertmanager 格式 → AWOOOI AlertPayload
# ==========================================================================
@@ -1326,15 +1358,31 @@ async def alertmanager_webhook(
# 2026-04-05 ogt: 自動修復評估 (ADR-058 閉環)
# Incident 建立後立即評估是否可自動修復
# P2 以下 + 高品質 Playbook + 低風險 → 背景自動執行
# Sprint 5.1 Q9: auto_repair=false 旗標 → 強制 HITL不觸發背景任務
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# ================================================================
background_tasks.add_task(
_try_auto_repair_background,
incident_id=incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
if _can_auto_repair_by_rule:
background_tasks.add_task(
_try_auto_repair_background,
incident_id=incident_id,
approval_id=str(approval.id),
alert_type=alert_type,
target_resource=target_resource,
namespace=namespace,
)
else:
# auto_repair=false → 記錄 GUARDRAIL_BLOCKED不觸發自動修復
_op_log_rule = get_alert_operation_log_repository()
background_tasks.add_task(
_op_log_rule.append,
"GUARDRAIL_BLOCKED",
incident_id=incident_id,
approval_id=str(approval.id),
actor="prometheus-rule",
action_detail=f"Prometheus rule 設定 auto_repair=false強制人工審核: {alertname}",
success=False,
context={"alertname": alertname, "auto_repair_flag": False},
)
# 推送 Telegram
background_tasks.add_task(

View File

@@ -124,6 +124,26 @@ class ApprovalRecord(Base):
comment="Last time this alert pattern was seen",
)
# Sprint 5.1 MultiSig 雙簽核支援 (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062 Q3)
approval_level: Mapped[str] = mapped_column(
String(20),
default="standard",
nullable=False,
comment="standard=1票審核, critical=2票MultiSig",
)
approval_votes: Mapped[list[dict[str, Any]]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="[{user_id, voted_at, action}]",
)
required_votes: Mapped[int] = mapped_column(
Integer,
default=1,
nullable=False,
comment="standard=1, critical=2",
)
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
# Playbook 萃取和 KM 寫入必須知道 incident_id不能靠文字解析
incident_id: Mapped[str | None] = mapped_column(

View File

@@ -212,6 +212,20 @@ class Playbook(BaseModel):
tags: list[str] = Field(default_factory=list, description="標籤")
notes: str | None = Field(None, description="人工補充說明")
# === Sprint 5.1 資料安全護欄 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei) ===
requires_approval_level: str = Field(
default="auto",
description="auto=直接執行, standard=1票, critical=2票MultiSig由 Service Registry 決定)",
)
stateful_targets: list[str] = Field(
default_factory=list,
description="此 Playbook 操作的 Stateful 服務清單,對應 service-registry.yaml",
)
requires_pre_backup: bool = Field(
default=False,
description="執行前是否需要 Pre-flight 備份檢查",
)
# === 時間軸 ===
created_at: datetime = Field(default_factory=now_taipei)
updated_at: datetime = Field(default_factory=now_taipei)

View File

@@ -22,7 +22,9 @@ from src.db.models import AlertOperationLog
logger = structlog.get_logger(__name__)
# 合法的 event_type 值 (對應 DB ENUM)
# Sprint 5.1 新增 8 個 (2026-04-08 Claude Sonnet 4.6 Asia/Taipei)
ALERT_EVENT_TYPES = {
# 原有 10 個
"ALERT_RECEIVED",
"TELEGRAM_SENT",
"USER_ACTION",
@@ -33,6 +35,15 @@ ALERT_EVENT_TYPES = {
"RESOLVED",
"SILENCED",
"ESCALATED",
# Sprint 5.1 Guardrail / Pre-flight / MultiSig / 備份追蹤
"GUARDRAIL_BLOCKED",
"PRE_FLIGHT_PASSED",
"PRE_FLIGHT_FAILED",
"BACKUP_TRIGGERED",
"BACKUP_COMPLETED",
"BACKUP_FAILED",
"APPROVAL_ESCALATED",
"CHANGE_APPLIED",
}

View File

@@ -192,6 +192,32 @@ class AutoRepairService:
blocked_by="GLOBAL_GUARDRAIL",
)
# 0.5 Sprint 5.1 Guardrail: Service Registry 服務分級檢查
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# 全域熔斷之後、嚴重度之前BLOCK 等級直接拒絕
try:
from src.services.service_registry import StatefulLevel, get_service_registry
_registry = get_service_registry()
_service_name = (incident.target_resource or "") if hasattr(incident, "target_resource") else ""
if not _service_name and incident.affected_services:
_service_name = incident.affected_services[0]
_stateful_level = _registry.get_stateful_level(_service_name)
if _stateful_level == StatefulLevel.BLOCK:
logger.warning(
"auto_repair_blocked_guardrail",
incident_id=incident.incident_id,
service_name=_service_name,
stateful_level="BLOCK",
)
return AutoRepairDecision(
can_auto_repair=False,
reason=f"GUARDRAIL_BLOCK: 服務 '{_service_name}' 屬於禁止自動修復清單(資料安全,見 service-registry.yaml",
blocked_by="SERVICE_REGISTRY_BLOCK",
)
except Exception as _guardrail_err:
logger.error("guardrail_check_failed", error=str(_guardrail_err))
# 保守原則:失敗時繼續(不阻擋,但記錄)
# 1. 檢查 Incident 嚴重度
if incident.severity and incident.severity.value in ["P0", "P1"]:
logger.info(

View File

@@ -0,0 +1,116 @@
# apps/api/src/services/preflight_service.py
# Pre-flight 安全檢查服務 (Q2/Q4 決策)
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient
# 參考: ADR-062
from __future__ import annotations
import logging
import time
from dataclasses import dataclass
from enum import Enum
from .service_registry import ServiceRegistryClient, get_service_registry
from .velero_client import VeleroClient, get_velero_client
logger = logging.getLogger(__name__)
class PreflightResult(str, Enum):
PASS = "PASS"
ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED"
ABORT_HIGH_IO = "ABORT_HIGH_IO"
SKIP = "SKIP" # 服務不需要 Pre-flight
@dataclass
class PreflightReport:
result: PreflightResult
backup_age_hours: float | None = None
backup_name_triggered: str | None = None
reason: str = ""
class PreflightService:
"""
Pre-flight 安全檢查
- 只有 requires_pre_backup=True 的服務才觸發
- 備份過期 → Abort + 觸發緊急備份(非同步)
- CPU/IO 高負載告警 → 禁止觸發備份Q4
"""
def __init__(
self,
registry: ServiceRegistryClient | None = None,
velero: VeleroClient | None = None,
) -> None:
self._registry = registry or get_service_registry()
self._velero = velero or get_velero_client()
async def check(
self,
service_name: str,
alert_labels: dict | None = None,
) -> PreflightReport:
"""
執行 Pre-flight 檢查
alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載
"""
info = self._registry.get_service(service_name)
if info is None or not info.requires_pre_backup:
return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight")
# Q4: CPU/IO 高負載告警時禁止觸發備份
if self._is_high_io_alert(alert_labels):
logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發")
return PreflightReport(
result=PreflightResult.ABORT_HIGH_IO,
reason="告警類型為 CPU/IO 高負載禁止觸發備份Q4 決策)",
)
policies = self._registry.get_backup_policies()
max_age = policies.get("velero_max_age_hours", 4)
age = await self._velero.get_latest_backup_age_hours()
if age <= max_age:
return PreflightReport(
result=PreflightResult.PASS,
backup_age_hours=age,
reason=f"備份時間正常 ({age:.1f}h < {max_age}h)",
)
# 備份過期 → 觸發緊急備份 + Abort
backup_name = f"emergency-preflight-{int(time.time())}"
triggered = await self._velero.trigger_emergency_backup(backup_name)
return PreflightReport(
result=PreflightResult.ABORT_BACKUP_EXPIRED,
backup_age_hours=age,
backup_name_triggered=backup_name if triggered else None,
reason=(
f"備份過期 ({age:.1f}h > {max_age}h)。"
f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}"
),
)
def _is_high_io_alert(self, labels: dict | None) -> bool:
if not labels:
return False
alert_name = labels.get("alertname", "").lower()
return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"])
_preflight_service: PreflightService | None = None
def get_preflight_service() -> PreflightService:
global _preflight_service
if _preflight_service is None:
_preflight_service = PreflightService()
return _preflight_service
def set_preflight_service(service: PreflightService) -> None:
"""測試注入用 (P4 規範)"""
global _preflight_service
_preflight_service = service

View File

@@ -0,0 +1,124 @@
# apps/api/src/services/service_registry.py
# Service Registry Client — 讀取 ops/config/service-registry.yaml
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 架構: leWOOOgo 積木化,純 Service 層,無 Router/DB 依賴
# 參考: ADR-062, ADR-063
from __future__ import annotations
import logging
from enum import Enum
from pathlib import Path
from typing import Any
import yaml
logger = logging.getLogger(__name__)
# YAML 路徑(相對於 repo root
_DEFAULT_REGISTRY_PATH = Path(__file__).parents[5] / "ops" / "config" / "service-registry.yaml"
class StatefulLevel(str, Enum):
BLOCK = "BLOCK" # 禁止,僅告警
CRITICAL_HITL = "CRITICAL_HITL" # 2 票 MultiSig
STANDARD_HITL = "STANDARD_HITL" # 1 票
AUTO = "AUTO" # 自動執行
class ServiceInfo:
def __init__(self, data: dict[str, Any]) -> None:
self.name: str = data["name"]
self.display_name: str = data.get("display_name", self.name)
self.host: str = data.get("host", "unknown")
self.stateful_level: StatefulLevel = StatefulLevel(data.get("stateful_level", "AUTO"))
self.reason: str = data.get("reason", "")
self.alert_only: bool = data.get("alert_only", False)
self.requires_pre_backup: bool = data.get("requires_pre_backup", False)
self.restart_command: str = data.get("restart_command", "docker restart")
self.containers: list[str] = data.get("containers", [])
class ServiceRegistryClient:
"""
Service Registry 客戶端
讀取 ops/config/service-registry.yaml提供服務 Stateful 分級查詢
設計原則: 純讀取,不寫入;失敗時 fallback AUTO防護不應阻擋告警流程
"""
def __init__(self, registry_path: Path | None = None) -> None:
self._path = registry_path or _DEFAULT_REGISTRY_PATH
self._services: dict[str, ServiceInfo] = {}
self._backup_policies: dict[str, Any] = {}
self._multisig_config: dict[str, Any] = {}
self._loaded = False
def _load(self) -> None:
if self._loaded:
return
try:
with open(self._path) as f:
data = yaml.safe_load(f)
for svc in data.get("services", []):
info = ServiceInfo(svc)
self._services[info.name] = info
# 也按 container 名稱建立索引
for container in info.containers:
self._services[container] = info
self._backup_policies = data.get("backup_policies", {})
self._multisig_config = data.get("multisig", {})
self._loaded = True
logger.info(f"Service Registry 載入完成: {len(self._services)} 個服務")
except Exception as e:
logger.error(f"Service Registry 載入失敗: {e},所有服務 fallback AUTO")
self._loaded = True # 防止重複嘗試
def get_service(self, name: str) -> ServiceInfo | None:
self._load()
return self._services.get(name)
def get_stateful_level(self, service_name: str) -> StatefulLevel:
"""查詢服務分級,未知服務 fallback AUTO"""
info = self.get_service(service_name)
if info is None:
logger.warning(f"未知服務 '{service_name}'fallback AUTO")
return StatefulLevel.AUTO
return info.stateful_level
def is_blocked(self, service_name: str) -> bool:
return self.get_stateful_level(service_name) == StatefulLevel.BLOCK
def requires_multisig(self, service_name: str) -> bool:
return self.get_stateful_level(service_name) == StatefulLevel.CRITICAL_HITL
def get_required_votes(self, service_name: str) -> int:
self._load()
level = self.get_stateful_level(service_name)
if level == StatefulLevel.CRITICAL_HITL:
return self._multisig_config.get("critical_required_votes", 2)
return self._multisig_config.get("standard_required_votes", 1)
def get_backup_policies(self) -> dict[str, Any]:
self._load()
return self._backup_policies
def get_restart_command(self, service_name: str) -> str:
info = self.get_service(service_name)
return info.restart_command if info else "docker restart"
# Singleton
_registry_client: ServiceRegistryClient | None = None
def get_service_registry() -> ServiceRegistryClient:
global _registry_client
if _registry_client is None:
_registry_client = ServiceRegistryClient()
return _registry_client
def set_service_registry(client: ServiceRegistryClient) -> None:
"""測試注入用 (P4 規範)"""
global _registry_client
_registry_client = client

View File

@@ -2654,6 +2654,128 @@ class TelegramGateway:
f"⚠️ 重診觸發失敗: {html.escape(str(e)[:100])}"
)
# =========================================================================
# Sprint 5.1 T1-T6: Data Safety Guardrail 通知場景
# (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062)
# =========================================================================
async def send_guardrail_blocked(
self,
service_name: str,
alertname: str,
reason: str,
) -> None:
"""T1: GUARDRAIL_BLOCKED — 服務屬於 BLOCK 等級,禁止自動修復"""
text = (
"🚫 <b>[服務保護] 自動修復已阻擋</b>\n"
"━━━━━━━━━━━━━━━━━\n"
f"服務: <code>{html.escape(service_name)}</code>\n"
f"告警: <code>{html.escape(alertname)}</code>\n"
f"原因: {html.escape(reason)}\n"
"━━━━━━━━━━━━━━━━━\n"
"⚠️ 請人工評估並手動處理"
)
await self.send_notification(text)
async def send_preflight_failed(
self,
service_name: str,
backup_age_hours: float,
max_age_hours: float,
backup_name: str | None,
) -> None:
"""T2: PRE_FLIGHT_FAILED + BACKUP_TRIGGERED — 備份過期,修復暫停"""
backup_status = (
f"緊急備份: 已啟動 <code>{html.escape(backup_name)}</code>"
if backup_name
else "緊急備份: <b>啟動失敗</b>,請人工處理"
)
text = (
"⏸ <b>[Pre-flight 阻擋] 備份已過期,修復暫停</b>\n"
"━━━━━━━━━━━━━━━━━\n"
f"服務: <code>{html.escape(service_name)}</code>\n"
f"備份距今: {backup_age_hours:.1f} 小時(上限 {max_age_hours:.0f} 小時)\n"
f"{backup_status}\n"
"━━━━━━━━━━━━━━━━━\n"
"請等待備份完成後,人工重新評估修復方案"
)
await self.send_notification(text)
async def send_backup_result(
self,
backup_name: str,
success: bool,
error_msg: str | None = None,
) -> None:
"""T3: BACKUP_COMPLETED / BACKUP_FAILED — 緊急備份結果"""
if success:
text = (
"✅ <b>緊急備份完成</b>\n"
f"備份: <code>{html.escape(backup_name)}</code>\n"
"可繼續手動執行修復"
)
else:
err = html.escape(error_msg or "未知錯誤")
text = (
"❌ <b>緊急備份失敗</b>\n"
f"備份: <code>{html.escape(backup_name)}</code>\n"
f"錯誤: {err}\n"
"請人工介入,備份異常"
)
await self.send_notification(text)
async def send_multisig_waiting(
self,
action: str,
service_name: str,
votes_received: int,
votes_required: int,
approval_id: str,
) -> None:
"""T4: APPROVAL_ESCALATED — 第 1 票完成,等待第 2 票"""
text = (
"🔐 <b>[MultiSig] 等待第 2 票授權</b>\n"
"━━━━━━━━━━━━━━━━━\n"
f"操作: {html.escape(action)}\n"
f"服務: <code>{html.escape(service_name)}</code>\n"
f"風險: CRITICALHITL 雙簽)\n"
f"已獲授權: {votes_received}/{votes_required}\n"
f"審核 ID: <code>{html.escape(approval_id)}</code>\n"
"━━━━━━━━━━━━━━━━━\n"
"請第二位審核者登入確認"
)
await self.send_notification(text)
async def send_multisig_approved(
self,
action: str,
service_name: str,
) -> None:
"""T5: MultiSig 完成2/2"""
text = (
"✅ <b>[MultiSig 完成] 雙簽授權通過</b>\n"
f"操作: {html.escape(action)}\n"
f"服務: <code>{html.escape(service_name)}</code>\n"
"授權: 2/2 票 開始執行..."
)
await self.send_notification(text)
async def send_change_applied(
self,
operator: str,
action_description: str,
timestamp: str,
) -> None:
"""T6: CHANGE_APPLIED — 手動變更記錄"""
text = (
"📝 <b>[變更記錄] 手動操作已記錄</b>\n"
"━━━━━━━━━━━━━━━━━\n"
f"操作者: {html.escape(operator)}\n"
f"動作: {html.escape(action_description)}\n"
f"時間: {html.escape(timestamp)}"
)
await self.send_notification(text)
async def send_notification(
self,
text: str,

View File

@@ -0,0 +1,113 @@
# apps/api/src/services/velero_client.py
# Velero Backup 查詢客戶端 (kubectl 方式Q7 決策)
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 架構: leWOOOgo 積木化,純 Service 層
# 參考: ADR-062
from __future__ import annotations
import asyncio
import json
import logging
import time
from datetime import UTC, datetime
logger = logging.getLogger(__name__)
_VELERO_NAMESPACE = "velero"
_KUBECTL_TIMEOUT = 30 # 秒
class VeleroClient:
"""
透過 kubectl 查詢 Velero 備份狀態
設計原則: 失敗時 fallback「假設備份過期」保守原則
"""
async def get_latest_backup_age_hours(self) -> float:
"""
查詢最近一次 Completed 備份距今幾小時
失敗時返回 999.0(視為嚴重過期,觸發 Abort
"""
try:
result = await asyncio.wait_for(
self._run_kubectl(
["get", "backup", "-n", _VELERO_NAMESPACE,
"-o", "json", "--field-selector", "status.phase=Completed"]
),
timeout=_KUBECTL_TIMEOUT,
)
data = json.loads(result)
items = data.get("items", [])
if not items:
logger.warning("Velero: 找不到任何 Completed 備份")
return 999.0
latest = max(
items,
key=lambda x: x.get("status", {}).get("completionTimestamp", ""),
)
completion_ts = latest["status"].get("completionTimestamp", "")
if not completion_ts:
return 999.0
completed_at = datetime.fromisoformat(completion_ts.replace("Z", "+00:00"))
age = (datetime.now(UTC) - completed_at).total_seconds() / 3600
logger.info(f"Velero 最近備份: {completion_ts},距今 {age:.1f} 小時")
return age
except asyncio.TimeoutError:
logger.error("Velero kubectl 查詢超時")
return 999.0
except Exception as e:
logger.error(f"Velero 查詢失敗: {e}")
return 999.0
async def trigger_emergency_backup(self, backup_name: str | None = None) -> bool:
"""
觸發緊急備份(非同步,不等待完成)
返回 True 表示指令已成功發送
"""
name = backup_name or f"emergency-{int(time.time())}"
try:
await asyncio.wait_for(
self._run_kubectl([
"create", "backup", name,
"-n", _VELERO_NAMESPACE,
"--include-namespaces", "awoooi-prod",
"--wait=false",
]),
timeout=_KUBECTL_TIMEOUT,
)
logger.info(f"Velero 緊急備份已啟動: {name}")
return True
except Exception as e:
logger.error(f"Velero 緊急備份失敗: {e}")
return False
async def _run_kubectl(self, args: list[str]) -> str:
proc = await asyncio.create_subprocess_exec(
"kubectl", *args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"kubectl 失敗: {stderr.decode()}")
return stdout.decode()
_velero_client: VeleroClient | None = None
def get_velero_client() -> VeleroClient:
global _velero_client
if _velero_client is None:
_velero_client = VeleroClient()
return _velero_client
def set_velero_client(client: VeleroClient) -> None:
"""測試注入用 (P4 規範)"""
global _velero_client
_velero_client = client

View File

@@ -0,0 +1,36 @@
# k8s/rbac/api-velero-reader.yaml
# API Pod 讀取 Velero backup 資源的 RBAC
# Sprint 5.1 K-001 / 2026-04-08 Asia/Taipei
# 說明: awoooi-executor ServiceAccount 需要讀取 velero namespace 的 backup 資源
# 用於 Pre-flight Check 查詢最近備份時間Q7 決策kubectl 方式)
# 注意: ServiceAccount 名稱為 awoooi-executor非 awoooi-api經 L0 確認)
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: awoooi-velero-backup-reader
labels:
app: awoooi
component: api
sprint: "5.1"
rules:
- apiGroups: ["velero.io"]
resources: ["backups"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: awoooi-velero-backup-reader
labels:
app: awoooi
component: api
sprint: "5.1"
subjects:
- kind: ServiceAccount
name: awoooi-executor
namespace: awoooi-prod
roleRef:
kind: ClusterRole
name: awoooi-velero-backup-reader
apiGroup: rbac.authorization.k8s.io

View File

@@ -0,0 +1,201 @@
# ops/config/service-registry.yaml
# Service Registry — 服務 Stateful 分級清單
# 版本: 1.0.0
# 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 維護: 修改需 PR + 統帥審核,禁止直接 push
# 說明:
# BLOCK = 系統禁止自動修復,僅告警(資料風險最高)
# CRITICAL_HITL = 允許 Playbook但需 MultiSig 2票
# STANDARD_HITL = 允許 Playbook需 1票審核
# AUTO = 允許自動執行(無狀態服務)
# 參考: ADR-062, ADR-063
services:
# ─── BLOCK系統禁止連 Playbook 都不提供)────────────────────────────
- name: postgres
display_name: "PostgreSQL 主庫 (awoooi_prod)"
host: "192.168.0.188"
stateful_level: BLOCK
reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾"
alert_only: true
containers: ["postgres"]
- name: momo-db
display_name: "PostgreSQL (momo_db)"
host: "192.168.0.188"
stateful_level: BLOCK
reason: "momo 產品資料庫,禁止自動操作"
alert_only: true
containers: ["momo-db"]
- name: langfuse-db
display_name: "PostgreSQL (Langfuse)"
host: "192.168.0.110"
stateful_level: BLOCK
reason: "LLM trace 資料庫,重啟導致追蹤資料遺失"
alert_only: true
containers: ["langfuse-db"]
- name: harbor-db
display_name: "PostgreSQL (Harbor Registry)"
host: "192.168.0.110"
stateful_level: BLOCK
reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引"
alert_only: true
containers: ["harbor-db"]
- name: sentry-postgres
display_name: "PostgreSQL (Sentry)"
host: "192.168.0.110"
stateful_level: BLOCK
reason: "Sentry 錯誤追蹤資料庫"
alert_only: true
containers: ["sentry-postgres"]
- name: signoz-clickhouse
display_name: "ClickHouse (SignOz)"
host: "192.168.0.188"
stateful_level: BLOCK
reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案"
alert_only: true
containers: ["signoz-clickhouse"]
# ─── CRITICAL_HITL高風險需 MultiSig 2票 ──────────────────────────
- name: redis
display_name: "Redis (AWOOOI)"
host: "192.168.0.188"
stateful_level: CRITICAL_HITL
reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態"
requires_pre_backup: false
containers: ["redis"]
- name: harbor-redis
display_name: "Redis (Harbor)"
host: "192.168.0.110"
stateful_level: CRITICAL_HITL
reason: "Harbor session 快取"
containers: ["harbor-redis"]
- name: sentry-redis
display_name: "Redis (Sentry)"
host: "192.168.0.110"
stateful_level: CRITICAL_HITL
reason: "Sentry 任務佇列"
containers: ["sentry-redis"]
- name: gitea
display_name: "Gitea (程式碼倉庫)"
host: "192.168.0.110"
stateful_level: CRITICAL_HITL
reason: "restart 會殺掉活躍 SSH sessionGit push 中斷可能損壞 working copy"
requires_pre_backup: false
containers: ["gitea"]
- name: harbor
display_name: "Harbor (Container Registry)"
host: "192.168.0.110"
stateful_level: CRITICAL_HITL
reason: "重啟中斷 pull/pushGC 進行中重啟可能損壞 layer"
requires_pre_backup: false
containers: ["harbor-core", "harbor-jobservice", "harbor-portal"]
- name: minio
display_name: "MinIO (物件存儲)"
host: "192.168.0.188"
stateful_level: CRITICAL_HITL
reason: "寫入中重啟可能導致 multipart upload 中斷"
requires_pre_backup: false
containers: ["minio"]
# ─── STANDARD_HITL中風險需 1票審核 ──────────────────────────────
- name: prometheus
display_name: "Prometheus"
host: "192.168.0.110"
stateful_level: STANDARD_HITL
reason: "有 TSDB WALexited 狀態用 docker start非 restart"
restart_command: "docker start"
containers: ["prometheus"]
- name: grafana
display_name: "Grafana"
host: "192.168.0.110"
stateful_level: STANDARD_HITL
reason: "有 SQLite 設定儲存exited 用 docker start"
restart_command: "docker start"
containers: ["grafana"]
- name: alertmanager
display_name: "Alertmanager"
host: "192.168.0.110"
stateful_level: STANDARD_HITL
reason: "有 silence 狀態exited 用 docker start"
restart_command: "docker start"
containers: ["alertmanager"]
# ─── AUTO無狀態允許自動修復 ──────────────────────────────────────
- name: nginx
display_name: "Nginx (反向代理)"
host: "192.168.0.110"
stateful_level: AUTO
containers: ["nginx", "nginx-188"]
- name: awoooi-api
display_name: "AWOOOI API (K3s)"
host: "k3s"
stateful_level: AUTO
containers: []
- name: awoooi-web
display_name: "AWOOOI Web (K3s)"
host: "k3s"
stateful_level: AUTO
containers: []
- name: blackbox-exporter
display_name: "Blackbox Exporter"
host: "192.168.0.110"
stateful_level: AUTO
containers: ["blackbox-exporter"]
- name: langfuse
display_name: "Langfuse (LLMOps)"
host: "192.168.0.110"
stateful_level: AUTO
containers: ["langfuse-web", "langfuse-worker"]
- name: ollama
display_name: "Ollama (Local LLM)"
host: "192.168.0.188"
stateful_level: AUTO
containers: ["ollama"]
- name: momo-app
display_name: "momo Web App"
host: "192.168.0.188"
stateful_level: AUTO
containers: ["momo-app"]
- name: tsenyang-website
display_name: "Tsenyang Website"
host: "192.168.0.188"
stateful_level: AUTO
containers: ["tsenyang-website"]
- name: stock-platform
display_name: "Stock Platform"
host: "192.168.0.110"
stateful_level: AUTO
containers: ["stock-platform"]
# ─── 備份策略參考 ────────────────────────────────────────────────────────
backup_policies:
velero_max_age_hours: 4 # Velero 備份過期閾值Q2 決策)
emergency_backup_timeout: 600 # 緊急備份超時秒數
block_backup_on_high_io: true # CPU/IO > 80% 時禁止觸發備份Q4 決策)
io_threshold_percent: 80
# ─── MultiSig 設定 ───────────────────────────────────────────────────────
multisig:
critical_required_votes: 2 # CRITICAL_HITL 需要幾票
standard_required_votes: 1 # STANDARD_HITL 需要幾票
vote_expiry_minutes: 30 # 投票有效期

View File

@@ -1,11 +1,13 @@
#!/usr/bin/env bash
# docker-health-monitor.sh
# Plan A: Docker 容器健康監控 + 自動修復
# Sprint 5.1 L4-6: 純感知層(偵測→送 Webhook禁止任何修復動作
#
# 部署: cron */5 * * * * /opt/awoooi-ops/docker-health-monitor.sh >> /var/log/docker-health-monitor.log 2>&1
# 設定: /etc/awoooi-ops/secrets.env
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 首席架構裁示: Intent→Action→Result 三段式,禁止靜默修復
# 架構裁示: Route B — 腳本只感知,所有修復決策由 AWOOOI API 執行ADR-062
# 注意: 禁止在此腳本中執行 docker restart / docker start
# 所有修復動作由 AWOOOI API Guardrail + Playbook + Approval 鏈路處理
set -euo pipefail
@@ -19,48 +21,45 @@ fi
: "${AWOOOI_API_URL:=https://awoooi.wooo.work}"
: "${TELEGRAM_BOT_TOKEN:=}"
: "${TELEGRAM_CHAT_ID:=}"
: "${WEBHOOK_HMAC_SECRET:=}"
: "${COOLDOWN_SECONDS:=300}"
: "${LOG_FILE:=/var/log/docker-health-monitor.log}"
# 冷卻期:避免同一容器在短時間內重複發送 webhook去重非修復冷卻
: "${SEND_COOLDOWN_SECONDS:=300}"
: "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}"
mkdir -p "$COOLDOWN_DIR"
# ─── 排除清單(禁止自動修復)───────────────────────────────────────────────
# 判斷方式: echo ":list:" | grep -q ":name:"
# 分類一:資料庫 — 禁止 restart
EXCLUDED_DB_LIST=":postgres:momo-db:langfuse-db:harbor-db:sentry-postgres:signoz-clickhouse:"
# 分類二Redis — 禁止 restart
EXCLUDED_REDIS_LIST=":redis:harbor-redis:sentry-redis:"
# 分類三:監控棧 exited → docker start保護 WAL
MONITORING_START_ONLY_LIST=":prometheus:grafana:alertmanager:"
# 分類四:監控棧 其他 → 僅告警
EXCLUDED_MONITORING_LIST=":blackbox-exporter:signoz-otel-collector:"
# 分類五:關鍵系統 — 永遠禁止Gitea restart 會殺活躍 SSH
EXCLUDED_CRITICAL_LIST=":gitea:"
# ─── 工具函數 ────────────────────────────────────────────────────────────────
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S %z')] $*"
}
in_list() {
local name=":${1}:"
local list="$2"
[[ "$list" == *"$name"* ]]
# 發送冷卻期檢查(避免同一容器短時間重複送 webhook
is_in_send_cooldown() {
local container="$1"
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
if [[ -f "$cooldown_file" ]]; then
local last_sent now elapsed
last_sent=$(cat "$cooldown_file")
now=$(date +%s)
elapsed=$(( now - last_sent ))
if (( elapsed < SEND_COOLDOWN_SECONDS )); then
log "COOLDOWN: ${container} 距上次通知 ${elapsed}s跳過冷卻期 ${SEND_COOLDOWN_SECONDS}s"
return 0
fi
fi
return 1
}
# 計算 HMAC-SHA256 簽章
sign_payload() {
local payload="$1"
printf '%s' "$payload" | openssl dgst -sha256 -hmac "$WEBHOOK_HMAC_SECRET" -binary | xxd -p -c 256
set_send_cooldown() {
local container="$1"
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
# 傳送 TelegramFallbackAWOOOI API down 時直接呼叫 Bot API
# FallbackAWOOOI API down 時直接呼叫 Telegram Bot API
send_telegram_direct() {
local message="$1"
if [[ -z "$TELEGRAM_BOT_TOKEN" || -z "$TELEGRAM_CHAT_ID" ]]; then
log "WARN: Telegram 未設定,跳過通知"
log "WARN: Telegram 未設定,跳過 Fallback"
return 0
fi
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
@@ -69,186 +68,113 @@ send_telegram_direct() {
> /dev/null 2>&1 || true
}
# 傳送 AWOOOI Webhook若失敗則 Fallback 至 Telegram Bot API
send_awoooi_alert() {
local title="$1"
local message="$2"
local severity="${3:-WARNING}"
local source="docker-health-monitor"
# 傳送 Alertmanager 格式 Webhook 到 AWOOOI API
# 使用現有端點 /api/v1/webhooks/alertmanager內網免 HMAC
send_to_awoooi() {
local container="$1"
local status="$2" # unhealthy | exited | dead
local hostname
hostname=$(hostname)
local now_ts
now_ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
# 組裝 Alertmanager 格式 JSON符合現有 AlertmanagerPayload schema
local payload
payload=$(printf '{"title":"%s","message":"%s","severity":"%s","source":"%s","labels":{"monitor":"docker-health-monitor","plan":"A"}}' \
"$title" "$message" "$severity" "$source")
local timestamp
timestamp=$(date -u +%s)
local signature
signature=$(sign_payload "${timestamp}${payload}")
payload=$(cat <<JSON
{
"version": "4",
"groupKey": "docker-health-${hostname}-${container}",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "DockerContainerUnhealthy",
"container": "${container}",
"host": "${hostname}",
"layer": "docker",
"severity": "warning",
"auto_repair": "true",
"source": "docker-health-monitor"
},
"annotations": {
"summary": "容器 ${container} 狀態異常: ${status}",
"description": "主機 ${hostname} 容器 ${container} 偵測狀態=${status},由 docker-health-monitor 感知層回報"
},
"startsAt": "${now_ts}"
}]
}
JSON
)
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/custom-alert" \
-X POST "${AWOOOI_API_URL}/api/v1/webhooks/alertmanager" \
-H "Content-Type: application/json" \
-H "X-Timestamp: ${timestamp}" \
-H "X-Signature: sha256=${signature}" \
-d "$payload" \
--connect-timeout 10 \
--max-time 30 2>/dev/null) || http_code="0"
if [[ "$http_code" != "200" && "$http_code" != "202" ]]; then
if [[ "$http_code" == "200" || "$http_code" == "202" ]]; then
log "SENT: ${container} 狀態=${status} → AWOOOI API (${http_code})"
set_send_cooldown "$container"
else
log "WARN: AWOOOI API 回應 ${http_code}Fallback 到 Telegram Bot API"
send_telegram_direct "[docker-health-monitor Fallback]&#10;${title}&#10;${message}"
send_telegram_direct "🚨 [docker-health-monitor Fallback]&#10;主機: ${hostname}&#10;容器: ${container}&#10;狀態: ${status}&#10;(API 不可達,請人工處理)"
set_send_cooldown "$container"
fi
}
# 冷卻期檢查(避免同一容器短時間重複修復)
is_in_cooldown() {
local container="$1"
local cooldown_file="${COOLDOWN_DIR}/${container}.cooldown"
if [[ -f "$cooldown_file" ]]; then
local last_repair
last_repair=$(cat "$cooldown_file")
local now
now=$(date +%s)
local elapsed=$(( now - last_repair ))
if (( elapsed < COOLDOWN_SECONDS )); then
log "COOLDOWN: ${container} 仍在冷卻期 (${elapsed}s / ${COOLDOWN_SECONDS}s)"
return 0
fi
fi
return 1
}
set_cooldown() {
local container="$1"
date +%s > "${COOLDOWN_DIR}/${container}.cooldown"
}
# ─── 核心:處理不健康容器 ───────────────────────────────────────────────────
handle_unhealthy_container() {
local container="$1"
local status="$2" # unhealthy | exited | dead
# ─── 核心:掃描所有容器 ─────────────────────────────────────────────────────
check_containers() {
local hostname
hostname=$(hostname)
log "DETECTED: ${container} 狀態=${status} on ${hostname}"
# 取得所有容器(含停止的)
while IFS=$'\t' read -r container_id container_name state health; do
# 跳過 header 或空行
[[ -z "$container_name" ]] && continue
# ── 排除清單判斷 ─────────────────────────────────────────────────────────
if in_list "$container" "$EXCLUDED_CRITICAL_LIST"; then
log "SKIP: ${container} 屬於關鍵排除清單 (Gitea),僅告警"
send_awoooi_alert \
"[${hostname}] 關鍵服務異常: ${container}" \
"容器 ${container} 狀態=${status}。此服務禁止自動修復,請人工處理!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_DB_LIST"; then
log "SKIP: ${container} 屬於資料庫排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] 資料庫容器異常: ${container}" \
"容器 ${container} 狀態=${status}。資料庫禁止自動修復,需人工介入!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_REDIS_LIST"; then
log "SKIP: ${container} 屬於 Redis 排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] Redis 容器異常: ${container}" \
"容器 ${container} 狀態=${status}。Redis 禁止自動修復,需人工介入!" \
"CRITICAL"
return
fi
if in_list "$container" "$EXCLUDED_MONITORING_LIST"; then
log "SKIP: ${container} 屬於監控棧排除清單,僅告警"
send_awoooi_alert \
"[${hostname}] 監控元件異常: ${container}" \
"容器 ${container} 狀態=${status}。請人工處理。" \
"WARNING"
return
fi
# ── 冷卻期判斷 ────────────────────────────────────────────────────────────
if is_in_cooldown "$container"; then
log "SKIP: ${container} 在冷卻期內,跳過本次修復"
return
fi
# ── 決定修復動作 ─────────────────────────────────────────────────────────
local action_cmd="docker restart"
local action_desc="docker restart"
if in_list "$container" "$MONITORING_START_ONLY_LIST" && [[ "$status" == "exited" ]]; then
action_cmd="docker start"
action_desc="docker start保護 WAL非 restart"
fi
# ── Phase 1: Intent決策意圖通知──────────────────────────────────────
log "INTENT: 即將對 ${container} 執行 ${action_desc}"
send_awoooi_alert \
"[${hostname}] 自動修復 Intent: ${container}" \
"偵測到容器 ${container} 狀態=${status}。即將執行 ${action_desc}2 秒後開始修復。" \
"WARNING"
sleep 2
# ── Phase 2: Action執行修復──────────────────────────────────────────
log "ACTION: 執行 ${action_cmd} ${container}"
set_cooldown "$container"
local repair_ok=false
if $action_cmd "$container" >> "$LOG_FILE" 2>&1; then
repair_ok=true
fi
# ── Phase 3: Result執行結果通知──────────────────────────────────────
if $repair_ok; then
log "RESULT: ${container} 修復成功"
send_awoooi_alert \
"[${hostname}] 自動修復成功: ${container}" \
"容器 ${container} 已透過 ${action_desc} 成功恢復。原狀態=${status}" \
"INFO"
else
log "RESULT: ${container} 修復失敗!需人工介入"
send_awoooi_alert \
"[${hostname}] 自動修復失敗: ${container}" \
"容器 ${container} 執行 ${action_desc} 失敗!原狀態=${status}。需人工介入!" \
"CRITICAL"
fi
}
# ─── 主流程 ──────────────────────────────────────────────────────────────────
main() {
log "===== docker-health-monitor 啟動 (host=$(hostname)) ====="
# 取得所有容器狀態
# docker ps -a 格式: Names / Health / State
while IFS=$'\t' read -r name health_status container_status; do
[[ -z "$name" ]] && continue
local needs_repair=false
local needs_alert=false
local detected_status=""
if [[ "$health_status" == "unhealthy" ]]; then
needs_repair=true
# 偵測 exited / dead
if [[ "$state" == "exited" || "$state" == "dead" ]]; then
needs_alert=true
detected_status="$state"
fi
# 偵測 unhealthyhealth check 存在且失敗)
if [[ "$health" == "unhealthy" ]]; then
needs_alert=true
detected_status="unhealthy"
elif [[ "$container_status" == "exited" || "$container_status" == "dead" ]]; then
needs_repair=true
detected_status="$container_status"
elif [[ "$health_status" == "starting" ]]; then
log "INFO: ${name} health=starting等待中跳過"
continue
fi
if $needs_repair; then
handle_unhealthy_container "$name" "$detected_status"
if $needs_alert; then
log "DETECTED: ${container_name} 狀態=${detected_status} on ${hostname}"
# 冷卻期去重
if is_in_send_cooldown "$container_name"; then
continue
fi
# 送 Webhook — 只感知,不修復
send_to_awoooi "$container_name" "$detected_status"
fi
done < <(docker ps -a --format '{{.ID}}\t{{.Names}}\t{{.State}}\t{{.Status}}' | \
awk -F'\t' '{
health = ""
if ($4 ~ /\(unhealthy\)/) health = "unhealthy"
else if ($4 ~ /\(healthy\)/) health = "healthy"
print $1 "\t" $2 "\t" $3 "\t" health
}')
}
done < <(docker ps -a --format '{{.Names}} {{.Health}} {{.State}}' 2>/dev/null)
log "===== docker-health-monitor 完成 ====="
# ─── Main ───────────────────────────────────────────────────────────────────
main() {
log "=== docker-health-monitor 感知層啟動 (純感知,禁止修復) ==="
check_containers
log "=== 掃描完成 ==="
}
main "$@"