Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
NetworkPolicy: 新增 192.168.0.188:22 egress — repair-bot-188.sh 執行路徑 service-registry.yaml: 新增 signoz/bitan-app (AUTO, 188主機) 修復覆蓋: Bug #11 補完 (188 SSH) + 188 服務分級覆蓋 E2E 驗證: MoWoooWorkDown → SSH → REPAIR_OK:momo-app (3791ms) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
222 lines
7.3 KiB
YAML
222 lines
7.3 KiB
YAML
# ops/config/service-registry.yaml
|
||
# Service Registry — 服務 Stateful 分級清單
|
||
# 版本: 1.0.0
|
||
# 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
|
||
# 維護: 修改需 PR + 統帥審核,禁止直接 push
|
||
# 說明:
|
||
# BLOCK = 系統禁止自動修復,僅告警(資料風險最高)
|
||
# CRITICAL_HITL = 允許 Playbook,但需 MultiSig 2票
|
||
# STANDARD_HITL = 允許 Playbook,需 1票審核
|
||
# AUTO = 允許自動執行(無狀態服務)
|
||
# 參考: ADR-062, ADR-063
|
||
|
||
services:
|
||
# ─── BLOCK:系統禁止(連 Playbook 都不提供)────────────────────────────
|
||
- name: postgres
|
||
display_name: "PostgreSQL 主庫 (awoooi_prod)"
|
||
host: "192.168.0.188"
|
||
stateful_level: BLOCK
|
||
reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾"
|
||
alert_only: true
|
||
containers: ["postgres"]
|
||
|
||
- name: momo-db
|
||
display_name: "PostgreSQL (momo_db)"
|
||
host: "192.168.0.188"
|
||
stateful_level: BLOCK
|
||
reason: "momo 產品資料庫,禁止自動操作"
|
||
alert_only: true
|
||
containers: ["momo-db"]
|
||
|
||
- name: langfuse-db
|
||
display_name: "PostgreSQL (Langfuse)"
|
||
host: "192.168.0.110"
|
||
stateful_level: BLOCK
|
||
reason: "LLM trace 資料庫,重啟導致追蹤資料遺失"
|
||
alert_only: true
|
||
containers: ["langfuse-db"]
|
||
|
||
- name: harbor-db
|
||
display_name: "PostgreSQL (Harbor Registry)"
|
||
host: "192.168.0.110"
|
||
stateful_level: BLOCK
|
||
reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引"
|
||
alert_only: true
|
||
containers: ["harbor-db"]
|
||
|
||
- name: sentry-postgres
|
||
display_name: "PostgreSQL (Sentry)"
|
||
host: "192.168.0.110"
|
||
stateful_level: BLOCK
|
||
reason: "Sentry 錯誤追蹤資料庫"
|
||
alert_only: true
|
||
containers: ["sentry-postgres"]
|
||
|
||
- name: signoz-clickhouse
|
||
display_name: "ClickHouse (SignOz)"
|
||
host: "192.168.0.188"
|
||
stateful_level: BLOCK
|
||
reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案"
|
||
alert_only: true
|
||
containers: ["signoz-clickhouse"]
|
||
|
||
# ─── CRITICAL_HITL:高風險,需 MultiSig 2票 ──────────────────────────
|
||
- name: redis
|
||
display_name: "Redis (AWOOOI)"
|
||
host: "192.168.0.188"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態"
|
||
requires_pre_backup: false
|
||
containers: ["redis"]
|
||
|
||
- name: harbor-redis
|
||
display_name: "Redis (Harbor)"
|
||
host: "192.168.0.110"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "Harbor session 快取"
|
||
containers: ["harbor-redis"]
|
||
|
||
- name: sentry-redis
|
||
display_name: "Redis (Sentry)"
|
||
host: "192.168.0.110"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "Sentry 任務佇列"
|
||
containers: ["sentry-redis"]
|
||
|
||
- name: gitea
|
||
display_name: "Gitea (程式碼倉庫)"
|
||
host: "192.168.0.110"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "restart 會殺掉活躍 SSH session,Git push 中斷可能損壞 working copy"
|
||
requires_pre_backup: false
|
||
containers: ["gitea"]
|
||
|
||
- name: harbor
|
||
display_name: "Harbor (Container Registry)"
|
||
host: "192.168.0.110"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "重啟中斷 pull/push;GC 進行中重啟可能損壞 layer"
|
||
requires_pre_backup: false
|
||
containers: ["harbor-core", "harbor-jobservice", "harbor-portal"]
|
||
|
||
- name: minio
|
||
display_name: "MinIO (物件存儲)"
|
||
host: "192.168.0.188"
|
||
stateful_level: CRITICAL_HITL
|
||
reason: "寫入中重啟可能導致 multipart upload 中斷"
|
||
requires_pre_backup: false
|
||
containers: ["minio"]
|
||
|
||
# ─── STANDARD_HITL:中風險,需 1票審核 ──────────────────────────────
|
||
- name: prometheus
|
||
display_name: "Prometheus"
|
||
host: "192.168.0.110"
|
||
stateful_level: STANDARD_HITL
|
||
reason: "有 TSDB WAL,exited 狀態用 docker start(非 restart)"
|
||
restart_command: "docker start"
|
||
containers: ["prometheus"]
|
||
|
||
- name: grafana
|
||
display_name: "Grafana"
|
||
host: "192.168.0.110"
|
||
stateful_level: STANDARD_HITL
|
||
reason: "有 SQLite 設定儲存,exited 用 docker start"
|
||
restart_command: "docker start"
|
||
containers: ["grafana"]
|
||
|
||
- name: alertmanager
|
||
display_name: "Alertmanager"
|
||
host: "192.168.0.110"
|
||
stateful_level: STANDARD_HITL
|
||
reason: "有 silence 狀態,exited 用 docker start"
|
||
restart_command: "docker start"
|
||
containers: ["alertmanager"]
|
||
|
||
# ─── AUTO:無狀態,允許自動修復 ──────────────────────────────────────
|
||
- name: nginx
|
||
display_name: "Nginx (反向代理)"
|
||
host: "192.168.0.110"
|
||
stateful_level: AUTO
|
||
containers: ["nginx", "nginx-188"]
|
||
|
||
- name: awoooi-api
|
||
display_name: "AWOOOI API (K3s)"
|
||
host: "k3s"
|
||
stateful_level: AUTO
|
||
containers: []
|
||
|
||
- name: awoooi-web
|
||
display_name: "AWOOOI Web (K3s)"
|
||
host: "k3s"
|
||
stateful_level: AUTO
|
||
containers: []
|
||
|
||
- name: blackbox-exporter
|
||
display_name: "Blackbox Exporter"
|
||
host: "192.168.0.110"
|
||
stateful_level: AUTO
|
||
containers: ["blackbox-exporter"]
|
||
|
||
- name: sentry
|
||
display_name: "Sentry (錯誤追蹤)"
|
||
host: "192.168.0.110"
|
||
stateful_level: AUTO
|
||
reason: "Web server 無狀態,docker compose up -d 即可恢復"
|
||
containers: ["sentry-web", "sentry-worker", "sentry-cron"]
|
||
|
||
- name: langfuse
|
||
display_name: "Langfuse (LLMOps)"
|
||
host: "192.168.0.110"
|
||
stateful_level: AUTO
|
||
containers: ["langfuse-web", "langfuse-worker"]
|
||
|
||
- name: signoz
|
||
display_name: "SignOz (APM)"
|
||
host: "192.168.0.188"
|
||
stateful_level: AUTO
|
||
reason: "APM 無狀態查詢層,docker compose up -d 可恢復(ClickHouse 資料不受影響)"
|
||
containers: ["signoz"]
|
||
|
||
- name: bitan-app
|
||
display_name: "Bitan App"
|
||
host: "192.168.0.188"
|
||
stateful_level: AUTO
|
||
containers: ["bitan-app"]
|
||
|
||
- name: ollama
|
||
display_name: "Ollama (Local LLM)"
|
||
host: "192.168.0.188"
|
||
stateful_level: AUTO
|
||
containers: ["ollama"]
|
||
|
||
- name: momo-app
|
||
display_name: "momo Web App"
|
||
host: "192.168.0.188"
|
||
stateful_level: AUTO
|
||
containers: ["momo-app"]
|
||
|
||
- name: tsenyang-website
|
||
display_name: "Tsenyang Website"
|
||
host: "192.168.0.188"
|
||
stateful_level: AUTO
|
||
containers: ["tsenyang-website"]
|
||
|
||
- name: stock-platform
|
||
display_name: "Stock Platform"
|
||
host: "192.168.0.110"
|
||
stateful_level: AUTO
|
||
containers: ["stock-platform"]
|
||
|
||
# ─── 備份策略參考 ────────────────────────────────────────────────────────
|
||
backup_policies:
|
||
velero_max_age_hours: 4 # Velero 備份過期閾值(Q2 決策)
|
||
emergency_backup_timeout: 600 # 緊急備份超時秒數
|
||
block_backup_on_high_io: true # CPU/IO > 80% 時禁止觸發備份(Q4 決策)
|
||
io_threshold_percent: 80
|
||
|
||
# ─── MultiSig 設定 ───────────────────────────────────────────────────────
|
||
multisig:
|
||
critical_required_votes: 2 # CRITICAL_HITL 需要幾票
|
||
standard_required_votes: 1 # STANDARD_HITL 需要幾票
|
||
vote_expiry_minutes: 30 # 投票有效期
|