diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index fb10c118..cc3f20ee 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -299,6 +299,13 @@ jobs: "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" echo "✅ ConfigMap 已更新" + # Step 1c: Apply Service Registry ConfigMap (Sprint 5.1 Guardrail) + # 2026-04-08 Claude Sonnet 4.6: 掛載 service-registry.yaml 到容器 /app/ops/config/ + cat k8s/awoooi-prod/15-service-registry-configmap.yaml | \ + ssh -i ~/.ssh/deploy_key wooo@192.168.0.121 \ + "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -" + echo "✅ Service Registry ConfigMap 已更新" + # Step 1b: Apply Deployment yamls (套用 volumes/resources/probe 等非 image 設定) # 2026-04-05 Claude Code: 確保 deployment 結構變更(如 SSH key mount)持久化到 K8s # C3 修正 2026-04-05: 先 sed 替換 IMAGE_TAG_PLACEHOLDER 為正確 sha, diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 54441bac..4a2fae5b 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -60,6 +60,12 @@ spec: - name: repair-known-hosts mountPath: /etc/repair-known-hosts readOnly: true + # 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry YAML + # 掛載到 /app/ops/config/ 讓 _find_registry_path() 可找到 + - name: service-registry + mountPath: /app/ops/config/service-registry.yaml + subPath: service-registry.yaml + readOnly: true resources: requests: cpu: "200m" @@ -114,6 +120,10 @@ spec: secret: secretName: awoooi-repair-known-hosts optional: true + # 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry + - name: service-registry + configMap: + name: service-registry --- apiVersion: v1 diff --git a/k8s/awoooi-prod/15-service-registry-configmap.yaml b/k8s/awoooi-prod/15-service-registry-configmap.yaml new file mode 100644 index 00000000..fefaaaf2 --- /dev/null +++ b/k8s/awoooi-prod/15-service-registry-configmap.yaml @@ -0,0 +1,217 @@ +# k8s/awoooi-prod/15-service-registry-configmap.yaml +# Service Registry ConfigMap — 掛載 ops/config/service-registry.yaml 到 K8s 容器 +# 建立: 2026-04-08 Claude Sonnet 4.6 +# 目的: 解決 Docker 容器無法找到 service-registry.yaml 導致 Guardrail 降級 AUTO 問題 +# 掛載路徑: /app/ops/config/service-registry.yaml +# 參考: ADR-062, ADR-063 +apiVersion: v1 +kind: ConfigMap +metadata: + name: service-registry + namespace: awoooi-prod + labels: + app: awoooi + component: service-registry +data: + service-registry.yaml: | + # ops/config/service-registry.yaml + # Service Registry — 服務 Stateful 分級清單 + # 版本: 1.0.0 + # 建立: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei + # 維護: 修改需 PR + 統帥審核,禁止直接 push + # 說明: + # BLOCK = 系統禁止自動修復,僅告警(資料風險最高) + # CRITICAL_HITL = 允許 Playbook,但需 MultiSig 2票 + # STANDARD_HITL = 允許 Playbook,需 1票審核 + # AUTO = 允許自動執行(無狀態服務) + # 參考: ADR-062, ADR-063 + + services: + # ─── BLOCK:系統禁止(連 Playbook 都不提供)──────────────────────────── + - name: postgres + display_name: "PostgreSQL 主庫 (awoooi_prod)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "主要業務資料庫,重啟可能導致 WAL 截斷、事務回滾" + alert_only: true + containers: ["postgres"] + + - name: momo-db + display_name: "PostgreSQL (momo_db)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "momo 產品資料庫,禁止自動操作" + alert_only: true + containers: ["momo-db"] + + - name: langfuse-db + display_name: "PostgreSQL (Langfuse)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "LLM trace 資料庫,重啟導致追蹤資料遺失" + alert_only: true + containers: ["langfuse-db"] + + - name: harbor-db + display_name: "PostgreSQL (Harbor Registry)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "Harbor Registry 資料庫,重啟可能損壞 image layer 索引" + alert_only: true + containers: ["harbor-db"] + + - name: sentry-postgres + display_name: "PostgreSQL (Sentry)" + host: "192.168.0.110" + stateful_level: BLOCK + reason: "Sentry 錯誤追蹤資料庫" + alert_only: true + containers: ["sentry-postgres"] + + - name: signoz-clickhouse + display_name: "ClickHouse (SignOz)" + host: "192.168.0.188" + stateful_level: BLOCK + reason: "列欄式 OLAP 資料庫,寫入中重啟可能損壞列欄檔案" + alert_only: true + containers: ["signoz-clickhouse"] + + # ─── CRITICAL_HITL:高風險,需 MultiSig 2票 ────────────────────────── + - name: redis + display_name: "Redis (AWOOOI)" + host: "192.168.0.188" + stateful_level: CRITICAL_HITL + reason: "AWOOOI 依賴 Redis 做冪等鎖與快取,重啟丟失鎖狀態" + requires_pre_backup: false + containers: ["redis"] + + - name: harbor-redis + display_name: "Redis (Harbor)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "Harbor session 快取" + containers: ["harbor-redis"] + + - name: sentry-redis + display_name: "Redis (Sentry)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "Sentry 任務佇列" + containers: ["sentry-redis"] + + - name: gitea + display_name: "Gitea (程式碼倉庫)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "restart 會殺掉活躍 SSH session,Git push 中斷可能損壞 working copy" + requires_pre_backup: false + containers: ["gitea"] + + - name: harbor + display_name: "Harbor (Container Registry)" + host: "192.168.0.110" + stateful_level: CRITICAL_HITL + reason: "重啟中斷 pull/push;GC 進行中重啟可能損壞 layer" + requires_pre_backup: false + containers: ["harbor-core", "harbor-jobservice", "harbor-portal"] + + - name: minio + display_name: "MinIO (物件存儲)" + host: "192.168.0.188" + stateful_level: CRITICAL_HITL + reason: "寫入中重啟可能導致 multipart upload 中斷" + requires_pre_backup: false + containers: ["minio"] + + # ─── STANDARD_HITL:中風險,需 1票審核 ────────────────────────────── + - name: prometheus + display_name: "Prometheus" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 TSDB WAL,exited 狀態用 docker start(非 restart)" + restart_command: "docker start" + containers: ["prometheus"] + + - name: grafana + display_name: "Grafana" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 SQLite 設定儲存,exited 用 docker start" + restart_command: "docker start" + containers: ["grafana"] + + - name: alertmanager + display_name: "Alertmanager" + host: "192.168.0.110" + stateful_level: STANDARD_HITL + reason: "有 silence 狀態,exited 用 docker start" + restart_command: "docker start" + containers: ["alertmanager"] + + # ─── AUTO:無狀態,允許自動修復 ────────────────────────────────────── + - name: nginx + display_name: "Nginx (反向代理)" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["nginx", "nginx-188"] + + - name: awoooi-api + display_name: "AWOOOI API (K3s)" + host: "k3s" + stateful_level: AUTO + containers: [] + + - name: awoooi-web + display_name: "AWOOOI Web (K3s)" + host: "k3s" + stateful_level: AUTO + containers: [] + + - name: blackbox-exporter + display_name: "Blackbox Exporter" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["blackbox-exporter"] + + - name: langfuse + display_name: "Langfuse (LLMOps)" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["langfuse-web", "langfuse-worker"] + + - name: ollama + display_name: "Ollama (Local LLM)" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["ollama"] + + - name: momo-app + display_name: "momo Web App" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["momo-app"] + + - name: tsenyang-website + display_name: "Tsenyang Website" + host: "192.168.0.188" + stateful_level: AUTO + containers: ["tsenyang-website"] + + - name: stock-platform + display_name: "Stock Platform" + host: "192.168.0.110" + stateful_level: AUTO + containers: ["stock-platform"] + + # ─── 備份策略參考 ──────────────────────────────────────────────────────── + backup_policies: + velero_max_age_hours: 4 + emergency_backup_timeout: 600 + block_backup_on_high_io: true + io_threshold_percent: 80 + + # ─── MultiSig 設定 ─────────────────────────────────────────────────────── + multisig: + critical_required_votes: 2 + standard_required_votes: 1 + vote_expiry_minutes: 30