243 lines
9.4 KiB
YAML
243 lines
9.4 KiB
YAML
# AWOOOI Backend (FastAPI) Deployment
|
||
# 負責人: CIO
|
||
# 版本: v1.1
|
||
# 日期: 2026-03-20
|
||
# 更新: 2026-03-28 - 新增 startupProbe + revisionHistoryLimit:3 (Phase K0.5/K0.7)
|
||
|
||
apiVersion: apps/v1
|
||
kind: Deployment
|
||
metadata:
|
||
name: awoooi-api
|
||
namespace: awoooi-prod
|
||
labels:
|
||
app: awoooi-api
|
||
system: awoooi
|
||
environment: prod
|
||
spec:
|
||
replicas: 2
|
||
revisionHistoryLimit: 3
|
||
selector:
|
||
matchLabels:
|
||
app: awoooi-api
|
||
environment: prod
|
||
system: awoooi
|
||
strategy:
|
||
type: RollingUpdate
|
||
rollingUpdate:
|
||
maxSurge: 1
|
||
# 2026-05-24 Codex: allow one unavailable replica so rollout can replace
|
||
# a bad old ReplicaSet instead of deadlocking at 1/2 when probes regress.
|
||
maxUnavailable: 1
|
||
template:
|
||
metadata:
|
||
labels:
|
||
app: awoooi-api
|
||
system: awoooi
|
||
environment: prod
|
||
spec:
|
||
# Phase 7: 使用 RBAC ServiceAccount (最小權限)
|
||
serviceAccountName: awoooi-executor
|
||
automountServiceAccountToken: true
|
||
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei: Bug #12 修正
|
||
# fsGroup=1000 讓 appuser(1000) 可讀取 defaultMode=0400 的 repair-ssh-key Secret
|
||
# SSH 要求 key 必須是 owner-only (0400/0600),0444 會被拒絕
|
||
securityContext:
|
||
fsGroup: 1000
|
||
containers:
|
||
- name: api
|
||
# 映像標籤由 CI/CD 動態注入 (格式: {sha}-{run_id})
|
||
# Harbor 金庫: 110 主機 (192.168.0.110:5000)
|
||
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
|
||
imagePullPolicy: Always
|
||
ports:
|
||
- containerPort: 8000
|
||
name: http
|
||
envFrom:
|
||
- configMapRef:
|
||
name: awoooi-config
|
||
- secretRef:
|
||
name: awoooi-secrets
|
||
# 2026-04-12 ogt: env 優先於 envFrom — 覆蓋 configmap 特定值
|
||
# 說明: Kubernetes env: 優先於 envFrom:,用於 live-patch 後需同步回 Git
|
||
env:
|
||
- name: USE_AI_ROUTER
|
||
value: "true"
|
||
- name: ENABLE_NEMOTRON_COLLABORATION
|
||
# 2026-04-15 ogt: 重新啟用 — asyncio.wait_for=120s,Ollama 已等待回應
|
||
value: "true"
|
||
- name: NEMOTRON_TIMEOUT_SECONDS
|
||
value: "55"
|
||
- name: TELEGRAM_ENABLE_POLLING
|
||
value: "true"
|
||
- name: OLLAMA_URL
|
||
value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy; health cooldown protects noisy offline probes
|
||
- name: OLLAMA_SECONDARY_URL
|
||
value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy; fallback only after GCP-A is unavailable
|
||
- name: OLLAMA_FALLBACK_URL
|
||
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini
|
||
- name: ALERT_AI_ALLOW_CLOUD_FALLBACK
|
||
value: "true" # Gemini 只作 GCP-A → GCP-B → 111 全失敗後的備援
|
||
- name: ALERT_AI_ENFORCE_OLLAMA_FIRST
|
||
value: "true" # 告警診斷強制先走 GCP-A → GCP-B → 111
|
||
- name: ALERT_OLLAMA_MODEL
|
||
value: "qwen3:14b" # 2026-05-05 Codex: 告警以解決問題為目標,可等待深度診斷
|
||
- name: OLLAMA_HEALTH_CHECK_MODEL
|
||
value: "gemma3:4b" # 2026-05-05 Codex: 避免 health probe 載入 qwen2.5 7B 污染 GCP alert lane
|
||
- name: OLLAMA_EMBEDDING_MODEL
|
||
value: "bge-m3:latest"
|
||
- name: OPENCLAW_DEFAULT_MODEL
|
||
value: "qwen2.5:7b-instruct"
|
||
- name: OPENCLAW_TIMEOUT
|
||
value: "120"
|
||
- name: OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||
value: "300"
|
||
- name: INCIDENT_LLM_TIMEOUT_SECONDS
|
||
value: "360"
|
||
- name: AGENT_DEBATE_GLOBAL_TIMEOUT_SEC
|
||
value: "420"
|
||
- name: AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
|
||
value: "100"
|
||
- name: AGENT_SOLVER_TIMEOUT_SEC
|
||
value: "80"
|
||
- name: PROMETHEUS_MULTIPROC_DIR
|
||
value: "/tmp/awoooi-prometheus-multiproc"
|
||
- name: ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER
|
||
value: "true"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS
|
||
value: "300"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT
|
||
value: "1"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS
|
||
value: "180"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS
|
||
value: "120"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_PROFILE
|
||
value: "ssh_mcp"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_SSH_KEY_PATH
|
||
value: "/run/secrets/ssh_mcp_key"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_KNOWN_HOSTS_PATH
|
||
value: "/etc/ssh-mcp/known_hosts"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_CANDIDATE_MAX_AGE_HOURS
|
||
value: "24"
|
||
- name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
|
||
value: "21600"
|
||
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
|
||
volumeMounts:
|
||
- name: repair-ssh-key
|
||
mountPath: /etc/repair-ssh
|
||
readOnly: true
|
||
# 2026-04-06 Claude Code: Sprint 3 Security Fix A1 — known_hosts
|
||
# 掛到獨立目錄,避免與 repair-ssh-key 的 mountPath 衝突
|
||
- name: repair-known-hosts
|
||
mountPath: /etc/repair-known-hosts
|
||
readOnly: true
|
||
# 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry YAML
|
||
# 掛載到 /app/ops/config/ 讓 _find_registry_path() 可找到
|
||
- name: service-registry
|
||
mountPath: /app/ops/config/service-registry.yaml
|
||
subPath: service-registry.yaml
|
||
readOnly: true
|
||
# MCP Phase 2a (2026-04-11 Claude Sonnet 4.6): SSH MCP key
|
||
# ssh_mcp_key → /run/secrets/ssh_mcp_key (SSH_KEY_PATH in ssh_provider.py)
|
||
# known_hosts → /etc/ssh-mcp/known_hosts (SSH_MCP_KNOWN_HOSTS_FILE)
|
||
- name: ssh-mcp-key
|
||
mountPath: /run/secrets/ssh_mcp_key
|
||
subPath: ssh_mcp_key
|
||
readOnly: true
|
||
- name: ssh-mcp-key
|
||
mountPath: /etc/ssh-mcp/known_hosts
|
||
subPath: known_hosts
|
||
readOnly: true
|
||
- name: prometheus-multiproc
|
||
mountPath: /tmp/awoooi-prometheus-multiproc
|
||
resources:
|
||
requests:
|
||
cpu: "200m"
|
||
memory: "512Mi"
|
||
limits:
|
||
cpu: "1"
|
||
memory: "1Gi"
|
||
livenessProbe:
|
||
httpGet:
|
||
# 2026-05-24 Codex: K8s probes must stay lightweight. The full
|
||
# health endpoint checks Ollama/OpenClaw/SigNoz and can exceed
|
||
# kubelet timeout when an external provider is degraded.
|
||
path: /api/v1/health/live
|
||
port: 8000
|
||
initialDelaySeconds: 30
|
||
periodSeconds: 10
|
||
timeoutSeconds: 5
|
||
failureThreshold: 3
|
||
readinessProbe:
|
||
httpGet:
|
||
path: /api/v1/health/ready
|
||
port: 8000
|
||
initialDelaySeconds: 5
|
||
periodSeconds: 5
|
||
timeoutSeconds: 3
|
||
failureThreshold: 3
|
||
# 2026-06-03 Codex: DB bootstrap DDL + background worker wiring can exceed
|
||
# 60s during rolling deploys. Keep liveness strict after startup, but allow
|
||
# cold-start to finish instead of killing the pod mid-bootstrap.
|
||
startupProbe:
|
||
httpGet:
|
||
path: /api/v1/health/live
|
||
port: 8000
|
||
initialDelaySeconds: 5
|
||
periodSeconds: 5
|
||
timeoutSeconds: 5
|
||
failureThreshold: 60
|
||
# 反親和性 - 分散到不同節點
|
||
affinity:
|
||
podAntiAffinity:
|
||
preferredDuringSchedulingIgnoredDuringExecution:
|
||
- weight: 100
|
||
podAffinityTerm:
|
||
labelSelector:
|
||
matchLabels:
|
||
app: awoooi-api
|
||
topologyKey: kubernetes.io/hostname
|
||
# 2026-04-05 Claude Code: Sprint 3 — repair SSH key (defaultMode 0400)
|
||
volumes:
|
||
- name: repair-ssh-key
|
||
secret:
|
||
secretName: awoooi-repair-ssh-key
|
||
defaultMode: 0400 # 八進位 0400 = 十進位 256 = r-------- (owner read-only)
|
||
# 2026-04-06 Claude Code: Sprint 3 Security Fix A1
|
||
# optional: true — CD 首次跑時建立 secret,Pod 不阻塞等待
|
||
- name: repair-known-hosts
|
||
secret:
|
||
secretName: awoooi-repair-known-hosts
|
||
optional: true
|
||
# 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry
|
||
- name: service-registry
|
||
configMap:
|
||
name: service-registry
|
||
# MCP Phase 2a (2026-04-11 Claude Sonnet 4.6): SSH MCP key
|
||
# optional: true — SSH MCP 預設關閉,Secret 不存在時 Pod 不阻塞
|
||
- name: ssh-mcp-key
|
||
secret:
|
||
secretName: ssh-mcp-key
|
||
defaultMode: 0400
|
||
optional: true
|
||
- name: prometheus-multiproc
|
||
emptyDir: {}
|
||
|
||
---
|
||
apiVersion: v1
|
||
kind: Service
|
||
metadata:
|
||
name: awoooi-api-svc
|
||
namespace: awoooi-prod
|
||
labels:
|
||
app: awoooi-api
|
||
spec:
|
||
type: NodePort
|
||
selector:
|
||
app: awoooi-api
|
||
ports:
|
||
- port: 8000
|
||
targetPort: 8000
|
||
nodePort: 32334
|
||
name: http
|