Files
awoooi/k8s/awoooi-prod/06-deployment-api.yaml
Your Name 6432e47770
All checks were successful
CD Pipeline / tests (push) Successful in 1m41s
Code Review / ai-code-review (push) Successful in 17s
CD Pipeline / build-and-deploy (push) Successful in 5m10s
CD Pipeline / post-deploy-checks (push) Successful in 1m59s
fix(ops): stabilize api rollout source correlation smoke
2026-06-03 08:08:48 +08:00

243 lines
9.4 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# AWOOOI Backend (FastAPI) Deployment
# 負責人: CIO
# 版本: v1.1
# 日期: 2026-03-20
# 更新: 2026-03-28 - 新增 startupProbe + revisionHistoryLimit:3 (Phase K0.5/K0.7)
apiVersion: apps/v1
kind: Deployment
metadata:
name: awoooi-api
namespace: awoooi-prod
labels:
app: awoooi-api
system: awoooi
environment: prod
spec:
replicas: 2
revisionHistoryLimit: 3
selector:
matchLabels:
app: awoooi-api
environment: prod
system: awoooi
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
# 2026-05-24 Codex: allow one unavailable replica so rollout can replace
# a bad old ReplicaSet instead of deadlocking at 1/2 when probes regress.
maxUnavailable: 1
template:
metadata:
labels:
app: awoooi-api
system: awoooi
environment: prod
spec:
# Phase 7: 使用 RBAC ServiceAccount (最小權限)
serviceAccountName: awoooi-executor
automountServiceAccountToken: true
# 2026-04-09 Claude Sonnet 4.6 Asia/Taipei: Bug #12 修正
# fsGroup=1000 讓 appuser(1000) 可讀取 defaultMode=0400 的 repair-ssh-key Secret
# SSH 要求 key 必須是 owner-only (0400/0600)0444 會被拒絕
securityContext:
fsGroup: 1000
containers:
- name: api
# 映像標籤由 CI/CD 動態注入 (格式: {sha}-{run_id})
# Harbor 金庫: 110 主機 (192.168.0.110:5000)
image: 192.168.0.110:5000/library/api:IMAGE_TAG_PLACEHOLDER
imagePullPolicy: Always
ports:
- containerPort: 8000
name: http
envFrom:
- configMapRef:
name: awoooi-config
- secretRef:
name: awoooi-secrets
# 2026-04-12 ogt: env 優先於 envFrom — 覆蓋 configmap 特定值
# 說明: Kubernetes env: 優先於 envFrom:,用於 live-patch 後需同步回 Git
env:
- name: USE_AI_ROUTER
value: "true"
- name: ENABLE_NEMOTRON_COLLABORATION
# 2026-04-15 ogt: 重新啟用 — asyncio.wait_for=120sOllama 已等待回應
value: "true"
- name: NEMOTRON_TIMEOUT_SECONDS
value: "55"
- name: TELEGRAM_ENABLE_POLLING
value: "true"
- name: OLLAMA_URL
value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy; health cooldown protects noisy offline probes
- name: OLLAMA_SECONDARY_URL
value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy; fallback only after GCP-A is unavailable
- name: OLLAMA_FALLBACK_URL
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini
- name: ALERT_AI_ALLOW_CLOUD_FALLBACK
value: "true" # Gemini 只作 GCP-A → GCP-B → 111 全失敗後的備援
- name: ALERT_AI_ENFORCE_OLLAMA_FIRST
value: "true" # 告警診斷強制先走 GCP-A → GCP-B → 111
- name: ALERT_OLLAMA_MODEL
value: "qwen3:14b" # 2026-05-05 Codex: 告警以解決問題為目標,可等待深度診斷
- name: OLLAMA_HEALTH_CHECK_MODEL
value: "gemma3:4b" # 2026-05-05 Codex: 避免 health probe 載入 qwen2.5 7B 污染 GCP alert lane
- name: OLLAMA_EMBEDDING_MODEL
value: "bge-m3:latest"
- name: OPENCLAW_DEFAULT_MODEL
value: "qwen2.5:7b-instruct"
- name: OPENCLAW_TIMEOUT
value: "120"
- name: OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
value: "300"
- name: INCIDENT_LLM_TIMEOUT_SECONDS
value: "360"
- name: AGENT_DEBATE_GLOBAL_TIMEOUT_SEC
value: "420"
- name: AGENT_DIAGNOSTICIAN_TIMEOUT_SEC
value: "100"
- name: AGENT_SOLVER_TIMEOUT_SEC
value: "80"
- name: PROMETHEUS_MULTIPROC_DIR
value: "/tmp/awoooi-prometheus-multiproc"
- name: ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER
value: "true"
- name: AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS
value: "300"
- name: AWOOOP_ANSIBLE_CHECK_MODE_BATCH_LIMIT
value: "1"
- name: AWOOOP_ANSIBLE_CHECK_MODE_TIMEOUT_SECONDS
value: "180"
- name: AWOOOP_ANSIBLE_CHECK_MODE_STARTUP_SLEEP_SECONDS
value: "120"
- name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_PROFILE
value: "ssh_mcp"
- name: AWOOOP_ANSIBLE_CHECK_MODE_SSH_KEY_PATH
value: "/run/secrets/ssh_mcp_key"
- name: AWOOOP_ANSIBLE_CHECK_MODE_KNOWN_HOSTS_PATH
value: "/etc/ssh-mcp/known_hosts"
- name: AWOOOP_ANSIBLE_CHECK_MODE_CANDIDATE_MAX_AGE_HOURS
value: "24"
- name: AWOOOP_ANSIBLE_CHECK_MODE_TRANSPORT_COOLDOWN_SECONDS
value: "21600"
# 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用
volumeMounts:
- name: repair-ssh-key
mountPath: /etc/repair-ssh
readOnly: true
# 2026-04-06 Claude Code: Sprint 3 Security Fix A1 — known_hosts
# 掛到獨立目錄,避免與 repair-ssh-key 的 mountPath 衝突
- name: repair-known-hosts
mountPath: /etc/repair-known-hosts
readOnly: true
# 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry YAML
# 掛載到 /app/ops/config/ 讓 _find_registry_path() 可找到
- name: service-registry
mountPath: /app/ops/config/service-registry.yaml
subPath: service-registry.yaml
readOnly: true
# MCP Phase 2a (2026-04-11 Claude Sonnet 4.6): SSH MCP key
# ssh_mcp_key → /run/secrets/ssh_mcp_key (SSH_KEY_PATH in ssh_provider.py)
# known_hosts → /etc/ssh-mcp/known_hosts (SSH_MCP_KNOWN_HOSTS_FILE)
- name: ssh-mcp-key
mountPath: /run/secrets/ssh_mcp_key
subPath: ssh_mcp_key
readOnly: true
- name: ssh-mcp-key
mountPath: /etc/ssh-mcp/known_hosts
subPath: known_hosts
readOnly: true
- name: prometheus-multiproc
mountPath: /tmp/awoooi-prometheus-multiproc
resources:
requests:
cpu: "200m"
memory: "512Mi"
limits:
cpu: "1"
memory: "1Gi"
livenessProbe:
httpGet:
# 2026-05-24 Codex: K8s probes must stay lightweight. The full
# health endpoint checks Ollama/OpenClaw/SigNoz and can exceed
# kubelet timeout when an external provider is degraded.
path: /api/v1/health/live
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /api/v1/health/ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
# 2026-06-03 Codex: DB bootstrap DDL + background worker wiring can exceed
# 60s during rolling deploys. Keep liveness strict after startup, but allow
# cold-start to finish instead of killing the pod mid-bootstrap.
startupProbe:
httpGet:
path: /api/v1/health/live
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 60
# 反親和性 - 分散到不同節點
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: awoooi-api
topologyKey: kubernetes.io/hostname
# 2026-04-05 Claude Code: Sprint 3 — repair SSH key (defaultMode 0400)
volumes:
- name: repair-ssh-key
secret:
secretName: awoooi-repair-ssh-key
defaultMode: 0400 # 八進位 0400 = 十進位 256 = r-------- (owner read-only)
# 2026-04-06 Claude Code: Sprint 3 Security Fix A1
# optional: true — CD 首次跑時建立 secretPod 不阻塞等待
- name: repair-known-hosts
secret:
secretName: awoooi-repair-known-hosts
optional: true
# 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 Guardrail — service registry
- name: service-registry
configMap:
name: service-registry
# MCP Phase 2a (2026-04-11 Claude Sonnet 4.6): SSH MCP key
# optional: true — SSH MCP 預設關閉Secret 不存在時 Pod 不阻塞
- name: ssh-mcp-key
secret:
secretName: ssh-mcp-key
defaultMode: 0400
optional: true
- name: prometheus-multiproc
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: awoooi-api-svc
namespace: awoooi-prod
labels:
app: awoooi-api
spec:
type: NodePort
selector:
app: awoooi-api
ports:
- port: 8000
targetPort: 8000
nodePort: 32334
name: http