feat(infra): 開發環境分離 + BuildKit cache 修復 + circuit breaker 優化
Some checks failed
CD Pipeline / build-and-deploy (push) Successful in 6m52s
E2E Health Check / e2e-health (push) Successful in 17s
CD Pipeline (Dev) / build-and-deploy-dev (push) Failing after 9s

1. k8s/awoooi-dev/: 新建 dev namespace (01-05 配置)
   - Namespace + ResourceQuota (cpu 2/4, mem 4Gi/8Gi)
   - ConfigMap: ENVIRONMENT=dev, LOG_LEVEL=DEBUG, SHADOW_MODE=false
   - Deployment: 1 replica, NodePort 32344, image dev-latest
   - RBAC: awoooi-executor-dev ServiceAccount

2. .gitea/workflows/cd-dev.yaml: dev branch CD pipeline
   - 觸發: dev branch push
   - Build: --no-cache (防 cache poisoning)
   - Tag: dev-{sha} / dev-latest
   - Deploy: awoooi-dev namespace, health check 32344
   - Telegram: [DEV] 前綴通知

3. apps/api/Dockerfile: ARG CACHE_BUST=none (防 BuildKit cache 毒化)
   - deps 層 (pip install) 仍可 cache
   - src/ 和 models.json 層每次重建

4. .gitea/workflows/cd.yaml: 正式環境 API build 加入 CACHE_BUST=git_sha
   - 確保 models.json 等配置變更正確進入 image

5. apps/api/src/services/nvidia_provider.py: timeout 不計入 circuit breaker
   - TimeoutException → 只 log,不 record_failure()
   - 只有硬性錯誤 (auth/rate limit/exception) 才斷路

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-01 16:22:21 +08:00
parent c9c60c3a61
commit 9913f5dc6d
10 changed files with 424 additions and 5 deletions

View File

@@ -0,0 +1,197 @@
# =============================================================================
# AWOOOI CD Pipeline - 開發環境 (dev branch)
# =============================================================================
# 流程: Build → Push to Harbor (dev tag) → Deploy to awoooi-dev namespace
# 用途: 驗證修改,確認無誤後才 merge main → 觸發正式環境部署
# 2026-04-01 ogt: 建立開發環境 CI/CD 分離機制
name: CD Pipeline (Dev)
on:
push:
branches: [dev]
workflow_dispatch:
concurrency:
group: cd-dev-deploy-${{ github.ref }}
cancel-in-progress: false
env:
HARBOR: 192.168.0.110:5000
HARBOR_MIRROR: 192.168.0.110:5001
OTEL_EXPORTER_OTLP_ENDPOINT: http://192.168.0.188:24318
OTEL_SERVICE_NAME: awoooi-cd-dev
OTEL_RESOURCE_ATTRIBUTES: service.version=${{ github.sha }},deployment.environment=dev
jobs:
build-and-deploy-dev:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Get Commit Info
id: commit
run: |
echo "short_sha=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT
echo "message=$(git log -1 --pretty=%s | head -c 50)" >> $GITHUB_OUTPUT
echo "start_time=$(date +%s)" >> $GITHUB_OUTPUT
- name: Notify Dev Deploy Start
run: |
MSG="🔧 <b>[DEV] 部署開始</b>
├ 📝 ${{ steps.commit.outputs.message }}
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
└ 🌿 dev branch"
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
# API 測試 (同 prod CI確保 dev 也通過)
- name: Run API Tests
run: |
VENV=/opt/api-venv
HASH_FILE=/opt/api-venv/.deps_hash
CURRENT_HASH=$(md5sum apps/api/pyproject.toml | awk '{print $1}')
if [ ! -d "$VENV" ] || [ "$(cat $HASH_FILE 2>/dev/null)" != "$CURRENT_HASH" ]; then
python3 -m venv $VENV
source $VENV/bin/activate
pip install -q uv
cd apps/api && uv pip install -q -e ".[dev]" && cd -
echo "$CURRENT_HASH" > $HASH_FILE
else
source $VENV/bin/activate
fi
cd apps/api
pytest tests/ -v --tb=short -x \
--ignore=tests/test_anomaly_counter.py \
--ignore=tests/test_global_repair_cooldown.py \
--ignore=tests/test_redis_multisig.py \
--ignore=tests/test_model_regression.py \
--ignore=tests/test_prompt_validation.py \
2>&1 | tail -50
echo "✅ API 測試通過"
- name: Login to Harbor
uses: docker/login-action@v3
with:
registry: ${{ env.HARBOR }}
username: ${{ secrets.HARBOR_USERNAME }}
password: ${{ secrets.HARBOR_PASSWORD }}
# Dev API 鏡像:強制重建,不用 cache確保 models.json 等配置文件更新)
- name: Build and Push API (Dev)
run: |
docker build -f apps/api/Dockerfile \
--no-cache \
-t ${{ env.HARBOR }}/awoooi/api:dev-${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:dev-latest \
.
docker push ${{ env.HARBOR }}/awoooi/api:dev-${{ github.sha }}
docker push ${{ env.HARBOR }}/awoooi/api:dev-latest
echo "✅ Dev API 鏡像建置完成"
# 注入 Dev K8s Secrets
- name: Inject Dev K8s Secrets
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
TG_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }}
TG_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }}
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
run: |
mkdir -p ~/.ssh
echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << SECRETS
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/OPENCLAW_TG_BOT_TOKEN","value":"'"$(echo -n "${TG_BOT_TOKEN}" | base64 -w 0)"'"},
{"op":"replace","path":"/data/OPENCLAW_TG_CHAT_ID","value":"'"$(echo -n "${TG_CHAT_ID}" | base64 -w 0)"'"}
]' || echo "⚠️ Telegram Secrets patch 跳過"
if [ -n "${NVIDIA_API_KEY}" ]; then
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/NVIDIA_API_KEY","value":"'"$(echo -n "${NVIDIA_API_KEY}" | base64 -w 0)"'"}
]' && echo "✅ NVIDIA_API_KEY 已注入 dev"
fi
if [ -n "${GEMINI_API_KEY}" ]; then
sudo kubectl patch secret awoooi-secrets -n awoooi-dev --type='json' -p='[
{"op":"replace","path":"/data/GEMINI_API_KEY","value":"'"$(echo -n "${GEMINI_API_KEY}" | base64 -w 0)"'"}
]' && echo "✅ GEMINI_API_KEY 已注入 dev"
fi
echo "✅ Dev Secrets 注入完成"
SECRETS
# 部署到 awoooi-dev
- name: Deploy to Dev K8s
env:
SSH_PRIVATE_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
run: |
cat k8s/awoooi-dev/02-configmap.yaml | \
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 \
"export KUBECONFIG=/etc/rancher/k3s/k3s.yaml && sudo kubectl apply -f -"
ssh -o StrictHostKeyChecking=no -i ~/.ssh/deploy_key wooo@192.168.0.121 << 'DEPLOY'
set -e
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
sudo kubectl set image deployment/awoooi-api \
api=192.168.0.110:5000/awoooi/api:dev-${{ github.sha }} \
-n awoooi-dev
sudo kubectl rollout status deployment/awoooi-api -n awoooi-dev --timeout=120s
echo "✅ Dev 部署完成"
# Health Check
sleep 10
HEALTH_PASS=0
for i in 1 2 3; do
HTTP_CODE=$(curl -s -w "%{http_code}" -o /dev/null --connect-timeout 10 "http://localhost:32344/api/v1/health")
if [ "$HTTP_CODE" = "200" ]; then
echo "✅ Dev API 健康檢查通過 (port 32344)"
HEALTH_PASS=1
break
fi
echo "⏳ 嘗試 #$i: HTTP $HTTP_CODE等待 10s..."
sleep 10
done
if [ "$HEALTH_PASS" = "0" ]; then
echo "❌ Dev API 健康檢查失敗"
exit 1
fi
DEPLOY
- name: Notify Dev Deploy Success
run: |
END_TIME=$(date +%s)
DURATION=$((END_TIME - ${{ steps.commit.outputs.start_time }}))
MINUTES=$((DURATION / 60))
SECONDS=$((DURATION % 60))
MSG="✅ <b>[DEV] 部署完成</b>
├ 📝 ${{ steps.commit.outputs.message }}
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
├ ⏱️ 耗時: ${MINUTES}m ${SECONDS}s
└ 🩺 http://192.168.0.125:32344/api/v1/health"
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"
- name: Notify Dev Deploy Failure
if: failure()
run: |
MSG="❌ <b>[DEV] 部署失敗</b>
├ 📝 ${{ steps.commit.outputs.message }}
├ 🔖 <code>${{ steps.commit.outputs.short_sha }}</code>
└ 🔗 <a href=\"http://192.168.0.110:3001/wooo/awoooi/actions\">查看日誌</a>"
printf '%b' "$MSG" | curl -fS -X POST "https://api.telegram.org/bot${{ secrets.TELEGRAM_BOT_TOKEN }}/sendMessage" \
-d "chat_id=${{ secrets.TELEGRAM_CHAT_ID }}" \
-d "parse_mode=HTML" \
--data-urlencode "text@-"

View File

@@ -93,11 +93,14 @@ jobs:
password: ${{ secrets.HARBOR_PASSWORD }}
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
# 2026-04-01 ogt: CACHE_BUST=git_sha 確保 src/ 和 models.json 層每次重建
# deps 層 (pip install) 仍可 cache → 加速;代碼/配置層強制失效
- name: Build and Push API
run: |
docker build -f apps/api/Dockerfile \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--cache-from ${{ env.HARBOR }}/awoooi/api:latest \
--build-arg CACHE_BUST=${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:${{ github.sha }} \
-t ${{ env.HARBOR }}/awoooi/api:latest \
.

1
.gitignore vendored
View File

@@ -79,3 +79,4 @@ temp/
.claude/scheduled_tasks.lock
.cursor/
.agents/memory/
playwright-mcp/

View File

@@ -39,7 +39,9 @@ WORKDIR /app
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# Copy application code and models config
# 2026-04-01 ogt: CACHE_BUST 強制失效 src/ 和 models.json 層
# deps 層 (pip install) 仍可 cache代碼/配置變更必須重建
ARG CACHE_BUST=none
COPY apps/api/src/ ./src/
COPY apps/api/models.json ./models.json

View File

@@ -787,14 +787,16 @@ class NvidiaProvider:
return text, True, total_tokens, cost_usd
except httpx.TimeoutException as e:
self._circuit_breaker.record_failure()
# 2026-04-01 ogt: timeout 不計入 circuit breaker
# Nemo free tier 偶爾慢是正常的,下次請求仍應優先嘗試
# 只有硬性錯誤 (auth/rate limit) 才應斷路
NVIDIA_REQUESTS_TOTAL.labels(status="timeout", tool_name="chat").inc()
logger.warning("nvidia_chat_timeout", error=str(e))
return f"Timeout: {e}", False, 0, 0.0
except httpx.HTTPStatusError as e:
# 2026-03-31 ogt: 記錄完整響應體以診斷 400 錯誤
self._circuit_breaker.record_failure()
self._circuit_breaker.record_failure() # 硬性錯誤才斷路
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="chat").inc()
response_text = e.response.text if e.response else "No response body"
logger.warning(

View File

@@ -5,11 +5,15 @@
---
## 📍 當前狀態 (2026-04-01 11:00 台北)
## 📍 當前狀態 (2026-04-01 17:30 台北)
| 項目 | 狀態 |
|------|------|
| **P0 Telegram 按鈕修復** | ✅ **`e6f6734`** Redis Leader Election (多 Pod 409 修復) — CD 推送中 |
| **Phase S + MCP 整合** | ✅ **`c9c60c3`** Phase S架構修復(82→完整) + ADR-048/049/050/051 + Playwright CD |
| **ClawBot v5 LLM 修復 P0** | ✅ **`0d4034d`** (clawbot-v5) Ollama 預設 + qwen2.5:7b-instruct — 待部署 |
| **Phase S 技術債清理** | ✅ **`22de22c`** S-01~S-05 全部完成 (393 passed) |
| **用量優化 + Memory 精簡** | ✅ MEMORY 204→127行 / CLAUDE.md 221→83行 / 5個新Memory (token_efficiency + daily_habits + workflow_schedule + ui_protocol + design_quickref) |
| **P0 Telegram 按鈕修復** | ✅ **`e6f6734`** Redis Leader Election (多 Pod 409 修復) |
| **首席架構師完整審查 (含 Code + 測試)** | ✅ **96/100 OUTSTANDING** 詳見下方審查報告 |
| **測試修復** | ✅ **test_smart_router + test_auto_repair + test_global_repair** 全部修復 |
| **Phase R 首席架構師完整審查** | ✅ **ADR-047 97/100 OUTSTANDING** R1-R4 + ADR-046 全部通過 |

View File

@@ -0,0 +1,38 @@
apiVersion: v1
kind: Namespace
metadata:
name: awoooi-dev
labels:
environment: dev
system: awoooi
name: awoooi-dev
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: awoooi-dev-quota
namespace: awoooi-dev
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
pods: "10"
---
apiVersion: v1
kind: LimitRange
metadata:
name: awoooi-dev-limits
namespace: awoooi-dev
spec:
limits:
- type: Container
default:
cpu: "500m"
memory: "512Mi"
defaultRequest:
cpu: "100m"
memory: "128Mi"

View File

@@ -0,0 +1,48 @@
# AWOOOI 開發環境 ConfigMap
# 負責人: CIO
# 版本: v1.0
# 日期: 2026-04-01 (台北時區)
# 用途: 開發/測試用,禁止用於生產環境
apiVersion: v1
kind: ConfigMap
metadata:
name: awoooi-config
namespace: awoooi-dev
data:
ENVIRONMENT: "dev"
SYSTEM_NAME: "awoooi"
OLLAMA_URL: "http://192.168.0.188:11434"
OPENCLAW_URL: "http://192.168.0.188:8089"
KALI_SCANNER_URL: "http://192.168.0.112:8080"
SIGNOZ_URL: "http://192.168.0.188:3301"
LANGFUSE_URL: "http://192.168.0.110:3100"
OTEL_ENABLED: "true"
OTEL_EXPORTER_OTLP_ENDPOINT: "http://192.168.0.188:24317"
OTEL_SERVICE_NAME: "awoooi-api-dev"
LOG_LEVEL: "DEBUG"
CORS_ORIGINS: '["http://localhost:3000","http://192.168.0.121:32344","http://192.168.0.125:32344"]'
AI_FALLBACK_ORDER: '["nvidia","gemini","ollama","claude"]'
AI_CACHE_TTL: "300"
ENABLE_NEMOTRON_COLLABORATION: "true"
NEMOTRON_TIMEOUT_SECONDS: "30"
NEMOTRON_ASYNC_UPDATE: "true"
TELEGRAM_ENABLE_POLLING: "false"
CACHE_TTL_DASHBOARD: "60"
CACHE_TTL_HOST_STATUS: "15"
CACHE_TTL_AI_RESPONSE: "300"
SENTRY_URL: "http://192.168.0.110:9000"
SENTRY_ORG: "awoooi"
SENTRY_PROJECT_API: "awoooi-api"
SENTRY_PROJECT_WEB: "awoooi-web"
# Dev: Shadow Mode 關閉,測試自動修復
SHADOW_MODE_ENABLED: "false"
SHADOW_MODE_LOG_ONLY: "false"

View File

@@ -0,0 +1,81 @@
# AWOOOI API - 開發環境 Deployment
# 版本: v1.0 | 日期: 2026-04-01
apiVersion: apps/v1
kind: Deployment
metadata:
name: awoooi-api
namespace: awoooi-dev
labels:
app: awoooi-api
system: awoooi
environment: dev
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: awoooi-api
environment: dev
template:
metadata:
labels:
app: awoooi-api
system: awoooi
environment: dev
spec:
serviceAccountName: awoooi-executor-dev
automountServiceAccountToken: true
containers:
- name: api
image: 192.168.0.110:5000/awoooi/api:dev-latest
imagePullPolicy: Always
ports:
- containerPort: 8000
name: http
envFrom:
- configMapRef:
name: awoooi-config
- secretRef:
name: awoooi-secrets
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "512Mi"
livenessProbe:
httpGet:
path: /api/v1/health
port: 8000
initialDelaySeconds: 30
periodSeconds: 15
failureThreshold: 3
readinessProbe:
httpGet:
path: /api/v1/health
port: 8000
initialDelaySeconds: 10
periodSeconds: 10
failureThreshold: 3
---
apiVersion: v1
kind: Service
metadata:
name: awoooi-api-svc
namespace: awoooi-dev
labels:
app: awoooi-api
environment: dev
spec:
type: NodePort
selector:
app: awoooi-api
environment: dev
ports:
- port: 8000
targetPort: 8000
nodePort: 32344
name: http

View File

@@ -0,0 +1,43 @@
# AWOOOI Dev RBAC - 最小權限
# 版本: v1.0 | 日期: 2026-04-01
apiVersion: v1
kind: ServiceAccount
metadata:
name: awoooi-executor-dev
namespace: awoooi-dev
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: awoooi-executor-dev-role
namespace: awoooi-dev
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "watch", "patch", "update"]
- apiGroups: ["apps"]
resources: ["deployments/scale"]
verbs: ["patch", "update"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch", "delete"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: awoooi-executor-dev-binding
namespace: awoooi-dev
subjects:
- kind: ServiceAccount
name: awoooi-executor-dev
namespace: awoooi-dev
roleRef:
kind: Role
apiRef: awoooi-executor-dev-role
apiGroup: rbac.authorization.k8s.io