From 8eaf2acb0d11faa7ebce475e15d74c26d8dca302 Mon Sep 17 00:00:00 2001 From: OG T Date: Mon, 23 Mar 2026 10:59:20 +0800 Subject: [PATCH] docs(skills): add guardrails and dry-run principles - Skill 03: Add proposal guardrails (forbidden commands, namespace binding) - Skill 04: Add idempotency and garbage collection awareness - Skill 05: Add dry-run first principle for destructive operations Co-Authored-By: Claude Opus 4.5 --- .../skills/03-openclaw-cognitive-expert.md | 69 +++++++++++++++++++ .agents/skills/04-awoooi-devops-commander.md | 58 ++++++++++++++++ .agents/skills/05-awoooi-sre-qa.md | 60 ++++++++++++++++ 3 files changed, 187 insertions(+) diff --git a/.agents/skills/03-openclaw-cognitive-expert.md b/.agents/skills/03-openclaw-cognitive-expert.md index 53ff9fb6..27d2f631 100644 --- a/.agents/skills/03-openclaw-cognitive-expert.md +++ b/.agents/skills/03-openclaw-cognitive-expert.md @@ -128,6 +128,75 @@ await redis_client.xack("stream:awoooi_signals", "awoooi_workers", message_id) --- +## 🚨 提案防爆圈 Guardrails (2026-03-23 首席架構師指令) + +> **目的**: AI 生成的修復提案可能包含危險指令,必須裝上保險絲 + +### 鐵律 1: 毀滅性指令黑名單 + +```python +# ❌ 絕對禁止出現在 AI 生成的提案中 +FORBIDDEN_COMMANDS = [ + "rm -rf /", + "rm -rf /*", + "DROP DATABASE", + "DROP TABLE", + "TRUNCATE", + "kubectl delete namespace", + "kubectl delete -A", + "> /dev/sda", + "mkfs", + ":(){:|:&};:", # Fork bomb +] + +def validate_proposal_safety(script: str) -> bool: + """提案安全檢查 - 必須在生成後立即執行""" + for forbidden in FORBIDDEN_COMMANDS: + if forbidden.lower() in script.lower(): + logger.critical("dangerous_command_blocked", command=forbidden) + return False + return True +``` + +### 鐵律 2: K8s Namespace 強制綁定 + +```python +# ✅ 正確: 所有 K8s 指令必須帶 Namespace +kubectl rollout restart deployment/awoooi-api -n awoooi-prod + +# ❌ 禁止: 無 Namespace (可能影響其他系統) +kubectl rollout restart deployment/awoooi-api + +# ❌ 絕對禁止: 跨越到系統 Namespace +kubectl delete pod xxx -n kube-system +kubectl delete pod xxx -n default +``` + +### 鐵律 3: 提案白名單 Namespace + +```python +ALLOWED_NAMESPACES = ["awoooi-prod", "awoooi-dev"] + +def validate_namespace(command: str) -> bool: + """檢查 K8s 指令是否在許可 Namespace 內""" + if "kubectl" in command: + for ns in ALLOWED_NAMESPACES: + if f"-n {ns}" in command or f"--namespace={ns}" in command: + return True + return False # 無 Namespace 或不在白名單 + return True +``` + +### 檢查清單 (每個提案生成後) + +| 檢查項目 | 通過條件 | +|---------|---------| +| 毀滅性指令掃描 | 黑名單指令數量 = 0 | +| Namespace 驗證 | 所有 kubectl 指令帶 `-n awoooi-prod` | +| 權限邊界 | 不涉及 kube-system, default | + +--- + ## 參考文檔 - `apps/api/src/services/incident_engine.py`: 聚合引擎 diff --git a/.agents/skills/04-awoooi-devops-commander.md b/.agents/skills/04-awoooi-devops-commander.md index 411905f4..1bae33c7 100644 --- a/.agents/skills/04-awoooi-devops-commander.md +++ b/.agents/skills/04-awoooi-devops-commander.md @@ -231,6 +231,64 @@ kubectl patch secret awoooi-secrets -n awoooi-prod --type='json' \ --- +## 🚨 冪等性與垃圾回收 (2026-03-23 首席架構師指令) + +> **教訓**: 50 個殭屍消費者告訴我們 - 容器會死,但它在外部系統留下的垃圾不會自己消失 + +### 鐵律 1: 重啟前思考殘留狀態 + +```bash +# ❌ 危險: 直接重啟,不考慮外部狀態 +kubectl rollout restart deployment/awoooi-worker -n awoooi-prod + +# ✅ 正確: 重啟前先清理外部殘留 +# Step 1: 檢查是否有殭屍消費者 +ssh ollama@192.168.0.188 "docker exec clawbot-redis redis-cli -n 10 \ + XINFO GROUPS stream:awoooi_signals" + +# Step 2: 如果 consumers 數量異常高,先清理 +ssh ollama@192.168.0.188 "docker exec clawbot-redis redis-cli -n 10 \ + XGROUP DESTROY stream:awoooi_signals awoooi_workers" + +# Step 3: 重啟後,Consumer Group 會自動重建 +kubectl rollout restart deployment/awoooi-worker -n awoooi-prod +``` + +### 鐵律 2: Stateful 服務必須實作清理機制 + +| 服務類型 | 殘留風險 | 清理機制 | +|---------|---------|---------| +| Redis Consumer | 殭屍消費者 | graceful shutdown + XGROUP DELCONSUMER | +| PostgreSQL Connection | 連線池殘留 | pool.dispose() on shutdown | +| K8s ConfigMap/Secret | 孤立配置 | ownerReferences 綁定 | +| Harbor Image | 舊版本堆積 | retention policy 自動清理 | + +### 鐵律 3: 失敗重試的冪等性檢查 + +```python +# ✅ 正確: 先檢查再創建 (冪等) +async def ensure_consumer_group(): + try: + await redis.xgroup_create(stream, group, mkstream=True) + except ResponseError as e: + if "BUSYGROUP" not in str(e): # 已存在不是錯誤 + raise + +# ❌ 危險: 不檢查直接創建 (非冪等) +await redis.xgroup_create(stream, group) # 重試會失敗 +``` + +### 檢查清單 (每次重啟/重建前) + +| 檢查項目 | 動作 | +|---------|------| +| Redis Consumer Group | `XINFO GROUPS` 確認 consumers 數量合理 | +| DB Connection Pool | 確認無 idle connection 洩漏 | +| K8s ReplicaSet | 確認舊 RS replicas=0 | +| Pending Messages | `XPENDING` 確認無積壓 | + +--- + ## 參考文檔 - `k8s/awoooi-prod/`: K8s Manifests diff --git a/.agents/skills/05-awoooi-sre-qa.md b/.agents/skills/05-awoooi-sre-qa.md index 34ea65ec..0e89c55e 100644 --- a/.agents/skills/05-awoooi-sre-qa.md +++ b/.agents/skills/05-awoooi-sre-qa.md @@ -298,6 +298,66 @@ const signApproval = async () => { --- +## 🚨 Dry-Run 先行原則 (2026-03-23 首席架構師指令) + +> **目的**: 對於破壞性操作,事後驗證來不及 - 必須事前模擬 + +### 鐵律 1: K8s 變更必須先 Dry-Run + +```bash +# ✅ 正確: 先 dry-run 確認變更範圍 +kubectl apply -f k8s/awoooi-prod/deployment.yaml --dry-run=client +kubectl apply -f k8s/awoooi-prod/deployment.yaml --dry-run=server # 更嚴格 + +# 確認輸出無誤後,才真正執行 +kubectl apply -f k8s/awoooi-prod/deployment.yaml + +# ❌ 禁止: 直接套用到 Prod +kubectl apply -f k8s/awoooi-prod/deployment.yaml # 沒有先 dry-run +``` + +### 鐵律 2: 資料庫變更必須先預覽 + +```sql +-- ✅ 正確: 先用 SELECT 確認影響範圍 +SELECT COUNT(*) FROM incidents WHERE created_at < '2025-01-01'; +-- 確認數量合理後才執行 +DELETE FROM incidents WHERE created_at < '2025-01-01'; + +-- ❌ 禁止: 直接 DELETE/UPDATE +DELETE FROM incidents WHERE ... # 沒有先確認影響範圍 +``` + +### 鐵律 3: 提案給統帥前必須附帶 Dry-Run 結果 + +```markdown +## Tier 2/3 提案模板 + +### 擬執行指令 +kubectl rollout restart deployment/awoooi-api -n awoooi-prod + +### Dry-Run 結果 +deployment.apps/awoooi-api restarted (dry-run) + +### 影響範圍 +- 2 個 API Pod 會滾動重啟 +- 預計停機時間: 0 秒 (Rolling Update) + +### 回滾方案 +kubectl rollout undo deployment/awoooi-api -n awoooi-prod +``` + +### 檢查清單 (向統帥提案前) + +| 檢查項目 | 必須附帶 | +|---------|---------| +| K8s YAML 變更 | `--dry-run=client` 輸出 | +| kubectl delete | 影響的 Pod/資源清單 | +| 資料庫變更 | SELECT COUNT 結果 | +| Image 更新 | 新舊 Image Tag 對比 | + +--- + ## 參考文檔 - `apps/web/playwright.config.ts`: Playwright 設定