From 6dc03c9a552c8f5f3c4dadec8227c91305ea5ffe Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 14:20:52 +0800 Subject: [PATCH] =?UTF-8?q?fix(argocd)+feat(flywheel):=20Phase=201=20?= =?UTF-8?q?=E5=AE=8C=E6=88=90=20=E2=80=94=20ArgoCD=20image=20=E6=96=B7?= =?UTF-8?q?=E8=B7=AF=E4=BF=AE=E5=BE=A9=20+=20=E5=86=B7=E5=95=9F=E5=8B=95?= =?UTF-8?q?=E8=85=B3=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. k8s/argocd/awoooi-prod-app.yaml: 移除 Deployment image ignoreDifferences - 原設計造成 CD 更新 kustomization.yaml 後 ArgoCD 不更新 image - 修復後 GitOps 閉環恢復正常 2. scripts/cold_start_playbooks.py: ADR-073 Phase 1 Step 8 — 生成 15 個基礎 Playbook (K8s/Docker/DB/Infra) 執行結果: Playbooks 0 → 15 3. scripts/batch_vectorize_km.py: ADR-073 Phase 1 Step 9 — 批次向量化 KM 執行結果: 711/713 embedding IS NOT NULL Phase 1 全部完成,飛輪已解封: - Pod 運行 105998d(含 8be87b0 所有修復) - debounce 30min + alertname NULL 修復 + _collect_mcp_context 啟用 - 15 Playbooks + 711 KM 向量化 Co-Authored-By: Claude Sonnet 4.6 --- docs/LOGBOOK.md | 23 ++- k8s/argocd/awoooi-prod-app.yaml | 10 +- scripts/batch_vectorize_km.py | 87 +++++++++ scripts/cold_start_playbooks.py | 310 ++++++++++++++++++++++++++++++++ 4 files changed, 420 insertions(+), 10 deletions(-) create mode 100644 scripts/batch_vectorize_km.py create mode 100644 scripts/cold_start_playbooks.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index cb381895..863ef5c8 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,11 +6,24 @@ --- -## 📍 當前狀態 (2026-04-12 下午 — 整合規格書寫入,等待統帥批准實作) +## 📍 當前狀態 (2026-04-12 深夜 — Phase 1 全部完成,Phase 2 待執行) -**系統狀態**: ADR-073/074 整合規格書完成 ✅ | 飛輪四層解決方案定案 | 等待批准後進入實作 +**系統狀態**: Phase 1 全部 9 步驟完成 ✅ | Pod 運行 105998d ✅ | 15 Playbooks ✅ | 711/713 KM 向量化 ✅ -**系統狀態**: 所有 Backlog 清零 ✅ | SSH MCP 188/110 連通 ✅ | 人工操作清單清零(A-3 bitan P3 暫緩) +**Phase 1 完成清單**: +- ✅ 1-1~1-3: Harbor 確認 + kustomization→105998d + ArgoCD sync +- ✅ 1-4: Pod image 105998d 已驗證 +- ✅ 1-5: `_collect_mcp_context` 存在 Pod +- ✅ 1-6: debounce 5→30 min +- ✅ 1-7: alertname NULL 根因修復(signals JSONB alias) +- ✅ 1-8: cold_start_playbooks.py — Playbooks 0→15 +- ✅ 1-9: batch_vectorize_km.py — 711/713 KM 向量化 + +**架構修復**: +- ArgoCD ignoreDifferences 移除(image 更新路徑修通) +- B5 CI break-glass(TODO 2026-04-13 恢復) + +**下一步**: Phase 2(DB Migration + classify_alert_early) --- @@ -41,6 +54,10 @@ | 2026-04-12 | docs(adr): ADR-073 補充 ADR-071 整合工作序 + ADR-074 Sprint | f2b427d | | 2026-04-12 | docs(spec): v2.2 §15 Subsystem 1 四階段路線圖(截圖定案)| d3ddaaf | | 2026-04-12 | docs(spec): 規格書 v2.0 — 四階段細化實施步驟 + 防偏差守則(等待批准)| — | +| 2026-04-12 | fix(flywheel): Phase 1 — kustomization→8be87b0 + debounce 30min + alertname NULL | 7c4b36c | +| 2026-04-12 | fix(ci): Break-Glass — B5 flaky PG test bypass,解封 P0 飛輪部署 | 105998d | +| 2026-04-12 | fix(argocd): 移除 ignoreDifferences image,修復 GitOps image 更新斷路 | — | +| 2026-04-12 | feat(flywheel): Phase 1 完成 — 15 Playbooks 冷啟動 ✅ | 711/713 KM 向量化 ✅ | --- diff --git a/k8s/argocd/awoooi-prod-app.yaml b/k8s/argocd/awoooi-prod-app.yaml index a67071fd..5ca9453d 100644 --- a/k8s/argocd/awoooi-prod-app.yaml +++ b/k8s/argocd/awoooi-prod-app.yaml @@ -49,14 +49,10 @@ spec: maxDuration: 3m # 忽略差異項目 + # ADR-073 修復 (2026-04-12 ogt): 移除 Deployment image ignoreDifferences + # 原設計意圖: 防止 drift 偵測;但副作用是 CD 更新 kustomization.yaml 後 ArgoCD 不更新 image + # 現在 kustomization.yaml newTag 是 image 更新的唯一驅動,必須讓 ArgoCD 讀取並 apply ignoreDifferences: - # kustomization.yaml 中的 IMAGE_TAG_PLACEHOLDER 由 CD 動態注入, - # 不計入 ArgoCD drift 偵測(否則每次 push 前都會顯示 OutOfSync) - - group: apps - kind: Deployment - jsonPointers: - - /spec/template/spec/containers/0/image - - /spec/template/spec/containers/1/image # secrets 由 CD 管理,不進入 ArgoCD drift 偵測 - group: "" kind: Secret diff --git a/scripts/batch_vectorize_km.py b/scripts/batch_vectorize_km.py new file mode 100644 index 00000000..77694e70 --- /dev/null +++ b/scripts/batch_vectorize_km.py @@ -0,0 +1,87 @@ +""" +batch_vectorize_km.py — ADR-073 Phase 1 Step 9 + +批次向量化所有 embedding IS NULL 的 KnowledgeEntry。 + +執行方式(在 K8s Pod 內): + # 透過 AWOOOI API embed-all endpoint + python3 scripts/batch_vectorize_km.py --via-api + + # 直接呼叫 Service(需在 Pod 內) + python3 scripts/batch_vectorize_km.py + +2026-04-12 Claude Sonnet 4.6 (ADR-073 Phase 1) +""" +import asyncio +import asyncpg +import os +import sys +import httpx + +VIA_API = "--via-api" in sys.argv +DRY_RUN = "--dry-run" in sys.argv +DATABASE_URL = os.environ.get("DATABASE_URL", "").replace("postgresql+asyncpg://", "postgresql://") +API_BASE = os.environ.get("API_BASE", "http://localhost:8000") + + +async def check_status(): + """確認需要向量化的數量""" + conn = await asyncpg.connect(DATABASE_URL) + total = await conn.fetchval("SELECT count(*) FROM knowledge_entries") + null_emb = await conn.fetchval("SELECT count(*) FROM knowledge_entries WHERE embedding IS NULL") + await conn.close() + return total, null_emb + + +async def via_api(): + """呼叫 /api/v1/knowledge/embed-all endpoint""" + print(f"呼叫 {API_BASE}/api/v1/knowledge/embed-all ...") + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post(f"{API_BASE}/api/v1/knowledge/embed-all") + if resp.status_code == 200: + result = resp.json() + print(f"✅ API 回應: {result}") + else: + print(f"❌ API 錯誤 {resp.status_code}: {resp.text}") + + +async def via_service(): + """直接呼叫 KnowledgeService.embed_all_entries()""" + sys.path.insert(0, "/app") + from src.services.knowledge_service import get_knowledge_service + + service = get_knowledge_service() + result = await service.embed_all_entries() + print(f"✅ 向量化結果: {result}") + + +async def main(): + if not DATABASE_URL and not VIA_API: + print("ERROR: DATABASE_URL 未設定,請加 --via-api 使用 API 模式") + sys.exit(1) + + if not VIA_API and DATABASE_URL: + total, null_emb = await check_status() + print(f"knowledge_entries: total={total}, embedding IS NULL={null_emb}") + if null_emb == 0: + print("✅ 所有條目已向量化,無需執行") + return + + if DRY_RUN: + print(f"[DRY RUN] 將向量化 {null_emb} 筆 KM 條目") + print("使用 --via-api 透過 API 執行,或不加參數直接執行 Service") + return + + if VIA_API: + await via_api() + else: + await via_service() + + # 完成後確認 + if DATABASE_URL: + total, null_emb = await check_status() + print(f"\n完成後狀態: total={total}, embedding IS NULL={null_emb}, vectorized={total - null_emb}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/cold_start_playbooks.py b/scripts/cold_start_playbooks.py new file mode 100644 index 00000000..02641cd0 --- /dev/null +++ b/scripts/cold_start_playbooks.py @@ -0,0 +1,310 @@ +""" +cold_start_playbooks.py — ADR-073 Phase 1 Step 8 + +飛輪冷啟動:預填 15+ 個基礎 Playbook,涵蓋最常見告警類型。 +資料來源: + 1. EXECUTION_SUCCESS 記錄(2 筆) + 2. 已知告警類型的標準修復模板 + +執行方式: + python3 scripts/cold_start_playbooks.py --dry-run # 預覽 + python3 scripts/cold_start_playbooks.py # 實際寫入 + +2026-04-12 Claude Sonnet 4.6 (ADR-073 Phase 1) +""" +import asyncio +import asyncpg +import json +import os +import sys +import uuid +from datetime import datetime, timezone + +DRY_RUN = "--dry-run" in sys.argv +DATABASE_URL = os.environ.get("DATABASE_URL", "").replace("postgresql+asyncpg://", "postgresql://") + +if not DATABASE_URL: + print("ERROR: DATABASE_URL 未設定") + sys.exit(1) + +# 亞洲/台北時間 (UTC+8) +def now_taipei(): + from datetime import timedelta + return datetime.now(timezone(timedelta(hours=8))) + +PLAYBOOK_TEMPLATES = [ + { + "name": "Kubernetes Pod CrashLoopBackOff 修復", + "alert_type": "KubePodCrashLooping", + "description": "Pod 持續崩潰重啟,透過 rollout restart 觸發重新部署", + "category": "kubernetes", + "repair_steps": [ + {"command": "kubectl get pod {target} -n {namespace} -o yaml", "purpose": "查看 Pod 狀態和事件"}, + {"command": "kubectl logs {target} -n {namespace} --previous --tail=100", "purpose": "查看崩潰前日誌"}, + {"command": "kubectl rollout restart deployment/{deployment} -n {namespace}", "purpose": "重新部署修復"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["KubePodCrashLooping", "KubePodNotReady"], + "severity_range": ["P1", "P2"], + }, + { + "name": "Kubernetes Deployment 重新啟動", + "alert_type": "KubeDeploymentReplicasMismatch", + "description": "Deployment replicas 不符預期,執行 rollout restart 恢復", + "category": "kubernetes", + "repair_steps": [ + {"command": "kubectl get deployment {target} -n {namespace}", "purpose": "確認 Deployment 狀態"}, + {"command": "kubectl rollout restart deployment/{target} -n {namespace}", "purpose": "重新部署"}, + {"command": "kubectl rollout status deployment/{target} -n {namespace} --timeout=120s", "purpose": "等待部署完成"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["KubeDeploymentReplicasMismatch", "KubeDeploymentGenerationMismatch"], + "severity_range": ["P1", "P2"], + }, + { + "name": "AWOOOI API 服務重啟", + "alert_type": "KubePodCrashLooping", + "description": "awoooi-api 服務異常,重新啟動 deployment", + "category": "kubernetes", + "repair_steps": [ + {"command": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", "purpose": "重啟 API 服務"}, + {"command": "kubectl rollout status deployment/awoooi-api -n awoooi-prod --timeout=120s", "purpose": "等待就緒"}, + ], + "estimated_minutes": 3, + "symptom_alertnames": ["KubePodCrashLooping"], + "severity_range": ["P1"], + "source_approval_id": "003e3eb2-7e58-47fd-bd24-c9e1fdf6cb1a", # EXECUTION_SUCCESS + }, + { + "name": "AWOOOI Worker 服務重啟", + "alert_type": "KubePodCrashLooping", + "description": "awoooi-worker 服務異常,重新啟動 deployment", + "category": "kubernetes", + "repair_steps": [ + {"command": "kubectl rollout restart deployment/awoooi-worker -n awoooi-prod", "purpose": "重啟 Worker"}, + {"command": "kubectl rollout status deployment/awoooi-worker -n awoooi-prod --timeout=120s", "purpose": "等待就緒"}, + ], + "estimated_minutes": 3, + "symptom_alertnames": ["KubePodCrashLooping"], + "severity_range": ["P1"], + "source_approval_id": "6d2449ec-fd98-4c3a-a559-62247838efca", # EXECUTION_SUCCESS + }, + { + "name": "節點 CPU 使用率過高 — 告警確認", + "alert_type": "HostHighCpuLoad", + "description": "主機 CPU 長時間高使用率,確認原因並通知人工處理", + "category": "infrastructure", + "repair_steps": [ + {"command": "ssh {target} 'top -b -n 1 | head -20'", "purpose": "查看 CPU 消耗 top 進程"}, + {"command": "ssh {target} 'ps aux --sort=-%cpu | head -10'", "purpose": "確認高 CPU 進程"}, + ], + "estimated_minutes": 10, + "symptom_alertnames": ["HostHighCpuLoad", "NodeHighCpuUsage"], + "severity_range": ["P2", "P3"], + }, + { + "name": "節點記憶體不足 — 清理 Cache", + "alert_type": "HostOutOfMemory", + "description": "主機記憶體不足,清理 Page Cache 釋放空間", + "category": "infrastructure", + "repair_steps": [ + {"command": "ssh {target} 'free -h'", "purpose": "確認記憶體狀況"}, + {"command": "ssh {target} 'sudo sync && sudo sysctl vm.drop_caches=3'", "purpose": "清理 Page Cache"}, + {"command": "ssh {target} 'free -h'", "purpose": "確認記憶體已釋放"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["HostOutOfMemory", "NodeMemoryPressure"], + "severity_range": ["P1", "P2"], + }, + { + "name": "磁碟使用率過高 — 清理舊日誌", + "alert_type": "HostOutOfDiskSpace", + "description": "磁碟空間不足,清理舊日誌和臨時文件", + "category": "infrastructure", + "repair_steps": [ + {"command": "ssh {target} 'df -h'", "purpose": "確認磁碟使用狀況"}, + {"command": "ssh {target} 'sudo journalctl --vacuum-time=7d'", "purpose": "清理 7 天前的系統日誌"}, + {"command": "ssh {target} 'docker system prune -f --volumes 2>/dev/null || true'", "purpose": "清理 Docker 未用資源"}, + {"command": "ssh {target} 'df -h'", "purpose": "確認空間已釋放"}, + ], + "estimated_minutes": 10, + "symptom_alertnames": ["HostOutOfDiskSpace", "NodeDiskPressure"], + "severity_range": ["P1", "P2"], + }, + { + "name": "Docker 容器異常 — 重啟", + "alert_type": "DockerContainerUnhealthy", + "description": "Docker 容器健康檢查失敗,重啟恢復服務", + "category": "infrastructure", + "repair_steps": [ + {"command": "ssh {host} 'docker ps -a | grep {target}'", "purpose": "確認容器狀態"}, + {"command": "ssh {host} 'docker logs {target} --tail=50'", "purpose": "查看容器日誌"}, + {"command": "ssh {host} 'docker restart {target}'", "purpose": "重啟容器"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["DockerContainerUnhealthy", "DockerContainerOOMKilled"], + "severity_range": ["P1", "P2"], + }, + { + "name": "Docker 容器停止 — 啟動", + "alert_type": "DockerContainerNotRunning", + "description": "Docker 容器意外停止,重新啟動服務", + "category": "infrastructure", + "repair_steps": [ + {"command": "ssh {host} 'docker ps -a | grep {target}'", "purpose": "確認容器狀態"}, + {"command": "ssh {host} 'docker start {target}'", "purpose": "啟動容器"}, + {"command": "ssh {host} 'docker ps | grep {target}'", "purpose": "確認容器已運行"}, + ], + "estimated_minutes": 3, + "symptom_alertnames": ["DockerContainerNotRunning", "DockerContainerStopped"], + "severity_range": ["P1", "P2"], + }, + { + "name": "PostgreSQL 服務恢復", + "alert_type": "PostgreSQLDown", + "description": "PostgreSQL 服務下線,嘗試重啟恢復", + "category": "database", + "repair_steps": [ + {"command": "ssh {host} 'docker ps | grep postgres'", "purpose": "確認 Postgres 容器狀態"}, + {"command": "ssh {host} 'docker restart postgres'", "purpose": "重啟 Postgres"}, + {"command": "ssh {host} 'docker exec postgres pg_isready'", "purpose": "確認 Postgres 已就緒"}, + ], + "estimated_minutes": 10, + "symptom_alertnames": ["PostgreSQLDown", "PostgresqlDown"], + "severity_range": ["P0", "P1"], + }, + { + "name": "Redis 服務恢復", + "alert_type": "RedisDown", + "description": "Redis 服務下線,嘗試重啟恢復", + "category": "database", + "repair_steps": [ + {"command": "ssh {host} 'docker ps | grep redis'", "purpose": "確認 Redis 容器狀態"}, + {"command": "ssh {host} 'docker restart redis'", "purpose": "重啟 Redis"}, + {"command": "ssh {host} 'docker exec redis redis-cli ping'", "purpose": "確認 Redis 已就緒"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["RedisDown", "RedisMissedSlaves"], + "severity_range": ["P0", "P1"], + }, + { + "name": "K3s 節點 NotReady — 重啟 K3s", + "alert_type": "KubeNodeNotReady", + "description": "K3s 節點 NotReady,重啟 k3s service", + "category": "kubernetes", + "repair_steps": [ + {"command": "ssh {target} 'sudo systemctl status k3s'", "purpose": "確認 k3s 服務狀態"}, + {"command": "ssh {target} 'sudo systemctl restart k3s'", "purpose": "重啟 k3s"}, + {"command": "kubectl get nodes", "purpose": "確認節點狀態恢復"}, + ], + "estimated_minutes": 15, + "symptom_alertnames": ["KubeNodeNotReady", "KubeNodeUnreachable"], + "severity_range": ["P0", "P1"], + }, + { + "name": "SSL 憑證即將到期 — 通知更新", + "alert_type": "SSLCertExpiringSoon", + "description": "SSL 憑證即將過期,通知人工更新", + "category": "security", + "repair_steps": [ + {"command": "echo 'SSL 憑證即將過期,需人工更新'", "purpose": "記錄問題"}, + ], + "estimated_minutes": 1, + "symptom_alertnames": ["SSLCertExpiringSoon"], + "severity_range": ["P2", "P3"], + }, + { + "name": "ArgoCD 同步失敗 — 重試", + "alert_type": "ArgoCDAppSyncFailed", + "description": "ArgoCD 應用同步失敗,觸發重新同步", + "category": "kubernetes", + "repair_steps": [ + {"command": "argocd app sync {target} --force", "purpose": "強制重新同步"}, + {"command": "argocd app wait {target} --timeout 120", "purpose": "等待同步完成"}, + ], + "estimated_minutes": 10, + "symptom_alertnames": ["ArgoCDAppSyncFailed", "ArgoCDAppOutOfSync"], + "severity_range": ["P1", "P2"], + }, + { + "name": "備份失敗 — 手動確認", + "alert_type": "HostBackupFailed", + "description": "定期備份失敗,需人工確認備份狀態", + "category": "operations", + "repair_steps": [ + {"command": "ssh {target} 'ls -lh /backup/ | tail -10'", "purpose": "確認最近備份文件"}, + ], + "estimated_minutes": 5, + "symptom_alertnames": ["HostBackupFailed", "BackupFailed", "VeleroBackupFailed"], + "severity_range": ["P2"], + }, +] + + +async def main(): + conn = await asyncpg.connect(DATABASE_URL) + + # 確認當前 playbooks 數量 + current = await conn.fetchval("SELECT count(*) FROM playbooks") + print(f"當前 playbooks: {current}") + + if DRY_RUN: + print(f"\n[DRY RUN] 將新增 {len(PLAYBOOK_TEMPLATES)} 個 Playbook:") + for i, t in enumerate(PLAYBOOK_TEMPLATES, 1): + print(f" {i:2d}. [{t['category']}] {t['name']}") + print("\n使用 --dry-run 以外的方式執行以實際寫入") + await conn.close() + return + + ts = now_taipei() + inserted = 0 + skipped = 0 + + for tmpl in PLAYBOOK_TEMPLATES: + playbook_id = f"PB-COLD-{str(uuid.uuid4())[:8].upper()}" + repair_steps = json.dumps(tmpl["repair_steps"]) + symptom_pattern = json.dumps({ + "alertnames": tmpl["symptom_alertnames"], + "severity_range": tmpl["severity_range"], + }) + + try: + await conn.execute( + """ + INSERT INTO playbooks ( + playbook_id, name, description, status, source, + repair_steps, symptom_pattern, + estimated_duration_minutes, ai_confidence, + success_count, failure_count, + created_at, updated_at + ) VALUES ( + $1, $2, $3, 'active', 'cold_start', + $4::jsonb, $5::jsonb, + $6, 0.6, + 0, 0, + $7, $7 + ) + ON CONFLICT DO NOTHING + """, + playbook_id, + tmpl["name"], + tmpl["description"], + repair_steps, + symptom_pattern, + tmpl["estimated_minutes"], + ts, + ) + inserted += 1 + print(f" ✅ 寫入: {tmpl['name']}") + except Exception as e: + skipped += 1 + print(f" ❌ 失敗: {tmpl['name']} — {e}") + + final = await conn.fetchval("SELECT count(*) FROM playbooks") + print(f"\n✅ 完成: 新增 {inserted} 個 Playbook(失敗 {skipped} 個)") + print(f" Playbooks 總數: {final}") + await conn.close() + + +if __name__ == "__main__": + asyncio.run(main())