diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index 06dcbc6c..2a800cbc 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -667,66 +667,75 @@ class SolverAgent(BaseAgent): 告警類別:{_safe_category} 診斷信心:{_confidence_pct} {_inventory_section}{_non_k8s_warning}{_mcp_section} -你的工作:為此根因提出 1-3 個修復候選方案,同時輸出 0-3 個結構化 recommended_actions。 +你的工作:依照根因假設,提出 1-3 個針對性修復方案,同時輸出 0-3 個結構化 recommended_actions。 + +⚠️ 核心規則:修復方案必須對應根因,禁止無腦重啟 +- HostDisk 類(磁碟滿)→ 先查大檔(du -sh)、清 log(journalctl --vacuum)、查 df -h,最後才考慮擴容 +- HostCPU / CPU 競爭 → 先查兇手進程(top -bn1 / ps aux)、找具體進程名,再決定是否重啟 +- OOM / 記憶體 → 先查 Pod 記憶體(kubectl top pods)、查 OOM log,再重啟 +- NetworkLatency → 先查連線狀態(ss -tp / ping / traceroute) +- DatabaseConnection → 先查連線池(pg_stat_activity)、DB log,再重啟 +- K8s Pod Crash → 先查 Pod log(kubectl logs),再重啟 +- 只有在診斷明確指向 crash/OOM/deadlock 時才用重啟;資源類問題(磁碟/CPU)優先用診斷+清理命令 candidates 格式規則: -- action 欄位必須是真實的 kubectl 命令(不可用自然語言描述) -- 目標資源格式:deployment/,命名空間統一用 awoooi-prod +- action 欄位:K8s 問題用 kubectl 命令;主機層問題(HostDisk/HostCPU 等)用 ssh 診斷命令 +- 主機 IP:192.168.0.188(AI+Web)/ 192.168.0.110(主服務)/ 192.168.0.111(Ollama) - 每個方案必須評估 blast_radius(0-100 影響範圍)和 rollback_cost(0-100 回滾難度) blast_radius 參考: +- 診斷命令(df/du/top/ps/kubectl top)= 0 +- SSH 清理 log(journalctl --vacuum / find -delete)= 5 - kubectl rollout restart deployment = 10 - kubectl scale deployment --replicas=N = 15 - kubectl rollout undo deployment = 25 -- kubectl apply -f = 40 - kubectl delete deployment = 75 -- kubectl delete pvc = 95 recommended_actions 規則(北極星 §1.1 修復多樣性): -- 不要全部是 restart 類動作,至少 1 個是查看 log 或診斷類(低風險) +- 第一個動作必須是診斷/查看類(低風險),讓 SRE 先確認情況 +- 不要全部是 restart 類動作 - mcp_provider 必須是以下之一:k8s | ssh | prometheus | signoz | database | internal - risk 必須是以下之一:low | medium | high | critical -- critical risk 的動作必須在 reasoning 說明原因 - params 中可使用模板:{{labels.namespace}} / {{labels.pod}} / {{incident_id}} -以 JSON 回覆: +以 JSON 回覆(範例為 HostDisk 場景,根據根因假設替換): {{ "candidates": [ {{ - "action": "kubectl rollout restart deployment/awoooi-api -n awoooi-prod", - "blast_radius": 10, - "rollback_cost": 5, - "confidence": 0.8, - "rationale": "重啟可清除 OOM 導致的記憶體碎片化" - }}, - {{ - "action": "kubectl top pods -n awoooi-prod --sort-by=memory", + "action": "ssh user@192.168.0.110 'df -h && du -sh /var/log/* 2>/dev/null | sort -rh | head -20'", "blast_radius": 0, "rollback_cost": 0, - "confidence": 0.9, - "rationale": "先確認哪個 Pod 記憶體使用最高再決定操作" + "confidence": 0.95, + "rationale": "先確認磁碟使用情況和最大目錄,找出根因後再決定清理方式" + }}, + {{ + "action": "ssh user@192.168.0.110 'journalctl --vacuum-time=7d && find /tmp -mtime +7 -delete'", + "blast_radius": 5, + "rollback_cost": 5, + "confidence": 0.8, + "rationale": "清理 7 天前 journal log 和 /tmp 舊檔,釋放磁碟空間" }} ], "recommended_actions": [ {{ - "name": "check_pod_logs", - "label": "查 Pod Log", - "emoji": "📋", - "mcp_provider": "k8s", - "mcp_tool": "k8s_get_pod_logs", - "params": {{"namespace": "awoooi-prod", "pod": "{{labels.pod}}", "tail_lines": "50"}}, + "name": "check_disk_usage", + "label": "查磁碟用量", + "emoji": "💾", + "mcp_provider": "ssh", + "mcp_tool": "ssh_exec", + "params": {{"host": "192.168.0.110", "command": "df -h && du -sh /var/log/* 2>/dev/null | sort -rh | head -10"}}, "risk": "low", - "reasoning": "先查 log 確認 OOM 根因,避免盲目重啟" + "reasoning": "診斷磁碟使用分布,找出佔用大戶" }}, {{ - "name": "k8s_restart", - "label": "重啟", - "emoji": "🔄", - "mcp_provider": "k8s", - "mcp_tool": "kubectl_restart", - "params": {{"namespace": "awoooi-prod", "deployment": "{{labels.deployment}}"}}, - "risk": "medium", - "reasoning": "確認 OOM 後重啟清除記憶體碎片" + "name": "clean_old_logs", + "label": "清舊 Log", + "emoji": "🗑️", + "mcp_provider": "ssh", + "mcp_tool": "ssh_exec", + "params": {{"host": "192.168.0.110", "command": "journalctl --vacuum-time=7d"}}, + "risk": "low", + "reasoning": "清理 7 天前 journal log,安全可回滾" }} ] }}""" diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index c80aee10..9ea43816 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -75,6 +75,8 @@ spec: value: "120" - name: AGENT_DIAGNOSTICIAN_TIMEOUT_SEC value: "100" + - name: AGENT_SOLVER_TIMEOUT_SEC + value: "80" # 2026-04-05 Claude Code: Sprint 3 — 掛載 SSH key 供 HostRepairAgent 使用 volumeMounts: - name: repair-ssh-key