diff --git a/apps/api/src/core/prompts.py b/apps/api/src/core/prompts.py index 66f94c3b..6dec0473 100644 --- a/apps/api/src/core/prompts.py +++ b/apps/api/src/core/prompts.py @@ -120,6 +120,24 @@ The `alertname` field is your PRIMARY signal. Use it to determine the problem ty **NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts. Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務"). +## 🧪 Evidence-First Protocol (CRITICAL — overrides intuition) + +If the prompt contains a `` block, you MUST: +1. **Read it first** before forming any hypothesis. +2. **Quote specific lines** from the evidence in your `reasoning` to show you used it. +3. **Never contradict** the evidence — if kubectl shows 2 pods running, do NOT say pods are down. +4. **Adjust confidence** based on evidence quality: + - Evidence clearly confirms root cause → 0.80–0.95 + - Evidence partially supports → 0.60–0.79 + - No evidence or contradictory → 0.30–0.59 (set `primary_responsibility = "COLLAB"`) + +## 🔍 Skepticism Rules + +- **Forbidden**: Recommending `kubectl rollout restart` when evidence shows the pod is healthy. +- **Forbidden**: Claiming OOM without memory metrics proving it. +- **Forbidden**: Setting `confidence > 0.75` when `` is absent or shows "error". +- If you have no concrete evidence, set `suggested_action = "INVESTIGATE"` and provide a diagnostic `kubectl_command` (get/describe/logs/top only). + ## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics. diff --git a/apps/api/src/services/consensus_engine.py b/apps/api/src/services/consensus_engine.py index 164fee1e..4253078e 100644 --- a/apps/api/src/services/consensus_engine.py +++ b/apps/api/src/services/consensus_engine.py @@ -192,26 +192,26 @@ class SREAgent(ExpertAgent): alert_names = " ".join([s.alert_name.lower() for s in incident.signals]) target = incident.affected_services[0] if incident.affected_services else "unknown" - # SRE 規則引擎 + # SRE 規則引擎 — confidence 依關鍵字明確度定 if any(kw in alert_names for kw in ["crash", "restart", "oom", "killed"]): action = "重新啟動服務以恢復穩定性" kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.72 # 明確崩潰訊號,規則高可信 risk = "medium" elif any(kw in alert_names for kw in ["latency", "slow", "timeout"]): action = "擴展副本數以分散負載" kubectl = f"kubectl scale deployment/{target} --replicas=3 -n awoooi-prod" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.65 # 效能問題,可能多因,中等可信 risk = "low" elif any(kw in alert_names for kw in ["cpu", "memory", "resource"]): action = "調整資源限制或擴展副本" kubectl = f"kubectl scale deployment/{target} --replicas=2 -n awoooi-prod" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.68 # 資源告警,指標明確 risk = "medium" else: action = "進行安全重啟以排除未知問題" kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.45 # 無明確訊號,低可信保守處理 risk = "medium" return AgentOpinion( @@ -236,7 +236,6 @@ class SecurityAgent(ExpertAgent): async def analyze(self, incident: Incident) -> AgentOpinion: """資安視角分析""" - _target = incident.affected_services[0] if incident.affected_services else "unknown" alert_names = " ".join([s.alert_name.lower() for s in incident.signals]) # 資安掃描 @@ -250,11 +249,11 @@ class SecurityAgent(ExpertAgent): if security_concerns: action = "建議先隔離受影響服務,啟用 NetworkPolicy 限制" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.80 # 安全關鍵字強命中,資安規則高可信 risk = "critical" else: action = "無明顯資安風險,建議 SRE 處理" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.60 # 排除確認,中等可信 risk = "low" return AgentOpinion( @@ -289,7 +288,7 @@ class CostAgent(ExpertAgent): agent_type=self.agent_type, action=action, reasoning="FinOps 分析: 使用 HPA 可在負載降低後自動縮減,相比固定擴容可節省約 40% 成本", - confidence=0.0, # 🔴 規則匹配,非 AI 分析 + confidence=0.55, # 通用建議,非症狀驅動,保守可信 risk_assessment="成本風險: low,使用 HPA 可自動調節", kubectl_command=kubectl, priority=4, @@ -313,11 +312,11 @@ class PerformanceAgent(ExpertAgent): if any(kw in alert_names for kw in ["latency", "p99", "slow"]): action = "建議增加資源限制並啟用 PodDisruptionBudget" kubectl = f"kubectl patch deployment/{target} -n awoooi-prod -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{target}\",\"resources\":{{\"limits\":{{\"cpu\":\"2\",\"memory\":\"2Gi\"}}}}}}]}}}}}}}}'" - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.70 # 效能關鍵字明確命中 else: action = "當前效能指標正常,建議觀察" kubectl = None - confidence = 0.0 # 🔴 規則匹配,非 AI 分析 + confidence = 0.50 # 無效能異常,不確定,低權重 return AgentOpinion( agent_type=self.agent_type, @@ -483,7 +482,7 @@ class ConsensusEngine: """將 action 正規化到類別""" action_lower = action.lower() - if any(kw in action_lower for kw in ["重啟", "restart"]): + if any(kw in action_lower for kw in ["重啟", "重新啟動", "restart"]): return "RESTART" elif any(kw in action_lower for kw in ["擴展", "scale", "副本"]): return "SCALE" diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 98898626..fe844fad 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1360,19 +1360,26 @@ Trace URL: {signoz_trace_url} else "\n\n## ⚠️ 無法取得叢集清單,target_resource 請依 alertname 推斷,勿編造。\n" ) + # P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6: 提取 MCP evidence_summary 注入 prompt + # diagnosis_context 由 decision_manager 在呼叫前填入(pre_decision_investigator 產出) + _raw_evidence = alert_context.get("diagnosis_context", "") or "" + if _raw_evidence and not _raw_evidence.startswith(""): + _raw_evidence = f"\n{_raw_evidence}\n" + evidence_section = f"\n\n## 🔬 MCP 實時環境證據\n{_raw_evidence}\n" if _raw_evidence else "" + # 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制) # 優先保留 System Prompt,截斷 Alert Data - available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) + available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section) if available_len < 500: # 如果 SignOz 太長,也截斷它 signoz_context = signoz_context[:500] + "... (truncated)" - available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) + available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section) alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2) if len(alert_json) > available_len: alert_json = alert_json[:available_len] + "... (truncated)" - full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + "\n\n## Alert Data:\n" + alert_json + full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + evidence_section + "\n\n## Alert Data:\n" + alert_json logger.info( "openclaw_alert_analysis_start",