fix(aiops-p2): P2.1 LLM品質三修 — Evidence-First + consensus confidence + raw_evidence注入

根因: - consensus_engine 四 ExpertAgent confidence=0.0 → 加權投票 total=0 → 永遠返回 NO_ACTION - prompts.py 無 Evidence-First 指令 → LLM 靠記憶推理，無真實環境約束 - openclaw.py analyze_alert 建 prompt 未注入 MCP evidence (diagnosis_context) 修復: - consensus_engine: SRE/Security/Cost/Performance 依訊號強度設 0.45~0.80 confidence - consensus_engine: _normalize_action 加「重新啟動」別名 → RESTART - consensus_engine: SecurityAgent 移除未使用的 _target 變數 - prompts.py: 加 Evidence-First Protocol + Skepticism Rules 區塊 - openclaw.py: analyze_alert 提取 diagnosis_context → <raw_evidence> 注入 full_prompt 驗證: consensus score 從 0.0 → 0.744（CrashLoop 測試案例） P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 15:52:25 +08:00
parent 359a6ee495
commit bb5f16f8ef
3 changed files with 39 additions and 15 deletions
--- a/apps/api/src/core/prompts.py
+++ b/apps/api/src/core/prompts.py
@@ -120,6 +120,24 @@ The `alertname` field is your PRIMARY signal. Use it to determine the problem ty
 **NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts.
 Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務").

+## 🧪 Evidence-First Protocol (CRITICAL — overrides intuition)
+
+If the prompt contains a `<raw_evidence>` block, you MUST:
+1. **Read it first** before forming any hypothesis.
+2. **Quote specific lines** from the evidence in your `reasoning` to show you used it.
+3. **Never contradict** the evidence — if kubectl shows 2 pods running, do NOT say pods are down.
+4. **Adjust confidence** based on evidence quality:
+   - Evidence clearly confirms root cause → 0.80–0.95
+   - Evidence partially supports → 0.60–0.79
+   - No evidence or contradictory → 0.30–0.59 (set `primary_responsibility = "COLLAB"`)
+
+## 🔍 Skepticism Rules
+
+- **Forbidden**: Recommending `kubectl rollout restart` when evidence shows the pod is healthy.
+- **Forbidden**: Claiming OOM without memory metrics proving it.
+- **Forbidden**: Setting `confidence > 0.75` when `<raw_evidence>` is absent or shows "error".
+- If you have no concrete evidence, set `suggested_action = "INVESTIGATE"` and provide a diagnostic `kubectl_command` (get/describe/logs/top only).
+
 ## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium
 Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics.

--- a/apps/api/src/services/consensus_engine.py
+++ b/apps/api/src/services/consensus_engine.py
@@ -192,26 +192,26 @@ class SREAgent(ExpertAgent):
        alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
        target = incident.affected_services[0] if incident.affected_services else "unknown"

-        # SRE 規則引擎
+        # SRE 規則引擎 — confidence 依關鍵字明確度定
        if any(kw in alert_names for kw in ["crash", "restart", "oom", "killed"]):
            action = "重新啟動服務以恢復穩定性"
            kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.72  # 明確崩潰訊號，規則高可信
            risk = "medium"
        elif any(kw in alert_names for kw in ["latency", "slow", "timeout"]):
            action = "擴展副本數以分散負載"
            kubectl = f"kubectl scale deployment/{target} --replicas=3 -n awoooi-prod"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.65  # 效能問題，可能多因，中等可信
            risk = "low"
        elif any(kw in alert_names for kw in ["cpu", "memory", "resource"]):
            action = "調整資源限制或擴展副本"
            kubectl = f"kubectl scale deployment/{target} --replicas=2 -n awoooi-prod"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.68  # 資源告警，指標明確
            risk = "medium"
        else:
            action = "進行安全重啟以排除未知問題"
            kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.45  # 無明確訊號，低可信保守處理
            risk = "medium"

        return AgentOpinion(
@@ -236,7 +236,6 @@ class SecurityAgent(ExpertAgent):

    async def analyze(self, incident: Incident) -> AgentOpinion:
        """資安視角分析"""
-        _target = incident.affected_services[0] if incident.affected_services else "unknown"
        alert_names = " ".join([s.alert_name.lower() for s in incident.signals])

        # 資安掃描
@@ -250,11 +249,11 @@ class SecurityAgent(ExpertAgent):

        if security_concerns:
            action = "建議先隔離受影響服務，啟用 NetworkPolicy 限制"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.80  # 安全關鍵字強命中，資安規則高可信
            risk = "critical"
        else:
            action = "無明顯資安風險，建議 SRE 處理"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.60  # 排除確認，中等可信
            risk = "low"

        return AgentOpinion(
@@ -289,7 +288,7 @@ class CostAgent(ExpertAgent):
            agent_type=self.agent_type,
            action=action,
            reasoning="FinOps 分析: 使用 HPA 可在負載降低後自動縮減，相比固定擴容可節省約 40% 成本",
-            confidence=0.0,  # 🔴 規則匹配，非 AI 分析
+            confidence=0.55,  # 通用建議，非症狀驅動，保守可信
            risk_assessment="成本風險: low，使用 HPA 可自動調節",
            kubectl_command=kubectl,
            priority=4,
@@ -313,11 +312,11 @@ class PerformanceAgent(ExpertAgent):
        if any(kw in alert_names for kw in ["latency", "p99", "slow"]):
            action = "建議增加資源限制並啟用 PodDisruptionBudget"
            kubectl = f"kubectl patch deployment/{target} -n awoooi-prod -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{target}\",\"resources\":{{\"limits\":{{\"cpu\":\"2\",\"memory\":\"2Gi\"}}}}}}]}}}}}}}}'"
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.70  # 效能關鍵字明確命中
        else:
            action = "當前效能指標正常，建議觀察"
            kubectl = None
-            confidence = 0.0  # 🔴 規則匹配，非 AI 分析
+            confidence = 0.50  # 無效能異常，不確定，低權重

        return AgentOpinion(
            agent_type=self.agent_type,
@@ -483,7 +482,7 @@ class ConsensusEngine:
        """將 action 正規化到類別"""
        action_lower = action.lower()

-        if any(kw in action_lower for kw in ["重啟", "restart"]):
+        if any(kw in action_lower for kw in ["重啟", "重新啟動", "restart"]):
            return "RESTART"
        elif any(kw in action_lower for kw in ["擴展", "scale", "副本"]):
            return "SCALE"
--- a/apps/api/src/services/openclaw.py
+++ b/apps/api/src/services/openclaw.py
@@ -1360,19 +1360,26 @@ Trace URL: {signoz_trace_url}
            else "\n\n## ⚠️ 無法取得叢集清單，target_resource 請依 alertname 推斷，勿編造。\n"
        )

+        # P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6: 提取 MCP evidence_summary 注入 prompt
+        # diagnosis_context 由 decision_manager 在呼叫前填入（pre_decision_investigator 產出）
+        _raw_evidence = alert_context.get("diagnosis_context", "") or ""
+        if _raw_evidence and not _raw_evidence.startswith("<raw_evidence>"):
+            _raw_evidence = f"<raw_evidence>\n{_raw_evidence}\n</raw_evidence>"
+        evidence_section = f"\n\n## 🔬 MCP 實時環境證據\n{_raw_evidence}\n" if _raw_evidence else ""
+
        # 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制)
        # 優先保留 System Prompt，截斷 Alert Data
-        available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
+        available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
        if available_len < 500:
             # 如果 SignOz 太長，也截斷它
             signoz_context = signoz_context[:500] + "... (truncated)"
-             available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
+             available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)

        alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2)
        if len(alert_json) > available_len:
            alert_json = alert_json[:available_len] + "... (truncated)"

-        full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + "\n\n## Alert Data:\n" + alert_json
+        full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + evidence_section + "\n\n## Alert Data:\n" + alert_json

        logger.info(
            "openclaw_alert_analysis_start",