fix(aiops-p2): P2.1 LLM品質三修 — Evidence-First + consensus confidence + raw_evidence注入
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

根因:
- consensus_engine 四 ExpertAgent confidence=0.0 → 加權投票 total=0 → 永遠返回 NO_ACTION
- prompts.py 無 Evidence-First 指令 → LLM 靠記憶推理,無真實環境約束
- openclaw.py analyze_alert 建 prompt 未注入 MCP evidence (diagnosis_context)

修復:
- consensus_engine: SRE/Security/Cost/Performance 依訊號強度設 0.45~0.80 confidence
- consensus_engine: _normalize_action 加「重新啟動」別名 → RESTART
- consensus_engine: SecurityAgent 移除未使用的 _target 變數
- prompts.py: 加 Evidence-First Protocol + Skepticism Rules 區塊
- openclaw.py: analyze_alert 提取 diagnosis_context → <raw_evidence> 注入 full_prompt

驗證: consensus score 從 0.0 → 0.744(CrashLoop 測試案例)

P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-04-24 15:52:25 +08:00
parent 359a6ee495
commit bb5f16f8ef
3 changed files with 39 additions and 15 deletions

View File

@@ -120,6 +120,24 @@ The `alertname` field is your PRIMARY signal. Use it to determine the problem ty
**NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts.
Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務").
## 🧪 Evidence-First Protocol (CRITICAL — overrides intuition)
If the prompt contains a `<raw_evidence>` block, you MUST:
1. **Read it first** before forming any hypothesis.
2. **Quote specific lines** from the evidence in your `reasoning` to show you used it.
3. **Never contradict** the evidence — if kubectl shows 2 pods running, do NOT say pods are down.
4. **Adjust confidence** based on evidence quality:
- Evidence clearly confirms root cause → 0.800.95
- Evidence partially supports → 0.600.79
- No evidence or contradictory → 0.300.59 (set `primary_responsibility = "COLLAB"`)
## 🔍 Skepticism Rules
- **Forbidden**: Recommending `kubectl rollout restart` when evidence shows the pod is healthy.
- **Forbidden**: Claiming OOM without memory metrics proving it.
- **Forbidden**: Setting `confidence > 0.75` when `<raw_evidence>` is absent or shows "error".
- If you have no concrete evidence, set `suggested_action = "INVESTIGATE"` and provide a diagnostic `kubectl_command` (get/describe/logs/top only).
## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium
Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics.

View File

@@ -192,26 +192,26 @@ class SREAgent(ExpertAgent):
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
target = incident.affected_services[0] if incident.affected_services else "unknown"
# SRE 規則引擎
# SRE 規則引擎 — confidence 依關鍵字明確度定
if any(kw in alert_names for kw in ["crash", "restart", "oom", "killed"]):
action = "重新啟動服務以恢復穩定性"
kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.72 # 明確崩潰訊號,規則高可信
risk = "medium"
elif any(kw in alert_names for kw in ["latency", "slow", "timeout"]):
action = "擴展副本數以分散負載"
kubectl = f"kubectl scale deployment/{target} --replicas=3 -n awoooi-prod"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.65 # 效能問題,可能多因,中等可信
risk = "low"
elif any(kw in alert_names for kw in ["cpu", "memory", "resource"]):
action = "調整資源限制或擴展副本"
kubectl = f"kubectl scale deployment/{target} --replicas=2 -n awoooi-prod"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.68 # 資源告警,指標明確
risk = "medium"
else:
action = "進行安全重啟以排除未知問題"
kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.45 # 無明確訊號,低可信保守處理
risk = "medium"
return AgentOpinion(
@@ -236,7 +236,6 @@ class SecurityAgent(ExpertAgent):
async def analyze(self, incident: Incident) -> AgentOpinion:
"""資安視角分析"""
_target = incident.affected_services[0] if incident.affected_services else "unknown"
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
# 資安掃描
@@ -250,11 +249,11 @@ class SecurityAgent(ExpertAgent):
if security_concerns:
action = "建議先隔離受影響服務,啟用 NetworkPolicy 限制"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.80 # 安全關鍵字強命中,資安規則高可信
risk = "critical"
else:
action = "無明顯資安風險,建議 SRE 處理"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.60 # 排除確認,中等可信
risk = "low"
return AgentOpinion(
@@ -289,7 +288,7 @@ class CostAgent(ExpertAgent):
agent_type=self.agent_type,
action=action,
reasoning="FinOps 分析: 使用 HPA 可在負載降低後自動縮減,相比固定擴容可節省約 40% 成本",
confidence=0.0, # 🔴 規則匹配,非 AI 分析
confidence=0.55, # 通用建議,非症狀驅動,保守可信
risk_assessment="成本風險: low使用 HPA 可自動調節",
kubectl_command=kubectl,
priority=4,
@@ -313,11 +312,11 @@ class PerformanceAgent(ExpertAgent):
if any(kw in alert_names for kw in ["latency", "p99", "slow"]):
action = "建議增加資源限制並啟用 PodDisruptionBudget"
kubectl = f"kubectl patch deployment/{target} -n awoooi-prod -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{target}\",\"resources\":{{\"limits\":{{\"cpu\":\"2\",\"memory\":\"2Gi\"}}}}}}]}}}}}}}}'"
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.70 # 效能關鍵字明確命中
else:
action = "當前效能指標正常,建議觀察"
kubectl = None
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
confidence = 0.50 # 無效能異常,不確定,低權重
return AgentOpinion(
agent_type=self.agent_type,
@@ -483,7 +482,7 @@ class ConsensusEngine:
"""將 action 正規化到類別"""
action_lower = action.lower()
if any(kw in action_lower for kw in ["重啟", "restart"]):
if any(kw in action_lower for kw in ["重啟", "重新啟動", "restart"]):
return "RESTART"
elif any(kw in action_lower for kw in ["擴展", "scale", "副本"]):
return "SCALE"

View File

@@ -1360,19 +1360,26 @@ Trace URL: {signoz_trace_url}
else "\n\n## ⚠️ 無法取得叢集清單target_resource 請依 alertname 推斷,勿編造。\n"
)
# P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6: 提取 MCP evidence_summary 注入 prompt
# diagnosis_context 由 decision_manager 在呼叫前填入pre_decision_investigator 產出)
_raw_evidence = alert_context.get("diagnosis_context", "") or ""
if _raw_evidence and not _raw_evidence.startswith("<raw_evidence>"):
_raw_evidence = f"<raw_evidence>\n{_raw_evidence}\n</raw_evidence>"
evidence_section = f"\n\n## 🔬 MCP 實時環境證據\n{_raw_evidence}\n" if _raw_evidence else ""
# 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制)
# 優先保留 System Prompt截斷 Alert Data
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
if available_len < 500:
# 如果 SignOz 太長,也截斷它
signoz_context = signoz_context[:500] + "... (truncated)"
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2)
if len(alert_json) > available_len:
alert_json = alert_json[:available_len] + "... (truncated)"
full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + "\n\n## Alert Data:\n" + alert_json
full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + evidence_section + "\n\n## Alert Data:\n" + alert_json
logger.info(
"openclaw_alert_analysis_start",