fix(aiops-p2): P2.1 LLM品質三修 — Evidence-First + consensus confidence + raw_evidence注入
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
根因: - consensus_engine 四 ExpertAgent confidence=0.0 → 加權投票 total=0 → 永遠返回 NO_ACTION - prompts.py 無 Evidence-First 指令 → LLM 靠記憶推理,無真實環境約束 - openclaw.py analyze_alert 建 prompt 未注入 MCP evidence (diagnosis_context) 修復: - consensus_engine: SRE/Security/Cost/Performance 依訊號強度設 0.45~0.80 confidence - consensus_engine: _normalize_action 加「重新啟動」別名 → RESTART - consensus_engine: SecurityAgent 移除未使用的 _target 變數 - prompts.py: 加 Evidence-First Protocol + Skepticism Rules 區塊 - openclaw.py: analyze_alert 提取 diagnosis_context → <raw_evidence> 注入 full_prompt 驗證: consensus score 從 0.0 → 0.744(CrashLoop 測試案例) P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -120,6 +120,24 @@ The `alertname` field is your PRIMARY signal. Use it to determine the problem ty
|
||||
**NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts.
|
||||
Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務").
|
||||
|
||||
## 🧪 Evidence-First Protocol (CRITICAL — overrides intuition)
|
||||
|
||||
If the prompt contains a `<raw_evidence>` block, you MUST:
|
||||
1. **Read it first** before forming any hypothesis.
|
||||
2. **Quote specific lines** from the evidence in your `reasoning` to show you used it.
|
||||
3. **Never contradict** the evidence — if kubectl shows 2 pods running, do NOT say pods are down.
|
||||
4. **Adjust confidence** based on evidence quality:
|
||||
- Evidence clearly confirms root cause → 0.80–0.95
|
||||
- Evidence partially supports → 0.60–0.79
|
||||
- No evidence or contradictory → 0.30–0.59 (set `primary_responsibility = "COLLAB"`)
|
||||
|
||||
## 🔍 Skepticism Rules
|
||||
|
||||
- **Forbidden**: Recommending `kubectl rollout restart` when evidence shows the pod is healthy.
|
||||
- **Forbidden**: Claiming OOM without memory metrics proving it.
|
||||
- **Forbidden**: Setting `confidence > 0.75` when `<raw_evidence>` is absent or shows "error".
|
||||
- If you have no concrete evidence, set `suggested_action = "INVESTIGATE"` and provide a diagnostic `kubectl_command` (get/describe/logs/top only).
|
||||
|
||||
## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium
|
||||
Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics.
|
||||
|
||||
|
||||
@@ -192,26 +192,26 @@ class SREAgent(ExpertAgent):
|
||||
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
|
||||
target = incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
|
||||
# SRE 規則引擎
|
||||
# SRE 規則引擎 — confidence 依關鍵字明確度定
|
||||
if any(kw in alert_names for kw in ["crash", "restart", "oom", "killed"]):
|
||||
action = "重新啟動服務以恢復穩定性"
|
||||
kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.72 # 明確崩潰訊號,規則高可信
|
||||
risk = "medium"
|
||||
elif any(kw in alert_names for kw in ["latency", "slow", "timeout"]):
|
||||
action = "擴展副本數以分散負載"
|
||||
kubectl = f"kubectl scale deployment/{target} --replicas=3 -n awoooi-prod"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.65 # 效能問題,可能多因,中等可信
|
||||
risk = "low"
|
||||
elif any(kw in alert_names for kw in ["cpu", "memory", "resource"]):
|
||||
action = "調整資源限制或擴展副本"
|
||||
kubectl = f"kubectl scale deployment/{target} --replicas=2 -n awoooi-prod"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.68 # 資源告警,指標明確
|
||||
risk = "medium"
|
||||
else:
|
||||
action = "進行安全重啟以排除未知問題"
|
||||
kubectl = f"kubectl rollout restart deployment/{target} -n awoooi-prod"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.45 # 無明確訊號,低可信保守處理
|
||||
risk = "medium"
|
||||
|
||||
return AgentOpinion(
|
||||
@@ -236,7 +236,6 @@ class SecurityAgent(ExpertAgent):
|
||||
|
||||
async def analyze(self, incident: Incident) -> AgentOpinion:
|
||||
"""資安視角分析"""
|
||||
_target = incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
alert_names = " ".join([s.alert_name.lower() for s in incident.signals])
|
||||
|
||||
# 資安掃描
|
||||
@@ -250,11 +249,11 @@ class SecurityAgent(ExpertAgent):
|
||||
|
||||
if security_concerns:
|
||||
action = "建議先隔離受影響服務,啟用 NetworkPolicy 限制"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.80 # 安全關鍵字強命中,資安規則高可信
|
||||
risk = "critical"
|
||||
else:
|
||||
action = "無明顯資安風險,建議 SRE 處理"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.60 # 排除確認,中等可信
|
||||
risk = "low"
|
||||
|
||||
return AgentOpinion(
|
||||
@@ -289,7 +288,7 @@ class CostAgent(ExpertAgent):
|
||||
agent_type=self.agent_type,
|
||||
action=action,
|
||||
reasoning="FinOps 分析: 使用 HPA 可在負載降低後自動縮減,相比固定擴容可節省約 40% 成本",
|
||||
confidence=0.0, # 🔴 規則匹配,非 AI 分析
|
||||
confidence=0.55, # 通用建議,非症狀驅動,保守可信
|
||||
risk_assessment="成本風險: low,使用 HPA 可自動調節",
|
||||
kubectl_command=kubectl,
|
||||
priority=4,
|
||||
@@ -313,11 +312,11 @@ class PerformanceAgent(ExpertAgent):
|
||||
if any(kw in alert_names for kw in ["latency", "p99", "slow"]):
|
||||
action = "建議增加資源限制並啟用 PodDisruptionBudget"
|
||||
kubectl = f"kubectl patch deployment/{target} -n awoooi-prod -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{target}\",\"resources\":{{\"limits\":{{\"cpu\":\"2\",\"memory\":\"2Gi\"}}}}}}]}}}}}}}}'"
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.70 # 效能關鍵字明確命中
|
||||
else:
|
||||
action = "當前效能指標正常,建議觀察"
|
||||
kubectl = None
|
||||
confidence = 0.0 # 🔴 規則匹配,非 AI 分析
|
||||
confidence = 0.50 # 無效能異常,不確定,低權重
|
||||
|
||||
return AgentOpinion(
|
||||
agent_type=self.agent_type,
|
||||
@@ -483,7 +482,7 @@ class ConsensusEngine:
|
||||
"""將 action 正規化到類別"""
|
||||
action_lower = action.lower()
|
||||
|
||||
if any(kw in action_lower for kw in ["重啟", "restart"]):
|
||||
if any(kw in action_lower for kw in ["重啟", "重新啟動", "restart"]):
|
||||
return "RESTART"
|
||||
elif any(kw in action_lower for kw in ["擴展", "scale", "副本"]):
|
||||
return "SCALE"
|
||||
|
||||
@@ -1360,19 +1360,26 @@ Trace URL: {signoz_trace_url}
|
||||
else "\n\n## ⚠️ 無法取得叢集清單,target_resource 請依 alertname 推斷,勿編造。\n"
|
||||
)
|
||||
|
||||
# P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6: 提取 MCP evidence_summary 注入 prompt
|
||||
# diagnosis_context 由 decision_manager 在呼叫前填入(pre_decision_investigator 產出)
|
||||
_raw_evidence = alert_context.get("diagnosis_context", "") or ""
|
||||
if _raw_evidence and not _raw_evidence.startswith("<raw_evidence>"):
|
||||
_raw_evidence = f"<raw_evidence>\n{_raw_evidence}\n</raw_evidence>"
|
||||
evidence_section = f"\n\n## 🔬 MCP 實時環境證據\n{_raw_evidence}\n" if _raw_evidence else ""
|
||||
|
||||
# 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制)
|
||||
# 優先保留 System Prompt,截斷 Alert Data
|
||||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
|
||||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
|
||||
if available_len < 500:
|
||||
# 如果 SignOz 太長,也截斷它
|
||||
signoz_context = signoz_context[:500] + "... (truncated)"
|
||||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section)
|
||||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
|
||||
|
||||
alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2)
|
||||
if len(alert_json) > available_len:
|
||||
alert_json = alert_json[:available_len] + "... (truncated)"
|
||||
|
||||
full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + "\n\n## Alert Data:\n" + alert_json
|
||||
full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + evidence_section + "\n\n## Alert Data:\n" + alert_json
|
||||
|
||||
logger.info(
|
||||
"openclaw_alert_analysis_start",
|
||||
|
||||
Reference in New Issue
Block a user