From ded93cbba3dc66be67b8edde300c5d3ac772199d Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 16 Apr 2026 16:26:07 +0800 Subject: [PATCH] =?UTF-8?q?fix(aiops):=20=E4=BF=AE=E5=BE=A9=20evidence=20?= =?UTF-8?q?=E7=A9=BA=E7=99=BD=20=E2=86=92=20AI=20ABSTAIN=20=E5=95=8F?= =?UTF-8?q?=E9=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題: - signal.alert_name 在頂層,但 _get_alertname() 從 labels["alertname"] 讀 → 空字串 - 所有 sensor 失敗時 evidence_summary 只有 120 字元,AI 無法分析 → ABSTAIN - labels 為空時 AI 根本不知道是什麼告警 修復: 1. _get_alertname(): 優先讀 signal.alert_name,fallback labels["alertname"] 2. _get_labels(): 自動補 alertname 到 labels dict 3. EvidenceSnapshot.alert_info: 新增告警基礎欄位(sensors=0 時的最小情報) 4. build_summary(): alert_info 永遠放在最前,讓 AI 至少知道告警類型+嚴重度 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/evidence_snapshot.py | 7 ++++ .../src/services/pre_decision_investigator.py | 35 +++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/evidence_snapshot.py b/apps/api/src/services/evidence_snapshot.py index 85bb784a..ec7e136d 100644 --- a/apps/api/src/services/evidence_snapshot.py +++ b/apps/api/src/services/evidence_snapshot.py @@ -77,6 +77,9 @@ class EvidenceSnapshot: schema_version: str = SCHEMA_VERSION collected_at: datetime = field(default_factory=now_taipei) + # 告警基礎資訊(sensors=0 時的最小情報,2026-04-16 ogt + Claude Sonnet 4.6) + alert_info: dict[str, Any] | None = None + # 8D 感官數據 k8s_state: dict[str, Any] | None = None # D1 recent_logs: str | None = None # D2 (sanitized) @@ -134,6 +137,10 @@ class EvidenceSnapshot: """ parts: list[str] = [] + # 告警基礎資訊永遠放在最前(sensors=0 時也要讓 AI 知道是什麼告警) + if self.alert_info: + parts.append(f"[告警資訊] {self.alert_info}") + if self.k8s_state: parts.append(f"[K8s狀態] {self.k8s_state}") if self.recent_logs: diff --git a/apps/api/src/services/pre_decision_investigator.py b/apps/api/src/services/pre_decision_investigator.py index 253dde59..9e728ce1 100644 --- a/apps/api/src/services/pre_decision_investigator.py +++ b/apps/api/src/services/pre_decision_investigator.py @@ -106,6 +106,22 @@ class PreDecisionInvestigator: snapshot = EvidenceSnapshot(incident_id=incident_id) snapshot.sensors_attempted = len(tools) + # 告警基礎資訊:sensors=0 時 AI 至少知道是什麼告警 + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復空 evidence → ABSTAIN 問題 + sigs = getattr(incident, "signals", []) or [] + sig0 = sigs[0] if sigs else None + snapshot.alert_info = { + "alert_name": alertname or getattr(incident, "alertname", "") or "", + "severity": str(getattr(incident, "severity", "")), + "affected_services": getattr(incident, "affected_services", []) or [], + "labels": labels, + "annotations": ( + ({k: v for k, v in (sig0.annotations or {}).items()} if sig0 else {}) + ), + "source": getattr(sig0, "source", "") if sig0 else "", + "incident_id": incident_id, + } + # 3. 並行蒐集(整體 INVESTIGATOR_TIMEOUT_SEC 保護) try: await asyncio.wait_for( @@ -343,13 +359,26 @@ def _fill_snapshot_dimension( def _get_alertname(incident: "Incident") -> str: if incident.signals: - return incident.signals[0].labels.get("alertname", "") - return "" + sig = incident.signals[0] + # alert_name 在 Signal 頂層欄位,labels["alertname"] 是 Prometheus 慣例但可能為空 + return ( + getattr(sig, "alert_name", "") + or sig.labels.get("alertname", "") + or getattr(incident, "alertname", "") + or "" + ) + return getattr(incident, "alertname", "") or "" def _get_labels(incident: "Incident") -> dict[str, Any]: if incident.signals: - return incident.signals[0].labels + sig = incident.signals[0] + labels = sig.labels or {} + # 若 labels 缺少 alertname,補上頂層的 alert_name + if "alertname" not in labels and getattr(sig, "alert_name", ""): + labels = dict(labels) + labels["alertname"] = sig.alert_name + return labels return {}