fix(aiops): 修復 evidence 空白 → AI ABSTAIN 問題
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 32m33s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 32m33s
問題: - signal.alert_name 在頂層,但 _get_alertname() 從 labels["alertname"] 讀 → 空字串 - 所有 sensor 失敗時 evidence_summary 只有 120 字元,AI 無法分析 → ABSTAIN - labels 為空時 AI 根本不知道是什麼告警 修復: 1. _get_alertname(): 優先讀 signal.alert_name,fallback labels["alertname"] 2. _get_labels(): 自動補 alertname 到 labels dict 3. EvidenceSnapshot.alert_info: 新增告警基礎欄位(sensors=0 時的最小情報) 4. build_summary(): alert_info 永遠放在最前,讓 AI 至少知道告警類型+嚴重度 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -77,6 +77,9 @@ class EvidenceSnapshot:
|
|||||||
schema_version: str = SCHEMA_VERSION
|
schema_version: str = SCHEMA_VERSION
|
||||||
collected_at: datetime = field(default_factory=now_taipei)
|
collected_at: datetime = field(default_factory=now_taipei)
|
||||||
|
|
||||||
|
# 告警基礎資訊(sensors=0 時的最小情報,2026-04-16 ogt + Claude Sonnet 4.6)
|
||||||
|
alert_info: dict[str, Any] | None = None
|
||||||
|
|
||||||
# 8D 感官數據
|
# 8D 感官數據
|
||||||
k8s_state: dict[str, Any] | None = None # D1
|
k8s_state: dict[str, Any] | None = None # D1
|
||||||
recent_logs: str | None = None # D2 (sanitized)
|
recent_logs: str | None = None # D2 (sanitized)
|
||||||
@@ -134,6 +137,10 @@ class EvidenceSnapshot:
|
|||||||
"""
|
"""
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
|
|
||||||
|
# 告警基礎資訊永遠放在最前(sensors=0 時也要讓 AI 知道是什麼告警)
|
||||||
|
if self.alert_info:
|
||||||
|
parts.append(f"[告警資訊] {self.alert_info}")
|
||||||
|
|
||||||
if self.k8s_state:
|
if self.k8s_state:
|
||||||
parts.append(f"[K8s狀態] {self.k8s_state}")
|
parts.append(f"[K8s狀態] {self.k8s_state}")
|
||||||
if self.recent_logs:
|
if self.recent_logs:
|
||||||
|
|||||||
@@ -106,6 +106,22 @@ class PreDecisionInvestigator:
|
|||||||
snapshot = EvidenceSnapshot(incident_id=incident_id)
|
snapshot = EvidenceSnapshot(incident_id=incident_id)
|
||||||
snapshot.sensors_attempted = len(tools)
|
snapshot.sensors_attempted = len(tools)
|
||||||
|
|
||||||
|
# 告警基礎資訊:sensors=0 時 AI 至少知道是什麼告警
|
||||||
|
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復空 evidence → ABSTAIN 問題
|
||||||
|
sigs = getattr(incident, "signals", []) or []
|
||||||
|
sig0 = sigs[0] if sigs else None
|
||||||
|
snapshot.alert_info = {
|
||||||
|
"alert_name": alertname or getattr(incident, "alertname", "") or "",
|
||||||
|
"severity": str(getattr(incident, "severity", "")),
|
||||||
|
"affected_services": getattr(incident, "affected_services", []) or [],
|
||||||
|
"labels": labels,
|
||||||
|
"annotations": (
|
||||||
|
({k: v for k, v in (sig0.annotations or {}).items()} if sig0 else {})
|
||||||
|
),
|
||||||
|
"source": getattr(sig0, "source", "") if sig0 else "",
|
||||||
|
"incident_id": incident_id,
|
||||||
|
}
|
||||||
|
|
||||||
# 3. 並行蒐集(整體 INVESTIGATOR_TIMEOUT_SEC 保護)
|
# 3. 並行蒐集(整體 INVESTIGATOR_TIMEOUT_SEC 保護)
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(
|
await asyncio.wait_for(
|
||||||
@@ -343,13 +359,26 @@ def _fill_snapshot_dimension(
|
|||||||
|
|
||||||
def _get_alertname(incident: "Incident") -> str:
|
def _get_alertname(incident: "Incident") -> str:
|
||||||
if incident.signals:
|
if incident.signals:
|
||||||
return incident.signals[0].labels.get("alertname", "")
|
sig = incident.signals[0]
|
||||||
return ""
|
# alert_name 在 Signal 頂層欄位,labels["alertname"] 是 Prometheus 慣例但可能為空
|
||||||
|
return (
|
||||||
|
getattr(sig, "alert_name", "")
|
||||||
|
or sig.labels.get("alertname", "")
|
||||||
|
or getattr(incident, "alertname", "")
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
return getattr(incident, "alertname", "") or ""
|
||||||
|
|
||||||
|
|
||||||
def _get_labels(incident: "Incident") -> dict[str, Any]:
|
def _get_labels(incident: "Incident") -> dict[str, Any]:
|
||||||
if incident.signals:
|
if incident.signals:
|
||||||
return incident.signals[0].labels
|
sig = incident.signals[0]
|
||||||
|
labels = sig.labels or {}
|
||||||
|
# 若 labels 缺少 alertname,補上頂層的 alert_name
|
||||||
|
if "alertname" not in labels and getattr(sig, "alert_name", ""):
|
||||||
|
labels = dict(labels)
|
||||||
|
labels["alertname"] = sig.alert_name
|
||||||
|
return labels
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user