fix(aiops): 修復 evidence 空白 → AI ABSTAIN 問題
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 32m33s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 32m33s
問題: - signal.alert_name 在頂層,但 _get_alertname() 從 labels["alertname"] 讀 → 空字串 - 所有 sensor 失敗時 evidence_summary 只有 120 字元,AI 無法分析 → ABSTAIN - labels 為空時 AI 根本不知道是什麼告警 修復: 1. _get_alertname(): 優先讀 signal.alert_name,fallback labels["alertname"] 2. _get_labels(): 自動補 alertname 到 labels dict 3. EvidenceSnapshot.alert_info: 新增告警基礎欄位(sensors=0 時的最小情報) 4. build_summary(): alert_info 永遠放在最前,讓 AI 至少知道告警類型+嚴重度 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -77,6 +77,9 @@ class EvidenceSnapshot:
|
||||
schema_version: str = SCHEMA_VERSION
|
||||
collected_at: datetime = field(default_factory=now_taipei)
|
||||
|
||||
# 告警基礎資訊(sensors=0 時的最小情報,2026-04-16 ogt + Claude Sonnet 4.6)
|
||||
alert_info: dict[str, Any] | None = None
|
||||
|
||||
# 8D 感官數據
|
||||
k8s_state: dict[str, Any] | None = None # D1
|
||||
recent_logs: str | None = None # D2 (sanitized)
|
||||
@@ -134,6 +137,10 @@ class EvidenceSnapshot:
|
||||
"""
|
||||
parts: list[str] = []
|
||||
|
||||
# 告警基礎資訊永遠放在最前(sensors=0 時也要讓 AI 知道是什麼告警)
|
||||
if self.alert_info:
|
||||
parts.append(f"[告警資訊] {self.alert_info}")
|
||||
|
||||
if self.k8s_state:
|
||||
parts.append(f"[K8s狀態] {self.k8s_state}")
|
||||
if self.recent_logs:
|
||||
|
||||
@@ -106,6 +106,22 @@ class PreDecisionInvestigator:
|
||||
snapshot = EvidenceSnapshot(incident_id=incident_id)
|
||||
snapshot.sensors_attempted = len(tools)
|
||||
|
||||
# 告警基礎資訊:sensors=0 時 AI 至少知道是什麼告警
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復空 evidence → ABSTAIN 問題
|
||||
sigs = getattr(incident, "signals", []) or []
|
||||
sig0 = sigs[0] if sigs else None
|
||||
snapshot.alert_info = {
|
||||
"alert_name": alertname or getattr(incident, "alertname", "") or "",
|
||||
"severity": str(getattr(incident, "severity", "")),
|
||||
"affected_services": getattr(incident, "affected_services", []) or [],
|
||||
"labels": labels,
|
||||
"annotations": (
|
||||
({k: v for k, v in (sig0.annotations or {}).items()} if sig0 else {})
|
||||
),
|
||||
"source": getattr(sig0, "source", "") if sig0 else "",
|
||||
"incident_id": incident_id,
|
||||
}
|
||||
|
||||
# 3. 並行蒐集(整體 INVESTIGATOR_TIMEOUT_SEC 保護)
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
@@ -343,13 +359,26 @@ def _fill_snapshot_dimension(
|
||||
|
||||
def _get_alertname(incident: "Incident") -> str:
|
||||
if incident.signals:
|
||||
return incident.signals[0].labels.get("alertname", "")
|
||||
return ""
|
||||
sig = incident.signals[0]
|
||||
# alert_name 在 Signal 頂層欄位,labels["alertname"] 是 Prometheus 慣例但可能為空
|
||||
return (
|
||||
getattr(sig, "alert_name", "")
|
||||
or sig.labels.get("alertname", "")
|
||||
or getattr(incident, "alertname", "")
|
||||
or ""
|
||||
)
|
||||
return getattr(incident, "alertname", "") or ""
|
||||
|
||||
|
||||
def _get_labels(incident: "Incident") -> dict[str, Any]:
|
||||
if incident.signals:
|
||||
return incident.signals[0].labels
|
||||
sig = incident.signals[0]
|
||||
labels = sig.labels or {}
|
||||
# 若 labels 缺少 alertname,補上頂層的 alert_name
|
||||
if "alertname" not in labels and getattr(sig, "alert_name", ""):
|
||||
labels = dict(labels)
|
||||
labels["alertname"] = sig.alert_name
|
||||
return labels
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user