fix(aiops): 修復 evidence 空白 → AI ABSTAIN 問題
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 32m33s

問題:
- signal.alert_name 在頂層,但 _get_alertname() 從 labels["alertname"] 讀 → 空字串
- 所有 sensor 失敗時 evidence_summary 只有 120 字元,AI 無法分析 → ABSTAIN
- labels 為空時 AI 根本不知道是什麼告警

修復:
1. _get_alertname(): 優先讀 signal.alert_name,fallback labels["alertname"]
2. _get_labels(): 自動補 alertname 到 labels dict
3. EvidenceSnapshot.alert_info: 新增告警基礎欄位(sensors=0 時的最小情報)
4. build_summary(): alert_info 永遠放在最前,讓 AI 至少知道告警類型+嚴重度

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-16 16:26:07 +08:00
parent 588b0d745b
commit ded93cbba3
2 changed files with 39 additions and 3 deletions

View File

@@ -77,6 +77,9 @@ class EvidenceSnapshot:
schema_version: str = SCHEMA_VERSION
collected_at: datetime = field(default_factory=now_taipei)
# 告警基礎資訊sensors=0 時的最小情報2026-04-16 ogt + Claude Sonnet 4.6
alert_info: dict[str, Any] | None = None
# 8D 感官數據
k8s_state: dict[str, Any] | None = None # D1
recent_logs: str | None = None # D2 (sanitized)
@@ -134,6 +137,10 @@ class EvidenceSnapshot:
"""
parts: list[str] = []
# 告警基礎資訊永遠放在最前sensors=0 時也要讓 AI 知道是什麼告警)
if self.alert_info:
parts.append(f"[告警資訊] {self.alert_info}")
if self.k8s_state:
parts.append(f"[K8s狀態] {self.k8s_state}")
if self.recent_logs:

View File

@@ -106,6 +106,22 @@ class PreDecisionInvestigator:
snapshot = EvidenceSnapshot(incident_id=incident_id)
snapshot.sensors_attempted = len(tools)
# 告警基礎資訊sensors=0 時 AI 至少知道是什麼告警
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復空 evidence → ABSTAIN 問題
sigs = getattr(incident, "signals", []) or []
sig0 = sigs[0] if sigs else None
snapshot.alert_info = {
"alert_name": alertname or getattr(incident, "alertname", "") or "",
"severity": str(getattr(incident, "severity", "")),
"affected_services": getattr(incident, "affected_services", []) or [],
"labels": labels,
"annotations": (
({k: v for k, v in (sig0.annotations or {}).items()} if sig0 else {})
),
"source": getattr(sig0, "source", "") if sig0 else "",
"incident_id": incident_id,
}
# 3. 並行蒐集(整體 INVESTIGATOR_TIMEOUT_SEC 保護)
try:
await asyncio.wait_for(
@@ -343,13 +359,26 @@ def _fill_snapshot_dimension(
def _get_alertname(incident: "Incident") -> str:
if incident.signals:
return incident.signals[0].labels.get("alertname", "")
return ""
sig = incident.signals[0]
# alert_name 在 Signal 頂層欄位labels["alertname"] 是 Prometheus 慣例但可能為空
return (
getattr(sig, "alert_name", "")
or sig.labels.get("alertname", "")
or getattr(incident, "alertname", "")
or ""
)
return getattr(incident, "alertname", "") or ""
def _get_labels(incident: "Incident") -> dict[str, Any]:
if incident.signals:
return incident.signals[0].labels
sig = incident.signals[0]
labels = sig.labels or {}
# 若 labels 缺少 alertname補上頂層的 alert_name
if "alertname" not in labels and getattr(sig, "alert_name", ""):
labels = dict(labels)
labels["alertname"] = sig.alert_name
return labels
return {}