fix(decision): derive telegram dedup from incident signals
This commit is contained in:
@@ -55,6 +55,20 @@ def _fire_and_forget(coro) -> asyncio.Task:
|
||||
return task
|
||||
|
||||
|
||||
def _incident_alertname_for_dedup(incident: Incident) -> str:
|
||||
"""Return a stable alert name for Telegram fingerprint dedup."""
|
||||
if incident.signals:
|
||||
signal = incident.signals[0]
|
||||
return (
|
||||
signal.labels.get("alertname")
|
||||
or signal.alert_name
|
||||
or signal.annotations.get("summary")
|
||||
or signal.annotations.get("description")
|
||||
or incident.incident_id
|
||||
)
|
||||
return incident.incident_id
|
||||
|
||||
|
||||
def _phase2_fallback_reason(package: Any) -> str | None:
|
||||
"""Return why a Phase 2 package should continue to Playbook/LLM fallback.
|
||||
|
||||
@@ -212,7 +226,7 @@ async def _push_decision_to_telegram(
|
||||
# 改成 alertname+target 構造的 fingerprint key + TTL 86400s,同症狀共用 dedup。
|
||||
# Incident 真正 RESOLVED/CLOSED 時走 line 220-226 的 status check 提早 return,不影響復發偵測。
|
||||
redis = get_redis()
|
||||
_alertname_fp = (incident.title or "unknown").strip().lower().replace(" ", "_")[:60]
|
||||
_alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60]
|
||||
_target_fp = (
|
||||
incident.affected_services[0] if incident.affected_services else "unknown"
|
||||
).lower()[:40]
|
||||
@@ -3236,7 +3250,7 @@ class DecisionManager:
|
||||
# 與 line 217-218 同邏輯,避免 pod restart resend 路徑繞過 fingerprint dedup。
|
||||
# 原本 telegram_sent:{incident_id} TTL 600s 早就過期 → 重啟必重發;
|
||||
# 改 fingerprint + 24h TTL → 同症狀 24h 內任何 INC ID 都不會重推。
|
||||
_alertname_fp = (getattr(incident, "title", None) or "unknown").strip().lower().replace(" ", "_")[:60]
|
||||
_alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60]
|
||||
_affected = getattr(incident, "affected_services", None) or []
|
||||
_target_fp = (_affected[0] if _affected else "unknown").lower()[:40]
|
||||
dedup_key = f"telegram_sent:fp:{_alertname_fp}:{_target_fp}"
|
||||
|
||||
35
apps/api/tests/test_decision_manager_telegram_dedup.py
Normal file
35
apps/api/tests/test_decision_manager_telegram_dedup.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from src.models.incident import Incident, Severity, Signal
|
||||
from src.services.decision_manager import _incident_alertname_for_dedup
|
||||
|
||||
|
||||
def test_telegram_dedup_alertname_uses_signal_not_missing_title() -> None:
|
||||
incident = Incident(
|
||||
incident_id="INC-TEST",
|
||||
severity=Severity.P3,
|
||||
signals=[
|
||||
Signal(
|
||||
alert_name="FallbackAlertName",
|
||||
severity=Severity.P3,
|
||||
source="alertmanager",
|
||||
fired_at=datetime.now(UTC),
|
||||
labels={"alertname": "DockerContainerMemoryLimitPressure"},
|
||||
)
|
||||
],
|
||||
affected_services=["node-exporter-110"],
|
||||
)
|
||||
|
||||
assert not hasattr(incident, "title")
|
||||
assert _incident_alertname_for_dedup(incident) == "DockerContainerMemoryLimitPressure"
|
||||
|
||||
|
||||
def test_telegram_dedup_alertname_falls_back_to_incident_id() -> None:
|
||||
incident = Incident(
|
||||
incident_id="INC-NO-SIGNAL",
|
||||
severity=Severity.P3,
|
||||
signals=[],
|
||||
affected_services=[],
|
||||
)
|
||||
|
||||
assert _incident_alertname_for_dedup(incident) == "INC-NO-SIGNAL"
|
||||
@@ -6,6 +6,15 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-06 | Decision Telegram dedup no longer reads missing Incident.title
|
||||
|
||||
**背景**:新 Ollama-first 部署後,production log 顯示 alert diagnosis 已走 `ollama_gcp_a -> ollama_gcp_b -> ollama_local -> gemini` 且 `phase24_ai_router_used` provider=`ollama`,但 DecisionManager 推送 Telegram decision card 時出現 `telegram_decision_push_failed: 'Incident' object has no attribute 'title'`。
|
||||
|
||||
**本次修補**:
|
||||
- 新增 `_incident_alertname_for_dedup()`,Telegram fingerprint dedup 改從 `signal.labels.alertname -> signal.alert_name -> signal.annotations -> incident_id` 取值。
|
||||
- `_push_decision_to_telegram()` 與 stale READY token resend 共用同一個 dedup helper,避免兩條路徑再次漂移。
|
||||
- 補 `test_decision_manager_telegram_dedup.py`,鎖住 `Incident` 無 `title` 欄位時仍能產出 alertname fingerprint。
|
||||
|
||||
## 2026-05-05 | Alert diagnosis prioritizes resolution over speed
|
||||
|
||||
**背景**:統帥明確修正策略:告警不是為了快速發卡片,而是為了把問題想清楚並完成 AI 自動化解決;GCP-A/GCP-B 有 SSD,可承擔深度診斷等待時間,Gemini 只能作 GCP-A → GCP-B → 111 全失敗後的備援。
|
||||
|
||||
Reference in New Issue
Block a user