From 5ed396e390130faf60fb4bdbf8114365508eb558 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 May 2026 00:19:35 +0800 Subject: [PATCH] fix(decision): derive telegram dedup from incident signals --- apps/api/src/services/decision_manager.py | 18 ++++++++-- .../test_decision_manager_telegram_dedup.py | 35 +++++++++++++++++++ docs/LOGBOOK.md | 9 +++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 apps/api/tests/test_decision_manager_telegram_dedup.py diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 68e9b48b..c3a83588 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -55,6 +55,20 @@ def _fire_and_forget(coro) -> asyncio.Task: return task +def _incident_alertname_for_dedup(incident: Incident) -> str: + """Return a stable alert name for Telegram fingerprint dedup.""" + if incident.signals: + signal = incident.signals[0] + return ( + signal.labels.get("alertname") + or signal.alert_name + or signal.annotations.get("summary") + or signal.annotations.get("description") + or incident.incident_id + ) + return incident.incident_id + + def _phase2_fallback_reason(package: Any) -> str | None: """Return why a Phase 2 package should continue to Playbook/LLM fallback. @@ -212,7 +226,7 @@ async def _push_decision_to_telegram( # 改成 alertname+target 構造的 fingerprint key + TTL 86400s,同症狀共用 dedup。 # Incident 真正 RESOLVED/CLOSED 時走 line 220-226 的 status check 提早 return,不影響復發偵測。 redis = get_redis() - _alertname_fp = (incident.title or "unknown").strip().lower().replace(" ", "_")[:60] + _alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60] _target_fp = ( incident.affected_services[0] if incident.affected_services else "unknown" ).lower()[:40] @@ -3236,7 +3250,7 @@ class DecisionManager: # 與 line 217-218 同邏輯,避免 pod restart resend 路徑繞過 fingerprint dedup。 # 原本 telegram_sent:{incident_id} TTL 600s 早就過期 → 重啟必重發; # 改 fingerprint + 24h TTL → 同症狀 24h 內任何 INC ID 都不會重推。 - _alertname_fp = (getattr(incident, "title", None) or "unknown").strip().lower().replace(" ", "_")[:60] + _alertname_fp = _incident_alertname_for_dedup(incident).strip().lower().replace(" ", "_")[:60] _affected = getattr(incident, "affected_services", None) or [] _target_fp = (_affected[0] if _affected else "unknown").lower()[:40] dedup_key = f"telegram_sent:fp:{_alertname_fp}:{_target_fp}" diff --git a/apps/api/tests/test_decision_manager_telegram_dedup.py b/apps/api/tests/test_decision_manager_telegram_dedup.py new file mode 100644 index 00000000..84557021 --- /dev/null +++ b/apps/api/tests/test_decision_manager_telegram_dedup.py @@ -0,0 +1,35 @@ +from datetime import UTC, datetime + +from src.models.incident import Incident, Severity, Signal +from src.services.decision_manager import _incident_alertname_for_dedup + + +def test_telegram_dedup_alertname_uses_signal_not_missing_title() -> None: + incident = Incident( + incident_id="INC-TEST", + severity=Severity.P3, + signals=[ + Signal( + alert_name="FallbackAlertName", + severity=Severity.P3, + source="alertmanager", + fired_at=datetime.now(UTC), + labels={"alertname": "DockerContainerMemoryLimitPressure"}, + ) + ], + affected_services=["node-exporter-110"], + ) + + assert not hasattr(incident, "title") + assert _incident_alertname_for_dedup(incident) == "DockerContainerMemoryLimitPressure" + + +def test_telegram_dedup_alertname_falls_back_to_incident_id() -> None: + incident = Incident( + incident_id="INC-NO-SIGNAL", + severity=Severity.P3, + signals=[], + affected_services=[], + ) + + assert _incident_alertname_for_dedup(incident) == "INC-NO-SIGNAL" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 0cdbc4c7..663d27b1 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,15 @@ --- +## 2026-05-06 | Decision Telegram dedup no longer reads missing Incident.title + +**背景**:新 Ollama-first 部署後,production log 顯示 alert diagnosis 已走 `ollama_gcp_a -> ollama_gcp_b -> ollama_local -> gemini` 且 `phase24_ai_router_used` provider=`ollama`,但 DecisionManager 推送 Telegram decision card 時出現 `telegram_decision_push_failed: 'Incident' object has no attribute 'title'`。 + +**本次修補**: +- 新增 `_incident_alertname_for_dedup()`,Telegram fingerprint dedup 改從 `signal.labels.alertname -> signal.alert_name -> signal.annotations -> incident_id` 取值。 +- `_push_decision_to_telegram()` 與 stale READY token resend 共用同一個 dedup helper,避免兩條路徑再次漂移。 +- 補 `test_decision_manager_telegram_dedup.py`,鎖住 `Incident` 無 `title` 欄位時仍能產出 alertname fingerprint。 + ## 2026-05-05 | Alert diagnosis prioritizes resolution over speed **背景**:統帥明確修正策略:告警不是為了快速發卡片,而是為了把問題想清楚並完成 AI 自動化解決;GCP-A/GCP-B 有 SSD,可承擔深度診斷等待時間,Gemini 只能作 GCP-A → GCP-B → 111 全失敗後的備援。