fix(decision): TYPE-1 告警重複洗版兩個根因修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
根因 1: TYPE-1 bypass 在 existing_token 檢查之前執行 → 每次 get_or_create_decision() 不管 token 是否存在,都直接推 TG → 修復: existing_token 檢查提前到 TYPE-1 bypass 之前(統一入口) 根因 2: TYPE-1 token TTL 僅 3600s → 1h 後 token 過期,下次掃描重新建立並再推 TG → 修復: TYPE-1 token TTL 提升至 86400s (24h) 影響: HostBackupFailed 等 TYPE-1 告警每個 incident 只推 1 次(24h 內) 2026-04-16 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1188,6 +1188,18 @@ class DecisionManager:
|
||||
"""
|
||||
_redis_client = get_redis()
|
||||
|
||||
# 1. 先檢查現有 token(所有類型統一入口)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 TYPE-1 bypass 未檢查 existing token 導致
|
||||
# HostBackupFailed 等告警重複洗版 — existing token 檢查必須在 TYPE-1 bypass 前執行
|
||||
existing_token = await self._find_existing_token(incident.incident_id)
|
||||
if existing_token:
|
||||
# READY 或 EXECUTING 狀態: 直接返回
|
||||
if existing_token.state in (DecisionState.READY, DecisionState.EXECUTING):
|
||||
return existing_token
|
||||
# COMPLETED 狀態: 直接返回,避免重複建立 decision 導致 Telegram 轟炸
|
||||
if existing_token.state == DecisionState.COMPLETED:
|
||||
return existing_token
|
||||
|
||||
# ADR-073 Phase 3-1: TYPE-1 triage guard — 純資訊告警跳過 LLM 分析
|
||||
# classify_alert_early() 已在 webhook 入口設定 notification_type
|
||||
# TYPE-1 (info/backup/heartbeat) 不需 AI 推理,直接推 Telegram 後返回
|
||||
@@ -1207,7 +1219,9 @@ class DecisionManager:
|
||||
"description": "純資訊通知,無需操作",
|
||||
},
|
||||
)
|
||||
await self._save_token(_info_token)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: TYPE-1 token TTL 24h 防洗版
|
||||
# 原 3600s 導致每小時重推同一 HostBackupFailed/TYPE-1 告警
|
||||
await self._save_token(_info_token, ttl=86400)
|
||||
_fire_and_forget(_push_decision_to_telegram(incident, _info_token.proposal_data))
|
||||
logger.info(
|
||||
"decision_type1_bypass",
|
||||
@@ -1216,16 +1230,6 @@ class DecisionManager:
|
||||
)
|
||||
return _info_token
|
||||
|
||||
# 1. 檢查現有 token
|
||||
existing_token = await self._find_existing_token(incident.incident_id)
|
||||
if existing_token:
|
||||
# READY 或 EXECUTING 狀態: 直接返回
|
||||
if existing_token.state in (DecisionState.READY, DecisionState.EXECUTING):
|
||||
return existing_token
|
||||
# COMPLETED 狀態: 直接返回,避免重複建立 decision 導致 Telegram 轟炸
|
||||
if existing_token.state == DecisionState.COMPLETED:
|
||||
return existing_token
|
||||
|
||||
# 2. 建立新 token
|
||||
token = DecisionToken(
|
||||
token=f"DEC-{uuid4().hex[:12].upper()}",
|
||||
@@ -2383,8 +2387,12 @@ class DecisionManager:
|
||||
except Exception as e:
|
||||
logger.warning("decision_chain_persist_failed", incident_id=incident_id, error=str(e))
|
||||
|
||||
async def _save_token(self, token: DecisionToken) -> None:
|
||||
"""儲存決策令牌到 Redis"""
|
||||
async def _save_token(self, token: DecisionToken, ttl: int = DECISION_TOKEN_TTL) -> None:
|
||||
"""儲存決策令牌到 Redis
|
||||
|
||||
ttl: 過期秒數,預設 DECISION_TOKEN_TTL (3600s)
|
||||
TYPE-1 純資訊通知使用 86400s (24h) 防重複洗版
|
||||
"""
|
||||
import json
|
||||
redis_client = get_redis()
|
||||
key = f"{DECISION_TOKEN_PREFIX}{token.token}"
|
||||
@@ -2392,7 +2400,7 @@ class DecisionManager:
|
||||
await redis_client.set(
|
||||
key,
|
||||
json.dumps(token.to_dict()),
|
||||
ex=DECISION_TOKEN_TTL,
|
||||
ex=ttl,
|
||||
)
|
||||
|
||||
async def get_token(self, token_id: str) -> DecisionToken | None:
|
||||
|
||||
Reference in New Issue
Block a user