fix(decision): TYPE-1 告警重複洗版兩個根因修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

根因 1: TYPE-1 bypass 在 existing_token 檢查之前執行
→ 每次 get_or_create_decision() 不管 token 是否存在,都直接推 TG
→ 修復: existing_token 檢查提前到 TYPE-1 bypass 之前(統一入口)

根因 2: TYPE-1 token TTL 僅 3600s
→ 1h 後 token 過期,下次掃描重新建立並再推 TG
→ 修復: TYPE-1 token TTL 提升至 86400s (24h)

影響: HostBackupFailed 等 TYPE-1 告警每個 incident 只推 1 次(24h 內)

2026-04-16 ogt + Claude Sonnet 4.6

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-16 02:49:31 +08:00
parent 62bcc50770
commit 5a3a649f8a

View File

@@ -1188,6 +1188,18 @@ class DecisionManager:
"""
_redis_client = get_redis()
# 1. 先檢查現有 token所有類型統一入口
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 TYPE-1 bypass 未檢查 existing token 導致
# HostBackupFailed 等告警重複洗版 — existing token 檢查必須在 TYPE-1 bypass 前執行
existing_token = await self._find_existing_token(incident.incident_id)
if existing_token:
# READY 或 EXECUTING 狀態: 直接返回
if existing_token.state in (DecisionState.READY, DecisionState.EXECUTING):
return existing_token
# COMPLETED 狀態: 直接返回,避免重複建立 decision 導致 Telegram 轟炸
if existing_token.state == DecisionState.COMPLETED:
return existing_token
# ADR-073 Phase 3-1: TYPE-1 triage guard — 純資訊告警跳過 LLM 分析
# classify_alert_early() 已在 webhook 入口設定 notification_type
# TYPE-1 (info/backup/heartbeat) 不需 AI 推理,直接推 Telegram 後返回
@@ -1207,7 +1219,9 @@ class DecisionManager:
"description": "純資訊通知,無需操作",
},
)
await self._save_token(_info_token)
# 2026-04-16 ogt + Claude Sonnet 4.6: TYPE-1 token TTL 24h 防洗版
# 原 3600s 導致每小時重推同一 HostBackupFailed/TYPE-1 告警
await self._save_token(_info_token, ttl=86400)
_fire_and_forget(_push_decision_to_telegram(incident, _info_token.proposal_data))
logger.info(
"decision_type1_bypass",
@@ -1216,16 +1230,6 @@ class DecisionManager:
)
return _info_token
# 1. 檢查現有 token
existing_token = await self._find_existing_token(incident.incident_id)
if existing_token:
# READY 或 EXECUTING 狀態: 直接返回
if existing_token.state in (DecisionState.READY, DecisionState.EXECUTING):
return existing_token
# COMPLETED 狀態: 直接返回,避免重複建立 decision 導致 Telegram 轟炸
if existing_token.state == DecisionState.COMPLETED:
return existing_token
# 2. 建立新 token
token = DecisionToken(
token=f"DEC-{uuid4().hex[:12].upper()}",
@@ -2383,8 +2387,12 @@ class DecisionManager:
except Exception as e:
logger.warning("decision_chain_persist_failed", incident_id=incident_id, error=str(e))
async def _save_token(self, token: DecisionToken) -> None:
"""儲存決策令牌到 Redis"""
async def _save_token(self, token: DecisionToken, ttl: int = DECISION_TOKEN_TTL) -> None:
"""儲存決策令牌到 Redis
ttl: 過期秒數,預設 DECISION_TOKEN_TTL (3600s)
TYPE-1 純資訊通知使用 86400s (24h) 防重複洗版
"""
import json
redis_client = get_redis()
key = f"{DECISION_TOKEN_PREFIX}{token.token}"
@@ -2392,7 +2400,7 @@ class DecisionManager:
await redis_client.set(
key,
json.dumps(token.to_dict()),
ex=DECISION_TOKEN_TTL,
ex=ttl,
)
async def get_token(self, token_id: str) -> DecisionToken | None: