fix(watchdog): 修復 META SYSTEM 重複告警 — violation_codes 穩定 dedup
All checks were successful
Code Review / ai-code-review (push) Successful in 1m3s

根因:violations 字串含動態浮點數(mean_trust/low_ratio),每次微變 → SHA256 不同 → dedup 失效
修法:新增 violation_codes list(穩定 W-code 格式),dedup 計算只用 violation_codes
     violations 保持含動態值(顯示用),Telegram 通知照常顯示完整資訊

W-6 Trust Drift dedup key: W6:trust_drift:low_count={N}(不含浮點數)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-05-05 00:06:38 +08:00
parent 40badc42cf
commit 10e665a540

View File

@@ -68,7 +68,13 @@ async def run_ai_slo_watchdog_loop() -> None:
async def _check_once() -> None:
# violations = 顯示用(含動態數值,送 Telegram
# violation_codes = dedup 用(穩定 W-code不含動態數值
# 2026-05-04 ogt: 分離 dedup key 與顯示字串
# 根因W-2/3/5/6 字串含動態數字count/ratio/score每次微變 → 不同 SHA256 → dedup 失效
# 修法dedup 用穩定 violation_codesW-N:type 格式Telegram 照常顯示動態值
violations: list[str] = []
violation_codes: list[str] = []
# W-1: AI SLO 違反(決策品質 7d 滾動)
try:
@@ -77,6 +83,7 @@ async def _check_once() -> None:
if report.any_violated:
violated = [m.name for m in report.metrics if m.violated]
violations.append(f"SLO 違反: {', '.join(violated)}")
violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}")
except Exception as e:
logger.warning("watchdog_w1_slo_check_failed", error=str(e))
@@ -91,6 +98,7 @@ async def _check_once() -> None:
violations.append(
f"{silent_count} 個 PENDING 告警超 30 分鐘未送達 Telegram未曾發送非 TTL 過期)"
)
violation_codes.append("W2:tg_silence")
except Exception as e:
logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e))
@@ -114,10 +122,12 @@ async def _check_once() -> None:
violations.append(
"飛輪執行成功率資料管線無流量uptime > 30min 仍無樣本)"
)
violation_codes.append("W3:flywheel_no_data")
elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN:
violations.append(
f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}"
)
violation_codes.append("W3:flywheel_low_rate")
except Exception as e:
logger.warning("watchdog_w3_flywheel_check_failed", error=str(e))
@@ -139,8 +149,10 @@ async def _check_once() -> None:
violations.append(
"Playbook 表為空 — 初始化失敗或表被清空uptime > 30min 仍 0 筆)"
)
violation_codes.append("W4:playbook_table_empty")
elif approved_count == 0:
violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂evolver 可能全部封存)")
violation_codes.append("W4:no_approved_playbook")
except Exception as e:
logger.warning("watchdog_w4_playbook_check_failed", error=str(e))
@@ -154,6 +166,7 @@ async def _check_once() -> None:
violations.append(
f"Agent Debate 失敗導致 {stuck_count} 個告警分析卡住PENDING + description='待分析' 超過 1 小時)"
)
violation_codes.append("W5:stuck_analysis")
except Exception as e:
logger.warning("watchdog_w5_stuck_analysis_check_failed", error=str(e))
@@ -171,6 +184,7 @@ async def _check_once() -> None:
f"Trust Drift 偵測到 {dist.low_count} 個 Playbook 信任度低落"
f"low_ratio: {dist.low_ratio:.1%}mean_trust: {dist.mean_trust:.2f}"
)
violation_codes.append(f"W6:trust_drift:low_count={dist.low_count}")
except Exception as e:
logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e))
@@ -178,11 +192,12 @@ async def _check_once() -> None:
logger.debug("ai_slo_watchdog_all_ok", checks=6)
return
# 去重violations 相同內容 1 小時內不重複發
# 2026-05-04 ogt: 修正 hash() 非確定性 bug — Python hash() 每次 process 重啟值不同
# (PYTHONHASHSEED 隨機化),導致每次 rollout 都繞過 dedup。改用 hashlib.sha256 確保跨 pod/重啟一致。
# 去重:用穩定 violation_codes 計算 SHA256避免動態數值ratio/score造成每次不同 hash
# 2026-05-04 ogt: dedup 分離顯示字串與 dedup key
# 根因violations 字串含動態數字count/ratio/score每次微變 → SHA256 不同 → dedup 失效
# 修法violation_codes 只含 W-code + 穩定類型,不含浮點數值
import hashlib
_content = "|".join(sorted(violations))
_content = "|".join(sorted(violation_codes))
dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12]
dedup_key = f"watchdog:alert:{dedup_hash}"
redis = get_redis()