fix(watchdog): 修復 META SYSTEM 重複告警 — violation_codes 穩定 dedup
All checks were successful
Code Review / ai-code-review (push) Successful in 1m3s
All checks were successful
Code Review / ai-code-review (push) Successful in 1m3s
根因:violations 字串含動態浮點數(mean_trust/low_ratio),每次微變 → SHA256 不同 → dedup 失效
修法:新增 violation_codes list(穩定 W-code 格式),dedup 計算只用 violation_codes
violations 保持含動態值(顯示用),Telegram 通知照常顯示完整資訊
W-6 Trust Drift dedup key: W6:trust_drift:low_count={N}(不含浮點數)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -68,7 +68,13 @@ async def run_ai_slo_watchdog_loop() -> None:
|
||||
|
||||
|
||||
async def _check_once() -> None:
|
||||
# violations = 顯示用(含動態數值,送 Telegram)
|
||||
# violation_codes = dedup 用(穩定 W-code,不含動態數值)
|
||||
# 2026-05-04 ogt: 分離 dedup key 與顯示字串
|
||||
# 根因:W-2/3/5/6 字串含動態數字(count/ratio/score),每次微變 → 不同 SHA256 → dedup 失效
|
||||
# 修法:dedup 用穩定 violation_codes(W-N:type 格式),Telegram 照常顯示動態值
|
||||
violations: list[str] = []
|
||||
violation_codes: list[str] = []
|
||||
|
||||
# W-1: AI SLO 違反(決策品質 7d 滾動)
|
||||
try:
|
||||
@@ -77,6 +83,7 @@ async def _check_once() -> None:
|
||||
if report.any_violated:
|
||||
violated = [m.name for m in report.metrics if m.violated]
|
||||
violations.append(f"SLO 違反: {', '.join(violated)}")
|
||||
violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w1_slo_check_failed", error=str(e))
|
||||
|
||||
@@ -91,6 +98,7 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
f"{silent_count} 個 PENDING 告警超 30 分鐘未送達 Telegram(未曾發送,非 TTL 過期)"
|
||||
)
|
||||
violation_codes.append("W2:tg_silence")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e))
|
||||
|
||||
@@ -114,10 +122,12 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
"飛輪執行成功率資料管線無流量(uptime > 30min 仍無樣本)"
|
||||
)
|
||||
violation_codes.append("W3:flywheel_no_data")
|
||||
elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN:
|
||||
violations.append(
|
||||
f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}"
|
||||
)
|
||||
violation_codes.append("W3:flywheel_low_rate")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w3_flywheel_check_failed", error=str(e))
|
||||
|
||||
@@ -139,8 +149,10 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
"Playbook 表為空 — 初始化失敗或表被清空(uptime > 30min 仍 0 筆)"
|
||||
)
|
||||
violation_codes.append("W4:playbook_table_empty")
|
||||
elif approved_count == 0:
|
||||
violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂(evolver 可能全部封存)")
|
||||
violation_codes.append("W4:no_approved_playbook")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w4_playbook_check_failed", error=str(e))
|
||||
|
||||
@@ -154,6 +166,7 @@ async def _check_once() -> None:
|
||||
violations.append(
|
||||
f"Agent Debate 失敗導致 {stuck_count} 個告警分析卡住(PENDING + description='待分析' 超過 1 小時)"
|
||||
)
|
||||
violation_codes.append("W5:stuck_analysis")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w5_stuck_analysis_check_failed", error=str(e))
|
||||
|
||||
@@ -171,6 +184,7 @@ async def _check_once() -> None:
|
||||
f"Trust Drift 偵測到 {dist.low_count} 個 Playbook 信任度低落"
|
||||
f"(low_ratio: {dist.low_ratio:.1%},mean_trust: {dist.mean_trust:.2f})"
|
||||
)
|
||||
violation_codes.append(f"W6:trust_drift:low_count={dist.low_count}")
|
||||
except Exception as e:
|
||||
logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e))
|
||||
|
||||
@@ -178,11 +192,12 @@ async def _check_once() -> None:
|
||||
logger.debug("ai_slo_watchdog_all_ok", checks=6)
|
||||
return
|
||||
|
||||
# 去重:violations 相同內容 1 小時內不重複發
|
||||
# 2026-05-04 ogt: 修正 hash() 非確定性 bug — Python hash() 每次 process 重啟值不同
|
||||
# (PYTHONHASHSEED 隨機化),導致每次 rollout 都繞過 dedup。改用 hashlib.sha256 確保跨 pod/重啟一致。
|
||||
# 去重:用穩定 violation_codes 計算 SHA256,避免動態數值(ratio/score)造成每次不同 hash
|
||||
# 2026-05-04 ogt: dedup 分離顯示字串與 dedup key
|
||||
# 根因:violations 字串含動態數字(count/ratio/score),每次微變 → SHA256 不同 → dedup 失效
|
||||
# 修法:violation_codes 只含 W-code + 穩定類型,不含浮點數值
|
||||
import hashlib
|
||||
_content = "|".join(sorted(violations))
|
||||
_content = "|".join(sorted(violation_codes))
|
||||
dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12]
|
||||
dedup_key = f"watchdog:alert:{dedup_hash}"
|
||||
redis = get_redis()
|
||||
|
||||
Reference in New Issue
Block a user