From 10e665a540019a9b5a331c2bd6c9fbfa6b09e91f Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 00:06:38 +0800 Subject: [PATCH] =?UTF-8?q?fix(watchdog):=20=E4=BF=AE=E5=BE=A9=20META=20SY?= =?UTF-8?q?STEM=20=E9=87=8D=E8=A4=87=E5=91=8A=E8=AD=A6=20=E2=80=94=20viola?= =?UTF-8?q?tion=5Fcodes=20=E7=A9=A9=E5=AE=9A=20dedup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:violations 字串含動態浮點數(mean_trust/low_ratio),每次微變 → SHA256 不同 → dedup 失效 修法:新增 violation_codes list(穩定 W-code 格式),dedup 計算只用 violation_codes violations 保持含動態值(顯示用),Telegram 通知照常顯示完整資訊 W-6 Trust Drift dedup key: W6:trust_drift:low_count={N}(不含浮點數) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/jobs/ai_slo_watchdog_job.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 6f2551d0..a0ab180b 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -68,7 +68,13 @@ async def run_ai_slo_watchdog_loop() -> None: async def _check_once() -> None: + # violations = 顯示用(含動態數值,送 Telegram) + # violation_codes = dedup 用(穩定 W-code,不含動態數值) + # 2026-05-04 ogt: 分離 dedup key 與顯示字串 + # 根因:W-2/3/5/6 字串含動態數字(count/ratio/score),每次微變 → 不同 SHA256 → dedup 失效 + # 修法:dedup 用穩定 violation_codes(W-N:type 格式),Telegram 照常顯示動態值 violations: list[str] = [] + violation_codes: list[str] = [] # W-1: AI SLO 違反(決策品質 7d 滾動) try: @@ -77,6 +83,7 @@ async def _check_once() -> None: if report.any_violated: violated = [m.name for m in report.metrics if m.violated] violations.append(f"SLO 違反: {', '.join(violated)}") + violation_codes.append(f"W1:slo_violated:{','.join(sorted(violated))}") except Exception as e: logger.warning("watchdog_w1_slo_check_failed", error=str(e)) @@ -91,6 +98,7 @@ async def _check_once() -> None: violations.append( f"{silent_count} 個 PENDING 告警超 30 分鐘未送達 Telegram(未曾發送,非 TTL 過期)" ) + violation_codes.append("W2:tg_silence") except Exception as e: logger.warning("watchdog_w2_tg_silence_check_failed", error=str(e)) @@ -114,10 +122,12 @@ async def _check_once() -> None: violations.append( "飛輪執行成功率資料管線無流量(uptime > 30min 仍無樣本)" ) + violation_codes.append("W3:flywheel_no_data") elif metrics and metrics.execution_success_rate < _FLYWHEEL_SUCCESS_MIN: violations.append( f"飛輪執行成功率 {metrics.execution_success_rate:.1%} < {_FLYWHEEL_SUCCESS_MIN:.0%}" ) + violation_codes.append("W3:flywheel_low_rate") except Exception as e: logger.warning("watchdog_w3_flywheel_check_failed", error=str(e)) @@ -139,8 +149,10 @@ async def _check_once() -> None: violations.append( "Playbook 表為空 — 初始化失敗或表被清空(uptime > 30min 仍 0 筆)" ) + violation_codes.append("W4:playbook_table_empty") elif approved_count == 0: violations.append("無 APPROVED Playbook — 自動修復鏈路斷裂(evolver 可能全部封存)") + violation_codes.append("W4:no_approved_playbook") except Exception as e: logger.warning("watchdog_w4_playbook_check_failed", error=str(e)) @@ -154,6 +166,7 @@ async def _check_once() -> None: violations.append( f"Agent Debate 失敗導致 {stuck_count} 個告警分析卡住(PENDING + description='待分析' 超過 1 小時)" ) + violation_codes.append("W5:stuck_analysis") except Exception as e: logger.warning("watchdog_w5_stuck_analysis_check_failed", error=str(e)) @@ -171,6 +184,7 @@ async def _check_once() -> None: f"Trust Drift 偵測到 {dist.low_count} 個 Playbook 信任度低落" f"(low_ratio: {dist.low_ratio:.1%},mean_trust: {dist.mean_trust:.2f})" ) + violation_codes.append(f"W6:trust_drift:low_count={dist.low_count}") except Exception as e: logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e)) @@ -178,11 +192,12 @@ async def _check_once() -> None: logger.debug("ai_slo_watchdog_all_ok", checks=6) return - # 去重:violations 相同內容 1 小時內不重複發 - # 2026-05-04 ogt: 修正 hash() 非確定性 bug — Python hash() 每次 process 重啟值不同 - # (PYTHONHASHSEED 隨機化),導致每次 rollout 都繞過 dedup。改用 hashlib.sha256 確保跨 pod/重啟一致。 + # 去重:用穩定 violation_codes 計算 SHA256,避免動態數值(ratio/score)造成每次不同 hash + # 2026-05-04 ogt: dedup 分離顯示字串與 dedup key + # 根因:violations 字串含動態數字(count/ratio/score),每次微變 → SHA256 不同 → dedup 失效 + # 修法:violation_codes 只含 W-code + 穩定類型,不含浮點數值 import hashlib - _content = "|".join(sorted(violations)) + _content = "|".join(sorted(violation_codes)) dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12] dedup_key = f"watchdog:alert:{dedup_hash}" redis = get_redis()