From aa4ccec429419fc68eeb992f73f27962d9e85fa1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 5 May 2026 10:31:53 +0800 Subject: [PATCH] =?UTF-8?q?fix(watchdog):=20ADR-092=20B4=20=E2=80=94=20?= =?UTF-8?q?=E4=B8=89=E5=B1=A4=E4=BF=AE=E5=BE=A9=E6=B6=88=E9=99=A4=20META?= =?UTF-8?q?=20SYSTEM=20=E9=87=8D=E8=A4=87=E5=91=8A=E8=AD=A6=20+=20Ollama?= =?UTF-8?q?=20=E8=B7=AF=E7=94=B1=E5=BC=B7=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題根因(debugger 全景徹查): 1. Prod 仍跑舊版代碼(ec013f66 後的修法未部署 → 告警字串仍含舊格式) 2. replicas=2 時 Pod 間 grace period 不共享 → violation_codes 分歧 → 不同 SHA256 → dedup 失效 3. 新 Pod 啟動立即執行 _check_once() → rollout 時多發一波 4. W6 violation_codes 含動態 low_count → count 微變繞過 dedup 修復(A2/A3/W6/C1/C2): - A2:run_ai_slo_watchdog_loop 加 90s leading sleep,避免 rollout 立即觸發 - A3:_grace_active() 改為 Redis cluster-shared(watchdog:cluster_grace, ex=1800s, nx=True) 消除 Pod 間 grace period 不一致;Redis 故障時 fallback 為 process-local monotonic - W6:violation_codes 移除動態 low_count,改為穩定 "W6:trust_drift" - C1:ollama_auto_recovery.py recovered_host 改動態 label(依 URL port 判斷 GCP-A/B/Local) - C2:ConfigMap OLLAMA_FALLBACK_URL 改走 110:11437 nginx proxy,三層容災統一架構 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/jobs/ai_slo_watchdog_job.py | 51 +++++++++++++++---- apps/api/src/services/ollama_auto_recovery.py | 13 ++++- k8s/awoooi-prod/04-configmap.yaml | 7 +-- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index a0ab180b..223b8ea3 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -6,6 +6,11 @@ ADR-092 (2026-04-20 ogt + Claude Opus 4.7 Asia/Taipei) ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei): W-2 修復:改用 telegram_message_id IS NULL 判斷真正靜默,排除 tg_sent TTL 過期誤判 W-5 新增:Agent Debate 失敗導致告警卡在分析中(description='待分析') +ADR-092 B4 (2026-05-05 ogt + Claude Sonnet 4.6 Asia/Taipei): + A2 修復:新 Pod 啟動後 90s leading sleep,避免 rollout 時立即觸發告警 + A3 修復:grace period 改為 Redis cluster-shared(watchdog:cluster_grace), + 消除 replicas=2 時 Pod 間 grace period 不一致造成 violation_codes 分歧 + W6 修復:dedup key 移除動態 low_count,改為穩定 "W6:trust_drift" 檢查項目: W-1 AI SLO 違反(決策品質,7d 滾動) @@ -13,6 +18,7 @@ ADR-092 B3 (2026-04-24 ogt + Claude Sonnet 4.6 Asia/Taipei): W-3 飛輪 execution_success_rate 低落(< 30%) W-4 無 APPROVED Playbook(自動修復鏈路斷裂) W-5 Agent Debate 失敗(PENDING 告警 description='待分析' 超過 1 小時) + W-6 Trust Drift 偵測(Playbook 信任度漂移) 任一異常 → send_meta_alert(TYPE-8M,flywheel_health) 去重:Redis watchdog:alert:{dedup_hash} TTL 1h,避免每 15 分鐘重複洗版 @@ -45,20 +51,46 @@ _STUCK_ANALYSIS_THRESHOLD = 3 # Agent Debate 失敗導致卡住的告警門檻 # 啟動寬限期:30 分鐘內可 skip「資料還沒到」噪音;超過寬限期仍空 = 真資料管線斷,必須告警 # 不可單獨用 skip 吞告警 — 一定要配對打「初始化期過、資料應該來但沒來」新告警 _INIT_GRACE_SEC = 1800 +# 2026-05-05 ogt A3:_PROCESS_START 僅作 Redis 故障時的 fallback _PROCESS_START = time.monotonic() +# 2026-05-05 ogt A2:新 Pod 啟動 leading sleep,避免 rollout 時立即觸發告警 +# 90s < dedup TTL(3600s),不影響正常告警時效 +_STARTUP_SLEEP_SEC = 90 -def _grace_active() -> bool: - """啟動 30 分鐘內為寬限期;超過後資料缺失必須告警""" - return (time.monotonic() - _PROCESS_START) < _INIT_GRACE_SEC +# Redis key for cluster-shared grace period(A3) +_GRACE_REDIS_KEY = "watchdog:cluster_grace" + + +async def _is_grace_active() -> bool: + """ + 叢集級別啟動寬限期(A3 修復)。 + 第一個 Pod 執行時 SET nx=True,後續 Pod SET 失敗但 key 仍存在。 + key TTL = _INIT_GRACE_SEC(30min);到期後 grace 結束。 + Redis 故障時降級為 process-local monotonic 判斷(fail-safe)。 + 2026-05-05 ogt + Claude Sonnet 4.6 — ADR-092 B4 + """ + try: + redis = get_redis() + await redis.set(_GRACE_REDIS_KEY, "1", nx=True, ex=_INIT_GRACE_SEC) + return bool(await redis.exists(_GRACE_REDIS_KEY)) + except Exception: + return (time.monotonic() - _PROCESS_START) < _INIT_GRACE_SEC async def run_ai_slo_watchdog_loop() -> None: """ 永久迴圈:每 15 分鐘自健診,異常時發送 TYPE-8M Meta-System 告警。 由 main.py lifespan 透過 asyncio.create_task() 啟動。 + A2:先 sleep 90s 再開始第一次 check,避免新 Pod 上線立即觸發告警。 """ - logger.info("ai_slo_watchdog_started", interval_sec=_INTERVAL_SEC) + logger.info( + "ai_slo_watchdog_started", + interval_sec=_INTERVAL_SEC, + startup_sleep_sec=_STARTUP_SLEEP_SEC, + ) + # A2 修復:Leading sleep — 讓服務先穩定,避免 rollout 時立即觸發 + await asyncio.sleep(_STARTUP_SLEEP_SEC) while True: try: await _check_once() @@ -75,6 +107,8 @@ async def _check_once() -> None: # 修法:dedup 用穩定 violation_codes(W-N:type 格式),Telegram 照常顯示動態值 violations: list[str] = [] violation_codes: list[str] = [] + # A3 修復:cluster-shared grace period,單次查詢供所有 W-check 使用,避免 Pod 間不一致 + grace = await _is_grace_active() # W-1: AI SLO 違反(決策品質 7d 滾動) try: @@ -112,11 +146,10 @@ async def _check_once() -> None: from src.services.flywheel_stats_service import FlywheelStatsService metrics = await FlywheelStatsService().compute() if metrics and metrics.execution_success_rate is None: - if _grace_active(): + if grace: logger.debug( "watchdog_w3_init_grace_skip", reason="execution_sample_below_min", - uptime_sec=int(time.monotonic() - _PROCESS_START), ) else: violations.append( @@ -139,11 +172,10 @@ async def _check_once() -> None: try: approved_count, total_playbook_count = await _count_approved_playbooks() if total_playbook_count == 0: - if _grace_active(): + if grace: logger.info( "watchdog_w4_init_grace_skip", reason="playbook_table_empty_likely_initializing", - uptime_sec=int(time.monotonic() - _PROCESS_START), ) else: violations.append( @@ -184,7 +216,8 @@ async def _check_once() -> None: f"Trust Drift 偵測到 {dist.low_count} 個 Playbook 信任度低落" f"(low_ratio: {dist.low_ratio:.1%},mean_trust: {dist.mean_trust:.2f})" ) - violation_codes.append(f"W6:trust_drift:low_count={dist.low_count}") + # 2026-05-05 ogt W6 修復:移除動態 low_count,避免 count 微變繞過 dedup + violation_codes.append("W6:trust_drift") except Exception as e: logger.warning("watchdog_w6_trust_drift_check_failed", error=str(e)) diff --git a/apps/api/src/services/ollama_auto_recovery.py b/apps/api/src/services/ollama_auto_recovery.py index e46fadc5..9b3dc4d8 100644 --- a/apps/api/src/services/ollama_auto_recovery.py +++ b/apps/api/src/services/ollama_auto_recovery.py @@ -423,12 +423,21 @@ class OllamaAutoRecoveryService: if alerter is None: from src.services.failover_alerter import get_failover_alerter alerter = get_failover_alerter() - # 2026-05-03 ogt: ADR-110 — recovered_host 動態,顯示恢復的實際主機 URL + # 2026-05-05 ogt C1 修復:動態判斷 provider label,避免 hardcode "GCP-A" + # 依 OLLAMA_URL port 判斷:11435=GCP-A、11436=GCP-B、11437/111=Local _recovered_url = getattr(self._settings, "OLLAMA_URL", "Ollama Primary") + if "11435" in _recovered_url: + _provider_label = "GCP-A" + elif "11436" in _recovered_url: + _provider_label = "GCP-B" + elif "11437" in _recovered_url or "192.168.0.111" in _recovered_url: + _provider_label = "Local" + else: + _provider_label = "Ollama" await alerter.alert_recovery({ "from": from_provider, "to": "ollama", - "recovered_host": f"GCP-A {_recovered_url}", + "recovered_host": f"{_provider_label} {_recovered_url}", "stable_count": stable_count, "recovery_time": recovery_time.isoformat(), }) diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index e5a0e6b7..036c3886 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -18,12 +18,13 @@ data: # 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111(GPU 機,RTX) # 188 = CPU-only Ollama,推理極慢(>60s);111 有 GPU,avg 10s # 2026-05-04 ogt: ADR-110 三層容災正式路由(nginx proxy 架設完成後恢復 GCP 優先) - # GCP-A(via 110:11435) → GCP-B(via 110:11436) → 111 兜底 - # 110 nginx proxy 轉發:11435 → 34.143.170.20:11434, 11436 → 34.21.145.224:11434 + # GCP-A(via 110:11435) → GCP-B(via 110:11436) → Local(via 110:11437) 統一走 nginx proxy + # 110 nginx proxy 轉發:11435→GCP-A, 11436→GCP-B, 11437→192.168.0.111:11434 # K8s pods 不可直連 GCP:11434(NetworkPolicy 外網 egress 只開 443) + # 2026-05-05 ogt C2 修復:Local fallback 改走 110:11437 統一 nginx proxy,增強可觀測性 OLLAMA_URL: "http://192.168.0.110:11435" OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436" - OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434" + OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437" OPENCLAW_URL: "http://192.168.0.188:8088" KALI_SCANNER_URL: "http://192.168.0.112:8080" SIGNOZ_URL: "http://192.168.0.188:3301"