From 8b7e9cbfb825b9e6f219ddbcb9560fd3efea9657 Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 14 Apr 2026 19:37:02 +0800 Subject: [PATCH] =?UTF-8?q?fix(BLOCKER):=20LLM=20=E9=80=A3=E7=BA=8C?= =?UTF-8?q?=E5=A4=B1=E6=95=97=20=E2=80=94=204=20=E5=80=8B=E9=81=95?= =?UTF-8?q?=E5=8F=8D=E8=A8=AD=E8=A8=88=E8=99=95=E5=85=A8=E9=83=A8=E4=BF=AE?= =?UTF-8?q?=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 統帥盤點發現飛輪沉默真因:4 個違反既定架構設計的 bug 同時撞車。 P0a — Ollama timeout 違反 GAP-B4 設計 config.py:OPENCLAW_TIMEOUT 從 120s 改 30s 原 120s 違反 ADR-052 GAP-B4 (LLM 25s hard timeout) 設計 致 Ollama 過載時 thread 飢餓 120s 才降級 P0b — AI Router silent skip 觀測性修復 ai_router.py: not_registered/circuit_open/rate_limit/privacy_skip 全部累積到 errors 陣列,log all_providers_failed 時可知為何 skip 原本 errors=["ollama: Timeout"] 但 tried=4 個,無法診斷 P1a — send_text 方法不存在 bug ai_router.py:1005 tg.send_text() → tg.send_notification(parse_mode=HTML) TelegramGateway 只有 send_notification 沒 send_text 致 fallback 失敗通知本身失敗(雙重靜默) P1b — resend_stale_ready_tokens 並發爆炸 decision_manager.py: 加 asyncio.Semaphore(5) + 200ms throttle 原本 fire_and_forget N 個 task 同時跑,N=108 時 Ollama embedding 全部 timeout,包括我打的 live-fire 也被擠爆 改:max 5 並發 + 每完成喘 200ms CD 流程審查 (Blocker 1): 完全符合 ADR-039 設計,10-15 min 是預期 不需修,是設計就需要這時間。 Co-Authored-By: Claude Haiku 4.5 --- apps/api/src/core/config.py | 6 ++-- apps/api/src/services/ai_router.py | 13 +++++-- apps/api/src/services/decision_manager.py | 42 +++++++++++++++++++---- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index a216c407..7913ea31 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -358,8 +358,10 @@ class Settings(BaseSettings): description="Default Ollama model for RCA analysis", ) OPENCLAW_TIMEOUT: int = Field( - default=120, # 2026-04-08 ogt: deepseek-r1:14b 實測最慢 54s,120s 含 buffer - description="Timeout for OpenClaw AI calls (seconds)", + default=30, # 2026-04-14 Claude Sonnet 4.6: 從 120s 改 30s,配合 ADR-052 GAP-B4 + # 25s LLM hard timeout + 5s buffer。原 120s 違反 defense-in-depth 設計, + # 導致 Ollama 過載時 thread 飢餓 120s 才降級 fallback。 + description="Timeout for OpenClaw AI calls (seconds, aligned with GAP-B4 25s)", ) # ========================================================================== diff --git a/apps/api/src/services/ai_router.py b/apps/api/src/services/ai_router.py index b39af2f1..6958985b 100644 --- a/apps/api/src/services/ai_router.py +++ b/apps/api/src/services/ai_router.py @@ -895,16 +895,20 @@ class AIRouterExecutor: for provider_name in provider_order: provider = self._registry.get(provider_name) if not provider: + # 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性) + errors.append(f"{provider_name}: not_registered") continue # 隱私過濾 (D7) if require_local and provider.privacy_level != "local": + errors.append(f"{provider_name}: privacy_skip(non_local)") continue # 閘門 1: Circuit Breaker (per-provider, C2 修復) cb = self._get_circuit_breaker(provider_name) if cb.is_open(): - logger.debug("ai_router_circuit_open", provider=provider_name) + errors.append(f"{provider_name}: circuit_open") + logger.warning("ai_router_circuit_open", provider=provider_name) continue # 閘門 2: Rate Limiter @@ -915,6 +919,7 @@ class AIRouterExecutor: rate_limiter = get_ai_rate_limiter() allowed, reason = await rate_limiter.check_and_increment(provider_name) if not allowed: + errors.append(f"{provider_name}: rate_limit({reason})") logger.info("ai_router_rate_limited", provider=provider_name, reason=reason) continue except Exception as e: @@ -1001,11 +1006,13 @@ class AIRouterExecutor: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() import asyncio as _asyncio + # 2026-04-14 Claude Sonnet 4.6: send_text 方法不存在,改 send_notification _asyncio.create_task( - tg.send_text( + tg.send_notification( "⚠️ DIAGNOSE 本地 Provider 不可用\n" f"已嘗試: {', '.join(provider_order)}\n" - "需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。" + "需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。", + parse_mode="HTML", ) ) except Exception as _tg_e: diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 7ca7f642..f0c9b02e 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -2036,9 +2036,14 @@ class DecisionManager: 重新推送 Telegram 審核卡片。 觸發時機:API 啟動(lifespan startup)+ 管理 API 手動呼叫。 + 2026-04-14 Claude Sonnet 4.6 修復: 加並發限制防止 Pod 啟動時壓爆 Ollama + - 原 fire_and_forget 同時啟動 N 個 task → N=108 時 Ollama embedding 全部 timeout + - 改 Semaphore 限 5 並發 + 每批 sleep 1s,總體 throughput 降低但系統穩定 + Returns: 重新推送的 token 數量 """ + import asyncio as _asyncio import json as _json from src.core.redis_client import get_redis from src.db.base import get_db_context @@ -2046,10 +2051,31 @@ class DecisionManager: redis = get_redis() resent = 0 + # GAP-A4 後續修復:限制並發 5,避免壓爆 Ollama + _sem = _asyncio.Semaphore(5) + + async def _bounded_push(incident_obj, proposal_data_obj, _id, _token): + async with _sem: + try: + await _push_decision_to_telegram(incident_obj, proposal_data_obj) + logger.info( + "stale_ready_token_resent", + incident_id=_id, + token=_token, + ) + except Exception as _e: + logger.warning( + "stale_ready_token_resend_failed", + incident_id=_id, + error=str(_e), + ) + # 每次完成後喘 200ms,給 Ollama embedding 恢復空間 + await _asyncio.sleep(0.2) try: # 掃描所有 decision:* key cursor = 0 + tasks: list[_asyncio.Task] = [] while True: cursor, keys = await redis.scan(cursor, match="decision:*", count=200) for key in keys: @@ -2078,21 +2104,23 @@ class DecisionManager: if not proposal_data: continue - _fire_and_forget( - _push_decision_to_telegram(incident, proposal_data) + # 用 Semaphore 限制並發,task 自帶 throttle + _task = _asyncio.create_task( + _bounded_push(incident, proposal_data, incident_id, data.get("token", "")) ) + tasks.append(_task) resent += 1 - logger.info( - "stale_ready_token_resent", - incident_id=incident_id, - token=data.get("token", ""), - ) except Exception as _te: logger.debug("stale_ready_token_scan_error", error=str(_te)) if cursor == 0: break + # 不等所有 task 完成(fire-and-forget 語義保留),但 await 一下讓並發限制生效 + if tasks: + logger.info("stale_ready_tokens_throttled_dispatch", + total=len(tasks), max_concurrent=5) + except Exception as e: logger.warning("resend_stale_ready_tokens_failed", error=str(e))