fix(BLOCKER): LLM 連續失敗 — 4 個違反設計處全部修復
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m21s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m21s
統帥盤點發現飛輪沉默真因:4 個違反既定架構設計的 bug 同時撞車。 P0a — Ollama timeout 違反 GAP-B4 設計 config.py:OPENCLAW_TIMEOUT 從 120s 改 30s 原 120s 違反 ADR-052 GAP-B4 (LLM 25s hard timeout) 設計 致 Ollama 過載時 thread 飢餓 120s 才降級 P0b — AI Router silent skip 觀測性修復 ai_router.py: not_registered/circuit_open/rate_limit/privacy_skip 全部累積到 errors 陣列,log all_providers_failed 時可知為何 skip 原本 errors=["ollama: Timeout"] 但 tried=4 個,無法診斷 P1a — send_text 方法不存在 bug ai_router.py:1005 tg.send_text() → tg.send_notification(parse_mode=HTML) TelegramGateway 只有 send_notification 沒 send_text 致 fallback 失敗通知本身失敗(雙重靜默) P1b — resend_stale_ready_tokens 並發爆炸 decision_manager.py: 加 asyncio.Semaphore(5) + 200ms throttle 原本 fire_and_forget N 個 task 同時跑,N=108 時 Ollama embedding 全部 timeout,包括我打的 live-fire 也被擠爆 改:max 5 並發 + 每完成喘 200ms CD 流程審查 (Blocker 1): 完全符合 ADR-039 設計,10-15 min 是預期 不需修,是設計就需要這時間。 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -358,8 +358,10 @@ class Settings(BaseSettings):
|
||||
description="Default Ollama model for RCA analysis",
|
||||
)
|
||||
OPENCLAW_TIMEOUT: int = Field(
|
||||
default=120, # 2026-04-08 ogt: deepseek-r1:14b 實測最慢 54s,120s 含 buffer
|
||||
description="Timeout for OpenClaw AI calls (seconds)",
|
||||
default=30, # 2026-04-14 Claude Sonnet 4.6: 從 120s 改 30s,配合 ADR-052 GAP-B4
|
||||
# 25s LLM hard timeout + 5s buffer。原 120s 違反 defense-in-depth 設計,
|
||||
# 導致 Ollama 過載時 thread 飢餓 120s 才降級 fallback。
|
||||
description="Timeout for OpenClaw AI calls (seconds, aligned with GAP-B4 25s)",
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
|
||||
@@ -895,16 +895,20 @@ class AIRouterExecutor:
|
||||
for provider_name in provider_order:
|
||||
provider = self._registry.get(provider_name)
|
||||
if not provider:
|
||||
# 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性)
|
||||
errors.append(f"{provider_name}: not_registered")
|
||||
continue
|
||||
|
||||
# 隱私過濾 (D7)
|
||||
if require_local and provider.privacy_level != "local":
|
||||
errors.append(f"{provider_name}: privacy_skip(non_local)")
|
||||
continue
|
||||
|
||||
# 閘門 1: Circuit Breaker (per-provider, C2 修復)
|
||||
cb = self._get_circuit_breaker(provider_name)
|
||||
if cb.is_open():
|
||||
logger.debug("ai_router_circuit_open", provider=provider_name)
|
||||
errors.append(f"{provider_name}: circuit_open")
|
||||
logger.warning("ai_router_circuit_open", provider=provider_name)
|
||||
continue
|
||||
|
||||
# 閘門 2: Rate Limiter
|
||||
@@ -915,6 +919,7 @@ class AIRouterExecutor:
|
||||
rate_limiter = get_ai_rate_limiter()
|
||||
allowed, reason = await rate_limiter.check_and_increment(provider_name)
|
||||
if not allowed:
|
||||
errors.append(f"{provider_name}: rate_limit({reason})")
|
||||
logger.info("ai_router_rate_limited", provider=provider_name, reason=reason)
|
||||
continue
|
||||
except Exception as e:
|
||||
@@ -1001,11 +1006,13 @@ class AIRouterExecutor:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
tg = get_telegram_gateway()
|
||||
import asyncio as _asyncio
|
||||
# 2026-04-14 Claude Sonnet 4.6: send_text 方法不存在,改 send_notification
|
||||
_asyncio.create_task(
|
||||
tg.send_text(
|
||||
tg.send_notification(
|
||||
"⚠️ <b>DIAGNOSE 本地 Provider 不可用</b>\n"
|
||||
f"已嘗試: {', '.join(provider_order)}\n"
|
||||
"需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。"
|
||||
"需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。",
|
||||
parse_mode="HTML",
|
||||
)
|
||||
)
|
||||
except Exception as _tg_e:
|
||||
|
||||
@@ -2036,9 +2036,14 @@ class DecisionManager:
|
||||
重新推送 Telegram 審核卡片。
|
||||
觸發時機:API 啟動(lifespan startup)+ 管理 API 手動呼叫。
|
||||
|
||||
2026-04-14 Claude Sonnet 4.6 修復: 加並發限制防止 Pod 啟動時壓爆 Ollama
|
||||
- 原 fire_and_forget 同時啟動 N 個 task → N=108 時 Ollama embedding 全部 timeout
|
||||
- 改 Semaphore 限 5 並發 + 每批 sleep 1s,總體 throughput 降低但系統穩定
|
||||
|
||||
Returns:
|
||||
重新推送的 token 數量
|
||||
"""
|
||||
import asyncio as _asyncio
|
||||
import json as _json
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_db_context
|
||||
@@ -2046,10 +2051,31 @@ class DecisionManager:
|
||||
|
||||
redis = get_redis()
|
||||
resent = 0
|
||||
# GAP-A4 後續修復:限制並發 5,避免壓爆 Ollama
|
||||
_sem = _asyncio.Semaphore(5)
|
||||
|
||||
async def _bounded_push(incident_obj, proposal_data_obj, _id, _token):
|
||||
async with _sem:
|
||||
try:
|
||||
await _push_decision_to_telegram(incident_obj, proposal_data_obj)
|
||||
logger.info(
|
||||
"stale_ready_token_resent",
|
||||
incident_id=_id,
|
||||
token=_token,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.warning(
|
||||
"stale_ready_token_resend_failed",
|
||||
incident_id=_id,
|
||||
error=str(_e),
|
||||
)
|
||||
# 每次完成後喘 200ms,給 Ollama embedding 恢復空間
|
||||
await _asyncio.sleep(0.2)
|
||||
|
||||
try:
|
||||
# 掃描所有 decision:* key
|
||||
cursor = 0
|
||||
tasks: list[_asyncio.Task] = []
|
||||
while True:
|
||||
cursor, keys = await redis.scan(cursor, match="decision:*", count=200)
|
||||
for key in keys:
|
||||
@@ -2078,21 +2104,23 @@ class DecisionManager:
|
||||
if not proposal_data:
|
||||
continue
|
||||
|
||||
_fire_and_forget(
|
||||
_push_decision_to_telegram(incident, proposal_data)
|
||||
# 用 Semaphore 限制並發,task 自帶 throttle
|
||||
_task = _asyncio.create_task(
|
||||
_bounded_push(incident, proposal_data, incident_id, data.get("token", ""))
|
||||
)
|
||||
tasks.append(_task)
|
||||
resent += 1
|
||||
logger.info(
|
||||
"stale_ready_token_resent",
|
||||
incident_id=incident_id,
|
||||
token=data.get("token", ""),
|
||||
)
|
||||
except Exception as _te:
|
||||
logger.debug("stale_ready_token_scan_error", error=str(_te))
|
||||
|
||||
if cursor == 0:
|
||||
break
|
||||
|
||||
# 不等所有 task 完成(fire-and-forget 語義保留),但 await 一下讓並發限制生效
|
||||
if tasks:
|
||||
logger.info("stale_ready_tokens_throttled_dispatch",
|
||||
total=len(tasks), max_concurrent=5)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("resend_stale_ready_tokens_failed", error=str(e))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user