fix(BLOCKER): LLM 連續失敗 — 4 個違反設計處全部修復
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m21s

統帥盤點發現飛輪沉默真因:4 個違反既定架構設計的 bug 同時撞車。

P0a — Ollama timeout 違反 GAP-B4 設計
  config.py:OPENCLAW_TIMEOUT 從 120s 改 30s
  原 120s 違反 ADR-052 GAP-B4 (LLM 25s hard timeout) 設計
  致 Ollama 過載時 thread 飢餓 120s 才降級

P0b — AI Router silent skip 觀測性修復
  ai_router.py: not_registered/circuit_open/rate_limit/privacy_skip
  全部累積到 errors 陣列,log all_providers_failed 時可知為何 skip
  原本 errors=["ollama: Timeout"] 但 tried=4 個,無法診斷

P1a — send_text 方法不存在 bug
  ai_router.py:1005 tg.send_text() → tg.send_notification(parse_mode=HTML)
  TelegramGateway 只有 send_notification 沒 send_text
  致 fallback 失敗通知本身失敗(雙重靜默)

P1b — resend_stale_ready_tokens 並發爆炸
  decision_manager.py: 加 asyncio.Semaphore(5) + 200ms throttle
  原本 fire_and_forget N 個 task 同時跑,N=108 時 Ollama embedding
  全部 timeout,包括我打的 live-fire 也被擠爆
  改:max 5 並發 + 每完成喘 200ms

CD 流程審查 (Blocker 1): 完全符合 ADR-039 設計,10-15 min 是預期
不需修,是設計就需要這時間。

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-14 19:37:02 +08:00
parent 35736315ce
commit 8b7e9cbfb8
3 changed files with 49 additions and 12 deletions

View File

@@ -358,8 +358,10 @@ class Settings(BaseSettings):
description="Default Ollama model for RCA analysis",
)
OPENCLAW_TIMEOUT: int = Field(
default=120, # 2026-04-08 ogt: deepseek-r1:14b 實測最慢 54s120s 含 buffer
description="Timeout for OpenClaw AI calls (seconds)",
default=30, # 2026-04-14 Claude Sonnet 4.6: 從 120s 改 30s配合 ADR-052 GAP-B4
# 25s LLM hard timeout + 5s buffer。原 120s 違反 defense-in-depth 設計,
# 導致 Ollama 過載時 thread 飢餓 120s 才降級 fallback。
description="Timeout for OpenClaw AI calls (seconds, aligned with GAP-B4 25s)",
)
# ==========================================================================

View File

@@ -895,16 +895,20 @@ class AIRouterExecutor:
for provider_name in provider_order:
provider = self._registry.get(provider_name)
if not provider:
# 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積(觀測性)
errors.append(f"{provider_name}: not_registered")
continue
# 隱私過濾 (D7)
if require_local and provider.privacy_level != "local":
errors.append(f"{provider_name}: privacy_skip(non_local)")
continue
# 閘門 1: Circuit Breaker (per-provider, C2 修復)
cb = self._get_circuit_breaker(provider_name)
if cb.is_open():
logger.debug("ai_router_circuit_open", provider=provider_name)
errors.append(f"{provider_name}: circuit_open")
logger.warning("ai_router_circuit_open", provider=provider_name)
continue
# 閘門 2: Rate Limiter
@@ -915,6 +919,7 @@ class AIRouterExecutor:
rate_limiter = get_ai_rate_limiter()
allowed, reason = await rate_limiter.check_and_increment(provider_name)
if not allowed:
errors.append(f"{provider_name}: rate_limit({reason})")
logger.info("ai_router_rate_limited", provider=provider_name, reason=reason)
continue
except Exception as e:
@@ -1001,11 +1006,13 @@ class AIRouterExecutor:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
import asyncio as _asyncio
# 2026-04-14 Claude Sonnet 4.6: send_text 方法不存在,改 send_notification
_asyncio.create_task(
tg.send_text(
tg.send_notification(
"⚠️ <b>DIAGNOSE 本地 Provider 不可用</b>\n"
f"已嘗試: {', '.join(provider_order)}\n"
"需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。"
"需要人工介入,雲端 Provider 不會被呼叫(隱私邊界)。",
parse_mode="HTML",
)
)
except Exception as _tg_e:

View File

@@ -2036,9 +2036,14 @@ class DecisionManager:
重新推送 Telegram 審核卡片。
觸發時機API 啟動lifespan startup+ 管理 API 手動呼叫。
2026-04-14 Claude Sonnet 4.6 修復: 加並發限制防止 Pod 啟動時壓爆 Ollama
- 原 fire_and_forget 同時啟動 N 個 task → N=108 時 Ollama embedding 全部 timeout
- 改 Semaphore 限 5 並發 + 每批 sleep 1s總體 throughput 降低但系統穩定
Returns:
重新推送的 token 數量
"""
import asyncio as _asyncio
import json as _json
from src.core.redis_client import get_redis
from src.db.base import get_db_context
@@ -2046,10 +2051,31 @@ class DecisionManager:
redis = get_redis()
resent = 0
# GAP-A4 後續修復:限制並發 5避免壓爆 Ollama
_sem = _asyncio.Semaphore(5)
async def _bounded_push(incident_obj, proposal_data_obj, _id, _token):
async with _sem:
try:
await _push_decision_to_telegram(incident_obj, proposal_data_obj)
logger.info(
"stale_ready_token_resent",
incident_id=_id,
token=_token,
)
except Exception as _e:
logger.warning(
"stale_ready_token_resend_failed",
incident_id=_id,
error=str(_e),
)
# 每次完成後喘 200ms給 Ollama embedding 恢復空間
await _asyncio.sleep(0.2)
try:
# 掃描所有 decision:* key
cursor = 0
tasks: list[_asyncio.Task] = []
while True:
cursor, keys = await redis.scan(cursor, match="decision:*", count=200)
for key in keys:
@@ -2078,21 +2104,23 @@ class DecisionManager:
if not proposal_data:
continue
_fire_and_forget(
_push_decision_to_telegram(incident, proposal_data)
# 用 Semaphore 限制並發task 自帶 throttle
_task = _asyncio.create_task(
_bounded_push(incident, proposal_data, incident_id, data.get("token", ""))
)
tasks.append(_task)
resent += 1
logger.info(
"stale_ready_token_resent",
incident_id=incident_id,
token=data.get("token", ""),
)
except Exception as _te:
logger.debug("stale_ready_token_scan_error", error=str(_te))
if cursor == 0:
break
# 不等所有 task 完成fire-and-forget 語義保留),但 await 一下讓並發限制生效
if tasks:
logger.info("stale_ready_tokens_throttled_dispatch",
total=len(tasks), max_concurrent=5)
except Exception as e:
logger.warning("resend_stale_ready_tokens_failed", error=str(e))