fix(BLOCKER): LLM 連續失敗 — 4 個違反設計處全部修復

統帥盤點發現飛輪沉默真因：4 個違反既定架構設計的 bug 同時撞車。 P0a — Ollama timeout 違反 GAP-B4 設計 config.py:OPENCLAW_TIMEOUT 從 120s 改 30s 原 120s 違反 ADR-052 GAP-B4 (LLM 25s hard timeout) 設計致 Ollama 過載時 thread 飢餓 120s 才降級 P0b — AI Router silent skip 觀測性修復 ai_router.py: not_registered/circuit_open/rate_limit/privacy_skip 全部累積到 errors 陣列，log all_providers_failed 時可知為何 skip 原本 errors=["ollama: Timeout"] 但 tried=4 個，無法診斷 P1a — send_text 方法不存在 bug ai_router.py:1005 tg.send_text() → tg.send_notification(parse_mode=HTML) TelegramGateway 只有 send_notification 沒 send_text 致 fallback 失敗通知本身失敗（雙重靜默） P1b — resend_stale_ready_tokens 並發爆炸 decision_manager.py: 加 asyncio.Semaphore(5) + 200ms throttle 原本 fire_and_forget N 個 task 同時跑，N=108 時 Ollama embedding 全部 timeout，包括我打的 live-fire 也被擠爆改：max 5 並發 + 每完成喘 200ms CD 流程審查 (Blocker 1): 完全符合 ADR-039 設計，10-15 min 是預期不需修，是設計就需要這時間。 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-14 19:37:02 +08:00
parent 35736315ce
commit 8b7e9cbfb8
3 changed files with 49 additions and 12 deletions
--- a/apps/api/src/core/config.py
+++ b/apps/api/src/core/config.py
@@ -358,8 +358,10 @@ class Settings(BaseSettings):
        description="Default Ollama model for RCA analysis",
    )
    OPENCLAW_TIMEOUT: int = Field(
-        default=120,  # 2026-04-08 ogt: deepseek-r1:14b 實測最慢 54s，120s 含 buffer
-        description="Timeout for OpenClaw AI calls (seconds)",
+        default=30,  # 2026-04-14 Claude Sonnet 4.6: 從 120s 改 30s，配合 ADR-052 GAP-B4
+        # 25s LLM hard timeout + 5s buffer。原 120s 違反 defense-in-depth 設計，
+        # 導致 Ollama 過載時 thread 飢餓 120s 才降級 fallback。
+        description="Timeout for OpenClaw AI calls (seconds, aligned with GAP-B4 25s)",
    )

    # ==========================================================================
--- a/apps/api/src/services/ai_router.py
+++ b/apps/api/src/services/ai_router.py
@@ -895,16 +895,20 @@ class AIRouterExecutor:
        for provider_name in provider_order:
            provider = self._registry.get(provider_name)
            if not provider:
+                # 2026-04-14 Claude Sonnet 4.6: silent skip 改 errors 累積（觀測性）
+                errors.append(f"{provider_name}: not_registered")
                continue

            # 隱私過濾 (D7)
            if require_local and provider.privacy_level != "local":
+                errors.append(f"{provider_name}: privacy_skip(non_local)")
                continue

            # 閘門 1: Circuit Breaker (per-provider, C2 修復)
            cb = self._get_circuit_breaker(provider_name)
            if cb.is_open():
-                logger.debug("ai_router_circuit_open", provider=provider_name)
+                errors.append(f"{provider_name}: circuit_open")
+                logger.warning("ai_router_circuit_open", provider=provider_name)
                continue

            # 閘門 2: Rate Limiter
@@ -915,6 +919,7 @@ class AIRouterExecutor:
                    rate_limiter = get_ai_rate_limiter()
                    allowed, reason = await rate_limiter.check_and_increment(provider_name)
                    if not allowed:
+                        errors.append(f"{provider_name}: rate_limit({reason})")
                        logger.info("ai_router_rate_limited", provider=provider_name, reason=reason)
                        continue
                except Exception as e:
@@ -1001,11 +1006,13 @@ class AIRouterExecutor:
                from src.services.telegram_gateway import get_telegram_gateway
                tg = get_telegram_gateway()
                import asyncio as _asyncio
+                # 2026-04-14 Claude Sonnet 4.6: send_text 方法不存在，改 send_notification
                _asyncio.create_task(
-                    tg.send_text(
+                    tg.send_notification(
                        "⚠️ <b>DIAGNOSE 本地 Provider 不可用</b>\n"
                        f"已嘗試: {', '.join(provider_order)}\n"
-                        "需要人工介入，雲端 Provider 不會被呼叫（隱私邊界）。"
+                        "需要人工介入，雲端 Provider 不會被呼叫（隱私邊界）。",
+                        parse_mode="HTML",
                    )
                )
            except Exception as _tg_e:
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -2036,9 +2036,14 @@ class DecisionManager:
        重新推送 Telegram 審核卡片。
        觸發時機：API 啟動（lifespan startup）+ 管理 API 手動呼叫。

+        2026-04-14 Claude Sonnet 4.6 修復: 加並發限制防止 Pod 啟動時壓爆 Ollama
+        - 原 fire_and_forget 同時啟動 N 個 task → N=108 時 Ollama embedding 全部 timeout
+        - 改 Semaphore 限 5 並發 + 每批 sleep 1s，總體 throughput 降低但系統穩定
+
        Returns:
            重新推送的 token 數量
        """
+        import asyncio as _asyncio
        import json as _json
        from src.core.redis_client import get_redis
        from src.db.base import get_db_context
@@ -2046,10 +2051,31 @@ class DecisionManager:

        redis = get_redis()
        resent = 0
+        # GAP-A4 後續修復：限制並發 5，避免壓爆 Ollama
+        _sem = _asyncio.Semaphore(5)
+
+        async def _bounded_push(incident_obj, proposal_data_obj, _id, _token):
+            async with _sem:
+                try:
+                    await _push_decision_to_telegram(incident_obj, proposal_data_obj)
+                    logger.info(
+                        "stale_ready_token_resent",
+                        incident_id=_id,
+                        token=_token,
+                    )
+                except Exception as _e:
+                    logger.warning(
+                        "stale_ready_token_resend_failed",
+                        incident_id=_id,
+                        error=str(_e),
+                    )
+                # 每次完成後喘 200ms，給 Ollama embedding 恢復空間
+                await _asyncio.sleep(0.2)

        try:
            # 掃描所有 decision:* key
            cursor = 0
+            tasks: list[_asyncio.Task] = []
            while True:
                cursor, keys = await redis.scan(cursor, match="decision:*", count=200)
                for key in keys:
@@ -2078,21 +2104,23 @@ class DecisionManager:
                        if not proposal_data:
                            continue

-                        _fire_and_forget(
-                            _push_decision_to_telegram(incident, proposal_data)
+                        # 用 Semaphore 限制並發，task 自帶 throttle
+                        _task = _asyncio.create_task(
+                            _bounded_push(incident, proposal_data, incident_id, data.get("token", ""))
                        )
+                        tasks.append(_task)
                        resent += 1
-                        logger.info(
-                            "stale_ready_token_resent",
-                            incident_id=incident_id,
-                            token=data.get("token", ""),
-                        )
                    except Exception as _te:
                        logger.debug("stale_ready_token_scan_error", error=str(_te))

                if cursor == 0:
                    break

+            # 不等所有 task 完成（fire-and-forget 語義保留），但 await 一下讓並發限制生效
+            if tasks:
+                logger.info("stale_ready_tokens_throttled_dispatch",
+                            total=len(tasks), max_concurrent=5)
+
        except Exception as e:
            logger.warning("resend_stale_ready_tokens_failed", error=str(e))