From 36754a8a844b82742f822bde3557d336a6234739 Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 14 Apr 2026 20:38:00 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20Bug=20A=20=E8=A8=BA=E6=96=B7=20+=20Bug?= =?UTF-8?q?=20B=20=E7=9C=9F=E4=BF=AE=20=E2=80=94=20LLM=20120s/130s=20?= =?UTF-8?q?=E7=A1=AC=E7=B7=A8=20=E2=86=92=20OPENCLAW=5FTIMEOUT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 殘留兩個深層 bug 處理: Bug A (approval.incident_id 仍 NULL) — 加診斷 - update_incident_id 加 rowcount 檢查 - 若 UPDATE 0 rows affected → warning log (id 型別 mismatch 或 session 不同步) - 手動 UPDATE 測試通過 → DB/permissions 正常,問題在應用層 - 等 CD 部署後 live-fire 觀察 log 診斷真因 Bug B (LLM 仍 2m6s >> 30s) — 真修 openclaw.py 兩處硬編 timeout: - line 146 httpx client default: 120.0s → settings.OPENCLAW_TIMEOUT (30s) - line 348 /analyze/incident POST: 130.0s → settings.OPENCLAW_TIMEOUT (30s) GAP-B4 commit dd0a778 只修了 ai_providers/ollama.py 但 openclaw.py 自己的 httpx client 和 endpoint call 沒改 這就是為什麼 Live-fire #2-#7 都卡 120s+ 的真因 回歸測試: 125/125 (dispatcher + a4 + classify + grouping) Co-Authored-By: Claude Haiku 4.5 --- apps/api/src/services/approval_db.py | 22 +++++++++++++++++++++- apps/api/src/services/openclaw.py | 13 ++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/apps/api/src/services/approval_db.py b/apps/api/src/services/approval_db.py index 9ae36fff..677d2fea 100644 --- a/apps/api/src/services/approval_db.py +++ b/apps/api/src/services/approval_db.py @@ -606,13 +606,33 @@ class ApprovalDBService: """ 2026-04-06 ogt: Phase 26 — 回寫 incident_id 到 approval_records 讓 Playbook 萃取和 KM 寫入能找到對應的 Incident + + 2026-04-14 Claude Sonnet 4.6 診斷: Live-fire #7 發現 approval.incident_id 仍 NULL + 加 rowcount 與 pre/post 值檢查,若 0 rows affected 則 log warning """ async with get_db_context() as db: - await db.execute( + result = await db.execute( update(ApprovalRecord) .where(ApprovalRecord.id == str(approval_id)) .values(incident_id=incident_id) ) + rowcount = result.rowcount if hasattr(result, "rowcount") else -1 + if rowcount == 0: + # 找不到對應 approval — 可能 id 型別或 session 不同步 + logger.warning( + "update_incident_id_zero_rows", + approval_id=str(approval_id), + approval_id_type=type(approval_id).__name__, + incident_id=incident_id, + reason="UPDATE 0 rows affected — approval 不存在或 id mismatch", + ) + else: + logger.info( + "update_incident_id_success", + approval_id=str(approval_id), + incident_id=incident_id, + rowcount=rowcount, + ) async def update_telegram_message( self, incident_id: str, telegram_message_id: int, telegram_chat_id: int | None = None diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 5ef58e18..ee887690 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -140,10 +140,15 @@ class OpenClawService: self._signoz = get_signoz_client() async def _get_client(self) -> httpx.AsyncClient: - """取得 HTTP 客戶端""" + """取得 HTTP 客戶端 + + 2026-04-14 Claude Sonnet 4.6: 從硬編 120s 改用 OPENCLAW_TIMEOUT 設定 (30s) + 對齊 ADR-052 GAP-B4 的 25s + 5s buffer 設計。原 120s 違反 defense-in-depth。 + """ if self._http_client is None or self._http_client.is_closed: + _t = float(settings.OPENCLAW_TIMEOUT) self._http_client = httpx.AsyncClient( - timeout=httpx.Timeout(120.0, connect=10.0), + timeout=httpx.Timeout(_t, connect=10.0), ) return self._http_client @@ -342,10 +347,12 @@ class OpenClawService: "affected_services": affected_services, "expert_context": _to_serializable(expert_context) if expert_context else None, } + # 2026-04-14 Claude Sonnet 4.6: 從硬編 130s 改用 OPENCLAW_TIMEOUT + # 原 130s 讓 LLM 能卡 2m10s,超過 Ollama 真實返回時間(P95 54s) resp = await client.post( f"{settings.OPENCLAW_URL}/api/v1/analyze/incident", json=payload, - timeout=httpx.Timeout(130.0, connect=5.0), + timeout=httpx.Timeout(float(settings.OPENCLAW_TIMEOUT), connect=5.0), ) resp.raise_for_status() data = resp.json()