diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 9705beda..3ecaa7b7 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1684,7 +1684,8 @@ Focus on: import asyncio await asyncio.sleep(2) # 重試前等 2 秒 - # 重試全部失敗 — 仍然標記 enabled 並顯示失敗狀態(不隱藏) + # 重試全部失敗 — fallback 到 Gemini 模擬 tool calling + # 2026-04-08 ogt: NIM 完全不可用時,改用 Gemini 產生執行方案(不可跳過) if last_error is not None: logger.error( "nemotron_collaboration_exhausted", @@ -1692,10 +1693,18 @@ Focus on: error=str(last_error), retries=max_retries, ) - proposal["nemotron_enabled"] = True # 🔴 仍然顯示區塊,讓統帥知道失敗了 - proposal["nemotron_tools"] = [] - proposal["nemotron_validation"] = f"❌ {max_retries}次重試均失敗" - proposal["nemotron_latency_ms"] = 0.0 + logger.info("nemotron_fallback_gemini_start", incident_id=incident_id) + gemini_fallback_result = await self._call_nemotron_tools_via_gemini( + incident_id=incident_id, + reasoning=proposal.get("reasoning", ""), + target_resource=proposal.get("target_resource", ""), + suggested_action=proposal.get("action", ""), + namespace=proposal.get("namespace", "awoooi-prod"), + ) + proposal["nemotron_enabled"] = True + proposal["nemotron_tools"] = gemini_fallback_result.get("tools", []) + proposal["nemotron_validation"] = gemini_fallback_result.get("validation", "⚠️ Gemini 代理") + proposal["nemotron_latency_ms"] = gemini_fallback_result.get("latency_ms", 0.0) return proposal, provider, True @@ -1860,6 +1869,78 @@ Focus on: ) raise + async def _call_nemotron_tools_via_gemini( + self, + incident_id: str, + reasoning: str, + target_resource: str, + suggested_action: str, + namespace: str = "awoooi-prod", + ) -> dict: + """ + NIM 完全不可用時,由 Gemini 代理產生 tool calling 執行方案。 + 2026-04-08 ogt: NIM timeout 後的唯一 fallback,不可跳過。 + + Returns: {"tools": [...], "validation": str, "latency_ms": float} + """ + import time as _time + start_time = _time.time() + + prompt = f"""你是 K8s SRE 專家。根據以下分析,輸出對應的 kubectl 操作指令(JSON 格式)。 + +Incident ID: {incident_id} +目標資源: {target_resource} +Namespace: {namespace} +建議操作: {suggested_action} +分析摘要: {reasoning[:300]} + +請輸出以下 JSON 格式(只輸出 JSON,不要其他文字): +{{ + "tool_name": "restart_deployment 或 scale_deployment 或 no_action", + "deployment_name": "部署名稱", + "namespace": "{namespace}", + "reason": "一句話說明原因" +}}""" + + try: + text, success, _, _ = await self._call_gemini(prompt) + latency_ms = (_time.time() - start_time) * 1000 + + if not success: + logger.warning("nemotron_gemini_fallback_failed", incident_id=incident_id, error=text) + return {"tools": [], "validation": "❌ NIM + Gemini 均不可用", "latency_ms": latency_ms} + + import json as _json + data = _json.loads(text) + tool_name = data.get("tool_name", "no_action") + tools = [] + if tool_name != "no_action": + tools = [{ + "tool": tool_name, + "args": { + "deployment_name": data.get("deployment_name", target_resource), + "namespace": data.get("namespace", namespace), + }, + "valid": True, + }] + + logger.info( + "nemotron_gemini_fallback_success", + incident_id=incident_id, + tool=tool_name, + latency_ms=latency_ms, + ) + return { + "tools": tools, + "validation": "✅ Gemini 代理驗證通過", + "latency_ms": latency_ms, + } + + except Exception as e: + latency_ms = (_time.time() - start_time) * 1000 + logger.error("nemotron_gemini_fallback_error", incident_id=incident_id, error=str(e)) + return {"tools": [], "validation": f"❌ Gemini 代理失敗: {str(e)[:50]}", "latency_ms": latency_ms} + # ========================================================================= # Shadow Mode Auto-Tuning # ========================================================================= diff --git a/scripts/ops/docker-health-monitor.sh b/scripts/ops/docker-health-monitor.sh index 4c2f42a2..f0a739d4 100755 --- a/scripts/ops/docker-health-monitor.sh +++ b/scripts/ops/docker-health-monitor.sh @@ -25,6 +25,9 @@ fi # 冷卻期:避免同一容器在短時間內重複發送 webhook(去重,非修復冷卻) : "${SEND_COOLDOWN_SECONDS:=300}" : "${COOLDOWN_DIR:=/tmp/docker-health-monitor-cooldown}" +# 排除清單:逗號分隔,支援 glob(如 signoz-*) +# 用途:init containers、永久停用服務、已知 exited 但不需告警的容器 +: "${EXCLUDE_CONTAINERS:=signoz-telemetrystore-migrator,signoz-clickhouse,signoz-init-clickhouse}" mkdir -p "$COOLDOWN_DIR" @@ -135,6 +138,21 @@ check_containers() { # 跳過 header 或空行 [[ -z "$container_name" ]] && continue + # 排除清單檢查(EXCLUDE_CONTAINERS 逗號分隔) + local excluded=false + IFS=',' read -ra EXCLUDES <<< "$EXCLUDE_CONTAINERS" + for pattern in "${EXCLUDES[@]}"; do + pattern="${pattern// /}" # trim spaces + [[ -z "$pattern" ]] && continue + # shellcheck disable=SC2254 + case "$container_name" in + $pattern) excluded=true; break ;; + esac + done + if $excluded; then + continue + fi + local needs_alert=false local detected_status=""