2743 lines
113 KiB
Python
2743 lines
113 KiB
Python
"""
|
||
OpenClaw AI Decision Engine - True LLM + SignOz Integration
|
||
============================================================
|
||
Phase 5: OpenClaw 實體化升級 (2026-03-21)
|
||
統帥校正: SignOz 為唯一全能視力中心
|
||
|
||
Features:
|
||
- 真實 LLM SDK 整合 (告警預設 Ollama GCP-A → GCP-B → 111 → Gemini)
|
||
- SignOz Gold Metrics 即時擷取 (P99/Error/RPS)
|
||
- AIOps Agent 專業人格 (K8s 維運 + SRE RCA 專精)
|
||
- 強制結構化 JSON 輸出 (符合 API 契約)
|
||
- 動態告警上下文注入 + SignOz 數據
|
||
- Shadow Mode 調優指令生成 (日誌輸出,不執行)
|
||
|
||
防禦性工程鐵律:
|
||
- Zero Trust: 預設不信任 LLM 輸出,必須通過 Pydantic 驗證
|
||
- Edge Case: 網路失敗、解析失敗、超時處理
|
||
- SignOz 失敗時優雅降級 (不阻塞主流程)
|
||
"""
|
||
|
||
import hashlib
|
||
import json
|
||
import random
|
||
import re
|
||
import time
|
||
from datetime import datetime
|
||
|
||
import httpx
|
||
import structlog
|
||
|
||
from src.core.config import settings
|
||
from src.core.prompts import NEMOTRON_SYSTEM_PROMPT, OPENCLAW_SYSTEM_PROMPT
|
||
from src.core.redis_client import get_redis
|
||
from src.models.ai import (
|
||
AIRiskLevel,
|
||
OpenClawDecision,
|
||
SuggestedAction,
|
||
)
|
||
from src.services.langfuse_client import langfuse_trace
|
||
from src.services.model_registry import get_model_registry
|
||
from src.services.ollama_failover_manager import get_ollama_failover_manager
|
||
from src.services.signoz_client import GoldMetrics, get_signoz_client
|
||
from src.utils.k8s_naming import normalize_resource_name
|
||
from src.utils.timezone import now_taipei_iso
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# AIOps Agent System Prompt (專業人格 + 仲裁邏輯 + SignOz 數據)
|
||
# =============================================================================
|
||
|
||
# 責任矩陣定義
|
||
RESPONSIBILITY_MATRIX = {
|
||
"FE": "前端團隊 (Frontend)",
|
||
"BE": "後端團隊 (Backend)",
|
||
"INFRA": "基礎設施團隊 (Infrastructure/SRE)",
|
||
"DB": "資料庫團隊 (Database/DBA)",
|
||
"COLLAB": "協同處理 (需多團隊會診)",
|
||
}
|
||
|
||
# 信心度閾值
|
||
CONFIDENCE_THRESHOLD_COLLAB = 0.70 # 低於此閾值自動標記為 COLLAB
|
||
|
||
# OPENCLAW_SYSTEM_PROMPT 已移至 src/core/prompts.py (Phase 17 P2 改進)
|
||
|
||
|
||
# =============================================================================
|
||
# LLM Analysis Result - Using Pydantic for Schema Enforcement
|
||
# =============================================================================
|
||
|
||
# We use OpenClawDecision from models/ai.py for Pydantic validation
|
||
# This alias is for backwards compatibility
|
||
LLMAnalysisResult = OpenClawDecision
|
||
|
||
|
||
# =============================================================================
|
||
# kubectl_command 回填 helper
|
||
# 2026-04-09 Claude Sonnet 4.6: I2 架構Review修復 — 補齊所有 tool 類型,消除兩處重複邏輯 (M3)
|
||
# =============================================================================
|
||
|
||
def _backfill_kubectl_command(proposal: dict, tools: list) -> None:
|
||
"""將 AI tool call 結果回填為可執行的 kubectl_command。
|
||
proposal["kubectl_command"] 若已有值則不覆蓋(LLM 直接填的優先)。
|
||
"""
|
||
if not tools or proposal.get("kubectl_command"):
|
||
return
|
||
_t = tools[0]
|
||
_tool_name = _t.get("tool", "")
|
||
_args = _t.get("args", {})
|
||
_ns = _args.get("namespace", proposal.get("namespace", "awoooi-prod"))
|
||
|
||
if _tool_name == "restart_deployment":
|
||
_deploy = _args.get("deployment_name", proposal.get("target_resource", ""))
|
||
if _deploy:
|
||
proposal["kubectl_command"] = f"kubectl rollout restart deployment/{_deploy} -n {_ns}"
|
||
elif _tool_name == "delete_pod":
|
||
_pod = _args.get("pod_name", "")
|
||
if _pod:
|
||
proposal["kubectl_command"] = f"kubectl delete pod {_pod} -n {_ns}"
|
||
elif _tool_name == "scale_deployment":
|
||
_deploy = _args.get("deployment_name", "")
|
||
_replicas = _args.get("replicas", 2)
|
||
if _deploy:
|
||
proposal["kubectl_command"] = f"kubectl scale deployment/{_deploy} --replicas={_replicas} -n {_ns}"
|
||
elif _tool_name == "delete_deployment":
|
||
_deploy = _args.get("deployment_name", "")
|
||
if _deploy:
|
||
proposal["kubectl_command"] = f"kubectl delete deployment/{_deploy} -n {_ns}"
|
||
elif _tool_name == "drain_node":
|
||
_node = _args.get("node_name", "")
|
||
if _node:
|
||
proposal["kubectl_command"] = f"kubectl drain {_node} --ignore-daemonsets --delete-emptydir-data"
|
||
elif _tool_name == "cordon_node":
|
||
_node = _args.get("node_name", "")
|
||
if _node:
|
||
proposal["kubectl_command"] = f"kubectl cordon {_node}"
|
||
elif _tool_name == "delete_service":
|
||
_svc = _args.get("service_name", "")
|
||
if _svc:
|
||
proposal["kubectl_command"] = f"kubectl delete service/{_svc} -n {_ns}"
|
||
|
||
|
||
# =============================================================================
|
||
# OpenClaw Service
|
||
# =============================================================================
|
||
|
||
def _build_alert_cache_context_hash(alert_context: dict | None) -> str:
|
||
"""Build a stable LLM cache scope for repeat alerts without dynamic annotations."""
|
||
|
||
if not alert_context:
|
||
return ""
|
||
|
||
alertname = alert_context.get("alertname") or alert_context.get("alert_type", "")
|
||
category = alert_context.get("alert_category", "")
|
||
namespace = alert_context.get("namespace", "")
|
||
target = alert_context.get("target_resource", "")
|
||
severity = alert_context.get("severity", "")
|
||
fingerprint = alert_context.get("fingerprint", "")
|
||
return f"{alertname}:{category}:{namespace}:{target}:{severity}:{fingerprint}"
|
||
|
||
|
||
class OpenClawService:
|
||
"""
|
||
OpenClaw AI 決策服務 - True LLM + SignOz Integration
|
||
|
||
實作 AI_FALLBACK_ORDER 備援機制。
|
||
告警/incident 上下文預設套用成本防線,只允許 Ollama GCP-A → GCP-B → 111。
|
||
|
||
新增 SignOz 整合:
|
||
- 自動擷取 Gold Metrics
|
||
- 數據驅動的 RCA 分析
|
||
- 動態 Trace URL 生成
|
||
"""
|
||
|
||
def __init__(self):
|
||
self._http_client: httpx.AsyncClient | None = None
|
||
self._signoz = get_signoz_client()
|
||
|
||
async def _get_client(self) -> httpx.AsyncClient:
|
||
"""取得 HTTP 客戶端
|
||
|
||
2026-04-14 Claude Sonnet 4.6: 從硬編 120s 改用 OPENCLAW_TIMEOUT 設定 (30s)
|
||
對齊 ADR-052 GAP-B4 的 25s + 5s buffer 設計。原 120s 違反 defense-in-depth。
|
||
"""
|
||
if self._http_client is None or self._http_client.is_closed:
|
||
_t = float(settings.OPENCLAW_TIMEOUT)
|
||
self._http_client = httpx.AsyncClient(
|
||
timeout=httpx.Timeout(_t, connect=10.0),
|
||
)
|
||
return self._http_client
|
||
|
||
async def close(self) -> None:
|
||
"""關閉連線"""
|
||
if self._http_client:
|
||
await self._http_client.aclose()
|
||
self._http_client = None
|
||
|
||
def _is_incident_alert_context(self, alert_context: dict | None) -> bool:
|
||
"""Return true when a request came from the alert/incident automation path."""
|
||
if not alert_context:
|
||
return False
|
||
alert_keys = {
|
||
"alert_type",
|
||
"alertname",
|
||
"alert_name",
|
||
"fingerprint",
|
||
"incident_id",
|
||
"severity",
|
||
"signals",
|
||
"target_resource",
|
||
}
|
||
return any(key in alert_context for key in alert_keys)
|
||
|
||
def _cloud_fallback_allowed_for_alert(self, alert_context: dict | None) -> bool:
|
||
"""Cloud fallback is allowed after the ordered Ollama lane for alerts."""
|
||
if alert_context and alert_context.get("allow_cloud_fallback") is False:
|
||
return False
|
||
if not self._is_incident_alert_context(alert_context):
|
||
return True
|
||
return bool(getattr(settings, "ALERT_AI_ALLOW_CLOUD_FALLBACK", True))
|
||
|
||
def _alert_enforces_ollama_first(self, alert_context: dict | None) -> bool:
|
||
"""Alert and AI-governance lanes must try GCP-A/GCP-B/111 before Gemini backup."""
|
||
return (
|
||
bool(alert_context)
|
||
and (
|
||
bool(alert_context.get("enforce_ollama_first"))
|
||
or self._is_incident_alert_context(alert_context)
|
||
)
|
||
and bool(getattr(settings, "ALERT_AI_ENFORCE_OLLAMA_FIRST", True))
|
||
)
|
||
|
||
async def _resolve_alert_provider_order(
|
||
self,
|
||
task_type: str = "diagnose",
|
||
alert_context: dict | None = None,
|
||
cloud_provider_order: list[str] | None = None,
|
||
) -> list[str]:
|
||
"""Resolve GCP-A/GCP-B/111, then OpenClaw/Nemo before Gemini backup."""
|
||
provider_order: list[str] = []
|
||
try:
|
||
route = await get_ollama_failover_manager().select_provider(task_type=task_type)
|
||
provider_order = [
|
||
endpoint.provider_name
|
||
for endpoint in route.all_endpoints_in_order()
|
||
if endpoint.provider_name.startswith("ollama")
|
||
]
|
||
except Exception as route_error:
|
||
logger.warning(
|
||
"alert_ollama_route_lookup_failed",
|
||
error=str(route_error),
|
||
task_type=task_type,
|
||
)
|
||
|
||
if not provider_order:
|
||
provider_order = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||
|
||
deduped: list[str] = []
|
||
for provider_name in provider_order:
|
||
if provider_name and provider_name not in deduped:
|
||
deduped.append(provider_name)
|
||
|
||
if not self._alert_enforces_ollama_first(alert_context):
|
||
return deduped
|
||
|
||
ollama_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
|
||
ordered_ollama = [
|
||
provider_name
|
||
for provider_name in deduped
|
||
if provider_name in ollama_order
|
||
]
|
||
ordered_ollama.sort(key=lambda provider_name: ollama_order[provider_name])
|
||
if not ordered_ollama:
|
||
ordered_ollama = ["ollama_gcp_a", "ollama_gcp_b", "ollama_local"]
|
||
|
||
if not self._cloud_fallback_allowed_for_alert(alert_context):
|
||
return ordered_ollama
|
||
|
||
cloud_aliases = {"nvidia": "openclaw_nemo"}
|
||
cloud_candidates = {
|
||
cloud_aliases.get(provider_name, provider_name)
|
||
for provider_name in (cloud_provider_order or [])
|
||
}
|
||
try:
|
||
from src.services.ai_control import is_provider_disabled
|
||
if await is_provider_disabled("openclaw_nemo"):
|
||
cloud_candidates.discard("openclaw_nemo")
|
||
else:
|
||
cloud_candidates.add("openclaw_nemo")
|
||
except Exception as control_error:
|
||
logger.warning("alert_openclaw_nemo_control_check_failed", error=str(control_error))
|
||
# Gemini remains the final paid backup, but alert traffic should use
|
||
# OpenClaw/Nemo first whenever the router control plane has not disabled it.
|
||
cloud_candidates.add("gemini")
|
||
cloud_backup = [
|
||
provider_name
|
||
for provider_name in ("openclaw_nemo", "gemini")
|
||
if provider_name in cloud_candidates
|
||
]
|
||
|
||
return ordered_ollama + cloud_backup
|
||
|
||
# =========================================================================
|
||
# SignOz Integration
|
||
# =========================================================================
|
||
|
||
async def get_signoz_context(
|
||
self,
|
||
service_name: str,
|
||
namespace: str = "default",
|
||
alert_timestamp: datetime | None = None,
|
||
) -> tuple[GoldMetrics | None, str]:
|
||
"""
|
||
擷取 SignOz 上下文數據
|
||
|
||
Returns:
|
||
(GoldMetrics, trace_url) or (None, fallback_url)
|
||
"""
|
||
try:
|
||
metrics = await self._signoz.get_gold_metrics(
|
||
service_name=service_name,
|
||
namespace=namespace,
|
||
time_window_minutes=10,
|
||
)
|
||
|
||
trace_url = self._signoz.generate_trace_url(
|
||
service_name=service_name,
|
||
alert_timestamp=alert_timestamp,
|
||
window_minutes=5,
|
||
)
|
||
|
||
logger.info(
|
||
"signoz_context_fetched",
|
||
service=service_name,
|
||
rps=metrics.rps,
|
||
error_rate=metrics.error_rate,
|
||
p99_latency=metrics.p99_latency_ms,
|
||
)
|
||
|
||
return metrics, trace_url
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"signoz_context_fetch_failed",
|
||
service=service_name,
|
||
error=str(e),
|
||
)
|
||
# 降級: 返回 None 和靜態 URL
|
||
fallback_url = f"{settings.SIGNOZ_URL}/traces?service={service_name}"
|
||
return None, fallback_url
|
||
|
||
def generate_auto_tuning_command(
|
||
self,
|
||
alert_type: str,
|
||
target_resource: str,
|
||
namespace: str,
|
||
metrics: GoldMetrics | None = None,
|
||
) -> dict:
|
||
"""
|
||
根據告警類型和 SignOz 數據生成調優指令
|
||
|
||
Shadow Mode: 僅生成指令,不執行
|
||
|
||
Phase 18.1.6: 整合 K8s 資源名稱驗證 (ADR-016)
|
||
|
||
Returns:
|
||
{command: str, description: str, type: str}
|
||
"""
|
||
# Phase 18.1.6: 先正規化資源名稱
|
||
normalized = normalize_resource_name(target_resource, namespace)
|
||
|
||
if not normalized.is_k8s_resource:
|
||
# 非 K8s 資源,返回提示訊息
|
||
logger.info(
|
||
"non_k8s_resource_detected",
|
||
original=target_resource,
|
||
note=normalized.note,
|
||
)
|
||
return {
|
||
"type": "MANUAL",
|
||
"command": f"# 非 K8s 資源: {target_resource}",
|
||
"description": f"此資源不在 K8s 中,需人工處理。{normalized.note or ''}",
|
||
}
|
||
|
||
# 使用正規化後的名稱
|
||
resolved_name = normalized.normalized or target_resource
|
||
resolved_ns = normalized.namespace or namespace
|
||
|
||
if normalized.confidence < 0.8:
|
||
logger.warning(
|
||
"low_confidence_resource_name",
|
||
original=target_resource,
|
||
resolved=resolved_name,
|
||
confidence=normalized.confidence,
|
||
)
|
||
|
||
# 根據告警類型選擇調優策略 (使用正規化後的名稱)
|
||
if "cpu" in alert_type.lower() or "high_cpu" in alert_type.lower():
|
||
# CPU 高 → 擴容或調整 limit
|
||
if metrics and metrics.rps > 100:
|
||
# 高流量場景 → HPA
|
||
return {
|
||
"type": "HPA",
|
||
"command": f"kubectl autoscale deployment {resolved_name} --cpu-percent=70 --min=2 --max=10 -n {resolved_ns}",
|
||
"description": f"SignOz RPS={metrics.rps:.0f},配置 HPA 應對流量波動",
|
||
}
|
||
else:
|
||
# 低流量但 CPU 高 → 調整資源
|
||
return {
|
||
"type": "RESOURCE_LIMIT",
|
||
"command": f"kubectl set resources deployment/{resolved_name} --limits=cpu=2000m -n {resolved_ns}",
|
||
"description": "增加 CPU limit 緩解資源競爭",
|
||
}
|
||
|
||
elif "memory" in alert_type.lower() or "oom" in alert_type.lower():
|
||
return {
|
||
"type": "RESOURCE_LIMIT",
|
||
"command": f"kubectl set resources deployment/{resolved_name} --limits=memory=1Gi -n {resolved_ns}",
|
||
"description": "增加 Memory limit 防止 OOM",
|
||
}
|
||
|
||
elif "pod_crash" in alert_type.lower() or "crash" in alert_type.lower():
|
||
return {
|
||
"type": "RESTART",
|
||
"command": f"kubectl rollout restart deployment/{resolved_name} -n {resolved_ns}",
|
||
"description": "滾動重啟清除異常狀態",
|
||
}
|
||
|
||
elif "latency" in alert_type.lower() or "slow" in alert_type.lower():
|
||
if metrics and metrics.p99_latency_ms > 500:
|
||
return {
|
||
"type": "SCALE",
|
||
"command": f"kubectl scale deployment {resolved_name} --replicas=+2 -n {resolved_ns}",
|
||
"description": f"SignOz P99={metrics.p99_latency_ms:.0f}ms,擴容分散負載",
|
||
}
|
||
else:
|
||
return {
|
||
"type": "CACHE",
|
||
"command": "# 檢查 Redis 連線池配置",
|
||
"description": "建議增加緩存層減少後端壓力",
|
||
}
|
||
|
||
else:
|
||
# 通用: 滾動重啟
|
||
return {
|
||
"type": "RESTART",
|
||
"command": f"kubectl rollout restart deployment/{resolved_name} -n {resolved_ns}",
|
||
"description": "滾動重啟恢復服務",
|
||
}
|
||
|
||
# =========================================================================
|
||
# =========================================================================
|
||
# OpenClaw (Nemo) 委派仲裁 — 架構鐵律
|
||
# 2026-04-01 ogt: AWOOOI API 不直接打 LLM,委派給 OpenClaw (192.168.0.188:8089)
|
||
# =========================================================================
|
||
|
||
async def _call_openclaw_analyze(
|
||
self,
|
||
incident_id: str,
|
||
severity: str,
|
||
signals: list[dict],
|
||
affected_services: list[str],
|
||
expert_context: dict | None = None,
|
||
) -> dict | None:
|
||
"""
|
||
委派 Incident RCA 給 OpenClaw (Nemo) API
|
||
|
||
Returns:
|
||
proposal_dict if success, None if failed (fallback to direct LLM)
|
||
"""
|
||
try:
|
||
client = await self._get_client()
|
||
import json as _json
|
||
def _to_serializable(obj):
|
||
if isinstance(obj, dict):
|
||
return {k: _to_serializable(v) for k, v in obj.items()}
|
||
if isinstance(obj, list):
|
||
return [_to_serializable(i) for i in obj]
|
||
try:
|
||
_json.dumps(obj)
|
||
return obj
|
||
except (TypeError, ValueError):
|
||
return str(obj)
|
||
|
||
payload = {
|
||
"incident_id": incident_id,
|
||
"severity": severity,
|
||
"signals": _to_serializable(signals[:5]),
|
||
"affected_services": affected_services,
|
||
"expert_context": _to_serializable(expert_context) if expert_context else None,
|
||
}
|
||
# 2026-04-14 Claude Sonnet 4.6: 從硬編 130s 改用 OPENCLAW_TIMEOUT
|
||
# 原 130s 讓 LLM 能卡 2m10s,超過 Ollama 真實返回時間(P95 54s)
|
||
resp = await client.post(
|
||
f"{settings.OPENCLAW_URL}/api/v1/analyze/incident",
|
||
json=payload,
|
||
timeout=httpx.Timeout(float(settings.OPENCLAW_TIMEOUT), connect=5.0),
|
||
)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
|
||
# 驗證必要欄位
|
||
if not data.get("action_title") or not data.get("risk_level"):
|
||
logger.warning("openclaw_analyze_invalid_response", incident_id=incident_id)
|
||
return None
|
||
|
||
try:
|
||
confidence_value = float(data.get("confidence", 0.0))
|
||
except (TypeError, ValueError):
|
||
confidence_value = 0.0
|
||
if data.get("degraded") or data.get("provider") == "openclaw_degraded" or confidence_value < 0.3:
|
||
logger.warning(
|
||
"openclaw_analyze_degraded_response",
|
||
incident_id=incident_id,
|
||
provider=data.get("provider"),
|
||
confidence=data.get("confidence"),
|
||
)
|
||
return None
|
||
|
||
logger.info(
|
||
"openclaw_analyze_success",
|
||
incident_id=incident_id,
|
||
confidence=data.get("confidence", 0),
|
||
risk_level=data.get("risk_level"),
|
||
)
|
||
|
||
# 轉換為 AWOOOI API proposal dict 格式
|
||
return {
|
||
"action": data.get("action_title", "AI 分析"),
|
||
"description": data.get("description", ""),
|
||
"kubectl_command": data.get("kubectl_command") or "",
|
||
"target_resource": data.get("target_resource", "unknown"),
|
||
"namespace": data.get("namespace", "awoooi-prod"),
|
||
"risk_level": data.get("risk_level", "medium"),
|
||
"reasoning": data.get("reasoning", ""),
|
||
"confidence": data.get("confidence", 0.8),
|
||
"primary_responsibility": data.get("primary_responsibility", "INFRA"),
|
||
"optimization_suggestions": data.get("optimization_suggestions", []),
|
||
"signoz_correlation": data.get("signoz_correlation", ""),
|
||
"from_cache": False,
|
||
"provider": "openclaw_nemo",
|
||
"ai_tokens": 0,
|
||
"ai_cost": 0.0,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"openclaw_analyze_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
reason="fallback to direct LLM",
|
||
)
|
||
return None
|
||
|
||
# =========================================================================
|
||
# [ARCHIVED Phase 24 B4 — 2026-04-03 ogt]
|
||
# 以下三個方法 (_call_ollama/_call_gemini/_call_claude) 為舊版 fallback chain
|
||
# 新路徑: USE_AI_ROUTER=true → _call_with_fallback → AIRouterExecutor (ai_router.py)
|
||
# 新 Provider 實作: apps/api/src/services/ai_providers/ (OllamaProvider/GeminiProvider/ClaudeProvider)
|
||
# 回滾保留: USE_AI_ROUTER=false 時仍由 _call_with_fallback (line ~993) 呼叫此區塊
|
||
# 完整移除時機: Phase 24 完整驗收後 (ADR-052 D11)
|
||
# =========================================================================
|
||
|
||
async def _call_ollama(self, prompt: str, *, ollama_only: bool = False) -> tuple[str, bool]:
|
||
"""
|
||
呼叫 Ollama (支援 JSON Mode)。
|
||
|
||
USE_AI_ROUTER=true 正常會走 AIRouterExecutor;這裡是 legacy safety-net。
|
||
2026-05-05 Codex: safety-net 也必須遵守 ADR-110 三層 Ollama
|
||
路由,告警路徑預設只允許 GCP-A/GCP-B/111,不能只打 OLLAMA_URL 後直接掉 Gemini。
|
||
"""
|
||
try:
|
||
client = await self._get_client()
|
||
|
||
# 從 ModelRegistry 取得模型配置
|
||
registry = get_model_registry()
|
||
model_name = registry.get_model("ollama", "rca")
|
||
if ollama_only:
|
||
model_name = getattr(settings, "ALERT_OLLAMA_MODEL", "qwen3:14b")
|
||
options = registry.get_provider_options("ollama")
|
||
timeout_seconds = max(
|
||
float(settings.OPENCLAW_TIMEOUT),
|
||
float(getattr(settings, "OLLAMA_DIAGNOSE_TIMEOUT_SECONDS", settings.OPENCLAW_TIMEOUT)),
|
||
)
|
||
|
||
endpoints: list[tuple[str, str]] = []
|
||
try:
|
||
route = await get_ollama_failover_manager().select_provider()
|
||
endpoints = [
|
||
(endpoint.provider_name, endpoint.url)
|
||
for endpoint in route.all_endpoints_in_order()
|
||
if endpoint.provider_name.startswith("ollama") and endpoint.url
|
||
]
|
||
except Exception as route_error:
|
||
logger.warning(
|
||
"legacy_ollama_route_lookup_failed",
|
||
error=str(route_error),
|
||
)
|
||
|
||
if not endpoints:
|
||
configured_endpoints = [
|
||
("ollama_gcp_a", settings.OLLAMA_URL),
|
||
("ollama_gcp_b", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||
("ollama_local", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
|
||
]
|
||
seen_urls: set[str] = set()
|
||
endpoints = []
|
||
for provider_name, endpoint_url in configured_endpoints:
|
||
if endpoint_url and endpoint_url not in seen_urls:
|
||
endpoints.append((provider_name, endpoint_url))
|
||
seen_urls.add(endpoint_url)
|
||
|
||
if ollama_only:
|
||
allowed_provider_order = {"ollama_gcp_a": 0, "ollama_gcp_b": 1, "ollama_local": 2}
|
||
endpoints = [
|
||
(provider_name, endpoint_url)
|
||
for provider_name, endpoint_url in endpoints
|
||
if provider_name in allowed_provider_order
|
||
]
|
||
endpoints.sort(key=lambda item: allowed_provider_order[item[0]])
|
||
if not endpoints:
|
||
endpoints = [
|
||
("ollama_gcp_a", settings.OLLAMA_URL),
|
||
("ollama_gcp_b", getattr(settings, "OLLAMA_SECONDARY_URL", "")),
|
||
("ollama_local", getattr(settings, "OLLAMA_FALLBACK_URL", "")),
|
||
]
|
||
endpoints = [
|
||
(provider_name, endpoint_url)
|
||
for provider_name, endpoint_url in endpoints
|
||
if endpoint_url
|
||
]
|
||
|
||
last_error = ""
|
||
for provider_name, endpoint_url in endpoints:
|
||
try:
|
||
logger.info(
|
||
"ollama_request_start",
|
||
provider=provider_name,
|
||
url=f"{endpoint_url}/api/generate",
|
||
prompt_length=len(prompt),
|
||
)
|
||
|
||
response = await client.post(
|
||
f"{endpoint_url}/api/generate",
|
||
json={
|
||
"model": model_name,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"format": "json", # 強制 JSON 輸出
|
||
"options": {
|
||
"num_predict": options.get("num_predict", 1024),
|
||
"temperature": options.get("temperature", 0.1),
|
||
"top_p": options.get("top_p", 0.9),
|
||
},
|
||
},
|
||
timeout=httpx.Timeout(timeout_seconds, connect=10.0),
|
||
)
|
||
|
||
logger.info(
|
||
"ollama_response_received",
|
||
provider=provider_name,
|
||
status_code=response.status_code,
|
||
)
|
||
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
result = data.get("response", "")
|
||
|
||
logger.info(
|
||
"ollama_response_parsed",
|
||
provider=provider_name,
|
||
response_length=len(result),
|
||
)
|
||
|
||
return result, True
|
||
except httpx.TimeoutException as e:
|
||
last_error = f"{provider_name} timeout: {e}"
|
||
logger.warning("ollama_timeout", provider=provider_name, error=str(e))
|
||
except Exception as e:
|
||
last_error = f"{provider_name} failed: {e}"
|
||
logger.warning(
|
||
"ollama_call_failed",
|
||
provider=provider_name,
|
||
error=str(e),
|
||
error_type=type(e).__name__,
|
||
)
|
||
|
||
return last_error or "all Ollama endpoints failed", False
|
||
|
||
except httpx.TimeoutException as e:
|
||
logger.warning("ollama_timeout", error=str(e))
|
||
return f"Timeout: {e}", False
|
||
|
||
except Exception as e:
|
||
logger.warning(
|
||
"ollama_call_failed",
|
||
error=str(e),
|
||
error_type=type(e).__name__,
|
||
)
|
||
return str(e), False
|
||
|
||
async def _call_gemini(self, prompt: str) -> tuple[str, bool, int, float]:
|
||
"""
|
||
呼叫 Google Gemini (支援 JSON Mode)
|
||
|
||
Returns:
|
||
tuple: (response_text, success, total_tokens, cost_usd)
|
||
- response_text: LLM 回應文本
|
||
- success: 是否成功
|
||
- total_tokens: 使用的 Token 總數
|
||
- cost_usd: 預估成本 (USD)
|
||
|
||
2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||
"""
|
||
if not settings.GEMINI_API_KEY:
|
||
return "GEMINI_API_KEY not configured", False, 0, 0.0
|
||
|
||
try:
|
||
client = await self._get_client()
|
||
|
||
# 從 ModelRegistry 取得模型配置
|
||
registry = get_model_registry()
|
||
model_name = registry.get_model("gemini", "rca")
|
||
|
||
response = await client.post(
|
||
f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent",
|
||
headers={"x-goog-api-key": settings.GEMINI_API_KEY},
|
||
json={
|
||
"contents": [{"parts": [{"text": prompt}]}],
|
||
"generationConfig": {
|
||
"temperature": 0.1,
|
||
"maxOutputTokens": 2048,
|
||
"responseMimeType": "application/json", # 強制 JSON 輸出
|
||
},
|
||
},
|
||
timeout=30.0,
|
||
)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
text = data["candidates"][0]["content"]["parts"][0]["text"]
|
||
|
||
# 2026-03-29 ogt: 擷取 Token 使用量
|
||
usage_metadata = data.get("usageMetadata", {})
|
||
prompt_tokens = usage_metadata.get("promptTokenCount", 0)
|
||
completion_tokens = usage_metadata.get("candidatesTokenCount", 0)
|
||
total_tokens = usage_metadata.get("totalTokenCount", prompt_tokens + completion_tokens)
|
||
|
||
# Gemini 1.5 Flash 定價 (per 1M tokens)
|
||
# Input: $0.075 / 1M, Output: $0.30 / 1M
|
||
cost_usd = (prompt_tokens * 0.000000075) + (completion_tokens * 0.0000003)
|
||
|
||
logger.info(
|
||
"gemini_response_received",
|
||
response_length=len(text),
|
||
prompt_tokens=prompt_tokens,
|
||
completion_tokens=completion_tokens,
|
||
total_tokens=total_tokens,
|
||
cost_usd=f"${cost_usd:.6f}",
|
||
)
|
||
return text, True, total_tokens, cost_usd
|
||
|
||
except Exception as e:
|
||
logger.warning("gemini_call_failed", error=str(e))
|
||
return str(e), False, 0, 0.0
|
||
|
||
async def _call_claude(self, prompt: str) -> tuple[str, bool]:
|
||
"""
|
||
呼叫 Anthropic Claude (使用 Tool Use 強制 JSON)
|
||
"""
|
||
if not settings.CLAUDE_API_KEY:
|
||
return "CLAUDE_API_KEY not configured", False
|
||
|
||
try:
|
||
client = await self._get_client()
|
||
|
||
# Claude 使用 Tool Use 強制結構化輸出
|
||
response = await client.post(
|
||
"https://api.anthropic.com/v1/messages",
|
||
headers={
|
||
"x-api-key": settings.CLAUDE_API_KEY,
|
||
"anthropic-version": "2023-06-01",
|
||
"content-type": "application/json",
|
||
},
|
||
json={
|
||
"model": get_model_registry().get_model("claude", "rca"),
|
||
"max_tokens": 2048,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"tools": [{
|
||
"name": "submit_analysis",
|
||
"description": "Submit the RCA analysis result in structured format",
|
||
"input_schema": {
|
||
"type": "object",
|
||
"properties": {
|
||
"action_title": {"type": "string"},
|
||
"description": {"type": "string"},
|
||
"suggested_action": {"type": "string", "enum": ["RESTART_DEPLOYMENT", "DELETE_POD", "SCALE_DEPLOYMENT", "APPLY_HPA", "TUNE_RESOURCES", "INVESTIGATE", "OBSERVE", "NO_ACTION"]},
|
||
"kubectl_command": {"type": "string"},
|
||
"target_resource": {"type": "string"},
|
||
"namespace": {"type": "string"},
|
||
"risk_level": {"type": "string", "enum": ["low", "medium", "critical"]},
|
||
"blast_radius": {
|
||
"type": "object",
|
||
"properties": {
|
||
"affected_pods": {"type": "integer"},
|
||
"estimated_downtime": {"type": "string"},
|
||
"related_services": {"type": "array", "items": {"type": "string"}},
|
||
"data_impact": {"type": "string", "enum": ["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"]}
|
||
},
|
||
"required": ["affected_pods", "estimated_downtime", "related_services", "data_impact"]
|
||
},
|
||
"reasoning": {"type": "string"},
|
||
"deviation_analysis": {"type": "string"},
|
||
"confidence": {"type": "number"},
|
||
"affected_services": {"type": "array", "items": {"type": "string"}}
|
||
},
|
||
"required": ["action_title", "description", "suggested_action", "kubectl_command", "target_resource", "namespace", "risk_level", "blast_radius", "reasoning", "confidence"]
|
||
}
|
||
}],
|
||
"tool_choice": {"type": "tool", "name": "submit_analysis"},
|
||
},
|
||
timeout=30.0,
|
||
)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
|
||
# 從 Tool Use 回應中提取 JSON
|
||
for block in data.get("content", []):
|
||
if block.get("type") == "tool_use" and block.get("name") == "submit_analysis":
|
||
tool_input = block.get("input", {})
|
||
logger.info("claude_tool_use_response", input_keys=list(tool_input.keys()))
|
||
return json.dumps(tool_input), True
|
||
|
||
# Fallback: 嘗試從 text 內容提取
|
||
for block in data.get("content", []):
|
||
if block.get("type") == "text":
|
||
return block.get("text", ""), True
|
||
|
||
return "No valid response from Claude", False
|
||
|
||
except Exception as e:
|
||
logger.warning("claude_call_failed", error=str(e))
|
||
return str(e), False
|
||
|
||
# 2026-03-29 ogt: _call_nvidia 已移至 nvidia_provider.py (ARCHIVED)
|
||
# 符合模組化規範 - 所有 NVIDIA API 呼叫統一由 NvidiaProvider / OpenClawNemoProvider 處理
|
||
|
||
# =========================================================================
|
||
# [END ARCHIVED Phase 24 B4]
|
||
# =========================================================================
|
||
|
||
# =========================================================================
|
||
# Mock LLM - Intelligent Fallback with SignOz Data
|
||
# =========================================================================
|
||
|
||
def _generate_mock_response(
|
||
self,
|
||
alert_context: dict,
|
||
signoz_metrics: GoldMetrics | None = None,
|
||
) -> str:
|
||
"""
|
||
規則引擎降級回應 (v8.0) — 生產用途,不是假數據
|
||
|
||
從 alert_rules.yaml 載入規則進行匹配,AI 分析失敗時的正式降級路徑。
|
||
命中 generic_fallback 時會回傳 rule_id="generic_fallback",
|
||
由上層 async 方法(_call_with_fallback)觸發 auto_generate_rule() 學習新規則。
|
||
|
||
Returns:
|
||
(json_str, rule_id) tuple
|
||
|
||
2026-04-09 ogt: 重構為規則引擎,移除 if/elif 硬編碼
|
||
2026-04-09 ogt: S2-4 架構師審查 — 修正 Mock 語意混淆,澄清為規則引擎生產路徑
|
||
"""
|
||
from src.services.alert_rule_engine import match_rule
|
||
|
||
time.sleep(random.uniform(0.3, 0.8)) # 模擬思考延遲
|
||
|
||
# SignOz 數據整合
|
||
signoz_correlation = "SignOz 數據擷取中..."
|
||
if signoz_metrics:
|
||
signoz_correlation = (
|
||
f"RPS={signoz_metrics.rps:.1f} ({signoz_metrics.rps_trend}), "
|
||
f"Error={signoz_metrics.error_rate:.2f}%, "
|
||
f"P99={signoz_metrics.p99_latency_ms:.0f}ms"
|
||
)
|
||
|
||
mock_response = match_rule(alert_context)
|
||
if mock_response is None:
|
||
# match_rule 不應該回傳 None(有通用兜底),但防禦性處理
|
||
alert_type = alert_context.get("alert_type", "custom")
|
||
target = alert_context.get("target_resource", "unknown")
|
||
namespace = alert_context.get("namespace", "awoooi-prod")
|
||
mock_response = {
|
||
"action_title": f"重新啟動 {target} 服務",
|
||
"description": f"⚙️ 規則匹配: {target} 發生異常,需進一步診斷確認根因。",
|
||
"suggested_action": "RESTART_DEPLOYMENT",
|
||
"kubectl_command": f"kubectl rollout restart deployment/{target} -n {namespace}",
|
||
"target_resource": target,
|
||
"namespace": namespace,
|
||
"risk_level": "medium",
|
||
"blast_radius": {
|
||
"affected_pods": 1,
|
||
"estimated_downtime": "5-15 min",
|
||
"related_services": [target],
|
||
"data_impact": "NONE",
|
||
},
|
||
"primary_responsibility": "COLLAB",
|
||
"responsibility_reasoning": "告警資訊不足,建議多團隊協同排查",
|
||
"secondary_teams": ["BE", "INFRA"],
|
||
"optimization_suggestions": [],
|
||
"reasoning": f"[規則匹配] 根據告警 {alert_type} 先重啟恢復服務。",
|
||
"deviation_analysis": "監控指標顯示異常偏離基準線",
|
||
"confidence": 0.0,
|
||
"affected_services": [target],
|
||
"signoz_correlation": signoz_correlation,
|
||
}
|
||
|
||
# 補充 SignOz 關聯資訊(規則引擎不持有 signoz_metrics)
|
||
mock_response["signoz_correlation"] = signoz_correlation
|
||
if signoz_metrics:
|
||
mock_response["description"] += f" {signoz_metrics.to_summary()}"
|
||
|
||
rule_id = mock_response.get("rule_id", "unknown")
|
||
logger.info(
|
||
"mock_llm_response_generated",
|
||
rule_id=rule_id,
|
||
action_title=mock_response["action_title"],
|
||
risk_level=mock_response["risk_level"],
|
||
primary_responsibility=mock_response["primary_responsibility"],
|
||
confidence=mock_response["confidence"],
|
||
signoz_integrated=signoz_metrics is not None,
|
||
is_mock=True,
|
||
)
|
||
|
||
# 2026-04-09 ogt: rule_id 回傳給上層 async 方法觸發自動規則生成
|
||
# 不在此 sync 方法中呼叫 asyncio,避免 event loop 混用問題 (S1-1 架構師審查)
|
||
return json.dumps(mock_response), rule_id
|
||
|
||
# =========================================================================
|
||
# LLM Cache Layer (憲法要求: 嚴禁無快取裸奔)
|
||
# =========================================================================
|
||
|
||
def _generate_cache_key(self, prompt: str, context_hash: str = "") -> str:
|
||
"""
|
||
生成 LLM 快取鍵
|
||
|
||
有告警上下文時,使用 prompt family + 穩定告警維度,避免 annotations /
|
||
SignOz 即時數值讓同一告警每 20 秒打穿快取;沒有上下文時仍用完整 prompt。
|
||
"""
|
||
if context_hash:
|
||
prompt_family_source = (
|
||
"openclaw_alert_analysis"
|
||
if "## Alert Data:" in prompt
|
||
else prompt[:512]
|
||
)
|
||
prompt_family = hashlib.sha256(prompt_family_source.encode()).hexdigest()[:8]
|
||
content = f"{prompt_family}:{context_hash}"
|
||
else:
|
||
content = prompt
|
||
hash_digest = hashlib.sha256(content.encode()).hexdigest()[:16]
|
||
return f"llm_cache:{hash_digest}"
|
||
|
||
async def _call_with_cache(
|
||
self,
|
||
prompt: str,
|
||
alert_context: dict | None = None,
|
||
signoz_metrics: GoldMetrics | None = None,
|
||
cache_ttl: int = 3600, # 1 hour default
|
||
) -> tuple[str, str, bool, bool, int, float]:
|
||
"""
|
||
帶快取的 LLM 呼叫包裝器
|
||
|
||
憲法條款: 必須使用快取保護算力資源
|
||
|
||
Args:
|
||
prompt: LLM prompt
|
||
alert_context: 告警上下文
|
||
signoz_metrics: SignOz 指標
|
||
cache_ttl: 快取存活時間 (秒)
|
||
|
||
Returns:
|
||
(response, provider, success, from_cache, total_tokens, cost_usd)
|
||
|
||
2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||
"""
|
||
# 生成快取鍵 (基於 prompt + alert_context hash)
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符
|
||
# 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop)
|
||
# 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果
|
||
context_hash = _build_alert_cache_context_hash(alert_context)
|
||
|
||
cache_key = self._generate_cache_key(prompt, context_hash)
|
||
|
||
# 1. 嘗試從快取讀取
|
||
try:
|
||
redis_client = get_redis()
|
||
cached = await redis_client.get(cache_key)
|
||
if cached:
|
||
cached_data = json.loads(cached)
|
||
logger.info(
|
||
"llm_cache_hit",
|
||
cache_key=cache_key[:20],
|
||
provider=cached_data.get("provider", "cached"),
|
||
)
|
||
return (
|
||
cached_data["response"],
|
||
f"{cached_data['provider']}_cached",
|
||
True,
|
||
True, # from_cache
|
||
0, # tokens (cache hit, no new tokens)
|
||
0.0, # cost (cache hit, no cost)
|
||
)
|
||
except Exception as e:
|
||
logger.warning("llm_cache_read_failed", error=str(e))
|
||
|
||
# 2. Cache Miss - 呼叫 LLM
|
||
logger.info("llm_cache_miss", cache_key=cache_key[:20])
|
||
response, provider, success, total_tokens, cost_usd = await self._call_with_fallback(
|
||
prompt, alert_context, signoz_metrics
|
||
)
|
||
|
||
# 3. 成功則寫入快取
|
||
if success:
|
||
try:
|
||
redis_client = get_redis()
|
||
cache_data = {
|
||
"response": response,
|
||
"provider": provider,
|
||
"cached_at": now_taipei_iso(),
|
||
}
|
||
await redis_client.set(
|
||
cache_key,
|
||
json.dumps(cache_data, ensure_ascii=False),
|
||
ex=cache_ttl,
|
||
)
|
||
logger.info(
|
||
"llm_cache_write",
|
||
cache_key=cache_key[:20],
|
||
provider=provider,
|
||
ttl=cache_ttl,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("llm_cache_write_failed", error=str(e))
|
||
|
||
return response, provider, success, False, total_tokens, cost_usd # from_cache=False
|
||
|
||
# =========================================================================
|
||
# Public LLM Interface (ILLMProvider Protocol)
|
||
# =========================================================================
|
||
|
||
async def call(
|
||
self,
|
||
prompt: str,
|
||
alert_context: dict | None = None,
|
||
) -> tuple[str, str, bool]:
|
||
"""
|
||
呼叫 LLM (ILLMProvider Protocol 實作)
|
||
|
||
#39 Error Analyzer Agent 使用此方法
|
||
|
||
Args:
|
||
prompt: 完整的 prompt
|
||
alert_context: 可選的告警上下文(含 incident_id/signals,
|
||
供 OPENCLAW_NEMO provider 使用結構化 API)
|
||
|
||
Returns:
|
||
(response, provider, success)
|
||
|
||
2026-04-16 ogt + Claude Sonnet 4.6: 修復 — _call_with_fallback 回傳 5 值,
|
||
call() 回傳 3 值,避免呼叫端 (diagnostician_agent:107) ValueError
|
||
2026-04-16 ogt + Claude Sonnet 4.6: 加入 alert_context — 讓 diagnostician
|
||
能傳 snapshot 結構化資料給 OPENCLAW_NEMO,避免 prompt 被截斷為 garbage
|
||
"""
|
||
response, provider, success, _tokens, _cost = await self._call_with_fallback(
|
||
prompt, alert_context=alert_context
|
||
)
|
||
return response, provider, success
|
||
|
||
# =========================================================================
|
||
# Fallback Chain
|
||
# =========================================================================
|
||
|
||
async def _call_with_fallback(
|
||
self,
|
||
prompt: str,
|
||
alert_context: dict | None = None,
|
||
signoz_metrics: GoldMetrics | None = None,
|
||
) -> tuple[str, str, bool, int, float]:
|
||
"""
|
||
依 AI_FALLBACK_ORDER 順序呼叫 AI
|
||
|
||
若 MOCK_MODE=True,直接回傳模擬結果。
|
||
若所有 Provider 失敗,fallback 到 Mock。
|
||
|
||
Returns:
|
||
tuple: (response, provider, success, total_tokens, cost_usd)
|
||
|
||
Phase 15.1: 整合 Langfuse LLMOps 追蹤
|
||
2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||
2026-04-02 ogt: Phase 24 ADR-052 絞殺者包裝 — USE_AI_ROUTER 新舊並存
|
||
"""
|
||
# =================================================================
|
||
# Phase 24 ADR-052: 絞殺者分支 (Strangler Fig)
|
||
# USE_AI_ROUTER=true → 新 AIRouterExecutor 路由
|
||
# USE_AI_ROUTER=false → 舊 if/else fallback chain (現狀)
|
||
# 回滾: kubectl set env deployment/awoooi-api USE_AI_ROUTER=false
|
||
# Phase 24 C: Redis 狀態覆蓋 env var (/ai router on/off)
|
||
# =================================================================
|
||
# Redis 狀態優先 (Phase 24 C — 2026-04-03 ogt)
|
||
_use_ai_router = settings.USE_AI_ROUTER
|
||
try:
|
||
from src.services.ai_control import get_ai_router_enabled
|
||
_redis_override = await get_ai_router_enabled()
|
||
if _redis_override is not None:
|
||
_use_ai_router = _redis_override
|
||
except Exception:
|
||
pass
|
||
|
||
if _use_ai_router:
|
||
try:
|
||
# 2026-04-02 ogt: C2 修復 — 呼叫 AIRouter.route() 智慧路由 (非靜態 order)
|
||
# D1 意圖分類路由、D7 隱私保護 (DIAGNOSE/CODE_REVIEW 強制 local) 生效
|
||
from src.services.ai_router import (
|
||
IntentType,
|
||
get_ai_executor,
|
||
get_ai_router,
|
||
)
|
||
router = get_ai_router()
|
||
executor = get_ai_executor()
|
||
|
||
# Step 1: 取得路由決策 (含意圖分類 + 複雜度評分)
|
||
decision = await router.route(prompt, alert_context)
|
||
|
||
# Step 2: 從 RoutingDecision 建立 provider_order (主 + fallback)
|
||
# Phase 24 C: Redis primary_provider 覆蓋路由決策
|
||
provider_order = [decision.selected_provider.value] + [
|
||
p.value for p, _ in decision.fallback_chain
|
||
if p.value != decision.selected_provider.value
|
||
]
|
||
try:
|
||
from src.services.ai_control import (
|
||
get_primary_provider,
|
||
is_provider_disabled,
|
||
)
|
||
_primary = await get_primary_provider()
|
||
if _primary and _primary != decision.selected_provider.value:
|
||
# 把 primary 移到首位 (保留原始 fallback)
|
||
provider_order = [_primary] + [p for p in provider_order if p != _primary]
|
||
# 過濾被停用的 Provider
|
||
# C2 修復 (2026-04-03 首席架構師審查): Python 3.11 不支援 list comprehension 中 await
|
||
_filtered = []
|
||
for _p in provider_order:
|
||
if not await is_provider_disabled(_p):
|
||
_filtered.append(_p)
|
||
if _filtered:
|
||
provider_order = _filtered
|
||
except Exception as _e:
|
||
logger.warning("ai_control_override_failed", error=str(_e))
|
||
|
||
if self._alert_enforces_ollama_first(alert_context):
|
||
original_provider_order = list(provider_order)
|
||
provider_order = await self._resolve_alert_provider_order(
|
||
task_type=decision.intent.value if decision.intent else "diagnose",
|
||
alert_context=alert_context,
|
||
cloud_provider_order=original_provider_order,
|
||
)
|
||
logger.info(
|
||
"alert_ollama_first_provider_order",
|
||
original_provider_order=original_provider_order,
|
||
provider_order=provider_order,
|
||
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
|
||
)
|
||
|
||
# Step 3: D7 隱私 — CODE_REVIEW 強制 local
|
||
# 2026-04-15 ogt: DIAGNOSE 移除 require_local(v4.3 決策:NIM 為主力,無隱私問題)
|
||
# ai_router.py v4.3 已明確:「NIM 從 Phase 22 起就是主力,無隱私問題」
|
||
# require_local=True 對 DIAGNOSE 只會讓所有 provider 被 privacy_skip → 永遠失敗
|
||
require_local = decision.intent in (IntentType.CODE_REVIEW,)
|
||
|
||
# 2026-04-29 ogt + Claude Code: 注入 task_type 讓 Ollama 用正確 timeout
|
||
# 根因: ai_providers/ollama.py:77 讀 context["task_type"] 決定 timeout
|
||
# - "diagnose"/"force_local" → OLLAMA_DIAGNOSE_TIMEOUT_SECONDS
|
||
# - 其他/未注入 → OPENCLAW_TIMEOUT=30s(不夠 qwen2.5:7b 推理)
|
||
# webhooks alert_context 從未注入 task_type → Ollama fallback 永遠 30s timeout
|
||
# 對齊 decision.intent 後 Ollama fallback 真正能跑完
|
||
exec_context = dict(alert_context) if alert_context else {}
|
||
if decision.intent == IntentType.DIAGNOSE:
|
||
exec_context["task_type"] = "diagnose"
|
||
if self._alert_enforces_ollama_first(alert_context):
|
||
exec_context.setdefault("task_type", "diagnose")
|
||
exec_context.setdefault("ollama_model", getattr(settings, "ALERT_OLLAMA_MODEL", "qwen3:14b"))
|
||
exec_context["allow_gcp_heavy_model"] = bool(
|
||
exec_context.get("allow_gcp_heavy_model", True)
|
||
)
|
||
exec_context["alert_requires_ollama_before_cloud"] = True
|
||
|
||
result = await executor.execute(
|
||
prompt=prompt,
|
||
provider_order=provider_order,
|
||
context=exec_context,
|
||
cache_ttl=3600,
|
||
require_local=require_local,
|
||
)
|
||
logger.info(
|
||
"phase24_ai_router_used",
|
||
provider=result.provider,
|
||
success=result.success,
|
||
latency_ms=round(result.latency_ms, 1),
|
||
intent=decision.intent.value,
|
||
routing_reason=decision.routing_reason,
|
||
)
|
||
return result.raw_response, result.provider, result.success, result.tokens, result.cost_usd
|
||
except Exception as e:
|
||
# AIRouter 失敗時 fallback 到舊路徑 (安全網)
|
||
logger.warning("phase24_ai_router_fallback_to_legacy", error=str(e))
|
||
|
||
# Mock Mode: 開發測試用
|
||
if settings.MOCK_MODE:
|
||
logger.info("mock_mode_enabled", using="mock_llm")
|
||
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
|
||
if _rule_id == "generic_fallback":
|
||
import asyncio
|
||
|
||
from src.services.alert_rule_engine import auto_generate_rule
|
||
try:
|
||
asyncio.create_task(auto_generate_rule(
|
||
alert_context or {},
|
||
ollama_url=settings.OLLAMA_URL,
|
||
model=settings.OPENCLAW_DEFAULT_MODEL,
|
||
gemini_api_key=(
|
||
getattr(settings, "GEMINI_API_KEY", "")
|
||
if self._cloud_fallback_allowed_for_alert(alert_context)
|
||
else ""
|
||
),
|
||
))
|
||
except Exception as _e:
|
||
logger.warning("auto_rule_trigger_failed", error=str(_e))
|
||
return _mock_json, "mock", True, 0, 0.0
|
||
|
||
# Phase 15.1 + 15.3: Langfuse 追蹤整合 + SignOz Deep Linking
|
||
with langfuse_trace(
|
||
"openclaw_fallback_chain",
|
||
metadata={
|
||
"prompt_length": len(prompt),
|
||
"fallback_order": settings.AI_FALLBACK_ORDER,
|
||
"alert_fingerprint": (alert_context or {}).get("fingerprint", "unknown"),
|
||
},
|
||
) as trace:
|
||
# Phase 15.3: SignOz → Langfuse 反向連結
|
||
# 在當前 OTEL span 中記錄 Langfuse trace_id
|
||
if trace.langfuse_trace_id:
|
||
from opentelemetry import trace as otel_trace
|
||
|
||
from src.core.deep_linking import DeepLinking
|
||
|
||
current_span = otel_trace.get_current_span()
|
||
if current_span:
|
||
current_span.set_attribute("langfuse.trace_id", trace.langfuse_trace_id)
|
||
current_span.set_attribute(
|
||
"langfuse.trace_url",
|
||
DeepLinking.langfuse_trace_url(trace.langfuse_trace_id),
|
||
)
|
||
|
||
# Phase 13.2: Rate Limiter 整合 (2026-03-26)
|
||
# 防止雲端 API 用量暴衝,超限自動降級
|
||
from src.services.ai_rate_limiter import get_ai_rate_limiter
|
||
rate_limiter = get_ai_rate_limiter()
|
||
|
||
legacy_provider_order = list(settings.AI_FALLBACK_ORDER)
|
||
if self._alert_enforces_ollama_first(alert_context):
|
||
legacy_provider_order = ["ollama"]
|
||
if self._cloud_fallback_allowed_for_alert(alert_context):
|
||
legacy_provider_order.append("gemini")
|
||
logger.info(
|
||
"legacy_alert_ollama_first_provider_order",
|
||
provider_order=legacy_provider_order,
|
||
cloud_fallback_allowed=self._cloud_fallback_allowed_for_alert(alert_context),
|
||
)
|
||
|
||
for provider in legacy_provider_order:
|
||
# Rate Limit 檢查 (nvidia/gemini/claude 需檢查,ollama 不限)
|
||
# 2026-03-30 ogt: 加入 nvidia (RPM=5 限制)
|
||
if provider in ("nvidia", "gemini", "claude"):
|
||
allowed, reason = await rate_limiter.check_and_increment(provider)
|
||
if not allowed:
|
||
logger.warning(
|
||
"ai_rate_limit_skip",
|
||
provider=provider,
|
||
reason=reason,
|
||
)
|
||
continue # 跳過此 provider,嘗試下一個
|
||
|
||
logger.info("ai_provider_attempt", provider=provider)
|
||
|
||
start_time = time.time()
|
||
model_name = self._get_model_name(provider)
|
||
|
||
# 2026-03-29 ogt: Gemini 回傳 4 值 (含 token/cost),其他 Provider 補 0
|
||
total_tokens = 0
|
||
cost_usd = 0.0
|
||
|
||
if provider == "ollama":
|
||
response, success = await self._call_ollama(
|
||
prompt,
|
||
ollama_only=self._alert_enforces_ollama_first(alert_context),
|
||
)
|
||
elif provider == "gemini":
|
||
response, success, total_tokens, cost_usd = await self._call_gemini(prompt)
|
||
elif provider == "nvidia":
|
||
# 2026-03-29 ogt: 使用 NvidiaProvider.chat() (模組化規範)
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
nvidia_provider = get_nvidia_provider()
|
||
response, success, total_tokens, cost_usd = await nvidia_provider.chat(prompt, use_json_mode=True)
|
||
elif provider == "claude":
|
||
response, success = await self._call_claude(prompt)
|
||
else:
|
||
logger.warning("unknown_ai_provider", provider=provider)
|
||
continue
|
||
|
||
latency_ms = (time.time() - start_time) * 1000
|
||
|
||
# Langfuse: 記錄每次 LLM 呼叫
|
||
trace.generation(
|
||
name=f"{provider}_call",
|
||
model=model_name,
|
||
input=prompt[:500], # 截斷避免過長
|
||
output=response[:500] if success else f"ERROR: {response[:200]}",
|
||
metadata={
|
||
"success": success,
|
||
"latency_ms": round(latency_ms, 2),
|
||
"provider": provider,
|
||
"total_tokens": total_tokens,
|
||
"cost_usd": cost_usd,
|
||
},
|
||
)
|
||
|
||
if success:
|
||
logger.info(
|
||
"ai_provider_success",
|
||
provider=provider,
|
||
latency_ms=latency_ms,
|
||
total_tokens=total_tokens,
|
||
cost_usd=f"${cost_usd:.6f}",
|
||
)
|
||
# Langfuse: 記錄成功評分
|
||
trace.score(name="provider_success", value=1.0, comment=f"Success via {provider}")
|
||
|
||
# 2026-03-29 ogt: 記錄累積成本 (Gemini/Claude)
|
||
if cost_usd > 0:
|
||
await rate_limiter.record_cost(provider, cost_usd)
|
||
|
||
return response, provider, True, total_tokens, cost_usd
|
||
|
||
logger.warning("ai_provider_failed_fallback", provider=provider, latency_ms=latency_ms)
|
||
|
||
# 所有 Provider 失敗時,fallback 到 Mock (優雅降級)
|
||
logger.warning("all_providers_failed_using_mock", fallback="mock_llm")
|
||
trace.score(name="provider_success", value=0.0, comment="All providers failed, using mock")
|
||
_mock_json, _rule_id = self._generate_mock_response(alert_context or {}, signoz_metrics)
|
||
if _rule_id == "generic_fallback":
|
||
import asyncio
|
||
|
||
from src.services.alert_rule_engine import auto_generate_rule
|
||
try:
|
||
asyncio.create_task(auto_generate_rule(
|
||
alert_context or {},
|
||
ollama_url=settings.OLLAMA_URL,
|
||
model=settings.OPENCLAW_DEFAULT_MODEL,
|
||
gemini_api_key=(
|
||
getattr(settings, "GEMINI_API_KEY", "")
|
||
if self._cloud_fallback_allowed_for_alert(alert_context)
|
||
else ""
|
||
),
|
||
))
|
||
except Exception as _e:
|
||
logger.warning("auto_rule_trigger_failed", error=str(_e))
|
||
return _mock_json, "mock_fallback", True, 0, 0.0
|
||
|
||
def _get_model_name(self, provider: str) -> str:
|
||
"""取得 provider 對應的模型名稱 (從 ModelRegistry)"""
|
||
registry = get_model_registry()
|
||
return registry.get_model(provider, "rca")
|
||
|
||
# =========================================================================
|
||
# Response Parsing (防禦性解析)
|
||
# =========================================================================
|
||
|
||
def _extract_json_from_response(self, text: str) -> str | None:
|
||
"""從 LLM 回應中提取 JSON (含啟發式修補)"""
|
||
# 0. 清理開頭結尾空白
|
||
text = text.strip()
|
||
if not text:
|
||
return None
|
||
|
||
# 1. 嘗試直接解析
|
||
try:
|
||
json.loads(text)
|
||
return text
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 2. 嘗試從 markdown code block 提取
|
||
patterns = [
|
||
r"```json\s*([\s\S]*?)\s*```",
|
||
r"```\s*([\s\S]*?)\s*```",
|
||
r"(\{[\s\S]*\})", # 貪婪匹配最大括號對
|
||
]
|
||
|
||
for pattern in patterns:
|
||
match = re.search(pattern, text)
|
||
if match:
|
||
candidate = match.group(1) if "(" in pattern else match.group(0)
|
||
candidate = candidate.strip()
|
||
try:
|
||
json.loads(candidate)
|
||
return candidate
|
||
except json.JSONDecodeError:
|
||
# 3. 啟發式修補: 如果結尾缺少括號,嘗試補齊
|
||
if candidate.startswith("{") and not candidate.endswith("}"):
|
||
for i in range(1, 5): # 嘗試補 1-5 個括號/引號
|
||
try:
|
||
repaired = candidate + '"' * (i - 1) + "}" * i
|
||
json.loads(repaired)
|
||
logger.info("json_repaired_heuristically", level=i)
|
||
return repaired
|
||
except json.JSONDecodeError:
|
||
continue
|
||
continue
|
||
|
||
# 4. 極端情況: 找出最後一個有效 key
|
||
if "{" in text:
|
||
start_idx = text.find("{")
|
||
candidate = text[start_idx:]
|
||
# 暴力去除非法尾綴 (如 \t\t...)
|
||
candidate = re.sub(r"[ \t\r\n]+$", "", candidate)
|
||
if not candidate.endswith("}"):
|
||
candidate += '"}' # 嘗試最簡單的閉合
|
||
try:
|
||
json.loads(candidate)
|
||
return candidate
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
return None
|
||
|
||
def _validate_deployment_inventory(
|
||
self,
|
||
result: "OpenClawDecision | None",
|
||
k8s_inventory: str,
|
||
k8s_ns: str,
|
||
) -> None:
|
||
"""
|
||
2026-04-19 ogt + Claude Opus 4.7 (抽取自 analyze_alert):
|
||
幻覺 deployment 名偵測與降級。雙路徑共用(analyze_alert + generate_incident_proposal)。
|
||
|
||
根因: NEMOTRON 即使 prompt 含 inventory 仍會拿 namespace 當 deployment 名
|
||
→ 執行 kubectl rollout restart deployment/awoooi-prod → "not found"
|
||
修復: 正則抽出 kubectl 指令的 deployment 名,對照 inventory 白名單;
|
||
不在白名單 → 降級為 NO_ACTION + 轉純調查 get deploy + 信心 0。
|
||
"""
|
||
if not result or not k8s_inventory:
|
||
return
|
||
_inventory_names = {n.strip() for n in k8s_inventory.split(",") if n.strip()}
|
||
if not _inventory_names:
|
||
return
|
||
_kcmd = (result.kubectl_command or "").lower()
|
||
import re as _re
|
||
_m = _re.search(r"deployment[/\s]+([a-z0-9][a-z0-9-]*)", _kcmd)
|
||
if not _m:
|
||
return
|
||
_deploy_guess = _m.group(1)
|
||
if _deploy_guess in _inventory_names:
|
||
return
|
||
|
||
logger.warning(
|
||
"openclaw_deployment_hallucination_detected",
|
||
hallucinated=_deploy_guess,
|
||
inventory=sorted(_inventory_names),
|
||
original_kubectl_cmd=result.kubectl_command,
|
||
original_action=(
|
||
result.suggested_action.value
|
||
if hasattr(result.suggested_action, "value")
|
||
else str(result.suggested_action)
|
||
),
|
||
namespace=k8s_ns,
|
||
)
|
||
# 降級為安全調查動作,不執行破壞性操作
|
||
try:
|
||
result.kubectl_command = f"kubectl get deploy -n {k8s_ns}"
|
||
except Exception:
|
||
pass
|
||
try:
|
||
result.target_resource = "unknown(hallucinated)"
|
||
except Exception:
|
||
pass
|
||
try:
|
||
result.suggested_action = SuggestedAction.NO_ACTION
|
||
except Exception:
|
||
pass
|
||
try:
|
||
result.action_title = f"[安全降級] 調查 {k8s_ns} 真實資源狀態"
|
||
except Exception:
|
||
pass
|
||
try:
|
||
result.description = (
|
||
f"[安全降級] 原 LLM 建議的 deployment '{_deploy_guess}' 不在叢集 inventory "
|
||
f"({', '.join(sorted(_inventory_names))})。"
|
||
f"已降級為純調查動作(kubectl get deploy),請手動確認實際問題資源。"
|
||
)
|
||
except Exception:
|
||
pass
|
||
try:
|
||
result.confidence = 0.0
|
||
except Exception:
|
||
pass
|
||
# 2026-04-21 ogt: 降級後風險強制 LOW,避免 NO_ACTION 因原 HIGH/CRITICAL risk
|
||
# 留在 PENDING 等 Telegram 批准(已無執行內容,等待毫無意義)
|
||
try:
|
||
result.risk_level = AIRiskLevel.LOW
|
||
except Exception:
|
||
pass
|
||
|
||
def _parse_analysis_result(self, raw_response: str) -> OpenClawDecision | None:
|
||
"""
|
||
解析 LLM 分析結果 - 使用 Pydantic Schema Enforcement
|
||
|
||
關鍵:blast_radius 為 REQUIRED,使用 AIBlastRadius Pydantic 模型驗證
|
||
"""
|
||
json_str = self._extract_json_from_response(raw_response)
|
||
if not json_str:
|
||
logger.error("json_extraction_failed", raw_response=raw_response[:200])
|
||
return None
|
||
|
||
try:
|
||
data = json.loads(json_str)
|
||
|
||
# Step 1: 確保 blast_radius 存在且為正確格式
|
||
if "blast_radius" not in data or not isinstance(data["blast_radius"], dict):
|
||
data["blast_radius"] = {
|
||
"affected_pods": 1,
|
||
"estimated_downtime": "~30s",
|
||
"related_services": data.get("affected_services", []),
|
||
"data_impact": "NONE"
|
||
}
|
||
else:
|
||
# 確保 blast_radius 內的必填欄位存在
|
||
br = data["blast_radius"]
|
||
if "affected_pods" not in br:
|
||
br["affected_pods"] = 1
|
||
if "estimated_downtime" not in br:
|
||
br["estimated_downtime"] = "~30s"
|
||
if "related_services" not in br:
|
||
br["related_services"] = data.get("affected_services", [])
|
||
if "data_impact" not in br:
|
||
br["data_impact"] = "NONE"
|
||
|
||
# Step 2: 填補其他可選欄位
|
||
if "action_title" not in data:
|
||
data["action_title"] = data.get("action", "未知操作")
|
||
if "target_resource" not in data:
|
||
data["target_resource"] = "unknown"
|
||
if "suggested_action" not in data:
|
||
data["suggested_action"] = "NO_ACTION"
|
||
|
||
# Step 2.5: 2026-04-01 Claude Code - 斷片補全 (信心度必須誠實)
|
||
# 🔴 禁止填入假信心度!截斷 = 0.0,讓 auto-approve 正確判斷
|
||
# 2026-04-12 ogt: NIM/Ollama 有時回傳字串 "0.85",先嘗試 parse 再判斷
|
||
if "confidence" in data and isinstance(data["confidence"], str):
|
||
try:
|
||
data["confidence"] = float(data["confidence"])
|
||
except (ValueError, TypeError):
|
||
data.pop("confidence", None)
|
||
if "confidence" not in data or not isinstance(data["confidence"], int | float):
|
||
data["confidence"] = 0.0 # 截斷/缺失 → 0.0,不可偽造
|
||
if "risk_level" not in data:
|
||
data["risk_level"] = "low"
|
||
# 2026-04-19 ogt + Claude Opus 4.7 修 AP-3:
|
||
# primary_responsibility 有時 LLM 填空字串/None → resp_display 顯示「❓ 未知」
|
||
# 強制正規化: 空/None/不在白名單 → 用 kubectl 有無推 INFRA 或 BE (非「未知」)
|
||
_valid_resp = {"FE", "BE", "INFRA", "DB", "COLLAB"}
|
||
_cur_resp = str(data.get("primary_responsibility") or "").strip().upper()
|
||
if _cur_resp not in _valid_resp:
|
||
data["primary_responsibility"] = "INFRA" if "kubectl" in str(data) else "BE"
|
||
if "suggested_action" not in data:
|
||
data["suggested_action"] = "RESTART_DEPLOYMENT" if "restart" in str(data).lower() else "NO_ACTION"
|
||
if "reasoning" not in data:
|
||
data["reasoning"] = "AI 產出欄位缺失,系統自動補全以維持運作。"
|
||
|
||
# Step 3: 使用 Pydantic 驗證 (會自動正規化 risk_level, data_impact 等)
|
||
decision = OpenClawDecision(**data)
|
||
|
||
logger.info(
|
||
"pydantic_validation_success",
|
||
action_title=decision.action_title,
|
||
risk_level=decision.risk_level.value,
|
||
blast_radius_pods=decision.blast_radius.affected_pods,
|
||
)
|
||
|
||
return decision
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"pydantic_validation_failed",
|
||
error=str(e),
|
||
json_str=json_str[:300],
|
||
)
|
||
return None
|
||
|
||
# =========================================================================
|
||
# Main Analysis Methods
|
||
# =========================================================================
|
||
|
||
async def analyze_alert(
|
||
self,
|
||
alert_context: dict,
|
||
) -> tuple[LLMAnalysisResult | None, str, str, GoldMetrics | None, str, int, float]:
|
||
"""
|
||
分析告警並產生 RCA 結果 (含 SignOz 整合)
|
||
|
||
Args:
|
||
alert_context: 告警上下文 (alert_type, severity, target_resource, etc.)
|
||
|
||
Returns:
|
||
(analysis_result, ai_provider, raw_response, signoz_metrics, signoz_trace_url, total_tokens, cost_usd)
|
||
|
||
2026-03-29 ogt: 加入 Token/Cost 追蹤
|
||
"""
|
||
# Step 0: 擷取 SignOz 上下文
|
||
service_name = alert_context.get("target_resource", "unknown")
|
||
namespace = alert_context.get("namespace", "default")
|
||
|
||
signoz_metrics, signoz_trace_url = await self.get_signoz_context(
|
||
service_name=service_name,
|
||
namespace=namespace,
|
||
)
|
||
|
||
# 將 SignOz 數據加入 prompt
|
||
signoz_context = ""
|
||
if signoz_metrics:
|
||
signoz_context = f"""
|
||
## 📊 SignOz Real-time Metrics (Last 10 min)
|
||
{signoz_metrics.to_summary()}
|
||
|
||
Trace URL: {signoz_trace_url}
|
||
"""
|
||
|
||
# Step 0.5: 擷取 K8s 叢集真實資源清單(Checkpoint-2 webhook path)
|
||
# 2026-04-17 ogt + Claude Sonnet 4.6: 防止 NemoTron 幻覺 deployment/awoooi-service
|
||
# 根因:webhook path 沒有叢集上下文 → LLM 盲猜資源名稱 → kubectl not found → trust 0 永遠
|
||
# 修復:每次分析前先拉真實 Deployment 清單,注入 prompt 強制 LLM 對齊
|
||
_k8s_ns = alert_context.get("namespace", "awoooi-prod")
|
||
_k8s_inventory = await _fetch_k8s_inventory_for_openclaw(namespace=_k8s_ns)
|
||
k8s_section = (
|
||
f"\n\n## 🔒 叢集實際資源清單({_k8s_ns})\n"
|
||
f"kubectl_command 與 target_resource **必須**從以下名稱選擇,不可自行編造:\n"
|
||
f"{_k8s_inventory}\n"
|
||
if _k8s_inventory
|
||
else "\n\n## ⚠️ 無法取得叢集清單,target_resource 請依 alertname 推斷,勿編造。\n"
|
||
)
|
||
|
||
# P2.1 fix 2026-04-24 ogt + Claude Sonnet 4.6: 提取 MCP evidence_summary 注入 prompt
|
||
# diagnosis_context 由 decision_manager 在呼叫前填入(pre_decision_investigator 產出)
|
||
_raw_evidence = alert_context.get("diagnosis_context", "") or ""
|
||
if _raw_evidence and not _raw_evidence.startswith("<raw_evidence>"):
|
||
_raw_evidence = f"<raw_evidence>\n{_raw_evidence}\n</raw_evidence>"
|
||
evidence_section = f"\n\n## 🔬 MCP 實時環境證據\n{_raw_evidence}\n" if _raw_evidence else ""
|
||
|
||
# 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制)
|
||
# 優先保留 System Prompt,截斷 Alert Data
|
||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
|
||
if available_len < 500:
|
||
# 如果 SignOz 太長,也截斷它
|
||
signoz_context = signoz_context[:500] + "... (truncated)"
|
||
available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) - len(evidence_section)
|
||
|
||
alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2)
|
||
if len(alert_json) > available_len:
|
||
alert_json = alert_json[:available_len] + "... (truncated)"
|
||
|
||
full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + evidence_section + "\n\n## Alert Data:\n" + alert_json
|
||
|
||
logger.info(
|
||
"openclaw_alert_analysis_start",
|
||
alert_type=alert_context.get("alert_type"),
|
||
target=alert_context.get("target_resource"),
|
||
signoz_available=signoz_metrics is not None,
|
||
)
|
||
|
||
# 呼叫 LLM (使用快取層保護算力)
|
||
raw_response, provider, success, from_cache, total_tokens, cost_usd = await self._call_with_cache(
|
||
full_prompt,
|
||
alert_context,
|
||
signoz_metrics,
|
||
cache_ttl=1800, # 30 min for alert analysis
|
||
)
|
||
|
||
if not success:
|
||
logger.error("openclaw_all_providers_failed")
|
||
return None, provider, raw_response, signoz_metrics, signoz_trace_url, 0, 0.0
|
||
|
||
if from_cache:
|
||
logger.info("openclaw_using_cached_response", provider=provider)
|
||
|
||
logger.info(
|
||
"openclaw_llm_response_received",
|
||
provider=provider,
|
||
response_length=len(raw_response),
|
||
)
|
||
|
||
# 解析結果
|
||
result = self._parse_analysis_result(raw_response)
|
||
|
||
# 2026-04-18 → 2026-04-19: 幻覺 deployment 名偵測與降級 (共用 helper)
|
||
self._validate_deployment_inventory(result, _k8s_inventory, _k8s_ns)
|
||
|
||
if result:
|
||
logger.info(
|
||
"openclaw_analysis_complete",
|
||
action_title=result.action_title,
|
||
risk_level=result.risk_level,
|
||
confidence=result.confidence,
|
||
provider=provider,
|
||
signoz_integrated=signoz_metrics is not None,
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"openclaw_analysis_parse_failed",
|
||
raw_response=raw_response[:300],
|
||
)
|
||
|
||
return result, provider, raw_response, signoz_metrics, signoz_trace_url, total_tokens, cost_usd
|
||
|
||
# =========================================================================
|
||
# Phase 6.4: LLM Proposal Generation
|
||
# =========================================================================
|
||
|
||
async def generate_incident_proposal(
|
||
self,
|
||
incident_id: str,
|
||
severity: str,
|
||
signals: list[dict],
|
||
affected_services: list[str],
|
||
expert_context: dict | None = None,
|
||
) -> tuple[dict | None, str, bool]:
|
||
"""
|
||
為 Incident 生成 LLM-based 修復提案
|
||
|
||
Phase 6.4: 賦予大腦「生成解決方案」的思考能力
|
||
2026-03-27: 整合 Expert System 診斷上下文
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
severity: 嚴重度 (P0/P1/P2/P3)
|
||
signals: 關聯的告警訊號
|
||
affected_services: 受影響服務
|
||
expert_context: Expert System 初步診斷 (可選)
|
||
- initial_diagnosis: 規則匹配結果
|
||
- diagnosis_description: 診斷描述
|
||
- suggested_diagnosis_commands: 建議診斷指令
|
||
- expert_confidence: 信心分數
|
||
- requires_human_review: 是否需人工介入
|
||
|
||
Returns:
|
||
(proposal_dict, provider, success)
|
||
proposal_dict 包含:
|
||
- action: 建議動作
|
||
- description: 動作描述
|
||
- kubectl_command: kubectl 指令
|
||
- risk_level: 風險等級
|
||
- reasoning: LLM 推理過程
|
||
"""
|
||
# 建構 prompt (2026-03-31 ogt: Nemotron-mini context 較小,限制數量與長度)
|
||
signal_summary = "\n".join([
|
||
f"- {s.get('alert_name', 'unknown')}: {str(s.get('description', 'N/A'))[:100]}..."
|
||
for s in signals[:3] # 最多 3 筆,每筆最多 100 字元
|
||
])
|
||
|
||
target = affected_services[0] if affected_services else "unknown-service"
|
||
|
||
# 擷取 SignOz 指標
|
||
signoz_metrics, signoz_trace_url = await self.get_signoz_context(
|
||
service_name=target,
|
||
namespace="awoooi-prod",
|
||
)
|
||
|
||
signoz_context = ""
|
||
if signoz_metrics:
|
||
signoz_context = f"""
|
||
## 📊 SignOz Real-time Metrics
|
||
{signoz_metrics.to_summary()}
|
||
"""
|
||
|
||
# 2026-03-27: 整合 Expert System 診斷上下文
|
||
# 2026-03-26: ADR-030 Phase 2 - 加入 K8s/SignOz 診斷上下文
|
||
expert_diagnosis_context = ""
|
||
if expert_context:
|
||
diagnosis_cmds = expert_context.get("suggested_diagnosis_commands", [])
|
||
diagnosis_cmds_str = "\n".join([f" - `{cmd}`" for cmd in diagnosis_cmds]) if diagnosis_cmds else " - (無)"
|
||
|
||
# ADR-030: 加入完整診斷上下文 (如果有),並限制長度以符合 4K Context
|
||
full_diagnosis = str(expert_context.get("diagnosis_context", ""))[:800]
|
||
if len(str(expert_context.get("diagnosis_context", ""))) > 800:
|
||
full_diagnosis += "... (truncated)"
|
||
diagnosis_signals = expert_context.get("diagnosis_signals", [])
|
||
signals_summary = ""
|
||
if diagnosis_signals:
|
||
signals_summary = "\n".join([
|
||
f" - [{s.get('severity', 'info').upper()}] {s.get('source', 'unknown')}: {s.get('message', 'N/A')[:100]}"
|
||
for s in diagnosis_signals[:5]
|
||
])
|
||
|
||
expert_diagnosis_context = f"""
|
||
## 🔍 Expert System Initial Diagnosis
|
||
- **Matched Rule**: {expert_context.get('initial_diagnosis', 'unknown')}
|
||
- **Diagnosis**: {expert_context.get('diagnosis_description', 'N/A')}
|
||
- **Confidence**: {expert_context.get('expert_confidence', 0.0):.0%}
|
||
- **Requires Human Review**: {'Yes' if expert_context.get('requires_human_review') else 'No'}
|
||
- **Suggested Diagnosis Commands**:
|
||
{diagnosis_cmds_str}
|
||
|
||
{f'''## 🩺 K8s/SignOz Deep Diagnosis (ADR-030)
|
||
{full_diagnosis}
|
||
|
||
### Diagnosis Signals
|
||
{signals_summary if signals_summary else " - (No signals detected)"}
|
||
''' if full_diagnosis else ''}
|
||
|
||
**IMPORTANT**: The Expert System and Diagnostic Aggregator have provided context.
|
||
Consider this data but apply your own analysis. If Expert says "human review required",
|
||
provide diagnostic guidance rather than automated fixes.
|
||
"""
|
||
|
||
# 2026-03-31 ogt: 針對 NVIDIA Nemo-4B 使用超精簡 Prompt
|
||
registry = get_model_registry()
|
||
is_nemo = "nvidia" in (registry.get_model("nvidia", "rca") or "").lower()
|
||
base_prompt = NEMOTRON_SYSTEM_PROMPT if is_nemo else OPENCLAW_SYSTEM_PROMPT
|
||
|
||
proposal_prompt = f"""{base_prompt}
|
||
|
||
{signoz_context}
|
||
{expert_diagnosis_context}
|
||
|
||
## 🚨 Incident Context
|
||
- **Incident ID**: {incident_id}
|
||
- **Severity**: {severity}
|
||
- **Affected Services**: {', '.join(affected_services)}
|
||
- **Signal Count**: {len(signals)}
|
||
|
||
## 📋 Alert Signals
|
||
{signal_summary}
|
||
|
||
## 🎯 Your Task
|
||
Based on the above incident, signals, and Expert System diagnosis, generate a remediation proposal.
|
||
You MUST respond with ONLY valid JSON following the schema above.
|
||
Focus on:
|
||
1. Root cause analysis based on signals, SignOz data, and Expert diagnosis
|
||
2. Specific kubectl command to remediate (or diagnostic command if root cause unclear)
|
||
3. Risk assessment for the proposed action
|
||
4. Preventive recommendations
|
||
5. If Expert System flagged "human review required", prioritize diagnostic commands over fixes
|
||
"""
|
||
|
||
logger.info(
|
||
"proposal_generation_start",
|
||
incident_id=incident_id,
|
||
severity=severity,
|
||
signal_count=len(signals),
|
||
signoz_available=signoz_metrics is not None,
|
||
)
|
||
|
||
# 2026-04-01 ogt: 架構鐵律 — OpenClaw (Nemo) 是 AI 大腦,優先委派仲裁
|
||
# AWOOOI K8s Pod 不直接打 Ollama/NVIDIA,避免並發逾時
|
||
openclaw_result = await self._call_openclaw_analyze(
|
||
incident_id, severity, signals, affected_services, expert_context
|
||
)
|
||
if openclaw_result is not None:
|
||
return openclaw_result, "openclaw_nemo", True
|
||
|
||
# 使用快取呼叫 LLM
|
||
alert_context = {
|
||
"incident_id": incident_id,
|
||
"alert_type": signals[0].get("alert_name", "incident") if signals else "incident",
|
||
"target_resource": target,
|
||
"severity": severity,
|
||
}
|
||
|
||
# 2026-03-29 ogt: 修復 tuple unpacking (Token/Cost 追蹤)
|
||
raw_response, provider, success, from_cache, ai_tokens, ai_cost = await self._call_with_cache(
|
||
proposal_prompt,
|
||
alert_context,
|
||
signoz_metrics,
|
||
cache_ttl=3600, # 1 hour for proposals
|
||
)
|
||
|
||
if not success:
|
||
logger.error(
|
||
"proposal_generation_failed",
|
||
incident_id=incident_id,
|
||
provider=provider,
|
||
)
|
||
return None, provider, False
|
||
|
||
# 解析 LLM 結果
|
||
result = self._parse_analysis_result(raw_response)
|
||
|
||
# 2026-04-19 ogt + Claude Opus 4.7: 同 analyze_alert 也需幻覺驗證
|
||
# 此路徑沒有 inventory 預抓,動態抓
|
||
_k8s_ns_for_validate = alert_context.get("namespace", "awoooi-prod") if "alert_context" in dir() else "awoooi-prod"
|
||
try:
|
||
_k8s_inv = await _fetch_k8s_inventory_for_openclaw(namespace=_k8s_ns_for_validate)
|
||
except Exception:
|
||
_k8s_inv = ""
|
||
self._validate_deployment_inventory(result, _k8s_inv, _k8s_ns_for_validate)
|
||
|
||
if result:
|
||
logger.info(
|
||
"proposal_generation_complete",
|
||
incident_id=incident_id,
|
||
action_title=result.action_title,
|
||
risk_level=result.risk_level,
|
||
provider=provider,
|
||
from_cache=from_cache,
|
||
)
|
||
|
||
# 轉換為 proposal dict (optimization_suggestions 是 list[dict])
|
||
# 2026-03-29 ogt: 加入 ai_tokens/ai_cost 追蹤
|
||
proposal_dict = {
|
||
"action": result.action_title,
|
||
"description": result.description,
|
||
"kubectl_command": result.kubectl_command,
|
||
"target_resource": result.target_resource,
|
||
"namespace": result.namespace,
|
||
"risk_level": result.risk_level,
|
||
"reasoning": result.reasoning,
|
||
"confidence": result.confidence,
|
||
"primary_responsibility": result.primary_responsibility,
|
||
"optimization_suggestions": [
|
||
{
|
||
"type": s.get("type", "UNKNOWN"),
|
||
"description": s.get("description", ""),
|
||
"kubectl_or_config": s.get("kubectl_or_config", ""),
|
||
}
|
||
for s in result.optimization_suggestions
|
||
],
|
||
"signoz_correlation": result.signoz_correlation,
|
||
"from_cache": from_cache,
|
||
"provider": provider,
|
||
"model": self._get_model_name(provider), # 2026-04-04 ogt: 底層模型名稱
|
||
"ai_tokens": ai_tokens,
|
||
"ai_cost": ai_cost,
|
||
}
|
||
return proposal_dict, provider, True
|
||
|
||
logger.warning(
|
||
"proposal_parse_failed",
|
||
incident_id=incident_id,
|
||
raw_response=raw_response[:300],
|
||
)
|
||
return None, provider, False
|
||
|
||
# =========================================================================
|
||
# Phase 22: OpenClaw + Nemotron 協作 (ADR-044)
|
||
# 2026-03-31 Claude Code: 統帥批准實作
|
||
# =========================================================================
|
||
|
||
async def _maybe_run_openclaw_agent_loop_shadow(
|
||
self,
|
||
*,
|
||
proposal: dict,
|
||
incident_id: str,
|
||
severity: str,
|
||
signals: list[dict],
|
||
affected_services: list[str],
|
||
expert_context: dict | None = None,
|
||
) -> None:
|
||
"""
|
||
ADR-105 P1: read-only Agent Loop shadow investigation.
|
||
|
||
This is intentionally non-decisive: it proves MCP tool_use/audit wiring
|
||
with local models and read-only tools, then falls back silently to the
|
||
regular proposal when disabled or unavailable.
|
||
"""
|
||
if not settings.ENABLE_OPENCLAW_AGENT_LOOP_SHADOW:
|
||
return
|
||
|
||
try:
|
||
from src.plugins.mcp.registry import get_provider_registry
|
||
from src.services.ai_providers.agent_loop import AgentToolExecutor
|
||
from src.services.ai_providers.permissions import is_read_only_tool
|
||
from src.services.ai_router import get_ai_registry
|
||
|
||
ai_registry = get_ai_registry()
|
||
provider = ai_registry.get("ollama") or ai_registry.get("ollama_local")
|
||
if provider is None or not hasattr(provider, "analyze_with_tools"):
|
||
logger.warning(
|
||
"openclaw_agent_loop_shadow_skipped",
|
||
incident_id=incident_id,
|
||
reason="no_local_tool_provider",
|
||
)
|
||
return
|
||
|
||
mcp_registry = get_provider_registry()
|
||
providers = {p.name: p for p in mcp_registry.all()}
|
||
allowed_servers = {"kubernetes", "prometheus", "signoz", "database", "rag", "grafana"}
|
||
available_tools = []
|
||
for mcp_provider in providers.values():
|
||
if mcp_provider.name not in allowed_servers:
|
||
continue
|
||
try:
|
||
provider_tools = await mcp_provider.list_tools()
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"openclaw_agent_loop_tool_list_failed",
|
||
incident_id=incident_id,
|
||
provider=mcp_provider.name,
|
||
error=str(exc),
|
||
)
|
||
continue
|
||
available_tools.extend(
|
||
tool for tool in provider_tools
|
||
if tool.server_name in allowed_servers and is_read_only_tool(tool)
|
||
)
|
||
|
||
if not available_tools:
|
||
logger.warning(
|
||
"openclaw_agent_loop_shadow_skipped",
|
||
incident_id=incident_id,
|
||
reason="no_readonly_tools",
|
||
)
|
||
return
|
||
|
||
executor = AgentToolExecutor(
|
||
available_tools=available_tools,
|
||
providers=providers,
|
||
agent_role="openclaw",
|
||
incident_id=incident_id,
|
||
flywheel_node="reason",
|
||
)
|
||
shadow_prompt = self._build_agent_loop_shadow_prompt(
|
||
proposal=proposal,
|
||
incident_id=incident_id,
|
||
severity=severity,
|
||
signals=signals,
|
||
affected_services=affected_services,
|
||
expert_context=expert_context,
|
||
)
|
||
result = await provider.analyze_with_tools(
|
||
prompt=shadow_prompt,
|
||
available_tools=available_tools,
|
||
tool_executor=executor.execute,
|
||
max_iterations=settings.OPENCLAW_AGENT_LOOP_MAX_ITERATIONS,
|
||
agent_role="openclaw",
|
||
context={
|
||
"incident_id": incident_id,
|
||
"severity": severity,
|
||
"task_type": "diagnose",
|
||
},
|
||
)
|
||
structured_shadow = self._parse_agent_loop_shadow_response(result.raw_response or "")
|
||
proposal["agent_loop_shadow"] = {
|
||
"enabled": True,
|
||
"success": result.success,
|
||
"provider": result.provider,
|
||
"tokens": result.tokens,
|
||
"latency_ms": round(result.latency_ms, 1),
|
||
"error": result.error,
|
||
"decision_impact": "none",
|
||
"structured": structured_shadow,
|
||
"confidence_delta": structured_shadow.get("confidence_delta", 0.0),
|
||
"preview": (result.raw_response or "")[:700],
|
||
}
|
||
logger.info(
|
||
"openclaw_agent_loop_shadow_complete",
|
||
incident_id=incident_id,
|
||
provider=result.provider,
|
||
success=result.success,
|
||
tools_available=len(available_tools),
|
||
latency_ms=round(result.latency_ms, 1),
|
||
confidence_delta=structured_shadow.get("confidence_delta", 0.0),
|
||
parse_status=structured_shadow.get("parse_status"),
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"openclaw_agent_loop_shadow_failed",
|
||
incident_id=incident_id,
|
||
error=str(exc),
|
||
)
|
||
|
||
@classmethod
|
||
def _parse_agent_loop_shadow_response(cls, raw_response: str) -> dict:
|
||
"""
|
||
Normalize read-only Agent Loop output into durable metadata.
|
||
|
||
The shadow result is intentionally non-decisive. Downstream code can
|
||
inspect this structure for quality review, but it must not override the
|
||
main proposal until ADR-105 canary graduation.
|
||
"""
|
||
text = (raw_response or "").strip()
|
||
if not text:
|
||
return {
|
||
"parse_status": "empty",
|
||
"root_cause_check": "",
|
||
"evidence_used": [],
|
||
"confidence_delta": 0.0,
|
||
"missing_evidence": [],
|
||
"human_or_ai_next_step": "",
|
||
}
|
||
|
||
payload = cls._extract_json_object(text)
|
||
if not isinstance(payload, dict):
|
||
return {
|
||
"parse_status": "unparsed",
|
||
"root_cause_check": "",
|
||
"evidence_used": [],
|
||
"confidence_delta": 0.0,
|
||
"missing_evidence": [],
|
||
"human_or_ai_next_step": "",
|
||
"raw_preview": text[:700],
|
||
}
|
||
|
||
return {
|
||
"parse_status": "ok",
|
||
"root_cause_check": cls._clip_shadow_text(payload.get("root_cause_check"), max_chars=500),
|
||
"evidence_used": cls._coerce_shadow_list(payload.get("evidence_used"), max_items=5),
|
||
"confidence_delta": cls._coerce_agent_loop_confidence_delta(
|
||
payload.get("confidence_delta", 0.0)
|
||
),
|
||
"missing_evidence": cls._coerce_shadow_list(payload.get("missing_evidence"), max_items=5),
|
||
"human_or_ai_next_step": cls._clip_shadow_text(
|
||
payload.get("human_or_ai_next_step"), max_chars=500
|
||
),
|
||
}
|
||
|
||
@staticmethod
|
||
def _extract_json_object(text: str) -> dict | None:
|
||
"""Extract the first JSON object from plain or fenced LLM output."""
|
||
candidates = [text]
|
||
fenced = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, flags=re.DOTALL | re.IGNORECASE)
|
||
if fenced:
|
||
candidates.insert(0, fenced.group(1))
|
||
object_match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
||
if object_match:
|
||
candidates.append(object_match.group(0))
|
||
|
||
for candidate in candidates:
|
||
try:
|
||
parsed = json.loads(candidate)
|
||
except (TypeError, json.JSONDecodeError):
|
||
continue
|
||
if isinstance(parsed, dict):
|
||
return parsed
|
||
return None
|
||
|
||
@staticmethod
|
||
def _clip_shadow_text(value: object, *, max_chars: int) -> str:
|
||
if value is None:
|
||
return ""
|
||
return str(value).strip()[:max_chars]
|
||
|
||
@classmethod
|
||
def _coerce_shadow_list(cls, value: object, *, max_items: int) -> list[str]:
|
||
if value is None:
|
||
return []
|
||
if isinstance(value, list):
|
||
items = value
|
||
else:
|
||
items = [value]
|
||
normalized = []
|
||
for item in items:
|
||
clipped = cls._clip_shadow_text(item, max_chars=240)
|
||
if clipped:
|
||
normalized.append(clipped)
|
||
if len(normalized) >= max_items:
|
||
break
|
||
return normalized
|
||
|
||
@staticmethod
|
||
def _coerce_agent_loop_confidence_delta(value: object) -> float:
|
||
"""
|
||
Keep canary deltas conservative: metadata may lower confidence later,
|
||
but positive boosts are recorded as 0 until the shadow path graduates.
|
||
"""
|
||
try:
|
||
delta = float(value)
|
||
except (TypeError, ValueError):
|
||
return 0.0
|
||
return round(max(min(delta, 0.0), -0.15), 3)
|
||
|
||
def _build_agent_loop_shadow_prompt(
|
||
self,
|
||
*,
|
||
proposal: dict,
|
||
incident_id: str,
|
||
severity: str,
|
||
signals: list[dict],
|
||
affected_services: list[str],
|
||
expert_context: dict | None = None,
|
||
) -> str:
|
||
"""Build a compact read-only investigation prompt for Agent Loop shadow mode."""
|
||
return f"""你是 OpenClaw 的唯讀 shadow investigator。你可以使用 MCP 工具查證,但不得要求任何寫入、重啟、刪除或通知動作。
|
||
|
||
請只回傳 JSON:
|
||
{{
|
||
"root_cause_check": "你對目前根因的查證結論",
|
||
"evidence_used": ["最多 5 條具體證據"],
|
||
"confidence_delta": -0.1,
|
||
"missing_evidence": ["還缺什麼證據"],
|
||
"human_or_ai_next_step": "下一個安全步驟"
|
||
}}
|
||
|
||
Incident: {incident_id}
|
||
Severity: {severity}
|
||
Affected services: {json.dumps(affected_services, ensure_ascii=False)}
|
||
Signals: {json.dumps(signals[:5], ensure_ascii=False, default=str)}
|
||
Current proposal: {json.dumps(proposal, ensure_ascii=False, default=str)}
|
||
Expert context: {json.dumps(expert_context or {}, ensure_ascii=False, default=str)}
|
||
"""
|
||
|
||
async def generate_incident_proposal_with_tools(
|
||
self,
|
||
incident_id: str,
|
||
severity: str,
|
||
signals: list[dict],
|
||
affected_services: list[str],
|
||
expert_context: dict | None = None,
|
||
) -> tuple[dict | None, str, bool]:
|
||
"""
|
||
Phase 22: OpenClaw + Nemotron 協作生成修復提案
|
||
|
||
架構:
|
||
- OpenClaw = 仲裁者 (Arbitrator) - 決定「為什麼」和「風險等級」
|
||
- Nemotron = 執行者 (Executor) - 決定「怎麼做」和「具體指令」
|
||
|
||
觸發條件:
|
||
- LOW 風險 → 僅 OpenClaw,跳過 Nemotron
|
||
- MEDIUM/HIGH/CRITICAL → OpenClaw + Nemotron 雙軌
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
severity: 嚴重度 (P0/P1/P2/P3)
|
||
signals: 關聯的告警訊號
|
||
affected_services: 受影響服務
|
||
expert_context: Expert System 初步診斷 (可選)
|
||
|
||
Returns:
|
||
(proposal_dict, provider, success)
|
||
proposal_dict 新增:
|
||
- nemotron_enabled: bool
|
||
- nemotron_tools: list[dict] (如果啟用)
|
||
- nemotron_validation: str
|
||
- nemotron_latency_ms: float
|
||
"""
|
||
# Feature Flag 檢查
|
||
if not settings.ENABLE_NEMOTRON_COLLABORATION:
|
||
logger.info(
|
||
"nemotron_collaboration_disabled",
|
||
incident_id=incident_id,
|
||
reason="Feature flag disabled",
|
||
)
|
||
return await self.generate_incident_proposal(
|
||
incident_id, severity, signals, affected_services, expert_context
|
||
)
|
||
|
||
# Step 1: OpenClaw 仲裁
|
||
proposal, provider, success = await self.generate_incident_proposal(
|
||
incident_id, severity, signals, affected_services, expert_context
|
||
)
|
||
|
||
if not success or proposal is None:
|
||
return proposal, provider, success
|
||
|
||
await self._maybe_run_openclaw_agent_loop_shadow(
|
||
proposal=proposal,
|
||
incident_id=incident_id,
|
||
severity=severity,
|
||
signals=signals,
|
||
affected_services=affected_services,
|
||
expert_context=expert_context,
|
||
)
|
||
|
||
# Step 2: 判斷是否需要 Nemotron
|
||
risk_level = proposal.get("risk_level", "low").lower()
|
||
if risk_level == "low":
|
||
proposal["nemotron_enabled"] = False
|
||
logger.info(
|
||
"nemotron_skipped_low_risk",
|
||
incident_id=incident_id,
|
||
risk_level=risk_level,
|
||
)
|
||
return proposal, provider, True
|
||
|
||
# Step 3: 呼叫 Nemotron Tool Calling — 🔴 必須等到有結果,不可跳過
|
||
# 2026-04-07 ogt: 統帥指示 Nemotron 不能跳過,必須等到處理完成
|
||
logger.info(
|
||
"nemotron_collaboration_start",
|
||
incident_id=incident_id,
|
||
risk_level=risk_level,
|
||
)
|
||
|
||
max_retries = 2
|
||
last_error = None
|
||
for attempt in range(1, max_retries + 1):
|
||
try:
|
||
nemotron_result = await self._call_nemotron_tools(
|
||
incident_id=incident_id,
|
||
reasoning=proposal.get("reasoning", ""),
|
||
target_resource=proposal.get("target_resource", ""),
|
||
suggested_action=proposal.get("action", ""),
|
||
namespace=proposal.get("namespace", "awoooi-prod"),
|
||
)
|
||
|
||
proposal["nemotron_enabled"] = True
|
||
proposal["nemotron_tools"] = nemotron_result.get("tools", [])
|
||
proposal["nemotron_validation"] = nemotron_result.get("validation", "⏳ 驗證中")
|
||
proposal["nemotron_latency_ms"] = nemotron_result.get("latency_ms", 0.0)
|
||
proposal["nemotron_tool_model"] = nemotron_result.get("tool_model", "")
|
||
proposal["nemotron_tool_backend"] = nemotron_result.get("tool_backend", "")
|
||
|
||
# 2026-04-09 Claude Sonnet 4.6: 將 Nemotron tool call 回填為 kubectl_command
|
||
# 根本問題修復:approval_records.action 需要可執行指令才能被 parse_operation_from_action 解析
|
||
_backfill_kubectl_command(proposal, proposal["nemotron_tools"])
|
||
|
||
logger.info(
|
||
"nemotron_collaboration_complete",
|
||
incident_id=incident_id,
|
||
tools_count=len(proposal["nemotron_tools"]),
|
||
validation=proposal["nemotron_validation"],
|
||
latency_ms=proposal["nemotron_latency_ms"],
|
||
attempt=attempt,
|
||
)
|
||
last_error = None
|
||
break # 成功,跳出重試迴圈
|
||
|
||
except Exception as e:
|
||
last_error = e
|
||
logger.warning(
|
||
"nemotron_collaboration_retry",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
attempt=attempt,
|
||
max_retries=max_retries,
|
||
)
|
||
if attempt < max_retries:
|
||
import asyncio
|
||
await asyncio.sleep(2) # 重試前等 2 秒
|
||
|
||
# 重試全部失敗 — fallback 到 Gemini 模擬 tool calling
|
||
# 2026-04-08 ogt: NIM 完全不可用時,改用 Gemini 產生執行方案(不可跳過)
|
||
if last_error is not None:
|
||
logger.error(
|
||
"nemotron_collaboration_exhausted",
|
||
incident_id=incident_id,
|
||
error=str(last_error),
|
||
retries=max_retries,
|
||
)
|
||
logger.info("nemotron_fallback_gemini_start", incident_id=incident_id)
|
||
gemini_fallback_result = await self._call_nemotron_tools_via_gemini(
|
||
incident_id=incident_id,
|
||
reasoning=proposal.get("reasoning", ""),
|
||
target_resource=proposal.get("target_resource", ""),
|
||
suggested_action=proposal.get("action", ""),
|
||
namespace=proposal.get("namespace", "awoooi-prod"),
|
||
)
|
||
proposal["nemotron_enabled"] = True
|
||
proposal["nemotron_tools"] = gemini_fallback_result.get("tools", [])
|
||
proposal["nemotron_validation"] = gemini_fallback_result.get("validation", "⚠️ Gemini 代理")
|
||
proposal["nemotron_latency_ms"] = gemini_fallback_result.get("latency_ms", 0.0)
|
||
proposal["nemotron_tool_model"] = "gemini-fallback"
|
||
proposal["nemotron_tool_backend"] = "Gemini 雲端"
|
||
|
||
# 2026-04-09 Claude Sonnet 4.6: Gemini fallback 同樣回填 kubectl_command
|
||
_backfill_kubectl_command(proposal, proposal["nemotron_tools"])
|
||
|
||
return proposal, provider, True
|
||
|
||
async def _call_nemotron_tools(
|
||
self,
|
||
incident_id: str,
|
||
reasoning: str,
|
||
target_resource: str,
|
||
suggested_action: str,
|
||
namespace: str = "awoooi-prod",
|
||
) -> dict:
|
||
"""
|
||
呼叫 Nemotron 執行 Tool Calling
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
reasoning: OpenClaw 推理結果
|
||
target_resource: 目標資源名稱
|
||
suggested_action: OpenClaw 建議的操作
|
||
namespace: K8s namespace
|
||
|
||
Returns:
|
||
{
|
||
"tools": [{"tool": str, "args": dict, "valid": bool}],
|
||
"validation": str,
|
||
"latency_ms": float
|
||
}
|
||
"""
|
||
import asyncio
|
||
|
||
from src.services.nvidia_provider import get_nvidia_provider
|
||
|
||
nvidia = get_nvidia_provider()
|
||
start_time = time.time()
|
||
|
||
# 建構 Tool Calling prompt
|
||
# 2026-04-09 Claude Code: 明確指示 deployment_name,防止 Nemotron 填 placeholder
|
||
tool_prompt = f"""根據以下 AI 分析結果,生成對應的 kubectl 操作指令:
|
||
|
||
## Incident 上下文
|
||
- Incident ID: {incident_id}
|
||
- 目標資源 (deployment_name): {target_resource}
|
||
- Namespace: {namespace}
|
||
|
||
## OpenClaw 分析
|
||
- 建議操作: {suggested_action}
|
||
- 推理過程: {reasoning[:500]}
|
||
|
||
## 你的任務
|
||
使用提供的工具生成 kubectl 操作。
|
||
**重要**: deployment_name 必須填入 "{target_resource}",不可使用 placeholder。
|
||
"""
|
||
|
||
# 定義可用 Tools (K8s 操作)
|
||
k8s_tools = [
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "restart_deployment",
|
||
"description": "重啟 Deployment (rollout restart)",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"deployment_name": {"type": "string"},
|
||
"namespace": {"type": "string", "default": "awoooi-prod"},
|
||
},
|
||
"required": ["deployment_name"],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "scale_deployment",
|
||
"description": "調整 Deployment 副本數",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"deployment_name": {"type": "string"},
|
||
"replicas": {"type": "integer"},
|
||
"namespace": {"type": "string", "default": "awoooi-prod"},
|
||
},
|
||
"required": ["deployment_name", "replicas"],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "delete_pod",
|
||
"description": "刪除 Pod (強制重建)",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"pod_name": {"type": "string"},
|
||
"namespace": {"type": "string", "default": "awoooi-prod"},
|
||
},
|
||
"required": ["pod_name"],
|
||
},
|
||
},
|
||
},
|
||
]
|
||
|
||
try:
|
||
# 2026-04-07 ogt: 統帥指示不可跳過 Nemotron,用 120 秒寬裕超時
|
||
timeout = 120
|
||
|
||
result = await asyncio.wait_for(
|
||
nvidia.tool_call(
|
||
messages=[{"role": "user", "content": tool_prompt}],
|
||
tools=k8s_tools,
|
||
),
|
||
timeout=timeout,
|
||
)
|
||
|
||
latency_ms = (time.time() - start_time) * 1000
|
||
|
||
# 解析 Tool Calling 結果
|
||
tools = []
|
||
validation_passed = True
|
||
|
||
if result and hasattr(result, "tool_calls") and result.tool_calls:
|
||
for tc in result.tool_calls:
|
||
tool_entry = {
|
||
"tool": tc.tool_name if hasattr(tc, "tool_name") else str(tc.get("name", "unknown")),
|
||
"args": tc.arguments if hasattr(tc, "arguments") else tc.get("arguments", {}),
|
||
"valid": tc.valid if hasattr(tc, "valid") else True,
|
||
}
|
||
tools.append(tool_entry)
|
||
if not tool_entry["valid"]:
|
||
validation_passed = False
|
||
elif result and isinstance(result, dict) and result.get("tool_calls"):
|
||
for tc in result["tool_calls"]:
|
||
tool_entry = {
|
||
"tool": tc.get("name", "unknown"),
|
||
"args": tc.get("arguments", {}),
|
||
"valid": True,
|
||
}
|
||
tools.append(tool_entry)
|
||
|
||
# 2026-04-09 Claude Code: 修正 Nemotron 回傳 placeholder 問題
|
||
# 若 deployment_name 是 placeholder 或空值,用 target_resource 覆蓋
|
||
_PLACEHOLDERS = {"<deployment_name>", "<target>", "", "unknown", "null", "None"}
|
||
for t in tools:
|
||
args = t.get("args", {})
|
||
if isinstance(args, dict):
|
||
dn = args.get("deployment_name", "")
|
||
if not dn or str(dn).strip("<>") in _PLACEHOLDERS or dn.startswith("<"):
|
||
args["deployment_name"] = target_resource
|
||
logger.info(
|
||
"nemotron_placeholder_corrected",
|
||
incident_id=incident_id,
|
||
original=dn,
|
||
corrected=target_resource,
|
||
)
|
||
|
||
validation_status = "✅ 驗證通過" if validation_passed and tools else "❌ 驗證失敗"
|
||
|
||
return {
|
||
"tools": tools,
|
||
"validation": validation_status,
|
||
"latency_ms": latency_ms,
|
||
}
|
||
|
||
except TimeoutError:
|
||
latency_ms = (time.time() - start_time) * 1000
|
||
logger.error(
|
||
"nemotron_tool_call_timeout",
|
||
incident_id=incident_id,
|
||
timeout_seconds=timeout,
|
||
)
|
||
# 超時也拋出,讓外層重試
|
||
raise
|
||
|
||
except Exception as e:
|
||
latency_ms = (time.time() - start_time) * 1000
|
||
logger.error(
|
||
"nemotron_tool_call_error",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
raise
|
||
|
||
async def _call_nemotron_tools_via_gemini(
|
||
self,
|
||
incident_id: str,
|
||
reasoning: str,
|
||
target_resource: str,
|
||
suggested_action: str,
|
||
namespace: str = "awoooi-prod",
|
||
) -> dict:
|
||
"""
|
||
NIM 完全不可用時,由 Gemini 代理產生 tool calling 執行方案。
|
||
2026-04-08 ogt: NIM timeout 後的唯一 fallback,不可跳過。
|
||
|
||
Returns: {"tools": [...], "validation": str, "latency_ms": float}
|
||
"""
|
||
import time as _time
|
||
start_time = _time.time()
|
||
|
||
prompt = f"""你是 K8s SRE 專家。根據以下分析,輸出對應的 kubectl 操作指令(JSON 格式)。
|
||
|
||
Incident ID: {incident_id}
|
||
目標資源: {target_resource}
|
||
Namespace: {namespace}
|
||
建議操作: {suggested_action}
|
||
分析摘要: {reasoning[:300]}
|
||
|
||
請輸出以下 JSON 格式(只輸出 JSON,不要其他文字):
|
||
{{
|
||
"tool_name": "restart_deployment 或 scale_deployment 或 no_action",
|
||
"deployment_name": "部署名稱",
|
||
"namespace": "{namespace}",
|
||
"reason": "一句話說明原因"
|
||
}}"""
|
||
|
||
try:
|
||
text, success, _, _ = await self._call_gemini(prompt)
|
||
latency_ms = (_time.time() - start_time) * 1000
|
||
|
||
if not success:
|
||
logger.warning("nemotron_gemini_fallback_failed", incident_id=incident_id, error=text)
|
||
return {"tools": [], "validation": "❌ NIM + Gemini 均不可用", "latency_ms": latency_ms}
|
||
|
||
import json as _json
|
||
data = _json.loads(text)
|
||
tool_name = data.get("tool_name", "no_action")
|
||
tools = []
|
||
if tool_name != "no_action":
|
||
tools = [{
|
||
"tool": tool_name,
|
||
"args": {
|
||
"deployment_name": data.get("deployment_name", target_resource),
|
||
"namespace": data.get("namespace", namespace),
|
||
},
|
||
"valid": True,
|
||
}]
|
||
|
||
logger.info(
|
||
"nemotron_gemini_fallback_success",
|
||
incident_id=incident_id,
|
||
tool=tool_name,
|
||
latency_ms=latency_ms,
|
||
)
|
||
return {
|
||
"tools": tools,
|
||
"validation": "✅ Gemini 代理驗證通過",
|
||
"latency_ms": latency_ms,
|
||
}
|
||
|
||
except Exception as e:
|
||
latency_ms = (_time.time() - start_time) * 1000
|
||
logger.error("nemotron_gemini_fallback_error", incident_id=incident_id, error=str(e))
|
||
return {"tools": [], "validation": f"❌ Gemini 代理失敗: {str(e)[:50]}", "latency_ms": latency_ms}
|
||
|
||
# =========================================================================
|
||
# Shadow Mode Auto-Tuning
|
||
# =========================================================================
|
||
|
||
async def execute_auto_tuning(
|
||
self,
|
||
approval_id: str,
|
||
kubectl_command: str,
|
||
description: str,
|
||
) -> dict:
|
||
"""
|
||
執行自動調優 (Shadow Mode: 僅日誌輸出)
|
||
|
||
統帥鐵律: Shadow Mode 下嚴禁實際執行 K8s 命令
|
||
|
||
Args:
|
||
approval_id: 簽核單 ID
|
||
kubectl_command: kubectl 指令
|
||
description: 操作描述
|
||
|
||
Returns:
|
||
{executed: bool, shadow_mode: bool, command: str, log: str}
|
||
"""
|
||
if settings.SHADOW_MODE_ENABLED:
|
||
# Shadow Mode: 僅記錄,不執行
|
||
log_message = f"[SHADOW MODE] AI 生成的調優指令:{kubectl_command}"
|
||
logger.info(
|
||
"shadow_mode_auto_tuning",
|
||
approval_id=approval_id,
|
||
command=kubectl_command,
|
||
description=description,
|
||
executed=False,
|
||
)
|
||
print(f"\n{'='*60}")
|
||
print(log_message)
|
||
print(f"描述: {description}")
|
||
print(f"簽核單: {approval_id}")
|
||
print(f"{'='*60}\n")
|
||
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": True,
|
||
"command": kubectl_command,
|
||
"description": description,
|
||
"log": log_message,
|
||
}
|
||
else:
|
||
# 生產模式: 實際執行 (需要額外安全檢查)
|
||
logger.warning(
|
||
"auto_tuning_execution_attempted",
|
||
approval_id=approval_id,
|
||
command=kubectl_command,
|
||
message="Production execution not yet implemented - requires multi-sig approval",
|
||
)
|
||
return {
|
||
"executed": False,
|
||
"shadow_mode": False,
|
||
"command": kubectl_command,
|
||
"description": description,
|
||
"log": "Production execution requires multi-sig approval",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_openclaw: OpenClawService | None = None
|
||
|
||
|
||
def get_openclaw() -> OpenClawService:
|
||
"""取得全域 OpenClaw 實例"""
|
||
global _openclaw
|
||
if _openclaw is None:
|
||
_openclaw = OpenClawService()
|
||
return _openclaw
|
||
|
||
|
||
async def close_openclaw() -> None:
|
||
"""關閉 OpenClaw 連線"""
|
||
global _openclaw
|
||
if _openclaw:
|
||
await _openclaw.close()
|
||
_openclaw = None
|
||
|
||
|
||
async def _fetch_k8s_inventory_for_openclaw(
|
||
namespace: str = "awoooi-prod",
|
||
timeout_sec: float = 3.0,
|
||
) -> str:
|
||
"""
|
||
取得 K8s 叢集實際 Deployment/StatefulSet 清單,注入 analyze_alert prompt。
|
||
|
||
2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 webhook path):
|
||
- 根因:NemoTron 在 webhook path 收不到叢集清單 → 幻覺 deployment/awoooi-service
|
||
- 修復:analyze_alert 前拉取真實資源名,注入 prompt,強制 LLM 從清單選擇
|
||
- 超時/失敗 → 返回 ""(prompt 仍正常但無鎖定效果,不中斷主流程)
|
||
- 只執行唯讀 get 指令,不修改叢集
|
||
|
||
Returns:
|
||
"awoooi-api, awoooi-web, ..." 格式字串,失敗時返回 ""
|
||
"""
|
||
import asyncio as _asyncio
|
||
|
||
import structlog as _structlog
|
||
_logger = _structlog.get_logger(__name__)
|
||
try:
|
||
cmd = (
|
||
f"kubectl get deployments,statefulsets -n {namespace} "
|
||
"-o jsonpath='{.items[*].metadata.name}' 2>/dev/null"
|
||
)
|
||
proc = await _asyncio.create_subprocess_shell(
|
||
cmd,
|
||
stdout=_asyncio.subprocess.PIPE,
|
||
stderr=_asyncio.subprocess.PIPE,
|
||
)
|
||
try:
|
||
stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec)
|
||
except TimeoutError:
|
||
proc.kill()
|
||
_logger.warning("k8s_inventory_timeout_openclaw", namespace=namespace)
|
||
return ""
|
||
raw = (stdout or b"").decode("utf-8", errors="replace").strip()
|
||
if not raw:
|
||
return ""
|
||
names = [n.strip() for n in raw.split() if n.strip()]
|
||
return ", ".join(names)
|
||
except Exception as _e:
|
||
_logger.warning("k8s_inventory_failed_openclaw", namespace=namespace, error=str(_e))
|
||
return ""
|
||
|
||
|
||
# =============================================================================
|
||
# Phase 5 + SignOz Integration Complete
|
||
# =============================================================================
|