實作智能自動修復系統的資料收集層: 1. k8s_diagnostics.py - K8s 診斷服務 - Pod Events/Logs/ResourceUsage 收集 - CrashLoopBackOff/OOM/ImagePull 偵測 - 非同步並行收集 + 錯誤容忍 2. diagnosis_aggregator.py - 診斷聚合器 - 整合 K8s + SignOz + Expert Rules - DiagnosisContext 提供結構化 LLM Prompt - DiagnosisSignal 信號分析 3. decision_manager.py - 決策引擎整合 - Step 2.5 加入診斷收集 - 傳遞 diagnosis_context 給 LLM 4. openclaw.py - LLM Prompt 增強 - 整合 K8s/SignOz 深度診斷上下文 - 支援 diagnosis_signals 摘要 ADR-030 架構: 診斷先行,根因分析,非盲目重啟 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
655 lines
22 KiB
Python
655 lines
22 KiB
Python
"""
|
||
K8s Diagnostics Service - Phase 2 資料收集強化
|
||
==============================================
|
||
ADR-030: 智能自動修復系統
|
||
|
||
提供 K8s 診斷資料收集:
|
||
- Pod Events (kubectl get events)
|
||
- Pod Logs (kubectl logs)
|
||
- Resource Usage (kubectl top)
|
||
|
||
設計原則:
|
||
- 非同步執行,不阻塞主流程
|
||
- 錯誤容忍,單一失敗不影響整體
|
||
- 結果快取,避免重複查詢
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-27 (台北時區)
|
||
"""
|
||
|
||
from dataclasses import dataclass, field
|
||
from datetime import UTC, datetime
|
||
from enum import Enum
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.core.config import settings
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# Lazy import kubernetes_asyncio to avoid import errors when not installed
|
||
_k8s_client = None
|
||
_k8s_config_loaded = False
|
||
|
||
|
||
async def _get_k8s_client():
|
||
"""Lazy load kubernetes client"""
|
||
global _k8s_client, _k8s_config_loaded
|
||
|
||
if _k8s_client is not None:
|
||
return _k8s_client
|
||
|
||
try:
|
||
from kubernetes_asyncio import client, config
|
||
|
||
if not _k8s_config_loaded:
|
||
try:
|
||
# 優先使用 in-cluster 配置
|
||
config.load_incluster_config()
|
||
logger.info("k8s_diagnostics_incluster_config")
|
||
except config.ConfigException:
|
||
# Fallback 到 kubeconfig
|
||
await config.load_kube_config(config_file=settings.KUBECONFIG_PATH)
|
||
logger.info("k8s_diagnostics_kubeconfig", path=settings.KUBECONFIG_PATH)
|
||
_k8s_config_loaded = True
|
||
|
||
_k8s_client = client
|
||
return _k8s_client
|
||
|
||
except Exception as e:
|
||
logger.error("k8s_diagnostics_init_failed", error=str(e))
|
||
return None
|
||
|
||
|
||
# =============================================================================
|
||
# Data Models
|
||
# =============================================================================
|
||
|
||
|
||
class EventType(str, Enum):
|
||
"""K8s Event 類型"""
|
||
|
||
NORMAL = "Normal"
|
||
WARNING = "Warning"
|
||
|
||
|
||
@dataclass
|
||
class K8sEvent:
|
||
"""K8s Event 資料"""
|
||
|
||
type: EventType
|
||
reason: str
|
||
message: str
|
||
count: int
|
||
first_timestamp: datetime | None
|
||
last_timestamp: datetime | None
|
||
source_component: str
|
||
involved_object: str
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"type": self.type.value,
|
||
"reason": self.reason,
|
||
"message": self.message,
|
||
"count": self.count,
|
||
"first_timestamp": self.first_timestamp.isoformat() if self.first_timestamp else None,
|
||
"last_timestamp": self.last_timestamp.isoformat() if self.last_timestamp else None,
|
||
"source_component": self.source_component,
|
||
"involved_object": self.involved_object,
|
||
}
|
||
|
||
def is_warning(self) -> bool:
|
||
return self.type == EventType.WARNING
|
||
|
||
def is_recent(self, minutes: int = 30) -> bool:
|
||
"""檢查是否為最近的事件"""
|
||
if not self.last_timestamp:
|
||
return False
|
||
age = datetime.now(UTC) - self.last_timestamp
|
||
return age.total_seconds() < minutes * 60
|
||
|
||
|
||
@dataclass
|
||
class ResourceUsage:
|
||
"""資源使用量"""
|
||
|
||
cpu_millicores: int # 毫核 (1000m = 1 core)
|
||
memory_bytes: int # Bytes
|
||
cpu_limit_millicores: int | None = None
|
||
memory_limit_bytes: int | None = None
|
||
|
||
@property
|
||
def cpu_percent(self) -> float | None:
|
||
"""CPU 使用率 (相對於 limit)"""
|
||
if self.cpu_limit_millicores:
|
||
return (self.cpu_millicores / self.cpu_limit_millicores) * 100
|
||
return None
|
||
|
||
@property
|
||
def memory_percent(self) -> float | None:
|
||
"""Memory 使用率 (相對於 limit)"""
|
||
if self.memory_limit_bytes:
|
||
return (self.memory_bytes / self.memory_limit_bytes) * 100
|
||
return None
|
||
|
||
@property
|
||
def memory_mb(self) -> float:
|
||
"""Memory in MB"""
|
||
return self.memory_bytes / (1024 * 1024)
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"cpu_millicores": self.cpu_millicores,
|
||
"cpu_percent": round(self.cpu_percent, 1) if self.cpu_percent else None,
|
||
"memory_bytes": self.memory_bytes,
|
||
"memory_mb": round(self.memory_mb, 1),
|
||
"memory_percent": round(self.memory_percent, 1) if self.memory_percent else None,
|
||
}
|
||
|
||
def is_cpu_high(self, threshold: float = 80.0) -> bool:
|
||
"""CPU 使用率是否過高"""
|
||
return self.cpu_percent is not None and self.cpu_percent > threshold
|
||
|
||
def is_memory_high(self, threshold: float = 80.0) -> bool:
|
||
"""Memory 使用率是否過高"""
|
||
return self.memory_percent is not None and self.memory_percent > threshold
|
||
|
||
|
||
@dataclass
|
||
class PodStatus:
|
||
"""Pod 狀態詳情"""
|
||
|
||
name: str
|
||
namespace: str
|
||
phase: str # Pending, Running, Succeeded, Failed, Unknown
|
||
ready: bool
|
||
restart_count: int
|
||
container_statuses: list[dict[str, Any]] = field(default_factory=list)
|
||
conditions: list[dict[str, Any]] = field(default_factory=list)
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"name": self.name,
|
||
"namespace": self.namespace,
|
||
"phase": self.phase,
|
||
"ready": self.ready,
|
||
"restart_count": self.restart_count,
|
||
"container_statuses": self.container_statuses,
|
||
"conditions": self.conditions,
|
||
}
|
||
|
||
def is_healthy(self) -> bool:
|
||
return self.phase == "Running" and self.ready
|
||
|
||
def is_crash_loop(self) -> bool:
|
||
"""檢查是否處於 CrashLoopBackOff"""
|
||
for cs in self.container_statuses:
|
||
waiting = cs.get("state", {}).get("waiting", {})
|
||
if waiting.get("reason") == "CrashLoopBackOff":
|
||
return True
|
||
return False
|
||
|
||
def is_image_pull_error(self) -> bool:
|
||
"""檢查是否為 Image Pull 錯誤"""
|
||
for cs in self.container_statuses:
|
||
waiting = cs.get("state", {}).get("waiting", {})
|
||
reason = waiting.get("reason", "")
|
||
if reason in ("ImagePullBackOff", "ErrImagePull", "ErrImageNeverPull"):
|
||
return True
|
||
return False
|
||
|
||
|
||
@dataclass
|
||
class K8sDiagnostics:
|
||
"""K8s 診斷資料彙總"""
|
||
|
||
pod_name: str
|
||
namespace: str
|
||
collected_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
||
|
||
# 診斷資料
|
||
pod_status: PodStatus | None = None
|
||
events: list[K8sEvent] = field(default_factory=list)
|
||
logs: str = ""
|
||
previous_logs: str = ""
|
||
resource_usage: ResourceUsage | None = None
|
||
|
||
# 錯誤記錄
|
||
errors: list[str] = field(default_factory=list)
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"pod_name": self.pod_name,
|
||
"namespace": self.namespace,
|
||
"collected_at": self.collected_at.isoformat(),
|
||
"pod_status": self.pod_status.to_dict() if self.pod_status else None,
|
||
"events": [e.to_dict() for e in self.events],
|
||
"logs_length": len(self.logs),
|
||
"previous_logs_length": len(self.previous_logs),
|
||
"resource_usage": self.resource_usage.to_dict() if self.resource_usage else None,
|
||
"errors": self.errors,
|
||
}
|
||
|
||
@property
|
||
def warning_events(self) -> list[K8sEvent]:
|
||
"""取得警告類型的事件"""
|
||
return [e for e in self.events if e.is_warning()]
|
||
|
||
@property
|
||
def recent_events(self) -> list[K8sEvent]:
|
||
"""取得最近 30 分鐘的事件"""
|
||
return [e for e in self.events if e.is_recent(30)]
|
||
|
||
def get_diagnosis_summary(self) -> str:
|
||
"""產生診斷摘要"""
|
||
lines = []
|
||
|
||
if self.pod_status:
|
||
lines.append(f"Pod Phase: {self.pod_status.phase}")
|
||
lines.append(f"Ready: {self.pod_status.ready}")
|
||
lines.append(f"Restart Count: {self.pod_status.restart_count}")
|
||
|
||
if self.pod_status.is_crash_loop():
|
||
lines.append("WARNING: CrashLoopBackOff detected!")
|
||
if self.pod_status.is_image_pull_error():
|
||
lines.append("WARNING: Image Pull Error detected!")
|
||
|
||
if self.resource_usage:
|
||
if self.resource_usage.is_cpu_high():
|
||
lines.append(f"WARNING: High CPU usage ({self.resource_usage.cpu_percent:.1f}%)")
|
||
if self.resource_usage.is_memory_high():
|
||
lines.append(f"WARNING: High Memory usage ({self.resource_usage.memory_percent:.1f}%)")
|
||
|
||
warning_count = len(self.warning_events)
|
||
if warning_count > 0:
|
||
lines.append(f"Warning Events: {warning_count}")
|
||
for e in self.warning_events[:3]: # 最多顯示 3 個
|
||
lines.append(f" - {e.reason}: {e.message[:100]}")
|
||
|
||
if self.errors:
|
||
lines.append(f"Collection Errors: {len(self.errors)}")
|
||
|
||
return "\n".join(lines) if lines else "No issues detected"
|
||
|
||
|
||
# =============================================================================
|
||
# K8s Diagnostics Service
|
||
# =============================================================================
|
||
|
||
|
||
class K8sDiagnosticsService:
|
||
"""
|
||
K8s 診斷資料收集服務
|
||
|
||
功能:
|
||
- 取得 Pod Events
|
||
- 取得 Pod Logs (current + previous)
|
||
- 取得 Resource Usage (via metrics-server)
|
||
- 取得 Pod Status
|
||
|
||
設計:
|
||
- 非同步並行收集
|
||
- 單一失敗不影響整體
|
||
- 結果包含錯誤資訊
|
||
"""
|
||
|
||
def __init__(self, default_namespace: str = "awoooi-prod"):
|
||
self.default_namespace = default_namespace
|
||
|
||
async def collect_diagnostics(
|
||
self,
|
||
pod_name: str,
|
||
namespace: str | None = None,
|
||
include_logs: bool = True,
|
||
include_previous_logs: bool = True,
|
||
log_tail_lines: int = 100,
|
||
) -> K8sDiagnostics:
|
||
"""
|
||
收集完整的 K8s 診斷資料
|
||
|
||
Args:
|
||
pod_name: Pod 名稱 (可以是部分名稱,會自動匹配)
|
||
namespace: Namespace (預設 awoooi-prod)
|
||
include_logs: 是否包含日誌
|
||
include_previous_logs: 是否包含上一次容器的日誌
|
||
log_tail_lines: 日誌行數
|
||
|
||
Returns:
|
||
K8sDiagnostics 包含所有收集到的資料
|
||
"""
|
||
ns = namespace or self.default_namespace
|
||
diagnostics = K8sDiagnostics(pod_name=pod_name, namespace=ns)
|
||
|
||
client = await _get_k8s_client()
|
||
if not client:
|
||
diagnostics.errors.append("K8s client initialization failed")
|
||
return diagnostics
|
||
|
||
# 先找到實際的 Pod 名稱 (支援部分匹配)
|
||
actual_pod_name = await self._find_pod(client, pod_name, ns)
|
||
if not actual_pod_name:
|
||
diagnostics.errors.append(f"Pod not found: {pod_name}")
|
||
return diagnostics
|
||
|
||
diagnostics.pod_name = actual_pod_name
|
||
|
||
# 並行收集所有資料
|
||
import asyncio
|
||
|
||
tasks = [
|
||
self._get_pod_status(client, actual_pod_name, ns),
|
||
self._get_pod_events(client, actual_pod_name, ns),
|
||
]
|
||
|
||
if include_logs:
|
||
tasks.append(self._get_pod_logs(client, actual_pod_name, ns, log_tail_lines, previous=False))
|
||
if include_previous_logs:
|
||
tasks.append(self._get_pod_logs(client, actual_pod_name, ns, log_tail_lines, previous=True))
|
||
|
||
# 資源使用量需要 metrics-server
|
||
tasks.append(self._get_resource_usage(client, actual_pod_name, ns))
|
||
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
# 處理結果
|
||
idx = 0
|
||
|
||
# Pod Status
|
||
if isinstance(results[idx], Exception):
|
||
diagnostics.errors.append(f"Pod status: {results[idx]}")
|
||
else:
|
||
diagnostics.pod_status = results[idx]
|
||
idx += 1
|
||
|
||
# Events
|
||
if isinstance(results[idx], Exception):
|
||
diagnostics.errors.append(f"Events: {results[idx]}")
|
||
else:
|
||
diagnostics.events = results[idx] or []
|
||
idx += 1
|
||
|
||
# Logs
|
||
if include_logs:
|
||
if isinstance(results[idx], Exception):
|
||
diagnostics.errors.append(f"Logs: {results[idx]}")
|
||
else:
|
||
diagnostics.logs = results[idx] or ""
|
||
idx += 1
|
||
|
||
if include_previous_logs:
|
||
if isinstance(results[idx], Exception):
|
||
# Previous logs 失敗很常見 (沒有 previous container)
|
||
pass
|
||
else:
|
||
diagnostics.previous_logs = results[idx] or ""
|
||
idx += 1
|
||
|
||
# Resource Usage
|
||
if isinstance(results[idx], Exception):
|
||
diagnostics.errors.append(f"Resource usage: {results[idx]}")
|
||
else:
|
||
diagnostics.resource_usage = results[idx]
|
||
|
||
logger.info(
|
||
"k8s_diagnostics_collected",
|
||
pod_name=actual_pod_name,
|
||
namespace=ns,
|
||
has_status=diagnostics.pod_status is not None,
|
||
events_count=len(diagnostics.events),
|
||
logs_length=len(diagnostics.logs),
|
||
has_resource_usage=diagnostics.resource_usage is not None,
|
||
errors_count=len(diagnostics.errors),
|
||
)
|
||
|
||
return diagnostics
|
||
|
||
async def _find_pod(
|
||
self,
|
||
client,
|
||
pod_name: str,
|
||
namespace: str,
|
||
) -> str | None:
|
||
"""找到實際的 Pod 名稱 (支援部分匹配)"""
|
||
try:
|
||
v1 = client.CoreV1Api()
|
||
|
||
# 先嘗試精確匹配
|
||
try:
|
||
await v1.read_namespaced_pod(name=pod_name, namespace=namespace)
|
||
return pod_name
|
||
except client.exceptions.ApiException as e:
|
||
if e.status != 404:
|
||
raise
|
||
|
||
# 部分匹配 (用於 Deployment Pod 名稱)
|
||
pods = await v1.list_namespaced_pod(namespace=namespace)
|
||
for pod in pods.items:
|
||
if pod_name in pod.metadata.name:
|
||
return pod.metadata.name
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.warning("k8s_find_pod_failed", pod_name=pod_name, error=str(e))
|
||
return None
|
||
|
||
async def _get_pod_status(
|
||
self,
|
||
client,
|
||
pod_name: str,
|
||
namespace: str,
|
||
) -> PodStatus | None:
|
||
"""取得 Pod 狀態"""
|
||
try:
|
||
v1 = client.CoreV1Api()
|
||
pod = await v1.read_namespaced_pod(name=pod_name, namespace=namespace)
|
||
|
||
# 計算 restart count
|
||
restart_count = 0
|
||
container_statuses = []
|
||
if pod.status.container_statuses:
|
||
for cs in pod.status.container_statuses:
|
||
restart_count += cs.restart_count or 0
|
||
container_statuses.append({
|
||
"name": cs.name,
|
||
"ready": cs.ready,
|
||
"restart_count": cs.restart_count,
|
||
"state": {
|
||
"running": cs.state.running is not None,
|
||
"waiting": {
|
||
"reason": cs.state.waiting.reason if cs.state.waiting else None,
|
||
"message": cs.state.waiting.message if cs.state.waiting else None,
|
||
} if cs.state.waiting else None,
|
||
"terminated": {
|
||
"reason": cs.state.terminated.reason if cs.state.terminated else None,
|
||
"exit_code": cs.state.terminated.exit_code if cs.state.terminated else None,
|
||
} if cs.state.terminated else None,
|
||
},
|
||
})
|
||
|
||
# Ready 條件
|
||
ready = False
|
||
conditions = []
|
||
if pod.status.conditions:
|
||
for c in pod.status.conditions:
|
||
conditions.append({
|
||
"type": c.type,
|
||
"status": c.status,
|
||
"reason": c.reason,
|
||
"message": c.message,
|
||
})
|
||
if c.type == "Ready" and c.status == "True":
|
||
ready = True
|
||
|
||
return PodStatus(
|
||
name=pod_name,
|
||
namespace=namespace,
|
||
phase=pod.status.phase,
|
||
ready=ready,
|
||
restart_count=restart_count,
|
||
container_statuses=container_statuses,
|
||
conditions=conditions,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("k8s_get_pod_status_failed", pod_name=pod_name, error=str(e))
|
||
raise
|
||
|
||
async def _get_pod_events(
|
||
self,
|
||
client,
|
||
pod_name: str,
|
||
namespace: str,
|
||
limit: int = 20,
|
||
) -> list[K8sEvent]:
|
||
"""取得 Pod 相關 Events"""
|
||
try:
|
||
v1 = client.CoreV1Api()
|
||
|
||
# 取得該 namespace 的所有 events,然後過濾
|
||
field_selector = f"involvedObject.name={pod_name}"
|
||
events = await v1.list_namespaced_event(
|
||
namespace=namespace,
|
||
field_selector=field_selector,
|
||
limit=limit,
|
||
)
|
||
|
||
result = []
|
||
for e in events.items:
|
||
result.append(K8sEvent(
|
||
type=EventType(e.type) if e.type else EventType.NORMAL,
|
||
reason=e.reason or "",
|
||
message=e.message or "",
|
||
count=e.count or 1,
|
||
first_timestamp=e.first_timestamp.replace(tzinfo=UTC) if e.first_timestamp else None,
|
||
last_timestamp=e.last_timestamp.replace(tzinfo=UTC) if e.last_timestamp else None,
|
||
source_component=e.source.component if e.source else "",
|
||
involved_object=f"{e.involved_object.kind}/{e.involved_object.name}" if e.involved_object else "",
|
||
))
|
||
|
||
# 按最後時間排序
|
||
result.sort(key=lambda x: x.last_timestamp or datetime.min.replace(tzinfo=UTC), reverse=True)
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.warning("k8s_get_pod_events_failed", pod_name=pod_name, error=str(e))
|
||
raise
|
||
|
||
async def _get_pod_logs(
|
||
self,
|
||
client,
|
||
pod_name: str,
|
||
namespace: str,
|
||
tail_lines: int = 100,
|
||
previous: bool = False,
|
||
) -> str:
|
||
"""取得 Pod 日誌"""
|
||
try:
|
||
v1 = client.CoreV1Api()
|
||
logs = await v1.read_namespaced_pod_log(
|
||
name=pod_name,
|
||
namespace=namespace,
|
||
tail_lines=tail_lines,
|
||
previous=previous,
|
||
)
|
||
return logs or ""
|
||
|
||
except Exception as e:
|
||
# Previous logs 失敗很常見
|
||
if not previous:
|
||
logger.warning("k8s_get_pod_logs_failed", pod_name=pod_name, error=str(e))
|
||
raise
|
||
|
||
async def _get_resource_usage(
|
||
self,
|
||
client,
|
||
pod_name: str,
|
||
namespace: str,
|
||
) -> ResourceUsage | None:
|
||
"""取得資源使用量 (需要 metrics-server)"""
|
||
try:
|
||
# 使用 CustomObjectsApi 查詢 metrics
|
||
custom_api = client.CustomObjectsApi()
|
||
metrics = await custom_api.get_namespaced_custom_object(
|
||
group="metrics.k8s.io",
|
||
version="v1beta1",
|
||
namespace=namespace,
|
||
plural="pods",
|
||
name=pod_name,
|
||
)
|
||
|
||
# 解析 metrics
|
||
total_cpu = 0
|
||
total_memory = 0
|
||
for container in metrics.get("containers", []):
|
||
usage = container.get("usage", {})
|
||
cpu = usage.get("cpu", "0")
|
||
memory = usage.get("memory", "0")
|
||
|
||
# 解析 CPU (可能是 "100m" 或 "1")
|
||
if cpu.endswith("n"):
|
||
total_cpu += int(cpu[:-1]) // 1000000 # nano to milli
|
||
elif cpu.endswith("m"):
|
||
total_cpu += int(cpu[:-1])
|
||
else:
|
||
total_cpu += int(float(cpu) * 1000)
|
||
|
||
# 解析 Memory (可能是 "100Mi", "1Gi", "1000000Ki")
|
||
if memory.endswith("Ki"):
|
||
total_memory += int(memory[:-2]) * 1024
|
||
elif memory.endswith("Mi"):
|
||
total_memory += int(memory[:-2]) * 1024 * 1024
|
||
elif memory.endswith("Gi"):
|
||
total_memory += int(memory[:-2]) * 1024 * 1024 * 1024
|
||
else:
|
||
total_memory += int(memory)
|
||
|
||
# 取得 limits (from pod spec)
|
||
v1 = client.CoreV1Api()
|
||
pod = await v1.read_namespaced_pod(name=pod_name, namespace=namespace)
|
||
|
||
cpu_limit = None
|
||
memory_limit = None
|
||
for container in pod.spec.containers:
|
||
if container.resources and container.resources.limits:
|
||
limits = container.resources.limits
|
||
if "cpu" in limits:
|
||
cpu_str = limits["cpu"]
|
||
if cpu_str.endswith("m"):
|
||
cpu_limit = (cpu_limit or 0) + int(cpu_str[:-1])
|
||
else:
|
||
cpu_limit = (cpu_limit or 0) + int(float(cpu_str) * 1000)
|
||
if "memory" in limits:
|
||
mem_str = limits["memory"]
|
||
if mem_str.endswith("Mi"):
|
||
memory_limit = (memory_limit or 0) + int(mem_str[:-2]) * 1024 * 1024
|
||
elif mem_str.endswith("Gi"):
|
||
memory_limit = (memory_limit or 0) + int(mem_str[:-2]) * 1024 * 1024 * 1024
|
||
|
||
return ResourceUsage(
|
||
cpu_millicores=total_cpu,
|
||
memory_bytes=total_memory,
|
||
cpu_limit_millicores=cpu_limit,
|
||
memory_limit_bytes=memory_limit,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("k8s_get_resource_usage_failed", pod_name=pod_name, error=str(e))
|
||
raise
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_diagnostics_service: K8sDiagnosticsService | None = None
|
||
|
||
|
||
def get_k8s_diagnostics_service() -> K8sDiagnosticsService:
|
||
"""取得 K8s 診斷服務 singleton"""
|
||
global _diagnostics_service
|
||
if _diagnostics_service is None:
|
||
_diagnostics_service = K8sDiagnosticsService()
|
||
return _diagnostics_service
|