""" Context Gatherer - K8s Log Collection & Cleaning ================================================= Phase 5.2.1: 日誌清洗模組 Features: - K8s Pod 日誌收集 - ERROR Only 過濾原則 (首席架構師要求) - 雜訊過濾 (DEBUG/INFO 清除) - 結構化上下文輸出 防禦性工程鐵律: - 只餵給 Ollama 純淨的戰訊,不含雜訊 - 過濾 DEBUG/INFO 標籤 - 限制 Context 長度避免 Token 浪費 """ import re from dataclasses import dataclass, field from datetime import datetime from typing import Any import structlog from src.core.config import settings logger = structlog.get_logger(__name__) # ============================================================================= # Log Level Filter - ERROR Only Principle # ============================================================================= class LogLevelFilter: """ 日誌等級過濾器 - ERROR Only 原則 首席架構師要求: - 僅保留 ERROR, FATAL, CRITICAL, WARN, WARNING - 過濾 DEBUG, INFO, TRACE, VERBOSE - 使用 Regex 精準匹配日誌等級標籤 """ # 允許的日誌等級 (從 config 加載) ALLOWED_LEVELS = settings.CONTEXT_LOG_LEVELS # 禁止的日誌等級 (明確排除) FORBIDDEN_LEVELS = ["DEBUG", "INFO", "TRACE", "VERBOSE", "FINE", "FINER", "FINEST"] # ========================================================================== # 核心 Regex 過濾器 # ========================================================================== # Pattern 1: 標準日誌格式 [LEVEL] 或 LEVEL: # 匹配: [INFO], [DEBUG], INFO:, DEBUG:, level=INFO, level=debug # 新增: timestamp-prefixed 格式 (2024-03-21T10:15:23.456Z INFO [...]) LEVEL_PATTERN = re.compile( r""" (?: \[(?PDEBUG|INFO|TRACE|VERBOSE)\] | # [DEBUG], [INFO] \b(?PDEBUG|INFO|TRACE|VERBOSE): | # DEBUG:, INFO: \blevel\s*[=:]\s*["']?(?PDEBUG|INFO|TRACE|VERBOSE)["']? | # level=DEBUG, level="INFO" \b(?PDEBUG|INFO|TRACE|VERBOSE)\s+\[ # timestamp DEBUG [...], timestamp INFO [...] ) """, re.IGNORECASE | re.VERBOSE ) # Pattern 2: 允許的日誌等級 (用於正向匹配) # 新增: 支援 timestamp-prefixed 格式 (2024-03-21T10:16:45.123Z ERROR [...]) ALLOWED_PATTERN = re.compile( r""" (?: \[(?PERROR|FATAL|CRITICAL|WARN|WARNING)\] | \b(?PERROR|FATAL|CRITICAL|WARN|WARNING): | \blevel\s*[=:]\s*["']?(?PERROR|FATAL|CRITICAL|WARN|WARNING)["']? | \b(?PERROR|FATAL|CRITICAL|WARN|WARNING)\s+\[ ) """, re.IGNORECASE | re.VERBOSE ) # Pattern 3: Kubernetes 事件格式 # 匹配: Warning, Normal (K8s Event Types) K8S_EVENT_PATTERN = re.compile( r"^\s*(?PWarning|Error)\s+", re.IGNORECASE ) # Pattern 4: Stacktrace 行 (保留) STACKTRACE_PATTERN = re.compile( r""" (?: ^\s+at\s+ | # Java stacktrace ^\s+File\s+".*",\s+line\s+ | # Python traceback ^Traceback\s+\(most\s+recent | # Python traceback header ^\s+\d+:\s+0x[0-9a-f]+ | # Go stacktrace ^panic: # Go panic ) """, re.IGNORECASE | re.VERBOSE ) @classmethod def is_allowed(cls, line: str) -> bool: """ 判斷日誌行是否應該保留 規則: 1. 包含 ERROR/FATAL/CRITICAL/WARN → 保留 2. 包含 DEBUG/INFO/TRACE → 過濾 3. 是 Stacktrace → 保留 4. K8s Warning/Error 事件 → 保留 5. 其他 → 過濾 (保守策略) Returns: bool: True = 保留, False = 過濾 """ line = line.strip() # 空行過濾 if not line: return False # Rule 1: 明確禁止的等級 → 過濾 if cls.LEVEL_PATTERN.search(line): return False # Rule 2: 允許的等級 → 保留 if cls.ALLOWED_PATTERN.search(line): return True # Rule 3: Stacktrace → 保留 if cls.STACKTRACE_PATTERN.search(line): return True # Rule 4: K8s Warning/Error 事件 → 保留 if cls.K8S_EVENT_PATTERN.search(line): return True # Rule 5: 預設過濾 (ERROR Only 原則) # 這是保守策略,避免雜訊 return False @classmethod def filter_logs(cls, logs: str) -> str: """ 過濾日誌字串,僅保留 ERROR 等級 Args: logs: 原始日誌字串 (多行) Returns: str: 過濾後的日誌字串 """ lines = logs.split("\n") filtered = [] # 追蹤 Stacktrace 狀態 in_stacktrace = False for line in lines: # Stacktrace 延續判斷 if in_stacktrace: if cls.STACKTRACE_PATTERN.search(line) or line.startswith((" ", "\t")): filtered.append(line) continue else: in_stacktrace = False # 進入 Stacktrace if "Traceback" in line or "panic:" in line or line.strip().startswith("at "): in_stacktrace = True filtered.append(line) continue # 標準過濾 if cls.is_allowed(line): filtered.append(line) return "\n".join(filtered) @classmethod def get_filter_stats(cls, original: str, filtered: str) -> dict: """ 取得過濾統計資訊 """ original_lines = len(original.split("\n")) filtered_lines = len(filtered.split("\n")) removed_lines = original_lines - filtered_lines removal_rate = (removed_lines / original_lines * 100) if original_lines > 0 else 0 return { "original_lines": original_lines, "filtered_lines": filtered_lines, "removed_lines": removed_lines, "removal_rate_percent": round(removal_rate, 1), } # ============================================================================= # Context Gatherer # ============================================================================= @dataclass class K8sContext: """K8s 上下文資料結構""" namespace: str resource_name: str resource_type: str pod_status: dict[str, Any] = field(default_factory=dict) deployment_status: dict[str, Any] = field(default_factory=dict) recent_events: list[dict[str, Any]] = field(default_factory=list) filtered_logs: str = "" log_filter_stats: dict[str, Any] = field(default_factory=dict) gathered_at: str = field(default_factory=lambda: datetime.utcnow().isoformat()) class ContextGatherer: """ 上下文收集器 - 為 Ollama 準備乾淨的分析資料 職責: 1. 收集 K8s Pod/Deployment 狀態 2. 收集最近事件 3. 收集並清洗日誌 (ERROR Only) 4. 組裝結構化上下文 """ def __init__(self): self._k8s_client = None self._initialized = False async def initialize(self) -> bool: """初始化 K8s 連線""" try: from pathlib import Path from kubernetes_asyncio import client from kubernetes_asyncio.config import load_kube_config kubeconfig_path = Path(settings.KUBECONFIG_PATH) if not kubeconfig_path.is_absolute(): kubeconfig_path = Path(__file__).parent.parent.parent / settings.KUBECONFIG_PATH if not kubeconfig_path.exists(): logger.warning("kubeconfig_not_found", path=str(kubeconfig_path)) return False await load_kube_config(config_file=str(kubeconfig_path)) self._k8s_client = client self._initialized = True logger.info("context_gatherer_initialized") return True except Exception as e: logger.error("context_gatherer_init_failed", error=str(e)) return False async def gather_pod_logs( self, pod_name: str, namespace: str = "default", tail_lines: int | None = None, ) -> tuple[str, dict]: """ 收集並清洗 Pod 日誌 Args: pod_name: Pod 名稱 namespace: Namespace tail_lines: 取最後 N 行 (預設從 config) Returns: (filtered_logs, filter_stats) """ tail_lines = tail_lines or settings.CONTEXT_MAX_LINES if not self._initialized: await self.initialize() if not self._initialized: return "[K8s not connected]", {"error": "K8s not initialized"} try: core_v1 = self._k8s_client.CoreV1Api() # 取得原始日誌 raw_logs = await core_v1.read_namespaced_pod_log( name=pod_name, namespace=namespace, tail_lines=tail_lines, ) # 清洗日誌 (ERROR Only) filtered_logs = LogLevelFilter.filter_logs(raw_logs) filter_stats = LogLevelFilter.get_filter_stats(raw_logs, filtered_logs) logger.info( "pod_logs_filtered", pod=pod_name, namespace=namespace, **filter_stats, ) return filtered_logs, filter_stats except Exception as e: logger.error("gather_pod_logs_failed", pod=pod_name, error=str(e)) return f"[Error gathering logs: {e}]", {"error": str(e)} async def gather_context( self, resource_name: str, namespace: str = "default", resource_type: str = "pod", ) -> K8sContext: """ 收集完整的 K8s 上下文 Args: resource_name: 資源名稱 namespace: Namespace resource_type: 資源類型 (pod/deployment) Returns: K8sContext: 結構化上下文 """ context = K8sContext( namespace=namespace, resource_name=resource_name, resource_type=resource_type, ) if not self._initialized: await self.initialize() if not self._initialized: context.filtered_logs = "[K8s not connected - using mock context]" return context try: core_v1 = self._k8s_client.CoreV1Api() apps_v1 = self._k8s_client.AppsV1Api() # 1. Pod 狀態 if resource_type == "pod": try: pod = await core_v1.read_namespaced_pod( name=resource_name, namespace=namespace, ) context.pod_status = { "phase": pod.status.phase, "restart_count": sum( c.restart_count for c in (pod.status.container_statuses or []) ), "conditions": [ c.type for c in (pod.status.conditions or []) if c.status == "True" ], } except Exception as e: logger.warning("gather_pod_status_failed", error=str(e)) # 2. Deployment 狀態 if resource_type in ["pod", "deployment"]: try: deploy_name = resource_name.rsplit("-", 2)[0] if resource_type == "pod" else resource_name deploy = await apps_v1.read_namespaced_deployment( name=deploy_name, namespace=namespace, ) context.deployment_status = { "replicas": deploy.spec.replicas, "ready_replicas": deploy.status.ready_replicas or 0, "available_replicas": deploy.status.available_replicas or 0, } except Exception as e: logger.warning("gather_deployment_status_failed", error=str(e)) # 3. 最近事件 try: events = await core_v1.list_namespaced_event( namespace=namespace, field_selector=f"involvedObject.name={resource_name}", ) context.recent_events = [ { "type": e.type, "reason": e.reason, "message": e.message[:100] if e.message else "", "count": e.count, } for e in sorted( events.items, key=lambda x: x.last_timestamp or x.event_time, reverse=True, )[:5] ] except Exception as e: logger.warning("gather_events_failed", error=str(e)) # 4. 清洗日誌 if resource_type == "pod": filtered_logs, filter_stats = await self.gather_pod_logs( resource_name, namespace ) context.filtered_logs = filtered_logs context.log_filter_stats = filter_stats logger.info( "context_gathered", resource=resource_name, namespace=namespace, events_count=len(context.recent_events), ) return context except Exception as e: logger.error("gather_context_failed", error=str(e)) return context def format_for_llm(self, context: K8sContext) -> str: """ 將上下文格式化為 LLM 可讀格式 Args: context: K8sContext 物件 Returns: str: 格式化的上下文字串 """ parts = [ "## K8s Context", f"- **Resource**: {context.resource_type}/{context.resource_name}", f"- **Namespace**: {context.namespace}", f"- **Gathered At**: {context.gathered_at}", ] if context.pod_status: parts.append("\n### Pod Status") parts.append(f"- Phase: {context.pod_status.get('phase', 'Unknown')}") parts.append(f"- Restart Count: {context.pod_status.get('restart_count', 0)}") parts.append(f"- Conditions: {', '.join(context.pod_status.get('conditions', []))}") if context.deployment_status: parts.append("\n### Deployment Status") parts.append(f"- Replicas: {context.deployment_status.get('replicas', 0)}") parts.append(f"- Ready: {context.deployment_status.get('ready_replicas', 0)}") parts.append(f"- Available: {context.deployment_status.get('available_replicas', 0)}") if context.recent_events: parts.append("\n### Recent Events") for event in context.recent_events: parts.append(f"- [{event['type']}] {event['reason']}: {event['message']}") if context.filtered_logs: parts.append("\n### Filtered Logs (ERROR Only)") parts.append("```") parts.append(context.filtered_logs[:2000]) # 限制長度 if len(context.filtered_logs) > 2000: parts.append("... (truncated)") parts.append("```") if context.log_filter_stats: stats = context.log_filter_stats parts.append(f"\n*Log Filter Stats: {stats.get('filtered_lines', 0)}/{stats.get('original_lines', 0)} lines kept ({stats.get('removal_rate_percent', 0)}% removed)*") return "\n".join(parts) # ============================================================================= # Singleton # ============================================================================= _gatherer: ContextGatherer | None = None def get_context_gatherer() -> ContextGatherer: """取得全域 ContextGatherer 實例""" global _gatherer if _gatherer is None: _gatherer = ContextGatherer() return _gatherer