awoooi/apps/api/src/services/context_gatherer.py

"""
Context Gatherer - K8s Log Collection & Cleaning
=================================================
Phase 5.2.1: 日誌清洗模組

Features:
- K8s Pod 日誌收集
- ERROR Only 過濾原則 (首席架構師要求)
- 雜訊過濾 (DEBUG/INFO 清除)
- 結構化上下文輸出

防禦性工程鐵律:
- 只餵給 Ollama 純淨的戰訊，不含雜訊
- 過濾 DEBUG/INFO 標籤
- 限制 Context 長度避免 Token 浪費
"""

import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any

import structlog

from src.core.config import settings

logger = structlog.get_logger(__name__)


# =============================================================================
# Log Level Filter - ERROR Only Principle
# =============================================================================

class LogLevelFilter:
    """
    日誌等級過濾器 - ERROR Only 原則

    首席架構師要求:
    - 僅保留 ERROR, FATAL, CRITICAL, WARN, WARNING
    - 過濾 DEBUG, INFO, TRACE, VERBOSE
    - 使用 Regex 精準匹配日誌等級標籤
    """

    # 允許的日誌等級 (從 config 加載)
    ALLOWED_LEVELS = settings.CONTEXT_LOG_LEVELS

    # 禁止的日誌等級 (明確排除)
    FORBIDDEN_LEVELS = ["DEBUG", "INFO", "TRACE", "VERBOSE", "FINE", "FINER", "FINEST"]

    # ==========================================================================
    # 核心 Regex 過濾器
    # ==========================================================================

    # Pattern 1: 標準日誌格式 [LEVEL] 或 LEVEL:
    # 匹配: [INFO], [DEBUG], INFO:, DEBUG:, level=INFO, level=debug
    # 新增: timestamp-prefixed 格式 (2024-03-21T10:15:23.456Z INFO [...])
    LEVEL_PATTERN = re.compile(
        r"""
        (?:
            \[(?P<bracket_level>DEBUG|INFO|TRACE|VERBOSE)\]  |  # [DEBUG], [INFO]
            \b(?P<colon_level>DEBUG|INFO|TRACE|VERBOSE):     |  # DEBUG:, INFO:
            \blevel\s*[=:]\s*["']?(?P<kv_level>DEBUG|INFO|TRACE|VERBOSE)["']?  |  # level=DEBUG, level="INFO"
            \b(?P<space_level>DEBUG|INFO|TRACE|VERBOSE)\s+\[  # timestamp DEBUG [...], timestamp INFO [...]
        )
        """,
        re.IGNORECASE | re.VERBOSE
    )

    # Pattern 2: 允許的日誌等級 (用於正向匹配)
    # 新增: 支援 timestamp-prefixed 格式 (2024-03-21T10:16:45.123Z ERROR [...])
    ALLOWED_PATTERN = re.compile(
        r"""
        (?:
            \[(?P<bracket_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\]  |
            \b(?P<colon_level>ERROR|FATAL|CRITICAL|WARN|WARNING):     |
            \blevel\s*[=:]\s*["']?(?P<kv_level>ERROR|FATAL|CRITICAL|WARN|WARNING)["']?  |
            \b(?P<space_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\s+\[
        )
        """,
        re.IGNORECASE | re.VERBOSE
    )

    # Pattern 3: Kubernetes 事件格式
    # 匹配: Warning, Normal (K8s Event Types)
    K8S_EVENT_PATTERN = re.compile(
        r"^\s*(?P<event_type>Warning|Error)\s+",
        re.IGNORECASE
    )

    # Pattern 4: Stacktrace 行 (保留)
    STACKTRACE_PATTERN = re.compile(
        r"""
        (?:
            ^\s+at\s+                    |  # Java stacktrace
            ^\s+File\s+".*",\s+line\s+   |  # Python traceback
            ^Traceback\s+\(most\s+recent |  # Python traceback header
            ^\s+\d+:\s+0x[0-9a-f]+       |  # Go stacktrace
            ^panic:                          # Go panic
        )
        """,
        re.IGNORECASE | re.VERBOSE
    )

    @classmethod
    def is_allowed(cls, line: str) -> bool:
        """
        判斷日誌行是否應該保留

        規則:
        1. 包含 ERROR/FATAL/CRITICAL/WARN → 保留
        2. 包含 DEBUG/INFO/TRACE → 過濾
        3. 是 Stacktrace → 保留
        4. K8s Warning/Error 事件 → 保留
        5. 其他 → 過濾 (保守策略)

        Returns:
            bool: True = 保留, False = 過濾
        """
        line = line.strip()

        # 空行過濾
        if not line:
            return False

        # Rule 1: 明確禁止的等級 → 過濾
        if cls.LEVEL_PATTERN.search(line):
            return False

        # Rule 2: 允許的等級 → 保留
        if cls.ALLOWED_PATTERN.search(line):
            return True

        # Rule 3: Stacktrace → 保留
        if cls.STACKTRACE_PATTERN.search(line):
            return True

        # Rule 4: K8s Warning/Error 事件 → 保留
        if cls.K8S_EVENT_PATTERN.search(line):
            return True

        # Rule 5: 預設過濾 (ERROR Only 原則)
        # 這是保守策略，避免雜訊
        return False

    @classmethod
    def filter_logs(cls, logs: str) -> str:
        """
        過濾日誌字串，僅保留 ERROR 等級

        Args:
            logs: 原始日誌字串 (多行)

        Returns:
            str: 過濾後的日誌字串
        """
        lines = logs.split("\n")
        filtered = []

        # 追蹤 Stacktrace 狀態
        in_stacktrace = False

        for line in lines:
            # Stacktrace 延續判斷
            if in_stacktrace:
                if cls.STACKTRACE_PATTERN.search(line) or line.startswith((" ", "\t")):
                    filtered.append(line)
                    continue
                else:
                    in_stacktrace = False

            # 進入 Stacktrace
            if "Traceback" in line or "panic:" in line or line.strip().startswith("at "):
                in_stacktrace = True
                filtered.append(line)
                continue

            # 標準過濾
            if cls.is_allowed(line):
                filtered.append(line)

        return "\n".join(filtered)

    @classmethod
    def get_filter_stats(cls, original: str, filtered: str) -> dict:
        """
        取得過濾統計資訊
        """
        original_lines = len(original.split("\n"))
        filtered_lines = len(filtered.split("\n"))
        removed_lines = original_lines - filtered_lines
        removal_rate = (removed_lines / original_lines * 100) if original_lines > 0 else 0

        return {
            "original_lines": original_lines,
            "filtered_lines": filtered_lines,
            "removed_lines": removed_lines,
            "removal_rate_percent": round(removal_rate, 1),
        }


# =============================================================================
# Context Gatherer
# =============================================================================

@dataclass
class K8sContext:
    """K8s 上下文資料結構"""
    namespace: str
    resource_name: str
    resource_type: str
    pod_status: dict[str, Any] = field(default_factory=dict)
    deployment_status: dict[str, Any] = field(default_factory=dict)
    recent_events: list[dict[str, Any]] = field(default_factory=list)
    filtered_logs: str = ""
    log_filter_stats: dict[str, Any] = field(default_factory=dict)
    gathered_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())


class ContextGatherer:
    """
    上下文收集器 - 為 Ollama 準備乾淨的分析資料

    職責:
    1. 收集 K8s Pod/Deployment 狀態
    2. 收集最近事件
    3. 收集並清洗日誌 (ERROR Only)
    4. 組裝結構化上下文
    """

    def __init__(self):
        self._k8s_client = None
        self._initialized = False

    async def initialize(self) -> bool:
        """初始化 K8s 連線"""
        try:
            from pathlib import Path

            from kubernetes_asyncio import client
            from kubernetes_asyncio.config import load_kube_config

            kubeconfig_path = Path(settings.KUBECONFIG_PATH)
            if not kubeconfig_path.is_absolute():
                kubeconfig_path = Path(__file__).parent.parent.parent / settings.KUBECONFIG_PATH

            if not kubeconfig_path.exists():
                logger.warning("kubeconfig_not_found", path=str(kubeconfig_path))
                return False

            await load_kube_config(config_file=str(kubeconfig_path))
            self._k8s_client = client
            self._initialized = True

            logger.info("context_gatherer_initialized")
            return True

        except Exception as e:
            logger.error("context_gatherer_init_failed", error=str(e))
            return False

    async def gather_pod_logs(
        self,
        pod_name: str,
        namespace: str = "default",
        tail_lines: int | None = None,
    ) -> tuple[str, dict]:
        """
        收集並清洗 Pod 日誌

        Args:
            pod_name: Pod 名稱
            namespace: Namespace
            tail_lines: 取最後 N 行 (預設從 config)

        Returns:
            (filtered_logs, filter_stats)
        """
        tail_lines = tail_lines or settings.CONTEXT_MAX_LINES

        if not self._initialized:
            await self.initialize()

        if not self._initialized:
            return "[K8s not connected]", {"error": "K8s not initialized"}

        try:
            core_v1 = self._k8s_client.CoreV1Api()

            # 取得原始日誌
            raw_logs = await core_v1.read_namespaced_pod_log(
                name=pod_name,
                namespace=namespace,
                tail_lines=tail_lines,
            )

            # 清洗日誌 (ERROR Only)
            filtered_logs = LogLevelFilter.filter_logs(raw_logs)
            filter_stats = LogLevelFilter.get_filter_stats(raw_logs, filtered_logs)

            logger.info(
                "pod_logs_filtered",
                pod=pod_name,
                namespace=namespace,
                **filter_stats,
            )

            return filtered_logs, filter_stats

        except Exception as e:
            logger.error("gather_pod_logs_failed", pod=pod_name, error=str(e))
            return f"[Error gathering logs: {e}]", {"error": str(e)}

    async def gather_context(
        self,
        resource_name: str,
        namespace: str = "default",
        resource_type: str = "pod",
    ) -> K8sContext:
        """
        收集完整的 K8s 上下文

        Args:
            resource_name: 資源名稱
            namespace: Namespace
            resource_type: 資源類型 (pod/deployment)

        Returns:
            K8sContext: 結構化上下文
        """
        context = K8sContext(
            namespace=namespace,
            resource_name=resource_name,
            resource_type=resource_type,
        )

        if not self._initialized:
            await self.initialize()

        if not self._initialized:
            context.filtered_logs = "[K8s not connected - using mock context]"
            return context

        try:
            core_v1 = self._k8s_client.CoreV1Api()
            apps_v1 = self._k8s_client.AppsV1Api()

            # 1. Pod 狀態
            if resource_type == "pod":
                try:
                    pod = await core_v1.read_namespaced_pod(
                        name=resource_name,
                        namespace=namespace,
                    )
                    context.pod_status = {
                        "phase": pod.status.phase,
                        "restart_count": sum(
                            c.restart_count for c in (pod.status.container_statuses or [])
                        ),
                        "conditions": [
                            c.type for c in (pod.status.conditions or []) if c.status == "True"
                        ],
                    }
                except Exception as e:
                    logger.warning("gather_pod_status_failed", error=str(e))

            # 2. Deployment 狀態
            if resource_type in ["pod", "deployment"]:
                try:
                    deploy_name = resource_name.rsplit("-", 2)[0] if resource_type == "pod" else resource_name
                    deploy = await apps_v1.read_namespaced_deployment(
                        name=deploy_name,
                        namespace=namespace,
                    )
                    context.deployment_status = {
                        "replicas": deploy.spec.replicas,
                        "ready_replicas": deploy.status.ready_replicas or 0,
                        "available_replicas": deploy.status.available_replicas or 0,
                    }
                except Exception as e:
                    logger.warning("gather_deployment_status_failed", error=str(e))

            # 3. 最近事件
            try:
                events = await core_v1.list_namespaced_event(
                    namespace=namespace,
                    field_selector=f"involvedObject.name={resource_name}",
                )
                context.recent_events = [
                    {
                        "type": e.type,
                        "reason": e.reason,
                        "message": e.message[:100] if e.message else "",
                        "count": e.count,
                    }
                    for e in sorted(
                        events.items,
                        key=lambda x: x.last_timestamp or x.event_time,
                        reverse=True,
                    )[:5]
                ]
            except Exception as e:
                logger.warning("gather_events_failed", error=str(e))

            # 4. 清洗日誌
            if resource_type == "pod":
                filtered_logs, filter_stats = await self.gather_pod_logs(
                    resource_name, namespace
                )
                context.filtered_logs = filtered_logs
                context.log_filter_stats = filter_stats

            logger.info(
                "context_gathered",
                resource=resource_name,
                namespace=namespace,
                events_count=len(context.recent_events),
            )

            return context

        except Exception as e:
            logger.error("gather_context_failed", error=str(e))
            return context

    def format_for_llm(self, context: K8sContext) -> str:
        """
        將上下文格式化為 LLM 可讀格式

        Args:
            context: K8sContext 物件

        Returns:
            str: 格式化的上下文字串
        """
        parts = [
            "## K8s Context",
            f"- **Resource**: {context.resource_type}/{context.resource_name}",
            f"- **Namespace**: {context.namespace}",
            f"- **Gathered At**: {context.gathered_at}",
        ]

        if context.pod_status:
            parts.append("\n### Pod Status")
            parts.append(f"- Phase: {context.pod_status.get('phase', 'Unknown')}")
            parts.append(f"- Restart Count: {context.pod_status.get('restart_count', 0)}")
            parts.append(f"- Conditions: {', '.join(context.pod_status.get('conditions', []))}")

        if context.deployment_status:
            parts.append("\n### Deployment Status")
            parts.append(f"- Replicas: {context.deployment_status.get('replicas', 0)}")
            parts.append(f"- Ready: {context.deployment_status.get('ready_replicas', 0)}")
            parts.append(f"- Available: {context.deployment_status.get('available_replicas', 0)}")

        if context.recent_events:
            parts.append("\n### Recent Events")
            for event in context.recent_events:
                parts.append(f"- [{event['type']}] {event['reason']}: {event['message']}")

        if context.filtered_logs:
            parts.append("\n### Filtered Logs (ERROR Only)")
            parts.append("```")
            parts.append(context.filtered_logs[:2000])  # 限制長度
            if len(context.filtered_logs) > 2000:
                parts.append("... (truncated)")
            parts.append("```")

            if context.log_filter_stats:
                stats = context.log_filter_stats
                parts.append(f"\n*Log Filter Stats: {stats.get('filtered_lines', 0)}/{stats.get('original_lines', 0)} lines kept ({stats.get('removal_rate_percent', 0)}% removed)*")

        return "\n".join(parts)


# =============================================================================
# Singleton
# =============================================================================

_gatherer: ContextGatherer | None = None


def get_context_gatherer() -> ContextGatherer:
    """取得全域 ContextGatherer 實例"""
    global _gatherer
    if _gatherer is None:
        _gatherer = ContextGatherer()
    return _gatherer