Files
awoooi/apps/api/src/services/context_gatherer.py
OG T 6f049877fc fix(lint): ruff auto-fix + lewooogo-core src 加入 git
- Python: ruff --fix 修復 280 個 lint 錯誤
- lewooogo-core: src/ 目錄未追蹤,導致 CI eslint 失敗

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-23 23:51:37 +08:00

487 lines
16 KiB
Python

"""
Context Gatherer - K8s Log Collection & Cleaning
=================================================
Phase 5.2.1: 日誌清洗模組
Features:
- K8s Pod 日誌收集
- ERROR Only 過濾原則 (首席架構師要求)
- 雜訊過濾 (DEBUG/INFO 清除)
- 結構化上下文輸出
防禦性工程鐵律:
- 只餵給 Ollama 純淨的戰訊,不含雜訊
- 過濾 DEBUG/INFO 標籤
- 限制 Context 長度避免 Token 浪費
"""
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
import structlog
from src.core.config import settings
logger = structlog.get_logger(__name__)
# =============================================================================
# Log Level Filter - ERROR Only Principle
# =============================================================================
class LogLevelFilter:
"""
日誌等級過濾器 - ERROR Only 原則
首席架構師要求:
- 僅保留 ERROR, FATAL, CRITICAL, WARN, WARNING
- 過濾 DEBUG, INFO, TRACE, VERBOSE
- 使用 Regex 精準匹配日誌等級標籤
"""
# 允許的日誌等級 (從 config 加載)
ALLOWED_LEVELS = settings.CONTEXT_LOG_LEVELS
# 禁止的日誌等級 (明確排除)
FORBIDDEN_LEVELS = ["DEBUG", "INFO", "TRACE", "VERBOSE", "FINE", "FINER", "FINEST"]
# ==========================================================================
# 核心 Regex 過濾器
# ==========================================================================
# Pattern 1: 標準日誌格式 [LEVEL] 或 LEVEL:
# 匹配: [INFO], [DEBUG], INFO:, DEBUG:, level=INFO, level=debug
# 新增: timestamp-prefixed 格式 (2024-03-21T10:15:23.456Z INFO [...])
LEVEL_PATTERN = re.compile(
r"""
(?:
\[(?P<bracket_level>DEBUG|INFO|TRACE|VERBOSE)\] | # [DEBUG], [INFO]
\b(?P<colon_level>DEBUG|INFO|TRACE|VERBOSE): | # DEBUG:, INFO:
\blevel\s*[=:]\s*["']?(?P<kv_level>DEBUG|INFO|TRACE|VERBOSE)["']? | # level=DEBUG, level="INFO"
\b(?P<space_level>DEBUG|INFO|TRACE|VERBOSE)\s+\[ # timestamp DEBUG [...], timestamp INFO [...]
)
""",
re.IGNORECASE | re.VERBOSE
)
# Pattern 2: 允許的日誌等級 (用於正向匹配)
# 新增: 支援 timestamp-prefixed 格式 (2024-03-21T10:16:45.123Z ERROR [...])
ALLOWED_PATTERN = re.compile(
r"""
(?:
\[(?P<bracket_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\] |
\b(?P<colon_level>ERROR|FATAL|CRITICAL|WARN|WARNING): |
\blevel\s*[=:]\s*["']?(?P<kv_level>ERROR|FATAL|CRITICAL|WARN|WARNING)["']? |
\b(?P<space_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\s+\[
)
""",
re.IGNORECASE | re.VERBOSE
)
# Pattern 3: Kubernetes 事件格式
# 匹配: Warning, Normal (K8s Event Types)
K8S_EVENT_PATTERN = re.compile(
r"^\s*(?P<event_type>Warning|Error)\s+",
re.IGNORECASE
)
# Pattern 4: Stacktrace 行 (保留)
STACKTRACE_PATTERN = re.compile(
r"""
(?:
^\s+at\s+ | # Java stacktrace
^\s+File\s+".*",\s+line\s+ | # Python traceback
^Traceback\s+\(most\s+recent | # Python traceback header
^\s+\d+:\s+0x[0-9a-f]+ | # Go stacktrace
^panic: # Go panic
)
""",
re.IGNORECASE | re.VERBOSE
)
@classmethod
def is_allowed(cls, line: str) -> bool:
"""
判斷日誌行是否應該保留
規則:
1. 包含 ERROR/FATAL/CRITICAL/WARN → 保留
2. 包含 DEBUG/INFO/TRACE → 過濾
3. 是 Stacktrace → 保留
4. K8s Warning/Error 事件 → 保留
5. 其他 → 過濾 (保守策略)
Returns:
bool: True = 保留, False = 過濾
"""
line = line.strip()
# 空行過濾
if not line:
return False
# Rule 1: 明確禁止的等級 → 過濾
if cls.LEVEL_PATTERN.search(line):
return False
# Rule 2: 允許的等級 → 保留
if cls.ALLOWED_PATTERN.search(line):
return True
# Rule 3: Stacktrace → 保留
if cls.STACKTRACE_PATTERN.search(line):
return True
# Rule 4: K8s Warning/Error 事件 → 保留
if cls.K8S_EVENT_PATTERN.search(line):
return True
# Rule 5: 預設過濾 (ERROR Only 原則)
# 這是保守策略,避免雜訊
return False
@classmethod
def filter_logs(cls, logs: str) -> str:
"""
過濾日誌字串,僅保留 ERROR 等級
Args:
logs: 原始日誌字串 (多行)
Returns:
str: 過濾後的日誌字串
"""
lines = logs.split("\n")
filtered = []
# 追蹤 Stacktrace 狀態
in_stacktrace = False
for line in lines:
# Stacktrace 延續判斷
if in_stacktrace:
if cls.STACKTRACE_PATTERN.search(line) or line.startswith((" ", "\t")):
filtered.append(line)
continue
else:
in_stacktrace = False
# 進入 Stacktrace
if "Traceback" in line or "panic:" in line or line.strip().startswith("at "):
in_stacktrace = True
filtered.append(line)
continue
# 標準過濾
if cls.is_allowed(line):
filtered.append(line)
return "\n".join(filtered)
@classmethod
def get_filter_stats(cls, original: str, filtered: str) -> dict:
"""
取得過濾統計資訊
"""
original_lines = len(original.split("\n"))
filtered_lines = len(filtered.split("\n"))
removed_lines = original_lines - filtered_lines
removal_rate = (removed_lines / original_lines * 100) if original_lines > 0 else 0
return {
"original_lines": original_lines,
"filtered_lines": filtered_lines,
"removed_lines": removed_lines,
"removal_rate_percent": round(removal_rate, 1),
}
# =============================================================================
# Context Gatherer
# =============================================================================
@dataclass
class K8sContext:
"""K8s 上下文資料結構"""
namespace: str
resource_name: str
resource_type: str
pod_status: dict[str, Any] = field(default_factory=dict)
deployment_status: dict[str, Any] = field(default_factory=dict)
recent_events: list[dict[str, Any]] = field(default_factory=list)
filtered_logs: str = ""
log_filter_stats: dict[str, Any] = field(default_factory=dict)
gathered_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
class ContextGatherer:
"""
上下文收集器 - 為 Ollama 準備乾淨的分析資料
職責:
1. 收集 K8s Pod/Deployment 狀態
2. 收集最近事件
3. 收集並清洗日誌 (ERROR Only)
4. 組裝結構化上下文
"""
def __init__(self):
self._k8s_client = None
self._initialized = False
async def initialize(self) -> bool:
"""初始化 K8s 連線"""
try:
from pathlib import Path
from kubernetes_asyncio import client
from kubernetes_asyncio.config import load_kube_config
kubeconfig_path = Path(settings.KUBECONFIG_PATH)
if not kubeconfig_path.is_absolute():
kubeconfig_path = Path(__file__).parent.parent.parent / settings.KUBECONFIG_PATH
if not kubeconfig_path.exists():
logger.warning("kubeconfig_not_found", path=str(kubeconfig_path))
return False
await load_kube_config(config_file=str(kubeconfig_path))
self._k8s_client = client
self._initialized = True
logger.info("context_gatherer_initialized")
return True
except Exception as e:
logger.error("context_gatherer_init_failed", error=str(e))
return False
async def gather_pod_logs(
self,
pod_name: str,
namespace: str = "default",
tail_lines: int | None = None,
) -> tuple[str, dict]:
"""
收集並清洗 Pod 日誌
Args:
pod_name: Pod 名稱
namespace: Namespace
tail_lines: 取最後 N 行 (預設從 config)
Returns:
(filtered_logs, filter_stats)
"""
tail_lines = tail_lines or settings.CONTEXT_MAX_LINES
if not self._initialized:
await self.initialize()
if not self._initialized:
return "[K8s not connected]", {"error": "K8s not initialized"}
try:
core_v1 = self._k8s_client.CoreV1Api()
# 取得原始日誌
raw_logs = await core_v1.read_namespaced_pod_log(
name=pod_name,
namespace=namespace,
tail_lines=tail_lines,
)
# 清洗日誌 (ERROR Only)
filtered_logs = LogLevelFilter.filter_logs(raw_logs)
filter_stats = LogLevelFilter.get_filter_stats(raw_logs, filtered_logs)
logger.info(
"pod_logs_filtered",
pod=pod_name,
namespace=namespace,
**filter_stats,
)
return filtered_logs, filter_stats
except Exception as e:
logger.error("gather_pod_logs_failed", pod=pod_name, error=str(e))
return f"[Error gathering logs: {e}]", {"error": str(e)}
async def gather_context(
self,
resource_name: str,
namespace: str = "default",
resource_type: str = "pod",
) -> K8sContext:
"""
收集完整的 K8s 上下文
Args:
resource_name: 資源名稱
namespace: Namespace
resource_type: 資源類型 (pod/deployment)
Returns:
K8sContext: 結構化上下文
"""
context = K8sContext(
namespace=namespace,
resource_name=resource_name,
resource_type=resource_type,
)
if not self._initialized:
await self.initialize()
if not self._initialized:
context.filtered_logs = "[K8s not connected - using mock context]"
return context
try:
core_v1 = self._k8s_client.CoreV1Api()
apps_v1 = self._k8s_client.AppsV1Api()
# 1. Pod 狀態
if resource_type == "pod":
try:
pod = await core_v1.read_namespaced_pod(
name=resource_name,
namespace=namespace,
)
context.pod_status = {
"phase": pod.status.phase,
"restart_count": sum(
c.restart_count for c in (pod.status.container_statuses or [])
),
"conditions": [
c.type for c in (pod.status.conditions or []) if c.status == "True"
],
}
except Exception as e:
logger.warning("gather_pod_status_failed", error=str(e))
# 2. Deployment 狀態
if resource_type in ["pod", "deployment"]:
try:
deploy_name = resource_name.rsplit("-", 2)[0] if resource_type == "pod" else resource_name
deploy = await apps_v1.read_namespaced_deployment(
name=deploy_name,
namespace=namespace,
)
context.deployment_status = {
"replicas": deploy.spec.replicas,
"ready_replicas": deploy.status.ready_replicas or 0,
"available_replicas": deploy.status.available_replicas or 0,
}
except Exception as e:
logger.warning("gather_deployment_status_failed", error=str(e))
# 3. 最近事件
try:
events = await core_v1.list_namespaced_event(
namespace=namespace,
field_selector=f"involvedObject.name={resource_name}",
)
context.recent_events = [
{
"type": e.type,
"reason": e.reason,
"message": e.message[:100] if e.message else "",
"count": e.count,
}
for e in sorted(
events.items,
key=lambda x: x.last_timestamp or x.event_time,
reverse=True,
)[:5]
]
except Exception as e:
logger.warning("gather_events_failed", error=str(e))
# 4. 清洗日誌
if resource_type == "pod":
filtered_logs, filter_stats = await self.gather_pod_logs(
resource_name, namespace
)
context.filtered_logs = filtered_logs
context.log_filter_stats = filter_stats
logger.info(
"context_gathered",
resource=resource_name,
namespace=namespace,
events_count=len(context.recent_events),
)
return context
except Exception as e:
logger.error("gather_context_failed", error=str(e))
return context
def format_for_llm(self, context: K8sContext) -> str:
"""
將上下文格式化為 LLM 可讀格式
Args:
context: K8sContext 物件
Returns:
str: 格式化的上下文字串
"""
parts = [
"## K8s Context",
f"- **Resource**: {context.resource_type}/{context.resource_name}",
f"- **Namespace**: {context.namespace}",
f"- **Gathered At**: {context.gathered_at}",
]
if context.pod_status:
parts.append("\n### Pod Status")
parts.append(f"- Phase: {context.pod_status.get('phase', 'Unknown')}")
parts.append(f"- Restart Count: {context.pod_status.get('restart_count', 0)}")
parts.append(f"- Conditions: {', '.join(context.pod_status.get('conditions', []))}")
if context.deployment_status:
parts.append("\n### Deployment Status")
parts.append(f"- Replicas: {context.deployment_status.get('replicas', 0)}")
parts.append(f"- Ready: {context.deployment_status.get('ready_replicas', 0)}")
parts.append(f"- Available: {context.deployment_status.get('available_replicas', 0)}")
if context.recent_events:
parts.append("\n### Recent Events")
for event in context.recent_events:
parts.append(f"- [{event['type']}] {event['reason']}: {event['message']}")
if context.filtered_logs:
parts.append("\n### Filtered Logs (ERROR Only)")
parts.append("```")
parts.append(context.filtered_logs[:2000]) # 限制長度
if len(context.filtered_logs) > 2000:
parts.append("... (truncated)")
parts.append("```")
if context.log_filter_stats:
stats = context.log_filter_stats
parts.append(f"\n*Log Filter Stats: {stats.get('filtered_lines', 0)}/{stats.get('original_lines', 0)} lines kept ({stats.get('removal_rate_percent', 0)}% removed)*")
return "\n".join(parts)
# =============================================================================
# Singleton
# =============================================================================
_gatherer: ContextGatherer | None = None
def get_context_gatherer() -> ContextGatherer:
"""取得全域 ContextGatherer 實例"""
global _gatherer
if _gatherer is None:
_gatherer = ContextGatherer()
return _gatherer