- Python: ruff --fix 修復 280 個 lint 錯誤 - lewooogo-core: src/ 目錄未追蹤,導致 CI eslint 失敗 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
487 lines
16 KiB
Python
487 lines
16 KiB
Python
"""
|
|
Context Gatherer - K8s Log Collection & Cleaning
|
|
=================================================
|
|
Phase 5.2.1: 日誌清洗模組
|
|
|
|
Features:
|
|
- K8s Pod 日誌收集
|
|
- ERROR Only 過濾原則 (首席架構師要求)
|
|
- 雜訊過濾 (DEBUG/INFO 清除)
|
|
- 結構化上下文輸出
|
|
|
|
防禦性工程鐵律:
|
|
- 只餵給 Ollama 純淨的戰訊,不含雜訊
|
|
- 過濾 DEBUG/INFO 標籤
|
|
- 限制 Context 長度避免 Token 浪費
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import structlog
|
|
|
|
from src.core.config import settings
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Log Level Filter - ERROR Only Principle
|
|
# =============================================================================
|
|
|
|
class LogLevelFilter:
|
|
"""
|
|
日誌等級過濾器 - ERROR Only 原則
|
|
|
|
首席架構師要求:
|
|
- 僅保留 ERROR, FATAL, CRITICAL, WARN, WARNING
|
|
- 過濾 DEBUG, INFO, TRACE, VERBOSE
|
|
- 使用 Regex 精準匹配日誌等級標籤
|
|
"""
|
|
|
|
# 允許的日誌等級 (從 config 加載)
|
|
ALLOWED_LEVELS = settings.CONTEXT_LOG_LEVELS
|
|
|
|
# 禁止的日誌等級 (明確排除)
|
|
FORBIDDEN_LEVELS = ["DEBUG", "INFO", "TRACE", "VERBOSE", "FINE", "FINER", "FINEST"]
|
|
|
|
# ==========================================================================
|
|
# 核心 Regex 過濾器
|
|
# ==========================================================================
|
|
|
|
# Pattern 1: 標準日誌格式 [LEVEL] 或 LEVEL:
|
|
# 匹配: [INFO], [DEBUG], INFO:, DEBUG:, level=INFO, level=debug
|
|
# 新增: timestamp-prefixed 格式 (2024-03-21T10:15:23.456Z INFO [...])
|
|
LEVEL_PATTERN = re.compile(
|
|
r"""
|
|
(?:
|
|
\[(?P<bracket_level>DEBUG|INFO|TRACE|VERBOSE)\] | # [DEBUG], [INFO]
|
|
\b(?P<colon_level>DEBUG|INFO|TRACE|VERBOSE): | # DEBUG:, INFO:
|
|
\blevel\s*[=:]\s*["']?(?P<kv_level>DEBUG|INFO|TRACE|VERBOSE)["']? | # level=DEBUG, level="INFO"
|
|
\b(?P<space_level>DEBUG|INFO|TRACE|VERBOSE)\s+\[ # timestamp DEBUG [...], timestamp INFO [...]
|
|
)
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE
|
|
)
|
|
|
|
# Pattern 2: 允許的日誌等級 (用於正向匹配)
|
|
# 新增: 支援 timestamp-prefixed 格式 (2024-03-21T10:16:45.123Z ERROR [...])
|
|
ALLOWED_PATTERN = re.compile(
|
|
r"""
|
|
(?:
|
|
\[(?P<bracket_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\] |
|
|
\b(?P<colon_level>ERROR|FATAL|CRITICAL|WARN|WARNING): |
|
|
\blevel\s*[=:]\s*["']?(?P<kv_level>ERROR|FATAL|CRITICAL|WARN|WARNING)["']? |
|
|
\b(?P<space_level>ERROR|FATAL|CRITICAL|WARN|WARNING)\s+\[
|
|
)
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE
|
|
)
|
|
|
|
# Pattern 3: Kubernetes 事件格式
|
|
# 匹配: Warning, Normal (K8s Event Types)
|
|
K8S_EVENT_PATTERN = re.compile(
|
|
r"^\s*(?P<event_type>Warning|Error)\s+",
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# Pattern 4: Stacktrace 行 (保留)
|
|
STACKTRACE_PATTERN = re.compile(
|
|
r"""
|
|
(?:
|
|
^\s+at\s+ | # Java stacktrace
|
|
^\s+File\s+".*",\s+line\s+ | # Python traceback
|
|
^Traceback\s+\(most\s+recent | # Python traceback header
|
|
^\s+\d+:\s+0x[0-9a-f]+ | # Go stacktrace
|
|
^panic: # Go panic
|
|
)
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE
|
|
)
|
|
|
|
@classmethod
|
|
def is_allowed(cls, line: str) -> bool:
|
|
"""
|
|
判斷日誌行是否應該保留
|
|
|
|
規則:
|
|
1. 包含 ERROR/FATAL/CRITICAL/WARN → 保留
|
|
2. 包含 DEBUG/INFO/TRACE → 過濾
|
|
3. 是 Stacktrace → 保留
|
|
4. K8s Warning/Error 事件 → 保留
|
|
5. 其他 → 過濾 (保守策略)
|
|
|
|
Returns:
|
|
bool: True = 保留, False = 過濾
|
|
"""
|
|
line = line.strip()
|
|
|
|
# 空行過濾
|
|
if not line:
|
|
return False
|
|
|
|
# Rule 1: 明確禁止的等級 → 過濾
|
|
if cls.LEVEL_PATTERN.search(line):
|
|
return False
|
|
|
|
# Rule 2: 允許的等級 → 保留
|
|
if cls.ALLOWED_PATTERN.search(line):
|
|
return True
|
|
|
|
# Rule 3: Stacktrace → 保留
|
|
if cls.STACKTRACE_PATTERN.search(line):
|
|
return True
|
|
|
|
# Rule 4: K8s Warning/Error 事件 → 保留
|
|
if cls.K8S_EVENT_PATTERN.search(line):
|
|
return True
|
|
|
|
# Rule 5: 預設過濾 (ERROR Only 原則)
|
|
# 這是保守策略,避免雜訊
|
|
return False
|
|
|
|
@classmethod
|
|
def filter_logs(cls, logs: str) -> str:
|
|
"""
|
|
過濾日誌字串,僅保留 ERROR 等級
|
|
|
|
Args:
|
|
logs: 原始日誌字串 (多行)
|
|
|
|
Returns:
|
|
str: 過濾後的日誌字串
|
|
"""
|
|
lines = logs.split("\n")
|
|
filtered = []
|
|
|
|
# 追蹤 Stacktrace 狀態
|
|
in_stacktrace = False
|
|
|
|
for line in lines:
|
|
# Stacktrace 延續判斷
|
|
if in_stacktrace:
|
|
if cls.STACKTRACE_PATTERN.search(line) or line.startswith((" ", "\t")):
|
|
filtered.append(line)
|
|
continue
|
|
else:
|
|
in_stacktrace = False
|
|
|
|
# 進入 Stacktrace
|
|
if "Traceback" in line or "panic:" in line or line.strip().startswith("at "):
|
|
in_stacktrace = True
|
|
filtered.append(line)
|
|
continue
|
|
|
|
# 標準過濾
|
|
if cls.is_allowed(line):
|
|
filtered.append(line)
|
|
|
|
return "\n".join(filtered)
|
|
|
|
@classmethod
|
|
def get_filter_stats(cls, original: str, filtered: str) -> dict:
|
|
"""
|
|
取得過濾統計資訊
|
|
"""
|
|
original_lines = len(original.split("\n"))
|
|
filtered_lines = len(filtered.split("\n"))
|
|
removed_lines = original_lines - filtered_lines
|
|
removal_rate = (removed_lines / original_lines * 100) if original_lines > 0 else 0
|
|
|
|
return {
|
|
"original_lines": original_lines,
|
|
"filtered_lines": filtered_lines,
|
|
"removed_lines": removed_lines,
|
|
"removal_rate_percent": round(removal_rate, 1),
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Context Gatherer
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class K8sContext:
|
|
"""K8s 上下文資料結構"""
|
|
namespace: str
|
|
resource_name: str
|
|
resource_type: str
|
|
pod_status: dict[str, Any] = field(default_factory=dict)
|
|
deployment_status: dict[str, Any] = field(default_factory=dict)
|
|
recent_events: list[dict[str, Any]] = field(default_factory=list)
|
|
filtered_logs: str = ""
|
|
log_filter_stats: dict[str, Any] = field(default_factory=dict)
|
|
gathered_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
|
|
|
|
class ContextGatherer:
|
|
"""
|
|
上下文收集器 - 為 Ollama 準備乾淨的分析資料
|
|
|
|
職責:
|
|
1. 收集 K8s Pod/Deployment 狀態
|
|
2. 收集最近事件
|
|
3. 收集並清洗日誌 (ERROR Only)
|
|
4. 組裝結構化上下文
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._k8s_client = None
|
|
self._initialized = False
|
|
|
|
async def initialize(self) -> bool:
|
|
"""初始化 K8s 連線"""
|
|
try:
|
|
from pathlib import Path
|
|
|
|
from kubernetes_asyncio import client
|
|
from kubernetes_asyncio.config import load_kube_config
|
|
|
|
kubeconfig_path = Path(settings.KUBECONFIG_PATH)
|
|
if not kubeconfig_path.is_absolute():
|
|
kubeconfig_path = Path(__file__).parent.parent.parent / settings.KUBECONFIG_PATH
|
|
|
|
if not kubeconfig_path.exists():
|
|
logger.warning("kubeconfig_not_found", path=str(kubeconfig_path))
|
|
return False
|
|
|
|
await load_kube_config(config_file=str(kubeconfig_path))
|
|
self._k8s_client = client
|
|
self._initialized = True
|
|
|
|
logger.info("context_gatherer_initialized")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error("context_gatherer_init_failed", error=str(e))
|
|
return False
|
|
|
|
async def gather_pod_logs(
|
|
self,
|
|
pod_name: str,
|
|
namespace: str = "default",
|
|
tail_lines: int | None = None,
|
|
) -> tuple[str, dict]:
|
|
"""
|
|
收集並清洗 Pod 日誌
|
|
|
|
Args:
|
|
pod_name: Pod 名稱
|
|
namespace: Namespace
|
|
tail_lines: 取最後 N 行 (預設從 config)
|
|
|
|
Returns:
|
|
(filtered_logs, filter_stats)
|
|
"""
|
|
tail_lines = tail_lines or settings.CONTEXT_MAX_LINES
|
|
|
|
if not self._initialized:
|
|
await self.initialize()
|
|
|
|
if not self._initialized:
|
|
return "[K8s not connected]", {"error": "K8s not initialized"}
|
|
|
|
try:
|
|
core_v1 = self._k8s_client.CoreV1Api()
|
|
|
|
# 取得原始日誌
|
|
raw_logs = await core_v1.read_namespaced_pod_log(
|
|
name=pod_name,
|
|
namespace=namespace,
|
|
tail_lines=tail_lines,
|
|
)
|
|
|
|
# 清洗日誌 (ERROR Only)
|
|
filtered_logs = LogLevelFilter.filter_logs(raw_logs)
|
|
filter_stats = LogLevelFilter.get_filter_stats(raw_logs, filtered_logs)
|
|
|
|
logger.info(
|
|
"pod_logs_filtered",
|
|
pod=pod_name,
|
|
namespace=namespace,
|
|
**filter_stats,
|
|
)
|
|
|
|
return filtered_logs, filter_stats
|
|
|
|
except Exception as e:
|
|
logger.error("gather_pod_logs_failed", pod=pod_name, error=str(e))
|
|
return f"[Error gathering logs: {e}]", {"error": str(e)}
|
|
|
|
async def gather_context(
|
|
self,
|
|
resource_name: str,
|
|
namespace: str = "default",
|
|
resource_type: str = "pod",
|
|
) -> K8sContext:
|
|
"""
|
|
收集完整的 K8s 上下文
|
|
|
|
Args:
|
|
resource_name: 資源名稱
|
|
namespace: Namespace
|
|
resource_type: 資源類型 (pod/deployment)
|
|
|
|
Returns:
|
|
K8sContext: 結構化上下文
|
|
"""
|
|
context = K8sContext(
|
|
namespace=namespace,
|
|
resource_name=resource_name,
|
|
resource_type=resource_type,
|
|
)
|
|
|
|
if not self._initialized:
|
|
await self.initialize()
|
|
|
|
if not self._initialized:
|
|
context.filtered_logs = "[K8s not connected - using mock context]"
|
|
return context
|
|
|
|
try:
|
|
core_v1 = self._k8s_client.CoreV1Api()
|
|
apps_v1 = self._k8s_client.AppsV1Api()
|
|
|
|
# 1. Pod 狀態
|
|
if resource_type == "pod":
|
|
try:
|
|
pod = await core_v1.read_namespaced_pod(
|
|
name=resource_name,
|
|
namespace=namespace,
|
|
)
|
|
context.pod_status = {
|
|
"phase": pod.status.phase,
|
|
"restart_count": sum(
|
|
c.restart_count for c in (pod.status.container_statuses or [])
|
|
),
|
|
"conditions": [
|
|
c.type for c in (pod.status.conditions or []) if c.status == "True"
|
|
],
|
|
}
|
|
except Exception as e:
|
|
logger.warning("gather_pod_status_failed", error=str(e))
|
|
|
|
# 2. Deployment 狀態
|
|
if resource_type in ["pod", "deployment"]:
|
|
try:
|
|
deploy_name = resource_name.rsplit("-", 2)[0] if resource_type == "pod" else resource_name
|
|
deploy = await apps_v1.read_namespaced_deployment(
|
|
name=deploy_name,
|
|
namespace=namespace,
|
|
)
|
|
context.deployment_status = {
|
|
"replicas": deploy.spec.replicas,
|
|
"ready_replicas": deploy.status.ready_replicas or 0,
|
|
"available_replicas": deploy.status.available_replicas or 0,
|
|
}
|
|
except Exception as e:
|
|
logger.warning("gather_deployment_status_failed", error=str(e))
|
|
|
|
# 3. 最近事件
|
|
try:
|
|
events = await core_v1.list_namespaced_event(
|
|
namespace=namespace,
|
|
field_selector=f"involvedObject.name={resource_name}",
|
|
)
|
|
context.recent_events = [
|
|
{
|
|
"type": e.type,
|
|
"reason": e.reason,
|
|
"message": e.message[:100] if e.message else "",
|
|
"count": e.count,
|
|
}
|
|
for e in sorted(
|
|
events.items,
|
|
key=lambda x: x.last_timestamp or x.event_time,
|
|
reverse=True,
|
|
)[:5]
|
|
]
|
|
except Exception as e:
|
|
logger.warning("gather_events_failed", error=str(e))
|
|
|
|
# 4. 清洗日誌
|
|
if resource_type == "pod":
|
|
filtered_logs, filter_stats = await self.gather_pod_logs(
|
|
resource_name, namespace
|
|
)
|
|
context.filtered_logs = filtered_logs
|
|
context.log_filter_stats = filter_stats
|
|
|
|
logger.info(
|
|
"context_gathered",
|
|
resource=resource_name,
|
|
namespace=namespace,
|
|
events_count=len(context.recent_events),
|
|
)
|
|
|
|
return context
|
|
|
|
except Exception as e:
|
|
logger.error("gather_context_failed", error=str(e))
|
|
return context
|
|
|
|
def format_for_llm(self, context: K8sContext) -> str:
|
|
"""
|
|
將上下文格式化為 LLM 可讀格式
|
|
|
|
Args:
|
|
context: K8sContext 物件
|
|
|
|
Returns:
|
|
str: 格式化的上下文字串
|
|
"""
|
|
parts = [
|
|
"## K8s Context",
|
|
f"- **Resource**: {context.resource_type}/{context.resource_name}",
|
|
f"- **Namespace**: {context.namespace}",
|
|
f"- **Gathered At**: {context.gathered_at}",
|
|
]
|
|
|
|
if context.pod_status:
|
|
parts.append("\n### Pod Status")
|
|
parts.append(f"- Phase: {context.pod_status.get('phase', 'Unknown')}")
|
|
parts.append(f"- Restart Count: {context.pod_status.get('restart_count', 0)}")
|
|
parts.append(f"- Conditions: {', '.join(context.pod_status.get('conditions', []))}")
|
|
|
|
if context.deployment_status:
|
|
parts.append("\n### Deployment Status")
|
|
parts.append(f"- Replicas: {context.deployment_status.get('replicas', 0)}")
|
|
parts.append(f"- Ready: {context.deployment_status.get('ready_replicas', 0)}")
|
|
parts.append(f"- Available: {context.deployment_status.get('available_replicas', 0)}")
|
|
|
|
if context.recent_events:
|
|
parts.append("\n### Recent Events")
|
|
for event in context.recent_events:
|
|
parts.append(f"- [{event['type']}] {event['reason']}: {event['message']}")
|
|
|
|
if context.filtered_logs:
|
|
parts.append("\n### Filtered Logs (ERROR Only)")
|
|
parts.append("```")
|
|
parts.append(context.filtered_logs[:2000]) # 限制長度
|
|
if len(context.filtered_logs) > 2000:
|
|
parts.append("... (truncated)")
|
|
parts.append("```")
|
|
|
|
if context.log_filter_stats:
|
|
stats = context.log_filter_stats
|
|
parts.append(f"\n*Log Filter Stats: {stats.get('filtered_lines', 0)}/{stats.get('original_lines', 0)} lines kept ({stats.get('removal_rate_percent', 0)}% removed)*")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
# =============================================================================
|
|
# Singleton
|
|
# =============================================================================
|
|
|
|
_gatherer: ContextGatherer | None = None
|
|
|
|
|
|
def get_context_gatherer() -> ContextGatherer:
|
|
"""取得全域 ContextGatherer 實例"""
|
|
global _gatherer
|
|
if _gatherer is None:
|
|
_gatherer = ContextGatherer()
|
|
return _gatherer
|