Files
ewoooc/services/browse_sh_tool.py

173 lines
5.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""browse.sh CLI 的可選執行 wrapper。
正式爬蟲仍以既有 Python/API client 為準browse.sh 只用於動態頁面診斷、
selector 探勘與 network trace避免把外部 CLI 變成 scheduler 的硬依賴。
"""
from __future__ import annotations
import os
import shutil
import subprocess
from dataclasses import dataclass
from typing import Mapping, Sequence
BROWSE_SH_CLI_ENV = "BROWSE_SH_CLI"
DEFAULT_TIMEOUT_SECONDS = 90
@dataclass(frozen=True)
class BrowseShAvailability:
available: bool
command: tuple[str, ...]
reason: str = ""
version: str = ""
def as_dict(self) -> dict:
return {
"available": self.available,
"command": list(self.command),
"reason": self.reason,
"version": self.version,
}
@dataclass(frozen=True)
class BrowseShResult:
ok: bool
command: tuple[str, ...]
stdout: str = ""
stderr: str = ""
returncode: int | None = None
timed_out: bool = False
unavailable_reason: str = ""
def as_dict(self) -> dict:
return {
"ok": self.ok,
"command": list(self.command),
"stdout": self.stdout,
"stderr": self.stderr,
"returncode": self.returncode,
"timed_out": self.timed_out,
"unavailable_reason": self.unavailable_reason,
}
class BrowseShTool:
"""browse CLI 的最小安全包裝。"""
def __init__(
self,
cli_path: str | None = None,
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
env: Mapping[str, str] | None = None,
) -> None:
self.cli_path = cli_path
self.timeout_seconds = timeout_seconds
self.env = dict(env or {})
def resolve_cli_path(self) -> str | None:
override = self.cli_path or os.getenv(BROWSE_SH_CLI_ENV)
if override:
return override
return shutil.which("browse")
def build_command(self, args: Sequence[str]) -> tuple[str, ...]:
cli_path = self.resolve_cli_path()
if not cli_path:
return tuple()
return (cli_path, *[str(arg) for arg in args])
def availability(self) -> BrowseShAvailability:
command = self.build_command(("--version",))
if not command:
return BrowseShAvailability(
available=False,
command=tuple(),
reason="browse CLI 未安裝;請先安裝並確認 PATH 可找到 browse。",
)
try:
completed = subprocess.run(
command,
capture_output=True,
check=False,
env={**os.environ, **self.env},
text=True,
timeout=8,
)
except FileNotFoundError:
return BrowseShAvailability(False, command, "browse CLI 路徑不存在。")
except subprocess.TimeoutExpired:
return BrowseShAvailability(False, command, "browse --version 執行逾時。")
except OSError as exc:
return BrowseShAvailability(False, command, f"browse CLI 無法啟動:{exc}")
stdout = (completed.stdout or "").strip()
stderr = (completed.stderr or "").strip()
if completed.returncode != 0:
reason = stderr or stdout or f"browse --version 回傳 {completed.returncode}"
return BrowseShAvailability(False, command, reason)
return BrowseShAvailability(True, command, version=stdout or stderr)
def run(
self,
args: Sequence[str],
timeout_seconds: int | None = None,
require_available: bool = True,
) -> BrowseShResult:
command = self.build_command(args)
if not command:
return BrowseShResult(
ok=False,
command=tuple(),
unavailable_reason="browse CLI 未安裝;此工具只會略過,不影響正式爬蟲。",
)
if require_available:
availability = self.availability()
if not availability.available:
return BrowseShResult(
ok=False,
command=command,
unavailable_reason=availability.reason,
)
try:
completed = subprocess.run(
command,
capture_output=True,
check=False,
env={**os.environ, **self.env},
text=True,
timeout=timeout_seconds or self.timeout_seconds,
)
except subprocess.TimeoutExpired as exc:
return BrowseShResult(
ok=False,
command=command,
stdout=exc.stdout or "",
stderr=exc.stderr or "",
timed_out=True,
)
except OSError as exc:
return BrowseShResult(
ok=False,
command=command,
stderr=str(exc),
unavailable_reason=str(exc),
)
return BrowseShResult(
ok=completed.returncode == 0,
command=command,
stdout=completed.stdout or "",
stderr=completed.stderr or "",
returncode=completed.returncode,
)
def run_skill(self, skill_name: str, *skill_args: str, timeout_seconds: int | None = None) -> BrowseShResult:
return self.run((skill_name, *skill_args), timeout_seconds=timeout_seconds)