173 lines
5.4 KiB
Python
173 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""browse.sh CLI 的可選執行 wrapper。
|
||
|
||
正式爬蟲仍以既有 Python/API client 為準;browse.sh 只用於動態頁面診斷、
|
||
selector 探勘與 network trace,避免把外部 CLI 變成 scheduler 的硬依賴。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
from dataclasses import dataclass
|
||
from typing import Mapping, Sequence
|
||
|
||
|
||
BROWSE_SH_CLI_ENV = "BROWSE_SH_CLI"
|
||
DEFAULT_TIMEOUT_SECONDS = 90
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class BrowseShAvailability:
|
||
available: bool
|
||
command: tuple[str, ...]
|
||
reason: str = ""
|
||
version: str = ""
|
||
|
||
def as_dict(self) -> dict:
|
||
return {
|
||
"available": self.available,
|
||
"command": list(self.command),
|
||
"reason": self.reason,
|
||
"version": self.version,
|
||
}
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class BrowseShResult:
|
||
ok: bool
|
||
command: tuple[str, ...]
|
||
stdout: str = ""
|
||
stderr: str = ""
|
||
returncode: int | None = None
|
||
timed_out: bool = False
|
||
unavailable_reason: str = ""
|
||
|
||
def as_dict(self) -> dict:
|
||
return {
|
||
"ok": self.ok,
|
||
"command": list(self.command),
|
||
"stdout": self.stdout,
|
||
"stderr": self.stderr,
|
||
"returncode": self.returncode,
|
||
"timed_out": self.timed_out,
|
||
"unavailable_reason": self.unavailable_reason,
|
||
}
|
||
|
||
|
||
class BrowseShTool:
|
||
"""browse CLI 的最小安全包裝。"""
|
||
|
||
def __init__(
|
||
self,
|
||
cli_path: str | None = None,
|
||
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
||
env: Mapping[str, str] | None = None,
|
||
) -> None:
|
||
self.cli_path = cli_path
|
||
self.timeout_seconds = timeout_seconds
|
||
self.env = dict(env or {})
|
||
|
||
def resolve_cli_path(self) -> str | None:
|
||
override = self.cli_path or os.getenv(BROWSE_SH_CLI_ENV)
|
||
if override:
|
||
return override
|
||
return shutil.which("browse")
|
||
|
||
def build_command(self, args: Sequence[str]) -> tuple[str, ...]:
|
||
cli_path = self.resolve_cli_path()
|
||
if not cli_path:
|
||
return tuple()
|
||
return (cli_path, *[str(arg) for arg in args])
|
||
|
||
def availability(self) -> BrowseShAvailability:
|
||
command = self.build_command(("--version",))
|
||
if not command:
|
||
return BrowseShAvailability(
|
||
available=False,
|
||
command=tuple(),
|
||
reason="browse CLI 未安裝;請先安裝並確認 PATH 可找到 browse。",
|
||
)
|
||
try:
|
||
completed = subprocess.run(
|
||
command,
|
||
capture_output=True,
|
||
check=False,
|
||
env={**os.environ, **self.env},
|
||
text=True,
|
||
timeout=8,
|
||
)
|
||
except FileNotFoundError:
|
||
return BrowseShAvailability(False, command, "browse CLI 路徑不存在。")
|
||
except subprocess.TimeoutExpired:
|
||
return BrowseShAvailability(False, command, "browse --version 執行逾時。")
|
||
except OSError as exc:
|
||
return BrowseShAvailability(False, command, f"browse CLI 無法啟動:{exc}")
|
||
|
||
stdout = (completed.stdout or "").strip()
|
||
stderr = (completed.stderr or "").strip()
|
||
if completed.returncode != 0:
|
||
reason = stderr or stdout or f"browse --version 回傳 {completed.returncode}"
|
||
return BrowseShAvailability(False, command, reason)
|
||
return BrowseShAvailability(True, command, version=stdout or stderr)
|
||
|
||
def run(
|
||
self,
|
||
args: Sequence[str],
|
||
timeout_seconds: int | None = None,
|
||
require_available: bool = True,
|
||
) -> BrowseShResult:
|
||
command = self.build_command(args)
|
||
if not command:
|
||
return BrowseShResult(
|
||
ok=False,
|
||
command=tuple(),
|
||
unavailable_reason="browse CLI 未安裝;此工具只會略過,不影響正式爬蟲。",
|
||
)
|
||
if require_available:
|
||
availability = self.availability()
|
||
if not availability.available:
|
||
return BrowseShResult(
|
||
ok=False,
|
||
command=command,
|
||
unavailable_reason=availability.reason,
|
||
)
|
||
|
||
try:
|
||
completed = subprocess.run(
|
||
command,
|
||
capture_output=True,
|
||
check=False,
|
||
env={**os.environ, **self.env},
|
||
text=True,
|
||
timeout=timeout_seconds or self.timeout_seconds,
|
||
)
|
||
except subprocess.TimeoutExpired as exc:
|
||
return BrowseShResult(
|
||
ok=False,
|
||
command=command,
|
||
stdout=exc.stdout or "",
|
||
stderr=exc.stderr or "",
|
||
timed_out=True,
|
||
)
|
||
except OSError as exc:
|
||
return BrowseShResult(
|
||
ok=False,
|
||
command=command,
|
||
stderr=str(exc),
|
||
unavailable_reason=str(exc),
|
||
)
|
||
|
||
return BrowseShResult(
|
||
ok=completed.returncode == 0,
|
||
command=command,
|
||
stdout=completed.stdout or "",
|
||
stderr=completed.stderr or "",
|
||
returncode=completed.returncode,
|
||
)
|
||
|
||
def run_skill(self, skill_name: str, *skill_args: str, timeout_seconds: int | None = None) -> BrowseShResult:
|
||
return self.run((skill_name, *skill_args), timeout_seconds=timeout_seconds)
|