agent-bounty-protocol/scripts/security_scan.py

import re
import subprocess
from pathlib import Path

root = Path(".").resolve()

SENSITIVE_KEY_RE = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*[:=]\s*(.+)$")

SENSITIVE_KEYWORDS = (
    "api_key",
    "apikey",
    "token",
    "secret",
    "password",
    "passwd",
    "private_key",
    "webhook",
    "client_secret",
    "auth",
    "credential",
    "access_token",
)

TELEGRAM_BOT_TOKEN_RE = re.compile(r"\b\d{6,12}:[A-Za-z0-9_-]{35,}\b")
KNOWN_TOKEN_RE = re.compile(
    r"(?i)\\b("
    r"github_pat_[A-Za-z0-9_]{20,}|"
    r"ghp_[A-Za-z0-9_]{30,}|"
    r"gho_[A-Za-z0-9_]{20,}|"
    r"glpat-[A-Za-z0-9_-]{20,}|"
    r"pk_live_[A-Za-z0-9]{20,}|"
    r"sk_live_[A-Za-z0-9]{20,}|"
    r"sk_test_[A-Za-z0-9]{20,}|"
    r"xox[baprs]-[A-Za-z0-9-]{10,}|"
    r"xapp-[A-Za-z0-9]{30,}|"
    r"vk_[A-Za-z0-9]{20,}|"
    r"AIza[0-9A-Za-z_-]{35,}"
    r")\\b"
)
JWT_RE = re.compile(r"\beyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+\b")
URL_CRED_RE = re.compile(r"https?://[^/\s:@]+:[^/@\s]+@[^\s]+")

PLACEHOLDER_MARKERS = (
    "${",
    "process.env.",
    "import.meta.env.",
    "your-",
    "your_",
    "<your",
    "placeholder",
    "example",
    "sample",
    "changeme",
)

SKIP_SUFFIXES = {
    ".png",
    ".jpg",
    ".jpeg",
    ".gif",
    ".webp",
    ".ico",
    ".mp4",
    ".mov",
    ".zip",
    ".jar",
    ".pyc",
    ".wasm",
    ".node",
    ".dylib",
    ".so",
    ".pdf",
    ".md",
    ".mdx",
    ".rst",
    ".txt",
}

SKIP_PATH_SUBSTRINGS = (
    "/prisma/generated/",
    "/runtime/",
    "/dist/",
    "/build/",
    "/coverage/",
)


def run(cmd: list[str]) -> str:
    result = subprocess.run(
        cmd,
        check=True,
        text=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
        cwd=str(root),
    )
    return result.stdout


def clean_value(raw: str) -> str:
    value = raw.strip()
    if "#" in value and not value.lstrip().startswith("#"):
        value = value.split("#", 1)[0].rstrip()
    if (len(value) >= 2) and (value[0] == value[-1]) and value[0] in ("'", '"', "`"):
        return value[1:-1].strip()
    return value


def is_placeholder(value: str) -> bool:
    v = value.strip()
    if not v:
        return True
    if v in ("\"\"", "''", "``"):
        return True
    if v.startswith("<") and v.endswith(">"):
        return True
    if v.startswith("$"):
        return True
    low = v.lower()
    return any(marker in low for marker in PLACEHOLDER_MARKERS)


def is_expression(value: str) -> bool:
    expr_markers = ("(", ")", "{", "}", "[", "]", "+", "-", "*", "/", " ", "|")
    return any(m in value for m in expr_markers)


def is_likely_secret_value(value: str) -> bool:
    if not value:
        return False
    if is_expression(value):
        return False

    if TELEGRAM_BOT_TOKEN_RE.fullmatch(value):
        return True
    if KNOWN_TOKEN_RE.search(value):
        return True
    if JWT_RE.fullmatch(value):
        return True
    if URL_CRED_RE.search(value):
        return True

    raw = value.strip().strip("'\"")
    if len(raw) < 24:
        return False

    if re.fullmatch(r"[A-Za-z0-9]{40,}", raw):
        return True

    if re.fullmatch(r"[A-Za-z0-9_./-]{30,}", raw) and len(set(raw)) >= 12:
        return True

    if re.fullmatch(r"[A-Za-z0-9+/._-]{40,}", raw) and len(set(raw)) >= 14:
        return True

    return False


def has_sensitive_key(key: str) -> bool:
    lower = key.lower()
    return any(keyword in lower for keyword in SENSITIVE_KEYWORDS)


file_list_text = run(["git", "ls-files"])
files = [p for p in file_list_text.splitlines() if p.strip()]

findings_high: list[tuple[str, int, str, str]] = []
findings_warn: list[tuple[str, int, str, str]] = []

for rel in files:
    path = root / rel
    if not path.is_file():
        continue

    rel_path = path.as_posix().lower()
    if any(skip in rel_path for skip in SKIP_PATH_SUBSTRINGS):
        continue
    if path.suffix.lower() in SKIP_SUFFIXES:
        continue

    try:
        if path.stat().st_size > 300_000:
            continue
    except OSError:
        continue

    try:
        text = path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        continue

    for idx, line in enumerate(text.splitlines(), start=1):
        stripped = line.strip()
        if not stripped or stripped.startswith("#"):
            continue

        if URL_CRED_RE.search(line):
            match = URL_CRED_RE.search(line)
            findings_high.append((str(path), idx, match.group(0), "embedded user:pass credentials in URL"))
            continue

        token_match = (
            KNOWN_TOKEN_RE.search(line)
            or TELEGRAM_BOT_TOKEN_RE.search(line)
            or JWT_RE.search(line)
        )
        if token_match:
            findings_high.append((str(path), idx, token_match.group(0), "known credential token format"))

        if path.name.startswith(".env"):
            if "=" not in line or stripped.startswith("#"):
                continue
            env_key, _, env_value = line.partition("=")
            key = env_key.strip()
            value = clean_value(env_value)
            if is_placeholder(value):
                continue
            if has_sensitive_key(key) and is_likely_secret_value(value):
                if ".example" in path.name.lower():
                    findings_warn.append((str(path), idx, f"{key}=***", "environment example secret-like literal value"))
                else:
                    findings_high.append((str(path), idx, f"{key}=***", "environment secret-like literal value"))
            continue

        m = SENSITIVE_KEY_RE.match(line)
        if not m:
            continue

        key = m.group(1)
        value = clean_value(m.group(2))
        if is_placeholder(value) or not has_sensitive_key(key):
            continue
        if is_likely_secret_value(value):
            findings_high.append((str(path), idx, f"{key}=***", "hardcoded credential-like assignment"))

if findings_high:
    print("[security-scan] FOUND HIGH-RISK credential findings:")
    for path, line_no, value, reason in findings_high:
        print(f"- HIGH {path}:{line_no}: {reason}: {value}")

if findings_warn:
    print("[security-scan] WARNINGS (manual review suggested):")
    for path, line_no, value, reason in findings_warn:
        print(f"- WARN {path}:{line_no}: {reason}: {value}")

if findings_high:
    raise SystemExit(1)

print("[security-scan] passed")