import re import subprocess from pathlib import Path root = Path(".").resolve() SENSITIVE_KEY_RE = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*[:=]\s*(.+)$") SENSITIVE_KEYWORDS = ( "api_key", "apikey", "token", "secret", "password", "passwd", "private_key", "webhook", "client_secret", "auth", "credential", "access_token", ) TELEGRAM_BOT_TOKEN_RE = re.compile(r"\b\d{6,12}:[A-Za-z0-9_-]{35,}\b") KNOWN_TOKEN_RE = re.compile( r"(?i)\\b(" r"github_pat_[A-Za-z0-9_]{20,}|" r"ghp_[A-Za-z0-9_]{30,}|" r"gho_[A-Za-z0-9_]{20,}|" r"glpat-[A-Za-z0-9_-]{20,}|" r"pk_live_[A-Za-z0-9]{20,}|" r"sk_live_[A-Za-z0-9]{20,}|" r"sk_test_[A-Za-z0-9]{20,}|" r"xox[baprs]-[A-Za-z0-9-]{10,}|" r"xapp-[A-Za-z0-9]{30,}|" r"vk_[A-Za-z0-9]{20,}|" r"AIza[0-9A-Za-z_-]{35,}" r")\\b" ) JWT_RE = re.compile(r"\beyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+\b") URL_CRED_RE = re.compile(r"https?://[^/\s:@]+:[^/@\s]+@[^\s]+") PLACEHOLDER_MARKERS = ( "${", "process.env.", "import.meta.env.", "your-", "your_", " str: result = subprocess.run( cmd, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, cwd=str(root), ) return result.stdout def clean_value(raw: str) -> str: value = raw.strip() if "#" in value and not value.lstrip().startswith("#"): value = value.split("#", 1)[0].rstrip() if (len(value) >= 2) and (value[0] == value[-1]) and value[0] in ("'", '"', "`"): return value[1:-1].strip() return value def is_placeholder(value: str) -> bool: v = value.strip() if not v: return True if v in ("\"\"", "''", "``"): return True if v.startswith("<") and v.endswith(">"): return True if v.startswith("$"): return True low = v.lower() return any(marker in low for marker in PLACEHOLDER_MARKERS) def is_expression(value: str) -> bool: expr_markers = ("(", ")", "{", "}", "[", "]", "+", "-", "*", "/", " ", "|") return any(m in value for m in expr_markers) def is_likely_secret_value(value: str) -> bool: if not value: return False if is_expression(value): return False if TELEGRAM_BOT_TOKEN_RE.fullmatch(value): return True if KNOWN_TOKEN_RE.search(value): return True if JWT_RE.fullmatch(value): return True if URL_CRED_RE.search(value): return True raw = value.strip().strip("'\"") if len(raw) < 24: return False if re.fullmatch(r"[A-Za-z0-9]{40,}", raw): return True if re.fullmatch(r"[A-Za-z0-9_./-]{30,}", raw) and len(set(raw)) >= 12: return True if re.fullmatch(r"[A-Za-z0-9+/._-]{40,}", raw) and len(set(raw)) >= 14: return True return False def has_sensitive_key(key: str) -> bool: lower = key.lower() return any(keyword in lower for keyword in SENSITIVE_KEYWORDS) file_list_text = run(["git", "ls-files"]) files = [p for p in file_list_text.splitlines() if p.strip()] findings_high: list[tuple[str, int, str, str]] = [] findings_warn: list[tuple[str, int, str, str]] = [] for rel in files: path = root / rel if not path.is_file(): continue rel_path = path.as_posix().lower() if any(skip in rel_path for skip in SKIP_PATH_SUBSTRINGS): continue if path.suffix.lower() in SKIP_SUFFIXES: continue try: if path.stat().st_size > 300_000: continue except OSError: continue try: text = path.read_text(encoding="utf-8", errors="ignore") except Exception: continue for idx, line in enumerate(text.splitlines(), start=1): stripped = line.strip() if not stripped or stripped.startswith("#"): continue if URL_CRED_RE.search(line): match = URL_CRED_RE.search(line) findings_high.append((str(path), idx, match.group(0), "embedded user:pass credentials in URL")) continue token_match = ( KNOWN_TOKEN_RE.search(line) or TELEGRAM_BOT_TOKEN_RE.search(line) or JWT_RE.search(line) ) if token_match: findings_high.append((str(path), idx, token_match.group(0), "known credential token format")) if path.name.startswith(".env"): if "=" not in line or stripped.startswith("#"): continue env_key, _, env_value = line.partition("=") key = env_key.strip() value = clean_value(env_value) if is_placeholder(value): continue if has_sensitive_key(key) and is_likely_secret_value(value): if ".example" in path.name.lower(): findings_warn.append((str(path), idx, f"{key}=***", "environment example secret-like literal value")) else: findings_high.append((str(path), idx, f"{key}=***", "environment secret-like literal value")) continue m = SENSITIVE_KEY_RE.match(line) if not m: continue key = m.group(1) value = clean_value(m.group(2)) if is_placeholder(value) or not has_sensitive_key(key): continue if is_likely_secret_value(value): findings_high.append((str(path), idx, f"{key}=***", "hardcoded credential-like assignment")) if findings_high: print("[security-scan] FOUND HIGH-RISK credential findings:") for path, line_no, value, reason in findings_high: print(f"- HIGH {path}:{line_no}: {reason}: {value}") if findings_warn: print("[security-scan] WARNINGS (manual review suggested):") for path, line_no, value, reason in findings_warn: print(f"- WARN {path}:{line_no}: {reason}: {value}") if findings_high: raise SystemExit(1) print("[security-scan] passed")