#!/usr/bin/env python3 """High-confidence secret pattern check for operational documents. This check intentionally scans documentation and workflow metadata. It allows documented placeholder formats such as nvapi-... or , but blocks likely real tokens, private keys, and long literal credentials. """ from __future__ import annotations import argparse import re import sys from dataclasses import dataclass from pathlib import Path DEFAULT_TARGETS = [Path("docs"), Path(".gitea")] DOC_SUFFIXES = {".md", ".json", ".yml", ".yaml"} @dataclass(frozen=True) class SecretPattern: name: str regex: re.Pattern[str] PATTERNS = [ SecretPattern("pem_private_key", re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----")), SecretPattern("github_token", re.compile(r"\bgh[pousr]_[A-Za-z0-9_]{20,}\b")), SecretPattern("gitlab_token", re.compile(r"\bglpat-[A-Za-z0-9_-]{20,}\b")), SecretPattern("slack_token", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{20,}\b")), SecretPattern("anthropic_key", re.compile(r"\bsk-ant-api03-[A-Za-z0-9_-]{20,}\b")), SecretPattern("openai_key", re.compile(r"\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b")), SecretPattern("google_api_key", re.compile(r"\bAIza[0-9A-Za-z_-]{30,}\b")), SecretPattern("nvidia_key", re.compile(r"\bnvapi-[0-9A-Za-z_-]{30,}\b")), SecretPattern("telegram_bot_token", re.compile(r"\b\d{8,12}:[A-Za-z0-9_-]{30,}\b")), SecretPattern("jwt", re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b")), SecretPattern("aws_access_key", re.compile(r"\bAKIA[0-9A-Z]{16}\b")), SecretPattern("gitea_token_assignment", re.compile(r"\bGITEA_TOKEN\s*=\s*[\"'][A-Za-z0-9]{32,}[\"']")), SecretPattern("authorization_token_literal", re.compile(r"\bAuthorization:\s*token\s+[A-Za-z0-9]{32,}\b", re.I)), ] def iter_files(paths: list[Path]) -> list[Path]: files: list[Path] = [] for path in paths: if not path.exists(): continue if path.is_file() and path.suffix in DOC_SUFFIXES: files.append(path) continue if path.is_dir(): files.extend(sorted(p for p in path.rglob("*") if p.is_file() and p.suffix in DOC_SUFFIXES)) return sorted(set(files)) def is_placeholder(value: str) -> bool: lower = value.lower() placeholder_fragments = [ "...", "<", ">", "change_me", "redacted", "example", "placeholder", "vault-item-id", "your_", "${", "$", "ζ–°ηš„", "取得", ] if any(fragment in lower for fragment in placeholder_fragments): return True tail = value for prefix in ["nvapi-", "sk-ant-api03-", "sk-proj-", "sk-", "AIza"]: if value.startswith(prefix): tail = value[len(prefix) :] break if tail and set(tail.lower()) <= {"x", "0", "_", "-", "."}: return True return False def masked(value: str) -> str: if len(value) <= 14: return value return f"{value[:6]}...{value[-4:]}" def main() -> int: parser = argparse.ArgumentParser(description="Scan operational docs for likely real secrets.") parser.add_argument("paths", nargs="*", type=Path, default=DEFAULT_TARGETS) args = parser.parse_args() findings: list[str] = [] scanned_files = iter_files(args.paths) for path in scanned_files: try: lines = path.read_text(encoding="utf-8", errors="replace").splitlines() except OSError as exc: findings.append(f"{path}:0 read_error {exc}") continue for lineno, line in enumerate(lines, start=1): for pattern in PATTERNS: for match in pattern.regex.finditer(line): value = match.group(0) if is_placeholder(value): continue findings.append(f"{path}:{lineno} {pattern.name} {masked(value)}") if findings: print("DOC_SECRET_SANITY_BLOCKED") for finding in findings: print(finding) return 1 print(f"DOC_SECRET_SANITY_OK scanned_files={len(scanned_files)}") return 0 if __name__ == "__main__": sys.exit(main())