#!/usr/bin/env python3 """Build local read-only workflow / secret-name inventory evidence. The collector reads only repository files that are already present in local working trees: - .github/workflows/* - .gitea/workflows/* - CODEOWNERS / .github/CODEOWNERS It does not call GitHub or Gitea APIs, does not read .env files, does not read repository secret stores, and does not collect secret values. The output keeps only secret names referenced by workflow expressions such as `${{ secrets.NAME }}`. """ from __future__ import annotations import argparse import json import re from pathlib import Path SECRET_RE = re.compile( r"secrets(?:\.([A-Za-z_][A-Za-z0-9_]*)|\[['\"]([^'\"]+)['\"]\])" ) EXPRESSION_RE = re.compile(r"\$\{\{(.+?)\}\}") NAME_RE = re.compile(r"^name:\s*(.+?)\s*$", re.MULTILINE) TOP_LEVEL_KEY_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*):") INDENTED_KEY_RE = re.compile(r"^\s{2,}([A-Za-z_][A-Za-z0-9_-]*):") RUNS_ON_RE = re.compile(r"runs-on:\s*(.+)") ENVIRONMENT_RE = re.compile(r"environment:\s*(.+)") STILL_FORBIDDEN = [ "collect secret value", "read .env or secret store", "modify workflow", "modify webhook", "rotate secret", "create GitHub repo", "sync refs", "switch GitHub primary", "disable Gitea", ] def parse_repo_spec(value: str) -> dict[str, str]: parts = value.split("|") if len(parts) != 6: raise ValueError( "--repo format must be key|path|github_repo|source_key|scope_status|risk" ) key, path, github_repo, source_key, scope_status, risk = parts return { "repo_key": key, "repo_path": str(Path(path).expanduser().resolve()), "github_repo": github_repo, "source_key": source_key, "scope_status": scope_status, "risk": risk, } def strip_quotes(value: str) -> str: value = value.strip().strip(",") if (value.startswith('"') and value.endswith('"')) or ( value.startswith("'") and value.endswith("'") ): return value[1:-1] return value def parse_inline_list(value: str) -> list[str]: value = strip_quotes(value.strip()) if value.startswith("[") and value.endswith("]"): return [ strip_quotes(item) for item in value[1:-1].split(",") if strip_quotes(item) ] return [value] if value else [] def first_scalar(pattern: re.Pattern[str], text: str, default: str = "") -> str: match = pattern.search(text) return strip_quotes(match.group(1)) if match else default def extract_on_block(lines: list[str]) -> list[str]: for index, line in enumerate(lines): if not line.startswith("on:"): continue inline = line.split(":", 1)[1].strip() if inline: return parse_inline_list(inline) triggers: list[str] = [] for next_line in lines[index + 1 :]: if not next_line.strip() or next_line.lstrip().startswith("#"): continue if TOP_LEVEL_KEY_RE.match(next_line): break key_match = INDENTED_KEY_RE.match(next_line) if key_match: triggers.append(key_match.group(1)) return sorted(set(triggers)) return [] def extract_runs_on(text: str) -> list[str]: labels: list[str] = [] for match in RUNS_ON_RE.finditer(text): value = match.group(1).split("#", 1)[0].strip() labels.extend(parse_inline_list(value)) return sorted({label for label in labels if label and "secrets." not in label}) def extract_environments(text: str) -> list[str]: names: list[str] = [] for match in ENVIRONMENT_RE.finditer(text): value = match.group(1).split("#", 1)[0].strip() if value and not value.startswith("{") and "secrets." not in value: names.extend(parse_inline_list(value)) return sorted(set(names)) def extract_secret_names(text: str) -> list[str]: names: set[str] = set() for expression in EXPRESSION_RE.findall(text): if "secrets" not in expression: continue for match in SECRET_RE.finditer(expression): for name in match.groups(): if name: names.add(name) return sorted(names) def provider_for(path: Path) -> str: parts = path.parts if ".gitea" in parts: return "gitea" if ".github" in parts: return "github" return "unknown" def workflow_files(repo: Path) -> list[Path]: files: list[Path] = [] for base in [repo / ".gitea" / "workflows", repo / ".github" / "workflows"]: if not base.exists(): continue for path in sorted(base.rglob("*")): if path.is_file() and path.suffix.lower() in {".yml", ".yaml"}: files.append(path) return files def codeowners_files(repo: Path) -> list[Path]: candidates = [ repo / "CODEOWNERS", repo / ".github" / "CODEOWNERS", repo / ".gitea" / "CODEOWNERS", ] return [path for path in candidates if path.exists() and path.is_file()] def workflow_item(repo: Path, path: Path) -> dict[str, object]: text = path.read_text(encoding="utf-8", errors="replace") rel = path.relative_to(repo) return { "provider": provider_for(path), "workflow_file_path": str(rel), "workflow_display_name": first_scalar(NAME_RE, text, path.stem), "trigger_names": extract_on_block(text.splitlines()), "runner_label_names": extract_runs_on(text), "environment_names": extract_environments(text), "referenced_secret_names": extract_secret_names(text), } def codeowners_item(repo: Path, path: Path) -> dict[str, object]: owners: set[str] = set() text = path.read_text(encoding="utf-8", errors="replace") for line in text.splitlines(): stripped = line.strip() if not stripped or stripped.startswith("#"): continue for token in stripped.split()[1:]: if token.startswith("@"): owners.add(token) return { "codeowners_path": str(path.relative_to(repo)), "owner_tokens": sorted(owners), "owner_token_count": len(owners), } def repo_item(spec: dict[str, str]) -> dict[str, object]: repo = Path(spec["repo_path"]) if not repo.exists(): return { **spec, "local_status": "missing_local_repo", "workflow_files": [], "codeowners_files": [], "referenced_secret_names": [], "runner_label_names": [], "environment_names": [], "api_required_lanes": [ "webhook_inventory", "deploy_key_inventory", "branch_protection_inventory", "repository_secret_name_parity", ], "still_forbidden": STILL_FORBIDDEN, } workflows = [workflow_item(repo, path) for path in workflow_files(repo)] codeowners = [codeowners_item(repo, path) for path in codeowners_files(repo)] secret_names = sorted( { name for workflow in workflows for name in workflow["referenced_secret_names"] if isinstance(name, str) } ) runner_labels = sorted( { name for workflow in workflows for name in workflow["runner_label_names"] if isinstance(name, str) } ) environments = sorted( { name for workflow in workflows for name in workflow["environment_names"] if isinstance(name, str) } ) if workflows or codeowners: local_status = "partial_local_evidence" else: local_status = "local_repo_visible_no_workflow_files" return { **spec, "local_status": local_status, "workflow_files": workflows, "codeowners_files": codeowners, "referenced_secret_names": secret_names, "runner_label_names": runner_labels, "environment_names": environments, "api_required_lanes": [ "webhook_inventory", "deploy_key_inventory", "branch_protection_inventory", "repository_secret_name_parity", ], "still_forbidden": STILL_FORBIDDEN, } def build_snapshot(args: argparse.Namespace) -> dict[str, object]: repos = [repo_item(parse_repo_spec(value)) for value in args.repo] workflow_count = sum(len(repo["workflow_files"]) for repo in repos) codeowners_count = sum(len(repo["codeowners_files"]) for repo in repos) gitea_workflow_count = sum( 1 for repo in repos for item in repo["workflow_files"] if item["provider"] == "gitea" ) github_workflow_count = sum( 1 for repo in repos for item in repo["workflow_files"] if item["provider"] == "github" ) secret_names = sorted( { name for repo in repos for name in repo["referenced_secret_names"] if isinstance(name, str) } ) runner_labels = sorted( { name for repo in repos for name in repo["runner_label_names"] if isinstance(name, str) } ) local_evidence_repos = [ repo for repo in repos if repo["local_status"] == "partial_local_evidence" ] return { "schema_version": "source_control_workflow_secret_name_local_evidence_v1", "status": "draft_partial_local_evidence", "date": args.date, "mode": "local_read_only_redacted_inventory", "runtime_execution_authorized": False, "source_contract": "source_control_workflow_secret_name_inventory_v1", "summary": { "candidate_repo_count": len(repos), "local_repo_visible_count": sum( 1 for repo in repos if repo["local_status"] != "missing_local_repo" ), "local_evidence_repo_count": len(local_evidence_repos), "workflow_file_count": workflow_count, "gitea_workflow_file_count": gitea_workflow_count, "github_workflow_file_count": github_workflow_count, "codeowners_file_count": codeowners_count, "unique_secret_name_count": len(secret_names), "runner_label_count": len(runner_labels), "secret_value_collection_allowed": False, "secret_value_detected": False, "runtime_actions_authorized": False, "action_buttons_allowed": False, }, "unique_secret_names": secret_names, "runner_label_names": runner_labels, "repos": repos, "redaction_rules": [ "只保存 workflow 內引用的 secret 名稱,不保存 secret value。", "不讀取 .env、secrets、private key、runner registration token 或 webhook secret。", "不呼叫 GitHub / Gitea API,因此 webhook、deploy key、branch protection 與 repository secret parity 仍需後續 redacted export 或 read-only API evidence。", "任何含 raw secret/token/private key 的 payload 都必須拒收並進 quarantine。", ], "forbidden_actions": STILL_FORBIDDEN, } def write_json(path: Path, payload: dict[str, object]) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text( json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", ) def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--date", default="2026-05-13") parser.add_argument("--repo", action="append", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() write_json(Path(args.output), build_snapshot(args)) return 0 if __name__ == "__main__": raise SystemExit(main())