360 lines
12 KiB
Python
360 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""Build local read-only workflow / secret-name inventory evidence.
|
||
|
||
The collector reads only repository files that are already present in local
|
||
working trees:
|
||
|
||
- .github/workflows/*
|
||
- .gitea/workflows/*
|
||
- CODEOWNERS / .github/CODEOWNERS
|
||
|
||
It does not call GitHub or Gitea APIs, does not read .env files, does not read
|
||
repository secret stores, and does not collect secret values. The output keeps
|
||
only secret names referenced by workflow expressions such as
|
||
`${{ secrets.NAME }}`.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
|
||
|
||
SECRET_RE = re.compile(
|
||
r"secrets(?:\.([A-Za-z_][A-Za-z0-9_]*)|\[['\"]([^'\"]+)['\"]\])"
|
||
)
|
||
EXPRESSION_RE = re.compile(r"\$\{\{(.+?)\}\}")
|
||
NAME_RE = re.compile(r"^name:\s*(.+?)\s*$", re.MULTILINE)
|
||
TOP_LEVEL_KEY_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*):")
|
||
INDENTED_KEY_RE = re.compile(r"^\s{2,}([A-Za-z_][A-Za-z0-9_-]*):")
|
||
RUNS_ON_RE = re.compile(r"runs-on:\s*(.+)")
|
||
ENVIRONMENT_RE = re.compile(r"environment:\s*(.+)")
|
||
|
||
STILL_FORBIDDEN = [
|
||
"collect secret value",
|
||
"read .env or secret store",
|
||
"modify workflow",
|
||
"modify webhook",
|
||
"rotate secret",
|
||
"create GitHub repo",
|
||
"sync refs",
|
||
"switch GitHub primary",
|
||
"disable Gitea",
|
||
]
|
||
|
||
|
||
def parse_repo_spec(value: str) -> dict[str, str]:
|
||
parts = value.split("|")
|
||
if len(parts) != 6:
|
||
raise ValueError(
|
||
"--repo format must be key|path|github_repo|source_key|scope_status|risk"
|
||
)
|
||
key, path, github_repo, source_key, scope_status, risk = parts
|
||
return {
|
||
"repo_key": key,
|
||
"repo_path": str(Path(path).expanduser().resolve()),
|
||
"github_repo": github_repo,
|
||
"source_key": source_key,
|
||
"scope_status": scope_status,
|
||
"risk": risk,
|
||
}
|
||
|
||
|
||
def strip_quotes(value: str) -> str:
|
||
value = value.strip().strip(",")
|
||
if (value.startswith('"') and value.endswith('"')) or (
|
||
value.startswith("'") and value.endswith("'")
|
||
):
|
||
return value[1:-1]
|
||
return value
|
||
|
||
|
||
def parse_inline_list(value: str) -> list[str]:
|
||
value = strip_quotes(value.strip())
|
||
if value.startswith("[") and value.endswith("]"):
|
||
return [
|
||
strip_quotes(item)
|
||
for item in value[1:-1].split(",")
|
||
if strip_quotes(item)
|
||
]
|
||
return [value] if value else []
|
||
|
||
|
||
def first_scalar(pattern: re.Pattern[str], text: str, default: str = "") -> str:
|
||
match = pattern.search(text)
|
||
return strip_quotes(match.group(1)) if match else default
|
||
|
||
|
||
def extract_on_block(lines: list[str]) -> list[str]:
|
||
for index, line in enumerate(lines):
|
||
if not line.startswith("on:"):
|
||
continue
|
||
inline = line.split(":", 1)[1].strip()
|
||
if inline:
|
||
return parse_inline_list(inline)
|
||
triggers: list[str] = []
|
||
for next_line in lines[index + 1 :]:
|
||
if not next_line.strip() or next_line.lstrip().startswith("#"):
|
||
continue
|
||
if TOP_LEVEL_KEY_RE.match(next_line):
|
||
break
|
||
key_match = INDENTED_KEY_RE.match(next_line)
|
||
if key_match:
|
||
triggers.append(key_match.group(1))
|
||
return sorted(set(triggers))
|
||
return []
|
||
|
||
|
||
def extract_runs_on(text: str) -> list[str]:
|
||
labels: list[str] = []
|
||
for match in RUNS_ON_RE.finditer(text):
|
||
value = match.group(1).split("#", 1)[0].strip()
|
||
labels.extend(parse_inline_list(value))
|
||
return sorted({label for label in labels if label and "secrets." not in label})
|
||
|
||
|
||
def extract_environments(text: str) -> list[str]:
|
||
names: list[str] = []
|
||
for match in ENVIRONMENT_RE.finditer(text):
|
||
value = match.group(1).split("#", 1)[0].strip()
|
||
if value and not value.startswith("{") and "secrets." not in value:
|
||
names.extend(parse_inline_list(value))
|
||
return sorted(set(names))
|
||
|
||
|
||
def extract_secret_names(text: str) -> list[str]:
|
||
names: set[str] = set()
|
||
for expression in EXPRESSION_RE.findall(text):
|
||
if "secrets" not in expression:
|
||
continue
|
||
for match in SECRET_RE.finditer(expression):
|
||
for name in match.groups():
|
||
if name:
|
||
names.add(name)
|
||
return sorted(names)
|
||
|
||
|
||
def provider_for(path: Path) -> str:
|
||
parts = path.parts
|
||
if ".gitea" in parts:
|
||
return "gitea"
|
||
if ".github" in parts:
|
||
return "github"
|
||
return "unknown"
|
||
|
||
|
||
def workflow_files(repo: Path) -> list[Path]:
|
||
files: list[Path] = []
|
||
for base in [repo / ".gitea" / "workflows", repo / ".github" / "workflows"]:
|
||
if not base.exists():
|
||
continue
|
||
for path in sorted(base.rglob("*")):
|
||
if path.is_file() and path.suffix.lower() in {".yml", ".yaml"}:
|
||
files.append(path)
|
||
return files
|
||
|
||
|
||
def codeowners_files(repo: Path) -> list[Path]:
|
||
candidates = [
|
||
repo / "CODEOWNERS",
|
||
repo / ".github" / "CODEOWNERS",
|
||
repo / ".gitea" / "CODEOWNERS",
|
||
]
|
||
return [path for path in candidates if path.exists() and path.is_file()]
|
||
|
||
|
||
def workflow_item(repo: Path, path: Path) -> dict[str, object]:
|
||
text = path.read_text(encoding="utf-8", errors="replace")
|
||
rel = path.relative_to(repo)
|
||
return {
|
||
"provider": provider_for(path),
|
||
"workflow_file_path": str(rel),
|
||
"workflow_display_name": first_scalar(NAME_RE, text, path.stem),
|
||
"trigger_names": extract_on_block(text.splitlines()),
|
||
"runner_label_names": extract_runs_on(text),
|
||
"environment_names": extract_environments(text),
|
||
"referenced_secret_names": extract_secret_names(text),
|
||
}
|
||
|
||
|
||
def codeowners_item(repo: Path, path: Path) -> dict[str, object]:
|
||
owners: set[str] = set()
|
||
text = path.read_text(encoding="utf-8", errors="replace")
|
||
for line in text.splitlines():
|
||
stripped = line.strip()
|
||
if not stripped or stripped.startswith("#"):
|
||
continue
|
||
for token in stripped.split()[1:]:
|
||
if token.startswith("@"):
|
||
owners.add(token)
|
||
return {
|
||
"codeowners_path": str(path.relative_to(repo)),
|
||
"owner_tokens": sorted(owners),
|
||
"owner_token_count": len(owners),
|
||
}
|
||
|
||
|
||
def repo_item(spec: dict[str, str]) -> dict[str, object]:
|
||
repo = Path(spec["repo_path"])
|
||
if not repo.exists():
|
||
return {
|
||
**spec,
|
||
"local_status": "missing_local_repo",
|
||
"workflow_files": [],
|
||
"codeowners_files": [],
|
||
"referenced_secret_names": [],
|
||
"runner_label_names": [],
|
||
"environment_names": [],
|
||
"api_required_lanes": [
|
||
"webhook_inventory",
|
||
"deploy_key_inventory",
|
||
"branch_protection_inventory",
|
||
"repository_secret_name_parity",
|
||
],
|
||
"still_forbidden": STILL_FORBIDDEN,
|
||
}
|
||
|
||
workflows = [workflow_item(repo, path) for path in workflow_files(repo)]
|
||
codeowners = [codeowners_item(repo, path) for path in codeowners_files(repo)]
|
||
secret_names = sorted(
|
||
{
|
||
name
|
||
for workflow in workflows
|
||
for name in workflow["referenced_secret_names"]
|
||
if isinstance(name, str)
|
||
}
|
||
)
|
||
runner_labels = sorted(
|
||
{
|
||
name
|
||
for workflow in workflows
|
||
for name in workflow["runner_label_names"]
|
||
if isinstance(name, str)
|
||
}
|
||
)
|
||
environments = sorted(
|
||
{
|
||
name
|
||
for workflow in workflows
|
||
for name in workflow["environment_names"]
|
||
if isinstance(name, str)
|
||
}
|
||
)
|
||
if workflows or codeowners:
|
||
local_status = "partial_local_evidence"
|
||
else:
|
||
local_status = "local_repo_visible_no_workflow_files"
|
||
|
||
return {
|
||
**spec,
|
||
"local_status": local_status,
|
||
"workflow_files": workflows,
|
||
"codeowners_files": codeowners,
|
||
"referenced_secret_names": secret_names,
|
||
"runner_label_names": runner_labels,
|
||
"environment_names": environments,
|
||
"api_required_lanes": [
|
||
"webhook_inventory",
|
||
"deploy_key_inventory",
|
||
"branch_protection_inventory",
|
||
"repository_secret_name_parity",
|
||
],
|
||
"still_forbidden": STILL_FORBIDDEN,
|
||
}
|
||
|
||
|
||
def build_snapshot(args: argparse.Namespace) -> dict[str, object]:
|
||
repos = [repo_item(parse_repo_spec(value)) for value in args.repo]
|
||
workflow_count = sum(len(repo["workflow_files"]) for repo in repos)
|
||
codeowners_count = sum(len(repo["codeowners_files"]) for repo in repos)
|
||
gitea_workflow_count = sum(
|
||
1
|
||
for repo in repos
|
||
for item in repo["workflow_files"]
|
||
if item["provider"] == "gitea"
|
||
)
|
||
github_workflow_count = sum(
|
||
1
|
||
for repo in repos
|
||
for item in repo["workflow_files"]
|
||
if item["provider"] == "github"
|
||
)
|
||
secret_names = sorted(
|
||
{
|
||
name
|
||
for repo in repos
|
||
for name in repo["referenced_secret_names"]
|
||
if isinstance(name, str)
|
||
}
|
||
)
|
||
runner_labels = sorted(
|
||
{
|
||
name
|
||
for repo in repos
|
||
for name in repo["runner_label_names"]
|
||
if isinstance(name, str)
|
||
}
|
||
)
|
||
local_evidence_repos = [
|
||
repo for repo in repos if repo["local_status"] == "partial_local_evidence"
|
||
]
|
||
return {
|
||
"schema_version": "source_control_workflow_secret_name_local_evidence_v1",
|
||
"status": "draft_partial_local_evidence",
|
||
"date": args.date,
|
||
"mode": "local_read_only_redacted_inventory",
|
||
"runtime_execution_authorized": False,
|
||
"source_contract": "source_control_workflow_secret_name_inventory_v1",
|
||
"summary": {
|
||
"candidate_repo_count": len(repos),
|
||
"local_repo_visible_count": sum(
|
||
1 for repo in repos if repo["local_status"] != "missing_local_repo"
|
||
),
|
||
"local_evidence_repo_count": len(local_evidence_repos),
|
||
"workflow_file_count": workflow_count,
|
||
"gitea_workflow_file_count": gitea_workflow_count,
|
||
"github_workflow_file_count": github_workflow_count,
|
||
"codeowners_file_count": codeowners_count,
|
||
"unique_secret_name_count": len(secret_names),
|
||
"runner_label_count": len(runner_labels),
|
||
"secret_value_collection_allowed": False,
|
||
"secret_value_detected": False,
|
||
"runtime_actions_authorized": False,
|
||
"action_buttons_allowed": False,
|
||
},
|
||
"unique_secret_names": secret_names,
|
||
"runner_label_names": runner_labels,
|
||
"repos": repos,
|
||
"redaction_rules": [
|
||
"只保存 workflow 內引用的 secret 名稱,不保存 secret value。",
|
||
"不讀取 .env、secrets、private key、runner registration token 或 webhook secret。",
|
||
"不呼叫 GitHub / Gitea API,因此 webhook、deploy key、branch protection 與 repository secret parity 仍需後續 redacted export 或 read-only API evidence。",
|
||
"任何含 raw secret/token/private key 的 payload 都必須拒收並進 quarantine。",
|
||
],
|
||
"forbidden_actions": STILL_FORBIDDEN,
|
||
}
|
||
|
||
|
||
def write_json(path: Path, payload: dict[str, object]) -> None:
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
path.write_text(
|
||
json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
|
||
encoding="utf-8",
|
||
)
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description=__doc__)
|
||
parser.add_argument("--date", default="2026-05-13")
|
||
parser.add_argument("--repo", action="append", required=True)
|
||
parser.add_argument("--output", required=True)
|
||
args = parser.parse_args()
|
||
write_json(Path(args.output), build_snapshot(args))
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|