awoooi/scripts/security/source-control-workflow-secret-name-local-inventory.py

#!/usr/bin/env python3
"""Build local read-only workflow / secret-name inventory evidence.

The collector reads only repository files that are already present in local
working trees:

- .github/workflows/*
- .gitea/workflows/*
- CODEOWNERS / .github/CODEOWNERS

It does not call GitHub or Gitea APIs, does not read .env files, does not read
repository secret stores, and does not collect secret values. The output keeps
only secret names referenced by workflow expressions such as
`${{ secrets.NAME }}`.
"""

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path


SECRET_RE = re.compile(
    r"secrets(?:\.([A-Za-z_][A-Za-z0-9_]*)|\[['\"]([^'\"]+)['\"]\])"
)
EXPRESSION_RE = re.compile(r"\$\{\{(.+?)\}\}")
NAME_RE = re.compile(r"^name:\s*(.+?)\s*$", re.MULTILINE)
TOP_LEVEL_KEY_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*):")
INDENTED_KEY_RE = re.compile(r"^\s{2,}([A-Za-z_][A-Za-z0-9_-]*):")
RUNS_ON_RE = re.compile(r"runs-on:\s*(.+)")
ENVIRONMENT_RE = re.compile(r"environment:\s*(.+)")

STILL_FORBIDDEN = [
    "collect secret value",
    "read .env or secret store",
    "modify workflow",
    "modify webhook",
    "rotate secret",
    "create GitHub repo",
    "sync refs",
    "switch GitHub primary",
    "disable Gitea",
]


def parse_repo_spec(value: str) -> dict[str, str]:
    parts = value.split("|")
    if len(parts) != 6:
        raise ValueError(
            "--repo format must be key|path|github_repo|source_key|scope_status|risk"
        )
    key, path, github_repo, source_key, scope_status, risk = parts
    return {
        "repo_key": key,
        "repo_path": str(Path(path).expanduser().resolve()),
        "github_repo": github_repo,
        "source_key": source_key,
        "scope_status": scope_status,
        "risk": risk,
    }


def strip_quotes(value: str) -> str:
    value = value.strip().strip(",")
    if (value.startswith('"') and value.endswith('"')) or (
        value.startswith("'") and value.endswith("'")
    ):
        return value[1:-1]
    return value


def parse_inline_list(value: str) -> list[str]:
    value = strip_quotes(value.strip())
    if value.startswith("[") and value.endswith("]"):
        return [
            strip_quotes(item)
            for item in value[1:-1].split(",")
            if strip_quotes(item)
        ]
    return [value] if value else []


def first_scalar(pattern: re.Pattern[str], text: str, default: str = "") -> str:
    match = pattern.search(text)
    return strip_quotes(match.group(1)) if match else default


def extract_on_block(lines: list[str]) -> list[str]:
    for index, line in enumerate(lines):
        if not line.startswith("on:"):
            continue
        inline = line.split(":", 1)[1].strip()
        if inline:
            return parse_inline_list(inline)
        triggers: list[str] = []
        for next_line in lines[index + 1 :]:
            if not next_line.strip() or next_line.lstrip().startswith("#"):
                continue
            if TOP_LEVEL_KEY_RE.match(next_line):
                break
            key_match = INDENTED_KEY_RE.match(next_line)
            if key_match:
                triggers.append(key_match.group(1))
        return sorted(set(triggers))
    return []


def extract_runs_on(text: str) -> list[str]:
    labels: list[str] = []
    for match in RUNS_ON_RE.finditer(text):
        value = match.group(1).split("#", 1)[0].strip()
        labels.extend(parse_inline_list(value))
    return sorted({label for label in labels if label and "secrets." not in label})


def extract_environments(text: str) -> list[str]:
    names: list[str] = []
    for match in ENVIRONMENT_RE.finditer(text):
        value = match.group(1).split("#", 1)[0].strip()
        if value and not value.startswith("{") and "secrets." not in value:
            names.extend(parse_inline_list(value))
    return sorted(set(names))


def extract_secret_names(text: str) -> list[str]:
    names: set[str] = set()
    for expression in EXPRESSION_RE.findall(text):
        if "secrets" not in expression:
            continue
        for match in SECRET_RE.finditer(expression):
            for name in match.groups():
                if name:
                    names.add(name)
    return sorted(names)


def provider_for(path: Path) -> str:
    parts = path.parts
    if ".gitea" in parts:
        return "gitea"
    if ".github" in parts:
        return "github"
    return "unknown"


def workflow_files(repo: Path) -> list[Path]:
    files: list[Path] = []
    for base in [repo / ".gitea" / "workflows", repo / ".github" / "workflows"]:
        if not base.exists():
            continue
        for path in sorted(base.rglob("*")):
            if path.is_file() and path.suffix.lower() in {".yml", ".yaml"}:
                files.append(path)
    return files


def codeowners_files(repo: Path) -> list[Path]:
    candidates = [
        repo / "CODEOWNERS",
        repo / ".github" / "CODEOWNERS",
        repo / ".gitea" / "CODEOWNERS",
    ]
    return [path for path in candidates if path.exists() and path.is_file()]


def workflow_item(repo: Path, path: Path) -> dict[str, object]:
    text = path.read_text(encoding="utf-8", errors="replace")
    rel = path.relative_to(repo)
    return {
        "provider": provider_for(path),
        "workflow_file_path": str(rel),
        "workflow_display_name": first_scalar(NAME_RE, text, path.stem),
        "trigger_names": extract_on_block(text.splitlines()),
        "runner_label_names": extract_runs_on(text),
        "environment_names": extract_environments(text),
        "referenced_secret_names": extract_secret_names(text),
    }


def codeowners_item(repo: Path, path: Path) -> dict[str, object]:
    owners: set[str] = set()
    text = path.read_text(encoding="utf-8", errors="replace")
    for line in text.splitlines():
        stripped = line.strip()
        if not stripped or stripped.startswith("#"):
            continue
        for token in stripped.split()[1:]:
            if token.startswith("@"):
                owners.add(token)
    return {
        "codeowners_path": str(path.relative_to(repo)),
        "owner_tokens": sorted(owners),
        "owner_token_count": len(owners),
    }


def repo_item(spec: dict[str, str]) -> dict[str, object]:
    repo = Path(spec["repo_path"])
    if not repo.exists():
        return {
            **spec,
            "local_status": "missing_local_repo",
            "workflow_files": [],
            "codeowners_files": [],
            "referenced_secret_names": [],
            "runner_label_names": [],
            "environment_names": [],
            "api_required_lanes": [
                "webhook_inventory",
                "deploy_key_inventory",
                "branch_protection_inventory",
                "repository_secret_name_parity",
            ],
            "still_forbidden": STILL_FORBIDDEN,
        }

    workflows = [workflow_item(repo, path) for path in workflow_files(repo)]
    codeowners = [codeowners_item(repo, path) for path in codeowners_files(repo)]
    secret_names = sorted(
        {
            name
            for workflow in workflows
            for name in workflow["referenced_secret_names"]
            if isinstance(name, str)
        }
    )
    runner_labels = sorted(
        {
            name
            for workflow in workflows
            for name in workflow["runner_label_names"]
            if isinstance(name, str)
        }
    )
    environments = sorted(
        {
            name
            for workflow in workflows
            for name in workflow["environment_names"]
            if isinstance(name, str)
        }
    )
    if workflows or codeowners:
        local_status = "partial_local_evidence"
    else:
        local_status = "local_repo_visible_no_workflow_files"

    return {
        **spec,
        "local_status": local_status,
        "workflow_files": workflows,
        "codeowners_files": codeowners,
        "referenced_secret_names": secret_names,
        "runner_label_names": runner_labels,
        "environment_names": environments,
        "api_required_lanes": [
            "webhook_inventory",
            "deploy_key_inventory",
            "branch_protection_inventory",
            "repository_secret_name_parity",
        ],
        "still_forbidden": STILL_FORBIDDEN,
    }


def build_snapshot(args: argparse.Namespace) -> dict[str, object]:
    repos = [repo_item(parse_repo_spec(value)) for value in args.repo]
    workflow_count = sum(len(repo["workflow_files"]) for repo in repos)
    codeowners_count = sum(len(repo["codeowners_files"]) for repo in repos)
    gitea_workflow_count = sum(
        1
        for repo in repos
        for item in repo["workflow_files"]
        if item["provider"] == "gitea"
    )
    github_workflow_count = sum(
        1
        for repo in repos
        for item in repo["workflow_files"]
        if item["provider"] == "github"
    )
    secret_names = sorted(
        {
            name
            for repo in repos
            for name in repo["referenced_secret_names"]
            if isinstance(name, str)
        }
    )
    runner_labels = sorted(
        {
            name
            for repo in repos
            for name in repo["runner_label_names"]
            if isinstance(name, str)
        }
    )
    local_evidence_repos = [
        repo for repo in repos if repo["local_status"] == "partial_local_evidence"
    ]
    return {
        "schema_version": "source_control_workflow_secret_name_local_evidence_v1",
        "status": "draft_partial_local_evidence",
        "date": args.date,
        "mode": "local_read_only_redacted_inventory",
        "runtime_execution_authorized": False,
        "source_contract": "source_control_workflow_secret_name_inventory_v1",
        "summary": {
            "candidate_repo_count": len(repos),
            "local_repo_visible_count": sum(
                1 for repo in repos if repo["local_status"] != "missing_local_repo"
            ),
            "local_evidence_repo_count": len(local_evidence_repos),
            "workflow_file_count": workflow_count,
            "gitea_workflow_file_count": gitea_workflow_count,
            "github_workflow_file_count": github_workflow_count,
            "codeowners_file_count": codeowners_count,
            "unique_secret_name_count": len(secret_names),
            "runner_label_count": len(runner_labels),
            "secret_value_collection_allowed": False,
            "secret_value_detected": False,
            "runtime_actions_authorized": False,
            "action_buttons_allowed": False,
        },
        "unique_secret_names": secret_names,
        "runner_label_names": runner_labels,
        "repos": repos,
        "redaction_rules": [
            "只保存 workflow 內引用的 secret 名稱，不保存 secret value。",
            "不讀取 .env、secrets、private key、runner registration token 或 webhook secret。",
            "不呼叫 GitHub / Gitea API，因此 webhook、deploy key、branch protection 與 repository secret parity 仍需後續 redacted export 或 read-only API evidence。",
            "任何含 raw secret/token/private key 的 payload 都必須拒收並進 quarantine。",
        ],
        "forbidden_actions": STILL_FORBIDDEN,
    }


def write_json(path: Path, payload: dict[str, object]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--date", default="2026-05-13")
    parser.add_argument("--repo", action="append", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()
    write_json(Path(args.output), build_snapshot(args))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())