#!/usr/bin/env python3 """ Phase-0 deterministic pre-scan for the /review-repo command. Python standard library only. Emits a JSON object to stdout: - inventory: roles, adrs, runbooks, playbooks, scripts (the shard list) - findings: exact, no-judgement issues (markers, broken refs, unencrypted vaults) The *judgement* review — contradictions, design-conformance, stale intent — is done by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap, exact things so the reviewers can focus on what needs reasoning. Usage: python3 scripts/repo-scan.py [repo_root] > scan.json """ import json import os import re import sys ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".") PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees", ".pytest_cache", "node_modules", "__pycache__"} SKIP_PREFIX = os.path.join("docs", "reviews") # don't scan our own reports SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"} MARKER_RE = re.compile(r"(?*${}") def walk_files(): for dirpath, dirnames, filenames in os.walk(ROOT): dirnames[:] = [d for d in dirnames if d not in PRUNE] for f in filenames: yield os.path.join(dirpath, f) def rel(path): return os.path.relpath(path, ROOT) def inventory(): def listdir(*parts, want_dirs=False, suffixes=None): d = os.path.join(ROOT, *parts) if not os.path.isdir(d): return [] out = [] for e in sorted(os.listdir(d)): full = os.path.join(d, e) if want_dirs and not os.path.isdir(full): continue if suffixes and not e.endswith(suffixes): continue out.append(e) return out return { "roles": listdir("roles", want_dirs=True), "adrs": listdir("docs", "decisions", suffixes=(".md",)), "runbooks": listdir("docs", "runbooks", suffixes=(".md",)), "playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")), "scripts": listdir("scripts"), } def adr_numbers(): dec = os.path.join(ROOT, "docs", "decisions") nums = set() if os.path.isdir(dec): for f in os.listdir(dec): m = re.match(r"(\d{3})-", f) if m: nums.add(m.group(1)) return nums def scan(): findings = [] adrs = adr_numbers() for path in walk_files(): rpath = rel(path) if rpath.startswith(SKIP_PREFIX): continue name = os.path.basename(path) if name == "vault.yml": try: text = open(path, encoding="utf-8", errors="replace").read() except OSError: continue if not text.startswith("$ANSIBLE_VAULT"): real = [ln for ln in text.splitlines() if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"] if real: findings.append({"check": "vault-unencrypted", "severity": "high", "path": rpath, "line": 1, "detail": "vault.yml is not ansible-vault encrypted but has content"}) continue if os.path.splitext(path)[1] not in SOURCE_EXTS: continue try: lines = open(path, encoding="utf-8", errors="replace").readlines() except OSError: continue for i, line in enumerate(lines, 1): markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line))) if markers: findings.append({"check": "marker", "severity": "low", "path": rpath, "line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"}) for m in ADR_REF_RE.finditer(line): if m.group(1) not in adrs: findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath, "line": i, "detail": f"references ADR-{m.group(1)} (no such file)"}) for m in PATH_REF_RE.finditer(line): # Skip paths that are part of a URL (e.g. a backend API endpoint). token_prefix = line[max(line.rfind(" ", 0, m.start()), line.rfind("\t", 0, m.start())) + 1:m.start()] if "://" in token_prefix: continue ref = m.group(0).rstrip(".,);:`'\"") if any(c in ref for c in PLACEHOLDER): continue if not os.path.exists(os.path.join(ROOT, ref)): findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath, "line": i, "detail": f"references '{ref}' which does not exist"}) return findings def main(): result = {"root": ROOT, "inventory": inventory(), "findings": scan()} json.dump(result, sys.stdout, indent=2) sys.stdout.write("\n") counts = {} for f in result["findings"]: counts[f["check"]] = counts.get(f["check"], 0) + 1 summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings" print(f"repo-scan: {len(result['inventory']['roles'])} roles, " f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr) if __name__ == "__main__": main()