boma/scripts/repo-scan.py

#!/usr/bin/env python3
"""
Phase-0 deterministic pre-scan for the /review-repo command.

Python standard library only. Emits a JSON object to stdout:
  - inventory: roles, adrs, runbooks, playbooks, scripts  (the shard list)
  - findings:  exact, no-judgement issues (markers, broken refs, unencrypted vaults)

The *judgement* review — contradictions, design-conformance, stale intent — is done
by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap,
exact things so the reviewers can focus on what needs reasoning.

Usage:  python3 scripts/repo-scan.py [repo_root]  > scan.json
"""
import json
import os
import re
import sys

ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")

PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees",
         ".pytest_cache", "node_modules", "__pycache__"}
SKIP_PREFIX = os.path.join("docs", "reviews")  # don't scan our own reports
SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"}

MARKER_RE = re.compile(r"(?<![(|])\b(TODO|FIXME|XXX|HACK)\b(?![|)])")
ADR_REF_RE = re.compile(r"\bADR-(\d{3})\b")
PATH_REF_RE = re.compile(r"(?:docs|scripts|roles|inventories|terraform|playbooks)/[\w./-]+")
PLACEHOLDER = set("<>*${}")


def walk_files():
    for dirpath, dirnames, filenames in os.walk(ROOT):
        dirnames[:] = [d for d in dirnames if d not in PRUNE]
        for f in filenames:
            yield os.path.join(dirpath, f)


def rel(path):
    return os.path.relpath(path, ROOT)


def inventory():
    def listdir(*parts, want_dirs=False, suffixes=None):
        d = os.path.join(ROOT, *parts)
        if not os.path.isdir(d):
            return []
        out = []
        for e in sorted(os.listdir(d)):
            full = os.path.join(d, e)
            if want_dirs and not os.path.isdir(full):
                continue
            if suffixes and not e.endswith(suffixes):
                continue
            out.append(e)
        return out

    return {
        "roles": listdir("roles", want_dirs=True),
        "adrs": listdir("docs", "decisions", suffixes=(".md",)),
        "runbooks": listdir("docs", "runbooks", suffixes=(".md",)),
        "playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")),
        "scripts": listdir("scripts"),
    }


def adr_numbers():
    dec = os.path.join(ROOT, "docs", "decisions")
    nums = set()
    if os.path.isdir(dec):
        for f in os.listdir(dec):
            m = re.match(r"(\d{3})-", f)
            if m:
                nums.add(m.group(1))
    return nums


def scan():
    findings = []
    adrs = adr_numbers()
    for path in walk_files():
        rpath = rel(path)
        if rpath.startswith(SKIP_PREFIX):
            continue
        name = os.path.basename(path)

        if name == "vault.yml":
            try:
                text = open(path, encoding="utf-8", errors="replace").read()
            except OSError:
                continue
            if not text.startswith("$ANSIBLE_VAULT"):
                real = [ln for ln in text.splitlines()
                        if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"]
                if real:
                    findings.append({"check": "vault-unencrypted", "severity": "high",
                                     "path": rpath, "line": 1,
                                     "detail": "vault.yml is not ansible-vault encrypted but has content"})
            continue

        if os.path.splitext(path)[1] not in SOURCE_EXTS:
            continue
        try:
            lines = open(path, encoding="utf-8", errors="replace").readlines()
        except OSError:
            continue

        for i, line in enumerate(lines, 1):
            markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line)))
            if markers:
                findings.append({"check": "marker", "severity": "low", "path": rpath,
                                 "line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"})
            for m in ADR_REF_RE.finditer(line):
                if m.group(1) not in adrs:
                    findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath,
                                     "line": i, "detail": f"references ADR-{m.group(1)} (no such file)"})
            for m in PATH_REF_RE.finditer(line):
                # Skip paths that are part of a URL (e.g. a backend API endpoint).
                token_prefix = line[max(line.rfind(" ", 0, m.start()),
                                        line.rfind("\t", 0, m.start())) + 1:m.start()]
                if "://" in token_prefix:
                    continue
                ref = m.group(0).rstrip(".,);:`'\"")
                if any(c in ref for c in PLACEHOLDER):
                    continue
                if not os.path.exists(os.path.join(ROOT, ref)):
                    findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath,
                                     "line": i, "detail": f"references '{ref}' which does not exist"})
    return findings


def main():
    result = {"root": ROOT, "inventory": inventory(), "findings": scan()}
    json.dump(result, sys.stdout, indent=2)
    sys.stdout.write("\n")
    counts = {}
    for f in result["findings"]:
        counts[f["check"]] = counts.get(f["check"], 0) + 1
    summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings"
    print(f"repo-scan: {len(result['inventory']['roles'])} roles, "
          f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr)


if __name__ == "__main__":
    main()
Add /review-repo command with deterministic pre-scan and reviews store New on-demand repo audit: scripts/repo-scan.py does the cheap deterministic checks (markers, broken refs, unencrypted vaults) and inventory; the command fans out judgement reviewers across four dimensions, applies only safe/obvious fixes, and writes a tracked report to docs/reviews/. Cron + email deferred. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> 2026-05-30 18:56:01 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Phase-0 deterministic pre-scan for the /review-repo command.`

			`Python standard library only. Emits a JSON object to stdout:`
			`- inventory: roles, adrs, runbooks, playbooks, scripts (the shard list)`
			`- findings: exact, no-judgement issues (markers, broken refs, unencrypted vaults)`

			`The judgement review — contradictions, design-conformance, stale intent — is done`
			`by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap,`
			`exact things so the reviewers can focus on what needs reasoning.`

			`Usage: python3 scripts/repo-scan.py [repo_root] > scan.json`
			`"""`
			`import json`
			`import os`
			`import re`
			`import sys`

			`ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")`

			`PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees",`
			`".pytest_cache", "node_modules", "__pycache__"}`
			`SKIP_PREFIX = os.path.join("docs", "reviews") # don't scan our own reports`
			`SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"}`

			`MARKER_RE = re.compile(r"(?<![(\|])\b(TODO\|FIXME\|XXX\|HACK)\b(?![\|)])")`
			`ADR_REF_RE = re.compile(r"\bADR-(\d{3})\b")`
			`PATH_REF_RE = re.compile(r"(?:docs\|scripts\|roles\|inventories\|terraform\|playbooks)/[\w./-]+")`
			`PLACEHOLDER = set("<>*${}")`


			`def walk_files():`
			`for dirpath, dirnames, filenames in os.walk(ROOT):`
			`dirnames[:] = [d for d in dirnames if d not in PRUNE]`
			`for f in filenames:`
			`yield os.path.join(dirpath, f)`


			`def rel(path):`
			`return os.path.relpath(path, ROOT)`


			`def inventory():`
			`def listdir(*parts, want_dirs=False, suffixes=None):`
			`d = os.path.join(ROOT, *parts)`
			`if not os.path.isdir(d):`
			`return []`
			`out = []`
			`for e in sorted(os.listdir(d)):`
			`full = os.path.join(d, e)`
			`if want_dirs and not os.path.isdir(full):`
			`continue`
			`if suffixes and not e.endswith(suffixes):`
			`continue`
			`out.append(e)`
			`return out`

			`return {`
			`"roles": listdir("roles", want_dirs=True),`
			`"adrs": listdir("docs", "decisions", suffixes=(".md",)),`
			`"runbooks": listdir("docs", "runbooks", suffixes=(".md",)),`
			`"playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")),`
			`"scripts": listdir("scripts"),`
			`}`


			`def adr_numbers():`
			`dec = os.path.join(ROOT, "docs", "decisions")`
			`nums = set()`
			`if os.path.isdir(dec):`
			`for f in os.listdir(dec):`
			`m = re.match(r"(\d{3})-", f)`
			`if m:`
			`nums.add(m.group(1))`
			`return nums`


			`def scan():`
			`findings = []`
			`adrs = adr_numbers()`
			`for path in walk_files():`
			`rpath = rel(path)`
			`if rpath.startswith(SKIP_PREFIX):`
			`continue`
			`name = os.path.basename(path)`

			`if name == "vault.yml":`
			`try:`
			`text = open(path, encoding="utf-8", errors="replace").read()`
			`except OSError:`
			`continue`
			`if not text.startswith("$ANSIBLE_VAULT"):`
			`real = [ln for ln in text.splitlines()`
			`if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"]`
			`if real:`
			`findings.append({"check": "vault-unencrypted", "severity": "high",`
			`"path": rpath, "line": 1,`
			`"detail": "vault.yml is not ansible-vault encrypted but has content"})`
			`continue`

			`if os.path.splitext(path)[1] not in SOURCE_EXTS:`
			`continue`
			`try:`
			`lines = open(path, encoding="utf-8", errors="replace").readlines()`
			`except OSError:`
			`continue`

			`for i, line in enumerate(lines, 1):`
			`markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line)))`
			`if markers:`
			`findings.append({"check": "marker", "severity": "low", "path": rpath,`
			`"line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"})`
			`for m in ADR_REF_RE.finditer(line):`
			`if m.group(1) not in adrs:`
			`findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath,`
			`"line": i, "detail": f"references ADR-{m.group(1)} (no such file)"})`
			`for m in PATH_REF_RE.finditer(line):`
			`# Skip paths that are part of a URL (e.g. a backend API endpoint).`
			`token_prefix = line[max(line.rfind(" ", 0, m.start()),`
			`line.rfind("\t", 0, m.start())) + 1:m.start()]`
			`if "://" in token_prefix:`
			`continue`
			ref = m.group(0).rstrip(".,);:`'\"")
			`if any(c in ref for c in PLACEHOLDER):`
			`continue`
			`if not os.path.exists(os.path.join(ROOT, ref)):`
			`findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath,`
			`"line": i, "detail": f"references '{ref}' which does not exist"})`
			`return findings`


			`def main():`
			`result = {"root": ROOT, "inventory": inventory(), "findings": scan()}`
			`json.dump(result, sys.stdout, indent=2)`
			`sys.stdout.write("\n")`
			`counts = {}`
			`for f in result["findings"]:`
			`counts[f["check"]] = counts.get(f["check"], 0) + 1`
			`summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings"`
			`print(f"repo-scan: {len(result['inventory']['roles'])} roles, "`
			`f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr)`


			`if __name__ == "__main__":`
			`main()`