First /review-repo run on boma. Hardened repo-scan.py (no TODO.md/prose false positives). Applied 7 safe fixes (DNS staleness x2, STATUS factual correction, hosts.yml path generalisation, trunk-based wording x2, scripts/README). Recorded the run and 17 open findings in docs/reviews/2026-05-30-*. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
150 lines
5.7 KiB
Python
150 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase-0 deterministic pre-scan for the /review-repo command.
|
|
|
|
Python standard library only. Emits a JSON object to stdout:
|
|
- inventory: roles, adrs, runbooks, playbooks, scripts (the shard list)
|
|
- findings: exact, no-judgement issues (markers, broken refs, unencrypted vaults)
|
|
|
|
The *judgement* review — contradictions, design-conformance, stale intent — is done
|
|
by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap,
|
|
exact things so the reviewers can focus on what needs reasoning.
|
|
|
|
Usage: python3 scripts/repo-scan.py [repo_root] > scan.json
|
|
"""
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
|
|
|
|
PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees",
|
|
".pytest_cache", "node_modules", "__pycache__"}
|
|
SKIP_PREFIX = os.path.join("docs", "reviews") # don't scan our own reports
|
|
SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"}
|
|
|
|
# Marker words, but NOT when part of a regex alternation `(TODO|...)` or a filename
|
|
# like `TODO.md` / `docs/TODO.md`.
|
|
MARKER_RE = re.compile(r"(?<![(|/])\b(TODO|FIXME|XXX|HACK)\b(?![|)]|\.\w)")
|
|
ADR_REF_RE = re.compile(r"\bADR-(\d{3})\b")
|
|
PATH_REF_RE = re.compile(r"(?:docs|scripts|roles|inventories|terraform|playbooks)/[\w./-]+")
|
|
PLACEHOLDER = set("<>*${}")
|
|
|
|
|
|
def walk_files():
|
|
for dirpath, dirnames, filenames in os.walk(ROOT):
|
|
dirnames[:] = [d for d in dirnames if d not in PRUNE]
|
|
for f in filenames:
|
|
yield os.path.join(dirpath, f)
|
|
|
|
|
|
def rel(path):
|
|
return os.path.relpath(path, ROOT)
|
|
|
|
|
|
def inventory():
|
|
def listdir(*parts, want_dirs=False, suffixes=None):
|
|
d = os.path.join(ROOT, *parts)
|
|
if not os.path.isdir(d):
|
|
return []
|
|
out = []
|
|
for e in sorted(os.listdir(d)):
|
|
full = os.path.join(d, e)
|
|
if want_dirs and not os.path.isdir(full):
|
|
continue
|
|
if suffixes and not e.endswith(suffixes):
|
|
continue
|
|
out.append(e)
|
|
return out
|
|
|
|
return {
|
|
"roles": listdir("roles", want_dirs=True),
|
|
"adrs": listdir("docs", "decisions", suffixes=(".md",)),
|
|
"runbooks": listdir("docs", "runbooks", suffixes=(".md",)),
|
|
"playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")),
|
|
"scripts": listdir("scripts"),
|
|
}
|
|
|
|
|
|
def adr_numbers():
|
|
dec = os.path.join(ROOT, "docs", "decisions")
|
|
nums = set()
|
|
if os.path.isdir(dec):
|
|
for f in os.listdir(dec):
|
|
m = re.match(r"(\d{3})-", f)
|
|
if m:
|
|
nums.add(m.group(1))
|
|
return nums
|
|
|
|
|
|
def scan():
|
|
findings = []
|
|
adrs = adr_numbers()
|
|
for path in walk_files():
|
|
rpath = rel(path)
|
|
if rpath.startswith(SKIP_PREFIX):
|
|
continue
|
|
name = os.path.basename(path)
|
|
|
|
if name == "vault.yml":
|
|
try:
|
|
text = open(path, encoding="utf-8", errors="replace").read()
|
|
except OSError:
|
|
continue
|
|
if not text.startswith("$ANSIBLE_VAULT"):
|
|
real = [ln for ln in text.splitlines()
|
|
if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"]
|
|
if real:
|
|
findings.append({"check": "vault-unencrypted", "severity": "high",
|
|
"path": rpath, "line": 1,
|
|
"detail": "vault.yml is not ansible-vault encrypted but has content"})
|
|
continue
|
|
|
|
if os.path.splitext(path)[1] not in SOURCE_EXTS:
|
|
continue
|
|
try:
|
|
lines = open(path, encoding="utf-8", errors="replace").readlines()
|
|
except OSError:
|
|
continue
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line)))
|
|
if markers:
|
|
findings.append({"check": "marker", "severity": "low", "path": rpath,
|
|
"line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"})
|
|
for m in ADR_REF_RE.finditer(line):
|
|
if m.group(1) not in adrs:
|
|
findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath,
|
|
"line": i, "detail": f"references ADR-{m.group(1)} (no such file)"})
|
|
# Only check path-like references that appear inside backticks or a
|
|
# markdown link target — bare prose ("roles/docs") is not a real path.
|
|
for cand in re.findall(r"`([^`]+)`", line) + re.findall(r"\]\(([^)]+)\)", line):
|
|
if "://" in cand: # skip URLs
|
|
continue
|
|
pm = PATH_REF_RE.search(cand)
|
|
if not pm:
|
|
continue
|
|
ref = pm.group(0).rstrip(".,);:`'\"")
|
|
if any(c in ref for c in PLACEHOLDER):
|
|
continue
|
|
if not os.path.exists(os.path.join(ROOT, ref)):
|
|
findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath,
|
|
"line": i, "detail": f"references '{ref}' which does not exist"})
|
|
return findings
|
|
|
|
|
|
def main():
|
|
result = {"root": ROOT, "inventory": inventory(), "findings": scan()}
|
|
json.dump(result, sys.stdout, indent=2)
|
|
sys.stdout.write("\n")
|
|
counts = {}
|
|
for f in result["findings"]:
|
|
counts[f["check"]] = counts.get(f["check"], 0) + 1
|
|
summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings"
|
|
print(f"repo-scan: {len(result['inventory']['roles'])} roles, "
|
|
f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|