#!/usr/bin/env python3 """ Phase-0 deterministic pre-scan for the /review-repo command. Python standard library only. Emits a JSON object to stdout: - inventory: roles, adrs, runbooks, playbooks, scripts (the shard list) - findings: exact, no-judgement issues (markers, broken refs, unencrypted vaults) The *judgement* review — contradictions, design-conformance, stale intent — is done by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap, exact things so the reviewers can focus on what needs reasoning. Usage: python3 scripts/repo-scan.py [repo_root] > scan.json """ import json import os import re import sys ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".") PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees", ".pytest_cache", "node_modules", "__pycache__"} SKIP_PREFIX = os.path.join("docs", "reviews") # don't scan our own reports SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"} # Marker words, but NOT when part of a regex alternation `(TODO|...)`, a filename # like `TODO.md` / `docs/TODO.md`, or a numbered backlog reference like `TODO 8.2` # / `TODO item 16` / `TODO #3` (those point at the backlog, they are not code markers). MARKER_RE = re.compile( r"(?*${}") # Stale-deferred detection: ADR "Deferred/Open" entries that another file describes # as resolved, but which aren't marked resolved in place. (See docs/FRICTION.md.) RESOLVE_MARK_RE = re.compile(r"\b(?:RESOLVED|DECIDED)\b", re.I) LIST_ITEM_RE = re.compile(r"^\s*(\d+\.|[-*+])\s+(.*)") # An external "this resolves ADR-NNN deferred #K" style reference. DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I) RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I) # ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory # sections and a parseable Status line. Presence only — section ORDER is a # template-demonstrated convention, not machine-enforced. ADR_FILE_RE = re.compile(r"^\d{3}-.*\.md$") ADR_REQUIRED_SECTIONS = ("Status", "Context", "Decision", "Consequences") ADR_STATUS_LINE_RE = re.compile( r"^(Proposed \(\d{4}-\d{2}-\d{2}\)" r"|Accepted \(\d{4}-\d{2}-\d{2}\)" r"|Superseded by ADR-\d{3}" r"|Deprecated \(\d{4}-\d{2}-\d{2}\))") def _is_defer_heading(text): t = text.strip().lower() return (t.startswith("deferred") or t.startswith("unresolved") or "open question" in t or "open issue" in t) def _defer_subject(item_text): m = re.search(r"\*\*(.+?)\*\*", item_text) s = m.group(1) if m else re.split(r"\s+[—–-]\s+|:", item_text, maxsplit=1)[0] return re.sub(r"\s+", " ", s).strip(" *_`~—–-:.") def deferred_findings(adr_files, defer_refs): """adr_files: {rel_path: [lines]} for docs/decisions/*.md. defer_refs: [(adr, ordinal, path, line, has_resolve_word)] gathered repo-wide. Emits one informational `open-deferred-item` per open entry, and a `stale-deferred` contradiction when another file describes that entry as resolved.""" out = [] for rpath, lines in sorted(adr_files.items()): madr = re.match(r"(\d{3})-", os.path.basename(rpath)) adr_num = madr.group(1) if madr else None in_defer = False for i, raw in enumerate(lines, 1): hm = re.match(r"#{1,6}\s+(.*)", raw) if hm: in_defer = _is_defer_heading(hm.group(1)) continue if not in_defer: continue im = LIST_ITEM_RE.match(raw) if not im: continue marker, item_text = im.group(1), im.group(2) # self-marked resolved (inline RESOLVED/DECIDED or ~~strikethrough~~) → fine if RESOLVE_MARK_RE.search(raw) or item_text.lstrip().startswith("~~"): continue ordinal = int(marker[:-1]) if marker[:-1].isdigit() else None subject = _defer_subject(item_text) tag = f" #{ordinal}" if ordinal else "" out.append({"check": "open-deferred-item", "severity": "low", "path": rpath, "line": i, "detail": f"open deferred item{tag} in ADR-{adr_num}: " f"'{subject[:80]}' — confirm not resolved by a later ADR/STATUS"}) if adr_num and ordinal: for ra, rk, rp, rl, has_res in defer_refs: if ra == adr_num and rk == ordinal and rp != rpath and has_res: out.append({"check": "stale-deferred", "severity": "medium", "path": rpath, "line": i, "detail": f"ADR-{adr_num} deferred #{ordinal} " f"('{subject[:60]}') is described as resolved at " f"{rp}:{rl}, but is not marked RESOLVED in place"}) return out def adr_structure_findings(adr_files): """adr_files: {rel_path: [lines]} for docs/decisions/*.md. Flags numbered ADRs (NNN-*.md) missing a mandatory section or whose Status section has no parseable lifecycle line. Non-numbered files (e.g. adr-template.md) are skipped. Section order is NOT checked (ADR-023).""" out = [] for rpath, lines in sorted(adr_files.items()): if not ADR_FILE_RE.match(os.path.basename(rpath)): continue headings = {} for i, line in enumerate(lines): m = re.match(r"^##\s+(\w+)", line) if m: headings.setdefault(m.group(1), i) missing = [s for s in ADR_REQUIRED_SECTIONS if s not in headings] if missing: out.append({"check": "adr-structure", "severity": "medium", "path": rpath, "line": 1, "detail": f"missing mandatory section(s): {', '.join(missing)}"}) if "Status" in headings: body = [] for line in lines[headings["Status"] + 1:]: if line.startswith("## "): break body.append(line) status_text = next((ln.strip() for ln in body if ln.strip()), "") if not ADR_STATUS_LINE_RE.match(status_text): out.append({"check": "adr-structure", "severity": "medium", "path": rpath, "line": headings["Status"] + 1, "detail": "Status not parseable (want 'Proposed (YYYY-MM-DD)', " "'Accepted (YYYY-MM-DD)', 'Superseded by ADR-NNN', or " "'Deprecated (YYYY-MM-DD)'); " f"got: {status_text[:60]!r}"}) return out def walk_files(): for dirpath, dirnames, filenames in os.walk(ROOT): dirnames[:] = [d for d in dirnames if d not in PRUNE] for f in filenames: yield os.path.join(dirpath, f) def rel(path): return os.path.relpath(path, ROOT) def inventory(): def listdir(*parts, want_dirs=False, suffixes=None): d = os.path.join(ROOT, *parts) if not os.path.isdir(d): return [] out = [] for e in sorted(os.listdir(d)): full = os.path.join(d, e) if want_dirs and not os.path.isdir(full): continue if suffixes and not e.endswith(suffixes): continue out.append(e) return out return { "roles": listdir("roles", want_dirs=True), "adrs": listdir("docs", "decisions", suffixes=(".md",)), "runbooks": listdir("docs", "runbooks", suffixes=(".md",)), "playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")), "scripts": listdir("scripts"), } def adr_numbers(): dec = os.path.join(ROOT, "docs", "decisions") nums = set() if os.path.isdir(dec): for f in os.listdir(dec): m = re.match(r"(\d{3})-", f) if m: nums.add(m.group(1)) return nums def scan(): findings = [] adrs = adr_numbers() adr_files = {} # docs/decisions/*.md → lines, for deferred-section parsing defer_refs = [] # repo-wide "resolves ADR-NNN deferred #K" references decisions_dir = os.path.join("docs", "decisions") for path in walk_files(): rpath = rel(path) if rpath.startswith(SKIP_PREFIX): continue name = os.path.basename(path) if name == "vault.yml": try: text = open(path, encoding="utf-8", errors="replace").read() except OSError: continue if not text.startswith("$ANSIBLE_VAULT"): real = [ln for ln in text.splitlines() if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"] if real: findings.append({"check": "vault-unencrypted", "severity": "high", "path": rpath, "line": 1, "detail": "vault.yml is not ansible-vault encrypted but has content"}) continue if os.path.splitext(path)[1] not in SOURCE_EXTS: continue try: lines = open(path, encoding="utf-8", errors="replace").readlines() except OSError: continue if rpath.startswith(decisions_dir) and rpath.endswith(".md"): adr_files[rpath] = lines for i, line in enumerate(lines, 1): for m in DEFER_REF_RE.finditer(line): defer_refs.append((m.group(1), int(m.group(2)), rpath, i, bool(RESOLVE_WORD_RE.search(line)))) markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line))) if markers: findings.append({"check": "marker", "severity": "low", "path": rpath, "line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"}) for m in ADR_REF_RE.finditer(line): if m.group(1) not in adrs: findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath, "line": i, "detail": f"references ADR-{m.group(1)} (no such file)"}) # Only check path-like references that appear inside backticks or a # markdown link target — bare prose ("roles/docs") is not a real path. for cand in re.findall(r"`([^`]+)`", line) + re.findall(r"\]\(([^)]+)\)", line): if "://" in cand: # skip URLs continue pm = PATH_REF_RE.search(cand) if not pm: continue ref = pm.group(0).rstrip(".,);:`'\"") if any(c in ref for c in PLACEHOLDER): continue # Skip template / generated-report paths — not real broken refs: # - a placeholder (, ${x}) immediately follows the matched path # (the regex stops at the placeholder, so it isn't caught above) # - a date-template token (YYYY-MM-DD) appears in the path # - the path is under a generated-report `reviews/` directory if (cand[pm.end():pm.end() + 1] in PLACEHOLDER or re.search(r"YYYY|MM-DD", ref) or re.search(r"(?:^|/)reviews/", ref)): continue if not os.path.exists(os.path.join(ROOT, ref)): findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath, "line": i, "detail": f"references '{ref}' which does not exist"}) findings.extend(deferred_findings(adr_files, defer_refs)) findings.extend(adr_structure_findings(adr_files)) return findings def main(): result = {"root": ROOT, "inventory": inventory(), "findings": scan()} json.dump(result, sys.stdout, indent=2) sys.stdout.write("\n") counts = {} for f in result["findings"]: counts[f["check"]] = counts.get(f["check"], 0) + 1 summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings" print(f"repo-scan: {len(result['inventory']['roles'])} roles, " f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr) if __name__ == "__main__": main()