repo-scan: cut broken-path-ref + marker false positives

- broken-path-ref: skip template/generated-report paths — a placeholder
  (<service>) immediately following the match, a YYYY-MM-DD date token, or a
  path under a generated-report reviews/ dir (14 -> 0 on the current tree).
- marker: skip numbered-backlog references (TODO 8.2, TODO-3.1, TODO (2.2,
  TODO item 16) which point at the backlog, not code markers (35 -> 2; the
  remaining two are literal "TODO:" strings in a plan doc). Real code markers
  (TODO:, FIXME, etc.) still caught — verified with a synthetic fixture.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
sjat 2026-06-05 20:37:40 +02:00
parent db76be2a63
commit 568729e7bd

View file

@ -24,9 +24,11 @@ PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees",
SKIP_PREFIX = os.path.join("docs", "reviews") # don't scan our own reports
SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"}
# Marker words, but NOT when part of a regex alternation `(TODO|...)` or a filename
# like `TODO.md` / `docs/TODO.md`.
MARKER_RE = re.compile(r"(?<![(|/])\b(TODO|FIXME|XXX|HACK)\b(?![|)]|\.\w)")
# Marker words, but NOT when part of a regex alternation `(TODO|...)`, a filename
# like `TODO.md` / `docs/TODO.md`, or a numbered backlog reference like `TODO 8.2`
# / `TODO item 16` / `TODO #3` (those point at the backlog, they are not code markers).
MARKER_RE = re.compile(
r"(?<![(|/])\b(TODO|FIXME|XXX|HACK)\b(?![|)]|\.\w|[\s\-]*\(?\s*(?:item\s+)?#?\d)")
ADR_REF_RE = re.compile(r"\bADR-(\d{3})\b")
PATH_REF_RE = re.compile(r"(?:docs|scripts|roles|inventories|terraform|playbooks)/[\w./-]+")
PLACEHOLDER = set("<>*${}")
@ -198,6 +200,15 @@ def scan():
ref = pm.group(0).rstrip(".,);:`'\"")
if any(c in ref for c in PLACEHOLDER):
continue
# Skip template / generated-report paths — not real broken refs:
# - a placeholder (<service>, ${x}) immediately follows the matched path
# (the regex stops at the placeholder, so it isn't caught above)
# - a date-template token (YYYY-MM-DD) appears in the path
# - the path is under a generated-report `reviews/` directory
if (cand[pm.end():pm.end() + 1] in PLACEHOLDER
or re.search(r"YYYY|MM-DD", ref)
or re.search(r"(?:^|/)reviews/", ref)):
continue
if not os.path.exists(os.path.join(ROOT, ref)):
findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath,
"line": i, "detail": f"references '{ref}' which does not exist"})