#!/usr/bin/env python3
"""
Phase-0 deterministic pre-scan for the /review-repo command.

Python standard library only. Emits a JSON object to stdout:
  - inventory: roles, adrs, runbooks, playbooks, scripts  (the shard list)
  - findings:  exact, no-judgement issues (markers, broken refs, unencrypted vaults)

The *judgement* review — contradictions, design-conformance, stale intent — is done
by the /review-repo fan-out reviewers, NOT here. This script only catches the cheap,
exact things so the reviewers can focus on what needs reasoning.

Usage:  python3 scripts/repo-scan.py [repo_root]  > scan.json
"""
import json
import os
import re
import sys

ROOT = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")

PRUNE = {".git", ".venv", ".collections", ".ansible", ".worktrees",
         ".pytest_cache", "node_modules", "__pycache__"}
SKIP_PREFIX = os.path.join("docs", "reviews")  # don't scan our own reports
SOURCE_EXTS = {".yml", ".yaml", ".j2", ".py", ".sh", ".md", ".tf", ".cfg", ".ini"}

# Marker words, but NOT when part of a regex alternation `(TODO|...)`, a filename
# like `TODO.md` / `docs/TODO.md`, or a numbered backlog reference like `TODO 8.2`
# / `TODO item 16` / `TODO #3` (those point at the backlog, they are not code markers).
MARKER_RE = re.compile(
    r"(?<![(|/])\b(TODO|FIXME|XXX|HACK)\b(?![|)]|\.\w|[\s\-]*\(?\s*(?:item\s+)?#?\d)")
ADR_REF_RE = re.compile(r"\bADR-(\d{3})\b")
PATH_REF_RE = re.compile(r"(?:docs|scripts|roles|inventories|terraform|playbooks)/[\w./-]+")
PLACEHOLDER = set("<>*${}")

# Stale-deferred detection: ADR "Deferred/Open" entries that another file describes
# as resolved, but which aren't marked resolved in place. (See docs/FRICTION.md.)
RESOLVE_MARK_RE = re.compile(r"\b(?:RESOLVED|DECIDED)\b", re.I)
LIST_ITEM_RE = re.compile(r"^\s*(\d+\.|[-*+])\s+(.*)")
# An external "this resolves ADR-NNN deferred #K" style reference.
DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I)
RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I)

# ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory
# sections and a parseable Status line. Presence only — section ORDER is a
# template-demonstrated convention, not machine-enforced.
ADR_FILE_RE = re.compile(r"^\d{3}-.*\.md$")
ADR_REQUIRED_SECTIONS = ("Status", "Context", "Decision", "Consequences")
ADR_STATUS_LINE_RE = re.compile(
    r"^(Proposed \(\d{4}-\d{2}-\d{2}\)"
    r"|Accepted \(\d{4}-\d{2}-\d{2}\)"
    r"|Superseded by ADR-\d{3}"
    r"|Deprecated \(\d{4}-\d{2}-\d{2}\))")


def _is_defer_heading(text):
    t = text.strip().lower()
    return (t.startswith("deferred") or t.startswith("unresolved")
            or "open question" in t or "open issue" in t)


def _defer_subject(item_text):
    m = re.search(r"\*\*(.+?)\*\*", item_text)
    s = m.group(1) if m else re.split(r"\s+[—–-]\s+|:", item_text, maxsplit=1)[0]
    return re.sub(r"\s+", " ", s).strip(" *_`~—–-:.")


def deferred_findings(adr_files, defer_refs):
    """adr_files: {rel_path: [lines]} for docs/decisions/*.md.
    defer_refs: [(adr, ordinal, path, line, has_resolve_word)] gathered repo-wide.
    Emits one informational `open-deferred-item` per open entry, and a `stale-deferred`
    contradiction when another file describes that entry as resolved."""
    out = []
    for rpath, lines in sorted(adr_files.items()):
        madr = re.match(r"(\d{3})-", os.path.basename(rpath))
        adr_num = madr.group(1) if madr else None
        in_defer = False
        for i, raw in enumerate(lines, 1):
            hm = re.match(r"#{1,6}\s+(.*)", raw)
            if hm:
                in_defer = _is_defer_heading(hm.group(1))
                continue
            if not in_defer:
                continue
            im = LIST_ITEM_RE.match(raw)
            if not im:
                continue
            marker, item_text = im.group(1), im.group(2)
            # self-marked resolved (inline RESOLVED/DECIDED or ~~strikethrough~~) → fine
            if RESOLVE_MARK_RE.search(raw) or item_text.lstrip().startswith("~~"):
                continue
            ordinal = int(marker[:-1]) if marker[:-1].isdigit() else None
            subject = _defer_subject(item_text)
            tag = f" #{ordinal}" if ordinal else ""
            out.append({"check": "open-deferred-item", "severity": "low", "path": rpath,
                        "line": i, "detail": f"open deferred item{tag} in ADR-{adr_num}: "
                        f"'{subject[:80]}' — confirm not resolved by a later ADR/STATUS"})
            if adr_num and ordinal:
                for ra, rk, rp, rl, has_res in defer_refs:
                    if ra == adr_num and rk == ordinal and rp != rpath and has_res:
                        out.append({"check": "stale-deferred", "severity": "medium",
                                    "path": rpath, "line": i,
                                    "detail": f"ADR-{adr_num} deferred #{ordinal} "
                                    f"('{subject[:60]}') is described as resolved at "
                                    f"{rp}:{rl}, but is not marked RESOLVED in place"})
    return out


def adr_structure_findings(adr_files):
    """adr_files: {rel_path: [lines]} for docs/decisions/*.md.
    Flags numbered ADRs (NNN-*.md) missing a mandatory section or whose Status
    section has no parseable lifecycle line. Non-numbered files (e.g.
    adr-template.md) are skipped. Section order is NOT checked (ADR-023)."""
    out = []
    for rpath, lines in sorted(adr_files.items()):
        if not ADR_FILE_RE.match(os.path.basename(rpath)):
            continue
        headings = {}
        for i, line in enumerate(lines):
            m = re.match(r"^##\s+(\w+)", line)
            if m:
                headings.setdefault(m.group(1), i)
        missing = [s for s in ADR_REQUIRED_SECTIONS if s not in headings]
        if missing:
            out.append({"check": "adr-structure", "severity": "medium",
                        "path": rpath, "line": 1,
                        "detail": f"missing mandatory section(s): {', '.join(missing)}"})
        if "Status" in headings:
            body = []
            for line in lines[headings["Status"] + 1:]:
                if line.startswith("## "):
                    break
                body.append(line)
            status_text = next((ln.strip() for ln in body if ln.strip()), "")
            if not ADR_STATUS_LINE_RE.match(status_text):
                out.append({"check": "adr-structure", "severity": "medium",
                            "path": rpath, "line": headings["Status"] + 1,
                            "detail": "Status not parseable (want 'Proposed (YYYY-MM-DD)', "
                                      "'Accepted (YYYY-MM-DD)', 'Superseded by ADR-NNN', or "
                                      "'Deprecated (YYYY-MM-DD)'); "
                                      f"got: {status_text[:60]!r}"})
    return out


def walk_files():
    for dirpath, dirnames, filenames in os.walk(ROOT):
        dirnames[:] = [d for d in dirnames if d not in PRUNE]
        for f in filenames:
            yield os.path.join(dirpath, f)


def rel(path):
    return os.path.relpath(path, ROOT)


def inventory():
    def listdir(*parts, want_dirs=False, suffixes=None):
        d = os.path.join(ROOT, *parts)
        if not os.path.isdir(d):
            return []
        out = []
        for e in sorted(os.listdir(d)):
            full = os.path.join(d, e)
            if want_dirs and not os.path.isdir(full):
                continue
            if suffixes and not e.endswith(suffixes):
                continue
            out.append(e)
        return out

    return {
        "roles": listdir("roles", want_dirs=True),
        "adrs": listdir("docs", "decisions", suffixes=(".md",)),
        "runbooks": listdir("docs", "runbooks", suffixes=(".md",)),
        "playbooks": listdir("playbooks", suffixes=(".yml", ".yaml")),
        "scripts": listdir("scripts"),
    }


def adr_numbers():
    dec = os.path.join(ROOT, "docs", "decisions")
    nums = set()
    if os.path.isdir(dec):
        for f in os.listdir(dec):
            m = re.match(r"(\d{3})-", f)
            if m:
                nums.add(m.group(1))
    return nums


def scan():
    findings = []
    adrs = adr_numbers()
    adr_files = {}        # docs/decisions/*.md → lines, for deferred-section parsing
    defer_refs = []       # repo-wide "resolves ADR-NNN deferred #K" references
    decisions_dir = os.path.join("docs", "decisions")
    for path in walk_files():
        rpath = rel(path)
        if rpath.startswith(SKIP_PREFIX):
            continue
        name = os.path.basename(path)

        if name == "vault.yml":
            try:
                text = open(path, encoding="utf-8", errors="replace").read()
            except OSError:
                continue
            if not text.startswith("$ANSIBLE_VAULT"):
                real = [ln for ln in text.splitlines()
                        if ln.strip() and not ln.lstrip().startswith("#") and ln.strip() != "---"]
                if real:
                    findings.append({"check": "vault-unencrypted", "severity": "high",
                                     "path": rpath, "line": 1,
                                     "detail": "vault.yml is not ansible-vault encrypted but has content"})
            continue

        if os.path.splitext(path)[1] not in SOURCE_EXTS:
            continue
        try:
            lines = open(path, encoding="utf-8", errors="replace").readlines()
        except OSError:
            continue

        if rpath.startswith(decisions_dir) and rpath.endswith(".md"):
            adr_files[rpath] = lines

        for i, line in enumerate(lines, 1):
            for m in DEFER_REF_RE.finditer(line):
                defer_refs.append((m.group(1), int(m.group(2)), rpath, i,
                                   bool(RESOLVE_WORD_RE.search(line))))
            markers = sorted(set(m.group(1) for m in MARKER_RE.finditer(line)))
            if markers:
                findings.append({"check": "marker", "severity": "low", "path": rpath,
                                 "line": i, "detail": f"{'/'.join(markers)}: {line.strip()[:120]}"})
            for m in ADR_REF_RE.finditer(line):
                if m.group(1) not in adrs:
                    findings.append({"check": "broken-adr-ref", "severity": "medium", "path": rpath,
                                     "line": i, "detail": f"references ADR-{m.group(1)} (no such file)"})
            # Only check path-like references that appear inside backticks or a
            # markdown link target — bare prose ("roles/docs") is not a real path.
            for cand in re.findall(r"`([^`]+)`", line) + re.findall(r"\]\(([^)]+)\)", line):
                if "://" in cand:  # skip URLs
                    continue
                pm = PATH_REF_RE.search(cand)
                if not pm:
                    continue
                ref = pm.group(0).rstrip(".,);:`'\"")
                if any(c in ref for c in PLACEHOLDER):
                    continue
                # Skip template / generated-report paths — not real broken refs:
                #  - a placeholder (<service>, ${x}) immediately follows the matched path
                #    (the regex stops at the placeholder, so it isn't caught above)
                #  - a date-template token (YYYY-MM-DD) appears in the path
                #  - the path is under a generated-report `reviews/` directory
                if (cand[pm.end():pm.end() + 1] in PLACEHOLDER
                        or re.search(r"YYYY|MM-DD", ref)
                        or re.search(r"(?:^|/)reviews/", ref)):
                    continue
                if not os.path.exists(os.path.join(ROOT, ref)):
                    findings.append({"check": "broken-path-ref", "severity": "medium", "path": rpath,
                                     "line": i, "detail": f"references '{ref}' which does not exist"})
    findings.extend(deferred_findings(adr_files, defer_refs))
    findings.extend(adr_structure_findings(adr_files))
    return findings


def main():
    result = {"root": ROOT, "inventory": inventory(), "findings": scan()}
    json.dump(result, sys.stdout, indent=2)
    sys.stdout.write("\n")
    counts = {}
    for f in result["findings"]:
        counts[f["check"]] = counts.get(f["check"], 0) + 1
    summary = ", ".join(f"{k}={v}" for k, v in sorted(counts.items())) or "no deterministic findings"
    print(f"repo-scan: {len(result['inventory']['roles'])} roles, "
          f"{len(result['inventory']['adrs'])} ADRs; {summary}", file=sys.stderr)


if __name__ == "__main__":
    main()