diff --git a/README.md b/README.md index cf7dfd6..c6ec866 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,13 @@ See `Makefile` for the full list of targets. - Control / AI-worker host (`ubongo`): `docs/decisions/015-control-host.md` - Mesh VPN (NetBird): `docs/decisions/016-mesh-vpn.md` - Service-UI verification (Level 4): `docs/decisions/017-service-ui-verification.md` +- Logging & log integrity: `docs/decisions/018-logging.md` +- Tagging & run-targeting: `docs/decisions/019-tagging.md` +- Firewall strategy: `docs/decisions/020-firewall.md` +- Operational access: `docs/decisions/021-operational-access.md` +- Backup & disaster recovery: `docs/decisions/022-backup.md` +- ADR structure & lifecycle: `docs/decisions/023-adr-structure.md` +- Reverse proxy (Caddy): `docs/decisions/024-reverse-proxy.md` (CLAUDE.md carries the full cross-referenced table, including the runbooks and security/testing docs.) diff --git a/docs/CAPABILITIES.md b/docs/CAPABILITIES.md index 5a2caa5..636234a 100644 --- a/docs/CAPABILITIES.md +++ b/docs/CAPABILITIES.md @@ -24,9 +24,9 @@ decisions this frame enables. | Capability | Candidate service(s) | Tier | Commitment | What it does | Notes / open | |---|---|---|---|---|---| -| Reverse proxy / TLS | Traefik | P | core | Edge routing + ACME certs for everything exposed | Spin-up order names it (TODO 12) | +| Reverse proxy / TLS | Caddy (ADR-024) | P | core | Edge routing + ACME certs for everything exposed | Spin-up order names it (TODO 12) | | Internal DNS | `dns` role → dns1/dns2 | P | core | Authoritative internal zone (ADR-007) | Ansible-rendered zone | -| Public DNS | `public_dns` role → Gandi LiveDNS | P | core | wingu.me zone as code (ADR-007) | anti-spoof baseline; mesh/LAN-only default; apply pending | +| Public DNS | `public_dns` role → Gandi LiveDNS | P | core | wingu.me zone as code (ADR-007) | anti-spoof baseline; mesh/LAN-only default; applied (M1) | | VPN / remote access | NetBird (self-hosted on `askari`) | P | core | Secure mesh remote access to `srv`/`mgmt` | **Decided (ADR-016):** NetBird mesh replaces ADR-007 OPNsense WireGuard | | Service portal / dashboard | Homepage | A | candidate | One landing page listing all services — a "what does what" front door | Gap surfaced by V4; fits boma's legibility goal | diff --git a/docs/reviews/2026-06-14-findings.json b/docs/reviews/2026-06-14-findings.json new file mode 100644 index 0000000..f043393 --- /dev/null +++ b/docs/reviews/2026-06-14-findings.json @@ -0,0 +1,76 @@ +{ + "date": "2026-06-14", + "reviewed_commit": "e346137", + "fixes_commit": null, + "mode": "on-demand", + "counts": { + "auto_fixed": 11, + "open": 29, + "scan": { + "broken-adr-ref": 4, + "broken-path-ref": 2, + "marker": 14, + "open-deferred-item": 5, + "stale-deferred": 0 + } + }, + "deferral_checklist": { + "adr-011-open-items": "all 5 ('Open questions': Proxmox snapshot driver, exact cadences, health-check harness home, classification home, staging-first) confirmed genuinely still open. ADR-011 is still Proposed/unbuilt; the same questions are echoed open in docs/TODO.md item 16; no later ADR or STATUS decides any of them. No stale-deferred.", + "stale_deferred_found": 0 + }, + "scan_false_positives": [ + {"check": "broken-adr-ref", "location": "tests/test_repo_scan.py:10,43; docs/superpowers/plans/2026-06-10-adr-structure.md:50,83", "why": "ADR-099/ADR-100 are intentional test fixtures exercising the scanner's bad-ref detection."}, + {"check": "broken-path-ref", "location": "docs/superpowers/plans/2026-06-14-m4b-netbird.md:28,56", "why": "roles/netbird/ is referenced by the M4b implementation plan for a role to be scaffolded via make new-role; forward-looking plan for unbuilt work, not a dead ref."}, + {"check": "marker", "location": "docs/decisions/019-tagging.md:14 + docs/superpowers/plans/* + docs/superpowers/specs/*", "why": "019-tagging.md:14 is prose discussing 'over-tagging' as a concept ('the TODO explicitly warns against...'), not an actionable TODO. The 13 superpowers markers are historical planning artifacts (commit-message TODOs, plan steps)."} + ], + "auto_fixed": [ + {"id": "AF1", "dimension": "drift", "severity": "high", "location": "roles/reverse_proxy/meta/main.yml:4-6", "description": "meta description said 'ACME DNS-01 TLS via Gandi ... builds the custom image on-host (caddy-dns/gandi)' — but the role is now vanilla Caddy + HTTP-01 (commit b7e919d dropped the custom image); README/defaults/compose/STATUS all reflect vanilla. Only meta was stale and contradicted the code.", "fix": "rewrote description to 'Vanilla Caddy reverse proxy (ADR-024); TLS via ACME HTTP-01 for public hosts. Routes from reverse_proxy__routes, managed via Docker Compose.'", "tag": "new"}, + {"id": "AF2", "dimension": "cruft", "severity": "medium", "location": "roles/README.md:11-15", "description": "Current-state paragraph said base hardening (SSH/fail2ball), auditd, packages, users 'not yet built' and docker_host 'scaffolded but has no tasks yet' — but STATUS records the hardening concern built+tested+applied to askari, and docker_host/reverse_proxy/public_dns all built.", "fix": "rewrote to: base firewall+hardening built (hardening applied to askari), docker_host/reverse_proxy/public_dns/dev_env built; auditd/packages/users pending.", "tag": "recurring"}, + {"id": "AF3", "dimension": "drift", "severity": "medium", "location": "playbooks/README.md:6-13", "description": "site.yml note said docker_host 'scaffolded with no tasks yet' (now installs Docker engine) and the file omitted dns.yml and offsite.yml entirely.", "fix": "reworded site.yml note (base firewall+hardening, no cluster docker hosts yet) and added dns.yml + offsite.yml bullets.", "tag": "new"}, + {"id": "AF4", "dimension": "cruft", "severity": "low", "location": "roles/public_dns/README.md:7-9", "description": "'the anti-spoof baseline now; askari in M4' — M4a is done; askari + *.askari records are applied.", "fix": "updated to note askari.wingu.me + *.askari wildcard applied in M4a.", "tag": "new"}, + {"id": "AF5", "dimension": "cruft", "severity": "low", "location": "scripts/README.md:17", "description": "Helper-script list omitted check-tags.py, which exists and is run by make lint (ADR-019).", "fix": "added a check-tags.py bullet.", "tag": "new"}, + {"id": "AF6", "dimension": "drift", "severity": "medium", "location": "terraform/README.md:7-15", "description": "Top-level terraform README omitted modules/hetzner_vm and environments/offsite — the only built+applied TF environment (askari).", "fix": "added hetzner_vm + offsite env bullets; scoped 'not yet init'ed' to the Proxmox envs.", "tag": "new"}, + {"id": "AF7", "dimension": "cruft", "severity": "low", "location": "terraform/environments/offsite/providers.tf:1", "description": "Verified-stamp said 'cax11@hel1' but the deployed server is cx23 (CAX11 out of stock).", "fix": "stamp now reads cx23@hel1.", "tag": "new"}, + {"id": "AF8", "dimension": "cruft", "severity": "low", "location": "terraform/modules/hetzner_vm/variables.tf:7", "description": "server_type description example was 'e.g. cax11 (ARM)'; the only consumer uses cx23.", "fix": "example now 'e.g. cx23 (x86) or cax11 (ARM)'.", "tag": "new"}, + {"id": "AF9", "dimension": "drift", "severity": "medium", "location": "inventories/production/group_vars/all/public_dns.yml:16-17", "description": "Comment on the *.askari wildcard said 'Caddy gets a *.askari.wingu.me cert via DNS-01 (M4a)' — M4a uses HTTP-01 (the wildcard A record itself is still legitimately needed for name resolution).", "fix": "comment now says per-host certs via ACME HTTP-01 (M4a).", "tag": "new"}, + {"id": "AF10", "dimension": "drift", "severity": "high", "location": "docs/CAPABILITIES.md:27,29", "description": "Capability table named Traefik as the reverse-proxy candidate (ADR-024 chose Caddy, built+applied) and marked public DNS 'apply pending' (applied 2026-06-14).", "fix": "reverse-proxy row -> 'Caddy (ADR-024)'; public DNS note -> 'applied (M1)'. (The V4-history Traefik mention at line 134 is correct and left as-is.)", "tag": "new"}, + {"id": "AF11", "dimension": "cruft", "severity": "low", "location": "README.md:110-119", "description": "README 'Documentation' ADR list stopped at ADR-017; ADR-018..024 exist.", "fix": "extended the list through ADR-024 (logging, tagging, firewall, access, backup, ADR-structure, reverse-proxy).", "tag": "recurring"} + ], + "open": [ + {"id": "O1", "dimension": "drift", "severity": "high", "location": "STATUS.md:41 (+ 45-48) ↔ STATUS.md:33-34", "description": "The 'Scaffolded but empty — NOT implemented' table still lists roles/docker_host as 'Scaffolded, no tasks ... applying it is a no-op', and the trailing prose (45-48) repeats it. This contradicts STATUS.md:33-34 ('Built + applied', installs Docker CE + compose) and the actual roles/docker_host/tasks/main.yml. An internal STATUS contradiction; one side is plainly correct (docker_host is built).", "suggested_fix": "Remove/rewrite the docker_host row in the 'Scaffolded but empty' table and the 45-48 paragraph: docker_host now installs the Docker engine; only its deferred daemon-hardening + nftables.d scope (ADR-004/020) remains. Report (STATUS is the operator's ground-truth doc — reword deliberately).", "tag": "new", "auto_fixable": false}, + {"id": "O2", "dimension": "consistency", "severity": "high", "location": "docs/decisions/004-docker-model.md:105,131 ↔ docs/decisions/022-backup.md", "description": "ADR-004 states twice that 'Backup strategy is defined separately (not in scope of this repo)'. ADR-022 defines a full in-repo backup/DR doctrine (restic, fisi pull node, per-service backup__* + BACKUP.md). Direct ADR↔ADR scope contradiction.", "suggested_fix": "Reword ADR-004's lines to point at ADR-022 (backup is now in-repo scope) and cross-link, per ADR-023's no-silent-reversal rule. Design decision — report.", "tag": "recurring", "auto_fixable": false}, + {"id": "O3", "dimension": "consistency", "severity": "high", "location": "docs/decisions/024-reverse-proxy.md (Consequences) ↔ 008-testing.md:70; 017-service-ui-verification.md:27,88; 019-tagging.md:52", "description": "ADR-024's Consequences claim 'ADR-017 prose that mentioned Traefik is updated to read Caddy'. That update was NOT done: ADR-017:27,88 still say 'Traefik + Authentik'; ADR-008:70 'Traefik + Authentik SSO flow'; ADR-019:52 'Traefik routes, Authentik'. The doc set still designs around Traefik while ADR-024 overclaims the reconciliation was completed.", "suggested_fix": "Replace Traefik with Caddy (ADR-024) in ADR-008:70, ADR-017:27,88, ADR-019:52, OR soften ADR-024's Consequences to 'to be updated'. ADR prose = design docs — report (not auto-fixed).", "tag": "new", "auto_fixable": false}, + {"id": "O4", "dimension": "conformance", "severity": "high", "location": "docs/decisions/023-adr-structure.md:7-8,77-80 ↔ 016-mesh-vpn.md:3; 017-service-ui-verification.md:3; 018-logging.md:3", "description": "ADR-023 §2 mandates ## Status as the first section and §6 explicitly claims ADRs 001–018 were retroactively restructured to lead with Status (calling out 016–018). But ADR-016/017/018 still open with ## Context, Status buried late (016:~92, 017:~66, 018:~73). ADR-023's own conformance claim is contradicted by three in-scope files. (Older ADRs 001–010 lead with Status but place Decision/Consequences after topical sections — an accepted presentational trade-off per ADR-023 §5/§6.)", "suggested_fix": "Either add a top-of-file ## Status section to ADR-016/017/018 (move the existing build-state line up), or correct ADR-023 §6 to exclude them. Reordering judgement — report.", "tag": "recurring", "auto_fixable": false}, + {"id": "O5", "dimension": "consistency", "severity": "medium", "location": "docs/decisions/004-docker-model.md:48-50", "description": "The service-role file table (the canonical standard) lists only README/SECURITY/VERIFY; it omits ACCESS.md (ADR-021) and BACKUP.md (ADR-022), both of which CLAUDE.md + those ADRs mandate as required per-service-role files.", "suggested_fix": "Add ACCESS.md (ADR-021) and BACKUP.md (ADR-022, stateful) rows to ADR-004's file table.", "tag": "recurring", "auto_fixable": false}, + {"id": "O6", "dimension": "drift", "severity": "medium", "location": "docs/decisions/002-security.md:82", "description": "References 'make deploy PLAYBOOK=upgrade' as the deliberate full-upgrade mechanism, but no upgrade.yml exists (only bootstrap/dns/offsite/site/workstation) and ADR-011 is still Proposed/unbuilt — stated without the '(planned)' caveat ADR-002 uses for its other unbuilt controls.", "suggested_fix": "Add a '(planned — ADR-011, not yet built)' caveat to the upgrade line, or drop the concrete command until upgrade.yml exists.", "tag": "recurring", "auto_fixable": false}, + {"id": "O7", "dimension": "drift", "severity": "medium", "location": "docs/CAPABILITIES.md:150-155 ↔ STATUS.md:29", "description": "CAPABILITIES still lists nvim/kitty/tmux among 'Confirmed exclusions' boma 'deliberately does not' have, but the dev_env role (built+applied to ubongo) installs neovim + tmux. (The reverse-proxy/public-DNS rows in this file were auto-fixed in AF10; this exclusions block was left because it needs a scoped carve-out, not a token swap.)", "suggested_fix": "Scope the exclusion to managed cluster/server hosts and note the control/dev host (ubongo, ADR-015) runs an interactive dev_env, or drop nvim/tmux from the list.", "tag": "recurring", "auto_fixable": false}, + {"id": "O8", "dimension": "conformance", "severity": "medium", "location": "roles/dev_env/tasks/main.yml (include_tasks per_user.yml) + roles/dev_env/tasks/per_user.yml:4-9", "description": "per_user.yml's getent + set_fact dev_env__home preflight is untagged, and the include_tasks that pulls it in carries no 'apply: tags:'. base/tasks/main.yml documents and guards exactly this gotcha with apply: tags:; dev_env does not. A partial --tags users or --tags config run selects only the include statement (running nothing) or, if made tag-aware, skips the set_fact and fails the dependent [config] tasks on an undefined dev_env__home. Against ADR-019's concern-runnable-in-isolation intent.", "suggested_fix": "Add apply: tags: [users, config] to the per_user.yml include (mirroring base), and tag the getent+set_fact with 'always' (or the union [users, config]).", "tag": "recurring", "auto_fixable": false}, + {"id": "O9", "dimension": "drift", "severity": "medium", "location": "inventories/production/hosts.yml:1-17", "description": "Header claims 'Generated from Terraform outputs: make tf-inventory TF_ENV=production', but the file is hand-maintained: it carries the manual control host (ubongo) and omits the offsite_hosts group that tf_to_inventory.py always emits (VALID_GROUPS). Running tf-inventory against the empty production env would DROP ubongo and ADD offsite_hosts, so the header misrepresents how the file is managed.", "suggested_fix": "Make the header honest (hand-maintained for the manual control-node exception while production TF has no VMs; offsite hosts live in offsite.yml), and reconcile the declared group set with tf_to_inventory.py. Do NOT hand-regenerate hosts.yml in a way that drops ubongo.", "tag": "recurring", "auto_fixable": false}, + {"id": "O10", "dimension": "consistency", "severity": "medium", "location": "inventories/production/group_vars/all/vars.yml:42 + hosts.yml:12 ↔ docs/decisions/007-network.md", "description": "ubongo's address is 10.20.10.151 (control host_var + base__firewall_control_addr), but ADR-007 defines srv as 10.20.0.0/24 (network__srv_subnet) and mgmt as 10.10.0.0/24 — 10.20.10.151 is in neither, and ADR-007's addressing tables don't record where the physical control node lives. base__firewall_control_addr (ADR-021 recovery path) depends on this being right.", "suggested_fix": "Add ubongo to ADR-007's addressing table (which VLAN/segment 10.20.10.151 belongs to, clearly outside srv 10.20.0.0/24), or correct the address. Confirm the real address with the operator first.", "tag": "recurring", "auto_fixable": false}, + {"id": "O11", "dimension": "consistency", "severity": "medium", "location": "terraform/environments/{staging,production}/terraform.tfvars.example:9-11 + variables.tf:5", "description": "Proxmox node naming uses 'pve01' (two-digit) in both tfvars.example files and the proxmox_endpoint var descriptions; ADR-007 defines single-digit node names pve0/pve1/pve2, and internal FQDNs as .boma.. Example contradicts the naming convention.", "suggested_fix": "Align example values with ADR-007 (proxmox_node = pve0; endpoint = https://pve0.boma.:8006/). Verify the intended node name with the operator before changing — report rather than auto-fix.", "tag": "recurring", "auto_fixable": false}, + {"id": "O12", "dimension": "conformance", "severity": "medium", "location": "roles/reverse_proxy/ (missing SECURITY.md, VERIFY.md, ACCESS.md, BACKUP.md)", "description": "CLAUDE.md requires every service role to carry SECURITY.md (ADR-002/004), VERIFY.md (ADR-008/017), ACCESS.md (ADR-021), and a stateful BACKUP.md (ADR-022); a stateless service records backup__state: false with a reason. reverse_proxy is the first real built+applied service role (askari, M4a) but ships only README.md. (Judgement recorded: public_dns is exempt — it runs on the control node against an external DNS API, provisioning no host-resident service/port, so it is not a 'service' role in the ADR-004 sense.)", "suggested_fix": "Add the four files from docs/security|testing|access|backup/ templates. BACKUP.md can declare backup__state: false (Caddy state = re-issuable ACME certs).", "tag": "new", "auto_fixable": false}, + {"id": "O13", "dimension": "consistency", "severity": "low", "location": "docs/decisions/012-hardware-capacity.md; 013-heritage-v4.md:77; 015-control-host.md; 016-mesh-vpn.md; 017-service-ui-verification.md; 018-logging.md", "description": "Inconsistent cross-reference convention: ADRs 014/019/020/021/022/023 + adr-template use a dedicated '## Related' section, while 012/013/015/016/017/018 use an inline 'See also:' prose line (placed mid-document in 016/017/018). ADR-023 §3 names ## Related as the optional section; 'See also:' is an undocumented variant.", "suggested_fix": "Convert the 'See also:' prose into ## Related sections (after Consequences) in ADR-012/013/015/016/017/018 for uniformity. Cosmetic.", "tag": "recurring", "auto_fixable": false}, + {"id": "O14", "dimension": "consistency", "severity": "low", "location": "docs/README.md:4-8; inventories/README.md", "description": "docs/README.md lists only decisions/ + runbooks/ (omits security/testing/access/backup/hardware/reviews); inventories/README.md omits the offsite_hosts group documented in CLAUDE.md. Both narrower than current reality.", "suggested_fix": "Add the missing subdir rows / note offsite_hosts, or explicitly defer to the canonical list in the repo README / CLAUDE.md.", "tag": "recurring", "auto_fixable": false}, + {"id": "O15", "dimension": "drift", "severity": "medium", "location": "docs/runbooks/new-host.md:82,114-138 (Part E)", "description": "Part E (control node ubongo) still instructs 'ssh ansible@' / an ansible-user flow, but STATUS records ubongo is deliberately managed as the operator account sjat (group_vars/control ansible_user: sjat) with the ansible-user bootstrap listed as Pending.", "suggested_fix": "Update Part E to reflect ubongo managed as sjat (no ansible user yet), the ansible-user bootstrap a pending item per STATUS.md.", "tag": "recurring", "auto_fixable": false}, + {"id": "O16", "dimension": "consistency", "severity": "low", "location": "roles/dev_env/files/dotfiles/zsh/.zshrc:28,55", "description": "Shipped .zshrc hard-codes alias rclone=\"/usr/bin/rclone\" (rclone not installed by dev_env) and 'eval \"$(direnv hook zsh)\"' unguarded (unlike the guarded oh-my-posh block) — heritage fisi/V4 carryovers. If direnv is dropped from dev_env__packages, every shell startup errors.", "suggested_fix": "Drop the rclone alias and guard the direnv hook with 'command -v direnv', or document direnv as a hard dependency of the shipped .zshrc.", "tag": "recurring", "auto_fixable": false}, + {"id": "O17", "dimension": "consistency", "severity": "low", "location": "roles/dev_env/tasks/oh_my_posh.yml:15-26", "description": "The zen.toml theme-directory + deploy tasks render config to disk but carry no 'config' tag, while analogous dotfile tasks in per_user.yml are tagged config — inconsistent concern tagging within the role.", "suggested_fix": "Add tags: [config] to the zen.toml directory + deploy tasks.", "tag": "recurring", "auto_fixable": false}, + {"id": "O18", "dimension": "drift", "severity": "medium", "location": "docs/decisions/007-network.md:159,167,186 + 009-provisioning-handoff.md:114 + 016-mesh-vpn.md:90 ↔ 007-network.md:174,184", "description": "Internal-zone name is inconsistent across the doc set: ADR-007:159/167/186, ADR-009:114, ADR-016:90 call it 'boma.baobab.band', while ADR-007:174/184 says infra is '.boma.wingu.me' and the internal zone 'will be renamed to boma.wingu.me' (Phase 2). M1 moved boma's home to wingu.me. A reader can't tell which domain the unbuilt dns role should render.", "suggested_fix": "State the transitional state in one authoritative place (current = boma.baobab.band, target = boma.wingu.me in Phase 2), or align all references on the target. Report.", "tag": "new", "auto_fixable": false}, + {"id": "O19", "dimension": "consistency", "severity": "low", "location": "docs/decisions/009-provisioning-handoff.md:122", "description": "M1 retired 'nyumbani' as a naming tier (ROADMAP:70, ADR-007:176). ADR-009:122 still uses 'forgejo.nyumbani.baobab.band' as the worked example of internal-zone data the dns role would render. (Note: STATUS:19 + ADR-003/008/010 use the same name for the LIVE legacy Forgejo host, which is legitimately legacy infra — distinguish.)", "suggested_fix": "Update the ADR-009:122 example to a non-nyumbani name consistent with the retired-nyumbani decision; annotate the legacy Forgejo references as intentionally legacy where they remain.", "tag": "recurring", "auto_fixable": false}, + {"id": "O20", "dimension": "drift", "severity": "low", "location": "docs/ROADMAP.md:82-83", "description": "ROADMAP M2 still describes askari as 'CAX11 ARM / Helsinki', but STATUS records it provisioned as cx23/x86 (CAX11/ARM out of stock EU-wide on 2026-06-14). M3/M4 sections got DONE notes; M2's spec line wasn't corrected.", "suggested_fix": "Update ROADMAP M2 to note askari shipped as cx23/x86 (CAX11 unavailable), or add a DONE note mirroring M3/M4.", "tag": "new", "auto_fixable": false}, + {"id": "O21", "dimension": "drift", "severity": "low", "location": "docs/decisions/020-firewall.md:91-93", "description": "ADR-020 says askari's Hetzner Cloud Firewall 'NetBird ports (UDP 3478 + TCP 80/443) will be added in M4 when the coordinator role is built' — but M4a is DONE and the firewall already opens 80/443/3478. Future-tense is stale; only the netbird role (M4b) remains.", "suggested_fix": "Update ADR-020 to past tense (80/443/3478 opened in M4a); keep the netbird coordinator role (M4b) caveated as unbuilt.", "tag": "new", "auto_fixable": false}, + {"id": "O22", "dimension": "consistency", "severity": "low", "location": "docs/decisions/024-reverse-proxy.md:60-92", "description": "ADR-024 is internally inconsistent post-revision: the revised Status note says askari ships HTTP-01 with vanilla Caddy (custom-image DNS-01 deferred to Phase 2), but Decision §2 still asserts boma builds/maintains the custom xcaddy+gandi image, §3 says 'fronts the NetBird stack on askari (M4)' (M4b unbuilt), and Consequences still lists 'a custom Caddy image must be built/pushed/kept current' as a present obligation.", "suggested_fix": "Scope the custom-image obligation (§2, Consequences) to the deferred Phase-2 DNS-01 path; soften §3 to reflect that M4a ships a test vhost and the NetBird front-end is M4b. Report (touches decision substance).", "tag": "new", "auto_fixable": false}, + {"id": "O23", "dimension": "consistency", "severity": "low", "location": "docs/decisions/001-architecture.md:50 + 016-mesh-vpn.md:87 ↔ docs/ROADMAP.md:116", "description": "The future NetBird service role is named 'netbird_coordinator' in ADR-001:50 + ADR-016:87 (coordinator framing also in STATUS), but ROADMAP M4b:116 calls it 'the netbird service role'. make new-role creates one directory name; the committed names will mismatch the actual role at build time. (The M4b plan at docs/superpowers/plans/2026-06-14-m4b-netbird.md also uses 'netbird'.)", "suggested_fix": "Settle one role name and align ADR-001/016, ROADMAP, and the M4b plan before scaffolding.", "tag": "new", "auto_fixable": false}, + {"id": "O24", "dimension": "consistency", "severity": "low", "location": "docs/decisions/024-reverse-proxy.md:22 ↔ docs/ROADMAP.md:71", "description": "ADR-024 describes the M1 ACME DNS-01 wildcard as '*.boma.' (infra subdomain), while ROADMAP:71 specifies '*.' (apex). Different name spaces — the cert's actual SAN coverage for unexposed services is ambiguous across the two docs.", "suggested_fix": "Align the wildcard scope (decide *.wingu.me vs *.boma.wingu.me vs both) and state it identically in ADR-024 and ROADMAP.", "tag": "new", "auto_fixable": false}, + {"id": "O25", "dimension": "consistency", "severity": "low", "location": "roles/reverse_proxy/molecule/default/verify.yml:11,22; roles/public_dns/molecule/default/verify.yml:12", "description": "Molecule verify tasks use tags: [verify], which is not in the tests/tags.yml vocabulary (concerns/special/opt_ins/playbooks). check-tags.py exempts molecule/ paths so the linter doesn't flag it, and 4 roles use this de-facto convention — but it's an out-of-vocabulary tag the ADR-019 standard doesn't sanction.", "suggested_fix": "Either drop the tags from molecule verify tasks (the linter ignores molecule anyway) or add 'verify' as a sanctioned testing-only tag in tests/tags.yml with an ADR-019 note. Repo-wide convention call.", "tag": "new", "auto_fixable": false}, + {"id": "O26", "dimension": "consistency", "severity": "low", "location": "roles/reverse_proxy/templates/Caddyfile.j2:1; docker-compose.yml.j2:1", "description": "Neither rendered template carries an {{ ansible_managed }} header, though ADR-024 §1.2 cites 'one ansible_managed header' as a Caddy advantage. (No template in the repo currently uses ansible_managed — consistent with current practice but inconsistent with the ADR's stated intent.)", "suggested_fix": "Add a commented '# {{ ansible_managed }}' header to both templates (and ideally adopt the convention repo-wide).", "tag": "new", "auto_fixable": false}, + {"id": "O27", "dimension": "consistency", "severity": "low", "location": "inventories/production/group_vars/all/reverse_proxy.yml", "description": "reverse_proxy production vars live in group_vars/all/ (every host) though the role only runs on offsite_hosts via offsite.yml; CLAUDE.md establishes an offsite_hosts/ group_vars dir for askari-specific config, which doesn't exist on disk. Harmless today (only askari imports the role) but broader scope than intended.", "suggested_fix": "Consider moving reverse_proxy.yml (and the offsite firewall opens) to group_vars/offsite_hosts/ for scope clarity, or leave if intentionally global. Judgement call.", "tag": "new", "auto_fixable": false}, + {"id": "O28", "dimension": "drift", "severity": "low", "location": "scripts/capacity-scan.py:133", "description": "capacity-scan.py cross-checks workload hostnames only against inventories//hosts.yml. askari lives in inventories/production/offsite.yml, not hosts.yml, so the drift cross-check never sees it. Minor (capacity is intent-based today) but a latent gap as offsite hosts grow.", "suggested_fix": "Also read offsite.yml (or glob inventories//*.yml host files) so offsite_hosts are included.", "tag": "new", "auto_fixable": false}, + {"id": "O29", "dimension": "consistency", "severity": "low", "location": "inventories/production/offsite.yml:1-16 ↔ inventories/production/hosts.yml:7-16", "description": "offsite.yml (generated by tf-inventory-offsite) re-declares control/docker_hosts/proxmox_hosts with empty host maps because tf_to_inventory.py always emits all four VALID_GROUPS — duplicating groups in hosts.yml in the same inventory dir. Ansible merges them harmlessly, but the duplication/merge is undocumented.", "suggested_fix": "Document in inventories/README.md that offsite.yml is a second generated inventory file merged with hosts.yml, or have tf_to_inventory.py emit only non-empty groups for offsite. Leave as-is if intended; just document.", "tag": "new", "auto_fixable": false} + ], + "prior_resolved": [ + {"id": "O1@2026-06-11", "description": "make lint RED on main (site.yml imported nonexistent docker_host role)", "status": "resolved — docker_host scaffolded (03d33f8) then built (456c27d); make lint green this run."}, + {"id": "O10@2026-06-11", "description": "README ADR list stopped early (recurring)", "status": "resolved — auto-fixed this run (AF11), extended through ADR-024."}, + {"id": "O17@2026-06-11", "description": "empty handlers/main.yml scaffold artifacts in base/dev_env", "status": "resolved (accepted) — treated as an intentional make new-role scaffold convention; not re-raised."}, + {"id": "O2,O3,O4,O5,O6,O7,O8,O9,O11,O12,O13,O14,O15,O16,O18@2026-06-11", "description": "ADR-004 backup scope; ADR-004 ACCESS/BACKUP table; CAPABILITIES nvim/tmux; ADR-002 upgrade caveat; hosts.yml offsite_hosts; new-host Part E; dev_env set_fact tag; ubongo subnet; ADR section order; ADR-007 example; .zshrc rclone/direnv; oh_my_posh config tag; tfvars pve01; See-also vs Related; docs/inventories README narrowness", "status": "still open — carried forward as O2,O5,O7,O6,O9,O15,O8,O10,O4,O18/O19,O16,O17,O11,O13,O14 respectively (renumbered)."} + ] +} diff --git a/docs/reviews/2026-06-14-review.md b/docs/reviews/2026-06-14-review.md new file mode 100644 index 0000000..0fc6adf --- /dev/null +++ b/docs/reviews/2026-06-14-review.md @@ -0,0 +1,157 @@ +# Repo review — 2026-06-14 + +- **Reviewed commit:** `e346137` (docs(plan): M4b — NetBird coordinator service role) +- **Mode:** on-demand (interactive — auto-fixes applied + committed) +- **Previous run:** 2026-06-11 (`67f2aba`) +- **`make lint`:** green before and after fixes (260 files, profile production; check-tags OK). + +## Summary + +A lot shipped since the last review (M4a: `docker_host` Docker engine, `reverse_proxy` +Caddy applied to askari; offsite Terraform env live; ADR-024). Most findings this run are +the predictable **docs-lagging-the-build** kind — stale "not built yet" notes, a +reverse-proxy that switched from DNS-01/custom-image to vanilla HTTP-01 leaving stale +descriptions behind, and the **Traefik→Caddy** rename only half-propagated through the +ADR set. The previous run's blocker (O1, `make lint` RED) is **resolved**. + +### Counts + +| Dimension | High | Medium | Low | Total | +|---|---|---|---|---| +| Cruft / staleness | 0 | 0 | 0 | 0 | +| Design conformance | 1 | 2 | 2 | 5 | +| Consistency & intent | 2 | 2 | 9 | 13 | +| Docs-vs-reality drift | 1 | 4 | 5 | 10 | +| **Open total** | **4** | **8** | **16** | **29** | + +Plus **11 auto-fixes applied** (3 high, 5 medium, 3 low). + +### Phase-0 scan + +`repo-scan.py`: 5 roles, 25 ADRs · broken-adr-ref=4, broken-path-ref=2, marker=14, +open-deferred-item=5, **stale-deferred=0**. Every scan finding is a known false-positive +(test fixtures ADR-099/100; the `roles/netbird/` references in the M4b *plan* for unbuilt +work; superpowers planning artifacts; `019-tagging.md:14` is prose about "over-tagging", +not a TODO). Details in the findings JSON. + +### Deferral checklist + +All 5 ADR-011 "Open questions" (Proxmox snapshot driver, exact cadences, health-check +harness home, classification home, staging-first) confirmed **genuinely still open** — +ADR-011 is still Proposed/unbuilt, the same questions sit open in `docs/TODO.md` item 16, +and no later ADR or STATUS decides any of them. **No stale-deferred** (same as last run). + +## Auto-fixes applied + +All safe/obvious (stale text contradicting code/reality, partial enumerations, broken +descriptions) — no logic, variable, secret, or task-order changes. + +| ID | Sev | File | What | +|---|---|---|---| +| AF1 | high | `roles/reverse_proxy/meta/main.yml` | description still said DNS-01 + custom on-host image → rewrote to vanilla Caddy + HTTP-01 (matches the role since b7e919d) | +| AF2 | med | `roles/README.md` | base hardening + docker_host/reverse_proxy/public_dns build-state was stale → reconciled with STATUS | +| AF3 | med | `playbooks/README.md` | stale "docker_host has no tasks" note; added missing `dns.yml` + `offsite.yml` bullets | +| AF4 | low | `roles/public_dns/README.md` | "askari in M4" → askari + `*.askari` records applied in M4a | +| AF5 | low | `scripts/README.md` | added the missing `check-tags.py` entry (run by `make lint`) | +| AF6 | med | `terraform/README.md` | added `modules/hetzner_vm` + `environments/offsite` (the one applied env) | +| AF7 | low | `terraform/environments/offsite/providers.tf` | verified-stamp `cax11@hel1` → `cx23@hel1` (actual server) | +| AF8 | low | `terraform/modules/hetzner_vm/variables.tf` | `server_type` example `cax11 (ARM)` → `cx23 (x86) or cax11 (ARM)` | +| AF9 | med | `inventories/production/group_vars/all/public_dns.yml` | wildcard comment "cert via DNS-01" → ACME HTTP-01 (M4a) | +| AF10 | high | `docs/CAPABILITIES.md` | reverse-proxy candidate `Traefik` → `Caddy (ADR-024)`; public DNS "apply pending" → "applied (M1)" | +| AF11 | low | `README.md` | Documentation ADR list extended ADR-017 → ADR-024 | + +## Open findings (prioritised) + +### High + +- **O1 — drift — STATUS.md:41 (+45-48) ↔ 33-34** *(new)*: docker_host still appears in + the "Scaffolded but empty — NOT implemented" table as a no-op, contradicting its own + "Built + applied" rows and the real tasks file. Reword the scaffold row + closing + paragraph (left for the operator — STATUS is the ground-truth doc). +- **O2 — consistency — ADR-004:105,131 ↔ ADR-022** *(recurring)*: ADR-004 says backup is + "not in scope of this repo"; ADR-022 defines a full in-repo backup doctrine. Repoint + ADR-004 at ADR-022 (ADR↔ADR design decision — report). +- **O3 — consistency — ADR-024 Consequences ↔ ADR-008:70/017:27,88/019:52** *(new)*: + ADR-024 claims it updated ADR-017's Traefik prose to Caddy; it didn't, and ADR-008/019 + still say Traefik too. Either finish the rename or soften ADR-024's claim. +- **O4 — conformance — ADR-023:7-8,77-80 ↔ ADR-016/017/018** *(recurring)*: ADR-023 + claims ADRs 001–018 were restructured to lead with `## Status`, but 016/017/018 still + open with `## Context` and bury Status. Fix the three ADRs or correct ADR-023 §6. + +### Medium + +- **O5 — ADR-004:48-50** *(recurring)*: service-role file table omits ACCESS.md + + BACKUP.md rows (now mandated by CLAUDE.md/ADR-021/022). +- **O6 — ADR-002:82** *(recurring)*: `make deploy PLAYBOOK=upgrade` cited as real, but no + `upgrade.yml` exists and ADR-011 is unbuilt — needs a `(planned)` caveat. +- **O7 — CAPABILITIES:150-155 ↔ STATUS:29** *(recurring)*: nvim/tmux listed as a + "confirmed exclusion" while `dev_env` installs them on ubongo; needs a control-host + carve-out (not a token swap, so left from AF10). +- **O8 — dev_env tasks (include_tasks + per_user.yml:4-9)** *(recurring)*: untagged + `set_fact dev_env__home` preflight + include without `apply: tags:`; a partial + `--tags users|config` run breaks (base guards this; dev_env doesn't). +- **O9 — inventories/production/hosts.yml** *(recurring)*: header claims TF-generated but + it's hand-maintained (carries ubongo, omits offsite_hosts); `tf-inventory` would drop + ubongo. Make the header honest. +- **O10 — group_vars/all/vars.yml:42 ↔ ADR-007** *(recurring)*: ubongo `10.20.10.151` is + in no ADR-007 subnet and undocumented; `base__firewall_control_addr` depends on it. +- **O11 — terraform tfvars.example (both envs)** *(recurring)*: `pve01` vs ADR-007's + `pve0`; verify the real node name before changing. +- **O12 — roles/reverse_proxy/** *(new)*: first built+applied service role, but missing + SECURITY/VERIFY/ACCESS/BACKUP.md. (Recorded judgement: public_dns is exempt — control- + node external-API role, not a host service.) +- **O15 — runbooks/new-host.md Part E** *(recurring)*: still describes an `ansible` user + on ubongo; STATUS says ubongo is managed as `sjat` (ansible-user bootstrap pending). +- **O18 — ADR-007/009/016 internal-zone name** *(new)*: `boma.baobab.band` vs target + `boma.wingu.me` used inconsistently across the doc set after M1; state the transition + in one place. + +### Low + +O13 (See-also vs `## Related` in ADR-012/013/015/016/017/018 — recurring), O14 +(docs/README + inventories/README narrow enumerations — recurring), O16 (.zshrc rclone +alias + unguarded direnv hook — recurring), O17 (oh_my_posh zen.toml tasks missing +`config` tag — recurring), O19 (ADR-009:122 `nyumbani` example after retirement — +recurring), O20 (ROADMAP M2 CAX11/ARM vs cx23/x86 — new), O21 (ADR-020 "ports will be +added in M4" stale; already opened in M4a — new), O22 (ADR-024 body still asserts custom- +image obligation contradicting its revised Status — new), O23 (`netbird_coordinator` vs +`netbird` role name across ADRs/ROADMAP/plan — new), O24 (`*.boma.` vs +`*.` wildcard scope ADR-024 vs ROADMAP — new), O25 (`tags: [verify]` out of +the ADR-019 vocabulary in molecule verify — new), O26 (reverse_proxy templates lack +`ansible_managed` header — new), O27 (reverse_proxy vars in `group_vars/all/` not +`offsite_hosts/` — new), O28 (capacity-scan.py ignores `offsite.yml` — new), O29 +(offsite.yml duplicates empty groups from hosts.yml, undocumented merge — new). + +Full detail + suggested fixes in `2026-06-14-findings.json`. + +## Themes worth a deliberate pass + +1. **Finish the Traefik→Caddy rename** (O3, and ADR-024 over-claimed it was done). One + sweep across ADR-008/017/019 closes it. +2. **STATUS docker_host self-contradiction** (O1) — quick, but it's the ground-truth doc. +3. **ADR-024 internal consistency** (O22) — the role went vanilla/HTTP-01 but the ADR + body still mandates the custom image; reconcile §2/§3/Consequences with its own Status. +4. **dev_env tag-isolation** (O8) — the one real conformance bug with runtime impact; + mirror base's `apply: tags:` guard. +5. **First service-role doc quartet** (O12) — reverse_proxy is the template for every + future service role; getting SECURITY/VERIFY/ACCESS/BACKUP.md right now pays forward. + +## Follow-up prompt + +> Work the open findings from `docs/reviews/2026-06-14-review.md`. Priority order: +> (1) **O1** — fix the STATUS.md docker_host contradiction (it's built+applied, not a +> no-op; reword the "Scaffolded but empty" row + the 45-48 paragraph). +> (2) **O3 + O22** — finish the Traefik→Caddy rename in ADR-008:70, ADR-017:27,88, +> ADR-019:52, and reconcile ADR-024's body (§2 custom image, §3 NetBird, Consequences) +> with its own revised HTTP-01 Status note. +> (3) **O2 + O5** — repoint ADR-004's "backup not in scope" line at ADR-022 and add +> ACCESS.md + BACKUP.md rows to its service-role file table. +> (4) **O8** — add `apply: tags: [users, config]` to dev_env's per_user.yml include and +> tag the `dev_env__home` set_fact `always`; add a Molecule assertion that a partial +> `--tags config` run still resolves the home dir. +> (5) **O12** — author the four service-role doc files for `roles/reverse_proxy/` from the +> templates (BACKUP.md = `backup__state: false`, re-issuable certs). +> (6) **O4** — restructure ADR-016/017/018 to lead with `## Status`, or correct ADR-023 §6. +> Then the medium drift items (O6 upgrade caveat, O7 nvim/tmux carve-out, O9 hosts.yml +> header, O15 new-host Part E, O18 internal-zone naming). Run `make lint` after each +> batch; commit per CLAUDE.md git conventions. diff --git a/docs/reviews/latest.md b/docs/reviews/latest.md index fe0d268..0fc6adf 100644 --- a/docs/reviews/latest.md +++ b/docs/reviews/latest.md @@ -1,161 +1,157 @@ -# Repo review — 2026-06-11 +# Repo review — 2026-06-14 -- **Reviewed commit:** `67f2aba` (main) -- **Mode:** on-demand (interactive) -- **Previous run:** `2026-06-05` (commit `f566fd1`) -- **Process:** Phase 0 deterministic scan → 5 parallel shard reviewers + 1 cross-cutting - reviewer → synthesis, deferral-checklist resolution, prior-run diff → safe auto-fixes. +- **Reviewed commit:** `e346137` (docs(plan): M4b — NetBird coordinator service role) +- **Mode:** on-demand (interactive — auto-fixes applied + committed) +- **Previous run:** 2026-06-11 (`67f2aba`) +- **`make lint`:** green before and after fixes (260 files, profile production; check-tags OK). ## Summary -| | High | Medium | Low | Total | +A lot shipped since the last review (M4a: `docker_host` Docker engine, `reverse_proxy` +Caddy applied to askari; offsite Terraform env live; ADR-024). Most findings this run are +the predictable **docs-lagging-the-build** kind — stale "not built yet" notes, a +reverse-proxy that switched from DNS-01/custom-image to vanilla HTTP-01 leaving stale +descriptions behind, and the **Traefik→Caddy** rename only half-propagated through the +ADR set. The previous run's blocker (O1, `make lint` RED) is **resolved**. + +### Counts + +| Dimension | High | Medium | Low | Total | |---|---|---|---|---| -| **Auto-fixed** | 1 | 2 | 2 | 5 | -| **Open (report-only)** | 2 | 7 | 9 | 18 | +| Cruft / staleness | 0 | 0 | 0 | 0 | +| Design conformance | 1 | 2 | 2 | 5 | +| Consistency & intent | 2 | 2 | 9 | 13 | +| Docs-vs-reality drift | 1 | 4 | 5 | 10 | +| **Open total** | **4** | **8** | **16** | **29** | -By dimension (open): conformance 3 · consistency 8 · drift 6 · cruft 1. +Plus **11 auto-fixes applied** (3 high, 5 medium, 3 low). -**Headline:** `make lint` is currently **red on `main`** — `playbooks/site.yml` imports the -not-yet-existent `docker_host` role (confirmed at clean HEAD, unrelated to this run's -edits). That breaks CLAUDE.md's "main must always work" / "Never skip lint" contract and -is the top open finding (O1). The bulk of the rest is documentation drift created by the -recent `base` (firewall) + `dev_env` build wave: several READMEs/playbook notes still -described the roles as "empty / not built." Those were the safe auto-fixes. +### Phase-0 scan -**Good news:** 7 of the 12 open findings from the 2026-06-05 run are confirmed resolved -(VERIFY.md row + runbook step, backend.tf relabel, askari group naming, ADR-014 -reproducibility, CAPABILITIES Level-4 row, TODO 3.10). The deferral checklist is clean — -**0 stale-deferred** this run (the recurring miss logged in FRICTION.md did not recur). +`repo-scan.py`: 5 roles, 25 ADRs · broken-adr-ref=4, broken-path-ref=2, marker=14, +open-deferred-item=5, **stale-deferred=0**. Every scan finding is a known false-positive +(test fixtures ADR-099/100; the `roles/netbird/` references in the M4b *plan* for unbuilt +work; superpowers planning artifacts; `019-tagging.md:14` is prose about "over-tagging", +not a TODO). Details in the findings JSON. + +### Deferral checklist + +All 5 ADR-011 "Open questions" (Proxmox snapshot driver, exact cadences, health-check +harness home, classification home, staging-first) confirmed **genuinely still open** — +ADR-011 is still Proposed/unbuilt, the same questions sit open in `docs/TODO.md` item 16, +and no later ADR or STATUS decides any of them. **No stale-deferred** (same as last run). ## Auto-fixes applied -Markdown / YAML-comment only; no runtime behaviour, logic, vars, or task order touched. +All safe/obvious (stale text contradicting code/reality, partial enumerations, broken +descriptions) — no logic, variable, secret, or task-order changes. -| ID | Sev | File(s) | What | +| ID | Sev | File | What | |---|---|---|---| -| AF1 | high | `roles/README.md` | Rewrote stale "base & docker_host are empty untracked dirs, site.yml would fail on a clean clone" → base partially built (firewall), docker_host not yet created, dev_env built+applied. | -| AF2 | med | `playbooks/site.yml` | NOTE no longer claims base is unbuilt / "fails on a clean clone"; now reflects firewall-only base + missing docker_host. | -| AF3 | med | `playbooks/README.md` | Dropped the "currently a no-op" claim; added a `workstation.yml` bullet. | -| AF4 | low | `README.md` | Added `docs/access/`, `docs/backup/`, `roles/dev_env/`, `playbooks/workstation.yml` to the project-structure tree. | -| AF5 | low | `docs/decisions/016-mesh-vpn.md`, `docs/decisions/020-firewall.md` | Added the reciprocal `ADR-021` cross-reference that ADR-021 says it amended in. | - -> `make lint` was re-run after the fixes: it fails **only** on the pre-existing -> `docker_host` syntax-check (O1), identical to clean HEAD. No auto-fix introduced or -> changed any lint result, so none were reverted. +| AF1 | high | `roles/reverse_proxy/meta/main.yml` | description still said DNS-01 + custom on-host image → rewrote to vanilla Caddy + HTTP-01 (matches the role since b7e919d) | +| AF2 | med | `roles/README.md` | base hardening + docker_host/reverse_proxy/public_dns build-state was stale → reconciled with STATUS | +| AF3 | med | `playbooks/README.md` | stale "docker_host has no tasks" note; added missing `dns.yml` + `offsite.yml` bullets | +| AF4 | low | `roles/public_dns/README.md` | "askari in M4" → askari + `*.askari` records applied in M4a | +| AF5 | low | `scripts/README.md` | added the missing `check-tags.py` entry (run by `make lint`) | +| AF6 | med | `terraform/README.md` | added `modules/hetzner_vm` + `environments/offsite` (the one applied env) | +| AF7 | low | `terraform/environments/offsite/providers.tf` | verified-stamp `cax11@hel1` → `cx23@hel1` (actual server) | +| AF8 | low | `terraform/modules/hetzner_vm/variables.tf` | `server_type` example `cax11 (ARM)` → `cx23 (x86) or cax11 (ARM)` | +| AF9 | med | `inventories/production/group_vars/all/public_dns.yml` | wildcard comment "cert via DNS-01" → ACME HTTP-01 (M4a) | +| AF10 | high | `docs/CAPABILITIES.md` | reverse-proxy candidate `Traefik` → `Caddy (ADR-024)`; public DNS "apply pending" → "applied (M1)" | +| AF11 | low | `README.md` | Documentation ADR list extended ADR-017 → ADR-024 | ## Open findings (prioritised) ### High -- **O1 — `make lint` is red on `main`** · `playbooks/site.yml:18` · *conformance* - site.yml imports the `docker_host` role, which does not exist, so ansible-lint's - syntax-check fails on a clean checkout. Violates "main must always work" + "Never skip - lint" (pre-commit would block every commit unless bypassed). - *Fix (judgement):* guard/skip the docker_host play until the role exists, scaffold a - stub via `make new-role NAME=docker_host`, or exclude site.yml from syntax-check until - built — and record the choice. **new** - -- **O2 — ADR-004 ↔ ADR-022 backup-scope contradiction** · - `docs/decisions/004-docker-model.md:105` · *consistency* - ADR-004 says "Backup strategy is defined separately (not in scope of this repo)"; - ADR-022 defines a full in-repo backup strategy. Per ADR-023 (no silent reversals), - update ADR-004's line to defer to ADR-022 and cross-link. Design decision — report. **new** +- **O1 — drift — STATUS.md:41 (+45-48) ↔ 33-34** *(new)*: docker_host still appears in + the "Scaffolded but empty — NOT implemented" table as a no-op, contradicting its own + "Built + applied" rows and the real tasks file. Reword the scaffold row + closing + paragraph (left for the operator — STATUS is the ground-truth doc). +- **O2 — consistency — ADR-004:105,131 ↔ ADR-022** *(recurring)*: ADR-004 says backup is + "not in scope of this repo"; ADR-022 defines a full in-repo backup doctrine. Repoint + ADR-004 at ADR-022 (ADR↔ADR design decision — report). +- **O3 — consistency — ADR-024 Consequences ↔ ADR-008:70/017:27,88/019:52** *(new)*: + ADR-024 claims it updated ADR-017's Traefik prose to Caddy; it didn't, and ADR-008/019 + still say Traefik too. Either finish the rename or soften ADR-024's claim. +- **O4 — conformance — ADR-023:7-8,77-80 ↔ ADR-016/017/018** *(recurring)*: ADR-023 + claims ADRs 001–018 were restructured to lead with `## Status`, but 016/017/018 still + open with `## Context` and bury Status. Fix the three ADRs or correct ADR-023 §6. ### Medium -- **O3 — ADR-004 service-role file table missing ACCESS.md + BACKUP.md** · - `docs/decisions/004-docker-model.md:48` · *consistency* — CLAUDE.md + ADR-021/022 now - mandate both for service roles; the canonical table lists only SECURITY.md + VERIFY.md. - (Prior "missing VERIFY.md" is resolved; this is the next evolution.) **new** -- **O4 — CAPABILITIES nvim/tmux exclusion ↔ dev_env built** · - `docs/CAPABILITIES.md:149` · *consistency* — listed as a confirmed exclusion - ("server-only"), but `dev_env` (built+applied to ubongo) installs exactly that. Carve - out the control-node/AI-worker exception (ADR-015). **new** -- **O5 — phantom `make deploy PLAYBOOK=upgrade`** · `docs/decisions/002-security.md:82` · - *drift* — no `upgrade.yml` exists; ADR-011 is unbuilt. Add a "(planned)" caveat. **new** -- **O6 — hosts.yml stubs missing `offsite_hosts` group** · - `inventories/{production,staging}/hosts.yml` · *drift* — the generator emits it (one of - four VALID_GROUPS); the hand-stubs predate the standard. Regenerate via - `make tf-inventory` (don't hand-edit). (Prior "askari group unnamed" is resolved.) **new** -- **O7 — new-host runbook Part E vs ubongo reality** · `docs/runbooks/new-host.md:81-130` - · *drift* — instructs creating an `ansible` user / `ssh ansible@`; STATUS records ubongo - is managed as `sjat`, ansible-user bootstrap pending. **new** -- **O8 — dev_env untagged `set_fact` under tagged consumers** · - `roles/dev_env/tasks/per_user.yml:2-9` · *conformance* — partial `--tags users|config` - runs skip the `dev_env__home` set_fact and fail. Tag the preflight `[users, config]` or - `always`. **new** -- **O9 — ubongo address outside ADR-007 subnets** · `STATUS.md:31 ↔ 007-network.md` · - *drift* — 10.20.10.151 is in neither srv (10.20.0.0/24) nor mgmt (10.10.0.0/24); - `base__firewall_control_addr` depends on it. Already a tracked follow-up in the - ubongo-build plan. Reconcile address or ADR-007. **new** +- **O5 — ADR-004:48-50** *(recurring)*: service-role file table omits ACCESS.md + + BACKUP.md rows (now mandated by CLAUDE.md/ADR-021/022). +- **O6 — ADR-002:82** *(recurring)*: `make deploy PLAYBOOK=upgrade` cited as real, but no + `upgrade.yml` exists and ADR-011 is unbuilt — needs a `(planned)` caveat. +- **O7 — CAPABILITIES:150-155 ↔ STATUS:29** *(recurring)*: nvim/tmux listed as a + "confirmed exclusion" while `dev_env` installs them on ubongo; needs a control-host + carve-out (not a token swap, so left from AF10). +- **O8 — dev_env tasks (include_tasks + per_user.yml:4-9)** *(recurring)*: untagged + `set_fact dev_env__home` preflight + include without `apply: tags:`; a partial + `--tags users|config` run breaks (base guards this; dev_env doesn't). +- **O9 — inventories/production/hosts.yml** *(recurring)*: header claims TF-generated but + it's hand-maintained (carries ubongo, omits offsite_hosts); `tf-inventory` would drop + ubongo. Make the header honest. +- **O10 — group_vars/all/vars.yml:42 ↔ ADR-007** *(recurring)*: ubongo `10.20.10.151` is + in no ADR-007 subnet and undocumented; `base__firewall_control_addr` depends on it. +- **O11 — terraform tfvars.example (both envs)** *(recurring)*: `pve01` vs ADR-007's + `pve0`; verify the real node name before changing. +- **O12 — roles/reverse_proxy/** *(new)*: first built+applied service role, but missing + SECURITY/VERIFY/ACCESS/BACKUP.md. (Recorded judgement: public_dns is exempt — control- + node external-API role, not a host service.) +- **O15 — runbooks/new-host.md Part E** *(recurring)*: still describes an `ansible` user + on ubongo; STATUS says ubongo is managed as `sjat` (ansible-user bootstrap pending). +- **O18 — ADR-007/009/016 internal-zone name** *(new)*: `boma.baobab.band` vs target + `boma.wingu.me` used inconsistently across the doc set after M1; state the transition + in one place. ### Low -- **O10 — README ADR list stops at 017** · `README.md:104` · *drift* — 018–023 exist; - extend or trim to a pointer. **recurring** (evolved from prior O3) -- **O11 — ADR section-order vs ADR-023 §2** · `008:3, 014:98, 016:91, 017:66, 018:73` · - *conformance* — Status-not-first / Decision-late; passes lint (order not gated) but not - the standard. Presentational restructure. **new** -- **O12 — ADR-007 FQDN convention vs its own example** · `007-network.md:160` · - *consistency* — `.baobab.band` vs `forgejo.nyumbani.baobab.band`; ties to open - TODO 4 (split-horizon). **new** -- **O13 — dev_env `.zshrc` heritage carryovers** · - `roles/dev_env/files/dotfiles/zsh/.zshrc:28,55` · *consistency* — hard-coded - `/usr/bin/rclone` alias (not installed by the role) + unguarded `direnv` hook. **new** -- **O14 — oh_my_posh config tasks untagged** · `roles/dev_env/tasks/oh_my_posh.yml:15-26` - · *consistency* — inconsistent `config` tagging vs per_user.yml. **new** -- **O15 — tfvars.example `pve01` vs ADR-007 `pve0`** · - `terraform/environments/*/terraform.tfvars.example:9` · *consistency* — verify the real - node name, then align. **new** -- **O16 — ADR-013/015 "See also:" vs `## Related`** · *consistency* — stylistic; convert - for uniformity. **new** -- **O17 — empty scaffold `handlers/main.yml`** · `roles/{dev_env,base}/handlers/main.yml` - · *cruft* — confirm convention or delete. **new** -- **O18 — docs/README.md + inventories/README.md narrower than reality** · *consistency* - — omit several real subdirs / the offsite_hosts group. **new** +O13 (See-also vs `## Related` in ADR-012/013/015/016/017/018 — recurring), O14 +(docs/README + inventories/README narrow enumerations — recurring), O16 (.zshrc rclone +alias + unguarded direnv hook — recurring), O17 (oh_my_posh zen.toml tasks missing +`config` tag — recurring), O19 (ADR-009:122 `nyumbani` example after retirement — +recurring), O20 (ROADMAP M2 CAX11/ARM vs cx23/x86 — new), O21 (ADR-020 "ports will be +added in M4" stale; already opened in M4a — new), O22 (ADR-024 body still asserts custom- +image obligation contradicting its revised Status — new), O23 (`netbird_coordinator` vs +`netbird` role name across ADRs/ROADMAP/plan — new), O24 (`*.boma.` vs +`*.` wildcard scope ADR-024 vs ROADMAP — new), O25 (`tags: [verify]` out of +the ADR-019 vocabulary in molecule verify — new), O26 (reverse_proxy templates lack +`ansible_managed` header — new), O27 (reverse_proxy vars in `group_vars/all/` not +`offsite_hosts/` — new), O28 (capacity-scan.py ignores `offsite.yml` — new), O29 +(offsite.yml duplicates empty groups from hosts.yml, undocumented merge — new). -## Deferral checklist (Phase 2) +Full detail + suggested fixes in `2026-06-14-findings.json`. -| Source | Items | Verdict | -|---|---|---| -| ADR-011 Deferred/Open | 5 (snapshot driver, cadences, health-check harness home, classification home, staging-first) | **All genuinely still open** — cross-checked against later ADRs + TODO 16. None silently resolved. | -| ADR-015 Deferred | #1 mesh VPN, #2 service-UI, #3 build | **All marked RESOLVED in place** (ADR-016 / ADR-017 / 2026-06-11 build). | +## Themes worth a deliberate pass -**Stale-deferred found: 0.** The recurring FRICTION.md miss did not recur this run. +1. **Finish the Traefik→Caddy rename** (O3, and ADR-024 over-claimed it was done). One + sweep across ADR-008/017/019 closes it. +2. **STATUS docker_host self-contradiction** (O1) — quick, but it's the ground-truth doc. +3. **ADR-024 internal consistency** (O22) — the role went vanilla/HTTP-01 but the ADR + body still mandates the custom image; reconcile §2/§3/Consequences with its own Status. +4. **dev_env tag-isolation** (O8) — the one real conformance bug with runtime impact; + mirror base's `apply: tags:` guard. +5. **First service-role doc quartet** (O12) — reverse_proxy is the template for every + future service role; getting SECURITY/VERIFY/ACCESS/BACKUP.md right now pays forward. -## Scan false positives (folded in, not actionable) +## Follow-up prompt -- `broken-path-ref STATUS.md:38` — STATUS legitimately documents `roles/docker_host/` as - "Not in git." (intentional reference to an unbuilt role). -- `broken-adr-ref` ×4 — `ADR-099`/`ADR-100` in `tests/test_repo_scan.py` and the - adr-structure plan are intentional **test fixtures** for the scanner's bad-ref check. -- `marker` ×14 — all in `docs/superpowers/{plans,specs}/*` (historical commit-message - TODOs / plan steps) or prose discussing "over-tagging" as a concept. Not cruft. - -## Prior-run diff (vs 2026-06-05) - -**Resolved (7):** O1 VERIFY.md row · O2 new-role VERIFY step · O4 askari group naming · -O5 backend.tf relabel · O6 ADR-014 reproducibility · O11 CAPABILITIES Level-4 row · -O12 TODO 3.10. **Partial:** O3 (docs tree fixed in AF4; ADR-list carried as O10). -**Not re-detected (verify next run):** O7–O10 (ADR-011 still Proposed). - -## Follow-up prompt (copy-paste) - -> Act on the open findings from `docs/reviews/2026-06-11-review.md`. Priority order: -> 1. **O1 (high):** `make lint` is red on `main` — `playbooks/site.yml` imports the -> non-existent `docker_host` role. Pick an interim posture (guard/skip the play, or -> `make new-role NAME=docker_host` to scaffold a stub, or exclude from syntax-check -> until built) so the trunk lints clean again, and record the choice in STATUS.md. -> 2. **O2 (high):** Resolve the ADR-004 ↔ ADR-022 backup-scope contradiction — -> update ADR-004's "not in scope of this repo" line to defer to ADR-022 (per ADR-023's -> no-silent-reversal rule) and cross-link. -> 3. **O3:** Add ACCESS.md + BACKUP.md rows to ADR-004's service-role file table. -> 4. **O4:** Reconcile CAPABILITIES' nvim/tmux exclusion with the built `dev_env` role -> (carve out the ubongo control-node exception). -> 5. **O8 (conformance):** Tag the `dev_env__home` preflight `set_fact` so partial -> `--tags users|config` runs don't fail. -> 6. **O6 / O9:** Regenerate the inventory stubs to include `offsite_hosts`; reconcile -> ubongo's 10.20.10.151 against ADR-007's subnets (or amend ADR-007). -> 7. Sweep the low-severity doc items (O5 caveat, O7 runbook, O10 ADR list, O11 ADR -> section order, O12–O18) as a single docs-hygiene batch. -> Run `make lint` before committing; commit per CLAUDE.md git conventions. +> Work the open findings from `docs/reviews/2026-06-14-review.md`. Priority order: +> (1) **O1** — fix the STATUS.md docker_host contradiction (it's built+applied, not a +> no-op; reword the "Scaffolded but empty" row + the 45-48 paragraph). +> (2) **O3 + O22** — finish the Traefik→Caddy rename in ADR-008:70, ADR-017:27,88, +> ADR-019:52, and reconcile ADR-024's body (§2 custom image, §3 NetBird, Consequences) +> with its own revised HTTP-01 Status note. +> (3) **O2 + O5** — repoint ADR-004's "backup not in scope" line at ADR-022 and add +> ACCESS.md + BACKUP.md rows to its service-role file table. +> (4) **O8** — add `apply: tags: [users, config]` to dev_env's per_user.yml include and +> tag the `dev_env__home` set_fact `always`; add a Molecule assertion that a partial +> `--tags config` run still resolves the home dir. +> (5) **O12** — author the four service-role doc files for `roles/reverse_proxy/` from the +> templates (BACKUP.md = `backup__state: false`, re-issuable certs). +> (6) **O4** — restructure ADR-016/017/018 to lead with `## Status`, or correct ADR-023 §6. +> Then the medium drift items (O6 upgrade caveat, O7 nvim/tmux carve-out, O9 hosts.yml +> header, O15 new-host Part E, O18 internal-zone naming). Run `make lint` after each +> batch; commit per CLAUDE.md git conventions. diff --git a/inventories/production/group_vars/all/public_dns.yml b/inventories/production/group_vars/all/public_dns.yml index 5d9ce02..0366d08 100644 --- a/inventories/production/group_vars/all/public_dns.yml +++ b/inventories/production/group_vars/all/public_dns.yml @@ -13,8 +13,8 @@ public_dns__records: # askari (off-site host, TF-provisioned M2) — public A so it's reachable by name + # for future ACME on *.askari.wingu.me. Mesh/LAN-only home services never appear here. - {record: askari, type: A, values: ["77.42.120.136"], ttl: 1800} - # Wildcard for askari's services (test/netbird/...) → same host; Caddy gets a - # *.askari.wingu.me cert via DNS-01 (M4a). + # Wildcard for askari's services (test/netbird/...) → same host; Caddy gets + # per-host certs via ACME HTTP-01 (M4a). - {record: "*.askari", type: A, values: ["77.42.120.136"], ttl: 1800} # Absent — Gandi's auto-seeded defaults we don't want (purged once, idempotent thereafter). diff --git a/playbooks/README.md b/playbooks/README.md index 096a6b0..171ba88 100644 --- a/playbooks/README.md +++ b/playbooks/README.md @@ -4,10 +4,15 @@ Top-level orchestration playbooks. No inline vars — configuration comes from `group_vars/` / `host_vars/` (see CLAUDE.md). - `site.yml` — full standard state: applies `base` to all hosts and `docker_host` - to docker hosts. **Note:** `base` is only partially built (its `firewall` concern) - and `docker_host` is scaffolded with no tasks yet, so this is incomplete — see `STATUS.md`. + to docker hosts. **Note:** `base` is only partially built (its `firewall` + + `hardening` concerns) and the cluster has no docker hosts yet, so this is + incomplete — see `STATUS.md`. - `workstation.yml` — applies the `dev_env` role (interactive developer environment) to the `control` group; built and applied to `ubongo` (see `STATUS.md`). +- `dns.yml` — manages the public DNS zone (wingu.me) at Gandi LiveDNS via the + `public_dns` role; runs from the control node against an external API. +- `offsite.yml` — off-site hosts (`askari`): `docker_host` (Docker engine) + + `reverse_proxy` (Caddy). NetBird coordinator appended in M4b. - `bootstrap.yml` — first-run setup for a host that may not have Python yet; self-contained (does not depend on the roles). diff --git a/roles/README.md b/roles/README.md index e75ce7d..e378820 100644 --- a/roles/README.md +++ b/roles/README.md @@ -8,8 +8,9 @@ Each role must have: a `molecule/default/` scenario (Debian 13), a populated `README.md`, and a filled-in `meta/main.yml`. Conventions: CLAUDE.md and `docs/runbooks/new-role.md`. -Current state: `base` is **partially built** — its `firewall` concern (nftables) is -implemented and tested; the other concerns (SSH hardening, fail2ban, auditd, packages, -users) are not yet built. `docker_host` is **scaffolded but has no tasks yet**. `dev_env` (interactive -developer environment) is built and applied. See `STATUS.md` for the authoritative -breakdown. +Current state: `base` is **partially built** — its `firewall` (nftables) and +`hardening` (SSH key-only + fail2ban) concerns are implemented, tested, and the +hardening concern is applied to `askari`; the remaining concerns (auditd, packages, +users) are not yet built. `docker_host` (Docker engine + Compose), `reverse_proxy` +(Caddy), `public_dns` (Gandi), and `dev_env` are built. See `STATUS.md` for the +authoritative breakdown. diff --git a/roles/public_dns/README.md b/roles/public_dns/README.md index 9200951..b23d854 100644 --- a/roles/public_dns/README.md +++ b/roles/public_dns/README.md @@ -5,8 +5,8 @@ Manages boma's public DNS zone (**wingu.me**) at **Gandi LiveDNS** as code, via name on purpose. Run from the control node: `make check/deploy PLAYBOOK=dns`. Mesh/LAN-only by default — only deliberate public records live in the zone (the -anti-spoof baseline now; `askari` in M4). Everything else is reached over LAN/mesh and -never appears here. +anti-spoof baseline plus `askari.wingu.me` + the `*.askari` wildcard, applied in M4a). +Everything else is reached over LAN/mesh and never appears here. ## Data (in `group_vars/all/public_dns.yml`) diff --git a/roles/reverse_proxy/meta/main.yml b/roles/reverse_proxy/meta/main.yml index 2f5c7bb..713b184 100644 --- a/roles/reverse_proxy/meta/main.yml +++ b/roles/reverse_proxy/meta/main.yml @@ -2,8 +2,8 @@ galaxy_info: author: sjat description: >- - Caddy reverse proxy with ACME DNS-01 TLS via Gandi (ADR-024). Builds the - custom image on-host (caddy-dns/gandi) and manages it via Docker Compose. + Vanilla Caddy reverse proxy (ADR-024); TLS via ACME HTTP-01 for public + hosts. Routes from reverse_proxy__routes, managed via Docker Compose. license: MIT min_ansible_version: "2.17" platforms: diff --git a/scripts/README.md b/scripts/README.md index b61acaa..8be4d11 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -14,6 +14,9 @@ exception: `check-vault.py` is a vault tool that needs the ansible venv (PyYAML `rbw`. Wired as `vault_password_file` (ADR-002). - `check-vault-encrypted.sh` — pre-commit guard: fails if a `vault.yml` holds plaintext secrets. +- `check-tags.py` — enforces the closed tag vocabulary (`tests/tags.yml`) and that + each role import in a play carries its role-name tag. Invoked by `make lint`. See + **ADR-019**. - `repo-scan.py` — Phase-0 deterministic scan for `/review-repo` (markers, broken refs, unencrypted vaults, inventory). - `capacity-scan.py` — deterministic capacity facts for `/capacity-review`: parses diff --git a/terraform/README.md b/terraform/README.md index 0541049..0510f48 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -5,9 +5,13 @@ destroying Proxmox VMs. It writes no DNS records and configures nothing inside a VM; Ansible owns all of that. - `modules/proxmox_vm/` — reusable VM module (Proxmox only). -- `environments/{staging,production}/` — separate state per environment. Add a VM by - editing `local.vms` in that env's `main.tf`, then `make tf-plan` → `tf-apply` → - `tf-inventory`. +- `modules/hetzner_vm/` — reusable VM module (Hetzner Cloud: server + firewall + + SSH key + cloud-init). +- `environments/{staging,production}/` — separate state per environment (Proxmox). + Add a VM by editing `local.vms` in that env's `main.tf`, then `make tf-plan` → + `tf-apply` → `tf-inventory`. Not yet `terraform init`ed. +- `environments/offsite/` — the off-site Hetzner host (`askari`); the one + **applied** environment. Use `make tf-* TF_ENV=offsite` and `tf-inventory-offsite`. Rationale: **ADR-006**. Handoff to Ansible: **ADR-009**. Secrets via `TF_VAR_*` -only — never in `.tfvars`. Not yet `terraform init`ed — see `STATUS.md`. +only — never in `.tfvars`. See `STATUS.md` for what is provisioned. diff --git a/terraform/environments/offsite/providers.tf b/terraform/environments/offsite/providers.tf index 2837a18..0f83908 100644 --- a/terraform/environments/offsite/providers.tf +++ b/terraform/environments/offsite/providers.tf @@ -1,4 +1,4 @@ -# verified: hetznercloud/hcloud 1.65.0 · debian-13 image · cax11@hel1 · terraform-registry · 2026-06-14 +# verified: hetznercloud/hcloud 1.65.0 · debian-13 image · cx23@hel1 · terraform-registry · 2026-06-14 terraform { required_version = ">= 1.9" diff --git a/terraform/modules/hetzner_vm/variables.tf b/terraform/modules/hetzner_vm/variables.tf index faf92e2..15b3ba4 100644 --- a/terraform/modules/hetzner_vm/variables.tf +++ b/terraform/modules/hetzner_vm/variables.tf @@ -4,7 +4,7 @@ variable "name" { } variable "server_type" { - description = "Hetzner server type, e.g. cax11 (ARM)" + description = "Hetzner server type, e.g. cx23 (x86) or cax11 (ARM)" type = string }