Compare commits
21 commits
67f2aba9d8
...
3cb6436ad2
| Author | SHA1 | Date | |
|---|---|---|---|
| 3cb6436ad2 | |||
| f170ffd936 | |||
| e247af6e55 | |||
| a0a3e4d356 | |||
| bd84dd0213 | |||
| 9311968363 | |||
| 91ad629c02 | |||
| 70c302d7e5 | |||
| 6f5c7b2bfb | |||
| e96480692d | |||
| b131ee317e | |||
| 602550fdaa | |||
| 32d480efcf | |||
| 79f2315eee | |||
| 43e5a4aa53 | |||
| f7fac5f5e3 | |||
| 7a47dd9dec | |||
| be2679cc66 | |||
| 3cfcb1c2e9 | |||
| 03d33f83dd | |||
| 1da117d65b |
43 changed files with 2032 additions and 110 deletions
11
CLAUDE.md
11
CLAUDE.md
|
|
@ -33,6 +33,8 @@ Full design rationale: `docs/decisions/`
|
|||
| Scaffold a new role | `make new-role NAME=<name>` |
|
||||
| Review repo for drift/cruft | `/review-repo` (Claude command) |
|
||||
| Review hardware capacity | `/capacity-review` (Claude command) |
|
||||
| Edit the vault (nvim, auto re-encrypt) | `make edit-vault [VAULT=<path>]` |
|
||||
| Validate vault structure | `make check-vault [VAULT=<path>]` |
|
||||
| Encrypt a vault file | `make encrypt FILE=<path>` |
|
||||
| Decrypt a vault file | `make decrypt FILE=<path>` |
|
||||
| Install Python deps | `make setup` |
|
||||
|
|
@ -76,7 +78,13 @@ Full design rationale: `docs/decisions/`
|
|||
git commit** — the pre-commit ansible-lint hook decrypts `vault.yml`), run `rbw
|
||||
unlocked`; if it exits non-zero, ask the user to `rbw unlock` and wait rather than
|
||||
starting and failing partway. The agent stays unlocked 5h.
|
||||
- To edit a vault file: `make decrypt FILE=<path>`, edit, `make encrypt FILE=<path>`
|
||||
- To edit the vault: `make edit-vault` — decrypts → opens nvim → re-encrypts on `:wq`
|
||||
(abort with `:cq`), then `check-vault` validates structure. No plaintext lands in the
|
||||
work tree. Override the file with `VAULT=<path>`. (The lower-level `make decrypt`/
|
||||
`encrypt FILE=<path>` still exist for scripted/non-interactive edits.)
|
||||
- `make check-vault` validates the vault decrypts, is valid YAML, keeps secrets under the
|
||||
nested `vault:` map, and has no empty leaves — printing a structure view with values
|
||||
masked. Needs `rbw` unlocked.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -205,6 +213,7 @@ Single-contributor, trunk-based (no merge requests / approval gates):
|
|||
| Topic | File |
|
||||
|------------------------|---------------------------------------|
|
||||
| Architecture overview | `docs/decisions/001-architecture.md` |
|
||||
| Build order / roadmap | `docs/ROADMAP.md` |
|
||||
| Capabilities overview (what boma does) | `docs/CAPABILITIES.md` |
|
||||
| Security baseline & strategy | `docs/decisions/002-security.md` |
|
||||
| Accepted security risks | `docs/security/accepted-risks.md` |
|
||||
|
|
|
|||
17
Makefile
17
Makefile
|
|
@ -11,6 +11,8 @@ LINT := $(VENV)/bin/ansible-lint
|
|||
MOLECULE := $(VENV)/bin/molecule
|
||||
# Vault password is resolved via ansible.cfg (vault_password_file); no flag needed.
|
||||
VAULT_ARGS :=
|
||||
# Default vault file for edit-vault / check-vault (override with VAULT=<path>).
|
||||
VAULT ?= inventories/production/group_vars/all/vault.yml
|
||||
INVENTORY := -i inventories/production/hosts.yml
|
||||
|
||||
TF := terraform
|
||||
|
|
@ -20,7 +22,8 @@ MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
|
|||
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt new-role \
|
||||
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
|
||||
edit-vault check-vault new-role \
|
||||
tf-init tf-plan tf-apply tf-output tf-inventory \
|
||||
molecule-image molecule-image-push
|
||||
|
||||
|
|
@ -35,6 +38,8 @@ help:
|
|||
@echo " make test-all Run Molecule tests for all roles"
|
||||
@echo " make check PLAYBOOK=<name> Dry-run a playbook (check mode)"
|
||||
@echo " make deploy PLAYBOOK=<name> Run a playbook against production"
|
||||
@echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)"
|
||||
@echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)"
|
||||
@echo " make encrypt FILE=<path> Encrypt a vault file"
|
||||
@echo " make decrypt FILE=<path> Decrypt a vault file"
|
||||
@echo " make new-role NAME=<name> Scaffold a new role"
|
||||
|
|
@ -99,6 +104,16 @@ endif
|
|||
|
||||
# ── Vault ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Streamlined edit: ansible-vault edit decrypts to a temp file, opens nvim, and
|
||||
# re-encrypts on :wq (abort with :cq) — no plaintext ever lands in the work tree.
|
||||
# Then validate structure. Override the file with VAULT=<path>.
|
||||
edit-vault:
|
||||
EDITOR=nvim $(ANSIBLE)-vault edit $(VAULT)
|
||||
@$(PYTHON) scripts/check-vault.py $(VAULT)
|
||||
|
||||
check-vault:
|
||||
@$(PYTHON) scripts/check-vault.py $(VAULT)
|
||||
|
||||
encrypt:
|
||||
ifndef FILE
|
||||
$(error FILE is required: make encrypt FILE=<path>)
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ See `Makefile` for the full list of targets.
|
|||
│ ├── runbooks/ # Step-by-step operational procedures
|
||||
│ ├── security/ # Per-service security checklist + templates + accepted risks
|
||||
│ ├── testing/ # VERIFY.md template + service-UI verification reports
|
||||
│ ├── access/ # ACCESS.md template (ADR-021)
|
||||
│ ├── backup/ # BACKUP.md template (ADR-022)
|
||||
│ ├── hardware/ # Physical capacity reference + reviews
|
||||
│ └── reviews/ # /review-repo reports
|
||||
│
|
||||
|
|
@ -69,10 +71,12 @@ See `Makefile` for the full list of targets.
|
|||
│
|
||||
├── playbooks/ # Orchestration playbooks
|
||||
│ ├── site.yml # Full standard state
|
||||
│ ├── workstation.yml # Developer environment (control group)
|
||||
│ └── bootstrap.yml # First-run new host setup
|
||||
│
|
||||
├── roles/ # Ansible roles
|
||||
│ ├── base/ # OS baseline applied to all hosts
|
||||
│ ├── dev_env/ # Interactive developer environment
|
||||
│ └── docker_host/ # Docker runtime setup
|
||||
│
|
||||
├── terraform/ # VM provisioning only — no DNS (see ADR-006/009)
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ _Last reviewed: 2026-06-11._
|
|||
| Tag standard + enforcement (ADR-019) | Works — `tests/tags.yml` (closed vocabulary) + `scripts/check-tags.py` (run by `make lint`, unit-tested): enforces the tag vocabulary and that each role import in a play's `roles:` block carries its role-name tag. Governs mostly-unbuilt roles, but the linter is live now. Proxmox VM tag convention (`<env>`, group, `managed-by=terraform`) is in the Terraform HCL but unprovisioned. |
|
||||
| `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). |
|
||||
| `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
|
||||
| `roles/public_dns/` + `playbooks/dns.yml` | **Built — not yet applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (null MX, SPF `-all`, DMARC reject), and the Gandi-defaults purge list are defined + unit-tested (`tests/test_public_dns.py`). The live `make deploy PLAYBOOK=dns` (purge + baseline) is **pending — run on ubongo**. M1 of the roadmap. |
|
||||
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (no TF state yet). |
|
||||
|
||||
## Scaffolded but empty — NOT implemented
|
||||
|
|
@ -35,14 +36,14 @@ _Last reviewed: 2026-06-11._
|
|||
| Thing | State |
|
||||
|---|---|
|
||||
| `roles/base/` | **Partially built.** The `firewall` concern is implemented (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) with pytest + Molecule render/syntax tests. Other concerns (SSH hardening, fail2ban, auditd, packages, users) are **not** built yet, so `make deploy PLAYBOOK=site` has no real content to apply (the make target itself now works — see "Real and working today"). |
|
||||
| `roles/docker_host/` | Not in git. Same. |
|
||||
| `roles/docker_host/` | **Scaffolded, no tasks.** In git (meta/README/molecule filled), wired into `playbooks/site.yml` so the standard state is expressed end-to-end and `make lint` covers it, but it has no tasks yet — applying it is a no-op. Planned scope (Docker engine + Compose, daemon hardening, `nftables.d` container rules) in ADR-004/ADR-020. |
|
||||
| `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
|
||||
| `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |
|
||||
|
||||
So `make deploy PLAYBOOK=site` has no real content to apply — `base` is only partially
|
||||
built (its `firewall` concern only) and the `docker_host` role does not exist yet. (The
|
||||
`make check`/`deploy` machinery itself now works — first proven by applying `dev_env` via
|
||||
`playbooks/workstation.yml`.)
|
||||
built (its `firewall` concern only) and the `docker_host` role is scaffolded but has no
|
||||
tasks yet. (The `make check`/`deploy` machinery itself now works — first proven by
|
||||
applying `dev_env` via `playbooks/workstation.yml`.)
|
||||
|
||||
## Designed but not built
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ decisions this frame enables.
|
|||
|---|---|---|---|---|---|
|
||||
| Reverse proxy / TLS | Traefik | P | core | Edge routing + ACME certs for everything exposed | Spin-up order names it (TODO 12) |
|
||||
| Internal DNS | `dns` role → dns1/dns2 | P | core | Authoritative internal zone (ADR-007) | Ansible-rendered zone |
|
||||
| Public DNS | `public_dns` role → Gandi LiveDNS | P | core | wingu.me zone as code (ADR-007) | anti-spoof baseline; mesh/LAN-only default; apply pending |
|
||||
| VPN / remote access | NetBird (self-hosted on `askari`) | P | core | Secure mesh remote access to `srv`/`mgmt` | **Decided (ADR-016):** NetBird mesh replaces ADR-007 OPNsense WireGuard |
|
||||
| Service portal / dashboard | Homepage | A | candidate | One landing page listing all services — a "what does what" front door | Gap surfaced by V4; fits boma's legibility goal |
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,19 @@ earning its keep.
|
|||
|
||||
_(append new raw signals here; the next kaizen review consumes them)_
|
||||
|
||||
- `[recurring]` **Execution-mode menu asked AGAIN despite the 2026-06-10 "mechanical
|
||||
fix"** (2026-06-14): at the M1 (`public_dns`) plan handoff I presented the "1.
|
||||
Subagent-Driven / 2. Inline Execution — which approach?" menu and asked the user to
|
||||
pick. The decisions ledger (2026-06-10) records this exact behaviour as CHANGE →
|
||||
mechanical: *"Stop hook in `.claude/settings.json` blocks the turn if the menu appears
|
||||
and tells me to proceed subagent-driven."* It did not fire — either the hook is absent
|
||||
in this clone, its matcher doesn't match the wording the `writing-plans` skill actually
|
||||
produces, or it isn't installed/active. The standing agreement is to **default straight
|
||||
to subagent-driven without asking**. → verify the Stop hook exists and that its pattern
|
||||
matches the real menu text (the skill scripts "Two execution options" / "Which
|
||||
approach?"); if it relies on `.claude/settings.json` hooks that aren't active here,
|
||||
that's the gap. 5th occurrence (06-05/06/09/10/14).
|
||||
|
||||
- `[friction]` **ADR-writing policy is unsettled** (2026-05-31): drafting an ADR, I
|
||||
invented a Status header ("Proposed") on the fly because there's no documented
|
||||
convention for how we write ADRs (status lifecycle, required sections). → TODO 10.2 —
|
||||
|
|
|
|||
190
docs/ROADMAP.md
Normal file
190
docs/ROADMAP.md
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
# ROADMAP — boma build order
|
||||
|
||||
High-level **build order** for the project. Almost everything in `docs/decisions/`
|
||||
(the ADRs) is *designed, not built* — this file sequences that backlog into milestones
|
||||
and records *why* the order is what it is.
|
||||
|
||||
- **What is built vs planned:** `STATUS.md` (ground truth — always check there first).
|
||||
- **The backlog of decisions:** `docs/TODO.md` (this roadmap sequences it).
|
||||
- **The design rationale:** `docs/decisions/` (ADRs).
|
||||
|
||||
This is a **living document**: update it as milestones land (move them to `STATUS.md`),
|
||||
as ordering changes, or as new milestones appear. Each milestone gets its own
|
||||
spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it
|
||||
comes up; this file stays high-level.
|
||||
|
||||
_Last updated: 2026-06-11._
|
||||
|
||||
---
|
||||
|
||||
## Strategy — "remote-access first" (Approach A)
|
||||
|
||||
One focused track now (**Off-site / Remote-access**), a **procurement gate**, then the
|
||||
**Cluster** track. Cross-cutting/ongoing work runs underneath both.
|
||||
|
||||
**Why this order.** The only physical machine that exists today is `ubongo` (the control
|
||||
node); the Proxmox cluster is a procurement decision, not yet made. The nearest-term goal
|
||||
— reach `ubongo` from `mamba` / a work laptop while on the move — needs only things
|
||||
already available or cheap to spin up (`askari` at Hetzner, the laptops). Doing the
|
||||
remote-access track first:
|
||||
|
||||
1. **delivers the mobile-access goal in the first phase**, and
|
||||
2. **doubles as the proving ground** for boma's core machinery — the first real *service
|
||||
role* (NetBird), the `base` role on a *real, internet-facing* host, the `offsite_hosts`
|
||||
pattern, public DNS + ACME, the backup contract, and `rbw`/vault in anger — all on two
|
||||
cheap, low-stakes hosts **before** spending on the cluster.
|
||||
|
||||
Cluster hardware is then procured *after* those patterns are proven and a
|
||||
`/capacity-review` informs the sizing — so the spend happens once, with knowledge.
|
||||
|
||||
Rejected alternatives: **B — procure now, build strictly bottom-up** (mobile access lands
|
||||
late; spend precedes any proven pattern). **C — two parallel tracks** (for a solo operator
|
||||
this collapses into interleaving with extra context-switching cost).
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 — Off-site / Remote-access
|
||||
|
||||
Delivers mobile access to `ubongo`; proves the machinery. Ordered by *real* dependencies.
|
||||
|
||||
### M1 · boma's DNS home — a new domain at Gandi, managed as code
|
||||
|
||||
Register a **new Swahili-themed domain at Gandi** for boma and manage its records **as
|
||||
code (IaC)**. Greenfield, not a migration: investigating the existing domains ruled them
|
||||
out as boma's home — `baobab.band` is the **live legacy homelab** (Cloudflare; vaultwarden
|
||||
/ nextcloud / matrix in daily use), and `ziethen.dk` is the **family's primary email**
|
||||
(Fastmail); moving either's authoritative DNS risks breaking production. A fresh domain is
|
||||
zero-risk and *born at Gandi*.
|
||||
|
||||
- **Driver:** values/sovereignty (Gandi) + a clean, decoupled home so boma builds without
|
||||
endangering anything live. `baobab.band`'s Cloudflare exit / V4 decommission is a
|
||||
**separate, later track**, not part of this build. `ziethen.dk` is untouched.
|
||||
- **IaC approach:** follow boma's grain — internal DNS is already Ansible-rendered and
|
||||
Terraform owns *no* DNS (CLAUDE.md), so **public DNS is Ansible-managed too** (Gandi
|
||||
LiveDNS via an Ansible module — exact module pinned in M1's spec, verified per ADR-014).
|
||||
- **Naming scheme (decided):** three tiers (on boma's new domain, `<boma-domain>`) —
|
||||
`<host>.boma.<boma-domain>` (infra, internal-only) · `<service>.<boma-domain>`
|
||||
(home/cluster services, split-horizon) · `<service>.askari.<boma-domain>` (off-site/VPS,
|
||||
public). **`nyumbani` dropped.** Home services are **mesh/LAN-only by default** (no
|
||||
public record; reached over LAN or the NetBird mesh), with public Gandi records only for
|
||||
deliberate exceptions. The NetBird mesh carries the `<boma-domain>` match-domain to
|
||||
road-warriors (resolver = dns1/dns2 over `wt0`); a `*.<boma-domain>` ACME **DNS-01**
|
||||
wildcard cert (Gandi API) gives even unexposed services real TLS. Resolves TODO 4 and
|
||||
review finding O12.
|
||||
- **Records as a new/updated ADR:** amends ADR-007 — boma's public zone is
|
||||
`<boma-domain>` at Gandi LiveDNS managed as code; the three-tier naming scheme;
|
||||
`nyumbani` removed; mesh/LAN-only default; `baobab.band` (legacy, Cloudflare) is out of
|
||||
scope.
|
||||
- **Maps to:** ADR-007 (network/DNS), ADR-016 (mesh DNS), TODO 4 (**resolved here**).
|
||||
|
||||
### M2 · `askari` provisioned + under Ansible
|
||||
|
||||
Provision the Hetzner VPS **as IaC with Terraform** (CAX11 ARM / Helsinki / Debian 13,
|
||||
behind a TF-managed Hetzner Cloud Firewall), bring it into `offsite_hosts`, and bootstrap
|
||||
it. Design: `docs/superpowers/specs/2026-06-14-askari-provisioning-design.md`.
|
||||
|
||||
- **Decided:** Terraform owns `askari`'s existence — generalizes ADR-006 from "Proxmox VM
|
||||
existence" to **Proxmox + Hetzner** (new `hetznercloud/hcloud` provider, `hetzner_vm`
|
||||
module, `offsite` stack). Token via `TF_VAR_hcloud_token` from `vault.hetzner.token`.
|
||||
- **Proves:** the `offsite_hosts` pattern, the TF→Ansible handoff for a non-Proxmox host
|
||||
(`tf_to_inventory.py` extended), bootstrap of a non-cluster host. Closes review finding
|
||||
O6 (`offsite_hosts` missing from `hosts.yml`).
|
||||
- **Amends:** ADR-006 (TF scope), ADR-009 (offsite handoff), ADR-020 (Hetzner Cloud
|
||||
Firewall = perimeter), ADR-007/016 (`askari` TF-provisioned, not "added manually").
|
||||
|
||||
### M3 · `base` matured to a "remote-access-sufficient" subset
|
||||
|
||||
Today `base` is firewall-only. Add the subset a real, internet-facing host needs:
|
||||
**SSH hardening + fail2ban + the NetBird agent task**. Full CIS L1/L2, auditd, AppArmor,
|
||||
AIDE are deferred to Phase 2.
|
||||
|
||||
- **Why a subset:** `askari` is public (Hetzner) — it must be SSH-hardened and firewalled
|
||||
*with* exposure, but the full hardening standard is not on the critical path to mobile
|
||||
access.
|
||||
- **Maps to:** ADR-002 (security baseline), ADR-016 (agent enrollment lives in `base`),
|
||||
ADR-020 (firewall — already built), TODO 15 (the rest of hardening → Phase 2).
|
||||
|
||||
### M4 · NetBird control plane on `askari` — first real service role
|
||||
|
||||
Deploy the NetBird stack (management / signal / relay / Coturn + dashboard) with the
|
||||
**embedded IdP** (ADR-016 — no Authentik dependency).
|
||||
|
||||
- **First exercise of:** the service-role conventions (`SECURITY.md` / `VERIFY.md` /
|
||||
`ACCESS.md` / `BACKUP.md`), public **TLS / ACME**, and the **backup contract** —
|
||||
NetBird's management datastore is *stateful*, so it gets encrypted off-host backup
|
||||
(ADR-016 §recovery, ADR-022).
|
||||
- **Open design choice (decide in M4's spec):** a minimal ACME-terminating reverse proxy
|
||||
(e.g. Caddy) just for NetBird on `askari`, vs leaning on NetBird's bundled setup.
|
||||
- **Maps to:** ADR-016 (mesh), ADR-004 (one service = one role), ADR-021 (access),
|
||||
ADR-022 (backup), ADR-008/017 (VERIFY), accepted-risk R3 (askari public surface).
|
||||
|
||||
### M5 · Enroll peers → goal reached
|
||||
|
||||
NetBird agent on `ubongo` (the `wt0` path appears), then NetBird **clients on `mamba` +
|
||||
the work laptop** → `ubongo` is reachable from anywhere. **← the mobile-access goal lands
|
||||
here.**
|
||||
|
||||
- **Critical ordering:** NetBird-on-`ubongo` **before** applying `base` default-deny to
|
||||
`ubongo`. Hardening first would lock out SSH (no mesh path yet). Once the mesh `wt0`
|
||||
path exists, apply default-deny and set `base__firewall_control_addr` for the LAN
|
||||
fallback (ADR-021's `ssh-from-control`, already built/dormant).
|
||||
- **Maps to:** ADR-016, ADR-021 (SSH ladder: `wt0` + ssh-from-control), ADR-020.
|
||||
|
||||
---
|
||||
|
||||
## Gate — Procurement decision
|
||||
|
||||
Run `/capacity-review` (intent-based) to size the cluster, **then procure the Proxmox
|
||||
hardware**. Every core pattern (service role, base-on-real-host, DNS+ACME, backup, access)
|
||||
has by now been rehearsed on two cheap hosts, so the spend happens once and informed.
|
||||
|
||||
- **Maps to:** ADR-012 (hardware & capacity), `/capacity-review`.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — Cluster (gated on procurement; coarse until M5 is near)
|
||||
|
||||
Canonical dependency order:
|
||||
|
||||
1. **Terraform provisioning** — `terraform init`/apply the Proxmox VM module; regenerate
|
||||
inventory via `make tf-inventory` (ADR-006, ADR-009).
|
||||
2. **`base` full** — CIS L1/L2, auditd, AppArmor (enforce), AIDE, packages, users; the
|
||||
VM disk layout for CIS L2 is decided **before** provisioning (ADR-002, TODO 15).
|
||||
3. **`docker_host`** — real Docker engine + Compose, daemon hardening, `nftables.d`
|
||||
container rules (currently a scaffold; ADR-004, ADR-020).
|
||||
4. **`dns` role** — render the internal zone from inventory (ADR-007).
|
||||
5. **Auth + reverse proxy** — Authentik + Traefik: the foundation every service sits
|
||||
behind with authentication (ADR-002).
|
||||
6. **Monitoring** — Loki + Grafana Alloy (logging, ADR-018) + Prometheus/exporters +
|
||||
Uptime Kuma; decide which alerts live where (TODO 3.6).
|
||||
7. **Service roles** — PhotoPrism, email, indexers, … (`docs/CAPABILITIES.md`); each
|
||||
clears `docs/security/service-checklist.md` and carries its standard files.
|
||||
8. **`backup` role + `fisi` pull node** — restic Model A, pCloud + USB air-gap (ADR-022).
|
||||
9. **Forgejo Actions CI** — runner + workflows (ADR-003/010, TODO 1).
|
||||
|
||||
---
|
||||
|
||||
## Underneath both — Cross-cutting / ongoing
|
||||
|
||||
- **Accept ADR-011** (update management) — resolve its 6 open questions before the first
|
||||
scheduled patch run (TODO 16).
|
||||
- **Kaizen `/retro`** + keep appending to `docs/FRICTION.md` (TODO 11); **`/security-review`**
|
||||
skill (TODO 8.5); **`/review-repo` fortnightly cron** + headless email (TODO 8.1);
|
||||
`scheduled_jobs` role (TODO 8.3).
|
||||
- **User-notification function** — ntfy / matrix / email so tools + AI can reach the
|
||||
operator (TODO 9; ties to ADR-011 control channel).
|
||||
|
||||
### Parked decisions — decide when they bite, not before
|
||||
|
||||
- Split-horizon FQDN with or without `nyumbani` (TODO 4) — likely settled in M1.
|
||||
- Central database server vs per-app databases (TODO 3.9) — at the service phase.
|
||||
- Script-dependencies policy: stdlib-only vs selective libraries (TODO 14).
|
||||
- Keep the custom Molecule base-image method as testing matures (TODO 3.10).
|
||||
|
||||
---
|
||||
|
||||
## Next step
|
||||
|
||||
**M1 (Gandi DNS migration, IaC)** design is written —
|
||||
`docs/superpowers/specs/2026-06-11-public-dns-gandi-migration-design.md`. Next: user
|
||||
review → implementation plan.
|
||||
|
|
@ -1,5 +1,8 @@
|
|||
# ToDo
|
||||
|
||||
> **Build order lives in `docs/ROADMAP.md`** — that sequences this backlog into
|
||||
> milestones. This file is the decision backlog; the roadmap is the order we build them.
|
||||
|
||||
1. **Forgejo CI** — what CI work remains after ADR-010 (which workflows, runner
|
||||
setup, etc. still need to be built)?
|
||||
|
||||
|
|
@ -47,7 +50,9 @@
|
|||
10. Should we keep the custom base-container (Molecule test image) method for role testing, or revisit it as boma's testing approach matures (ADR-008)?
|
||||
11. ~~Deliberate tagging strategy.~~ DECIDED (ADR-019) — folded into 3.7.
|
||||
|
||||
4. **Split-horizon FQDN** — adopt split-horizon FQDN with or without nyumbani?
|
||||
4. ~~**Split-horizon FQDN** — adopt split-horizon FQDN with or without nyumbani?~~
|
||||
DECIDED (M1): three-tier scheme on `wingu.me`; `nyumbani` dropped; mesh/LAN-only
|
||||
default. See `docs/decisions/007-network.md` + the M1 spec.
|
||||
|
||||
5. **Control node**
|
||||
1. Set up and test the control node while waiting for hardware.
|
||||
|
|
|
|||
|
|
@ -157,7 +157,8 @@ IoT devices cannot initiate connections to `srv`.
|
|||
| Infrastructure VMs | `<role><n>` | `dns1`, `dns2`, `proxy` |
|
||||
| Hetzner VPS | `askari` | Swahili for guard/sentinel |
|
||||
| Internal FQDN | `<host>.boma.baobab.band` | `dns1.boma.baobab.band` |
|
||||
| Public service FQDN | `<service>.baobab.band` | `forgejo.nyumbani.baobab.band` |
|
||||
| Public service FQDN | `<service>.wingu.me` | `vaultwarden.wingu.me` |
|
||||
| Off-site (VPS) FQDN | `<service>.askari.wingu.me` | `netbird.askari.wingu.me` |
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -169,12 +170,18 @@ inventory (which derives from Terraform's `local.vms` via `make tf-inventory`),
|
|||
and service/alias/split-horizon records are explicit zone data in `group_vars`.
|
||||
Terraform itself writes no DNS records — see ADR-009.
|
||||
|
||||
**Public zone**: `baobab.band` — served by external DNS (Cloudflare or equivalent).
|
||||
Public-facing services resolve to the public IP or Cloudflare proxy.
|
||||
**Public zone**: `wingu.me` — Gandi LiveDNS, **managed as code** by the `public_dns`
|
||||
role (`vault.gandi.pat`). Three-tier naming: infra `<host>.boma.wingu.me` (internal),
|
||||
services `<service>.wingu.me` (split-horizon), off-site `<service>.askari.wingu.me`.
|
||||
`nyumbani` is retired. **Mesh/LAN-only by default**: home services have no public record
|
||||
(reached over LAN or the NetBird mesh); only deliberate exceptions are published. The
|
||||
project is `boma`; the domain is `wingu.me`. The legacy `baobab.band` zone (Cloudflare)
|
||||
is out of scope here.
|
||||
|
||||
**Split-horizon**: `dns1`/`dns2` serve internal answers for any hostname that has
|
||||
both a public and private face. Example: `forgejo.nyumbani.baobab.band` resolves to
|
||||
`10.20.0.12` (proxy) internally and to the public IP externally.
|
||||
both a public and private face. Example: `vaultwarden.wingu.me` resolves to
|
||||
`10.20.0.12` (proxy) internally and to the public IP externally (the internal
|
||||
zone will be renamed to `boma.wingu.me` when the `dns` role is built — Phase 2).
|
||||
|
||||
OPNsense DNS resolver forwards `boma.baobab.band` queries to `dns1`/`dns2`.
|
||||
All other queries go upstream (e.g., `1.1.1.1`, `9.9.9.9`).
|
||||
|
|
@ -191,7 +198,7 @@ ACLs — no OPNsense WireGuard tunnel and no `10.99.0.0/24` routing.
|
|||
`askari` is provisioned and managed independently of the Proxmox cluster — it must
|
||||
be reachable even when the homelab is down (its entire purpose), which is also why
|
||||
the mesh coordinator lives here: an off-site control plane survives a homelab outage.
|
||||
FQDN: `askari.baobab.band`.
|
||||
FQDN: `askari.wingu.me` (off-site tier; record added by `public_dns` when askari exists — M2/M4).
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -107,7 +107,8 @@ Accepted (2026-06-05). Designed, not built — depends on the unbuilt `base` rol
|
|||
|
||||
See also: ADR-007 (network — amended), ADR-015 (control host), ADR-002 (security),
|
||||
ADR-011 (version pinning), ADR-004 (one service = one role), ADR-009 (TF↔Ansible
|
||||
handoff), ADR-013 (heritage — V4 ran WireGuard; NetBird is translated, not transplanted).
|
||||
handoff), ADR-013 (heritage — V4 ran WireGuard; NetBird is translated, not transplanted),
|
||||
ADR-021 (operational access; SSH ladder reconciling `wt0` + `ubongo`'s LAN address).
|
||||
|
||||
## Consequences
|
||||
|
||||
|
|
|
|||
|
|
@ -132,4 +132,5 @@ symbolic sources; each layer renders its own slice; the no-ad-hoc-ports guardrai
|
|||
|
||||
ADR-002 (security baseline: nftables default-deny, fail2ban, blast radius),
|
||||
ADR-004 (Docker model: `iptables:false`), ADR-007 (network topology, VLANs, OPNsense,
|
||||
per-VLAN egress), ADR-016 (NetBird mesh: SSH on `wt0` only), ADR-019 (`firewall` tag).
|
||||
per-VLAN egress), ADR-016 (NetBird mesh: SSH on `wt0` only), ADR-019 (`firewall` tag),
|
||||
ADR-021 (operational access doctrine; `ssh-from-control` management-plane source).
|
||||
|
|
|
|||
65
docs/reviews/2026-06-11-findings.json
Normal file
65
docs/reviews/2026-06-11-findings.json
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
{
|
||||
"date": "2026-06-11",
|
||||
"reviewed_commit": "67f2aba",
|
||||
"fixes_commit": null,
|
||||
"mode": "on-demand",
|
||||
"counts": {
|
||||
"auto_fixed": 5,
|
||||
"open": 18,
|
||||
"scan": {
|
||||
"broken-adr-ref": 4,
|
||||
"broken-path-ref": 1,
|
||||
"marker": 14,
|
||||
"open-deferred-item": 5,
|
||||
"stale-deferred": 0
|
||||
}
|
||||
},
|
||||
"deferral_checklist": {
|
||||
"adr-011-open-items": "all 5 (snapshot driver, cadences, health-check harness home, classification home, staging-first) confirmed genuinely still open; cross-checked against later ADRs + TODO 16. No stale-deferred.",
|
||||
"adr-015-deferred": "deferred #1 (mesh VPN) #2 (service-UI) #3 (build) all confirmed marked RESOLVED in place. No stale-deferred.",
|
||||
"stale_deferred_found": 0
|
||||
},
|
||||
"scan_false_positives": [
|
||||
{"check": "broken-path-ref", "location": "STATUS.md:38", "why": "STATUS legitimately documents roles/docker_host/ as 'Not in git.' — intentional reference to an unbuilt role."},
|
||||
{"check": "broken-adr-ref", "location": "tests/test_repo_scan.py:10,43; docs/superpowers/plans/2026-06-10-adr-structure.md:50,83", "why": "ADR-099/ADR-100 are intentional test fixtures exercising the scanner's bad-ref detection."},
|
||||
{"check": "marker", "location": "docs/superpowers/plans/*, docs/superpowers/specs/*, docs/decisions/019-tagging.md:14", "why": "All 14 markers are in historical planning artifacts (commit-message TODOs, plan steps) or prose discussing 'over-tagging' as a concept — not actionable cruft."}
|
||||
],
|
||||
"auto_fixed": [
|
||||
{"id": "AF1", "dimension": "drift", "severity": "high", "location": "roles/README.md:11-13", "description": "'base and docker_host not built yet — empty, untracked dirs, so site.yml would fail on a clean clone' contradicts STATUS.md: base is partially built (firewall concern, tracked), docker_host does not exist, dev_env is built+applied.", "fix": "rewrote Current-state paragraph: base partially built (firewall), docker_host not yet created, dev_env built+applied.", "tag": "new"},
|
||||
{"id": "AF2", "dimension": "drift", "severity": "medium", "location": "playbooks/site.yml:4-5", "description": "NOTE claimed base + docker_host 'not built yet ... fails on a clean clone'; base's firewall concern is built+applied per STATUS.md.", "fix": "NOTE now states base is partially built (firewall) and only docker_host is missing.", "tag": "new"},
|
||||
{"id": "AF3", "dimension": "drift", "severity": "medium", "location": "playbooks/README.md:6-8", "description": "site.yml described as 'currently a no-op' (roles empty); base's firewall now applies real nftables state. workstation.yml (applies dev_env) was unlisted.", "fix": "reworded the no-op claim and added a workstation.yml bullet.", "tag": "new"},
|
||||
{"id": "AF4", "dimension": "drift", "severity": "low", "location": "README.md:58-76", "description": "project-structure tree omitted docs/access/, docs/backup/, roles/dev_env/, and playbooks/workstation.yml — all present on disk.", "fix": "added the four missing tree entries.", "tag": "recurring"},
|
||||
{"id": "AF5", "dimension": "consistency", "severity": "low", "location": "docs/decisions/016-mesh-vpn.md:110; docs/decisions/020-firewall.md:135", "description": "ADR-021 states it amends ADR-016 and ADR-020 to cross-reference the SSH ladder, but neither listed ADR-021 back in its See-also/Related section.", "fix": "added the reciprocal ADR-021 cross-reference to both.", "tag": "new"}
|
||||
],
|
||||
"open": [
|
||||
{"id": "O1", "dimension": "conformance", "severity": "high", "location": "playbooks/site.yml:18", "description": "`make lint` is RED on `main`: site.yml imports the `docker_host` role which does not exist, so ansible-lint syntax-check fails on a clean checkout. Violates CLAUDE.md 'main must always work' and 'Never skip lint' (pre-commit would block every commit unless bypassed).", "suggested_fix": "Decide an interim posture: guard the docker_host play (e.g. skip until the role exists), stub the role via `make new-role NAME=docker_host`, or exclude site.yml from syntax-check until built — and record it. Judgement call.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O2", "dimension": "consistency", "severity": "high", "location": "docs/decisions/004-docker-model.md:105 ↔ docs/decisions/022-backup.md", "description": "ADR-004 'Persistent data' says 'Backup strategy is defined separately (not in scope of this repo).' ADR-022 defines a full in-repo backup strategy (backup role, fisi pull node, per-service backup__* + BACKUP.md). Direct ADR↔ADR contradiction on scope.", "suggested_fix": "Update ADR-004's line to point at ADR-022 (backup is now in-repo scope) and cross-link, per ADR-023's no-silent-reversal rule. Design decision — report only.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O3", "dimension": "consistency", "severity": "medium", "location": "docs/decisions/004-docker-model.md:48-49", "description": "ADR-004's service-role file table (the canonical standard) lists only SECURITY.md + VERIFY.md, but CLAUDE.md + ADR-021/ADR-022 now mandate ACCESS.md (every service role) and BACKUP.md (stateful service roles).", "suggested_fix": "Add ACCESS.md (ADR-021) and BACKUP.md (ADR-022) rows to ADR-004's service-role file table. (Prior O1 'missing VERIFY.md' is now resolved — this is the next evolution.)", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O4", "dimension": "consistency", "severity": "medium", "location": "docs/CAPABILITIES.md:149-154 ↔ STATUS.md:29", "description": "CAPABILITIES lists nvim/tmux/shell config as a CONFIRMED EXCLUSION ('boma is server-only, so these are correctly absent'), but the dev_env role (built+applied to ubongo) installs exactly zsh+oh-my-zsh+tmux+neovim.", "suggested_fix": "Carve out an exception for the control-node developer/AI-worker environment (ubongo, ADR-015) rather than flatly excluding nvim/tmux; distinguish infra worker-host config from personal desktops.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O5", "dimension": "drift", "severity": "medium", "location": "docs/decisions/002-security.md:82", "description": "References `make deploy PLAYBOOK=upgrade` as the deliberate full-upgrade mechanism, but no upgrade.yml playbook exists (only bootstrap/site/workstation) and ADR-011 update-management is still Proposed/unbuilt — stated without the '(planned)' caveat ADR-002 uses for its other unbuilt controls.", "suggested_fix": "Add a '(planned — ADR-011, not yet built)' caveat to the upgrade line.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O6", "dimension": "drift", "severity": "medium", "location": "inventories/production/hosts.yml:7-16; inventories/staging/hosts.yml:7-14", "description": "Committed hosts.yml stubs omit the offsite_hosts group, but it is one of the four VALID_GROUPS in tf_to_inventory.py and in ADR-009/ADR-016/CLAUDE.md; the next `make tf-inventory` would add it, so the hand-stubs have drifted. (Prior O4 'askari group unnamed' is resolved — naming is now consistent; this is the residual stub gap.)", "suggested_fix": "Regenerate via `make tf-inventory TF_ENV=production` and `TF_ENV=staging` (do NOT hand-edit hosts.yml — CLAUDE.md), or accept the stubs lag until TF runs.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O7", "dimension": "drift", "severity": "medium", "location": "docs/runbooks/new-host.md:81-130", "description": "Part E (control node ubongo) instructs creating an 'ansible' user and 'ssh ansible@<IP>', but STATUS.md records ubongo is deliberately managed as the operator account sjat (group_vars/control ansible_user: sjat) with the ansible-user bootstrap listed as Pending.", "suggested_fix": "Update Part E to reflect ubongo managed as sjat (no ansible user yet), ansible-user bootstrap a pending item per STATUS.md.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O8", "dimension": "conformance", "severity": "medium", "location": "roles/dev_env/tasks/per_user.yml:2-9", "description": "The getent + `set_fact: dev_env__home` preflight is untagged, but downstream tasks that consume dev_env__home carry concern tags (users, config). A partial `--tags users` or `--tags config` run skips the set_fact, leaving dev_env__home undefined and failing the tagged tasks — against ADR-019's concern-runnable-in-isolation intent.", "suggested_fix": "Tag the preflight with the union of dependent concerns ([users, config]) or `always`.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O9", "dimension": "consistency", "severity": "medium", "location": "STATUS.md:31 ↔ docs/decisions/007-network.md", "description": "STATUS places ubongo at 10.20.10.151; ADR-007 defines srv as 10.20.0.0/24 and mgmt as 10.10.0.0/24 — 10.20.10.151 is in neither. base__firewall_control_addr (ADR-021 recovery path) depends on this address being correct. Already a tracked follow-up in the ubongo-build plan (line 147).", "suggested_fix": "Either correct ubongo's recorded address to a valid ADR-007 subnet, or amend ADR-007 to document the actual VLAN/subnet ubongo's physical port lives on, before base__firewall_control_addr is populated.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O10", "dimension": "drift", "severity": "low", "location": "README.md:104-106", "description": "README's Documentation ADR list stops at 017; ADRs 018 (logging), 019 (tagging), 020 (firewall), 021 (access), 022 (backup), 023 (ADR structure) exist and are in CLAUDE.md's full table. Partial enumeration is now stale. (Evolved from prior O3, which is otherwise resolved — the docs/ tree omissions were fixed in AF4.)", "suggested_fix": "Extend the list through 023, or trim it to a pointer at CLAUDE.md's full table to avoid a stale partial list.", "tag": "recurring", "auto_fixable": false},
|
||||
{"id": "O11", "dimension": "conformance", "severity": "low", "location": "docs/decisions/008-testing.md:3; 014-knowledge-sourcing.md:98; 016-mesh-vpn.md:91; 017-service-ui-verification.md:66; 018-logging.md:73", "description": "ADR-023 §2 mandates section order Status→Context→Decision→Consequences. ADR-008 injects a gotchas blockquote before ## Status; ADR-014's ## Decision is a late summary after six topical sections; ADR-016/017/018 place ## Status mid-document. The scan checks presence, not order, so all pass lint — but they don't match the stated standard.", "suggested_fix": "Presentational restructure per ADR-023 §6 (move Status first; pull Decision up). No decision substance changes. Judgement call — report.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O12", "dimension": "consistency", "severity": "low", "location": "docs/decisions/007-network.md:160", "description": "The naming-scheme table states the public FQDN convention is `<service>.baobab.band`, but its own example is `forgejo.nyumbani.baobab.band` (extra nyumbani label). The nyumbani split-horizon sub-label is still OPEN (TODO 4); convention and example disagree.", "suggested_fix": "Change the example to forgejo.baobab.band, or note nyumbani is an unresolved split-horizon sub-label (TODO 4). Ties to an open decision — report.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O13", "dimension": "consistency", "severity": "low", "location": "roles/dev_env/files/dotfiles/zsh/.zshrc:28,55", "description": "Shipped .zshrc hard-codes `alias rclone=\"/usr/bin/rclone\"` (rclone is not installed by dev_env) and `eval \"$(direnv hook zsh)\"` unguarded (unlike the guarded oh-my-posh block) — heritage fisi/V4 carryovers. If direnv is dropped from dev_env__packages every shell startup errors.", "suggested_fix": "Drop the rclone alias (role doesn't install it) and guard the direnv hook with `command -v direnv`, or document direnv as a hard dependency of the shipped .zshrc.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O14", "dimension": "consistency", "severity": "low", "location": "roles/dev_env/tasks/oh_my_posh.yml:15-26", "description": "The zen.toml theme-directory + deploy tasks render config to disk but carry no `config` tag, while analogous dotfile tasks in per_user.yml are tagged `config` — inconsistent concern tagging within the role.", "suggested_fix": "Add tags: [config] to the zen.toml directory + deploy tasks.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O15", "dimension": "consistency", "severity": "low", "location": "terraform/environments/production/terraform.tfvars.example:9-11; staging/terraform.tfvars.example", "description": "proxmox_node/endpoint examples use pve01 / pve01.baobab.band, but ADR-007 defines Proxmox node names as pve0/pve1/pve2 (single digit, no leading zero). Example contradicts the naming convention.", "suggested_fix": "Change example values to pve0 / pve0.baobab.band (both envs). Verify the actual node name first — report rather than auto-fix.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O16", "dimension": "consistency", "severity": "low", "location": "docs/decisions/013-heritage-v4.md:77; docs/decisions/015-control-host.md", "description": "ADR-013 and ADR-015 close with an inline 'See also:' prose line, whereas ADRs 014/019/020/021/022 and the adr-template use a dedicated `## Related` section. Stylistic inconsistency (## Related is optional per ADR-023 §3).", "suggested_fix": "Convert the 'See also:' prose in ADR-013/015 into ## Related sections for uniformity. Cosmetic.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O17", "dimension": "cruft", "severity": "low", "location": "roles/dev_env/handlers/main.yml; roles/base/handlers/main.yml", "description": "Both roles ship an empty handlers/main.yml (only `---`); neither defines or uses handlers (base's firewall apply/rollback is deliberately in tasks). Scaffold artifacts from make new-role.", "suggested_fix": "Confirm whether empty scaffold files are an intentional convention; if not, delete. Low priority.", "tag": "new", "auto_fixable": false},
|
||||
{"id": "O18", "dimension": "consistency", "severity": "low", "location": "docs/README.md:5-8; inventories/README.md:1-12", "description": "docs/README.md lists only decisions/ + runbooks/ (omits security/testing/access/backup/hardware/reviews/superpowers); inventories/README.md omits the offsite_hosts group documented in CLAUDE.md. Both are narrower than current reality.", "suggested_fix": "Add the missing subdir rows / note offsite_hosts, or explicitly defer to the canonical list. Low priority.", "tag": "new", "auto_fixable": false}
|
||||
],
|
||||
"prior_resolved": [
|
||||
{"id": "O1@2026-06-05", "description": "ADR-004 service-role table missing VERIFY.md row", "status": "resolved — table now lists SECURITY.md + VERIFY.md (next gap ACCESS/BACKUP tracked as O3)"},
|
||||
{"id": "O2@2026-06-05", "description": "new-role runbook missing VERIFY.md step", "status": "resolved — step 10 present"},
|
||||
{"id": "O3@2026-06-05", "description": "README ADR list + docs/ tree omissions", "status": "partial — docs tree security/testing/hardware now present; access/backup fixed in AF4; ADR-list staleness carried as O10"},
|
||||
{"id": "O4@2026-06-05", "description": "askari inventory group unnamed", "status": "resolved — offsite_hosts named consistently (residual stub gap = O6)"},
|
||||
{"id": "O5@2026-06-05", "description": "backend.tf mislabelled Forgejo state backend", "status": "resolved — now labelled local state"},
|
||||
{"id": "O6@2026-06-05", "description": "ADR-014 plugin reproducibility described open but TODO done", "status": "resolved"},
|
||||
{"id": "O11@2026-06-05", "description": "CAPABILITIES missing /verify-service Level-4 row", "status": "resolved — present (§10)"},
|
||||
{"id": "O12@2026-06-05", "description": "TODO 3.10 garbled", "status": "resolved — readable"},
|
||||
{"id": "O7-O10@2026-06-05", "description": "ADR-011 digest-pinning row; act_runner ambiguity; WireGuard Molecule row; ADR-011 scheduled_jobs cross-ref", "status": "not re-detected this run (ADR-011 still Proposed) — verify on next run"}
|
||||
]
|
||||
}
|
||||
161
docs/reviews/2026-06-11-review.md
Normal file
161
docs/reviews/2026-06-11-review.md
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
# Repo review — 2026-06-11
|
||||
|
||||
- **Reviewed commit:** `67f2aba` (main)
|
||||
- **Mode:** on-demand (interactive)
|
||||
- **Previous run:** `2026-06-05` (commit `f566fd1`)
|
||||
- **Process:** Phase 0 deterministic scan → 5 parallel shard reviewers + 1 cross-cutting
|
||||
reviewer → synthesis, deferral-checklist resolution, prior-run diff → safe auto-fixes.
|
||||
|
||||
## Summary
|
||||
|
||||
| | High | Medium | Low | Total |
|
||||
|---|---|---|---|---|
|
||||
| **Auto-fixed** | 1 | 2 | 2 | 5 |
|
||||
| **Open (report-only)** | 2 | 7 | 9 | 18 |
|
||||
|
||||
By dimension (open): conformance 3 · consistency 8 · drift 6 · cruft 1.
|
||||
|
||||
**Headline:** `make lint` is currently **red on `main`** — `playbooks/site.yml` imports the
|
||||
not-yet-existent `docker_host` role (confirmed at clean HEAD, unrelated to this run's
|
||||
edits). That breaks CLAUDE.md's "main must always work" / "Never skip lint" contract and
|
||||
is the top open finding (O1). The bulk of the rest is documentation drift created by the
|
||||
recent `base` (firewall) + `dev_env` build wave: several READMEs/playbook notes still
|
||||
described the roles as "empty / not built." Those were the safe auto-fixes.
|
||||
|
||||
**Good news:** 7 of the 12 open findings from the 2026-06-05 run are confirmed resolved
|
||||
(VERIFY.md row + runbook step, backend.tf relabel, askari group naming, ADR-014
|
||||
reproducibility, CAPABILITIES Level-4 row, TODO 3.10). The deferral checklist is clean —
|
||||
**0 stale-deferred** this run (the recurring miss logged in FRICTION.md did not recur).
|
||||
|
||||
## Auto-fixes applied
|
||||
|
||||
Markdown / YAML-comment only; no runtime behaviour, logic, vars, or task order touched.
|
||||
|
||||
| ID | Sev | File(s) | What |
|
||||
|---|---|---|---|
|
||||
| AF1 | high | `roles/README.md` | Rewrote stale "base & docker_host are empty untracked dirs, site.yml would fail on a clean clone" → base partially built (firewall), docker_host not yet created, dev_env built+applied. |
|
||||
| AF2 | med | `playbooks/site.yml` | NOTE no longer claims base is unbuilt / "fails on a clean clone"; now reflects firewall-only base + missing docker_host. |
|
||||
| AF3 | med | `playbooks/README.md` | Dropped the "currently a no-op" claim; added a `workstation.yml` bullet. |
|
||||
| AF4 | low | `README.md` | Added `docs/access/`, `docs/backup/`, `roles/dev_env/`, `playbooks/workstation.yml` to the project-structure tree. |
|
||||
| AF5 | low | `docs/decisions/016-mesh-vpn.md`, `docs/decisions/020-firewall.md` | Added the reciprocal `ADR-021` cross-reference that ADR-021 says it amended in. |
|
||||
|
||||
> `make lint` was re-run after the fixes: it fails **only** on the pre-existing
|
||||
> `docker_host` syntax-check (O1), identical to clean HEAD. No auto-fix introduced or
|
||||
> changed any lint result, so none were reverted.
|
||||
|
||||
## Open findings (prioritised)
|
||||
|
||||
### High
|
||||
|
||||
- **O1 — `make lint` is red on `main`** · `playbooks/site.yml:18` · *conformance*
|
||||
site.yml imports the `docker_host` role, which does not exist, so ansible-lint's
|
||||
syntax-check fails on a clean checkout. Violates "main must always work" + "Never skip
|
||||
lint" (pre-commit would block every commit unless bypassed).
|
||||
*Fix (judgement):* guard/skip the docker_host play until the role exists, scaffold a
|
||||
stub via `make new-role NAME=docker_host`, or exclude site.yml from syntax-check until
|
||||
built — and record the choice. **new**
|
||||
|
||||
- **O2 — ADR-004 ↔ ADR-022 backup-scope contradiction** ·
|
||||
`docs/decisions/004-docker-model.md:105` · *consistency*
|
||||
ADR-004 says "Backup strategy is defined separately (not in scope of this repo)";
|
||||
ADR-022 defines a full in-repo backup strategy. Per ADR-023 (no silent reversals),
|
||||
update ADR-004's line to defer to ADR-022 and cross-link. Design decision — report. **new**
|
||||
|
||||
### Medium
|
||||
|
||||
- **O3 — ADR-004 service-role file table missing ACCESS.md + BACKUP.md** ·
|
||||
`docs/decisions/004-docker-model.md:48` · *consistency* — CLAUDE.md + ADR-021/022 now
|
||||
mandate both for service roles; the canonical table lists only SECURITY.md + VERIFY.md.
|
||||
(Prior "missing VERIFY.md" is resolved; this is the next evolution.) **new**
|
||||
- **O4 — CAPABILITIES nvim/tmux exclusion ↔ dev_env built** ·
|
||||
`docs/CAPABILITIES.md:149` · *consistency* — listed as a confirmed exclusion
|
||||
("server-only"), but `dev_env` (built+applied to ubongo) installs exactly that. Carve
|
||||
out the control-node/AI-worker exception (ADR-015). **new**
|
||||
- **O5 — phantom `make deploy PLAYBOOK=upgrade`** · `docs/decisions/002-security.md:82` ·
|
||||
*drift* — no `upgrade.yml` exists; ADR-011 is unbuilt. Add a "(planned)" caveat. **new**
|
||||
- **O6 — hosts.yml stubs missing `offsite_hosts` group** ·
|
||||
`inventories/{production,staging}/hosts.yml` · *drift* — the generator emits it (one of
|
||||
four VALID_GROUPS); the hand-stubs predate the standard. Regenerate via
|
||||
`make tf-inventory` (don't hand-edit). (Prior "askari group unnamed" is resolved.) **new**
|
||||
- **O7 — new-host runbook Part E vs ubongo reality** · `docs/runbooks/new-host.md:81-130`
|
||||
· *drift* — instructs creating an `ansible` user / `ssh ansible@`; STATUS records ubongo
|
||||
is managed as `sjat`, ansible-user bootstrap pending. **new**
|
||||
- **O8 — dev_env untagged `set_fact` under tagged consumers** ·
|
||||
`roles/dev_env/tasks/per_user.yml:2-9` · *conformance* — partial `--tags users|config`
|
||||
runs skip the `dev_env__home` set_fact and fail. Tag the preflight `[users, config]` or
|
||||
`always`. **new**
|
||||
- **O9 — ubongo address outside ADR-007 subnets** · `STATUS.md:31 ↔ 007-network.md` ·
|
||||
*drift* — 10.20.10.151 is in neither srv (10.20.0.0/24) nor mgmt (10.10.0.0/24);
|
||||
`base__firewall_control_addr` depends on it. Already a tracked follow-up in the
|
||||
ubongo-build plan. Reconcile address or ADR-007. **new**
|
||||
|
||||
### Low
|
||||
|
||||
- **O10 — README ADR list stops at 017** · `README.md:104` · *drift* — 018–023 exist;
|
||||
extend or trim to a pointer. **recurring** (evolved from prior O3)
|
||||
- **O11 — ADR section-order vs ADR-023 §2** · `008:3, 014:98, 016:91, 017:66, 018:73` ·
|
||||
*conformance* — Status-not-first / Decision-late; passes lint (order not gated) but not
|
||||
the standard. Presentational restructure. **new**
|
||||
- **O12 — ADR-007 FQDN convention vs its own example** · `007-network.md:160` ·
|
||||
*consistency* — `<service>.baobab.band` vs `forgejo.nyumbani.baobab.band`; ties to open
|
||||
TODO 4 (split-horizon). **new**
|
||||
- **O13 — dev_env `.zshrc` heritage carryovers** ·
|
||||
`roles/dev_env/files/dotfiles/zsh/.zshrc:28,55` · *consistency* — hard-coded
|
||||
`/usr/bin/rclone` alias (not installed by the role) + unguarded `direnv` hook. **new**
|
||||
- **O14 — oh_my_posh config tasks untagged** · `roles/dev_env/tasks/oh_my_posh.yml:15-26`
|
||||
· *consistency* — inconsistent `config` tagging vs per_user.yml. **new**
|
||||
- **O15 — tfvars.example `pve01` vs ADR-007 `pve0`** ·
|
||||
`terraform/environments/*/terraform.tfvars.example:9` · *consistency* — verify the real
|
||||
node name, then align. **new**
|
||||
- **O16 — ADR-013/015 "See also:" vs `## Related`** · *consistency* — stylistic; convert
|
||||
for uniformity. **new**
|
||||
- **O17 — empty scaffold `handlers/main.yml`** · `roles/{dev_env,base}/handlers/main.yml`
|
||||
· *cruft* — confirm convention or delete. **new**
|
||||
- **O18 — docs/README.md + inventories/README.md narrower than reality** · *consistency*
|
||||
— omit several real subdirs / the offsite_hosts group. **new**
|
||||
|
||||
## Deferral checklist (Phase 2)
|
||||
|
||||
| Source | Items | Verdict |
|
||||
|---|---|---|
|
||||
| ADR-011 Deferred/Open | 5 (snapshot driver, cadences, health-check harness home, classification home, staging-first) | **All genuinely still open** — cross-checked against later ADRs + TODO 16. None silently resolved. |
|
||||
| ADR-015 Deferred | #1 mesh VPN, #2 service-UI, #3 build | **All marked RESOLVED in place** (ADR-016 / ADR-017 / 2026-06-11 build). |
|
||||
|
||||
**Stale-deferred found: 0.** The recurring FRICTION.md miss did not recur this run.
|
||||
|
||||
## Scan false positives (folded in, not actionable)
|
||||
|
||||
- `broken-path-ref STATUS.md:38` — STATUS legitimately documents `roles/docker_host/` as
|
||||
"Not in git." (intentional reference to an unbuilt role).
|
||||
- `broken-adr-ref` ×4 — `ADR-099`/`ADR-100` in `tests/test_repo_scan.py` and the
|
||||
adr-structure plan are intentional **test fixtures** for the scanner's bad-ref check.
|
||||
- `marker` ×14 — all in `docs/superpowers/{plans,specs}/*` (historical commit-message
|
||||
TODOs / plan steps) or prose discussing "over-tagging" as a concept. Not cruft.
|
||||
|
||||
## Prior-run diff (vs 2026-06-05)
|
||||
|
||||
**Resolved (7):** O1 VERIFY.md row · O2 new-role VERIFY step · O4 askari group naming ·
|
||||
O5 backend.tf relabel · O6 ADR-014 reproducibility · O11 CAPABILITIES Level-4 row ·
|
||||
O12 TODO 3.10. **Partial:** O3 (docs tree fixed in AF4; ADR-list carried as O10).
|
||||
**Not re-detected (verify next run):** O7–O10 (ADR-011 still Proposed).
|
||||
|
||||
## Follow-up prompt (copy-paste)
|
||||
|
||||
> Act on the open findings from `docs/reviews/2026-06-11-review.md`. Priority order:
|
||||
> 1. **O1 (high):** `make lint` is red on `main` — `playbooks/site.yml` imports the
|
||||
> non-existent `docker_host` role. Pick an interim posture (guard/skip the play, or
|
||||
> `make new-role NAME=docker_host` to scaffold a stub, or exclude from syntax-check
|
||||
> until built) so the trunk lints clean again, and record the choice in STATUS.md.
|
||||
> 2. **O2 (high):** Resolve the ADR-004 ↔ ADR-022 backup-scope contradiction —
|
||||
> update ADR-004's "not in scope of this repo" line to defer to ADR-022 (per ADR-023's
|
||||
> no-silent-reversal rule) and cross-link.
|
||||
> 3. **O3:** Add ACCESS.md + BACKUP.md rows to ADR-004's service-role file table.
|
||||
> 4. **O4:** Reconcile CAPABILITIES' nvim/tmux exclusion with the built `dev_env` role
|
||||
> (carve out the ubongo control-node exception).
|
||||
> 5. **O8 (conformance):** Tag the `dev_env__home` preflight `set_fact` so partial
|
||||
> `--tags users|config` runs don't fail.
|
||||
> 6. **O6 / O9:** Regenerate the inventory stubs to include `offsite_hosts`; reconcile
|
||||
> ubongo's 10.20.10.151 against ADR-007's subnets (or amend ADR-007).
|
||||
> 7. Sweep the low-severity doc items (O5 caveat, O7 runbook, O10 ADR list, O11 ADR
|
||||
> section order, O12–O18) as a single docs-hygiene batch.
|
||||
> Run `make lint` before committing; commit per CLAUDE.md git conventions.
|
||||
|
|
@ -1,93 +1,161 @@
|
|||
# Repo review — 2026-06-05
|
||||
# Repo review — 2026-06-11
|
||||
|
||||
- **Reviewed commit:** `f566fd1` (scan); auto-fixes landed in `666ad42`
|
||||
- **Reviewed commit:** `67f2aba` (main)
|
||||
- **Mode:** on-demand (interactive)
|
||||
- **Scope:** whole repo — 2 roles, 17 ADRs, 4 runbooks, 7 scripts; doc-heavy
|
||||
- **Prior run:** 2026-05-30 (`de38d1c`) — 7 auto-fixed, 17 open
|
||||
- **Previous run:** `2026-06-05` (commit `f566fd1`)
|
||||
- **Process:** Phase 0 deterministic scan → 5 parallel shard reviewers + 1 cross-cutting
|
||||
reviewer → synthesis, deferral-checklist resolution, prior-run diff → safe auto-fixes.
|
||||
|
||||
## Summary
|
||||
|
||||
| | high | medium | low | total |
|
||||
| | High | Medium | Low | Total |
|
||||
|---|---|---|---|---|
|
||||
| Auto-fixed | 2 | 0 | 2 | 4 |
|
||||
| Open (report-only) | 0 | 5 | 7 | 12 |
|
||||
| **Auto-fixed** | 1 | 2 | 2 | 5 |
|
||||
| **Open (report-only)** | 2 | 7 | 9 | 18 |
|
||||
|
||||
This review followed a session of heavy documentation work (ADR-015 `ubongo`,
|
||||
ADR-016 NetBird mesh, ADR-017 Level-4 verification). Most findings are **propagation
|
||||
gaps** — a new decision landed but an older doc still reflects the prior design.
|
||||
By dimension (open): conformance 3 · consistency 8 · drift 6 · cruft 1.
|
||||
|
||||
**New deferral check exercised.** `repo-scan.py` now enumerates open ADR "Deferred/
|
||||
Open" items and flags any another file calls resolved-but-unmarked. This run: 6
|
||||
open-deferred-items surfaced, **all confirmed genuinely open** by the cross-cutting
|
||||
reviewer (ADR-011 #1–5, ADR-015 #3), **0 stale-deferred**. The check produced no false
|
||||
resolutions and the judgement layer agreed — working as designed.
|
||||
**Headline:** `make lint` is currently **red on `main`** — `playbooks/site.yml` imports the
|
||||
not-yet-existent `docker_host` role (confirmed at clean HEAD, unrelated to this run's
|
||||
edits). That breaks CLAUDE.md's "main must always work" / "Never skip lint" contract and
|
||||
is the top open finding (O1). The bulk of the rest is documentation drift created by the
|
||||
recent `base` (firewall) + `dev_env` build wave: several READMEs/playbook notes still
|
||||
described the roles as "empty / not built." Those were the safe auto-fixes.
|
||||
|
||||
## Auto-fixes applied (`666ad42`)
|
||||
**Good news:** 7 of the 12 open findings from the 2026-06-05 run are confirmed resolved
|
||||
(VERIFY.md row + runbook step, backend.tf relabel, askari group naming, ADR-014
|
||||
reproducibility, CAPABILITIES Level-4 row, TODO 3.10). The deferral checklist is clean —
|
||||
**0 stale-deferred** this run (the recurring miss logged in FRICTION.md did not recur).
|
||||
|
||||
| id | dim | sev | location | fix |
|
||||
|---|---|---|---|---|
|
||||
| AF1 | consistency | high | `docs/decisions/005-bootstrapping.md:36`, `docs/runbooks/new-host.md:62,71` | Removed "Terraform writes the host's DNS A record" — contradicts ADR-009 (the `dns` role owns the zone). **Recurring**: the 2026-05-30 run fixed the same contradiction in README/ADR-003; it reappeared in two more files. |
|
||||
| AF2 | consistency | high | `docs/decisions/005-bootstrapping.md:8` | Control node described as cloned from the cloud-init template; ADR-015 makes `ubongo` a physical box installed directly. Corrected. |
|
||||
| AF3 | consistency | low | `CLAUDE.md:197` | Added the missing `docs/testing/service-verify-template.md` row to Further reading (parallels the security-template row). |
|
||||
| AF4 | cruft | low | `docs/TODO.md:79` | Typos: "we we" → "we"; "seperate" → "separate". |
|
||||
## Auto-fixes applied
|
||||
|
||||
## Open findings (report-only)
|
||||
Markdown / YAML-comment only; no runtime behaviour, logic, vars, or task order touched.
|
||||
|
||||
### VERIFY.md propagation cluster (ADR-017 not fully threaded through)
|
||||
| ID | Sev | File(s) | What |
|
||||
|---|---|---|---|
|
||||
| AF1 | high | `roles/README.md` | Rewrote stale "base & docker_host are empty untracked dirs, site.yml would fail on a clean clone" → base partially built (firewall), docker_host not yet created, dev_env built+applied. |
|
||||
| AF2 | med | `playbooks/site.yml` | NOTE no longer claims base is unbuilt / "fails on a clean clone"; now reflects firewall-only base + missing docker_host. |
|
||||
| AF3 | med | `playbooks/README.md` | Dropped the "currently a no-op" claim; added a `workstation.yml` bullet. |
|
||||
| AF4 | low | `README.md` | Added `docs/access/`, `docs/backup/`, `roles/dev_env/`, `playbooks/workstation.yml` to the project-structure tree. |
|
||||
| AF5 | low | `docs/decisions/016-mesh-vpn.md`, `docs/decisions/020-firewall.md` | Added the reciprocal `ADR-021` cross-reference that ADR-021 says it amended in. |
|
||||
|
||||
| id | sev | location | finding | suggested fix |
|
||||
|---|---|---|---|---|
|
||||
| O1 | medium | `docs/decisions/004-docker-model.md` (file table) | The service-role standard lists `SECURITY.md` but not `VERIFY.md`, though ADR-017 + CLAUDE.md:85 now mandate it. | Add a `VERIFY.md` row to ADR-004's file table. |
|
||||
| O2 | medium | `docs/runbooks/new-role.md` (step 9 → Commit) | No step to write `VERIFY.md` for service roles (only `SECURITY.md`). Makes `STATUS.md:17` ("runbooks current and mutually reconciled") slightly overstated. | Add a "write the per-service verification spec" step mirroring the SECURITY.md step. |
|
||||
| O3 | low | `README.md:58-60, 94` | ADR list stops at 001–009 (010–017 absent); the `docs/` tree omits `security/`, `testing/`, `hardware/`. | Extend the ADR list (or point to `docs/decisions/` + CLAUDE.md's table); expand the `docs/` subtree. |
|
||||
> `make lint` was re-run after the fixes: it fails **only** on the pre-existing
|
||||
> `docker_host` syntax-check (O1), identical to clean HEAD. No auto-fix introduced or
|
||||
> changed any lint result, so none were reverted.
|
||||
|
||||
### Design gaps from the recent ADRs
|
||||
## Open findings (prioritised)
|
||||
|
||||
| id | sev | location | finding | suggested fix |
|
||||
|---|---|---|---|---|
|
||||
| O4 | medium | `CLAUDE.md:106`, `docs/decisions/009-provisioning-handoff.md:78`, `scripts/tf_to_inventory.py:24` | ADR-016 says "`askari` is Ansible-managed — its own inventory group", but no group is named anywhere; host-groups list + valid-groups set don't include it. | Decide the group name (e.g. `edge_hosts`/`hetzner_hosts`), add to CLAUDE.md host groups + ADR-009 valid groups. (`askari` is manual like the control node, so `tf_to_inventory.py` need not generate it, but the group must be valid.) |
|
||||
| O5 | medium | `docs/decisions/006-terraform.md:78` | `backend.tf` labelled "Forgejo state backend", contradicting ADR-006's own State-backend section (local state on `ubongo`; Forgejo's API is read-only). | Relabel to "local state backend (no remote backend)". |
|
||||
| O6 | medium | `docs/decisions/014-knowledge-sourcing.md:88` | Plugin-reproducibility described as open ("tracked in `docs/TODO.md`"), but TODO 10.7 is marked DONE (settings.json declares the plugin set; claude-code-setup.md covers bootstrap). | Update to reflect the resolved state; drop the forward-pointer. |
|
||||
### High
|
||||
|
||||
### Clarity / lower-priority consistency
|
||||
- **O1 — `make lint` is red on `main`** · `playbooks/site.yml:18` · *conformance*
|
||||
site.yml imports the `docker_host` role, which does not exist, so ansible-lint's
|
||||
syntax-check fails on a clean checkout. Violates "main must always work" + "Never skip
|
||||
lint" (pre-commit would block every commit unless bypassed).
|
||||
*Fix (judgement):* guard/skip the docker_host play until the role exists, scaffold a
|
||||
stub via `make new-role NAME=docker_host`, or exclude site.yml from syntax-check until
|
||||
built — and record the choice. **new**
|
||||
|
||||
| id | sev | location | finding | suggested fix |
|
||||
|---|---|---|---|---|
|
||||
| O7 | low | `docs/decisions/011-update-management.md:128` | "Digest-pinning the stateful tier" sits in the ruled-out table, but Decision #2 *adopts* `tag@digest` for stateful (TODO 16 confirms). ADR-011 is still **Proposed/draft**. | Remove/replace the ruled-out row when accepting ADR-011 (TODO 16). |
|
||||
| O8 | low | `docs/decisions/003-toolchain.md:85`, `docs/decisions/010-forgejo-ci.md:66` | "act_runner on the control node **or a dedicated runner VM**" reads ambiguously against ADR-015 (no cluster control VM). Not wrong (a runner VM is a separate option) but worth disambiguating. | Name `ubongo` as the runner host; cross-ref ADR-015; keep "dedicated runner VM" as an explicit future option. |
|
||||
| O9 | low | `docs/decisions/008-testing.md:148` | The "WireGuard tunnel establishment" Molecule-exclusion row is framed for the retired OPNsense VLAN-99 WireGuard; NetBird still uses WireGuard (`wt0`) as its data plane. | Reframe the row to the NetBird `wt0` data-plane (ADR-016). |
|
||||
| O10 | low | `docs/decisions/011-update-management.md:67` | Cross-references "the `scheduled_jobs` plan and ADR-010"; ADR-010 is Forgejo CI, not scheduled jobs (that's TODO 8.3, unbuilt). | Point to TODO 8.3 instead. |
|
||||
| O11 | low | `docs/CAPABILITIES.md` §10 | No row for the `/verify-service` (Level 4) capability though ADR-017 decided it. | Add an Operations row for `/verify-service`. |
|
||||
| O12 | low | `docs/TODO.md:30` (item 3.10) | Garbled text ("maybe something in the improvements of the methods in boma moods the point?") — unfollowable. | Rewrite the question clearly or strike it. |
|
||||
- **O2 — ADR-004 ↔ ADR-022 backup-scope contradiction** ·
|
||||
`docs/decisions/004-docker-model.md:105` · *consistency*
|
||||
ADR-004 says "Backup strategy is defined separately (not in scope of this repo)";
|
||||
ADR-022 defines a full in-repo backup strategy. Per ADR-023 (no silent reversals),
|
||||
update ADR-004's line to defer to ADR-022 and cross-link. Design decision — report. **new**
|
||||
|
||||
### Deterministic-scan noise (not fixed — known limitations)
|
||||
### Medium
|
||||
|
||||
- **`broken-path-ref` ×14** — all illustrative/future paths: report-name templates
|
||||
(`docs/testing/reviews/YYYY-MM-DD-<service>.md`) and `latest.md` files not yet
|
||||
created. The path-ref check stops at the `<placeholder>` boundary, so a templated
|
||||
path registers as a partial broken ref. *Potential scanner improvement: skip a path
|
||||
ref immediately followed by a placeholder char or a `YYYY-MM-DD` token.*
|
||||
- **`marker` ×35** — mostly prose references to `TODO.md` items, not code markers.
|
||||
Known noise; the regex already excludes `TODO.md`/alternations but not "TODO 8.2"
|
||||
prose.
|
||||
- **`open-deferred-item` ×6** — all confirmed genuinely open (see above). `0`
|
||||
stale-deferred. New check healthy.
|
||||
- **O3 — ADR-004 service-role file table missing ACCESS.md + BACKUP.md** ·
|
||||
`docs/decisions/004-docker-model.md:48` · *consistency* — CLAUDE.md + ADR-021/022 now
|
||||
mandate both for service roles; the canonical table lists only SECURITY.md + VERIFY.md.
|
||||
(Prior "missing VERIFY.md" is resolved; this is the next evolution.) **new**
|
||||
- **O4 — CAPABILITIES nvim/tmux exclusion ↔ dev_env built** ·
|
||||
`docs/CAPABILITIES.md:149` · *consistency* — listed as a confirmed exclusion
|
||||
("server-only"), but `dev_env` (built+applied to ubongo) installs exactly that. Carve
|
||||
out the control-node/AI-worker exception (ADR-015). **new**
|
||||
- **O5 — phantom `make deploy PLAYBOOK=upgrade`** · `docs/decisions/002-security.md:82` ·
|
||||
*drift* — no `upgrade.yml` exists; ADR-011 is unbuilt. Add a "(planned)" caveat. **new**
|
||||
- **O6 — hosts.yml stubs missing `offsite_hosts` group** ·
|
||||
`inventories/{production,staging}/hosts.yml` · *drift* — the generator emits it (one of
|
||||
four VALID_GROUPS); the hand-stubs predate the standard. Regenerate via
|
||||
`make tf-inventory` (don't hand-edit). (Prior "askari group unnamed" is resolved.) **new**
|
||||
- **O7 — new-host runbook Part E vs ubongo reality** · `docs/runbooks/new-host.md:81-130`
|
||||
· *drift* — instructs creating an `ansible` user / `ssh ansible@`; STATUS records ubongo
|
||||
is managed as `sjat`, ansible-user bootstrap pending. **new**
|
||||
- **O8 — dev_env untagged `set_fact` under tagged consumers** ·
|
||||
`roles/dev_env/tasks/per_user.yml:2-9` · *conformance* — partial `--tags users|config`
|
||||
runs skip the `dev_env__home` set_fact and fail. Tag the preflight `[users, config]` or
|
||||
`always`. **new**
|
||||
- **O9 — ubongo address outside ADR-007 subnets** · `STATUS.md:31 ↔ 007-network.md` ·
|
||||
*drift* — 10.20.10.151 is in neither srv (10.20.0.0/24) nor mgmt (10.10.0.0/24);
|
||||
`base__firewall_control_addr` depends on it. Already a tracked follow-up in the
|
||||
ubongo-build plan. Reconcile address or ADR-007. **new**
|
||||
|
||||
## Diff vs prior run (2026-05-30)
|
||||
### Low
|
||||
|
||||
- **Recurring:** the Terraform-writes-DNS contradiction (AF1) — fixed in README/ADR-003
|
||||
last run, reappeared in ADR-005/new-host.md. Signal that this phrasing keeps being
|
||||
copied; worth a `/review-repo`-time grep for "writes … DNS A record".
|
||||
- **New:** everything else — the repo gained ADR-010…017 and the `ubongo`/NetBird/
|
||||
Level-4 work since the prior run, so most findings are fresh propagation gaps.
|
||||
- **Resolved:** prior-run open items were largely addressed during the intervening
|
||||
doc work (control-node-as-VM, WireGuard framing, etc., now mostly reconciled).
|
||||
- **O10 — README ADR list stops at 017** · `README.md:104` · *drift* — 018–023 exist;
|
||||
extend or trim to a pointer. **recurring** (evolved from prior O3)
|
||||
- **O11 — ADR section-order vs ADR-023 §2** · `008:3, 014:98, 016:91, 017:66, 018:73` ·
|
||||
*conformance* — Status-not-first / Decision-late; passes lint (order not gated) but not
|
||||
the standard. Presentational restructure. **new**
|
||||
- **O12 — ADR-007 FQDN convention vs its own example** · `007-network.md:160` ·
|
||||
*consistency* — `<service>.baobab.band` vs `forgejo.nyumbani.baobab.band`; ties to open
|
||||
TODO 4 (split-horizon). **new**
|
||||
- **O13 — dev_env `.zshrc` heritage carryovers** ·
|
||||
`roles/dev_env/files/dotfiles/zsh/.zshrc:28,55` · *consistency* — hard-coded
|
||||
`/usr/bin/rclone` alias (not installed by the role) + unguarded `direnv` hook. **new**
|
||||
- **O14 — oh_my_posh config tasks untagged** · `roles/dev_env/tasks/oh_my_posh.yml:15-26`
|
||||
· *consistency* — inconsistent `config` tagging vs per_user.yml. **new**
|
||||
- **O15 — tfvars.example `pve01` vs ADR-007 `pve0`** ·
|
||||
`terraform/environments/*/terraform.tfvars.example:9` · *consistency* — verify the real
|
||||
node name, then align. **new**
|
||||
- **O16 — ADR-013/015 "See also:" vs `## Related`** · *consistency* — stylistic; convert
|
||||
for uniformity. **new**
|
||||
- **O17 — empty scaffold `handlers/main.yml`** · `roles/{dev_env,base}/handlers/main.yml`
|
||||
· *cruft* — confirm convention or delete. **new**
|
||||
- **O18 — docs/README.md + inventories/README.md narrower than reality** · *consistency*
|
||||
— omit several real subdirs / the offsite_hosts group. **new**
|
||||
|
||||
## Follow-up prompt
|
||||
## Deferral checklist (Phase 2)
|
||||
|
||||
> Thread the ADR-017 `VERIFY.md` convention through the remaining docs (O1–O3): add a
|
||||
> `VERIFY.md` row to ADR-004's service-role file table, a VERIFY.md step to
|
||||
> `new-role.md` (and reconcile STATUS.md:17), and refresh `README.md`'s ADR list +
|
||||
> `docs/` tree. Then settle the `askari` inventory group name (O4) and propagate it to
|
||||
> CLAUDE.md host-groups + ADR-009 valid-groups. Finally clear the stale labels O5
|
||||
> (ADR-006 backend.tf) and O6 (ADR-014 plugin reproducibility = DONE).
|
||||
| Source | Items | Verdict |
|
||||
|---|---|---|
|
||||
| ADR-011 Deferred/Open | 5 (snapshot driver, cadences, health-check harness home, classification home, staging-first) | **All genuinely still open** — cross-checked against later ADRs + TODO 16. None silently resolved. |
|
||||
| ADR-015 Deferred | #1 mesh VPN, #2 service-UI, #3 build | **All marked RESOLVED in place** (ADR-016 / ADR-017 / 2026-06-11 build). |
|
||||
|
||||
**Stale-deferred found: 0.** The recurring FRICTION.md miss did not recur this run.
|
||||
|
||||
## Scan false positives (folded in, not actionable)
|
||||
|
||||
- `broken-path-ref STATUS.md:38` — STATUS legitimately documents `roles/docker_host/` as
|
||||
"Not in git." (intentional reference to an unbuilt role).
|
||||
- `broken-adr-ref` ×4 — `ADR-099`/`ADR-100` in `tests/test_repo_scan.py` and the
|
||||
adr-structure plan are intentional **test fixtures** for the scanner's bad-ref check.
|
||||
- `marker` ×14 — all in `docs/superpowers/{plans,specs}/*` (historical commit-message
|
||||
TODOs / plan steps) or prose discussing "over-tagging" as a concept. Not cruft.
|
||||
|
||||
## Prior-run diff (vs 2026-06-05)
|
||||
|
||||
**Resolved (7):** O1 VERIFY.md row · O2 new-role VERIFY step · O4 askari group naming ·
|
||||
O5 backend.tf relabel · O6 ADR-014 reproducibility · O11 CAPABILITIES Level-4 row ·
|
||||
O12 TODO 3.10. **Partial:** O3 (docs tree fixed in AF4; ADR-list carried as O10).
|
||||
**Not re-detected (verify next run):** O7–O10 (ADR-011 still Proposed).
|
||||
|
||||
## Follow-up prompt (copy-paste)
|
||||
|
||||
> Act on the open findings from `docs/reviews/2026-06-11-review.md`. Priority order:
|
||||
> 1. **O1 (high):** `make lint` is red on `main` — `playbooks/site.yml` imports the
|
||||
> non-existent `docker_host` role. Pick an interim posture (guard/skip the play, or
|
||||
> `make new-role NAME=docker_host` to scaffold a stub, or exclude from syntax-check
|
||||
> until built) so the trunk lints clean again, and record the choice in STATUS.md.
|
||||
> 2. **O2 (high):** Resolve the ADR-004 ↔ ADR-022 backup-scope contradiction —
|
||||
> update ADR-004's "not in scope of this repo" line to defer to ADR-022 (per ADR-023's
|
||||
> no-silent-reversal rule) and cross-link.
|
||||
> 3. **O3:** Add ACCESS.md + BACKUP.md rows to ADR-004's service-role file table.
|
||||
> 4. **O4:** Reconcile CAPABILITIES' nvim/tmux exclusion with the built `dev_env` role
|
||||
> (carve out the ubongo control-node exception).
|
||||
> 5. **O8 (conformance):** Tag the `dev_env__home` preflight `set_fact` so partial
|
||||
> `--tags users|config` runs don't fail.
|
||||
> 6. **O6 / O9:** Regenerate the inventory stubs to include `offsite_hosts`; reconcile
|
||||
> ubongo's 10.20.10.151 against ADR-007's subnets (or amend ADR-007).
|
||||
> 7. Sweep the low-severity doc items (O5 caveat, O7 runbook, O10 ADR list, O11 ADR
|
||||
> section order, O12–O18) as a single docs-hygiene batch.
|
||||
> Run `make lint` before committing; commit per CLAUDE.md git conventions.
|
||||
|
|
|
|||
551
docs/superpowers/plans/2026-06-14-public-dns-m1.md
Normal file
551
docs/superpowers/plans/2026-06-14-public-dns-m1.md
Normal file
|
|
@ -0,0 +1,551 @@
|
|||
# Public DNS (M1) Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Build the `public_dns` role that manages `wingu.me`'s records at Gandi LiveDNS as code, purging Gandi's seeded defaults and applying boma's anti-spoof baseline.
|
||||
|
||||
**Architecture:** A control-node role drives `community.general.gandi_livedns` over declarative record lists in `group_vars/all/public_dns.yml` (mirroring the firewall-catalog pattern). Records to keep are `state: present`; Gandi's auto-seeded defaults are `state: absent`. A `public_dns__apply` toggle lets Molecule converge without calling the API; a pytest validates the data shape; the live run happens via `make check`/`deploy PLAYBOOK=dns` on ubongo.
|
||||
|
||||
**Tech Stack:** Ansible (`community.general.gandi_livedns`, PAT auth), pytest, Gandi LiveDNS API. Secrets from `vault.gandi.pat`.
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-11-public-dns-gandi-migration-design.md`
|
||||
|
||||
**Execution context:** Tasks 1–6 + 8 are authoring (any machine with the venv). **Task 7 runs on ubongo** (has the vault + Gandi egress) and is the only one that touches live Gandi.
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
- `requirements.yml` (modify) — add `community.general` (≥9.0.0) for `gandi_livedns`.
|
||||
- `roles/public_dns/` (create) — `defaults/main.yml`, `tasks/main.yml`, `meta/main.yml`, `README.md`, `molecule/default/`.
|
||||
- `inventories/production/group_vars/all/public_dns.yml` (create) — `public_dns__domain` + `public_dns__records` (present) + `public_dns__absent` (Gandi defaults).
|
||||
- `playbooks/dns.yml` (create) — control-node play running the role.
|
||||
- `tests/test_public_dns.py` (create) — pytest over the record data.
|
||||
- `docs/decisions/007-network.md`, `STATUS.md`, `docs/TODO.md`, `docs/CAPABILITIES.md` (modify) — doc reconciliation.
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Add the `community.general` collection
|
||||
|
||||
**Files:**
|
||||
- Modify: `requirements.yml`
|
||||
|
||||
- [ ] **Step 1: Add the collection with the on-demand comment**
|
||||
|
||||
In `requirements.yml`, under `collections:`, append:
|
||||
|
||||
```yaml
|
||||
# community.general — gandi_livedns (public_dns role manages wingu.me at Gandi
|
||||
# LiveDNS). PAT auth requires >= 9.0.0.
|
||||
- name: community.general
|
||||
version: ">=9.0.0"
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Install it**
|
||||
|
||||
Run: `make collections`
|
||||
Expected: installs `community.general` (≥9.0.0) with no errors.
|
||||
|
||||
- [ ] **Step 3: Verify the module is available**
|
||||
|
||||
Run: `.venv/bin/ansible-doc community.general.gandi_livedns | head -5`
|
||||
Expected: prints the module doc header (confirms the module resolves), mentioning `personal_access_token`.
|
||||
|
||||
- [ ] **Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add requirements.yml
|
||||
git commit -m "deps: add community.general for gandi_livedns (public_dns)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Scaffold the role
|
||||
|
||||
**Files:**
|
||||
- Create: `roles/public_dns/` (via the scaffolder)
|
||||
|
||||
- [ ] **Step 1: Scaffold**
|
||||
|
||||
Run: `make new-role NAME=public_dns`
|
||||
Expected: `Role public_dns scaffolded at roles/public_dns/` (creates `tasks/`, `handlers/`, `defaults/`, `meta/`, `templates/`, `files/`, `molecule/default/`, `README.md`).
|
||||
|
||||
- [ ] **Step 2: Commit the scaffold**
|
||||
|
||||
```bash
|
||||
git add roles/public_dns
|
||||
git commit -m "scaffold(public_dns): empty role structure"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Record data + validation test (TDD)
|
||||
|
||||
**Files:**
|
||||
- Test: `tests/test_public_dns.py`
|
||||
- Create: `inventories/production/group_vars/all/public_dns.yml`
|
||||
|
||||
- [ ] **Step 1: Write the failing test**
|
||||
|
||||
Create `tests/test_public_dns.py`:
|
||||
|
||||
```python
|
||||
import pathlib
|
||||
|
||||
import yaml
|
||||
|
||||
_DATA = (
|
||||
pathlib.Path(__file__).resolve().parent.parent
|
||||
/ "inventories" / "production" / "group_vars" / "all" / "public_dns.yml"
|
||||
)
|
||||
|
||||
# Gandi auto-seeds these on a fresh .me zone; boma purges them (verified 2026-06-14).
|
||||
GANDI_DEFAULTS_ABSENT = {
|
||||
("@", "A"), ("www", "CNAME"), ("webmail", "CNAME"),
|
||||
("gm1._domainkey", "CNAME"), ("gm2._domainkey", "CNAME"), ("gm3._domainkey", "CNAME"),
|
||||
("_imap._tcp", "SRV"), ("_imaps._tcp", "SRV"), ("_pop3._tcp", "SRV"),
|
||||
("_pop3s._tcp", "SRV"), ("_submission._tcp", "SRV"),
|
||||
}
|
||||
|
||||
|
||||
def _load():
|
||||
return yaml.safe_load(_DATA.read_text())
|
||||
|
||||
|
||||
def test_domain_is_wingu():
|
||||
assert _load()["public_dns__domain"] == "wingu.me"
|
||||
|
||||
|
||||
def test_present_records_well_formed():
|
||||
for r in _load()["public_dns__records"]:
|
||||
assert r["record"] and r["type"]
|
||||
assert isinstance(r["values"], list) and r["values"]
|
||||
|
||||
|
||||
def test_anti_spoof_baseline_present():
|
||||
recs = {(r["record"], r["type"]): r["values"] for r in _load()["public_dns__records"]}
|
||||
assert recs[("@", "MX")] == ["0 ."] # null MX
|
||||
assert recs[("@", "TXT")] == ['"v=spf1 -all"'] # SPF deny-all
|
||||
assert recs[("_dmarc", "TXT")] == ['"v=DMARC1; p=reject;"']
|
||||
|
||||
|
||||
def test_gandi_defaults_marked_absent():
|
||||
absent = {(r["record"], r["type"]) for r in _load()["public_dns__absent"]}
|
||||
assert GANDI_DEFAULTS_ABSENT <= absent
|
||||
|
||||
|
||||
def test_no_record_both_present_and_absent():
|
||||
present = {(r["record"], r["type"]) for r in _load()["public_dns__records"]}
|
||||
absent = {(r["record"], r["type"]) for r in _load()["public_dns__absent"]}
|
||||
assert present.isdisjoint(absent)
|
||||
|
||||
|
||||
def test_no_duplicate_present_records():
|
||||
keys = [(r["record"], r["type"]) for r in _load()["public_dns__records"]]
|
||||
assert len(keys) == len(set(keys))
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run it to verify it fails**
|
||||
|
||||
Run: `.venv/bin/python -m pytest tests/test_public_dns.py -v`
|
||||
Expected: FAIL (the data file does not exist yet — `FileNotFoundError`).
|
||||
|
||||
- [ ] **Step 3: Create the record data**
|
||||
|
||||
Create `inventories/production/group_vars/all/public_dns.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Public DNS — wingu.me at Gandi LiveDNS, managed by the public_dns role (M1).
|
||||
# Mesh/LAN-only by default: only deliberate public records live here. PAT in
|
||||
# vault.gandi.pat. See docs/decisions/007-network.md and the M1 spec.
|
||||
public_dns__domain: wingu.me
|
||||
|
||||
# Present — anti-spoof baseline for a no-mail domain (overwrites Gandi's seeded mail set).
|
||||
public_dns__records:
|
||||
- { record: "@", type: MX, values: ["0 ."], ttl: 3600 }
|
||||
- { record: "@", type: TXT, values: ['"v=spf1 -all"'], ttl: 3600 }
|
||||
- { record: _dmarc, type: TXT, values: ['"v=DMARC1; p=reject;"'], ttl: 3600 }
|
||||
# Service records appear as public-tier needs arise (askari A in M4).
|
||||
# Mesh/LAN-only services never appear here.
|
||||
|
||||
# Absent — Gandi's auto-seeded defaults we don't want (purged once, idempotent thereafter).
|
||||
public_dns__absent:
|
||||
- { record: "@", type: A } # Gandi parking IP
|
||||
- { record: www, type: CNAME } # Gandi web-redirect
|
||||
- { record: webmail, type: CNAME } # Gandi webmail
|
||||
- { record: gm1._domainkey, type: CNAME } # Gandi DKIM
|
||||
- { record: gm2._domainkey, type: CNAME }
|
||||
- { record: gm3._domainkey, type: CNAME }
|
||||
- { record: _imap._tcp, type: SRV } # Gandi mail autodiscovery
|
||||
- { record: _imaps._tcp, type: SRV }
|
||||
- { record: _pop3._tcp, type: SRV }
|
||||
- { record: _pop3s._tcp, type: SRV }
|
||||
- { record: _submission._tcp, type: SRV }
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run the test to verify it passes**
|
||||
|
||||
Run: `.venv/bin/python -m pytest tests/test_public_dns.py -v`
|
||||
Expected: PASS (6 passed).
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add tests/test_public_dns.py inventories/production/group_vars/all/public_dns.yml
|
||||
git commit -m "feat(public_dns): wingu.me record data + validation test"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Role implementation (defaults, tasks, meta, README)
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/public_dns/defaults/main.yml`
|
||||
- Modify: `roles/public_dns/tasks/main.yml`
|
||||
- Modify: `roles/public_dns/meta/main.yml`
|
||||
- Modify: `roles/public_dns/README.md`
|
||||
|
||||
- [ ] **Step 1: Write `defaults/main.yml`**
|
||||
|
||||
```yaml
|
||||
---
|
||||
# public_dns — manage the public zone at Gandi LiveDNS as code (M1).
|
||||
# Record data (public_dns__domain / __records / __absent) lives in group_vars/all.
|
||||
# See docs/decisions/007-network.md.
|
||||
public_dns__apply: true # set false to validate without calling the Gandi API (Molecule)
|
||||
public_dns__default_ttl: 1800 # TTL when a record omits one
|
||||
public_dns__domain: "" # overridden in group_vars/all
|
||||
public_dns__records: [] # present records
|
||||
public_dns__absent: [] # records to remove
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Write `tasks/main.yml`**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Assert public DNS data is sane
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- public_dns__domain | length > 0
|
||||
- public_dns__records | selectattr('type', 'equalto', 'MX') | list | length > 0
|
||||
fail_msg: >-
|
||||
public_dns__domain must be set and a null-MX anti-spoof record declared in
|
||||
public_dns__records (group_vars/all/public_dns.yml).
|
||||
run_once: true
|
||||
|
||||
- name: Ensure desired records are present (Gandi LiveDNS)
|
||||
community.general.gandi_livedns:
|
||||
domain: "{{ public_dns__domain }}"
|
||||
record: "{{ item.record }}"
|
||||
type: "{{ item.type }}"
|
||||
values: "{{ item.values }}"
|
||||
ttl: "{{ item.ttl | default(public_dns__default_ttl) }}"
|
||||
state: present
|
||||
personal_access_token: "{{ vault.gandi.pat }}"
|
||||
loop: "{{ public_dns__records }}"
|
||||
loop_control:
|
||||
label: "{{ item.record }} {{ item.type }}"
|
||||
run_once: true
|
||||
when: public_dns__apply | bool
|
||||
|
||||
- name: Ensure unwanted records are absent (Gandi LiveDNS)
|
||||
community.general.gandi_livedns:
|
||||
domain: "{{ public_dns__domain }}"
|
||||
record: "{{ item.record }}"
|
||||
type: "{{ item.type }}"
|
||||
state: absent
|
||||
personal_access_token: "{{ vault.gandi.pat }}"
|
||||
loop: "{{ public_dns__absent }}"
|
||||
loop_control:
|
||||
label: "{{ item.record }} {{ item.type }}"
|
||||
run_once: true
|
||||
when: public_dns__apply | bool
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Write `meta/main.yml`**
|
||||
|
||||
```yaml
|
||||
---
|
||||
galaxy_info:
|
||||
author: sjat
|
||||
description: Manage boma's public DNS zone (wingu.me) at Gandi LiveDNS as code.
|
||||
license: MIT
|
||||
min_ansible_version: "2.17"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions:
|
||||
- trixie
|
||||
dependencies: []
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Write `README.md`**
|
||||
|
||||
```markdown
|
||||
# public_dns
|
||||
|
||||
Manages boma's public DNS zone (**wingu.me**) at **Gandi LiveDNS** as code, via
|
||||
`community.general.gandi_livedns` (PAT auth from `vault.gandi.pat`). Provider-agnostic
|
||||
name on purpose. Run from the control node: `make check/deploy PLAYBOOK=dns`.
|
||||
|
||||
Mesh/LAN-only by default — only deliberate public records live in the zone (the
|
||||
anti-spoof baseline now; `askari` in M4). Everything else is reached over LAN/mesh and
|
||||
never appears here.
|
||||
|
||||
## Data (in `group_vars/all/public_dns.yml`)
|
||||
|
||||
| Var | Meaning |
|
||||
|---|---|
|
||||
| `public_dns__domain` | the zone (`wingu.me`) |
|
||||
| `public_dns__records` | records to ensure **present** (`record`, `type`, `values`, optional `ttl`) |
|
||||
| `public_dns__absent` | records to ensure **absent** (Gandi's auto-seeded defaults) |
|
||||
|
||||
## Behaviour knobs (`defaults/main.yml`)
|
||||
|
||||
| Var | Default | Meaning |
|
||||
|---|---|---|
|
||||
| `public_dns__apply` | `true` | set `false` to validate without calling the Gandi API (Molecule) |
|
||||
| `public_dns__default_ttl` | `1800` | TTL when a record omits one |
|
||||
|
||||
## Notes
|
||||
|
||||
The zone is reconciled **additively** plus an explicit `absent` list (Gandi seeds 13
|
||||
default records on a new `.me`; we purge the unwanted 11 and overwrite MX/SPF with the
|
||||
anti-spoof baseline). Full-zone authoritative pruning is a future enhancement (TODO 8.3).
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
|
||||
|
||||
- [ ] **Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add roles/public_dns
|
||||
git commit -m "feat(public_dns): role tasks, defaults, meta, README"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Molecule scenario (no live API)
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/public_dns/molecule/default/converge.yml`
|
||||
- Modify: `roles/public_dns/molecule/default/verify.yml`
|
||||
|
||||
- [ ] **Step 1: Write `converge.yml` (apply disabled, sample data)**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Converge
|
||||
hosts: all
|
||||
gather_facts: true
|
||||
vars:
|
||||
public_dns__apply: false # never call the Gandi API from a container
|
||||
public_dns__domain: example.test
|
||||
public_dns__records:
|
||||
- { record: "@", type: MX, values: ["0 ."], ttl: 3600 }
|
||||
- { record: "@", type: TXT, values: ['"v=spf1 -all"'], ttl: 3600 }
|
||||
public_dns__absent:
|
||||
- { record: www, type: CNAME }
|
||||
roles:
|
||||
- role: public_dns
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Write `verify.yml`**
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Verify
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Role variables resolved
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- public_dns__domain == "example.test"
|
||||
- public_dns__apply | bool == false
|
||||
msg: "public_dns defaults/vars did not resolve as expected"
|
||||
tags: [verify]
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Run Molecule**
|
||||
|
||||
Run: `make test ROLE=public_dns`
|
||||
Expected: PASS — converge applies the role (the `assert` passes; the `gandi_livedns` tasks are skipped because `public_dns__apply: false`), verify passes, idempotence clean.
|
||||
|
||||
- [ ] **Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add roles/public_dns/molecule
|
||||
git commit -m "test(public_dns): Molecule scenario (apply disabled, no live API)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 6: The `dns.yml` playbook
|
||||
|
||||
**Files:**
|
||||
- Create: `playbooks/dns.yml`
|
||||
|
||||
- [ ] **Step 1: Write the play**
|
||||
|
||||
```yaml
|
||||
---
|
||||
# dns.yml — manage the public DNS zone (wingu.me) at Gandi LiveDNS as code.
|
||||
# Runs on the control node (ubongo) against the Gandi API — no host config.
|
||||
# Run: make check PLAYBOOK=dns then make deploy PLAYBOOK=dns
|
||||
- name: Manage public DNS (Gandi LiveDNS)
|
||||
hosts: control
|
||||
connection: local
|
||||
gather_facts: false
|
||||
become: false
|
||||
roles:
|
||||
- role: public_dns
|
||||
tags: [public_dns]
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Lint (verifies the role-name tag on the import)**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: `Passed: 0 failure(s)` and `check-tags: OK (... role imports verified)`.
|
||||
|
||||
- [ ] **Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add playbooks/dns.yml
|
||||
git commit -m "feat(public_dns): dns.yml play (control-node, Gandi LiveDNS)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 7: Live run on ubongo (purge + baseline) — gated
|
||||
|
||||
> **Runs on ubongo only** (vault + Gandi egress). `rbw unlock` first. This is the one
|
||||
> task that mutates live Gandi; review the check-mode diff before deploying.
|
||||
|
||||
- [ ] **Step 1: Dry-run (check mode + diff)**
|
||||
|
||||
Run: `make check PLAYBOOK=dns`
|
||||
Expected: the diff shows the 3 present records being set (null MX, SPF `-all`, DMARC `reject`) and the 11 Gandi defaults being removed. **Review it.**
|
||||
|
||||
- [ ] **Step 2: Apply**
|
||||
|
||||
Run: `make deploy PLAYBOOK=dns`
|
||||
Expected: `changed` for the present + absent records; no errors.
|
||||
|
||||
- [ ] **Step 3: Verify idempotence**
|
||||
|
||||
Run: `make deploy PLAYBOOK=dns`
|
||||
Expected: `ok=... changed=0` — a second run makes no changes.
|
||||
|
||||
- [ ] **Step 4: Verify with dig**
|
||||
|
||||
```bash
|
||||
dig +short MX wingu.me # expect: 0 .
|
||||
dig +short TXT wingu.me # expect: "v=spf1 -all"
|
||||
dig +short TXT _dmarc.wingu.me # expect: "v=DMARC1; p=reject;"
|
||||
dig +short www.wingu.me # expect: empty (CNAME removed)
|
||||
```
|
||||
Expected: as annotated (allow for TTL/propagation).
|
||||
|
||||
- [ ] **Step 5: No commit** — this task changes live Gandi, not the repo.
|
||||
|
||||
---
|
||||
|
||||
### Task 8: Documentation reconciliation
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/decisions/007-network.md`
|
||||
- Modify: `STATUS.md`
|
||||
- Modify: `docs/TODO.md`
|
||||
- Modify: `docs/CAPABILITIES.md`
|
||||
|
||||
- [ ] **Step 1: Amend ADR-007 — naming scheme row**
|
||||
|
||||
Replace the `Public service FQDN` row of the naming-scheme table:
|
||||
|
||||
```
|
||||
| Public service FQDN | `<service>.baobab.band` | `forgejo.nyumbani.baobab.band` |
|
||||
```
|
||||
with:
|
||||
|
||||
```
|
||||
| Public service FQDN | `<service>.wingu.me` | `vaultwarden.wingu.me` |
|
||||
| Off-site (VPS) FQDN | `<service>.askari.wingu.me` | `netbird.askari.wingu.me` |
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Amend ADR-007 — public zone + scheme**
|
||||
|
||||
Replace the **Public zone** paragraph:
|
||||
|
||||
```
|
||||
**Public zone**: `baobab.band` — served by external DNS (Cloudflare or equivalent).
|
||||
Public-facing services resolve to the public IP or Cloudflare proxy.
|
||||
```
|
||||
with:
|
||||
|
||||
```
|
||||
**Public zone**: `wingu.me` — Gandi LiveDNS, **managed as code** by the `public_dns`
|
||||
role (`vault.gandi.pat`). Three-tier naming: infra `<host>.boma.wingu.me` (internal),
|
||||
services `<service>.wingu.me` (split-horizon), off-site `<service>.askari.wingu.me`.
|
||||
`nyumbani` is retired. **Mesh/LAN-only by default**: home services have no public record
|
||||
(reached over LAN or the NetBird mesh); only deliberate exceptions are published.
|
||||
The project is `boma`; the domain is `wingu.me` (see the M1 spec). The legacy
|
||||
`baobab.band` zone (Cloudflare) is out of scope here.
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Update the split-horizon example**
|
||||
|
||||
In the **Split-horizon** paragraph, replace the example `forgejo.nyumbani.baobab.band`
|
||||
with `vaultwarden.wingu.me` (internal → private proxy IP; public → only if a deliberate
|
||||
exception). Leave the internal-zone (`boma.baobab.band` → to become `boma.wingu.me` when
|
||||
the `dns` role lands in Phase 2) wording; add a parenthetical: *(internal zone is renamed
|
||||
to `boma.wingu.me` when the `dns` role is built — Phase 2)*.
|
||||
|
||||
- [ ] **Step 4: Mark STATUS — public_dns built**
|
||||
|
||||
In `STATUS.md`, under "Real and working today", add a row:
|
||||
|
||||
```
|
||||
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); purged Gandi's seeded defaults, applied the anti-spoof baseline (null MX, SPF `-all`, DMARC reject). Mesh/LAN-only default. M1 of the roadmap. |
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Resolve TODO 4**
|
||||
|
||||
In `docs/TODO.md`, change item 4 to struck-through/decided:
|
||||
|
||||
```
|
||||
4. ~~**Split-horizon FQDN** — adopt split-horizon FQDN with or without nyumbani?~~
|
||||
DECIDED (M1): three-tier scheme on `wingu.me`; `nyumbani` dropped; mesh/LAN-only
|
||||
default. See `docs/decisions/007-network.md` + the M1 spec.
|
||||
```
|
||||
|
||||
- [ ] **Step 6: Add a CAPABILITIES row**
|
||||
|
||||
In `docs/CAPABILITIES.md`, near the Internal DNS row, add:
|
||||
|
||||
```
|
||||
| Public DNS | `public_dns` role → Gandi LiveDNS | P | core | wingu.me zone as code (ADR-007) | anti-spoof baseline; mesh/LAN-only |
|
||||
```
|
||||
(Match the surrounding table's column shape; adjust the status letter to the table's convention.)
|
||||
|
||||
- [ ] **Step 7: Lint + commit**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: clean.
|
||||
|
||||
```bash
|
||||
git add docs/decisions/007-network.md STATUS.md docs/TODO.md docs/CAPABILITIES.md
|
||||
git commit -m "docs(public_dns): amend ADR-007 to wingu.me/Gandi; resolve TODO 4; STATUS + CAPABILITIES"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Self-Review (completed)
|
||||
|
||||
- **Spec coverage:** role + group_vars data (Decisions 4,5) → Tasks 3,4; `gandi_livedns` + PAT (Decision 5, Verified facts) → Task 4; collections-on-demand (Decision 5) → Task 1; anti-spoof baseline + Gandi-defaults purge (Problem, Data model) → Tasks 3,7; cert scope (Decision 6) → out of scope (no cert tasks, correct); testing (check-mode/idempotence/dig + pytest) → Tasks 5,7,3; ADR-007 amendment + TODO 4/O12 → Task 8. All covered.
|
||||
- **Placeholder scan:** none — every code/content step is concrete.
|
||||
- **Type/name consistency:** `public_dns__domain`/`__records`/`__absent`/`__apply`/`__default_ttl` and `vault.gandi.pat` used identically across data, role, play, and tests. `gandi_livedns` params match the verified module signature.
|
||||
- **Note for the implementer:** Task 7 assumes ubongo. If the `gandi_livedns` `absent` call needs `values` for some record types, add them from `public_dns__absent` (verify against the pinned `community.general` version per ADR-014).
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
# Design — boma's DNS home: a new domain at Gandi (DNS-as-code)
|
||||
|
||||
- **Date:** 2026-06-11 · **Revised:** 2026-06-12 (Option B — boma gets its own new domain;
|
||||
supersedes this spec's original "migrate `baobab.band` off Cloudflare" framing)
|
||||
- **Status:** Draft for review — design settled in brainstorming; pending user review,
|
||||
then implementation plan
|
||||
- **Roadmap milestone:** M1 (`docs/ROADMAP.md`)
|
||||
- **Resolves:** TODO 4 (split-horizon FQDN — with/without `nyumbani`); review finding O12
|
||||
- **Amends:** ADR-007 — boma's public zone is a **new domain at Gandi LiveDNS, managed as
|
||||
code**; the three-tier naming scheme; `nyumbani` removed; mesh/LAN-only default
|
||||
- **Becomes:** an ADR-007 amendment (no new ADR unless `public_dns` grows its own concerns)
|
||||
|
||||
---
|
||||
|
||||
## Problem
|
||||
|
||||
boma needs a DNS home. Investigating the obvious candidates ruled them out as *boma's*
|
||||
home:
|
||||
|
||||
- **`baobab.band`** is the **live legacy homelab** (on Cloudflare): `vaultwarden`,
|
||||
`nextcloud`, `matrix`/`element`, `collabora`, `ntfy`, `radio`, … in daily use, much of
|
||||
it riding `*.baobab.band` / `*.nyumbani.baobab.band` wildcards. Moving its authoritative
|
||||
DNS risks breaking production.
|
||||
- **`ziethen.dk`** is the **family's primary email** (Fastmail). Moving a live email
|
||||
domain's DNS is the highest-stakes DNS operation there is — worse, not better.
|
||||
|
||||
**Decision: register a NEW Swahili-themed domain at Gandi for boma.** Greenfield,
|
||||
zero-risk, *born at Gandi* — so it satisfies the DNS-as-code + sovereignty goal natively
|
||||
with **no migration at all**. The existing domains are decoupled: `baobab.band`'s
|
||||
Cloudflare exit / V4 decommission is a **separate, later track** (handled when boma
|
||||
replaces what it hosts), and `ziethen.dk` is untouched.
|
||||
|
||||
boma's domain is **`wingu.me`** (registered at Gandi 2026-06-14; *wingu* = Swahili for
|
||||
*cloud*). The `public_dns` role keeps it as a variable (`public_dns__domain`) so it stays
|
||||
swappable.
|
||||
|
||||
**Starting state (verified 2026-06-14):** Gandi auto-seeded the zone with **13 default
|
||||
records** — apex parking `A`, `www` web-redirect, and a full Gandi mailbox set (`MX`, SPF,
|
||||
three `*._domainkey` DKIM CNAMEs, `webmail`, IMAP/POP/submission `SRV`). None are boma's;
|
||||
wingu.me sends no mail (email stays at `ziethen.dk`). See the setup sequence for the
|
||||
one-time purge + anti-spoof baseline.
|
||||
|
||||
## Decisions (as settled)
|
||||
|
||||
1. **New domain, registered at Gandi.** No transfer, no migration, no Cloudflare/Fastmail
|
||||
entanglement. (Human registers + pays — see division of labour.)
|
||||
2. **Three-tier naming scheme** (re-homed to `wingu.me`) — see table. `nyumbani`
|
||||
**dropped**.
|
||||
3. **Mesh/LAN-only by default.** Home/cluster services have **no public record**; reached
|
||||
over LAN or the NetBird mesh. Public Gandi records only for deliberate exceptions.
|
||||
4. **DNS-as-code via a control-node `public_dns` role** driven by record data in
|
||||
`group_vars` (same pattern as the firewall catalog). Name is provider-agnostic.
|
||||
5. **Tooling: `community.general.gandi_livedns` with `personal_access_token`** (PAT).
|
||||
Re-adds `community.general` to `requirements.yml` (collections-on-demand; a committed
|
||||
role uses `gandi_livedns`), pinned `>=9.0.0`, with the naming comment.
|
||||
6. **Cert scope: DNS + PAT only.** M1 ends at the zone + PAT in vault, which *enables*
|
||||
ACME DNS-01 later. No cert issuance in M1 (reverse proxy → askari M4 / home Phase 2).
|
||||
7. **Human/agent division of labour** (see table) — register + pay + PAT are human; all
|
||||
record/IaC work is the agent's, from `ubongo`.
|
||||
8. **Explicitly out of scope:** `baobab.band` (and its Cloudflare exit / V4 decommission)
|
||||
and `ziethen.dk` — separate later tracks.
|
||||
|
||||
## Verified facts (ADR-014)
|
||||
|
||||
> verified: `community.general.gandi_livedns` requires `personal_access_token` (PAT);
|
||||
> `api_key` is deprecated and **rejected** by Gandi (Bearer auth replaced Apikey) ·
|
||||
> WebFetch docs.ansible.com + WebSearch (Gandi PAT announcement 2023-09; community.general
|
||||
> issue #7926) · PAT param added in **community.general 9.0.0**, **13.0.1** current ·
|
||||
> 2026-06-11
|
||||
> - Module params: `domain`, `record`, `type`, `values` (list), `ttl`, `state`
|
||||
> (`present`/`absent`). Supports **check mode + diff**.
|
||||
> - Auth is per-task: `personal_access_token: "{{ vault.gandi.pat }}"`.
|
||||
|
||||
## Naming scheme (the convention)
|
||||
|
||||
| Tier | Pattern | Authoritative source | Public? |
|
||||
|---|---|---|---|
|
||||
| Infrastructure / hosts | `<host>.boma.wingu.me` | internal zone (`dns1`/`dns2`, Phase 2) | never |
|
||||
| Home / cluster services | `<service>.wingu.me` | internal zone (split-horizon) | only deliberate exceptions |
|
||||
| Off-site / VPS services | `<service>.askari.wingu.me` | Gandi LiveDNS | yes (askari has a stable public IP) |
|
||||
|
||||
- **Project vs domain.** The project/homelab stays **`boma`** (ADR-007); **`wingu.me`** is
|
||||
its domain. `<host>.boma.wingu.me` reads as "host in the `boma` compound, on the `wingu`
|
||||
cloud" — kept distinct deliberately (`boma` wasn't available as a domain; the two layers
|
||||
fit the self-hosting ethos). Folds into the ADR-007 amendment.
|
||||
- **`nyumbani` removed** — home is the default; only the exception (`askari`) needs naming.
|
||||
- **The mesh carries "internal" to road-warriors.** NetBird pushes `dns1`/`dns2` (over
|
||||
`wt0`) as resolver for the `wingu.me` match-domain → on-LAN-or-on-mesh resolves
|
||||
internal; truly public resolves at Gandi (ties M1 ↔ ADR-016 / M5).
|
||||
- **Wildcard TLS later.** `*.wingu.me` ACME DNS-01 (Gandi PAT) gives even unexposed
|
||||
services real TLS without a public A record. Enabled by M1, issued in M4/Phase 2.
|
||||
|
||||
## Architecture — two deliverables
|
||||
|
||||
### (A) One-time setup — a short runbook (`docs/runbooks/`)
|
||||
|
||||
Greenfield, so this is small and low-risk (contrast the abandoned migration framing):
|
||||
register the domain, create the LiveDNS zone, issue the PAT. No transfer, no live-zone
|
||||
cutover.
|
||||
|
||||
### (B) `public_dns` — the reusable IaC role
|
||||
|
||||
- Runs **from the control node** (`delegate_to: localhost`, or a `dns.yml` play targeting
|
||||
`control`) against the Gandi LiveDNS API — no managed *host*, only API calls.
|
||||
- Reconciles records from **`group_vars` data** via `community.general.gandi_livedns`,
|
||||
PAT from `vault.gandi.pat`. **Check-mode/diff first**, always.
|
||||
|
||||
#### Data model (sketch)
|
||||
|
||||
```yaml
|
||||
# inventories/production/group_vars/all/public_dns.yml
|
||||
public_dns__domain: "wingu.me"
|
||||
public_dns__records:
|
||||
# Anti-spoof baseline for a no-mail domain (replaces Gandi's seeded mail set):
|
||||
- { record: "@", type: MX, values: ["0 ."], ttl: 3600 }
|
||||
- { record: "@", type: TXT, values: ['"v=spf1 -all"'], ttl: 3600 }
|
||||
- { record: _dmarc, type: TXT, values: ['"v=DMARC1; p=reject;"'], ttl: 3600 }
|
||||
# Service records appear as public-tier needs arise; near-empty at M1.
|
||||
# askari / NetBird records land in M4, e.g.:
|
||||
# - { record: askari, type: A, values: ["<hetzner-ip>"], ttl: 1800 }
|
||||
# mesh/LAN-only services are intentionally ABSENT — internal zone only.
|
||||
# PAT referenced as {{ vault.gandi.pat }} (nested vault.<service>.<key>, CLAUDE.md).
|
||||
```
|
||||
|
||||
#### Open design nuance — additive vs authoritative
|
||||
|
||||
`gandi_livedns` is **per-record** (`present`/`absent`), not whole-zone sync. Gandi seeded
|
||||
`wingu.me` with 13 default records (above), so M1 needs a **one-time purge** of those to a
|
||||
clean baseline (declare them `state: absent`, or a one-shot scripted delete), then manage
|
||||
**additively**. Full-zone authoritative sync (GET existing → remove undeclared — the
|
||||
proper end-state, and TODO 8.3's prune question) is flagged as a later enhancement.
|
||||
|
||||
## Setup sequence (the runbook)
|
||||
|
||||
Legend: **[H]** human · **[A]** agent (from `ubongo`, committed code + check-mode).
|
||||
|
||||
1. **[H]** Register `wingu.me` at Gandi; pay. **[H]** Issue a **LiveDNS-scoped PAT**
|
||||
for it; store in vault (`vault.gandi.pat`) via rbw.
|
||||
2. **[A]** Author the `public_dns` role + `public_dns__records` data (incl. the anti-spoof
|
||||
baseline); add `community.general` to `requirements.yml` (≥9.0.0, with comment); commit.
|
||||
3. **[A]** One-time: **purge Gandi's 13 seeded defaults** (parking `A`, `www` redirect,
|
||||
Gandi mail `MX`/SPF/DKIM/`webmail`/`SRV`) down to the boma baseline.
|
||||
4. **[A]** `make check` (diff vs live Gandi) → `make deploy` to load records → `dig`
|
||||
verify. Re-run `make deploy` to confirm idempotence.
|
||||
4. Thereafter the zone is reconciled as code; M4 adds the `askari`/NetBird records.
|
||||
|
||||
No registrar transfer, no nameserver flip of a live zone, no service-preservation,
|
||||
no Forgejo rename — all of that belonged to the abandoned `baobab.band` framing.
|
||||
|
||||
## Division of labour & access (security posture)
|
||||
|
||||
| Task | Who | How |
|
||||
|---|---|---|
|
||||
| Register domain + pay | Human | Identity/billing/ToS — not automatable. |
|
||||
| Issue + store the PAT | Human | LiveDNS-scoped, single-domain; into vault via rbw. |
|
||||
| `public_dns` role + record data | Agent | Committed IaC; `make check` diff. |
|
||||
| Create zone + load records + reconcile | Agent | `public_dns` on `ubongo`, PAT from vault, check-mode first. |
|
||||
|
||||
- **Minimal token scope.** Gandi PAT: **LiveDNS-only**, restricted to `wingu.me`.
|
||||
- **Token in vault** (`vault.gandi.pat`) via rbw — never pasted in chat.
|
||||
- **Execution on `ubongo`**, committed role + `make check` → `make deploy`. No agent
|
||||
sandbox holds production credentials.
|
||||
|
||||
## Testing & verification
|
||||
|
||||
External-API reconciliation does not fit container Molecule cleanly (a nuance against
|
||||
ADR-008). Instead: **`make check` (check-mode + diff)**, **idempotence** (second deploy =
|
||||
no changes), **`dig` assertions** post-load, and optionally a small pytest over the
|
||||
`public_dns__records` data shape (mirrors `test_firewall_rules.py`).
|
||||
|
||||
## Scope boundaries — what M1 is NOT
|
||||
|
||||
- **Not** a migration of `baobab.band` or `ziethen.dk` — and **not** the Cloudflare exit /
|
||||
V4 decommission. Those are separate, later tracks.
|
||||
- **Not** the internal split-horizon `dns` role (renders `<service>.wingu.me`
|
||||
privately) — that needs the `dns` role + actual home services → **Phase 2**.
|
||||
- **Not** certificate issuance or the reverse proxy — **M4 (askari) / Phase 2 (home)**.
|
||||
- **Not** authoritative whole-zone pruning — additive for now.
|
||||
|
||||
## ADR work
|
||||
|
||||
Amend **ADR-007**: boma's public zone is **`wingu.me` at Gandi LiveDNS, managed as
|
||||
code** (replaces "Cloudflare or equivalent"); record the **three-tier naming scheme**;
|
||||
remove the `nyumbani` example; state the **mesh/LAN-only default**; note `public_dns` as
|
||||
the control-node role rendering the public zone (sibling to the internal `dns` role). Note
|
||||
that `baobab.band` (legacy, Cloudflare) is **not** boma's zone and is out of ADR-007's
|
||||
scope going forward.
|
||||
|
||||
## Open items (resolve during the plan / implementation)
|
||||
|
||||
- ~~Pick the domain~~ **DONE:** `wingu.me` registered at Gandi; LiveDNS PAT verified
|
||||
(2026-06-14) and stored in vault as `vault.gandi.pat`.
|
||||
- **Pin** the `community.general` version in `requirements.yml` (≥9.0.0).
|
||||
- **Play wiring:** a dedicated `dns.yml` play (control-targeted) vs folding into an
|
||||
existing play — decide in the plan.
|
||||
146
docs/superpowers/specs/2026-06-14-askari-provisioning-design.md
Normal file
146
docs/superpowers/specs/2026-06-14-askari-provisioning-design.md
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
# Design — Provisioning `askari` (Terraform + Hetzner Cloud)
|
||||
|
||||
- **Date:** 2026-06-14
|
||||
- **Status:** Draft for review — design settled in brainstorming; pending user review,
|
||||
then implementation plan
|
||||
- **Roadmap milestone:** M2 (`docs/ROADMAP.md`)
|
||||
- **Amends:** ADR-006 (Terraform scope → Proxmox **+ Hetzner**), ADR-009 (offsite
|
||||
handoff), ADR-020 (Hetzner Cloud Firewall = askari's perimeter), ADR-007/016 (`askari`
|
||||
is Terraform-provisioned, not "added manually")
|
||||
- **Becomes:** amendments to those ADRs
|
||||
|
||||
---
|
||||
|
||||
## Problem
|
||||
|
||||
`askari` (the off-site Hetzner VPS — NetBird coordinator + watchdog, later the off-site
|
||||
log subset) does not exist yet. ADR-007/016 designed it as "provisioned independently…
|
||||
added manually." Now that there's a dedicated Hetzner account + a verified API token in
|
||||
the vault, we can provision it as **IaC** instead. boma's principle (ADR-006/009) is
|
||||
"**Terraform owns VM existence; Ansible owns config**" — but scoped to Proxmox. This
|
||||
milestone **generalizes that principle to Hetzner** and stands `askari` up.
|
||||
|
||||
## Decisions (as settled)
|
||||
|
||||
1. **Terraform owns `askari`'s existence** (Approach 1) — generalize ADR-006 from "Proxmox
|
||||
VM existence" to "VM existence on **Proxmox + Hetzner**." (Rejected: Ansible
|
||||
`hetzner.hcloud` — breaks the TF/Ansible boundary; `hcloud` CLI — not stateful IaC.)
|
||||
2. **Server:** **CAX11** (ARM/Ampere, 2 vCPU / 4 GB / 40 GB), **Helsinki (`hel1`)**,
|
||||
**Debian 13**. Rescale up later if the off-site log subset needs it.
|
||||
3. **TF-managed Hetzner Cloud Firewall** as `askari`'s perimeter (the off-site
|
||||
OPNsense-analog). Starts minimal (**SSH from ubongo only**); service ports are added as
|
||||
services land (NetBird ports in M4). The ADR-020 catalog stays authoritative for the
|
||||
**host nftables** layer.
|
||||
4. **Token via `TF_VAR_hcloud_token`**, sourced from `vault.hetzner.token` at apply time
|
||||
— never in `.tfvars` (CLAUDE.md).
|
||||
5. **Handoff stays ADR-009-shaped:** `tf_to_inventory.py` is extended to emit `askari`
|
||||
into `offsite_hosts`, so `hosts.yml` stays fully generated.
|
||||
|
||||
## Verified facts (ADR-014)
|
||||
|
||||
> verified: Hetzner Cloud entry tiers · WebSearch · 2026-06-14 · **CAX11** (ARM/Ampere)
|
||||
> 2 vCPU / 4 GB / 40 GB ≈ €3.79/mo, 20 TB traffic + 1 IPv4; ARM (CAX) is **EU-locations
|
||||
> only** (incl. `hel1`). Price change for new orders from 2026-06-15.
|
||||
|
||||
> to verify when writing the role (ADR-014): the `hetznercloud/hcloud` provider version
|
||||
> to pin; the Debian 13 image slug (expected `debian-13`); CAX11 availability in `hel1`.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Terraform structure
|
||||
|
||||
- **Module `terraform/modules/hetzner_vm/`** (sibling to `proxmox_vm`): inputs `name`,
|
||||
`server_type`, `location`, `image`, `ssh_keys`, `user_data`, `firewall_rules`,
|
||||
`labels`; outputs the server's `ipv4` (+ id, name).
|
||||
- **Stack `terraform/environments/offsite/`** (its own **local state** on ubongo,
|
||||
gitignored): `providers.tf` pins **`hetznercloud/hcloud`**; `main.tf` calls
|
||||
`hetzner_vm` for `askari` + an `hcloud_firewall` + an `hcloud_ssh_key`; `variables.tf`
|
||||
(incl. `hcloud_token`, `control_ssh_pubkey`, `ssh_admin_cidr`); `outputs.tf` (askari
|
||||
`ipv4`, for the handoff + DNS); `backend.tf` (local state, like the Proxmox envs).
|
||||
- **`make tf-* TF_ENV=offsite`** drives it; for `offsite` the targets first export
|
||||
`TF_VAR_hcloud_token` from `vault.hetzner.token` (a small vault→env step). `tf-apply`
|
||||
stays gated behind a shown `tf-plan` (CLAUDE.md).
|
||||
|
||||
### Provisioning → Ansible handoff
|
||||
|
||||
1. TF creates the CAX11 with a **cloud-init `user_data`** that injects **ubongo's control
|
||||
SSH public key** for first login (minimal — no config beyond the key + ensuring
|
||||
Python is present for Ansible).
|
||||
2. TF outputs `askari`'s public IPv4. `tf_to_inventory.py` (extended for the offsite
|
||||
stack) writes `askari` into the `offsite_hosts` group of `hosts.yml`.
|
||||
3. `playbooks/bootstrap.yml` runs against `askari` → creates the `ansible` user + sudoers
|
||||
(as for Proxmox hosts). **Where M2 ends.**
|
||||
4. *(Downstream, not M2):* `base` remote-access subset (M3), NetBird coordinator (M4),
|
||||
mesh enrollment + SSH-narrowed-to-`wt0` (M5).
|
||||
- A convenience **`askari.wingu.me` A record** is added via the M1 `public_dns` role
|
||||
(stable name for humans + future certs); the inventory may reference it once DNS exists.
|
||||
|
||||
### Cloud firewall (perimeter)
|
||||
|
||||
- TF `hcloud_firewall` attached to `askari`:
|
||||
- **inbound SSH (22/tcp) from ubongo's address only** (`ssh_admin_cidr` var);
|
||||
- everything else default-deny.
|
||||
- **Grows with services:** NetBird's **UDP 3478** (Coturn) + **TCP 80/443**
|
||||
(management/dashboard) are added in **M4** when the coordinator deploys — not opened to
|
||||
a non-existent listener now.
|
||||
- This is the off-site **perimeter** layer (OPNsense has no presence off-cluster);
|
||||
ADR-020's `group_vars` catalog remains the single source for the **host nftables**
|
||||
layer that `base` renders (M3).
|
||||
|
||||
### State + disaster recovery
|
||||
|
||||
- The `offsite` `terraform.tfstate` lives on ubongo and is added to the **ADR-022 backup
|
||||
scope** (the control-node TF state backup already flagged in STATUS).
|
||||
- DR is management-only: `askari` survives a homelab/ubongo outage by design, so a lost
|
||||
state is recovered by `terraform import`-ing the still-running server — no rebuild.
|
||||
|
||||
## Division of labour & access
|
||||
|
||||
| Task | Who | How |
|
||||
|---|---|---|
|
||||
| Hetzner token | Done | `vault.hetzner.token` (verified live, HTTP 200). |
|
||||
| `hetzner_vm` module + `offsite` stack + `tf_to_inventory` extension + make token-inject | Agent | Committed IaC + a pytest for the handoff. |
|
||||
| `terraform plan` (offsite) | Agent | `make tf-plan TF_ENV=offsite`, **output shown**. |
|
||||
| `terraform apply` (offsite) | Human-gated | Only after the plan is reviewed (CLAUDE.md: never apply without a shown plan). Run on ubongo. |
|
||||
| Confirm the control SSH key | Human | Which ubongo key Ansible uses to reach hosts (its public key feeds `control_ssh_pubkey`). |
|
||||
|
||||
- **Token:** `TF_VAR_hcloud_token` from vault at apply; never written to a `.tfvars` file.
|
||||
- **SSH:** cloud-init injects only the control public key; the private key stays on
|
||||
ubongo. The cloud firewall limits SSH to ubongo's address until the mesh exists.
|
||||
|
||||
## Testing & verification
|
||||
|
||||
- `terraform fmt` + **`terraform validate`** + **`make tf-plan TF_ENV=offsite`** (plan
|
||||
reviewed before any apply).
|
||||
- **pytest** for the `tf_to_inventory.py` offsite extension (mirrors the existing
|
||||
stdlib-only script tests), asserting an `askari` entry lands in `offsite_hosts`.
|
||||
- Post-apply: SSH reachability from ubongo; cloud-init ran; then `bootstrap.yml`
|
||||
connectivity. (`base`/NetBird get their own Molecule/verify in M3/M4.)
|
||||
|
||||
## Scope boundaries — what M2 is NOT
|
||||
|
||||
- **Not** the `base` hardening subset (SSH hardening, fail2ban, NetBird agent) — **M3**.
|
||||
- **Not** the NetBird coordinator or the cloud-firewall NetBird ports — **M4**.
|
||||
- **Not** mesh enrollment / narrowing SSH to `wt0` — **M5**.
|
||||
- **Not** the off-site log subset (may need a bigger instance / a volume) — later.
|
||||
|
||||
## ADR work
|
||||
|
||||
- **ADR-006** — generalize "Terraform owns VM existence" to **Proxmox + Hetzner**; add the
|
||||
`hetznercloud/hcloud` provider (no longer "the only provider is `bpg/proxmox`"); add the
|
||||
`offsite` environment + `hetzner_vm` module to Structure; note the TF-managed Hetzner
|
||||
Cloud Firewall.
|
||||
- **ADR-009** — the offsite handoff (`tf_to_inventory.py` emits `askari` → `offsite_hosts`).
|
||||
- **ADR-020** — the Hetzner Cloud Firewall is `askari`'s perimeter (OPNsense-analog);
|
||||
catalog still authoritative for host nftables.
|
||||
- **ADR-007 / ADR-016** — `askari` is Terraform-provisioned (hcloud), superseding "added
|
||||
manually."
|
||||
|
||||
## Open items (resolve during the plan / implementation)
|
||||
|
||||
- **Pin** the `hetznercloud/hcloud` provider version; confirm the `debian-13` image slug
|
||||
and CAX11/`hel1` availability (ADR-014).
|
||||
- The **make tf token-inject** mechanism for `offsite` (read `vault.hetzner.token` → export
|
||||
`TF_VAR_hcloud_token`) — shape it in the plan (rbw/ansible-vault one-liner vs a wrapper).
|
||||
- Whether the inventory references `askari` by **IPv4 (from TF output)** or by
|
||||
**`askari.wingu.me`** once the DNS record exists — decide in the plan.
|
||||
27
inventories/production/group_vars/all/public_dns.yml
Normal file
27
inventories/production/group_vars/all/public_dns.yml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
---
|
||||
# Public DNS — wingu.me at Gandi LiveDNS, managed by the public_dns role (M1).
|
||||
# Mesh/LAN-only by default: only deliberate public records live here. PAT in
|
||||
# vault.gandi.pat. See docs/decisions/007-network.md and the M1 spec.
|
||||
public_dns__domain: wingu.me
|
||||
|
||||
# Present — anti-spoof baseline for a no-mail domain (overwrites Gandi's seeded mail set).
|
||||
public_dns__records:
|
||||
- {record: "@", type: MX, values: ["0 ."], ttl: 3600}
|
||||
- {record: "@", type: TXT, values: ['"v=spf1 -all"'], ttl: 3600}
|
||||
- {record: _dmarc, type: TXT, values: ['"v=DMARC1; p=reject;"'], ttl: 3600}
|
||||
# Service records appear as public-tier needs arise (askari A in M4).
|
||||
# Mesh/LAN-only services never appear here.
|
||||
|
||||
# Absent — Gandi's auto-seeded defaults we don't want (purged once, idempotent thereafter).
|
||||
public_dns__absent:
|
||||
- {record: "@", type: A} # Gandi parking IP
|
||||
- {record: www, type: CNAME} # Gandi web-redirect
|
||||
- {record: webmail, type: CNAME} # Gandi webmail
|
||||
- {record: gm1._domainkey, type: CNAME} # Gandi DKIM
|
||||
- {record: gm2._domainkey, type: CNAME}
|
||||
- {record: gm3._domainkey, type: CNAME}
|
||||
- {record: _imap._tcp, type: SRV} # Gandi mail autodiscovery
|
||||
- {record: _imaps._tcp, type: SRV}
|
||||
- {record: _pop3._tcp, type: SRV}
|
||||
- {record: _pop3s._tcp, type: SRV}
|
||||
- {record: _submission._tcp, type: SRV}
|
||||
|
|
@ -1,18 +1,44 @@
|
|||
$ANSIBLE_VAULT;1.1;AES256
|
||||
62313835613730303334653334393033646661323865636534353061333765326239333835643139
|
||||
6631393939363263313861656461303134383162336662380a333564343131323036343137383736
|
||||
64666265653233636631373266396132623561363766326266353638643538303936613435333530
|
||||
3864356133633663650a376362313861326263633036303664336439663030613438636339613765
|
||||
63383431376636646236346435363035333036373466613066643761646237323133633866366230
|
||||
38396333313238393630336263373063363538343865366432353138643663653638356438303738
|
||||
66313832343436616634313734343433363362613437383963383263363666613431346663376263
|
||||
62633162633962376537306262353736336435343339333266643661373538643236636631666662
|
||||
39643664303137356562313061306439623239656534323065306132643833383738623261393232
|
||||
63643434396165343631633063616161616430373130663830623936306339393933653437393931
|
||||
37633532363264636537343165316231363130613964646635666665363136623637326561323336
|
||||
30386235366261353661656231396362366263316338663135663333306434306563363464363336
|
||||
35376233303939393039646261633833656337666335636333343030343435656664306433363530
|
||||
36306636303530336262396664646331663834336235663236656636353833396437303636373133
|
||||
64313639346164656438613066636661353736613334383734633232376335323761396634363031
|
||||
65333631636337323630353165356539306531393434633163373637373739366131363734353934
|
||||
34373762303661316235353162636132623736646630663438366433376639613964
|
||||
32663033666462323861636161306437393231663035646137646130326433366638356632333463
|
||||
3338333435356631306330376134376139333233336334300a336164376539363833356431633465
|
||||
61313531366161663761373038613166303132636261363138636438316631313133326265623166
|
||||
3439643431646261340a383734386163373630633261613231643530393064303431633437343434
|
||||
34323434346265336535663637326433643837366564363633666132633537313230303731313264
|
||||
64366364626266663437303032353933653664313932383765346431303035303136326637616131
|
||||
31666237663930303035306632633765626133346561653434653131323962623730613338343532
|
||||
63363235326537636434303163646131656535376234353732366264666131366532656333383066
|
||||
62363631316533333330373763653366376162336531373539666466323934353461666433616231
|
||||
38333639333831363861326636303434316130353662336235336261346433343539366233643337
|
||||
64656564613531623533663865366138356633373065643263613832373961653237303831336539
|
||||
61643363656566396164383236383361643035383233363064313766336561626564366435626539
|
||||
37376262396234343765313430303736623038353765666337363162643666323766373333306438
|
||||
63316639363864386662373865396139313933666533333062376266393737356535366164636261
|
||||
63663764623132356131393966323563313265303261666232623033653136633763653933616166
|
||||
66373137633536313863646134633435643735356165313863343662393065306336613737356131
|
||||
36636466626639346238303239326462393966316233343531383137343633626439316130373836
|
||||
30383434653234343964353633313764386639326130373331343130336432653164383935336663
|
||||
31343166353833343535373338616464316437386163353865353363386462393038323563633837
|
||||
30666161626537316532663234366633356336363965396166333062306335346639373262303633
|
||||
32316262623037336166613466623662383134363463663136353433396237393935313661366461
|
||||
31336531396163633065346364323037356665383039633437346465306431303530336263356536
|
||||
33373033313538303464373562336131336238373433656439626462343930356363393033323736
|
||||
34663666356434393263666633666439373639383336333165663036623332336330626165376634
|
||||
31333438346132613433313162636439316531303436313436383063646438316366663661373363
|
||||
37326263343163666230363530313066383534636635636136636261333037333533303937313861
|
||||
36363137393264313734636165656631643234646634653835656666306635373761356535663232
|
||||
63626137613839333833303832623135326237616662333563626461636436363562653338343938
|
||||
39636430313739323965626362623034613364323162376161366236373439373262383036383234
|
||||
61303962633265326563386139313966626334663865623762326139666535613232373261623264
|
||||
37623730386661643662396639643737313265646532353561333537316464623064393236356230
|
||||
63666163346531333363393434643337643038303232333862353831363434313961386133613163
|
||||
63376264343036373230633130383732316332303437393936646464383630343130316432636134
|
||||
37643335633763303931623133396166646231616233653533623731643231323331393732383935
|
||||
38366564626637623737366336356433613435653631653762333833373662643634383265353266
|
||||
65613736313534646134316333343566313564383838316633386235343136383239633636303862
|
||||
34623635623530373961313434366562366564373533633839356664643064383139393561373833
|
||||
66373466323863383734663834613832663339623236656636353032663237623733303136393138
|
||||
64376331666666633361666538623138393065626664623139333832663930333065393332623235
|
||||
31633135616633336630353136326463366664646133316637313066303637636231313766616464
|
||||
30643165323530366631326238636437613664396633613163353536623934313163333330373665
|
||||
35336364393236313934653339653462663639616562366264313334313062323235346239343534
|
||||
373835613065313662636665646537633036
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@ Top-level orchestration playbooks. No inline vars — configuration comes from
|
|||
`group_vars/` / `host_vars/` (see CLAUDE.md).
|
||||
|
||||
- `site.yml` — full standard state: applies `base` to all hosts and `docker_host`
|
||||
to docker hosts. **Note:** those roles are empty today, so this is currently a
|
||||
no-op — see `STATUS.md`.
|
||||
to docker hosts. **Note:** `base` is only partially built (its `firewall` concern)
|
||||
and `docker_host` is scaffolded with no tasks yet, so this is incomplete — see `STATUS.md`.
|
||||
- `workstation.yml` — applies the `dev_env` role (interactive developer environment)
|
||||
to the `control` group; built and applied to `ubongo` (see `STATUS.md`).
|
||||
- `bootstrap.yml` — first-run setup for a host that may not have Python yet;
|
||||
self-contained (does not depend on the roles).
|
||||
|
||||
|
|
|
|||
12
playbooks/dns.yml
Normal file
12
playbooks/dns.yml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
# dns.yml — manage the public DNS zone (wingu.me) at Gandi LiveDNS as code.
|
||||
# Runs on the control node (ubongo) against the Gandi API — no host config.
|
||||
# Run: make check PLAYBOOK=dns then make deploy PLAYBOOK=dns
|
||||
- name: Manage public DNS (Gandi LiveDNS)
|
||||
hosts: control
|
||||
connection: local
|
||||
gather_facts: false
|
||||
become: false
|
||||
roles:
|
||||
- role: public_dns
|
||||
tags: [public_dns]
|
||||
|
|
@ -1,8 +1,9 @@
|
|||
---
|
||||
# site.yml — apply full standard state to all hosts
|
||||
# Run via: make deploy PLAYBOOK=site
|
||||
# NOTE: the `base` and `docker_host` roles are not built yet (see STATUS.md), so this
|
||||
# playbook fails on a clean clone until they exist.
|
||||
# NOTE: `base` is only partially built (its `firewall` concern; see STATUS.md) and
|
||||
# `docker_host` is scaffolded but has no tasks yet, so this playbook applies base's
|
||||
# firewall but is otherwise incomplete until those roles gain content.
|
||||
|
||||
- name: Apply base configuration to all hosts
|
||||
hosts: all
|
||||
|
|
|
|||
|
|
@ -11,3 +11,8 @@ collections:
|
|||
# authorized_key, sysctl, acl.
|
||||
- name: ansible.posix
|
||||
version: ">=1.5.0"
|
||||
|
||||
# community.general — gandi_livedns (public_dns role manages wingu.me at Gandi
|
||||
# LiveDNS). PAT auth requires >= 9.0.0.
|
||||
- name: community.general
|
||||
version: ">=9.0.0"
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ Each role must have: a `molecule/default/` scenario (Debian 13), a populated
|
|||
`README.md`, and a filled-in `meta/main.yml`. Conventions: CLAUDE.md and
|
||||
`docs/runbooks/new-role.md`.
|
||||
|
||||
Current state: `base` and `docker_host` are **not built yet** — they exist only as
|
||||
empty, untracked dirs, so `site.yml` would fail on a clean clone. Build them with
|
||||
`make new-role` when defining the baseline. See `STATUS.md`.
|
||||
Current state: `base` is **partially built** — its `firewall` concern (nftables) is
|
||||
implemented and tested; the other concerns (SSH hardening, fail2ban, auditd, packages,
|
||||
users) are not yet built. `docker_host` is **scaffolded but has no tasks yet**. `dev_env` (interactive
|
||||
developer environment) is built and applied. See `STATUS.md` for the authoritative
|
||||
breakdown.
|
||||
|
|
|
|||
34
roles/docker_host/README.md
Normal file
34
roles/docker_host/README.md
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
# docker_host
|
||||
|
||||
Docker engine + Compose runtime applied to every host in the `docker_hosts` group.
|
||||
Provides the container platform that the per-service roles (one service = one role,
|
||||
ADR-004) deploy their Compose stacks onto.
|
||||
|
||||
> **Status: scaffolded, not yet implemented.** This role has no tasks yet — applying it
|
||||
> is a no-op. It is wired into `playbooks/site.yml` so the full standard state is
|
||||
> expressed end-to-end, and so `make lint` covers it. See `STATUS.md`.
|
||||
|
||||
## Planned scope
|
||||
|
||||
- Install Docker engine + the Compose plugin, version-pinned (ADR-011).
|
||||
- Daemon hardening: `iptables: false` (the host `base` firewall owns nftables, ADR-020),
|
||||
log driver, `live-restore`, user-namespace remapping where practical (ADR-002).
|
||||
- Render container forward/NAT rules into `/etc/nftables.d/*.nft` — the include hook the
|
||||
`base` role's ruleset exposes (see `roles/base/README.md`).
|
||||
- Provide the runtime the service roles deploy their Compose files onto.
|
||||
|
||||
## Variables
|
||||
|
||||
None yet. Placeholders will use the `docker_host__*` namespace (CLAUDE.md convention).
|
||||
|
||||
## Example
|
||||
|
||||
```yaml
|
||||
- hosts: docker_hosts
|
||||
become: true
|
||||
roles:
|
||||
- role: docker_host
|
||||
tags: [docker_host]
|
||||
```
|
||||
|
||||
See ADR-004 (`docs/decisions/004-docker-model.md`) for the Docker & Compose model.
|
||||
1
roles/docker_host/defaults/main.yml
Normal file
1
roles/docker_host/defaults/main.yml
Normal file
|
|
@ -0,0 +1 @@
|
|||
---
|
||||
1
roles/docker_host/handlers/main.yml
Normal file
1
roles/docker_host/handlers/main.yml
Normal file
|
|
@ -0,0 +1 @@
|
|||
---
|
||||
11
roles/docker_host/meta/main.yml
Normal file
11
roles/docker_host/meta/main.yml
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
---
|
||||
galaxy_info:
|
||||
author: sjat
|
||||
description: Docker engine + Compose runtime for boma docker_hosts (Debian 13).
|
||||
license: MIT
|
||||
min_ansible_version: "2.17"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions:
|
||||
- trixie
|
||||
dependencies: []
|
||||
7
roles/docker_host/molecule/default/converge.yml
Normal file
7
roles/docker_host/molecule/default/converge.yml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
- name: Converge
|
||||
hosts: all
|
||||
gather_facts: true
|
||||
|
||||
roles:
|
||||
- role: docker_host
|
||||
31
roles/docker_host/molecule/default/molecule.yml
Normal file
31
roles/docker_host/molecule/default/molecule.yml
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
dependency:
|
||||
name: galaxy
|
||||
options:
|
||||
requirements-file: ../../requirements.yml
|
||||
|
||||
driver:
|
||||
name: docker
|
||||
|
||||
platforms:
|
||||
- name: instance
|
||||
# Project-owned image built from .docker/molecule-debian13/Dockerfile
|
||||
# and hosted in the Forgejo container registry.
|
||||
# Build/push with: make molecule-image / make molecule-image-push
|
||||
image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
|
||||
pre_build_image: true
|
||||
privileged: true # required for systemd
|
||||
cgroupns_mode: host
|
||||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
command: /lib/systemd/systemd
|
||||
|
||||
provisioner:
|
||||
name: ansible
|
||||
inventory:
|
||||
host_vars:
|
||||
instance:
|
||||
ansible_user: root
|
||||
|
||||
verifier:
|
||||
name: ansible
|
||||
11
roles/docker_host/molecule/default/verify.yml
Normal file
11
roles/docker_host/molecule/default/verify.yml
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
---
|
||||
- name: Verify
|
||||
hosts: all
|
||||
gather_facts: true
|
||||
|
||||
tasks:
|
||||
- name: Add verification tasks here
|
||||
ansible.builtin.assert:
|
||||
that: true
|
||||
msg: "Replace this with real assertions"
|
||||
tags: [verify]
|
||||
13
roles/docker_host/tasks/main.yml
Normal file
13
roles/docker_host/tasks/main.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
# docker_host — Docker engine + Compose runtime for hosts in the docker_hosts group.
|
||||
#
|
||||
# SCAFFOLDED, NOT YET IMPLEMENTED. This role is referenced by playbooks/site.yml so the
|
||||
# full standard state is expressed end-to-end, but it has no tasks yet — applying it is a
|
||||
# no-op. See STATUS.md ("Scaffolded but empty") and ADR-004 (Docker & Compose model).
|
||||
#
|
||||
# Planned scope (ADR-002/004/020):
|
||||
# - install Docker engine + compose plugin (version-pinned, per ADR-011)
|
||||
# - daemon hardening: iptables:false (host nftables owns the firewall, ADR-020),
|
||||
# log-driver, live-restore, userns where practical
|
||||
# - render container forward/NAT rules into /etc/nftables.d/*.nft (the base-role hook)
|
||||
# - deploy per-service Compose stacks from the service roles (one service = one role)
|
||||
30
roles/public_dns/README.md
Normal file
30
roles/public_dns/README.md
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
# public_dns
|
||||
|
||||
Manages boma's public DNS zone (**wingu.me**) at **Gandi LiveDNS** as code, via
|
||||
`community.general.gandi_livedns` (PAT auth from `vault.gandi.pat`). Provider-agnostic
|
||||
name on purpose. Run from the control node: `make check/deploy PLAYBOOK=dns`.
|
||||
|
||||
Mesh/LAN-only by default — only deliberate public records live in the zone (the
|
||||
anti-spoof baseline now; `askari` in M4). Everything else is reached over LAN/mesh and
|
||||
never appears here.
|
||||
|
||||
## Data (in `group_vars/all/public_dns.yml`)
|
||||
|
||||
| Var | Meaning |
|
||||
|---|---|
|
||||
| `public_dns__domain` | the zone (`wingu.me`) |
|
||||
| `public_dns__records` | records to ensure **present** (`record`, `type`, `values`, optional `ttl`) |
|
||||
| `public_dns__absent` | records to ensure **absent** (Gandi's auto-seeded defaults) |
|
||||
|
||||
## Behaviour knobs (`defaults/main.yml`)
|
||||
|
||||
| Var | Default | Meaning |
|
||||
|---|---|---|
|
||||
| `public_dns__apply` | `true` | set `false` to validate without calling the Gandi API (Molecule) |
|
||||
| `public_dns__default_ttl` | `1800` | TTL when a record omits one |
|
||||
|
||||
## Notes
|
||||
|
||||
The zone is reconciled **additively** plus an explicit `absent` list (Gandi seeds 13
|
||||
default records on a new `.me`; we purge the unwanted 11 and overwrite MX/SPF with the
|
||||
anti-spoof baseline). Full-zone authoritative pruning is a future enhancement (TODO 8.3).
|
||||
9
roles/public_dns/defaults/main.yml
Normal file
9
roles/public_dns/defaults/main.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
---
|
||||
# public_dns — manage the public zone at Gandi LiveDNS as code (M1).
|
||||
# Record data (public_dns__domain / __records / __absent) lives in group_vars/all.
|
||||
# See docs/decisions/007-network.md.
|
||||
public_dns__apply: true # set false to validate without calling the Gandi API (Molecule)
|
||||
public_dns__default_ttl: 1800 # TTL when a record omits one
|
||||
public_dns__domain: "" # overridden in group_vars/all
|
||||
public_dns__records: [] # present records
|
||||
public_dns__absent: [] # records to remove
|
||||
1
roles/public_dns/handlers/main.yml
Normal file
1
roles/public_dns/handlers/main.yml
Normal file
|
|
@ -0,0 +1 @@
|
|||
---
|
||||
11
roles/public_dns/meta/main.yml
Normal file
11
roles/public_dns/meta/main.yml
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
---
|
||||
galaxy_info:
|
||||
author: sjat
|
||||
description: Manage boma's public DNS zone (wingu.me) at Gandi LiveDNS as code.
|
||||
license: MIT
|
||||
min_ansible_version: "2.17"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions:
|
||||
- trixie
|
||||
dependencies: []
|
||||
14
roles/public_dns/molecule/default/converge.yml
Normal file
14
roles/public_dns/molecule/default/converge.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
---
|
||||
- name: Converge
|
||||
hosts: all
|
||||
gather_facts: true
|
||||
vars:
|
||||
public_dns__apply: false # never call the Gandi API from a container
|
||||
public_dns__domain: example.test
|
||||
public_dns__records:
|
||||
- {record: "@", type: MX, values: ["0 ."], ttl: 3600}
|
||||
- {record: "@", type: TXT, values: ['"v=spf1 -all"'], ttl: 3600}
|
||||
public_dns__absent:
|
||||
- {record: www, type: CNAME}
|
||||
roles:
|
||||
- role: public_dns
|
||||
31
roles/public_dns/molecule/default/molecule.yml
Normal file
31
roles/public_dns/molecule/default/molecule.yml
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
---
|
||||
dependency:
|
||||
name: galaxy
|
||||
options:
|
||||
requirements-file: ../../requirements.yml
|
||||
|
||||
driver:
|
||||
name: docker
|
||||
|
||||
platforms:
|
||||
- name: instance
|
||||
# Project-owned image built from .docker/molecule-debian13/Dockerfile
|
||||
# and hosted in the Forgejo container registry.
|
||||
# Build/push with: make molecule-image / make molecule-image-push
|
||||
image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
|
||||
pre_build_image: true
|
||||
privileged: true # required for systemd
|
||||
cgroupns_mode: host
|
||||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
command: /lib/systemd/systemd
|
||||
|
||||
provisioner:
|
||||
name: ansible
|
||||
inventory:
|
||||
host_vars:
|
||||
instance:
|
||||
ansible_user: root
|
||||
|
||||
verifier:
|
||||
name: ansible
|
||||
12
roles/public_dns/molecule/default/verify.yml
Normal file
12
roles/public_dns/molecule/default/verify.yml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
- name: Verify
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Role variables resolved
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- public_dns__domain == "example.test"
|
||||
- public_dns__apply | bool == false
|
||||
msg: "public_dns defaults/vars did not resolve as expected"
|
||||
tags: [verify]
|
||||
38
roles/public_dns/tasks/main.yml
Normal file
38
roles/public_dns/tasks/main.yml
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
- name: Assert public DNS data is sane
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- public_dns__domain | length > 0
|
||||
- public_dns__records | selectattr('type', 'equalto', 'MX') | list | length > 0
|
||||
fail_msg: >-
|
||||
public_dns__domain must be set and a null-MX anti-spoof record declared in
|
||||
public_dns__records (group_vars/all/public_dns.yml).
|
||||
run_once: true
|
||||
|
||||
- name: Ensure desired records are present (Gandi LiveDNS)
|
||||
community.general.gandi_livedns:
|
||||
domain: "{{ public_dns__domain }}"
|
||||
record: "{{ item.record }}"
|
||||
type: "{{ item.type }}"
|
||||
values: "{{ item.values }}"
|
||||
ttl: "{{ item.ttl | default(public_dns__default_ttl) }}"
|
||||
state: present
|
||||
personal_access_token: "{{ vault.gandi.pat }}"
|
||||
loop: "{{ public_dns__records }}"
|
||||
loop_control:
|
||||
label: "{{ item.record }} {{ item.type }}"
|
||||
run_once: true
|
||||
when: public_dns__apply | bool
|
||||
|
||||
- name: Ensure unwanted records are absent (Gandi LiveDNS)
|
||||
community.general.gandi_livedns:
|
||||
domain: "{{ public_dns__domain }}"
|
||||
record: "{{ item.record }}"
|
||||
type: "{{ item.type }}"
|
||||
state: absent
|
||||
personal_access_token: "{{ vault.gandi.pat }}"
|
||||
loop: "{{ public_dns__absent }}"
|
||||
loop_control:
|
||||
label: "{{ item.record }} {{ item.type }}"
|
||||
run_once: true
|
||||
when: public_dns__apply | bool
|
||||
|
|
@ -1,10 +1,15 @@
|
|||
# scripts/
|
||||
|
||||
Small helper scripts. **Python standard library only** — no third-party
|
||||
dependencies (keeps them runnable anywhere without a venv).
|
||||
dependencies (keeps them runnable anywhere without a venv). One deliberate
|
||||
exception: `check-vault.py` is a vault tool that needs the ansible venv (PyYAML +
|
||||
`ansible-vault`) and `rbw`, so it is not run-anywhere by design.
|
||||
|
||||
- `tf_to_inventory.py` — reads `terraform output -json` on stdin and writes an
|
||||
Ansible `hosts.yml`. Invoked by `make tf-inventory`. Data contract: **ADR-009**.
|
||||
- `check-vault.py` — validates a vault file's structure (decrypts in-memory; valid
|
||||
YAML; secrets under the nested `vault:` map; no empty leaves) and prints a
|
||||
values-masked view. Invoked by `make check-vault` and after `make edit-vault`.
|
||||
- `vault-pass-client.sh` — fetches the master vault password from Vaultwarden via
|
||||
`rbw`. Wired as `vault_password_file` (ADR-002).
|
||||
- `check-vault-encrypted.sh` — pre-commit guard: fails if a `vault.yml` holds
|
||||
|
|
|
|||
101
scripts/check-vault.py
Executable file
101
scripts/check-vault.py
Executable file
|
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Validate an ansible-vault file's structure without exposing secret values.
|
||||
|
||||
Decrypts in-memory via ``ansible-vault view`` (using the configured
|
||||
``vault_password_file`` from ansible.cfg — so rbw must be unlocked), parses the
|
||||
YAML, and checks:
|
||||
|
||||
- the file is ansible-vault encrypted on disk;
|
||||
- it decrypts to valid YAML that is a mapping;
|
||||
- top-level keys are within the allowed set (``vault`` + the ``vault__confirm``
|
||||
canary) — secrets belong under the nested ``vault:`` map (CLAUDE.md);
|
||||
- ``vault.<service>.<key>`` leaves are all non-empty strings.
|
||||
|
||||
Prints a REDACTED view (comments + key tree, values masked) so a human can eyeball
|
||||
format and comments. Secret values are never printed.
|
||||
|
||||
Unlike the stdlib-only utility scripts (TODO 14), this one deliberately depends on
|
||||
the ansible venv (PyYAML) + ``ansible-vault`` + rbw — it is a vault tool, not a
|
||||
run-anywhere helper. Invoked by ``make check-vault`` / ``make edit-vault``.
|
||||
|
||||
Usage: check-vault.py [VAULT_FILE]
|
||||
"""
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
sys.exit("check-vault: needs PyYAML — run inside the ansible venv (make check-vault)")
|
||||
|
||||
DEFAULT = "inventories/production/group_vars/all/vault.yml"
|
||||
ALLOWED_TOPLEVEL = {"vault", "vault__confirm"}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT
|
||||
p = pathlib.Path(path)
|
||||
if not p.exists():
|
||||
sys.exit(f"check-vault: {path} not found")
|
||||
if not p.read_text(errors="replace").startswith("$ANSIBLE_VAULT"):
|
||||
sys.exit(f"check-vault: {path} is not ansible-vault encrypted")
|
||||
|
||||
# Decrypt in-memory via the venv's ansible-vault (picks up vault_password_file).
|
||||
av = pathlib.Path(sys.executable).parent / "ansible-vault"
|
||||
av = str(av) if av.exists() else "ansible-vault"
|
||||
r = subprocess.run([av, "view", path], capture_output=True, text=True)
|
||||
if r.returncode != 0:
|
||||
sys.exit(f"check-vault: cannot decrypt {path} (is rbw unlocked?)\n{r.stderr.strip()[:300]}")
|
||||
|
||||
text = r.stdout
|
||||
try:
|
||||
data = yaml.safe_load(text)
|
||||
except yaml.YAMLError as e:
|
||||
sys.exit(f"check-vault: invalid YAML after decrypt: {e}")
|
||||
if not isinstance(data, dict):
|
||||
sys.exit("check-vault: vault root is not a mapping")
|
||||
|
||||
errors = []
|
||||
extra = set(data) - ALLOWED_TOPLEVEL
|
||||
if extra:
|
||||
errors.append(
|
||||
f"unexpected top-level key(s) {sorted(extra)} — secrets belong under "
|
||||
f"the `vault:` map (CLAUDE.md)")
|
||||
if "vault" not in data:
|
||||
errors.append("missing top-level `vault:` map")
|
||||
elif not isinstance(data["vault"], dict):
|
||||
errors.append("`vault:` is not a mapping")
|
||||
else:
|
||||
for svc, kv in data["vault"].items():
|
||||
if not isinstance(kv, dict):
|
||||
errors.append(f"vault.{svc} is not a mapping of <key>: <secret>")
|
||||
continue
|
||||
for k, v in kv.items():
|
||||
if not isinstance(v, str) or not v.strip():
|
||||
errors.append(f"vault.{svc}.{k} is empty or not a string")
|
||||
|
||||
# Redacted structure (comments + masked values) for human review.
|
||||
print(f"# {path} — redacted structure (secret values masked)")
|
||||
for line in text.splitlines():
|
||||
s = line.strip()
|
||||
if not s or s.startswith("#") or s.endswith(":"):
|
||||
print(line)
|
||||
elif ":" in line:
|
||||
print(line.split(":", 1)[0] + ': "***"')
|
||||
else:
|
||||
print(" ***")
|
||||
if isinstance(data.get("vault"), dict):
|
||||
print("\n# services under vault: " + ", ".join(sorted(data["vault"])))
|
||||
|
||||
if errors:
|
||||
print("\ncheck-vault: FAIL", file=sys.stderr)
|
||||
for e in errors:
|
||||
print(f" - {e}", file=sys.stderr)
|
||||
return 1
|
||||
print("\ncheck-vault: OK")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
53
tests/test_public_dns.py
Normal file
53
tests/test_public_dns.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import pathlib
|
||||
|
||||
import yaml
|
||||
|
||||
_DATA = (
|
||||
pathlib.Path(__file__).resolve().parent.parent
|
||||
/ "inventories" / "production" / "group_vars" / "all" / "public_dns.yml"
|
||||
)
|
||||
|
||||
# Gandi auto-seeds these on a fresh .me zone; boma purges them (verified 2026-06-14).
|
||||
GANDI_DEFAULTS_ABSENT = {
|
||||
("@", "A"), ("www", "CNAME"), ("webmail", "CNAME"),
|
||||
("gm1._domainkey", "CNAME"), ("gm2._domainkey", "CNAME"), ("gm3._domainkey", "CNAME"),
|
||||
("_imap._tcp", "SRV"), ("_imaps._tcp", "SRV"), ("_pop3._tcp", "SRV"),
|
||||
("_pop3s._tcp", "SRV"), ("_submission._tcp", "SRV"),
|
||||
}
|
||||
|
||||
|
||||
def _load():
|
||||
return yaml.safe_load(_DATA.read_text())
|
||||
|
||||
|
||||
def test_domain_is_wingu():
|
||||
assert _load()["public_dns__domain"] == "wingu.me"
|
||||
|
||||
|
||||
def test_present_records_well_formed():
|
||||
for r in _load()["public_dns__records"]:
|
||||
assert r["record"] and r["type"]
|
||||
assert isinstance(r["values"], list) and r["values"]
|
||||
|
||||
|
||||
def test_anti_spoof_baseline_present():
|
||||
recs = {(r["record"], r["type"]): r["values"] for r in _load()["public_dns__records"]}
|
||||
assert recs[("@", "MX")] == ["0 ."] # null MX
|
||||
assert recs[("@", "TXT")] == ['"v=spf1 -all"'] # SPF deny-all
|
||||
assert recs[("_dmarc", "TXT")] == ['"v=DMARC1; p=reject;"']
|
||||
|
||||
|
||||
def test_gandi_defaults_marked_absent():
|
||||
absent = {(r["record"], r["type"]) for r in _load()["public_dns__absent"]}
|
||||
assert GANDI_DEFAULTS_ABSENT <= absent
|
||||
|
||||
|
||||
def test_no_record_both_present_and_absent():
|
||||
present = {(r["record"], r["type"]) for r in _load()["public_dns__records"]}
|
||||
absent = {(r["record"], r["type"]) for r in _load()["public_dns__absent"]}
|
||||
assert present.isdisjoint(absent)
|
||||
|
||||
|
||||
def test_no_duplicate_present_records():
|
||||
keys = [(r["record"], r["type"]) for r in _load()["public_dns__records"]]
|
||||
assert len(keys) == len(set(keys))
|
||||
Loading…
Add table
Reference in a new issue