95 changed files with 225 additions and 6663 deletions
--- a/.ansible-lint
+++ b/.ansible-lint
@ -6,7 +6,6 @@ exclude_paths:
  - .venv/
  - .collections/
  - .scaffold/
  - tests/integration/.run/   # transient harness run dir (gitignored, generated)
  - "**/vault.yml"  # ansible-vault encrypted — not lintable YAML
 # Warn only (don't fail) on these rules during initial setup
--- a/.claude/hooks/guard-execution-mode-menu.sh
+++ b/.claude/hooks/guard-execution-mode-menu.sh
@ -6,12 +6,7 @@
 #   1. The execution-mode menu — writing-plans / subagent-driven-development script a
 #      "Subagent-Driven vs Inline Execution — which approach?" menu at the plan→execution
 #      handoff. boma's standing preference is to NEVER present it and proceed
-#      subagent-driven. (Recorded by the 2026-06-10 kaizen review; the 2026-06-17 review
+#      subagent-driven. (Recorded by the 2026-06-10 kaizen review.)
 #      widened the matcher to also catch free-form *prose* re-asks of the same choice —
 #      e.g. "which execution approach?" — which the literal-menu matcher missed. The
 #      sibling push-vs-not re-ask is deliberately NOT hooked: a genuine "should I push?"
 #      is sometimes legitimate, so it stays a soft default via the
 #      dont-reask-settled-defaults memory rather than a hard block.)
 #   2. The brainstorming spec-review gate — the brainstorming skill scripts "Spec written
 #      and committed … please review it before … the implementation plan." The standing
 #      agreement is to move directly from the committed spec to writing-plans. (Recorded
@ -44,11 +39,7 @@ text=$(jq -rs '
 low="${text,,}"
 if [[ "$low" == *"inline execution"* \
-      && ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]] \
+   && ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]]; then
   || [[ "$low" == *"subagent-driven or inline"* || "$low" == *"inline or subagent"* ]] \
   || [[ "$low" == *"subagent-driven vs inline"* || "$low" == *"subagent vs inline"* \
         || "$low" == *"inline vs subagent"* ]] \
   || [[ "$low" == *"execution approach"* && "$low" == *"?"* ]]; then
  cat <<'JSON'
 {"decision":"block","reason":"Execution-mode menu detected in your final message. boma standing preference (docs/FRICTION.md + always-subagent-driven-execution memory): never present the subagent-driven-vs-inline menu. Drop the menu and proceed with subagent-driven execution directly (superpowers:subagent-driven-development)."}
 JSON
--- a/.claude/hooks/guard-vault-preflight.sh
+++ b/.claude/hooks/guard-vault-preflight.sh
@ -1,16 +1,12 @@
 #!/usr/bin/env bash
 #
-# PreToolUse guard (Bash): block `git commit` ONLY when the rbw vault agent is locked
+# PreToolUse guard (Bash): block `git commit` when the rbw vault agent is locked.
-# AND the commit would actually need the vault. The pre-commit ansible-lint hook decrypts
+# The pre-commit ansible-lint hook decrypts vault.yml via rbw, so a commit while
-# vault.yml via rbw — but it is scoped (`files: ^(roles|playbooks|inventories)/.*\.ya?ml$`,
+# locked fails deep with a confusing error. This catches it early with a clear fix.
 # always_run:false), so a docs-/config-only commit never triggers it and needs no vault.
 # (2026-06-17 kaizen, docs/FRICTION.md: the old guard blocked *every* locked commit, so a
 # docs-only commit got snagged needing a vault password it never uses.)
 #
-# Fails OPEN: blocks only on a definitive "Ansible content staged AND rbw locked" signal.
+# Fails OPEN: only blocks on a definitive "rbw present AND not unlocked" signal.
-# rbw missing, not a plain `git commit`, `--no-verify`, or no Ansible content staged → allow.
+# If rbw is missing, the command isn't a plain `git commit`, or `--no-verify` is
-# When unsure it errs toward blocking (asking for an unlock is cheap; a deep pre-commit
+# used, the action is allowed.
 # failure is not).
 #
 set -uo pipefail
@ -26,25 +22,14 @@ case "$cmd" in
 esac
 command -v rbw >/dev/null 2>&1 || exit 0   # rbw not installed — allow
 rbw unlocked >/dev/null 2>&1 && exit 0      # unlocked — allow
-# rbw is LOCKED. Only block if this commit would run the vault-decrypting ansible-lint
+if rbw unlocked >/dev/null 2>&1; then
-# hook — i.e. staged content matches its `files:` scope. Mirror that regex exactly.
+  exit 0                                    # unlocked — allow
-ANSIBLE_RE='^(roles|playbooks|inventories)/.*\.ya?ml$'
+fi
-cd "${CLAUDE_PROJECT_DIR:-.}" 2>/dev/null || exit 0
+# rbw present but not unlocked (locked or agent not running) — the commit would
-files=$(git diff --cached --name-only 2>/dev/null) || exit 0
+# fail in the pre-commit hook, so block early with guidance.
 # `git commit -a/--all` also sweeps in modified tracked files that aren't staged yet.
 # (Substring match — errs toward including them, which only ever over-blocks. Safe.)
 case " $cmd " in
  *" -a"*|*"--all"*) files="$files"$'\n'"$(git diff --name-only 2>/dev/null)" ;;
 esac
 # No Ansible content in the fileset → ansible-lint hook won't run → no vault needed → allow.
 printf '%s\n' "$files" | grep -Eq "$ANSIBLE_RE" || exit 0
 # Ansible content staged AND rbw locked — the commit would fail deep in pre-commit. Block.
 cat <<'JSON'
-{"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked and this commit stages Ansible content — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock  (docs-/config-only commits are exempt and won't hit this guard.)"}}
+{"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock"}}
 JSON
 exit 0
--- a/.claude/settings.json
+++ b/.claude/settings.json
@ -69,10 +69,5 @@
        ]
      }
    ]
  },
  "statusLine": {
    "type": "command",
    "command": "bash \"${CLAUDE_PROJECT_DIR:-.}/.claude/statusline.sh\"",
    "padding": 0
  }
 }
--- a/.claude/statusline.sh
+++ b/.claude/statusline.sh
@ -1,63 +0,0 @@
 #!/usr/bin/env bash
 #
 # Claude Code statusLine — shows working dir, model, and context-window usage.
 # Wired via .claude/settings.json (statusLine.command). Receives the statusLine
 # JSON on stdin; first stdout line is rendered (ANSI colour supported).
 #
 # Context usage comes straight from the input JSON — no transcript parsing:
 #   .context_window.used_percentage     pre-calculated % of the window in use (input side)
 #   .context_window.context_window_size window size in tokens (1000000 for the 1M models)
 # verified: Claude Code statusLine schema · code.claude.com/docs/en/statusline · 2026-06-17
 #
 # Fails soft: any parse problem prints nothing and exits 0 (never breaks the prompt).
 set -uo pipefail
 input=$(cat 2>/dev/null) || exit 0
 command -v jq >/dev/null 2>&1 || exit 0
 # pct<TAB>window<TAB>dir-basename<TAB>model-name  (used_percentage preferred,
 # else derived from current_usage, else 0). @tsv keeps spaces in the dir safe.
 parsed=$(printf '%s' "$input" | jq -r '
  (.workspace.current_dir // .cwd // "" | sub(".*/"; "")) as $dir
  | (.model.display_name // "?")                          as $model
  | (.context_window.context_window_size // 200000)       as $win
  | (
      if (.context_window.used_percentage // null) != null then
        .context_window.used_percentage
      elif (.context_window.current_usage // null) != null then
        ((.context_window.current_usage.input_tokens
          + (.context_window.current_usage.cache_creation_input_tokens // 0)
          + (.context_window.current_usage.cache_read_input_tokens // 0)) / $win * 100)
      else 0 end | floor
    ) as $pct
  | [$pct, $win, $dir, $model] | @tsv
 ' 2>/dev/null) || exit 0
 [ -z "$parsed" ] && exit 0
 IFS=$'\t' read -r pct win dir model <<<"$parsed"
 # Human window label: 1000000 -> 1M, 200000 -> 200k, else Nk.
 case "$win" in
  1000000) wlabel="1M" ;;
  *)       wlabel="$((win / 1000))k" ;;
 esac
 # Colour the bar/percentage by pressure: green <70, yellow 70–89, red >=90.
 if   [ "$pct" -ge 90 ]; then col=$'\033[31m'   # red
 elif [ "$pct" -ge 70 ]; then col=$'\033[33m'   # yellow
 else                        col=$'\033[32m'    # green
 fi
 dim=$'\033[2m'; rst=$'\033[0m'
 # 10-cell bar; clamp fill to [0,10] so an over-100 reading can't overflow.
 filled=$((pct / 10)); [ "$filled" -gt 10 ] && filled=10; [ "$filled" -lt 0 ] && filled=0
 bar=""
 for ((i = 0; i < 10; i++)); do
  if [ "$i" -lt "$filled" ]; then bar+="█"; else bar+="░"; fi
 done
 printf '%s%s%s · %s · %s%s %d%%%s %sctx/%s%s\n' \
  "$dim" "$dir" "$rst" \
  "$model" \
  "$col" "$bar" "$pct" "$rst" \
  "$dim" "$wlabel" "$rst"
--- a/.gitignore
+++ b/.gitignore
@ -34,6 +34,3 @@ terraform/**/terraform.tfvars
 # Service-UI verification screenshots (kept locally on ubongo, not committed — ADR-017)
 .verify-runs/
 # Integration-test transient run dir (ADR-025); diagnostics live under ~/integration-runs
 tests/integration/.run/
--- a/.yamllint
+++ b/.yamllint
@ -24,5 +24,4 @@ ignore: |
  .venv/
  .collections/
  .scaffold/
  tests/integration/.run/
  **/vault.yml
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -43,8 +43,6 @@ Full design rationale: `docs/decisions/`
 | Terraform plan                | `make tf-plan [TF_ENV=staging]`                  |
 | Terraform apply               | `make tf-apply [TF_ENV=staging]`                 |
 | Regenerate Ansible inventory  | `make tf-inventory TF_ENV=<staging\|production>` |
 | Integration-test a host on a local VM | `make test-integration HOST=<name> [CERTS=…]` |
 | Clean up integration test VMs | `make test-integration-clean`                   |
 **Always `tf-plan` before `tf-apply`. Always `check` before `deploy`. Never skip lint.**
@ -258,10 +256,7 @@ Single-contributor, trunk-based (no merge requests / approval gates):
 | Backup & disaster recovery | `docs/decisions/022-backup.md`        |
 | ADR structure & lifecycle | `docs/decisions/023-adr-structure.md`     |
 | Reverse proxy (Caddy)  | `docs/decisions/024-reverse-proxy.md`     |
 | Local VM integration testing (ADR-025) | `docs/decisions/025-local-vm-integration-testing.md` |
 | Integration testing runbook | `docs/runbooks/integration-testing.md`   |
 | Adding a new role      | `docs/runbooks/new-role.md`           |
 | Adding a new host      | `docs/runbooks/new-host.md`           |
 | Enrolling a NetBird client (laptop/phone) | `docs/runbooks/netbird-client.md` |
 | Rotating vault secrets | `docs/runbooks/rotate-secrets.md`     |
 | Claude Code setup (per machine) | `docs/runbooks/claude-code-setup.md` |
--- a/38
+++ b/38
@ -23,11 +23,6 @@ MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
 # (the Go module proxy 403s Hetzner IPs); push the pinned tag to the Forgejo registry.
 CADDY_IMAGE     := forgejo.nyumbani.baobab.band/sjat/caddy-gandi:2.11.4
 CADDY_DOCKERFILE := .docker/caddy-gandi/Dockerfile
 # Forgejo container registry (same host/user as the image tags above). `make registry-login`
 # logs the Docker daemon in using vault.forgejo.registry_token (2026-06-17 kaizen) so image
 # pushes are agent-completable non-interactively.
 REGISTRY_HOST   := forgejo.nyumbani.baobab.band
 REGISTRY_USER   := sjat
 # For TF_ENV=offsite, source the Hetzner token from the vault into the environment
 # (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
@ -39,11 +34,10 @@ endif
 .DEFAULT_GOAL := help
-.PHONY: help setup collections lint test test-all test-integration test-integration-clean \
+.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
        check deploy encrypt decrypt \
        edit-vault check-vault new-role \
        tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \
-        molecule-image molecule-image-push caddy-image caddy-image-push registry-login
+        molecule-image molecule-image-push caddy-image caddy-image-push
 help:
 	@echo ""
@ -54,10 +48,8 @@ help:
 	@echo "  make lint                          Run yamllint + ansible-lint"
 	@echo "  make test ROLE=<name>              Run Molecule tests for a role"
 	@echo "  make test-all                      Run Molecule tests for all roles"
-	@echo "  make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1]   Run ADR-025 integration cycle against a VM"
+	@echo "  make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>]   Dry-run a playbook (check mode)"
-	@echo "  make test-integration-clean        Prune stale integration-test VM snapshots"
+	@echo "  make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>]  Run a playbook against production"
 	@echo "  make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>]   Dry-run a playbook (check mode)"
 	@echo "  make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>]  Run a playbook against production"
 	@echo "  make edit-vault [VAULT=<path>]     Edit the vault in nvim (auto re-encrypts + checks)"
 	@echo "  make check-vault [VAULT=<path>]    Validate vault structure (values masked)"
 	@echo "  make encrypt FILE=<path>           Encrypt a vault file"
@ -77,7 +69,6 @@ help:
 	@echo "  make molecule-image-push      Push the test image to the Forgejo registry"
 	@echo "  make caddy-image              Build the custom Caddy + Gandi DNS-01 image (run on ubongo)"
 	@echo "  make caddy-image-push         Push the Caddy image to the Forgejo registry"
 	@echo "  make registry-login           Log Docker into the Forgejo registry (vaulted token)"
 	@echo ""
 # ── Environment setup ─────────────────────────────────────────────────────────
@ -112,29 +103,19 @@ test-all:
 	  cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \
 	done
 test-integration:
 ifndef HOST
 	$(error HOST is required: make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1])
 endif
 	PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py cycle \
 	  --host $(HOST) $(if $(CERTS),--certs $(CERTS)) $(if $(KEEP),--keep)
 test-integration-clean:
 	PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py prune
 # ── Playbook execution ────────────────────────────────────────────────────────
 check:
 ifndef PLAYBOOK
 	$(error PLAYBOOK is required: make check PLAYBOOK=<name>)
 endif
-	$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) --check --diff playbooks/$(PLAYBOOK).yml
+	$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) --check --diff playbooks/$(PLAYBOOK).yml
 deploy:
 ifndef PLAYBOOK
 	$(error PLAYBOOK is required: make deploy PLAYBOOK=<name>)
 endif
-	$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) playbooks/$(PLAYBOOK).yml
+	$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) playbooks/$(PLAYBOOK).yml
 # ── Vault ─────────────────────────────────────────────────────────────────────
@ -178,13 +159,6 @@ caddy-image:
 caddy-image-push: caddy-image
 	docker push $(CADDY_IMAGE)
 # Log the local Docker daemon into the Forgejo registry using the vaulted token, so the
 # *-image-push targets above are agent-completable non-interactively (rbw must be unlocked).
 registry-login:
 	@ANSIBLE_VAULT="$(ANSIBLE)-vault" PYTHON="$(PYTHON)" VAULT="$(VAULT)" \
 	  REGISTRY_HOST="$(REGISTRY_HOST)" REGISTRY_USER="$(REGISTRY_USER)" \
 	  bash scripts/registry-login.sh
 # ── Terraform ─────────────────────────────────────────────────────────────────
 tf-init:
--- a/STATUS.md
+++ b/STATUS.md
@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the
 truth. **Before relying on a role, provider, or pipeline existing, check here.**
 If something is listed as "designed, not built", do not assume it works.
-_Last reviewed: 2026-06-19._
+_Last reviewed: 2026-06-14._
 ## Real and working today
@ -30,8 +30,8 @@ _Last reviewed: 2026-06-19._
 | `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). |
 | `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
 | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
-| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **`base` firewall applied (mesh-hardening 2/3, 2026-06-19):** INPUT-only default-deny — input locked to `wt0` + ssh-from-control (`10.20.10.151`) + workstations (`10.20.10.50` mamba, `10.20.10.17`); forward `accept` (Docker/libvirt-NAT safe). Live-verified (SSH self-path + Docker egress, after a post-apply `restart docker` — base's flush wipes Docker nat, FRICTION); **real-host reboot-validated (2026-06-19):** after an operator reboot, the `policy drop` input chain + full allow-list re-applied on boot and the `wt0` mesh + SSH self-path came back clean. `claude` now self-SSHes (ad-hoc `authorized_keys` grant so the agent can run SSH-based deploys with the auto-rollback safety; fold into the control-node bootstrap). **Pending:** full `base` hardening (auditd/CIS); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservations (10.20.10.151 MAC `88:a4:c2:e0:ee:da` + the `.50`/`.17` workstation leases); Terraform state backup (now relevant — the offsite tfstate exists). |
+| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern; agent management now works because `claude`'s SSH key was added to `sjat`'s `authorized_keys` and `sjat` was granted `NOPASSWD` sudo (`/etc/sudoers.d/sjat-ansible`) — the interim until the proper `ansible`-user bootstrap. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
-| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Mesh-hardening redesign applied + live reboot-validated (2026-06-20):** `base` INPUT-only nftables default-deny (`inet filter` input `policy drop`; forward `accept`, Docker-safe via a post-apply `restart docker`), SSH `wt0`-primary + a permanent WAN break-glass (ubongo's WAN `91.226.145.80`; the Hetzner console is the OOB ultimate fallback), managed over `wt0`; `netbird_coordinator` geolocation disabled (`NB_DISABLE_GEOLOCATION`) so a no-egress boot can't FATAL it. A real reboot recovered **unattended** — firewall persisted, Docker forwarding + public services (Caddy 80/443, STUN 3478) up, coordinator geo-disabled (no FATAL), `wt0`/mesh (Management+Signal Connected) + both SSH paths back. **Pending:** offsite tfstate backup (ADR-022); relay-SPOF reduction (next mesh-hardening sub-project — `ubongo→askari` is currently `Relayed` through askari's own relay). |
+| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Pending:** host firewall + moving askari's SSH onto `wt0` (deferred mesh-hardening; the Hetzner Cloud Firewall is its perimeter until then), offsite tfstate backup (ADR-022). |
 | `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy` → `/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). |
 | `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. |
@ -39,7 +39,7 @@ _Last reviewed: 2026-06-19._
 | Thing | State |
 |---|---|
-| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is **applied to ubongo** (mesh-hardening 2/3, 2026-06-19) **and askari** (mesh-hardening redesign, 2026-06-20) — both INPUT-only default-deny via the `base__firewall_input_only` knob (input default-deny + `wt0`/ssh-from-control/`base__firewall_admin_addrs` allow-list; forward left `accept` so Docker/libvirt-NAT survive), both **live reboot-validated**. On a Docker host (askari) base's `flush ruleset` wipes Docker's nat, so the cutover follows the firewall apply with a `restart docker` to rebuild it (FRICTION). Not built: auditd, packages, users (Phase 2 / TODO 15). The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`) so a local-DNS hiccup can't strand the mesh — **applied + live on ubongo (2026-06-20)**: `getent hosts netbird.askari.wingu.me` → `77.42.120.136`, mesh unaffected. The single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment). |
+| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). |
 | `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
 | `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |
@ -50,7 +50,7 @@ daemon hardening + `nftables.d` container rules, ADR-004/ADR-020 — is still pe
 A `make deploy PLAYBOOK=site` run now applies real content — `base` (its `firewall` +
 `hardening` concerns) plus a functional `docker_host` (Docker engine) on docker hosts —
 but in practice it is still limited: the production cluster has no docker hosts yet, and
-`base`'s `firewall` concern is now applied to `ubongo` (control) but not yet to cluster docker hosts (none exist), so a full cluster `site` run does not
+`base`'s `firewall` concern is mesh-gated until M5, so a full cluster `site` run does not
 yet exist. (The `make check`/`deploy` machinery itself works — first proven by applying
 `dev_env` via `playbooks/workstation.yml`, then `base`/`docker_host`/`reverse_proxy` on
 askari.)
@ -70,7 +70,7 @@ askari.)
 | CIS hardening (Debian L1+L2 + Docker) | ADR-002 / TODO 15 | Implemented by the (unbuilt) `base`/`docker_host` roles; brings AppArmor + AIDE as baseline. L2 partitions affect VM provisioning (ADR-006) |
 | Network IDS + security alerting | ADR-002 / TODO 15 | Suricata on OPNsense + AIDE/`auditd`/`fail2ban` alerting into the monitoring stack; not built |
 | NetBird mesh — coordinator on `askari` | ADR-016 | **BUILT + applied (M4b, 2026-06-16)** — moved up to "Real and working today" (`roles/netbird_coordinator/`). Self-hosted control plane on askari; replaces ADR-007 WireGuard. Mesh **peer enrolment = M5** (next row). |
-| NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. **Road-warrior clients (`mamba` + work laptop) enrolled (2026-06-17) → `ubongo` reachable from anywhere: the mobile-access goal is met and Phase 1 (remote access) is COMPLETE.** Client enrollment runbook: `docs/runbooks/netbird-client.md`. |
+| NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. Road-warrior clients (laptops) are operator-enrolled. |
 | Service-UI verification (Level 4) | ADR-017 / ADR-008 | **Design RESOLVED** (ADR-017 + spec + plan); resolves ADR-015 deferred #2. `/verify-service` skill + `VERIFY.md` template + standards are authorable and present. **Build pending:** running needs ubongo + `playwright` plugin + Authentik + a staging deploy. |
 | Logging pipeline (Loki + Alloy + off-site subset) | ADR-018 | **Design RESOLVED** (ADR-018 + spec). All logs → on-cluster Loki; security subset write-only off-site to askari. **Build pending:** Alloy in `base`, `loki`/`grafana` service roles, OPNsense syslog — none built. |
 | Security alerting (AIDE/auditd/fail2ban/Suricata + log-silence) | ADR-002 / ADR-018 | Wired into Grafana on the Loki stack. Designed; depends on the logging pipeline + metrics stack (TODO 3.6). |
@ -81,18 +81,6 @@ askari.)
 | Backup `backup` role + `backup_hosts` group | ADR-022 | Does not exist. Pull node (`fisi`), restic repo, rclone→pCloud, USB air-gap — Plan 2. |
 | Per-service `backup__*` contract + `BACKUP.md` | ADR-022 | Convention defined; inert until service roles exist to declare against. |
 ## Integration test harness (ADR-025)
 | Thing | State |
 |---|---|
 | `roles/integration_test/` | **Built** — installs/enables libvirt+QEMU+virtinst on `control` group hosts; adds `sjat`/`claude` to `libvirt` group; creates image-cache dir. Lint clean; applied live to ubongo (substrate installed); molecule scenario present, not run in the build env. |
 | `scripts/integration-vm.py` | **Built** — stdlib-only lifecycle driver over `virsh`/`virt-install`/`cloud-localds`: `up / apply / reboot / assert / cycle / down / prune / console`. Lazily ensures the golden Debian-13 genericcloud image. pytest clean (transient-inventory generation, var/overlay merge, `--certs` mapping, DHCP-lease parsing, resource-guard math). |
 | `tests/integration/` (profile, verify, overrides) | **Built** — "be askari" profile + var overlay + `verify.yml` outcome assertions (Docker active, forward-chain accepts present, published-port DNAT alive). Validated end-to-end by the RED→GREEN acceptance run. |
 | `make test-integration` / `make test-integration-clean` | **Built** — wired into `Makefile`. |
 | ADR-025 | **Accepted (2026-06-18)** — decision recorded, approach A, cert tiers, safety invariants, UEFI boot requirement, and claude-sudo dependency documented. |
 | **RED/GREEN acceptance (ubongo live pass)** | **PASSED (2026-06-18).** A throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base nftables forward default-deny kills Docker forwarding on reboot) = RED. Applying the `docker_host` container-forward drop-in and rebooting survived = GREEN. Nine shakedown findings captured in `docs/FRICTION.md`; key learnings (UEFI boot, claude sudo) recorded in ADR-025. `docs/TODO.md` item 2.4 closed. |
 | `le-staging` cert validation | **Pending** — wired in v1 but not yet exercised on a real VM (separate from the RED/GREEN acceptance gate). |
 ## Keeping this honest
 Update this file whenever you build, stub, or remove something. It is the first
--- a/docs/FRICTION.md
+++ b/docs/FRICTION.md
@ -22,259 +22,83 @@ earning its keep.
 _(append new raw signals here; the next kaizen review consumes them)_
- `[friction]` **Re-asked settled defaults (push + subagent-driven) at the plan→execute handoff**
+- `[friction]` **Image push to the Forgejo registry fails with `no basic auth
-  (2026-06-19): despite the standing preference (memory `dont-reask-settled-defaults`: push to
+  credentials`** (2026-06-15): `make caddy-image-push` (and `molecule-image-push`) fail
-  origin as off-machine backup **and** go subagent-driven, both WITHOUT asking), I again asked the
+  unless the Docker daemon on ubongo has an interactive `docker login
-  operator "which execution approach?" and "want me to push?". The `writing-plans` skill scripts
+  forgejo.nyumbani.baobab.band` session — and those creds are **not in vault** (only
-  that handoff question ("Which approach?"), and confirming a push felt natural — both overrode the
+  `gandi` + `hetzner` are), so an agent can't complete a push non-interactively. The
-  memory. → at the writing-plans → execution handoff, default to subagent-driven execution and push
+  build half is fully automatable; the push half silently requires a human. → candidate:
-  to origin without a confirmation gate; reserve questions for genuine forks. Recurrence of an
+  document the `docker login` step in `docs/runbooks/claude-code-setup.md`, **or** store
-  already-recorded signal — treat the skill's scripted "Which approach?" as pre-answered
+  a scoped Forgejo registry token in vault + a `make registry-login` target (login via
-  (subagent-driven) for this operator.
+  `--password-stdin`, `no_log`) so pushes are agent-completable like every other
  vault-backed action.
-<!-- The six below are from the 2026-06-17 mesh-hardening-1/3 incident: applying base's
+- `[gotcha]` **Single-file Docker bind mount + atomic config rewrite = stale config in
-nftables default-deny + wt0-only sshd to askari (the off-site Docker host that ALSO runs
+  the running container** (2026-06-16): `reverse_proxy` bind-mounted the Caddyfile as a
-the NetBird coordinator) took it down on reboot; recovery needed the Hetzner console +
+  single file; `ansible.builtin.template` writes atomically (temp + rename → new inode),
-a WAN-SSH break-glass. Spec/plan: docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*. -->
+  so the running container kept the OLD inode and `caddy reload` (in-container, no restart)
  re-read stale config and silently no-op'd (`"config is unchanged"`). The NetBird route
  never loaded → Caddy never requested its cert; surfaced only by a TLS handshake failure.
  Fix: mount the config **directory** (`./caddy` → `/etc/caddy`) — directory mounts reflect
  inode swaps, so live reload works (proven on askari). NOTE the sibling case: NetBird also
  single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
  in-container reload), and a restart DOES re-resolve the bind mount (verified: 0 before,
  1 after) — so restart-based roles are safe; only in-place-reload roles need the dir mount.
  → candidate gotcha doc (`docs/testing/gotchas.md`): "reload-in-place needs a directory
  mount; restart-based roles are fine with a single-file mount."
- `[gotcha]` **`base`'s nftables `forward policy drop` breaks Docker hosts on reboot**
+- `[friction]` **`make check` always fails on the first-ever deploy of a compose service
-  (2026-06-17): `base/templates/nftables.conf.j2` sets `chain forward { ... policy drop; }`.
+  role** (2026-06-16): in check mode the "ensure base_dir" task is reported-but-not-run, so
-  On a Docker host, container traffic is *forwarded* (published-port DNAT → container, and
+  the later `community.docker.docker_compose_v2` up fails with `"…is not a directory"`
-  inter-container over the bridge), so the drop kills it. It worked right after `make
+  (missing `project_src`). Not a defect — a real deploy creates the dir — but it means the
-  deploy` (Docker's runtime rules coexisted) but after a reboot nftables loaded our
+  CLAUDE.md "always `make check` before `make deploy`" step is guaranteed-red for any brand
-  default-deny *before* Docker, breaking WAN→Caddy and Caddy→coordinator → the public
+  new stateful role, which erodes trust in the check. → candidate: guard the compose-up with
-  services and the mesh went down. The `docker_host` "`nftables.d` container-forward rules"
+  `not ansible_check_mode` (clean "skipped" in dry-run; compose can't be meaningfully
-  that would make this Docker-safe are explicitly **pending** (STATUS.md). → the `base`
+  dry-run before first deploy anyway), OR document the one-time expected failure. Decide one.
  firewall (`base__firewall_apply`) must NOT be applied to any Docker host until
  `docker_host` ships the container-forward rules; add a guard/check (a Docker host with
  `firewall_apply: true` and no container-forward drop-in is a misconfiguration), and the
  firewall design (ADR-020) should state the Docker-host dependency explicitly.
- `[gotcha]` **`ip_nonlocal_bind` did NOT beat the sshd boot-race** (2026-06-17): the
+- `[recurring]` **Re-asked the operator about settled defaults — push + execution mode**
-  mesh-hardening plan bound sshd `ListenAddress` to the `wt0` IP and set
+  (2026-06-17): at the M5 plan handoff I asked (a) whether to push to origin and (b) which
-  `net.ipv4.ip_nonlocal_bind=1` so sshd could bind the mesh IP before `wt0` exists at
+  execution mode (subagent-driven vs inline) — both already settled: CLAUDE.md says push to
-  boot. In practice the console still showed sshd *"could not assign the address"* at boot
+  `origin` often (off-machine backup), and TODO 10.5 / the standing agreement is "always
-  — so the protection did not work as designed, and because `wt0` never came up (the
+  subagent-driven" (there's even `guard-execution-mode-menu.sh`). Same shape as the 5×
-  coordinator was down), sshd had no listener at all → no SSH path. → the entire
+  "execution-mode menu asked AGAIN" ledger entries — but this time the ask was my own
-  "sshd listens on `wt0` only" premise is unsound without (a) a *verified* boot-race fix
+  free-form prose ("want those pushed now?", "which execution approach?"), which the
-  and (b) a guaranteed non-mesh break-glass. Re-investigate why `ip_nonlocal_bind` didn't
+  existing menu-text matcher does NOT catch (it keys on the writing-plans menu's literal
-  help (ordering vs the sysctl drop-in load? the sysctl not applied before sshd start?),
+  text). → the gap is that the guard only matches that literal menu; free-form re-asks slip
-  or drop ListenAddress-on-mesh entirely and rely on the host firewall for SSH scoping.
+  through. Candidate: widen the Stop-hook matcher to also flag prose re-asks of
  push-vs-not / subagent-vs-inline, since prose reminders have already failed this many
  times. Default behaviour: **push as backup and proceed subagent-driven without asking.**
- `[gotcha]` **The coordinator host can't bootstrap the mesh it depends on** (2026-06-17):
+- `[friction]` **A docs-only commit still tripped the `rbw`-locked pre-commit guard**
-  `askari` runs the NetBird coordinator AND is a mesh peer. After a reboot its NetBird
+  (2026-06-17): committing only `docs/superpowers/specs/*.md` (no ansible content) was
-  agent needs the coordinator (a local container) to be serving to bring up `wt0` — but
+  blocked needing the vault password, although the 2026-06-10 kaizen fix scoped the
-  the coordinator wasn't healthy, so `wt0` never came up. Circular. Combined with sshd
+  pre-commit `ansible-lint` hook (`always_run: false` + `files:` ansible content) so
-  being `wt0`-only, the host was reachable only via the Hetzner console. → the
+  docs-/config-only commits skip it and need no vault. So either the hook's `files:`
-  coordinator host must keep a **non-mesh management path always** (don't move its SSH onto
+  pattern still matches `docs/**` (or `.md`), or a blanket pre-commit step needs the
-  `wt0`), or the mesh-hardening must treat the coordinator host as a special case. General
+  vault regardless. → check `.pre-commit-config.yaml`'s `files:`/`exclude:` against the
-  rule: never make a host's only management path depend on a service that host itself
+  spec/plan paths; docs-only commits should not require `rbw`.
  hosts.
- `[gotcha]` **NetBird `netbird-server` FATAL-loops on the geolocation DB download with no
+- `[friction]` **The agent can't manage `ubongo` (the control node it runs ON) without
-  egress** (2026-06-17): on startup the combined `netbird-server:0.72.4` tries to download
+  the operator granting access** (2026-06-17): enrolling `ubongo` in the mesh needed two
-  the GeoLite2 DB from `pkgs.netbird.io` and treats failure as **FATAL** (crash-loop) — so
+  manual operator grants because the agent runs as `claude` (no sudo) but the inventory
-  any loss of container egress (here: Docker NAT masquerade wiped when `nftables` was
+  manages `ubongo` as `sjat`: (1) `claude`'s SSH key added to `sjat`'s `authorized_keys`
-  flushed, not re-added by a plain `restart docker`) takes the whole control plane down.
+  (`Permission denied (publickey)` otherwise), then (2) `NOPASSWD` sudo for `sjat`
-  Recovery was `restart docker` (rebuild NAT) → force-recreate the container so it could
+  (`Missing sudo password` otherwise). So the "AI-worker control node" (ADR-015) can drive
-  download. → for the `netbird_coordinator` role: pre-seed/persist the geo DB in the data
+  the whole fleet but not itself, unattended. This is the **pending `ansible`-user
-  dir (or pin a local copy), or disable the geolocation requirement, so a transient egress
+  bootstrap** gap (STATUS) biting in practice. → the proper fix is ubongo's bootstrap to a
-  blip can't FATAL the coordinator. Note for the firewall design: container egress (NAT)
+  key-trusted, NOPASSWD `ansible` (or `sjat`) management identity as part of `base`/its
-  is fragile across `nft flush` + reboot.
+  control-node recipe, so control-node self-management doesn't need ad-hoc operator grants.
- `[friction]` **No off-site coordinator backup turned a 2-minute restore into a long live
+- `[recurring]` **ADRs claim cross-doc reconciliation they didn't actually perform**
-  recovery** (2026-06-17): the NetBird coordinator's stateful store (`/var/lib/netbird`,
+  (2026-06-14): ADR-024's Status + Consequences asserted "ADR-017 prose that mentioned
-  encrypted SQLite) has **no off-site backup yet** (ADR-022 `backup` role pending,
+  Traefik is updated to read Caddy" — but ADR-008/017/019 + CAPABILITIES still said
-  flagged in STATUS as the coordinator's deferred backup). During the incident there was a
+  Traefik; the rename was left half-done across the doc set and the ADR over-claimed its
-  real fear the unclean reboots had corrupted the store, with no restore path. It turned
+  own follow-through. Surfaced only by a full-repo `grep Traefik` during `/review-repo`.
-  out to be a runtime/egress issue, not corruption — but the absence of a backup made the
+  Same shape as the deferred-decision-goes-stale signal (a decision lands in one place,
-  whole recovery higher-stakes. → prioritise the ADR-022 backup contract for the
+  its promised ripple edits don't). → candidate `repo-scan.py` check: when an ADR's text
-  `netbird_coordinator` store ahead of the rest of the backup role; a recent off-host copy
+  asserts "X is updated to Y" / supersedes a named tool, flag remaining occurrences of the
-  would have made "rebuild askari from scratch" a safe option.
+  old name (or verify the claimed edit landed) — the structural cousin of `stale-deferred`.
-
+  (KEEP-OPEN per the 2026-06-14 `/kaizen` run — it's its own build task.)
 - `[friction]` **The plan tested reboot-recovery AFTER removing the break-glass**
  (2026-06-17): the mesh-hardening plan's live cutover closed the WAN `:22` (step 5)
  *before* the reboot-resilience test (step 7), so the one fallback path was gone exactly
  when the reboot exposed the boot-race + Docker-firewall bugs. → sequencing rule for
  lockout-risky cutovers: **validate reboot-recovery while the old access path is still
  open**, and only retire the break-glass once recovery (incl. a reboot) is proven.
  Generalises beyond this milestone — a candidate line in the new-host / hardening runbooks.
 <!-- The below are from the 2026-06-18 ADR-025 build: standing up the local-VM integration
 harness on ubongo and shaking it down against real KVM (spec/plan in docs/superpowers/). -->
 - `[gotcha]` **Debian 13 genericcloud boot-loops under legacy BIOS/SeaBIOS** (2026-06-18):
  `virt-install --import` of the genericcloud qcow2 with the default (SeaBIOS) firmware
  triple-faults at the real-mode kernel handoff — GRUB loops, no "Decompressing Linux", no
  DHCP lease. The symptom (no network) pointed away from the cause (firmware). → boot test
  VMs via **UEFI** (`virt-install --boot uefi`; OVMF→efistub).
 - `[friction]` **The no-sudo `claude` model blocked diagnosing a failed VM** (2026-06-18):
  under ADR-015 `claude` had no sudo, so when the VM wouldn't network there was no way to
  introspect it (serial logs are `root:0600`, libguestfs not installed, mounting needs
  root). Diagnosis was fully blocked until the operator granted `claude` sudo. → DECISION:
  `claude` gets `NOPASSWD:ALL` (reverses ADR-015's "no local sudo"); compensating control
  is auditd/Loki attribution (already in ADR-015). Amend ADR-015/ADR-021 + accepted-risks;
  codify the sudoers drop-in in Ansible.
 - `[gotcha]` **Non-root `virsh`/`virt-install` default to `qemu:///session`** (2026-06-18):
  the substrate (NAT net, /dev/kvm) lives on `qemu:///system`. → pin
  `LIBVIRT_DEFAULT_URI=qemu:///system` in the driver.
 - `[gotcha]` **`qemu:///system` (libvirt-qemu) can't traverse `/home`** (2026-06-18): VM
  disk/seed/console under the repo/home failed "Permission denied (search permissions for
  /home/claude)". → put per-VM artifacts in a system-readable dir (`/var/lib/boma-integration`,
  group libvirt); the inventory (read by ansible as the user) can stay in the repo.
 - `[gotcha]` **`ansible-playbook -i <dir>/` parses sibling non-inventory files as INI**
  (2026-06-18): pointing `-i` at a run-dir holding a state file + qcow2s made the directory
  inventory loader parse the state file as INI → phantom hosts INCLUDING the real `askari`
  (with its real vars), breaking the single-host isolation invariant. → point `-i` at the
  single `hosts.yml`. Caught by the holistic cross-file review BEFORE any hardware run.
 - `[gotcha]` **Jinja `{%- -%}` + ansible `trim_blocks=True` double-strip newlines**
  (2026-06-18): a template edit used `{%- -%}`, reviewed by rendering with RAW jinja2
  (trim_blocks=False) which looked fine; ansible (trim_blocks=True) then collapsed the
  rendered Caddyfile onto single lines → caddy crash-looped on invalid config. → verify
  templates with ansible's whitespace (trim_blocks=True), not raw jinja2; prefer plain
  `{% %}` at column 0 (the repo's existing style).
 - `[gotcha]` **Fresh cloud images have empty apt lists** (2026-06-18): `apt install
  nftables` failed "No package matching 'nftables' is available" on a fresh genericcloud
  VM whose cloud-init had `package_update: false`. → `package_update: true` AND block on
  `cloud-init status --wait` before applying.
 - `[gotcha]` **base's default-deny firewall drops SSH to a NAT'd VM unless the gateway is
  allowed** (2026-06-18): the driver reaches the VM via the libvirt-NAT gateway
  (192.168.150.1). `ct established,related accept` saves the in-flight apply connection,
  but a fresh post-reboot SSH is dropped without an explicit allow. → test overlay sets
  `base__firewall_control_addr` to the NAT gateway.
 - `[recurring]` **Real-hardware shakedown and static review each caught what the other
  couldn't** (2026-06-18): the qemu-URI, storage-path, UEFI, apt-list, and caddy-render
  bugs ALL surfaced only on a live KVM run; the phantom-host inventory bug surfaced only in
  the holistic cross-file review. → for infra this novel, budget for BOTH an adversarial
  cross-file review AND a real-hardware run; neither alone would have shipped it working.
 <!-- From the 2026-06-19 mesh-hardening-2/3 design (ubongo INPUT-only default-deny). -->
 - `[friction]` **Raw DHCP leases pinned in ubongo's host firewall (admin-addr SSH allows)**
  (2026-06-19): mesh-hardening 2/3 lets the operator workstations reach ubongo's LAN SSH by
  *raw lease* — `base__firewall_admin_addrs: ["10.20.10.50" (mamba), "10.20.10.17"]` — because
  there is no DHCP reservation yet (OPNsense isn't managed as code). A lease reassignment
  silently moves the allow to whatever host next holds the IP (still SSH-key-gated) and drops
  the workstation's *LAN* path (mesh still works, so never a full lockout). → when
  OPNsense-as-code lands (ADR-020 perimeter / TODO 3.5), replace both with **MAC-pinned DHCP
  reservations** (`10.20.10.17` = MAC `bc:0f:f3:c8:4a:8a`; mamba's MAC TBD) and allow the
  reserved IPs. Spec: `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`.
 - `[gotcha]` **`make test-integration` on ubongo fails (`qemu-img` "Permission denied") when
  the agent session predates the `libvirt` group grant** (2026-06-19): the `integration_test`
  role adds `claude` to `libvirt`+`kvm` and makes the cache dir `/var/lib/boma-integration`
  `root:libvirt 2775` — correct — but a `claude` session whose shell started *before* that
  grant carries a stale process group set (`id` → `claude,docker` only, no `libvirt`), so
  `qemu-img create` of the VM overlay into the group-owned dir is denied. `virsh`/`virt-install`
  still work (they reach system libvirtd via polkit/socket, and the real KVM runs server-side
  as `libvirt-qemu`), so ONLY claude's own file-writes break. Unblock without restarting the
  session: **`sg libvirt -c 'make test-integration HOST=<name>'`** (claude needs only `libvirt`
  for the dir; `kvm` is server-side; note `sg` adds one group, not the full set). → self-heal
  in `scripts/integration-vm.py`: if the `libvirt` gid is absent from `os.getgroups()`, re-exec
  under `sg libvirt` (or have the Makefile target do it), so a stale-session agent never hits
  this opaque symptom. New agent sessions pick the groups up on login, so it's a stale-session
  transient — but high-confusion, worth self-healing.
 - `[friction]` **No standard for when the agent may run local-VM integration tests on ubongo
  without asking** (2026-06-19): `make test-integration HOST=<name>` spins an ISOLATED throwaway
  KVM VM (its own libvirt NAT; never touches the real host's firewall/network; guards:
  one-VM-at-a-time + a 4 GiB free-RAM floor + auto-destroy on success), so it is safe and
  self-contained — yet the agent paused for a go-ahead before running it (mesh-hardening 2/3,
  Task 4). The operator wants a STANDARD that pre-authorises VM-testing on ubongo so the agent
  just runs it. → decide + record the rule: e.g. a `.claude/settings.json` permission allow for
  `make test-integration*` / `scripts/integration-vm.py` (and the `sg libvirt -c '…'` form per
  the gotcha above), plus a CLAUDE.md line distinguishing the pre-authorised isolated VM tests
  from the genuinely-gated live steps (`make deploy` to real hosts, host reboots, cutovers —
  still need a go-ahead). Ties to the `test-risky-infra-before-live-deploy` +
  `dont-reask-settled-defaults` memories + ADR-025.
 - `[gotcha]` **Molecule covers only the `input_only`-OFF (forward drop) branch of the base
  firewall** (2026-06-19): mesh-hardening 2/3 added `base__firewall_input_only` (forward policy
  drop↔accept). The `default` Molecule scenario renders ONE fixture, set to the secure default
  (drop) — so the fast `make test ROLE=base` gate locks the drop default (security-critical for
  service hosts) but does NOT exercise the `=true` → forward-`accept` rendering; only `make
  test-integration HOST=ubongo` does (passed GREEN). An in-converge re-render can't cheaply
  cover it (role defaults aren't in scope outside the role run). → decide in kaizen: a second
  Molecule scenario (`molecule/input-only/`) asserting forward `policy accept`, vs accepting the
  integration-only coverage. Final-review finding; not a cutover blocker (the accept branch is a
  literal, and a var-name break would fail the drop branch too → caught).
 - `[gotcha]` **Applying base's firewall to a Docker host flushes Docker's nat → container
  egress dies until `restart docker`** (2026-06-19, mesh-hardening 2/3 live cutover): base's
  `nftables.conf.j2` starts with `flush ruleset`, which wipes ALL tables incl. Docker's
  `ip nat`/`ip filter` (+ libvirt's). On ubongo I chose INPUT-only so `forward` stays `accept`
  — yet the apply STILL broke CONTAINER egress: `docker pull` worked (dockerd uses HOST egress)
  but a container `ping` FAILED — the masquerade (SNAT) was gone, so replies couldn't return.
  `forward accept` permits forwarding but can't replace the missing nat. The spec's "input-only
  keeps Docker egress working" was therefore **incomplete**, and the local-VM harness couldn't
  catch it (the test VM runs no Docker). Fix on the live host: `systemctl restart docker`
  re-adds its `ip nat`/`ip filter` (egress restored; coexists fine with base's `inet filter`).
  On REBOOT it self-heals (dockerd re-adds nat on boot; `forward accept` doesn't block — unlike
  the 2026-06-17 `forward drop` incident). → (1) any cutover/runbook applying base firewall to a
  Docker host MUST `restart docker` + check container egress after the apply; (2) the pending
  `docker_host` nftables integration should own re-adding/persisting Docker's rules so base's
  `flush` is safe; (3) the firewall final-review checklist should include "does the host run
  Docker/libvirt? the flush wipes their nat."
 <!-- From the 2026-06-19 mesh-hardening 3/3 (askari INPUT-only integration gate). -->
 - `[gotcha]` **`inet filter` default-deny blocks libvirt dnsmasq DHCP — silent, hard to diagnose**
  (2026-06-19, task-3 integration gate): when `base__firewall_input_only: true` is applied to
  ubongo, the `table inet filter { chain input { policy drop; } }` blocks DHCP packets that arrive
  via the libvirt bridge (`virbr-boma`). In nftables, multiple tables at the same hook priority all
  run independently; an `accept` verdict in `table ip filter LIBVIRT_INP` does NOT prevent
  `table inet filter` from seeing and dropping the same packet. VMs never got DHCP leases (dnsmasq
  socket confirmed by strace to never receive POLLIN despite tcpdump seeing the packet on
  `virbr-boma`). Diagnosed by temporarily changing `inet filter input` to `policy accept` → fd=3
  immediately fired. Fix: `/etc/nftables.d/10-libvirt-boma.nft` drop-in adding
  `iifname "virbr-boma" accept` (survives service restarts via `include "/etc/nftables.d/*.nft"`).
  → The `base` role's template needs a `base__firewall_trusted_bridges` variable so this is
  encoded at the Ansible level, not in a manual host drop-in. Every host that runs Docker or
  libvirt and also has `base__firewall_input_only: true` needs an analogous exception.
 - `[gotcha]` **libvirt `leaseshelper` PID-file permission: `virPidFileReleasePath` unlinks
  `/run/leaseshelper.pid` after EVERY call; nobody cannot recreate it** (2026-06-19, task-3
  integration gate): dnsmasq runs as nobody; `libvirt_leaseshelper` is its `--dhcp-script`. The
  helper acquires a PID-file mutex at `/run/leaseshelper.pid`, but `virPidFileReleasePath`
  UNLINKS the file on exit. `/run/` is `root:root 755`, so nobody cannot create the file after the
  first unlink → every subsequent `add` call fails with `errno=13`, dnsmasq silently drops the
  DHCP grant (no log, no error to the client). Fix: suid root C wrapper at
  `/usr/lib/libvirt/libvirt_leaseshelper` (original moved to `.real`) that pre-creates
  `/run/leaseshelper.pid` owned by nobody, then drops privileges and execs the real helper. The
  root dnsmasq fork calls the wrapper; suid gives it permission to touch `/run/`; on return to
  nobody uid the PID file stays. Also: `/var/lib/libvirt/dnsmasq/` must be `nobody:nogroup 775`
  so leaseshelper can update `virbr-boma.status`. This fix is host-local on ubongo and NOT in
  Ansible — encode it in an `integration_test` role task (or a libvirt role) before the harness
  can be safely re-deployed.
 - `[gotcha]` **cloud-init rejects underscores in `local-hostname` → silently skips
  network-config → VM never gets DHCP** (2026-06-19, task-3 integration gate): setting
  `local-hostname: boma-it-askari_inputonly-<uuid>` caused cloud-init-local to consider the
  hostname invalid and skip writing the network-config to the system. Systemd-networkd then
  used the genericcloud default (no DHCP), so VMs got only IPv6 link-local. Fix in
  `scripts/integration-vm.py`: `name.replace("_", "-")` in the meta-data hostname (disk paths
  and virsh domain names keep the original underscore). Sanitization rule: RFC-952 hostnames
  allow hyphens, not underscores.
 - `[friction]` **Molecule Docker image can't `apt install` → roles with real package tasks
  have no Molecule substrate coverage** (2026-06-19): the Docker Molecule image ships with
  cleared apt-lists and no internet access, so any role whose core work is `apt install` —
  `base`, `docker_host`, `integration_test` — cannot cover its package/substrate tasks in
  Molecule. Those tasks are validated only by `make test-integration` (ADR-025, real KVM).
  The gap is systemic: it affects every role with non-trivial package or system-level setup.
  → systematization idea: provide a Molecule image or driver that can install packages (e.g.
  a custom Docker image with pre-seeded apt-lists, or a `prepare.yml` that pre-installs
  packages from a local cache), or an alternative driver (e.g. `molecule-libvirt` using the
  same KVM harness), so substrate tasks get real Molecule unit coverage rather than relying
  entirely on the integration harness.
 ---
@ -282,29 +106,6 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp
 Consumed signals and where their resolution now lives. Newest first.
 ### 2026-06-17
 Second `/kaizen` run. 7 signals triaged; all 7 consumed (0 kept open). Two heavier items
 (the `rename-incomplete` scan check and the Forgejo registry-login path) were built by
 parallel subagents and verified against the diff. **Bias-to-remove note:** one PARK
 (the ubongo self-management gap — out-of-phase, already tracked in STATUS) and zero
 REMOVE; the rest accreted (migrate/change). None of the open signals were `[unused]`
 *tooling*, so there was nothing to delete — the only reductive move available was parking
 the out-of-phase build. **Cadence:** healthy — 3 days after the first run, every signal
 0–2 days old except the one carried over from 2026-06-14; the "recurring ≥3" nudge in
 `scripts/friction-scan.py` didn't fire this pass (all recurrence counts were 1), so the
 thresholds need no change.
 | Signal (first seen) | Verdict | Resolution / where it lives now |
 |---|---|---|
 | ADRs claim cross-doc reconciliation they didn't perform (06-14) | SYSTEMATIZE | New `rename-incomplete` check in `scripts/repo-scan.py` (+7 tests): when a numbered ADR announces a rename `Old`→`New`, flag any design-doc line where `Old` still appears in present tense (skips the announcing ADR, lines also naming `New`, and historical/negation cues; rejects `ADR-NNN` tokens as terms). 0 findings on the current tree — the Traefik→Caddy ripple edits have landed. Structural cousin of `stale-deferred`; run by `/review-repo`. (Was KEEP-OPEN on 2026-06-14 — now built.) |
 | Image push to the Forgejo registry needs an interactive `docker login` (06-15) | SYSTEMATIZE → vault | Vault-backed login path so pushes are agent-completable: `vault.forgejo.registry_token` stub (CHANGEME, operator-minted) + `scripts/registry-login.sh` (reads the token, `docker login --password-stdin`, never echoes it) + `make registry-login` + a prereq note in `docs/runbooks/claude-code-setup.md`. Works once the operator fills the token via `make edit-vault`. |
 | Single-file bind mount + atomic rewrite = stale config (06-16) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Single-file bind mount + atomic rewrite = stale config (reload-in-place only)": `template` writes a new inode, a single-file bind mount pins the old one, so an in-container reload reads stale config. Mount the config *directory* for reload-in-place roles; restart-based roles are fine with a single-file mount. |
 | `make check` always fails on the first-ever deploy of a compose service role (06-16) | CHANGE | `check_mode: false` on the `state: directory` scaffold tasks in `roles/reverse_proxy` + `roles/netbird_coordinator`, so the base dirs exist under `--check` and the rest of the dry-run (templates + compose) evaluates instead of failing on a missing `project_src`. Inert under converge → Molecule unchanged. |
 | Re-asked settled defaults — push + execution mode, in prose (06-17) | CHANGE (exec) + ACCEPTED (push) | Widened `.claude/hooks/guard-execution-mode-menu.sh` to also catch free-form *prose* re-asks of the subagent-vs-inline choice (`"which execution approach?"`, `"subagent vs inline"`, …), not just the literal menu; tested. The push re-ask stays a soft default via the `dont-reask-settled-defaults` memory — a genuine "should I push?" is sometimes legitimate, so it is deliberately not hard-blocked. |
 | Docs-only commit tripped the rbw-locked pre-commit guard (06-17) | CHANGE | Root cause was NOT the ansible-lint `files:` scope (innocent) — it was `.claude/hooks/guard-vault-preflight.sh` blocking *every* locked `git commit`. Rewrote it to inspect the staged set (`git diff --cached`, plus `-a`/`--all`) and block only when Ansible content (`^(roles\|playbooks\|inventories)/.*\.ya?ml$`) is staged; docs-/config-only commits are now exempt. Fail-safe to block when unsure. Tested. |
 | Agent can't self-manage `ubongo` (the control node it runs on) without operator grants (06-17) | PARK | The knowledge already lives in `STATUS.md` (control-node row: the interim `claude`-key + `sjat` NOPASSWD grants, and **Pending:** the proper `ansible`-user bootstrap) and the `ubongo-self-sufficiency` memory. Out-of-phase — the fix is the control-node bootstrap recipe, a tracked future build. **Resurrection trigger:** when building ubongo's `base` hardening / `ansible`-user bootstrap, fold in key-trusted NOPASSWD self-management so control-node self-management needs no ad-hoc operator grants. |
 ### 2026-06-14
 First `/kaizen` run (dogfood). 12 signals triaged; 11 consumed, 1 kept open (#13 above —
--- a/docs/ROADMAP.md
+++ b/docs/ROADMAP.md
@ -13,7 +13,7 @@ as ordering changes, or as new milestones appear. Each milestone gets its own
 spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it
 comes up; this file stays high-level.
-_Last updated: 2026-06-19._
+_Last updated: 2026-06-11._
 ---
@ -43,10 +43,9 @@ this collapses into interleaving with extra context-switching cost).
 ---
-## Phase 1 — Off-site / Remote-access — ✅ COMPLETE (2026-06-17)
+## Phase 1 — Off-site / Remote-access
 Delivers mobile access to `ubongo`; proves the machinery. Ordered by *real* dependencies.
 All milestones (M1–M5) done; the mobile-access goal is met. Next: the Procurement gate.
 ### M1 · boma's DNS home — a new domain at Gandi, managed as code
@ -135,14 +134,14 @@ Dashboard live at `https://netbird.askari.wingu.me` (valid LE cert); `/api` auth
 - **Maps to:** ADR-016 (mesh), ADR-004 (one service = one role), ADR-021 (access),
  ADR-022 (backup), ADR-008/017 (VERIFY), accepted-risk R3 (askari public surface).
-### M5 · Enroll peers → goal reached — ✅ DONE (2026-06-17)
+### M5 · Enroll peers → goal reached — ✅ infra done (2026-06-17); laptops = operator step
 The `base` `mesh` concern enrolled **`ubongo` (`100.99.146.14`) + `askari`
 (`100.99.226.39`)** as NetBird peers — both Management+Signal Connected, the ubongo↔askari
 mesh link ping-verified. NetBird ships a default **Allow-All** peer policy, so any enrolled
-peer reaches `ubongo` over `wt0`. The road-warrior clients (**`mamba` + the work laptop**)
+peer can already reach `ubongo` over `wt0`. **Remaining (operator):** install the NetBird
-are enrolled (operator, via `docs/runbooks/netbird-client.md`) → **`ubongo` is reachable
+client on `mamba` + the work laptop and log in → `ubongo` reachable from anywhere. **← the
-from anywhere. ← the mobile-access goal is met; Phase 1 is complete.**
+mobile-access goal lands when the laptops join.**
 - **Deferred to a "mesh-hardening" follow-on** (was folded into M5; split out as the
  lockout-risky part): apply `base` nftables **default-deny** to `ubongo` + set
@ -206,22 +205,6 @@ Canonical dependency order:
 ## Next step
-**Phase 1 complete (M1–M5); mesh-hardening: ubongo (2/3) DONE 2026-06-19, askari redesign DONE 2026-06-20.**
+**M1 (Gandi DNS migration, IaC)** design is written —
-Both hosts now run INPUT-only nftables default-deny (`base__firewall_input_only`), live reboot-validated.
+`docs/superpowers/specs/2026-06-11-public-dns-gandi-migration-design.md`. Next: user
-askari's redesign (spec/plan `docs/superpowers/{specs,plans}/2026-06-19-mesh-hardening-askari-redesign*`)
+review → implementation plan.
 applied INPUT-only default-deny + `wt0`-primary SSH + a permanent WAN break-glass + a geo-disabled
 coordinator; a real reboot recovered unattended. Remaining mesh-hardening sub-projects:
 1. ~~`ubongo` nftables default-deny + `ssh-from-control`~~ → **DONE (2026-06-19).**
 2. ~~**redesign** `askari`'s SSH → `wt0`~~ → **DONE (2026-06-20)** — boot-race, coordinator-bootstrap
   chicken-egg, and Docker-nat-flush all resolved + live reboot-validated.
 3. ~~**askari relay-SPOF reduction**~~ → **DONE (2026-06-20)** — assessed + **accepted** as a
   documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
   narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
   second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
   DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
 4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
 5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
   BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
 **Then** the Procurement gate (`/capacity-review` → buy Proxmox hardware) opens Phase 2.
--- a/docs/TODO.md
+++ b/docs/TODO.md
@ -17,7 +17,6 @@
      calls, curl pulls of web products, log reviews. Headless browsing → ADR-017
      (`/verify-service`); the API/curl/log-review siblings remain open.
   3. ~~Standard for test users + manual-test instructions.~~ → ADR-017.
   4. ~~Local VM integration testing on ubongo.~~ → ADR-025 / `make test-integration` (built + RED→GREEN validated 2026-06-18).
 3. **Building services**
   1. ~~Decide how to manage logs.~~ → ADR-018.
@ -85,13 +84,6 @@
    5. ~~Always subagent-driven?~~ → DECIDED: yes (standing agreement; enforced by `.claude/hooks/guard-execution-mode-menu.sh`).
    6. When AI deploys, i.e. runs playbooks etc., should we make a methodology so that it does not have to poll all the time or review all the output. Perhaps something about the MAKE method could provide only the relevant feedback?
    7. ~~Reproducible agent toolchain.~~ → `.claude/settings.json` + `docs/runbooks/claude-code-setup.md`.
    8. **Screenshot hand-off to the agent.** Give the operator a smooth way to hand the
       agent a screenshot (e.g. of a Hetzner/VNC console during an incident) — the agent
       can already read image files; the gap is the hand-off. During the 2026-06-17
       incident the only diagnostic channel was console screenshots, copied manually to
       `/tmp` and `find`-located. Options: a known drop path the agent checks (e.g.
       `~/screenshots/`), a small `screenshot`/paste helper or slash-command, or a
       clipboard→file convention. Cheap, high-value for incident work.
 11. **Kaizen loop** — `/kaizen` built (STATUS).
    1. ~~Build the loop command.~~ → `/kaizen` (`scripts/friction-scan.py` + `.claude/commands/kaizen.md`; spec `docs/superpowers/specs/2026-06-14-kaizen-command-design.md`).
@ -128,7 +120,6 @@
    6. Supply-chain hygiene: enforce tiered image pinning (stateful `tag@digest`;
       stateless rolling tags — ADR-011) + official/verified images via the service
       checklist; revisit active scanning (Trivy/Grype) once a triage stack exists (R1).
    7. Is our network setup as it should be? I am not sure if all traffic between ubongo and notes goes via askari? what if askari breaks - will the rest work?
 16. **ADR-011 (update management) — resolve open questions + accept.** Committed as
    **Proposed**; resolve before marking Accepted:
--- a/docs/decisions/008-testing.md
+++ b/docs/decisions/008-testing.md
@ -154,7 +154,6 @@ Level 2 (staging) or Level 3 (external). This is a conscious, documented decisio
 | Capability | Reason not testable in Molecule |
 |---|---|
 | `nftables` rule loading | Requires `nf_tables` kernel module; not available in Docker |
 | **Reboot-survivability / host-firewall × Docker interaction / boot-ordering** | **Requires a real kernel reboot — the class that caused the 2026-06-17 mesh-hardening incident. Now covered by local VM integration testing (ADR-025).** |
 | NetBird mesh data plane (`wt0` WireGuard interface) | Requires the `wireguard` kernel module; Molecule checks only that the agent is installed/configured (ADR-016) |
 | `unattended-upgrades` behaviour | Installs correctly; actual upgrade behaviour requires a real apt environment |
 | DHCP behaviour (OPNsense) | OPNsense is managed by Ansible but not testable in a container |
@ -166,11 +165,6 @@ For the above, Molecule tests only what it can: that the relevant packages are
 installed, that configuration files render correctly, and that services are enabled.
 Behavioural correctness is confirmed on staging.
 **ADR-025 is the concrete build of Level 2/3** — local VM integration testing on
 ubongo (libvirt/KVM, throwaway overlay VMs, stdlib-only driver). It specifically
 targets the reboot-survivability / host-firewall × Docker / boot-ordering class that
 Molecule structurally cannot reach. See `docs/decisions/025-local-vm-integration-testing.md`.
 ---
 ### CI pipeline
--- a/docs/decisions/015-control-host.md
+++ b/docs/decisions/015-control-host.md
@ -2,10 +2,7 @@
 ## Status
-Accepted (2026-06-05). **Amended 2026-06-18:** the `claude` AI-worker account now has
+Accepted (2026-06-05)
 `NOPASSWD:ALL` sudo on `ubongo` — reversing the original "no local sudo" sub-decision.
 The amendment is recorded in §Access & security below; rationale and accepted risk are
 in ADR-021 and `docs/security/accepted-risks.md` (R7).
 ## Context
@ -46,12 +43,8 @@ points at this physical box. This *strengthens* the ADR-009 control-node excepti
 it is genuinely outside Terraform's world, not a VM pretending to be the exception.
 Every other host stays a Terraform-managed VM exactly as designed.
-`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a production
+`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a hypervisor
-hypervisor and runs no `docker_host` services. It does run **ephemeral KVM test VMs**
+and runs no `docker_host` services.
 as part of its local-test-runner role (ADR-025 — local VM integration testing): one
 throwaway VM at a time (~3 GiB RAM), against ~13 GiB free of the 16 GiB sized here.
 This is not a production workload — it is the concrete implementation of ADR-008 Level
 2/3, and the resource guard enforces one-at-a-time to stay within the RAM ceiling.
 ### Hardware target
@ -91,38 +84,12 @@ Manual, on bare metal:
  only** — key-only, with password auth and root login disabled — until the NetBird mesh
  (ADR-016) is stood up.
 - **AI-worker identity:** `ubongo` runs the AI worker under a dedicated,
-  password-locked `claude` user (in the `docker` and `libvirt` groups; **`NOPASSWD:ALL`
+  password-locked `claude` user (in the `docker` group for Molecule; **no local sudo** —
-  sudo** via a repo-managed drop-in — see amendment below). It is reached via `sudo -iu
+  boma deploys reach the fleet over SSH as the `ansible` user, not via local root). It is
-  claude` or its own SSH key. The rationale is **attribution + revocation, not
+  reached via `sudo -iu claude` or its own SSH key. The rationale is **attribution +
-  containment**: auditd/Loki (ADR-018) can separate human from agent actions, and the
+  revocation, not containment**: auditd/Loki (ADR-018) can separate human from agent
-  account/key can be revoked without touching the operator's access. (ADR-021 left the
+  actions, and the account/key can be revoked without touching the operator's access.
-  on-`ubongo` agent identity unspecified; this records it.)
+  (ADR-021 left the on-`ubongo` agent identity unspecified; this records it.)
  **Amendment (2026-06-18) — `claude` now has `NOPASSWD:ALL` sudo.**
  > **Superseded by [ADR-025](025-local-vm-integration-testing.md)** (per ADR-023 §4): the
  > "no local sudo" sub-decision is reversed. The shakedown that necessitated it is ADR-025;
  > the resulting two-account access model is ADR-021; the accepted risk is R7.
  During the
  integration-testing harness shakedown, the original "no local sudo" sub-decision was
  reversed. No-sudo blocked the AI-worker from diagnosing a failed VM: `virsh`,
  `virt-install`, `cloud-localds`, `journalctl`, `nft` — nearly all low-level
  diagnostic commands — require root. The AI-worker must autonomously spin up,
  inspect, and tear down test VMs without operator hand-holding; that is the harness's
  core value proposition. Compensating controls make the risk acceptable:
  1. `claude`'s password is **locked** (no interactive login, no `su claude` without the
     operator's own credentials) — `NOPASSWD` sudo is the *only* sudo path.
  2. `auditd` + Loki attribution (ADR-018) separates human from agent root actions.
  3. The drop-in is **repo-managed** via `base__ai_worker_user` — revocable in one commit
     and one deploy.
  4. Single-operator homelab: everything in git, off-machine backups (ADR-022).
  The operator (`sjat`) uses **password-required sudo** via the `sudo` group; their
  former `NOPASSWD` drop-in was removed 2026-06-18 as redundant once `claude` had sudo
  (least-privilege cleanup). The accepted risk is registered as R7 in
  `docs/security/accepted-risks.md`. ADR-021 records the resulting sudo model for both
  accounts.
 - **Disk encryption:** `ubongo`'s SSD is **not encrypted at rest** — the SanDisk X600 is
  TCG-Opal-capable but Opal is unused. This is an accepted risk recorded in
  `docs/security/accepted-risks.md` (control-node disk not encrypted at rest),
--- a/docs/decisions/016-mesh-vpn.md
+++ b/docs/decisions/016-mesh-vpn.md
@ -85,9 +85,8 @@ allocated for it.
 - **Bootstrap order:** stand up the coordinator on `askari` → enroll `ubongo` →
  `base` enrolls the fleet.
 - **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
-  NetBird's management datastore is **intended** to be backed up encrypted off `askari`
+  NetBird's management datastore is backed up encrypted off `askari` (synced to
-  (synced to `ubongo`/`mamba`; not yet built — see the Availability amendment / R8); peers
+  `ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
  keep last-known config through a brief coordinator outage.
 - **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned
  as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox
  cluster (its own provider + local state). Ansible configuration: `base` role, plus a
@ -117,7 +116,7 @@ allocated for it.
  address as a mesh-independent secondary path, so a mesh/coordinator outage never
  blocks on-LAN SSH and Ansible stays off the mesh (Security; Recovery & operations).
 - The mesh survives a homelab outage because the coordinator is off-site on `askari`,
-  with its management datastore **intended** to be backed up encrypted off `askari` (not yet built — see the Availability amendment / R8) and peers keeping
+  with its management datastore backed up encrypted off `askari` and peers keeping
  last-known config through a brief coordinator outage (Recovery & operations).
 - Choosing NetBird over plain OPNsense WireGuard, Tailscale, Tailscale+Headscale, an
  on-cluster coordinator, a `ubongo` subnet router, and a standalone IdP gains
@ -126,38 +125,6 @@ allocated for it.
 - Implementation is pending: the role tasks land only once the unbuilt `base` role and
  service-role machinery exist (Status).
 ## Availability — an `askari` outage (amendment 2026-06-20)
 The coordinator is deliberately **single** (one off-site host). Recorded here so its
 availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
 The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
 normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
 radius**:
 | Traffic | `askari` down |
 |---|---|
 | LAN device → LAN service (direct / via reverse proxy) | unaffected |
 | node ↔ node over LAN IPs (cluster) | unaffected |
 | node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
 | **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
 | mesh control plane (new enrol / ACL change / re-handshake) | pauses |
 Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
 is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
 operations, above).
 **Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
 once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
 gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
 the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
 hosts get the same pin via `base__mesh_coordinator_pin`.
 **Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
 default-deny posture; only helps established sessions), a second relay (needs another public
 host / reintroduces the home public surface), a second coordinator (unsupported by
 self-hosted NetBird; against this ADR).
 ## Related
 ADR-007 (network — amended), ADR-015 (control host), ADR-002 (security),
--- a/docs/decisions/021-operational-access.md
+++ b/docs/decisions/021-operational-access.md
@ -3,9 +3,7 @@
 ## Status
 Accepted (2026-06-09). Resolves TODO 7.2 (what to set up on hosts given direct access
-will be rare) and TODO 3.2 (the service admin-API access question). **Amended
+will be rare) and TODO 3.2 (the service admin-API access question).
 2026-06-18:** the on-`ubongo` sudo model for the two local accounts is now settled
 (see §Sudo model on `ubongo` below).
 **Doctrine ADR.** It pins the operational-access doctrine, the declarative `access__*`
 data model, the rendered `ACCESS.md` record, and the `/check-access` verifier. It does
@ -165,36 +163,6 @@ exists and `/check-access` is green (or a deviation is recorded in `accepted-ris
 No scaffold change — same manual-copy-plus-review pattern the sibling records
 (`SECURITY.md`/`VERIFY.md`) use.
 ### Sudo model on `ubongo` (amendment 2026-06-18)
 The original ADR left on-`ubongo` local sudo unspecified. The integration-testing
 harness shakedown settled it:
 | Account | Role | Sudo |
 |---|---|---|
 | `claude` | Automated AI-worker | `NOPASSWD:ALL` via repo-managed drop-in (`base__ai_worker_user`) |
 | `sjat` | Human operator | Password-required sudo via the `sudo` group |
 **Rationale for `claude NOPASSWD`.** No-sudo blocked the AI-worker from diagnosing a
 failed test VM: `virsh`, `virt-install`, `cloud-localds`, `nft`, `journalctl` —
 almost every low-level diagnostic tool — require root. The harness's core value is
 autonomous spin-up → apply → reboot → assert → diagnose; that loop collapses without
 local root access.
 **Compensating controls (R7 in `docs/security/accepted-risks.md`):**
 - `claude`'s password is locked — `NOPASSWD` is the account's *only* sudo path; no
  interactive login is possible.
 - `auditd` + Loki attribution (ADR-018) separates human from agent root actions in the
  audit trail.
 - The drop-in is repo-managed and revocable in one commit + one deploy.
 - Single-operator homelab; everything in git; off-machine backups (ADR-022).
 **`sjat` NOPASSWD removed.** The operator's former `NOPASSWD` drop-in
 (`/etc/sudoers.d/sjat-ansible`, added as an interim measure during M5 NetBird
 enrolment) was removed 2026-06-18. It was redundant once `claude` held sudo, and its
 removal restores least-privilege for the human operator. `sjat` retains full sudo
 capability via the `sudo` group (password required).
 ## Consequences
 - Every host and service has at least one documented, verifiable way in — and a verifier
--- a/docs/decisions/025-local-vm-integration-testing.md
+++ b/docs/decisions/025-local-vm-integration-testing.md
@ -1,180 +0,0 @@
 # ADR-025 — Local VM integration testing on ubongo
 ## Status
 Accepted (2026-06-18). Implements ADR-008 Level 2/3 (deferred for lack of hosts; now
 viable on ubongo). **RED→GREEN acceptance PASSED on real hardware (2026-06-18):** a
 throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base's nftables forward
 default-deny kills Docker forwarding on reboot) — RED — and survived the reboot once
 the `docker_host` container-forward drop-in was applied — GREEN. Two shakedown
 learnings added below.
 ## Context
 Molecule (ADR-008 Level 1) tests each role in a single Docker container: one
 `converge`, no real kernel netfilter, no real Docker daemon in the loop, and **no
 reboot**. That structurally cannot catch an entire class of bug — reboot-survivability,
 host-firewall × Docker interaction, and boot-ordering — which is exactly the class
 that caused the **2026-06-17 mesh-hardening incident**.
 During that incident, `base`'s nftables `forward { policy drop; }` killed the askari
 Docker host **on reboot**: nftables loaded its default-deny before Docker, breaking
 published-port DNAT and inter-container forwarding. Public services and the mesh went
 down. It had worked right after `make deploy`, when Docker's runtime rules still
 coexisted. `ip_nonlocal_bind` also failed to beat the sshd boot-race, leaving the mesh
 listener absent at boot. Recovery required the Hetzner console and a WAN-SSH
 break-glass. Molecule had passed.
 ADR-008's Level 2/3 was deferred "for lack of hosts." ubongo breaks that deferral:
 > verified: ubongo KVM capability · Bash (2026-06-18 session) · `/dev/kvm` present +
 > accessible (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM
 > free of 16, ~198 GiB disk free; libvirt/QEMU/Vagrant **not yet installed** ·
 > 2026-06-18.
 ## Decision
 ### 1. Virtualisation approach: libvirt/KVM directly (Approach A)
 A golden Debian-13 genericcloud qcow2 is cached locally on ubongo. Each run boots an
 ephemeral qcow2 **overlay** backed by it (the golden image is never mutated), seeded
 via cloud-init NoCloud, driven by a **stdlib-only** Python driver (`scripts/
 integration-vm.py`) over `virsh` / `virt-install` / `cloud-localds`. No `libvirt-
 python` dependency — the driver stays portable and the role stays lean.
 ### 2. Fidelity envelope
 The bugs are **post-boot**, not in the provisioning path. A lightweight local hypervisor
 is sufficient: real OS, real kernel netfilter, real Docker daemon, real published-port
 DNAT, a **real reboot**, and the coordinator running inside the VM (so the VM forms its
 own one-node mesh, reproducing the circular bootstrap). The Proxmox provisioning chrome
 is not mirrored.
 ### 3. Scope: one throwaway VM at a time, instantiated from real inventory
 The first profile is **"be askari"** — a single box running Docker host + NetBird
 coordinator + mesh peer, mirroring the host whose incident motivates this work. The
 mechanism is generic: swap the profile to "be" any inventory host. Multi-VM topologies
 are a deferred extension.
 ### 4. Acceptance: self-validating against the real failure
 The harness is accepted when it can, on a local VM:
 1. Apply `base` (firewall on, no `docker_host` container-forward drop-in) to a Docker
   host, reboot, and observe the **2026-06-17 breakage** (Docker forwarding dead,
   services down). If step 1 passes, the harness is not faithful.
 2. Apply the `docker_host` container-forward fix, re-run, and **survive the reboot**.
 ### 5. Tiered cert fidelity via a `--certs` knob
 DNS-01 is what makes real certs possible without public inbound (validation is
 out-of-band via a Gandi TXT record; the VM needs only outbound to ACME + Gandi, which
 the isolated NAT network provides):
 | Tier | Description | Default? |
 |---|---|---|
 | `internal` | Caddy `tls internal` — zero deps, instant. For incident repro and runs where certs are not under test. | Yes |
 | `le-staging` | Real DNS-01 ACME against Let's Encrypt **staging** — real caddy-gandi path, real cert files/renewal, untrusted root, effectively no rate limits. | Built in v1; use when testing the ACME/cert path. |
 | `le-prod-wildcard` | A real trusted `*.test.wingu.me` wildcard, **issued once, persisted on ubongo, reused** across runs. | On-demand only. Accepted risk recorded as R6 in `docs/security/accepted-risks.md`. |
 A deliberate "no-egress" failure scenario (reproducing FRICTION 2026-06-17 #4 —
 `netbird-server` FATAL-loops on GeoLite2 download when egress is lost) forces
 `internal`, since ACME requires egress.
 ### 6. The toolchain is Ansible-managed
 A new non-service role (`integration_test`, `control` group) installs and enables
 libvirt + QEMU + virtinst reproducibly. The driver manages the golden image lazily on
 first run (keeping the role lean; no fiddly download/refresh logic in Ansible). The
 repo owns ubongo's state.
 ### 7. Stubs live in an overlay file, never in the real inventory
 Transient inventory entries for the test VM are generated at runtime as a single-host
 file. Stubs (cert tier, in-VM coordinator endpoint, VM connection details) live in
 `tests/integration/overrides/<host>.yml` — an explicit, reviewable overlay. The real
 inventory is never touched, so `make tf-inventory` and "don't edit inventory directly"
 stay intact.
 ## Consequences
 - **Reconciles ADR-015:** ubongo runs ephemeral KVM test VMs as part of its
  local-test-runner role — it is still not a production hypervisor. A default VM
  (~2 vCPU / 3 GiB / 20 GiB thin overlay) against ~13 GiB free is comfortable; the
  driver enforces **one integration VM at a time** (resource guard, name-prefix
  `boma-it-*`) and refuses to start below a free-RAM threshold.
 - **Operationalises the standing rule:** "firewall/sshd/boot changes must be tested on
  a real VM with a real reboot before they touch a live host" (FRICTION 2026-06-17 #6)
  becomes a concrete, runnable step documented in `docs/runbooks/integration-testing.md`.
 - **Accepted risk R6:** `le-prod-wildcard` runs pass the production Gandi PAT
  (`vault.gandi.pat`) to an ephemeral local VM and write transient `_acme-challenge`
  TXT records into the real `wingu.me` zone. Scope: on-demand only; `le-staging` is the
  default. Compensating controls: ephemeral VM, isolated NAT network, TXT records
  auto-removed by Caddy after validation.
 - **Three safety invariants** make the test tool itself safe:
  1. The transient inventory contains only the test VM — no real host is ever in scope.
  2. "Be askari" points NetBird at the in-VM coordinator — the VM forms its own one-node
     mesh; it never enrols in the real mesh.
  3. Test VMs sit on an isolated libvirt NAT network — outbound NAT for ACME/image pulls
     only, not reachable to the LAN (`10.20.x`) or the real mesh.
 - **Diagnostics on failure** (catching a bug is the point): failure keeps the VM and
  dumps `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`,
  `systemd-analyze critical-chain`. `make test-integration-clean` reaps all `boma-it-*`
  orphans. Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/`.
 - **Future pinch:** concurrency with the Level-4 Chromium/Playwright stack (ADR-017)
  competes for ubongo RAM. The resource guard is the v1 answer — one integration VM at a
  time; don't run alongside a heavy Level-4 session. Revisit at `/capacity-review`.
 ## Scope
 **In scope:** reboot-survivability, host-firewall × Docker interaction, boot-ordering,
 cert/ACME paths, mesh bootstrap on one box.
 **Out of scope (v1):** multi-VM mini-cluster (inter-host mesh dataplane); CI gate
 (this is an interactive, agent-driven pre-deploy check; CI stays lint + Molecule per
 ADR-008/010); the Proxmox provisioning path (the bugs live in the boot/kernel/Docker
 layer, not provisioning).
 ## What was ruled out
 | Option | Reason |
 |---|---|
 | **Proxmox VE nested on ubongo** | Highest fidelity including the provisioning step, but heavy (nested virt, RAM), in tension with ADR-015, and the incident bugs do not live in provisioning. |
 | **Vagrant + vagrant-libvirt** | Mature lifecycle/snapshots, but adds the Ruby/Vagrant ecosystem + a fragile plugin; boxes drift from the real Debian cloud image; the reboot→assert sequence still needs custom logic. |
 | **terraform-provider-libvirt** | Declarative and reuses TF, but poor at the imperative apply→reboot→re-apply test sequence; adds throwaway state; blurs ADR-006's "TF owns *production* VM existence on Proxmox" boundary. |
 ## Verified facts (ADR-014)
 - verified: ubongo KVM capability · Bash · `/dev/kvm` present + accessible (kvm group),
  Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198 GiB
  disk free · 2026-06-18.
 ## Shakedown learnings (2026-06-18 live run)
 Two findings from the RED→GREEN acceptance run that affect anyone operating the harness:
 1. **Boot firmware: UEFI required.** The Debian 13 genericcloud image triple-faults
   under legacy BIOS/SeaBIOS and does not reach the kernel. Boot the VM with UEFI
   (`virt-install --boot uefi`; `ovmf` package). The driver does this by default; note
   it here so the requirement is findable.
 2. **`claude` sudo is load-bearing.** VM management (`virsh`, `virt-install`,
   `cloud-localds`) and offline diagnostics (`nft list ruleset`, `journalctl -b`,
   `systemd-analyze critical-chain`) all require root. The harness assumes the
   AI-worker has `NOPASSWD:ALL` sudo on `ubongo` — settled as the ADR-015 amendment
   (2026-06-18) and registered as R7 in `docs/security/accepted-risks.md`. A `claude`
   account without sudo will block the harness at the first `virsh` call.
 The nine full shakedown findings (including the UEFI boot-loop) are in
 `docs/FRICTION.md`.
 ## Related
 - ADR-006 — Terraform owns production VM existence (boundary this ADR respects).
 - ADR-008 — Testing methodology (Levels 1–4); this ADR is the concrete build of Level 2/3.
 - ADR-015 — Control host (ubongo); this ADR reconciles "not a hypervisor" with ephemeral test VMs. **Supersedes** ADR-015's "no local sudo" sub-decision for the AI-worker — the shakedown necessitated `claude` NOPASSWD sudo (ADR-023 §4; access model in ADR-021, risk R7).
 - ADR-016 — Mesh VPN; the "be askari" profile includes the coordinator role.
 - ADR-020 — Firewall strategy; firewall × Docker interaction is what this harness tests.
 - ADR-021 — Operational access; sudo model for `claude` and `sjat` on `ubongo`.
 - ADR-024 — Reverse proxy (Caddy); cert tiers exercise the DNS-01 ACME path.
--- a/docs/hardware/reference.md
+++ b/docs/hardware/reference.md
@ -25,7 +25,7 @@
 - **Storage:** 256 GB SanDisk X600 SATA 2.5" SSD (model SD9TB8W256G1001; TCG Opal-capable, Opal unused — no disk encryption)
 - **NICs:** wired GbE, interface eno1, MAC 88:a4:c2:e0:ee:da
 - **BIOS:** Lenovo M2WKT5AA (2023-06-20)
- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred). Also runs **one ephemeral KVM integration test VM** (~3 GiB RAM) at a time per ADR-025 — the resource guard enforces one-at-a-time; do not run a test-integration cycle alongside a heavy Level-4 browser session (Chromium/Playwright).
+- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred)
 ### fisi (backup node — outside the cluster; provisional)
 - **Model / form factor:** HP Elite 600 G9 (tower)
--- a/docs/runbooks/claude-code-setup.md
+++ b/docs/runbooks/claude-code-setup.md
@ -50,13 +50,6 @@ Don't install these until their trigger lands — then add them here and to
 - **The venv-activate hook** — this repo expects the Python `.venv` active for Bash
  commands. If you use the user-level `~/.claude/hooks/activate-venv.sh` pattern,
  replicate it; otherwise `source .venv/bin/activate` per session after `make setup`.
 - **Forgejo registry login (for image pushes)** — `make caddy-image-push` /
  `molecule-image-push` need the Docker daemon authenticated to
  `forgejo.nyumbani.baobab.band`. Run **`make registry-login`** once per machine: it reads
  `vault.forgejo.registry_token` from the vault and does `docker login --password-stdin`
  (no interactive prompt, so an agent can complete a push). The token is operator-minted
  (Forgejo → Settings → Applications → Generate Token, package read+write) and set via
  `make edit-vault`; until then `registry-login` prints how to obtain it. (2026-06-17 kaizen.)
 ## 4. A note on user-level settings
--- a/docs/runbooks/integration-testing.md
+++ b/docs/runbooks/integration-testing.md
@ -1,229 +0,0 @@
 # Runbook — Local VM integration testing
 ## When to use this
 Run a local VM integration test before deploying any change that touches:
 - **nftables / firewall rules** (the `firewall` concern of `base`)
 - **sshd configuration** (listener address, port, key types, `base` hardening)
 - **boot ordering or kernel parameters** (systemd units, sysctl)
 - **Docker host networking** (`docker_host` DNAT rules, published-port forwarding, `daemon.json`)
 These are the change classes that Molecule (ADR-008 Level 1) cannot catch: they require
 a real kernel reboot to surface. This harness is the concrete tool for ADR-008 Level 2/3
 (see ADR-025) and directly operationalises two standing rules:
 - **"Test risky infra before live deploy"** (standing rule, ubongo memory) — firewall/sshd/boot changes must be tested on a real VM with a real reboot before touching a live host.
 - **FRICTION 2026-06-17 #6 — validate reboot-recovery before retiring the break-glass** — the lesson crystallised from the mesh-hardening incident: confirm the host recovers from reboot *while you still have the break-glass open*, not after.
 You do not need this runbook for pure-config changes (template rendering, package lists, user management) — Molecule covers those.
 ---
 ## First-deploy (one-time setup)
 The `integration_test` role installs libvirt + QEMU + virtinst on ubongo and adds the
 operator accounts (`sjat`, `claude`) to the `libvirt` and `kvm` groups.
 ```bash
 make deploy PLAYBOOK=site LIMIT=ubongo TAGS=integration_test
 ```
 **Re-login after this run** — group membership changes do not take effect in the current
 session. The driver (`scripts/integration-vm.py`) requires both `libvirt` and `kvm`
 group membership to create and manage VMs.
 The golden Debian-13 genericcloud qcow2 image is downloaded lazily on the first run
 (one-time cost, ~500 MB); subsequent runs reuse the cached image.
 ---
 ## Running a cycle
 ### Makefile interface (recommended)
 ```bash
 # Full cycle (provision → apply → reboot → assert → teardown on pass)
 make test-integration HOST=askari
 # With a specific cert tier
 make test-integration HOST=askari CERTS=le-staging
 # Keep the VM alive after the run (for manual inspection)
 make test-integration HOST=askari KEEP=1
 # Destroy all orphan integration VMs (name-prefix boma-it-*)
 make test-integration-clean
 ```
 `HOST` is a hostname from the production inventory (the profile `tests/integration/
 profiles/<host>.json` must exist — see Adding a new profile below). `CERTS` defaults
 to `internal`.
 ### Lower-level driver
 The driver (`scripts/integration-vm.py`) exposes individual lifecycle steps for manual
 or scripted use:
 | Sub-command | What it does |
 |---|---|
 | `up` | Ensure golden image → create ephemeral overlay → cloud-init seed → boot |
 | `apply` | Run the site playbook against the transient inventory (real apply) |
 | `reboot` | `virsh reboot` + wait for a verified reboot (boot-id change) — the step Molecule cannot do |
 | `assert` | Run `tests/integration/verify.yml` (outcome assertions) |
 | `cycle` | `up` → `apply` → `reboot` → `assert` → `down` (default: destroy on pass) |
 | `down` | Destroy the VM + overlay |
 | `prune` | Destroy all `boma-it-*` VMs + overlays (orphan cleanup) |
 | `console` | Print the VM's captured serial-console log |
 ```bash
 # Example: step through manually
 python3 scripts/integration-vm.py up --host askari
 python3 scripts/integration-vm.py apply --host askari
 python3 scripts/integration-vm.py reboot --host askari
 python3 scripts/integration-vm.py assert --host askari
 python3 scripts/integration-vm.py down --host askari
 ```
 ---
 ## Cert tiers
 | Tier | Flag | Use when |
 |---|---|---|
 | `internal` | `CERTS=internal` (default) | Incident repro, firewall/sshd/boot changes where certs are not under test. Zero deps, instant. |
 | `le-staging` | `CERTS=le-staging` | Testing the Caddy DNS-01 ACME path, cert renewal logic, or the `caddy-gandi` plugin. Real cert files, untrusted root, effectively no rate limits. Requires `vault.gandi.pat`. |
 | `le-prod-wildcard` | `CERTS=le-prod-wildcard` | Verifying TLS behaviour with a real trusted cert. On-demand only — accepted risk R6 (`docs/security/accepted-risks.md`): the production Gandi PAT reaches an ephemeral VM and transient TXT records are written into the real `wingu.me` zone. |
 > A deliberate "no-egress" scenario (reproducing FRICTION 2026-06-17 #4 — the
 > `netbird-server` GeoLite2 FATAL-loop when NAT masquerade is wiped) **must** use
 > `CERTS=internal`: the egress loss is the fault being simulated, and ACME requires egress.
 ---
 ## Diagnostics and inspecting a failed VM
 ### Where diagnostics land
 Diagnostics from every run are captured in:
 ```
 ~/integration-runs/<timestamp>-<host>/
 ```
 This directory is gitignored. On a failed assert step, the driver dumps:
 - `nft list ruleset` — the live nftables state at failure
 - `docker ps -a` — container states
 - `ss -tlnp` — listening sockets
 - `journalctl -b` — full boot log
 - `systemd-analyze critical-chain` — boot timing
 - Serial console capture (on boot/SSH failure — the automated equivalent of the Hetzner
  console, addressing FRICTION 2026-06-17 #5)
 The agent reads these directly from `~/integration-runs/` — no manual download needed.
 ### Inspecting a kept or failed VM
 When a run fails or when `KEEP=1` is passed, the VM is left running. Connect to it:
 ```bash
 # Serial console (no SSH needed — useful when SSH is the fault)
 python3 scripts/integration-vm.py console --host askari
 # or directly:
 virsh console boma-it-askari
 # Exit with Ctrl-]
 # SSH (as the ansible user, IP from virsh)
 virsh domifaddr boma-it-askari --source lease
 ssh ansible@<IP>
 # List all integration VMs
 virsh list --all | grep boma-it-
 ```
 ### Cleanup
 ```bash
 # Destroy a specific VM
 python3 scripts/integration-vm.py down --host askari
 # Reap all orphans
 make test-integration-clean
 # or:
 python3 scripts/integration-vm.py prune
 ```
 ---
 ## Safety invariants
 These make the test tool itself safe — the harness cannot reach or modify production:
 1. **Single-host transient inventory** — the playbook apply runs against a generated
   single-host inventory (`ansible_host=<VM lease IP>`). No real host is ever in scope.
 2. **In-VM coordinator only** — "be askari" points NetBird at the coordinator running
   inside the VM itself (localhost endpoint). The VM forms its own one-node mesh; it
   never enrols in the real NetBird mesh.
 3. **Isolated NAT network** — test VMs sit on a dedicated libvirt NAT network.
   Outbound NAT provides ACME/image-pull access, but the VM is not reachable from
   the LAN (`10.20.x`) or the real mesh.
 ---
 ## Resource constraints
 The default VM profile is ~2 vCPU / 3 GiB RAM / 20 GiB thin-provisioned overlay. The
 driver enforces **one integration VM at a time** (refusing to start if another
 `boma-it-*` VM is already running) and refuses to start below the free-RAM threshold
 (~13 GiB available on ubongo at baseline, per ADR-025).
 **Do not run a test-integration cycle alongside a Level-4 browser session**
 (Chromium/Playwright, ADR-017) — both compete for ubongo RAM. The resource guard is the
 enforcement mechanism, not a suggestion.
 ---
 ## Adding a new profile
 To make the harness "be" a different host:
 1. Create `tests/integration/profiles/<hostname>.json` — specifies which roles to apply
   and base VM sizing for that host.
 2. Create `tests/integration/overrides/<hostname>.yml` — the explicit stub overlay:
   cert tier, in-VM coordinator endpoint (if the host runs the coordinator),
   `ansible_host` placeholder, and any other variables that must differ from the real
   inventory (e.g. public DNS → local resolution, geo-DB disable for coordinator).
 3. Add assertions to `tests/integration/verify.yml` (or extend an existing task with a
   `when: inventory_hostname == '<hostname>'` guard) for any host-specific outcomes.
 4. Run `make test-integration HOST=<hostname>` to validate the new profile.
 All stubs must be explicit in the overlay — the real inventory is never edited.
 ---
 ## Reproducing the 2026-06-17 incident
 The acceptance test for the harness (ADR-025) deliberately reproduces the incident:
 1. Run with today's `base` (firewall on, no `docker_host` container-forward drop-in):
   ```bash
   make test-integration HOST=askari CERTS=internal
   ```
   The assert step **must FAIL** after reboot (Docker forwarding dead, published ports
   unreachable). If it passes, the harness is not faithful.
 2. Implement the `docker_host` container-forward rules (FRICTION 2026-06-17 #1 fix) and
   re-run. The assert step **must PASS** across the reboot.
 This round-trip proves: (a) the harness faithfully reproduces the incident, and (b) the
 fix survives a real reboot.
 ---
 ## Related
 - ADR-025 — decision record for this harness (approach, cert tiers, safety invariants)
 - ADR-008 — testing methodology; this is Level 2/3
 - `docs/security/accepted-risks.md` R6 — `le-prod-wildcard` accepted risk
 - `docs/FRICTION.md` — 2026-06-17 signals that motivated this runbook
--- a/docs/runbooks/netbird-client.md
+++ b/docs/runbooks/netbird-client.md
@ -1,144 +0,0 @@
 # Runbook — Enrolling a NetBird client (road-warrior device)
 Joins a **client/road-warrior device** (laptop, desktop, phone) to the boma NetBird mesh
 so it can reach `ubongo` and other peers from anywhere. The self-hosted coordinator is on
 `askari` (ADR-016, M4b); enrollment lands a device on the `100.64.0.0/10` overlay.
 > **Hosts vs clients.** Managed **Linux hosts** join via the `base` role's `mesh` concern
 > (`base__mesh_enabled: true` + the reusable key in `vault.netbird.setup_key`) — see
 > ADR-016 / the `base` README, *not* this runbook. This runbook is for **user devices**
 > NetBird doesn't manage with Ansible.
 verified: NetBird client install + self-hosted `--management-url` flow · docs.netbird.io
 (`/get-started/install/windows`, `/get-started/cli`) · 2026-06-17
 ## Prerequisites
 - The coordinator's first-boot `/setup` admin exists and you can log in at
  `https://netbird.askari.wingu.me`.
 - **Auth, pick one:**
  - **SSO** (recommended for a personal device) — your dashboard account; no secret to copy.
  - **Setup key** — dashboard → **Settings → Setup Keys** → a reusable key (mint a
    client-specific one for clean ACL grouping, or reuse the existing reusable key).
 - Local **admin rights** on the device (the client installs a service).
 - **Coordinator facts:** management URL `https://netbird.askari.wingu.me`; `ubongo`
  = `100.99.146.14` (`ubongo.netbird.selfhosted`); `askari` = `100.99.226.39`.
 ---
 ## Part A — Windows 11
 1. **Install:** download + run the MSI **https://pkgs.netbird.io/windows/msi/x64**
   (official x64 client; installs the tray app + the `netbird` service).
 2. **Connect** from an **elevated** Windows Terminal / PowerShell ("Run as administrator"):
   ```powershell
   netbird up --management-url https://netbird.askari.wingu.me
   ```
   A browser opens — sign in with your dashboard account. (SSO won't open a browser?
   use a key: `netbird up --setup-key <KEY> --management-url https://netbird.askari.wingu.me`.)
 3. Proceed to **Part C** (verify).
 ---
 ## Part B — Other platforms (same management URL)
 - **macOS / Linux desktop:** install the client (macOS: NetBird app / Homebrew; Linux:
  `pkgs.netbird.io` per the distro — same apt/rpm flow as `base`'s `mesh` concern), then
  `netbird up --management-url https://netbird.askari.wingu.me` (Linux: prefix `sudo`).
 - **Android / iOS:** install the **NetBird** app, then in **Settings → Advanced /
  Server** set the management server to `https://netbird.askari.wingu.me` **before**
  logging in; connect and complete the SSO login. (Setup keys are supported in-app too.)
 ---
 ## Part C — Verify + use
 ```sh
 netbird status        # expect: Management: Connected, Signal: Connected, a 100.x NetBird IP
 netbird status -d     # peer detail — ubongo (100.99.146.14) + askari (100.99.226.39) listed
 ```
 Reach `ubongo` over the mesh:
 ```sh
 ssh sjat@100.99.146.14        # or: ssh sjat@ubongo.netbird.selfhosted
 ```
 **SSH auth is separate from the mesh:** `ubongo` is key-only (passwords disabled), so the
 device needs an SSH key authorised for `sjat@ubongo`. The mesh provides the network path;
 the SSH key provides auth.
 ---
 ## Troubleshooting — mesh drops / SSH to `ubongo` times out
 Symptom: SSH to `ubongo` (or any peer) times out for minutes and recovers on its own;
 `netbird status` shows **Management/Signal: Disconnected** or peers stuck **Connecting**.
 verified: client DNS/relay behaviour + NRPT scope read from a 0.72.4 debug bundle;
 mitigations per docs.netbird.io (`/manage/dns/troubleshooting`,
 `/help/troubleshooting-client`) · 2026-06-18
 **1. Triage — is it your device or the coordinator?** On the device:
 ```sh
 netbird status -d                     # Management/Signal Connected? peers P2P/Relayed?
 nslookup netbird.askari.wingu.me      # coordinator FQDN
 nslookup pkgs.netbird.io              # a PUBLIC name — control test
 ```
 If the relay/handshake errors say `lookup netbird.askari.wingu.me: no such host` **and**
 a *public* name (`pkgs.netbird.io`) also fails to resolve, your **local resolver is
 dead** — the coordinator and `ubongo` are almost certainly fine. NetBird only manages
 `*.netbird.selfhosted` resolution (a single NRPT rule), so it is **not** the cause.
 Confirm from the other side if you can: the dashboard shows peer *last-seen*; `askari`/
 `ubongo` staying green ⇒ the fault is your device's network.
 **Why it cascades:** NetBird re-resolves the coordinator FQDN on every reconnect. A
 network transition (Wi-Fi ↔ phone hotspot, sleep/wake) that briefly kills DNS means it
 can't reach management/signal/relay — and since `ubongo` is **relay-only** (below), there
 is no direct path to fall back to, so SSH dies until DNS recovers.
 **2. Make the device resilient:**
 - **Reliable resolvers** — set the device's DNS to public resolvers (`1.1.1.1`, `8.8.8.8`)
  rather than a network-handed or homelab-internal resolver that's unreachable off-LAN.
  Windows: inspect with `Get-DnsClientServerAddress`.
 - **Pin the coordinator** so a DNS hiccup can't strand the client — add to the hosts file
  (`C:\Windows\System32\drivers\etc\hosts` as admin, or `/etc/hosts`):
  ```
  77.42.120.136  netbird.askari.wingu.me
  ```
  `askari`'s stable WAN IP; TLS still validates on the hostname. Removes the multi-minute
  reconnect deadlocks.
 **3. Break-glass — reach `ubongo` without the mesh.** When the mesh is down you still need
 a way in. On the home LAN, go straight to `ubongo`'s wired address (bypasses the mesh and
 coordinator DNS entirely):
 ```sh
 ssh sjat@10.20.10.151        # ubongo eno1 (LAN) — verify this works from your device NOW
 ```
 > ⚠️ This works **today** only because `ubongo`'s host-firewall default-deny is not yet
 > applied. When the deferred mesh-hardening lands (SSH only on `wt0`), this path closes
 > unless a break-glass SSH rule is added to the firewall catalog. That hardening **must**
 > keep a non-mesh break-glass (catalog SSH rule from a trusted LAN/admin source) — else a
 > DNS/mesh outage = full lockout. (ADR-021 break-glass.)
 **Why `ubongo` is relay-only (and P2P is not the fix).** Peers connect to `ubongo` as
 `Relayed`, never `P2P`: its `nftables` default-deny drops the inbound UDP that ICE
 hole-punching needs (egress is open, so STUN itself succeeds). This is the **intended
 current posture** — P2P / NAT-traversal is the *deferred mesh-hardening* (ADR-016/020,
 STATUS.md). Enabling it needs a firewall-catalog UDP entry **plus** an `accepted-risks.md`
 deviation or ADR amendment, and OPNsense NAT work — and it would **not** have prevented a
 DNS-driven outage (a re-handshake still needs signal, which needs DNS). Tracked as future
 hardening, not a quick fix.
 ---
 ## Notes
 - **Split-tunnel:** NetBird routes only the `100.x` overlay by default — normal/work
  networking is unaffected.
 - **Persistence:** the service auto-starts on boot and reconnects; the tray app has
  Connect/Disconnect; CLI `netbird down` / `netbird up` (no flags after first setup).
 - **Troubleshooting** — *"failed while getting Management Service public key"* / won't
  register: confirm `https://netbird.askari.wingu.me` loads in a browser from the device
  (DNS + TLS + the gRPC routing through Caddy are reachable), the URL is exact, and the
  terminal is elevated. For peers stuck Disconnected/Connecting or SSH-to-`ubongo`
  timeouts that recover on their own, see **Troubleshooting — mesh drops** above.
 - **Removing a device:** `netbird down` then uninstall; revoke its peer in the dashboard
  (and the setup key if one-off).
--- a/docs/runbooks/new-host.md
+++ b/docs/runbooks/new-host.md
@ -109,13 +109,6 @@ make check PLAYBOOK=site
 # Should report no changes
 ```
 > **Pre-flight before lockout-risky changes (firewall / sshd / boot):** before applying
 > any change that touches nftables rules, SSH configuration, or boot ordering, run
 > `make test-integration HOST=<name>` and confirm reboot-recovery on the local VM
 > **while the break-glass (Proxmox console / Hetzner console) is still open**. Do not
 > retire the break-glass until the integration test passes. See
 > `docs/runbooks/integration-testing.md` and ADR-025.
 ---
 ## Part E — Control node (`ubongo`, manual exception)
--- a/docs/runbooks/new-role.md
+++ b/docs/runbooks/new-role.md
@ -114,20 +114,7 @@ reason and gets no `BACKUP.md`. Once the backup node exists, `/check-backup <rol
 proves the declared state is captured — part of the service-clearance gate
 (`docs/security/service-checklist.md`).
-### 13. Pre-flight for lockout-risky roles
+### 13. Commit
 If the new role touches nftables rules, SSH configuration, or boot ordering, run a
 local VM integration test and confirm reboot-recovery **before** deploying to a live
 host and while the host's break-glass (Proxmox console / Hetzner console) is still
 open:
 ```bash
 make test-integration HOST=<target-host>
 ```
 See `docs/runbooks/integration-testing.md` and ADR-025.
 ### 14. Commit
 ```bash
 git checkout -b role/<rolename>
--- a/docs/security/accepted-risks.md
+++ b/docs/security/accepted-risks.md
@ -18,11 +18,8 @@ revisit (trigger).
 | R3 | **Self-hosted mesh control plane is a public target on `askari`** — the NetBird coordinator (ADR-016) exposes a management API + dashboard (TCP 80/443) and STUN (UDP 3478) on `askari`'s public IP; the management API controls the whole mesh (NetBird v0.72.4 embeds STUN in the combined server — no separate Coturn) | Self-hosting means **no third-party trust** and an off-site control plane that survives a homelab outage (boma's sovereignty ethos). Residual surface is on `askari` (already a public VPS) and is mitigated: TLS + embedded-IdP login, source-IP restriction where practical, `base` hardening, version-pinned NetBird (ADR-011) patched on boma's cadence | A coordinator compromise or unpatched NetBird CVE; the management plane is reachable without auth/IP-limits; the operational burden makes a hosted coordinator worth reconsidering |
 | R4 | **No cryptographic WORM for logs** — shipped logs are append-only via Loki's push API and copied off-site to `askari` (ADR-018), but the stored chunks are not object-locked/immutable; a root-on-`askari` attacker could edit history | Append-only push + off-site copy already defeats the realistic threat (a host attacker covering tracks survives even full-cluster compromise). True WORM (object-lock) is forensic-grade cost for boma's opportunistic threat model (R1) | Threat model shifts toward targeted/forensic; a regulatory/evidentiary need appears; `askari` itself is assessed as a likely target |
 | R5 | **No disk encryption on `ubongo`** — the control node's SSD (SanDisk X600 256 GB, TCG-Opal-capable but Opal unused) is unencrypted at rest, so it holds recovery-critical secrets in plaintext: the Ansible Vault password's `rbw` local cache and (future) Terraform state. Physical theft of the box would expose them | `ubongo` is always-on in a physically controlled location; compensating controls are a **BIOS supervisor password** and **disabled external/USB + PXE boot** (an attacker cannot trivially boot another OS to read the disk), and the offline-recoverable design means the irreducible root secret (Vaultwarden master password) is never stored on the box anyway. Full-disk encryption was weighed against the always-on/unattended-reboot requirement (LUKS+TPM auto-unlock or passphrase) and deferred for simplicity at this trust level | `ubongo` is relocated to a less-trusted physical location; the box starts holding additional high-value secrets; or a reinstall onto LUKS (TPM-sealed) is undertaken |
 | R6 | **`le-prod-wildcard` integration runs** — when `CERTS=le-prod-wildcard` is passed to `make test-integration`, the production Gandi PAT (`vault.gandi.pat`) is passed to an ephemeral local test VM via the var overlay, and transient `_acme-challenge` TXT records are written into the real `wingu.me` DNS zone to satisfy the Let's Encrypt DNS-01 challenge. A compromised or long-lived test VM could exfiltrate the PAT; the real zone is briefly (seconds) modified | Scope is **on-demand only** — `le-staging` is the default cert tier (`CERTS=internal` for incident repro); `le-prod-wildcard` is an explicit opt-in. Compensating controls: the VM is ephemeral and destroyed on success; it sits on an isolated libvirt NAT network (no LAN/mesh access); TXT records are auto-removed by Caddy immediately after validation; the PAT is not persisted inside the VM after the run. ADR-025 documents the cert-tier design and the three isolation invariants | The PAT is exfiltrated from a test VM; the `wingu.me` zone shows unexpected records; a `CERTS=le-prod-wildcard` run must be audited or the tier must be revoked |
 | R7 | **`claude` AI-worker has `NOPASSWD:ALL` sudo on `ubongo`** — the automated AI-worker account can execute any command as root on the control node without a password prompt. A compromised or misbehaving agent session could make arbitrary root-level changes to ubongo | The account is **password-locked** (no interactive `claude` login; `NOPASSWD` sudo is the account's only escalation path, so there is no "su to claude + sudo" attack). `auditd` + Loki attribution (ADR-018) logs every `sudo` invocation with the originating user. The drop-in (`/etc/sudoers.d/claude-ai-worker`) is repo-managed via `base__ai_worker_user` — revocable in one commit + one deploy. Single-operator homelab; all changes in git; off-machine backups (ADR-022). Full rationale: ADR-015 amendment (2026-06-18) + ADR-021 §Sudo model. | The AI-worker executes a destructive action that cannot be rolled back via git; the account key is compromised; the threat model shifts toward targeted remote attackers |
 | R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access** — `askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
-_Last reviewed: 2026-06-20. The prior gaps (full CIS hardening, SELinux/AppArmor,
+_Last reviewed: 2026-06-11. The prior gaps (full CIS hardening, SELinux/AppArmor,
 IDS) were re-challenged and **adopted rather than accepted**: CIS Debian L1+L2 + CIS
 Docker, AppArmor (enforce), AIDE file-integrity, and Suricata network IDS are now
 part of the security strategy (ADR-002). See STATUS.md / `docs/TODO.md` for build
--- a/docs/superpowers/plans/2026-06-17-mesh-hardening-askari-ssh-wt0.md
+++ b/docs/superpowers/plans/2026-06-17-mesh-hardening-askari-ssh-wt0.md
@ -1,466 +0,0 @@
 # Mesh-hardening 1/3 — askari SSH onto wt0 — Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Make askari's SSH reachable only over the NetBird mesh (`wt0`) and close the WAN `:22` surface at both the host nftables layer and the Hetzner Cloud Firewall, without dropping askari's public services.
 **Architecture:** Three enforcement layers — (1) sshd `ListenAddress` bound to the live `wt0` IP (fail-closed, `ip_nonlocal_bind` to beat the post-boot bind race); (2) the base role's catalog-driven nftables default-deny (SSH already restricted to `wt0` via `base__firewall_mgmt_interface`; add a `public` zone + askari service entries so 80/443/3478 survive); (3) Terraform drops the Hetzner Cloud Firewall WAN `:22` rule. Tasks 1–4 are code (subagent-driven, each Molecule/lint/plan-verified). Task 5 is the live, operator-supervised cutover on the real host.
 **Tech Stack:** Ansible (role `base`, FQCN), nftables, Molecule on Debian 13, `ansible.posix.sysctl`, pytest (filter unit tests), Terraform (`hcloud` provider).
 **Spec:** `docs/superpowers/specs/2026-06-17-mesh-hardening-askari-ssh-wt0-design.md`
 **Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; `make tf-plan` before `make tf-apply`; never hand-edit the generated `offsite.yml`; rbw unlocked for commits touching ansible content.
 ---
 ### Task 1: base role — sshd `ListenAddress` on wt0 + `ip_nonlocal_bind` (fail-closed)
 **Files:**
 - Modify: `roles/base/defaults/main.yml`
 - Modify: `roles/base/tasks/ssh.yml`
 - Modify: `roles/base/templates/sshd_hardening.conf.j2`
 - Modify: `roles/base/molecule/default/converge.yml` (fixture)
 - Modify: `roles/base/molecule/default/verify.yml` (assertions = the test)
 - [ ] **Step 1: Write the failing test (extend Molecule verify)**
 In `roles/base/molecule/default/verify.yml`, add these tasks after the existing "Sshd drop-in present and config valid" block:
 ```yaml
    - name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
      ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
      changed_when: false
    - name: ip_nonlocal_bind sysctl drop-in is present
      ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind = 1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
      changed_when: false
    - name: ip_nonlocal_bind is live in this netns
      ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
      register: _nonlocal
      changed_when: false
      failed_when: _nonlocal.stdout | trim != '1'
 ```
 - [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
 In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (alongside the existing `base__mesh_*`):
 ```yaml
    base__ssh_listen_mesh_only: true
    base__ssh_listen_addr: "100.99.0.1"   # fixture mesh IP (no wt0 in the container)
 ```
 - [ ] **Step 3: Run the test to verify it fails**
 Run: `make test ROLE=base`
 Expected: FAIL — converge errors or verify fails (`ListenAddress` not rendered; sysctl drop-in absent), because the feature isn't implemented yet.
 - [ ] **Step 4: Add the defaults**
 In `roles/base/defaults/main.yml`, after the `base__ssh_authorised_keys: []` line (end of the hardening block), add:
 ```yaml
 # SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
 # ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
 # fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
 # lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
 # non-empty address rather than silently listening on all interfaces.
 base__ssh_listen_mesh_only: false
 base__ssh_listen_addr: ""
 ```
 - [ ] **Step 5: Resolve + assert + sysctl in `ssh.yml`**
 In `roles/base/tasks/ssh.yml`, insert these tasks at the TOP of the file (before "Ensure openssh-server is installed"):
 ```yaml
 - name: Resolve the sshd mesh listen address (override, else live wt0 fact)
  ansible.builtin.set_fact:
    base__ssh_listen_addr_resolved: >-
      {{ base__ssh_listen_addr
         or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
  when: base__ssh_listen_mesh_only | bool
 - name: Fail closed — refuse to render sshd without a known mesh address
  ansible.builtin.assert:
    that:
      - base__ssh_listen_addr_resolved | length > 0
    fail_msg: >-
      base__ssh_listen_mesh_only is true but no mesh address resolved (set
      base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
      render sshd ListenAddress empty (which would listen on ALL interfaces).
  when: base__ssh_listen_mesh_only | bool
 - name: Allow sshd to bind the mesh IP before wt0 exists at boot
  ansible.posix.sysctl:
    name: net.ipv4.ip_nonlocal_bind
    value: "1"
    sysctl_set: true
    state: present
    reload: true
    sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
  when: base__ssh_listen_mesh_only | bool
 ```
 - [ ] **Step 6: Render the conditional `ListenAddress`**
 In `roles/base/templates/sshd_hardening.conf.j2`, append after the existing `KbdInteractiveAuthentication no` line:
 ```jinja
 {% if base__ssh_listen_mesh_only | bool %}
 ListenAddress {{ base__ssh_listen_addr_resolved }}
 {% endif %}
 ```
 - [ ] **Step 7: Run the test to verify it passes**
 Run: `make test ROLE=base`
 Expected: PASS — converge succeeds; verify confirms `ListenAddress 100.99.0.1`, the sysctl drop-in, and the live value `1`.
 > **Checkpoint (environmental):** if `make test` fails on the sysctl task because the Molecule container can't write `net.ipv4.ip_nonlocal_bind`, add `sysctls: {net.ipv4.ip_nonlocal_bind: "0"}` to the platform in `roles/base/molecule/default/molecule.yml` (pre-creates the namespaced sysctl so the task can set it), then re-run. Note the change in the commit.
 - [ ] **Step 8: Lint**
 Run: `make lint`
 Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
 - [ ] **Step 9: Commit**
 ```bash
 git add roles/base/defaults/main.yml roles/base/tasks/ssh.yml \
        roles/base/templates/sshd_hardening.conf.j2 \
        roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
 git commit -m "feat(base): opt-in sshd ListenAddress on the mesh IP (fail-closed)
 base__ssh_listen_mesh_only binds sshd to the live wt0 IP only, with
 ip_nonlocal_bind to beat the post-boot bind race and a fail-closed assert so an
 unresolved address never silently listens on all interfaces. Molecule covers
 the render + sysctl. Mesh-hardening 1/3 (ADR-016/021).
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 2: firewall catalog — `public` zone + askari's public services
 **Files:**
 - Modify: `inventories/production/group_vars/all/firewall.yml`
 - Modify: `roles/base/molecule/default/converge.yml` (fixture: public-zone rule)
 - Modify: `roles/base/molecule/default/verify.yml` (assert the 0.0.0.0/0 rule)
 - Test: `tests/test_firewall_rules.py` (unit: a `public` zone resolves to `0.0.0.0/0`)
 Rationale: `base__firewall_mgmt_interface` already accepts `:22` on `wt0`. The gap is that the catalog is empty and has no "anywhere" source, so applying default-deny to askari would drop 80/443/3478. We add a `public` zone (`0.0.0.0/0`) and askari's service ingress.
 - [ ] **Step 1: Write the failing unit test**
 In `tests/test_firewall_rules.py`, add:
 ```python
 def test_public_zone_resolves_to_anywhere():
    catalog = {"web": {"host": "askari",
                       "ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
    zones = {"public": "0.0.0.0/0"}
    rules = rs.resolve_firewall_rules(catalog, zones, "askari",
                                      {"askari": {"ansible_host": "100.99.226.39"}}, {})
    assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]
 ```
 (Module is loaded by the existing importlib shim at the top of the test file as `rs`. If the filter is imported under a different alias there, match it.)
 - [ ] **Step 2: Run it to verify it fails (or passes trivially)**
 Run: `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
 Expected: this test PASSES immediately if the filter already resolves arbitrary zones (it does — `_resolve_source` treats any `zones` key generically). That is fine: the unit test documents/locks the `public`-zone contract. If it fails, fix the filter. Either way it must end green.
 - [ ] **Step 3: Add the Molecule fixture (public-zone rule)**
 In `roles/base/molecule/default/converge.yml`, under `firewall_zones:` add `public: 0.0.0.0/0`, and under `firewall_catalog:` add:
 ```yaml
      netbird_stun:
        host: instance
        ingress:
          - { from: public, port: 3478, proto: udp }
 ```
 - [ ] **Step 4: Add the Molecule assertion (the test)**
 In `roles/base/molecule/default/verify.yml`, after the photoprism assertion block, add:
 ```yaml
    - name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
      ansible.builtin.assert:
        that:
          - "'0.0.0.0/0' in nft"
          - "'udp dport 3478 accept' in nft"
        fail_msg: "missing public->3478/udp rule for netbird_stun"
 ```
 - [ ] **Step 5: Run the tests**
 Run: `make test ROLE=base` then `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
 Expected: both PASS (the rendered ruleset now contains the `0.0.0.0/0 ... udp dport 3478 accept` rule).
 - [ ] **Step 6: Populate the real catalog**
 In `inventories/production/group_vars/all/firewall.yml`, replace the `firewall_zones`/`firewall_catalog` blocks with:
 ```yaml
 # Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public
 # off-site services (askari); home/cluster services use the internal zones only.
 firewall_zones:
  mgmt: 10.10.0.0/24
  srv: 10.20.0.0/24
  lan: 10.30.0.0/24
  iot: 10.40.0.0/24
  guest: 10.50.0.0/24
  public: 0.0.0.0/0
 # Service catalog: <name> → placement (host | group | hosts) + ingress[].
 # askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host
 # nftables template renders IPv4 source rules only; askari is reached via its A record
 # (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
 firewall_catalog:
  reverse_proxy:
    host: askari
    ingress:
      - { from: public, port: 80, proto: tcp }
      - { from: public, port: 443, proto: tcp }
  netbird_stun:
    host: askari
    ingress:
      - { from: public, port: 3478, proto: udp }
 ```
 - [ ] **Step 7: Lint**
 Run: `make lint`
 Expected: clean pass (`check-tags: OK`).
 - [ ] **Step 8: Commit**
 ```bash
 git add inventories/production/group_vars/all/firewall.yml \
        roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
        tests/test_firewall_rules.py
 git commit -m "feat(firewall): public zone + askari's public services in the catalog
 Adds a public (0.0.0.0/0) zone and askari's Caddy (80/443) + NetBird STUN
 (3478/udp) ingress so the base nftables default-deny does not drop the live
 public services when applied to askari. Molecule + filter unit test cover the
 public-zone rendering. Mesh-hardening 1/3 (ADR-020/024/016).
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 3: inventory — point Ansible at wt0 + enable mesh-only SSH on askari
 **Files:**
 - Create: `inventories/production/host_vars/askari.yml`
 - Modify: `inventories/production/group_vars/offsite_hosts/vars.yml`
 - [ ] **Step 1: Create the host_var override**
 Create `inventories/production/host_vars/askari.yml`:
 ```yaml
 ---
 # Manage askari over the NetBird mesh (wt0), not its WAN IP. This OVERRIDES the
 # TF-generated inventories/production/offsite.yml (ansible_host = 77.42.120.136); host_vars
 # outrank the generated inventory and are NOT touched by `make tf-inventory-offsite`.
 # Mesh-hardening 1/3 — once SSH is wt0-only, the WAN IP is no longer reachable for SSH.
 ansible_host: 100.99.226.39   # askari's wt0 address (NetBird, M5)
 ```
 - [ ] **Step 2: Enable mesh-only SSH for offsite hosts**
 In `inventories/production/group_vars/offsite_hosts/vars.yml`, replace the file body with:
 ```yaml
 ---
 # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
 # (ADR-016, M5). Mesh-hardening 1/3 (2026-06-17): SSH is moved onto wt0 — sshd binds the
 # mesh IP only (base__ssh_listen_mesh_only) and the base nftables default-deny applies
 # (base__firewall_apply defaults true; SSH allowed on wt0 via base__firewall_mgmt_interface,
 # public services via the catalog). base__mesh_enabled stays true (precondition from M5).
 base__mesh_enabled: true
 base__ssh_listen_mesh_only: true
 ```
 - [ ] **Step 3: Verify the override resolves**
 Run: `.venv/bin/ansible-inventory -i inventories/production/ --host askari 2>/dev/null | grep ansible_host`
 Expected: `"ansible_host": "100.99.226.39"` (the host_var wins over the generated `offsite.yml`).
 - [ ] **Step 4: Lint**
 Run: `make lint`
 Expected: clean pass.
 - [ ] **Step 5: Commit**
 ```bash
 git add inventories/production/host_vars/askari.yml \
        inventories/production/group_vars/offsite_hosts/vars.yml
 git commit -m "feat(inventory): manage askari over wt0 + enable mesh-only SSH
 host_vars/askari.yml points ansible_host at the wt0 IP (overriding the generated
 offsite.yml); offsite_hosts sets base__ssh_listen_mesh_only. Mesh-hardening 1/3.
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 4: Terraform — retire the Hetzner WAN `:22` rule
 **Files:**
 - Modify: `terraform/modules/hetzner_vm/main.tf`
 - Modify: `terraform/modules/hetzner_vm/variables.tf`
 - Modify: `terraform/environments/offsite/main.tf`
 This task makes the SSH rule conditional and sets askari's admin CIDRs to empty (mesh-only). The live `tf-plan`/`tf-apply` happens in Task 5 — here we only change + format/validate the code.
 - [ ] **Step 1: Gate the SSH rule on a non-empty CIDR list**
 In `terraform/modules/hetzner_vm/main.tf`, replace the static SSH `rule { ... }` block (the one with `port = "22"`) with a dynamic block:
 ```hcl
  # SSH from the control node only — and only when admin CIDRs are set. An empty
  # ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over
  # wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
  dynamic "rule" {
    for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
    content {
      direction  = "in"
      protocol   = "tcp"
      port       = "22"
      source_ips = var.ssh_admin_cidrs
    }
  }
 ```
 - [ ] **Step 2: Default the variable to empty**
 In `terraform/modules/hetzner_vm/variables.tf`, change the `ssh_admin_cidrs` variable to default to an empty list:
 ```hcl
 variable "ssh_admin_cidrs" {
  description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)."
  type        = list(string)
  default     = []
 }
 ```
 - [ ] **Step 3: Set askari to mesh-only SSH**
 In `terraform/environments/offsite/main.tf`, change the `ssh_admin_cidrs` argument in the `module "askari"` block to:
 ```hcl
  ssh_admin_cidrs = [] # mesh-only: SSH is reached over wt0; WAN :22 retired (mesh-hardening 1/3)
 ```
 - [ ] **Step 4: Format + validate**
 Run: `cd terraform/environments/offsite && terraform fmt -recursive ../.. && terraform validate && cd -`
 Expected: `fmt` lists any reformatted files (re-add them); `validate` prints `Success! The configuration is valid.` (offsite is already `init`ed — it has live state.)
 - [ ] **Step 5: Commit**
 ```bash
 git add terraform/modules/hetzner_vm/main.tf terraform/modules/hetzner_vm/variables.tf \
        terraform/environments/offsite/main.tf
 git commit -m "feat(tf/offsite): retire askari's WAN :22 (mesh-only SSH)
 The Hetzner Cloud Firewall SSH rule is now conditional on a non-empty
 ssh_admin_cidrs (default []); askari sets it empty so the WAN :22 rule is
 removed on the next apply. SSH is reached over wt0; break-glass is the Hetzner
 console. Apply is the live cutover (Task 5). Mesh-hardening 1/3.
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
 > This task touches the real askari over the network and is lockout-risky. Run it
 > interactively with the operator, in order, verifying each step before the next. The
 > firewall's auto-rollback timer + `wait_for_connection` over wt0 is the safety net; the
 > Hetzner web console is the ultimate break-glass. Do NOT hand this to an unattended agent.
 - [ ] **Step 1: Pre-check the mesh SSH path (before any change)**
 Run: `.venv/bin/ansible askari -i inventories/production/ -m ping`
 Expected: `SUCCESS` — confirms Ansible reaches askari over `wt0` (Tasks 1–3 are merged, so `ansible_host` is now `100.99.226.39`). If this fails, STOP — the mesh path must work before closing the WAN.
 - [ ] **Step 2: Dry-run the base apply (firewall + sshd)**
 Run: `make check PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
 Expected: shows the nftables ruleset diff (default-deny + wt0 SSH + public 80/443/3478) and the sshd drop-in diff (`ListenAddress 100.99.226.39`); no errors. Review that the public service rules are present (so they won't be dropped).
 - [ ] **Step 3: Apply the host firewall + sshd (auto-rollback armed)**
 Run: `make deploy PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
 Expected: the firewall concern arms the rollback timer, applies, resets the connection, and `wait_for_connection` succeeds over wt0; sshd reloads with the mesh ListenAddress. If connectivity is lost, the timer auto-reverts the ruleset within `base__firewall_rollback_timeout` (45 s).
 - [ ] **Step 4: Verify services + WAN SSH still open at the cloud edge**
 ```bash
 curl -sSf -o /dev/null -w '%{http_code}\n' https://test.askari.wingu.me   # expect 200
 curl -sSf -o /dev/null -w '%{http_code}\n' https://netbird.askari.wingu.me # expect 200
 ```
 Expected: both `200` (valid certs); the host firewall did not drop the public services. (WAN `:22` is now dropped by the host nftables, but the Hetzner FW still allows it until Step 5 — that's fine.)
 - [ ] **Step 5: Retire the Hetzner WAN `:22` — plan, review, apply**
 Run: `make tf-plan TF_ENV=offsite`
 Expected: the plan shows the SSH firewall rule being **destroyed** (and nothing else of substance). Review it.
 Then: `make tf-apply TF_ENV=offsite`
 Expected: apply succeeds; the WAN `:22` rule is gone.
 - [ ] **Step 6: Verify the end-state (out-of-band)**
 From an OFF-MESH host (e.g. the operator's laptop with NetBird disconnected, or a quick check from askari's perspective):
 ```bash
 nc -vz -w5 77.42.120.136 22   # expect: refused / timeout (WAN SSH closed)
 nc -vz -w5 77.42.120.136 443  # expect: open (public service intact)
 ```
 And from ubongo over the mesh: `.venv/bin/ansible askari -i inventories/production/ -m ping` → `SUCCESS`.
 - [ ] **Step 7: Reboot resilience check (optional but recommended)**
 Reboot askari from the Hetzner console; after it comes back, confirm `ansible askari -m ping` succeeds over wt0 without intervention (proves `ip_nonlocal_bind` beat the post-boot bind race).
 - [ ] **Step 8: Update STATUS + ROADMAP**
 - In `STATUS.md`, update the askari row: SSH is now wt0-only; the host nftables default-deny is applied; the Hetzner WAN `:22` is retired. Move "host firewall + moving askari's SSH onto wt0" out of *Pending*.
 - In `docs/ROADMAP.md`, mark mesh-hardening sub-project 1 (askari SSH→wt0) done; next is sub-project 2 (ubongo default-deny).
 ```bash
 git add STATUS.md docs/ROADMAP.md
 git commit -m "docs: askari SSH moved onto wt0 (mesh-hardening 1/3 done)
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 - [ ] **Step 9: Push**
 Run: `git push origin main`
 ---
 ## Self-review (against the spec)
 - **§ three layers** → Task 1 (sshd ListenAddress), Task 2 (nftables catalog; SSH-on-wt0 pre-existing via `base__firewall_mgmt_interface`), Task 4 (Hetzner WAN :22). ✓
 - **§ boot-race fix** (`ip_nonlocal_bind` + fail-closed assert + live wt0 fact) → Task 1 Steps 4–6. ✓
 - **§ new code/vars** (`base__ssh_listen_mesh_only`, `base__ssh_listen_addr`, host_vars/askari.yml, offsite flag, catalog, TF) → Tasks 1–4. ✓
 - **§ staged cutover** → Task 5 Steps 1–6, with the firewall auto-rollback as the gate. ✓
 - **§ testing** → Molecule render asserts (ListenAddress, sysctl, public-zone rule) + filter unit test + live out-of-band checks. The fail-closed assert is exercised by code; to spot-check it, temporarily blank `base__ssh_listen_addr` in the converge fixture and confirm `make test ROLE=base` fails on the assert, then revert (manual, not automated — a deliberate-failure Molecule scenario is non-idiomatic). ✓
 - **§ risks/rollback** → auto-rollback timer (Task 5 Step 3), `ip_nonlocal_bind` (Task 1), Hetzner console break-glass, re-addable TF rule. ✓
 - **IPv6 note** → recorded in the catalog comment (Task 2 Step 6); acceptable because askari has only an A record.
--- a/docs/superpowers/plans/2026-06-18-local-vm-integration-testing.md
+++ b/docs/superpowers/plans/2026-06-18-local-vm-integration-testing.md
--- a/docs/superpowers/plans/2026-06-19-mesh-hardening-askari-redesign.md
+++ b/docs/superpowers/plans/2026-06-19-mesh-hardening-askari-redesign.md
@ -1,409 +0,0 @@
 # Mesh-hardening redesign (askari) — Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Harden askari's inbound surface with the proven ubongo INPUT-only default-deny pattern (SSH scoped by `iifname "wt0"` + a permanent WAN break-glass), and make the NetBird coordinator survive a no-egress startup — reboot-safe, no boot-race, no lockout.
 **Architecture:** Mirror mesh-hardening 2/3 (ubongo): `base` firewall INPUT-only (`base__firewall_input_only: true`, forward stays `policy accept` so Docker forwarding/NAT survive), **no** sshd `ListenAddress` change (the firewall, not sshd, scopes `:22`). The coordinator-host exception: WAN `:22` stays open from ubongo's static WAN IP as the always-available non-mesh break-glass (the Hetzner console is the ultimate fallback). A `netbird_coordinator` change disables geolocation so a transient egress loss can't FATAL the control plane. Validate firewall reboot-safety on a throwaway VM (ADR-025 harness) GREEN before a supervised live cutover.
 **Tech Stack:** Ansible (`base`, `netbird_coordinator` roles), nftables, Docker Compose, Molecule (Debian 13), the `scripts/integration-vm.py` ADR-025 harness, NetBird self-hosted `netbird-server:0.72.4`.
 **Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md`
 ## Global Constraints
 - **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
 - **No sshd `ListenAddress` change** — `base__ssh_listen_mesh_only` stays `false` everywhere here (this is what sidesteps the 2026-06-17 boot-race).
 - **WAN `:22` is never closed** — no Terraform / Hetzner-Cloud-Firewall change in this plan.
 - **`base__firewall_input_only: true` on askari** — the forward chain must stay `policy accept` (Docker host). Never apply a forward-`drop` firewall to askari.
 - **ubongo's WAN IP is `91.226.145.80`** (operator-confirmed static 2026-06-19) — the break-glass anchor.
 - **askari `wt0` IP is `100.99.226.39`**; askari domain `netbird.askari.wingu.me`.
 - **Before any commit:** `rbw unlocked` must succeed (the pre-commit hook decrypts `vault.yml`); run `make lint` and it must be clean.
 - **Tags:** import each role at play level with its role-name tag; only use concern tags from `tests/tags.yml`.
 - **Harness GREEN before live** (Task 3 before Task 4). The live cutover (Task 4) is **operator-gated** — never run autonomously.
 ---
 ### Task 1: Disable geolocation in `netbird_coordinator` (FRICTION 2026-06-17 #4)
 Make the control plane survive a startup with no container egress: NetBird's combined server downloads the GeoLite2 DB at boot and treats failure as FATAL. boma uses no geo posture (ACL is Allow-All), so disable geolocation entirely via the documented env var. TDD'd through the role's render-only Molecule scenario.
 > verified: NetBird self-hosted geolocation knobs (`NB_DISABLE_GEOLOCATION`, `disableGeoliteUpdate`, GeoLite2 pre-seed) · WebFetch · docs.netbird.io/selfhosted/geo-support · 2026-06-19 — *from a docs summary; the live "healthy with egress blocked" check in Task 4 is the real gate, with a concrete pre-seed fallback there.*
 **Files:**
 - Modify: `roles/netbird_coordinator/defaults/main.yml` (add the knob)
 - Modify: `roles/netbird_coordinator/templates/docker-compose.yml.j2:14-27` (add `environment:` to `netbird-server`)
 - Test: `roles/netbird_coordinator/molecule/default/verify.yml:21-32` (assert the rendered compose)
 - Modify: `roles/netbird_coordinator/README.md` (one line documenting the knob)
 **Interfaces:**
 - Produces: role default `netbird_coordinator__disable_geolocation` (bool, default `true`); rendered compose env `NB_DISABLE_GEOLOCATION: "true"` on the `netbird-server` service.
 - [ ] **Step 1: Write the failing Molecule assertion**
 Append to `roles/netbird_coordinator/molecule/default/verify.yml` (after the existing compose-tags assert, inside the same `tasks:` list):
 ```yaml
    - name: Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)
      ansible.builtin.assert:
        that:
          - "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
        fail_msg: >-
          compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
          the coordinator on the GeoLite2 download
        success_msg: "geolocation disabled in compose"
 ```
 - [ ] **Step 2: Run Molecule to verify it fails**
 Run: `make test ROLE=netbird_coordinator`
 Expected: FAIL at "Assert geolocation is disabled" — the rendered compose has no `NB_DISABLE_GEOLOCATION`.
 - [ ] **Step 3: Add the default knob**
 Add to `roles/netbird_coordinator/defaults/main.yml` (after line 7, the `__domain` line):
 ```yaml
 # Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
 # (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
 # so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
 # re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
 # removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
 netbird_coordinator__disable_geolocation: true
 ```
 - [ ] **Step 4: Render the env in the compose template**
 In `roles/netbird_coordinator/templates/docker-compose.yml.j2`, add an `environment:` block to the `netbird-server` service, immediately after its `command:` line (line 18):
 ```yaml
    environment:
      # Disable geolocation so a no-egress startup can't FATAL the control plane
      # (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
      NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
 ```
 - [ ] **Step 5: Run Molecule to verify it passes**
 Run: `make test ROLE=netbird_coordinator`
 Expected: PASS — all asserts green, including "geolocation disabled in compose"; Molecule idempotence clean.
 - [ ] **Step 6: Document the knob**
 Add one line to `roles/netbird_coordinator/README.md` under its variables/defaults section:
 ```markdown
 - `netbird_coordinator__disable_geolocation` (default `true`) — sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4).
 ```
 - [ ] **Step 7: Lint and commit**
 ```bash
 rbw unlocked && make lint
 git add roles/netbird_coordinator/defaults/main.yml \
        roles/netbird_coordinator/templates/docker-compose.yml.j2 \
        roles/netbird_coordinator/molecule/default/verify.yml \
        roles/netbird_coordinator/README.md
 git commit -m "feat(netbird_coordinator): disable geolocation so no-egress startup can't FATAL the control plane" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 2: Enable askari's host firewall (INPUT-only) + WAN break-glass + manage over `wt0`
 Flip askari from "firewall not applied" to the redesigned INPUT-only default-deny, add the permanent WAN break-glass source, and point Ansible at the mesh. Pure inventory change — validated by lint + inventory resolution (the firewall *behavior* is proven in Task 3).
 **Files:**
 - Modify: `inventories/production/group_vars/offsite_hosts/vars.yml` (replace the whole file body)
 - Create: `inventories/production/host_vars/askari.yml`
 **Interfaces:**
 - Consumes: `base` knobs `base__firewall_apply`, `base__firewall_input_only`, `base__firewall_admin_addrs`, `base__ssh_listen_mesh_only`, `base__mesh_enabled` (all defined in `roles/base/defaults/main.yml`).
 - Produces: askari resolves `ansible_host: 100.99.226.39`, `base__firewall_apply: true`, `base__firewall_input_only: true`, `base__firewall_admin_addrs: ["91.226.145.80"]`.
 - [ ] **Step 1: Rewrite the offsite group_vars**
 Replace the body of `inventories/production/group_vars/offsite_hosts/vars.yml` with:
 ```yaml
 ---
 # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
 # (ADR-016, M5).
 #
 # Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward
 # `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
 # did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
 #   - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
 #     so Docker container forwarding/NAT survive a reboot;
 #   - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
 #     change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
 #   - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
 #     as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
 #     management path must never depend on a service that host itself hosts).
 # Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
 base__mesh_enabled: true
 base__firewall_apply: true
 base__firewall_input_only: true     # forward stays `policy accept` → Docker-safe
 base__ssh_listen_mesh_only: false   # no sshd ListenAddress change → no boot-race
 base__firewall_admin_addrs:
  - 91.226.145.80   # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass
 ```
 - [ ] **Step 2: Create the askari host_vars to manage over the mesh**
 Create `inventories/production/host_vars/askari.yml`:
 ```yaml
 ---
 # Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
 # in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
 # (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
 # break-glass; the Hetzner web console is the IP-independent ultimate fallback.
 # Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
 ansible_host: 100.99.226.39
 ```
 - [ ] **Step 3: Verify the inventory resolves**
 Run: `ansible-inventory -i inventories/production --host askari`
 Expected: JSON shows `"ansible_host": "100.99.226.39"`, `"base__firewall_apply": true`, `"base__firewall_input_only": true`, and `"base__firewall_admin_addrs": ["91.226.145.80"]`.
 - [ ] **Step 4: Lint**
 Run: `rbw unlocked && make lint`
 Expected: clean (no yamllint/ansible-lint errors).
 - [ ] **Step 5: Commit**
 ```bash
 git add inventories/production/group_vars/offsite_hosts/vars.yml \
        inventories/production/host_vars/askari.yml
 git commit -m "feat(inventory): askari INPUT-only firewall + WAN break-glass + manage over wt0" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 3: Integration harness "askari_inputonly" profile — the reboot-safety GREEN gate
 Prove on a throwaway VM (ADR-025) that the redesigned firewall is reboot-safe BEFORE touching the real host: INPUT default-deny + forward accept + the admin-addr break-glass + published-port DNAT all survive a reboot. New profile (keeps the existing `askari` profile, which validates the `docker_host` container-forward drop-in path, intact).
 **Files:**
 - Create: `tests/integration/profiles/askari_inputonly.json`
 - Create: `tests/integration/overrides/askari_inputonly.yml`
 - Modify: `tests/integration/verify.yml` (allow-list + a new profile branch)
 **Interfaces:**
 - Consumes: the `scripts/integration-vm.py` harness; `make test-integration HOST=<profile>` maps `HOST` to `profiles/<HOST>.json` (a profile name, not a production inventory host).
 - Produces: profile `askari_inputonly` with `integration_profile: askari_inputonly`.
 - [ ] **Step 1: Add the new profile to the verify allow-list and a failing branch**
 In `tests/integration/verify.yml`, change the allow-list assert (line 14) from:
 ```yaml
          - integration_profile in ['askari', 'ubongo']
 ```
 to:
 ```yaml
          - integration_profile in ['askari', 'askari_inputonly', 'ubongo']
 ```
 and update its `fail_msg` (line 15) to `"integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"`. Then append this block to the `tasks:` list (after the ubongo block):
 ```yaml
    # ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
    # INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
    # (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
    - name: (askari_inputonly) Read the live nftables ruleset
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.command: nft list ruleset
      register: _nft_io
      changed_when: false
    - name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.assert:
        that:
          - "'hook input priority filter; policy drop;' in _nft_io.stdout"
          - "'hook forward priority filter; policy accept;' in _nft_io.stdout"
          - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
        fail_msg: >-
          askari_inputonly: expected input policy drop, forward policy accept (input-only),
          and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
    - name: (askari_inputonly) Gather service facts
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.service_facts:
    - name: (askari_inputonly) Docker daemon is active
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.assert:
        that: "ansible_facts.services['docker.service'].state == 'running'"
        fail_msg: "docker.service is not running"
    - name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
      when: integration_profile == 'askari_inputonly'
      delegate_to: localhost
      become: false
      ansible.builtin.uri:
        url: "http://{{ ansible_host }}/"
        follow_redirects: none
        status_code: [200, 301, 308, 404, 502, 503]
        timeout: 10
      register: _probe_io
      retries: 5
      delay: 6
      until: _probe_io is succeeded
 ```
 - [ ] **Step 2: Create the profile descriptor**
 Create `tests/integration/profiles/askari_inputonly.json`:
 ```json
 {
  "groups": ["offsite_hosts"],
  "applies": [
    {"playbook": "site.yml", "tags": ["base"]},
    {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
  ],
  "extra_vars_files": ["overrides/askari_inputonly.yml"],
  "mem_mib": 3072,
  "vcpus": 2
 }
 ```
 - [ ] **Step 3: Create the overlay**
 Create `tests/integration/overrides/askari_inputonly.yml`:
 ```yaml
 ---
 # Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
 # Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
 # accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
 integration_profile: askari_inputonly
 base__firewall_apply: true
 base__firewall_input_only: true
 # No sshd ListenAddress change — never wt0-only in a throwaway VM.
 base__ssh_listen_mesh_only: false
 # Isolated VM: never touch the real mesh.
 base__mesh_enabled: false
 # The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
 # VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
 # interface-independent and the default-deny + reboot don't lock out the driver. This
 # mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
 base__firewall_admin_addrs:
  - 192.168.150.1
 ```
 - [ ] **Step 4: Run the harness — the GREEN gate**
 Run: `make test-integration HOST=askari_inputonly`
 Expected: GREEN. The harness boots a VM, applies `base` (INPUT-only) + `docker_host` + `reverse_proxy`, **reboots**, re-SSHes (proving the admin-addr break-glass survives), then `verify.yml` asserts input `policy drop`, forward `policy accept`, the `192.168.150.1` SSH allow, Docker active, and the published `:80` answering. Clean up: `make test-integration-clean`.
 - [ ] **Step 5: Commit**
 ```bash
 rbw unlocked && make lint
 git add tests/integration/profiles/askari_inputonly.json \
        tests/integration/overrides/askari_inputonly.yml \
        tests/integration/verify.yml
 git commit -m "test(integration): askari_inputonly profile — INPUT-only default-deny reboot gate" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 4: Supervised live cutover + STATUS/ROADMAP update — ⚠️ OPERATOR-GATED
 > **⚠️ DO NOT run this task autonomously.** It changes the live off-site host (lockout risk) and runs `make deploy`. An automated executor must STOP here and hand back to the operator. Preconditions: Tasks 1–3 committed and GREEN; `rbw unlocked`; the **Hetzner web console** open in a browser (the out-of-band ultimate break-glass); the operator present. The WAN `:22` break-glass is never removed, so a fallback path is open throughout (FRICTION 2026-06-17 #6).
 **Files (Step 7 only):**
 - Modify: `STATUS.md` (askari row), `docs/ROADMAP.md` (Next step)
 - [ ] **Step 1: Pre-check both paths are healthy**
 ```bash
 ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
 ansible askari -i inventories/production -m ping
 curl -sI https://test.askari.wingu.me | head -1
 curl -sI https://netbird.askari.wingu.me | head -1
 ```
 Expected: wt0 SSH OK; ping `pong`; both curls `HTTP/2 200`.
 - [ ] **Step 2: Dry-run the converge (mandatory `check` before `deploy`)**
 ```bash
 make check PLAYBOOK=site LIMIT=askari
 ```
 Expected: changes limited to the `base` firewall (input-only ruleset, admin-addr) + the `netbird_coordinator` compose env (`NB_DISABLE_GEOLOCATION`). Review and show the output before proceeding.
 - [ ] **Step 3: Apply (operator present, console open, auto-rollback armed)**
 ```bash
 make deploy PLAYBOOK=site LIMIT=askari
 ```
 The `base` firewall concern arms the auto-rollback timer (`base__firewall_rollback_timeout: 45`) and reconnects over `wt0` — a bad ruleset reverts itself. Expected: converge OK; SSH-over-`wt0` stays up.
 - [ ] **Step 4: Rebuild NAT and confirm the coordinator is healthy with geo disabled**
 `base`'s `flush ruleset` wipes Docker's nat (FRICTION) — rebuild it, then confirm the control plane:
 ```bash
 ssh sjat@100.99.226.39 'sudo systemctl restart docker'
 ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
 ssh sjat@100.99.226.39 'docker logs --since 2m netbird-server 2>&1 | grep -iE "geo|fatal" || echo "no geo/fatal log lines"'
 ```
 Expected: `netbird-server` + `netbird-dashboard` Up; no geo-DB FATAL.
 > **Contingency (only if `netbird-server` still FATALs on geolocation):** `NB_DISABLE_GEOLOCATION` was not honored by the pinned image. Pre-seed the DB into the volume instead — `ssh sjat@100.99.226.39 'sudo curl -fSL -o /var/lib/docker/volumes/netbird_data/_data/GeoLite2-City_20260101.mmdb https://pkgs.netbird.io/geolite2/GeoLite2-City.mmdb && sudo docker restart netbird-server'` — and add `disableGeoliteUpdate: true` under `server:` in `config.yaml.j2` so it never re-downloads. Re-verify, then fold the working fix back into the role (amend Task 1).
 - [ ] **Step 5: Verify the new steady state (both SSH paths + services)**
 ```bash
 ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
 # From ubongo: SSH to askari's WAN IP. ubongo's packets egress via OPNsense, SNAT'd to the
 # WAN IP 91.226.145.80 — matching askari's admin-addr break-glass rule. (No BindAddress:
 # ubongo does not hold 91.226.145.80; OPNsense does.)
 ssh sjat@77.42.120.136 true && echo "WAN break-glass OK"
 curl -sI https://test.askari.wingu.me | head -1
 nc -vz -u 77.42.120.136 3478   # STUN answers
 ```
 Expected: both SSH paths succeed; cert valid; STUN reachable.
 - [ ] **Step 6: Reboot-resilience — the real test (console available)**
 ```bash
 ssh sjat@100.99.226.39 'sudo systemctl reboot'
 # wait ~60s, then from ubongo — no manual intervention:
 sleep 60; ssh sjat@100.99.226.39 'nft list chain inet filter input | grep -E "policy drop|wt0|91.226.145.80"'
 curl -sI https://netbird.askari.wingu.me | head -1
 ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
 ```
 Expected, unattended: input `policy drop` with the `wt0` + `91.226.145.80` allows; public cert valid; both containers Up; `wt0` SSH back. (If lost: recover via the Hetzner console — the firewall auto-rollback and the WAN break-glass should make that unnecessary.)
 - [ ] **Step 7: Record reality in the ground-truth docs and commit**
 Update `STATUS.md` (the askari row): firewall now **applied** — INPUT-only default-deny, SSH `wt0`-primary + permanent WAN break-glass (ubongo's WAN), managed over `wt0`, geolocation disabled, **reboot-validated**. Update `docs/ROADMAP.md` "Next step": mark the askari SSH→`wt0` redesign **DONE**; the next mesh-hardening sub-project is the **SPOF reduction** (askari relay single-point-of-failure) — confirmed by the `ubongo → askari` `Relayed` finding (2026-06-19).
 ```bash
 rbw unlocked && make lint
 git add STATUS.md docs/ROADMAP.md
 git commit -m "docs(status): mesh-hardening redesign — askari INPUT-only + WAN break-glass applied + reboot-validated" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ## Notes / out of scope (carry to the SPOF sub-project)
 - **SPOF reduction is the next sub-project** (operator decision 2026-06-19): `ubongo → askari` is currently `Relayed` through askari's own relay; if askari is down, relayed peers lose the mesh data plane. Its own spec.
 - **NetBird ACL stays Allow-All** — any enrolled peer can reach askari `wt0:22` until a later sub-project.
 - **Full forward-chain hardening** (`docker_host` container-forward drop-in over the `input_only` baseline) — a later tightening; the existing `askari` integration profile already covers that path.
 - **Coordinator off-site backup** (FRICTION 2026-06-17 #5, ADR-022) — still pending; not in scope.
--- a/docs/superpowers/plans/2026-06-19-mesh-hardening-ubongo-default-deny.md
+++ b/docs/superpowers/plans/2026-06-19-mesh-hardening-ubongo-default-deny.md
@ -1,470 +0,0 @@
 # Mesh-hardening 2/3 — ubongo INPUT-only default-deny — Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Apply base's nftables firewall to the control node (ubongo) as an INPUT-only default-deny — hardening its inbound surface — while leaving the forward chain permissive so Docker egress and the libvirt-NAT integration harness keep working, and without any sshd `ListenAddress` change.
 **Architecture:** Two new `base` knobs make the existing firewall concern fit a control node: `base__firewall_input_only` flips the forward chain to `policy accept` (host-local input filtering only), and `base__firewall_admin_addrs` adds operator-workstation LAN sources to the SSH allow-list (alongside `wt0` and `ssh-from-control`). sshd is untouched (nftables does the scoping → no `ip_nonlocal_bind` boot-race). The change is validated on a throwaway VM via the ADR-025 integration harness (a new "be ubongo" profile) before an operator-supervised live cutover whose safety net is the firewall auto-rollback timer plus the permanent on-prem physical console.
 **Tech Stack:** Ansible (role `base`, FQCN), nftables, Jinja2, Molecule on Debian 13, pytest (none new), the ADR-025 integration harness (`scripts/integration-vm.py`, JSON profiles, `-e @` overlays).
 **Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`
 **Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; never hand-edit the generated `offsite.yml`; `rbw unlocked` for any commit touching Ansible content and for the integration/live applies (the production `group_vars/all/vault.yml` is in inventory scope and gets decrypted at playbook load). Tasks 1–3 are code (subagent-driven, each lint/Molecule-verified). Task 4 is a real-VM validation gate on ubongo. Task 5 is the live, operator-supervised cutover.
 ---
 ## File Structure
 | File | Create/Modify | Responsibility |
 |---|---|---|
 | `roles/base/defaults/main.yml` | Modify | Declare `base__firewall_input_only` + `base__firewall_admin_addrs` (defaults: off / empty). |
 | `roles/base/templates/nftables.conf.j2` | Modify | Conditional forward policy; render an SSH-allow rule per admin address. |
 | `roles/base/molecule/default/converge.yml` | Modify | Fixture: an admin-addr source (input-only stays at its default → forward drop). |
 | `roles/base/molecule/default/verify.yml` | Modify | Assert forward-drop default + the admin-addr rule render. |
 | `inventories/production/group_vars/control/vars.yml` | Modify | Turn the knobs on for ubongo (input-only; mamba's LAN IP). |
 | `tests/integration/overrides/ubongo.yml` | Create | The "be ubongo" overlay (input-only firewall; harness SSH lifeline). |
 | `tests/integration/profiles/ubongo.json` | Create | The "be ubongo" VM profile (group `control`, applies `site.yml:base`). |
 | `tests/integration/overrides/askari.yml` | Modify | Add the `integration_profile` marker (verify is now profile-aware). |
 | `tests/integration/verify.yml` | Modify | Gate the askari (Docker/DNAT) block; add the ubongo (input-only) block + a guard. |
 | `STATUS.md`, `docs/ROADMAP.md` | Modify (Task 5) | Record mesh-hardening 2/3 done. |
 ---
 ### Task 1: base role — `base__firewall_input_only` (forward policy) + `base__firewall_admin_addrs` (LAN SSH allow)
 **Files:**
 - Modify: `roles/base/defaults/main.yml`
 - Modify: `roles/base/templates/nftables.conf.j2`
 - Modify: `roles/base/molecule/default/converge.yml`
 - Modify: `roles/base/molecule/default/verify.yml`
 > **Test strategy (note):** Molecule renders one fixture, so it locks the *secure default* —
 > `input_only` **off** → forward `policy drop` — plus the new admin-addr rule (red→green). The
 > `input_only` **on** → forward `policy accept` path is exercised on a real VM by the
 > integration "be ubongo" profile (Tasks 3–4), whose verify fails red until this template
 > conditional exists. Both branches are covered, across the two test layers.
 - [ ] **Step 1: Write the failing test (extend Molecule verify)**
 In `roles/base/molecule/default/verify.yml`, after the `Assert the docker_host extension hook is present` block, add:
 ```yaml
    - name: Assert the forward chain defaults to policy drop (input_only off)
      ansible.builtin.assert:
        that:
          - "'hook forward priority 0; policy drop;' in nft"
        fail_msg: >-
          forward chain must default to policy drop when base__firewall_input_only is
          false (container isolation stays the norm on real service hosts)
    - name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
      ansible.builtin.assert:
        that:
          - "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
        fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
 ```
 - [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
 In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (after the `base__firewall_control_addr` line):
 ```yaml
    base__firewall_admin_addrs:
      - "10.30.0.77"   # fixture: an operator-workstation LAN source (admin-addr SSH allow)
 ```
 - [ ] **Step 3: Run the test to verify it fails**
 Run: `make test ROLE=base`
 Expected: FAIL on `Assert the admin-addr SSH allow rule` (the template does not consume `base__firewall_admin_addrs` yet, so the `ip saddr 10.30.0.77 …` rule is absent). The forward-drop assertion passes already (the template currently hardcodes `policy drop`).
 - [ ] **Step 4: Add the defaults**
 In `roles/base/defaults/main.yml`, after the `base__firewall_apply: true` line (end of the firewall behaviour block, currently line 13), add:
 ```yaml
 base__firewall_input_only: false     # true → the forward chain is `policy accept` (host-local
                                     # INPUT filtering only). For hosts that forward/route
                                     # container or NAT traffic (the control node's Docker +
                                     # libvirt-NAT) where a forward default-deny would break
                                     # them. Real service hosts keep this false (forward drop).
 base__firewall_admin_addrs: []       # extra LAN source IPs allowed to SSH, besides wt0 +
                                     # ssh-from-control. For an operator workstation reaching
                                     # the host over the LAN (no mesh). Key-gated. (ADR-021)
 ```
 - [ ] **Step 5: Make the forward policy conditional + render the admin-addr rules**
 In `roles/base/templates/nftables.conf.j2`:
 (a) Replace the forward-chain line (currently line 21):
 ```jinja
  chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; }
 ```
 (b) After the `ssh-from-control` `{% endif %}` (currently line 14) and before the `ip protocol icmp accept` line, add the admin-addr loop:
 ```jinja
 {% for addr in base__firewall_admin_addrs %}
    ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
 {% endfor %}
 ```
 - [ ] **Step 6: Run the test to verify it passes**
 Run: `make test ROLE=base`
 Expected: PASS — converge renders the ruleset; verify confirms the forward chain is `policy drop` (input_only defaults false) and the `ip saddr 10.30.0.77 tcp dport 22 accept` rule is present; all pre-existing assertions stay green.
 - [ ] **Step 7: Lint**
 Run: `make lint`
 Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
 - [ ] **Step 8: Commit**
 ```bash
 git add roles/base/defaults/main.yml roles/base/templates/nftables.conf.j2 \
        roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
 git commit -m "feat(base): input-only forward policy + admin-addr SSH allow
 base__firewall_input_only renders the forward chain policy accept (host-local
 INPUT filtering only) for hosts that forward container/NAT traffic; defaults
 false so real service hosts keep the forward default-deny. base__firewall_admin_addrs
 adds operator-workstation LAN sources to the SSH allow-list alongside wt0 +
 ssh-from-control. Molecule locks the secure default + the admin rule.
 Mesh-hardening 2/3 (ADR-020/021).
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 2: inventory — enable input-only default-deny + mamba on ubongo (control group)
 **Files:**
 - Modify: `inventories/production/group_vars/control/vars.yml`
 - [ ] **Step 1: Turn the knobs on for the control group**
 Append to `inventories/production/group_vars/control/vars.yml`:
 ```yaml
 # Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
 # INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
 # Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
 # (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
 # ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
 # mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
 base__firewall_input_only: true
 base__firewall_admin_addrs:
  - "10.20.10.50"   # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
                    # OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
  - "10.20.10.17"   # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
 ```
 - [ ] **Step 2: Verify the vars resolve for ubongo**
 Run: `.venv/bin/ansible-inventory -i inventories/production/ --host ubongo 2>/dev/null | grep -E 'firewall_input_only|firewall_admin_addrs|10.20.10.(50|17)'`
 Expected: shows `"base__firewall_input_only": true` and `"base__firewall_admin_addrs": ["10.20.10.50", "10.20.10.17"]`.
 - [ ] **Step 3: Lint**
 Run: `make lint`
 Expected: clean pass (`check-tags: OK`).
 - [ ] **Step 4: Commit**
 ```bash
 git add inventories/production/group_vars/control/vars.yml
 git commit -m "feat(inventory): ubongo gets INPUT-only host firewall + mamba LAN SSH
 Enables base__firewall_input_only on the control group (forward chain stays
 permissive so Docker egress + the integration-test libvirt NAT survive) and
 allows the operator workstations' LAN IPs (mamba 10.20.10.50 + 10.20.10.17;
 raw leases, backstopped by wt0). Mesh-hardening 2/3.
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 3: integration harness — "be ubongo" profile (overlay + profile + profile-aware verify)
 **Files:**
 - Create: `tests/integration/overrides/ubongo.yml`
 - Create: `tests/integration/profiles/ubongo.json`
 - Modify: `tests/integration/overrides/askari.yml`
 - Modify: `tests/integration/verify.yml`
 - [ ] **Step 1: Create the "be ubongo" overlay**
 Create `tests/integration/overrides/ubongo.yml`:
 ```yaml
 ---
 # Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
 # Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
 # chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
 # ListenAddress change (so no boot-race).
 integration_profile: ubongo
 base__firewall_apply: true
 base__firewall_input_only: true        # forward chain renders `policy accept`
 base__firewall_admin_addrs:
  - "192.168.150.98"                   # two representative LAN sources — exercises the
  - "192.168.150.99"                   # admin-addr loop with a multi-entry list (like ubongo)
 # Never wt0-only; never touch the real mesh from a throwaway VM.
 base__ssh_listen_mesh_only: false
 base__mesh_enabled: false
 # Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
 # default-deny apply + the reboot don't lock out the harness. By source IP (interface-
 # independent). This is the harness's lifeline; the admin-addr above is only exercised.
 base__firewall_control_addr: "192.168.150.1"
 ```
 - [ ] **Step 2: Create the "be ubongo" VM profile**
 Create `tests/integration/profiles/ubongo.json`:
 ```json
 {
  "groups": ["control"],
  "applies": [
    {"playbook": "site.yml", "tags": ["base"]}
  ],
  "extra_vars_files": ["overrides/ubongo.yml"],
  "mem_mib": 2048,
  "vcpus": 2
 }
 ```
 - [ ] **Step 3: Mark the askari overlay with its profile name**
 In `tests/integration/overrides/askari.yml`, after the two header comment lines (before `base__firewall_apply: true`), add:
 ```yaml
 integration_profile: askari
 ```
 - [ ] **Step 4: Make `verify.yml` profile-aware (the test)**
 Replace the entire contents of `tests/integration/verify.yml` with:
 ```yaml
 ---
 # Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
 # `integration_profile` (set in each profile's overlay). Each profile asserts its own success
 # criteria; an unknown/unset profile fails loudly (never a silent pass).
 - name: Verify the rebooted host
  hosts: all
  become: true
  gather_facts: false
  tasks:
    - name: A known integration_profile must be set (no silent pass)
      ansible.builtin.assert:
        that:
          - integration_profile is defined
          - integration_profile in ['askari', 'ubongo']
        fail_msg: "integration_profile must be set in the profile overlay (askari|ubongo)"
    # ── askari profile — Docker host: published-port forwarding survives the reboot ──
    # The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
    # base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
    - name: (askari) Gather service facts
      when: integration_profile == 'askari'
      ansible.builtin.service_facts:
    - name: (askari) Docker daemon is active
      when: integration_profile == 'askari'
      ansible.builtin.assert:
        that: "ansible_facts.services['docker.service'].state == 'running'"
        fail_msg: "docker.service is not running"
    - name: (askari) Forward chain permits container traffic (drop-in loaded)
      when: integration_profile == 'askari'
      ansible.builtin.command: nft list chain inet filter forward
      register: _fwd
      changed_when: false
    - name: (askari) Assert container forwarding is allowed (not pure drop)
      when: integration_profile == 'askari'
      ansible.builtin.assert:
        that: "'accept' in _fwd.stdout"
        fail_msg: >-
          forward chain is pure drop — container forwarding will die on reboot
          (FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
    - name: (askari) Published port answers from the controller (DNAT + forward alive)
      when: integration_profile == 'askari'
      delegate_to: localhost
      become: false
      ansible.builtin.uri:
        url: "http://{{ ansible_host }}/"
        follow_redirects: none
        status_code: [200, 301, 308, 404, 502, 503]
        timeout: 10
      register: _probe
      retries: 5
      delay: 6
      until: _probe is succeeded
    # ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
    # SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
    # checks boot_id changed before this verify runs). Here we assert the ruleset shape.
    - name: (ubongo) Read the live nftables ruleset
      when: integration_profile == 'ubongo'
      ansible.builtin.command: nft list ruleset
      register: _nft
      changed_when: false
    - name: (ubongo) INPUT default-deny, forward permissive, admin-addr allow
      when: integration_profile == 'ubongo'
      ansible.builtin.assert:
        that:
          - "'hook input priority 0; policy drop;' in _nft.stdout"
          - "'hook forward priority 0; policy accept;' in _nft.stdout"
          - "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
          - "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
        fail_msg: >-
          ubongo profile: expected input policy drop, forward policy accept (input-only),
          and both admin-addr (192.168.150.98/99) SSH allows in the live ruleset.
 ```
 - [ ] **Step 5: Validate the JSON + lint**
 Run: `.venv/bin/python -m json.tool tests/integration/profiles/ubongo.json >/dev/null && echo OK` then `make lint`
 Expected: `OK`, then a clean lint pass (`check-tags: OK`).
 - [ ] **Step 6: Commit**
 ```bash
 git add tests/integration/overrides/ubongo.yml tests/integration/profiles/ubongo.json \
        tests/integration/overrides/askari.yml tests/integration/verify.yml
 git commit -m "test(integration): add the 'be ubongo' profile (input-only default-deny)
 A control-group VM that applies base with INPUT-only default-deny (forward
 policy accept; admin-addr SSH allow). verify.yml is now profile-aware via an
 integration_profile marker — the askari Docker/DNAT block is gated, and a ubongo
 block asserts input drop + forward accept + the admin-addr rule. Enables
 \`make test-integration HOST=ubongo\`. Mesh-hardening 2/3 (ADR-025).
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 4: Validate on the integration harness (`make test-integration HOST=ubongo`) — the GREEN gate
 > Runs a throwaway UEFI VM on ubongo: boots it, applies the base role with the ubongo
 > overlay (INPUT-only default-deny), **reboots it**, and asserts the ruleset + SSH-returns.
 > This proves the change survives a reboot before the real control node is ever touched
 > (spec §cutover step 1; FRICTION signal-6). No code change / no commit — a validation gate.
 - [ ] **Step 1: Ensure the vault is unlocked**
 The run loads `inventories/production/group_vars/all/vault.yml` (symlinked into the run dir), which is decrypted at playbook load.
 Run: `rbw unlocked || rbw unlock`
 Expected: exits 0 (unlocked). If it prompts, the operator unlocks.
 - [ ] **Step 2: Run the integration cycle**
 Run: `make test-integration HOST=ubongo`
 Expected (the `cycle`: up → apply → reboot → assert): the VM gets a `192.168.150.x` lease; `site.yml --tags base` applies cleanly; `… rebooted (boot_id changed), SSH back at 192.168.150.x`; then `VERIFY PASSED for boma-it-ubongo-…`. The VM is destroyed on success.
 - [ ] **Step 3: On failure, read the diagnostics**
 If it prints `VERIFY FAILED`, diagnostics are in `~/integration-runs/boma-it-ubongo-<id>/` (`nft.txt`, `console.log`, `journal.txt`). The likely suspects: the admin-addr/forward assertion (Task 1/3 wiring) or SSH not returning post-reboot (the `base__firewall_control_addr: 192.168.150.1` lifeline in the overlay). Fix the implicated task, re-commit, and re-run Step 2. Re-run `make test-integration-clean` first if a VM was left defined.
 - [ ] **Step 4: Record the result**
 Capture the `VERIFY PASSED` line in the task notes (this is the gate Task 5 step 1 depends on). No commit.
 ---
 ### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
 > Touches the **real ubongo** (the control node Ansible runs from) and reboots it — lockout-
 > risky. Run it interactively with the operator, in order, verifying each step before the
 > next. The firewall auto-rollback timer (`base__firewall_rollback_timeout`, 45 s) +
 > `wait_for_connection` over the live path is the safety net; the **on-prem physical console**
 > is the permanent break-glass. Do NOT hand this to an unattended agent.
 - [ ] **Step 1: Pre-checks (gate: Task 4 GREEN)**
 - `rbw unlocked || rbw unlock`.
 - SSH to ubongo over `wt0` from a road-warrior succeeds.
 - SSH to ubongo from mamba on the LAN (`10.20.10.50`) succeeds.
 - `.venv/bin/ansible ubongo -i inventories/production/ -m ping` → `SUCCESS` (over `10.20.10.151`).
 - The physical console is reachable. If any path fails, STOP.
 - [ ] **Step 2: Dry-run the firewall apply**
 Run: `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
 Expected: the nftables diff shows `policy drop` on input, `iifname "wt0" … accept`, `ip saddr 10.20.10.151 … accept`, `ip saddr 10.20.10.50 … accept`, and the forward chain as `policy accept`. No errors.
 - [ ] **Step 3: Apply the host firewall (auto-rollback armed)**
 Run: `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
 Expected: the firewall concern snapshots `/etc/nftables.rollback`, arms the 45 s `systemd-run` revert, applies the ruleset, `reset_connection` → `wait_for_connection` over `10.20.10.151` succeeds, then cancels the timer. If connectivity is lost, the timer reverts the ruleset within 45 s and the console is the fallback.
 - [ ] **Step 4: Verify every path + forwarding still works**
 ```bash
 # from a road-warrior over wt0, and from mamba on the LAN:
 ssh sjat@100.99.146.14 true && echo "wt0 OK"
 ssh sjat@10.20.10.151 true && echo "mamba-LAN OK"   # run from mamba (10.20.10.50)
 # Ansible self-path:
 .venv/bin/ansible ubongo -i inventories/production/ -m ping
 # a disallowed LAN host (e.g. 10.20.10.17) must now be refused/timeout on :22
 # Docker egress (forward chain still permissive):
 docker run --rm busybox wget -qO- https://cloudflare.com/cdn-cgi/trace | head -1
 # libvirt-NAT forwarding intact — a fresh integration VM still reaches apt:
 make test-integration HOST=ubongo   # expect VERIFY PASSED (proves the NAT path survived)
 ```
 Expected: `wt0 OK`, `mamba-LAN OK`, Ansible `SUCCESS`, the disallowed host refused, the Docker egress line returns, and the integration cycle passes.
 - [ ] **Step 5: Reboot resilience — while the console is present (FRICTION signal-6)**
 With the operator at the physical console, reboot ubongo (`sudo systemctl reboot`). After it returns, confirm SSH comes back on all paths **unaided**:
 ```bash
 ssh sjat@100.99.146.14 true && echo "wt0 OK after reboot"
 .venv/bin/ansible ubongo -i inventories/production/ -m ping
 ```
 Expected: SSH returns with no manual intervention (no `ListenAddress`, so nothing to race). Only now is the cutover complete.
 - [ ] **Step 6: Update STATUS + ROADMAP**
 - In `STATUS.md`: in the `roles/base/` row of "Scaffolded but empty", change the firewall note — the `firewall` concern is now **applied to ubongo** as INPUT-only default-deny (it is no longer "not yet applied to any host"); note the `base__firewall_input_only` knob and that the forward default-deny still awaits the `docker_host` drop-in for real service hosts. Add the ubongo control-node row's "Pending" item for default-deny → done.
 - In `docs/ROADMAP.md`: mark **mesh-hardening sub-project 2 (ubongo default-deny) done**; the remaining follow-on is sub-project 1 (askari SSH→`wt0` *redesign*) and sub-project 3 (NetBird ACL). Update the "Next step" section accordingly.
 ```bash
 git add STATUS.md docs/ROADMAP.md
 git commit -m "docs: ubongo INPUT-only default-deny applied (mesh-hardening 2/3 done)
 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 - [ ] **Step 7: Push**
 Run: `git push origin main`
 ---
 ## Self-review (against the spec)
 - **§ Design — INPUT-only default-deny** → Task 1 (forward-policy knob) + Task 2 (enabled on ubongo). ✓
 - **§ Design — admin-addrs (operator workstations on LAN)** → Task 1 (`base__firewall_admin_addrs` + template loop) + Task 2 (`10.20.10.50` mamba, `10.20.10.17`). ✓
 - **§ Design — no sshd ListenAddress change** → nothing touches `ssh.yml`/`sshd_hardening.conf.j2`; only nftables. ✓ (verified: Tasks 1–3 file lists exclude them).
 - **§ allow-list** (lo, established, wt0, ssh-from-control, admin-addr, icmp; forward accept) → template already renders lo/established/wt0/control/icmp; Task 1 adds admin-addr + forward-accept. ✓
 - **§ Why-safe (incident signals 1/2/3/6)** → signal 1 (forward accept, Task 1); signal 2 (no ListenAddress); signal 3 (ubongo keeps LAN + console); signal 6 (Task 4 harness reboot + Task 5 step 5 reboot-while-console). ✓
 - **§ New & changed code** (defaults, template, molecule, group_vars/control, integration profile) → Tasks 1–3. ✓
 - **§ admin raw-leases + revisit** → Task 2 comments record both leases + the OPNsense-reservation revisit trigger; backstop (wt0) noted; flagged in `FRICTION.md`. ✓
 - **§ Testing** (Molecule render asserts; `make test-integration HOST=ubongo`; live checks) → Task 1 (Molecule), Task 4 (harness), Task 5 step 4 (live). ✓ Coverage split (default in Molecule, input_only on the VM) noted in Task 1.
 - **§ Staged cutover (signal-6 order)** → Task 5 steps 1–7; reboot-recovery (step 5) precedes nothing that retires a break-glass (the console is permanent). ✓
 - **§ Risks/rollback** → auto-rollback (Task 5 step 3), redundant paths + physical console, raw-lease backstop. ✓
 - **Type/name consistency:** `base__firewall_input_only` (bool) and `base__firewall_admin_addrs` (list) are spelled identically in defaults, template, converge, group_vars, and the overlay. `integration_profile` is spelled identically in both overlays and the three gates in `verify.yml`. ✓
 - **Placeholder scan:** no TBD/TODO; every code/command step shows the actual content. ✓
--- a/docs/superpowers/plans/2026-06-20-mesh-spof-accept-resilience.md
+++ b/docs/superpowers/plans/2026-06-20-mesh-spof-accept-resilience.md
@ -1,237 +0,0 @@
 # Mesh SPOF — accept + targeted resilience — Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Accept askari's single-coordinator SPOF as a documented availability trade-off, and harden the one real gap — a `base` mesh knob that pins the coordinator FQDN in `/etc/hosts` on managed mesh hosts so a local-DNS hiccup can't strand the mesh.
 **Architecture:** One additive, idempotent `base` `mesh`-concern task (a `/etc/hosts` line via `lineinfile`, gated on a new opt-in knob), Molecule-tested; plus documentation (accepted-risk R8 + an ADR-016 availability amendment + STATUS/ROADMAP). No new infra, no Terraform, no live-deploy gate.
 **Tech Stack:** Ansible (`base` role, `lineinfile`), Molecule (Debian 13), Markdown docs.
 **Spec:** `docs/superpowers/specs/2026-06-20-mesh-spof-accept-resilience-design.md`
 ## Global Constraints
 - **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
 - **No new collection** — derive the coordinator FQDN with builtin `regex_replace` (NOT `urlsplit`, which would pull in `community.general`).
 - The pin is **opt-in and additive**: gated on `base__mesh_enabled | bool` AND `base__mesh_coordinator_pin | length > 0`. Empty knob (the default) = a clean no-op. The coordinator host (`askari`/`offsite_hosts`) is **exempt** — leave its pin empty.
 - **askari's coordinator IP = `77.42.120.136`** (stable WAN; the A record for `netbird.askari.wingu.me`); ubongo is in the `control` group.
 - `make lint` clean + `rbw unlocked` before any commit (the pre-commit hook decrypts the vault).
 - **No new infra** — no P2P, no second relay/coordinator, no Terraform. The coordinator off-site backup is **out of scope** (ADR-022 kickoff).
 - Tags: the new task carries the `mesh` concern tag (it belongs to the mesh concern).
 ---
 ### Task 1: `base` mesh coordinator-FQDN `/etc/hosts` pin (DNS-resilience)
 Add an opt-in knob that pins the coordinator FQDN (derived from `base__mesh_management_url`) to a stable IP in `/etc/hosts`, so a managed mesh host survives a local-DNS failure. TDD'd through the role's Molecule scenario (which already exercises the `mesh` concern with `manage: false`).
 **Files:**
 - Modify: `roles/base/defaults/main.yml` (add the knob after the mesh block, ~line 53)
 - Modify: `roles/base/tasks/mesh.yml` (append the pin task)
 - Modify: `roles/base/molecule/default/converge.yml` (add a fixture pin to the vars block)
 - Modify: `roles/base/molecule/default/verify.yml` (assert the rendered `/etc/hosts` line)
 - Modify: `inventories/production/group_vars/control/vars.yml` (set the pin for ubongo)
 **Interfaces:**
 - Produces: role default `base__mesh_coordinator_pin` (string, default `""`); when set + `base__mesh_enabled`, an `/etc/hosts` line `<pin-ip> <fqdn>` where `<fqdn>` is `base__mesh_management_url` minus scheme/port/path.
 - [ ] **Step 1: Write the failing Molecule test (fixture + assertion)**
 In `roles/base/molecule/default/converge.yml`, add one line to the `vars:` block (after `base__mesh_setup_key`, ~line 15):
 ```yaml
    base__mesh_coordinator_pin: "203.0.113.9"   # fixture coordinator IP (TEST-NET-3); pins the FQDN from base__mesh_management_url
 ```
 In `roles/base/molecule/default/verify.yml`, append to the `tasks:` list (after the mesh no-op assertion at the end):
 ```yaml
    - name: Read /etc/hosts (coordinator pin)
      ansible.builtin.slurp:
        src: /etc/hosts
      register: _etchosts
    - name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
      ansible.builtin.assert:
        that:
          - "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)"
        fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
        success_msg: "coordinator FQDN pinned in /etc/hosts"
 ```
 - [ ] **Step 2: Run Molecule to verify it fails**
 Run: `make test ROLE=base`
 Expected: FAIL at "Assert the coordinator FQDN is pinned…" — no pin task exists yet, so `/etc/hosts` has no such line.
 - [ ] **Step 3: Add the default knob**
 In `roles/base/defaults/main.yml`, after `base__mesh_version` (~line 53), add:
 ```yaml
 # DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
 # stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
 # so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
 # = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
 base__mesh_coordinator_pin: ""
 ```
 - [ ] **Step 4: Add the pin task**
 Append to `roles/base/tasks/mesh.yml`:
 ```yaml
 - name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
  ansible.builtin.lineinfile:
    path: /etc/hosts
    regexp: '\s{{ _coordinator_fqdn | regex_escape }}$'
    line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
    state: present
  vars:
    _coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
  when:
    - base__mesh_enabled | bool
    - base__mesh_coordinator_pin | length > 0
  tags: [mesh]
 ```
 (`_coordinator_fqdn` strips the scheme then anything from the first `:`/`/` → `netbird.askari.wingu.me`. The `regexp` matches an existing ` <fqdn>` at line end so a changed IP updates in place — idempotent; absent → appended.)
 - [ ] **Step 5: Run Molecule to verify it passes**
 Run: `make test ROLE=base`
 Expected: PASS — the new assertion is green and Molecule idempotence is clean (re-running the pin task reports `ok`, not `changed`). The idempotence pass is what proves the `regexp` matches the line it wrote.
 > Note: the empty-knob no-op (the production default for non-mesh / coordinator hosts) is guaranteed by the `when: base__mesh_coordinator_pin | length > 0` gate, not a separate Molecule case — a single converge can't hold both var-states, and boma uses one default scenario per role. The fixture exercises the meaningful path (rendering + FQDN extraction + idempotence).
 - [ ] **Step 6: Wire the production pin for ubongo**
 In `inventories/production/group_vars/control/vars.yml`, after the `base__mesh_enabled: true` block, add:
 ```yaml
 # DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
 # IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
 # mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
 base__mesh_coordinator_pin: "77.42.120.136"
 ```
 - [ ] **Step 7: Lint and commit**
 ```bash
 rbw unlocked && make lint
 git add roles/base/defaults/main.yml roles/base/tasks/mesh.yml \
        roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
        inventories/production/group_vars/control/vars.yml
 git commit -m "feat(base): pin the NetBird coordinator FQDN in /etc/hosts (mesh DNS-resilience)" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ### Task 2: Accept + document the SPOF (R8, ADR-016 amendment, STATUS/ROADMAP)
 Record the single-coordinator SPOF as a conscious, revisitable trade-off and capture the availability analysis + recovery. Pure documentation; references the pin from Task 1.
 **Files:**
 - Modify: `docs/security/accepted-risks.md` (add row R8; bump the review date)
 - Modify: `docs/decisions/016-mesh-vpn.md` (add the availability amendment subsection)
 - Modify: `STATUS.md` (note the SPOF accepted + the coordinator-pin knob)
 - Modify: `docs/ROADMAP.md` (mark sub-project 3 addressed; surface ADR-022 backup + ACL as next)
 - [ ] **Step 1: Add accepted-risk R8**
 In `docs/security/accepted-risks.md`, add this row to the table after R7:
 ```markdown
 | R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access** — `askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
 ```
 Then update the closing line's date: change `_Last reviewed: 2026-06-18.` to `_Last reviewed: 2026-06-20.`
 - [ ] **Step 2: Add the ADR-016 availability amendment**
 In `docs/decisions/016-mesh-vpn.md`, add this subsection immediately before the `## Related` section:
 ```markdown
 ## Availability — an `askari` outage (amendment 2026-06-20)
 The coordinator is deliberately **single** (one off-site host). Recorded here so its
 availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
 The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
 normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
 radius**:
 | Traffic | `askari` down |
 |---|---|
 | LAN device → LAN service (direct / via reverse proxy) | unaffected |
 | node ↔ node over LAN IPs (cluster) | unaffected |
 | node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
 | **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
 | mesh control plane (new enrol / ACL change / re-handshake) | pauses |
 Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
 is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
 operations, above).
 **Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
 once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
 gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
 the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
 hosts get the same pin via `base__mesh_coordinator_pin`.
 **Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
 default-deny posture; only helps established sessions), a second relay (needs another public
 host / reintroduces the home public surface), a second coordinator (unsupported by
 self-hosted NetBird; against this ADR).
 ```
 - [ ] **Step 3: Update STATUS.md**
 In `STATUS.md`, in the `roles/base/` row, append to the end of the firewall/mesh description (before the closing ` |`): a sentence noting the pin and the accepted SPOF:
 ```markdown
 The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`, set for ubongo) so a local-DNS hiccup can't strand the mesh; the single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment).
 ```
 - [ ] **Step 4: Update ROADMAP.md**
 In `docs/ROADMAP.md`, in the "Remaining mesh-hardening sub-projects" list, change item 3 from the SPOF-reduction "(next)" wording to **DONE**, and make the NetBird ACL the next item. Replace the current items 3–4 block with:
 ```markdown
 3. ~~**askari relay-SPOF reduction**~~ → **DONE (2026-06-20)** — assessed + **accepted** as a
   documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
   narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
   second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
   DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
 4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
 5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
   BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
 ```
 - [ ] **Step 5: Consistency check + commit**
 ```bash
 grep -q "^| R8 " docs/security/accepted-risks.md && \
 grep -q "Availability — an .askari. outage" docs/decisions/016-mesh-vpn.md && \
 echo "docs OK"
 ```
 Expected: `docs OK`.
 ```bash
 rbw unlocked
 git add docs/security/accepted-risks.md docs/decisions/016-mesh-vpn.md STATUS.md docs/ROADMAP.md
 git commit -m "docs(security): accept the single-coordinator mesh SPOF (R8) + ADR-016 availability amendment" \
           -m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
 ```
 ---
 ## Notes / out of scope
 - **Coordinator off-site backup → ADR-022 kickoff** (next sub-project). Not built here.
 - **Direct P2P / second relay / second coordinator** — deliberately not pursued (spec §Design).
 - No live deploy is required to land this — the pin is additive/idempotent and applies to ubongo on the next routine `base` apply (`make deploy PLAYBOOK=site LIMIT=ubongo`, operator's discretion). Optional post-deploy spot-check: `getent hosts netbird.askari.wingu.me` on ubongo resolves to `77.42.120.136`.
--- a/docs/superpowers/specs/2026-06-17-mesh-hardening-askari-ssh-wt0-design.md
+++ b/docs/superpowers/specs/2026-06-17-mesh-hardening-askari-ssh-wt0-design.md
@ -1,156 +0,0 @@
 # Spec — Mesh-hardening (1 of 3): move askari's SSH onto `wt0`
 Status: Accepted (2026-06-17)
 ## Context & scope
 The **mesh-hardening follow-on** was deferred from M5 (ROADMAP). It was decomposed into
 **three independent sub-projects**, each with its own spec → plan → implementation cycle:
 1. **askari SSH → `wt0`** ← *this spec*
 2. ubongo nftables default-deny + `ssh-from-control` (its own later spec)
 3. NetBird ACL off Allow-All → scoped policies (its own later spec)
 This spec covers **only (1)**. It makes askari's SSH reachable **only over the NetBird mesh
 interface `wt0`** and closes the WAN `:22` surface at both the host and the Hetzner Cloud
 Firewall. It does **not** touch ubongo, the NetBird ACL (stays Allow-All for now — one
 moving access-layer at a time), or askari's public service exposure (Caddy 80/443, NetBird
 STUN 3478 stay on the WAN).
 Current state (STATUS.md): askari is reached at `ansible_host: 77.42.120.136` (WAN, in the
 TF-generated `inventories/production/offsite.yml`); `wt0` is up at `100.99.226.39`
 (Management+Signal Connected, M5); the base nftables `firewall` concern is **built but not
 applied** to askari (the Hetzner Cloud Firewall is its perimeter today); the Hetzner Cloud
 Firewall (`terraform/modules/hetzner_vm`) opens `:22` from `var.ssh_admin_cidrs` plus
 80/443/3478 from anywhere.
 ## Goal / success criteria
 - SSH to askari succeeds over `wt0` (from ubongo) and **fails from any off-mesh source**.
 - The WAN `:22` surface is closed at **both** layers (host nftables = `wt0`-only; Hetzner
  Cloud Firewall drops the `:22` rule).
 - Public services are unaffected: `https://test.askari.wingu.me` and
  `https://netbird.askari.wingu.me` serve valid certs; STUN `3478/udp` still answers.
 - Ansible manages askari over `wt0`.
 - Break-glass is the **Hetzner web console** (out-of-band; works even if the mesh is down).
 - A reboot of askari does **not** lock SSH out (the boot-race below is solved).
 ## Design — three enforcement layers (defense-in-depth)
 1. **sshd** binds `ListenAddress` to askari's `wt0` IP only, so it does not accept on WAN.
 2. **host nftables** (base `firewall` concern, ADR-020): catalog-driven default-deny;
   `:22` allowed only via `iifname "wt0"` (the interface-name match that survives `wt0`
   being absent — see `docs/testing/gotchas.md`); public service ports stay open on WAN.
 3. **Hetzner Cloud Firewall** (Terraform): the `:22` `ssh_admin_cidrs` rule is removed;
   80/443/3478 stay.
 ## The boot-race fix (load-bearing)
 `wt0` is brought up by NetBird **after** boot, so at sshd start the `wt0` IP may not exist
 yet. A plain `ListenAddress 100.99.226.39` would fail to bind → sshd exits → **lockout on
 reboot**. Solution:
 - **`net.ipv4.ip_nonlocal_bind = 1`** via a sysctl drop-in (`ansible.posix.sysctl`,
  persisted under `/etc/sysctl.d/`). This lets sshd bind the `wt0` address even before the
  interface exists; once `wt0` comes up with that IP, traffic is delivered to the existing
  listener — no reload needed.
 - The sshd drop-in **fails closed**: the mesh IP is resolved (see below) and the play
  **asserts it is non-empty** before rendering. An empty `ListenAddress` would silently
  fall back to listening on all interfaces, defeating the restriction — that must never
  render.
 **Mesh-IP source (decided):** the **live `wt0` fact** `ansible_wt0.ipv4.address`, gathered
 at apply time (`wt0` is up during the play, since M5), with a **`host_var` fallback**
 (`base__ssh_listen_addr`, default `""`) and a fail-closed `assert` that one of them yielded
 a non-empty address. Live fact is preferred (correct even if NetBird reassigns the IP);
 the host_var is an explicit override / belt.
 ## New & changed code
 **Role `base` (the `hardening` + `firewall` concerns):**
 - `roles/base/defaults/main.yml` — add:
  - `base__ssh_listen_mesh_only: false` — opt-in; when `true`, sshd binds the mesh IP only.
  - `base__ssh_listen_addr: ""` — optional explicit mesh-IP override (fallback to the
    `ansible_wt0` fact).
 - `roles/base/tasks/ssh.yml` —
  - resolve the mesh IP (`base__ssh_listen_addr` or `ansible_wt0.ipv4.address`) into a fact;
  - `assert` it is non-empty **when** `base__ssh_listen_mesh_only`;
  - set `net.ipv4.ip_nonlocal_bind = 1` (sysctl drop-in) under the same condition.
 - `roles/base/templates/sshd_hardening.conf.j2` — append a conditional
  `ListenAddress {{ resolved_mesh_ip }}` block guarded by `base__ssh_listen_mesh_only`
  (unset → unchanged behaviour: listen on all). Keep the existing `sshd -t` validation.
 **Inventory:**
 - `inventories/production/host_vars/askari.yml` (new) — `ansible_host: 100.99.226.39`
  (overrides the TF-generated `offsite.yml`; host_vars are not regenerated by
  `tf_to_inventory.py`). A header comment explains why.
 - `inventories/production/group_vars/offsite_hosts/vars.yml` — add
  `base__ssh_listen_mesh_only: true`; ensure `base__firewall_apply: true`.
  (`base__mesh_enabled` is already `true` for askari — set in M5 — and is a precondition,
  not a change here.)
 **Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
 - Enumerate askari's required ingress so catalog-driven default-deny does **not** drop a
  live public service. Derived from the existing `reverse_proxy` + `netbird_coordinator`
  definitions: `:22/tcp` on the **mesh** zone (`wt0`); `80,443/tcp` + `3478/udp` on the
  **public** zone (WAN). The exact catalog/zone YAML is finalised in the implementation
  plan against the `resolve_firewall_rules` filter's schema.
 **Terraform** (`terraform/environments/offsite` + `terraform/modules/hetzner_vm`):
 - Remove the WAN `:22` ingress rule (e.g. drop `ssh_admin_cidrs` from the firewall, or set
  it empty and guard the rule). Keep 80/443/3478. Applied via `make tf-plan/apply
  TF_ENV=offsite` (plan shown before apply).
 ## Staged cutover — a working path at every step
 1. **Pre-check:** confirm `ssh sjat@100.99.226.39` and an `ansible askari -m ping` forced
   over `wt0` both succeed **before** changing anything.
 2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); verify
   `ansible askari -m ping` runs over the mesh. WAN `:22` still open as a fallback here.
 3. **Apply `base` (firewall + sshd together):** one `make deploy PLAYBOOK=site LIMIT=askari`
   converge applies catalog default-deny (`:22` on `wt0` + public ports) **and** the sshd
   `ListenAddress`=mesh + `ip_nonlocal_bind` drop-in. The firewall concern's
   `reset_connection` → `wait_for_connection` (now over `wt0`) plus the armed auto-rollback
   timer (`base__firewall_rollback_timeout`, 45 s) is the safety gate — a bad ruleset
   reverts itself. The sshd `reload` cannot drop the in-flight `wt0` session. Verify the
   public services still respond.
 4. **Retire the Hetzner WAN `:22`:** the Terraform change above; `make tf-plan
   TF_ENV=offsite` (review) → `make tf-apply`. Verify: `wt0` SSH works; off-mesh `nc -vz
   77.42.120.136 22` is refused/times out; `:443` open; STUN answers.
 ## Testing
 - **Molecule** (base `default` scenario; `wt0` absent in-container, `base__firewall_apply:
  false` render-only): assert (a) the rendered nftables allows `:22` via `iifname "wt0"`;
  (b) with `base__ssh_listen_mesh_only: true` + a fixture mesh IP, the sshd drop-in renders
  `ListenAddress <ip>` and `sshd -t` passes; (c) with the flag set but **no** resolvable
  mesh IP, the play **fails closed** (the `assert`); (d) the `ip_nonlocal_bind` sysctl task
  is present. Keep existing firewall/hardening assertions green.
 - **Live, out-of-band:** post-cutover, from an off-mesh host `nc -vz 77.42.120.136 22` →
  refused; `:443` → open; from ubongo over `wt0`, SSH + `ansible -m ping` succeed; reboot
  askari (Hetzner console) and confirm SSH-over-`wt0` returns without intervention.
 ## Risks & rollback
 - **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
  firewall auto-rollback timer, and `ansible_host`=`wt0` so the connectivity confirm tests
  the real new path.
 - **Reboot lockout:** mitigated by `ip_nonlocal_bind` (sshd binds `wt0` regardless of
  interface timing) + the fail-closed assert (never silently listen-all).
 - **Default-deny breaks a public service:** mitigated by enumerating all live ingress into
  the catalog and the §Testing service checks; reversible by re-running with
  `base__firewall_apply: false` or widening the catalog.
 - **Ultimate break-glass:** the Hetzner web console (out-of-band). The TF `:22` rule is
  trivially re-addable.
 ## Out of scope / follow-ons
 - ubongo default-deny + `ssh-from-control` (sub-project 2).
 - NetBird ACL off Allow-All (sub-project 3) — until then any enrolled peer can reach
  askari's `wt0:22`; scoping that is sub-project 3's job.
 - `/check-access` (ADR-021) live verification — designed, build still pending.
 - STATUS.md / ROADMAP updates land with the implementation, not this spec.
--- a/docs/superpowers/specs/2026-06-18-local-vm-integration-testing-design.md
+++ b/docs/superpowers/specs/2026-06-18-local-vm-integration-testing-design.md
@ -1,267 +0,0 @@
 # Local VM integration testing on ubongo (design)
 **Status:** Designed, not built. Resolves `docs/TODO.md` item 2.4 (Local VM integration
 testing on ubongo, pre-deploy).
 **Date:** 2026-06-18.
 **Implements:** the concrete build of ADR-008 Level 2/3 (staging/integration), deferred
 for lack of hosts but hostable on ubongo. To be recorded as **ADR-025**.
 ## Context
 Molecule (ADR-008 Level 1) tests each role in a single Docker container: one `converge`,
 no real kernel netfilter, no real Docker daemon in the loop, and **no reboot**. That
 structurally cannot catch an entire class of bug — reboot-survivability, host-firewall ×
 Docker interaction, and boot-ordering — which is exactly the class that caused the
 **2026-06-17 mesh-hardening incident**:
 - `base`'s nftables `forward { policy drop; }` killed the askari Docker host **on reboot**
  (nftables loaded its default-deny *before* Docker, breaking published-port DNAT and
  inter-container forwarding → public services + the mesh went down). It had worked right
  after `make deploy`, when Docker's runtime rules still coexisted. (FRICTION 2026-06-17 #1.)
 - `ip_nonlocal_bind` did **not** beat the sshd boot-race; sshd bound to the `wt0` mesh IP
  had no listener at boot. (FRICTION #2.)
 - The coordinator host could not bootstrap the mesh it itself hosts. (FRICTION #3.)
 - NetBird `netbird-server` FATAL-loops on the GeoLite2 download when egress is lost — and
  egress was lost when `nft flush` wiped Docker's NAT masquerade. (FRICTION #4.)
 Recovery needed the Hetzner console + a WAN-SSH break-glass. The lesson, already crystallised
 as a standing rule: *firewall/sshd/boot changes must be tested on a real VM with a real
 reboot before they touch a live host, and a non-mesh break-glass must be kept.*
 This spec defines a way for the agent to spin up **throwaway KVM VMs locally on ubongo**
 that mirror a target host (real Docker, a real reboot, the real role apply) and validate
 risky infra changes **before** a live deploy. ubongo can host this today:
 > verified: ubongo KVM capability · Bash (this session) · `/dev/kvm` present + accessible
 > (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198
 > GiB disk free; libvirt/QEMU/Vagrant **not yet installed** · 2026-06-18.
 ## Goals
 - Reproduce the 2026-06-17 bug class locally: real OS boot, real Docker, real netfilter,
  the real role apply, a **real reboot**, then outcome assertions.
 - Let the agent drive the full loop autonomously: provision → apply → reboot → assert →
  teardown, with diagnostics captured on failure.
 - Mirror a *real* host from inventory (first profile: "be askari"), so the apply is
  faithful, not synthetic.
 - Be the concrete tool that operationalises the standing "test risky infra before live
  deploy" rule.
 ## Non-goals (v1)
 - Not a production hypervisor on ubongo (reconciles ADR-015 — see Governance).
 - Not nested Proxmox; the provisioning *chrome* (template clone / Terraform) is **not**
  mirrored — every incident bug lives in the boot/kernel/Docker layer, not provisioning.
 - Not a multi-VM mini-cluster; one VM at a time. (All six 2026-06-17 signals occurred on a
  single host that was Docker host + coordinator + mesh peer.) Multi-VM is a later extension.
 - Not a CI gate; this is an interactive, agent-driven pre-deploy check on ubongo (CI stays
  lint + Molecule per ADR-008/010).
 ## Decisions (from the 2026-06-18 brainstorm)
 1. **Virtualisation approach: libvirt/KVM directly (Approach A).** A golden Debian-13
   genericcloud qcow2 cached locally; each run boots an ephemeral qcow2 overlay backed by
   it, seeded via cloud-init NoCloud, driven by a **stdlib-only** Python script over
   `virsh` (no `libvirt-python` dependency). Chosen over Vagrant+vagrant-libvirt (Ruby/plugin
   footprint, box drift from the real cloud image) and terraform-provider-libvirt (poor at
   the imperative apply→reboot→re-apply sequence, throwaway state, blurs ADR-006's prod-VM
   boundary). Lightest footprint on a 15 GiB control node; full control of the reboot step;
   the same Debian cloud image real hosts boot.
 2. **Fidelity envelope: real OS/Docker/netfilter/reboot, not the Proxmox provisioning
   path.** A lightweight local hypervisor is enough because the bugs are post-boot.
 3. **Scope: one throwaway VM at a time, instantiated from a real host's inventory.** First
   profile: **"be askari"** (Docker host + NetBird coordinator + mesh peer on one box). The
   mechanism is generic — later "be" any host by swapping which inventory host it mirrors.
 4. **Acceptance is self-validating against the real failure.** Done = the harness, on a
   local VM, applies `base` (firewall on) to a Docker host, reboots, and **observes the
   2026-06-17 breakage** (Docker forwarding dead / services down); then, with the
   `docker_host` container-forward drop-in in place, the same run **survives the reboot**.
   If step 1 passes, the harness is not faithful.
 5. **Tiered cert fidelity via a `--certs` knob** (DNS-01 is what makes real certs possible
   with no public inbound — validation is out-of-band via a Gandi TXT record; the VM needs
   only outbound to ACME + Gandi, which the NAT net provides):
   - `internal` (default) — Caddy `tls internal`, zero deps, instant; for the incident repro
     and runs where certs aren't under test.
   - `le-staging` — real DNS-01 ACME against Let's Encrypt **staging**: real caddy-gandi
     path, real cert files/renewal, untrusted root, effectively no rate limits. **Built in v1.**
   - `le-prod-wildcard` — a real trusted `*.test.wingu.me` wildcard, **issued once,
     persisted on ubongo, reused** across runs. Wired in v1 but **on-demand only**; its
     accepted risk is recorded when used (prod Gandi credential reaching an ephemeral VM;
     transient TXT in the real `wingu.me` zone). A deliberate "no-egress" failure scenario
     (to reproduce FRICTION #4) forces `internal`, since ACME needs egress.
 6. **The toolchain is Ansible-managed**, not hand-installed: a new non-service role
   (`integration_test`, `control` group) installs/enables libvirt+QEMU reproducibly. The
   repo owns ubongo's state. The driver manages *images* lazily on first run (keeps the role
   lean; avoids fiddly download/refresh logic in Ansible).
 7. **Stubs live in an overlay file, never in the real inventory** — so `make tf-inventory`
   and "don't edit inventory directly" stay intact, and every stub is explicit and reviewable.
 8. **A new ADR-025** records this decision (approach + alternatives + cert tiers); ADR-008
   gains a pointer and redirects its "what Molecule does NOT test" gaps here.
 ## Architecture — five isolated components
 | # | Component | Purpose | Location |
 |---|-----------|---------|----------|
 | 1 | **`integration_test` role** (non-service, `control` group) | Install/enable libvirt+QEMU+virtinst, add `sjat`/`claude` to `libvirt` group, create the image-cache dir, drop the driver. Idempotent, Molecule-tested. | `roles/integration_test/` |
 | 2 | **`integration-vm.py` driver** | Stdlib-only lifecycle over `virsh`: `up / apply / reboot / assert / cycle / reset / down / prune / console`. Lazily ensures the golden image (download + checksum). | `scripts/integration-vm.py` |
 | 3 | **Profiles + var overlays** | Make a VM "become" a host: pull that host's real group_vars/host_vars + layer a small explicit overlay (cert tier, in-VM coordinator endpoint, VM connection). | `tests/integration/overrides/<host>.yml` |
 | 4 | **Verify playbook** | Outcome-based post-reboot assertions (Docker up, published-port DNAT alive, `nft` sane, service responds, `wt0` up), reusing Molecule's `verify.yml` philosophy. | `tests/integration/verify.yml` |
 | 5 | **Makefile target** | `make test-integration HOST=<name> [CERTS=...] [KEEP=1]` → `cycle`; `make test-integration-clean` → `prune`. Documented in CLAUDE.md's command table. | `Makefile` |
 ## Lifecycle / data flow
 `make test-integration HOST=askari` drives:
 ```
 1. ensure golden image    Debian-13 genericcloud qcow2, cached + checksum-verified
 2. ephemeral overlay      qcow2 backed by golden (throwaway; never mutate golden)
 3. cloud-init NoCloud      seed hostname + ansible user + ubongo's SSH key + NIC
 4. virt-install --import   boot on an isolated libvirt NAT net (DHCP IP + outbound NAT)
 5. wait for SSH            IP via `virsh domifaddr --source lease` (guest-agent optional)
 6. transient inventory     askari's real vars + ansible_host=<lease IP> + stub overlay
 7. ansible-playbook site   THE REAL APPLY (base + docker_host + reverse_proxy + coordinator)
 8. [snapshot post-apply]   optional reset point for fast re-runs
 9. virsh reboot ──────────┐  ← the step Molecule structurally cannot do
 10. wait for SSH           ┘
 11. ansible-playbook verify outcome assertions; THIS is where the incident surfaces
 12. report + teardown       pass/fail; on fail keep VM + dump diagnostics; else destroy overlay
 ```
 Steps 1–7 build a real Docker daemon with real published-port DNAT to break; step 9 is a
 real kernel reboot, so nftables loads default-deny before Docker exactly as on askari.
 ## Fidelity boundary & cert tiers
 **Faithful where the bug lives:** real kernel, real netfilter, real Docker with
 published-port DNAT, the real role apply, a real reboot, and the coordinator running *inside
 the VM* so the VM is its own mesh peer — reproducing the circular mesh-bootstrap (FRICTION #3)
 on one box.
 **Stubbed where it needs the public internet** (explicit, in the overlay): LE certs via the
 `--certs` knob (Decision 5); public DNS (`askari.wingu.me`) → local resolution; NetBird
 geo-DB → pre-seeded or requirement disabled (which is *also* the FRICTION #4 fix, so the
 harness can test both the FATAL-loop and its remedy).
 ## Acceptance test (self-validating)
 1. Run the cycle on **today's** `base` (firewall on, no `docker_host` container-forward
   drop-in) → **step 11 must FAIL after reboot** (Docker forwarding dead, services down).
 2. Implement the `docker_host` container-forward rules (the pending fix STATUS.md names) →
   re-run → **step 11 must PASS across the reboot.**
 **Scope boundary:** the *harness* is this plan's deliverable. The `docker_host`
 container-forward fix is a separate work item (FRICTION 2026-06-17 #1). v1's acceptance
 deliberately spans both, because a credible harness must demonstrate **both** a true-negative
 (red on the broken state) and a true-positive (green on the fixed state) — otherwise we have
 only ever watched the assert go red. The plan decides sequencing: build the small
 `docker_host` drop-in as the green-half of acceptance, or consume it if built separately
 first. Minimum credible v1 is the red half (faithful reproduction); full acceptance is red→green.
 This one round-trip proves the harness reproduces the incident, the fix works, and the loop
 can be trusted for the next risky change before it touches a live host.
 ## Robustness, isolation & teardown
 **Failure leaves evidence** (catching a bug is the point):
 | Step fails | Behaviour |
 |---|---|
 | Golden image (1) | Fail fast, clear message; image cached (one-time cost) |
 | Boot / first SSH (4–5) | **Capture serial console to a log file**, fail with its tail — the automated equivalent of the Hetzner console (ties to TODO 10.8) |
 | Apply (7) | Keep VM, surface Ansible output, dump diagnostics |
 | **No SSH after reboot (9–10)** | The classic incident signature; FAIL, keep VM, capture console — the harness *succeeding* |
 | Assert (11) | FAIL, keep VM, dump post-mortem: `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`, `systemd-analyze critical-chain`; exit non-zero |
 Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/` (same pattern as ADR-017's
 screenshot dir; the agent reads them directly).
 **Three safety invariants** (these make the testing tool itself safe):
 1. **The transient inventory contains only the test VM** — no real host is ever in scope;
   the apply is `--limit`ed to the VM.
 2. **"Be askari" points NetBird at the in-VM coordinator (localhost)** — the VM forms its
   own one-node mesh; it never enrolls in the real mesh.
 3. **Test VMs sit on an isolated libvirt NAT net** — outbound NAT for ACME/image pulls, but
   not reachable to the LAN (`10.20.x`) or the real mesh.
 **Resource guard** (ubongo's 15 GiB ceiling, ADR-015/012): default VM ≈ 2 vCPU / 3 GiB / 20
 GiB thin overlay; the driver refuses to start below a free-RAM threshold and enforces **one
 integration VM at a time** (name-prefix `boma-it-*`). **Teardown:** success destroys domain +
 overlay; failure keeps them and prints how to inspect; `make test-integration-clean` reaps
 all `boma-it-*` orphans. An optional post-apply **snapshot** lets `reset` re-run
 reboot+assert without re-applying (fast iteration on a fix).
 ## Testing the tester
 - **pytest** on the driver's pure logic: transient-inventory generation, var/overlay merge,
  `--certs`→overlay mapping, DHCP-lease parsing, resource-guard math (mock `virsh`). Joins
  boma's existing pytest suite.
 - **Molecule** (Docker) on the `integration_test` role: asserts libvirt/qemu/virtinst
  installed, `libvirtd` enabled, users in `libvirt` group, driver present. (Cannot run
  KVM-in-Docker — the documented Molecule limitation.)
 - **End-to-end self-test = the acceptance test above**, run manually on first build and
  recorded in the runbook.
 ## Governance & documentation touch-points
 - **ADR-025 "Local VM integration testing"** — decision, approach A, rejected alternatives
  (Proxmox-nested / Vagrant / TF-libvirt), cert tiers.
 - **ADR-008** — pointer to ADR-025; redirect its "what Molecule does NOT test" gaps
  (nftables loading, mesh dataplane) to this level.
 - **ADR-015** — one-line reconciliation: "not a hypervisor" → runs *ephemeral KVM test VMs*
  as part of its local-test-runner role (still not a production hypervisor); note the
  test-VM RAM load.
 - **`docs/security/accepted-risks.md`** — the `le-prod-wildcard` risk (prod Gandi credential
  → ephemeral VM; transient TXT in real `wingu.me`).
 - **CLAUDE.md** command table + **`docs/runbooks/integration-testing.md`** (run a cycle,
  cert knobs, where diagnostics land, inspecting a kept failed VM, pruning) + **STATUS.md**
  entry. The runbook's pre-flight line operationalises FRICTION #6 (*validate
  reboot-recovery before retiring the break-glass*).
 ## Capacity
 One VM (~3 GiB) against ~13 GiB free is comfortable. The only future pinch is concurrency
 with the Level-4 Chromium/Playwright stack (ADR-017) — handled by the resource guard +
 "one at a time." Add a note to `docs/hardware/reference.md`; revisit at `/capacity-review`.
 ## Alternatives considered
 - **Proxmox VE nested on ubongo** — highest fidelity incl. the provisioning step, but heavy
  (nested virt, RAM), in tension with ADR-015, and the incident bugs don't live in
  provisioning. Rejected.
 - **Vagrant + vagrant-libvirt** — mature lifecycle/snapshots, but adds the Ruby/Vagrant
  ecosystem + a fragile plugin, boxes drift from the real Debian cloud image, and the
  reboot→assert sequence still needs custom logic. Rejected.
 - **terraform-provider-libvirt** — declarative and reuses TF, but poor at the imperative
  apply→reboot→re-apply test sequence, adds throwaway state, and blurs ADR-006's
  "TF owns *production* VM existence on Proxmox" boundary. Rejected.
 ## Open questions / deferred
 - **Multi-VM mini-staging** (inter-host mesh/dataplane) — design the driver + NAT net so a
  topology is an additive extension; out of scope for v1.
 - **Interplay with the Level-4 browser stack** — both want ubongo RAM; the resource guard is
  the v1 answer, revisit when Level 4 is built.
 - **Snapshot strategy depth** — v1 ships clone-and-destroy + an optional post-apply snapshot;
  richer snapshot trees deferred.
 ## Knowledge to verify at plan stage (ADR-014)
 These are from memory / unverified and must be confirmed against version-matched docs before
 the plan asserts them:
 - Exact `virt-install --import` flags and the cloud-init **NoCloud** seed format on the
  Debian-13 libvirt stack.
 - Whether the Debian-13 genericcloud image ships `qemu-guest-agent` (IP can come from the
  DHCP lease regardless — guest-agent is an optimisation, not a requirement).
 - Let's Encrypt **rate limits** (prod vs staging) — to confirm "issue the wildcard once,
  reuse" stays within limits.
 - The `caddy-dns/gandi` DNS-01 configuration and pinned version already used by
  `reverse_proxy`, and whether the Gandi LiveDNS API key can be scoped to `test.wingu.me`.
 - libvirt default vs a dedicated isolated NAT network on Debian-13 (`virsh net-*`).
--- a/docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
+++ b/docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
@ -1,216 +0,0 @@
 # Spec — Mesh-hardening redesign: askari SSH `wt0`-primary + permanent WAN break-glass
 Status: Accepted (2026-06-19)
 ## Context & scope
 The **mesh-hardening follow-on** (deferred from M5) was decomposed into three independent
 sub-projects, each with its own spec → plan → implementation cycle. Progress so far:
 1. ~~askari SSH → `wt0`~~ — **attempted 2026-06-17, BACKED OUT** after it took askari down
   on reboot (spec/plan `docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*`).
 2. ubongo nftables INPUT-only default-deny — **DONE 2026-06-19**, reboot-validated
   (`base__firewall_input_only`).
 3. NetBird ACL off Allow-All → scoped policies — not started.
 This spec is the **redesign of (1)**. The operator sequencing decision (2026-06-19) is:
 do this redesign **first**, then a separate sub-project to reduce askari's
 single-point-of-failure (SPOF) role. **This spec covers only the redesign of (1).** The SPOF
 reduction is the named follow-on (its own later spec).
 ### Why the 2026-06-17 attempt was backed out
 Four hazards, recorded in `docs/FRICTION.md` (the six 2026-06-17 signals):
 1. **`base`'s `forward policy drop` breaks Docker hosts on reboot** — nftables loaded
   default-deny before Docker, so container forwarding/NAT (WAN→Caddy, Caddy→coordinator)
   died after reboot.
 2. **`ip_nonlocal_bind` did NOT beat the sshd boot-race** — binding sshd `ListenAddress`
   to the `wt0` IP still failed at boot ("could not assign the address"); and because
   `wt0` never came up, sshd had no listener at all.
 3. **The coordinator host can't bootstrap the mesh it depends on** — askari runs the
   NetBird coordinator *and* is a mesh peer; its agent needs the local coordinator container
   healthy to bring up `wt0`. After an unclean reboot the coordinator was down → `wt0`
   never came up → with SSH `wt0`-only, the host was reachable only via the Hetzner console.
   General rule: *never make a host's only management path depend on a service that host
   itself hosts.*
 4. **The coordinator FATAL-loops on the geolocation-DB download with no egress** — a
   transient loss of container egress (here: NAT wiped by `nft flush`) crash-loops the whole
   control plane.
 ### What changed since 2026-06-17 (enablers this redesign relies on)
 - `docker_host` **container-forward nftables drop-in** (`172ae37`) — reboot-safe Docker
  forwarding (available as a later tightening; not required by this pass).
 - **`base__firewall_input_only`** — input-only default-deny, forward chain stays
  `policy accept` (Docker-safe). **Proven on ubongo and reboot-validated 2026-06-19.**
 - The **ADR-025 integration harness** — reproduces a host's boot on a throwaway local VM,
  so reboot-safety is proven GREEN before the real host is touched.
 ## Goal / success criteria
 - askari's host nftables firewall is **applied at last** (`base__firewall_apply: true`),
  INPUT-only default-deny — matching ubongo.
 - **Normal management is over the mesh:** `ansible_host` resolves to askari's `wt0` IP
  (`100.99.226.39`); SSH-over-`wt0` and `ansible askari -m ping` over the mesh both succeed.
 - **A permanent non-mesh break-glass survives a mesh/coordinator outage**, via two
  independent channels:
  - the **Hetzner web console** (out-of-band; always works, IP-independent); and
  - **WAN `:22` reachable only from ubongo's WAN IP (`91.226.145.80`)**, enforced at *both*
    the host nftables layer (`base__firewall_admin_addrs`) and the Hetzner Cloud Firewall.
  WAN `:22` is **deliberately NOT closed** — the coordinator-host exception (FRICTION #3).
 - **askari survives a reboot under the new firewall, unattended:** Docker forwarding/NAT
  intact, `https://test.askari.wingu.me` + `https://netbird.askari.wingu.me` serve valid
  certs, STUN `3478/udp` answers, the coordinator container is healthy (geo-DB no longer
  FATAL), `wt0` returns, SSH is reachable over both `wt0` and the WAN break-glass.
 - **No sshd `ListenAddress` change** (`base__ssh_listen_mesh_only` stays `false`) — this is
  what sidesteps the boot-race that sank the 2026-06-17 attempt.
 ## Design — mirror ubongo 2/3, with the coordinator-host exception
 The host firewall does the SSH scoping; sshd is left listening on all interfaces. This is
 the ubongo 2/3 pattern, which is proven and reboot-validated.
 1. **`base` firewall, INPUT-only default-deny** (`base__firewall_apply: true`,
   `base__firewall_input_only: true`): the input chain defaults to `drop`; the forward chain
   stays `policy accept` so Docker container forwarding/NAT and published-port DNAT keep
   working across a reboot. Allowed ingress:
   - `:22/tcp` via `iifname "wt0"` (the interface-name match that survives `wt0` being
     absent at boot — `base__firewall_mgmt_interface: wt0`);
   - `:22/tcp` from `91.226.145.80` (ubongo's WAN — the break-glass; via
     `base__firewall_admin_addrs`);
   - the public service surface from the catalog: `80,443/tcp` + `3478/udp` (WAN).
 2. **No sshd change.** `base__ssh_listen_mesh_only` stays `false`; sshd keeps listening on
   all interfaces. The firewall, not sshd, restricts where `:22` is reachable. There is no
   `ListenAddress`, hence no `ip_nonlocal_bind`, hence no boot-race.
 3. **The Hetzner Cloud Firewall is unchanged** — the `:22`-from-ubongo rule stays (the
   2026-06-17 attempt removed it; this redesign keeps it as the perimeter break-glass).
 4. **Coordinator geo-DB robustness** — make the `netbird_coordinator` control plane survive
   a transient egress loss (the nat-flush window on apply, and the boot window before Docker
   re-adds its NAT), so the coordinator stays healthy and `wt0` can come back. One of:
   - **pre-seed** the GeoLite2 DB into the persistent `netbird_data:/var/lib/netbird` volume
     so netbird-server finds it locally and never needs to download; or
   - **disable / make non-fatal** the geolocation requirement in `config.yaml.j2`.
   The exact v0.72.4 mechanism is verified against NetBird's pinned docs at plan time
   (ADR-014) — the design fixes the *intent* (a transient egress blip must not FATAL the
   control plane); the plan fixes the *knob*.
 ### Rejected alternatives (these are the 2026-06-17 failures)
 - sshd `ListenAddress = wt0 IP` + `ip_nonlocal_bind` → boot-race; did not bind. **Out.**
 - `forward policy drop` on a Docker host → broke forwarding on reboot. **Out** (use
  `input_only`; the `docker_host` container-forward drop-in is a later tightening).
 - Close WAN `:22` entirely → coordinator host left console-only on a bad reboot. **Out**
  (keep WAN `:22`-from-ubongo as the always-available non-mesh path).
 ### How each 2026-06-17 failure is answered
 | 2026-06-17 failure | Fix in this design |
 |---|---|
 | `forward drop` killed Docker on reboot | `base__firewall_input_only: true` → forward stays `accept` |
 | `ip_nonlocal_bind` sshd boot-race | no sshd `ListenAddress` change; firewall scopes `:22` by `iifname "wt0"` |
 | coordinator chicken-egg / lockout | permanent WAN `:22`-from-ubongo + Hetzner console; management never depends on a service askari hosts |
 | coordinator geo-DB FATAL-loop | pre-seed / non-fatal geo so a transient egress blip can't crash the control plane |
 ## New & changed code
 **Inventory:**
 - `inventories/production/group_vars/offsite_hosts/vars.yml` —
  - `base__firewall_apply: true` (was `false`);
  - `base__firewall_input_only: true` (new — forward stays `accept`, Docker-safe);
  - `base__firewall_admin_addrs: ["91.226.145.80"]` (new — ubongo's WAN, the break-glass;
    comment states what it is and why a coordinator host keeps a non-mesh path);
  - `base__ssh_listen_mesh_only: false` stays (explicit — no boot-race);
  - rewrite the header backout note → "redesigned 2026-06-19: `wt0`-primary + permanent WAN
    break-glass; see this spec."
 - `inventories/production/host_vars/askari.yml` (**new**) — `ansible_host: 100.99.226.39`
  (the `wt0` IP), so Ansible manages askari over the mesh. Overrides the TF-generated WAN
  `ansible_host` in `offsite.yml` (host_vars are not regenerated by `tf_to_inventory.py`).
  Header comment explains why.
 **Role `netbird_coordinator`:**
 - The geo-DB robustness change above (`templates/config.yaml.j2` and/or a pre-seed task +
  `templates/docker-compose.yml.j2` volume already persists `/var/lib/netbird`), with
  Molecule/verify coverage that the control plane comes up without external geo egress.
 **Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
 - **No change.** It already enumerates askari's public ingress (`reverse_proxy` 80/443,
  `netbird_stun` 3478/udp). `:22` is handled by the `base` firewall's built-in SSH rules
  (`mgmt_interface` `wt0` + `admin_addrs`), not the catalog.
 **Terraform / Hetzner Cloud Firewall:**
 - **No change.** The WAN `:22`-from-ubongo rule stays (the perimeter half of the break-glass).
 **sshd:**
 - **No change.**
 ## Validation
 ### Harness-first GREEN gate (ADR-025) — before any live change
 A "be askari" integration profile (Docker host + a coordinator-like container on the shared
 network + `base__firewall_input_only` + `admin_addrs`), driven through `make
 test-integration HOST=askari` (reusing the existing profile/overlay/verify pattern):
 - input chain default-deny with `:22` accepted via `iifname "wt0"` **and** from the
  break-glass admin address; forward chain `policy accept`;
 - published-port DNAT + NAT masquerade survive a **reboot** (the RED→GREEN reboot cycle);
 - the coordinator-like container comes up healthy with **no external geo egress**;
 - SSH path returns after reboot.
 This must be GREEN before the live cutover.
 ### Live cutover — supervised, console open, break-glass never removed
 Sequencing rule (FRICTION #6): validate reboot-recovery while a fallback path is still open.
 Because the WAN break-glass is *never* removed in this design, that invariant holds by
 construction.
 1. **Pre-check:** `ssh sjat@100.99.226.39` (over `wt0`) and `ansible askari -m ping` (forced
   over `wt0`) both succeed; public services + STUN healthy.
 2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); confirm
   `ansible askari -m ping` runs over the mesh.
 3. **Apply `base` (+ the geo-DB fix):** one `make deploy PLAYBOOK=site LIMIT=askari`
   converge applies INPUT-only default-deny with the `wt0` + admin-addr SSH allow and the
   coordinator robustness change. The firewall concern's armed auto-rollback
   (`base__firewall_rollback_timeout: 45`) reverts a bad ruleset. Then a post-apply
   `restart docker` rebuilds NAT (base's `flush ruleset` wipes Docker's nat — FRICTION); the
   coordinator now survives the egress window thanks to the geo-DB fix.
 4. **Verify the new steady state:** public services serve valid certs; STUN answers; SSH
   over `wt0` works; SSH over the WAN break-glass (`91.226.145.80` → `:22`) works.
 5. **Reboot resilience (the real test):** reboot askari (Hetzner console available) and
   confirm — with no intervention — Docker forwarding/NAT, public services, the coordinator,
   `wt0`, and SSH (both paths) all return.
 ## Risks & rollback
 - **ubongo's WAN IP anchors the break-glass.** If it is dynamic and rotates, the host
  `admin_addrs` rule and the Hetzner FW rule must be updated. The **Hetzner console** is the
  IP-independent ultimate break-glass. (Confirmed static by the operator 2026-06-19; it is
  also already the Hetzner FW assumption today.)
 - **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
  firewall auto-rollback timer, `ansible_host` = `wt0` (the confirm tests the real new path),
  and the WAN break-glass that is never removed.
 - **Reboot lockout:** mitigated by `iifname "wt0"` scoping (no sshd boot-race), the WAN
  break-glass, the geo-DB fix (coordinator survives the egress window), and harness GREEN.
 - **Default-deny breaks a public service:** mitigated by the catalog already enumerating all
  live ingress and the §Validation service checks; reversible via `base__firewall_apply:
  false`.
 - **Ultimate break-glass:** the Hetzner web console (out-of-band).
 ## Out of scope / follow-ons
 - **SPOF reduction (the next sub-project)** — reduce askari's single-point-of-failure role
  (currently `ubongo → askari` is `Relayed` through askari's own relay; if askari is down the
  mesh data plane for relayed peers is down). Its own spec, after this.
 - **NetBird ACL off Allow-All** — until then any enrolled peer can reach askari's `wt0:22`;
  scoping that is a separate sub-project.
 - **Full forward-chain hardening** — the `docker_host` container-forward drop-in (full
  forward default-deny, reboot-safe) as a later tightening over the `input_only` baseline.
 - **Coordinator off-site backup** (FRICTION #5, ADR-022) — still pending; noted, not in scope.
 - STATUS.md / ROADMAP updates land with the implementation, not this spec.
--- a/docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md
+++ b/docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md
@ -1,203 +0,0 @@
 # Spec — Mesh-hardening (2 of 3): ubongo INPUT-only default-deny + `ssh-from-control`
 Status: Accepted (2026-06-19)
 ## Context & scope
 The **mesh-hardening follow-on** (deferred from M5, ROADMAP) was decomposed into three
 independent sub-projects, each its own spec → plan → implementation cycle:
 1. askari SSH → `wt0` — spec/plan written 2026-06-17, **attempted and backed out the same day**
   (the incident; six lessons in `FRICTION.md`). Needs a redesign — **not** this spec.
 2. **ubongo nftables default-deny + `ssh-from-control`** ← *this spec*
 3. NetBird ACL off Allow-All → scoped policies (its own later spec; open mechanism question —
   no headless API path).
 ROADMAP (re-ordered after the 2026-06-17 incident) puts **ubongo first**: it is the clean,
 low-risk case — a physical box with a permanent console break-glass, and *not* the coordinator
 host that the incident proved you must not corner.
 This spec hardens **ubongo's inbound surface only**. It does **not** change sshd's
 `ListenAddress` (so no boot-race), does **not** apply a forward-chain default-deny (so Docker +
 the libvirt NAT keep working), and does **not** touch askari or the NetBird ACL.
 Current state (verified on ubongo, 2026-06-19): **no host firewall** — sshd listens on
 `0.0.0.0:22`, reachable from LAN, mesh, and anything routable; only Docker's + libvirt's own
 `iptables-nft` tables exist. Interfaces: `eno1` `10.20.10.151` (LAN, = `ansible_host`), `wt0`
 `100.99.146.14` (mesh), `docker0` (one container, no published ports), `virbr-boma`
 `192.168.150.1/24` (the libvirt NAT that `make test-integration` uses), `ip_forward=1`.
 ## Goal / success criteria
 - SSH to ubongo succeeds over **`wt0`** (road-warriors, askari), from **mamba on the LAN**
  (`10.20.10.50`), and via the **`ssh-from-control` self-path** (Ansible; source `10.20.10.151`).
 - SSH from any **other** LAN source is **dropped** (default-deny on `input`).
 - **Docker container egress and `make test-integration` (libvirt NAT) keep working** — the
  forward chain is untouched.
 - A **reboot** does not lock SSH out (no `ListenAddress`, so no bind race).
 - Break-glass is the **on-prem physical console** (permanent, non-mesh). The live apply is
  additionally gated by the firewall **auto-rollback** timer.
 ## Design
 Apply base's nftables `firewall` concern to ubongo, with two adjustments and one deliberate
 non-change:
 1. **INPUT-only default-deny.** The `input` chain keeps `policy drop` with the guaranteed
   management plane: `lo`, `established,related`, ICMP, SSH on `wt0`, and SSH from
   `ssh-from-control` (`10.20.10.151`). We add **one operator-workstation source** (mamba,
   `10.20.10.50`) via a new `base__firewall_admin_addrs` list. Everything else on `eno1` drops.
 2. **Forward chain left permissive.** base hardcodes `chain forward { … policy drop; }` for
   inter-container isolation. On ubongo that would break Docker egress **and** the libvirt NAT
   the integration harness depends on — the same class of failure that sank askari (FRICTION
   2026-06-17, signal 1). A new `base__firewall_input_only` knob renders the forward chain
   `policy accept` instead. Docker's and libvirt's own `iptables-nft` forward rules continue to
   apply (separate tables); base simply does not add a default-deny on top.
 3. **No sshd `ListenAddress` change.** sshd keeps listening on `0.0.0.0:22`; nftables does all
   inbound scoping. This deliberately avoids the `ip_nonlocal_bind` boot-race that broke askari
   (FRICTION signal 2) — there is nothing to bind before `wt0` exists.
 Resulting `input` allow-list:
 ```
 iif "lo" accept
 ct state established,related accept
 ct state invalid drop
 iifname "wt0" tcp dport 22 accept            # mesh (road-warriors, askari)
 ip saddr 10.20.10.151 tcp dport 22 accept    # ssh-from-control (Ansible self) — group_vars/all
 ip saddr 10.20.10.50  tcp dport 22 accept    # mamba on the LAN     — base__firewall_admin_addrs
 ip saddr 10.20.10.17  tcp dport 22 accept    # 2nd operator wkstn   — base__firewall_admin_addrs
 ip protocol icmp accept ; ip6 nexthdr ipv6-icmp accept
 # (no catalog services on ubongo) → default drop
 chain forward: policy accept                 # Docker + libvirt-NAT forwarding preserved
 ```
 ## Why ubongo is the safe case (maps to the 2026-06-17 incident)
 - **Signal 1** (forward-drop breaks Docker hosts): sidestepped — INPUT-only leaves forwarding alone.
 - **Signal 2** (`ip_nonlocal_bind` boot-race): sidestepped — no `ListenAddress`; sshd binds nothing new.
 - **Signal 3** (a host's only mgmt path must not depend on a service it hosts): satisfied —
  ubongo is not the coordinator and keeps three independent paths (mesh, LAN, physical console).
 - **Signal 6** (recovery tested after the break-glass was removed): the physical console is
  permanent (nothing to retire), and reboot-recovery is proven on a throwaway VM first.
 ## New & changed code
 **Role `base`:**
 - `roles/base/defaults/main.yml` — add:
  - `base__firewall_input_only: false` — when true, the forward chain is `policy accept`
    (host-local input filtering only), for hosts that route/forward container or NAT traffic
    (e.g. the control node's Docker + libvirt-NAT) where a forward default-deny would break them.
  - `base__firewall_admin_addrs: []` — extra LAN source IPs allowed to SSH (besides `wt0` +
    `ssh-from-control`); for an operator workstation reaching the host over the LAN. Key-gated.
 - `roles/base/templates/nftables.conf.j2`:
  - the forward line (currently line 21) →
    `chain forward { type filter hook forward priority 0; policy {{ "accept" if base__firewall_input_only | bool else "drop" }}; }`
  - after the `ssh-from-control` block (currently lines 12-14), add a loop:
    `{% for addr in base__firewall_admin_addrs %}` →
    `ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept`
 - `roles/base/molecule/default/{converge,verify}.yml` — fixture sets `input_only: true` + an
  `admin_addrs` entry; assert (a) `forward` renders `policy accept`, (b) the admin-addr accept
  rule renders, (c) existing input default-deny + `wt0` + control-addr assertions stay green.
 **Inventory** (`inventories/production/group_vars/control/vars.yml`, append):
 ```yaml
 # Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
 # INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
 # Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
 # (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0, the
 # ssh-from-control self-path (base__firewall_control_addr in group_vars/all), or mamba on the
 # LAN. Break-glass: the physical console.
 base__firewall_input_only: true
 base__firewall_admin_addrs:
  - "10.20.10.50"   # mamba over the LAN (NetBird off). Raw DHCP lease — see note below.
  - "10.20.10.17"   # a 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
 # base__firewall_apply defaults true; base__firewall_control_addr (= ubongo's own 10.20.10.151)
 # is set in group_vars/all and covers Ansible's self-connection.
 ```
 **Integration harness** (ADR-025) — a "be ubongo" profile, mirroring "be askari":
 - `tests/integration/overrides/ubongo.yml` — `firewall_apply: true`, `input_only: true`,
  `admin_addrs: ["192.168.150.99"]` (a representative LAN addr to exercise the rule),
  `firewall_control_addr: "192.168.150.1"` (the libvirt-NAT gateway = the harness's own SSH
  path, so the apply + reboot don't lock it out), `ssh_listen_mesh_only: false`,
  `mesh_enabled: false`.
 - `tests/integration/profiles/ubongo.json` — mirror `profiles/askari.json` (VM resources/image).
 - `tests/integration/verify.yml` — make the assertions **profile-aware** (gated on the active
  profile, since `verify.yml` is shared): for ubongo assert `input` policy drop, `forward`
  policy **accept**, and the admin-addr rule present. Reachability across the reboot is the
  harness's existing cycle. The askari assertions (Docker/forward-DNAT) must **not** run for the
  ubongo profile, nor vice-versa.
 Enables `make test-integration HOST=ubongo`.
 ## The admin-addrs — deliberately interim values
 `base__firewall_admin_addrs: ["10.20.10.50", "10.20.10.17"]` are the operator workstations'
 **current raw DHCP leases** (mamba + a second box), not reservations (operator decision,
 2026-06-19). Both share the operator's `sjat` SSH key. Caveats, accepted for now:
 - **Lease drift:** if DHCP reassigns either IP, the rule allows whatever host then holds it
  (still SSH-key-gated, so low risk) and that workstation loses its *LAN* path. **Backstop:**
  the workstations also reach ubongo over `wt0` (mesh), so they are never cut off — only the
  off-mesh LAN convenience lapses until the IP is corrected.
 - **Revisit trigger (flagged for follow-up):** when OPNsense-as-code lands (ADR-020 perimeter /
  TODO 3.5), replace both raw leases with **MAC-pinned DHCP reservations** (`10.20.10.17` =
  MAC `bc:0f:f3:c8:4a:8a`) and allow the reserved addresses. Recorded as a `FRICTION.md` open
  signal so the next `/kaizen` surfaces it.
 ## Testing
 - **Molecule** (base `default`, render-only, `firewall_apply: false`): the new forward-accept +
  admin-addr assertions above, with existing assertions green.
 - **Integration harness** (`make test-integration HOST=ubongo`): on a throwaway UEFI VM, apply
  the ubongo overlay, assert the ruleset shape, and prove **SSH survives a reboot** from an
  allowed source (the existing assert/cycle). This is the gate before touching the real control
  node.
 - **Live** (during cutover): SSH over `wt0` ✓, from mamba LAN ✓, Ansible self-ping ✓; SSH from a
  disallowed LAN host dropped ✓; `docker run … ` egress ✓; a fresh `make test-integration`
  still spins a VM (libvirt NAT intact) ✓.
 ## Staged cutover (operator-supervised — lockout-aware, FRICTION signal-6 order)
 ubongo is managed as `sjat` (password sudo), so the live apply needs the operator present
 anyway. The physical console is open throughout.
 1. **Harness GREEN:** `make test-integration HOST=ubongo` passes (incl. the reboot).
 2. **Pre-check the real paths** *before* applying: SSH over `wt0`, SSH from mamba
   (`10.20.10.50`), `ansible ubongo -m ping`. Confirm the physical console is reachable.
 3. **Dry-run:** `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — review the nftables diff
   (input default-deny + `wt0` + `10.20.10.151` + `10.20.10.50`; forward `policy accept`).
 4. **Apply (auto-rollback armed):** `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — the
   firewall concern snapshots, arms the 45 s revert, applies, `reset_connection` →
   `wait_for_connection` over the live path (`10.20.10.151`), then cancels the timer. A bad
   ruleset reverts itself; the console is the ultimate fallback.
 5. **Verify** every path + Docker egress + a fresh integration-VM spin (above).
 6. **Reboot ubongo; confirm SSH returns on all paths unaided** (console present). Only now is it
   done — recovery is proven *while the break-glass is still there*.
 7. **Docs:** update `STATUS.md` (ubongo row: input-only default-deny applied) and `ROADMAP.md`
   (mesh-hardening 2/3 done; next is sub-project 1 askari redesign or 3 NetBird ACL).
 ## Risks & rollback
 - **Self-referential apply** (ubongo runs Ansible against itself): mitigated by the auto-rollback
  timer, the `wait_for_connection` over the real path, three redundant allowed sources, and the
  permanent physical console. ubongo cannot be bricked.
 - **Raw-lease fragility:** documented above; backstopped by the mesh path; revisit with OPNsense.
 - **No new container isolation** (forward stays accept): accepted — ubongo is a single-tenant
  control node, not a service host; Docker/libvirt keep their own forward rules. The forward
  default-deny remains the norm for real service hosts (`base__firewall_input_only: false`).
 ## Out of scope / follow-ons
 - askari SSH → `wt0` redesign (sub-project 1) — needs the boot-race + coordinator-bootstrap
  resolved; folds in the coordinator-robustness (geo-DB FATAL-loop) + off-site backup lessons.
 - NetBird ACL off Allow-All (sub-project 3) — open mechanism question (no headless API path).
 - OPNsense DHCP reservations for the admin workstations (`10.20.10.50` mamba, `10.20.10.17`)
  and ubongo — replace the raw leases with MAC-pinned reservations; flagged in `FRICTION.md`,
  with OPNsense-as-code.
 - Forward-chain container isolation on ubongo — deliberately not done here.
 - `STATUS.md` / `ROADMAP.md` edits land with the implementation, not this spec.
--- a/docs/superpowers/specs/2026-06-20-mesh-spof-accept-resilience-design.md
+++ b/docs/superpowers/specs/2026-06-20-mesh-spof-accept-resilience-design.md
@ -1,163 +0,0 @@
 # Spec — Mesh-hardening (SPOF): accept the single-coordinator SPOF + targeted resilience
 Status: Accepted (2026-06-20)
 ## Context & scope
 The **mesh-hardening follow-on** decomposed into independent sub-projects (ROADMAP). Progress:
 1. ~~ubongo nftables INPUT-only default-deny~~ — **DONE 2026-06-19**.
 2. ~~askari SSH → `wt0` redesign~~ — **DONE 2026-06-20** (live reboot-validated).
 3. **askari relay-SPOF reduction** ← *this spec*.
 4. NetBird ACL off Allow-All — not started.
 `askari` runs boma's **single** self-hosted NetBird coordinator (management + signal + relay +
 STUN, one combined container) **and** is a mesh peer (ADR-016). Because `ubongo`'s INPUT-only
 default-deny drops the inbound UDP that ICE hole-punching needs, `ubongo`'s peers are always
 **`Relayed`** through askari's own relay (intentional posture — `docs/runbooks/netbird-client.md`,
 the `ubongo-relay-only` finding). So askari is a single point of failure for **relayed mesh
 traffic**.
 ### The decisive finding — the blast radius is narrow
 The mesh (`wt0`) is **not** a default gateway. Verified on ubongo (2026-06-20):
 ```
 wt0 routes ONLY 100.99.0.0/16   ·   default route via 10.20.10.1 dev eno1   ·   Networks: -  (no subnet-routes/exit-node)
 ```
 So an askari outage affects **only** traffic addressed to a peer's `100.99.x.x` mesh IP over the
 relay:
 | Traffic | askari down |
 |---|---|
 | LAN device → LAN service (direct or via reverse proxy) | unaffected |
 | node ↔ node over LAN IPs (future cluster) | unaffected |
 | node ↔ node same-LAN over mesh IPs | unaffected (direct P2P, local ICE candidate) |
 | **road-warrior → ubongo (remote, relayed)** | **breaks** |
 | mesh control plane (new enrol / ACL change / re-handshake) | pauses |
 Nothing on the LAN and no future intra-cluster traffic depends on askari. The only loss is
 **remote (off-LAN) mesh access to peers** — and only when off-LAN *and* askari is down at once.
 ### Why we are not "fixing" the SPOF with new infrastructure
 - **A second coordinator** is not supported by self-hosted NetBird (single management/signal) and
  contradicts ADR-016's deliberate single off-site coordinator.
 - **Direct P2P** only helps already-established sessions (re-handshakes still need askari's
  signal), and enabling it punctures `ubongo`'s deliberate default-deny (a firewall-catalog UDP
  entry + an `accepted-risks` deviation + OPNsense NAT) — cost out of proportion to a narrow,
  rare failure.
 - **A second relay** needs another publicly-reachable host; a relay at home reintroduces the
  public home surface ADR-016's off-site coordinator exists to avoid.
 Given a reliable always-on VPS and boma's 2–5-host scale, the sound engineering choice is to
 **accept the SPOF as a conscious, documented trade-off** and harden only the two spots real
 incidents point to.
 ## Goal / success criteria
 - The single-coordinator SPOF is **explicitly accepted and documented** (register entry + an
  ADR-016 availability analysis + recovery), so the trade-off is revisitable, not forgotten.
 - **Managed mesh hosts survive a local-DNS hiccup:** `ubongo` (and future managed mesh hosts)
  resolve the coordinator FQDN even when their resolver dies on a transition, mirroring the
  client-side fix already in the runbook.
 - **No new infrastructure** — no P2P, no second relay, no second coordinator, no Terraform.
 - The coordinator **off-site backup gap** is named in the accepted risk and explicitly handed to
  the next sub-project (ADR-022), not built here.
 ## Design
 ### (a) Accepted-risk `R8` — `docs/security/accepted-risks.md`
 Add one row to the register (owned by ADR-002):
 - **Risk:** *Single off-site mesh coordinator is an availability SPOF for remote mesh access* —
  askari hosts the only management/signal/relay (ADR-016); a relayed peer (all of ubongo's) loses
  remote mesh reachability while askari is down, and the control plane pauses. The
  `netbird_coordinator` store has **no off-site backup yet** (BACKUP.md), so an askari loss also
  loses mesh control-plane state until rebuilt.
 - **Rationale:** inherent to ADR-016's deliberate single off-site coordinator (sovereignty,
  survives a homelab outage); **narrow blast radius** (above table — LAN/intra-cluster/local
  unaffected); askari is a reliable always-on VPS; mitigations exist (client + managed-host DNS
  pin; documented rebuild).
 - **Revisit trigger:** askari proves unreliable; the cluster grows to depend on the mesh for
  intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role
  lands (closes the state-loss half).
 R8 is the **availability** complement to R3 (which covers askari as a *security* target).
 ### (b) ADR-016 amendment — an "Availability — an askari outage" subsection
 A short subsection capturing: the blast-radius table; that the SPOF is an accepted property
 (→ R8); and the **recovery procedure** — rebuild the coordinator (`/setup` + re-enrol peers, M5)
 or restore from backup once ADR-022 lands; client/road-warrior break-glass already in
 `docs/runbooks/netbird-client.md`; on-LAN access to ubongo never depends on the mesh (ADR-016
 recovery model). Recorded as an amendment (dated), ADR-016 status stays Accepted.
 ### (c) DNS-resilience — pin the coordinator FQDN on managed mesh hosts (`base` `mesh` concern)
 The 2026-06-18 outage was a client failing to resolve `netbird.askari.wingu.me` on a network
 transition; the client fix (public resolvers + an `/etc/hosts` pin to askari's stable WAN IP) is
 already in the runbook. The gap: **managed** mesh hosts have no equivalent. Add to `base`'s `mesh`
 concern (`roles/base/tasks/mesh.yml`):
 - New default `base__mesh_coordinator_pin: ""` (empty → no pin; opt-in).
 - When set (and `base__mesh_enabled`), render an `/etc/hosts` entry mapping the coordinator FQDN
  — derived from `base__mesh_management_url` via the `urlsplit('hostname')` filter, **not** a
  duplicated literal — to `base__mesh_coordinator_pin`, idempotently (a marker-scoped
  `blockinfile`/`lineinfile`).
 - Set `base__mesh_coordinator_pin` to askari's static WAN IP for managed mesh hosts that depend
  on the coordinator (ubongo via the `control` group_vars; future cluster groups as they appear).
  The **coordinator host itself (askari) is exempt** (it would point its own FQDN at its own WAN
  IP — needs NAT hairpin and is a server with stable DNS); the plan confirms the exact group_vars
  placement and the askari exemption.
 The pin is safe because askari's WAN IP is static (operator-confirmed); rendering it from a single
 inventory variable keeps it maintainable if it ever changes.
 ## New & changed code/docs
 - `docs/security/accepted-risks.md` — add row **R8**; bump the "Last reviewed" date.
 - `docs/decisions/016-mesh-vpn.md` — add the dated "Availability — an askari outage" amendment
  subsection (blast-radius table + recovery + R8 cross-ref).
 - `roles/base/defaults/main.yml` — add `base__mesh_coordinator_pin: ""` with a comment.
 - `roles/base/tasks/mesh.yml` — add the `/etc/hosts` coordinator-pin task (gated on
  `base__mesh_enabled` + a non-empty pin; FQDN from `urlsplit`).
 - `inventories/production/group_vars/control/vars.yml` — set `base__mesh_coordinator_pin` to
  askari's WAN IP for ubongo.
 - `roles/base/molecule/default/{converge,verify}.yml` — assert that with the pin set + a fixture
  FQDN the `/etc/hosts` entry renders, and that an empty pin renders nothing (no-op).
 - `STATUS.md` / `docs/ROADMAP.md` — mark sub-project 3 done; surface ADR-022 (coordinator backup)
  as the next item. (Land with the implementation, not this spec.)
 ## Testing
 - **Molecule** (`base` default scenario): (1) `base__mesh_coordinator_pin: ""` → no `/etc/hosts`
  coordinator line (default no-op); (2) pin set + a fixture `base__mesh_management_url` → exactly
  one idempotent `<ip> <fqdn>` line, FQDN correctly extracted by `urlsplit`. Existing
  firewall/hardening/mesh assertions stay green.
 - **No live deploy required for acceptance** — the pin is additive and idempotent; it lands on
  ubongo on the next routine `base` apply. (Optional spot-check: `getent hosts
  netbird.askari.wingu.me` on ubongo resolves to the pinned IP.)
 ## Risks & rollback
 - **Stale pin if askari's WAN IP changes** — mitigated by rendering from one inventory variable
  (single edit) and askari's IP being static; the pin is removable by clearing the knob + a
  re-apply.
 - **Over-pinning the coordinator host** — askari is explicitly exempt (hairpin/DNS), set in
  group_vars scope.
 - **Accepting the SPOF** is itself the residual risk — bounded by the narrow blast radius, the
  documented recovery, and R8's revisit triggers.
 ## Out of scope / follow-ons
 - **Coordinator off-site backup → ADR-022 kickoff (the next sub-project).** Named in R8 and
  `BACKUP.md` as the open gap; building it means ADR-022's pull-node (`fisi`) + restic design, not
  throwaway plumbing here.
 - **Direct P2P / NAT-traversal** — deferred posture change (default-deny puncture + OPNsense NAT +
  governance); explicitly not pursued here.
 - **A second relay / second coordinator** — ruled out above (infra cost / not supported / against
  ADR-016).
 - **NetBird ACL off Allow-All** — separate sub-project (4).
--- a/docs/testing/gotchas.md
+++ b/docs/testing/gotchas.md
@ -70,21 +70,3 @@ testing surprise is worth remembering past the session that hit it.
  plus review. Only a real (or `--check`) call against the API surfaces them.
 - → Treat a **check-mode run against the real API as a required gate** for such roles, or
  build a render-only assertion that materializes and inspects the rendered module args.
 ## Single-file bind mount + atomic rewrite = stale config (reload-in-place only)
 - **`ansible.builtin.template` writes atomically** (temp file + rename → a *new inode*). A
  Docker **single-file** bind mount pins the *old* inode, so a container that reloads
  config **in place** (no restart) keeps reading the stale file. Live hit: `reverse_proxy`
  bind-mounted the Caddyfile as a single file; `caddy reload` (in-container) re-read the
  old inode and silently no-op'd (`"config is unchanged"`). The new NetBird route never
  loaded → Caddy never requested its cert → surfaced only as a downstream TLS handshake
  failure.
 - **Fix for reload-in-place roles: bind-mount the config *directory*, not the file**
  (`./caddy` → `/etc/caddy`). Directory mounts reflect the inode swap, so the reload sees
  the new file (proven on askari).
 - **Restart-based roles are fine with a single-file mount.** Sibling case: `netbird`
  single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
  in-container reload), and a **restart re-resolves the bind mount** (verified: route
  count 0 before, 1 after). Rule of thumb: **reload-in-place needs a directory mount;
  restart-based roles don't.**
--- a/inventories/production/group_vars/all/firewall.yml
+++ b/inventories/production/group_vars/all/firewall.yml
@ -2,27 +2,14 @@
 # Shared firewall topology — single source of truth for the host nftables layer
 # (base role) and OPNsense (future). See docs/decisions/020-firewall.md.
-# Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public
+# Zone → subnet (from ADR-007).
 # off-site services (askari); home/cluster services use the internal zones only.
 firewall_zones:
  mgmt: 10.10.0.0/24
  srv: 10.20.0.0/24
  lan: 10.30.0.0/24
  iot: 10.40.0.0/24
  guest: 10.50.0.0/24
  public: 0.0.0.0/0
 # Service catalog: <name> → placement (host | group | hosts) + ingress[].
-# askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host
+# Empty until services are built; hosts still get default-deny + the management plane.
-# nftables template renders IPv4 source rules only; askari is reached via its A record
+firewall_catalog: {}
 # (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
 firewall_catalog:
  reverse_proxy:
    host: askari
    ingress:
      - { from: public, port: 80, proto: tcp }
      - { from: public, port: 443, proto: tcp }
  netbird_stun:
    host: askari
    ingress:
      - { from: public, port: 3478, proto: udp }
--- a/inventories/production/group_vars/all/vault.yml
+++ b/inventories/production/group_vars/all/vault.yml
@ -1,108 +1,86 @@
 $ANSIBLE_VAULT;1.1;AES256
-33393537643265363864656666343435633766306366316363663337363630636231646436656530
+32313030663934353361336234373562303537356334346238663836373238366136356331363761
-3032316362373533636163366562396563613735663335370a373239666261633263353963643632
+6337323031666565663430303562646565303533653531640a636662373939363632383838613431
-30396263343765396435376539323833623933353563333363383337366535616365393730643239
+38313365626365373539653266326661393765333737386161666165666534636562353165386537
-3034313633323963630a376334343134306138636234613438373866633730373737623863396463
+3934633033383966360a323965333139643764326236396635383863353437313966326665373537
-32646538636261363363636439626131643865306130623164656366663739333464393564663836
+65396564393130303030643861663964383436396561643666623837306366346333306430306238
-35313431383834383133386335376661346465613465346537353863363836663936393035646366
+66656136626566626262373037623531623633313664376166376161363336353930636538323339
-36393833326437363034646532313263383931316432316132396633333330623035636162626230
+38386564333432353363353663643539343765373662643836646666626339353539323033386230
-31643232306664386364666332396439633934303434636633353262396535396161303361643730
+31613165373035363533383862366638353035653836303737656534623361313064616365643131
-36636230323834393435376263326537326262396366633130623530303637333032613838373938
+64386165653835366137353339396364313661656333333635616338346561363765353934343162
-33623333383539613763646136663466643536653734386263346661653838613034356631353733
+64346462656566376539643030656461363161393936623332373632653731303031393437316636
-39393632616236626566613364356364323434313737383530333364323333383036613039323865
+36626165306161336262356161666531323336343663643661626365396437383230613636356530
-33396561626564353063623238656663386331323832613832323837346136346330613337393862
+62326363383138643162316464396666623332366434336462363531363836313833366237396464
-32356537623934363232373034373838643961343131336263663339643264613366383466613366
+38323635353238653432626361383434646538326531356333393337643066373262663462656466
-30303764626437313065316636633938323035303332356262646661636139653630633565636538
+65373036653265616137666533373930333239303732623832353337343434636434616562336135
-33613861663836333664623433636134663538663065323964383036616430336631636433646562
+38666137353266353130303235616362323633353735373163336138633838633738393637633964
-36393835363838303463356565386365623464326631363339363164396338366531386161646633
+66623866353265316336336566663034306664656365643832616232313732626464316563636335
-64663333633437353335336530306537353038356664623231666362633861376262613564643262
+63653930626565636630326661626561366539303964373933653437356537343361626438313439
-62393061353865333839386232626361663165623038306366363033383333306139316633343266
+35643165636662643463616337323063343633306536346538623331333365366533653634343538
-36356361613438663332653638376262346363613661623633316231316661353166366663616664
+63623261636366303261373338633939363338316463303065613436396163616537666265623439
-36323461653034666131386166333335393438376631326338386635623762663666316461643935
+31383361646531633863623230616635646138653630383537366335633030343530383735616435
-31326638303766626437393634666531303766326539303939343433393066623933623532636166
+35656464393432313563303030626133383761303763653530653837313930303034353136353237
-30636463383237393366306630323739333161373666643962363235613133316361383437643162
+37376366623836646236363062633938666135326631376235323061666465373865396235643937
-33623764383762373539373130333563383636386563373330613633373065333235376166373464
+32633736656539356332336237646137303534343337353139383637623165353338623566666535
-66653635343665656366383439333433366364663734396239326635653839386662323563663465
+30643134303235633362383064376234366235363262396362613731373364306362303634613138
-66633235303738303464383139323163303562643765623166316536363835653362633863646261
+39366230366262363237656631646361356464393266656166386337303663313136666261633836
-30393833316135656462326438633432363965356134396531383465333834346436363235336639
+32306132323239343539396232316564326361626462366561313561393635393233653633646431
-62663566646632383333613036666431326362346464666530383439373132346531316464613533
+39313039313139616262396334613035333633326135346365333537373138396535633137353832
-30663062373066623961316237623933663862613433636461373931643866306564313863613334
+63636335613237623234646234653435616635356637343964656463383864366534363438343938
-37353935343637383133316263663661363463383335636463643932323534393861326635613136
+39626364653832373062323434316134653831336534383934346231656533643435306465393065
-66326664653234636465353539616432386435633838373436333633366336623233363732363262
+31653731653438646361363732303664626438663533393837356562376633643933376132616236
-66666231643661333161613733643234383331616162386136346538373439613430326437333966
+65393432633831313433323930383736316630626230373963653536396637363436643136363962
-33623739626263616235373438303333666237336537626639316561306438373534653161643533
+37326534343237363961326438376137663034356532376433376461363337333562646136616462
-37303533653238346565396562376266666265646666623661393039383961376466396337656636
+61636131376264393236376532356539376536643632623864656331656630353362623133303830
-38343730663837653638653239333334333735666431633639353234326264656462633164346566
+34633461633539643262353263376363613566343261373930623139626364653232363538353330
-61316331623964393763616630633861326236333862653565373931303264316462303932633166
+33633634363232653439656236303262373265613762373165646131383537623438383835383962
-37363735336266316431303464386232353430636566303637393530663435363536323364346236
+33383931626136313036366562363732396561633631643561646536653665333733383261363833
-38376631396465303937656562386166306165316432653133623534336338636233383763666234
+66356461663965373234393237323037356331333339643931313936313234323432613563306630
-36626431316534353462356131316161316162313439326266376438316134653433656335643632
+33306638663839363565636661653830316265393639313065313062666534303039326465373636
-39653434366464613066666161626334643634353337376166323130353564313961626265373337
+64363033323837313030353132383562343337326366626635663439396231393537313932643337
-36636163636538323134353431336166373266333934366462373662323762643061336335646264
+30663031323231313938366436343735326165326433656633336465316630383961626664303536
-38343765343237386665623563383733633264316630326433666663373739373666623030366534
+38633964326431643362626631656131303539613033323039393630353766386339346363663362
-30363366386563326430333465383362633630646633393466623231333366653837643262336134
+33323034396136356362313163376438393739373738366363623636623634316537313461373066
-34396163326335666534366334643539623439336133626232353565306562636564656565646164
+38613062656231363532663133333438663535666566356336316266383763623765346237663838
-34323136663164306466666430613039306231336134323165363736666262356639396438306537
+64336435353437373264346561363265643339306532383539306363653564356362313430333066
-35633439636435313833626432643832636566366633653161326534303234316632393166396533
+65633733633938343830303537383231303036326132376263363531626565633664343038356661
-66613334323533373234393731383034323039656462333833646339353530636466303437643136
+31336139663061656437633138373438663966616338343565396562306638346437353730643664
-64383465353136386435626539353032363632303432633830343365396634336534383761353131
+30373133373863626137313062643062393035653463653231653465333166633063353137633538
-39343438633837316336633934373132306136393635373933623939623863663465316164313966
+62383331303164343236343539396461623738396234653333356632313664616263623061363563
-39323365373438343533623365653761323034633661373339616239346465643639306230636139
+34323165306533323362376161346364316135333535626261353730666131643938306366326263
-35313330363838366436666436313864346232396339613362333866646531363162303238373936
+31313934633137623638316534383234376333396131303034633636323037363732383263326335
-62353536613763313432636662353362313232373261313865636366373366306137373339333439
+32393766343161386537333062643434333333363538323366363231336666383161373432383563
-31376333666538303733353962323239323536643034663662323330373165326433616431383163
+65613537366139643032336230303133623431376231646662643666373532636565393639373930
-61616130653939313535396438346162313038616134323837336634366433303866656361376165
+65336630616462353837666431616662636635333532393331326539306233363539396266653239
-31393331343738373133313764656539626139643630303730343439613137623930356362333634
+31303031303330396632386131623134313536313433623064356636333230373962643339363736
-38663334396335396166383761663866613565643130366135623634343838623739333365653364
+30396130353466373136643935646436613636376636323530643031653334303863376432646534
-32396666306166643130353163323036663831613436376562383865306538653763336332353632
+39343165356232346539366233373135326338343663356164616265336235623332646365633466
-39346138353463316662376363333835386166393836666462323161376633336635356664376133
+35393533373663393762376332396136336236616635616535313336613034346436363665356565
-38326639323932373635653139613165616432336136363866383764393930373732633533306433
+32636536336634613531393434613435613962653862343737373237623261373836386663343831
-63303834386131626366633465393235613432363337386139656561333464303637353539653935
+66656135323838636638353963646638326531343635653937306230323237343933626135356533
-36616538376663383236386561383339616332306137623731626537343765643637343232303230
+66356263636438633164386535333762616438626439343462393833393731643037396662653737
-35393438636361353965353166633833316162376463376338353830386131666238626138666165
+31666361656530383437396230393663616133383764316437623939663631396561343266383766
-37646438663561343831643431303434333138666664373634363038653964363335646165343163
+62373636663631393637393763613337356337633264366434346561343263373931323335643135
-62613938636663613063383338326437333739386137316535366235366261383162656663636130
+31366661623137353336666630633365663764646234343035313130663562636361623532643461
-35613938333763333633636565306239356161383731643864373830646438303137306465376231
+63333961333338623966396662656262323830396439633337663431663235663962666238356630
-35356334303233343634653936323966653961616630633061643765373430386362376437656535
+30353331313462653061373638666235653938623931366466666164343566623238333237353265
-39306630393466343232663632656133356531663935643137353439333261316632653762323232
+30373064353132366634623966306632303832306630383637623465323134633133656333303964
-36323964636534326561626133323530643639386563623435656535386562633635633339343938
+35646637316236303364393363323137616132326437623238336631313530663230333362623633
-35343536646565353936326362623930313739386163383765326330316335636139326665653339
+34383032376538366464363032343262656164376166386237383563613630336666633965653730
-37333030383438363231333663616565303434303334643234353239313837656563363861656661
+64373236396564363164643637623736626532396630313131356563333238643665356166323837
-36313166666566393737636231373634363132623066376437323532353861336338373462323539
+31626338623665623165643763623661666439626435643237336433646132666366623661393832
-35306135363835653733356634646332346461643236613263376664343537333531313561333035
+37306533613966663936373061613331633934623462343236626234306130383738343631303231
-34393938643561613231666434386331393966353730343634343437353566343263653038316430
+32326339323738323537333363313538373266623363363636633462356234363466393263316235
-62336333373336633164626132346534616139333830336535666135613833623734353563353732
+39663033303165656366396334306535643361646663373935303230376466366632373563303231
-31326139386336346332363565303333353135663732613765313034356433363932346263386164
+64323264653036333039663965646630653934376239653236323063656137373830623563336463
-33343636333039346339356261623037316334623236653736386362323536386134633665383237
+37343461373737313539316361623763373733653930626532393565333938333761323631303332
-39393665646231313734393963336634393563366134373233663036363830363265656663646361
+39663530303439616561356561666532653762343339323435636164376664373731343132666539
-64353063653362383435623931343133623434356139363430613935346363386139373134306164
+63626637346563393765303065646564643661636130396439323736343764333633373331653333
-37343931363931613834316665343662393533383730663364396338623933663766396130646566
+66633465343433303038623638323965636533666639643266353163353436393036336639336133
-30626339616537373337303338613931303938323032666634666337626361376130396631376236
+32646664363565326539643763653832313336663262313634343635616333613434373333323036
-35373766366637313661616335383739616636373166366332336161316466323731383836643263
+61366435376265336638326132333439613431353633653762653836386235643965366436363866
-30623564353934636561323734666663623363666365323734633030643664643232633638636437
+35626664393139386337353335343930306130356335623131646261656434303966656431623231
-63373664383863333032383739386238353239666162656436646439356239336266393966366434
+66643730393430363838626434663933613536343533316262373564666665373663336363623166
-38613437353931633138343865313831303264653732313764336564623065613339326239356232
+63363037373634383961373035633239646235316137363036333765313864643365396165643432
-32376536616635346536633361663463663231636566333062636261653761383664646639646335
+36623465313036376261393566383539336638363836633232656136656533396663323366313062
-31656236343930386135346266353533393035646265383437313763653530666136653433353964
+64616632373333313466356362336234346564373832316433373963623263316635
 30326434323038643565356239646533323134356361656365656339383635303065633537656532
 64633663653138653439623238636532373265386362643238646433616531343962343762623238
 30663966666434643361313835373835633064376536636436636465383763356663313862393138
 62346431663864316335386433396535386137366462666334623837666233626661333565613766
 35656264383936326638613431646236643131396337626231326565653233393061643530333830
 37396130303862613034393332623665376464353831366562353865373065336366393939623036
 30633637336564326466326562653966633265343062616536363738363239626637373730643839
 30336238363535373664643463353035313735633635666562653063386139366464626432633931
 33393436393435386637333135356630373464646634346364326164303038393664313864623633
 62383733366430373535633531356162666164653030326232336137633630346237386230323166
 37346365373632636639363833366461663265313235633663616432643835646133626365616531
 31646531643134633531353039343832643336373735343264653437373662633465613861646630
 34323131306236343566343736326264663339363537346539353434303866343036303761656566
 33386438343539656535306330346433643636343063336433323061313762613839633665363063
 66363233343337626631323038363336636335333965353636356436373031356262343734386565
 62396436303238373837373334663130396631373034356462393931653935633161356633383131
 37376130333232383235633765366636653330376663343566343833323861313236623333653834
 61363261326266353935333738626530396433306331326339623533393738663437343131656462
 61396533636334613363363161646366326631643138313161393438303261336537383733343630
 35383739353136613162326630383961623463626561313033613664643931366435326635383838
 30333066396132396238633837316636373062316264336530326133623465346264356530363537
 643734623039346364383038363937353764
--- a/inventories/production/group_vars/control/vars.yml
+++ b/inventories/production/group_vars/control/vars.yml
@ -12,28 +12,7 @@ dev_env__users:
 # group only.
 ansible_user: sjat
 # ubongo's AI-worker; passwordless sudo for the claude user (ADR-015 amended).
 base__ai_worker_user: claude
 # ubongo is a NetBird mesh peer (ADR-016, M5) — enrol the agent via base's `mesh` concern.
 # Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening
 # follow-on), so this brings up wt0 without changing SSH exposure.
 base__mesh_enabled: true
 # Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
 # INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
 # Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
 # (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
 # ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
 # mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
 base__firewall_input_only: true
 # DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
 # IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
 # mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
 base__mesh_coordinator_pin: "77.42.120.136"
 base__firewall_admin_addrs:
  - "10.20.10.50"   # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
                    # OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
  - "10.20.10.17"   # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
--- a/inventories/production/group_vars/offsite_hosts/vars.yml
+++ b/inventories/production/group_vars/offsite_hosts/vars.yml
@ -1,21 +1,6 @@
 ---
 # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
-# (ADR-016, M5).
+# (ADR-016, M5) — enrol the agent via base's `mesh` concern. Enrollment only; the
-#
+# host firewall default-deny + moving askari's SSH onto wt0 stay deferred to the
-# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward
+# mesh-hardening follow-on.
 # `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
 # did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
 #   - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
 #     so Docker container forwarding/NAT survive a reboot;
 #   - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
 #     change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
 #   - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
 #     as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
 #     management path must never depend on a service that host itself hosts).
 # Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
 base__mesh_enabled: true
 base__firewall_apply: true
 base__firewall_input_only: true     # forward stays `policy accept` → Docker-safe
 base__ssh_listen_mesh_only: false   # no sshd ListenAddress change → no boot-race
 base__firewall_admin_addrs:
  - 91.226.145.80   # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass
--- a/inventories/production/host_vars/askari.yml
+++ b/inventories/production/host_vars/askari.yml
@ -1,7 +0,0 @@
 ---
 # Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
 # in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
 # (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
 # break-glass; the Hetzner web console is the IP-independent ultimate fallback.
 # Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
 ansible_host: 100.99.226.39
--- a/playbooks/workstation.yml
+++ b/playbooks/workstation.yml
@ -8,5 +8,3 @@
  roles:
    - role: dev_env
      tags: [dev_env]
    - role: integration_test
      tags: [integration_test]
--- a/roles/base/defaults/main.yml
+++ b/roles/base/defaults/main.yml
@ -11,14 +11,6 @@ base__firewall_rollback_timeout: 45  # seconds before the auto-revert fires on a
 base__firewall_confirm_timeout: 20   # seconds to re-establish a fresh connection post-apply
 base__firewall_dropin_dir: /etc/nftables.d
 base__firewall_apply: true           # set false to render+validate without applying (CI/Molecule)
 base__firewall_input_only: false     # true → the forward chain is `policy accept` (host-local
                                     # INPUT filtering only). For hosts that forward/route
                                     # container or NAT traffic (the control node's Docker +
                                     # libvirt-NAT) where a forward default-deny would break
                                     # them. Real service hosts keep this false (forward drop).
 base__firewall_admin_addrs: []       # extra LAN source IPs allowed to SSH, besides wt0 +
                                     # ssh-from-control. For an operator workstation reaching
                                     # the host over the LAN (no mesh). Key-gated. (ADR-021)
 # SSH hardening + fail2ban (ADR-002) — `hardening` concern.
 base__ssh_password_authentication: "no"
@ -29,19 +21,6 @@ base__fail2ban_findtime: 10m
 # base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys).
 base__ssh_authorised_keys: []
 # SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
 # ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
 # fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
 # lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
 # non-empty address rather than silently listening on all interfaces.
 base__ssh_listen_mesh_only: false
 base__ssh_listen_addr: ""
 # The automation/AI-worker user granted passwordless sudo (ADR-015 amended / ADR-021).
 # Empty = no AI-worker sudo. Set per-group (e.g. group_vars/control: claude). The user's
 # password should be locked so NOPASSWD is its only sudo path; actions are auditd-attributed.
 base__ai_worker_user: ""
 # NetBird mesh agent enrollment (ADR-016). Opt-in: default off so applying `base` to a
 # host not on the mesh is a no-op for this concern. The live actions (apt install over
 # the network, `netbird up` against the coordinator) are additionally gated by
@ -51,9 +30,3 @@ base__mesh_manage: true
 base__mesh_management_url: "https://netbird.askari.wingu.me"
 base__mesh_setup_key: "{{ vault.netbird.setup_key }}"
 base__mesh_version: "0.72.4"   # match the coordinator; exact apt pin confirmed on-host at deploy
 # DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
 # stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
 # so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
 # = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
 base__mesh_coordinator_pin: ""
--- a/roles/base/molecule/default/converge.yml
+++ b/roles/base/molecule/default/converge.yml
@ -6,21 +6,15 @@
  vars:
    base__firewall_apply: false
    base__firewall_control_addr: 10.10.0.99   # test control-node LAN address
    base__firewall_admin_addrs:
      - "10.30.0.77"   # fixture: an operator-workstation LAN source (admin-addr SSH allow)
    # Exercise the mesh concern's include path with the live actions gated off, so it
    # runs hermetically (no coordinator/key needed) and must be a clean no-op.
    base__mesh_enabled: true
    base__mesh_manage: false
    base__mesh_setup_key: "dummy-molecule-key"
    base__mesh_coordinator_pin: "203.0.113.9"   # fixture IP (TEST-NET-3); pins FQDN from base__mesh_management_url
    base__ssh_listen_mesh_only: true
    base__ssh_listen_addr: "100.99.0.1"   # fixture mesh IP (no wt0 in the container)
    firewall_zones:
      lan: 10.30.0.0/24
      srv: 10.20.0.0/24
      mgmt: 10.10.0.0/24
      public: 0.0.0.0/0
    firewall_catalog:
      reverse_proxy:
        host: instance
@ -30,9 +24,5 @@
        host: instance
        ingress:
          - { from: srv, port: 2342, proto: tcp }
      netbird_stun:
        host: instance
        ingress:
          - { from: public, port: 3478, proto: udp }
  roles:
    - role: base
--- a/roles/base/molecule/default/molecule.yml
+++ b/roles/base/molecule/default/molecule.yml
@ -19,16 +19,6 @@ platforms:
    volumes:
      - /sys/fs/cgroup:/sys/fs/cgroup:rw
    command: /lib/systemd/systemd
    # Pre-create the namespaced sysctl so ansible.posix.sysctl can set it (mesh-hardening 1/3).
    # The container image lacks procps so the sysctl binary is absent; we also install it in
    # prepare.yml. This entry ensures the value exists in the container's netns at startup.
    sysctls:
      net.ipv4.ip_nonlocal_bind: "0"
    # ubongo's /etc/resolv.conf points to the NetBird mesh DNS (100.99.x.x), which Docker
    # containers can't reach (no wt0). Override to a public resolver so prepare.yml apt tasks
    # can update the cache and install packages.
    dns_servers:
      - 8.8.8.8
 provisioner:
  name: ansible
--- a/roles/base/molecule/default/prepare.yml
+++ b/roles/base/molecule/default/prepare.yml
@ -1,11 +0,0 @@
 ---
 - name: Prepare
  hosts: all
  become: true
  gather_facts: false
  tasks:
    - name: Install procps so ansible.posix.sysctl can find the sysctl binary
      ansible.builtin.apt:
        name: procps
        state: present
        update_cache: true
--- a/roles/base/molecule/default/verify.yml
+++ b/roles/base/molecule/default/verify.yml
@ -38,33 +38,12 @@
          - "'tcp dport 2342 accept' in nft"
        fail_msg: "missing srv->2342 rule for photoprism"
    - name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
      ansible.builtin.assert:
        that:
          - "'0.0.0.0/0' in nft"
          - "'udp dport 3478 accept' in nft"
        fail_msg: "missing public->3478/udp rule for netbird_stun"
    - name: Assert the docker_host extension hook is present
      ansible.builtin.assert:
        that:
          - "'include \"/etc/nftables.d/*.nft\"' in nft"
        fail_msg: "missing drop-in include hook"
    - name: Assert the forward chain defaults to policy drop (input_only off)
      ansible.builtin.assert:
        that:
          - "'hook forward priority 0; policy drop;' in nft"
        fail_msg: >-
          forward chain must default to policy drop when base__firewall_input_only is
          false (container isolation stays the norm on real service hosts)
    - name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
      ansible.builtin.assert:
        that:
          - "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
        fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
    - name: Syntax-check the rendered ruleset (no apply)
      ansible.builtin.command: nft -c -f /etc/nftables.conf
      changed_when: false
@ -79,18 +58,6 @@
      ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local
      changed_when: false
    - name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
      ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
      changed_when: false
    - name: Sysctl drop-in for ip_nonlocal_bind is present
      ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind=1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
      changed_when: false
    - name: Kernel ip_nonlocal_bind is live in this netns
      ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
      register: _nonlocal
      changed_when: false
      failed_when: _nonlocal.stdout | trim != '1'
    # mesh concern: enabled but manage=false must be a clean no-op (no install/enrol)
    - name: Check whether netbird got installed
      ansible.builtin.command: which netbird
@ -103,14 +70,3 @@
          - _nb.rc != 0
        fail_msg: "netbird must not be installed when base__mesh_manage is false"
        success_msg: "mesh concern is a clean no-op under manage=false"
    - name: Read /etc/hosts (coordinator pin)
      ansible.builtin.slurp:
        src: /etc/hosts
      register: _etchosts
    - name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
      ansible.builtin.assert:
        that:
          - "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)"  # slurp content is always base64
        fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
        success_msg: "coordinator FQDN pinned in /etc/hosts"
--- a/roles/base/tasks/main.yml
+++ b/roles/base/tasks/main.yml
@ -23,13 +23,6 @@
      tags: [hardening]
  tags: [hardening]
 - name: AI-worker operational access (sudoers drop-in)
  ansible.builtin.include_tasks:
    file: operational_access.yml
    apply:
      tags: [users]
  tags: [users]
 - name: NetBird mesh enrollment
  ansible.builtin.include_tasks:
    file: mesh.yml
--- a/roles/base/tasks/mesh.yml
+++ b/roles/base/tasks/mesh.yml
@ -64,19 +64,3 @@
    - "'Management: Connected' not in (_netbird_status.stdout | default(''))"
  no_log: true   # setup key is on the argv
  tags: [mesh]
 - name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
  ansible.builtin.lineinfile:
    path: /etc/hosts
    regexp: '^\S+\s+{{ _coordinator_fqdn | regex_escape }}\s*$'
    line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
    state: present
    # /etc/hosts is bind-mounted in the Docker Molecule container (atomic rename → EBUSY);
    # this is a fallback only — production VMs still write atomically.
    unsafe_writes: true
  vars:
    _coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
  when:
    - base__mesh_enabled | bool
    - base__mesh_coordinator_pin | length > 0
  tags: [mesh]
--- a/roles/base/tasks/operational_access.yml
+++ b/roles/base/tasks/operational_access.yml
@ -1,11 +0,0 @@
 ---
 - name: Grant the AI-worker user passwordless sudo (ADR-015 amended / ADR-021)
  ansible.builtin.copy:
    content: "{{ base__ai_worker_user }} ALL=(ALL) NOPASSWD:ALL\n"
    dest: "/etc/sudoers.d/{{ base__ai_worker_user }}-ai-worker"
    owner: root
    group: root
    mode: "0440"
    validate: "visudo -cf %s"
  when: base__ai_worker_user | length > 0
  tags: [users]
--- a/roles/base/tasks/ssh.yml
+++ b/roles/base/tasks/ssh.yml
@ -1,31 +1,4 @@
 ---
 - name: Resolve the sshd mesh listen address (override, else live wt0 fact)
  ansible.builtin.set_fact:
    base__ssh_listen_addr_resolved: >-
      {{ base__ssh_listen_addr
         or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
  when: base__ssh_listen_mesh_only | bool
 - name: Fail closed — refuse to render sshd without a known mesh address
  ansible.builtin.assert:
    that:
      - base__ssh_listen_addr_resolved | length > 0
    fail_msg: >-
      base__ssh_listen_mesh_only is true but no mesh address resolved (set
      base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
      render sshd ListenAddress empty (which would listen on ALL interfaces).
  when: base__ssh_listen_mesh_only | bool
 - name: Allow sshd to bind the mesh IP before wt0 exists at boot
  ansible.posix.sysctl:
    name: net.ipv4.ip_nonlocal_bind
    value: "1"
    sysctl_set: true
    state: present
    reload: true
    sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
  when: base__ssh_listen_mesh_only | bool
 - name: Ensure openssh-server is installed
  ansible.builtin.apt:
    name: openssh-server
--- a/roles/base/templates/nftables.conf.j2
+++ b/roles/base/templates/nftables.conf.j2
@ -12,16 +12,13 @@ table inet filter {
 {% if base__firewall_control_addr %}
    ip saddr {{ base__firewall_control_addr }} tcp dport {{ base__firewall_ssh_port }} accept
 {% endif %}
 {% for addr in base__firewall_admin_addrs %}
    ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
 {% endfor %}
    ip protocol icmp accept
    ip6 nexthdr ipv6-icmp accept
 {% for r in base__firewall_resolved %}
    ip saddr { {{ r.sources | join(', ') }} } {{ r.proto }} dport {{ r.port }} accept
 {% endfor %}
  }
-  chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; }
+  chain forward { type filter hook forward priority 0; policy drop; }
  chain output  { type filter hook output  priority 0; policy accept; }
 }
--- a/roles/base/templates/sshd_hardening.conf.j2
+++ b/roles/base/templates/sshd_hardening.conf.j2
@ -3,6 +3,3 @@ PasswordAuthentication {{ base__ssh_password_authentication }}
 PermitRootLogin {{ base__ssh_permit_root_login }}
 PubkeyAuthentication yes
 KbdInteractiveAuthentication no
 {% if base__ssh_listen_mesh_only | bool %}
 ListenAddress {{ base__ssh_listen_addr_resolved }}
 {% endif %}
--- a/roles/docker_host/defaults/main.yml
+++ b/roles/docker_host/defaults/main.yml
@ -1,16 +1,8 @@
 ---
-# Docker engine install (ADR-004). Cluster-specific daemon hardening is deferred to when
+# Docker engine install (ADR-004). Cluster-specific daemon hardening + nftables.d
-# the cluster exists.
+# integration are deferred to when the cluster + host firewall exist.
 docker_host__packages:
  - docker-ce
  - docker-ce-cli
  - containerd.io
  - docker-compose-plugin
 # Container-forward nftables drop-in (FRICTION 2026-06-17 #1 / ADR-025). base's inet-filter
 # forward chain is `policy drop`; on a Docker host that kills published-port DNAT + inter-
 # container forwarding ON REBOOT (nftables loads default-deny before dockerd). This drop-in
 # (loaded via base's /etc/nftables.d/*.nft include) appends the accepts so a rebooted Docker
 # host keeps forwarding. Only meaningful where base__firewall_apply is true.
 docker_host__forward_dropin: true
 docker_host__nftables_dropin_dir: /etc/nftables.d   # must match base__firewall_dropin_dir
--- a/roles/docker_host/tasks/main.yml
+++ b/roles/docker_host/tasks/main.yml
@ -37,22 +37,3 @@
    state: present
    update_cache: true
  tags: [packages]
 - name: Ensure the nftables drop-in dir exists (for the container-forward rules)
  ansible.builtin.file:
    path: "{{ docker_host__nftables_dropin_dir }}"
    state: directory
    mode: "0755"
  when: docker_host__forward_dropin | bool
  tags: [firewall]
 - name: Install the container-forward nftables drop-in (reboot-safe Docker forwarding)
  ansible.builtin.template:
    src: 10-docker-forward.nft.j2
    dest: "{{ docker_host__nftables_dropin_dir }}/10-docker-forward.nft"
    mode: "0644"
  when: docker_host__forward_dropin | bool
  # Not reloaded here: a running host already forwards via Docker's runtime rules, so the
  # drop-in only needs to protect the NEXT boot (loaded by nftables.service). Reloading nft
  # now would flush Docker's NAT (FRICTION 2026-06-17 #4); the boot loads it cleanly.
  tags: [firewall]
--- a/roles/docker_host/templates/10-docker-forward.nft.j2
+++ b/roles/docker_host/templates/10-docker-forward.nft.j2
@ -1,14 +0,0 @@
 # {{ ansible_managed }}
 # Allow container forwarding through base's default-deny forward chain (ADR-025 / FRICTION
 # 2026-06-17 #1). Appended to base's `table inet filter` / `chain forward` via the
 # /etc/nftables.d/*.nft include, and loaded by nftables.service at boot — exactly when the
 # bug bit (default-deny forward loading before dockerd on reboot).
 table inet filter {
  chain forward {
    ct state established,related accept
    iifname "docker0" accept
    oifname "docker0" accept
    iifname "br-*" accept
    oifname "br-*" accept
  }
 }
--- a/roles/integration_test/README.md
+++ b/roles/integration_test/README.md
@ -1,35 +0,0 @@
 # integration_test
 Installs the KVM/libvirt substrate on the control node (`ubongo`) so the agent
 can boot throwaway Debian VMs for local integration testing (ADR-025).
 This is a **non-service** role — no SECURITY/VERIFY/ACCESS/BACKUP files are
 required. It does **not** make ubongo a production hypervisor; it only provides
 the tooling needed to spin up short-lived test VMs (see ADR-015).
 ## Target group
 `control` (i.e. `ubongo`)
 ## What it does
 1. Installs QEMU/KVM, libvirt daemon + clients, `virt-install`, and
   cloud-image tools (`cloud-image-utils`, `genisoimage`).
 2. Enables and starts `libvirtd`.
 3. Adds the configured users (`sjat`, `claude`) to the `libvirt` and `kvm`
   groups so VMs can be managed without `sudo`.
 4. Creates `/var/lib/boma-integration` (owned `root:libvirt`, mode `2775`) as
   the cache directory for golden images and overlays.
 ## Defaults
 | Variable                      | Default                       | Purpose                          |
 |-------------------------------|-------------------------------|----------------------------------|
 | `integration_test__packages`  | see `defaults/main.yml`       | APT packages to install          |
 | `integration_test__users`     | `[sjat, claude]`              | Users granted libvirt/kvm access |
 | `integration_test__cache_dir` | `/var/lib/boma-integration`   | Image/overlay cache directory    |
 ## Related decisions
 - [ADR-025](../../docs/decisions/025-local-vm-integration-testing.md) — local VM integration testing
 - [ADR-015](../../docs/decisions/015-control-host.md) — control host scope (ubongo is not a hypervisor)
--- a/roles/integration_test/defaults/main.yml
+++ b/roles/integration_test/defaults/main.yml
@ -1,20 +0,0 @@
 ---
 # integration_test — installs the local KVM/libvirt substrate on the control node
 # (ubongo) so the agent can run throwaway VM integration tests (ADR-025). Non-service
 # role; applied to the `control` group. Not a production hypervisor (ADR-015).
 integration_test__packages:
  - qemu-system-x86      # KVM
  - qemu-utils           # qemu-img (overlays)
  - libvirt-daemon-system
  - libvirt-clients      # virsh
  - virt-install         # virt-install (trixie: the real pkg; `virtinst` is transitional)
  - cloud-image-utils    # cloud-localds (NoCloud seed)
  - genisoimage          # cloud-localds fallback
 # Users granted libvirt/kvm access (run VMs without sudo).
 integration_test__users:
  - sjat
  - claude
 # Where the golden image + overlays live (outside the repo).
 integration_test__cache_dir: "/var/lib/boma-integration"
 # nftables drop-in dir — must match base__firewall_dropin_dir (base role default: /etc/nftables.d)
 integration_test__nftables_dropin_dir: /etc/nftables.d
--- a/roles/integration_test/handlers/main.yml
+++ b/roles/integration_test/handlers/main.yml
@ -1,15 +0,0 @@
 ---
 - name: Reload nftables
  ansible.builtin.service:
    name: nftables
    state: reloaded
  listen: "integration_test | reload nftables"
  register: _nft_reload
  # nftables is absent from the Molecule Docker container; ignore "not found" errors there.
  # On real hosts where base has applied nftables, failures propagate normally.
  failed_when:
    - _nft_reload.failed
    - >-
      'Could not find the requested service nftables' not in (_nft_reload.msg | default(''))
      and 'nftables.service not found' not in (_nft_reload.msg | default(''))
      and 'Unit nftables.service not found' not in (_nft_reload.msg | default(''))
--- a/roles/integration_test/meta/main.yml
+++ b/roles/integration_test/meta/main.yml
@ -1,14 +0,0 @@
 ---
 galaxy_info:
  author: sjat
  description: >-
    Installs the KVM/libvirt substrate on the control node (ubongo) to enable
    local VM integration testing (ADR-025). Non-service role; not a production
    hypervisor (ADR-015).
  license: MIT
  min_ansible_version: "2.17"
  platforms:
    - name: Debian
      versions:
        - trixie
 dependencies: []
--- a/roles/integration_test/molecule/default/converge.yml
+++ b/roles/integration_test/molecule/default/converge.yml
@ -1,21 +0,0 @@
 ---
 # KVM/libvirt APT packages cannot be installed in the Docker Molecule container
 # (no internet; KVM unusable in a container). This converge exercises only the
 # nftables drop-in rendering via tasks_from, which IS meaningful in a container.
 # The full role (packages/libvirt) is exercised by make test-integration.
 #
 # Coverage split:
 #   Docker Molecule (this file): nftables drop-in rendering only.
 #   make test-integration (ADR-025, real KVM): libvirt/KVM package install, cache
 #     dir creation, and end-to-end VM lifecycle — the role's substrate tasks.
 # The Docker scenario intentionally covers only the firewall drop-in; substrate
 # coverage lives in the real-KVM integration harness, not here.
 - name: Converge
  hosts: all
  become: true
  gather_facts: true
  tasks:
    - name: Include integration_test firewall tasks
      ansible.builtin.include_role:
        name: integration_test
        tasks_from: firewall.yml
--- a/roles/integration_test/molecule/default/molecule.yml
+++ b/roles/integration_test/molecule/default/molecule.yml
@ -1,31 +0,0 @@
 ---
 dependency:
  name: galaxy
  options:
    requirements-file: ../../requirements.yml
 driver:
  name: docker
 platforms:
  - name: instance
    # Project-owned image built from .docker/molecule-debian13/Dockerfile
    # and hosted in the Forgejo container registry.
    # Build/push with: make molecule-image / make molecule-image-push
    image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
    pre_build_image: true
    privileged: true          # required for systemd
    cgroupns_mode: host
    volumes:
      - /sys/fs/cgroup:/sys/fs/cgroup:rw
    command: /lib/systemd/systemd
 provisioner:
  name: ansible
  inventory:
    host_vars:
      instance:
        ansible_user: root
 verifier:
  name: ansible
--- a/roles/integration_test/molecule/default/prepare.yml
+++ b/roles/integration_test/molecule/default/prepare.yml
@ -1,14 +0,0 @@
 ---
 # The Molecule Docker image ships with /var/lib/apt/lists/ cleared to minimise size.
 # KVM/libvirt packages cannot be installed in a container; converge only runs the
 # `firewall` tag. Pre-create /etc/nftables.d so the drop-in template task succeeds.
 - name: Prepare
  hosts: all
  become: true
  gather_facts: false
  tasks:
    - name: Create nftables drop-in dir (normally created by the config task)
      ansible.builtin.file:
        path: /etc/nftables.d
        state: directory
        mode: "0755"
--- a/roles/integration_test/molecule/default/verify.yml
+++ b/roles/integration_test/molecule/default/verify.yml
@ -1,18 +0,0 @@
 ---
 # Package-install and cache-dir tasks are skipped (converge runs `firewall` tag only;
 # KVM/libvirt packages cannot be fetched in the Docker container). This scenario
 # verifies the nftables drop-in renders correctly.
 - name: Verify
  hosts: all
  become: true
  gather_facts: false
  tasks:
    - name: Read the libvirt bridge nftables drop-in
      ansible.builtin.slurp:
        src: /etc/nftables.d/10-libvirt-boma.nft
      register: _dropin
    - name: Assert drop-in contains virbr-boma accept rule
      ansible.builtin.assert:
        that:
          - "'virbr-boma' in (_dropin.content | b64decode)"
          - "'accept' in (_dropin.content | b64decode)"
--- a/roles/integration_test/tasks/firewall.yml
+++ b/roles/integration_test/tasks/firewall.yml
@ -1,8 +0,0 @@
 ---
 - name: Install the libvirt bridge nftables drop-in (virbr-boma input allow)
  ansible.builtin.template:
    src: 10-libvirt-boma.nft.j2
    dest: "{{ integration_test__nftables_dropin_dir }}/10-libvirt-boma.nft"
    mode: "0644"
  notify: "integration_test | reload nftables"
  tags: [firewall]
--- a/roles/integration_test/tasks/main.yml
+++ b/roles/integration_test/tasks/main.yml
@ -1,35 +0,0 @@
 ---
 - name: Install the KVM/libvirt substrate
  ansible.builtin.apt:
    name: "{{ integration_test__packages }}"
    state: present
    update_cache: true
    cache_valid_time: 3600
  tags: [packages]
 - name: Enable and start libvirtd
  ansible.builtin.systemd:
    name: libvirtd
    enabled: true
    state: started
  tags: [config]
 - name: Grant users libvirt + kvm access
  ansible.builtin.user:
    name: "{{ item }}"
    groups: [libvirt, kvm]
    append: true
  loop: "{{ integration_test__users }}"
  tags: [users]
 - name: Create the integration cache dir
  ansible.builtin.file:
    path: "{{ integration_test__cache_dir }}"
    state: directory
    owner: root
    group: libvirt
    mode: "2775"
  tags: [config]
 - name: Import firewall tasks
  ansible.builtin.import_tasks: firewall.yml
--- a/roles/integration_test/templates/10-libvirt-boma.nft.j2
+++ b/roles/integration_test/templates/10-libvirt-boma.nft.j2
@ -1,12 +0,0 @@
 # {{ ansible_managed }}
 # Allow DHCP/DNS traffic arriving on the libvirt integration bridge to pass base's
 # inet filter input default-deny chain (ADR-025). nftables multi-table semantics mean
 # libvirt's own `ip filter` table accept is not enough — base's `inet filter` input
 # policy drop kills bridge traffic first without this drop-in.
 #
 # Bridge name "virbr-boma" must match NET_XML in scripts/integration-vm.py.
 table inet filter {
  chain input {
    iifname "virbr-boma" accept
  }
 }
--- a/roles/netbird_coordinator/README.md
+++ b/roles/netbird_coordinator/README.md
@ -46,7 +46,6 @@ upstream support; WS/gRPC need long timeouts (Caddy sets none by default).
 | `netbird_coordinator__domain` | `netbird.askari.wingu.me` | Public hostname; feeds `exposedAddress`, the OIDC issuer, redirect URIs, and the dashboard endpoints |
 | `netbird_coordinator__trusted_proxies` | `["172.16.0.0/12"]` | Source ranges NetBird trusts `X-Forwarded-*` from (`server.reverseProxy.trustedHTTPProxies`). Must cover Caddy's source IP on the boma network — verify the actual bridge subnet at deploy |
 | `netbird_coordinator__manage` | `true` | Set `false` in Molecule to render templates without a Docker daemon |
 | `netbird_coordinator__disable_geolocation` | `true` | sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4) |
 Production overrides live in `inventories/production/group_vars/`.
--- a/roles/netbird_coordinator/defaults/main.yml
+++ b/roles/netbird_coordinator/defaults/main.yml
@ -6,13 +6,6 @@ netbird_coordinator__dashboard_image: "netbirdio/dashboard:v2.39.0"
 netbird_coordinator__base_dir: /opt/services/netbird
 netbird_coordinator__domain: netbird.askari.wingu.me
 # Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
 # (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
 # so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
 # re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
 # removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
 netbird_coordinator__disable_geolocation: true
 # Source IP ranges Caddy fronts NetBird from, rendered into config.yaml
 # server.reverseProxy.trustedHTTPProxies. NetBird trusts X-Forwarded-* only from
 # these. MUST cover the Caddy container's source IP on the boma Docker network —
--- a/roles/netbird_coordinator/molecule/default/verify.yml
+++ b/roles/netbird_coordinator/molecule/default/verify.yml
@ -30,12 +30,3 @@
          - "'v2.39.0' in (_compose.content | b64decode)"
        fail_msg: "docker-compose.yml is missing pinned image tags"
        success_msg: "docker-compose.yml pins both image tags"
    - name: "Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)"
      ansible.builtin.assert:
        that:
          - "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
        fail_msg: >-
          compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
          the coordinator on the GeoLite2 download
        success_msg: "geolocation disabled in compose"
--- a/roles/netbird_coordinator/tasks/main.yml
+++ b/roles/netbird_coordinator/tasks/main.yml
@ -4,8 +4,6 @@
    path: "{{ netbird_coordinator__base_dir }}"
    state: directory
    mode: "0750"
  # create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
  check_mode: false
  tags: [config]
 - name: Render the combined server config
--- a/roles/netbird_coordinator/templates/docker-compose.yml.j2
+++ b/roles/netbird_coordinator/templates/docker-compose.yml.j2
@ -16,10 +16,6 @@ services:
    container_name: netbird-server
    restart: unless-stopped
    command: ["--config", "/etc/netbird/config.yaml"]
    environment:
      # Disable geolocation so a no-egress startup can't FATAL the control plane
      # (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
      NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
    ports:
      - "3478:3478/udp"
    volumes:
--- a/roles/reverse_proxy/defaults/main.yml
+++ b/roles/reverse_proxy/defaults/main.yml
@ -35,7 +35,3 @@ access__api:  # noqa: var-naming[no-role-prefix]
 # DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance.
 backup__service: reverse_proxy  # noqa: var-naming[no-role-prefix]
 backup__state: false  # noqa: var-naming[no-role-prefix]
 # Integration-test / staging cert knobs (ADR-025). Default off = production behaviour.
 reverse_proxy__tls_internal: false   # true => every site uses Caddy's self-signed CA
 reverse_proxy__acme_ca: ""           # set to the LE staging directory URL to use staging
--- a/roles/reverse_proxy/tasks/main.yml
+++ b/roles/reverse_proxy/tasks/main.yml
@ -4,8 +4,6 @@
    path: "{{ reverse_proxy__base_dir }}"
    state: directory
    mode: "0750"
  # create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
  check_mode: false
  tags: [config]
 - name: Ensure the Caddy config directory exists
@ -13,8 +11,6 @@
    path: "{{ reverse_proxy__base_dir }}/caddy"
    state: directory
    mode: "0750"
  # create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
  check_mode: false
  tags: [config]
 # Render into a directory that is bind-mounted whole (./caddy -> /etc/caddy). Mounting
--- a/roles/reverse_proxy/templates/Caddyfile.j2
+++ b/roles/reverse_proxy/templates/Caddyfile.j2
@ -1,9 +1,6 @@
 # {{ ansible_managed }}
 {
  email {{ reverse_proxy__acme_email }}
 {% if reverse_proxy__acme_ca %}
  acme_ca {{ reverse_proxy__acme_ca }}
 {% endif %}
 {% if reverse_proxy__acme_dns_provider == 'gandi' %}
  # ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the
  # Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom
@ -13,9 +10,6 @@
 }
 {% for r in reverse_proxy__routes %}
 {{ r['host'] }} {
 {% if reverse_proxy__tls_internal %}
  tls internal
 {% endif %}
 {% if r['caddy'] is defined %}
 {{ r['caddy'] | trim | indent(2, first=true) }}
 {% elif r['upstream'] is defined %}
--- a/scripts/integration-vm.py
+++ b/scripts/integration-vm.py
@ -1,462 +0,0 @@
 #!/usr/bin/env python3
 """boma local-VM integration test harness driver (ADR-025).
 Stdlib-only by convention (TODO-14): never imports a YAML library. The transient
 inventory is emitted via string templates; stubs/cert-tiers reach Ansible as
 `-e @<file>` extra-vars; profile metadata is JSON. Talks to libvirt via `virsh`.
 """
 import argparse
 import hashlib
 import json
 import os
 import pathlib
 import re
 import subprocess
 import sys
 import time
 import urllib.request
 import uuid
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
 CACHE_DIR = pathlib.Path(os.environ.get("BOMA_IT_CACHE", "/var/lib/boma-integration"))
 IMAGE_URL = "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2"
 SHA_URL = "https://cloud.debian.org/images/cloud/trixie/latest/SHA512SUMS"
 IMAGE_NAME = "debian-13-genericcloud-amd64.qcow2"
 NET_NAME = "boma-it"
 NET_XML = """<network>
  <name>boma-it</name>
  <forward mode='nat'/>
  <bridge name='virbr-boma' stp='on' delay='0'/>
  <ip address='192.168.150.1' netmask='255.255.255.0'>
    <dhcp><range start='192.168.150.10' end='192.168.150.254'/></dhcp>
  </ip>
 </network>
 """
 NAME_PREFIX = "boma-it-"
 RUN_DIR = REPO_ROOT / "tests" / "integration" / ".run"
 DIAG_ROOT = pathlib.Path.home() / "integration-runs"
 PROFILE_DIR = REPO_ROOT / "tests" / "integration" / "profiles"
 INTEG_DIR = REPO_ROOT / "tests" / "integration"
 CERT_DIR = REPO_ROOT / "tests" / "integration" / "certs"
 DEFAULT_MEM_MIB = 3072
 DEFAULT_VCPUS = 2
 MIN_FREE_MIB = 4096
 VALID_TIERS = ("internal", "le-staging", "le-prod-wildcard")
 # Target the SYSTEM libvirtd — where the substrate, /dev/kvm, and the NAT network live.
 # Without this, a non-root caller's bare virsh/virt-install default to qemu:///session.
 os.environ.setdefault("LIBVIRT_DEFAULT_URI", "qemu:///system")
 def vm_name(host, suffix=None):
    suffix = suffix or uuid.uuid4().hex[:8]
    return f"{NAME_PREFIX}{host}-{suffix}"
 def free_mib(meminfo_text):
    m = re.search(r"^MemAvailable:\s+(\d+)\s+kB", meminfo_text, re.MULTILINE)
    return int(m.group(1)) // 1024 if m else 0
 def parse_lease_ip(domifaddr_output):
    m = re.search(r"ipv4\s+(\d+\.\d+\.\d+\.\d+)", domifaddr_output)
    return m.group(1) if m else None
 def render_meta_data(instance_id, hostname):
    return f"instance-id: {instance_id}\nlocal-hostname: {hostname}\n"
 def render_user_data(ssh_pubkey, ansible_user):
    return (
        "#cloud-config\n"
        "users:\n"
        f"  - name: {ansible_user}\n"
        "    sudo: 'ALL=(ALL) NOPASSWD:ALL'\n"
        "    shell: /bin/bash\n"
        "    ssh_authorized_keys:\n"
        f"      - {ssh_pubkey}\n"
        "ssh_pwauth: false\n"
        "package_update: true\n"
    )
 def cert_file(tier):
    if tier not in VALID_TIERS:
        raise ValueError(f"unknown cert tier: {tier}")
    return CERT_DIR / f"{tier}.yml"
 def profile_path(host):
    return PROFILE_DIR / f"{host}.json"
 def render_run_hosts(name, ip, ansible_user, groups):
    lines = [
        "---",
        "# Generated by scripts/integration-vm.py — transient, gitignored. Do not edit.",
        "# Single test host ONLY (safety invariant: no real host is ever in scope).",
        "all:",
        "  children:",
    ]
    for g in dict.fromkeys(groups):
        lines += [
            f"    {g}:",
            "      hosts:",
            f"        {name}:",
            f"          ansible_host: {ip}",
            f"          ansible_user: {ansible_user}",
            # Integration VMs reuse IPs; bypass host-key caching so stale
            # known_hosts entries (from prior runs with a different VM at
            # the same IP) do not block the Ansible apply step.
            "          ansible_ssh_common_args: >-",
            "            -o StrictHostKeyChecking=no",
            "            -o UserKnownHostsFile=/dev/null",
        ]
    return "\n".join(lines) + "\n"
 def sh(cmd, check=True, capture=False, **kw):
    """Run a command (list form). Logs the command to stderr."""
    print("+ " + " ".join(str(c) for c in cmd), file=sys.stderr)
    return subprocess.run(cmd, check=check,
                          capture_output=capture, text=True, **kw)
 def _expected_sha(sha_text, filename):
    for line in sha_text.splitlines():
        parts = line.split()
        if len(parts) == 2 and parts[1].lstrip("*") == filename:
            return parts[0]
    return None
 def ensure_image():
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    img = CACHE_DIR / IMAGE_NAME
    if img.exists():
        return img
    print(f"Downloading {IMAGE_URL} ...", file=sys.stderr)
    tmp = img.with_suffix(".part")
    urllib.request.urlretrieve(IMAGE_URL, tmp)
    sha_text = urllib.request.urlopen(SHA_URL).read().decode()
    want = _expected_sha(sha_text, IMAGE_NAME)
    if not want:
        tmp.unlink(missing_ok=True)
        raise SystemExit(f"checksum for {IMAGE_NAME} not found at {SHA_URL}")
    h = hashlib.sha512()
    with open(tmp, "rb") as fh:
        for chunk in iter(lambda: fh.read(1 << 20), b""):
            h.update(chunk)
    if h.hexdigest() != want:
        tmp.unlink(missing_ok=True)
        raise SystemExit("golden image SHA512 mismatch — refusing to use it")
    tmp.rename(img)
    return img
 def net_ensure():
    r = sh(["virsh", "net-info", NET_NAME], check=False, capture=True)
    if r.returncode != 0:
        xml = RUN_DIR / "net.xml"
        RUN_DIR.mkdir(parents=True, exist_ok=True)
        xml.write_text(NET_XML)
        sh(["virsh", "net-define", str(xml)])
        sh(["virsh", "net-autostart", NET_NAME])
    active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
    if not re.search(r"Active:\s+yes", active):
        sh(["virsh", "net-start", NET_NAME])
 def _ssh_pubkey():
    for cand in ("id_ed25519.pub", "id_rsa.pub"):
        p = pathlib.Path.home() / ".ssh" / cand
        if p.exists():
            return p.read_text().strip()
    raise SystemExit("no SSH public key found in ~/.ssh")
 def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS):
    free = free_mib(pathlib.Path("/proc/meminfo").read_text())
    if free < MIN_FREE_MIB:
        raise SystemExit(f"refusing to start: only {free} MiB free (< {MIN_FREE_MIB})")
    running = sh(["virsh", "list", "--name"], capture=True).stdout.split()
    if any(n.startswith(NAME_PREFIX) for n in running):
        raise SystemExit("an integration VM is already running (one at a time); "
                         "run `integration-vm prune` first")
    name = name or vm_name(host)
    img = ensure_image()
    net_ensure()
    RUN_DIR.mkdir(parents=True, exist_ok=True)
    # VM disk/seed/console must live where the SYSTEM hypervisor (libvirt-qemu) can reach
    # them — NOT under the repo/home (qemu cannot traverse /home/claude). CACHE_DIR is
    # group-libvirt + world-traversable (created by the integration_test role).
    overlay = CACHE_DIR / f"{name}.qcow2"
    sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)])
    (RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible"))
    # cloud-init rejects underscores in local-hostname (causes init-local to skip
    # writing the network config → VM never gets a DHCP lease). Sanitize VM name
    # for use as hostname without affecting disk paths or virsh domain names.
    (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name.replace("_", "-")))
    seed = CACHE_DIR / f"{name}-seed.img"
    # Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback.
    # Use explicit renderer + interface name to avoid a netplan 1.1.2 generation issue:
    # `match.name: en*` with a named key (e.g. `primary`) produces a .network file that
    # networkd loads but never DHCPs (no DHCP4 messages, just IPv6LL). Using the real
    # interface name `enp1s0` (all virtio NICs in these KVM VMs are named enp1s0) and
    # `renderer: networkd` bypasses the bug.
    (RUN_DIR / "network-config").write_text(
        'version: 2\n'
        'renderer: networkd\n'
        'ethernets:\n'
        '  enp1s0:\n'
        '    dhcp4: true\n')
    sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"),
        str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")])
    console = CACHE_DIR / f"{name}-console.log"
    # virt-install has a `#!/usr/bin/env python3` shebang; the Makefile prepends .venv/bin to
    # PATH (so the venv's ansible tools resolve), which would hijack virt-install into the
    # isolated venv — it lacks system PyGObject (`gi`) and crashes. Strip the venv from PATH
    # for this system tool so its shebang finds /usr/bin/python3 (which has gi). Ansible is
    # invoked via its absolute .venv path elsewhere, so it is unaffected.
    sys_path = ":".join(p for p in os.environ.get("PATH", "").split(":")
                        if "/.venv/bin" not in p)
    sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus),
        "--boot", "uefi",   # genericcloud triple-faults on legacy BIOS handoff; UEFI boots
        "--import",
        "--disk", f"path={overlay},format=qcow2",
        "--disk", f"path={seed},device=cdrom",
        "--network", f"network={NET_NAME}",
        "--osinfo", "debian13",
        "--graphics", "none",
        "--serial", f"file,path={console}",
        "--noautoconsole"],
       env=dict(os.environ, PATH=sys_path))
    ip = wait_for_ip(name)
    wait_for_ssh(ip, "ansible")
    # Block until cloud-init finishes (incl. apt-get update) so apply sees a ready system.
    sh(["ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
        f"ansible@{ip}", "sudo cloud-init status --wait"], check=False)
    (RUN_DIR / "current").write_text(f"{name}\n{ip}\n{host}\n")
    print(f"VM {name} up at {ip}")
    return name, ip
 def wait_for_ip(name, timeout=120):
    # Try --source lease first (fastest when leaseshelper works), then fall back to
    # --source arp (reads the host neighbour/ARP table — no privileged helper needed,
    # populated once the VM sends traffic). Both sources produce identical output that
    # parse_lease_ip handles, so this removes the leaseshelper/suid dependency.
    end = time.time() + timeout
    while time.time() < end:
        for source in ("lease", "arp"):
            out = sh(["virsh", "domifaddr", name, "--source", source],
                     check=False, capture=True).stdout
            ip = parse_lease_ip(out)
            if ip:
                return ip
        time.sleep(4)
    raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
                     "VM left defined; run `integration-vm prune` to remove it")
 def wait_for_ssh(ip, user, timeout=180):
    end = time.time() + timeout
    while time.time() < end:
        r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
                "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
                f"{user}@{ip}", "true"], check=False, capture=True)
        if r.returncode == 0:
            return
        time.sleep(5)
    raise SystemExit(f"timed out waiting for SSH to {ip} — "
                     "VM left defined; run `integration-vm prune` to remove it")
 def _read_current():
    txt = (RUN_DIR / "current").read_text().splitlines()
    return txt[0], txt[1], txt[2]   # name, ip, host
 def write_run_inventory(name, ip, groups):
    RUN_DIR.mkdir(parents=True, exist_ok=True)
    (RUN_DIR / "hosts.yml").write_text(
        render_run_hosts(name, ip, "ansible", groups))
    link = RUN_DIR / "group_vars"
    target = REPO_ROOT / "inventories" / "production" / "group_vars"
    if link.is_symlink():
        link.unlink()
    elif link.exists():
        raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
    link.symlink_to(target)
 def apply(host, certs):
    name, ip, _ = _read_current()
    prof = json.loads(profile_path(host).read_text())
    write_run_inventory(name, ip, prof["groups"])
    extra = []
    for f in prof.get("extra_vars_files", []):
        extra += ["-e", f"@{INTEG_DIR / f}"]
    extra += ["-e", f"@{cert_file(certs)}"]
    for step in prof["applies"]:
        cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
               f"playbooks/{step['playbook']}", "--limit", name]
        if step.get("tags"):
            cmd += ["--tags", ",".join(step["tags"])]
        cmd += extra
        sh(cmd, cwd=str(REPO_ROOT))
    print(f"applied {host} profile to {name}")
 def _boot_id(ip, user):
    r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
            "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
            f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
           check=False, capture=True)
    return r.stdout.strip() if r.returncode == 0 else None
 def wait_for_reboot(ip, user, before_boot_id, timeout=240):
    """Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
    end = time.time() + timeout
    while time.time() < end:
        bid = _boot_id(ip, user)
        if bid and bid != before_boot_id:
            return
        time.sleep(5)
    raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
                     "VM left defined; run `integration-vm prune` to remove it")
 def reboot_vm():
    name, ip, _ = _read_current()
    before = _boot_id(ip, "ansible")
    sh(["virsh", "reboot", name])
    wait_for_reboot(ip, "ansible", before)
    print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
 def run_assert(host, certs):
    name, ip, _ = _read_current()
    prof = json.loads(profile_path(host).read_text())
    write_run_inventory(name, ip, prof["groups"])
    extra = []
    for f in prof.get("extra_vars_files", []):
        extra += ["-e", f"@{INTEG_DIR / f}"]
    extra += ["-e", f"@{cert_file(certs)}"]
    cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
           "tests/integration/verify.yml", "--limit", name] + extra
    r = sh(cmd, cwd=str(REPO_ROOT), check=False)
    if r.returncode != 0:
        dump_diagnostics(name, ip)
        raise SystemExit(f"VERIFY FAILED for {name} — diagnostics in {DIAG_ROOT}")
    print(f"VERIFY PASSED for {name}")
 def dump_diagnostics(name, ip):
    d = DIAG_ROOT / name
    d.mkdir(parents=True, exist_ok=True)
    for label, cmd in [
        ("nft", "nft list ruleset"),
        ("docker", "docker ps -a"),
        ("ss", "ss -tlnp"),
        ("journal", "journalctl -b --no-pager"),
        ("critical-chain", "systemd-analyze critical-chain"),
    ]:
        r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
                "-o", "UserKnownHostsFile=/dev/null",
                f"ansible@{ip}", "sudo " + cmd], check=False, capture=True)
        (d / f"{label}.txt").write_text((r.stdout or "") + (r.stderr or ""))
    console = CACHE_DIR / f"{name}-console.log"
    if console.exists():
        # The serial log is root:0600 (libvirt-created); read it via sudo (ADR-015: the
        # claude worker has sudo) and write a worker-owned copy into the bundle.
        r = sh(["sudo", "cat", str(console)], check=False, capture=True)
        (d / "console.log").write_text(r.stdout or "")
    print(f"diagnostics written to {d}", file=sys.stderr)
 def _destroy(name):
    sh(["virsh", "destroy", name], check=False)
    sh(["virsh", "undefine", name, "--nvram"], check=False)
    for base in (RUN_DIR, CACHE_DIR):
        for f in base.glob(f"{name}*"):
            f.unlink(missing_ok=True)
 def down(host=None, keep=False):
    if keep:
        print("--keep: leaving the VM running for inspection")
        return
    cur = RUN_DIR / "current"
    if cur.exists():
        name = cur.read_text().splitlines()[0]
        _destroy(name)
        cur.unlink(missing_ok=True)
        print(f"destroyed {name}")
 def prune():
    running = sh(["virsh", "list", "--all", "--name"], capture=True).stdout.split()
    for n in running:
        if n.startswith(NAME_PREFIX):
            _destroy(n)
            print(f"pruned {n}")
    (RUN_DIR / "current").unlink(missing_ok=True)
 def console():
    name = (RUN_DIR / "current").read_text().splitlines()[0]
    log = CACHE_DIR / f"{name}-console.log"
    if log.exists():
        print(sh(["sudo", "cat", str(log)], check=False, capture=True).stdout or "")
    else:
        print(f"no console log at {log}")
 def cycle(host, certs, keep=False, no_reboot=False):
    ok = False
    try:
        up(host)
        apply(host, certs)
        if not no_reboot:
            reboot_vm()
        run_assert(host, certs)
        ok = True
    finally:
        if ok and not keep:
            down(host)
        elif not ok:
            print("FAILED — VM left up for inspection; `integration-vm prune` to clean.",
                  file=sys.stderr)
 DISPATCH = {
    "up": lambda a: (up(a.host), None)[1],
    "apply": lambda a: apply(a.host, a.certs),
    "reboot": lambda a: reboot_vm(),
    "assert": lambda a: run_assert(a.host, a.certs),
    "down": lambda a: down(a.host, a.keep),
    "console": lambda a: console(),
    "prune": lambda a: prune(),
    "cycle": lambda a: cycle(a.host, a.certs, a.keep, a.no_reboot),
 }
 def main(argv=None):
    p = argparse.ArgumentParser(prog="integration-vm", description=__doc__)
    sub = p.add_subparsers(dest="cmd", required=True)
    for c in ("up", "apply", "reboot", "assert", "cycle", "down", "console"):
        sp = sub.add_parser(c)
        sp.add_argument("--host", required=True)
        sp.add_argument("--certs", choices=VALID_TIERS, default="internal")
        sp.add_argument("--keep", action="store_true")
        sp.add_argument("--no-reboot", action="store_true")
    sub.add_parser("prune")
    args = p.parse_args(argv)
    return DISPATCH[args.cmd](args)
 if __name__ == "__main__":  # pragma: no cover
    sys.exit(main())
--- a/scripts/registry-login.sh
+++ b/scripts/registry-login.sh
@ -1,32 +0,0 @@
 #!/usr/bin/env bash
 #
 # Log the local Docker daemon into the Forgejo container registry using a token stored in
 # the Ansible vault — so registry pushes (make caddy-image-push / molecule-image-push) are
 # agent-completable non-interactively, like every other vault-backed action.
 # (2026-06-17 kaizen, docs/FRICTION.md: the push half silently needed an interactive
 # `docker login`; the creds weren't in the vault, so an agent couldn't complete a push.)
 #
 # Reads vault.forgejo.registry_token from the vault (rbw must be unlocked) and pipes it to
 # `docker login --password-stdin`. The token never lands on argv or on disk and is never
 # echoed (no `set -x`). Binaries/paths are overridable via env so the Makefile can pass the
 # venv ansible-vault/python; defaults work when run from the repo root with the venv present.
 #
 set -euo pipefail
 ANSIBLE_VAULT="${ANSIBLE_VAULT:-.venv/bin/ansible-vault}"
 PYTHON="${PYTHON:-.venv/bin/python}"
 VAULT="${VAULT:-inventories/production/group_vars/all/vault.yml}"
 REGISTRY_HOST="${REGISTRY_HOST:-forgejo.nyumbani.baobab.band}"
 REGISTRY_USER="${REGISTRY_USER:-sjat}"
 token="$("$ANSIBLE_VAULT" view "$VAULT" \
  | "$PYTHON" -c 'import sys, yaml; d = yaml.safe_load(sys.stdin) or {}; print((((d.get("vault") or {}).get("forgejo") or {}).get("registry_token")) or "", end="")')"
 if [ -z "$token" ] || [ "$token" = "CHANGEME" ]; then
  echo "registry-login: vault.forgejo.registry_token is unset or still CHANGEME." >&2
  echo "  Mint a Forgejo token (Settings -> Applications -> Generate Token, with package" >&2
  echo "  read+write scope, user $REGISTRY_USER) and set it via:  make edit-vault" >&2
  exit 1
 fi
 printf '%s' "$token" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin
--- a/scripts/repo-scan.py
+++ b/scripts/repo-scan.py
@ -41,42 +41,6 @@ LIST_ITEM_RE = re.compile(r"^\s*(\d+\.|[-*+])\s+(.*)")
 DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I)
 RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I)
 # Rename-incomplete detection: an ADR announces a rename/supersession of a named
 # term (Old → New); verify the OLD name no longer lingers in the design-doc set.
 # (The structural cousin of stale-deferred — see docs/FRICTION.md, ADR-024.)
 # A "specific" name is a backticked token or a capitalised proper-noun/identifier;
 # common connective words are rejected so they can't be mistaken for a tool name.
 _NAME = r"(?:`[^`]+`|[A-Z][A-Za-z0-9_+.-]{2,})"
 RENAME_STOPWORDS = {
    "was", "were", "the", "this", "that", "with", "from", "into", "and", "but",
    "for", "are", "has", "had", "been", "now", "not", "all", "any", "use", "used",
    "via", "per", "its", "our", "one", "two", "old", "new", "phase", "step",
    "adr", "read", "name", "term", "tool", "prose", "roadmap",
 }
 # Trigger forms — each captures (old, new) as raw name tokens; the connective words
 # are case-insensitive but the names must still satisfy _NAME (specific tokens).
 RENAME_ASSERT_RES = (
    # renamed X to Y
    re.compile(rf"renamed\s+(?:from\s+)?({_NAME})\s+to\s+({_NAME})", re.I),
    # replaced X with Y
    re.compile(rf"replac\w*\s+({_NAME})\s+with\s+({_NAME})", re.I),
    # superseded X with/by Y
    re.compile(rf"supersed\w*\s+({_NAME})\s+(?:with|by)\s+({_NAME})", re.I),
    # X ... (is/are/was/were/been) updated to read Y
    re.compile(rf"({_NAME})\b.{{0,40}}?\b(?:is|are|was|were|been)?\s*"
               rf"updated\s+to\s+read\s+[\"']?({_NAME})", re.I),
    # X → Y  /  X -> Y  on a line that also carries a rename/supersede/update cue
    re.compile(rf"({_NAME})\s*(?:->|→)\s*({_NAME})"),
 )
 RENAME_ARROW_RES = (RENAME_ASSERT_RES[-1],)  # arrow forms need a cue word on the line
 RENAME_CUE_RE = re.compile(r"\b(?:renam\w*|replac\w*|supersed\w*|updated|rename)\b", re.I)
 # Historical / negation cues — a lingering OLD name on such a line is legitimate
 # history, not a missed ripple edit, so it is skipped.
 RENAME_HIST_RE = re.compile(
    r"\b(?:was|were|formerly|previously|no longer|instead of|rather than|reject\w*|"
    r"reconsider\w*|supersed\w*|deprecat\w*|legacy|history|heritage|V4|"
    r"actually ran|used to)\b", re.I)
 # ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory
 # sections and a parseable Status line. Presence only — section ORDER is a
 # template-demonstrated convention, not machine-enforced.
@ -178,84 +142,6 @@ def adr_structure_findings(adr_files):
    return out
 def _clean_name(tok):
    """Strip backticks/quotes from a captured name token. Return the bare name, or
    None if it is not a 'specific' token (empty, multi-word, or a stopword)."""
    s = tok.strip().strip("`\"'").strip()
    s = s.rstrip(".,;:!?)")  # trailing sentence punctuation is not part of the name
    if not s or " " in s:
        return None
    if s.lower() in RENAME_STOPWORDS:
        return None
    # An ADR reference (ADR-017) is a document pointer, never the renamed *term* — a
    # sentence like "the ADR-017 prose ... is updated to read Caddy" must not parse
    # ADR-017 as the old name. Reject it so such lines skip (precision >> recall).
    if re.fullmatch(r"ADR-\d{3}", s):
        return None
    # Must be backtick-able identifier or a capitalised proper noun (the _NAME shape
    # already enforced this on capture; this is the after-stripping re-check).
    if not re.fullmatch(r"[A-Za-z0-9_+.-]{3,}", s):
        return None
    return s
 def _rename_assertion(line):
    """Parse a single ADR line for a tight Old→New rename assertion. Returns
    (old, new) of cleaned specific names, or None. Conservative: precision >> recall."""
    for rx in RENAME_ASSERT_RES:
        m = rx.search(line)
        if not m:
            continue
        # Arrow form only counts when the line also carries a rename/supersede cue.
        if rx in RENAME_ARROW_RES and not RENAME_CUE_RE.search(line):
            continue
        old, new = _clean_name(m.group(1)), _clean_name(m.group(2))
        if old and new and old != new:
            return old, new
    return None
 def rename_incomplete_findings(adr_files, extra_docs):
    """adr_files: {rel_path: [lines]} for docs/decisions/*.md (the numbered ADRs make
    the assertions). extra_docs: {rel_path: [lines]} for CAPABILITIES.md / ROADMAP.md.
    When a numbered ADR announces a rename 'Old' -> 'New', flag any DESIGN-doc line
    where 'Old' still appears as a whole word in present tense (skipping the announcing
    ADR, lines that also name 'New', and lines carrying a historical/negation cue)."""
    out = []
    # The design-doc set we search: all decisions/*.md plus the two extra docs.
    doc_set = dict(adr_files)
    doc_set.update(extra_docs)
    # Collect assertions only from numbered ADRs (NNN-*.md).
    assertions = []  # (adr_num, announcer_path, old, new)
    for rpath, lines in sorted(adr_files.items()):
        base = os.path.basename(rpath)
        if not ADR_FILE_RE.match(base):
            continue
        adr_num = base[:3]
        for line in lines:
            parsed = _rename_assertion(line)
            if parsed:
                assertions.append((adr_num, rpath, parsed[0], parsed[1]))
    for adr_num, announcer, old, new in assertions:
        old_re = re.compile(r"\b" + re.escape(old) + r"\b")  # case-sensitive whole word
        for rpath, lines in sorted(doc_set.items()):
            if rpath == announcer:            # the ADR that made the claim is exempt
                continue
            for i, raw in enumerate(lines, 1):
                if not old_re.search(raw):
                    continue
                if new in raw:                # rename is being explained on this line
                    continue
                if RENAME_HIST_RE.search(raw):  # legitimate history / negation
                    continue
                out.append({"check": "rename-incomplete", "severity": "medium",
                            "path": rpath, "line": i,
                            "detail": f"ADR-{adr_num} announced rename '{old}' -> "
                            f"'{new}' but '{old}' still appears here; confirm the "
                            "ripple edit landed or soften the ADR claim"})
    return out
 def walk_files():
    for dirpath, dirnames, filenames in os.walk(ROOT):
        dirnames[:] = [d for d in dirnames if d not in PRUNE]
@ -306,11 +192,8 @@ def scan():
    findings = []
    adrs = adr_numbers()
    adr_files = {}        # docs/decisions/*.md → lines, for deferred-section parsing
    extra_docs = {}       # CAPABILITIES.md / ROADMAP.md → lines, for rename-incomplete
    defer_refs = []       # repo-wide "resolves ADR-NNN deferred #K" references
    decisions_dir = os.path.join("docs", "decisions")
    rename_extra = {os.path.join("docs", "CAPABILITIES.md"),
                    os.path.join("docs", "ROADMAP.md")}
    for path in walk_files():
        rpath = rel(path)
        if rpath.startswith(SKIP_PREFIX):
@ -340,8 +223,6 @@ def scan():
        if rpath.startswith(decisions_dir) and rpath.endswith(".md"):
            adr_files[rpath] = lines
        if rpath in rename_extra:
            extra_docs[rpath] = lines
        for i, line in enumerate(lines, 1):
            for m in DEFER_REF_RE.finditer(line):
@ -380,7 +261,6 @@ def scan():
                                     "line": i, "detail": f"references '{ref}' which does not exist"})
    findings.extend(deferred_findings(adr_files, defer_refs))
    findings.extend(adr_structure_findings(adr_files))
    findings.extend(rename_incomplete_findings(adr_files, extra_docs))
    return findings
--- a/terraform/environments/offsite/main.tf
+++ b/terraform/environments/offsite/main.tf
@ -11,7 +11,7 @@ module "askari" {
  location           = "hel1" # Helsinki
  image              = "debian-13"
  ansible_ssh_pubkey = var.ansible_ssh_pubkey
-  ssh_admin_cidrs    = ["91.226.145.80/32"] # TEMP (incident recovery 2026-06-17): re-open WAN :22 to ubongo only; re-close once the firewall/Docker + boot-race issues are fixed
+  ssh_admin_cidrs    = var.ssh_admin_cidrs
  public_web         = true # Caddy 80/443 + NetBird 3478 (M4)
  labels = {
    env        = "offsite"
--- a/terraform/modules/hetzner_vm/main.tf
+++ b/terraform/modules/hetzner_vm/main.tf
@ -26,18 +26,13 @@ resource "hcloud_ssh_key" "ansible" {
 resource "hcloud_firewall" "this" {
  name = "${var.name}-fw"
-  # SSH from the control node only — and only when admin CIDRs are set. An empty
+  # SSH from the control node only.
-  # ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over
+  rule {
  # wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
  dynamic "rule" {
    for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
    content {
    direction  = "in"
    protocol   = "tcp"
    port       = "22"
    source_ips = var.ssh_admin_cidrs
  }
  }
  # Public web (Caddy 80/443) + NetBird STUN/TURN (3478/udp) — only when public_web
  # (ADR-024, M4). Host nftables stays catalog-driven (ADR-020).
--- a/terraform/modules/hetzner_vm/variables.tf
+++ b/terraform/modules/hetzner_vm/variables.tf
@ -24,9 +24,8 @@ variable "ansible_ssh_pubkey" {
 }
 variable "ssh_admin_cidrs" {
-  description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)."
+  description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
  type        = list(string)
  default     = []
 }
 variable "public_web" {
--- a/tests/integration/certs/internal.yml
+++ b/tests/integration/certs/internal.yml
@ -1,2 +0,0 @@
 ---
 reverse_proxy__tls_internal: true
--- a/tests/integration/certs/le-prod-wildcard.yml
+++ b/tests/integration/certs/le-prod-wildcard.yml
@ -1,6 +0,0 @@
 ---
 # On-demand only. Records an accepted risk (ADR-025 / accepted-risks.md): the prod
 # Gandi PAT reaches an ephemeral VM and transient TXT records land in the real wingu.me.
 reverse_proxy__tls_internal: false
 reverse_proxy__acme_dns_provider: gandi
 reverse_proxy__acme_ca: ""
--- a/tests/integration/certs/le-staging.yml
+++ b/tests/integration/certs/le-staging.yml
@ -1,4 +0,0 @@
 ---
 reverse_proxy__tls_internal: false
 reverse_proxy__acme_dns_provider: gandi
 reverse_proxy__acme_ca: "https://acme-staging-v02.api.letsencrypt.org/directory"
--- a/tests/integration/overrides/askari.yml
+++ b/tests/integration/overrides/askari.yml
@ -1,13 +0,0 @@
 ---
 # Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`.
 # Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host.
 integration_profile: askari
 base__firewall_apply: true
 # Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM).
 base__ssh_listen_mesh_only: false
 # The VM is isolated; it must never touch the real mesh.
 base__mesh_enabled: false
 # Allow SSH from the VM's libvirt-NAT gateway (where the driver/ansible connects from),
 # so base's default-deny firewall + the reboot don't lock out the harness. By source IP,
 # so it's interface-independent. Overrides askari's real control addr for the test only.
 base__firewall_control_addr: "192.168.150.1"
--- a/tests/integration/overrides/askari_inputonly.yml
+++ b/tests/integration/overrides/askari_inputonly.yml
@ -1,17 +0,0 @@
 ---
 # Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
 # Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
 # accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
 integration_profile: askari_inputonly
 base__firewall_apply: true
 base__firewall_input_only: true
 # No sshd ListenAddress change — never wt0-only in a throwaway VM.
 base__ssh_listen_mesh_only: false
 # Isolated VM: never touch the real mesh.
 base__mesh_enabled: false
 # The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
 # VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
 # interface-independent and the default-deny + reboot don't lock out the driver. This
 # mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
 base__firewall_admin_addrs:
  - 192.168.150.1
--- a/tests/integration/overrides/ubongo.yml
+++ b/tests/integration/overrides/ubongo.yml
@ -1,18 +0,0 @@
 ---
 # Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
 # Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
 # chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
 # ListenAddress change (so no boot-race).
 integration_profile: ubongo
 base__firewall_apply: true
 base__firewall_input_only: true        # forward chain renders `policy accept`
 base__firewall_admin_addrs:
  - "192.168.150.98"                   # two representative LAN sources — exercises the
  - "192.168.150.99"                   # admin-addr loop with a multi-entry list (like ubongo)
 # Never wt0-only; never touch the real mesh from a throwaway VM.
 base__ssh_listen_mesh_only: false
 base__mesh_enabled: false
 # Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
 # default-deny apply + the reboot don't lock out the harness. By source IP (interface-
 # independent). This is the harness's lifeline; the admin-addr above is only exercised.
 base__firewall_control_addr: "192.168.150.1"
--- a/tests/integration/profiles/askari.json
+++ b/tests/integration/profiles/askari.json
@ -1,10 +0,0 @@
 {
  "groups": ["offsite_hosts"],
  "applies": [
    {"playbook": "site.yml", "tags": ["base"]},
    {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
  ],
  "extra_vars_files": ["overrides/askari.yml"],
  "mem_mib": 3072,
  "vcpus": 2
 }
--- a/tests/integration/profiles/askari_inputonly.json
+++ b/tests/integration/profiles/askari_inputonly.json
@ -1,10 +0,0 @@
 {
  "groups": ["offsite_hosts"],
  "applies": [
    {"playbook": "site.yml", "tags": ["base"]},
    {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
  ],
  "extra_vars_files": ["overrides/askari_inputonly.yml"],
  "mem_mib": 3072,
  "vcpus": 2
 }
--- a/tests/integration/profiles/ubongo.json
+++ b/tests/integration/profiles/ubongo.json
@ -1,9 +0,0 @@
 {
  "groups": ["control"],
  "applies": [
    {"playbook": "site.yml", "tags": ["base"]}
  ],
  "extra_vars_files": ["overrides/ubongo.yml"],
  "mem_mib": 2048,
  "vcpus": 2
 }
--- a/tests/integration/verify.yml
+++ b/tests/integration/verify.yml
@ -1,129 +0,0 @@
 ---
 # Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
 # `integration_profile` (set in each profile's overlay). Each profile asserts its own success
 # criteria; an unknown/unset profile fails loudly (never a silent pass).
 - name: Verify the rebooted host
  hosts: all
  become: true
  gather_facts: false
  tasks:
    - name: A known integration_profile must be set (no silent pass)
      ansible.builtin.assert:
        that:
          - integration_profile is defined
          - integration_profile in ['askari', 'askari_inputonly', 'ubongo']
        fail_msg: "integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"
    # ── askari profile — Docker host: published-port forwarding survives the reboot ──
    # The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
    # base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
    - name: (askari) Gather service facts
      when: integration_profile == 'askari'
      ansible.builtin.service_facts:
    - name: (askari) Docker daemon is active
      when: integration_profile == 'askari'
      ansible.builtin.assert:
        that: "ansible_facts.services['docker.service'].state == 'running'"
        fail_msg: "docker.service is not running"
    - name: (askari) Forward chain permits container traffic (drop-in loaded)
      when: integration_profile == 'askari'
      ansible.builtin.command: nft list chain inet filter forward
      register: _fwd
      changed_when: false
    - name: (askari) Assert container forwarding is allowed (not pure drop)
      when: integration_profile == 'askari'
      ansible.builtin.assert:
        that: "'accept' in _fwd.stdout"
        fail_msg: >-
          forward chain is pure drop — container forwarding will die on reboot
          (FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
    - name: (askari) Published port answers from the controller (DNAT + forward alive)
      when: integration_profile == 'askari'
      delegate_to: localhost
      become: false
      ansible.builtin.uri:
        # Probe :80 (plain HTTP) — any answer proves the published-port DNAT + forward path
        # is alive. Don't follow caddy's HTTP->HTTPS redirect (its `tls internal` has no
        # cert for a bare-IP HTTPS request); the 308 itself proves the path works.
        url: "http://{{ ansible_host }}/"
        follow_redirects: none
        status_code: [200, 301, 308, 404, 502, 503]
        timeout: 10
      register: _probe
      retries: 5
      delay: 6
      until: _probe is succeeded
    # ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
    # SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
    # checks boot_id changed before this verify runs). Here we assert the ruleset shape.
    - name: (ubongo) Read the live nftables ruleset
      when: integration_profile == 'ubongo'
      ansible.builtin.command: nft list ruleset
      register: _nft
      changed_when: false
    - name: (ubongo) INPUT default-deny, forward permissive, lifeline + admin-addr allow
      when: integration_profile == 'ubongo'
      ansible.builtin.assert:
        that:
          # live `nft list ruleset` prints the SYMBOLIC priority (`filter` = 0), unlike the
          # rendered /etc/nftables.conf (`priority 0`) that the Molecule scenario asserts against.
          - "'hook input priority filter; policy drop;' in _nft.stdout"
          - "'hook forward priority filter; policy accept;' in _nft.stdout"
          # the ssh-from-control lifeline (base__firewall_control_addr) — the reconnect path
          - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft.stdout"
          - "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
          - "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
        fail_msg: >-
          ubongo profile: expected input policy drop, forward policy accept (input-only),
          the ssh-from-control lifeline (192.168.150.1), and both admin-addr
          (192.168.150.98/99) SSH allows in the live ruleset.
    # ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
    # INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
    # (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
    - name: (askari_inputonly) Read the live nftables ruleset
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.command: nft list ruleset
      register: _nft_io
      changed_when: false
    - name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.assert:
        that:
          - "'hook input priority filter; policy drop;' in _nft_io.stdout"
          - "'hook forward priority filter; policy accept;' in _nft_io.stdout"
          - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
        fail_msg: >-
          askari_inputonly: expected input policy drop, forward policy accept (input-only),
          and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
    - name: (askari_inputonly) Gather service facts
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.service_facts:
    - name: (askari_inputonly) Docker daemon is active
      when: integration_profile == 'askari_inputonly'
      ansible.builtin.assert:
        that: "ansible_facts.services['docker.service'].state == 'running'"
        fail_msg: "docker.service is not running"
    - name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
      when: integration_profile == 'askari_inputonly'
      delegate_to: localhost
      become: false
      ansible.builtin.uri:
        url: "http://{{ ansible_host }}/"
        follow_redirects: none
        status_code: [200, 301, 308, 404, 502, 503]
        timeout: 10
      register: _probe_io
      retries: 5
      delay: 6
      until: _probe_io is succeeded
--- a/tests/test_firewall_rules.py
+++ b/tests/test_firewall_rules.py
@ -97,12 +97,3 @@ def test_ingress_missing_port_raises():
    cat = {"svc": {"host": "docker01", "ingress": [{"from": "lan"}]}}
    with pytest.raises(ValueError):
        fr.resolve_firewall_rules(cat, ZONES, "docker01", HOSTVARS, GROUPS)
 def test_public_zone_resolves_to_anywhere():
    catalog = {"web": {"host": "askari",
                       "ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
    zones = {"public": "0.0.0.0/0"}
    rules = fr.resolve_firewall_rules(catalog, zones, "askari",
                                      {"askari": {"ansible_host": "100.99.226.39"}}, {})
    assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]
--- a/tests/test_friction_scan.py
+++ b/tests/test_friction_scan.py
@ -123,8 +123,5 @@ def test_nudge_line_overdue_on_age():
 def test_load_signals_reads_real_friction_file():
    path = os.path.join(os.path.dirname(__file__), "..", "docs", "FRICTION.md")
    sigs = fs.load_signals(path, TODAY)
-    # May legitimately be empty right after a /kaizen pass consumes every open signal —
+    assert len(sigs) >= 1
    # an empty Open-signals section is the goal state, not a failure. Assert the function
    # parses the real file into well-formed signals (validity holds vacuously when empty).
    assert isinstance(sigs, list)
    assert all(s["tag"] in {"friction", "gotcha", "recurring", "unused"} for s in sigs)
--- a/tests/test_integration_vm.py
+++ b/tests/test_integration_vm.py
@ -1,106 +0,0 @@
 import importlib.util
 import pathlib
 import types
 import pytest
 _PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py"
 _spec = importlib.util.spec_from_file_location("integration_vm", _PATH)
 ivm = importlib.util.module_from_spec(_spec)
 _spec.loader.exec_module(ivm)
 def test_valid_tiers():
    assert ivm.VALID_TIERS == ("internal", "le-staging", "le-prod-wildcard")
 def test_vm_name_prefix_and_suffix():
    assert ivm.vm_name("askari", "ab12cd34") == "boma-it-askari-ab12cd34"
 def test_vm_name_generates_suffix():
    n = ivm.vm_name("askari")
    assert n.startswith("boma-it-askari-") and len(n.split("-")[-1]) == 8
 def test_free_mib_parses_memavailable():
    sample = "MemTotal:       16331156 kB\nMemAvailable:    8388608 kB\n"
    assert ivm.free_mib(sample) == 8192
 def test_parse_lease_ip_extracts_ipv4():
    out = (" Name       MAC address          Protocol     Address\n"
           "-------------------------------------------------------------------\n"
           " vnet0      52:54:00:aa:bb:cc    ipv4         192.168.150.42/24\n")
    assert ivm.parse_lease_ip(out) == "192.168.150.42"
 def test_parse_lease_ip_none_when_absent():
    assert ivm.parse_lease_ip("no leases\n") is None
 def test_parse_lease_ip_format_is_source_agnostic():
    # virsh domifaddr --source arp output format is identical to --source lease;
    # this test only proves the regex is format-agnostic (both sources produce the
    # same table). The behavioral arp-fallback in wait_for_ip is covered by
    # test_wait_for_ip_falls_back_to_arp below.
    out = (" Name       MAC address          Protocol     Address\n"
           "-------------------------------------------------------------------\n"
           " vnet0      52:54:00:de:ad:be    ipv4         192.168.150.73/24\n")
    assert ivm.parse_lease_ip(out) == "192.168.150.73"
 def test_wait_for_ip_falls_back_to_arp(monkeypatch):
    # wait_for_ip polls virsh domifaddr with --source lease first, then --source arp.
    # Simulate lease returning empty (no DHCP lease yet) and arp returning a real address.
    arp_out = (" Name       MAC address          Protocol     Address\n"
               "-------------------------------------------------------------------\n"
               " vnet0      52:54:00:aa:bb:cc    ipv4         192.168.150.142/24\n")
    def fake_sh(cmd, **kwargs):
        if "arp" in cmd:
            return types.SimpleNamespace(stdout=arp_out)
        return types.SimpleNamespace(stdout="")
    monkeypatch.setattr(ivm, "sh", fake_sh)
    monkeypatch.setattr(ivm.time, "sleep", lambda _: None)
    assert ivm.wait_for_ip("dummy") == "192.168.150.142"
 def test_meta_data_has_instance_and_hostname():
    md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x")
    assert "instance-id: iid-askari-x" in md
    assert "local-hostname: boma-it-askari-x" in md
 def test_user_data_injects_key_and_ansible_user():
    ud = ivm.render_user_data("ssh-ed25519 AAAA... claude@ubongo", "ansible")
    assert ud.startswith("#cloud-config")
    assert "name: ansible" in ud
    assert "ssh-ed25519 AAAA... claude@ubongo" in ud
    assert "NOPASSWD:ALL" in ud
 def test_cert_file_valid_tier():
    p = ivm.cert_file("le-staging")
    assert p.name == "le-staging.yml" and p.parent.name == "certs"
 def test_cert_file_rejects_bad_tier():
    with pytest.raises(ValueError):
        ivm.cert_file("bogus")
 def test_render_run_hosts_single_host_in_groups():
    out = ivm.render_run_hosts("boma-it-askari-x", "192.168.150.42",
                               "ansible", ["offsite_hosts"])
    assert "offsite_hosts:" in out
    assert "boma-it-askari-x:" in out
    assert "ansible_host: 192.168.150.42" in out
    assert "ansible_user: ansible" in out
    assert "askari:" not in out.replace("boma-it-askari-x:", "")
 def test_free_mib_returns_zero_when_absent():
    assert ivm.free_mib("MemTotal:    16384 kB\n") == 0
 def test_render_run_hosts_multiple_groups():
    out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
                               ["offsite_hosts", "docker_hosts"])
    assert "offsite_hosts:" in out
    assert "docker_hosts:" in out
 def test_render_run_hosts_dedups_groups():
    out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
                               ["docker_hosts", "docker_hosts"])
    assert out.count("docker_hosts:") == 1
--- a/tests/test_repo_scan.py
+++ b/tests/test_repo_scan.py
@ -57,99 +57,3 @@ def test_non_numbered_file_is_skipped():
    bare = ["# ADR template\n", "\n", "## Status\n", "\n", "<!-- hint -->\n"]
    out = _checks(rs.adr_structure_findings({"docs/decisions/adr-template.md": bare}))
    assert out == []
 # --- rename-incomplete -------------------------------------------------------
 def _renames(findings):
    return [f for f in findings if f["check"] == "rename-incomplete"]
 def test_rename_incomplete_flags_lingering_old_name():
    # ADR announces `Foo` -> `Bar`; another decisions file still says Foo present-tense.
    announcer = {"docs/decisions/050-rename.md": [
        "## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"]}
    other = {}  # extra_docs (CAPABILITIES/ROADMAP) — none here
    lingering = {"docs/decisions/030-other.md": [
        "The Foo proxy renders config from the catalog.\n"]}
    announcer.update(lingering)
    out = _renames(rs.rename_incomplete_findings(announcer, other))
    assert len(out) == 1
    assert out[0]["path"] == "docs/decisions/030-other.md"
    assert out[0]["line"] == 1
    assert out[0]["severity"] == "medium"
    assert "Foo" in out[0]["detail"] and "Bar" in out[0]["detail"]
 def test_rename_incomplete_clean_rename_has_no_findings():
    # The rename announced, and no other doc still mentions Foo.
    adr_files = {
        "docs/decisions/050-rename.md": [
            "## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
        "docs/decisions/030-other.md": [
            "The Bar proxy renders config from the catalog.\n"],
    }
    out = _renames(rs.rename_incomplete_findings(adr_files, {}))
    assert out == []
 def test_rename_incomplete_skips_historical_cue_line():
    # Foo lingers only on a line carrying a historical/negation cue → no finding.
    adr_files = {
        "docs/decisions/050-rename.md": [
            "## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
        "docs/decisions/030-other.md": [
            "Foo was rejected; we run Bar now.\n",
            "The history of Foo informs the choice.\n"],
    }
    out = _renames(rs.rename_incomplete_findings(adr_files, {}))
    assert out == []
 def test_rename_incomplete_skips_announcing_adr_itself():
    # The announcing ADR mentions Foo (it has to) — must not flag itself.
    adr_files = {
        "docs/decisions/050-rename.md": [
            "## Decision\n",
            "We renamed `Foo` to `Bar`.\n",
            "Operators who configured Foo should switch their habits.\n"],
    }
    out = _renames(rs.rename_incomplete_findings(adr_files, {}))
    assert out == []
 def test_rename_incomplete_skips_line_naming_new_term():
    # A line that mentions both Foo and Bar is explaining the rename → skipped.
    adr_files = {
        "docs/decisions/050-rename.md": [
            "## Decision\n", "We renamed `Foo` to `Bar`.\n"],
        "docs/decisions/030-other.md": [
            "Foo is being phased out for Bar in this paragraph.\n"],
    }
    out = _renames(rs.rename_incomplete_findings(adr_files, {}))
    assert out == []
 def test_rename_incomplete_searches_extra_docs():
    # A lingering OLD name in CAPABILITIES.md (an extra_docs file) is flagged.
    adr_files = {"docs/decisions/050-rename.md": [
        "## Decision\n", "We renamed `Foo` to `Bar`.\n"]}
    extra = {"docs/CAPABILITIES.md": ["The Foo proxy is what we deploy.\n"]}
    out = _renames(rs.rename_incomplete_findings(adr_files, extra))
    assert len(out) == 1
    assert out[0]["path"] == "docs/CAPABILITIES.md"
 def test_rename_incomplete_ignores_ambiguous_adr_pointer_assertion():
    # "the ADR-017 prose ... is updated to read Caddy" must NOT parse ADR-017 as the
    # old name (it is a doc pointer). With ADR-017 rejected, no assertion → no finding,
    # even though 'ADR-017' appears in many other docs.
    adr_files = {
        "docs/decisions/024-reverse-proxy.md": [
            "## Consequences\n",
            '- ADR-017 prose that mentioned Traefik is updated to read "Caddy".\n'],
        "docs/decisions/008-testing.md": [
            "Level 4 UI verification follows ADR-017.\n"],
    }
    out = _renames(rs.rename_incomplete_findings(adr_files, {}))
    assert out == []