Compare commits

..

No commits in common. "main" and "feat/m5-mesh-enrollment" have entirely different histories.

95 changed files with 225 additions and 6663 deletions

View file

@ -6,7 +6,6 @@ exclude_paths:
- .venv/ - .venv/
- .collections/ - .collections/
- .scaffold/ - .scaffold/
- tests/integration/.run/ # transient harness run dir (gitignored, generated)
- "**/vault.yml" # ansible-vault encrypted — not lintable YAML - "**/vault.yml" # ansible-vault encrypted — not lintable YAML
# Warn only (don't fail) on these rules during initial setup # Warn only (don't fail) on these rules during initial setup

View file

@ -6,12 +6,7 @@
# 1. The execution-mode menu — writing-plans / subagent-driven-development script a # 1. The execution-mode menu — writing-plans / subagent-driven-development script a
# "Subagent-Driven vs Inline Execution — which approach?" menu at the plan→execution # "Subagent-Driven vs Inline Execution — which approach?" menu at the plan→execution
# handoff. boma's standing preference is to NEVER present it and proceed # handoff. boma's standing preference is to NEVER present it and proceed
# subagent-driven. (Recorded by the 2026-06-10 kaizen review; the 2026-06-17 review # subagent-driven. (Recorded by the 2026-06-10 kaizen review.)
# widened the matcher to also catch free-form *prose* re-asks of the same choice —
# e.g. "which execution approach?" — which the literal-menu matcher missed. The
# sibling push-vs-not re-ask is deliberately NOT hooked: a genuine "should I push?"
# is sometimes legitimate, so it stays a soft default via the
# dont-reask-settled-defaults memory rather than a hard block.)
# 2. The brainstorming spec-review gate — the brainstorming skill scripts "Spec written # 2. The brainstorming spec-review gate — the brainstorming skill scripts "Spec written
# and committed … please review it before … the implementation plan." The standing # and committed … please review it before … the implementation plan." The standing
# agreement is to move directly from the committed spec to writing-plans. (Recorded # agreement is to move directly from the committed spec to writing-plans. (Recorded
@ -44,11 +39,7 @@ text=$(jq -rs '
low="${text,,}" low="${text,,}"
if [[ "$low" == *"inline execution"* \ if [[ "$low" == *"inline execution"* \
&& ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]] \ && ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]]; then
|| [[ "$low" == *"subagent-driven or inline"* || "$low" == *"inline or subagent"* ]] \
|| [[ "$low" == *"subagent-driven vs inline"* || "$low" == *"subagent vs inline"* \
|| "$low" == *"inline vs subagent"* ]] \
|| [[ "$low" == *"execution approach"* && "$low" == *"?"* ]]; then
cat <<'JSON' cat <<'JSON'
{"decision":"block","reason":"Execution-mode menu detected in your final message. boma standing preference (docs/FRICTION.md + always-subagent-driven-execution memory): never present the subagent-driven-vs-inline menu. Drop the menu and proceed with subagent-driven execution directly (superpowers:subagent-driven-development)."} {"decision":"block","reason":"Execution-mode menu detected in your final message. boma standing preference (docs/FRICTION.md + always-subagent-driven-execution memory): never present the subagent-driven-vs-inline menu. Drop the menu and proceed with subagent-driven execution directly (superpowers:subagent-driven-development)."}
JSON JSON

View file

@ -1,16 +1,12 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# #
# PreToolUse guard (Bash): block `git commit` ONLY when the rbw vault agent is locked # PreToolUse guard (Bash): block `git commit` when the rbw vault agent is locked.
# AND the commit would actually need the vault. The pre-commit ansible-lint hook decrypts # The pre-commit ansible-lint hook decrypts vault.yml via rbw, so a commit while
# vault.yml via rbw — but it is scoped (`files: ^(roles|playbooks|inventories)/.*\.ya?ml$`, # locked fails deep with a confusing error. This catches it early with a clear fix.
# always_run:false), so a docs-/config-only commit never triggers it and needs no vault.
# (2026-06-17 kaizen, docs/FRICTION.md: the old guard blocked *every* locked commit, so a
# docs-only commit got snagged needing a vault password it never uses.)
# #
# Fails OPEN: blocks only on a definitive "Ansible content staged AND rbw locked" signal. # Fails OPEN: only blocks on a definitive "rbw present AND not unlocked" signal.
# rbw missing, not a plain `git commit`, `--no-verify`, or no Ansible content staged → allow. # If rbw is missing, the command isn't a plain `git commit`, or `--no-verify` is
# When unsure it errs toward blocking (asking for an unlock is cheap; a deep pre-commit # used, the action is allowed.
# failure is not).
# #
set -uo pipefail set -uo pipefail
@ -26,25 +22,14 @@ case "$cmd" in
esac esac
command -v rbw >/dev/null 2>&1 || exit 0 # rbw not installed — allow command -v rbw >/dev/null 2>&1 || exit 0 # rbw not installed — allow
rbw unlocked >/dev/null 2>&1 && exit 0 # unlocked — allow
# rbw is LOCKED. Only block if this commit would run the vault-decrypting ansible-lint if rbw unlocked >/dev/null 2>&1; then
# hook — i.e. staged content matches its `files:` scope. Mirror that regex exactly. exit 0 # unlocked — allow
ANSIBLE_RE='^(roles|playbooks|inventories)/.*\.ya?ml$' fi
cd "${CLAUDE_PROJECT_DIR:-.}" 2>/dev/null || exit 0 # rbw present but not unlocked (locked or agent not running) — the commit would
files=$(git diff --cached --name-only 2>/dev/null) || exit 0 # fail in the pre-commit hook, so block early with guidance.
# `git commit -a/--all` also sweeps in modified tracked files that aren't staged yet.
# (Substring match — errs toward including them, which only ever over-blocks. Safe.)
case " $cmd " in
*" -a"*|*"--all"*) files="$files"$'\n'"$(git diff --name-only 2>/dev/null)" ;;
esac
# No Ansible content in the fileset → ansible-lint hook won't run → no vault needed → allow.
printf '%s\n' "$files" | grep -Eq "$ANSIBLE_RE" || exit 0
# Ansible content staged AND rbw locked — the commit would fail deep in pre-commit. Block.
cat <<'JSON' cat <<'JSON'
{"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked and this commit stages Ansible content — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock (docs-/config-only commits are exempt and won't hit this guard.)"}} {"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock"}}
JSON JSON
exit 0 exit 0

View file

@ -69,10 +69,5 @@
] ]
} }
] ]
},
"statusLine": {
"type": "command",
"command": "bash \"${CLAUDE_PROJECT_DIR:-.}/.claude/statusline.sh\"",
"padding": 0
} }
} }

View file

@ -1,63 +0,0 @@
#!/usr/bin/env bash
#
# Claude Code statusLine — shows working dir, model, and context-window usage.
# Wired via .claude/settings.json (statusLine.command). Receives the statusLine
# JSON on stdin; first stdout line is rendered (ANSI colour supported).
#
# Context usage comes straight from the input JSON — no transcript parsing:
# .context_window.used_percentage pre-calculated % of the window in use (input side)
# .context_window.context_window_size window size in tokens (1000000 for the 1M models)
# verified: Claude Code statusLine schema · code.claude.com/docs/en/statusline · 2026-06-17
#
# Fails soft: any parse problem prints nothing and exits 0 (never breaks the prompt).
set -uo pipefail
input=$(cat 2>/dev/null) || exit 0
command -v jq >/dev/null 2>&1 || exit 0
# pct<TAB>window<TAB>dir-basename<TAB>model-name (used_percentage preferred,
# else derived from current_usage, else 0). @tsv keeps spaces in the dir safe.
parsed=$(printf '%s' "$input" | jq -r '
(.workspace.current_dir // .cwd // "" | sub(".*/"; "")) as $dir
| (.model.display_name // "?") as $model
| (.context_window.context_window_size // 200000) as $win
| (
if (.context_window.used_percentage // null) != null then
.context_window.used_percentage
elif (.context_window.current_usage // null) != null then
((.context_window.current_usage.input_tokens
+ (.context_window.current_usage.cache_creation_input_tokens // 0)
+ (.context_window.current_usage.cache_read_input_tokens // 0)) / $win * 100)
else 0 end | floor
) as $pct
| [$pct, $win, $dir, $model] | @tsv
' 2>/dev/null) || exit 0
[ -z "$parsed" ] && exit 0
IFS=$'\t' read -r pct win dir model <<<"$parsed"
# Human window label: 1000000 -> 1M, 200000 -> 200k, else Nk.
case "$win" in
1000000) wlabel="1M" ;;
*) wlabel="$((win / 1000))k" ;;
esac
# Colour the bar/percentage by pressure: green <70, yellow 7089, red >=90.
if [ "$pct" -ge 90 ]; then col=$'\033[31m' # red
elif [ "$pct" -ge 70 ]; then col=$'\033[33m' # yellow
else col=$'\033[32m' # green
fi
dim=$'\033[2m'; rst=$'\033[0m'
# 10-cell bar; clamp fill to [0,10] so an over-100 reading can't overflow.
filled=$((pct / 10)); [ "$filled" -gt 10 ] && filled=10; [ "$filled" -lt 0 ] && filled=0
bar=""
for ((i = 0; i < 10; i++)); do
if [ "$i" -lt "$filled" ]; then bar+="█"; else bar+="░"; fi
done
printf '%s%s%s · %s · %s%s %d%%%s %sctx/%s%s\n' \
"$dim" "$dir" "$rst" \
"$model" \
"$col" "$bar" "$pct" "$rst" \
"$dim" "$wlabel" "$rst"

3
.gitignore vendored
View file

@ -34,6 +34,3 @@ terraform/**/terraform.tfvars
# Service-UI verification screenshots (kept locally on ubongo, not committed — ADR-017) # Service-UI verification screenshots (kept locally on ubongo, not committed — ADR-017)
.verify-runs/ .verify-runs/
# Integration-test transient run dir (ADR-025); diagnostics live under ~/integration-runs
tests/integration/.run/

View file

@ -24,5 +24,4 @@ ignore: |
.venv/ .venv/
.collections/ .collections/
.scaffold/ .scaffold/
tests/integration/.run/
**/vault.yml **/vault.yml

View file

@ -43,8 +43,6 @@ Full design rationale: `docs/decisions/`
| Terraform plan | `make tf-plan [TF_ENV=staging]` | | Terraform plan | `make tf-plan [TF_ENV=staging]` |
| Terraform apply | `make tf-apply [TF_ENV=staging]` | | Terraform apply | `make tf-apply [TF_ENV=staging]` |
| Regenerate Ansible inventory | `make tf-inventory TF_ENV=<staging\|production>` | | Regenerate Ansible inventory | `make tf-inventory TF_ENV=<staging\|production>` |
| Integration-test a host on a local VM | `make test-integration HOST=<name> [CERTS=…]` |
| Clean up integration test VMs | `make test-integration-clean` |
**Always `tf-plan` before `tf-apply`. Always `check` before `deploy`. Never skip lint.** **Always `tf-plan` before `tf-apply`. Always `check` before `deploy`. Never skip lint.**
@ -258,10 +256,7 @@ Single-contributor, trunk-based (no merge requests / approval gates):
| Backup & disaster recovery | `docs/decisions/022-backup.md` | | Backup & disaster recovery | `docs/decisions/022-backup.md` |
| ADR structure & lifecycle | `docs/decisions/023-adr-structure.md` | | ADR structure & lifecycle | `docs/decisions/023-adr-structure.md` |
| Reverse proxy (Caddy) | `docs/decisions/024-reverse-proxy.md` | | Reverse proxy (Caddy) | `docs/decisions/024-reverse-proxy.md` |
| Local VM integration testing (ADR-025) | `docs/decisions/025-local-vm-integration-testing.md` |
| Integration testing runbook | `docs/runbooks/integration-testing.md` |
| Adding a new role | `docs/runbooks/new-role.md` | | Adding a new role | `docs/runbooks/new-role.md` |
| Adding a new host | `docs/runbooks/new-host.md` | | Adding a new host | `docs/runbooks/new-host.md` |
| Enrolling a NetBird client (laptop/phone) | `docs/runbooks/netbird-client.md` |
| Rotating vault secrets | `docs/runbooks/rotate-secrets.md` | | Rotating vault secrets | `docs/runbooks/rotate-secrets.md` |
| Claude Code setup (per machine) | `docs/runbooks/claude-code-setup.md` | | Claude Code setup (per machine) | `docs/runbooks/claude-code-setup.md` |

View file

@ -23,11 +23,6 @@ MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
# (the Go module proxy 403s Hetzner IPs); push the pinned tag to the Forgejo registry. # (the Go module proxy 403s Hetzner IPs); push the pinned tag to the Forgejo registry.
CADDY_IMAGE := forgejo.nyumbani.baobab.band/sjat/caddy-gandi:2.11.4 CADDY_IMAGE := forgejo.nyumbani.baobab.band/sjat/caddy-gandi:2.11.4
CADDY_DOCKERFILE := .docker/caddy-gandi/Dockerfile CADDY_DOCKERFILE := .docker/caddy-gandi/Dockerfile
# Forgejo container registry (same host/user as the image tags above). `make registry-login`
# logs the Docker daemon in using vault.forgejo.registry_token (2026-06-17 kaizen) so image
# pushes are agent-completable non-interactively.
REGISTRY_HOST := forgejo.nyumbani.baobab.band
REGISTRY_USER := sjat
# For TF_ENV=offsite, source the Hetzner token from the vault into the environment # For TF_ENV=offsite, source the Hetzner token from the vault into the environment
# (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md). # (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
@ -39,11 +34,10 @@ endif
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: help setup collections lint test test-all test-integration test-integration-clean \ .PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
check deploy encrypt decrypt \
edit-vault check-vault new-role \ edit-vault check-vault new-role \
tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \ tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \
molecule-image molecule-image-push caddy-image caddy-image-push registry-login molecule-image molecule-image-push caddy-image caddy-image-push
help: help:
@echo "" @echo ""
@ -54,10 +48,8 @@ help:
@echo " make lint Run yamllint + ansible-lint" @echo " make lint Run yamllint + ansible-lint"
@echo " make test ROLE=<name> Run Molecule tests for a role" @echo " make test ROLE=<name> Run Molecule tests for a role"
@echo " make test-all Run Molecule tests for all roles" @echo " make test-all Run Molecule tests for all roles"
@echo " make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1] Run ADR-025 integration cycle against a VM" @echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Dry-run a playbook (check mode)"
@echo " make test-integration-clean Prune stale integration-test VM snapshots" @echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Run a playbook against production"
@echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>] Dry-run a playbook (check mode)"
@echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>] Run a playbook against production"
@echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)" @echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)"
@echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)" @echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)"
@echo " make encrypt FILE=<path> Encrypt a vault file" @echo " make encrypt FILE=<path> Encrypt a vault file"
@ -77,7 +69,6 @@ help:
@echo " make molecule-image-push Push the test image to the Forgejo registry" @echo " make molecule-image-push Push the test image to the Forgejo registry"
@echo " make caddy-image Build the custom Caddy + Gandi DNS-01 image (run on ubongo)" @echo " make caddy-image Build the custom Caddy + Gandi DNS-01 image (run on ubongo)"
@echo " make caddy-image-push Push the Caddy image to the Forgejo registry" @echo " make caddy-image-push Push the Caddy image to the Forgejo registry"
@echo " make registry-login Log Docker into the Forgejo registry (vaulted token)"
@echo "" @echo ""
# ── Environment setup ───────────────────────────────────────────────────────── # ── Environment setup ─────────────────────────────────────────────────────────
@ -112,29 +103,19 @@ test-all:
cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \ cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \
done done
test-integration:
ifndef HOST
$(error HOST is required: make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1])
endif
PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py cycle \
--host $(HOST) $(if $(CERTS),--certs $(CERTS)) $(if $(KEEP),--keep)
test-integration-clean:
PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py prune
# ── Playbook execution ──────────────────────────────────────────────────────── # ── Playbook execution ────────────────────────────────────────────────────────
check: check:
ifndef PLAYBOOK ifndef PLAYBOOK
$(error PLAYBOOK is required: make check PLAYBOOK=<name>) $(error PLAYBOOK is required: make check PLAYBOOK=<name>)
endif endif
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) --check --diff playbooks/$(PLAYBOOK).yml $(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) --check --diff playbooks/$(PLAYBOOK).yml
deploy: deploy:
ifndef PLAYBOOK ifndef PLAYBOOK
$(error PLAYBOOK is required: make deploy PLAYBOOK=<name>) $(error PLAYBOOK is required: make deploy PLAYBOOK=<name>)
endif endif
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) playbooks/$(PLAYBOOK).yml $(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) playbooks/$(PLAYBOOK).yml
# ── Vault ───────────────────────────────────────────────────────────────────── # ── Vault ─────────────────────────────────────────────────────────────────────
@ -178,13 +159,6 @@ caddy-image:
caddy-image-push: caddy-image caddy-image-push: caddy-image
docker push $(CADDY_IMAGE) docker push $(CADDY_IMAGE)
# Log the local Docker daemon into the Forgejo registry using the vaulted token, so the
# *-image-push targets above are agent-completable non-interactively (rbw must be unlocked).
registry-login:
@ANSIBLE_VAULT="$(ANSIBLE)-vault" PYTHON="$(PYTHON)" VAULT="$(VAULT)" \
REGISTRY_HOST="$(REGISTRY_HOST)" REGISTRY_USER="$(REGISTRY_USER)" \
bash scripts/registry-login.sh
# ── Terraform ───────────────────────────────────────────────────────────────── # ── Terraform ─────────────────────────────────────────────────────────────────
tf-init: tf-init:

View file

@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the
truth. **Before relying on a role, provider, or pipeline existing, check here.** truth. **Before relying on a role, provider, or pipeline existing, check here.**
If something is listed as "designed, not built", do not assume it works. If something is listed as "designed, not built", do not assume it works.
_Last reviewed: 2026-06-19._ _Last reviewed: 2026-06-14._
## Real and working today ## Real and working today
@ -30,8 +30,8 @@ _Last reviewed: 2026-06-19._
| `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). | | `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). |
| `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). | | `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. | | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **`base` firewall applied (mesh-hardening 2/3, 2026-06-19):** INPUT-only default-deny — input locked to `wt0` + ssh-from-control (`10.20.10.151`) + workstations (`10.20.10.50` mamba, `10.20.10.17`); forward `accept` (Docker/libvirt-NAT safe). Live-verified (SSH self-path + Docker egress, after a post-apply `restart docker` — base's flush wipes Docker nat, FRICTION); **real-host reboot-validated (2026-06-19):** after an operator reboot, the `policy drop` input chain + full allow-list re-applied on boot and the `wt0` mesh + SSH self-path came back clean. `claude` now self-SSHes (ad-hoc `authorized_keys` grant so the agent can run SSH-based deploys with the auto-rollback safety; fold into the control-node bootstrap). **Pending:** full `base` hardening (auditd/CIS); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservations (10.20.10.151 MAC `88:a4:c2:e0:ee:da` + the `.50`/`.17` workstation leases); Terraform state backup (now relevant — the offsite tfstate exists). | | `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern; agent management now works because `claude`'s SSH key was added to `sjat`'s `authorized_keys` and `sjat` was granted `NOPASSWD` sudo (`/etc/sudoers.d/sjat-ansible`) — the interim until the proper `ansible`-user bootstrap. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Mesh-hardening redesign applied + live reboot-validated (2026-06-20):** `base` INPUT-only nftables default-deny (`inet filter` input `policy drop`; forward `accept`, Docker-safe via a post-apply `restart docker`), SSH `wt0`-primary + a permanent WAN break-glass (ubongo's WAN `91.226.145.80`; the Hetzner console is the OOB ultimate fallback), managed over `wt0`; `netbird_coordinator` geolocation disabled (`NB_DISABLE_GEOLOCATION`) so a no-egress boot can't FATAL it. A real reboot recovered **unattended** — firewall persisted, Docker forwarding + public services (Caddy 80/443, STUN 3478) up, coordinator geo-disabled (no FATAL), `wt0`/mesh (Management+Signal Connected) + both SSH paths back. **Pending:** offsite tfstate backup (ADR-022); relay-SPOF reduction (next mesh-hardening sub-project — `ubongo→askari` is currently `Relayed` through askari's own relay). | | `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Pending:** host firewall + moving askari's SSH onto `wt0` (deferred mesh-hardening; the Hetzner Cloud Firewall is its perimeter until then), offsite tfstate backup (ADR-022). |
| `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy``/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). | | `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy``/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). |
| `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. | | `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. |
@ -39,7 +39,7 @@ _Last reviewed: 2026-06-19._
| Thing | State | | Thing | State |
|---|---| |---|---|
| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is **applied to ubongo** (mesh-hardening 2/3, 2026-06-19) **and askari** (mesh-hardening redesign, 2026-06-20) — both INPUT-only default-deny via the `base__firewall_input_only` knob (input default-deny + `wt0`/ssh-from-control/`base__firewall_admin_addrs` allow-list; forward left `accept` so Docker/libvirt-NAT survive), both **live reboot-validated**. On a Docker host (askari) base's `flush ruleset` wipes Docker's nat, so the cutover follows the firewall apply with a `restart docker` to rebuild it (FRICTION). Not built: auditd, packages, users (Phase 2 / TODO 15). The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`) so a local-DNS hiccup can't strand the mesh — **applied + live on ubongo (2026-06-20)**: `getent hosts netbird.askari.wingu.me``77.42.120.136`, mesh unaffected. The single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment). | | `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). |
| `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts | | `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
| `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs | | `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |
@ -50,7 +50,7 @@ daemon hardening + `nftables.d` container rules, ADR-004/ADR-020 — is still pe
A `make deploy PLAYBOOK=site` run now applies real content — `base` (its `firewall` + A `make deploy PLAYBOOK=site` run now applies real content — `base` (its `firewall` +
`hardening` concerns) plus a functional `docker_host` (Docker engine) on docker hosts — `hardening` concerns) plus a functional `docker_host` (Docker engine) on docker hosts —
but in practice it is still limited: the production cluster has no docker hosts yet, and but in practice it is still limited: the production cluster has no docker hosts yet, and
`base`'s `firewall` concern is now applied to `ubongo` (control) but not yet to cluster docker hosts (none exist), so a full cluster `site` run does not `base`'s `firewall` concern is mesh-gated until M5, so a full cluster `site` run does not
yet exist. (The `make check`/`deploy` machinery itself works — first proven by applying yet exist. (The `make check`/`deploy` machinery itself works — first proven by applying
`dev_env` via `playbooks/workstation.yml`, then `base`/`docker_host`/`reverse_proxy` on `dev_env` via `playbooks/workstation.yml`, then `base`/`docker_host`/`reverse_proxy` on
askari.) askari.)
@ -70,7 +70,7 @@ askari.)
| CIS hardening (Debian L1+L2 + Docker) | ADR-002 / TODO 15 | Implemented by the (unbuilt) `base`/`docker_host` roles; brings AppArmor + AIDE as baseline. L2 partitions affect VM provisioning (ADR-006) | | CIS hardening (Debian L1+L2 + Docker) | ADR-002 / TODO 15 | Implemented by the (unbuilt) `base`/`docker_host` roles; brings AppArmor + AIDE as baseline. L2 partitions affect VM provisioning (ADR-006) |
| Network IDS + security alerting | ADR-002 / TODO 15 | Suricata on OPNsense + AIDE/`auditd`/`fail2ban` alerting into the monitoring stack; not built | | Network IDS + security alerting | ADR-002 / TODO 15 | Suricata on OPNsense + AIDE/`auditd`/`fail2ban` alerting into the monitoring stack; not built |
| NetBird mesh — coordinator on `askari` | ADR-016 | **BUILT + applied (M4b, 2026-06-16)** — moved up to "Real and working today" (`roles/netbird_coordinator/`). Self-hosted control plane on askari; replaces ADR-007 WireGuard. Mesh **peer enrolment = M5** (next row). | | NetBird mesh — coordinator on `askari` | ADR-016 | **BUILT + applied (M4b, 2026-06-16)** — moved up to "Real and working today" (`roles/netbird_coordinator/`). Self-hosted control plane on askari; replaces ADR-007 WireGuard. Mesh **peer enrolment = M5** (next row). |
| NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. **Road-warrior clients (`mamba` + work laptop) enrolled (2026-06-17) → `ubongo` reachable from anywhere: the mobile-access goal is met and Phase 1 (remote access) is COMPLETE.** Client enrollment runbook: `docs/runbooks/netbird-client.md`. | | NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. Road-warrior clients (laptops) are operator-enrolled. |
| Service-UI verification (Level 4) | ADR-017 / ADR-008 | **Design RESOLVED** (ADR-017 + spec + plan); resolves ADR-015 deferred #2. `/verify-service` skill + `VERIFY.md` template + standards are authorable and present. **Build pending:** running needs ubongo + `playwright` plugin + Authentik + a staging deploy. | | Service-UI verification (Level 4) | ADR-017 / ADR-008 | **Design RESOLVED** (ADR-017 + spec + plan); resolves ADR-015 deferred #2. `/verify-service` skill + `VERIFY.md` template + standards are authorable and present. **Build pending:** running needs ubongo + `playwright` plugin + Authentik + a staging deploy. |
| Logging pipeline (Loki + Alloy + off-site subset) | ADR-018 | **Design RESOLVED** (ADR-018 + spec). All logs → on-cluster Loki; security subset write-only off-site to askari. **Build pending:** Alloy in `base`, `loki`/`grafana` service roles, OPNsense syslog — none built. | | Logging pipeline (Loki + Alloy + off-site subset) | ADR-018 | **Design RESOLVED** (ADR-018 + spec). All logs → on-cluster Loki; security subset write-only off-site to askari. **Build pending:** Alloy in `base`, `loki`/`grafana` service roles, OPNsense syslog — none built. |
| Security alerting (AIDE/auditd/fail2ban/Suricata + log-silence) | ADR-002 / ADR-018 | Wired into Grafana on the Loki stack. Designed; depends on the logging pipeline + metrics stack (TODO 3.6). | | Security alerting (AIDE/auditd/fail2ban/Suricata + log-silence) | ADR-002 / ADR-018 | Wired into Grafana on the Loki stack. Designed; depends on the logging pipeline + metrics stack (TODO 3.6). |
@ -81,18 +81,6 @@ askari.)
| Backup `backup` role + `backup_hosts` group | ADR-022 | Does not exist. Pull node (`fisi`), restic repo, rclone→pCloud, USB air-gap — Plan 2. | | Backup `backup` role + `backup_hosts` group | ADR-022 | Does not exist. Pull node (`fisi`), restic repo, rclone→pCloud, USB air-gap — Plan 2. |
| Per-service `backup__*` contract + `BACKUP.md` | ADR-022 | Convention defined; inert until service roles exist to declare against. | | Per-service `backup__*` contract + `BACKUP.md` | ADR-022 | Convention defined; inert until service roles exist to declare against. |
## Integration test harness (ADR-025)
| Thing | State |
|---|---|
| `roles/integration_test/` | **Built** — installs/enables libvirt+QEMU+virtinst on `control` group hosts; adds `sjat`/`claude` to `libvirt` group; creates image-cache dir. Lint clean; applied live to ubongo (substrate installed); molecule scenario present, not run in the build env. |
| `scripts/integration-vm.py` | **Built** — stdlib-only lifecycle driver over `virsh`/`virt-install`/`cloud-localds`: `up / apply / reboot / assert / cycle / down / prune / console`. Lazily ensures the golden Debian-13 genericcloud image. pytest clean (transient-inventory generation, var/overlay merge, `--certs` mapping, DHCP-lease parsing, resource-guard math). |
| `tests/integration/` (profile, verify, overrides) | **Built** — "be askari" profile + var overlay + `verify.yml` outcome assertions (Docker active, forward-chain accepts present, published-port DNAT alive). Validated end-to-end by the RED→GREEN acceptance run. |
| `make test-integration` / `make test-integration-clean` | **Built** — wired into `Makefile`. |
| ADR-025 | **Accepted (2026-06-18)** — decision recorded, approach A, cert tiers, safety invariants, UEFI boot requirement, and claude-sudo dependency documented. |
| **RED/GREEN acceptance (ubongo live pass)** | **PASSED (2026-06-18).** A throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base nftables forward default-deny kills Docker forwarding on reboot) = RED. Applying the `docker_host` container-forward drop-in and rebooting survived = GREEN. Nine shakedown findings captured in `docs/FRICTION.md`; key learnings (UEFI boot, claude sudo) recorded in ADR-025. `docs/TODO.md` item 2.4 closed. |
| `le-staging` cert validation | **Pending** — wired in v1 but not yet exercised on a real VM (separate from the RED/GREEN acceptance gate). |
## Keeping this honest ## Keeping this honest
Update this file whenever you build, stub, or remove something. It is the first Update this file whenever you build, stub, or remove something. It is the first

View file

@ -22,259 +22,83 @@ earning its keep.
_(append new raw signals here; the next kaizen review consumes them)_ _(append new raw signals here; the next kaizen review consumes them)_
- `[friction]` **Re-asked settled defaults (push + subagent-driven) at the plan→execute handoff** - `[friction]` **Image push to the Forgejo registry fails with `no basic auth
(2026-06-19): despite the standing preference (memory `dont-reask-settled-defaults`: push to credentials`** (2026-06-15): `make caddy-image-push` (and `molecule-image-push`) fail
origin as off-machine backup **and** go subagent-driven, both WITHOUT asking), I again asked the unless the Docker daemon on ubongo has an interactive `docker login
operator "which execution approach?" and "want me to push?". The `writing-plans` skill scripts forgejo.nyumbani.baobab.band` session — and those creds are **not in vault** (only
that handoff question ("Which approach?"), and confirming a push felt natural — both overrode the `gandi` + `hetzner` are), so an agent can't complete a push non-interactively. The
memory. → at the writing-plans → execution handoff, default to subagent-driven execution and push build half is fully automatable; the push half silently requires a human. → candidate:
to origin without a confirmation gate; reserve questions for genuine forks. Recurrence of an document the `docker login` step in `docs/runbooks/claude-code-setup.md`, **or** store
already-recorded signal — treat the skill's scripted "Which approach?" as pre-answered a scoped Forgejo registry token in vault + a `make registry-login` target (login via
(subagent-driven) for this operator. `--password-stdin`, `no_log`) so pushes are agent-completable like every other
vault-backed action.
<!-- The six below are from the 2026-06-17 mesh-hardening-1/3 incident: applying base's - `[gotcha]` **Single-file Docker bind mount + atomic config rewrite = stale config in
nftables default-deny + wt0-only sshd to askari (the off-site Docker host that ALSO runs the running container** (2026-06-16): `reverse_proxy` bind-mounted the Caddyfile as a
the NetBird coordinator) took it down on reboot; recovery needed the Hetzner console + single file; `ansible.builtin.template` writes atomically (temp + rename → new inode),
a WAN-SSH break-glass. Spec/plan: docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*. --> so the running container kept the OLD inode and `caddy reload` (in-container, no restart)
re-read stale config and silently no-op'd (`"config is unchanged"`). The NetBird route
never loaded → Caddy never requested its cert; surfaced only by a TLS handshake failure.
Fix: mount the config **directory** (`./caddy``/etc/caddy`) — directory mounts reflect
inode swaps, so live reload works (proven on askari). NOTE the sibling case: NetBird also
single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
in-container reload), and a restart DOES re-resolve the bind mount (verified: 0 before,
1 after) — so restart-based roles are safe; only in-place-reload roles need the dir mount.
→ candidate gotcha doc (`docs/testing/gotchas.md`): "reload-in-place needs a directory
mount; restart-based roles are fine with a single-file mount."
- `[gotcha]` **`base`'s nftables `forward policy drop` breaks Docker hosts on reboot** - `[friction]` **`make check` always fails on the first-ever deploy of a compose service
(2026-06-17): `base/templates/nftables.conf.j2` sets `chain forward { ... policy drop; }`. role** (2026-06-16): in check mode the "ensure base_dir" task is reported-but-not-run, so
On a Docker host, container traffic is *forwarded* (published-port DNAT → container, and the later `community.docker.docker_compose_v2` up fails with `"…is not a directory"`
inter-container over the bridge), so the drop kills it. It worked right after `make (missing `project_src`). Not a defect — a real deploy creates the dir — but it means the
deploy` (Docker's runtime rules coexisted) but after a reboot nftables loaded our CLAUDE.md "always `make check` before `make deploy`" step is guaranteed-red for any brand
default-deny *before* Docker, breaking WAN→Caddy and Caddy→coordinator → the public new stateful role, which erodes trust in the check. → candidate: guard the compose-up with
services and the mesh went down. The `docker_host` "`nftables.d` container-forward rules" `not ansible_check_mode` (clean "skipped" in dry-run; compose can't be meaningfully
that would make this Docker-safe are explicitly **pending** (STATUS.md). → the `base` dry-run before first deploy anyway), OR document the one-time expected failure. Decide one.
firewall (`base__firewall_apply`) must NOT be applied to any Docker host until
`docker_host` ships the container-forward rules; add a guard/check (a Docker host with
`firewall_apply: true` and no container-forward drop-in is a misconfiguration), and the
firewall design (ADR-020) should state the Docker-host dependency explicitly.
- `[gotcha]` **`ip_nonlocal_bind` did NOT beat the sshd boot-race** (2026-06-17): the - `[recurring]` **Re-asked the operator about settled defaults — push + execution mode**
mesh-hardening plan bound sshd `ListenAddress` to the `wt0` IP and set (2026-06-17): at the M5 plan handoff I asked (a) whether to push to origin and (b) which
`net.ipv4.ip_nonlocal_bind=1` so sshd could bind the mesh IP before `wt0` exists at execution mode (subagent-driven vs inline) — both already settled: CLAUDE.md says push to
boot. In practice the console still showed sshd *"could not assign the address"* at boot `origin` often (off-machine backup), and TODO 10.5 / the standing agreement is "always
— so the protection did not work as designed, and because `wt0` never came up (the subagent-driven" (there's even `guard-execution-mode-menu.sh`). Same shape as the 5×
coordinator was down), sshd had no listener at all → no SSH path. → the entire "execution-mode menu asked AGAIN" ledger entries — but this time the ask was my own
"sshd listens on `wt0` only" premise is unsound without (a) a *verified* boot-race fix free-form prose ("want those pushed now?", "which execution approach?"), which the
and (b) a guaranteed non-mesh break-glass. Re-investigate why `ip_nonlocal_bind` didn't existing menu-text matcher does NOT catch (it keys on the writing-plans menu's literal
help (ordering vs the sysctl drop-in load? the sysctl not applied before sshd start?), text). → the gap is that the guard only matches that literal menu; free-form re-asks slip
or drop ListenAddress-on-mesh entirely and rely on the host firewall for SSH scoping. through. Candidate: widen the Stop-hook matcher to also flag prose re-asks of
push-vs-not / subagent-vs-inline, since prose reminders have already failed this many
times. Default behaviour: **push as backup and proceed subagent-driven without asking.**
- `[gotcha]` **The coordinator host can't bootstrap the mesh it depends on** (2026-06-17): - `[friction]` **A docs-only commit still tripped the `rbw`-locked pre-commit guard**
`askari` runs the NetBird coordinator AND is a mesh peer. After a reboot its NetBird (2026-06-17): committing only `docs/superpowers/specs/*.md` (no ansible content) was
agent needs the coordinator (a local container) to be serving to bring up `wt0` — but blocked needing the vault password, although the 2026-06-10 kaizen fix scoped the
the coordinator wasn't healthy, so `wt0` never came up. Circular. Combined with sshd pre-commit `ansible-lint` hook (`always_run: false` + `files:` ansible content) so
being `wt0`-only, the host was reachable only via the Hetzner console. → the docs-/config-only commits skip it and need no vault. So either the hook's `files:`
coordinator host must keep a **non-mesh management path always** (don't move its SSH onto pattern still matches `docs/**` (or `.md`), or a blanket pre-commit step needs the
`wt0`), or the mesh-hardening must treat the coordinator host as a special case. General vault regardless. → check `.pre-commit-config.yaml`'s `files:`/`exclude:` against the
rule: never make a host's only management path depend on a service that host itself spec/plan paths; docs-only commits should not require `rbw`.
hosts.
- `[gotcha]` **NetBird `netbird-server` FATAL-loops on the geolocation DB download with no - `[friction]` **The agent can't manage `ubongo` (the control node it runs ON) without
egress** (2026-06-17): on startup the combined `netbird-server:0.72.4` tries to download the operator granting access** (2026-06-17): enrolling `ubongo` in the mesh needed two
the GeoLite2 DB from `pkgs.netbird.io` and treats failure as **FATAL** (crash-loop) — so manual operator grants because the agent runs as `claude` (no sudo) but the inventory
any loss of container egress (here: Docker NAT masquerade wiped when `nftables` was manages `ubongo` as `sjat`: (1) `claude`'s SSH key added to `sjat`'s `authorized_keys`
flushed, not re-added by a plain `restart docker`) takes the whole control plane down. (`Permission denied (publickey)` otherwise), then (2) `NOPASSWD` sudo for `sjat`
Recovery was `restart docker` (rebuild NAT) → force-recreate the container so it could (`Missing sudo password` otherwise). So the "AI-worker control node" (ADR-015) can drive
download. → for the `netbird_coordinator` role: pre-seed/persist the geo DB in the data the whole fleet but not itself, unattended. This is the **pending `ansible`-user
dir (or pin a local copy), or disable the geolocation requirement, so a transient egress bootstrap** gap (STATUS) biting in practice. → the proper fix is ubongo's bootstrap to a
blip can't FATAL the coordinator. Note for the firewall design: container egress (NAT) key-trusted, NOPASSWD `ansible` (or `sjat`) management identity as part of `base`/its
is fragile across `nft flush` + reboot. control-node recipe, so control-node self-management doesn't need ad-hoc operator grants.
- `[friction]` **No off-site coordinator backup turned a 2-minute restore into a long live - `[recurring]` **ADRs claim cross-doc reconciliation they didn't actually perform**
recovery** (2026-06-17): the NetBird coordinator's stateful store (`/var/lib/netbird`, (2026-06-14): ADR-024's Status + Consequences asserted "ADR-017 prose that mentioned
encrypted SQLite) has **no off-site backup yet** (ADR-022 `backup` role pending, Traefik is updated to read Caddy" — but ADR-008/017/019 + CAPABILITIES still said
flagged in STATUS as the coordinator's deferred backup). During the incident there was a Traefik; the rename was left half-done across the doc set and the ADR over-claimed its
real fear the unclean reboots had corrupted the store, with no restore path. It turned own follow-through. Surfaced only by a full-repo `grep Traefik` during `/review-repo`.
out to be a runtime/egress issue, not corruption — but the absence of a backup made the Same shape as the deferred-decision-goes-stale signal (a decision lands in one place,
whole recovery higher-stakes. → prioritise the ADR-022 backup contract for the its promised ripple edits don't). → candidate `repo-scan.py` check: when an ADR's text
`netbird_coordinator` store ahead of the rest of the backup role; a recent off-host copy asserts "X is updated to Y" / supersedes a named tool, flag remaining occurrences of the
would have made "rebuild askari from scratch" a safe option. old name (or verify the claimed edit landed) — the structural cousin of `stale-deferred`.
(KEEP-OPEN per the 2026-06-14 `/kaizen` run — it's its own build task.)
- `[friction]` **The plan tested reboot-recovery AFTER removing the break-glass**
(2026-06-17): the mesh-hardening plan's live cutover closed the WAN `:22` (step 5)
*before* the reboot-resilience test (step 7), so the one fallback path was gone exactly
when the reboot exposed the boot-race + Docker-firewall bugs. → sequencing rule for
lockout-risky cutovers: **validate reboot-recovery while the old access path is still
open**, and only retire the break-glass once recovery (incl. a reboot) is proven.
Generalises beyond this milestone — a candidate line in the new-host / hardening runbooks.
<!-- The below are from the 2026-06-18 ADR-025 build: standing up the local-VM integration
harness on ubongo and shaking it down against real KVM (spec/plan in docs/superpowers/). -->
- `[gotcha]` **Debian 13 genericcloud boot-loops under legacy BIOS/SeaBIOS** (2026-06-18):
`virt-install --import` of the genericcloud qcow2 with the default (SeaBIOS) firmware
triple-faults at the real-mode kernel handoff — GRUB loops, no "Decompressing Linux", no
DHCP lease. The symptom (no network) pointed away from the cause (firmware). → boot test
VMs via **UEFI** (`virt-install --boot uefi`; OVMF→efistub).
- `[friction]` **The no-sudo `claude` model blocked diagnosing a failed VM** (2026-06-18):
under ADR-015 `claude` had no sudo, so when the VM wouldn't network there was no way to
introspect it (serial logs are `root:0600`, libguestfs not installed, mounting needs
root). Diagnosis was fully blocked until the operator granted `claude` sudo. → DECISION:
`claude` gets `NOPASSWD:ALL` (reverses ADR-015's "no local sudo"); compensating control
is auditd/Loki attribution (already in ADR-015). Amend ADR-015/ADR-021 + accepted-risks;
codify the sudoers drop-in in Ansible.
- `[gotcha]` **Non-root `virsh`/`virt-install` default to `qemu:///session`** (2026-06-18):
the substrate (NAT net, /dev/kvm) lives on `qemu:///system`. → pin
`LIBVIRT_DEFAULT_URI=qemu:///system` in the driver.
- `[gotcha]` **`qemu:///system` (libvirt-qemu) can't traverse `/home`** (2026-06-18): VM
disk/seed/console under the repo/home failed "Permission denied (search permissions for
/home/claude)". → put per-VM artifacts in a system-readable dir (`/var/lib/boma-integration`,
group libvirt); the inventory (read by ansible as the user) can stay in the repo.
- `[gotcha]` **`ansible-playbook -i <dir>/` parses sibling non-inventory files as INI**
(2026-06-18): pointing `-i` at a run-dir holding a state file + qcow2s made the directory
inventory loader parse the state file as INI → phantom hosts INCLUDING the real `askari`
(with its real vars), breaking the single-host isolation invariant. → point `-i` at the
single `hosts.yml`. Caught by the holistic cross-file review BEFORE any hardware run.
- `[gotcha]` **Jinja `{%- -%}` + ansible `trim_blocks=True` double-strip newlines**
(2026-06-18): a template edit used `{%- -%}`, reviewed by rendering with RAW jinja2
(trim_blocks=False) which looked fine; ansible (trim_blocks=True) then collapsed the
rendered Caddyfile onto single lines → caddy crash-looped on invalid config. → verify
templates with ansible's whitespace (trim_blocks=True), not raw jinja2; prefer plain
`{% %}` at column 0 (the repo's existing style).
- `[gotcha]` **Fresh cloud images have empty apt lists** (2026-06-18): `apt install
nftables` failed "No package matching 'nftables' is available" on a fresh genericcloud
VM whose cloud-init had `package_update: false`. → `package_update: true` AND block on
`cloud-init status --wait` before applying.
- `[gotcha]` **base's default-deny firewall drops SSH to a NAT'd VM unless the gateway is
allowed** (2026-06-18): the driver reaches the VM via the libvirt-NAT gateway
(192.168.150.1). `ct established,related accept` saves the in-flight apply connection,
but a fresh post-reboot SSH is dropped without an explicit allow. → test overlay sets
`base__firewall_control_addr` to the NAT gateway.
- `[recurring]` **Real-hardware shakedown and static review each caught what the other
couldn't** (2026-06-18): the qemu-URI, storage-path, UEFI, apt-list, and caddy-render
bugs ALL surfaced only on a live KVM run; the phantom-host inventory bug surfaced only in
the holistic cross-file review. → for infra this novel, budget for BOTH an adversarial
cross-file review AND a real-hardware run; neither alone would have shipped it working.
<!-- From the 2026-06-19 mesh-hardening-2/3 design (ubongo INPUT-only default-deny). -->
- `[friction]` **Raw DHCP leases pinned in ubongo's host firewall (admin-addr SSH allows)**
(2026-06-19): mesh-hardening 2/3 lets the operator workstations reach ubongo's LAN SSH by
*raw lease*`base__firewall_admin_addrs: ["10.20.10.50" (mamba), "10.20.10.17"]` — because
there is no DHCP reservation yet (OPNsense isn't managed as code). A lease reassignment
silently moves the allow to whatever host next holds the IP (still SSH-key-gated) and drops
the workstation's *LAN* path (mesh still works, so never a full lockout). → when
OPNsense-as-code lands (ADR-020 perimeter / TODO 3.5), replace both with **MAC-pinned DHCP
reservations** (`10.20.10.17` = MAC `bc:0f:f3:c8:4a:8a`; mamba's MAC TBD) and allow the
reserved IPs. Spec: `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`.
- `[gotcha]` **`make test-integration` on ubongo fails (`qemu-img` "Permission denied") when
the agent session predates the `libvirt` group grant** (2026-06-19): the `integration_test`
role adds `claude` to `libvirt`+`kvm` and makes the cache dir `/var/lib/boma-integration`
`root:libvirt 2775` — correct — but a `claude` session whose shell started *before* that
grant carries a stale process group set (`id``claude,docker` only, no `libvirt`), so
`qemu-img create` of the VM overlay into the group-owned dir is denied. `virsh`/`virt-install`
still work (they reach system libvirtd via polkit/socket, and the real KVM runs server-side
as `libvirt-qemu`), so ONLY claude's own file-writes break. Unblock without restarting the
session: **`sg libvirt -c 'make test-integration HOST=<name>'`** (claude needs only `libvirt`
for the dir; `kvm` is server-side; note `sg` adds one group, not the full set). → self-heal
in `scripts/integration-vm.py`: if the `libvirt` gid is absent from `os.getgroups()`, re-exec
under `sg libvirt` (or have the Makefile target do it), so a stale-session agent never hits
this opaque symptom. New agent sessions pick the groups up on login, so it's a stale-session
transient — but high-confusion, worth self-healing.
- `[friction]` **No standard for when the agent may run local-VM integration tests on ubongo
without asking** (2026-06-19): `make test-integration HOST=<name>` spins an ISOLATED throwaway
KVM VM (its own libvirt NAT; never touches the real host's firewall/network; guards:
one-VM-at-a-time + a 4 GiB free-RAM floor + auto-destroy on success), so it is safe and
self-contained — yet the agent paused for a go-ahead before running it (mesh-hardening 2/3,
Task 4). The operator wants a STANDARD that pre-authorises VM-testing on ubongo so the agent
just runs it. → decide + record the rule: e.g. a `.claude/settings.json` permission allow for
`make test-integration*` / `scripts/integration-vm.py` (and the `sg libvirt -c '…'` form per
the gotcha above), plus a CLAUDE.md line distinguishing the pre-authorised isolated VM tests
from the genuinely-gated live steps (`make deploy` to real hosts, host reboots, cutovers —
still need a go-ahead). Ties to the `test-risky-infra-before-live-deploy` +
`dont-reask-settled-defaults` memories + ADR-025.
- `[gotcha]` **Molecule covers only the `input_only`-OFF (forward drop) branch of the base
firewall** (2026-06-19): mesh-hardening 2/3 added `base__firewall_input_only` (forward policy
drop↔accept). The `default` Molecule scenario renders ONE fixture, set to the secure default
(drop) — so the fast `make test ROLE=base` gate locks the drop default (security-critical for
service hosts) but does NOT exercise the `=true` → forward-`accept` rendering; only `make
test-integration HOST=ubongo` does (passed GREEN). An in-converge re-render can't cheaply
cover it (role defaults aren't in scope outside the role run). → decide in kaizen: a second
Molecule scenario (`molecule/input-only/`) asserting forward `policy accept`, vs accepting the
integration-only coverage. Final-review finding; not a cutover blocker (the accept branch is a
literal, and a var-name break would fail the drop branch too → caught).
- `[gotcha]` **Applying base's firewall to a Docker host flushes Docker's nat → container
egress dies until `restart docker`** (2026-06-19, mesh-hardening 2/3 live cutover): base's
`nftables.conf.j2` starts with `flush ruleset`, which wipes ALL tables incl. Docker's
`ip nat`/`ip filter` (+ libvirt's). On ubongo I chose INPUT-only so `forward` stays `accept`
— yet the apply STILL broke CONTAINER egress: `docker pull` worked (dockerd uses HOST egress)
but a container `ping` FAILED — the masquerade (SNAT) was gone, so replies couldn't return.
`forward accept` permits forwarding but can't replace the missing nat. The spec's "input-only
keeps Docker egress working" was therefore **incomplete**, and the local-VM harness couldn't
catch it (the test VM runs no Docker). Fix on the live host: `systemctl restart docker`
re-adds its `ip nat`/`ip filter` (egress restored; coexists fine with base's `inet filter`).
On REBOOT it self-heals (dockerd re-adds nat on boot; `forward accept` doesn't block — unlike
the 2026-06-17 `forward drop` incident). → (1) any cutover/runbook applying base firewall to a
Docker host MUST `restart docker` + check container egress after the apply; (2) the pending
`docker_host` nftables integration should own re-adding/persisting Docker's rules so base's
`flush` is safe; (3) the firewall final-review checklist should include "does the host run
Docker/libvirt? the flush wipes their nat."
<!-- From the 2026-06-19 mesh-hardening 3/3 (askari INPUT-only integration gate). -->
- `[gotcha]` **`inet filter` default-deny blocks libvirt dnsmasq DHCP — silent, hard to diagnose**
(2026-06-19, task-3 integration gate): when `base__firewall_input_only: true` is applied to
ubongo, the `table inet filter { chain input { policy drop; } }` blocks DHCP packets that arrive
via the libvirt bridge (`virbr-boma`). In nftables, multiple tables at the same hook priority all
run independently; an `accept` verdict in `table ip filter LIBVIRT_INP` does NOT prevent
`table inet filter` from seeing and dropping the same packet. VMs never got DHCP leases (dnsmasq
socket confirmed by strace to never receive POLLIN despite tcpdump seeing the packet on
`virbr-boma`). Diagnosed by temporarily changing `inet filter input` to `policy accept` → fd=3
immediately fired. Fix: `/etc/nftables.d/10-libvirt-boma.nft` drop-in adding
`iifname "virbr-boma" accept` (survives service restarts via `include "/etc/nftables.d/*.nft"`).
→ The `base` role's template needs a `base__firewall_trusted_bridges` variable so this is
encoded at the Ansible level, not in a manual host drop-in. Every host that runs Docker or
libvirt and also has `base__firewall_input_only: true` needs an analogous exception.
- `[gotcha]` **libvirt `leaseshelper` PID-file permission: `virPidFileReleasePath` unlinks
`/run/leaseshelper.pid` after EVERY call; nobody cannot recreate it** (2026-06-19, task-3
integration gate): dnsmasq runs as nobody; `libvirt_leaseshelper` is its `--dhcp-script`. The
helper acquires a PID-file mutex at `/run/leaseshelper.pid`, but `virPidFileReleasePath`
UNLINKS the file on exit. `/run/` is `root:root 755`, so nobody cannot create the file after the
first unlink → every subsequent `add` call fails with `errno=13`, dnsmasq silently drops the
DHCP grant (no log, no error to the client). Fix: suid root C wrapper at
`/usr/lib/libvirt/libvirt_leaseshelper` (original moved to `.real`) that pre-creates
`/run/leaseshelper.pid` owned by nobody, then drops privileges and execs the real helper. The
root dnsmasq fork calls the wrapper; suid gives it permission to touch `/run/`; on return to
nobody uid the PID file stays. Also: `/var/lib/libvirt/dnsmasq/` must be `nobody:nogroup 775`
so leaseshelper can update `virbr-boma.status`. This fix is host-local on ubongo and NOT in
Ansible — encode it in an `integration_test` role task (or a libvirt role) before the harness
can be safely re-deployed.
- `[gotcha]` **cloud-init rejects underscores in `local-hostname` → silently skips
network-config → VM never gets DHCP** (2026-06-19, task-3 integration gate): setting
`local-hostname: boma-it-askari_inputonly-<uuid>` caused cloud-init-local to consider the
hostname invalid and skip writing the network-config to the system. Systemd-networkd then
used the genericcloud default (no DHCP), so VMs got only IPv6 link-local. Fix in
`scripts/integration-vm.py`: `name.replace("_", "-")` in the meta-data hostname (disk paths
and virsh domain names keep the original underscore). Sanitization rule: RFC-952 hostnames
allow hyphens, not underscores.
- `[friction]` **Molecule Docker image can't `apt install` → roles with real package tasks
have no Molecule substrate coverage** (2026-06-19): the Docker Molecule image ships with
cleared apt-lists and no internet access, so any role whose core work is `apt install`
`base`, `docker_host`, `integration_test` — cannot cover its package/substrate tasks in
Molecule. Those tasks are validated only by `make test-integration` (ADR-025, real KVM).
The gap is systemic: it affects every role with non-trivial package or system-level setup.
→ systematization idea: provide a Molecule image or driver that can install packages (e.g.
a custom Docker image with pre-seeded apt-lists, or a `prepare.yml` that pre-installs
packages from a local cache), or an alternative driver (e.g. `molecule-libvirt` using the
same KVM harness), so substrate tasks get real Molecule unit coverage rather than relying
entirely on the integration harness.
--- ---
@ -282,29 +106,6 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp
Consumed signals and where their resolution now lives. Newest first. Consumed signals and where their resolution now lives. Newest first.
### 2026-06-17
Second `/kaizen` run. 7 signals triaged; all 7 consumed (0 kept open). Two heavier items
(the `rename-incomplete` scan check and the Forgejo registry-login path) were built by
parallel subagents and verified against the diff. **Bias-to-remove note:** one PARK
(the ubongo self-management gap — out-of-phase, already tracked in STATUS) and zero
REMOVE; the rest accreted (migrate/change). None of the open signals were `[unused]`
*tooling*, so there was nothing to delete — the only reductive move available was parking
the out-of-phase build. **Cadence:** healthy — 3 days after the first run, every signal
02 days old except the one carried over from 2026-06-14; the "recurring ≥3" nudge in
`scripts/friction-scan.py` didn't fire this pass (all recurrence counts were 1), so the
thresholds need no change.
| Signal (first seen) | Verdict | Resolution / where it lives now |
|---|---|---|
| ADRs claim cross-doc reconciliation they didn't perform (06-14) | SYSTEMATIZE | New `rename-incomplete` check in `scripts/repo-scan.py` (+7 tests): when a numbered ADR announces a rename `Old``New`, flag any design-doc line where `Old` still appears in present tense (skips the announcing ADR, lines also naming `New`, and historical/negation cues; rejects `ADR-NNN` tokens as terms). 0 findings on the current tree — the Traefik→Caddy ripple edits have landed. Structural cousin of `stale-deferred`; run by `/review-repo`. (Was KEEP-OPEN on 2026-06-14 — now built.) |
| Image push to the Forgejo registry needs an interactive `docker login` (06-15) | SYSTEMATIZE → vault | Vault-backed login path so pushes are agent-completable: `vault.forgejo.registry_token` stub (CHANGEME, operator-minted) + `scripts/registry-login.sh` (reads the token, `docker login --password-stdin`, never echoes it) + `make registry-login` + a prereq note in `docs/runbooks/claude-code-setup.md`. Works once the operator fills the token via `make edit-vault`. |
| Single-file bind mount + atomic rewrite = stale config (06-16) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Single-file bind mount + atomic rewrite = stale config (reload-in-place only)": `template` writes a new inode, a single-file bind mount pins the old one, so an in-container reload reads stale config. Mount the config *directory* for reload-in-place roles; restart-based roles are fine with a single-file mount. |
| `make check` always fails on the first-ever deploy of a compose service role (06-16) | CHANGE | `check_mode: false` on the `state: directory` scaffold tasks in `roles/reverse_proxy` + `roles/netbird_coordinator`, so the base dirs exist under `--check` and the rest of the dry-run (templates + compose) evaluates instead of failing on a missing `project_src`. Inert under converge → Molecule unchanged. |
| Re-asked settled defaults — push + execution mode, in prose (06-17) | CHANGE (exec) + ACCEPTED (push) | Widened `.claude/hooks/guard-execution-mode-menu.sh` to also catch free-form *prose* re-asks of the subagent-vs-inline choice (`"which execution approach?"`, `"subagent vs inline"`, …), not just the literal menu; tested. The push re-ask stays a soft default via the `dont-reask-settled-defaults` memory — a genuine "should I push?" is sometimes legitimate, so it is deliberately not hard-blocked. |
| Docs-only commit tripped the rbw-locked pre-commit guard (06-17) | CHANGE | Root cause was NOT the ansible-lint `files:` scope (innocent) — it was `.claude/hooks/guard-vault-preflight.sh` blocking *every* locked `git commit`. Rewrote it to inspect the staged set (`git diff --cached`, plus `-a`/`--all`) and block only when Ansible content (`^(roles\|playbooks\|inventories)/.*\.ya?ml$`) is staged; docs-/config-only commits are now exempt. Fail-safe to block when unsure. Tested. |
| Agent can't self-manage `ubongo` (the control node it runs on) without operator grants (06-17) | PARK | The knowledge already lives in `STATUS.md` (control-node row: the interim `claude`-key + `sjat` NOPASSWD grants, and **Pending:** the proper `ansible`-user bootstrap) and the `ubongo-self-sufficiency` memory. Out-of-phase — the fix is the control-node bootstrap recipe, a tracked future build. **Resurrection trigger:** when building ubongo's `base` hardening / `ansible`-user bootstrap, fold in key-trusted NOPASSWD self-management so control-node self-management needs no ad-hoc operator grants. |
### 2026-06-14 ### 2026-06-14
First `/kaizen` run (dogfood). 12 signals triaged; 11 consumed, 1 kept open (#13 above — First `/kaizen` run (dogfood). 12 signals triaged; 11 consumed, 1 kept open (#13 above —

View file

@ -13,7 +13,7 @@ as ordering changes, or as new milestones appear. Each milestone gets its own
spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it
comes up; this file stays high-level. comes up; this file stays high-level.
_Last updated: 2026-06-19._ _Last updated: 2026-06-11._
--- ---
@ -43,10 +43,9 @@ this collapses into interleaving with extra context-switching cost).
--- ---
## Phase 1 — Off-site / Remote-access — ✅ COMPLETE (2026-06-17) ## Phase 1 — Off-site / Remote-access
Delivers mobile access to `ubongo`; proves the machinery. Ordered by *real* dependencies. Delivers mobile access to `ubongo`; proves the machinery. Ordered by *real* dependencies.
All milestones (M1M5) done; the mobile-access goal is met. Next: the Procurement gate.
### M1 · boma's DNS home — a new domain at Gandi, managed as code ### M1 · boma's DNS home — a new domain at Gandi, managed as code
@ -135,14 +134,14 @@ Dashboard live at `https://netbird.askari.wingu.me` (valid LE cert); `/api` auth
- **Maps to:** ADR-016 (mesh), ADR-004 (one service = one role), ADR-021 (access), - **Maps to:** ADR-016 (mesh), ADR-004 (one service = one role), ADR-021 (access),
ADR-022 (backup), ADR-008/017 (VERIFY), accepted-risk R3 (askari public surface). ADR-022 (backup), ADR-008/017 (VERIFY), accepted-risk R3 (askari public surface).
### M5 · Enroll peers → goal reached — ✅ DONE (2026-06-17) ### M5 · Enroll peers → goal reached — ✅ infra done (2026-06-17); laptops = operator step
The `base` `mesh` concern enrolled **`ubongo` (`100.99.146.14`) + `askari` The `base` `mesh` concern enrolled **`ubongo` (`100.99.146.14`) + `askari`
(`100.99.226.39`)** as NetBird peers — both Management+Signal Connected, the ubongo↔askari (`100.99.226.39`)** as NetBird peers — both Management+Signal Connected, the ubongo↔askari
mesh link ping-verified. NetBird ships a default **Allow-All** peer policy, so any enrolled mesh link ping-verified. NetBird ships a default **Allow-All** peer policy, so any enrolled
peer reaches `ubongo` over `wt0`. The road-warrior clients (**`mamba` + the work laptop**) peer can already reach `ubongo` over `wt0`. **Remaining (operator):** install the NetBird
are enrolled (operator, via `docs/runbooks/netbird-client.md`) → **`ubongo` is reachable client on `mamba` + the work laptop and log in → `ubongo` reachable from anywhere. **← the
from anywhere. ← the mobile-access goal is met; Phase 1 is complete.** mobile-access goal lands when the laptops join.**
- **Deferred to a "mesh-hardening" follow-on** (was folded into M5; split out as the - **Deferred to a "mesh-hardening" follow-on** (was folded into M5; split out as the
lockout-risky part): apply `base` nftables **default-deny** to `ubongo` + set lockout-risky part): apply `base` nftables **default-deny** to `ubongo` + set
@ -206,22 +205,6 @@ Canonical dependency order:
## Next step ## Next step
**Phase 1 complete (M1M5); mesh-hardening: ubongo (2/3) DONE 2026-06-19, askari redesign DONE 2026-06-20.** **M1 (Gandi DNS migration, IaC)** design is written —
Both hosts now run INPUT-only nftables default-deny (`base__firewall_input_only`), live reboot-validated. `docs/superpowers/specs/2026-06-11-public-dns-gandi-migration-design.md`. Next: user
askari's redesign (spec/plan `docs/superpowers/{specs,plans}/2026-06-19-mesh-hardening-askari-redesign*`) review → implementation plan.
applied INPUT-only default-deny + `wt0`-primary SSH + a permanent WAN break-glass + a geo-disabled
coordinator; a real reboot recovered unattended. Remaining mesh-hardening sub-projects:
1. ~~`ubongo` nftables default-deny + `ssh-from-control`~~ → **DONE (2026-06-19).**
2. ~~**redesign** `askari`'s SSH → `wt0`~~**DONE (2026-06-20)** — boot-race, coordinator-bootstrap
chicken-egg, and Docker-nat-flush all resolved + live reboot-validated.
3. ~~**askari relay-SPOF reduction**~~**DONE (2026-06-20)** — assessed + **accepted** as a
documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
**Then** the Procurement gate (`/capacity-review` → buy Proxmox hardware) opens Phase 2.

View file

@ -17,7 +17,6 @@
calls, curl pulls of web products, log reviews. Headless browsing → ADR-017 calls, curl pulls of web products, log reviews. Headless browsing → ADR-017
(`/verify-service`); the API/curl/log-review siblings remain open. (`/verify-service`); the API/curl/log-review siblings remain open.
3. ~~Standard for test users + manual-test instructions.~~ → ADR-017. 3. ~~Standard for test users + manual-test instructions.~~ → ADR-017.
4. ~~Local VM integration testing on ubongo.~~ → ADR-025 / `make test-integration` (built + RED→GREEN validated 2026-06-18).
3. **Building services** 3. **Building services**
1. ~~Decide how to manage logs.~~ → ADR-018. 1. ~~Decide how to manage logs.~~ → ADR-018.
@ -85,13 +84,6 @@
5. ~~Always subagent-driven?~~ → DECIDED: yes (standing agreement; enforced by `.claude/hooks/guard-execution-mode-menu.sh`). 5. ~~Always subagent-driven?~~ → DECIDED: yes (standing agreement; enforced by `.claude/hooks/guard-execution-mode-menu.sh`).
6. When AI deploys, i.e. runs playbooks etc., should we make a methodology so that it does not have to poll all the time or review all the output. Perhaps something about the MAKE method could provide only the relevant feedback? 6. When AI deploys, i.e. runs playbooks etc., should we make a methodology so that it does not have to poll all the time or review all the output. Perhaps something about the MAKE method could provide only the relevant feedback?
7. ~~Reproducible agent toolchain.~~`.claude/settings.json` + `docs/runbooks/claude-code-setup.md`. 7. ~~Reproducible agent toolchain.~~`.claude/settings.json` + `docs/runbooks/claude-code-setup.md`.
8. **Screenshot hand-off to the agent.** Give the operator a smooth way to hand the
agent a screenshot (e.g. of a Hetzner/VNC console during an incident) — the agent
can already read image files; the gap is the hand-off. During the 2026-06-17
incident the only diagnostic channel was console screenshots, copied manually to
`/tmp` and `find`-located. Options: a known drop path the agent checks (e.g.
`~/screenshots/`), a small `screenshot`/paste helper or slash-command, or a
clipboard→file convention. Cheap, high-value for incident work.
11. **Kaizen loop**`/kaizen` built (STATUS). 11. **Kaizen loop**`/kaizen` built (STATUS).
1. ~~Build the loop command.~~`/kaizen` (`scripts/friction-scan.py` + `.claude/commands/kaizen.md`; spec `docs/superpowers/specs/2026-06-14-kaizen-command-design.md`). 1. ~~Build the loop command.~~`/kaizen` (`scripts/friction-scan.py` + `.claude/commands/kaizen.md`; spec `docs/superpowers/specs/2026-06-14-kaizen-command-design.md`).
@ -128,7 +120,6 @@
6. Supply-chain hygiene: enforce tiered image pinning (stateful `tag@digest`; 6. Supply-chain hygiene: enforce tiered image pinning (stateful `tag@digest`;
stateless rolling tags — ADR-011) + official/verified images via the service stateless rolling tags — ADR-011) + official/verified images via the service
checklist; revisit active scanning (Trivy/Grype) once a triage stack exists (R1). checklist; revisit active scanning (Trivy/Grype) once a triage stack exists (R1).
7. Is our network setup as it should be? I am not sure if all traffic between ubongo and notes goes via askari? what if askari breaks - will the rest work?
16. **ADR-011 (update management) — resolve open questions + accept.** Committed as 16. **ADR-011 (update management) — resolve open questions + accept.** Committed as
**Proposed**; resolve before marking Accepted: **Proposed**; resolve before marking Accepted:

View file

@ -154,7 +154,6 @@ Level 2 (staging) or Level 3 (external). This is a conscious, documented decisio
| Capability | Reason not testable in Molecule | | Capability | Reason not testable in Molecule |
|---|---| |---|---|
| `nftables` rule loading | Requires `nf_tables` kernel module; not available in Docker | | `nftables` rule loading | Requires `nf_tables` kernel module; not available in Docker |
| **Reboot-survivability / host-firewall × Docker interaction / boot-ordering** | **Requires a real kernel reboot — the class that caused the 2026-06-17 mesh-hardening incident. Now covered by local VM integration testing (ADR-025).** |
| NetBird mesh data plane (`wt0` WireGuard interface) | Requires the `wireguard` kernel module; Molecule checks only that the agent is installed/configured (ADR-016) | | NetBird mesh data plane (`wt0` WireGuard interface) | Requires the `wireguard` kernel module; Molecule checks only that the agent is installed/configured (ADR-016) |
| `unattended-upgrades` behaviour | Installs correctly; actual upgrade behaviour requires a real apt environment | | `unattended-upgrades` behaviour | Installs correctly; actual upgrade behaviour requires a real apt environment |
| DHCP behaviour (OPNsense) | OPNsense is managed by Ansible but not testable in a container | | DHCP behaviour (OPNsense) | OPNsense is managed by Ansible but not testable in a container |
@ -166,11 +165,6 @@ For the above, Molecule tests only what it can: that the relevant packages are
installed, that configuration files render correctly, and that services are enabled. installed, that configuration files render correctly, and that services are enabled.
Behavioural correctness is confirmed on staging. Behavioural correctness is confirmed on staging.
**ADR-025 is the concrete build of Level 2/3** — local VM integration testing on
ubongo (libvirt/KVM, throwaway overlay VMs, stdlib-only driver). It specifically
targets the reboot-survivability / host-firewall × Docker / boot-ordering class that
Molecule structurally cannot reach. See `docs/decisions/025-local-vm-integration-testing.md`.
--- ---
### CI pipeline ### CI pipeline

View file

@ -2,10 +2,7 @@
## Status ## Status
Accepted (2026-06-05). **Amended 2026-06-18:** the `claude` AI-worker account now has Accepted (2026-06-05)
`NOPASSWD:ALL` sudo on `ubongo` — reversing the original "no local sudo" sub-decision.
The amendment is recorded in §Access & security below; rationale and accepted risk are
in ADR-021 and `docs/security/accepted-risks.md` (R7).
## Context ## Context
@ -46,12 +43,8 @@ points at this physical box. This *strengthens* the ADR-009 control-node excepti
it is genuinely outside Terraform's world, not a VM pretending to be the exception. it is genuinely outside Terraform's world, not a VM pretending to be the exception.
Every other host stays a Terraform-managed VM exactly as designed. Every other host stays a Terraform-managed VM exactly as designed.
`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a production `ubongo` runs **plain Debian 13** (the `base` role applies). It is not a hypervisor
hypervisor and runs no `docker_host` services. It does run **ephemeral KVM test VMs** and runs no `docker_host` services.
as part of its local-test-runner role (ADR-025 — local VM integration testing): one
throwaway VM at a time (~3 GiB RAM), against ~13 GiB free of the 16 GiB sized here.
This is not a production workload — it is the concrete implementation of ADR-008 Level
2/3, and the resource guard enforces one-at-a-time to stay within the RAM ceiling.
### Hardware target ### Hardware target
@ -91,38 +84,12 @@ Manual, on bare metal:
only** — key-only, with password auth and root login disabled — until the NetBird mesh only** — key-only, with password auth and root login disabled — until the NetBird mesh
(ADR-016) is stood up. (ADR-016) is stood up.
- **AI-worker identity:** `ubongo` runs the AI worker under a dedicated, - **AI-worker identity:** `ubongo` runs the AI worker under a dedicated,
password-locked `claude` user (in the `docker` and `libvirt` groups; **`NOPASSWD:ALL` password-locked `claude` user (in the `docker` group for Molecule; **no local sudo**
sudo** via a repo-managed drop-in — see amendment below). It is reached via `sudo -iu boma deploys reach the fleet over SSH as the `ansible` user, not via local root). It is
claude` or its own SSH key. The rationale is **attribution + revocation, not reached via `sudo -iu claude` or its own SSH key. The rationale is **attribution +
containment**: auditd/Loki (ADR-018) can separate human from agent actions, and the revocation, not containment**: auditd/Loki (ADR-018) can separate human from agent
account/key can be revoked without touching the operator's access. (ADR-021 left the actions, and the account/key can be revoked without touching the operator's access.
on-`ubongo` agent identity unspecified; this records it.) (ADR-021 left the on-`ubongo` agent identity unspecified; this records it.)
**Amendment (2026-06-18) — `claude` now has `NOPASSWD:ALL` sudo.**
> **Superseded by [ADR-025](025-local-vm-integration-testing.md)** (per ADR-023 §4): the
> "no local sudo" sub-decision is reversed. The shakedown that necessitated it is ADR-025;
> the resulting two-account access model is ADR-021; the accepted risk is R7.
During the
integration-testing harness shakedown, the original "no local sudo" sub-decision was
reversed. No-sudo blocked the AI-worker from diagnosing a failed VM: `virsh`,
`virt-install`, `cloud-localds`, `journalctl`, `nft` — nearly all low-level
diagnostic commands — require root. The AI-worker must autonomously spin up,
inspect, and tear down test VMs without operator hand-holding; that is the harness's
core value proposition. Compensating controls make the risk acceptable:
1. `claude`'s password is **locked** (no interactive login, no `su claude` without the
operator's own credentials) — `NOPASSWD` sudo is the *only* sudo path.
2. `auditd` + Loki attribution (ADR-018) separates human from agent root actions.
3. The drop-in is **repo-managed** via `base__ai_worker_user` — revocable in one commit
and one deploy.
4. Single-operator homelab: everything in git, off-machine backups (ADR-022).
The operator (`sjat`) uses **password-required sudo** via the `sudo` group; their
former `NOPASSWD` drop-in was removed 2026-06-18 as redundant once `claude` had sudo
(least-privilege cleanup). The accepted risk is registered as R7 in
`docs/security/accepted-risks.md`. ADR-021 records the resulting sudo model for both
accounts.
- **Disk encryption:** `ubongo`'s SSD is **not encrypted at rest** — the SanDisk X600 is - **Disk encryption:** `ubongo`'s SSD is **not encrypted at rest** — the SanDisk X600 is
TCG-Opal-capable but Opal is unused. This is an accepted risk recorded in TCG-Opal-capable but Opal is unused. This is an accepted risk recorded in
`docs/security/accepted-risks.md` (control-node disk not encrypted at rest), `docs/security/accepted-risks.md` (control-node disk not encrypted at rest),

View file

@ -85,9 +85,8 @@ allocated for it.
- **Bootstrap order:** stand up the coordinator on `askari` → enroll `ubongo` - **Bootstrap order:** stand up the coordinator on `askari` → enroll `ubongo`
`base` enrolls the fleet. `base` enrolls the fleet.
- **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage. - **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
NetBird's management datastore is **intended** to be backed up encrypted off `askari` NetBird's management datastore is backed up encrypted off `askari` (synced to
(synced to `ubongo`/`mamba`; not yet built — see the Availability amendment / R8); peers `ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
keep last-known config through a brief coordinator outage.
- **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned - **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned
as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox
cluster (its own provider + local state). Ansible configuration: `base` role, plus a cluster (its own provider + local state). Ansible configuration: `base` role, plus a
@ -117,7 +116,7 @@ allocated for it.
address as a mesh-independent secondary path, so a mesh/coordinator outage never address as a mesh-independent secondary path, so a mesh/coordinator outage never
blocks on-LAN SSH and Ansible stays off the mesh (Security; Recovery & operations). blocks on-LAN SSH and Ansible stays off the mesh (Security; Recovery & operations).
- The mesh survives a homelab outage because the coordinator is off-site on `askari`, - The mesh survives a homelab outage because the coordinator is off-site on `askari`,
with its management datastore **intended** to be backed up encrypted off `askari` (not yet built — see the Availability amendment / R8) and peers keeping with its management datastore backed up encrypted off `askari` and peers keeping
last-known config through a brief coordinator outage (Recovery & operations). last-known config through a brief coordinator outage (Recovery & operations).
- Choosing NetBird over plain OPNsense WireGuard, Tailscale, Tailscale+Headscale, an - Choosing NetBird over plain OPNsense WireGuard, Tailscale, Tailscale+Headscale, an
on-cluster coordinator, a `ubongo` subnet router, and a standalone IdP gains on-cluster coordinator, a `ubongo` subnet router, and a standalone IdP gains
@ -126,38 +125,6 @@ allocated for it.
- Implementation is pending: the role tasks land only once the unbuilt `base` role and - Implementation is pending: the role tasks land only once the unbuilt `base` role and
service-role machinery exist (Status). service-role machinery exist (Status).
## Availability — an `askari` outage (amendment 2026-06-20)
The coordinator is deliberately **single** (one off-site host). Recorded here so its
availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
radius**:
| Traffic | `askari` down |
|---|---|
| LAN device → LAN service (direct / via reverse proxy) | unaffected |
| node ↔ node over LAN IPs (cluster) | unaffected |
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
| **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
operations, above).
**Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
hosts get the same pin via `base__mesh_coordinator_pin`.
**Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
default-deny posture; only helps established sessions), a second relay (needs another public
host / reintroduces the home public surface), a second coordinator (unsupported by
self-hosted NetBird; against this ADR).
## Related ## Related
ADR-007 (network — amended), ADR-015 (control host), ADR-002 (security), ADR-007 (network — amended), ADR-015 (control host), ADR-002 (security),

View file

@ -3,9 +3,7 @@
## Status ## Status
Accepted (2026-06-09). Resolves TODO 7.2 (what to set up on hosts given direct access Accepted (2026-06-09). Resolves TODO 7.2 (what to set up on hosts given direct access
will be rare) and TODO 3.2 (the service admin-API access question). **Amended will be rare) and TODO 3.2 (the service admin-API access question).
2026-06-18:** the on-`ubongo` sudo model for the two local accounts is now settled
(see §Sudo model on `ubongo` below).
**Doctrine ADR.** It pins the operational-access doctrine, the declarative `access__*` **Doctrine ADR.** It pins the operational-access doctrine, the declarative `access__*`
data model, the rendered `ACCESS.md` record, and the `/check-access` verifier. It does data model, the rendered `ACCESS.md` record, and the `/check-access` verifier. It does
@ -165,36 +163,6 @@ exists and `/check-access` is green (or a deviation is recorded in `accepted-ris
No scaffold change — same manual-copy-plus-review pattern the sibling records No scaffold change — same manual-copy-plus-review pattern the sibling records
(`SECURITY.md`/`VERIFY.md`) use. (`SECURITY.md`/`VERIFY.md`) use.
### Sudo model on `ubongo` (amendment 2026-06-18)
The original ADR left on-`ubongo` local sudo unspecified. The integration-testing
harness shakedown settled it:
| Account | Role | Sudo |
|---|---|---|
| `claude` | Automated AI-worker | `NOPASSWD:ALL` via repo-managed drop-in (`base__ai_worker_user`) |
| `sjat` | Human operator | Password-required sudo via the `sudo` group |
**Rationale for `claude NOPASSWD`.** No-sudo blocked the AI-worker from diagnosing a
failed test VM: `virsh`, `virt-install`, `cloud-localds`, `nft`, `journalctl`
almost every low-level diagnostic tool — require root. The harness's core value is
autonomous spin-up → apply → reboot → assert → diagnose; that loop collapses without
local root access.
**Compensating controls (R7 in `docs/security/accepted-risks.md`):**
- `claude`'s password is locked — `NOPASSWD` is the account's *only* sudo path; no
interactive login is possible.
- `auditd` + Loki attribution (ADR-018) separates human from agent root actions in the
audit trail.
- The drop-in is repo-managed and revocable in one commit + one deploy.
- Single-operator homelab; everything in git; off-machine backups (ADR-022).
**`sjat` NOPASSWD removed.** The operator's former `NOPASSWD` drop-in
(`/etc/sudoers.d/sjat-ansible`, added as an interim measure during M5 NetBird
enrolment) was removed 2026-06-18. It was redundant once `claude` held sudo, and its
removal restores least-privilege for the human operator. `sjat` retains full sudo
capability via the `sudo` group (password required).
## Consequences ## Consequences
- Every host and service has at least one documented, verifiable way in — and a verifier - Every host and service has at least one documented, verifiable way in — and a verifier

View file

@ -1,180 +0,0 @@
# ADR-025 — Local VM integration testing on ubongo
## Status
Accepted (2026-06-18). Implements ADR-008 Level 2/3 (deferred for lack of hosts; now
viable on ubongo). **RED→GREEN acceptance PASSED on real hardware (2026-06-18):** a
throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base's nftables forward
default-deny kills Docker forwarding on reboot) — RED — and survived the reboot once
the `docker_host` container-forward drop-in was applied — GREEN. Two shakedown
learnings added below.
## Context
Molecule (ADR-008 Level 1) tests each role in a single Docker container: one
`converge`, no real kernel netfilter, no real Docker daemon in the loop, and **no
reboot**. That structurally cannot catch an entire class of bug — reboot-survivability,
host-firewall × Docker interaction, and boot-ordering — which is exactly the class
that caused the **2026-06-17 mesh-hardening incident**.
During that incident, `base`'s nftables `forward { policy drop; }` killed the askari
Docker host **on reboot**: nftables loaded its default-deny before Docker, breaking
published-port DNAT and inter-container forwarding. Public services and the mesh went
down. It had worked right after `make deploy`, when Docker's runtime rules still
coexisted. `ip_nonlocal_bind` also failed to beat the sshd boot-race, leaving the mesh
listener absent at boot. Recovery required the Hetzner console and a WAN-SSH
break-glass. Molecule had passed.
ADR-008's Level 2/3 was deferred "for lack of hosts." ubongo breaks that deferral:
> verified: ubongo KVM capability · Bash (2026-06-18 session) · `/dev/kvm` present +
> accessible (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM
> free of 16, ~198 GiB disk free; libvirt/QEMU/Vagrant **not yet installed** ·
> 2026-06-18.
## Decision
### 1. Virtualisation approach: libvirt/KVM directly (Approach A)
A golden Debian-13 genericcloud qcow2 is cached locally on ubongo. Each run boots an
ephemeral qcow2 **overlay** backed by it (the golden image is never mutated), seeded
via cloud-init NoCloud, driven by a **stdlib-only** Python driver (`scripts/
integration-vm.py`) over `virsh` / `virt-install` / `cloud-localds`. No `libvirt-
python` dependency — the driver stays portable and the role stays lean.
### 2. Fidelity envelope
The bugs are **post-boot**, not in the provisioning path. A lightweight local hypervisor
is sufficient: real OS, real kernel netfilter, real Docker daemon, real published-port
DNAT, a **real reboot**, and the coordinator running inside the VM (so the VM forms its
own one-node mesh, reproducing the circular bootstrap). The Proxmox provisioning chrome
is not mirrored.
### 3. Scope: one throwaway VM at a time, instantiated from real inventory
The first profile is **"be askari"** — a single box running Docker host + NetBird
coordinator + mesh peer, mirroring the host whose incident motivates this work. The
mechanism is generic: swap the profile to "be" any inventory host. Multi-VM topologies
are a deferred extension.
### 4. Acceptance: self-validating against the real failure
The harness is accepted when it can, on a local VM:
1. Apply `base` (firewall on, no `docker_host` container-forward drop-in) to a Docker
host, reboot, and observe the **2026-06-17 breakage** (Docker forwarding dead,
services down). If step 1 passes, the harness is not faithful.
2. Apply the `docker_host` container-forward fix, re-run, and **survive the reboot**.
### 5. Tiered cert fidelity via a `--certs` knob
DNS-01 is what makes real certs possible without public inbound (validation is
out-of-band via a Gandi TXT record; the VM needs only outbound to ACME + Gandi, which
the isolated NAT network provides):
| Tier | Description | Default? |
|---|---|---|
| `internal` | Caddy `tls internal` — zero deps, instant. For incident repro and runs where certs are not under test. | Yes |
| `le-staging` | Real DNS-01 ACME against Let's Encrypt **staging** — real caddy-gandi path, real cert files/renewal, untrusted root, effectively no rate limits. | Built in v1; use when testing the ACME/cert path. |
| `le-prod-wildcard` | A real trusted `*.test.wingu.me` wildcard, **issued once, persisted on ubongo, reused** across runs. | On-demand only. Accepted risk recorded as R6 in `docs/security/accepted-risks.md`. |
A deliberate "no-egress" failure scenario (reproducing FRICTION 2026-06-17 #4
`netbird-server` FATAL-loops on GeoLite2 download when egress is lost) forces
`internal`, since ACME requires egress.
### 6. The toolchain is Ansible-managed
A new non-service role (`integration_test`, `control` group) installs and enables
libvirt + QEMU + virtinst reproducibly. The driver manages the golden image lazily on
first run (keeping the role lean; no fiddly download/refresh logic in Ansible). The
repo owns ubongo's state.
### 7. Stubs live in an overlay file, never in the real inventory
Transient inventory entries for the test VM are generated at runtime as a single-host
file. Stubs (cert tier, in-VM coordinator endpoint, VM connection details) live in
`tests/integration/overrides/<host>.yml` — an explicit, reviewable overlay. The real
inventory is never touched, so `make tf-inventory` and "don't edit inventory directly"
stay intact.
## Consequences
- **Reconciles ADR-015:** ubongo runs ephemeral KVM test VMs as part of its
local-test-runner role — it is still not a production hypervisor. A default VM
(~2 vCPU / 3 GiB / 20 GiB thin overlay) against ~13 GiB free is comfortable; the
driver enforces **one integration VM at a time** (resource guard, name-prefix
`boma-it-*`) and refuses to start below a free-RAM threshold.
- **Operationalises the standing rule:** "firewall/sshd/boot changes must be tested on
a real VM with a real reboot before they touch a live host" (FRICTION 2026-06-17 #6)
becomes a concrete, runnable step documented in `docs/runbooks/integration-testing.md`.
- **Accepted risk R6:** `le-prod-wildcard` runs pass the production Gandi PAT
(`vault.gandi.pat`) to an ephemeral local VM and write transient `_acme-challenge`
TXT records into the real `wingu.me` zone. Scope: on-demand only; `le-staging` is the
default. Compensating controls: ephemeral VM, isolated NAT network, TXT records
auto-removed by Caddy after validation.
- **Three safety invariants** make the test tool itself safe:
1. The transient inventory contains only the test VM — no real host is ever in scope.
2. "Be askari" points NetBird at the in-VM coordinator — the VM forms its own one-node
mesh; it never enrols in the real mesh.
3. Test VMs sit on an isolated libvirt NAT network — outbound NAT for ACME/image pulls
only, not reachable to the LAN (`10.20.x`) or the real mesh.
- **Diagnostics on failure** (catching a bug is the point): failure keeps the VM and
dumps `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`,
`systemd-analyze critical-chain`. `make test-integration-clean` reaps all `boma-it-*`
orphans. Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/`.
- **Future pinch:** concurrency with the Level-4 Chromium/Playwright stack (ADR-017)
competes for ubongo RAM. The resource guard is the v1 answer — one integration VM at a
time; don't run alongside a heavy Level-4 session. Revisit at `/capacity-review`.
## Scope
**In scope:** reboot-survivability, host-firewall × Docker interaction, boot-ordering,
cert/ACME paths, mesh bootstrap on one box.
**Out of scope (v1):** multi-VM mini-cluster (inter-host mesh dataplane); CI gate
(this is an interactive, agent-driven pre-deploy check; CI stays lint + Molecule per
ADR-008/010); the Proxmox provisioning path (the bugs live in the boot/kernel/Docker
layer, not provisioning).
## What was ruled out
| Option | Reason |
|---|---|
| **Proxmox VE nested on ubongo** | Highest fidelity including the provisioning step, but heavy (nested virt, RAM), in tension with ADR-015, and the incident bugs do not live in provisioning. |
| **Vagrant + vagrant-libvirt** | Mature lifecycle/snapshots, but adds the Ruby/Vagrant ecosystem + a fragile plugin; boxes drift from the real Debian cloud image; the reboot→assert sequence still needs custom logic. |
| **terraform-provider-libvirt** | Declarative and reuses TF, but poor at the imperative apply→reboot→re-apply test sequence; adds throwaway state; blurs ADR-006's "TF owns *production* VM existence on Proxmox" boundary. |
## Verified facts (ADR-014)
- verified: ubongo KVM capability · Bash · `/dev/kvm` present + accessible (kvm group),
Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198 GiB
disk free · 2026-06-18.
## Shakedown learnings (2026-06-18 live run)
Two findings from the RED→GREEN acceptance run that affect anyone operating the harness:
1. **Boot firmware: UEFI required.** The Debian 13 genericcloud image triple-faults
under legacy BIOS/SeaBIOS and does not reach the kernel. Boot the VM with UEFI
(`virt-install --boot uefi`; `ovmf` package). The driver does this by default; note
it here so the requirement is findable.
2. **`claude` sudo is load-bearing.** VM management (`virsh`, `virt-install`,
`cloud-localds`) and offline diagnostics (`nft list ruleset`, `journalctl -b`,
`systemd-analyze critical-chain`) all require root. The harness assumes the
AI-worker has `NOPASSWD:ALL` sudo on `ubongo` — settled as the ADR-015 amendment
(2026-06-18) and registered as R7 in `docs/security/accepted-risks.md`. A `claude`
account without sudo will block the harness at the first `virsh` call.
The nine full shakedown findings (including the UEFI boot-loop) are in
`docs/FRICTION.md`.
## Related
- ADR-006 — Terraform owns production VM existence (boundary this ADR respects).
- ADR-008 — Testing methodology (Levels 14); this ADR is the concrete build of Level 2/3.
- ADR-015 — Control host (ubongo); this ADR reconciles "not a hypervisor" with ephemeral test VMs. **Supersedes** ADR-015's "no local sudo" sub-decision for the AI-worker — the shakedown necessitated `claude` NOPASSWD sudo (ADR-023 §4; access model in ADR-021, risk R7).
- ADR-016 — Mesh VPN; the "be askari" profile includes the coordinator role.
- ADR-020 — Firewall strategy; firewall × Docker interaction is what this harness tests.
- ADR-021 — Operational access; sudo model for `claude` and `sjat` on `ubongo`.
- ADR-024 — Reverse proxy (Caddy); cert tiers exercise the DNS-01 ACME path.

View file

@ -25,7 +25,7 @@
- **Storage:** 256 GB SanDisk X600 SATA 2.5" SSD (model SD9TB8W256G1001; TCG Opal-capable, Opal unused — no disk encryption) - **Storage:** 256 GB SanDisk X600 SATA 2.5" SSD (model SD9TB8W256G1001; TCG Opal-capable, Opal unused — no disk encryption)
- **NICs:** wired GbE, interface eno1, MAC 88:a4:c2:e0:ee:da - **NICs:** wired GbE, interface eno1, MAC 88:a4:c2:e0:ee:da
- **BIOS:** Lenovo M2WKT5AA (2023-06-20) - **BIOS:** Lenovo M2WKT5AA (2023-06-20)
- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred). Also runs **one ephemeral KVM integration test VM** (~3 GiB RAM) at a time per ADR-025 — the resource guard enforces one-at-a-time; do not run a test-integration cycle alongside a heavy Level-4 browser session (Chromium/Playwright). - **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred)
### fisi (backup node — outside the cluster; provisional) ### fisi (backup node — outside the cluster; provisional)
- **Model / form factor:** HP Elite 600 G9 (tower) - **Model / form factor:** HP Elite 600 G9 (tower)

View file

@ -50,13 +50,6 @@ Don't install these until their trigger lands — then add them here and to
- **The venv-activate hook** — this repo expects the Python `.venv` active for Bash - **The venv-activate hook** — this repo expects the Python `.venv` active for Bash
commands. If you use the user-level `~/.claude/hooks/activate-venv.sh` pattern, commands. If you use the user-level `~/.claude/hooks/activate-venv.sh` pattern,
replicate it; otherwise `source .venv/bin/activate` per session after `make setup`. replicate it; otherwise `source .venv/bin/activate` per session after `make setup`.
- **Forgejo registry login (for image pushes)**`make caddy-image-push` /
`molecule-image-push` need the Docker daemon authenticated to
`forgejo.nyumbani.baobab.band`. Run **`make registry-login`** once per machine: it reads
`vault.forgejo.registry_token` from the vault and does `docker login --password-stdin`
(no interactive prompt, so an agent can complete a push). The token is operator-minted
(Forgejo → Settings → Applications → Generate Token, package read+write) and set via
`make edit-vault`; until then `registry-login` prints how to obtain it. (2026-06-17 kaizen.)
## 4. A note on user-level settings ## 4. A note on user-level settings

View file

@ -1,229 +0,0 @@
# Runbook — Local VM integration testing
## When to use this
Run a local VM integration test before deploying any change that touches:
- **nftables / firewall rules** (the `firewall` concern of `base`)
- **sshd configuration** (listener address, port, key types, `base` hardening)
- **boot ordering or kernel parameters** (systemd units, sysctl)
- **Docker host networking** (`docker_host` DNAT rules, published-port forwarding, `daemon.json`)
These are the change classes that Molecule (ADR-008 Level 1) cannot catch: they require
a real kernel reboot to surface. This harness is the concrete tool for ADR-008 Level 2/3
(see ADR-025) and directly operationalises two standing rules:
- **"Test risky infra before live deploy"** (standing rule, ubongo memory) — firewall/sshd/boot changes must be tested on a real VM with a real reboot before touching a live host.
- **FRICTION 2026-06-17 #6 — validate reboot-recovery before retiring the break-glass** — the lesson crystallised from the mesh-hardening incident: confirm the host recovers from reboot *while you still have the break-glass open*, not after.
You do not need this runbook for pure-config changes (template rendering, package lists, user management) — Molecule covers those.
---
## First-deploy (one-time setup)
The `integration_test` role installs libvirt + QEMU + virtinst on ubongo and adds the
operator accounts (`sjat`, `claude`) to the `libvirt` and `kvm` groups.
```bash
make deploy PLAYBOOK=site LIMIT=ubongo TAGS=integration_test
```
**Re-login after this run** — group membership changes do not take effect in the current
session. The driver (`scripts/integration-vm.py`) requires both `libvirt` and `kvm`
group membership to create and manage VMs.
The golden Debian-13 genericcloud qcow2 image is downloaded lazily on the first run
(one-time cost, ~500 MB); subsequent runs reuse the cached image.
---
## Running a cycle
### Makefile interface (recommended)
```bash
# Full cycle (provision → apply → reboot → assert → teardown on pass)
make test-integration HOST=askari
# With a specific cert tier
make test-integration HOST=askari CERTS=le-staging
# Keep the VM alive after the run (for manual inspection)
make test-integration HOST=askari KEEP=1
# Destroy all orphan integration VMs (name-prefix boma-it-*)
make test-integration-clean
```
`HOST` is a hostname from the production inventory (the profile `tests/integration/
profiles/<host>.json` must exist — see Adding a new profile below). `CERTS` defaults
to `internal`.
### Lower-level driver
The driver (`scripts/integration-vm.py`) exposes individual lifecycle steps for manual
or scripted use:
| Sub-command | What it does |
|---|---|
| `up` | Ensure golden image → create ephemeral overlay → cloud-init seed → boot |
| `apply` | Run the site playbook against the transient inventory (real apply) |
| `reboot` | `virsh reboot` + wait for a verified reboot (boot-id change) — the step Molecule cannot do |
| `assert` | Run `tests/integration/verify.yml` (outcome assertions) |
| `cycle` | `up``apply``reboot``assert``down` (default: destroy on pass) |
| `down` | Destroy the VM + overlay |
| `prune` | Destroy all `boma-it-*` VMs + overlays (orphan cleanup) |
| `console` | Print the VM's captured serial-console log |
```bash
# Example: step through manually
python3 scripts/integration-vm.py up --host askari
python3 scripts/integration-vm.py apply --host askari
python3 scripts/integration-vm.py reboot --host askari
python3 scripts/integration-vm.py assert --host askari
python3 scripts/integration-vm.py down --host askari
```
---
## Cert tiers
| Tier | Flag | Use when |
|---|---|---|
| `internal` | `CERTS=internal` (default) | Incident repro, firewall/sshd/boot changes where certs are not under test. Zero deps, instant. |
| `le-staging` | `CERTS=le-staging` | Testing the Caddy DNS-01 ACME path, cert renewal logic, or the `caddy-gandi` plugin. Real cert files, untrusted root, effectively no rate limits. Requires `vault.gandi.pat`. |
| `le-prod-wildcard` | `CERTS=le-prod-wildcard` | Verifying TLS behaviour with a real trusted cert. On-demand only — accepted risk R6 (`docs/security/accepted-risks.md`): the production Gandi PAT reaches an ephemeral VM and transient TXT records are written into the real `wingu.me` zone. |
> A deliberate "no-egress" scenario (reproducing FRICTION 2026-06-17 #4 — the
> `netbird-server` GeoLite2 FATAL-loop when NAT masquerade is wiped) **must** use
> `CERTS=internal`: the egress loss is the fault being simulated, and ACME requires egress.
---
## Diagnostics and inspecting a failed VM
### Where diagnostics land
Diagnostics from every run are captured in:
```
~/integration-runs/<timestamp>-<host>/
```
This directory is gitignored. On a failed assert step, the driver dumps:
- `nft list ruleset` — the live nftables state at failure
- `docker ps -a` — container states
- `ss -tlnp` — listening sockets
- `journalctl -b` — full boot log
- `systemd-analyze critical-chain` — boot timing
- Serial console capture (on boot/SSH failure — the automated equivalent of the Hetzner
console, addressing FRICTION 2026-06-17 #5)
The agent reads these directly from `~/integration-runs/` — no manual download needed.
### Inspecting a kept or failed VM
When a run fails or when `KEEP=1` is passed, the VM is left running. Connect to it:
```bash
# Serial console (no SSH needed — useful when SSH is the fault)
python3 scripts/integration-vm.py console --host askari
# or directly:
virsh console boma-it-askari
# Exit with Ctrl-]
# SSH (as the ansible user, IP from virsh)
virsh domifaddr boma-it-askari --source lease
ssh ansible@<IP>
# List all integration VMs
virsh list --all | grep boma-it-
```
### Cleanup
```bash
# Destroy a specific VM
python3 scripts/integration-vm.py down --host askari
# Reap all orphans
make test-integration-clean
# or:
python3 scripts/integration-vm.py prune
```
---
## Safety invariants
These make the test tool itself safe — the harness cannot reach or modify production:
1. **Single-host transient inventory** — the playbook apply runs against a generated
single-host inventory (`ansible_host=<VM lease IP>`). No real host is ever in scope.
2. **In-VM coordinator only** — "be askari" points NetBird at the coordinator running
inside the VM itself (localhost endpoint). The VM forms its own one-node mesh; it
never enrols in the real NetBird mesh.
3. **Isolated NAT network** — test VMs sit on a dedicated libvirt NAT network.
Outbound NAT provides ACME/image-pull access, but the VM is not reachable from
the LAN (`10.20.x`) or the real mesh.
---
## Resource constraints
The default VM profile is ~2 vCPU / 3 GiB RAM / 20 GiB thin-provisioned overlay. The
driver enforces **one integration VM at a time** (refusing to start if another
`boma-it-*` VM is already running) and refuses to start below the free-RAM threshold
(~13 GiB available on ubongo at baseline, per ADR-025).
**Do not run a test-integration cycle alongside a Level-4 browser session**
(Chromium/Playwright, ADR-017) — both compete for ubongo RAM. The resource guard is the
enforcement mechanism, not a suggestion.
---
## Adding a new profile
To make the harness "be" a different host:
1. Create `tests/integration/profiles/<hostname>.json` — specifies which roles to apply
and base VM sizing for that host.
2. Create `tests/integration/overrides/<hostname>.yml` — the explicit stub overlay:
cert tier, in-VM coordinator endpoint (if the host runs the coordinator),
`ansible_host` placeholder, and any other variables that must differ from the real
inventory (e.g. public DNS → local resolution, geo-DB disable for coordinator).
3. Add assertions to `tests/integration/verify.yml` (or extend an existing task with a
`when: inventory_hostname == '<hostname>'` guard) for any host-specific outcomes.
4. Run `make test-integration HOST=<hostname>` to validate the new profile.
All stubs must be explicit in the overlay — the real inventory is never edited.
---
## Reproducing the 2026-06-17 incident
The acceptance test for the harness (ADR-025) deliberately reproduces the incident:
1. Run with today's `base` (firewall on, no `docker_host` container-forward drop-in):
```bash
make test-integration HOST=askari CERTS=internal
```
The assert step **must FAIL** after reboot (Docker forwarding dead, published ports
unreachable). If it passes, the harness is not faithful.
2. Implement the `docker_host` container-forward rules (FRICTION 2026-06-17 #1 fix) and
re-run. The assert step **must PASS** across the reboot.
This round-trip proves: (a) the harness faithfully reproduces the incident, and (b) the
fix survives a real reboot.
---
## Related
- ADR-025 — decision record for this harness (approach, cert tiers, safety invariants)
- ADR-008 — testing methodology; this is Level 2/3
- `docs/security/accepted-risks.md` R6 — `le-prod-wildcard` accepted risk
- `docs/FRICTION.md` — 2026-06-17 signals that motivated this runbook

View file

@ -1,144 +0,0 @@
# Runbook — Enrolling a NetBird client (road-warrior device)
Joins a **client/road-warrior device** (laptop, desktop, phone) to the boma NetBird mesh
so it can reach `ubongo` and other peers from anywhere. The self-hosted coordinator is on
`askari` (ADR-016, M4b); enrollment lands a device on the `100.64.0.0/10` overlay.
> **Hosts vs clients.** Managed **Linux hosts** join via the `base` role's `mesh` concern
> (`base__mesh_enabled: true` + the reusable key in `vault.netbird.setup_key`) — see
> ADR-016 / the `base` README, *not* this runbook. This runbook is for **user devices**
> NetBird doesn't manage with Ansible.
verified: NetBird client install + self-hosted `--management-url` flow · docs.netbird.io
(`/get-started/install/windows`, `/get-started/cli`) · 2026-06-17
## Prerequisites
- The coordinator's first-boot `/setup` admin exists and you can log in at
`https://netbird.askari.wingu.me`.
- **Auth, pick one:**
- **SSO** (recommended for a personal device) — your dashboard account; no secret to copy.
- **Setup key** — dashboard → **Settings → Setup Keys** → a reusable key (mint a
client-specific one for clean ACL grouping, or reuse the existing reusable key).
- Local **admin rights** on the device (the client installs a service).
- **Coordinator facts:** management URL `https://netbird.askari.wingu.me`; `ubongo`
= `100.99.146.14` (`ubongo.netbird.selfhosted`); `askari` = `100.99.226.39`.
---
## Part A — Windows 11
1. **Install:** download + run the MSI **https://pkgs.netbird.io/windows/msi/x64**
(official x64 client; installs the tray app + the `netbird` service).
2. **Connect** from an **elevated** Windows Terminal / PowerShell ("Run as administrator"):
```powershell
netbird up --management-url https://netbird.askari.wingu.me
```
A browser opens — sign in with your dashboard account. (SSO won't open a browser?
use a key: `netbird up --setup-key <KEY> --management-url https://netbird.askari.wingu.me`.)
3. Proceed to **Part C** (verify).
---
## Part B — Other platforms (same management URL)
- **macOS / Linux desktop:** install the client (macOS: NetBird app / Homebrew; Linux:
`pkgs.netbird.io` per the distro — same apt/rpm flow as `base`'s `mesh` concern), then
`netbird up --management-url https://netbird.askari.wingu.me` (Linux: prefix `sudo`).
- **Android / iOS:** install the **NetBird** app, then in **Settings → Advanced /
Server** set the management server to `https://netbird.askari.wingu.me` **before**
logging in; connect and complete the SSO login. (Setup keys are supported in-app too.)
---
## Part C — Verify + use
```sh
netbird status # expect: Management: Connected, Signal: Connected, a 100.x NetBird IP
netbird status -d # peer detail — ubongo (100.99.146.14) + askari (100.99.226.39) listed
```
Reach `ubongo` over the mesh:
```sh
ssh sjat@100.99.146.14 # or: ssh sjat@ubongo.netbird.selfhosted
```
**SSH auth is separate from the mesh:** `ubongo` is key-only (passwords disabled), so the
device needs an SSH key authorised for `sjat@ubongo`. The mesh provides the network path;
the SSH key provides auth.
---
## Troubleshooting — mesh drops / SSH to `ubongo` times out
Symptom: SSH to `ubongo` (or any peer) times out for minutes and recovers on its own;
`netbird status` shows **Management/Signal: Disconnected** or peers stuck **Connecting**.
verified: client DNS/relay behaviour + NRPT scope read from a 0.72.4 debug bundle;
mitigations per docs.netbird.io (`/manage/dns/troubleshooting`,
`/help/troubleshooting-client`) · 2026-06-18
**1. Triage — is it your device or the coordinator?** On the device:
```sh
netbird status -d # Management/Signal Connected? peers P2P/Relayed?
nslookup netbird.askari.wingu.me # coordinator FQDN
nslookup pkgs.netbird.io # a PUBLIC name — control test
```
If the relay/handshake errors say `lookup netbird.askari.wingu.me: no such host` **and**
a *public* name (`pkgs.netbird.io`) also fails to resolve, your **local resolver is
dead** — the coordinator and `ubongo` are almost certainly fine. NetBird only manages
`*.netbird.selfhosted` resolution (a single NRPT rule), so it is **not** the cause.
Confirm from the other side if you can: the dashboard shows peer *last-seen*; `askari`/
`ubongo` staying green ⇒ the fault is your device's network.
**Why it cascades:** NetBird re-resolves the coordinator FQDN on every reconnect. A
network transition (Wi-Fi ↔ phone hotspot, sleep/wake) that briefly kills DNS means it
can't reach management/signal/relay — and since `ubongo` is **relay-only** (below), there
is no direct path to fall back to, so SSH dies until DNS recovers.
**2. Make the device resilient:**
- **Reliable resolvers** — set the device's DNS to public resolvers (`1.1.1.1`, `8.8.8.8`)
rather than a network-handed or homelab-internal resolver that's unreachable off-LAN.
Windows: inspect with `Get-DnsClientServerAddress`.
- **Pin the coordinator** so a DNS hiccup can't strand the client — add to the hosts file
(`C:\Windows\System32\drivers\etc\hosts` as admin, or `/etc/hosts`):
```
77.42.120.136 netbird.askari.wingu.me
```
`askari`'s stable WAN IP; TLS still validates on the hostname. Removes the multi-minute
reconnect deadlocks.
**3. Break-glass — reach `ubongo` without the mesh.** When the mesh is down you still need
a way in. On the home LAN, go straight to `ubongo`'s wired address (bypasses the mesh and
coordinator DNS entirely):
```sh
ssh sjat@10.20.10.151 # ubongo eno1 (LAN) — verify this works from your device NOW
```
> ⚠️ This works **today** only because `ubongo`'s host-firewall default-deny is not yet
> applied. When the deferred mesh-hardening lands (SSH only on `wt0`), this path closes
> unless a break-glass SSH rule is added to the firewall catalog. That hardening **must**
> keep a non-mesh break-glass (catalog SSH rule from a trusted LAN/admin source) — else a
> DNS/mesh outage = full lockout. (ADR-021 break-glass.)
**Why `ubongo` is relay-only (and P2P is not the fix).** Peers connect to `ubongo` as
`Relayed`, never `P2P`: its `nftables` default-deny drops the inbound UDP that ICE
hole-punching needs (egress is open, so STUN itself succeeds). This is the **intended
current posture** — P2P / NAT-traversal is the *deferred mesh-hardening* (ADR-016/020,
STATUS.md). Enabling it needs a firewall-catalog UDP entry **plus** an `accepted-risks.md`
deviation or ADR amendment, and OPNsense NAT work — and it would **not** have prevented a
DNS-driven outage (a re-handshake still needs signal, which needs DNS). Tracked as future
hardening, not a quick fix.
---
## Notes
- **Split-tunnel:** NetBird routes only the `100.x` overlay by default — normal/work
networking is unaffected.
- **Persistence:** the service auto-starts on boot and reconnects; the tray app has
Connect/Disconnect; CLI `netbird down` / `netbird up` (no flags after first setup).
- **Troubleshooting***"failed while getting Management Service public key"* / won't
register: confirm `https://netbird.askari.wingu.me` loads in a browser from the device
(DNS + TLS + the gRPC routing through Caddy are reachable), the URL is exact, and the
terminal is elevated. For peers stuck Disconnected/Connecting or SSH-to-`ubongo`
timeouts that recover on their own, see **Troubleshooting — mesh drops** above.
- **Removing a device:** `netbird down` then uninstall; revoke its peer in the dashboard
(and the setup key if one-off).

View file

@ -109,13 +109,6 @@ make check PLAYBOOK=site
# Should report no changes # Should report no changes
``` ```
> **Pre-flight before lockout-risky changes (firewall / sshd / boot):** before applying
> any change that touches nftables rules, SSH configuration, or boot ordering, run
> `make test-integration HOST=<name>` and confirm reboot-recovery on the local VM
> **while the break-glass (Proxmox console / Hetzner console) is still open**. Do not
> retire the break-glass until the integration test passes. See
> `docs/runbooks/integration-testing.md` and ADR-025.
--- ---
## Part E — Control node (`ubongo`, manual exception) ## Part E — Control node (`ubongo`, manual exception)

View file

@ -114,20 +114,7 @@ reason and gets no `BACKUP.md`. Once the backup node exists, `/check-backup <rol
proves the declared state is captured — part of the service-clearance gate proves the declared state is captured — part of the service-clearance gate
(`docs/security/service-checklist.md`). (`docs/security/service-checklist.md`).
### 13. Pre-flight for lockout-risky roles ### 13. Commit
If the new role touches nftables rules, SSH configuration, or boot ordering, run a
local VM integration test and confirm reboot-recovery **before** deploying to a live
host and while the host's break-glass (Proxmox console / Hetzner console) is still
open:
```bash
make test-integration HOST=<target-host>
```
See `docs/runbooks/integration-testing.md` and ADR-025.
### 14. Commit
```bash ```bash
git checkout -b role/<rolename> git checkout -b role/<rolename>

View file

@ -18,11 +18,8 @@ revisit (trigger).
| R3 | **Self-hosted mesh control plane is a public target on `askari`** — the NetBird coordinator (ADR-016) exposes a management API + dashboard (TCP 80/443) and STUN (UDP 3478) on `askari`'s public IP; the management API controls the whole mesh (NetBird v0.72.4 embeds STUN in the combined server — no separate Coturn) | Self-hosting means **no third-party trust** and an off-site control plane that survives a homelab outage (boma's sovereignty ethos). Residual surface is on `askari` (already a public VPS) and is mitigated: TLS + embedded-IdP login, source-IP restriction where practical, `base` hardening, version-pinned NetBird (ADR-011) patched on boma's cadence | A coordinator compromise or unpatched NetBird CVE; the management plane is reachable without auth/IP-limits; the operational burden makes a hosted coordinator worth reconsidering | | R3 | **Self-hosted mesh control plane is a public target on `askari`** — the NetBird coordinator (ADR-016) exposes a management API + dashboard (TCP 80/443) and STUN (UDP 3478) on `askari`'s public IP; the management API controls the whole mesh (NetBird v0.72.4 embeds STUN in the combined server — no separate Coturn) | Self-hosting means **no third-party trust** and an off-site control plane that survives a homelab outage (boma's sovereignty ethos). Residual surface is on `askari` (already a public VPS) and is mitigated: TLS + embedded-IdP login, source-IP restriction where practical, `base` hardening, version-pinned NetBird (ADR-011) patched on boma's cadence | A coordinator compromise or unpatched NetBird CVE; the management plane is reachable without auth/IP-limits; the operational burden makes a hosted coordinator worth reconsidering |
| R4 | **No cryptographic WORM for logs** — shipped logs are append-only via Loki's push API and copied off-site to `askari` (ADR-018), but the stored chunks are not object-locked/immutable; a root-on-`askari` attacker could edit history | Append-only push + off-site copy already defeats the realistic threat (a host attacker covering tracks survives even full-cluster compromise). True WORM (object-lock) is forensic-grade cost for boma's opportunistic threat model (R1) | Threat model shifts toward targeted/forensic; a regulatory/evidentiary need appears; `askari` itself is assessed as a likely target | | R4 | **No cryptographic WORM for logs** — shipped logs are append-only via Loki's push API and copied off-site to `askari` (ADR-018), but the stored chunks are not object-locked/immutable; a root-on-`askari` attacker could edit history | Append-only push + off-site copy already defeats the realistic threat (a host attacker covering tracks survives even full-cluster compromise). True WORM (object-lock) is forensic-grade cost for boma's opportunistic threat model (R1) | Threat model shifts toward targeted/forensic; a regulatory/evidentiary need appears; `askari` itself is assessed as a likely target |
| R5 | **No disk encryption on `ubongo`** — the control node's SSD (SanDisk X600 256 GB, TCG-Opal-capable but Opal unused) is unencrypted at rest, so it holds recovery-critical secrets in plaintext: the Ansible Vault password's `rbw` local cache and (future) Terraform state. Physical theft of the box would expose them | `ubongo` is always-on in a physically controlled location; compensating controls are a **BIOS supervisor password** and **disabled external/USB + PXE boot** (an attacker cannot trivially boot another OS to read the disk), and the offline-recoverable design means the irreducible root secret (Vaultwarden master password) is never stored on the box anyway. Full-disk encryption was weighed against the always-on/unattended-reboot requirement (LUKS+TPM auto-unlock or passphrase) and deferred for simplicity at this trust level | `ubongo` is relocated to a less-trusted physical location; the box starts holding additional high-value secrets; or a reinstall onto LUKS (TPM-sealed) is undertaken | | R5 | **No disk encryption on `ubongo`** — the control node's SSD (SanDisk X600 256 GB, TCG-Opal-capable but Opal unused) is unencrypted at rest, so it holds recovery-critical secrets in plaintext: the Ansible Vault password's `rbw` local cache and (future) Terraform state. Physical theft of the box would expose them | `ubongo` is always-on in a physically controlled location; compensating controls are a **BIOS supervisor password** and **disabled external/USB + PXE boot** (an attacker cannot trivially boot another OS to read the disk), and the offline-recoverable design means the irreducible root secret (Vaultwarden master password) is never stored on the box anyway. Full-disk encryption was weighed against the always-on/unattended-reboot requirement (LUKS+TPM auto-unlock or passphrase) and deferred for simplicity at this trust level | `ubongo` is relocated to a less-trusted physical location; the box starts holding additional high-value secrets; or a reinstall onto LUKS (TPM-sealed) is undertaken |
| R6 | **`le-prod-wildcard` integration runs** — when `CERTS=le-prod-wildcard` is passed to `make test-integration`, the production Gandi PAT (`vault.gandi.pat`) is passed to an ephemeral local test VM via the var overlay, and transient `_acme-challenge` TXT records are written into the real `wingu.me` DNS zone to satisfy the Let's Encrypt DNS-01 challenge. A compromised or long-lived test VM could exfiltrate the PAT; the real zone is briefly (seconds) modified | Scope is **on-demand only**`le-staging` is the default cert tier (`CERTS=internal` for incident repro); `le-prod-wildcard` is an explicit opt-in. Compensating controls: the VM is ephemeral and destroyed on success; it sits on an isolated libvirt NAT network (no LAN/mesh access); TXT records are auto-removed by Caddy immediately after validation; the PAT is not persisted inside the VM after the run. ADR-025 documents the cert-tier design and the three isolation invariants | The PAT is exfiltrated from a test VM; the `wingu.me` zone shows unexpected records; a `CERTS=le-prod-wildcard` run must be audited or the tier must be revoked |
| R7 | **`claude` AI-worker has `NOPASSWD:ALL` sudo on `ubongo`** — the automated AI-worker account can execute any command as root on the control node without a password prompt. A compromised or misbehaving agent session could make arbitrary root-level changes to ubongo | The account is **password-locked** (no interactive `claude` login; `NOPASSWD` sudo is the account's only escalation path, so there is no "su to claude + sudo" attack). `auditd` + Loki attribution (ADR-018) logs every `sudo` invocation with the originating user. The drop-in (`/etc/sudoers.d/claude-ai-worker`) is repo-managed via `base__ai_worker_user` — revocable in one commit + one deploy. Single-operator homelab; all changes in git; off-machine backups (ADR-022). Full rationale: ADR-015 amendment (2026-06-18) + ADR-021 §Sudo model. | The AI-worker executes a destructive action that cannot be rolled back via git; the account key is compromised; the threat model shifts toward targeted remote attackers |
| R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access**`askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
_Last reviewed: 2026-06-20. The prior gaps (full CIS hardening, SELinux/AppArmor, _Last reviewed: 2026-06-11. The prior gaps (full CIS hardening, SELinux/AppArmor,
IDS) were re-challenged and **adopted rather than accepted**: CIS Debian L1+L2 + CIS IDS) were re-challenged and **adopted rather than accepted**: CIS Debian L1+L2 + CIS
Docker, AppArmor (enforce), AIDE file-integrity, and Suricata network IDS are now Docker, AppArmor (enforce), AIDE file-integrity, and Suricata network IDS are now
part of the security strategy (ADR-002). See STATUS.md / `docs/TODO.md` for build part of the security strategy (ADR-002). See STATUS.md / `docs/TODO.md` for build

View file

@ -1,466 +0,0 @@
# Mesh-hardening 1/3 — askari SSH onto wt0 — Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Make askari's SSH reachable only over the NetBird mesh (`wt0`) and close the WAN `:22` surface at both the host nftables layer and the Hetzner Cloud Firewall, without dropping askari's public services.
**Architecture:** Three enforcement layers — (1) sshd `ListenAddress` bound to the live `wt0` IP (fail-closed, `ip_nonlocal_bind` to beat the post-boot bind race); (2) the base role's catalog-driven nftables default-deny (SSH already restricted to `wt0` via `base__firewall_mgmt_interface`; add a `public` zone + askari service entries so 80/443/3478 survive); (3) Terraform drops the Hetzner Cloud Firewall WAN `:22` rule. Tasks 14 are code (subagent-driven, each Molecule/lint/plan-verified). Task 5 is the live, operator-supervised cutover on the real host.
**Tech Stack:** Ansible (role `base`, FQCN), nftables, Molecule on Debian 13, `ansible.posix.sysctl`, pytest (filter unit tests), Terraform (`hcloud` provider).
**Spec:** `docs/superpowers/specs/2026-06-17-mesh-hardening-askari-ssh-wt0-design.md`
**Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; `make tf-plan` before `make tf-apply`; never hand-edit the generated `offsite.yml`; rbw unlocked for commits touching ansible content.
---
### Task 1: base role — sshd `ListenAddress` on wt0 + `ip_nonlocal_bind` (fail-closed)
**Files:**
- Modify: `roles/base/defaults/main.yml`
- Modify: `roles/base/tasks/ssh.yml`
- Modify: `roles/base/templates/sshd_hardening.conf.j2`
- Modify: `roles/base/molecule/default/converge.yml` (fixture)
- Modify: `roles/base/molecule/default/verify.yml` (assertions = the test)
- [ ] **Step 1: Write the failing test (extend Molecule verify)**
In `roles/base/molecule/default/verify.yml`, add these tasks after the existing "Sshd drop-in present and config valid" block:
```yaml
- name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
changed_when: false
- name: ip_nonlocal_bind sysctl drop-in is present
ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind = 1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
changed_when: false
- name: ip_nonlocal_bind is live in this netns
ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
register: _nonlocal
changed_when: false
failed_when: _nonlocal.stdout | trim != '1'
```
- [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (alongside the existing `base__mesh_*`):
```yaml
base__ssh_listen_mesh_only: true
base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container)
```
- [ ] **Step 3: Run the test to verify it fails**
Run: `make test ROLE=base`
Expected: FAIL — converge errors or verify fails (`ListenAddress` not rendered; sysctl drop-in absent), because the feature isn't implemented yet.
- [ ] **Step 4: Add the defaults**
In `roles/base/defaults/main.yml`, after the `base__ssh_authorised_keys: []` line (end of the hardening block), add:
```yaml
# SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
# ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
# fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
# lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
# non-empty address rather than silently listening on all interfaces.
base__ssh_listen_mesh_only: false
base__ssh_listen_addr: ""
```
- [ ] **Step 5: Resolve + assert + sysctl in `ssh.yml`**
In `roles/base/tasks/ssh.yml`, insert these tasks at the TOP of the file (before "Ensure openssh-server is installed"):
```yaml
- name: Resolve the sshd mesh listen address (override, else live wt0 fact)
ansible.builtin.set_fact:
base__ssh_listen_addr_resolved: >-
{{ base__ssh_listen_addr
or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
when: base__ssh_listen_mesh_only | bool
- name: Fail closed — refuse to render sshd without a known mesh address
ansible.builtin.assert:
that:
- base__ssh_listen_addr_resolved | length > 0
fail_msg: >-
base__ssh_listen_mesh_only is true but no mesh address resolved (set
base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
render sshd ListenAddress empty (which would listen on ALL interfaces).
when: base__ssh_listen_mesh_only | bool
- name: Allow sshd to bind the mesh IP before wt0 exists at boot
ansible.posix.sysctl:
name: net.ipv4.ip_nonlocal_bind
value: "1"
sysctl_set: true
state: present
reload: true
sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
when: base__ssh_listen_mesh_only | bool
```
- [ ] **Step 6: Render the conditional `ListenAddress`**
In `roles/base/templates/sshd_hardening.conf.j2`, append after the existing `KbdInteractiveAuthentication no` line:
```jinja
{% if base__ssh_listen_mesh_only | bool %}
ListenAddress {{ base__ssh_listen_addr_resolved }}
{% endif %}
```
- [ ] **Step 7: Run the test to verify it passes**
Run: `make test ROLE=base`
Expected: PASS — converge succeeds; verify confirms `ListenAddress 100.99.0.1`, the sysctl drop-in, and the live value `1`.
> **Checkpoint (environmental):** if `make test` fails on the sysctl task because the Molecule container can't write `net.ipv4.ip_nonlocal_bind`, add `sysctls: {net.ipv4.ip_nonlocal_bind: "0"}` to the platform in `roles/base/molecule/default/molecule.yml` (pre-creates the namespaced sysctl so the task can set it), then re-run. Note the change in the commit.
- [ ] **Step 8: Lint**
Run: `make lint`
Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
- [ ] **Step 9: Commit**
```bash
git add roles/base/defaults/main.yml roles/base/tasks/ssh.yml \
roles/base/templates/sshd_hardening.conf.j2 \
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
git commit -m "feat(base): opt-in sshd ListenAddress on the mesh IP (fail-closed)
base__ssh_listen_mesh_only binds sshd to the live wt0 IP only, with
ip_nonlocal_bind to beat the post-boot bind race and a fail-closed assert so an
unresolved address never silently listens on all interfaces. Molecule covers
the render + sysctl. Mesh-hardening 1/3 (ADR-016/021).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 2: firewall catalog — `public` zone + askari's public services
**Files:**
- Modify: `inventories/production/group_vars/all/firewall.yml`
- Modify: `roles/base/molecule/default/converge.yml` (fixture: public-zone rule)
- Modify: `roles/base/molecule/default/verify.yml` (assert the 0.0.0.0/0 rule)
- Test: `tests/test_firewall_rules.py` (unit: a `public` zone resolves to `0.0.0.0/0`)
Rationale: `base__firewall_mgmt_interface` already accepts `:22` on `wt0`. The gap is that the catalog is empty and has no "anywhere" source, so applying default-deny to askari would drop 80/443/3478. We add a `public` zone (`0.0.0.0/0`) and askari's service ingress.
- [ ] **Step 1: Write the failing unit test**
In `tests/test_firewall_rules.py`, add:
```python
def test_public_zone_resolves_to_anywhere():
catalog = {"web": {"host": "askari",
"ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
zones = {"public": "0.0.0.0/0"}
rules = rs.resolve_firewall_rules(catalog, zones, "askari",
{"askari": {"ansible_host": "100.99.226.39"}}, {})
assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]
```
(Module is loaded by the existing importlib shim at the top of the test file as `rs`. If the filter is imported under a different alias there, match it.)
- [ ] **Step 2: Run it to verify it fails (or passes trivially)**
Run: `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
Expected: this test PASSES immediately if the filter already resolves arbitrary zones (it does — `_resolve_source` treats any `zones` key generically). That is fine: the unit test documents/locks the `public`-zone contract. If it fails, fix the filter. Either way it must end green.
- [ ] **Step 3: Add the Molecule fixture (public-zone rule)**
In `roles/base/molecule/default/converge.yml`, under `firewall_zones:` add `public: 0.0.0.0/0`, and under `firewall_catalog:` add:
```yaml
netbird_stun:
host: instance
ingress:
- { from: public, port: 3478, proto: udp }
```
- [ ] **Step 4: Add the Molecule assertion (the test)**
In `roles/base/molecule/default/verify.yml`, after the photoprism assertion block, add:
```yaml
- name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
ansible.builtin.assert:
that:
- "'0.0.0.0/0' in nft"
- "'udp dport 3478 accept' in nft"
fail_msg: "missing public->3478/udp rule for netbird_stun"
```
- [ ] **Step 5: Run the tests**
Run: `make test ROLE=base` then `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
Expected: both PASS (the rendered ruleset now contains the `0.0.0.0/0 ... udp dport 3478 accept` rule).
- [ ] **Step 6: Populate the real catalog**
In `inventories/production/group_vars/all/firewall.yml`, replace the `firewall_zones`/`firewall_catalog` blocks with:
```yaml
# Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public
# off-site services (askari); home/cluster services use the internal zones only.
firewall_zones:
mgmt: 10.10.0.0/24
srv: 10.20.0.0/24
lan: 10.30.0.0/24
iot: 10.40.0.0/24
guest: 10.50.0.0/24
public: 0.0.0.0/0
# Service catalog: <name> → placement (host | group | hosts) + ingress[].
# askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host
# nftables template renders IPv4 source rules only; askari is reached via its A record
# (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
firewall_catalog:
reverse_proxy:
host: askari
ingress:
- { from: public, port: 80, proto: tcp }
- { from: public, port: 443, proto: tcp }
netbird_stun:
host: askari
ingress:
- { from: public, port: 3478, proto: udp }
```
- [ ] **Step 7: Lint**
Run: `make lint`
Expected: clean pass (`check-tags: OK`).
- [ ] **Step 8: Commit**
```bash
git add inventories/production/group_vars/all/firewall.yml \
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
tests/test_firewall_rules.py
git commit -m "feat(firewall): public zone + askari's public services in the catalog
Adds a public (0.0.0.0/0) zone and askari's Caddy (80/443) + NetBird STUN
(3478/udp) ingress so the base nftables default-deny does not drop the live
public services when applied to askari. Molecule + filter unit test cover the
public-zone rendering. Mesh-hardening 1/3 (ADR-020/024/016).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 3: inventory — point Ansible at wt0 + enable mesh-only SSH on askari
**Files:**
- Create: `inventories/production/host_vars/askari.yml`
- Modify: `inventories/production/group_vars/offsite_hosts/vars.yml`
- [ ] **Step 1: Create the host_var override**
Create `inventories/production/host_vars/askari.yml`:
```yaml
---
# Manage askari over the NetBird mesh (wt0), not its WAN IP. This OVERRIDES the
# TF-generated inventories/production/offsite.yml (ansible_host = 77.42.120.136); host_vars
# outrank the generated inventory and are NOT touched by `make tf-inventory-offsite`.
# Mesh-hardening 1/3 — once SSH is wt0-only, the WAN IP is no longer reachable for SSH.
ansible_host: 100.99.226.39 # askari's wt0 address (NetBird, M5)
```
- [ ] **Step 2: Enable mesh-only SSH for offsite hosts**
In `inventories/production/group_vars/offsite_hosts/vars.yml`, replace the file body with:
```yaml
---
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
# (ADR-016, M5). Mesh-hardening 1/3 (2026-06-17): SSH is moved onto wt0 — sshd binds the
# mesh IP only (base__ssh_listen_mesh_only) and the base nftables default-deny applies
# (base__firewall_apply defaults true; SSH allowed on wt0 via base__firewall_mgmt_interface,
# public services via the catalog). base__mesh_enabled stays true (precondition from M5).
base__mesh_enabled: true
base__ssh_listen_mesh_only: true
```
- [ ] **Step 3: Verify the override resolves**
Run: `.venv/bin/ansible-inventory -i inventories/production/ --host askari 2>/dev/null | grep ansible_host`
Expected: `"ansible_host": "100.99.226.39"` (the host_var wins over the generated `offsite.yml`).
- [ ] **Step 4: Lint**
Run: `make lint`
Expected: clean pass.
- [ ] **Step 5: Commit**
```bash
git add inventories/production/host_vars/askari.yml \
inventories/production/group_vars/offsite_hosts/vars.yml
git commit -m "feat(inventory): manage askari over wt0 + enable mesh-only SSH
host_vars/askari.yml points ansible_host at the wt0 IP (overriding the generated
offsite.yml); offsite_hosts sets base__ssh_listen_mesh_only. Mesh-hardening 1/3.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 4: Terraform — retire the Hetzner WAN `:22` rule
**Files:**
- Modify: `terraform/modules/hetzner_vm/main.tf`
- Modify: `terraform/modules/hetzner_vm/variables.tf`
- Modify: `terraform/environments/offsite/main.tf`
This task makes the SSH rule conditional and sets askari's admin CIDRs to empty (mesh-only). The live `tf-plan`/`tf-apply` happens in Task 5 — here we only change + format/validate the code.
- [ ] **Step 1: Gate the SSH rule on a non-empty CIDR list**
In `terraform/modules/hetzner_vm/main.tf`, replace the static SSH `rule { ... }` block (the one with `port = "22"`) with a dynamic block:
```hcl
# SSH from the control node only — and only when admin CIDRs are set. An empty
# ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over
# wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
dynamic "rule" {
for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
content {
direction = "in"
protocol = "tcp"
port = "22"
source_ips = var.ssh_admin_cidrs
}
}
```
- [ ] **Step 2: Default the variable to empty**
In `terraform/modules/hetzner_vm/variables.tf`, change the `ssh_admin_cidrs` variable to default to an empty list:
```hcl
variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)."
type = list(string)
default = []
}
```
- [ ] **Step 3: Set askari to mesh-only SSH**
In `terraform/environments/offsite/main.tf`, change the `ssh_admin_cidrs` argument in the `module "askari"` block to:
```hcl
ssh_admin_cidrs = [] # mesh-only: SSH is reached over wt0; WAN :22 retired (mesh-hardening 1/3)
```
- [ ] **Step 4: Format + validate**
Run: `cd terraform/environments/offsite && terraform fmt -recursive ../.. && terraform validate && cd -`
Expected: `fmt` lists any reformatted files (re-add them); `validate` prints `Success! The configuration is valid.` (offsite is already `init`ed — it has live state.)
- [ ] **Step 5: Commit**
```bash
git add terraform/modules/hetzner_vm/main.tf terraform/modules/hetzner_vm/variables.tf \
terraform/environments/offsite/main.tf
git commit -m "feat(tf/offsite): retire askari's WAN :22 (mesh-only SSH)
The Hetzner Cloud Firewall SSH rule is now conditional on a non-empty
ssh_admin_cidrs (default []); askari sets it empty so the WAN :22 rule is
removed on the next apply. SSH is reached over wt0; break-glass is the Hetzner
console. Apply is the live cutover (Task 5). Mesh-hardening 1/3.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
> This task touches the real askari over the network and is lockout-risky. Run it
> interactively with the operator, in order, verifying each step before the next. The
> firewall's auto-rollback timer + `wait_for_connection` over wt0 is the safety net; the
> Hetzner web console is the ultimate break-glass. Do NOT hand this to an unattended agent.
- [ ] **Step 1: Pre-check the mesh SSH path (before any change)**
Run: `.venv/bin/ansible askari -i inventories/production/ -m ping`
Expected: `SUCCESS` — confirms Ansible reaches askari over `wt0` (Tasks 13 are merged, so `ansible_host` is now `100.99.226.39`). If this fails, STOP — the mesh path must work before closing the WAN.
- [ ] **Step 2: Dry-run the base apply (firewall + sshd)**
Run: `make check PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
Expected: shows the nftables ruleset diff (default-deny + wt0 SSH + public 80/443/3478) and the sshd drop-in diff (`ListenAddress 100.99.226.39`); no errors. Review that the public service rules are present (so they won't be dropped).
- [ ] **Step 3: Apply the host firewall + sshd (auto-rollback armed)**
Run: `make deploy PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
Expected: the firewall concern arms the rollback timer, applies, resets the connection, and `wait_for_connection` succeeds over wt0; sshd reloads with the mesh ListenAddress. If connectivity is lost, the timer auto-reverts the ruleset within `base__firewall_rollback_timeout` (45 s).
- [ ] **Step 4: Verify services + WAN SSH still open at the cloud edge**
```bash
curl -sSf -o /dev/null -w '%{http_code}\n' https://test.askari.wingu.me # expect 200
curl -sSf -o /dev/null -w '%{http_code}\n' https://netbird.askari.wingu.me # expect 200
```
Expected: both `200` (valid certs); the host firewall did not drop the public services. (WAN `:22` is now dropped by the host nftables, but the Hetzner FW still allows it until Step 5 — that's fine.)
- [ ] **Step 5: Retire the Hetzner WAN `:22` — plan, review, apply**
Run: `make tf-plan TF_ENV=offsite`
Expected: the plan shows the SSH firewall rule being **destroyed** (and nothing else of substance). Review it.
Then: `make tf-apply TF_ENV=offsite`
Expected: apply succeeds; the WAN `:22` rule is gone.
- [ ] **Step 6: Verify the end-state (out-of-band)**
From an OFF-MESH host (e.g. the operator's laptop with NetBird disconnected, or a quick check from askari's perspective):
```bash
nc -vz -w5 77.42.120.136 22 # expect: refused / timeout (WAN SSH closed)
nc -vz -w5 77.42.120.136 443 # expect: open (public service intact)
```
And from ubongo over the mesh: `.venv/bin/ansible askari -i inventories/production/ -m ping``SUCCESS`.
- [ ] **Step 7: Reboot resilience check (optional but recommended)**
Reboot askari from the Hetzner console; after it comes back, confirm `ansible askari -m ping` succeeds over wt0 without intervention (proves `ip_nonlocal_bind` beat the post-boot bind race).
- [ ] **Step 8: Update STATUS + ROADMAP**
- In `STATUS.md`, update the askari row: SSH is now wt0-only; the host nftables default-deny is applied; the Hetzner WAN `:22` is retired. Move "host firewall + moving askari's SSH onto wt0" out of *Pending*.
- In `docs/ROADMAP.md`, mark mesh-hardening sub-project 1 (askari SSH→wt0) done; next is sub-project 2 (ubongo default-deny).
```bash
git add STATUS.md docs/ROADMAP.md
git commit -m "docs: askari SSH moved onto wt0 (mesh-hardening 1/3 done)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
- [ ] **Step 9: Push**
Run: `git push origin main`
---
## Self-review (against the spec)
- **§ three layers** → Task 1 (sshd ListenAddress), Task 2 (nftables catalog; SSH-on-wt0 pre-existing via `base__firewall_mgmt_interface`), Task 4 (Hetzner WAN :22). ✓
- **§ boot-race fix** (`ip_nonlocal_bind` + fail-closed assert + live wt0 fact) → Task 1 Steps 46. ✓
- **§ new code/vars** (`base__ssh_listen_mesh_only`, `base__ssh_listen_addr`, host_vars/askari.yml, offsite flag, catalog, TF) → Tasks 14. ✓
- **§ staged cutover** → Task 5 Steps 16, with the firewall auto-rollback as the gate. ✓
- **§ testing** → Molecule render asserts (ListenAddress, sysctl, public-zone rule) + filter unit test + live out-of-band checks. The fail-closed assert is exercised by code; to spot-check it, temporarily blank `base__ssh_listen_addr` in the converge fixture and confirm `make test ROLE=base` fails on the assert, then revert (manual, not automated — a deliberate-failure Molecule scenario is non-idiomatic). ✓
- **§ risks/rollback** → auto-rollback timer (Task 5 Step 3), `ip_nonlocal_bind` (Task 1), Hetzner console break-glass, re-addable TF rule. ✓
- **IPv6 note** → recorded in the catalog comment (Task 2 Step 6); acceptable because askari has only an A record.

View file

@ -1,409 +0,0 @@
# Mesh-hardening redesign (askari) — Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Harden askari's inbound surface with the proven ubongo INPUT-only default-deny pattern (SSH scoped by `iifname "wt0"` + a permanent WAN break-glass), and make the NetBird coordinator survive a no-egress startup — reboot-safe, no boot-race, no lockout.
**Architecture:** Mirror mesh-hardening 2/3 (ubongo): `base` firewall INPUT-only (`base__firewall_input_only: true`, forward stays `policy accept` so Docker forwarding/NAT survive), **no** sshd `ListenAddress` change (the firewall, not sshd, scopes `:22`). The coordinator-host exception: WAN `:22` stays open from ubongo's static WAN IP as the always-available non-mesh break-glass (the Hetzner console is the ultimate fallback). A `netbird_coordinator` change disables geolocation so a transient egress loss can't FATAL the control plane. Validate firewall reboot-safety on a throwaway VM (ADR-025 harness) GREEN before a supervised live cutover.
**Tech Stack:** Ansible (`base`, `netbird_coordinator` roles), nftables, Docker Compose, Molecule (Debian 13), the `scripts/integration-vm.py` ADR-025 harness, NetBird self-hosted `netbird-server:0.72.4`.
**Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md`
## Global Constraints
- **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
- **No sshd `ListenAddress` change**`base__ssh_listen_mesh_only` stays `false` everywhere here (this is what sidesteps the 2026-06-17 boot-race).
- **WAN `:22` is never closed** — no Terraform / Hetzner-Cloud-Firewall change in this plan.
- **`base__firewall_input_only: true` on askari** — the forward chain must stay `policy accept` (Docker host). Never apply a forward-`drop` firewall to askari.
- **ubongo's WAN IP is `91.226.145.80`** (operator-confirmed static 2026-06-19) — the break-glass anchor.
- **askari `wt0` IP is `100.99.226.39`**; askari domain `netbird.askari.wingu.me`.
- **Before any commit:** `rbw unlocked` must succeed (the pre-commit hook decrypts `vault.yml`); run `make lint` and it must be clean.
- **Tags:** import each role at play level with its role-name tag; only use concern tags from `tests/tags.yml`.
- **Harness GREEN before live** (Task 3 before Task 4). The live cutover (Task 4) is **operator-gated** — never run autonomously.
---
### Task 1: Disable geolocation in `netbird_coordinator` (FRICTION 2026-06-17 #4)
Make the control plane survive a startup with no container egress: NetBird's combined server downloads the GeoLite2 DB at boot and treats failure as FATAL. boma uses no geo posture (ACL is Allow-All), so disable geolocation entirely via the documented env var. TDD'd through the role's render-only Molecule scenario.
> verified: NetBird self-hosted geolocation knobs (`NB_DISABLE_GEOLOCATION`, `disableGeoliteUpdate`, GeoLite2 pre-seed) · WebFetch · docs.netbird.io/selfhosted/geo-support · 2026-06-19 — *from a docs summary; the live "healthy with egress blocked" check in Task 4 is the real gate, with a concrete pre-seed fallback there.*
**Files:**
- Modify: `roles/netbird_coordinator/defaults/main.yml` (add the knob)
- Modify: `roles/netbird_coordinator/templates/docker-compose.yml.j2:14-27` (add `environment:` to `netbird-server`)
- Test: `roles/netbird_coordinator/molecule/default/verify.yml:21-32` (assert the rendered compose)
- Modify: `roles/netbird_coordinator/README.md` (one line documenting the knob)
**Interfaces:**
- Produces: role default `netbird_coordinator__disable_geolocation` (bool, default `true`); rendered compose env `NB_DISABLE_GEOLOCATION: "true"` on the `netbird-server` service.
- [ ] **Step 1: Write the failing Molecule assertion**
Append to `roles/netbird_coordinator/molecule/default/verify.yml` (after the existing compose-tags assert, inside the same `tasks:` list):
```yaml
- name: Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)
ansible.builtin.assert:
that:
- "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
fail_msg: >-
compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
the coordinator on the GeoLite2 download
success_msg: "geolocation disabled in compose"
```
- [ ] **Step 2: Run Molecule to verify it fails**
Run: `make test ROLE=netbird_coordinator`
Expected: FAIL at "Assert geolocation is disabled" — the rendered compose has no `NB_DISABLE_GEOLOCATION`.
- [ ] **Step 3: Add the default knob**
Add to `roles/netbird_coordinator/defaults/main.yml` (after line 7, the `__domain` line):
```yaml
# Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
# (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
# so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
# re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
# removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
netbird_coordinator__disable_geolocation: true
```
- [ ] **Step 4: Render the env in the compose template**
In `roles/netbird_coordinator/templates/docker-compose.yml.j2`, add an `environment:` block to the `netbird-server` service, immediately after its `command:` line (line 18):
```yaml
environment:
# Disable geolocation so a no-egress startup can't FATAL the control plane
# (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
```
- [ ] **Step 5: Run Molecule to verify it passes**
Run: `make test ROLE=netbird_coordinator`
Expected: PASS — all asserts green, including "geolocation disabled in compose"; Molecule idempotence clean.
- [ ] **Step 6: Document the knob**
Add one line to `roles/netbird_coordinator/README.md` under its variables/defaults section:
```markdown
- `netbird_coordinator__disable_geolocation` (default `true`) — sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4).
```
- [ ] **Step 7: Lint and commit**
```bash
rbw unlocked && make lint
git add roles/netbird_coordinator/defaults/main.yml \
roles/netbird_coordinator/templates/docker-compose.yml.j2 \
roles/netbird_coordinator/molecule/default/verify.yml \
roles/netbird_coordinator/README.md
git commit -m "feat(netbird_coordinator): disable geolocation so no-egress startup can't FATAL the control plane" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 2: Enable askari's host firewall (INPUT-only) + WAN break-glass + manage over `wt0`
Flip askari from "firewall not applied" to the redesigned INPUT-only default-deny, add the permanent WAN break-glass source, and point Ansible at the mesh. Pure inventory change — validated by lint + inventory resolution (the firewall *behavior* is proven in Task 3).
**Files:**
- Modify: `inventories/production/group_vars/offsite_hosts/vars.yml` (replace the whole file body)
- Create: `inventories/production/host_vars/askari.yml`
**Interfaces:**
- Consumes: `base` knobs `base__firewall_apply`, `base__firewall_input_only`, `base__firewall_admin_addrs`, `base__ssh_listen_mesh_only`, `base__mesh_enabled` (all defined in `roles/base/defaults/main.yml`).
- Produces: askari resolves `ansible_host: 100.99.226.39`, `base__firewall_apply: true`, `base__firewall_input_only: true`, `base__firewall_admin_addrs: ["91.226.145.80"]`.
- [ ] **Step 1: Rewrite the offsite group_vars**
Replace the body of `inventories/production/group_vars/offsite_hosts/vars.yml` with:
```yaml
---
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
# (ADR-016, M5).
#
# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward
# `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
# did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
# - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
# so Docker container forwarding/NAT survive a reboot;
# - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
# change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
# - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
# as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
# management path must never depend on a service that host itself hosts).
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
base__mesh_enabled: true
base__firewall_apply: true
base__firewall_input_only: true # forward stays `policy accept` → Docker-safe
base__ssh_listen_mesh_only: false # no sshd ListenAddress change → no boot-race
base__firewall_admin_addrs:
- 91.226.145.80 # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass
```
- [ ] **Step 2: Create the askari host_vars to manage over the mesh**
Create `inventories/production/host_vars/askari.yml`:
```yaml
---
# Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
# in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
# (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
# break-glass; the Hetzner web console is the IP-independent ultimate fallback.
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
ansible_host: 100.99.226.39
```
- [ ] **Step 3: Verify the inventory resolves**
Run: `ansible-inventory -i inventories/production --host askari`
Expected: JSON shows `"ansible_host": "100.99.226.39"`, `"base__firewall_apply": true`, `"base__firewall_input_only": true`, and `"base__firewall_admin_addrs": ["91.226.145.80"]`.
- [ ] **Step 4: Lint**
Run: `rbw unlocked && make lint`
Expected: clean (no yamllint/ansible-lint errors).
- [ ] **Step 5: Commit**
```bash
git add inventories/production/group_vars/offsite_hosts/vars.yml \
inventories/production/host_vars/askari.yml
git commit -m "feat(inventory): askari INPUT-only firewall + WAN break-glass + manage over wt0" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 3: Integration harness "askari_inputonly" profile — the reboot-safety GREEN gate
Prove on a throwaway VM (ADR-025) that the redesigned firewall is reboot-safe BEFORE touching the real host: INPUT default-deny + forward accept + the admin-addr break-glass + published-port DNAT all survive a reboot. New profile (keeps the existing `askari` profile, which validates the `docker_host` container-forward drop-in path, intact).
**Files:**
- Create: `tests/integration/profiles/askari_inputonly.json`
- Create: `tests/integration/overrides/askari_inputonly.yml`
- Modify: `tests/integration/verify.yml` (allow-list + a new profile branch)
**Interfaces:**
- Consumes: the `scripts/integration-vm.py` harness; `make test-integration HOST=<profile>` maps `HOST` to `profiles/<HOST>.json` (a profile name, not a production inventory host).
- Produces: profile `askari_inputonly` with `integration_profile: askari_inputonly`.
- [ ] **Step 1: Add the new profile to the verify allow-list and a failing branch**
In `tests/integration/verify.yml`, change the allow-list assert (line 14) from:
```yaml
- integration_profile in ['askari', 'ubongo']
```
to:
```yaml
- integration_profile in ['askari', 'askari_inputonly', 'ubongo']
```
and update its `fail_msg` (line 15) to `"integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"`. Then append this block to the `tasks:` list (after the ubongo block):
```yaml
# ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
# INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
# (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
- name: (askari_inputonly) Read the live nftables ruleset
when: integration_profile == 'askari_inputonly'
ansible.builtin.command: nft list ruleset
register: _nft_io
changed_when: false
- name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
when: integration_profile == 'askari_inputonly'
ansible.builtin.assert:
that:
- "'hook input priority filter; policy drop;' in _nft_io.stdout"
- "'hook forward priority filter; policy accept;' in _nft_io.stdout"
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
fail_msg: >-
askari_inputonly: expected input policy drop, forward policy accept (input-only),
and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
- name: (askari_inputonly) Gather service facts
when: integration_profile == 'askari_inputonly'
ansible.builtin.service_facts:
- name: (askari_inputonly) Docker daemon is active
when: integration_profile == 'askari_inputonly'
ansible.builtin.assert:
that: "ansible_facts.services['docker.service'].state == 'running'"
fail_msg: "docker.service is not running"
- name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
when: integration_profile == 'askari_inputonly'
delegate_to: localhost
become: false
ansible.builtin.uri:
url: "http://{{ ansible_host }}/"
follow_redirects: none
status_code: [200, 301, 308, 404, 502, 503]
timeout: 10
register: _probe_io
retries: 5
delay: 6
until: _probe_io is succeeded
```
- [ ] **Step 2: Create the profile descriptor**
Create `tests/integration/profiles/askari_inputonly.json`:
```json
{
"groups": ["offsite_hosts"],
"applies": [
{"playbook": "site.yml", "tags": ["base"]},
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
],
"extra_vars_files": ["overrides/askari_inputonly.yml"],
"mem_mib": 3072,
"vcpus": 2
}
```
- [ ] **Step 3: Create the overlay**
Create `tests/integration/overrides/askari_inputonly.yml`:
```yaml
---
# Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
# Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
# accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
integration_profile: askari_inputonly
base__firewall_apply: true
base__firewall_input_only: true
# No sshd ListenAddress change — never wt0-only in a throwaway VM.
base__ssh_listen_mesh_only: false
# Isolated VM: never touch the real mesh.
base__mesh_enabled: false
# The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
# VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
# interface-independent and the default-deny + reboot don't lock out the driver. This
# mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
base__firewall_admin_addrs:
- 192.168.150.1
```
- [ ] **Step 4: Run the harness — the GREEN gate**
Run: `make test-integration HOST=askari_inputonly`
Expected: GREEN. The harness boots a VM, applies `base` (INPUT-only) + `docker_host` + `reverse_proxy`, **reboots**, re-SSHes (proving the admin-addr break-glass survives), then `verify.yml` asserts input `policy drop`, forward `policy accept`, the `192.168.150.1` SSH allow, Docker active, and the published `:80` answering. Clean up: `make test-integration-clean`.
- [ ] **Step 5: Commit**
```bash
rbw unlocked && make lint
git add tests/integration/profiles/askari_inputonly.json \
tests/integration/overrides/askari_inputonly.yml \
tests/integration/verify.yml
git commit -m "test(integration): askari_inputonly profile — INPUT-only default-deny reboot gate" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 4: Supervised live cutover + STATUS/ROADMAP update — ⚠️ OPERATOR-GATED
> **⚠️ DO NOT run this task autonomously.** It changes the live off-site host (lockout risk) and runs `make deploy`. An automated executor must STOP here and hand back to the operator. Preconditions: Tasks 13 committed and GREEN; `rbw unlocked`; the **Hetzner web console** open in a browser (the out-of-band ultimate break-glass); the operator present. The WAN `:22` break-glass is never removed, so a fallback path is open throughout (FRICTION 2026-06-17 #6).
**Files (Step 7 only):**
- Modify: `STATUS.md` (askari row), `docs/ROADMAP.md` (Next step)
- [ ] **Step 1: Pre-check both paths are healthy**
```bash
ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
ansible askari -i inventories/production -m ping
curl -sI https://test.askari.wingu.me | head -1
curl -sI https://netbird.askari.wingu.me | head -1
```
Expected: wt0 SSH OK; ping `pong`; both curls `HTTP/2 200`.
- [ ] **Step 2: Dry-run the converge (mandatory `check` before `deploy`)**
```bash
make check PLAYBOOK=site LIMIT=askari
```
Expected: changes limited to the `base` firewall (input-only ruleset, admin-addr) + the `netbird_coordinator` compose env (`NB_DISABLE_GEOLOCATION`). Review and show the output before proceeding.
- [ ] **Step 3: Apply (operator present, console open, auto-rollback armed)**
```bash
make deploy PLAYBOOK=site LIMIT=askari
```
The `base` firewall concern arms the auto-rollback timer (`base__firewall_rollback_timeout: 45`) and reconnects over `wt0` — a bad ruleset reverts itself. Expected: converge OK; SSH-over-`wt0` stays up.
- [ ] **Step 4: Rebuild NAT and confirm the coordinator is healthy with geo disabled**
`base`'s `flush ruleset` wipes Docker's nat (FRICTION) — rebuild it, then confirm the control plane:
```bash
ssh sjat@100.99.226.39 'sudo systemctl restart docker'
ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
ssh sjat@100.99.226.39 'docker logs --since 2m netbird-server 2>&1 | grep -iE "geo|fatal" || echo "no geo/fatal log lines"'
```
Expected: `netbird-server` + `netbird-dashboard` Up; no geo-DB FATAL.
> **Contingency (only if `netbird-server` still FATALs on geolocation):** `NB_DISABLE_GEOLOCATION` was not honored by the pinned image. Pre-seed the DB into the volume instead — `ssh sjat@100.99.226.39 'sudo curl -fSL -o /var/lib/docker/volumes/netbird_data/_data/GeoLite2-City_20260101.mmdb https://pkgs.netbird.io/geolite2/GeoLite2-City.mmdb && sudo docker restart netbird-server'` — and add `disableGeoliteUpdate: true` under `server:` in `config.yaml.j2` so it never re-downloads. Re-verify, then fold the working fix back into the role (amend Task 1).
- [ ] **Step 5: Verify the new steady state (both SSH paths + services)**
```bash
ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
# From ubongo: SSH to askari's WAN IP. ubongo's packets egress via OPNsense, SNAT'd to the
# WAN IP 91.226.145.80 — matching askari's admin-addr break-glass rule. (No BindAddress:
# ubongo does not hold 91.226.145.80; OPNsense does.)
ssh sjat@77.42.120.136 true && echo "WAN break-glass OK"
curl -sI https://test.askari.wingu.me | head -1
nc -vz -u 77.42.120.136 3478 # STUN answers
```
Expected: both SSH paths succeed; cert valid; STUN reachable.
- [ ] **Step 6: Reboot-resilience — the real test (console available)**
```bash
ssh sjat@100.99.226.39 'sudo systemctl reboot'
# wait ~60s, then from ubongo — no manual intervention:
sleep 60; ssh sjat@100.99.226.39 'nft list chain inet filter input | grep -E "policy drop|wt0|91.226.145.80"'
curl -sI https://netbird.askari.wingu.me | head -1
ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
```
Expected, unattended: input `policy drop` with the `wt0` + `91.226.145.80` allows; public cert valid; both containers Up; `wt0` SSH back. (If lost: recover via the Hetzner console — the firewall auto-rollback and the WAN break-glass should make that unnecessary.)
- [ ] **Step 7: Record reality in the ground-truth docs and commit**
Update `STATUS.md` (the askari row): firewall now **applied** — INPUT-only default-deny, SSH `wt0`-primary + permanent WAN break-glass (ubongo's WAN), managed over `wt0`, geolocation disabled, **reboot-validated**. Update `docs/ROADMAP.md` "Next step": mark the askari SSH→`wt0` redesign **DONE**; the next mesh-hardening sub-project is the **SPOF reduction** (askari relay single-point-of-failure) — confirmed by the `ubongo → askari` `Relayed` finding (2026-06-19).
```bash
rbw unlocked && make lint
git add STATUS.md docs/ROADMAP.md
git commit -m "docs(status): mesh-hardening redesign — askari INPUT-only + WAN break-glass applied + reboot-validated" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
## Notes / out of scope (carry to the SPOF sub-project)
- **SPOF reduction is the next sub-project** (operator decision 2026-06-19): `ubongo → askari` is currently `Relayed` through askari's own relay; if askari is down, relayed peers lose the mesh data plane. Its own spec.
- **NetBird ACL stays Allow-All** — any enrolled peer can reach askari `wt0:22` until a later sub-project.
- **Full forward-chain hardening** (`docker_host` container-forward drop-in over the `input_only` baseline) — a later tightening; the existing `askari` integration profile already covers that path.
- **Coordinator off-site backup** (FRICTION 2026-06-17 #5, ADR-022) — still pending; not in scope.

View file

@ -1,470 +0,0 @@
# Mesh-hardening 2/3 — ubongo INPUT-only default-deny — Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Apply base's nftables firewall to the control node (ubongo) as an INPUT-only default-deny — hardening its inbound surface — while leaving the forward chain permissive so Docker egress and the libvirt-NAT integration harness keep working, and without any sshd `ListenAddress` change.
**Architecture:** Two new `base` knobs make the existing firewall concern fit a control node: `base__firewall_input_only` flips the forward chain to `policy accept` (host-local input filtering only), and `base__firewall_admin_addrs` adds operator-workstation LAN sources to the SSH allow-list (alongside `wt0` and `ssh-from-control`). sshd is untouched (nftables does the scoping → no `ip_nonlocal_bind` boot-race). The change is validated on a throwaway VM via the ADR-025 integration harness (a new "be ubongo" profile) before an operator-supervised live cutover whose safety net is the firewall auto-rollback timer plus the permanent on-prem physical console.
**Tech Stack:** Ansible (role `base`, FQCN), nftables, Jinja2, Molecule on Debian 13, pytest (none new), the ADR-025 integration harness (`scripts/integration-vm.py`, JSON profiles, `-e @` overlays).
**Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`
**Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; never hand-edit the generated `offsite.yml`; `rbw unlocked` for any commit touching Ansible content and for the integration/live applies (the production `group_vars/all/vault.yml` is in inventory scope and gets decrypted at playbook load). Tasks 13 are code (subagent-driven, each lint/Molecule-verified). Task 4 is a real-VM validation gate on ubongo. Task 5 is the live, operator-supervised cutover.
---
## File Structure
| File | Create/Modify | Responsibility |
|---|---|---|
| `roles/base/defaults/main.yml` | Modify | Declare `base__firewall_input_only` + `base__firewall_admin_addrs` (defaults: off / empty). |
| `roles/base/templates/nftables.conf.j2` | Modify | Conditional forward policy; render an SSH-allow rule per admin address. |
| `roles/base/molecule/default/converge.yml` | Modify | Fixture: an admin-addr source (input-only stays at its default → forward drop). |
| `roles/base/molecule/default/verify.yml` | Modify | Assert forward-drop default + the admin-addr rule render. |
| `inventories/production/group_vars/control/vars.yml` | Modify | Turn the knobs on for ubongo (input-only; mamba's LAN IP). |
| `tests/integration/overrides/ubongo.yml` | Create | The "be ubongo" overlay (input-only firewall; harness SSH lifeline). |
| `tests/integration/profiles/ubongo.json` | Create | The "be ubongo" VM profile (group `control`, applies `site.yml:base`). |
| `tests/integration/overrides/askari.yml` | Modify | Add the `integration_profile` marker (verify is now profile-aware). |
| `tests/integration/verify.yml` | Modify | Gate the askari (Docker/DNAT) block; add the ubongo (input-only) block + a guard. |
| `STATUS.md`, `docs/ROADMAP.md` | Modify (Task 5) | Record mesh-hardening 2/3 done. |
---
### Task 1: base role — `base__firewall_input_only` (forward policy) + `base__firewall_admin_addrs` (LAN SSH allow)
**Files:**
- Modify: `roles/base/defaults/main.yml`
- Modify: `roles/base/templates/nftables.conf.j2`
- Modify: `roles/base/molecule/default/converge.yml`
- Modify: `roles/base/molecule/default/verify.yml`
> **Test strategy (note):** Molecule renders one fixture, so it locks the *secure default*
> `input_only` **off** → forward `policy drop` — plus the new admin-addr rule (red→green). The
> `input_only` **on** → forward `policy accept` path is exercised on a real VM by the
> integration "be ubongo" profile (Tasks 34), whose verify fails red until this template
> conditional exists. Both branches are covered, across the two test layers.
- [ ] **Step 1: Write the failing test (extend Molecule verify)**
In `roles/base/molecule/default/verify.yml`, after the `Assert the docker_host extension hook is present` block, add:
```yaml
- name: Assert the forward chain defaults to policy drop (input_only off)
ansible.builtin.assert:
that:
- "'hook forward priority 0; policy drop;' in nft"
fail_msg: >-
forward chain must default to policy drop when base__firewall_input_only is
false (container isolation stays the norm on real service hosts)
- name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
ansible.builtin.assert:
that:
- "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
```
- [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (after the `base__firewall_control_addr` line):
```yaml
base__firewall_admin_addrs:
- "10.30.0.77" # fixture: an operator-workstation LAN source (admin-addr SSH allow)
```
- [ ] **Step 3: Run the test to verify it fails**
Run: `make test ROLE=base`
Expected: FAIL on `Assert the admin-addr SSH allow rule` (the template does not consume `base__firewall_admin_addrs` yet, so the `ip saddr 10.30.0.77 …` rule is absent). The forward-drop assertion passes already (the template currently hardcodes `policy drop`).
- [ ] **Step 4: Add the defaults**
In `roles/base/defaults/main.yml`, after the `base__firewall_apply: true` line (end of the firewall behaviour block, currently line 13), add:
```yaml
base__firewall_input_only: false # true → the forward chain is `policy accept` (host-local
# INPUT filtering only). For hosts that forward/route
# container or NAT traffic (the control node's Docker +
# libvirt-NAT) where a forward default-deny would break
# them. Real service hosts keep this false (forward drop).
base__firewall_admin_addrs: [] # extra LAN source IPs allowed to SSH, besides wt0 +
# ssh-from-control. For an operator workstation reaching
# the host over the LAN (no mesh). Key-gated. (ADR-021)
```
- [ ] **Step 5: Make the forward policy conditional + render the admin-addr rules**
In `roles/base/templates/nftables.conf.j2`:
(a) Replace the forward-chain line (currently line 21):
```jinja
chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; }
```
(b) After the `ssh-from-control` `{% endif %}` (currently line 14) and before the `ip protocol icmp accept` line, add the admin-addr loop:
```jinja
{% for addr in base__firewall_admin_addrs %}
ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
{% endfor %}
```
- [ ] **Step 6: Run the test to verify it passes**
Run: `make test ROLE=base`
Expected: PASS — converge renders the ruleset; verify confirms the forward chain is `policy drop` (input_only defaults false) and the `ip saddr 10.30.0.77 tcp dport 22 accept` rule is present; all pre-existing assertions stay green.
- [ ] **Step 7: Lint**
Run: `make lint`
Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
- [ ] **Step 8: Commit**
```bash
git add roles/base/defaults/main.yml roles/base/templates/nftables.conf.j2 \
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
git commit -m "feat(base): input-only forward policy + admin-addr SSH allow
base__firewall_input_only renders the forward chain policy accept (host-local
INPUT filtering only) for hosts that forward container/NAT traffic; defaults
false so real service hosts keep the forward default-deny. base__firewall_admin_addrs
adds operator-workstation LAN sources to the SSH allow-list alongside wt0 +
ssh-from-control. Molecule locks the secure default + the admin rule.
Mesh-hardening 2/3 (ADR-020/021).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 2: inventory — enable input-only default-deny + mamba on ubongo (control group)
**Files:**
- Modify: `inventories/production/group_vars/control/vars.yml`
- [ ] **Step 1: Turn the knobs on for the control group**
Append to `inventories/production/group_vars/control/vars.yml`:
```yaml
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
base__firewall_input_only: true
base__firewall_admin_addrs:
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
# OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
- "10.20.10.17" # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
```
- [ ] **Step 2: Verify the vars resolve for ubongo**
Run: `.venv/bin/ansible-inventory -i inventories/production/ --host ubongo 2>/dev/null | grep -E 'firewall_input_only|firewall_admin_addrs|10.20.10.(50|17)'`
Expected: shows `"base__firewall_input_only": true` and `"base__firewall_admin_addrs": ["10.20.10.50", "10.20.10.17"]`.
- [ ] **Step 3: Lint**
Run: `make lint`
Expected: clean pass (`check-tags: OK`).
- [ ] **Step 4: Commit**
```bash
git add inventories/production/group_vars/control/vars.yml
git commit -m "feat(inventory): ubongo gets INPUT-only host firewall + mamba LAN SSH
Enables base__firewall_input_only on the control group (forward chain stays
permissive so Docker egress + the integration-test libvirt NAT survive) and
allows the operator workstations' LAN IPs (mamba 10.20.10.50 + 10.20.10.17;
raw leases, backstopped by wt0). Mesh-hardening 2/3.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 3: integration harness — "be ubongo" profile (overlay + profile + profile-aware verify)
**Files:**
- Create: `tests/integration/overrides/ubongo.yml`
- Create: `tests/integration/profiles/ubongo.json`
- Modify: `tests/integration/overrides/askari.yml`
- Modify: `tests/integration/verify.yml`
- [ ] **Step 1: Create the "be ubongo" overlay**
Create `tests/integration/overrides/ubongo.yml`:
```yaml
---
# Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
# Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
# chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
# ListenAddress change (so no boot-race).
integration_profile: ubongo
base__firewall_apply: true
base__firewall_input_only: true # forward chain renders `policy accept`
base__firewall_admin_addrs:
- "192.168.150.98" # two representative LAN sources — exercises the
- "192.168.150.99" # admin-addr loop with a multi-entry list (like ubongo)
# Never wt0-only; never touch the real mesh from a throwaway VM.
base__ssh_listen_mesh_only: false
base__mesh_enabled: false
# Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
# default-deny apply + the reboot don't lock out the harness. By source IP (interface-
# independent). This is the harness's lifeline; the admin-addr above is only exercised.
base__firewall_control_addr: "192.168.150.1"
```
- [ ] **Step 2: Create the "be ubongo" VM profile**
Create `tests/integration/profiles/ubongo.json`:
```json
{
"groups": ["control"],
"applies": [
{"playbook": "site.yml", "tags": ["base"]}
],
"extra_vars_files": ["overrides/ubongo.yml"],
"mem_mib": 2048,
"vcpus": 2
}
```
- [ ] **Step 3: Mark the askari overlay with its profile name**
In `tests/integration/overrides/askari.yml`, after the two header comment lines (before `base__firewall_apply: true`), add:
```yaml
integration_profile: askari
```
- [ ] **Step 4: Make `verify.yml` profile-aware (the test)**
Replace the entire contents of `tests/integration/verify.yml` with:
```yaml
---
# Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
# `integration_profile` (set in each profile's overlay). Each profile asserts its own success
# criteria; an unknown/unset profile fails loudly (never a silent pass).
- name: Verify the rebooted host
hosts: all
become: true
gather_facts: false
tasks:
- name: A known integration_profile must be set (no silent pass)
ansible.builtin.assert:
that:
- integration_profile is defined
- integration_profile in ['askari', 'ubongo']
fail_msg: "integration_profile must be set in the profile overlay (askari|ubongo)"
# ── askari profile — Docker host: published-port forwarding survives the reboot ──
# The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
# base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
- name: (askari) Gather service facts
when: integration_profile == 'askari'
ansible.builtin.service_facts:
- name: (askari) Docker daemon is active
when: integration_profile == 'askari'
ansible.builtin.assert:
that: "ansible_facts.services['docker.service'].state == 'running'"
fail_msg: "docker.service is not running"
- name: (askari) Forward chain permits container traffic (drop-in loaded)
when: integration_profile == 'askari'
ansible.builtin.command: nft list chain inet filter forward
register: _fwd
changed_when: false
- name: (askari) Assert container forwarding is allowed (not pure drop)
when: integration_profile == 'askari'
ansible.builtin.assert:
that: "'accept' in _fwd.stdout"
fail_msg: >-
forward chain is pure drop — container forwarding will die on reboot
(FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
- name: (askari) Published port answers from the controller (DNAT + forward alive)
when: integration_profile == 'askari'
delegate_to: localhost
become: false
ansible.builtin.uri:
url: "http://{{ ansible_host }}/"
follow_redirects: none
status_code: [200, 301, 308, 404, 502, 503]
timeout: 10
register: _probe
retries: 5
delay: 6
until: _probe is succeeded
# ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
# SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
# checks boot_id changed before this verify runs). Here we assert the ruleset shape.
- name: (ubongo) Read the live nftables ruleset
when: integration_profile == 'ubongo'
ansible.builtin.command: nft list ruleset
register: _nft
changed_when: false
- name: (ubongo) INPUT default-deny, forward permissive, admin-addr allow
when: integration_profile == 'ubongo'
ansible.builtin.assert:
that:
- "'hook input priority 0; policy drop;' in _nft.stdout"
- "'hook forward priority 0; policy accept;' in _nft.stdout"
- "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
- "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
fail_msg: >-
ubongo profile: expected input policy drop, forward policy accept (input-only),
and both admin-addr (192.168.150.98/99) SSH allows in the live ruleset.
```
- [ ] **Step 5: Validate the JSON + lint**
Run: `.venv/bin/python -m json.tool tests/integration/profiles/ubongo.json >/dev/null && echo OK` then `make lint`
Expected: `OK`, then a clean lint pass (`check-tags: OK`).
- [ ] **Step 6: Commit**
```bash
git add tests/integration/overrides/ubongo.yml tests/integration/profiles/ubongo.json \
tests/integration/overrides/askari.yml tests/integration/verify.yml
git commit -m "test(integration): add the 'be ubongo' profile (input-only default-deny)
A control-group VM that applies base with INPUT-only default-deny (forward
policy accept; admin-addr SSH allow). verify.yml is now profile-aware via an
integration_profile marker — the askari Docker/DNAT block is gated, and a ubongo
block asserts input drop + forward accept + the admin-addr rule. Enables
\`make test-integration HOST=ubongo\`. Mesh-hardening 2/3 (ADR-025).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 4: Validate on the integration harness (`make test-integration HOST=ubongo`) — the GREEN gate
> Runs a throwaway UEFI VM on ubongo: boots it, applies the base role with the ubongo
> overlay (INPUT-only default-deny), **reboots it**, and asserts the ruleset + SSH-returns.
> This proves the change survives a reboot before the real control node is ever touched
> (spec §cutover step 1; FRICTION signal-6). No code change / no commit — a validation gate.
- [ ] **Step 1: Ensure the vault is unlocked**
The run loads `inventories/production/group_vars/all/vault.yml` (symlinked into the run dir), which is decrypted at playbook load.
Run: `rbw unlocked || rbw unlock`
Expected: exits 0 (unlocked). If it prompts, the operator unlocks.
- [ ] **Step 2: Run the integration cycle**
Run: `make test-integration HOST=ubongo`
Expected (the `cycle`: up → apply → reboot → assert): the VM gets a `192.168.150.x` lease; `site.yml --tags base` applies cleanly; `… rebooted (boot_id changed), SSH back at 192.168.150.x`; then `VERIFY PASSED for boma-it-ubongo-…`. The VM is destroyed on success.
- [ ] **Step 3: On failure, read the diagnostics**
If it prints `VERIFY FAILED`, diagnostics are in `~/integration-runs/boma-it-ubongo-<id>/` (`nft.txt`, `console.log`, `journal.txt`). The likely suspects: the admin-addr/forward assertion (Task 1/3 wiring) or SSH not returning post-reboot (the `base__firewall_control_addr: 192.168.150.1` lifeline in the overlay). Fix the implicated task, re-commit, and re-run Step 2. Re-run `make test-integration-clean` first if a VM was left defined.
- [ ] **Step 4: Record the result**
Capture the `VERIFY PASSED` line in the task notes (this is the gate Task 5 step 1 depends on). No commit.
---
### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
> Touches the **real ubongo** (the control node Ansible runs from) and reboots it — lockout-
> risky. Run it interactively with the operator, in order, verifying each step before the
> next. The firewall auto-rollback timer (`base__firewall_rollback_timeout`, 45 s) +
> `wait_for_connection` over the live path is the safety net; the **on-prem physical console**
> is the permanent break-glass. Do NOT hand this to an unattended agent.
- [ ] **Step 1: Pre-checks (gate: Task 4 GREEN)**
- `rbw unlocked || rbw unlock`.
- SSH to ubongo over `wt0` from a road-warrior succeeds.
- SSH to ubongo from mamba on the LAN (`10.20.10.50`) succeeds.
- `.venv/bin/ansible ubongo -i inventories/production/ -m ping``SUCCESS` (over `10.20.10.151`).
- The physical console is reachable. If any path fails, STOP.
- [ ] **Step 2: Dry-run the firewall apply**
Run: `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
Expected: the nftables diff shows `policy drop` on input, `iifname "wt0" … accept`, `ip saddr 10.20.10.151 … accept`, `ip saddr 10.20.10.50 … accept`, and the forward chain as `policy accept`. No errors.
- [ ] **Step 3: Apply the host firewall (auto-rollback armed)**
Run: `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
Expected: the firewall concern snapshots `/etc/nftables.rollback`, arms the 45 s `systemd-run` revert, applies the ruleset, `reset_connection``wait_for_connection` over `10.20.10.151` succeeds, then cancels the timer. If connectivity is lost, the timer reverts the ruleset within 45 s and the console is the fallback.
- [ ] **Step 4: Verify every path + forwarding still works**
```bash
# from a road-warrior over wt0, and from mamba on the LAN:
ssh sjat@100.99.146.14 true && echo "wt0 OK"
ssh sjat@10.20.10.151 true && echo "mamba-LAN OK" # run from mamba (10.20.10.50)
# Ansible self-path:
.venv/bin/ansible ubongo -i inventories/production/ -m ping
# a disallowed LAN host (e.g. 10.20.10.17) must now be refused/timeout on :22
# Docker egress (forward chain still permissive):
docker run --rm busybox wget -qO- https://cloudflare.com/cdn-cgi/trace | head -1
# libvirt-NAT forwarding intact — a fresh integration VM still reaches apt:
make test-integration HOST=ubongo # expect VERIFY PASSED (proves the NAT path survived)
```
Expected: `wt0 OK`, `mamba-LAN OK`, Ansible `SUCCESS`, the disallowed host refused, the Docker egress line returns, and the integration cycle passes.
- [ ] **Step 5: Reboot resilience — while the console is present (FRICTION signal-6)**
With the operator at the physical console, reboot ubongo (`sudo systemctl reboot`). After it returns, confirm SSH comes back on all paths **unaided**:
```bash
ssh sjat@100.99.146.14 true && echo "wt0 OK after reboot"
.venv/bin/ansible ubongo -i inventories/production/ -m ping
```
Expected: SSH returns with no manual intervention (no `ListenAddress`, so nothing to race). Only now is the cutover complete.
- [ ] **Step 6: Update STATUS + ROADMAP**
- In `STATUS.md`: in the `roles/base/` row of "Scaffolded but empty", change the firewall note — the `firewall` concern is now **applied to ubongo** as INPUT-only default-deny (it is no longer "not yet applied to any host"); note the `base__firewall_input_only` knob and that the forward default-deny still awaits the `docker_host` drop-in for real service hosts. Add the ubongo control-node row's "Pending" item for default-deny → done.
- In `docs/ROADMAP.md`: mark **mesh-hardening sub-project 2 (ubongo default-deny) done**; the remaining follow-on is sub-project 1 (askari SSH→`wt0` *redesign*) and sub-project 3 (NetBird ACL). Update the "Next step" section accordingly.
```bash
git add STATUS.md docs/ROADMAP.md
git commit -m "docs: ubongo INPUT-only default-deny applied (mesh-hardening 2/3 done)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
- [ ] **Step 7: Push**
Run: `git push origin main`
---
## Self-review (against the spec)
- **§ Design — INPUT-only default-deny** → Task 1 (forward-policy knob) + Task 2 (enabled on ubongo). ✓
- **§ Design — admin-addrs (operator workstations on LAN)** → Task 1 (`base__firewall_admin_addrs` + template loop) + Task 2 (`10.20.10.50` mamba, `10.20.10.17`). ✓
- **§ Design — no sshd ListenAddress change** → nothing touches `ssh.yml`/`sshd_hardening.conf.j2`; only nftables. ✓ (verified: Tasks 13 file lists exclude them).
- **§ allow-list** (lo, established, wt0, ssh-from-control, admin-addr, icmp; forward accept) → template already renders lo/established/wt0/control/icmp; Task 1 adds admin-addr + forward-accept. ✓
- **§ Why-safe (incident signals 1/2/3/6)** → signal 1 (forward accept, Task 1); signal 2 (no ListenAddress); signal 3 (ubongo keeps LAN + console); signal 6 (Task 4 harness reboot + Task 5 step 5 reboot-while-console). ✓
- **§ New & changed code** (defaults, template, molecule, group_vars/control, integration profile) → Tasks 13. ✓
- **§ admin raw-leases + revisit** → Task 2 comments record both leases + the OPNsense-reservation revisit trigger; backstop (wt0) noted; flagged in `FRICTION.md`. ✓
- **§ Testing** (Molecule render asserts; `make test-integration HOST=ubongo`; live checks) → Task 1 (Molecule), Task 4 (harness), Task 5 step 4 (live). ✓ Coverage split (default in Molecule, input_only on the VM) noted in Task 1.
- **§ Staged cutover (signal-6 order)** → Task 5 steps 17; reboot-recovery (step 5) precedes nothing that retires a break-glass (the console is permanent). ✓
- **§ Risks/rollback** → auto-rollback (Task 5 step 3), redundant paths + physical console, raw-lease backstop. ✓
- **Type/name consistency:** `base__firewall_input_only` (bool) and `base__firewall_admin_addrs` (list) are spelled identically in defaults, template, converge, group_vars, and the overlay. `integration_profile` is spelled identically in both overlays and the three gates in `verify.yml`. ✓
- **Placeholder scan:** no TBD/TODO; every code/command step shows the actual content. ✓

View file

@ -1,237 +0,0 @@
# Mesh SPOF — accept + targeted resilience — Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Accept askari's single-coordinator SPOF as a documented availability trade-off, and harden the one real gap — a `base` mesh knob that pins the coordinator FQDN in `/etc/hosts` on managed mesh hosts so a local-DNS hiccup can't strand the mesh.
**Architecture:** One additive, idempotent `base` `mesh`-concern task (a `/etc/hosts` line via `lineinfile`, gated on a new opt-in knob), Molecule-tested; plus documentation (accepted-risk R8 + an ADR-016 availability amendment + STATUS/ROADMAP). No new infra, no Terraform, no live-deploy gate.
**Tech Stack:** Ansible (`base` role, `lineinfile`), Molecule (Debian 13), Markdown docs.
**Spec:** `docs/superpowers/specs/2026-06-20-mesh-spof-accept-resilience-design.md`
## Global Constraints
- **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
- **No new collection** — derive the coordinator FQDN with builtin `regex_replace` (NOT `urlsplit`, which would pull in `community.general`).
- The pin is **opt-in and additive**: gated on `base__mesh_enabled | bool` AND `base__mesh_coordinator_pin | length > 0`. Empty knob (the default) = a clean no-op. The coordinator host (`askari`/`offsite_hosts`) is **exempt** — leave its pin empty.
- **askari's coordinator IP = `77.42.120.136`** (stable WAN; the A record for `netbird.askari.wingu.me`); ubongo is in the `control` group.
- `make lint` clean + `rbw unlocked` before any commit (the pre-commit hook decrypts the vault).
- **No new infra** — no P2P, no second relay/coordinator, no Terraform. The coordinator off-site backup is **out of scope** (ADR-022 kickoff).
- Tags: the new task carries the `mesh` concern tag (it belongs to the mesh concern).
---
### Task 1: `base` mesh coordinator-FQDN `/etc/hosts` pin (DNS-resilience)
Add an opt-in knob that pins the coordinator FQDN (derived from `base__mesh_management_url`) to a stable IP in `/etc/hosts`, so a managed mesh host survives a local-DNS failure. TDD'd through the role's Molecule scenario (which already exercises the `mesh` concern with `manage: false`).
**Files:**
- Modify: `roles/base/defaults/main.yml` (add the knob after the mesh block, ~line 53)
- Modify: `roles/base/tasks/mesh.yml` (append the pin task)
- Modify: `roles/base/molecule/default/converge.yml` (add a fixture pin to the vars block)
- Modify: `roles/base/molecule/default/verify.yml` (assert the rendered `/etc/hosts` line)
- Modify: `inventories/production/group_vars/control/vars.yml` (set the pin for ubongo)
**Interfaces:**
- Produces: role default `base__mesh_coordinator_pin` (string, default `""`); when set + `base__mesh_enabled`, an `/etc/hosts` line `<pin-ip> <fqdn>` where `<fqdn>` is `base__mesh_management_url` minus scheme/port/path.
- [ ] **Step 1: Write the failing Molecule test (fixture + assertion)**
In `roles/base/molecule/default/converge.yml`, add one line to the `vars:` block (after `base__mesh_setup_key`, ~line 15):
```yaml
base__mesh_coordinator_pin: "203.0.113.9" # fixture coordinator IP (TEST-NET-3); pins the FQDN from base__mesh_management_url
```
In `roles/base/molecule/default/verify.yml`, append to the `tasks:` list (after the mesh no-op assertion at the end):
```yaml
- name: Read /etc/hosts (coordinator pin)
ansible.builtin.slurp:
src: /etc/hosts
register: _etchosts
- name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
ansible.builtin.assert:
that:
- "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)"
fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
success_msg: "coordinator FQDN pinned in /etc/hosts"
```
- [ ] **Step 2: Run Molecule to verify it fails**
Run: `make test ROLE=base`
Expected: FAIL at "Assert the coordinator FQDN is pinned…" — no pin task exists yet, so `/etc/hosts` has no such line.
- [ ] **Step 3: Add the default knob**
In `roles/base/defaults/main.yml`, after `base__mesh_version` (~line 53), add:
```yaml
# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
base__mesh_coordinator_pin: ""
```
- [ ] **Step 4: Add the pin task**
Append to `roles/base/tasks/mesh.yml`:
```yaml
- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
ansible.builtin.lineinfile:
path: /etc/hosts
regexp: '\s{{ _coordinator_fqdn | regex_escape }}$'
line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
state: present
vars:
_coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
when:
- base__mesh_enabled | bool
- base__mesh_coordinator_pin | length > 0
tags: [mesh]
```
(`_coordinator_fqdn` strips the scheme then anything from the first `:`/`/``netbird.askari.wingu.me`. The `regexp` matches an existing ` <fqdn>` at line end so a changed IP updates in place — idempotent; absent → appended.)
- [ ] **Step 5: Run Molecule to verify it passes**
Run: `make test ROLE=base`
Expected: PASS — the new assertion is green and Molecule idempotence is clean (re-running the pin task reports `ok`, not `changed`). The idempotence pass is what proves the `regexp` matches the line it wrote.
> Note: the empty-knob no-op (the production default for non-mesh / coordinator hosts) is guaranteed by the `when: base__mesh_coordinator_pin | length > 0` gate, not a separate Molecule case — a single converge can't hold both var-states, and boma uses one default scenario per role. The fixture exercises the meaningful path (rendering + FQDN extraction + idempotence).
- [ ] **Step 6: Wire the production pin for ubongo**
In `inventories/production/group_vars/control/vars.yml`, after the `base__mesh_enabled: true` block, add:
```yaml
# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
base__mesh_coordinator_pin: "77.42.120.136"
```
- [ ] **Step 7: Lint and commit**
```bash
rbw unlocked && make lint
git add roles/base/defaults/main.yml roles/base/tasks/mesh.yml \
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
inventories/production/group_vars/control/vars.yml
git commit -m "feat(base): pin the NetBird coordinator FQDN in /etc/hosts (mesh DNS-resilience)" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
### Task 2: Accept + document the SPOF (R8, ADR-016 amendment, STATUS/ROADMAP)
Record the single-coordinator SPOF as a conscious, revisitable trade-off and capture the availability analysis + recovery. Pure documentation; references the pin from Task 1.
**Files:**
- Modify: `docs/security/accepted-risks.md` (add row R8; bump the review date)
- Modify: `docs/decisions/016-mesh-vpn.md` (add the availability amendment subsection)
- Modify: `STATUS.md` (note the SPOF accepted + the coordinator-pin knob)
- Modify: `docs/ROADMAP.md` (mark sub-project 3 addressed; surface ADR-022 backup + ACL as next)
- [ ] **Step 1: Add accepted-risk R8**
In `docs/security/accepted-risks.md`, add this row to the table after R7:
```markdown
| R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access**`askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
```
Then update the closing line's date: change `_Last reviewed: 2026-06-18.` to `_Last reviewed: 2026-06-20.`
- [ ] **Step 2: Add the ADR-016 availability amendment**
In `docs/decisions/016-mesh-vpn.md`, add this subsection immediately before the `## Related` section:
```markdown
## Availability — an `askari` outage (amendment 2026-06-20)
The coordinator is deliberately **single** (one off-site host). Recorded here so its
availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
radius**:
| Traffic | `askari` down |
|---|---|
| LAN device → LAN service (direct / via reverse proxy) | unaffected |
| node ↔ node over LAN IPs (cluster) | unaffected |
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
| **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
operations, above).
**Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
hosts get the same pin via `base__mesh_coordinator_pin`.
**Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
default-deny posture; only helps established sessions), a second relay (needs another public
host / reintroduces the home public surface), a second coordinator (unsupported by
self-hosted NetBird; against this ADR).
```
- [ ] **Step 3: Update STATUS.md**
In `STATUS.md`, in the `roles/base/` row, append to the end of the firewall/mesh description (before the closing ` |`): a sentence noting the pin and the accepted SPOF:
```markdown
The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`, set for ubongo) so a local-DNS hiccup can't strand the mesh; the single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment).
```
- [ ] **Step 4: Update ROADMAP.md**
In `docs/ROADMAP.md`, in the "Remaining mesh-hardening sub-projects" list, change item 3 from the SPOF-reduction "(next)" wording to **DONE**, and make the NetBird ACL the next item. Replace the current items 34 block with:
```markdown
3. ~~**askari relay-SPOF reduction**~~**DONE (2026-06-20)** — assessed + **accepted** as a
documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
```
- [ ] **Step 5: Consistency check + commit**
```bash
grep -q "^| R8 " docs/security/accepted-risks.md && \
grep -q "Availability — an .askari. outage" docs/decisions/016-mesh-vpn.md && \
echo "docs OK"
```
Expected: `docs OK`.
```bash
rbw unlocked
git add docs/security/accepted-risks.md docs/decisions/016-mesh-vpn.md STATUS.md docs/ROADMAP.md
git commit -m "docs(security): accept the single-coordinator mesh SPOF (R8) + ADR-016 availability amendment" \
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
```
---
## Notes / out of scope
- **Coordinator off-site backup → ADR-022 kickoff** (next sub-project). Not built here.
- **Direct P2P / second relay / second coordinator** — deliberately not pursued (spec §Design).
- No live deploy is required to land this — the pin is additive/idempotent and applies to ubongo on the next routine `base` apply (`make deploy PLAYBOOK=site LIMIT=ubongo`, operator's discretion). Optional post-deploy spot-check: `getent hosts netbird.askari.wingu.me` on ubongo resolves to `77.42.120.136`.

View file

@ -1,156 +0,0 @@
# Spec — Mesh-hardening (1 of 3): move askari's SSH onto `wt0`
Status: Accepted (2026-06-17)
## Context & scope
The **mesh-hardening follow-on** was deferred from M5 (ROADMAP). It was decomposed into
**three independent sub-projects**, each with its own spec → plan → implementation cycle:
1. **askari SSH → `wt0`** ← *this spec*
2. ubongo nftables default-deny + `ssh-from-control` (its own later spec)
3. NetBird ACL off Allow-All → scoped policies (its own later spec)
This spec covers **only (1)**. It makes askari's SSH reachable **only over the NetBird mesh
interface `wt0`** and closes the WAN `:22` surface at both the host and the Hetzner Cloud
Firewall. It does **not** touch ubongo, the NetBird ACL (stays Allow-All for now — one
moving access-layer at a time), or askari's public service exposure (Caddy 80/443, NetBird
STUN 3478 stay on the WAN).
Current state (STATUS.md): askari is reached at `ansible_host: 77.42.120.136` (WAN, in the
TF-generated `inventories/production/offsite.yml`); `wt0` is up at `100.99.226.39`
(Management+Signal Connected, M5); the base nftables `firewall` concern is **built but not
applied** to askari (the Hetzner Cloud Firewall is its perimeter today); the Hetzner Cloud
Firewall (`terraform/modules/hetzner_vm`) opens `:22` from `var.ssh_admin_cidrs` plus
80/443/3478 from anywhere.
## Goal / success criteria
- SSH to askari succeeds over `wt0` (from ubongo) and **fails from any off-mesh source**.
- The WAN `:22` surface is closed at **both** layers (host nftables = `wt0`-only; Hetzner
Cloud Firewall drops the `:22` rule).
- Public services are unaffected: `https://test.askari.wingu.me` and
`https://netbird.askari.wingu.me` serve valid certs; STUN `3478/udp` still answers.
- Ansible manages askari over `wt0`.
- Break-glass is the **Hetzner web console** (out-of-band; works even if the mesh is down).
- A reboot of askari does **not** lock SSH out (the boot-race below is solved).
## Design — three enforcement layers (defense-in-depth)
1. **sshd** binds `ListenAddress` to askari's `wt0` IP only, so it does not accept on WAN.
2. **host nftables** (base `firewall` concern, ADR-020): catalog-driven default-deny;
`:22` allowed only via `iifname "wt0"` (the interface-name match that survives `wt0`
being absent — see `docs/testing/gotchas.md`); public service ports stay open on WAN.
3. **Hetzner Cloud Firewall** (Terraform): the `:22` `ssh_admin_cidrs` rule is removed;
80/443/3478 stay.
## The boot-race fix (load-bearing)
`wt0` is brought up by NetBird **after** boot, so at sshd start the `wt0` IP may not exist
yet. A plain `ListenAddress 100.99.226.39` would fail to bind → sshd exits → **lockout on
reboot**. Solution:
- **`net.ipv4.ip_nonlocal_bind = 1`** via a sysctl drop-in (`ansible.posix.sysctl`,
persisted under `/etc/sysctl.d/`). This lets sshd bind the `wt0` address even before the
interface exists; once `wt0` comes up with that IP, traffic is delivered to the existing
listener — no reload needed.
- The sshd drop-in **fails closed**: the mesh IP is resolved (see below) and the play
**asserts it is non-empty** before rendering. An empty `ListenAddress` would silently
fall back to listening on all interfaces, defeating the restriction — that must never
render.
**Mesh-IP source (decided):** the **live `wt0` fact** `ansible_wt0.ipv4.address`, gathered
at apply time (`wt0` is up during the play, since M5), with a **`host_var` fallback**
(`base__ssh_listen_addr`, default `""`) and a fail-closed `assert` that one of them yielded
a non-empty address. Live fact is preferred (correct even if NetBird reassigns the IP);
the host_var is an explicit override / belt.
## New & changed code
**Role `base` (the `hardening` + `firewall` concerns):**
- `roles/base/defaults/main.yml` — add:
- `base__ssh_listen_mesh_only: false` — opt-in; when `true`, sshd binds the mesh IP only.
- `base__ssh_listen_addr: ""` — optional explicit mesh-IP override (fallback to the
`ansible_wt0` fact).
- `roles/base/tasks/ssh.yml`
- resolve the mesh IP (`base__ssh_listen_addr` or `ansible_wt0.ipv4.address`) into a fact;
- `assert` it is non-empty **when** `base__ssh_listen_mesh_only`;
- set `net.ipv4.ip_nonlocal_bind = 1` (sysctl drop-in) under the same condition.
- `roles/base/templates/sshd_hardening.conf.j2` — append a conditional
`ListenAddress {{ resolved_mesh_ip }}` block guarded by `base__ssh_listen_mesh_only`
(unset → unchanged behaviour: listen on all). Keep the existing `sshd -t` validation.
**Inventory:**
- `inventories/production/host_vars/askari.yml` (new) — `ansible_host: 100.99.226.39`
(overrides the TF-generated `offsite.yml`; host_vars are not regenerated by
`tf_to_inventory.py`). A header comment explains why.
- `inventories/production/group_vars/offsite_hosts/vars.yml` — add
`base__ssh_listen_mesh_only: true`; ensure `base__firewall_apply: true`.
(`base__mesh_enabled` is already `true` for askari — set in M5 — and is a precondition,
not a change here.)
**Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
- Enumerate askari's required ingress so catalog-driven default-deny does **not** drop a
live public service. Derived from the existing `reverse_proxy` + `netbird_coordinator`
definitions: `:22/tcp` on the **mesh** zone (`wt0`); `80,443/tcp` + `3478/udp` on the
**public** zone (WAN). The exact catalog/zone YAML is finalised in the implementation
plan against the `resolve_firewall_rules` filter's schema.
**Terraform** (`terraform/environments/offsite` + `terraform/modules/hetzner_vm`):
- Remove the WAN `:22` ingress rule (e.g. drop `ssh_admin_cidrs` from the firewall, or set
it empty and guard the rule). Keep 80/443/3478. Applied via `make tf-plan/apply
TF_ENV=offsite` (plan shown before apply).
## Staged cutover — a working path at every step
1. **Pre-check:** confirm `ssh sjat@100.99.226.39` and an `ansible askari -m ping` forced
over `wt0` both succeed **before** changing anything.
2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); verify
`ansible askari -m ping` runs over the mesh. WAN `:22` still open as a fallback here.
3. **Apply `base` (firewall + sshd together):** one `make deploy PLAYBOOK=site LIMIT=askari`
converge applies catalog default-deny (`:22` on `wt0` + public ports) **and** the sshd
`ListenAddress`=mesh + `ip_nonlocal_bind` drop-in. The firewall concern's
`reset_connection``wait_for_connection` (now over `wt0`) plus the armed auto-rollback
timer (`base__firewall_rollback_timeout`, 45 s) is the safety gate — a bad ruleset
reverts itself. The sshd `reload` cannot drop the in-flight `wt0` session. Verify the
public services still respond.
4. **Retire the Hetzner WAN `:22`:** the Terraform change above; `make tf-plan
TF_ENV=offsite` (review) → `make tf-apply`. Verify: `wt0` SSH works; off-mesh `nc -vz
77.42.120.136 22` is refused/times out; `:443` open; STUN answers.
## Testing
- **Molecule** (base `default` scenario; `wt0` absent in-container, `base__firewall_apply:
false` render-only): assert (a) the rendered nftables allows `:22` via `iifname "wt0"`;
(b) with `base__ssh_listen_mesh_only: true` + a fixture mesh IP, the sshd drop-in renders
`ListenAddress <ip>` and `sshd -t` passes; (c) with the flag set but **no** resolvable
mesh IP, the play **fails closed** (the `assert`); (d) the `ip_nonlocal_bind` sysctl task
is present. Keep existing firewall/hardening assertions green.
- **Live, out-of-band:** post-cutover, from an off-mesh host `nc -vz 77.42.120.136 22`
refused; `:443` → open; from ubongo over `wt0`, SSH + `ansible -m ping` succeed; reboot
askari (Hetzner console) and confirm SSH-over-`wt0` returns without intervention.
## Risks & rollback
- **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
firewall auto-rollback timer, and `ansible_host`=`wt0` so the connectivity confirm tests
the real new path.
- **Reboot lockout:** mitigated by `ip_nonlocal_bind` (sshd binds `wt0` regardless of
interface timing) + the fail-closed assert (never silently listen-all).
- **Default-deny breaks a public service:** mitigated by enumerating all live ingress into
the catalog and the §Testing service checks; reversible by re-running with
`base__firewall_apply: false` or widening the catalog.
- **Ultimate break-glass:** the Hetzner web console (out-of-band). The TF `:22` rule is
trivially re-addable.
## Out of scope / follow-ons
- ubongo default-deny + `ssh-from-control` (sub-project 2).
- NetBird ACL off Allow-All (sub-project 3) — until then any enrolled peer can reach
askari's `wt0:22`; scoping that is sub-project 3's job.
- `/check-access` (ADR-021) live verification — designed, build still pending.
- STATUS.md / ROADMAP updates land with the implementation, not this spec.

View file

@ -1,267 +0,0 @@
# Local VM integration testing on ubongo (design)
**Status:** Designed, not built. Resolves `docs/TODO.md` item 2.4 (Local VM integration
testing on ubongo, pre-deploy).
**Date:** 2026-06-18.
**Implements:** the concrete build of ADR-008 Level 2/3 (staging/integration), deferred
for lack of hosts but hostable on ubongo. To be recorded as **ADR-025**.
## Context
Molecule (ADR-008 Level 1) tests each role in a single Docker container: one `converge`,
no real kernel netfilter, no real Docker daemon in the loop, and **no reboot**. That
structurally cannot catch an entire class of bug — reboot-survivability, host-firewall ×
Docker interaction, and boot-ordering — which is exactly the class that caused the
**2026-06-17 mesh-hardening incident**:
- `base`'s nftables `forward { policy drop; }` killed the askari Docker host **on reboot**
(nftables loaded its default-deny *before* Docker, breaking published-port DNAT and
inter-container forwarding → public services + the mesh went down). It had worked right
after `make deploy`, when Docker's runtime rules still coexisted. (FRICTION 2026-06-17 #1.)
- `ip_nonlocal_bind` did **not** beat the sshd boot-race; sshd bound to the `wt0` mesh IP
had no listener at boot. (FRICTION #2.)
- The coordinator host could not bootstrap the mesh it itself hosts. (FRICTION #3.)
- NetBird `netbird-server` FATAL-loops on the GeoLite2 download when egress is lost — and
egress was lost when `nft flush` wiped Docker's NAT masquerade. (FRICTION #4.)
Recovery needed the Hetzner console + a WAN-SSH break-glass. The lesson, already crystallised
as a standing rule: *firewall/sshd/boot changes must be tested on a real VM with a real
reboot before they touch a live host, and a non-mesh break-glass must be kept.*
This spec defines a way for the agent to spin up **throwaway KVM VMs locally on ubongo**
that mirror a target host (real Docker, a real reboot, the real role apply) and validate
risky infra changes **before** a live deploy. ubongo can host this today:
> verified: ubongo KVM capability · Bash (this session) · `/dev/kvm` present + accessible
> (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198
> GiB disk free; libvirt/QEMU/Vagrant **not yet installed** · 2026-06-18.
## Goals
- Reproduce the 2026-06-17 bug class locally: real OS boot, real Docker, real netfilter,
the real role apply, a **real reboot**, then outcome assertions.
- Let the agent drive the full loop autonomously: provision → apply → reboot → assert →
teardown, with diagnostics captured on failure.
- Mirror a *real* host from inventory (first profile: "be askari"), so the apply is
faithful, not synthetic.
- Be the concrete tool that operationalises the standing "test risky infra before live
deploy" rule.
## Non-goals (v1)
- Not a production hypervisor on ubongo (reconciles ADR-015 — see Governance).
- Not nested Proxmox; the provisioning *chrome* (template clone / Terraform) is **not**
mirrored — every incident bug lives in the boot/kernel/Docker layer, not provisioning.
- Not a multi-VM mini-cluster; one VM at a time. (All six 2026-06-17 signals occurred on a
single host that was Docker host + coordinator + mesh peer.) Multi-VM is a later extension.
- Not a CI gate; this is an interactive, agent-driven pre-deploy check on ubongo (CI stays
lint + Molecule per ADR-008/010).
## Decisions (from the 2026-06-18 brainstorm)
1. **Virtualisation approach: libvirt/KVM directly (Approach A).** A golden Debian-13
genericcloud qcow2 cached locally; each run boots an ephemeral qcow2 overlay backed by
it, seeded via cloud-init NoCloud, driven by a **stdlib-only** Python script over
`virsh` (no `libvirt-python` dependency). Chosen over Vagrant+vagrant-libvirt (Ruby/plugin
footprint, box drift from the real cloud image) and terraform-provider-libvirt (poor at
the imperative apply→reboot→re-apply sequence, throwaway state, blurs ADR-006's prod-VM
boundary). Lightest footprint on a 15 GiB control node; full control of the reboot step;
the same Debian cloud image real hosts boot.
2. **Fidelity envelope: real OS/Docker/netfilter/reboot, not the Proxmox provisioning
path.** A lightweight local hypervisor is enough because the bugs are post-boot.
3. **Scope: one throwaway VM at a time, instantiated from a real host's inventory.** First
profile: **"be askari"** (Docker host + NetBird coordinator + mesh peer on one box). The
mechanism is generic — later "be" any host by swapping which inventory host it mirrors.
4. **Acceptance is self-validating against the real failure.** Done = the harness, on a
local VM, applies `base` (firewall on) to a Docker host, reboots, and **observes the
2026-06-17 breakage** (Docker forwarding dead / services down); then, with the
`docker_host` container-forward drop-in in place, the same run **survives the reboot**.
If step 1 passes, the harness is not faithful.
5. **Tiered cert fidelity via a `--certs` knob** (DNS-01 is what makes real certs possible
with no public inbound — validation is out-of-band via a Gandi TXT record; the VM needs
only outbound to ACME + Gandi, which the NAT net provides):
- `internal` (default) — Caddy `tls internal`, zero deps, instant; for the incident repro
and runs where certs aren't under test.
- `le-staging` — real DNS-01 ACME against Let's Encrypt **staging**: real caddy-gandi
path, real cert files/renewal, untrusted root, effectively no rate limits. **Built in v1.**
- `le-prod-wildcard` — a real trusted `*.test.wingu.me` wildcard, **issued once,
persisted on ubongo, reused** across runs. Wired in v1 but **on-demand only**; its
accepted risk is recorded when used (prod Gandi credential reaching an ephemeral VM;
transient TXT in the real `wingu.me` zone). A deliberate "no-egress" failure scenario
(to reproduce FRICTION #4) forces `internal`, since ACME needs egress.
6. **The toolchain is Ansible-managed**, not hand-installed: a new non-service role
(`integration_test`, `control` group) installs/enables libvirt+QEMU reproducibly. The
repo owns ubongo's state. The driver manages *images* lazily on first run (keeps the role
lean; avoids fiddly download/refresh logic in Ansible).
7. **Stubs live in an overlay file, never in the real inventory** — so `make tf-inventory`
and "don't edit inventory directly" stay intact, and every stub is explicit and reviewable.
8. **A new ADR-025** records this decision (approach + alternatives + cert tiers); ADR-008
gains a pointer and redirects its "what Molecule does NOT test" gaps here.
## Architecture — five isolated components
| # | Component | Purpose | Location |
|---|-----------|---------|----------|
| 1 | **`integration_test` role** (non-service, `control` group) | Install/enable libvirt+QEMU+virtinst, add `sjat`/`claude` to `libvirt` group, create the image-cache dir, drop the driver. Idempotent, Molecule-tested. | `roles/integration_test/` |
| 2 | **`integration-vm.py` driver** | Stdlib-only lifecycle over `virsh`: `up / apply / reboot / assert / cycle / reset / down / prune / console`. Lazily ensures the golden image (download + checksum). | `scripts/integration-vm.py` |
| 3 | **Profiles + var overlays** | Make a VM "become" a host: pull that host's real group_vars/host_vars + layer a small explicit overlay (cert tier, in-VM coordinator endpoint, VM connection). | `tests/integration/overrides/<host>.yml` |
| 4 | **Verify playbook** | Outcome-based post-reboot assertions (Docker up, published-port DNAT alive, `nft` sane, service responds, `wt0` up), reusing Molecule's `verify.yml` philosophy. | `tests/integration/verify.yml` |
| 5 | **Makefile target** | `make test-integration HOST=<name> [CERTS=...] [KEEP=1]``cycle`; `make test-integration-clean``prune`. Documented in CLAUDE.md's command table. | `Makefile` |
## Lifecycle / data flow
`make test-integration HOST=askari` drives:
```
1. ensure golden image Debian-13 genericcloud qcow2, cached + checksum-verified
2. ephemeral overlay qcow2 backed by golden (throwaway; never mutate golden)
3. cloud-init NoCloud seed hostname + ansible user + ubongo's SSH key + NIC
4. virt-install --import boot on an isolated libvirt NAT net (DHCP IP + outbound NAT)
5. wait for SSH IP via `virsh domifaddr --source lease` (guest-agent optional)
6. transient inventory askari's real vars + ansible_host=<lease IP> + stub overlay
7. ansible-playbook site THE REAL APPLY (base + docker_host + reverse_proxy + coordinator)
8. [snapshot post-apply] optional reset point for fast re-runs
9. virsh reboot ──────────┐ ← the step Molecule structurally cannot do
10. wait for SSH ┘
11. ansible-playbook verify outcome assertions; THIS is where the incident surfaces
12. report + teardown pass/fail; on fail keep VM + dump diagnostics; else destroy overlay
```
Steps 17 build a real Docker daemon with real published-port DNAT to break; step 9 is a
real kernel reboot, so nftables loads default-deny before Docker exactly as on askari.
## Fidelity boundary & cert tiers
**Faithful where the bug lives:** real kernel, real netfilter, real Docker with
published-port DNAT, the real role apply, a real reboot, and the coordinator running *inside
the VM* so the VM is its own mesh peer — reproducing the circular mesh-bootstrap (FRICTION #3)
on one box.
**Stubbed where it needs the public internet** (explicit, in the overlay): LE certs via the
`--certs` knob (Decision 5); public DNS (`askari.wingu.me`) → local resolution; NetBird
geo-DB → pre-seeded or requirement disabled (which is *also* the FRICTION #4 fix, so the
harness can test both the FATAL-loop and its remedy).
## Acceptance test (self-validating)
1. Run the cycle on **today's** `base` (firewall on, no `docker_host` container-forward
drop-in) → **step 11 must FAIL after reboot** (Docker forwarding dead, services down).
2. Implement the `docker_host` container-forward rules (the pending fix STATUS.md names) →
re-run → **step 11 must PASS across the reboot.**
**Scope boundary:** the *harness* is this plan's deliverable. The `docker_host`
container-forward fix is a separate work item (FRICTION 2026-06-17 #1). v1's acceptance
deliberately spans both, because a credible harness must demonstrate **both** a true-negative
(red on the broken state) and a true-positive (green on the fixed state) — otherwise we have
only ever watched the assert go red. The plan decides sequencing: build the small
`docker_host` drop-in as the green-half of acceptance, or consume it if built separately
first. Minimum credible v1 is the red half (faithful reproduction); full acceptance is red→green.
This one round-trip proves the harness reproduces the incident, the fix works, and the loop
can be trusted for the next risky change before it touches a live host.
## Robustness, isolation & teardown
**Failure leaves evidence** (catching a bug is the point):
| Step fails | Behaviour |
|---|---|
| Golden image (1) | Fail fast, clear message; image cached (one-time cost) |
| Boot / first SSH (45) | **Capture serial console to a log file**, fail with its tail — the automated equivalent of the Hetzner console (ties to TODO 10.8) |
| Apply (7) | Keep VM, surface Ansible output, dump diagnostics |
| **No SSH after reboot (910)** | The classic incident signature; FAIL, keep VM, capture console — the harness *succeeding* |
| Assert (11) | FAIL, keep VM, dump post-mortem: `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`, `systemd-analyze critical-chain`; exit non-zero |
Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/` (same pattern as ADR-017's
screenshot dir; the agent reads them directly).
**Three safety invariants** (these make the testing tool itself safe):
1. **The transient inventory contains only the test VM** — no real host is ever in scope;
the apply is `--limit`ed to the VM.
2. **"Be askari" points NetBird at the in-VM coordinator (localhost)** — the VM forms its
own one-node mesh; it never enrolls in the real mesh.
3. **Test VMs sit on an isolated libvirt NAT net** — outbound NAT for ACME/image pulls, but
not reachable to the LAN (`10.20.x`) or the real mesh.
**Resource guard** (ubongo's 15 GiB ceiling, ADR-015/012): default VM ≈ 2 vCPU / 3 GiB / 20
GiB thin overlay; the driver refuses to start below a free-RAM threshold and enforces **one
integration VM at a time** (name-prefix `boma-it-*`). **Teardown:** success destroys domain +
overlay; failure keeps them and prints how to inspect; `make test-integration-clean` reaps
all `boma-it-*` orphans. An optional post-apply **snapshot** lets `reset` re-run
reboot+assert without re-applying (fast iteration on a fix).
## Testing the tester
- **pytest** on the driver's pure logic: transient-inventory generation, var/overlay merge,
`--certs`→overlay mapping, DHCP-lease parsing, resource-guard math (mock `virsh`). Joins
boma's existing pytest suite.
- **Molecule** (Docker) on the `integration_test` role: asserts libvirt/qemu/virtinst
installed, `libvirtd` enabled, users in `libvirt` group, driver present. (Cannot run
KVM-in-Docker — the documented Molecule limitation.)
- **End-to-end self-test = the acceptance test above**, run manually on first build and
recorded in the runbook.
## Governance & documentation touch-points
- **ADR-025 "Local VM integration testing"** — decision, approach A, rejected alternatives
(Proxmox-nested / Vagrant / TF-libvirt), cert tiers.
- **ADR-008** — pointer to ADR-025; redirect its "what Molecule does NOT test" gaps
(nftables loading, mesh dataplane) to this level.
- **ADR-015** — one-line reconciliation: "not a hypervisor" → runs *ephemeral KVM test VMs*
as part of its local-test-runner role (still not a production hypervisor); note the
test-VM RAM load.
- **`docs/security/accepted-risks.md`** — the `le-prod-wildcard` risk (prod Gandi credential
→ ephemeral VM; transient TXT in real `wingu.me`).
- **CLAUDE.md** command table + **`docs/runbooks/integration-testing.md`** (run a cycle,
cert knobs, where diagnostics land, inspecting a kept failed VM, pruning) + **STATUS.md**
entry. The runbook's pre-flight line operationalises FRICTION #6 (*validate
reboot-recovery before retiring the break-glass*).
## Capacity
One VM (~3 GiB) against ~13 GiB free is comfortable. The only future pinch is concurrency
with the Level-4 Chromium/Playwright stack (ADR-017) — handled by the resource guard +
"one at a time." Add a note to `docs/hardware/reference.md`; revisit at `/capacity-review`.
## Alternatives considered
- **Proxmox VE nested on ubongo** — highest fidelity incl. the provisioning step, but heavy
(nested virt, RAM), in tension with ADR-015, and the incident bugs don't live in
provisioning. Rejected.
- **Vagrant + vagrant-libvirt** — mature lifecycle/snapshots, but adds the Ruby/Vagrant
ecosystem + a fragile plugin, boxes drift from the real Debian cloud image, and the
reboot→assert sequence still needs custom logic. Rejected.
- **terraform-provider-libvirt** — declarative and reuses TF, but poor at the imperative
apply→reboot→re-apply test sequence, adds throwaway state, and blurs ADR-006's
"TF owns *production* VM existence on Proxmox" boundary. Rejected.
## Open questions / deferred
- **Multi-VM mini-staging** (inter-host mesh/dataplane) — design the driver + NAT net so a
topology is an additive extension; out of scope for v1.
- **Interplay with the Level-4 browser stack** — both want ubongo RAM; the resource guard is
the v1 answer, revisit when Level 4 is built.
- **Snapshot strategy depth** — v1 ships clone-and-destroy + an optional post-apply snapshot;
richer snapshot trees deferred.
## Knowledge to verify at plan stage (ADR-014)
These are from memory / unverified and must be confirmed against version-matched docs before
the plan asserts them:
- Exact `virt-install --import` flags and the cloud-init **NoCloud** seed format on the
Debian-13 libvirt stack.
- Whether the Debian-13 genericcloud image ships `qemu-guest-agent` (IP can come from the
DHCP lease regardless — guest-agent is an optimisation, not a requirement).
- Let's Encrypt **rate limits** (prod vs staging) — to confirm "issue the wildcard once,
reuse" stays within limits.
- The `caddy-dns/gandi` DNS-01 configuration and pinned version already used by
`reverse_proxy`, and whether the Gandi LiveDNS API key can be scoped to `test.wingu.me`.
- libvirt default vs a dedicated isolated NAT network on Debian-13 (`virsh net-*`).

View file

@ -1,216 +0,0 @@
# Spec — Mesh-hardening redesign: askari SSH `wt0`-primary + permanent WAN break-glass
Status: Accepted (2026-06-19)
## Context & scope
The **mesh-hardening follow-on** (deferred from M5) was decomposed into three independent
sub-projects, each with its own spec → plan → implementation cycle. Progress so far:
1. ~~askari SSH → `wt0`~~**attempted 2026-06-17, BACKED OUT** after it took askari down
on reboot (spec/plan `docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*`).
2. ubongo nftables INPUT-only default-deny — **DONE 2026-06-19**, reboot-validated
(`base__firewall_input_only`).
3. NetBird ACL off Allow-All → scoped policies — not started.
This spec is the **redesign of (1)**. The operator sequencing decision (2026-06-19) is:
do this redesign **first**, then a separate sub-project to reduce askari's
single-point-of-failure (SPOF) role. **This spec covers only the redesign of (1).** The SPOF
reduction is the named follow-on (its own later spec).
### Why the 2026-06-17 attempt was backed out
Four hazards, recorded in `docs/FRICTION.md` (the six 2026-06-17 signals):
1. **`base`'s `forward policy drop` breaks Docker hosts on reboot** — nftables loaded
default-deny before Docker, so container forwarding/NAT (WAN→Caddy, Caddy→coordinator)
died after reboot.
2. **`ip_nonlocal_bind` did NOT beat the sshd boot-race** — binding sshd `ListenAddress`
to the `wt0` IP still failed at boot ("could not assign the address"); and because
`wt0` never came up, sshd had no listener at all.
3. **The coordinator host can't bootstrap the mesh it depends on** — askari runs the
NetBird coordinator *and* is a mesh peer; its agent needs the local coordinator container
healthy to bring up `wt0`. After an unclean reboot the coordinator was down → `wt0`
never came up → with SSH `wt0`-only, the host was reachable only via the Hetzner console.
General rule: *never make a host's only management path depend on a service that host
itself hosts.*
4. **The coordinator FATAL-loops on the geolocation-DB download with no egress** — a
transient loss of container egress (here: NAT wiped by `nft flush`) crash-loops the whole
control plane.
### What changed since 2026-06-17 (enablers this redesign relies on)
- `docker_host` **container-forward nftables drop-in** (`172ae37`) — reboot-safe Docker
forwarding (available as a later tightening; not required by this pass).
- **`base__firewall_input_only`** — input-only default-deny, forward chain stays
`policy accept` (Docker-safe). **Proven on ubongo and reboot-validated 2026-06-19.**
- The **ADR-025 integration harness** — reproduces a host's boot on a throwaway local VM,
so reboot-safety is proven GREEN before the real host is touched.
## Goal / success criteria
- askari's host nftables firewall is **applied at last** (`base__firewall_apply: true`),
INPUT-only default-deny — matching ubongo.
- **Normal management is over the mesh:** `ansible_host` resolves to askari's `wt0` IP
(`100.99.226.39`); SSH-over-`wt0` and `ansible askari -m ping` over the mesh both succeed.
- **A permanent non-mesh break-glass survives a mesh/coordinator outage**, via two
independent channels:
- the **Hetzner web console** (out-of-band; always works, IP-independent); and
- **WAN `:22` reachable only from ubongo's WAN IP (`91.226.145.80`)**, enforced at *both*
the host nftables layer (`base__firewall_admin_addrs`) and the Hetzner Cloud Firewall.
WAN `:22` is **deliberately NOT closed** — the coordinator-host exception (FRICTION #3).
- **askari survives a reboot under the new firewall, unattended:** Docker forwarding/NAT
intact, `https://test.askari.wingu.me` + `https://netbird.askari.wingu.me` serve valid
certs, STUN `3478/udp` answers, the coordinator container is healthy (geo-DB no longer
FATAL), `wt0` returns, SSH is reachable over both `wt0` and the WAN break-glass.
- **No sshd `ListenAddress` change** (`base__ssh_listen_mesh_only` stays `false`) — this is
what sidesteps the boot-race that sank the 2026-06-17 attempt.
## Design — mirror ubongo 2/3, with the coordinator-host exception
The host firewall does the SSH scoping; sshd is left listening on all interfaces. This is
the ubongo 2/3 pattern, which is proven and reboot-validated.
1. **`base` firewall, INPUT-only default-deny** (`base__firewall_apply: true`,
`base__firewall_input_only: true`): the input chain defaults to `drop`; the forward chain
stays `policy accept` so Docker container forwarding/NAT and published-port DNAT keep
working across a reboot. Allowed ingress:
- `:22/tcp` via `iifname "wt0"` (the interface-name match that survives `wt0` being
absent at boot — `base__firewall_mgmt_interface: wt0`);
- `:22/tcp` from `91.226.145.80` (ubongo's WAN — the break-glass; via
`base__firewall_admin_addrs`);
- the public service surface from the catalog: `80,443/tcp` + `3478/udp` (WAN).
2. **No sshd change.** `base__ssh_listen_mesh_only` stays `false`; sshd keeps listening on
all interfaces. The firewall, not sshd, restricts where `:22` is reachable. There is no
`ListenAddress`, hence no `ip_nonlocal_bind`, hence no boot-race.
3. **The Hetzner Cloud Firewall is unchanged** — the `:22`-from-ubongo rule stays (the
2026-06-17 attempt removed it; this redesign keeps it as the perimeter break-glass).
4. **Coordinator geo-DB robustness** — make the `netbird_coordinator` control plane survive
a transient egress loss (the nat-flush window on apply, and the boot window before Docker
re-adds its NAT), so the coordinator stays healthy and `wt0` can come back. One of:
- **pre-seed** the GeoLite2 DB into the persistent `netbird_data:/var/lib/netbird` volume
so netbird-server finds it locally and never needs to download; or
- **disable / make non-fatal** the geolocation requirement in `config.yaml.j2`.
The exact v0.72.4 mechanism is verified against NetBird's pinned docs at plan time
(ADR-014) — the design fixes the *intent* (a transient egress blip must not FATAL the
control plane); the plan fixes the *knob*.
### Rejected alternatives (these are the 2026-06-17 failures)
- sshd `ListenAddress = wt0 IP` + `ip_nonlocal_bind` → boot-race; did not bind. **Out.**
- `forward policy drop` on a Docker host → broke forwarding on reboot. **Out** (use
`input_only`; the `docker_host` container-forward drop-in is a later tightening).
- Close WAN `:22` entirely → coordinator host left console-only on a bad reboot. **Out**
(keep WAN `:22`-from-ubongo as the always-available non-mesh path).
### How each 2026-06-17 failure is answered
| 2026-06-17 failure | Fix in this design |
|---|---|
| `forward drop` killed Docker on reboot | `base__firewall_input_only: true` → forward stays `accept` |
| `ip_nonlocal_bind` sshd boot-race | no sshd `ListenAddress` change; firewall scopes `:22` by `iifname "wt0"` |
| coordinator chicken-egg / lockout | permanent WAN `:22`-from-ubongo + Hetzner console; management never depends on a service askari hosts |
| coordinator geo-DB FATAL-loop | pre-seed / non-fatal geo so a transient egress blip can't crash the control plane |
## New & changed code
**Inventory:**
- `inventories/production/group_vars/offsite_hosts/vars.yml`
- `base__firewall_apply: true` (was `false`);
- `base__firewall_input_only: true` (new — forward stays `accept`, Docker-safe);
- `base__firewall_admin_addrs: ["91.226.145.80"]` (new — ubongo's WAN, the break-glass;
comment states what it is and why a coordinator host keeps a non-mesh path);
- `base__ssh_listen_mesh_only: false` stays (explicit — no boot-race);
- rewrite the header backout note → "redesigned 2026-06-19: `wt0`-primary + permanent WAN
break-glass; see this spec."
- `inventories/production/host_vars/askari.yml` (**new**) — `ansible_host: 100.99.226.39`
(the `wt0` IP), so Ansible manages askari over the mesh. Overrides the TF-generated WAN
`ansible_host` in `offsite.yml` (host_vars are not regenerated by `tf_to_inventory.py`).
Header comment explains why.
**Role `netbird_coordinator`:**
- The geo-DB robustness change above (`templates/config.yaml.j2` and/or a pre-seed task +
`templates/docker-compose.yml.j2` volume already persists `/var/lib/netbird`), with
Molecule/verify coverage that the control plane comes up without external geo egress.
**Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
- **No change.** It already enumerates askari's public ingress (`reverse_proxy` 80/443,
`netbird_stun` 3478/udp). `:22` is handled by the `base` firewall's built-in SSH rules
(`mgmt_interface` `wt0` + `admin_addrs`), not the catalog.
**Terraform / Hetzner Cloud Firewall:**
- **No change.** The WAN `:22`-from-ubongo rule stays (the perimeter half of the break-glass).
**sshd:**
- **No change.**
## Validation
### Harness-first GREEN gate (ADR-025) — before any live change
A "be askari" integration profile (Docker host + a coordinator-like container on the shared
network + `base__firewall_input_only` + `admin_addrs`), driven through `make
test-integration HOST=askari` (reusing the existing profile/overlay/verify pattern):
- input chain default-deny with `:22` accepted via `iifname "wt0"` **and** from the
break-glass admin address; forward chain `policy accept`;
- published-port DNAT + NAT masquerade survive a **reboot** (the RED→GREEN reboot cycle);
- the coordinator-like container comes up healthy with **no external geo egress**;
- SSH path returns after reboot.
This must be GREEN before the live cutover.
### Live cutover — supervised, console open, break-glass never removed
Sequencing rule (FRICTION #6): validate reboot-recovery while a fallback path is still open.
Because the WAN break-glass is *never* removed in this design, that invariant holds by
construction.
1. **Pre-check:** `ssh sjat@100.99.226.39` (over `wt0`) and `ansible askari -m ping` (forced
over `wt0`) both succeed; public services + STUN healthy.
2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); confirm
`ansible askari -m ping` runs over the mesh.
3. **Apply `base` (+ the geo-DB fix):** one `make deploy PLAYBOOK=site LIMIT=askari`
converge applies INPUT-only default-deny with the `wt0` + admin-addr SSH allow and the
coordinator robustness change. The firewall concern's armed auto-rollback
(`base__firewall_rollback_timeout: 45`) reverts a bad ruleset. Then a post-apply
`restart docker` rebuilds NAT (base's `flush ruleset` wipes Docker's nat — FRICTION); the
coordinator now survives the egress window thanks to the geo-DB fix.
4. **Verify the new steady state:** public services serve valid certs; STUN answers; SSH
over `wt0` works; SSH over the WAN break-glass (`91.226.145.80``:22`) works.
5. **Reboot resilience (the real test):** reboot askari (Hetzner console available) and
confirm — with no intervention — Docker forwarding/NAT, public services, the coordinator,
`wt0`, and SSH (both paths) all return.
## Risks & rollback
- **ubongo's WAN IP anchors the break-glass.** If it is dynamic and rotates, the host
`admin_addrs` rule and the Hetzner FW rule must be updated. The **Hetzner console** is the
IP-independent ultimate break-glass. (Confirmed static by the operator 2026-06-19; it is
also already the Hetzner FW assumption today.)
- **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
firewall auto-rollback timer, `ansible_host` = `wt0` (the confirm tests the real new path),
and the WAN break-glass that is never removed.
- **Reboot lockout:** mitigated by `iifname "wt0"` scoping (no sshd boot-race), the WAN
break-glass, the geo-DB fix (coordinator survives the egress window), and harness GREEN.
- **Default-deny breaks a public service:** mitigated by the catalog already enumerating all
live ingress and the §Validation service checks; reversible via `base__firewall_apply:
false`.
- **Ultimate break-glass:** the Hetzner web console (out-of-band).
## Out of scope / follow-ons
- **SPOF reduction (the next sub-project)** — reduce askari's single-point-of-failure role
(currently `ubongo → askari` is `Relayed` through askari's own relay; if askari is down the
mesh data plane for relayed peers is down). Its own spec, after this.
- **NetBird ACL off Allow-All** — until then any enrolled peer can reach askari's `wt0:22`;
scoping that is a separate sub-project.
- **Full forward-chain hardening** — the `docker_host` container-forward drop-in (full
forward default-deny, reboot-safe) as a later tightening over the `input_only` baseline.
- **Coordinator off-site backup** (FRICTION #5, ADR-022) — still pending; noted, not in scope.
- STATUS.md / ROADMAP updates land with the implementation, not this spec.

View file

@ -1,203 +0,0 @@
# Spec — Mesh-hardening (2 of 3): ubongo INPUT-only default-deny + `ssh-from-control`
Status: Accepted (2026-06-19)
## Context & scope
The **mesh-hardening follow-on** (deferred from M5, ROADMAP) was decomposed into three
independent sub-projects, each its own spec → plan → implementation cycle:
1. askari SSH → `wt0` — spec/plan written 2026-06-17, **attempted and backed out the same day**
(the incident; six lessons in `FRICTION.md`). Needs a redesign — **not** this spec.
2. **ubongo nftables default-deny + `ssh-from-control`** ← *this spec*
3. NetBird ACL off Allow-All → scoped policies (its own later spec; open mechanism question —
no headless API path).
ROADMAP (re-ordered after the 2026-06-17 incident) puts **ubongo first**: it is the clean,
low-risk case — a physical box with a permanent console break-glass, and *not* the coordinator
host that the incident proved you must not corner.
This spec hardens **ubongo's inbound surface only**. It does **not** change sshd's
`ListenAddress` (so no boot-race), does **not** apply a forward-chain default-deny (so Docker +
the libvirt NAT keep working), and does **not** touch askari or the NetBird ACL.
Current state (verified on ubongo, 2026-06-19): **no host firewall** — sshd listens on
`0.0.0.0:22`, reachable from LAN, mesh, and anything routable; only Docker's + libvirt's own
`iptables-nft` tables exist. Interfaces: `eno1` `10.20.10.151` (LAN, = `ansible_host`), `wt0`
`100.99.146.14` (mesh), `docker0` (one container, no published ports), `virbr-boma`
`192.168.150.1/24` (the libvirt NAT that `make test-integration` uses), `ip_forward=1`.
## Goal / success criteria
- SSH to ubongo succeeds over **`wt0`** (road-warriors, askari), from **mamba on the LAN**
(`10.20.10.50`), and via the **`ssh-from-control` self-path** (Ansible; source `10.20.10.151`).
- SSH from any **other** LAN source is **dropped** (default-deny on `input`).
- **Docker container egress and `make test-integration` (libvirt NAT) keep working** — the
forward chain is untouched.
- A **reboot** does not lock SSH out (no `ListenAddress`, so no bind race).
- Break-glass is the **on-prem physical console** (permanent, non-mesh). The live apply is
additionally gated by the firewall **auto-rollback** timer.
## Design
Apply base's nftables `firewall` concern to ubongo, with two adjustments and one deliberate
non-change:
1. **INPUT-only default-deny.** The `input` chain keeps `policy drop` with the guaranteed
management plane: `lo`, `established,related`, ICMP, SSH on `wt0`, and SSH from
`ssh-from-control` (`10.20.10.151`). We add **one operator-workstation source** (mamba,
`10.20.10.50`) via a new `base__firewall_admin_addrs` list. Everything else on `eno1` drops.
2. **Forward chain left permissive.** base hardcodes `chain forward { … policy drop; }` for
inter-container isolation. On ubongo that would break Docker egress **and** the libvirt NAT
the integration harness depends on — the same class of failure that sank askari (FRICTION
2026-06-17, signal 1). A new `base__firewall_input_only` knob renders the forward chain
`policy accept` instead. Docker's and libvirt's own `iptables-nft` forward rules continue to
apply (separate tables); base simply does not add a default-deny on top.
3. **No sshd `ListenAddress` change.** sshd keeps listening on `0.0.0.0:22`; nftables does all
inbound scoping. This deliberately avoids the `ip_nonlocal_bind` boot-race that broke askari
(FRICTION signal 2) — there is nothing to bind before `wt0` exists.
Resulting `input` allow-list:
```
iif "lo" accept
ct state established,related accept
ct state invalid drop
iifname "wt0" tcp dport 22 accept # mesh (road-warriors, askari)
ip saddr 10.20.10.151 tcp dport 22 accept # ssh-from-control (Ansible self) — group_vars/all
ip saddr 10.20.10.50 tcp dport 22 accept # mamba on the LAN — base__firewall_admin_addrs
ip saddr 10.20.10.17 tcp dport 22 accept # 2nd operator wkstn — base__firewall_admin_addrs
ip protocol icmp accept ; ip6 nexthdr ipv6-icmp accept
# (no catalog services on ubongo) → default drop
chain forward: policy accept # Docker + libvirt-NAT forwarding preserved
```
## Why ubongo is the safe case (maps to the 2026-06-17 incident)
- **Signal 1** (forward-drop breaks Docker hosts): sidestepped — INPUT-only leaves forwarding alone.
- **Signal 2** (`ip_nonlocal_bind` boot-race): sidestepped — no `ListenAddress`; sshd binds nothing new.
- **Signal 3** (a host's only mgmt path must not depend on a service it hosts): satisfied —
ubongo is not the coordinator and keeps three independent paths (mesh, LAN, physical console).
- **Signal 6** (recovery tested after the break-glass was removed): the physical console is
permanent (nothing to retire), and reboot-recovery is proven on a throwaway VM first.
## New & changed code
**Role `base`:**
- `roles/base/defaults/main.yml` — add:
- `base__firewall_input_only: false` — when true, the forward chain is `policy accept`
(host-local input filtering only), for hosts that route/forward container or NAT traffic
(e.g. the control node's Docker + libvirt-NAT) where a forward default-deny would break them.
- `base__firewall_admin_addrs: []` — extra LAN source IPs allowed to SSH (besides `wt0` +
`ssh-from-control`); for an operator workstation reaching the host over the LAN. Key-gated.
- `roles/base/templates/nftables.conf.j2`:
- the forward line (currently line 21) →
`chain forward { type filter hook forward priority 0; policy {{ "accept" if base__firewall_input_only | bool else "drop" }}; }`
- after the `ssh-from-control` block (currently lines 12-14), add a loop:
`{% for addr in base__firewall_admin_addrs %}`
`ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept`
- `roles/base/molecule/default/{converge,verify}.yml` — fixture sets `input_only: true` + an
`admin_addrs` entry; assert (a) `forward` renders `policy accept`, (b) the admin-addr accept
rule renders, (c) existing input default-deny + `wt0` + control-addr assertions stay green.
**Inventory** (`inventories/production/group_vars/control/vars.yml`, append):
```yaml
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0, the
# ssh-from-control self-path (base__firewall_control_addr in group_vars/all), or mamba on the
# LAN. Break-glass: the physical console.
base__firewall_input_only: true
base__firewall_admin_addrs:
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — see note below.
- "10.20.10.17" # a 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
# base__firewall_apply defaults true; base__firewall_control_addr (= ubongo's own 10.20.10.151)
# is set in group_vars/all and covers Ansible's self-connection.
```
**Integration harness** (ADR-025) — a "be ubongo" profile, mirroring "be askari":
- `tests/integration/overrides/ubongo.yml``firewall_apply: true`, `input_only: true`,
`admin_addrs: ["192.168.150.99"]` (a representative LAN addr to exercise the rule),
`firewall_control_addr: "192.168.150.1"` (the libvirt-NAT gateway = the harness's own SSH
path, so the apply + reboot don't lock it out), `ssh_listen_mesh_only: false`,
`mesh_enabled: false`.
- `tests/integration/profiles/ubongo.json` — mirror `profiles/askari.json` (VM resources/image).
- `tests/integration/verify.yml` — make the assertions **profile-aware** (gated on the active
profile, since `verify.yml` is shared): for ubongo assert `input` policy drop, `forward`
policy **accept**, and the admin-addr rule present. Reachability across the reboot is the
harness's existing cycle. The askari assertions (Docker/forward-DNAT) must **not** run for the
ubongo profile, nor vice-versa.
Enables `make test-integration HOST=ubongo`.
## The admin-addrs — deliberately interim values
`base__firewall_admin_addrs: ["10.20.10.50", "10.20.10.17"]` are the operator workstations'
**current raw DHCP leases** (mamba + a second box), not reservations (operator decision,
2026-06-19). Both share the operator's `sjat` SSH key. Caveats, accepted for now:
- **Lease drift:** if DHCP reassigns either IP, the rule allows whatever host then holds it
(still SSH-key-gated, so low risk) and that workstation loses its *LAN* path. **Backstop:**
the workstations also reach ubongo over `wt0` (mesh), so they are never cut off — only the
off-mesh LAN convenience lapses until the IP is corrected.
- **Revisit trigger (flagged for follow-up):** when OPNsense-as-code lands (ADR-020 perimeter /
TODO 3.5), replace both raw leases with **MAC-pinned DHCP reservations** (`10.20.10.17` =
MAC `bc:0f:f3:c8:4a:8a`) and allow the reserved addresses. Recorded as a `FRICTION.md` open
signal so the next `/kaizen` surfaces it.
## Testing
- **Molecule** (base `default`, render-only, `firewall_apply: false`): the new forward-accept +
admin-addr assertions above, with existing assertions green.
- **Integration harness** (`make test-integration HOST=ubongo`): on a throwaway UEFI VM, apply
the ubongo overlay, assert the ruleset shape, and prove **SSH survives a reboot** from an
allowed source (the existing assert/cycle). This is the gate before touching the real control
node.
- **Live** (during cutover): SSH over `wt0` ✓, from mamba LAN ✓, Ansible self-ping ✓; SSH from a
disallowed LAN host dropped ✓; `docker run … ` egress ✓; a fresh `make test-integration`
still spins a VM (libvirt NAT intact) ✓.
## Staged cutover (operator-supervised — lockout-aware, FRICTION signal-6 order)
ubongo is managed as `sjat` (password sudo), so the live apply needs the operator present
anyway. The physical console is open throughout.
1. **Harness GREEN:** `make test-integration HOST=ubongo` passes (incl. the reboot).
2. **Pre-check the real paths** *before* applying: SSH over `wt0`, SSH from mamba
(`10.20.10.50`), `ansible ubongo -m ping`. Confirm the physical console is reachable.
3. **Dry-run:** `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — review the nftables diff
(input default-deny + `wt0` + `10.20.10.151` + `10.20.10.50`; forward `policy accept`).
4. **Apply (auto-rollback armed):** `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — the
firewall concern snapshots, arms the 45 s revert, applies, `reset_connection`
`wait_for_connection` over the live path (`10.20.10.151`), then cancels the timer. A bad
ruleset reverts itself; the console is the ultimate fallback.
5. **Verify** every path + Docker egress + a fresh integration-VM spin (above).
6. **Reboot ubongo; confirm SSH returns on all paths unaided** (console present). Only now is it
done — recovery is proven *while the break-glass is still there*.
7. **Docs:** update `STATUS.md` (ubongo row: input-only default-deny applied) and `ROADMAP.md`
(mesh-hardening 2/3 done; next is sub-project 1 askari redesign or 3 NetBird ACL).
## Risks & rollback
- **Self-referential apply** (ubongo runs Ansible against itself): mitigated by the auto-rollback
timer, the `wait_for_connection` over the real path, three redundant allowed sources, and the
permanent physical console. ubongo cannot be bricked.
- **Raw-lease fragility:** documented above; backstopped by the mesh path; revisit with OPNsense.
- **No new container isolation** (forward stays accept): accepted — ubongo is a single-tenant
control node, not a service host; Docker/libvirt keep their own forward rules. The forward
default-deny remains the norm for real service hosts (`base__firewall_input_only: false`).
## Out of scope / follow-ons
- askari SSH → `wt0` redesign (sub-project 1) — needs the boot-race + coordinator-bootstrap
resolved; folds in the coordinator-robustness (geo-DB FATAL-loop) + off-site backup lessons.
- NetBird ACL off Allow-All (sub-project 3) — open mechanism question (no headless API path).
- OPNsense DHCP reservations for the admin workstations (`10.20.10.50` mamba, `10.20.10.17`)
and ubongo — replace the raw leases with MAC-pinned reservations; flagged in `FRICTION.md`,
with OPNsense-as-code.
- Forward-chain container isolation on ubongo — deliberately not done here.
- `STATUS.md` / `ROADMAP.md` edits land with the implementation, not this spec.

View file

@ -1,163 +0,0 @@
# Spec — Mesh-hardening (SPOF): accept the single-coordinator SPOF + targeted resilience
Status: Accepted (2026-06-20)
## Context & scope
The **mesh-hardening follow-on** decomposed into independent sub-projects (ROADMAP). Progress:
1. ~~ubongo nftables INPUT-only default-deny~~**DONE 2026-06-19**.
2. ~~askari SSH → `wt0` redesign~~**DONE 2026-06-20** (live reboot-validated).
3. **askari relay-SPOF reduction***this spec*.
4. NetBird ACL off Allow-All — not started.
`askari` runs boma's **single** self-hosted NetBird coordinator (management + signal + relay +
STUN, one combined container) **and** is a mesh peer (ADR-016). Because `ubongo`'s INPUT-only
default-deny drops the inbound UDP that ICE hole-punching needs, `ubongo`'s peers are always
**`Relayed`** through askari's own relay (intentional posture — `docs/runbooks/netbird-client.md`,
the `ubongo-relay-only` finding). So askari is a single point of failure for **relayed mesh
traffic**.
### The decisive finding — the blast radius is narrow
The mesh (`wt0`) is **not** a default gateway. Verified on ubongo (2026-06-20):
```
wt0 routes ONLY 100.99.0.0/16 · default route via 10.20.10.1 dev eno1 · Networks: - (no subnet-routes/exit-node)
```
So an askari outage affects **only** traffic addressed to a peer's `100.99.x.x` mesh IP over the
relay:
| Traffic | askari down |
|---|---|
| LAN device → LAN service (direct or via reverse proxy) | unaffected |
| node ↔ node over LAN IPs (future cluster) | unaffected |
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P, local ICE candidate) |
| **road-warrior → ubongo (remote, relayed)** | **breaks** |
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
Nothing on the LAN and no future intra-cluster traffic depends on askari. The only loss is
**remote (off-LAN) mesh access to peers** — and only when off-LAN *and* askari is down at once.
### Why we are not "fixing" the SPOF with new infrastructure
- **A second coordinator** is not supported by self-hosted NetBird (single management/signal) and
contradicts ADR-016's deliberate single off-site coordinator.
- **Direct P2P** only helps already-established sessions (re-handshakes still need askari's
signal), and enabling it punctures `ubongo`'s deliberate default-deny (a firewall-catalog UDP
entry + an `accepted-risks` deviation + OPNsense NAT) — cost out of proportion to a narrow,
rare failure.
- **A second relay** needs another publicly-reachable host; a relay at home reintroduces the
public home surface ADR-016's off-site coordinator exists to avoid.
Given a reliable always-on VPS and boma's 25-host scale, the sound engineering choice is to
**accept the SPOF as a conscious, documented trade-off** and harden only the two spots real
incidents point to.
## Goal / success criteria
- The single-coordinator SPOF is **explicitly accepted and documented** (register entry + an
ADR-016 availability analysis + recovery), so the trade-off is revisitable, not forgotten.
- **Managed mesh hosts survive a local-DNS hiccup:** `ubongo` (and future managed mesh hosts)
resolve the coordinator FQDN even when their resolver dies on a transition, mirroring the
client-side fix already in the runbook.
- **No new infrastructure** — no P2P, no second relay, no second coordinator, no Terraform.
- The coordinator **off-site backup gap** is named in the accepted risk and explicitly handed to
the next sub-project (ADR-022), not built here.
## Design
### (a) Accepted-risk `R8``docs/security/accepted-risks.md`
Add one row to the register (owned by ADR-002):
- **Risk:** *Single off-site mesh coordinator is an availability SPOF for remote mesh access*
askari hosts the only management/signal/relay (ADR-016); a relayed peer (all of ubongo's) loses
remote mesh reachability while askari is down, and the control plane pauses. The
`netbird_coordinator` store has **no off-site backup yet** (BACKUP.md), so an askari loss also
loses mesh control-plane state until rebuilt.
- **Rationale:** inherent to ADR-016's deliberate single off-site coordinator (sovereignty,
survives a homelab outage); **narrow blast radius** (above table — LAN/intra-cluster/local
unaffected); askari is a reliable always-on VPS; mitigations exist (client + managed-host DNS
pin; documented rebuild).
- **Revisit trigger:** askari proves unreliable; the cluster grows to depend on the mesh for
intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role
lands (closes the state-loss half).
R8 is the **availability** complement to R3 (which covers askari as a *security* target).
### (b) ADR-016 amendment — an "Availability — an askari outage" subsection
A short subsection capturing: the blast-radius table; that the SPOF is an accepted property
(→ R8); and the **recovery procedure** — rebuild the coordinator (`/setup` + re-enrol peers, M5)
or restore from backup once ADR-022 lands; client/road-warrior break-glass already in
`docs/runbooks/netbird-client.md`; on-LAN access to ubongo never depends on the mesh (ADR-016
recovery model). Recorded as an amendment (dated), ADR-016 status stays Accepted.
### (c) DNS-resilience — pin the coordinator FQDN on managed mesh hosts (`base` `mesh` concern)
The 2026-06-18 outage was a client failing to resolve `netbird.askari.wingu.me` on a network
transition; the client fix (public resolvers + an `/etc/hosts` pin to askari's stable WAN IP) is
already in the runbook. The gap: **managed** mesh hosts have no equivalent. Add to `base`'s `mesh`
concern (`roles/base/tasks/mesh.yml`):
- New default `base__mesh_coordinator_pin: ""` (empty → no pin; opt-in).
- When set (and `base__mesh_enabled`), render an `/etc/hosts` entry mapping the coordinator FQDN
— derived from `base__mesh_management_url` via the `urlsplit('hostname')` filter, **not** a
duplicated literal — to `base__mesh_coordinator_pin`, idempotently (a marker-scoped
`blockinfile`/`lineinfile`).
- Set `base__mesh_coordinator_pin` to askari's static WAN IP for managed mesh hosts that depend
on the coordinator (ubongo via the `control` group_vars; future cluster groups as they appear).
The **coordinator host itself (askari) is exempt** (it would point its own FQDN at its own WAN
IP — needs NAT hairpin and is a server with stable DNS); the plan confirms the exact group_vars
placement and the askari exemption.
The pin is safe because askari's WAN IP is static (operator-confirmed); rendering it from a single
inventory variable keeps it maintainable if it ever changes.
## New & changed code/docs
- `docs/security/accepted-risks.md` — add row **R8**; bump the "Last reviewed" date.
- `docs/decisions/016-mesh-vpn.md` — add the dated "Availability — an askari outage" amendment
subsection (blast-radius table + recovery + R8 cross-ref).
- `roles/base/defaults/main.yml` — add `base__mesh_coordinator_pin: ""` with a comment.
- `roles/base/tasks/mesh.yml` — add the `/etc/hosts` coordinator-pin task (gated on
`base__mesh_enabled` + a non-empty pin; FQDN from `urlsplit`).
- `inventories/production/group_vars/control/vars.yml` — set `base__mesh_coordinator_pin` to
askari's WAN IP for ubongo.
- `roles/base/molecule/default/{converge,verify}.yml` — assert that with the pin set + a fixture
FQDN the `/etc/hosts` entry renders, and that an empty pin renders nothing (no-op).
- `STATUS.md` / `docs/ROADMAP.md` — mark sub-project 3 done; surface ADR-022 (coordinator backup)
as the next item. (Land with the implementation, not this spec.)
## Testing
- **Molecule** (`base` default scenario): (1) `base__mesh_coordinator_pin: ""` → no `/etc/hosts`
coordinator line (default no-op); (2) pin set + a fixture `base__mesh_management_url` → exactly
one idempotent `<ip> <fqdn>` line, FQDN correctly extracted by `urlsplit`. Existing
firewall/hardening/mesh assertions stay green.
- **No live deploy required for acceptance** — the pin is additive and idempotent; it lands on
ubongo on the next routine `base` apply. (Optional spot-check: `getent hosts
netbird.askari.wingu.me` on ubongo resolves to the pinned IP.)
## Risks & rollback
- **Stale pin if askari's WAN IP changes** — mitigated by rendering from one inventory variable
(single edit) and askari's IP being static; the pin is removable by clearing the knob + a
re-apply.
- **Over-pinning the coordinator host** — askari is explicitly exempt (hairpin/DNS), set in
group_vars scope.
- **Accepting the SPOF** is itself the residual risk — bounded by the narrow blast radius, the
documented recovery, and R8's revisit triggers.
## Out of scope / follow-ons
- **Coordinator off-site backup → ADR-022 kickoff (the next sub-project).** Named in R8 and
`BACKUP.md` as the open gap; building it means ADR-022's pull-node (`fisi`) + restic design, not
throwaway plumbing here.
- **Direct P2P / NAT-traversal** — deferred posture change (default-deny puncture + OPNsense NAT +
governance); explicitly not pursued here.
- **A second relay / second coordinator** — ruled out above (infra cost / not supported / against
ADR-016).
- **NetBird ACL off Allow-All** — separate sub-project (4).

View file

@ -70,21 +70,3 @@ testing surprise is worth remembering past the session that hit it.
plus review. Only a real (or `--check`) call against the API surfaces them. plus review. Only a real (or `--check`) call against the API surfaces them.
- → Treat a **check-mode run against the real API as a required gate** for such roles, or - → Treat a **check-mode run against the real API as a required gate** for such roles, or
build a render-only assertion that materializes and inspects the rendered module args. build a render-only assertion that materializes and inspects the rendered module args.
## Single-file bind mount + atomic rewrite = stale config (reload-in-place only)
- **`ansible.builtin.template` writes atomically** (temp file + rename → a *new inode*). A
Docker **single-file** bind mount pins the *old* inode, so a container that reloads
config **in place** (no restart) keeps reading the stale file. Live hit: `reverse_proxy`
bind-mounted the Caddyfile as a single file; `caddy reload` (in-container) re-read the
old inode and silently no-op'd (`"config is unchanged"`). The new NetBird route never
loaded → Caddy never requested its cert → surfaced only as a downstream TLS handshake
failure.
- **Fix for reload-in-place roles: bind-mount the config *directory*, not the file**
(`./caddy``/etc/caddy`). Directory mounts reflect the inode swap, so the reload sees
the new file (proven on askari).
- **Restart-based roles are fine with a single-file mount.** Sibling case: `netbird`
single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
in-container reload), and a **restart re-resolves the bind mount** (verified: route
count 0 before, 1 after). Rule of thumb: **reload-in-place needs a directory mount;
restart-based roles don't.**

View file

@ -2,27 +2,14 @@
# Shared firewall topology — single source of truth for the host nftables layer # Shared firewall topology — single source of truth for the host nftables layer
# (base role) and OPNsense (future). See docs/decisions/020-firewall.md. # (base role) and OPNsense (future). See docs/decisions/020-firewall.md.
# Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public # Zone → subnet (from ADR-007).
# off-site services (askari); home/cluster services use the internal zones only.
firewall_zones: firewall_zones:
mgmt: 10.10.0.0/24 mgmt: 10.10.0.0/24
srv: 10.20.0.0/24 srv: 10.20.0.0/24
lan: 10.30.0.0/24 lan: 10.30.0.0/24
iot: 10.40.0.0/24 iot: 10.40.0.0/24
guest: 10.50.0.0/24 guest: 10.50.0.0/24
public: 0.0.0.0/0
# Service catalog: <name> → placement (host | group | hosts) + ingress[]. # Service catalog: <name> → placement (host | group | hosts) + ingress[].
# askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host # Empty until services are built; hosts still get default-deny + the management plane.
# nftables template renders IPv4 source rules only; askari is reached via its A record firewall_catalog: {}
# (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
firewall_catalog:
reverse_proxy:
host: askari
ingress:
- { from: public, port: 80, proto: tcp }
- { from: public, port: 443, proto: tcp }
netbird_stun:
host: askari
ingress:
- { from: public, port: 3478, proto: udp }

View file

@ -1,108 +1,86 @@
$ANSIBLE_VAULT;1.1;AES256 $ANSIBLE_VAULT;1.1;AES256
33393537643265363864656666343435633766306366316363663337363630636231646436656530 32313030663934353361336234373562303537356334346238663836373238366136356331363761
3032316362373533636163366562396563613735663335370a373239666261633263353963643632 6337323031666565663430303562646565303533653531640a636662373939363632383838613431
30396263343765396435376539323833623933353563333363383337366535616365393730643239 38313365626365373539653266326661393765333737386161666165666534636562353165386537
3034313633323963630a376334343134306138636234613438373866633730373737623863396463 3934633033383966360a323965333139643764326236396635383863353437313966326665373537
32646538636261363363636439626131643865306130623164656366663739333464393564663836 65396564393130303030643861663964383436396561643666623837306366346333306430306238
35313431383834383133386335376661346465613465346537353863363836663936393035646366 66656136626566626262373037623531623633313664376166376161363336353930636538323339
36393833326437363034646532313263383931316432316132396633333330623035636162626230 38386564333432353363353663643539343765373662643836646666626339353539323033386230
31643232306664386364666332396439633934303434636633353262396535396161303361643730 31613165373035363533383862366638353035653836303737656534623361313064616365643131
36636230323834393435376263326537326262396366633130623530303637333032613838373938 64386165653835366137353339396364313661656333333635616338346561363765353934343162
33623333383539613763646136663466643536653734386263346661653838613034356631353733 64346462656566376539643030656461363161393936623332373632653731303031393437316636
39393632616236626566613364356364323434313737383530333364323333383036613039323865 36626165306161336262356161666531323336343663643661626365396437383230613636356530
33396561626564353063623238656663386331323832613832323837346136346330613337393862 62326363383138643162316464396666623332366434336462363531363836313833366237396464
32356537623934363232373034373838643961343131336263663339643264613366383466613366 38323635353238653432626361383434646538326531356333393337643066373262663462656466
30303764626437313065316636633938323035303332356262646661636139653630633565636538 65373036653265616137666533373930333239303732623832353337343434636434616562336135
33613861663836333664623433636134663538663065323964383036616430336631636433646562 38666137353266353130303235616362323633353735373163336138633838633738393637633964
36393835363838303463356565386365623464326631363339363164396338366531386161646633 66623866353265316336336566663034306664656365643832616232313732626464316563636335
64663333633437353335336530306537353038356664623231666362633861376262613564643262 63653930626565636630326661626561366539303964373933653437356537343361626438313439
62393061353865333839386232626361663165623038306366363033383333306139316633343266 35643165636662643463616337323063343633306536346538623331333365366533653634343538
36356361613438663332653638376262346363613661623633316231316661353166366663616664 63623261636366303261373338633939363338316463303065613436396163616537666265623439
36323461653034666131386166333335393438376631326338386635623762663666316461643935 31383361646531633863623230616635646138653630383537366335633030343530383735616435
31326638303766626437393634666531303766326539303939343433393066623933623532636166 35656464393432313563303030626133383761303763653530653837313930303034353136353237
30636463383237393366306630323739333161373666643962363235613133316361383437643162 37376366623836646236363062633938666135326631376235323061666465373865396235643937
33623764383762373539373130333563383636386563373330613633373065333235376166373464 32633736656539356332336237646137303534343337353139383637623165353338623566666535
66653635343665656366383439333433366364663734396239326635653839386662323563663465 30643134303235633362383064376234366235363262396362613731373364306362303634613138
66633235303738303464383139323163303562643765623166316536363835653362633863646261 39366230366262363237656631646361356464393266656166386337303663313136666261633836
30393833316135656462326438633432363965356134396531383465333834346436363235336639 32306132323239343539396232316564326361626462366561313561393635393233653633646431
62663566646632383333613036666431326362346464666530383439373132346531316464613533 39313039313139616262396334613035333633326135346365333537373138396535633137353832
30663062373066623961316237623933663862613433636461373931643866306564313863613334 63636335613237623234646234653435616635356637343964656463383864366534363438343938
37353935343637383133316263663661363463383335636463643932323534393861326635613136 39626364653832373062323434316134653831336534383934346231656533643435306465393065
66326664653234636465353539616432386435633838373436333633366336623233363732363262 31653731653438646361363732303664626438663533393837356562376633643933376132616236
66666231643661333161613733643234383331616162386136346538373439613430326437333966 65393432633831313433323930383736316630626230373963653536396637363436643136363962
33623739626263616235373438303333666237336537626639316561306438373534653161643533 37326534343237363961326438376137663034356532376433376461363337333562646136616462
37303533653238346565396562376266666265646666623661393039383961376466396337656636 61636131376264393236376532356539376536643632623864656331656630353362623133303830
38343730663837653638653239333334333735666431633639353234326264656462633164346566 34633461633539643262353263376363613566343261373930623139626364653232363538353330
61316331623964393763616630633861326236333862653565373931303264316462303932633166 33633634363232653439656236303262373265613762373165646131383537623438383835383962
37363735336266316431303464386232353430636566303637393530663435363536323364346236 33383931626136313036366562363732396561633631643561646536653665333733383261363833
38376631396465303937656562386166306165316432653133623534336338636233383763666234 66356461663965373234393237323037356331333339643931313936313234323432613563306630
36626431316534353462356131316161316162313439326266376438316134653433656335643632 33306638663839363565636661653830316265393639313065313062666534303039326465373636
39653434366464613066666161626334643634353337376166323130353564313961626265373337 64363033323837313030353132383562343337326366626635663439396231393537313932643337
36636163636538323134353431336166373266333934366462373662323762643061336335646264 30663031323231313938366436343735326165326433656633336465316630383961626664303536
38343765343237386665623563383733633264316630326433666663373739373666623030366534 38633964326431643362626631656131303539613033323039393630353766386339346363663362
30363366386563326430333465383362633630646633393466623231333366653837643262336134 33323034396136356362313163376438393739373738366363623636623634316537313461373066
34396163326335666534366334643539623439336133626232353565306562636564656565646164 38613062656231363532663133333438663535666566356336316266383763623765346237663838
34323136663164306466666430613039306231336134323165363736666262356639396438306537 64336435353437373264346561363265643339306532383539306363653564356362313430333066
35633439636435313833626432643832636566366633653161326534303234316632393166396533 65633733633938343830303537383231303036326132376263363531626565633664343038356661
66613334323533373234393731383034323039656462333833646339353530636466303437643136 31336139663061656437633138373438663966616338343565396562306638346437353730643664
64383465353136386435626539353032363632303432633830343365396634336534383761353131 30373133373863626137313062643062393035653463653231653465333166633063353137633538
39343438633837316336633934373132306136393635373933623939623863663465316164313966 62383331303164343236343539396461623738396234653333356632313664616263623061363563
39323365373438343533623365653761323034633661373339616239346465643639306230636139 34323165306533323362376161346364316135333535626261353730666131643938306366326263
35313330363838366436666436313864346232396339613362333866646531363162303238373936 31313934633137623638316534383234376333396131303034633636323037363732383263326335
62353536613763313432636662353362313232373261313865636366373366306137373339333439 32393766343161386537333062643434333333363538323366363231336666383161373432383563
31376333666538303733353962323239323536643034663662323330373165326433616431383163 65613537366139643032336230303133623431376231646662643666373532636565393639373930
61616130653939313535396438346162313038616134323837336634366433303866656361376165 65336630616462353837666431616662636635333532393331326539306233363539396266653239
31393331343738373133313764656539626139643630303730343439613137623930356362333634 31303031303330396632386131623134313536313433623064356636333230373962643339363736
38663334396335396166383761663866613565643130366135623634343838623739333365653364 30396130353466373136643935646436613636376636323530643031653334303863376432646534
32396666306166643130353163323036663831613436376562383865306538653763336332353632 39343165356232346539366233373135326338343663356164616265336235623332646365633466
39346138353463316662376363333835386166393836666462323161376633336635356664376133 35393533373663393762376332396136336236616635616535313336613034346436363665356565
38326639323932373635653139613165616432336136363866383764393930373732633533306433 32636536336634613531393434613435613962653862343737373237623261373836386663343831
63303834386131626366633465393235613432363337386139656561333464303637353539653935 66656135323838636638353963646638326531343635653937306230323237343933626135356533
36616538376663383236386561383339616332306137623731626537343765643637343232303230 66356263636438633164386535333762616438626439343462393833393731643037396662653737
35393438636361353965353166633833316162376463376338353830386131666238626138666165 31666361656530383437396230393663616133383764316437623939663631396561343266383766
37646438663561343831643431303434333138666664373634363038653964363335646165343163 62373636663631393637393763613337356337633264366434346561343263373931323335643135
62613938636663613063383338326437333739386137316535366235366261383162656663636130 31366661623137353336666630633365663764646234343035313130663562636361623532643461
35613938333763333633636565306239356161383731643864373830646438303137306465376231 63333961333338623966396662656262323830396439633337663431663235663962666238356630
35356334303233343634653936323966653961616630633061643765373430386362376437656535 30353331313462653061373638666235653938623931366466666164343566623238333237353265
39306630393466343232663632656133356531663935643137353439333261316632653762323232 30373064353132366634623966306632303832306630383637623465323134633133656333303964
36323964636534326561626133323530643639386563623435656535386562633635633339343938 35646637316236303364393363323137616132326437623238336631313530663230333362623633
35343536646565353936326362623930313739386163383765326330316335636139326665653339 34383032376538366464363032343262656164376166386237383563613630336666633965653730
37333030383438363231333663616565303434303334643234353239313837656563363861656661 64373236396564363164643637623736626532396630313131356563333238643665356166323837
36313166666566393737636231373634363132623066376437323532353861336338373462323539 31626338623665623165643763623661666439626435643237336433646132666366623661393832
35306135363835653733356634646332346461643236613263376664343537333531313561333035 37306533613966663936373061613331633934623462343236626234306130383738343631303231
34393938643561613231666434386331393966353730343634343437353566343263653038316430 32326339323738323537333363313538373266623363363636633462356234363466393263316235
62336333373336633164626132346534616139333830336535666135613833623734353563353732 39663033303165656366396334306535643361646663373935303230376466366632373563303231
31326139386336346332363565303333353135663732613765313034356433363932346263386164 64323264653036333039663965646630653934376239653236323063656137373830623563336463
33343636333039346339356261623037316334623236653736386362323536386134633665383237 37343461373737313539316361623763373733653930626532393565333938333761323631303332
39393665646231313734393963336634393563366134373233663036363830363265656663646361 39663530303439616561356561666532653762343339323435636164376664373731343132666539
64353063653362383435623931343133623434356139363430613935346363386139373134306164 63626637346563393765303065646564643661636130396439323736343764333633373331653333
37343931363931613834316665343662393533383730663364396338623933663766396130646566 66633465343433303038623638323965636533666639643266353163353436393036336639336133
30626339616537373337303338613931303938323032666634666337626361376130396631376236 32646664363565326539643763653832313336663262313634343635616333613434373333323036
35373766366637313661616335383739616636373166366332336161316466323731383836643263 61366435376265336638326132333439613431353633653762653836386235643965366436363866
30623564353934636561323734666663623363666365323734633030643664643232633638636437 35626664393139386337353335343930306130356335623131646261656434303966656431623231
63373664383863333032383739386238353239666162656436646439356239336266393966366434 66643730393430363838626434663933613536343533316262373564666665373663336363623166
38613437353931633138343865313831303264653732313764336564623065613339326239356232 63363037373634383961373035633239646235316137363036333765313864643365396165643432
32376536616635346536633361663463663231636566333062636261653761383664646639646335 36623465313036376261393566383539336638363836633232656136656533396663323366313062
31656236343930386135346266353533393035646265383437313763653530666136653433353964 64616632373333313466356362336234346564373832316433373963623263316635
30326434323038643565356239646533323134356361656365656339383635303065633537656532
64633663653138653439623238636532373265386362643238646433616531343962343762623238
30663966666434643361313835373835633064376536636436636465383763356663313862393138
62346431663864316335386433396535386137366462666334623837666233626661333565613766
35656264383936326638613431646236643131396337626231326565653233393061643530333830
37396130303862613034393332623665376464353831366562353865373065336366393939623036
30633637336564326466326562653966633265343062616536363738363239626637373730643839
30336238363535373664643463353035313735633635666562653063386139366464626432633931
33393436393435386637333135356630373464646634346364326164303038393664313864623633
62383733366430373535633531356162666164653030326232336137633630346237386230323166
37346365373632636639363833366461663265313235633663616432643835646133626365616531
31646531643134633531353039343832643336373735343264653437373662633465613861646630
34323131306236343566343736326264663339363537346539353434303866343036303761656566
33386438343539656535306330346433643636343063336433323061313762613839633665363063
66363233343337626631323038363336636335333965353636356436373031356262343734386565
62396436303238373837373334663130396631373034356462393931653935633161356633383131
37376130333232383235633765366636653330376663343566343833323861313236623333653834
61363261326266353935333738626530396433306331326339623533393738663437343131656462
61396533636334613363363161646366326631643138313161393438303261336537383733343630
35383739353136613162326630383961623463626561313033613664643931366435326635383838
30333066396132396238633837316636373062316264336530326133623465346264356530363537
643734623039346364383038363937353764

View file

@ -12,28 +12,7 @@ dev_env__users:
# group only. # group only.
ansible_user: sjat ansible_user: sjat
# ubongo's AI-worker; passwordless sudo for the claude user (ADR-015 amended).
base__ai_worker_user: claude
# ubongo is a NetBird mesh peer (ADR-016, M5) — enrol the agent via base's `mesh` concern. # ubongo is a NetBird mesh peer (ADR-016, M5) — enrol the agent via base's `mesh` concern.
# Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening # Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening
# follow-on), so this brings up wt0 without changing SSH exposure. # follow-on), so this brings up wt0 without changing SSH exposure.
base__mesh_enabled: true base__mesh_enabled: true
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
base__firewall_input_only: true
# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
base__mesh_coordinator_pin: "77.42.120.136"
base__firewall_admin_addrs:
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
# OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
- "10.20.10.17" # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.

View file

@ -1,21 +1,6 @@
--- ---
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
# (ADR-016, M5). # (ADR-016, M5) — enrol the agent via base's `mesh` concern. Enrollment only; the
# # host firewall default-deny + moving askari's SSH onto wt0 stay deferred to the
# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward # mesh-hardening follow-on.
# `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
# did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
# - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
# so Docker container forwarding/NAT survive a reboot;
# - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
# change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
# - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
# as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
# management path must never depend on a service that host itself hosts).
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
base__mesh_enabled: true base__mesh_enabled: true
base__firewall_apply: true
base__firewall_input_only: true # forward stays `policy accept` → Docker-safe
base__ssh_listen_mesh_only: false # no sshd ListenAddress change → no boot-race
base__firewall_admin_addrs:
- 91.226.145.80 # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass

View file

@ -1,7 +0,0 @@
---
# Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
# in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
# (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
# break-glass; the Hetzner web console is the IP-independent ultimate fallback.
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
ansible_host: 100.99.226.39

View file

@ -8,5 +8,3 @@
roles: roles:
- role: dev_env - role: dev_env
tags: [dev_env] tags: [dev_env]
- role: integration_test
tags: [integration_test]

View file

@ -11,14 +11,6 @@ base__firewall_rollback_timeout: 45 # seconds before the auto-revert fires on a
base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply
base__firewall_dropin_dir: /etc/nftables.d base__firewall_dropin_dir: /etc/nftables.d
base__firewall_apply: true # set false to render+validate without applying (CI/Molecule) base__firewall_apply: true # set false to render+validate without applying (CI/Molecule)
base__firewall_input_only: false # true → the forward chain is `policy accept` (host-local
# INPUT filtering only). For hosts that forward/route
# container or NAT traffic (the control node's Docker +
# libvirt-NAT) where a forward default-deny would break
# them. Real service hosts keep this false (forward drop).
base__firewall_admin_addrs: [] # extra LAN source IPs allowed to SSH, besides wt0 +
# ssh-from-control. For an operator workstation reaching
# the host over the LAN (no mesh). Key-gated. (ADR-021)
# SSH hardening + fail2ban (ADR-002) — `hardening` concern. # SSH hardening + fail2ban (ADR-002) — `hardening` concern.
base__ssh_password_authentication: "no" base__ssh_password_authentication: "no"
@ -29,19 +21,6 @@ base__fail2ban_findtime: 10m
# base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys). # base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys).
base__ssh_authorised_keys: [] base__ssh_authorised_keys: []
# SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
# ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
# fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
# lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
# non-empty address rather than silently listening on all interfaces.
base__ssh_listen_mesh_only: false
base__ssh_listen_addr: ""
# The automation/AI-worker user granted passwordless sudo (ADR-015 amended / ADR-021).
# Empty = no AI-worker sudo. Set per-group (e.g. group_vars/control: claude). The user's
# password should be locked so NOPASSWD is its only sudo path; actions are auditd-attributed.
base__ai_worker_user: ""
# NetBird mesh agent enrollment (ADR-016). Opt-in: default off so applying `base` to a # NetBird mesh agent enrollment (ADR-016). Opt-in: default off so applying `base` to a
# host not on the mesh is a no-op for this concern. The live actions (apt install over # host not on the mesh is a no-op for this concern. The live actions (apt install over
# the network, `netbird up` against the coordinator) are additionally gated by # the network, `netbird up` against the coordinator) are additionally gated by
@ -51,9 +30,3 @@ base__mesh_manage: true
base__mesh_management_url: "https://netbird.askari.wingu.me" base__mesh_management_url: "https://netbird.askari.wingu.me"
base__mesh_setup_key: "{{ vault.netbird.setup_key }}" base__mesh_setup_key: "{{ vault.netbird.setup_key }}"
base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy
# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
base__mesh_coordinator_pin: ""

View file

@ -6,21 +6,15 @@
vars: vars:
base__firewall_apply: false base__firewall_apply: false
base__firewall_control_addr: 10.10.0.99 # test control-node LAN address base__firewall_control_addr: 10.10.0.99 # test control-node LAN address
base__firewall_admin_addrs:
- "10.30.0.77" # fixture: an operator-workstation LAN source (admin-addr SSH allow)
# Exercise the mesh concern's include path with the live actions gated off, so it # Exercise the mesh concern's include path with the live actions gated off, so it
# runs hermetically (no coordinator/key needed) and must be a clean no-op. # runs hermetically (no coordinator/key needed) and must be a clean no-op.
base__mesh_enabled: true base__mesh_enabled: true
base__mesh_manage: false base__mesh_manage: false
base__mesh_setup_key: "dummy-molecule-key" base__mesh_setup_key: "dummy-molecule-key"
base__mesh_coordinator_pin: "203.0.113.9" # fixture IP (TEST-NET-3); pins FQDN from base__mesh_management_url
base__ssh_listen_mesh_only: true
base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container)
firewall_zones: firewall_zones:
lan: 10.30.0.0/24 lan: 10.30.0.0/24
srv: 10.20.0.0/24 srv: 10.20.0.0/24
mgmt: 10.10.0.0/24 mgmt: 10.10.0.0/24
public: 0.0.0.0/0
firewall_catalog: firewall_catalog:
reverse_proxy: reverse_proxy:
host: instance host: instance
@ -30,9 +24,5 @@
host: instance host: instance
ingress: ingress:
- { from: srv, port: 2342, proto: tcp } - { from: srv, port: 2342, proto: tcp }
netbird_stun:
host: instance
ingress:
- { from: public, port: 3478, proto: udp }
roles: roles:
- role: base - role: base

View file

@ -19,16 +19,6 @@ platforms:
volumes: volumes:
- /sys/fs/cgroup:/sys/fs/cgroup:rw - /sys/fs/cgroup:/sys/fs/cgroup:rw
command: /lib/systemd/systemd command: /lib/systemd/systemd
# Pre-create the namespaced sysctl so ansible.posix.sysctl can set it (mesh-hardening 1/3).
# The container image lacks procps so the sysctl binary is absent; we also install it in
# prepare.yml. This entry ensures the value exists in the container's netns at startup.
sysctls:
net.ipv4.ip_nonlocal_bind: "0"
# ubongo's /etc/resolv.conf points to the NetBird mesh DNS (100.99.x.x), which Docker
# containers can't reach (no wt0). Override to a public resolver so prepare.yml apt tasks
# can update the cache and install packages.
dns_servers:
- 8.8.8.8
provisioner: provisioner:
name: ansible name: ansible

View file

@ -1,11 +0,0 @@
---
- name: Prepare
hosts: all
become: true
gather_facts: false
tasks:
- name: Install procps so ansible.posix.sysctl can find the sysctl binary
ansible.builtin.apt:
name: procps
state: present
update_cache: true

View file

@ -38,33 +38,12 @@
- "'tcp dport 2342 accept' in nft" - "'tcp dport 2342 accept' in nft"
fail_msg: "missing srv->2342 rule for photoprism" fail_msg: "missing srv->2342 rule for photoprism"
- name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
ansible.builtin.assert:
that:
- "'0.0.0.0/0' in nft"
- "'udp dport 3478 accept' in nft"
fail_msg: "missing public->3478/udp rule for netbird_stun"
- name: Assert the docker_host extension hook is present - name: Assert the docker_host extension hook is present
ansible.builtin.assert: ansible.builtin.assert:
that: that:
- "'include \"/etc/nftables.d/*.nft\"' in nft" - "'include \"/etc/nftables.d/*.nft\"' in nft"
fail_msg: "missing drop-in include hook" fail_msg: "missing drop-in include hook"
- name: Assert the forward chain defaults to policy drop (input_only off)
ansible.builtin.assert:
that:
- "'hook forward priority 0; policy drop;' in nft"
fail_msg: >-
forward chain must default to policy drop when base__firewall_input_only is
false (container isolation stays the norm on real service hosts)
- name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
ansible.builtin.assert:
that:
- "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
- name: Syntax-check the rendered ruleset (no apply) - name: Syntax-check the rendered ruleset (no apply)
ansible.builtin.command: nft -c -f /etc/nftables.conf ansible.builtin.command: nft -c -f /etc/nftables.conf
changed_when: false changed_when: false
@ -79,18 +58,6 @@
ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local
changed_when: false changed_when: false
- name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
changed_when: false
- name: Sysctl drop-in for ip_nonlocal_bind is present
ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind=1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
changed_when: false
- name: Kernel ip_nonlocal_bind is live in this netns
ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
register: _nonlocal
changed_when: false
failed_when: _nonlocal.stdout | trim != '1'
# mesh concern: enabled but manage=false must be a clean no-op (no install/enrol) # mesh concern: enabled but manage=false must be a clean no-op (no install/enrol)
- name: Check whether netbird got installed - name: Check whether netbird got installed
ansible.builtin.command: which netbird ansible.builtin.command: which netbird
@ -103,14 +70,3 @@
- _nb.rc != 0 - _nb.rc != 0
fail_msg: "netbird must not be installed when base__mesh_manage is false" fail_msg: "netbird must not be installed when base__mesh_manage is false"
success_msg: "mesh concern is a clean no-op under manage=false" success_msg: "mesh concern is a clean no-op under manage=false"
- name: Read /etc/hosts (coordinator pin)
ansible.builtin.slurp:
src: /etc/hosts
register: _etchosts
- name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
ansible.builtin.assert:
that:
- "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)" # slurp content is always base64
fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
success_msg: "coordinator FQDN pinned in /etc/hosts"

View file

@ -23,13 +23,6 @@
tags: [hardening] tags: [hardening]
tags: [hardening] tags: [hardening]
- name: AI-worker operational access (sudoers drop-in)
ansible.builtin.include_tasks:
file: operational_access.yml
apply:
tags: [users]
tags: [users]
- name: NetBird mesh enrollment - name: NetBird mesh enrollment
ansible.builtin.include_tasks: ansible.builtin.include_tasks:
file: mesh.yml file: mesh.yml

View file

@ -64,19 +64,3 @@
- "'Management: Connected' not in (_netbird_status.stdout | default(''))" - "'Management: Connected' not in (_netbird_status.stdout | default(''))"
no_log: true # setup key is on the argv no_log: true # setup key is on the argv
tags: [mesh] tags: [mesh]
- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
ansible.builtin.lineinfile:
path: /etc/hosts
regexp: '^\S+\s+{{ _coordinator_fqdn | regex_escape }}\s*$'
line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
state: present
# /etc/hosts is bind-mounted in the Docker Molecule container (atomic rename → EBUSY);
# this is a fallback only — production VMs still write atomically.
unsafe_writes: true
vars:
_coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
when:
- base__mesh_enabled | bool
- base__mesh_coordinator_pin | length > 0
tags: [mesh]

View file

@ -1,11 +0,0 @@
---
- name: Grant the AI-worker user passwordless sudo (ADR-015 amended / ADR-021)
ansible.builtin.copy:
content: "{{ base__ai_worker_user }} ALL=(ALL) NOPASSWD:ALL\n"
dest: "/etc/sudoers.d/{{ base__ai_worker_user }}-ai-worker"
owner: root
group: root
mode: "0440"
validate: "visudo -cf %s"
when: base__ai_worker_user | length > 0
tags: [users]

View file

@ -1,31 +1,4 @@
--- ---
- name: Resolve the sshd mesh listen address (override, else live wt0 fact)
ansible.builtin.set_fact:
base__ssh_listen_addr_resolved: >-
{{ base__ssh_listen_addr
or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
when: base__ssh_listen_mesh_only | bool
- name: Fail closed — refuse to render sshd without a known mesh address
ansible.builtin.assert:
that:
- base__ssh_listen_addr_resolved | length > 0
fail_msg: >-
base__ssh_listen_mesh_only is true but no mesh address resolved (set
base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
render sshd ListenAddress empty (which would listen on ALL interfaces).
when: base__ssh_listen_mesh_only | bool
- name: Allow sshd to bind the mesh IP before wt0 exists at boot
ansible.posix.sysctl:
name: net.ipv4.ip_nonlocal_bind
value: "1"
sysctl_set: true
state: present
reload: true
sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
when: base__ssh_listen_mesh_only | bool
- name: Ensure openssh-server is installed - name: Ensure openssh-server is installed
ansible.builtin.apt: ansible.builtin.apt:
name: openssh-server name: openssh-server

View file

@ -12,16 +12,13 @@ table inet filter {
{% if base__firewall_control_addr %} {% if base__firewall_control_addr %}
ip saddr {{ base__firewall_control_addr }} tcp dport {{ base__firewall_ssh_port }} accept ip saddr {{ base__firewall_control_addr }} tcp dport {{ base__firewall_ssh_port }} accept
{% endif %} {% endif %}
{% for addr in base__firewall_admin_addrs %}
ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
{% endfor %}
ip protocol icmp accept ip protocol icmp accept
ip6 nexthdr ipv6-icmp accept ip6 nexthdr ipv6-icmp accept
{% for r in base__firewall_resolved %} {% for r in base__firewall_resolved %}
ip saddr { {{ r.sources | join(', ') }} } {{ r.proto }} dport {{ r.port }} accept ip saddr { {{ r.sources | join(', ') }} } {{ r.proto }} dport {{ r.port }} accept
{% endfor %} {% endfor %}
} }
chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; } chain forward { type filter hook forward priority 0; policy drop; }
chain output { type filter hook output priority 0; policy accept; } chain output { type filter hook output priority 0; policy accept; }
} }

View file

@ -3,6 +3,3 @@ PasswordAuthentication {{ base__ssh_password_authentication }}
PermitRootLogin {{ base__ssh_permit_root_login }} PermitRootLogin {{ base__ssh_permit_root_login }}
PubkeyAuthentication yes PubkeyAuthentication yes
KbdInteractiveAuthentication no KbdInteractiveAuthentication no
{% if base__ssh_listen_mesh_only | bool %}
ListenAddress {{ base__ssh_listen_addr_resolved }}
{% endif %}

View file

@ -1,16 +1,8 @@
--- ---
# Docker engine install (ADR-004). Cluster-specific daemon hardening is deferred to when # Docker engine install (ADR-004). Cluster-specific daemon hardening + nftables.d
# the cluster exists. # integration are deferred to when the cluster + host firewall exist.
docker_host__packages: docker_host__packages:
- docker-ce - docker-ce
- docker-ce-cli - docker-ce-cli
- containerd.io - containerd.io
- docker-compose-plugin - docker-compose-plugin
# Container-forward nftables drop-in (FRICTION 2026-06-17 #1 / ADR-025). base's inet-filter
# forward chain is `policy drop`; on a Docker host that kills published-port DNAT + inter-
# container forwarding ON REBOOT (nftables loads default-deny before dockerd). This drop-in
# (loaded via base's /etc/nftables.d/*.nft include) appends the accepts so a rebooted Docker
# host keeps forwarding. Only meaningful where base__firewall_apply is true.
docker_host__forward_dropin: true
docker_host__nftables_dropin_dir: /etc/nftables.d # must match base__firewall_dropin_dir

View file

@ -37,22 +37,3 @@
state: present state: present
update_cache: true update_cache: true
tags: [packages] tags: [packages]
- name: Ensure the nftables drop-in dir exists (for the container-forward rules)
ansible.builtin.file:
path: "{{ docker_host__nftables_dropin_dir }}"
state: directory
mode: "0755"
when: docker_host__forward_dropin | bool
tags: [firewall]
- name: Install the container-forward nftables drop-in (reboot-safe Docker forwarding)
ansible.builtin.template:
src: 10-docker-forward.nft.j2
dest: "{{ docker_host__nftables_dropin_dir }}/10-docker-forward.nft"
mode: "0644"
when: docker_host__forward_dropin | bool
# Not reloaded here: a running host already forwards via Docker's runtime rules, so the
# drop-in only needs to protect the NEXT boot (loaded by nftables.service). Reloading nft
# now would flush Docker's NAT (FRICTION 2026-06-17 #4); the boot loads it cleanly.
tags: [firewall]

View file

@ -1,14 +0,0 @@
# {{ ansible_managed }}
# Allow container forwarding through base's default-deny forward chain (ADR-025 / FRICTION
# 2026-06-17 #1). Appended to base's `table inet filter` / `chain forward` via the
# /etc/nftables.d/*.nft include, and loaded by nftables.service at boot — exactly when the
# bug bit (default-deny forward loading before dockerd on reboot).
table inet filter {
chain forward {
ct state established,related accept
iifname "docker0" accept
oifname "docker0" accept
iifname "br-*" accept
oifname "br-*" accept
}
}

View file

@ -1,35 +0,0 @@
# integration_test
Installs the KVM/libvirt substrate on the control node (`ubongo`) so the agent
can boot throwaway Debian VMs for local integration testing (ADR-025).
This is a **non-service** role — no SECURITY/VERIFY/ACCESS/BACKUP files are
required. It does **not** make ubongo a production hypervisor; it only provides
the tooling needed to spin up short-lived test VMs (see ADR-015).
## Target group
`control` (i.e. `ubongo`)
## What it does
1. Installs QEMU/KVM, libvirt daemon + clients, `virt-install`, and
cloud-image tools (`cloud-image-utils`, `genisoimage`).
2. Enables and starts `libvirtd`.
3. Adds the configured users (`sjat`, `claude`) to the `libvirt` and `kvm`
groups so VMs can be managed without `sudo`.
4. Creates `/var/lib/boma-integration` (owned `root:libvirt`, mode `2775`) as
the cache directory for golden images and overlays.
## Defaults
| Variable | Default | Purpose |
|-------------------------------|-------------------------------|----------------------------------|
| `integration_test__packages` | see `defaults/main.yml` | APT packages to install |
| `integration_test__users` | `[sjat, claude]` | Users granted libvirt/kvm access |
| `integration_test__cache_dir` | `/var/lib/boma-integration` | Image/overlay cache directory |
## Related decisions
- [ADR-025](../../docs/decisions/025-local-vm-integration-testing.md) — local VM integration testing
- [ADR-015](../../docs/decisions/015-control-host.md) — control host scope (ubongo is not a hypervisor)

View file

@ -1,20 +0,0 @@
---
# integration_test — installs the local KVM/libvirt substrate on the control node
# (ubongo) so the agent can run throwaway VM integration tests (ADR-025). Non-service
# role; applied to the `control` group. Not a production hypervisor (ADR-015).
integration_test__packages:
- qemu-system-x86 # KVM
- qemu-utils # qemu-img (overlays)
- libvirt-daemon-system
- libvirt-clients # virsh
- virt-install # virt-install (trixie: the real pkg; `virtinst` is transitional)
- cloud-image-utils # cloud-localds (NoCloud seed)
- genisoimage # cloud-localds fallback
# Users granted libvirt/kvm access (run VMs without sudo).
integration_test__users:
- sjat
- claude
# Where the golden image + overlays live (outside the repo).
integration_test__cache_dir: "/var/lib/boma-integration"
# nftables drop-in dir — must match base__firewall_dropin_dir (base role default: /etc/nftables.d)
integration_test__nftables_dropin_dir: /etc/nftables.d

View file

@ -1,15 +0,0 @@
---
- name: Reload nftables
ansible.builtin.service:
name: nftables
state: reloaded
listen: "integration_test | reload nftables"
register: _nft_reload
# nftables is absent from the Molecule Docker container; ignore "not found" errors there.
# On real hosts where base has applied nftables, failures propagate normally.
failed_when:
- _nft_reload.failed
- >-
'Could not find the requested service nftables' not in (_nft_reload.msg | default(''))
and 'nftables.service not found' not in (_nft_reload.msg | default(''))
and 'Unit nftables.service not found' not in (_nft_reload.msg | default(''))

View file

@ -1,14 +0,0 @@
---
galaxy_info:
author: sjat
description: >-
Installs the KVM/libvirt substrate on the control node (ubongo) to enable
local VM integration testing (ADR-025). Non-service role; not a production
hypervisor (ADR-015).
license: MIT
min_ansible_version: "2.17"
platforms:
- name: Debian
versions:
- trixie
dependencies: []

View file

@ -1,21 +0,0 @@
---
# KVM/libvirt APT packages cannot be installed in the Docker Molecule container
# (no internet; KVM unusable in a container). This converge exercises only the
# nftables drop-in rendering via tasks_from, which IS meaningful in a container.
# The full role (packages/libvirt) is exercised by make test-integration.
#
# Coverage split:
# Docker Molecule (this file): nftables drop-in rendering only.
# make test-integration (ADR-025, real KVM): libvirt/KVM package install, cache
# dir creation, and end-to-end VM lifecycle — the role's substrate tasks.
# The Docker scenario intentionally covers only the firewall drop-in; substrate
# coverage lives in the real-KVM integration harness, not here.
- name: Converge
hosts: all
become: true
gather_facts: true
tasks:
- name: Include integration_test firewall tasks
ansible.builtin.include_role:
name: integration_test
tasks_from: firewall.yml

View file

@ -1,31 +0,0 @@
---
dependency:
name: galaxy
options:
requirements-file: ../../requirements.yml
driver:
name: docker
platforms:
- name: instance
# Project-owned image built from .docker/molecule-debian13/Dockerfile
# and hosted in the Forgejo container registry.
# Build/push with: make molecule-image / make molecule-image-push
image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
pre_build_image: true
privileged: true # required for systemd
cgroupns_mode: host
volumes:
- /sys/fs/cgroup:/sys/fs/cgroup:rw
command: /lib/systemd/systemd
provisioner:
name: ansible
inventory:
host_vars:
instance:
ansible_user: root
verifier:
name: ansible

View file

@ -1,14 +0,0 @@
---
# The Molecule Docker image ships with /var/lib/apt/lists/ cleared to minimise size.
# KVM/libvirt packages cannot be installed in a container; converge only runs the
# `firewall` tag. Pre-create /etc/nftables.d so the drop-in template task succeeds.
- name: Prepare
hosts: all
become: true
gather_facts: false
tasks:
- name: Create nftables drop-in dir (normally created by the config task)
ansible.builtin.file:
path: /etc/nftables.d
state: directory
mode: "0755"

View file

@ -1,18 +0,0 @@
---
# Package-install and cache-dir tasks are skipped (converge runs `firewall` tag only;
# KVM/libvirt packages cannot be fetched in the Docker container). This scenario
# verifies the nftables drop-in renders correctly.
- name: Verify
hosts: all
become: true
gather_facts: false
tasks:
- name: Read the libvirt bridge nftables drop-in
ansible.builtin.slurp:
src: /etc/nftables.d/10-libvirt-boma.nft
register: _dropin
- name: Assert drop-in contains virbr-boma accept rule
ansible.builtin.assert:
that:
- "'virbr-boma' in (_dropin.content | b64decode)"
- "'accept' in (_dropin.content | b64decode)"

View file

@ -1,8 +0,0 @@
---
- name: Install the libvirt bridge nftables drop-in (virbr-boma input allow)
ansible.builtin.template:
src: 10-libvirt-boma.nft.j2
dest: "{{ integration_test__nftables_dropin_dir }}/10-libvirt-boma.nft"
mode: "0644"
notify: "integration_test | reload nftables"
tags: [firewall]

View file

@ -1,35 +0,0 @@
---
- name: Install the KVM/libvirt substrate
ansible.builtin.apt:
name: "{{ integration_test__packages }}"
state: present
update_cache: true
cache_valid_time: 3600
tags: [packages]
- name: Enable and start libvirtd
ansible.builtin.systemd:
name: libvirtd
enabled: true
state: started
tags: [config]
- name: Grant users libvirt + kvm access
ansible.builtin.user:
name: "{{ item }}"
groups: [libvirt, kvm]
append: true
loop: "{{ integration_test__users }}"
tags: [users]
- name: Create the integration cache dir
ansible.builtin.file:
path: "{{ integration_test__cache_dir }}"
state: directory
owner: root
group: libvirt
mode: "2775"
tags: [config]
- name: Import firewall tasks
ansible.builtin.import_tasks: firewall.yml

View file

@ -1,12 +0,0 @@
# {{ ansible_managed }}
# Allow DHCP/DNS traffic arriving on the libvirt integration bridge to pass base's
# inet filter input default-deny chain (ADR-025). nftables multi-table semantics mean
# libvirt's own `ip filter` table accept is not enough — base's `inet filter` input
# policy drop kills bridge traffic first without this drop-in.
#
# Bridge name "virbr-boma" must match NET_XML in scripts/integration-vm.py.
table inet filter {
chain input {
iifname "virbr-boma" accept
}
}

View file

@ -46,7 +46,6 @@ upstream support; WS/gRPC need long timeouts (Caddy sets none by default).
| `netbird_coordinator__domain` | `netbird.askari.wingu.me` | Public hostname; feeds `exposedAddress`, the OIDC issuer, redirect URIs, and the dashboard endpoints | | `netbird_coordinator__domain` | `netbird.askari.wingu.me` | Public hostname; feeds `exposedAddress`, the OIDC issuer, redirect URIs, and the dashboard endpoints |
| `netbird_coordinator__trusted_proxies` | `["172.16.0.0/12"]` | Source ranges NetBird trusts `X-Forwarded-*` from (`server.reverseProxy.trustedHTTPProxies`). Must cover Caddy's source IP on the boma network — verify the actual bridge subnet at deploy | | `netbird_coordinator__trusted_proxies` | `["172.16.0.0/12"]` | Source ranges NetBird trusts `X-Forwarded-*` from (`server.reverseProxy.trustedHTTPProxies`). Must cover Caddy's source IP on the boma network — verify the actual bridge subnet at deploy |
| `netbird_coordinator__manage` | `true` | Set `false` in Molecule to render templates without a Docker daemon | | `netbird_coordinator__manage` | `true` | Set `false` in Molecule to render templates without a Docker daemon |
| `netbird_coordinator__disable_geolocation` | `true` | sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4) |
Production overrides live in `inventories/production/group_vars/`. Production overrides live in `inventories/production/group_vars/`.

View file

@ -6,13 +6,6 @@ netbird_coordinator__dashboard_image: "netbirdio/dashboard:v2.39.0"
netbird_coordinator__base_dir: /opt/services/netbird netbird_coordinator__base_dir: /opt/services/netbird
netbird_coordinator__domain: netbird.askari.wingu.me netbird_coordinator__domain: netbird.askari.wingu.me
# Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
# (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
# so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
# re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
# removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
netbird_coordinator__disable_geolocation: true
# Source IP ranges Caddy fronts NetBird from, rendered into config.yaml # Source IP ranges Caddy fronts NetBird from, rendered into config.yaml
# server.reverseProxy.trustedHTTPProxies. NetBird trusts X-Forwarded-* only from # server.reverseProxy.trustedHTTPProxies. NetBird trusts X-Forwarded-* only from
# these. MUST cover the Caddy container's source IP on the boma Docker network — # these. MUST cover the Caddy container's source IP on the boma Docker network —

View file

@ -30,12 +30,3 @@
- "'v2.39.0' in (_compose.content | b64decode)" - "'v2.39.0' in (_compose.content | b64decode)"
fail_msg: "docker-compose.yml is missing pinned image tags" fail_msg: "docker-compose.yml is missing pinned image tags"
success_msg: "docker-compose.yml pins both image tags" success_msg: "docker-compose.yml pins both image tags"
- name: "Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)"
ansible.builtin.assert:
that:
- "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
fail_msg: >-
compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
the coordinator on the GeoLite2 download
success_msg: "geolocation disabled in compose"

View file

@ -4,8 +4,6 @@
path: "{{ netbird_coordinator__base_dir }}" path: "{{ netbird_coordinator__base_dir }}"
state: directory state: directory
mode: "0750" mode: "0750"
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
check_mode: false
tags: [config] tags: [config]
- name: Render the combined server config - name: Render the combined server config

View file

@ -16,10 +16,6 @@ services:
container_name: netbird-server container_name: netbird-server
restart: unless-stopped restart: unless-stopped
command: ["--config", "/etc/netbird/config.yaml"] command: ["--config", "/etc/netbird/config.yaml"]
environment:
# Disable geolocation so a no-egress startup can't FATAL the control plane
# (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
ports: ports:
- "3478:3478/udp" - "3478:3478/udp"
volumes: volumes:

View file

@ -35,7 +35,3 @@ access__api: # noqa: var-naming[no-role-prefix]
# DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance. # DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance.
backup__service: reverse_proxy # noqa: var-naming[no-role-prefix] backup__service: reverse_proxy # noqa: var-naming[no-role-prefix]
backup__state: false # noqa: var-naming[no-role-prefix] backup__state: false # noqa: var-naming[no-role-prefix]
# Integration-test / staging cert knobs (ADR-025). Default off = production behaviour.
reverse_proxy__tls_internal: false # true => every site uses Caddy's self-signed CA
reverse_proxy__acme_ca: "" # set to the LE staging directory URL to use staging

View file

@ -4,8 +4,6 @@
path: "{{ reverse_proxy__base_dir }}" path: "{{ reverse_proxy__base_dir }}"
state: directory state: directory
mode: "0750" mode: "0750"
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
check_mode: false
tags: [config] tags: [config]
- name: Ensure the Caddy config directory exists - name: Ensure the Caddy config directory exists
@ -13,8 +11,6 @@
path: "{{ reverse_proxy__base_dir }}/caddy" path: "{{ reverse_proxy__base_dir }}/caddy"
state: directory state: directory
mode: "0750" mode: "0750"
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
check_mode: false
tags: [config] tags: [config]
# Render into a directory that is bind-mounted whole (./caddy -> /etc/caddy). Mounting # Render into a directory that is bind-mounted whole (./caddy -> /etc/caddy). Mounting

View file

@ -1,9 +1,6 @@
# {{ ansible_managed }} # {{ ansible_managed }}
{ {
email {{ reverse_proxy__acme_email }} email {{ reverse_proxy__acme_email }}
{% if reverse_proxy__acme_ca %}
acme_ca {{ reverse_proxy__acme_ca }}
{% endif %}
{% if reverse_proxy__acme_dns_provider == 'gandi' %} {% if reverse_proxy__acme_dns_provider == 'gandi' %}
# ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the # ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the
# Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom # Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom
@ -13,9 +10,6 @@
} }
{% for r in reverse_proxy__routes %} {% for r in reverse_proxy__routes %}
{{ r['host'] }} { {{ r['host'] }} {
{% if reverse_proxy__tls_internal %}
tls internal
{% endif %}
{% if r['caddy'] is defined %} {% if r['caddy'] is defined %}
{{ r['caddy'] | trim | indent(2, first=true) }} {{ r['caddy'] | trim | indent(2, first=true) }}
{% elif r['upstream'] is defined %} {% elif r['upstream'] is defined %}

View file

@ -1,462 +0,0 @@
#!/usr/bin/env python3
"""boma local-VM integration test harness driver (ADR-025).
Stdlib-only by convention (TODO-14): never imports a YAML library. The transient
inventory is emitted via string templates; stubs/cert-tiers reach Ansible as
`-e @<file>` extra-vars; profile metadata is JSON. Talks to libvirt via `virsh`.
"""
import argparse
import hashlib
import json
import os
import pathlib
import re
import subprocess
import sys
import time
import urllib.request
import uuid
REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
CACHE_DIR = pathlib.Path(os.environ.get("BOMA_IT_CACHE", "/var/lib/boma-integration"))
IMAGE_URL = "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2"
SHA_URL = "https://cloud.debian.org/images/cloud/trixie/latest/SHA512SUMS"
IMAGE_NAME = "debian-13-genericcloud-amd64.qcow2"
NET_NAME = "boma-it"
NET_XML = """<network>
<name>boma-it</name>
<forward mode='nat'/>
<bridge name='virbr-boma' stp='on' delay='0'/>
<ip address='192.168.150.1' netmask='255.255.255.0'>
<dhcp><range start='192.168.150.10' end='192.168.150.254'/></dhcp>
</ip>
</network>
"""
NAME_PREFIX = "boma-it-"
RUN_DIR = REPO_ROOT / "tests" / "integration" / ".run"
DIAG_ROOT = pathlib.Path.home() / "integration-runs"
PROFILE_DIR = REPO_ROOT / "tests" / "integration" / "profiles"
INTEG_DIR = REPO_ROOT / "tests" / "integration"
CERT_DIR = REPO_ROOT / "tests" / "integration" / "certs"
DEFAULT_MEM_MIB = 3072
DEFAULT_VCPUS = 2
MIN_FREE_MIB = 4096
VALID_TIERS = ("internal", "le-staging", "le-prod-wildcard")
# Target the SYSTEM libvirtd — where the substrate, /dev/kvm, and the NAT network live.
# Without this, a non-root caller's bare virsh/virt-install default to qemu:///session.
os.environ.setdefault("LIBVIRT_DEFAULT_URI", "qemu:///system")
def vm_name(host, suffix=None):
suffix = suffix or uuid.uuid4().hex[:8]
return f"{NAME_PREFIX}{host}-{suffix}"
def free_mib(meminfo_text):
m = re.search(r"^MemAvailable:\s+(\d+)\s+kB", meminfo_text, re.MULTILINE)
return int(m.group(1)) // 1024 if m else 0
def parse_lease_ip(domifaddr_output):
m = re.search(r"ipv4\s+(\d+\.\d+\.\d+\.\d+)", domifaddr_output)
return m.group(1) if m else None
def render_meta_data(instance_id, hostname):
return f"instance-id: {instance_id}\nlocal-hostname: {hostname}\n"
def render_user_data(ssh_pubkey, ansible_user):
return (
"#cloud-config\n"
"users:\n"
f" - name: {ansible_user}\n"
" sudo: 'ALL=(ALL) NOPASSWD:ALL'\n"
" shell: /bin/bash\n"
" ssh_authorized_keys:\n"
f" - {ssh_pubkey}\n"
"ssh_pwauth: false\n"
"package_update: true\n"
)
def cert_file(tier):
if tier not in VALID_TIERS:
raise ValueError(f"unknown cert tier: {tier}")
return CERT_DIR / f"{tier}.yml"
def profile_path(host):
return PROFILE_DIR / f"{host}.json"
def render_run_hosts(name, ip, ansible_user, groups):
lines = [
"---",
"# Generated by scripts/integration-vm.py — transient, gitignored. Do not edit.",
"# Single test host ONLY (safety invariant: no real host is ever in scope).",
"all:",
" children:",
]
for g in dict.fromkeys(groups):
lines += [
f" {g}:",
" hosts:",
f" {name}:",
f" ansible_host: {ip}",
f" ansible_user: {ansible_user}",
# Integration VMs reuse IPs; bypass host-key caching so stale
# known_hosts entries (from prior runs with a different VM at
# the same IP) do not block the Ansible apply step.
" ansible_ssh_common_args: >-",
" -o StrictHostKeyChecking=no",
" -o UserKnownHostsFile=/dev/null",
]
return "\n".join(lines) + "\n"
def sh(cmd, check=True, capture=False, **kw):
"""Run a command (list form). Logs the command to stderr."""
print("+ " + " ".join(str(c) for c in cmd), file=sys.stderr)
return subprocess.run(cmd, check=check,
capture_output=capture, text=True, **kw)
def _expected_sha(sha_text, filename):
for line in sha_text.splitlines():
parts = line.split()
if len(parts) == 2 and parts[1].lstrip("*") == filename:
return parts[0]
return None
def ensure_image():
CACHE_DIR.mkdir(parents=True, exist_ok=True)
img = CACHE_DIR / IMAGE_NAME
if img.exists():
return img
print(f"Downloading {IMAGE_URL} ...", file=sys.stderr)
tmp = img.with_suffix(".part")
urllib.request.urlretrieve(IMAGE_URL, tmp)
sha_text = urllib.request.urlopen(SHA_URL).read().decode()
want = _expected_sha(sha_text, IMAGE_NAME)
if not want:
tmp.unlink(missing_ok=True)
raise SystemExit(f"checksum for {IMAGE_NAME} not found at {SHA_URL}")
h = hashlib.sha512()
with open(tmp, "rb") as fh:
for chunk in iter(lambda: fh.read(1 << 20), b""):
h.update(chunk)
if h.hexdigest() != want:
tmp.unlink(missing_ok=True)
raise SystemExit("golden image SHA512 mismatch — refusing to use it")
tmp.rename(img)
return img
def net_ensure():
r = sh(["virsh", "net-info", NET_NAME], check=False, capture=True)
if r.returncode != 0:
xml = RUN_DIR / "net.xml"
RUN_DIR.mkdir(parents=True, exist_ok=True)
xml.write_text(NET_XML)
sh(["virsh", "net-define", str(xml)])
sh(["virsh", "net-autostart", NET_NAME])
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
if not re.search(r"Active:\s+yes", active):
sh(["virsh", "net-start", NET_NAME])
def _ssh_pubkey():
for cand in ("id_ed25519.pub", "id_rsa.pub"):
p = pathlib.Path.home() / ".ssh" / cand
if p.exists():
return p.read_text().strip()
raise SystemExit("no SSH public key found in ~/.ssh")
def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS):
free = free_mib(pathlib.Path("/proc/meminfo").read_text())
if free < MIN_FREE_MIB:
raise SystemExit(f"refusing to start: only {free} MiB free (< {MIN_FREE_MIB})")
running = sh(["virsh", "list", "--name"], capture=True).stdout.split()
if any(n.startswith(NAME_PREFIX) for n in running):
raise SystemExit("an integration VM is already running (one at a time); "
"run `integration-vm prune` first")
name = name or vm_name(host)
img = ensure_image()
net_ensure()
RUN_DIR.mkdir(parents=True, exist_ok=True)
# VM disk/seed/console must live where the SYSTEM hypervisor (libvirt-qemu) can reach
# them — NOT under the repo/home (qemu cannot traverse /home/claude). CACHE_DIR is
# group-libvirt + world-traversable (created by the integration_test role).
overlay = CACHE_DIR / f"{name}.qcow2"
sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)])
(RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible"))
# cloud-init rejects underscores in local-hostname (causes init-local to skip
# writing the network config → VM never gets a DHCP lease). Sanitize VM name
# for use as hostname without affecting disk paths or virsh domain names.
(RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name.replace("_", "-")))
seed = CACHE_DIR / f"{name}-seed.img"
# Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback.
# Use explicit renderer + interface name to avoid a netplan 1.1.2 generation issue:
# `match.name: en*` with a named key (e.g. `primary`) produces a .network file that
# networkd loads but never DHCPs (no DHCP4 messages, just IPv6LL). Using the real
# interface name `enp1s0` (all virtio NICs in these KVM VMs are named enp1s0) and
# `renderer: networkd` bypasses the bug.
(RUN_DIR / "network-config").write_text(
'version: 2\n'
'renderer: networkd\n'
'ethernets:\n'
' enp1s0:\n'
' dhcp4: true\n')
sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"),
str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")])
console = CACHE_DIR / f"{name}-console.log"
# virt-install has a `#!/usr/bin/env python3` shebang; the Makefile prepends .venv/bin to
# PATH (so the venv's ansible tools resolve), which would hijack virt-install into the
# isolated venv — it lacks system PyGObject (`gi`) and crashes. Strip the venv from PATH
# for this system tool so its shebang finds /usr/bin/python3 (which has gi). Ansible is
# invoked via its absolute .venv path elsewhere, so it is unaffected.
sys_path = ":".join(p for p in os.environ.get("PATH", "").split(":")
if "/.venv/bin" not in p)
sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus),
"--boot", "uefi", # genericcloud triple-faults on legacy BIOS handoff; UEFI boots
"--import",
"--disk", f"path={overlay},format=qcow2",
"--disk", f"path={seed},device=cdrom",
"--network", f"network={NET_NAME}",
"--osinfo", "debian13",
"--graphics", "none",
"--serial", f"file,path={console}",
"--noautoconsole"],
env=dict(os.environ, PATH=sys_path))
ip = wait_for_ip(name)
wait_for_ssh(ip, "ansible")
# Block until cloud-init finishes (incl. apt-get update) so apply sees a ready system.
sh(["ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
f"ansible@{ip}", "sudo cloud-init status --wait"], check=False)
(RUN_DIR / "current").write_text(f"{name}\n{ip}\n{host}\n")
print(f"VM {name} up at {ip}")
return name, ip
def wait_for_ip(name, timeout=120):
# Try --source lease first (fastest when leaseshelper works), then fall back to
# --source arp (reads the host neighbour/ARP table — no privileged helper needed,
# populated once the VM sends traffic). Both sources produce identical output that
# parse_lease_ip handles, so this removes the leaseshelper/suid dependency.
end = time.time() + timeout
while time.time() < end:
for source in ("lease", "arp"):
out = sh(["virsh", "domifaddr", name, "--source", source],
check=False, capture=True).stdout
ip = parse_lease_ip(out)
if ip:
return ip
time.sleep(4)
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
"VM left defined; run `integration-vm prune` to remove it")
def wait_for_ssh(ip, user, timeout=180):
end = time.time() + timeout
while time.time() < end:
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
f"{user}@{ip}", "true"], check=False, capture=True)
if r.returncode == 0:
return
time.sleep(5)
raise SystemExit(f"timed out waiting for SSH to {ip}"
"VM left defined; run `integration-vm prune` to remove it")
def _read_current():
txt = (RUN_DIR / "current").read_text().splitlines()
return txt[0], txt[1], txt[2] # name, ip, host
def write_run_inventory(name, ip, groups):
RUN_DIR.mkdir(parents=True, exist_ok=True)
(RUN_DIR / "hosts.yml").write_text(
render_run_hosts(name, ip, "ansible", groups))
link = RUN_DIR / "group_vars"
target = REPO_ROOT / "inventories" / "production" / "group_vars"
if link.is_symlink():
link.unlink()
elif link.exists():
raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
link.symlink_to(target)
def apply(host, certs):
name, ip, _ = _read_current()
prof = json.loads(profile_path(host).read_text())
write_run_inventory(name, ip, prof["groups"])
extra = []
for f in prof.get("extra_vars_files", []):
extra += ["-e", f"@{INTEG_DIR / f}"]
extra += ["-e", f"@{cert_file(certs)}"]
for step in prof["applies"]:
cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
f"playbooks/{step['playbook']}", "--limit", name]
if step.get("tags"):
cmd += ["--tags", ",".join(step["tags"])]
cmd += extra
sh(cmd, cwd=str(REPO_ROOT))
print(f"applied {host} profile to {name}")
def _boot_id(ip, user):
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
check=False, capture=True)
return r.stdout.strip() if r.returncode == 0 else None
def wait_for_reboot(ip, user, before_boot_id, timeout=240):
"""Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
end = time.time() + timeout
while time.time() < end:
bid = _boot_id(ip, user)
if bid and bid != before_boot_id:
return
time.sleep(5)
raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
"VM left defined; run `integration-vm prune` to remove it")
def reboot_vm():
name, ip, _ = _read_current()
before = _boot_id(ip, "ansible")
sh(["virsh", "reboot", name])
wait_for_reboot(ip, "ansible", before)
print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
def run_assert(host, certs):
name, ip, _ = _read_current()
prof = json.loads(profile_path(host).read_text())
write_run_inventory(name, ip, prof["groups"])
extra = []
for f in prof.get("extra_vars_files", []):
extra += ["-e", f"@{INTEG_DIR / f}"]
extra += ["-e", f"@{cert_file(certs)}"]
cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
"tests/integration/verify.yml", "--limit", name] + extra
r = sh(cmd, cwd=str(REPO_ROOT), check=False)
if r.returncode != 0:
dump_diagnostics(name, ip)
raise SystemExit(f"VERIFY FAILED for {name} — diagnostics in {DIAG_ROOT}")
print(f"VERIFY PASSED for {name}")
def dump_diagnostics(name, ip):
d = DIAG_ROOT / name
d.mkdir(parents=True, exist_ok=True)
for label, cmd in [
("nft", "nft list ruleset"),
("docker", "docker ps -a"),
("ss", "ss -tlnp"),
("journal", "journalctl -b --no-pager"),
("critical-chain", "systemd-analyze critical-chain"),
]:
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
f"ansible@{ip}", "sudo " + cmd], check=False, capture=True)
(d / f"{label}.txt").write_text((r.stdout or "") + (r.stderr or ""))
console = CACHE_DIR / f"{name}-console.log"
if console.exists():
# The serial log is root:0600 (libvirt-created); read it via sudo (ADR-015: the
# claude worker has sudo) and write a worker-owned copy into the bundle.
r = sh(["sudo", "cat", str(console)], check=False, capture=True)
(d / "console.log").write_text(r.stdout or "")
print(f"diagnostics written to {d}", file=sys.stderr)
def _destroy(name):
sh(["virsh", "destroy", name], check=False)
sh(["virsh", "undefine", name, "--nvram"], check=False)
for base in (RUN_DIR, CACHE_DIR):
for f in base.glob(f"{name}*"):
f.unlink(missing_ok=True)
def down(host=None, keep=False):
if keep:
print("--keep: leaving the VM running for inspection")
return
cur = RUN_DIR / "current"
if cur.exists():
name = cur.read_text().splitlines()[0]
_destroy(name)
cur.unlink(missing_ok=True)
print(f"destroyed {name}")
def prune():
running = sh(["virsh", "list", "--all", "--name"], capture=True).stdout.split()
for n in running:
if n.startswith(NAME_PREFIX):
_destroy(n)
print(f"pruned {n}")
(RUN_DIR / "current").unlink(missing_ok=True)
def console():
name = (RUN_DIR / "current").read_text().splitlines()[0]
log = CACHE_DIR / f"{name}-console.log"
if log.exists():
print(sh(["sudo", "cat", str(log)], check=False, capture=True).stdout or "")
else:
print(f"no console log at {log}")
def cycle(host, certs, keep=False, no_reboot=False):
ok = False
try:
up(host)
apply(host, certs)
if not no_reboot:
reboot_vm()
run_assert(host, certs)
ok = True
finally:
if ok and not keep:
down(host)
elif not ok:
print("FAILED — VM left up for inspection; `integration-vm prune` to clean.",
file=sys.stderr)
DISPATCH = {
"up": lambda a: (up(a.host), None)[1],
"apply": lambda a: apply(a.host, a.certs),
"reboot": lambda a: reboot_vm(),
"assert": lambda a: run_assert(a.host, a.certs),
"down": lambda a: down(a.host, a.keep),
"console": lambda a: console(),
"prune": lambda a: prune(),
"cycle": lambda a: cycle(a.host, a.certs, a.keep, a.no_reboot),
}
def main(argv=None):
p = argparse.ArgumentParser(prog="integration-vm", description=__doc__)
sub = p.add_subparsers(dest="cmd", required=True)
for c in ("up", "apply", "reboot", "assert", "cycle", "down", "console"):
sp = sub.add_parser(c)
sp.add_argument("--host", required=True)
sp.add_argument("--certs", choices=VALID_TIERS, default="internal")
sp.add_argument("--keep", action="store_true")
sp.add_argument("--no-reboot", action="store_true")
sub.add_parser("prune")
args = p.parse_args(argv)
return DISPATCH[args.cmd](args)
if __name__ == "__main__": # pragma: no cover
sys.exit(main())

View file

@ -1,32 +0,0 @@
#!/usr/bin/env bash
#
# Log the local Docker daemon into the Forgejo container registry using a token stored in
# the Ansible vault — so registry pushes (make caddy-image-push / molecule-image-push) are
# agent-completable non-interactively, like every other vault-backed action.
# (2026-06-17 kaizen, docs/FRICTION.md: the push half silently needed an interactive
# `docker login`; the creds weren't in the vault, so an agent couldn't complete a push.)
#
# Reads vault.forgejo.registry_token from the vault (rbw must be unlocked) and pipes it to
# `docker login --password-stdin`. The token never lands on argv or on disk and is never
# echoed (no `set -x`). Binaries/paths are overridable via env so the Makefile can pass the
# venv ansible-vault/python; defaults work when run from the repo root with the venv present.
#
set -euo pipefail
ANSIBLE_VAULT="${ANSIBLE_VAULT:-.venv/bin/ansible-vault}"
PYTHON="${PYTHON:-.venv/bin/python}"
VAULT="${VAULT:-inventories/production/group_vars/all/vault.yml}"
REGISTRY_HOST="${REGISTRY_HOST:-forgejo.nyumbani.baobab.band}"
REGISTRY_USER="${REGISTRY_USER:-sjat}"
token="$("$ANSIBLE_VAULT" view "$VAULT" \
| "$PYTHON" -c 'import sys, yaml; d = yaml.safe_load(sys.stdin) or {}; print((((d.get("vault") or {}).get("forgejo") or {}).get("registry_token")) or "", end="")')"
if [ -z "$token" ] || [ "$token" = "CHANGEME" ]; then
echo "registry-login: vault.forgejo.registry_token is unset or still CHANGEME." >&2
echo " Mint a Forgejo token (Settings -> Applications -> Generate Token, with package" >&2
echo " read+write scope, user $REGISTRY_USER) and set it via: make edit-vault" >&2
exit 1
fi
printf '%s' "$token" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin

View file

@ -41,42 +41,6 @@ LIST_ITEM_RE = re.compile(r"^\s*(\d+\.|[-*+])\s+(.*)")
DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I) DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I)
RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I) RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I)
# Rename-incomplete detection: an ADR announces a rename/supersession of a named
# term (Old → New); verify the OLD name no longer lingers in the design-doc set.
# (The structural cousin of stale-deferred — see docs/FRICTION.md, ADR-024.)
# A "specific" name is a backticked token or a capitalised proper-noun/identifier;
# common connective words are rejected so they can't be mistaken for a tool name.
_NAME = r"(?:`[^`]+`|[A-Z][A-Za-z0-9_+.-]{2,})"
RENAME_STOPWORDS = {
"was", "were", "the", "this", "that", "with", "from", "into", "and", "but",
"for", "are", "has", "had", "been", "now", "not", "all", "any", "use", "used",
"via", "per", "its", "our", "one", "two", "old", "new", "phase", "step",
"adr", "read", "name", "term", "tool", "prose", "roadmap",
}
# Trigger forms — each captures (old, new) as raw name tokens; the connective words
# are case-insensitive but the names must still satisfy _NAME (specific tokens).
RENAME_ASSERT_RES = (
# renamed X to Y
re.compile(rf"renamed\s+(?:from\s+)?({_NAME})\s+to\s+({_NAME})", re.I),
# replaced X with Y
re.compile(rf"replac\w*\s+({_NAME})\s+with\s+({_NAME})", re.I),
# superseded X with/by Y
re.compile(rf"supersed\w*\s+({_NAME})\s+(?:with|by)\s+({_NAME})", re.I),
# X ... (is/are/was/were/been) updated to read Y
re.compile(rf"({_NAME})\b.{{0,40}}?\b(?:is|are|was|were|been)?\s*"
rf"updated\s+to\s+read\s+[\"']?({_NAME})", re.I),
# X → Y / X -> Y on a line that also carries a rename/supersede/update cue
re.compile(rf"({_NAME})\s*(?:->|→)\s*({_NAME})"),
)
RENAME_ARROW_RES = (RENAME_ASSERT_RES[-1],) # arrow forms need a cue word on the line
RENAME_CUE_RE = re.compile(r"\b(?:renam\w*|replac\w*|supersed\w*|updated|rename)\b", re.I)
# Historical / negation cues — a lingering OLD name on such a line is legitimate
# history, not a missed ripple edit, so it is skipped.
RENAME_HIST_RE = re.compile(
r"\b(?:was|were|formerly|previously|no longer|instead of|rather than|reject\w*|"
r"reconsider\w*|supersed\w*|deprecat\w*|legacy|history|heritage|V4|"
r"actually ran|used to)\b", re.I)
# ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory # ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory
# sections and a parseable Status line. Presence only — section ORDER is a # sections and a parseable Status line. Presence only — section ORDER is a
# template-demonstrated convention, not machine-enforced. # template-demonstrated convention, not machine-enforced.
@ -178,84 +142,6 @@ def adr_structure_findings(adr_files):
return out return out
def _clean_name(tok):
"""Strip backticks/quotes from a captured name token. Return the bare name, or
None if it is not a 'specific' token (empty, multi-word, or a stopword)."""
s = tok.strip().strip("`\"'").strip()
s = s.rstrip(".,;:!?)") # trailing sentence punctuation is not part of the name
if not s or " " in s:
return None
if s.lower() in RENAME_STOPWORDS:
return None
# An ADR reference (ADR-017) is a document pointer, never the renamed *term* — a
# sentence like "the ADR-017 prose ... is updated to read Caddy" must not parse
# ADR-017 as the old name. Reject it so such lines skip (precision >> recall).
if re.fullmatch(r"ADR-\d{3}", s):
return None
# Must be backtick-able identifier or a capitalised proper noun (the _NAME shape
# already enforced this on capture; this is the after-stripping re-check).
if not re.fullmatch(r"[A-Za-z0-9_+.-]{3,}", s):
return None
return s
def _rename_assertion(line):
"""Parse a single ADR line for a tight Old→New rename assertion. Returns
(old, new) of cleaned specific names, or None. Conservative: precision >> recall."""
for rx in RENAME_ASSERT_RES:
m = rx.search(line)
if not m:
continue
# Arrow form only counts when the line also carries a rename/supersede cue.
if rx in RENAME_ARROW_RES and not RENAME_CUE_RE.search(line):
continue
old, new = _clean_name(m.group(1)), _clean_name(m.group(2))
if old and new and old != new:
return old, new
return None
def rename_incomplete_findings(adr_files, extra_docs):
"""adr_files: {rel_path: [lines]} for docs/decisions/*.md (the numbered ADRs make
the assertions). extra_docs: {rel_path: [lines]} for CAPABILITIES.md / ROADMAP.md.
When a numbered ADR announces a rename 'Old' -> 'New', flag any DESIGN-doc line
where 'Old' still appears as a whole word in present tense (skipping the announcing
ADR, lines that also name 'New', and lines carrying a historical/negation cue)."""
out = []
# The design-doc set we search: all decisions/*.md plus the two extra docs.
doc_set = dict(adr_files)
doc_set.update(extra_docs)
# Collect assertions only from numbered ADRs (NNN-*.md).
assertions = [] # (adr_num, announcer_path, old, new)
for rpath, lines in sorted(adr_files.items()):
base = os.path.basename(rpath)
if not ADR_FILE_RE.match(base):
continue
adr_num = base[:3]
for line in lines:
parsed = _rename_assertion(line)
if parsed:
assertions.append((adr_num, rpath, parsed[0], parsed[1]))
for adr_num, announcer, old, new in assertions:
old_re = re.compile(r"\b" + re.escape(old) + r"\b") # case-sensitive whole word
for rpath, lines in sorted(doc_set.items()):
if rpath == announcer: # the ADR that made the claim is exempt
continue
for i, raw in enumerate(lines, 1):
if not old_re.search(raw):
continue
if new in raw: # rename is being explained on this line
continue
if RENAME_HIST_RE.search(raw): # legitimate history / negation
continue
out.append({"check": "rename-incomplete", "severity": "medium",
"path": rpath, "line": i,
"detail": f"ADR-{adr_num} announced rename '{old}' -> "
f"'{new}' but '{old}' still appears here; confirm the "
"ripple edit landed or soften the ADR claim"})
return out
def walk_files(): def walk_files():
for dirpath, dirnames, filenames in os.walk(ROOT): for dirpath, dirnames, filenames in os.walk(ROOT):
dirnames[:] = [d for d in dirnames if d not in PRUNE] dirnames[:] = [d for d in dirnames if d not in PRUNE]
@ -306,11 +192,8 @@ def scan():
findings = [] findings = []
adrs = adr_numbers() adrs = adr_numbers()
adr_files = {} # docs/decisions/*.md → lines, for deferred-section parsing adr_files = {} # docs/decisions/*.md → lines, for deferred-section parsing
extra_docs = {} # CAPABILITIES.md / ROADMAP.md → lines, for rename-incomplete
defer_refs = [] # repo-wide "resolves ADR-NNN deferred #K" references defer_refs = [] # repo-wide "resolves ADR-NNN deferred #K" references
decisions_dir = os.path.join("docs", "decisions") decisions_dir = os.path.join("docs", "decisions")
rename_extra = {os.path.join("docs", "CAPABILITIES.md"),
os.path.join("docs", "ROADMAP.md")}
for path in walk_files(): for path in walk_files():
rpath = rel(path) rpath = rel(path)
if rpath.startswith(SKIP_PREFIX): if rpath.startswith(SKIP_PREFIX):
@ -340,8 +223,6 @@ def scan():
if rpath.startswith(decisions_dir) and rpath.endswith(".md"): if rpath.startswith(decisions_dir) and rpath.endswith(".md"):
adr_files[rpath] = lines adr_files[rpath] = lines
if rpath in rename_extra:
extra_docs[rpath] = lines
for i, line in enumerate(lines, 1): for i, line in enumerate(lines, 1):
for m in DEFER_REF_RE.finditer(line): for m in DEFER_REF_RE.finditer(line):
@ -380,7 +261,6 @@ def scan():
"line": i, "detail": f"references '{ref}' which does not exist"}) "line": i, "detail": f"references '{ref}' which does not exist"})
findings.extend(deferred_findings(adr_files, defer_refs)) findings.extend(deferred_findings(adr_files, defer_refs))
findings.extend(adr_structure_findings(adr_files)) findings.extend(adr_structure_findings(adr_files))
findings.extend(rename_incomplete_findings(adr_files, extra_docs))
return findings return findings

View file

@ -11,7 +11,7 @@ module "askari" {
location = "hel1" # Helsinki location = "hel1" # Helsinki
image = "debian-13" image = "debian-13"
ansible_ssh_pubkey = var.ansible_ssh_pubkey ansible_ssh_pubkey = var.ansible_ssh_pubkey
ssh_admin_cidrs = ["91.226.145.80/32"] # TEMP (incident recovery 2026-06-17): re-open WAN :22 to ubongo only; re-close once the firewall/Docker + boot-race issues are fixed ssh_admin_cidrs = var.ssh_admin_cidrs
public_web = true # Caddy 80/443 + NetBird 3478 (M4) public_web = true # Caddy 80/443 + NetBird 3478 (M4)
labels = { labels = {
env = "offsite" env = "offsite"

View file

@ -26,18 +26,13 @@ resource "hcloud_ssh_key" "ansible" {
resource "hcloud_firewall" "this" { resource "hcloud_firewall" "this" {
name = "${var.name}-fw" name = "${var.name}-fw"
# SSH from the control node only and only when admin CIDRs are set. An empty # SSH from the control node only.
# ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over rule {
# wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
dynamic "rule" {
for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
content {
direction = "in" direction = "in"
protocol = "tcp" protocol = "tcp"
port = "22" port = "22"
source_ips = var.ssh_admin_cidrs source_ips = var.ssh_admin_cidrs
} }
}
# Public web (Caddy 80/443) + NetBird STUN/TURN (3478/udp) only when public_web # Public web (Caddy 80/443) + NetBird STUN/TURN (3478/udp) only when public_web
# (ADR-024, M4). Host nftables stays catalog-driven (ADR-020). # (ADR-024, M4). Host nftables stays catalog-driven (ADR-020).

View file

@ -24,9 +24,8 @@ variable "ansible_ssh_pubkey" {
} }
variable "ssh_admin_cidrs" { variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)." description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
type = list(string) type = list(string)
default = []
} }
variable "public_web" { variable "public_web" {

View file

@ -1,2 +0,0 @@
---
reverse_proxy__tls_internal: true

View file

@ -1,6 +0,0 @@
---
# On-demand only. Records an accepted risk (ADR-025 / accepted-risks.md): the prod
# Gandi PAT reaches an ephemeral VM and transient TXT records land in the real wingu.me.
reverse_proxy__tls_internal: false
reverse_proxy__acme_dns_provider: gandi
reverse_proxy__acme_ca: ""

View file

@ -1,4 +0,0 @@
---
reverse_proxy__tls_internal: false
reverse_proxy__acme_dns_provider: gandi
reverse_proxy__acme_ca: "https://acme-staging-v02.api.letsencrypt.org/directory"

View file

@ -1,13 +0,0 @@
---
# Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`.
# Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host.
integration_profile: askari
base__firewall_apply: true
# Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM).
base__ssh_listen_mesh_only: false
# The VM is isolated; it must never touch the real mesh.
base__mesh_enabled: false
# Allow SSH from the VM's libvirt-NAT gateway (where the driver/ansible connects from),
# so base's default-deny firewall + the reboot don't lock out the harness. By source IP,
# so it's interface-independent. Overrides askari's real control addr for the test only.
base__firewall_control_addr: "192.168.150.1"

View file

@ -1,17 +0,0 @@
---
# Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
# Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
# accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
integration_profile: askari_inputonly
base__firewall_apply: true
base__firewall_input_only: true
# No sshd ListenAddress change — never wt0-only in a throwaway VM.
base__ssh_listen_mesh_only: false
# Isolated VM: never touch the real mesh.
base__mesh_enabled: false
# The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
# VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
# interface-independent and the default-deny + reboot don't lock out the driver. This
# mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
base__firewall_admin_addrs:
- 192.168.150.1

View file

@ -1,18 +0,0 @@
---
# Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
# Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
# chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
# ListenAddress change (so no boot-race).
integration_profile: ubongo
base__firewall_apply: true
base__firewall_input_only: true # forward chain renders `policy accept`
base__firewall_admin_addrs:
- "192.168.150.98" # two representative LAN sources — exercises the
- "192.168.150.99" # admin-addr loop with a multi-entry list (like ubongo)
# Never wt0-only; never touch the real mesh from a throwaway VM.
base__ssh_listen_mesh_only: false
base__mesh_enabled: false
# Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
# default-deny apply + the reboot don't lock out the harness. By source IP (interface-
# independent). This is the harness's lifeline; the admin-addr above is only exercised.
base__firewall_control_addr: "192.168.150.1"

View file

@ -1,10 +0,0 @@
{
"groups": ["offsite_hosts"],
"applies": [
{"playbook": "site.yml", "tags": ["base"]},
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
],
"extra_vars_files": ["overrides/askari.yml"],
"mem_mib": 3072,
"vcpus": 2
}

View file

@ -1,10 +0,0 @@
{
"groups": ["offsite_hosts"],
"applies": [
{"playbook": "site.yml", "tags": ["base"]},
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
],
"extra_vars_files": ["overrides/askari_inputonly.yml"],
"mem_mib": 3072,
"vcpus": 2
}

View file

@ -1,9 +0,0 @@
{
"groups": ["control"],
"applies": [
{"playbook": "site.yml", "tags": ["base"]}
],
"extra_vars_files": ["overrides/ubongo.yml"],
"mem_mib": 2048,
"vcpus": 2
}

View file

@ -1,129 +0,0 @@
---
# Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
# `integration_profile` (set in each profile's overlay). Each profile asserts its own success
# criteria; an unknown/unset profile fails loudly (never a silent pass).
- name: Verify the rebooted host
hosts: all
become: true
gather_facts: false
tasks:
- name: A known integration_profile must be set (no silent pass)
ansible.builtin.assert:
that:
- integration_profile is defined
- integration_profile in ['askari', 'askari_inputonly', 'ubongo']
fail_msg: "integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"
# ── askari profile — Docker host: published-port forwarding survives the reboot ──
# The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
# base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
- name: (askari) Gather service facts
when: integration_profile == 'askari'
ansible.builtin.service_facts:
- name: (askari) Docker daemon is active
when: integration_profile == 'askari'
ansible.builtin.assert:
that: "ansible_facts.services['docker.service'].state == 'running'"
fail_msg: "docker.service is not running"
- name: (askari) Forward chain permits container traffic (drop-in loaded)
when: integration_profile == 'askari'
ansible.builtin.command: nft list chain inet filter forward
register: _fwd
changed_when: false
- name: (askari) Assert container forwarding is allowed (not pure drop)
when: integration_profile == 'askari'
ansible.builtin.assert:
that: "'accept' in _fwd.stdout"
fail_msg: >-
forward chain is pure drop — container forwarding will die on reboot
(FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
- name: (askari) Published port answers from the controller (DNAT + forward alive)
when: integration_profile == 'askari'
delegate_to: localhost
become: false
ansible.builtin.uri:
# Probe :80 (plain HTTP) — any answer proves the published-port DNAT + forward path
# is alive. Don't follow caddy's HTTP->HTTPS redirect (its `tls internal` has no
# cert for a bare-IP HTTPS request); the 308 itself proves the path works.
url: "http://{{ ansible_host }}/"
follow_redirects: none
status_code: [200, 301, 308, 404, 502, 503]
timeout: 10
register: _probe
retries: 5
delay: 6
until: _probe is succeeded
# ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
# SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
# checks boot_id changed before this verify runs). Here we assert the ruleset shape.
- name: (ubongo) Read the live nftables ruleset
when: integration_profile == 'ubongo'
ansible.builtin.command: nft list ruleset
register: _nft
changed_when: false
- name: (ubongo) INPUT default-deny, forward permissive, lifeline + admin-addr allow
when: integration_profile == 'ubongo'
ansible.builtin.assert:
that:
# live `nft list ruleset` prints the SYMBOLIC priority (`filter` = 0), unlike the
# rendered /etc/nftables.conf (`priority 0`) that the Molecule scenario asserts against.
- "'hook input priority filter; policy drop;' in _nft.stdout"
- "'hook forward priority filter; policy accept;' in _nft.stdout"
# the ssh-from-control lifeline (base__firewall_control_addr) — the reconnect path
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft.stdout"
- "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
- "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
fail_msg: >-
ubongo profile: expected input policy drop, forward policy accept (input-only),
the ssh-from-control lifeline (192.168.150.1), and both admin-addr
(192.168.150.98/99) SSH allows in the live ruleset.
# ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
# INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
# (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
- name: (askari_inputonly) Read the live nftables ruleset
when: integration_profile == 'askari_inputonly'
ansible.builtin.command: nft list ruleset
register: _nft_io
changed_when: false
- name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
when: integration_profile == 'askari_inputonly'
ansible.builtin.assert:
that:
- "'hook input priority filter; policy drop;' in _nft_io.stdout"
- "'hook forward priority filter; policy accept;' in _nft_io.stdout"
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
fail_msg: >-
askari_inputonly: expected input policy drop, forward policy accept (input-only),
and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
- name: (askari_inputonly) Gather service facts
when: integration_profile == 'askari_inputonly'
ansible.builtin.service_facts:
- name: (askari_inputonly) Docker daemon is active
when: integration_profile == 'askari_inputonly'
ansible.builtin.assert:
that: "ansible_facts.services['docker.service'].state == 'running'"
fail_msg: "docker.service is not running"
- name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
when: integration_profile == 'askari_inputonly'
delegate_to: localhost
become: false
ansible.builtin.uri:
url: "http://{{ ansible_host }}/"
follow_redirects: none
status_code: [200, 301, 308, 404, 502, 503]
timeout: 10
register: _probe_io
retries: 5
delay: 6
until: _probe_io is succeeded

View file

@ -97,12 +97,3 @@ def test_ingress_missing_port_raises():
cat = {"svc": {"host": "docker01", "ingress": [{"from": "lan"}]}} cat = {"svc": {"host": "docker01", "ingress": [{"from": "lan"}]}}
with pytest.raises(ValueError): with pytest.raises(ValueError):
fr.resolve_firewall_rules(cat, ZONES, "docker01", HOSTVARS, GROUPS) fr.resolve_firewall_rules(cat, ZONES, "docker01", HOSTVARS, GROUPS)
def test_public_zone_resolves_to_anywhere():
catalog = {"web": {"host": "askari",
"ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
zones = {"public": "0.0.0.0/0"}
rules = fr.resolve_firewall_rules(catalog, zones, "askari",
{"askari": {"ansible_host": "100.99.226.39"}}, {})
assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]

View file

@ -123,8 +123,5 @@ def test_nudge_line_overdue_on_age():
def test_load_signals_reads_real_friction_file(): def test_load_signals_reads_real_friction_file():
path = os.path.join(os.path.dirname(__file__), "..", "docs", "FRICTION.md") path = os.path.join(os.path.dirname(__file__), "..", "docs", "FRICTION.md")
sigs = fs.load_signals(path, TODAY) sigs = fs.load_signals(path, TODAY)
# May legitimately be empty right after a /kaizen pass consumes every open signal — assert len(sigs) >= 1
# an empty Open-signals section is the goal state, not a failure. Assert the function
# parses the real file into well-formed signals (validity holds vacuously when empty).
assert isinstance(sigs, list)
assert all(s["tag"] in {"friction", "gotcha", "recurring", "unused"} for s in sigs) assert all(s["tag"] in {"friction", "gotcha", "recurring", "unused"} for s in sigs)

View file

@ -1,106 +0,0 @@
import importlib.util
import pathlib
import types
import pytest
_PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py"
_spec = importlib.util.spec_from_file_location("integration_vm", _PATH)
ivm = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(ivm)
def test_valid_tiers():
assert ivm.VALID_TIERS == ("internal", "le-staging", "le-prod-wildcard")
def test_vm_name_prefix_and_suffix():
assert ivm.vm_name("askari", "ab12cd34") == "boma-it-askari-ab12cd34"
def test_vm_name_generates_suffix():
n = ivm.vm_name("askari")
assert n.startswith("boma-it-askari-") and len(n.split("-")[-1]) == 8
def test_free_mib_parses_memavailable():
sample = "MemTotal: 16331156 kB\nMemAvailable: 8388608 kB\n"
assert ivm.free_mib(sample) == 8192
def test_parse_lease_ip_extracts_ipv4():
out = (" Name MAC address Protocol Address\n"
"-------------------------------------------------------------------\n"
" vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.42/24\n")
assert ivm.parse_lease_ip(out) == "192.168.150.42"
def test_parse_lease_ip_none_when_absent():
assert ivm.parse_lease_ip("no leases\n") is None
def test_parse_lease_ip_format_is_source_agnostic():
# virsh domifaddr --source arp output format is identical to --source lease;
# this test only proves the regex is format-agnostic (both sources produce the
# same table). The behavioral arp-fallback in wait_for_ip is covered by
# test_wait_for_ip_falls_back_to_arp below.
out = (" Name MAC address Protocol Address\n"
"-------------------------------------------------------------------\n"
" vnet0 52:54:00:de:ad:be ipv4 192.168.150.73/24\n")
assert ivm.parse_lease_ip(out) == "192.168.150.73"
def test_wait_for_ip_falls_back_to_arp(monkeypatch):
# wait_for_ip polls virsh domifaddr with --source lease first, then --source arp.
# Simulate lease returning empty (no DHCP lease yet) and arp returning a real address.
arp_out = (" Name MAC address Protocol Address\n"
"-------------------------------------------------------------------\n"
" vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.142/24\n")
def fake_sh(cmd, **kwargs):
if "arp" in cmd:
return types.SimpleNamespace(stdout=arp_out)
return types.SimpleNamespace(stdout="")
monkeypatch.setattr(ivm, "sh", fake_sh)
monkeypatch.setattr(ivm.time, "sleep", lambda _: None)
assert ivm.wait_for_ip("dummy") == "192.168.150.142"
def test_meta_data_has_instance_and_hostname():
md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x")
assert "instance-id: iid-askari-x" in md
assert "local-hostname: boma-it-askari-x" in md
def test_user_data_injects_key_and_ansible_user():
ud = ivm.render_user_data("ssh-ed25519 AAAA... claude@ubongo", "ansible")
assert ud.startswith("#cloud-config")
assert "name: ansible" in ud
assert "ssh-ed25519 AAAA... claude@ubongo" in ud
assert "NOPASSWD:ALL" in ud
def test_cert_file_valid_tier():
p = ivm.cert_file("le-staging")
assert p.name == "le-staging.yml" and p.parent.name == "certs"
def test_cert_file_rejects_bad_tier():
with pytest.raises(ValueError):
ivm.cert_file("bogus")
def test_render_run_hosts_single_host_in_groups():
out = ivm.render_run_hosts("boma-it-askari-x", "192.168.150.42",
"ansible", ["offsite_hosts"])
assert "offsite_hosts:" in out
assert "boma-it-askari-x:" in out
assert "ansible_host: 192.168.150.42" in out
assert "ansible_user: ansible" in out
assert "askari:" not in out.replace("boma-it-askari-x:", "")
def test_free_mib_returns_zero_when_absent():
assert ivm.free_mib("MemTotal: 16384 kB\n") == 0
def test_render_run_hosts_multiple_groups():
out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
["offsite_hosts", "docker_hosts"])
assert "offsite_hosts:" in out
assert "docker_hosts:" in out
def test_render_run_hosts_dedups_groups():
out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
["docker_hosts", "docker_hosts"])
assert out.count("docker_hosts:") == 1

View file

@ -57,99 +57,3 @@ def test_non_numbered_file_is_skipped():
bare = ["# ADR template\n", "\n", "## Status\n", "\n", "<!-- hint -->\n"] bare = ["# ADR template\n", "\n", "## Status\n", "\n", "<!-- hint -->\n"]
out = _checks(rs.adr_structure_findings({"docs/decisions/adr-template.md": bare})) out = _checks(rs.adr_structure_findings({"docs/decisions/adr-template.md": bare}))
assert out == [] assert out == []
# --- rename-incomplete -------------------------------------------------------
def _renames(findings):
return [f for f in findings if f["check"] == "rename-incomplete"]
def test_rename_incomplete_flags_lingering_old_name():
# ADR announces `Foo` -> `Bar`; another decisions file still says Foo present-tense.
announcer = {"docs/decisions/050-rename.md": [
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"]}
other = {} # extra_docs (CAPABILITIES/ROADMAP) — none here
lingering = {"docs/decisions/030-other.md": [
"The Foo proxy renders config from the catalog.\n"]}
announcer.update(lingering)
out = _renames(rs.rename_incomplete_findings(announcer, other))
assert len(out) == 1
assert out[0]["path"] == "docs/decisions/030-other.md"
assert out[0]["line"] == 1
assert out[0]["severity"] == "medium"
assert "Foo" in out[0]["detail"] and "Bar" in out[0]["detail"]
def test_rename_incomplete_clean_rename_has_no_findings():
# The rename announced, and no other doc still mentions Foo.
adr_files = {
"docs/decisions/050-rename.md": [
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
"docs/decisions/030-other.md": [
"The Bar proxy renders config from the catalog.\n"],
}
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
assert out == []
def test_rename_incomplete_skips_historical_cue_line():
# Foo lingers only on a line carrying a historical/negation cue → no finding.
adr_files = {
"docs/decisions/050-rename.md": [
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
"docs/decisions/030-other.md": [
"Foo was rejected; we run Bar now.\n",
"The history of Foo informs the choice.\n"],
}
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
assert out == []
def test_rename_incomplete_skips_announcing_adr_itself():
# The announcing ADR mentions Foo (it has to) — must not flag itself.
adr_files = {
"docs/decisions/050-rename.md": [
"## Decision\n",
"We renamed `Foo` to `Bar`.\n",
"Operators who configured Foo should switch their habits.\n"],
}
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
assert out == []
def test_rename_incomplete_skips_line_naming_new_term():
# A line that mentions both Foo and Bar is explaining the rename → skipped.
adr_files = {
"docs/decisions/050-rename.md": [
"## Decision\n", "We renamed `Foo` to `Bar`.\n"],
"docs/decisions/030-other.md": [
"Foo is being phased out for Bar in this paragraph.\n"],
}
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
assert out == []
def test_rename_incomplete_searches_extra_docs():
# A lingering OLD name in CAPABILITIES.md (an extra_docs file) is flagged.
adr_files = {"docs/decisions/050-rename.md": [
"## Decision\n", "We renamed `Foo` to `Bar`.\n"]}
extra = {"docs/CAPABILITIES.md": ["The Foo proxy is what we deploy.\n"]}
out = _renames(rs.rename_incomplete_findings(adr_files, extra))
assert len(out) == 1
assert out[0]["path"] == "docs/CAPABILITIES.md"
def test_rename_incomplete_ignores_ambiguous_adr_pointer_assertion():
# "the ADR-017 prose ... is updated to read Caddy" must NOT parse ADR-017 as the
# old name (it is a doc pointer). With ADR-017 rejected, no assertion → no finding,
# even though 'ADR-017' appears in many other docs.
adr_files = {
"docs/decisions/024-reverse-proxy.md": [
"## Consequences\n",
'- ADR-017 prose that mentioned Traefik is updated to read "Caddy".\n'],
"docs/decisions/008-testing.md": [
"Level 4 UI verification follows ADR-017.\n"],
}
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
assert out == []