Compare commits
No commits in common. "main" and "feat/m5-mesh-enrollment" have entirely different histories.
main
...
feat/m5-me
95 changed files with 225 additions and 6663 deletions
|
|
@ -6,7 +6,6 @@ exclude_paths:
|
|||
- .venv/
|
||||
- .collections/
|
||||
- .scaffold/
|
||||
- tests/integration/.run/ # transient harness run dir (gitignored, generated)
|
||||
- "**/vault.yml" # ansible-vault encrypted — not lintable YAML
|
||||
|
||||
# Warn only (don't fail) on these rules during initial setup
|
||||
|
|
|
|||
|
|
@ -6,12 +6,7 @@
|
|||
# 1. The execution-mode menu — writing-plans / subagent-driven-development script a
|
||||
# "Subagent-Driven vs Inline Execution — which approach?" menu at the plan→execution
|
||||
# handoff. boma's standing preference is to NEVER present it and proceed
|
||||
# subagent-driven. (Recorded by the 2026-06-10 kaizen review; the 2026-06-17 review
|
||||
# widened the matcher to also catch free-form *prose* re-asks of the same choice —
|
||||
# e.g. "which execution approach?" — which the literal-menu matcher missed. The
|
||||
# sibling push-vs-not re-ask is deliberately NOT hooked: a genuine "should I push?"
|
||||
# is sometimes legitimate, so it stays a soft default via the
|
||||
# dont-reask-settled-defaults memory rather than a hard block.)
|
||||
# subagent-driven. (Recorded by the 2026-06-10 kaizen review.)
|
||||
# 2. The brainstorming spec-review gate — the brainstorming skill scripts "Spec written
|
||||
# and committed … please review it before … the implementation plan." The standing
|
||||
# agreement is to move directly from the committed spec to writing-plans. (Recorded
|
||||
|
|
@ -44,11 +39,7 @@ text=$(jq -rs '
|
|||
low="${text,,}"
|
||||
|
||||
if [[ "$low" == *"inline execution"* \
|
||||
&& ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]] \
|
||||
|| [[ "$low" == *"subagent-driven or inline"* || "$low" == *"inline or subagent"* ]] \
|
||||
|| [[ "$low" == *"subagent-driven vs inline"* || "$low" == *"subagent vs inline"* \
|
||||
|| "$low" == *"inline vs subagent"* ]] \
|
||||
|| [[ "$low" == *"execution approach"* && "$low" == *"?"* ]]; then
|
||||
&& ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]]; then
|
||||
cat <<'JSON'
|
||||
{"decision":"block","reason":"Execution-mode menu detected in your final message. boma standing preference (docs/FRICTION.md + always-subagent-driven-execution memory): never present the subagent-driven-vs-inline menu. Drop the menu and proceed with subagent-driven execution directly (superpowers:subagent-driven-development)."}
|
||||
JSON
|
||||
|
|
|
|||
|
|
@ -1,16 +1,12 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# PreToolUse guard (Bash): block `git commit` ONLY when the rbw vault agent is locked
|
||||
# AND the commit would actually need the vault. The pre-commit ansible-lint hook decrypts
|
||||
# vault.yml via rbw — but it is scoped (`files: ^(roles|playbooks|inventories)/.*\.ya?ml$`,
|
||||
# always_run:false), so a docs-/config-only commit never triggers it and needs no vault.
|
||||
# (2026-06-17 kaizen, docs/FRICTION.md: the old guard blocked *every* locked commit, so a
|
||||
# docs-only commit got snagged needing a vault password it never uses.)
|
||||
# PreToolUse guard (Bash): block `git commit` when the rbw vault agent is locked.
|
||||
# The pre-commit ansible-lint hook decrypts vault.yml via rbw, so a commit while
|
||||
# locked fails deep with a confusing error. This catches it early with a clear fix.
|
||||
#
|
||||
# Fails OPEN: blocks only on a definitive "Ansible content staged AND rbw locked" signal.
|
||||
# rbw missing, not a plain `git commit`, `--no-verify`, or no Ansible content staged → allow.
|
||||
# When unsure it errs toward blocking (asking for an unlock is cheap; a deep pre-commit
|
||||
# failure is not).
|
||||
# Fails OPEN: only blocks on a definitive "rbw present AND not unlocked" signal.
|
||||
# If rbw is missing, the command isn't a plain `git commit`, or `--no-verify` is
|
||||
# used, the action is allowed.
|
||||
#
|
||||
set -uo pipefail
|
||||
|
||||
|
|
@ -26,25 +22,14 @@ case "$cmd" in
|
|||
esac
|
||||
|
||||
command -v rbw >/dev/null 2>&1 || exit 0 # rbw not installed — allow
|
||||
rbw unlocked >/dev/null 2>&1 && exit 0 # unlocked — allow
|
||||
|
||||
# rbw is LOCKED. Only block if this commit would run the vault-decrypting ansible-lint
|
||||
# hook — i.e. staged content matches its `files:` scope. Mirror that regex exactly.
|
||||
ANSIBLE_RE='^(roles|playbooks|inventories)/.*\.ya?ml$'
|
||||
if rbw unlocked >/dev/null 2>&1; then
|
||||
exit 0 # unlocked — allow
|
||||
fi
|
||||
|
||||
cd "${CLAUDE_PROJECT_DIR:-.}" 2>/dev/null || exit 0
|
||||
files=$(git diff --cached --name-only 2>/dev/null) || exit 0
|
||||
# `git commit -a/--all` also sweeps in modified tracked files that aren't staged yet.
|
||||
# (Substring match — errs toward including them, which only ever over-blocks. Safe.)
|
||||
case " $cmd " in
|
||||
*" -a"*|*"--all"*) files="$files"$'\n'"$(git diff --name-only 2>/dev/null)" ;;
|
||||
esac
|
||||
|
||||
# No Ansible content in the fileset → ansible-lint hook won't run → no vault needed → allow.
|
||||
printf '%s\n' "$files" | grep -Eq "$ANSIBLE_RE" || exit 0
|
||||
|
||||
# Ansible content staged AND rbw locked — the commit would fail deep in pre-commit. Block.
|
||||
# rbw present but not unlocked (locked or agent not running) — the commit would
|
||||
# fail in the pre-commit hook, so block early with guidance.
|
||||
cat <<'JSON'
|
||||
{"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked and this commit stages Ansible content — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock (docs-/config-only commits are exempt and won't hit this guard.)"}}
|
||||
{"hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"rbw is locked — the pre-commit ansible-lint hook needs the vault password to decrypt vault.yml. Run: rbw unlock"}}
|
||||
JSON
|
||||
exit 0
|
||||
|
|
|
|||
|
|
@ -69,10 +69,5 @@
|
|||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"statusLine": {
|
||||
"type": "command",
|
||||
"command": "bash \"${CLAUDE_PROJECT_DIR:-.}/.claude/statusline.sh\"",
|
||||
"padding": 0
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,63 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Claude Code statusLine — shows working dir, model, and context-window usage.
|
||||
# Wired via .claude/settings.json (statusLine.command). Receives the statusLine
|
||||
# JSON on stdin; first stdout line is rendered (ANSI colour supported).
|
||||
#
|
||||
# Context usage comes straight from the input JSON — no transcript parsing:
|
||||
# .context_window.used_percentage pre-calculated % of the window in use (input side)
|
||||
# .context_window.context_window_size window size in tokens (1000000 for the 1M models)
|
||||
# verified: Claude Code statusLine schema · code.claude.com/docs/en/statusline · 2026-06-17
|
||||
#
|
||||
# Fails soft: any parse problem prints nothing and exits 0 (never breaks the prompt).
|
||||
set -uo pipefail
|
||||
|
||||
input=$(cat 2>/dev/null) || exit 0
|
||||
command -v jq >/dev/null 2>&1 || exit 0
|
||||
|
||||
# pct<TAB>window<TAB>dir-basename<TAB>model-name (used_percentage preferred,
|
||||
# else derived from current_usage, else 0). @tsv keeps spaces in the dir safe.
|
||||
parsed=$(printf '%s' "$input" | jq -r '
|
||||
(.workspace.current_dir // .cwd // "" | sub(".*/"; "")) as $dir
|
||||
| (.model.display_name // "?") as $model
|
||||
| (.context_window.context_window_size // 200000) as $win
|
||||
| (
|
||||
if (.context_window.used_percentage // null) != null then
|
||||
.context_window.used_percentage
|
||||
elif (.context_window.current_usage // null) != null then
|
||||
((.context_window.current_usage.input_tokens
|
||||
+ (.context_window.current_usage.cache_creation_input_tokens // 0)
|
||||
+ (.context_window.current_usage.cache_read_input_tokens // 0)) / $win * 100)
|
||||
else 0 end | floor
|
||||
) as $pct
|
||||
| [$pct, $win, $dir, $model] | @tsv
|
||||
' 2>/dev/null) || exit 0
|
||||
[ -z "$parsed" ] && exit 0
|
||||
|
||||
IFS=$'\t' read -r pct win dir model <<<"$parsed"
|
||||
|
||||
# Human window label: 1000000 -> 1M, 200000 -> 200k, else Nk.
|
||||
case "$win" in
|
||||
1000000) wlabel="1M" ;;
|
||||
*) wlabel="$((win / 1000))k" ;;
|
||||
esac
|
||||
|
||||
# Colour the bar/percentage by pressure: green <70, yellow 70–89, red >=90.
|
||||
if [ "$pct" -ge 90 ]; then col=$'\033[31m' # red
|
||||
elif [ "$pct" -ge 70 ]; then col=$'\033[33m' # yellow
|
||||
else col=$'\033[32m' # green
|
||||
fi
|
||||
dim=$'\033[2m'; rst=$'\033[0m'
|
||||
|
||||
# 10-cell bar; clamp fill to [0,10] so an over-100 reading can't overflow.
|
||||
filled=$((pct / 10)); [ "$filled" -gt 10 ] && filled=10; [ "$filled" -lt 0 ] && filled=0
|
||||
bar=""
|
||||
for ((i = 0; i < 10; i++)); do
|
||||
if [ "$i" -lt "$filled" ]; then bar+="█"; else bar+="░"; fi
|
||||
done
|
||||
|
||||
printf '%s%s%s · %s · %s%s %d%%%s %sctx/%s%s\n' \
|
||||
"$dim" "$dir" "$rst" \
|
||||
"$model" \
|
||||
"$col" "$bar" "$pct" "$rst" \
|
||||
"$dim" "$wlabel" "$rst"
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -34,6 +34,3 @@ terraform/**/terraform.tfvars
|
|||
|
||||
# Service-UI verification screenshots (kept locally on ubongo, not committed — ADR-017)
|
||||
.verify-runs/
|
||||
|
||||
# Integration-test transient run dir (ADR-025); diagnostics live under ~/integration-runs
|
||||
tests/integration/.run/
|
||||
|
|
|
|||
|
|
@ -24,5 +24,4 @@ ignore: |
|
|||
.venv/
|
||||
.collections/
|
||||
.scaffold/
|
||||
tests/integration/.run/
|
||||
**/vault.yml
|
||||
|
|
|
|||
|
|
@ -43,8 +43,6 @@ Full design rationale: `docs/decisions/`
|
|||
| Terraform plan | `make tf-plan [TF_ENV=staging]` |
|
||||
| Terraform apply | `make tf-apply [TF_ENV=staging]` |
|
||||
| Regenerate Ansible inventory | `make tf-inventory TF_ENV=<staging\|production>` |
|
||||
| Integration-test a host on a local VM | `make test-integration HOST=<name> [CERTS=…]` |
|
||||
| Clean up integration test VMs | `make test-integration-clean` |
|
||||
|
||||
**Always `tf-plan` before `tf-apply`. Always `check` before `deploy`. Never skip lint.**
|
||||
|
||||
|
|
@ -258,10 +256,7 @@ Single-contributor, trunk-based (no merge requests / approval gates):
|
|||
| Backup & disaster recovery | `docs/decisions/022-backup.md` |
|
||||
| ADR structure & lifecycle | `docs/decisions/023-adr-structure.md` |
|
||||
| Reverse proxy (Caddy) | `docs/decisions/024-reverse-proxy.md` |
|
||||
| Local VM integration testing (ADR-025) | `docs/decisions/025-local-vm-integration-testing.md` |
|
||||
| Integration testing runbook | `docs/runbooks/integration-testing.md` |
|
||||
| Adding a new role | `docs/runbooks/new-role.md` |
|
||||
| Adding a new host | `docs/runbooks/new-host.md` |
|
||||
| Enrolling a NetBird client (laptop/phone) | `docs/runbooks/netbird-client.md` |
|
||||
| Rotating vault secrets | `docs/runbooks/rotate-secrets.md` |
|
||||
| Claude Code setup (per machine) | `docs/runbooks/claude-code-setup.md` |
|
||||
|
|
|
|||
38
Makefile
38
Makefile
|
|
@ -23,11 +23,6 @@ MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
|
|||
# (the Go module proxy 403s Hetzner IPs); push the pinned tag to the Forgejo registry.
|
||||
CADDY_IMAGE := forgejo.nyumbani.baobab.band/sjat/caddy-gandi:2.11.4
|
||||
CADDY_DOCKERFILE := .docker/caddy-gandi/Dockerfile
|
||||
# Forgejo container registry (same host/user as the image tags above). `make registry-login`
|
||||
# logs the Docker daemon in using vault.forgejo.registry_token (2026-06-17 kaizen) so image
|
||||
# pushes are agent-completable non-interactively.
|
||||
REGISTRY_HOST := forgejo.nyumbani.baobab.band
|
||||
REGISTRY_USER := sjat
|
||||
|
||||
# For TF_ENV=offsite, source the Hetzner token from the vault into the environment
|
||||
# (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
|
||||
|
|
@ -39,11 +34,10 @@ endif
|
|||
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
.PHONY: help setup collections lint test test-all test-integration test-integration-clean \
|
||||
check deploy encrypt decrypt \
|
||||
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
|
||||
edit-vault check-vault new-role \
|
||||
tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \
|
||||
molecule-image molecule-image-push caddy-image caddy-image-push registry-login
|
||||
molecule-image molecule-image-push caddy-image caddy-image-push
|
||||
|
||||
help:
|
||||
@echo ""
|
||||
|
|
@ -54,10 +48,8 @@ help:
|
|||
@echo " make lint Run yamllint + ansible-lint"
|
||||
@echo " make test ROLE=<name> Run Molecule tests for a role"
|
||||
@echo " make test-all Run Molecule tests for all roles"
|
||||
@echo " make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1] Run ADR-025 integration cycle against a VM"
|
||||
@echo " make test-integration-clean Prune stale integration-test VM snapshots"
|
||||
@echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>] Dry-run a playbook (check mode)"
|
||||
@echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] [EXTRA=<args>] Run a playbook against production"
|
||||
@echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Dry-run a playbook (check mode)"
|
||||
@echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Run a playbook against production"
|
||||
@echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)"
|
||||
@echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)"
|
||||
@echo " make encrypt FILE=<path> Encrypt a vault file"
|
||||
|
|
@ -77,7 +69,6 @@ help:
|
|||
@echo " make molecule-image-push Push the test image to the Forgejo registry"
|
||||
@echo " make caddy-image Build the custom Caddy + Gandi DNS-01 image (run on ubongo)"
|
||||
@echo " make caddy-image-push Push the Caddy image to the Forgejo registry"
|
||||
@echo " make registry-login Log Docker into the Forgejo registry (vaulted token)"
|
||||
@echo ""
|
||||
|
||||
# ── Environment setup ─────────────────────────────────────────────────────────
|
||||
|
|
@ -112,29 +103,19 @@ test-all:
|
|||
cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \
|
||||
done
|
||||
|
||||
test-integration:
|
||||
ifndef HOST
|
||||
$(error HOST is required: make test-integration HOST=<name> [CERTS=internal|le-staging] [KEEP=1])
|
||||
endif
|
||||
PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py cycle \
|
||||
--host $(HOST) $(if $(CERTS),--certs $(CERTS)) $(if $(KEEP),--keep)
|
||||
|
||||
test-integration-clean:
|
||||
PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py prune
|
||||
|
||||
# ── Playbook execution ────────────────────────────────────────────────────────
|
||||
|
||||
check:
|
||||
ifndef PLAYBOOK
|
||||
$(error PLAYBOOK is required: make check PLAYBOOK=<name>)
|
||||
endif
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) --check --diff playbooks/$(PLAYBOOK).yml
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) --check --diff playbooks/$(PLAYBOOK).yml
|
||||
|
||||
deploy:
|
||||
ifndef PLAYBOOK
|
||||
$(error PLAYBOOK is required: make deploy PLAYBOOK=<name>)
|
||||
endif
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) $(EXTRA) playbooks/$(PLAYBOOK).yml
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) playbooks/$(PLAYBOOK).yml
|
||||
|
||||
# ── Vault ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -178,13 +159,6 @@ caddy-image:
|
|||
caddy-image-push: caddy-image
|
||||
docker push $(CADDY_IMAGE)
|
||||
|
||||
# Log the local Docker daemon into the Forgejo registry using the vaulted token, so the
|
||||
# *-image-push targets above are agent-completable non-interactively (rbw must be unlocked).
|
||||
registry-login:
|
||||
@ANSIBLE_VAULT="$(ANSIBLE)-vault" PYTHON="$(PYTHON)" VAULT="$(VAULT)" \
|
||||
REGISTRY_HOST="$(REGISTRY_HOST)" REGISTRY_USER="$(REGISTRY_USER)" \
|
||||
bash scripts/registry-login.sh
|
||||
|
||||
# ── Terraform ─────────────────────────────────────────────────────────────────
|
||||
|
||||
tf-init:
|
||||
|
|
|
|||
24
STATUS.md
24
STATUS.md
|
|
@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the
|
|||
truth. **Before relying on a role, provider, or pipeline existing, check here.**
|
||||
If something is listed as "designed, not built", do not assume it works.
|
||||
|
||||
_Last reviewed: 2026-06-19._
|
||||
_Last reviewed: 2026-06-14._
|
||||
|
||||
## Real and working today
|
||||
|
||||
|
|
@ -30,8 +30,8 @@ _Last reviewed: 2026-06-19._
|
|||
| `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). |
|
||||
| `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
|
||||
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
|
||||
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **`base` firewall applied (mesh-hardening 2/3, 2026-06-19):** INPUT-only default-deny — input locked to `wt0` + ssh-from-control (`10.20.10.151`) + workstations (`10.20.10.50` mamba, `10.20.10.17`); forward `accept` (Docker/libvirt-NAT safe). Live-verified (SSH self-path + Docker egress, after a post-apply `restart docker` — base's flush wipes Docker nat, FRICTION); **real-host reboot-validated (2026-06-19):** after an operator reboot, the `policy drop` input chain + full allow-list re-applied on boot and the `wt0` mesh + SSH self-path came back clean. `claude` now self-SSHes (ad-hoc `authorized_keys` grant so the agent can run SSH-based deploys with the auto-rollback safety; fold into the control-node bootstrap). **Pending:** full `base` hardening (auditd/CIS); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservations (10.20.10.151 MAC `88:a4:c2:e0:ee:da` + the `.50`/`.17` workstation leases); Terraform state backup (now relevant — the offsite tfstate exists). |
|
||||
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Mesh-hardening redesign applied + live reboot-validated (2026-06-20):** `base` INPUT-only nftables default-deny (`inet filter` input `policy drop`; forward `accept`, Docker-safe via a post-apply `restart docker`), SSH `wt0`-primary + a permanent WAN break-glass (ubongo's WAN `91.226.145.80`; the Hetzner console is the OOB ultimate fallback), managed over `wt0`; `netbird_coordinator` geolocation disabled (`NB_DISABLE_GEOLOCATION`) so a no-egress boot can't FATAL it. A real reboot recovered **unattended** — firewall persisted, Docker forwarding + public services (Caddy 80/443, STUN 3478) up, coordinator geo-disabled (no FATAL), `wt0`/mesh (Management+Signal Connected) + both SSH paths back. **Pending:** offsite tfstate backup (ADR-022); relay-SPOF reduction (next mesh-hardening sub-project — `ubongo→askari` is currently `Relayed` through askari's own relay). |
|
||||
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern; agent management now works because `claude`'s SSH key was added to `sjat`'s `authorized_keys` and `sjat` was granted `NOPASSWD` sudo (`/etc/sudoers.d/sjat-ansible`) — the interim until the proper `ansible`-user bootstrap. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
|
||||
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Pending:** host firewall + moving askari's SSH onto `wt0` (deferred mesh-hardening; the Hetzner Cloud Firewall is its perimeter until then), offsite tfstate backup (ADR-022). |
|
||||
| `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy` → `/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). |
|
||||
| `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. |
|
||||
|
||||
|
|
@ -39,7 +39,7 @@ _Last reviewed: 2026-06-19._
|
|||
|
||||
| Thing | State |
|
||||
|---|---|
|
||||
| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is **applied to ubongo** (mesh-hardening 2/3, 2026-06-19) **and askari** (mesh-hardening redesign, 2026-06-20) — both INPUT-only default-deny via the `base__firewall_input_only` knob (input default-deny + `wt0`/ssh-from-control/`base__firewall_admin_addrs` allow-list; forward left `accept` so Docker/libvirt-NAT survive), both **live reboot-validated**. On a Docker host (askari) base's `flush ruleset` wipes Docker's nat, so the cutover follows the firewall apply with a `restart docker` to rebuild it (FRICTION). Not built: auditd, packages, users (Phase 2 / TODO 15). The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`) so a local-DNS hiccup can't strand the mesh — **applied + live on ubongo (2026-06-20)**: `getent hosts netbird.askari.wingu.me` → `77.42.120.136`, mesh unaffected. The single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment). |
|
||||
| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). |
|
||||
| `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
|
||||
| `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |
|
||||
|
||||
|
|
@ -50,7 +50,7 @@ daemon hardening + `nftables.d` container rules, ADR-004/ADR-020 — is still pe
|
|||
A `make deploy PLAYBOOK=site` run now applies real content — `base` (its `firewall` +
|
||||
`hardening` concerns) plus a functional `docker_host` (Docker engine) on docker hosts —
|
||||
but in practice it is still limited: the production cluster has no docker hosts yet, and
|
||||
`base`'s `firewall` concern is now applied to `ubongo` (control) but not yet to cluster docker hosts (none exist), so a full cluster `site` run does not
|
||||
`base`'s `firewall` concern is mesh-gated until M5, so a full cluster `site` run does not
|
||||
yet exist. (The `make check`/`deploy` machinery itself works — first proven by applying
|
||||
`dev_env` via `playbooks/workstation.yml`, then `base`/`docker_host`/`reverse_proxy` on
|
||||
askari.)
|
||||
|
|
@ -70,7 +70,7 @@ askari.)
|
|||
| CIS hardening (Debian L1+L2 + Docker) | ADR-002 / TODO 15 | Implemented by the (unbuilt) `base`/`docker_host` roles; brings AppArmor + AIDE as baseline. L2 partitions affect VM provisioning (ADR-006) |
|
||||
| Network IDS + security alerting | ADR-002 / TODO 15 | Suricata on OPNsense + AIDE/`auditd`/`fail2ban` alerting into the monitoring stack; not built |
|
||||
| NetBird mesh — coordinator on `askari` | ADR-016 | **BUILT + applied (M4b, 2026-06-16)** — moved up to "Real and working today" (`roles/netbird_coordinator/`). Self-hosted control plane on askari; replaces ADR-007 WireGuard. Mesh **peer enrolment = M5** (next row). |
|
||||
| NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. **Road-warrior clients (`mamba` + work laptop) enrolled (2026-06-17) → `ubongo` reachable from anywhere: the mobile-access goal is met and Phase 1 (remote access) is COMPLETE.** Client enrollment runbook: `docs/runbooks/netbird-client.md`. |
|
||||
| NetBird agent enrollment in `base` | ADR-016 | **BUILT + applied (M5, 2026-06-17).** The `base` `mesh` concern (opt-in `base__mesh_enabled`) installs the pinned NetBird agent + runs `netbird up` with the reusable scoped key from `vault.netbird.setup_key`. Applied to **askari (`100.99.226.39`) + ubongo (`100.99.146.14`)** — both Management+Signal Connected; ubongo↔askari mesh ping verified. Enrollment is **additive** — the "SSH only on `wt0`" firewall lockdown is the deferred mesh-hardening follow-on, NOT applied. Road-warrior clients (laptops) are operator-enrolled. |
|
||||
| Service-UI verification (Level 4) | ADR-017 / ADR-008 | **Design RESOLVED** (ADR-017 + spec + plan); resolves ADR-015 deferred #2. `/verify-service` skill + `VERIFY.md` template + standards are authorable and present. **Build pending:** running needs ubongo + `playwright` plugin + Authentik + a staging deploy. |
|
||||
| Logging pipeline (Loki + Alloy + off-site subset) | ADR-018 | **Design RESOLVED** (ADR-018 + spec). All logs → on-cluster Loki; security subset write-only off-site to askari. **Build pending:** Alloy in `base`, `loki`/`grafana` service roles, OPNsense syslog — none built. |
|
||||
| Security alerting (AIDE/auditd/fail2ban/Suricata + log-silence) | ADR-002 / ADR-018 | Wired into Grafana on the Loki stack. Designed; depends on the logging pipeline + metrics stack (TODO 3.6). |
|
||||
|
|
@ -81,18 +81,6 @@ askari.)
|
|||
| Backup `backup` role + `backup_hosts` group | ADR-022 | Does not exist. Pull node (`fisi`), restic repo, rclone→pCloud, USB air-gap — Plan 2. |
|
||||
| Per-service `backup__*` contract + `BACKUP.md` | ADR-022 | Convention defined; inert until service roles exist to declare against. |
|
||||
|
||||
## Integration test harness (ADR-025)
|
||||
|
||||
| Thing | State |
|
||||
|---|---|
|
||||
| `roles/integration_test/` | **Built** — installs/enables libvirt+QEMU+virtinst on `control` group hosts; adds `sjat`/`claude` to `libvirt` group; creates image-cache dir. Lint clean; applied live to ubongo (substrate installed); molecule scenario present, not run in the build env. |
|
||||
| `scripts/integration-vm.py` | **Built** — stdlib-only lifecycle driver over `virsh`/`virt-install`/`cloud-localds`: `up / apply / reboot / assert / cycle / down / prune / console`. Lazily ensures the golden Debian-13 genericcloud image. pytest clean (transient-inventory generation, var/overlay merge, `--certs` mapping, DHCP-lease parsing, resource-guard math). |
|
||||
| `tests/integration/` (profile, verify, overrides) | **Built** — "be askari" profile + var overlay + `verify.yml` outcome assertions (Docker active, forward-chain accepts present, published-port DNAT alive). Validated end-to-end by the RED→GREEN acceptance run. |
|
||||
| `make test-integration` / `make test-integration-clean` | **Built** — wired into `Makefile`. |
|
||||
| ADR-025 | **Accepted (2026-06-18)** — decision recorded, approach A, cert tiers, safety invariants, UEFI boot requirement, and claude-sudo dependency documented. |
|
||||
| **RED/GREEN acceptance (ubongo live pass)** | **PASSED (2026-06-18).** A throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base nftables forward default-deny kills Docker forwarding on reboot) = RED. Applying the `docker_host` container-forward drop-in and rebooting survived = GREEN. Nine shakedown findings captured in `docs/FRICTION.md`; key learnings (UEFI boot, claude sudo) recorded in ADR-025. `docs/TODO.md` item 2.4 closed. |
|
||||
| `le-staging` cert validation | **Pending** — wired in v1 but not yet exercised on a real VM (separate from the RED/GREEN acceptance gate). |
|
||||
|
||||
## Keeping this honest
|
||||
|
||||
Update this file whenever you build, stub, or remove something. It is the first
|
||||
|
|
|
|||
341
docs/FRICTION.md
341
docs/FRICTION.md
|
|
@ -22,259 +22,83 @@ earning its keep.
|
|||
|
||||
_(append new raw signals here; the next kaizen review consumes them)_
|
||||
|
||||
- `[friction]` **Re-asked settled defaults (push + subagent-driven) at the plan→execute handoff**
|
||||
(2026-06-19): despite the standing preference (memory `dont-reask-settled-defaults`: push to
|
||||
origin as off-machine backup **and** go subagent-driven, both WITHOUT asking), I again asked the
|
||||
operator "which execution approach?" and "want me to push?". The `writing-plans` skill scripts
|
||||
that handoff question ("Which approach?"), and confirming a push felt natural — both overrode the
|
||||
memory. → at the writing-plans → execution handoff, default to subagent-driven execution and push
|
||||
to origin without a confirmation gate; reserve questions for genuine forks. Recurrence of an
|
||||
already-recorded signal — treat the skill's scripted "Which approach?" as pre-answered
|
||||
(subagent-driven) for this operator.
|
||||
- `[friction]` **Image push to the Forgejo registry fails with `no basic auth
|
||||
credentials`** (2026-06-15): `make caddy-image-push` (and `molecule-image-push`) fail
|
||||
unless the Docker daemon on ubongo has an interactive `docker login
|
||||
forgejo.nyumbani.baobab.band` session — and those creds are **not in vault** (only
|
||||
`gandi` + `hetzner` are), so an agent can't complete a push non-interactively. The
|
||||
build half is fully automatable; the push half silently requires a human. → candidate:
|
||||
document the `docker login` step in `docs/runbooks/claude-code-setup.md`, **or** store
|
||||
a scoped Forgejo registry token in vault + a `make registry-login` target (login via
|
||||
`--password-stdin`, `no_log`) so pushes are agent-completable like every other
|
||||
vault-backed action.
|
||||
|
||||
<!-- The six below are from the 2026-06-17 mesh-hardening-1/3 incident: applying base's
|
||||
nftables default-deny + wt0-only sshd to askari (the off-site Docker host that ALSO runs
|
||||
the NetBird coordinator) took it down on reboot; recovery needed the Hetzner console +
|
||||
a WAN-SSH break-glass. Spec/plan: docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*. -->
|
||||
- `[gotcha]` **Single-file Docker bind mount + atomic config rewrite = stale config in
|
||||
the running container** (2026-06-16): `reverse_proxy` bind-mounted the Caddyfile as a
|
||||
single file; `ansible.builtin.template` writes atomically (temp + rename → new inode),
|
||||
so the running container kept the OLD inode and `caddy reload` (in-container, no restart)
|
||||
re-read stale config and silently no-op'd (`"config is unchanged"`). The NetBird route
|
||||
never loaded → Caddy never requested its cert; surfaced only by a TLS handshake failure.
|
||||
Fix: mount the config **directory** (`./caddy` → `/etc/caddy`) — directory mounts reflect
|
||||
inode swaps, so live reload works (proven on askari). NOTE the sibling case: NetBird also
|
||||
single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
|
||||
in-container reload), and a restart DOES re-resolve the bind mount (verified: 0 before,
|
||||
1 after) — so restart-based roles are safe; only in-place-reload roles need the dir mount.
|
||||
→ candidate gotcha doc (`docs/testing/gotchas.md`): "reload-in-place needs a directory
|
||||
mount; restart-based roles are fine with a single-file mount."
|
||||
|
||||
- `[gotcha]` **`base`'s nftables `forward policy drop` breaks Docker hosts on reboot**
|
||||
(2026-06-17): `base/templates/nftables.conf.j2` sets `chain forward { ... policy drop; }`.
|
||||
On a Docker host, container traffic is *forwarded* (published-port DNAT → container, and
|
||||
inter-container over the bridge), so the drop kills it. It worked right after `make
|
||||
deploy` (Docker's runtime rules coexisted) but after a reboot nftables loaded our
|
||||
default-deny *before* Docker, breaking WAN→Caddy and Caddy→coordinator → the public
|
||||
services and the mesh went down. The `docker_host` "`nftables.d` container-forward rules"
|
||||
that would make this Docker-safe are explicitly **pending** (STATUS.md). → the `base`
|
||||
firewall (`base__firewall_apply`) must NOT be applied to any Docker host until
|
||||
`docker_host` ships the container-forward rules; add a guard/check (a Docker host with
|
||||
`firewall_apply: true` and no container-forward drop-in is a misconfiguration), and the
|
||||
firewall design (ADR-020) should state the Docker-host dependency explicitly.
|
||||
- `[friction]` **`make check` always fails on the first-ever deploy of a compose service
|
||||
role** (2026-06-16): in check mode the "ensure base_dir" task is reported-but-not-run, so
|
||||
the later `community.docker.docker_compose_v2` up fails with `"…is not a directory"`
|
||||
(missing `project_src`). Not a defect — a real deploy creates the dir — but it means the
|
||||
CLAUDE.md "always `make check` before `make deploy`" step is guaranteed-red for any brand
|
||||
new stateful role, which erodes trust in the check. → candidate: guard the compose-up with
|
||||
`not ansible_check_mode` (clean "skipped" in dry-run; compose can't be meaningfully
|
||||
dry-run before first deploy anyway), OR document the one-time expected failure. Decide one.
|
||||
|
||||
- `[gotcha]` **`ip_nonlocal_bind` did NOT beat the sshd boot-race** (2026-06-17): the
|
||||
mesh-hardening plan bound sshd `ListenAddress` to the `wt0` IP and set
|
||||
`net.ipv4.ip_nonlocal_bind=1` so sshd could bind the mesh IP before `wt0` exists at
|
||||
boot. In practice the console still showed sshd *"could not assign the address"* at boot
|
||||
— so the protection did not work as designed, and because `wt0` never came up (the
|
||||
coordinator was down), sshd had no listener at all → no SSH path. → the entire
|
||||
"sshd listens on `wt0` only" premise is unsound without (a) a *verified* boot-race fix
|
||||
and (b) a guaranteed non-mesh break-glass. Re-investigate why `ip_nonlocal_bind` didn't
|
||||
help (ordering vs the sysctl drop-in load? the sysctl not applied before sshd start?),
|
||||
or drop ListenAddress-on-mesh entirely and rely on the host firewall for SSH scoping.
|
||||
- `[recurring]` **Re-asked the operator about settled defaults — push + execution mode**
|
||||
(2026-06-17): at the M5 plan handoff I asked (a) whether to push to origin and (b) which
|
||||
execution mode (subagent-driven vs inline) — both already settled: CLAUDE.md says push to
|
||||
`origin` often (off-machine backup), and TODO 10.5 / the standing agreement is "always
|
||||
subagent-driven" (there's even `guard-execution-mode-menu.sh`). Same shape as the 5×
|
||||
"execution-mode menu asked AGAIN" ledger entries — but this time the ask was my own
|
||||
free-form prose ("want those pushed now?", "which execution approach?"), which the
|
||||
existing menu-text matcher does NOT catch (it keys on the writing-plans menu's literal
|
||||
text). → the gap is that the guard only matches that literal menu; free-form re-asks slip
|
||||
through. Candidate: widen the Stop-hook matcher to also flag prose re-asks of
|
||||
push-vs-not / subagent-vs-inline, since prose reminders have already failed this many
|
||||
times. Default behaviour: **push as backup and proceed subagent-driven without asking.**
|
||||
|
||||
- `[gotcha]` **The coordinator host can't bootstrap the mesh it depends on** (2026-06-17):
|
||||
`askari` runs the NetBird coordinator AND is a mesh peer. After a reboot its NetBird
|
||||
agent needs the coordinator (a local container) to be serving to bring up `wt0` — but
|
||||
the coordinator wasn't healthy, so `wt0` never came up. Circular. Combined with sshd
|
||||
being `wt0`-only, the host was reachable only via the Hetzner console. → the
|
||||
coordinator host must keep a **non-mesh management path always** (don't move its SSH onto
|
||||
`wt0`), or the mesh-hardening must treat the coordinator host as a special case. General
|
||||
rule: never make a host's only management path depend on a service that host itself
|
||||
hosts.
|
||||
- `[friction]` **A docs-only commit still tripped the `rbw`-locked pre-commit guard**
|
||||
(2026-06-17): committing only `docs/superpowers/specs/*.md` (no ansible content) was
|
||||
blocked needing the vault password, although the 2026-06-10 kaizen fix scoped the
|
||||
pre-commit `ansible-lint` hook (`always_run: false` + `files:` ansible content) so
|
||||
docs-/config-only commits skip it and need no vault. So either the hook's `files:`
|
||||
pattern still matches `docs/**` (or `.md`), or a blanket pre-commit step needs the
|
||||
vault regardless. → check `.pre-commit-config.yaml`'s `files:`/`exclude:` against the
|
||||
spec/plan paths; docs-only commits should not require `rbw`.
|
||||
|
||||
- `[gotcha]` **NetBird `netbird-server` FATAL-loops on the geolocation DB download with no
|
||||
egress** (2026-06-17): on startup the combined `netbird-server:0.72.4` tries to download
|
||||
the GeoLite2 DB from `pkgs.netbird.io` and treats failure as **FATAL** (crash-loop) — so
|
||||
any loss of container egress (here: Docker NAT masquerade wiped when `nftables` was
|
||||
flushed, not re-added by a plain `restart docker`) takes the whole control plane down.
|
||||
Recovery was `restart docker` (rebuild NAT) → force-recreate the container so it could
|
||||
download. → for the `netbird_coordinator` role: pre-seed/persist the geo DB in the data
|
||||
dir (or pin a local copy), or disable the geolocation requirement, so a transient egress
|
||||
blip can't FATAL the coordinator. Note for the firewall design: container egress (NAT)
|
||||
is fragile across `nft flush` + reboot.
|
||||
- `[friction]` **The agent can't manage `ubongo` (the control node it runs ON) without
|
||||
the operator granting access** (2026-06-17): enrolling `ubongo` in the mesh needed two
|
||||
manual operator grants because the agent runs as `claude` (no sudo) but the inventory
|
||||
manages `ubongo` as `sjat`: (1) `claude`'s SSH key added to `sjat`'s `authorized_keys`
|
||||
(`Permission denied (publickey)` otherwise), then (2) `NOPASSWD` sudo for `sjat`
|
||||
(`Missing sudo password` otherwise). So the "AI-worker control node" (ADR-015) can drive
|
||||
the whole fleet but not itself, unattended. This is the **pending `ansible`-user
|
||||
bootstrap** gap (STATUS) biting in practice. → the proper fix is ubongo's bootstrap to a
|
||||
key-trusted, NOPASSWD `ansible` (or `sjat`) management identity as part of `base`/its
|
||||
control-node recipe, so control-node self-management doesn't need ad-hoc operator grants.
|
||||
|
||||
- `[friction]` **No off-site coordinator backup turned a 2-minute restore into a long live
|
||||
recovery** (2026-06-17): the NetBird coordinator's stateful store (`/var/lib/netbird`,
|
||||
encrypted SQLite) has **no off-site backup yet** (ADR-022 `backup` role pending,
|
||||
flagged in STATUS as the coordinator's deferred backup). During the incident there was a
|
||||
real fear the unclean reboots had corrupted the store, with no restore path. It turned
|
||||
out to be a runtime/egress issue, not corruption — but the absence of a backup made the
|
||||
whole recovery higher-stakes. → prioritise the ADR-022 backup contract for the
|
||||
`netbird_coordinator` store ahead of the rest of the backup role; a recent off-host copy
|
||||
would have made "rebuild askari from scratch" a safe option.
|
||||
|
||||
- `[friction]` **The plan tested reboot-recovery AFTER removing the break-glass**
|
||||
(2026-06-17): the mesh-hardening plan's live cutover closed the WAN `:22` (step 5)
|
||||
*before* the reboot-resilience test (step 7), so the one fallback path was gone exactly
|
||||
when the reboot exposed the boot-race + Docker-firewall bugs. → sequencing rule for
|
||||
lockout-risky cutovers: **validate reboot-recovery while the old access path is still
|
||||
open**, and only retire the break-glass once recovery (incl. a reboot) is proven.
|
||||
Generalises beyond this milestone — a candidate line in the new-host / hardening runbooks.
|
||||
|
||||
<!-- The below are from the 2026-06-18 ADR-025 build: standing up the local-VM integration
|
||||
harness on ubongo and shaking it down against real KVM (spec/plan in docs/superpowers/). -->
|
||||
|
||||
- `[gotcha]` **Debian 13 genericcloud boot-loops under legacy BIOS/SeaBIOS** (2026-06-18):
|
||||
`virt-install --import` of the genericcloud qcow2 with the default (SeaBIOS) firmware
|
||||
triple-faults at the real-mode kernel handoff — GRUB loops, no "Decompressing Linux", no
|
||||
DHCP lease. The symptom (no network) pointed away from the cause (firmware). → boot test
|
||||
VMs via **UEFI** (`virt-install --boot uefi`; OVMF→efistub).
|
||||
|
||||
- `[friction]` **The no-sudo `claude` model blocked diagnosing a failed VM** (2026-06-18):
|
||||
under ADR-015 `claude` had no sudo, so when the VM wouldn't network there was no way to
|
||||
introspect it (serial logs are `root:0600`, libguestfs not installed, mounting needs
|
||||
root). Diagnosis was fully blocked until the operator granted `claude` sudo. → DECISION:
|
||||
`claude` gets `NOPASSWD:ALL` (reverses ADR-015's "no local sudo"); compensating control
|
||||
is auditd/Loki attribution (already in ADR-015). Amend ADR-015/ADR-021 + accepted-risks;
|
||||
codify the sudoers drop-in in Ansible.
|
||||
|
||||
- `[gotcha]` **Non-root `virsh`/`virt-install` default to `qemu:///session`** (2026-06-18):
|
||||
the substrate (NAT net, /dev/kvm) lives on `qemu:///system`. → pin
|
||||
`LIBVIRT_DEFAULT_URI=qemu:///system` in the driver.
|
||||
|
||||
- `[gotcha]` **`qemu:///system` (libvirt-qemu) can't traverse `/home`** (2026-06-18): VM
|
||||
disk/seed/console under the repo/home failed "Permission denied (search permissions for
|
||||
/home/claude)". → put per-VM artifacts in a system-readable dir (`/var/lib/boma-integration`,
|
||||
group libvirt); the inventory (read by ansible as the user) can stay in the repo.
|
||||
|
||||
- `[gotcha]` **`ansible-playbook -i <dir>/` parses sibling non-inventory files as INI**
|
||||
(2026-06-18): pointing `-i` at a run-dir holding a state file + qcow2s made the directory
|
||||
inventory loader parse the state file as INI → phantom hosts INCLUDING the real `askari`
|
||||
(with its real vars), breaking the single-host isolation invariant. → point `-i` at the
|
||||
single `hosts.yml`. Caught by the holistic cross-file review BEFORE any hardware run.
|
||||
|
||||
- `[gotcha]` **Jinja `{%- -%}` + ansible `trim_blocks=True` double-strip newlines**
|
||||
(2026-06-18): a template edit used `{%- -%}`, reviewed by rendering with RAW jinja2
|
||||
(trim_blocks=False) which looked fine; ansible (trim_blocks=True) then collapsed the
|
||||
rendered Caddyfile onto single lines → caddy crash-looped on invalid config. → verify
|
||||
templates with ansible's whitespace (trim_blocks=True), not raw jinja2; prefer plain
|
||||
`{% %}` at column 0 (the repo's existing style).
|
||||
|
||||
- `[gotcha]` **Fresh cloud images have empty apt lists** (2026-06-18): `apt install
|
||||
nftables` failed "No package matching 'nftables' is available" on a fresh genericcloud
|
||||
VM whose cloud-init had `package_update: false`. → `package_update: true` AND block on
|
||||
`cloud-init status --wait` before applying.
|
||||
|
||||
- `[gotcha]` **base's default-deny firewall drops SSH to a NAT'd VM unless the gateway is
|
||||
allowed** (2026-06-18): the driver reaches the VM via the libvirt-NAT gateway
|
||||
(192.168.150.1). `ct established,related accept` saves the in-flight apply connection,
|
||||
but a fresh post-reboot SSH is dropped without an explicit allow. → test overlay sets
|
||||
`base__firewall_control_addr` to the NAT gateway.
|
||||
|
||||
- `[recurring]` **Real-hardware shakedown and static review each caught what the other
|
||||
couldn't** (2026-06-18): the qemu-URI, storage-path, UEFI, apt-list, and caddy-render
|
||||
bugs ALL surfaced only on a live KVM run; the phantom-host inventory bug surfaced only in
|
||||
the holistic cross-file review. → for infra this novel, budget for BOTH an adversarial
|
||||
cross-file review AND a real-hardware run; neither alone would have shipped it working.
|
||||
|
||||
<!-- From the 2026-06-19 mesh-hardening-2/3 design (ubongo INPUT-only default-deny). -->
|
||||
|
||||
- `[friction]` **Raw DHCP leases pinned in ubongo's host firewall (admin-addr SSH allows)**
|
||||
(2026-06-19): mesh-hardening 2/3 lets the operator workstations reach ubongo's LAN SSH by
|
||||
*raw lease* — `base__firewall_admin_addrs: ["10.20.10.50" (mamba), "10.20.10.17"]` — because
|
||||
there is no DHCP reservation yet (OPNsense isn't managed as code). A lease reassignment
|
||||
silently moves the allow to whatever host next holds the IP (still SSH-key-gated) and drops
|
||||
the workstation's *LAN* path (mesh still works, so never a full lockout). → when
|
||||
OPNsense-as-code lands (ADR-020 perimeter / TODO 3.5), replace both with **MAC-pinned DHCP
|
||||
reservations** (`10.20.10.17` = MAC `bc:0f:f3:c8:4a:8a`; mamba's MAC TBD) and allow the
|
||||
reserved IPs. Spec: `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`.
|
||||
|
||||
- `[gotcha]` **`make test-integration` on ubongo fails (`qemu-img` "Permission denied") when
|
||||
the agent session predates the `libvirt` group grant** (2026-06-19): the `integration_test`
|
||||
role adds `claude` to `libvirt`+`kvm` and makes the cache dir `/var/lib/boma-integration`
|
||||
`root:libvirt 2775` — correct — but a `claude` session whose shell started *before* that
|
||||
grant carries a stale process group set (`id` → `claude,docker` only, no `libvirt`), so
|
||||
`qemu-img create` of the VM overlay into the group-owned dir is denied. `virsh`/`virt-install`
|
||||
still work (they reach system libvirtd via polkit/socket, and the real KVM runs server-side
|
||||
as `libvirt-qemu`), so ONLY claude's own file-writes break. Unblock without restarting the
|
||||
session: **`sg libvirt -c 'make test-integration HOST=<name>'`** (claude needs only `libvirt`
|
||||
for the dir; `kvm` is server-side; note `sg` adds one group, not the full set). → self-heal
|
||||
in `scripts/integration-vm.py`: if the `libvirt` gid is absent from `os.getgroups()`, re-exec
|
||||
under `sg libvirt` (or have the Makefile target do it), so a stale-session agent never hits
|
||||
this opaque symptom. New agent sessions pick the groups up on login, so it's a stale-session
|
||||
transient — but high-confusion, worth self-healing.
|
||||
|
||||
- `[friction]` **No standard for when the agent may run local-VM integration tests on ubongo
|
||||
without asking** (2026-06-19): `make test-integration HOST=<name>` spins an ISOLATED throwaway
|
||||
KVM VM (its own libvirt NAT; never touches the real host's firewall/network; guards:
|
||||
one-VM-at-a-time + a 4 GiB free-RAM floor + auto-destroy on success), so it is safe and
|
||||
self-contained — yet the agent paused for a go-ahead before running it (mesh-hardening 2/3,
|
||||
Task 4). The operator wants a STANDARD that pre-authorises VM-testing on ubongo so the agent
|
||||
just runs it. → decide + record the rule: e.g. a `.claude/settings.json` permission allow for
|
||||
`make test-integration*` / `scripts/integration-vm.py` (and the `sg libvirt -c '…'` form per
|
||||
the gotcha above), plus a CLAUDE.md line distinguishing the pre-authorised isolated VM tests
|
||||
from the genuinely-gated live steps (`make deploy` to real hosts, host reboots, cutovers —
|
||||
still need a go-ahead). Ties to the `test-risky-infra-before-live-deploy` +
|
||||
`dont-reask-settled-defaults` memories + ADR-025.
|
||||
|
||||
- `[gotcha]` **Molecule covers only the `input_only`-OFF (forward drop) branch of the base
|
||||
firewall** (2026-06-19): mesh-hardening 2/3 added `base__firewall_input_only` (forward policy
|
||||
drop↔accept). The `default` Molecule scenario renders ONE fixture, set to the secure default
|
||||
(drop) — so the fast `make test ROLE=base` gate locks the drop default (security-critical for
|
||||
service hosts) but does NOT exercise the `=true` → forward-`accept` rendering; only `make
|
||||
test-integration HOST=ubongo` does (passed GREEN). An in-converge re-render can't cheaply
|
||||
cover it (role defaults aren't in scope outside the role run). → decide in kaizen: a second
|
||||
Molecule scenario (`molecule/input-only/`) asserting forward `policy accept`, vs accepting the
|
||||
integration-only coverage. Final-review finding; not a cutover blocker (the accept branch is a
|
||||
literal, and a var-name break would fail the drop branch too → caught).
|
||||
|
||||
- `[gotcha]` **Applying base's firewall to a Docker host flushes Docker's nat → container
|
||||
egress dies until `restart docker`** (2026-06-19, mesh-hardening 2/3 live cutover): base's
|
||||
`nftables.conf.j2` starts with `flush ruleset`, which wipes ALL tables incl. Docker's
|
||||
`ip nat`/`ip filter` (+ libvirt's). On ubongo I chose INPUT-only so `forward` stays `accept`
|
||||
— yet the apply STILL broke CONTAINER egress: `docker pull` worked (dockerd uses HOST egress)
|
||||
but a container `ping` FAILED — the masquerade (SNAT) was gone, so replies couldn't return.
|
||||
`forward accept` permits forwarding but can't replace the missing nat. The spec's "input-only
|
||||
keeps Docker egress working" was therefore **incomplete**, and the local-VM harness couldn't
|
||||
catch it (the test VM runs no Docker). Fix on the live host: `systemctl restart docker`
|
||||
re-adds its `ip nat`/`ip filter` (egress restored; coexists fine with base's `inet filter`).
|
||||
On REBOOT it self-heals (dockerd re-adds nat on boot; `forward accept` doesn't block — unlike
|
||||
the 2026-06-17 `forward drop` incident). → (1) any cutover/runbook applying base firewall to a
|
||||
Docker host MUST `restart docker` + check container egress after the apply; (2) the pending
|
||||
`docker_host` nftables integration should own re-adding/persisting Docker's rules so base's
|
||||
`flush` is safe; (3) the firewall final-review checklist should include "does the host run
|
||||
Docker/libvirt? the flush wipes their nat."
|
||||
|
||||
<!-- From the 2026-06-19 mesh-hardening 3/3 (askari INPUT-only integration gate). -->
|
||||
|
||||
- `[gotcha]` **`inet filter` default-deny blocks libvirt dnsmasq DHCP — silent, hard to diagnose**
|
||||
(2026-06-19, task-3 integration gate): when `base__firewall_input_only: true` is applied to
|
||||
ubongo, the `table inet filter { chain input { policy drop; } }` blocks DHCP packets that arrive
|
||||
via the libvirt bridge (`virbr-boma`). In nftables, multiple tables at the same hook priority all
|
||||
run independently; an `accept` verdict in `table ip filter LIBVIRT_INP` does NOT prevent
|
||||
`table inet filter` from seeing and dropping the same packet. VMs never got DHCP leases (dnsmasq
|
||||
socket confirmed by strace to never receive POLLIN despite tcpdump seeing the packet on
|
||||
`virbr-boma`). Diagnosed by temporarily changing `inet filter input` to `policy accept` → fd=3
|
||||
immediately fired. Fix: `/etc/nftables.d/10-libvirt-boma.nft` drop-in adding
|
||||
`iifname "virbr-boma" accept` (survives service restarts via `include "/etc/nftables.d/*.nft"`).
|
||||
→ The `base` role's template needs a `base__firewall_trusted_bridges` variable so this is
|
||||
encoded at the Ansible level, not in a manual host drop-in. Every host that runs Docker or
|
||||
libvirt and also has `base__firewall_input_only: true` needs an analogous exception.
|
||||
|
||||
- `[gotcha]` **libvirt `leaseshelper` PID-file permission: `virPidFileReleasePath` unlinks
|
||||
`/run/leaseshelper.pid` after EVERY call; nobody cannot recreate it** (2026-06-19, task-3
|
||||
integration gate): dnsmasq runs as nobody; `libvirt_leaseshelper` is its `--dhcp-script`. The
|
||||
helper acquires a PID-file mutex at `/run/leaseshelper.pid`, but `virPidFileReleasePath`
|
||||
UNLINKS the file on exit. `/run/` is `root:root 755`, so nobody cannot create the file after the
|
||||
first unlink → every subsequent `add` call fails with `errno=13`, dnsmasq silently drops the
|
||||
DHCP grant (no log, no error to the client). Fix: suid root C wrapper at
|
||||
`/usr/lib/libvirt/libvirt_leaseshelper` (original moved to `.real`) that pre-creates
|
||||
`/run/leaseshelper.pid` owned by nobody, then drops privileges and execs the real helper. The
|
||||
root dnsmasq fork calls the wrapper; suid gives it permission to touch `/run/`; on return to
|
||||
nobody uid the PID file stays. Also: `/var/lib/libvirt/dnsmasq/` must be `nobody:nogroup 775`
|
||||
so leaseshelper can update `virbr-boma.status`. This fix is host-local on ubongo and NOT in
|
||||
Ansible — encode it in an `integration_test` role task (or a libvirt role) before the harness
|
||||
can be safely re-deployed.
|
||||
|
||||
- `[gotcha]` **cloud-init rejects underscores in `local-hostname` → silently skips
|
||||
network-config → VM never gets DHCP** (2026-06-19, task-3 integration gate): setting
|
||||
`local-hostname: boma-it-askari_inputonly-<uuid>` caused cloud-init-local to consider the
|
||||
hostname invalid and skip writing the network-config to the system. Systemd-networkd then
|
||||
used the genericcloud default (no DHCP), so VMs got only IPv6 link-local. Fix in
|
||||
`scripts/integration-vm.py`: `name.replace("_", "-")` in the meta-data hostname (disk paths
|
||||
and virsh domain names keep the original underscore). Sanitization rule: RFC-952 hostnames
|
||||
allow hyphens, not underscores.
|
||||
|
||||
- `[friction]` **Molecule Docker image can't `apt install` → roles with real package tasks
|
||||
have no Molecule substrate coverage** (2026-06-19): the Docker Molecule image ships with
|
||||
cleared apt-lists and no internet access, so any role whose core work is `apt install` —
|
||||
`base`, `docker_host`, `integration_test` — cannot cover its package/substrate tasks in
|
||||
Molecule. Those tasks are validated only by `make test-integration` (ADR-025, real KVM).
|
||||
The gap is systemic: it affects every role with non-trivial package or system-level setup.
|
||||
→ systematization idea: provide a Molecule image or driver that can install packages (e.g.
|
||||
a custom Docker image with pre-seeded apt-lists, or a `prepare.yml` that pre-installs
|
||||
packages from a local cache), or an alternative driver (e.g. `molecule-libvirt` using the
|
||||
same KVM harness), so substrate tasks get real Molecule unit coverage rather than relying
|
||||
entirely on the integration harness.
|
||||
- `[recurring]` **ADRs claim cross-doc reconciliation they didn't actually perform**
|
||||
(2026-06-14): ADR-024's Status + Consequences asserted "ADR-017 prose that mentioned
|
||||
Traefik is updated to read Caddy" — but ADR-008/017/019 + CAPABILITIES still said
|
||||
Traefik; the rename was left half-done across the doc set and the ADR over-claimed its
|
||||
own follow-through. Surfaced only by a full-repo `grep Traefik` during `/review-repo`.
|
||||
Same shape as the deferred-decision-goes-stale signal (a decision lands in one place,
|
||||
its promised ripple edits don't). → candidate `repo-scan.py` check: when an ADR's text
|
||||
asserts "X is updated to Y" / supersedes a named tool, flag remaining occurrences of the
|
||||
old name (or verify the claimed edit landed) — the structural cousin of `stale-deferred`.
|
||||
(KEEP-OPEN per the 2026-06-14 `/kaizen` run — it's its own build task.)
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -282,29 +106,6 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp
|
|||
|
||||
Consumed signals and where their resolution now lives. Newest first.
|
||||
|
||||
### 2026-06-17
|
||||
|
||||
Second `/kaizen` run. 7 signals triaged; all 7 consumed (0 kept open). Two heavier items
|
||||
(the `rename-incomplete` scan check and the Forgejo registry-login path) were built by
|
||||
parallel subagents and verified against the diff. **Bias-to-remove note:** one PARK
|
||||
(the ubongo self-management gap — out-of-phase, already tracked in STATUS) and zero
|
||||
REMOVE; the rest accreted (migrate/change). None of the open signals were `[unused]`
|
||||
*tooling*, so there was nothing to delete — the only reductive move available was parking
|
||||
the out-of-phase build. **Cadence:** healthy — 3 days after the first run, every signal
|
||||
0–2 days old except the one carried over from 2026-06-14; the "recurring ≥3" nudge in
|
||||
`scripts/friction-scan.py` didn't fire this pass (all recurrence counts were 1), so the
|
||||
thresholds need no change.
|
||||
|
||||
| Signal (first seen) | Verdict | Resolution / where it lives now |
|
||||
|---|---|---|
|
||||
| ADRs claim cross-doc reconciliation they didn't perform (06-14) | SYSTEMATIZE | New `rename-incomplete` check in `scripts/repo-scan.py` (+7 tests): when a numbered ADR announces a rename `Old`→`New`, flag any design-doc line where `Old` still appears in present tense (skips the announcing ADR, lines also naming `New`, and historical/negation cues; rejects `ADR-NNN` tokens as terms). 0 findings on the current tree — the Traefik→Caddy ripple edits have landed. Structural cousin of `stale-deferred`; run by `/review-repo`. (Was KEEP-OPEN on 2026-06-14 — now built.) |
|
||||
| Image push to the Forgejo registry needs an interactive `docker login` (06-15) | SYSTEMATIZE → vault | Vault-backed login path so pushes are agent-completable: `vault.forgejo.registry_token` stub (CHANGEME, operator-minted) + `scripts/registry-login.sh` (reads the token, `docker login --password-stdin`, never echoes it) + `make registry-login` + a prereq note in `docs/runbooks/claude-code-setup.md`. Works once the operator fills the token via `make edit-vault`. |
|
||||
| Single-file bind mount + atomic rewrite = stale config (06-16) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Single-file bind mount + atomic rewrite = stale config (reload-in-place only)": `template` writes a new inode, a single-file bind mount pins the old one, so an in-container reload reads stale config. Mount the config *directory* for reload-in-place roles; restart-based roles are fine with a single-file mount. |
|
||||
| `make check` always fails on the first-ever deploy of a compose service role (06-16) | CHANGE | `check_mode: false` on the `state: directory` scaffold tasks in `roles/reverse_proxy` + `roles/netbird_coordinator`, so the base dirs exist under `--check` and the rest of the dry-run (templates + compose) evaluates instead of failing on a missing `project_src`. Inert under converge → Molecule unchanged. |
|
||||
| Re-asked settled defaults — push + execution mode, in prose (06-17) | CHANGE (exec) + ACCEPTED (push) | Widened `.claude/hooks/guard-execution-mode-menu.sh` to also catch free-form *prose* re-asks of the subagent-vs-inline choice (`"which execution approach?"`, `"subagent vs inline"`, …), not just the literal menu; tested. The push re-ask stays a soft default via the `dont-reask-settled-defaults` memory — a genuine "should I push?" is sometimes legitimate, so it is deliberately not hard-blocked. |
|
||||
| Docs-only commit tripped the rbw-locked pre-commit guard (06-17) | CHANGE | Root cause was NOT the ansible-lint `files:` scope (innocent) — it was `.claude/hooks/guard-vault-preflight.sh` blocking *every* locked `git commit`. Rewrote it to inspect the staged set (`git diff --cached`, plus `-a`/`--all`) and block only when Ansible content (`^(roles\|playbooks\|inventories)/.*\.ya?ml$`) is staged; docs-/config-only commits are now exempt. Fail-safe to block when unsure. Tested. |
|
||||
| Agent can't self-manage `ubongo` (the control node it runs on) without operator grants (06-17) | PARK | The knowledge already lives in `STATUS.md` (control-node row: the interim `claude`-key + `sjat` NOPASSWD grants, and **Pending:** the proper `ansible`-user bootstrap) and the `ubongo-self-sufficiency` memory. Out-of-phase — the fix is the control-node bootstrap recipe, a tracked future build. **Resurrection trigger:** when building ubongo's `base` hardening / `ansible`-user bootstrap, fold in key-trusted NOPASSWD self-management so control-node self-management needs no ad-hoc operator grants. |
|
||||
|
||||
### 2026-06-14
|
||||
|
||||
First `/kaizen` run (dogfood). 12 signals triaged; 11 consumed, 1 kept open (#13 above —
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ as ordering changes, or as new milestones appear. Each milestone gets its own
|
|||
spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it
|
||||
comes up; this file stays high-level.
|
||||
|
||||
_Last updated: 2026-06-19._
|
||||
_Last updated: 2026-06-11._
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -43,10 +43,9 @@ this collapses into interleaving with extra context-switching cost).
|
|||
|
||||
---
|
||||
|
||||
## Phase 1 — Off-site / Remote-access — ✅ COMPLETE (2026-06-17)
|
||||
## Phase 1 — Off-site / Remote-access
|
||||
|
||||
Delivers mobile access to `ubongo`; proves the machinery. Ordered by *real* dependencies.
|
||||
All milestones (M1–M5) done; the mobile-access goal is met. Next: the Procurement gate.
|
||||
|
||||
### M1 · boma's DNS home — a new domain at Gandi, managed as code
|
||||
|
||||
|
|
@ -135,14 +134,14 @@ Dashboard live at `https://netbird.askari.wingu.me` (valid LE cert); `/api` auth
|
|||
- **Maps to:** ADR-016 (mesh), ADR-004 (one service = one role), ADR-021 (access),
|
||||
ADR-022 (backup), ADR-008/017 (VERIFY), accepted-risk R3 (askari public surface).
|
||||
|
||||
### M5 · Enroll peers → goal reached — ✅ DONE (2026-06-17)
|
||||
### M5 · Enroll peers → goal reached — ✅ infra done (2026-06-17); laptops = operator step
|
||||
|
||||
The `base` `mesh` concern enrolled **`ubongo` (`100.99.146.14`) + `askari`
|
||||
(`100.99.226.39`)** as NetBird peers — both Management+Signal Connected, the ubongo↔askari
|
||||
mesh link ping-verified. NetBird ships a default **Allow-All** peer policy, so any enrolled
|
||||
peer reaches `ubongo` over `wt0`. The road-warrior clients (**`mamba` + the work laptop**)
|
||||
are enrolled (operator, via `docs/runbooks/netbird-client.md`) → **`ubongo` is reachable
|
||||
from anywhere. ← the mobile-access goal is met; Phase 1 is complete.**
|
||||
peer can already reach `ubongo` over `wt0`. **Remaining (operator):** install the NetBird
|
||||
client on `mamba` + the work laptop and log in → `ubongo` reachable from anywhere. **← the
|
||||
mobile-access goal lands when the laptops join.**
|
||||
|
||||
- **Deferred to a "mesh-hardening" follow-on** (was folded into M5; split out as the
|
||||
lockout-risky part): apply `base` nftables **default-deny** to `ubongo` + set
|
||||
|
|
@ -206,22 +205,6 @@ Canonical dependency order:
|
|||
|
||||
## Next step
|
||||
|
||||
**Phase 1 complete (M1–M5); mesh-hardening: ubongo (2/3) DONE 2026-06-19, askari redesign DONE 2026-06-20.**
|
||||
Both hosts now run INPUT-only nftables default-deny (`base__firewall_input_only`), live reboot-validated.
|
||||
askari's redesign (spec/plan `docs/superpowers/{specs,plans}/2026-06-19-mesh-hardening-askari-redesign*`)
|
||||
applied INPUT-only default-deny + `wt0`-primary SSH + a permanent WAN break-glass + a geo-disabled
|
||||
coordinator; a real reboot recovered unattended. Remaining mesh-hardening sub-projects:
|
||||
|
||||
1. ~~`ubongo` nftables default-deny + `ssh-from-control`~~ → **DONE (2026-06-19).**
|
||||
2. ~~**redesign** `askari`'s SSH → `wt0`~~ → **DONE (2026-06-20)** — boot-race, coordinator-bootstrap
|
||||
chicken-egg, and Docker-nat-flush all resolved + live reboot-validated.
|
||||
3. ~~**askari relay-SPOF reduction**~~ → **DONE (2026-06-20)** — assessed + **accepted** as a
|
||||
documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
|
||||
narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
|
||||
second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
|
||||
DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
|
||||
4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
|
||||
5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
|
||||
BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
|
||||
|
||||
**Then** the Procurement gate (`/capacity-review` → buy Proxmox hardware) opens Phase 2.
|
||||
**M1 (Gandi DNS migration, IaC)** design is written —
|
||||
`docs/superpowers/specs/2026-06-11-public-dns-gandi-migration-design.md`. Next: user
|
||||
review → implementation plan.
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@
|
|||
calls, curl pulls of web products, log reviews. Headless browsing → ADR-017
|
||||
(`/verify-service`); the API/curl/log-review siblings remain open.
|
||||
3. ~~Standard for test users + manual-test instructions.~~ → ADR-017.
|
||||
4. ~~Local VM integration testing on ubongo.~~ → ADR-025 / `make test-integration` (built + RED→GREEN validated 2026-06-18).
|
||||
|
||||
3. **Building services**
|
||||
1. ~~Decide how to manage logs.~~ → ADR-018.
|
||||
|
|
@ -85,13 +84,6 @@
|
|||
5. ~~Always subagent-driven?~~ → DECIDED: yes (standing agreement; enforced by `.claude/hooks/guard-execution-mode-menu.sh`).
|
||||
6. When AI deploys, i.e. runs playbooks etc., should we make a methodology so that it does not have to poll all the time or review all the output. Perhaps something about the MAKE method could provide only the relevant feedback?
|
||||
7. ~~Reproducible agent toolchain.~~ → `.claude/settings.json` + `docs/runbooks/claude-code-setup.md`.
|
||||
8. **Screenshot hand-off to the agent.** Give the operator a smooth way to hand the
|
||||
agent a screenshot (e.g. of a Hetzner/VNC console during an incident) — the agent
|
||||
can already read image files; the gap is the hand-off. During the 2026-06-17
|
||||
incident the only diagnostic channel was console screenshots, copied manually to
|
||||
`/tmp` and `find`-located. Options: a known drop path the agent checks (e.g.
|
||||
`~/screenshots/`), a small `screenshot`/paste helper or slash-command, or a
|
||||
clipboard→file convention. Cheap, high-value for incident work.
|
||||
|
||||
11. **Kaizen loop** — `/kaizen` built (STATUS).
|
||||
1. ~~Build the loop command.~~ → `/kaizen` (`scripts/friction-scan.py` + `.claude/commands/kaizen.md`; spec `docs/superpowers/specs/2026-06-14-kaizen-command-design.md`).
|
||||
|
|
@ -128,7 +120,6 @@
|
|||
6. Supply-chain hygiene: enforce tiered image pinning (stateful `tag@digest`;
|
||||
stateless rolling tags — ADR-011) + official/verified images via the service
|
||||
checklist; revisit active scanning (Trivy/Grype) once a triage stack exists (R1).
|
||||
7. Is our network setup as it should be? I am not sure if all traffic between ubongo and notes goes via askari? what if askari breaks - will the rest work?
|
||||
|
||||
16. **ADR-011 (update management) — resolve open questions + accept.** Committed as
|
||||
**Proposed**; resolve before marking Accepted:
|
||||
|
|
|
|||
|
|
@ -154,7 +154,6 @@ Level 2 (staging) or Level 3 (external). This is a conscious, documented decisio
|
|||
| Capability | Reason not testable in Molecule |
|
||||
|---|---|
|
||||
| `nftables` rule loading | Requires `nf_tables` kernel module; not available in Docker |
|
||||
| **Reboot-survivability / host-firewall × Docker interaction / boot-ordering** | **Requires a real kernel reboot — the class that caused the 2026-06-17 mesh-hardening incident. Now covered by local VM integration testing (ADR-025).** |
|
||||
| NetBird mesh data plane (`wt0` WireGuard interface) | Requires the `wireguard` kernel module; Molecule checks only that the agent is installed/configured (ADR-016) |
|
||||
| `unattended-upgrades` behaviour | Installs correctly; actual upgrade behaviour requires a real apt environment |
|
||||
| DHCP behaviour (OPNsense) | OPNsense is managed by Ansible but not testable in a container |
|
||||
|
|
@ -166,11 +165,6 @@ For the above, Molecule tests only what it can: that the relevant packages are
|
|||
installed, that configuration files render correctly, and that services are enabled.
|
||||
Behavioural correctness is confirmed on staging.
|
||||
|
||||
**ADR-025 is the concrete build of Level 2/3** — local VM integration testing on
|
||||
ubongo (libvirt/KVM, throwaway overlay VMs, stdlib-only driver). It specifically
|
||||
targets the reboot-survivability / host-firewall × Docker / boot-ordering class that
|
||||
Molecule structurally cannot reach. See `docs/decisions/025-local-vm-integration-testing.md`.
|
||||
|
||||
---
|
||||
|
||||
### CI pipeline
|
||||
|
|
|
|||
|
|
@ -2,10 +2,7 @@
|
|||
|
||||
## Status
|
||||
|
||||
Accepted (2026-06-05). **Amended 2026-06-18:** the `claude` AI-worker account now has
|
||||
`NOPASSWD:ALL` sudo on `ubongo` — reversing the original "no local sudo" sub-decision.
|
||||
The amendment is recorded in §Access & security below; rationale and accepted risk are
|
||||
in ADR-021 and `docs/security/accepted-risks.md` (R7).
|
||||
Accepted (2026-06-05)
|
||||
|
||||
## Context
|
||||
|
||||
|
|
@ -46,12 +43,8 @@ points at this physical box. This *strengthens* the ADR-009 control-node excepti
|
|||
it is genuinely outside Terraform's world, not a VM pretending to be the exception.
|
||||
Every other host stays a Terraform-managed VM exactly as designed.
|
||||
|
||||
`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a production
|
||||
hypervisor and runs no `docker_host` services. It does run **ephemeral KVM test VMs**
|
||||
as part of its local-test-runner role (ADR-025 — local VM integration testing): one
|
||||
throwaway VM at a time (~3 GiB RAM), against ~13 GiB free of the 16 GiB sized here.
|
||||
This is not a production workload — it is the concrete implementation of ADR-008 Level
|
||||
2/3, and the resource guard enforces one-at-a-time to stay within the RAM ceiling.
|
||||
`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a hypervisor
|
||||
and runs no `docker_host` services.
|
||||
|
||||
### Hardware target
|
||||
|
||||
|
|
@ -91,38 +84,12 @@ Manual, on bare metal:
|
|||
only** — key-only, with password auth and root login disabled — until the NetBird mesh
|
||||
(ADR-016) is stood up.
|
||||
- **AI-worker identity:** `ubongo` runs the AI worker under a dedicated,
|
||||
password-locked `claude` user (in the `docker` and `libvirt` groups; **`NOPASSWD:ALL`
|
||||
sudo** via a repo-managed drop-in — see amendment below). It is reached via `sudo -iu
|
||||
claude` or its own SSH key. The rationale is **attribution + revocation, not
|
||||
containment**: auditd/Loki (ADR-018) can separate human from agent actions, and the
|
||||
account/key can be revoked without touching the operator's access. (ADR-021 left the
|
||||
on-`ubongo` agent identity unspecified; this records it.)
|
||||
|
||||
**Amendment (2026-06-18) — `claude` now has `NOPASSWD:ALL` sudo.**
|
||||
> **Superseded by [ADR-025](025-local-vm-integration-testing.md)** (per ADR-023 §4): the
|
||||
> "no local sudo" sub-decision is reversed. The shakedown that necessitated it is ADR-025;
|
||||
> the resulting two-account access model is ADR-021; the accepted risk is R7.
|
||||
|
||||
During the
|
||||
integration-testing harness shakedown, the original "no local sudo" sub-decision was
|
||||
reversed. No-sudo blocked the AI-worker from diagnosing a failed VM: `virsh`,
|
||||
`virt-install`, `cloud-localds`, `journalctl`, `nft` — nearly all low-level
|
||||
diagnostic commands — require root. The AI-worker must autonomously spin up,
|
||||
inspect, and tear down test VMs without operator hand-holding; that is the harness's
|
||||
core value proposition. Compensating controls make the risk acceptable:
|
||||
|
||||
1. `claude`'s password is **locked** (no interactive login, no `su claude` without the
|
||||
operator's own credentials) — `NOPASSWD` sudo is the *only* sudo path.
|
||||
2. `auditd` + Loki attribution (ADR-018) separates human from agent root actions.
|
||||
3. The drop-in is **repo-managed** via `base__ai_worker_user` — revocable in one commit
|
||||
and one deploy.
|
||||
4. Single-operator homelab: everything in git, off-machine backups (ADR-022).
|
||||
|
||||
The operator (`sjat`) uses **password-required sudo** via the `sudo` group; their
|
||||
former `NOPASSWD` drop-in was removed 2026-06-18 as redundant once `claude` had sudo
|
||||
(least-privilege cleanup). The accepted risk is registered as R7 in
|
||||
`docs/security/accepted-risks.md`. ADR-021 records the resulting sudo model for both
|
||||
accounts.
|
||||
password-locked `claude` user (in the `docker` group for Molecule; **no local sudo** —
|
||||
boma deploys reach the fleet over SSH as the `ansible` user, not via local root). It is
|
||||
reached via `sudo -iu claude` or its own SSH key. The rationale is **attribution +
|
||||
revocation, not containment**: auditd/Loki (ADR-018) can separate human from agent
|
||||
actions, and the account/key can be revoked without touching the operator's access.
|
||||
(ADR-021 left the on-`ubongo` agent identity unspecified; this records it.)
|
||||
- **Disk encryption:** `ubongo`'s SSD is **not encrypted at rest** — the SanDisk X600 is
|
||||
TCG-Opal-capable but Opal is unused. This is an accepted risk recorded in
|
||||
`docs/security/accepted-risks.md` (control-node disk not encrypted at rest),
|
||||
|
|
|
|||
|
|
@ -85,9 +85,8 @@ allocated for it.
|
|||
- **Bootstrap order:** stand up the coordinator on `askari` → enroll `ubongo` →
|
||||
`base` enrolls the fleet.
|
||||
- **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
|
||||
NetBird's management datastore is **intended** to be backed up encrypted off `askari`
|
||||
(synced to `ubongo`/`mamba`; not yet built — see the Availability amendment / R8); peers
|
||||
keep last-known config through a brief coordinator outage.
|
||||
NetBird's management datastore is backed up encrypted off `askari` (synced to
|
||||
`ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
|
||||
- **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned
|
||||
as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox
|
||||
cluster (its own provider + local state). Ansible configuration: `base` role, plus a
|
||||
|
|
@ -117,7 +116,7 @@ allocated for it.
|
|||
address as a mesh-independent secondary path, so a mesh/coordinator outage never
|
||||
blocks on-LAN SSH and Ansible stays off the mesh (Security; Recovery & operations).
|
||||
- The mesh survives a homelab outage because the coordinator is off-site on `askari`,
|
||||
with its management datastore **intended** to be backed up encrypted off `askari` (not yet built — see the Availability amendment / R8) and peers keeping
|
||||
with its management datastore backed up encrypted off `askari` and peers keeping
|
||||
last-known config through a brief coordinator outage (Recovery & operations).
|
||||
- Choosing NetBird over plain OPNsense WireGuard, Tailscale, Tailscale+Headscale, an
|
||||
on-cluster coordinator, a `ubongo` subnet router, and a standalone IdP gains
|
||||
|
|
@ -126,38 +125,6 @@ allocated for it.
|
|||
- Implementation is pending: the role tasks land only once the unbuilt `base` role and
|
||||
service-role machinery exist (Status).
|
||||
|
||||
## Availability — an `askari` outage (amendment 2026-06-20)
|
||||
|
||||
The coordinator is deliberately **single** (one off-site host). Recorded here so its
|
||||
availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
|
||||
|
||||
The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
|
||||
normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
|
||||
radius**:
|
||||
|
||||
| Traffic | `askari` down |
|
||||
|---|---|
|
||||
| LAN device → LAN service (direct / via reverse proxy) | unaffected |
|
||||
| node ↔ node over LAN IPs (cluster) | unaffected |
|
||||
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
|
||||
| **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
|
||||
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
|
||||
|
||||
Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
|
||||
is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
|
||||
operations, above).
|
||||
|
||||
**Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
|
||||
once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
|
||||
gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
|
||||
the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
|
||||
hosts get the same pin via `base__mesh_coordinator_pin`.
|
||||
|
||||
**Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
|
||||
default-deny posture; only helps established sessions), a second relay (needs another public
|
||||
host / reintroduces the home public surface), a second coordinator (unsupported by
|
||||
self-hosted NetBird; against this ADR).
|
||||
|
||||
## Related
|
||||
|
||||
ADR-007 (network — amended), ADR-015 (control host), ADR-002 (security),
|
||||
|
|
|
|||
|
|
@ -3,9 +3,7 @@
|
|||
## Status
|
||||
|
||||
Accepted (2026-06-09). Resolves TODO 7.2 (what to set up on hosts given direct access
|
||||
will be rare) and TODO 3.2 (the service admin-API access question). **Amended
|
||||
2026-06-18:** the on-`ubongo` sudo model for the two local accounts is now settled
|
||||
(see §Sudo model on `ubongo` below).
|
||||
will be rare) and TODO 3.2 (the service admin-API access question).
|
||||
|
||||
**Doctrine ADR.** It pins the operational-access doctrine, the declarative `access__*`
|
||||
data model, the rendered `ACCESS.md` record, and the `/check-access` verifier. It does
|
||||
|
|
@ -165,36 +163,6 @@ exists and `/check-access` is green (or a deviation is recorded in `accepted-ris
|
|||
No scaffold change — same manual-copy-plus-review pattern the sibling records
|
||||
(`SECURITY.md`/`VERIFY.md`) use.
|
||||
|
||||
### Sudo model on `ubongo` (amendment 2026-06-18)
|
||||
|
||||
The original ADR left on-`ubongo` local sudo unspecified. The integration-testing
|
||||
harness shakedown settled it:
|
||||
|
||||
| Account | Role | Sudo |
|
||||
|---|---|---|
|
||||
| `claude` | Automated AI-worker | `NOPASSWD:ALL` via repo-managed drop-in (`base__ai_worker_user`) |
|
||||
| `sjat` | Human operator | Password-required sudo via the `sudo` group |
|
||||
|
||||
**Rationale for `claude NOPASSWD`.** No-sudo blocked the AI-worker from diagnosing a
|
||||
failed test VM: `virsh`, `virt-install`, `cloud-localds`, `nft`, `journalctl` —
|
||||
almost every low-level diagnostic tool — require root. The harness's core value is
|
||||
autonomous spin-up → apply → reboot → assert → diagnose; that loop collapses without
|
||||
local root access.
|
||||
|
||||
**Compensating controls (R7 in `docs/security/accepted-risks.md`):**
|
||||
- `claude`'s password is locked — `NOPASSWD` is the account's *only* sudo path; no
|
||||
interactive login is possible.
|
||||
- `auditd` + Loki attribution (ADR-018) separates human from agent root actions in the
|
||||
audit trail.
|
||||
- The drop-in is repo-managed and revocable in one commit + one deploy.
|
||||
- Single-operator homelab; everything in git; off-machine backups (ADR-022).
|
||||
|
||||
**`sjat` NOPASSWD removed.** The operator's former `NOPASSWD` drop-in
|
||||
(`/etc/sudoers.d/sjat-ansible`, added as an interim measure during M5 NetBird
|
||||
enrolment) was removed 2026-06-18. It was redundant once `claude` held sudo, and its
|
||||
removal restores least-privilege for the human operator. `sjat` retains full sudo
|
||||
capability via the `sudo` group (password required).
|
||||
|
||||
## Consequences
|
||||
|
||||
- Every host and service has at least one documented, verifiable way in — and a verifier
|
||||
|
|
|
|||
|
|
@ -1,180 +0,0 @@
|
|||
# ADR-025 — Local VM integration testing on ubongo
|
||||
|
||||
## Status
|
||||
|
||||
Accepted (2026-06-18). Implements ADR-008 Level 2/3 (deferred for lack of hosts; now
|
||||
viable on ubongo). **RED→GREEN acceptance PASSED on real hardware (2026-06-18):** a
|
||||
throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base's nftables forward
|
||||
default-deny kills Docker forwarding on reboot) — RED — and survived the reboot once
|
||||
the `docker_host` container-forward drop-in was applied — GREEN. Two shakedown
|
||||
learnings added below.
|
||||
|
||||
## Context
|
||||
|
||||
Molecule (ADR-008 Level 1) tests each role in a single Docker container: one
|
||||
`converge`, no real kernel netfilter, no real Docker daemon in the loop, and **no
|
||||
reboot**. That structurally cannot catch an entire class of bug — reboot-survivability,
|
||||
host-firewall × Docker interaction, and boot-ordering — which is exactly the class
|
||||
that caused the **2026-06-17 mesh-hardening incident**.
|
||||
|
||||
During that incident, `base`'s nftables `forward { policy drop; }` killed the askari
|
||||
Docker host **on reboot**: nftables loaded its default-deny before Docker, breaking
|
||||
published-port DNAT and inter-container forwarding. Public services and the mesh went
|
||||
down. It had worked right after `make deploy`, when Docker's runtime rules still
|
||||
coexisted. `ip_nonlocal_bind` also failed to beat the sshd boot-race, leaving the mesh
|
||||
listener absent at boot. Recovery required the Hetzner console and a WAN-SSH
|
||||
break-glass. Molecule had passed.
|
||||
|
||||
ADR-008's Level 2/3 was deferred "for lack of hosts." ubongo breaks that deferral:
|
||||
|
||||
> verified: ubongo KVM capability · Bash (2026-06-18 session) · `/dev/kvm` present +
|
||||
> accessible (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM
|
||||
> free of 16, ~198 GiB disk free; libvirt/QEMU/Vagrant **not yet installed** ·
|
||||
> 2026-06-18.
|
||||
|
||||
## Decision
|
||||
|
||||
### 1. Virtualisation approach: libvirt/KVM directly (Approach A)
|
||||
|
||||
A golden Debian-13 genericcloud qcow2 is cached locally on ubongo. Each run boots an
|
||||
ephemeral qcow2 **overlay** backed by it (the golden image is never mutated), seeded
|
||||
via cloud-init NoCloud, driven by a **stdlib-only** Python driver (`scripts/
|
||||
integration-vm.py`) over `virsh` / `virt-install` / `cloud-localds`. No `libvirt-
|
||||
python` dependency — the driver stays portable and the role stays lean.
|
||||
|
||||
### 2. Fidelity envelope
|
||||
|
||||
The bugs are **post-boot**, not in the provisioning path. A lightweight local hypervisor
|
||||
is sufficient: real OS, real kernel netfilter, real Docker daemon, real published-port
|
||||
DNAT, a **real reboot**, and the coordinator running inside the VM (so the VM forms its
|
||||
own one-node mesh, reproducing the circular bootstrap). The Proxmox provisioning chrome
|
||||
is not mirrored.
|
||||
|
||||
### 3. Scope: one throwaway VM at a time, instantiated from real inventory
|
||||
|
||||
The first profile is **"be askari"** — a single box running Docker host + NetBird
|
||||
coordinator + mesh peer, mirroring the host whose incident motivates this work. The
|
||||
mechanism is generic: swap the profile to "be" any inventory host. Multi-VM topologies
|
||||
are a deferred extension.
|
||||
|
||||
### 4. Acceptance: self-validating against the real failure
|
||||
|
||||
The harness is accepted when it can, on a local VM:
|
||||
|
||||
1. Apply `base` (firewall on, no `docker_host` container-forward drop-in) to a Docker
|
||||
host, reboot, and observe the **2026-06-17 breakage** (Docker forwarding dead,
|
||||
services down). If step 1 passes, the harness is not faithful.
|
||||
2. Apply the `docker_host` container-forward fix, re-run, and **survive the reboot**.
|
||||
|
||||
### 5. Tiered cert fidelity via a `--certs` knob
|
||||
|
||||
DNS-01 is what makes real certs possible without public inbound (validation is
|
||||
out-of-band via a Gandi TXT record; the VM needs only outbound to ACME + Gandi, which
|
||||
the isolated NAT network provides):
|
||||
|
||||
| Tier | Description | Default? |
|
||||
|---|---|---|
|
||||
| `internal` | Caddy `tls internal` — zero deps, instant. For incident repro and runs where certs are not under test. | Yes |
|
||||
| `le-staging` | Real DNS-01 ACME against Let's Encrypt **staging** — real caddy-gandi path, real cert files/renewal, untrusted root, effectively no rate limits. | Built in v1; use when testing the ACME/cert path. |
|
||||
| `le-prod-wildcard` | A real trusted `*.test.wingu.me` wildcard, **issued once, persisted on ubongo, reused** across runs. | On-demand only. Accepted risk recorded as R6 in `docs/security/accepted-risks.md`. |
|
||||
|
||||
A deliberate "no-egress" failure scenario (reproducing FRICTION 2026-06-17 #4 —
|
||||
`netbird-server` FATAL-loops on GeoLite2 download when egress is lost) forces
|
||||
`internal`, since ACME requires egress.
|
||||
|
||||
### 6. The toolchain is Ansible-managed
|
||||
|
||||
A new non-service role (`integration_test`, `control` group) installs and enables
|
||||
libvirt + QEMU + virtinst reproducibly. The driver manages the golden image lazily on
|
||||
first run (keeping the role lean; no fiddly download/refresh logic in Ansible). The
|
||||
repo owns ubongo's state.
|
||||
|
||||
### 7. Stubs live in an overlay file, never in the real inventory
|
||||
|
||||
Transient inventory entries for the test VM are generated at runtime as a single-host
|
||||
file. Stubs (cert tier, in-VM coordinator endpoint, VM connection details) live in
|
||||
`tests/integration/overrides/<host>.yml` — an explicit, reviewable overlay. The real
|
||||
inventory is never touched, so `make tf-inventory` and "don't edit inventory directly"
|
||||
stay intact.
|
||||
|
||||
## Consequences
|
||||
|
||||
- **Reconciles ADR-015:** ubongo runs ephemeral KVM test VMs as part of its
|
||||
local-test-runner role — it is still not a production hypervisor. A default VM
|
||||
(~2 vCPU / 3 GiB / 20 GiB thin overlay) against ~13 GiB free is comfortable; the
|
||||
driver enforces **one integration VM at a time** (resource guard, name-prefix
|
||||
`boma-it-*`) and refuses to start below a free-RAM threshold.
|
||||
- **Operationalises the standing rule:** "firewall/sshd/boot changes must be tested on
|
||||
a real VM with a real reboot before they touch a live host" (FRICTION 2026-06-17 #6)
|
||||
becomes a concrete, runnable step documented in `docs/runbooks/integration-testing.md`.
|
||||
- **Accepted risk R6:** `le-prod-wildcard` runs pass the production Gandi PAT
|
||||
(`vault.gandi.pat`) to an ephemeral local VM and write transient `_acme-challenge`
|
||||
TXT records into the real `wingu.me` zone. Scope: on-demand only; `le-staging` is the
|
||||
default. Compensating controls: ephemeral VM, isolated NAT network, TXT records
|
||||
auto-removed by Caddy after validation.
|
||||
- **Three safety invariants** make the test tool itself safe:
|
||||
1. The transient inventory contains only the test VM — no real host is ever in scope.
|
||||
2. "Be askari" points NetBird at the in-VM coordinator — the VM forms its own one-node
|
||||
mesh; it never enrols in the real mesh.
|
||||
3. Test VMs sit on an isolated libvirt NAT network — outbound NAT for ACME/image pulls
|
||||
only, not reachable to the LAN (`10.20.x`) or the real mesh.
|
||||
- **Diagnostics on failure** (catching a bug is the point): failure keeps the VM and
|
||||
dumps `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`,
|
||||
`systemd-analyze critical-chain`. `make test-integration-clean` reaps all `boma-it-*`
|
||||
orphans. Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/`.
|
||||
- **Future pinch:** concurrency with the Level-4 Chromium/Playwright stack (ADR-017)
|
||||
competes for ubongo RAM. The resource guard is the v1 answer — one integration VM at a
|
||||
time; don't run alongside a heavy Level-4 session. Revisit at `/capacity-review`.
|
||||
|
||||
## Scope
|
||||
|
||||
**In scope:** reboot-survivability, host-firewall × Docker interaction, boot-ordering,
|
||||
cert/ACME paths, mesh bootstrap on one box.
|
||||
|
||||
**Out of scope (v1):** multi-VM mini-cluster (inter-host mesh dataplane); CI gate
|
||||
(this is an interactive, agent-driven pre-deploy check; CI stays lint + Molecule per
|
||||
ADR-008/010); the Proxmox provisioning path (the bugs live in the boot/kernel/Docker
|
||||
layer, not provisioning).
|
||||
|
||||
## What was ruled out
|
||||
|
||||
| Option | Reason |
|
||||
|---|---|
|
||||
| **Proxmox VE nested on ubongo** | Highest fidelity including the provisioning step, but heavy (nested virt, RAM), in tension with ADR-015, and the incident bugs do not live in provisioning. |
|
||||
| **Vagrant + vagrant-libvirt** | Mature lifecycle/snapshots, but adds the Ruby/Vagrant ecosystem + a fragile plugin; boxes drift from the real Debian cloud image; the reboot→assert sequence still needs custom logic. |
|
||||
| **terraform-provider-libvirt** | Declarative and reuses TF, but poor at the imperative apply→reboot→re-apply test sequence; adds throwaway state; blurs ADR-006's "TF owns *production* VM existence on Proxmox" boundary. |
|
||||
|
||||
## Verified facts (ADR-014)
|
||||
|
||||
- verified: ubongo KVM capability · Bash · `/dev/kvm` present + accessible (kvm group),
|
||||
Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198 GiB
|
||||
disk free · 2026-06-18.
|
||||
|
||||
## Shakedown learnings (2026-06-18 live run)
|
||||
|
||||
Two findings from the RED→GREEN acceptance run that affect anyone operating the harness:
|
||||
|
||||
1. **Boot firmware: UEFI required.** The Debian 13 genericcloud image triple-faults
|
||||
under legacy BIOS/SeaBIOS and does not reach the kernel. Boot the VM with UEFI
|
||||
(`virt-install --boot uefi`; `ovmf` package). The driver does this by default; note
|
||||
it here so the requirement is findable.
|
||||
|
||||
2. **`claude` sudo is load-bearing.** VM management (`virsh`, `virt-install`,
|
||||
`cloud-localds`) and offline diagnostics (`nft list ruleset`, `journalctl -b`,
|
||||
`systemd-analyze critical-chain`) all require root. The harness assumes the
|
||||
AI-worker has `NOPASSWD:ALL` sudo on `ubongo` — settled as the ADR-015 amendment
|
||||
(2026-06-18) and registered as R7 in `docs/security/accepted-risks.md`. A `claude`
|
||||
account without sudo will block the harness at the first `virsh` call.
|
||||
|
||||
The nine full shakedown findings (including the UEFI boot-loop) are in
|
||||
`docs/FRICTION.md`.
|
||||
|
||||
## Related
|
||||
|
||||
- ADR-006 — Terraform owns production VM existence (boundary this ADR respects).
|
||||
- ADR-008 — Testing methodology (Levels 1–4); this ADR is the concrete build of Level 2/3.
|
||||
- ADR-015 — Control host (ubongo); this ADR reconciles "not a hypervisor" with ephemeral test VMs. **Supersedes** ADR-015's "no local sudo" sub-decision for the AI-worker — the shakedown necessitated `claude` NOPASSWD sudo (ADR-023 §4; access model in ADR-021, risk R7).
|
||||
- ADR-016 — Mesh VPN; the "be askari" profile includes the coordinator role.
|
||||
- ADR-020 — Firewall strategy; firewall × Docker interaction is what this harness tests.
|
||||
- ADR-021 — Operational access; sudo model for `claude` and `sjat` on `ubongo`.
|
||||
- ADR-024 — Reverse proxy (Caddy); cert tiers exercise the DNS-01 ACME path.
|
||||
|
|
@ -25,7 +25,7 @@
|
|||
- **Storage:** 256 GB SanDisk X600 SATA 2.5" SSD (model SD9TB8W256G1001; TCG Opal-capable, Opal unused — no disk encryption)
|
||||
- **NICs:** wired GbE, interface eno1, MAC 88:a4:c2:e0:ee:da
|
||||
- **BIOS:** Lenovo M2WKT5AA (2023-06-20)
|
||||
- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred). Also runs **one ephemeral KVM integration test VM** (~3 GiB RAM) at a time per ADR-025 — the resource guard enforces one-at-a-time; do not run a test-integration cycle alongside a heavy Level-4 browser session (Chromium/Playwright).
|
||||
- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred)
|
||||
|
||||
### fisi (backup node — outside the cluster; provisional)
|
||||
- **Model / form factor:** HP Elite 600 G9 (tower)
|
||||
|
|
|
|||
|
|
@ -50,13 +50,6 @@ Don't install these until their trigger lands — then add them here and to
|
|||
- **The venv-activate hook** — this repo expects the Python `.venv` active for Bash
|
||||
commands. If you use the user-level `~/.claude/hooks/activate-venv.sh` pattern,
|
||||
replicate it; otherwise `source .venv/bin/activate` per session after `make setup`.
|
||||
- **Forgejo registry login (for image pushes)** — `make caddy-image-push` /
|
||||
`molecule-image-push` need the Docker daemon authenticated to
|
||||
`forgejo.nyumbani.baobab.band`. Run **`make registry-login`** once per machine: it reads
|
||||
`vault.forgejo.registry_token` from the vault and does `docker login --password-stdin`
|
||||
(no interactive prompt, so an agent can complete a push). The token is operator-minted
|
||||
(Forgejo → Settings → Applications → Generate Token, package read+write) and set via
|
||||
`make edit-vault`; until then `registry-login` prints how to obtain it. (2026-06-17 kaizen.)
|
||||
|
||||
## 4. A note on user-level settings
|
||||
|
||||
|
|
|
|||
|
|
@ -1,229 +0,0 @@
|
|||
# Runbook — Local VM integration testing
|
||||
|
||||
## When to use this
|
||||
|
||||
Run a local VM integration test before deploying any change that touches:
|
||||
|
||||
- **nftables / firewall rules** (the `firewall` concern of `base`)
|
||||
- **sshd configuration** (listener address, port, key types, `base` hardening)
|
||||
- **boot ordering or kernel parameters** (systemd units, sysctl)
|
||||
- **Docker host networking** (`docker_host` DNAT rules, published-port forwarding, `daemon.json`)
|
||||
|
||||
These are the change classes that Molecule (ADR-008 Level 1) cannot catch: they require
|
||||
a real kernel reboot to surface. This harness is the concrete tool for ADR-008 Level 2/3
|
||||
(see ADR-025) and directly operationalises two standing rules:
|
||||
|
||||
- **"Test risky infra before live deploy"** (standing rule, ubongo memory) — firewall/sshd/boot changes must be tested on a real VM with a real reboot before touching a live host.
|
||||
- **FRICTION 2026-06-17 #6 — validate reboot-recovery before retiring the break-glass** — the lesson crystallised from the mesh-hardening incident: confirm the host recovers from reboot *while you still have the break-glass open*, not after.
|
||||
|
||||
You do not need this runbook for pure-config changes (template rendering, package lists, user management) — Molecule covers those.
|
||||
|
||||
---
|
||||
|
||||
## First-deploy (one-time setup)
|
||||
|
||||
The `integration_test` role installs libvirt + QEMU + virtinst on ubongo and adds the
|
||||
operator accounts (`sjat`, `claude`) to the `libvirt` and `kvm` groups.
|
||||
|
||||
```bash
|
||||
make deploy PLAYBOOK=site LIMIT=ubongo TAGS=integration_test
|
||||
```
|
||||
|
||||
**Re-login after this run** — group membership changes do not take effect in the current
|
||||
session. The driver (`scripts/integration-vm.py`) requires both `libvirt` and `kvm`
|
||||
group membership to create and manage VMs.
|
||||
|
||||
The golden Debian-13 genericcloud qcow2 image is downloaded lazily on the first run
|
||||
(one-time cost, ~500 MB); subsequent runs reuse the cached image.
|
||||
|
||||
---
|
||||
|
||||
## Running a cycle
|
||||
|
||||
### Makefile interface (recommended)
|
||||
|
||||
```bash
|
||||
# Full cycle (provision → apply → reboot → assert → teardown on pass)
|
||||
make test-integration HOST=askari
|
||||
|
||||
# With a specific cert tier
|
||||
make test-integration HOST=askari CERTS=le-staging
|
||||
|
||||
# Keep the VM alive after the run (for manual inspection)
|
||||
make test-integration HOST=askari KEEP=1
|
||||
|
||||
# Destroy all orphan integration VMs (name-prefix boma-it-*)
|
||||
make test-integration-clean
|
||||
```
|
||||
|
||||
`HOST` is a hostname from the production inventory (the profile `tests/integration/
|
||||
profiles/<host>.json` must exist — see Adding a new profile below). `CERTS` defaults
|
||||
to `internal`.
|
||||
|
||||
### Lower-level driver
|
||||
|
||||
The driver (`scripts/integration-vm.py`) exposes individual lifecycle steps for manual
|
||||
or scripted use:
|
||||
|
||||
| Sub-command | What it does |
|
||||
|---|---|
|
||||
| `up` | Ensure golden image → create ephemeral overlay → cloud-init seed → boot |
|
||||
| `apply` | Run the site playbook against the transient inventory (real apply) |
|
||||
| `reboot` | `virsh reboot` + wait for a verified reboot (boot-id change) — the step Molecule cannot do |
|
||||
| `assert` | Run `tests/integration/verify.yml` (outcome assertions) |
|
||||
| `cycle` | `up` → `apply` → `reboot` → `assert` → `down` (default: destroy on pass) |
|
||||
| `down` | Destroy the VM + overlay |
|
||||
| `prune` | Destroy all `boma-it-*` VMs + overlays (orphan cleanup) |
|
||||
| `console` | Print the VM's captured serial-console log |
|
||||
|
||||
```bash
|
||||
# Example: step through manually
|
||||
python3 scripts/integration-vm.py up --host askari
|
||||
python3 scripts/integration-vm.py apply --host askari
|
||||
python3 scripts/integration-vm.py reboot --host askari
|
||||
python3 scripts/integration-vm.py assert --host askari
|
||||
python3 scripts/integration-vm.py down --host askari
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cert tiers
|
||||
|
||||
| Tier | Flag | Use when |
|
||||
|---|---|---|
|
||||
| `internal` | `CERTS=internal` (default) | Incident repro, firewall/sshd/boot changes where certs are not under test. Zero deps, instant. |
|
||||
| `le-staging` | `CERTS=le-staging` | Testing the Caddy DNS-01 ACME path, cert renewal logic, or the `caddy-gandi` plugin. Real cert files, untrusted root, effectively no rate limits. Requires `vault.gandi.pat`. |
|
||||
| `le-prod-wildcard` | `CERTS=le-prod-wildcard` | Verifying TLS behaviour with a real trusted cert. On-demand only — accepted risk R6 (`docs/security/accepted-risks.md`): the production Gandi PAT reaches an ephemeral VM and transient TXT records are written into the real `wingu.me` zone. |
|
||||
|
||||
> A deliberate "no-egress" scenario (reproducing FRICTION 2026-06-17 #4 — the
|
||||
> `netbird-server` GeoLite2 FATAL-loop when NAT masquerade is wiped) **must** use
|
||||
> `CERTS=internal`: the egress loss is the fault being simulated, and ACME requires egress.
|
||||
|
||||
---
|
||||
|
||||
## Diagnostics and inspecting a failed VM
|
||||
|
||||
### Where diagnostics land
|
||||
|
||||
Diagnostics from every run are captured in:
|
||||
|
||||
```
|
||||
~/integration-runs/<timestamp>-<host>/
|
||||
```
|
||||
|
||||
This directory is gitignored. On a failed assert step, the driver dumps:
|
||||
|
||||
- `nft list ruleset` — the live nftables state at failure
|
||||
- `docker ps -a` — container states
|
||||
- `ss -tlnp` — listening sockets
|
||||
- `journalctl -b` — full boot log
|
||||
- `systemd-analyze critical-chain` — boot timing
|
||||
- Serial console capture (on boot/SSH failure — the automated equivalent of the Hetzner
|
||||
console, addressing FRICTION 2026-06-17 #5)
|
||||
|
||||
The agent reads these directly from `~/integration-runs/` — no manual download needed.
|
||||
|
||||
### Inspecting a kept or failed VM
|
||||
|
||||
When a run fails or when `KEEP=1` is passed, the VM is left running. Connect to it:
|
||||
|
||||
```bash
|
||||
# Serial console (no SSH needed — useful when SSH is the fault)
|
||||
python3 scripts/integration-vm.py console --host askari
|
||||
# or directly:
|
||||
virsh console boma-it-askari
|
||||
# Exit with Ctrl-]
|
||||
|
||||
# SSH (as the ansible user, IP from virsh)
|
||||
virsh domifaddr boma-it-askari --source lease
|
||||
ssh ansible@<IP>
|
||||
|
||||
# List all integration VMs
|
||||
virsh list --all | grep boma-it-
|
||||
```
|
||||
|
||||
### Cleanup
|
||||
|
||||
```bash
|
||||
# Destroy a specific VM
|
||||
python3 scripts/integration-vm.py down --host askari
|
||||
|
||||
# Reap all orphans
|
||||
make test-integration-clean
|
||||
# or:
|
||||
python3 scripts/integration-vm.py prune
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Safety invariants
|
||||
|
||||
These make the test tool itself safe — the harness cannot reach or modify production:
|
||||
|
||||
1. **Single-host transient inventory** — the playbook apply runs against a generated
|
||||
single-host inventory (`ansible_host=<VM lease IP>`). No real host is ever in scope.
|
||||
2. **In-VM coordinator only** — "be askari" points NetBird at the coordinator running
|
||||
inside the VM itself (localhost endpoint). The VM forms its own one-node mesh; it
|
||||
never enrols in the real NetBird mesh.
|
||||
3. **Isolated NAT network** — test VMs sit on a dedicated libvirt NAT network.
|
||||
Outbound NAT provides ACME/image-pull access, but the VM is not reachable from
|
||||
the LAN (`10.20.x`) or the real mesh.
|
||||
|
||||
---
|
||||
|
||||
## Resource constraints
|
||||
|
||||
The default VM profile is ~2 vCPU / 3 GiB RAM / 20 GiB thin-provisioned overlay. The
|
||||
driver enforces **one integration VM at a time** (refusing to start if another
|
||||
`boma-it-*` VM is already running) and refuses to start below the free-RAM threshold
|
||||
(~13 GiB available on ubongo at baseline, per ADR-025).
|
||||
|
||||
**Do not run a test-integration cycle alongside a Level-4 browser session**
|
||||
(Chromium/Playwright, ADR-017) — both compete for ubongo RAM. The resource guard is the
|
||||
enforcement mechanism, not a suggestion.
|
||||
|
||||
---
|
||||
|
||||
## Adding a new profile
|
||||
|
||||
To make the harness "be" a different host:
|
||||
|
||||
1. Create `tests/integration/profiles/<hostname>.json` — specifies which roles to apply
|
||||
and base VM sizing for that host.
|
||||
2. Create `tests/integration/overrides/<hostname>.yml` — the explicit stub overlay:
|
||||
cert tier, in-VM coordinator endpoint (if the host runs the coordinator),
|
||||
`ansible_host` placeholder, and any other variables that must differ from the real
|
||||
inventory (e.g. public DNS → local resolution, geo-DB disable for coordinator).
|
||||
3. Add assertions to `tests/integration/verify.yml` (or extend an existing task with a
|
||||
`when: inventory_hostname == '<hostname>'` guard) for any host-specific outcomes.
|
||||
4. Run `make test-integration HOST=<hostname>` to validate the new profile.
|
||||
|
||||
All stubs must be explicit in the overlay — the real inventory is never edited.
|
||||
|
||||
---
|
||||
|
||||
## Reproducing the 2026-06-17 incident
|
||||
|
||||
The acceptance test for the harness (ADR-025) deliberately reproduces the incident:
|
||||
|
||||
1. Run with today's `base` (firewall on, no `docker_host` container-forward drop-in):
|
||||
```bash
|
||||
make test-integration HOST=askari CERTS=internal
|
||||
```
|
||||
The assert step **must FAIL** after reboot (Docker forwarding dead, published ports
|
||||
unreachable). If it passes, the harness is not faithful.
|
||||
|
||||
2. Implement the `docker_host` container-forward rules (FRICTION 2026-06-17 #1 fix) and
|
||||
re-run. The assert step **must PASS** across the reboot.
|
||||
|
||||
This round-trip proves: (a) the harness faithfully reproduces the incident, and (b) the
|
||||
fix survives a real reboot.
|
||||
|
||||
---
|
||||
|
||||
## Related
|
||||
|
||||
- ADR-025 — decision record for this harness (approach, cert tiers, safety invariants)
|
||||
- ADR-008 — testing methodology; this is Level 2/3
|
||||
- `docs/security/accepted-risks.md` R6 — `le-prod-wildcard` accepted risk
|
||||
- `docs/FRICTION.md` — 2026-06-17 signals that motivated this runbook
|
||||
|
|
@ -1,144 +0,0 @@
|
|||
# Runbook — Enrolling a NetBird client (road-warrior device)
|
||||
|
||||
Joins a **client/road-warrior device** (laptop, desktop, phone) to the boma NetBird mesh
|
||||
so it can reach `ubongo` and other peers from anywhere. The self-hosted coordinator is on
|
||||
`askari` (ADR-016, M4b); enrollment lands a device on the `100.64.0.0/10` overlay.
|
||||
|
||||
> **Hosts vs clients.** Managed **Linux hosts** join via the `base` role's `mesh` concern
|
||||
> (`base__mesh_enabled: true` + the reusable key in `vault.netbird.setup_key`) — see
|
||||
> ADR-016 / the `base` README, *not* this runbook. This runbook is for **user devices**
|
||||
> NetBird doesn't manage with Ansible.
|
||||
|
||||
verified: NetBird client install + self-hosted `--management-url` flow · docs.netbird.io
|
||||
(`/get-started/install/windows`, `/get-started/cli`) · 2026-06-17
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- The coordinator's first-boot `/setup` admin exists and you can log in at
|
||||
`https://netbird.askari.wingu.me`.
|
||||
- **Auth, pick one:**
|
||||
- **SSO** (recommended for a personal device) — your dashboard account; no secret to copy.
|
||||
- **Setup key** — dashboard → **Settings → Setup Keys** → a reusable key (mint a
|
||||
client-specific one for clean ACL grouping, or reuse the existing reusable key).
|
||||
- Local **admin rights** on the device (the client installs a service).
|
||||
- **Coordinator facts:** management URL `https://netbird.askari.wingu.me`; `ubongo`
|
||||
= `100.99.146.14` (`ubongo.netbird.selfhosted`); `askari` = `100.99.226.39`.
|
||||
|
||||
---
|
||||
|
||||
## Part A — Windows 11
|
||||
|
||||
1. **Install:** download + run the MSI **https://pkgs.netbird.io/windows/msi/x64**
|
||||
(official x64 client; installs the tray app + the `netbird` service).
|
||||
2. **Connect** from an **elevated** Windows Terminal / PowerShell ("Run as administrator"):
|
||||
```powershell
|
||||
netbird up --management-url https://netbird.askari.wingu.me
|
||||
```
|
||||
A browser opens — sign in with your dashboard account. (SSO won't open a browser?
|
||||
use a key: `netbird up --setup-key <KEY> --management-url https://netbird.askari.wingu.me`.)
|
||||
3. Proceed to **Part C** (verify).
|
||||
|
||||
---
|
||||
|
||||
## Part B — Other platforms (same management URL)
|
||||
|
||||
- **macOS / Linux desktop:** install the client (macOS: NetBird app / Homebrew; Linux:
|
||||
`pkgs.netbird.io` per the distro — same apt/rpm flow as `base`'s `mesh` concern), then
|
||||
`netbird up --management-url https://netbird.askari.wingu.me` (Linux: prefix `sudo`).
|
||||
- **Android / iOS:** install the **NetBird** app, then in **Settings → Advanced /
|
||||
Server** set the management server to `https://netbird.askari.wingu.me` **before**
|
||||
logging in; connect and complete the SSO login. (Setup keys are supported in-app too.)
|
||||
|
||||
---
|
||||
|
||||
## Part C — Verify + use
|
||||
|
||||
```sh
|
||||
netbird status # expect: Management: Connected, Signal: Connected, a 100.x NetBird IP
|
||||
netbird status -d # peer detail — ubongo (100.99.146.14) + askari (100.99.226.39) listed
|
||||
```
|
||||
Reach `ubongo` over the mesh:
|
||||
```sh
|
||||
ssh sjat@100.99.146.14 # or: ssh sjat@ubongo.netbird.selfhosted
|
||||
```
|
||||
**SSH auth is separate from the mesh:** `ubongo` is key-only (passwords disabled), so the
|
||||
device needs an SSH key authorised for `sjat@ubongo`. The mesh provides the network path;
|
||||
the SSH key provides auth.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting — mesh drops / SSH to `ubongo` times out
|
||||
|
||||
Symptom: SSH to `ubongo` (or any peer) times out for minutes and recovers on its own;
|
||||
`netbird status` shows **Management/Signal: Disconnected** or peers stuck **Connecting**.
|
||||
|
||||
verified: client DNS/relay behaviour + NRPT scope read from a 0.72.4 debug bundle;
|
||||
mitigations per docs.netbird.io (`/manage/dns/troubleshooting`,
|
||||
`/help/troubleshooting-client`) · 2026-06-18
|
||||
|
||||
**1. Triage — is it your device or the coordinator?** On the device:
|
||||
```sh
|
||||
netbird status -d # Management/Signal Connected? peers P2P/Relayed?
|
||||
nslookup netbird.askari.wingu.me # coordinator FQDN
|
||||
nslookup pkgs.netbird.io # a PUBLIC name — control test
|
||||
```
|
||||
If the relay/handshake errors say `lookup netbird.askari.wingu.me: no such host` **and**
|
||||
a *public* name (`pkgs.netbird.io`) also fails to resolve, your **local resolver is
|
||||
dead** — the coordinator and `ubongo` are almost certainly fine. NetBird only manages
|
||||
`*.netbird.selfhosted` resolution (a single NRPT rule), so it is **not** the cause.
|
||||
Confirm from the other side if you can: the dashboard shows peer *last-seen*; `askari`/
|
||||
`ubongo` staying green ⇒ the fault is your device's network.
|
||||
|
||||
**Why it cascades:** NetBird re-resolves the coordinator FQDN on every reconnect. A
|
||||
network transition (Wi-Fi ↔ phone hotspot, sleep/wake) that briefly kills DNS means it
|
||||
can't reach management/signal/relay — and since `ubongo` is **relay-only** (below), there
|
||||
is no direct path to fall back to, so SSH dies until DNS recovers.
|
||||
|
||||
**2. Make the device resilient:**
|
||||
- **Reliable resolvers** — set the device's DNS to public resolvers (`1.1.1.1`, `8.8.8.8`)
|
||||
rather than a network-handed or homelab-internal resolver that's unreachable off-LAN.
|
||||
Windows: inspect with `Get-DnsClientServerAddress`.
|
||||
- **Pin the coordinator** so a DNS hiccup can't strand the client — add to the hosts file
|
||||
(`C:\Windows\System32\drivers\etc\hosts` as admin, or `/etc/hosts`):
|
||||
```
|
||||
77.42.120.136 netbird.askari.wingu.me
|
||||
```
|
||||
`askari`'s stable WAN IP; TLS still validates on the hostname. Removes the multi-minute
|
||||
reconnect deadlocks.
|
||||
|
||||
**3. Break-glass — reach `ubongo` without the mesh.** When the mesh is down you still need
|
||||
a way in. On the home LAN, go straight to `ubongo`'s wired address (bypasses the mesh and
|
||||
coordinator DNS entirely):
|
||||
```sh
|
||||
ssh sjat@10.20.10.151 # ubongo eno1 (LAN) — verify this works from your device NOW
|
||||
```
|
||||
> ⚠️ This works **today** only because `ubongo`'s host-firewall default-deny is not yet
|
||||
> applied. When the deferred mesh-hardening lands (SSH only on `wt0`), this path closes
|
||||
> unless a break-glass SSH rule is added to the firewall catalog. That hardening **must**
|
||||
> keep a non-mesh break-glass (catalog SSH rule from a trusted LAN/admin source) — else a
|
||||
> DNS/mesh outage = full lockout. (ADR-021 break-glass.)
|
||||
|
||||
**Why `ubongo` is relay-only (and P2P is not the fix).** Peers connect to `ubongo` as
|
||||
`Relayed`, never `P2P`: its `nftables` default-deny drops the inbound UDP that ICE
|
||||
hole-punching needs (egress is open, so STUN itself succeeds). This is the **intended
|
||||
current posture** — P2P / NAT-traversal is the *deferred mesh-hardening* (ADR-016/020,
|
||||
STATUS.md). Enabling it needs a firewall-catalog UDP entry **plus** an `accepted-risks.md`
|
||||
deviation or ADR amendment, and OPNsense NAT work — and it would **not** have prevented a
|
||||
DNS-driven outage (a re-handshake still needs signal, which needs DNS). Tracked as future
|
||||
hardening, not a quick fix.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- **Split-tunnel:** NetBird routes only the `100.x` overlay by default — normal/work
|
||||
networking is unaffected.
|
||||
- **Persistence:** the service auto-starts on boot and reconnects; the tray app has
|
||||
Connect/Disconnect; CLI `netbird down` / `netbird up` (no flags after first setup).
|
||||
- **Troubleshooting** — *"failed while getting Management Service public key"* / won't
|
||||
register: confirm `https://netbird.askari.wingu.me` loads in a browser from the device
|
||||
(DNS + TLS + the gRPC routing through Caddy are reachable), the URL is exact, and the
|
||||
terminal is elevated. For peers stuck Disconnected/Connecting or SSH-to-`ubongo`
|
||||
timeouts that recover on their own, see **Troubleshooting — mesh drops** above.
|
||||
- **Removing a device:** `netbird down` then uninstall; revoke its peer in the dashboard
|
||||
(and the setup key if one-off).
|
||||
|
|
@ -109,13 +109,6 @@ make check PLAYBOOK=site
|
|||
# Should report no changes
|
||||
```
|
||||
|
||||
> **Pre-flight before lockout-risky changes (firewall / sshd / boot):** before applying
|
||||
> any change that touches nftables rules, SSH configuration, or boot ordering, run
|
||||
> `make test-integration HOST=<name>` and confirm reboot-recovery on the local VM
|
||||
> **while the break-glass (Proxmox console / Hetzner console) is still open**. Do not
|
||||
> retire the break-glass until the integration test passes. See
|
||||
> `docs/runbooks/integration-testing.md` and ADR-025.
|
||||
|
||||
---
|
||||
|
||||
## Part E — Control node (`ubongo`, manual exception)
|
||||
|
|
|
|||
|
|
@ -114,20 +114,7 @@ reason and gets no `BACKUP.md`. Once the backup node exists, `/check-backup <rol
|
|||
proves the declared state is captured — part of the service-clearance gate
|
||||
(`docs/security/service-checklist.md`).
|
||||
|
||||
### 13. Pre-flight for lockout-risky roles
|
||||
|
||||
If the new role touches nftables rules, SSH configuration, or boot ordering, run a
|
||||
local VM integration test and confirm reboot-recovery **before** deploying to a live
|
||||
host and while the host's break-glass (Proxmox console / Hetzner console) is still
|
||||
open:
|
||||
|
||||
```bash
|
||||
make test-integration HOST=<target-host>
|
||||
```
|
||||
|
||||
See `docs/runbooks/integration-testing.md` and ADR-025.
|
||||
|
||||
### 14. Commit
|
||||
### 13. Commit
|
||||
|
||||
```bash
|
||||
git checkout -b role/<rolename>
|
||||
|
|
|
|||
|
|
@ -18,11 +18,8 @@ revisit (trigger).
|
|||
| R3 | **Self-hosted mesh control plane is a public target on `askari`** — the NetBird coordinator (ADR-016) exposes a management API + dashboard (TCP 80/443) and STUN (UDP 3478) on `askari`'s public IP; the management API controls the whole mesh (NetBird v0.72.4 embeds STUN in the combined server — no separate Coturn) | Self-hosting means **no third-party trust** and an off-site control plane that survives a homelab outage (boma's sovereignty ethos). Residual surface is on `askari` (already a public VPS) and is mitigated: TLS + embedded-IdP login, source-IP restriction where practical, `base` hardening, version-pinned NetBird (ADR-011) patched on boma's cadence | A coordinator compromise or unpatched NetBird CVE; the management plane is reachable without auth/IP-limits; the operational burden makes a hosted coordinator worth reconsidering |
|
||||
| R4 | **No cryptographic WORM for logs** — shipped logs are append-only via Loki's push API and copied off-site to `askari` (ADR-018), but the stored chunks are not object-locked/immutable; a root-on-`askari` attacker could edit history | Append-only push + off-site copy already defeats the realistic threat (a host attacker covering tracks survives even full-cluster compromise). True WORM (object-lock) is forensic-grade cost for boma's opportunistic threat model (R1) | Threat model shifts toward targeted/forensic; a regulatory/evidentiary need appears; `askari` itself is assessed as a likely target |
|
||||
| R5 | **No disk encryption on `ubongo`** — the control node's SSD (SanDisk X600 256 GB, TCG-Opal-capable but Opal unused) is unencrypted at rest, so it holds recovery-critical secrets in plaintext: the Ansible Vault password's `rbw` local cache and (future) Terraform state. Physical theft of the box would expose them | `ubongo` is always-on in a physically controlled location; compensating controls are a **BIOS supervisor password** and **disabled external/USB + PXE boot** (an attacker cannot trivially boot another OS to read the disk), and the offline-recoverable design means the irreducible root secret (Vaultwarden master password) is never stored on the box anyway. Full-disk encryption was weighed against the always-on/unattended-reboot requirement (LUKS+TPM auto-unlock or passphrase) and deferred for simplicity at this trust level | `ubongo` is relocated to a less-trusted physical location; the box starts holding additional high-value secrets; or a reinstall onto LUKS (TPM-sealed) is undertaken |
|
||||
| R6 | **`le-prod-wildcard` integration runs** — when `CERTS=le-prod-wildcard` is passed to `make test-integration`, the production Gandi PAT (`vault.gandi.pat`) is passed to an ephemeral local test VM via the var overlay, and transient `_acme-challenge` TXT records are written into the real `wingu.me` DNS zone to satisfy the Let's Encrypt DNS-01 challenge. A compromised or long-lived test VM could exfiltrate the PAT; the real zone is briefly (seconds) modified | Scope is **on-demand only** — `le-staging` is the default cert tier (`CERTS=internal` for incident repro); `le-prod-wildcard` is an explicit opt-in. Compensating controls: the VM is ephemeral and destroyed on success; it sits on an isolated libvirt NAT network (no LAN/mesh access); TXT records are auto-removed by Caddy immediately after validation; the PAT is not persisted inside the VM after the run. ADR-025 documents the cert-tier design and the three isolation invariants | The PAT is exfiltrated from a test VM; the `wingu.me` zone shows unexpected records; a `CERTS=le-prod-wildcard` run must be audited or the tier must be revoked |
|
||||
| R7 | **`claude` AI-worker has `NOPASSWD:ALL` sudo on `ubongo`** — the automated AI-worker account can execute any command as root on the control node without a password prompt. A compromised or misbehaving agent session could make arbitrary root-level changes to ubongo | The account is **password-locked** (no interactive `claude` login; `NOPASSWD` sudo is the account's only escalation path, so there is no "su to claude + sudo" attack). `auditd` + Loki attribution (ADR-018) logs every `sudo` invocation with the originating user. The drop-in (`/etc/sudoers.d/claude-ai-worker`) is repo-managed via `base__ai_worker_user` — revocable in one commit + one deploy. Single-operator homelab; all changes in git; off-machine backups (ADR-022). Full rationale: ADR-015 amendment (2026-06-18) + ADR-021 §Sudo model. | The AI-worker executes a destructive action that cannot be rolled back via git; the account key is compromised; the threat model shifts toward targeted remote attackers |
|
||||
| R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access** — `askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
|
||||
|
||||
_Last reviewed: 2026-06-20. The prior gaps (full CIS hardening, SELinux/AppArmor,
|
||||
_Last reviewed: 2026-06-11. The prior gaps (full CIS hardening, SELinux/AppArmor,
|
||||
IDS) were re-challenged and **adopted rather than accepted**: CIS Debian L1+L2 + CIS
|
||||
Docker, AppArmor (enforce), AIDE file-integrity, and Suricata network IDS are now
|
||||
part of the security strategy (ADR-002). See STATUS.md / `docs/TODO.md` for build
|
||||
|
|
|
|||
|
|
@ -1,466 +0,0 @@
|
|||
# Mesh-hardening 1/3 — askari SSH onto wt0 — Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Make askari's SSH reachable only over the NetBird mesh (`wt0`) and close the WAN `:22` surface at both the host nftables layer and the Hetzner Cloud Firewall, without dropping askari's public services.
|
||||
|
||||
**Architecture:** Three enforcement layers — (1) sshd `ListenAddress` bound to the live `wt0` IP (fail-closed, `ip_nonlocal_bind` to beat the post-boot bind race); (2) the base role's catalog-driven nftables default-deny (SSH already restricted to `wt0` via `base__firewall_mgmt_interface`; add a `public` zone + askari service entries so 80/443/3478 survive); (3) Terraform drops the Hetzner Cloud Firewall WAN `:22` rule. Tasks 1–4 are code (subagent-driven, each Molecule/lint/plan-verified). Task 5 is the live, operator-supervised cutover on the real host.
|
||||
|
||||
**Tech Stack:** Ansible (role `base`, FQCN), nftables, Molecule on Debian 13, `ansible.posix.sysctl`, pytest (filter unit tests), Terraform (`hcloud` provider).
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-17-mesh-hardening-askari-ssh-wt0-design.md`
|
||||
|
||||
**Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; `make tf-plan` before `make tf-apply`; never hand-edit the generated `offsite.yml`; rbw unlocked for commits touching ansible content.
|
||||
|
||||
---
|
||||
|
||||
### Task 1: base role — sshd `ListenAddress` on wt0 + `ip_nonlocal_bind` (fail-closed)
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/base/defaults/main.yml`
|
||||
- Modify: `roles/base/tasks/ssh.yml`
|
||||
- Modify: `roles/base/templates/sshd_hardening.conf.j2`
|
||||
- Modify: `roles/base/molecule/default/converge.yml` (fixture)
|
||||
- Modify: `roles/base/molecule/default/verify.yml` (assertions = the test)
|
||||
|
||||
- [ ] **Step 1: Write the failing test (extend Molecule verify)**
|
||||
|
||||
In `roles/base/molecule/default/verify.yml`, add these tasks after the existing "Sshd drop-in present and config valid" block:
|
||||
|
||||
```yaml
|
||||
- name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
|
||||
ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
|
||||
changed_when: false
|
||||
- name: ip_nonlocal_bind sysctl drop-in is present
|
||||
ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind = 1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
|
||||
changed_when: false
|
||||
- name: ip_nonlocal_bind is live in this netns
|
||||
ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
|
||||
register: _nonlocal
|
||||
changed_when: false
|
||||
failed_when: _nonlocal.stdout | trim != '1'
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
|
||||
|
||||
In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (alongside the existing `base__mesh_*`):
|
||||
|
||||
```yaml
|
||||
base__ssh_listen_mesh_only: true
|
||||
base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container)
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Run the test to verify it fails**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: FAIL — converge errors or verify fails (`ListenAddress` not rendered; sysctl drop-in absent), because the feature isn't implemented yet.
|
||||
|
||||
- [ ] **Step 4: Add the defaults**
|
||||
|
||||
In `roles/base/defaults/main.yml`, after the `base__ssh_authorised_keys: []` line (end of the hardening block), add:
|
||||
|
||||
```yaml
|
||||
# SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
|
||||
# ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
|
||||
# fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
|
||||
# lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
|
||||
# non-empty address rather than silently listening on all interfaces.
|
||||
base__ssh_listen_mesh_only: false
|
||||
base__ssh_listen_addr: ""
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Resolve + assert + sysctl in `ssh.yml`**
|
||||
|
||||
In `roles/base/tasks/ssh.yml`, insert these tasks at the TOP of the file (before "Ensure openssh-server is installed"):
|
||||
|
||||
```yaml
|
||||
- name: Resolve the sshd mesh listen address (override, else live wt0 fact)
|
||||
ansible.builtin.set_fact:
|
||||
base__ssh_listen_addr_resolved: >-
|
||||
{{ base__ssh_listen_addr
|
||||
or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
|
||||
- name: Fail closed — refuse to render sshd without a known mesh address
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- base__ssh_listen_addr_resolved | length > 0
|
||||
fail_msg: >-
|
||||
base__ssh_listen_mesh_only is true but no mesh address resolved (set
|
||||
base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
|
||||
render sshd ListenAddress empty (which would listen on ALL interfaces).
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
|
||||
- name: Allow sshd to bind the mesh IP before wt0 exists at boot
|
||||
ansible.posix.sysctl:
|
||||
name: net.ipv4.ip_nonlocal_bind
|
||||
value: "1"
|
||||
sysctl_set: true
|
||||
state: present
|
||||
reload: true
|
||||
sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
```
|
||||
|
||||
- [ ] **Step 6: Render the conditional `ListenAddress`**
|
||||
|
||||
In `roles/base/templates/sshd_hardening.conf.j2`, append after the existing `KbdInteractiveAuthentication no` line:
|
||||
|
||||
```jinja
|
||||
{% if base__ssh_listen_mesh_only | bool %}
|
||||
ListenAddress {{ base__ssh_listen_addr_resolved }}
|
||||
{% endif %}
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Run the test to verify it passes**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: PASS — converge succeeds; verify confirms `ListenAddress 100.99.0.1`, the sysctl drop-in, and the live value `1`.
|
||||
|
||||
> **Checkpoint (environmental):** if `make test` fails on the sysctl task because the Molecule container can't write `net.ipv4.ip_nonlocal_bind`, add `sysctls: {net.ipv4.ip_nonlocal_bind: "0"}` to the platform in `roles/base/molecule/default/molecule.yml` (pre-creates the namespaced sysctl so the task can set it), then re-run. Note the change in the commit.
|
||||
|
||||
- [ ] **Step 8: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
|
||||
|
||||
- [ ] **Step 9: Commit**
|
||||
|
||||
```bash
|
||||
git add roles/base/defaults/main.yml roles/base/tasks/ssh.yml \
|
||||
roles/base/templates/sshd_hardening.conf.j2 \
|
||||
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
|
||||
git commit -m "feat(base): opt-in sshd ListenAddress on the mesh IP (fail-closed)
|
||||
|
||||
base__ssh_listen_mesh_only binds sshd to the live wt0 IP only, with
|
||||
ip_nonlocal_bind to beat the post-boot bind race and a fail-closed assert so an
|
||||
unresolved address never silently listens on all interfaces. Molecule covers
|
||||
the render + sysctl. Mesh-hardening 1/3 (ADR-016/021).
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: firewall catalog — `public` zone + askari's public services
|
||||
|
||||
**Files:**
|
||||
- Modify: `inventories/production/group_vars/all/firewall.yml`
|
||||
- Modify: `roles/base/molecule/default/converge.yml` (fixture: public-zone rule)
|
||||
- Modify: `roles/base/molecule/default/verify.yml` (assert the 0.0.0.0/0 rule)
|
||||
- Test: `tests/test_firewall_rules.py` (unit: a `public` zone resolves to `0.0.0.0/0`)
|
||||
|
||||
Rationale: `base__firewall_mgmt_interface` already accepts `:22` on `wt0`. The gap is that the catalog is empty and has no "anywhere" source, so applying default-deny to askari would drop 80/443/3478. We add a `public` zone (`0.0.0.0/0`) and askari's service ingress.
|
||||
|
||||
- [ ] **Step 1: Write the failing unit test**
|
||||
|
||||
In `tests/test_firewall_rules.py`, add:
|
||||
|
||||
```python
|
||||
def test_public_zone_resolves_to_anywhere():
|
||||
catalog = {"web": {"host": "askari",
|
||||
"ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
|
||||
zones = {"public": "0.0.0.0/0"}
|
||||
rules = rs.resolve_firewall_rules(catalog, zones, "askari",
|
||||
{"askari": {"ansible_host": "100.99.226.39"}}, {})
|
||||
assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]
|
||||
```
|
||||
|
||||
(Module is loaded by the existing importlib shim at the top of the test file as `rs`. If the filter is imported under a different alias there, match it.)
|
||||
|
||||
- [ ] **Step 2: Run it to verify it fails (or passes trivially)**
|
||||
|
||||
Run: `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
|
||||
Expected: this test PASSES immediately if the filter already resolves arbitrary zones (it does — `_resolve_source` treats any `zones` key generically). That is fine: the unit test documents/locks the `public`-zone contract. If it fails, fix the filter. Either way it must end green.
|
||||
|
||||
- [ ] **Step 3: Add the Molecule fixture (public-zone rule)**
|
||||
|
||||
In `roles/base/molecule/default/converge.yml`, under `firewall_zones:` add `public: 0.0.0.0/0`, and under `firewall_catalog:` add:
|
||||
|
||||
```yaml
|
||||
netbird_stun:
|
||||
host: instance
|
||||
ingress:
|
||||
- { from: public, port: 3478, proto: udp }
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Add the Molecule assertion (the test)**
|
||||
|
||||
In `roles/base/molecule/default/verify.yml`, after the photoprism assertion block, add:
|
||||
|
||||
```yaml
|
||||
- name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'0.0.0.0/0' in nft"
|
||||
- "'udp dport 3478 accept' in nft"
|
||||
fail_msg: "missing public->3478/udp rule for netbird_stun"
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Run the tests**
|
||||
|
||||
Run: `make test ROLE=base` then `.venv/bin/python -m pytest tests/test_firewall_rules.py -q`
|
||||
Expected: both PASS (the rendered ruleset now contains the `0.0.0.0/0 ... udp dport 3478 accept` rule).
|
||||
|
||||
- [ ] **Step 6: Populate the real catalog**
|
||||
|
||||
In `inventories/production/group_vars/all/firewall.yml`, replace the `firewall_zones`/`firewall_catalog` blocks with:
|
||||
|
||||
```yaml
|
||||
# Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public
|
||||
# off-site services (askari); home/cluster services use the internal zones only.
|
||||
firewall_zones:
|
||||
mgmt: 10.10.0.0/24
|
||||
srv: 10.20.0.0/24
|
||||
lan: 10.30.0.0/24
|
||||
iot: 10.40.0.0/24
|
||||
guest: 10.50.0.0/24
|
||||
public: 0.0.0.0/0
|
||||
|
||||
# Service catalog: <name> → placement (host | group | hosts) + ingress[].
|
||||
# askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host
|
||||
# nftables template renders IPv4 source rules only; askari is reached via its A record
|
||||
# (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
|
||||
firewall_catalog:
|
||||
reverse_proxy:
|
||||
host: askari
|
||||
ingress:
|
||||
- { from: public, port: 80, proto: tcp }
|
||||
- { from: public, port: 443, proto: tcp }
|
||||
netbird_stun:
|
||||
host: askari
|
||||
ingress:
|
||||
- { from: public, port: 3478, proto: udp }
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: clean pass (`check-tags: OK`).
|
||||
|
||||
- [ ] **Step 8: Commit**
|
||||
|
||||
```bash
|
||||
git add inventories/production/group_vars/all/firewall.yml \
|
||||
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
|
||||
tests/test_firewall_rules.py
|
||||
git commit -m "feat(firewall): public zone + askari's public services in the catalog
|
||||
|
||||
Adds a public (0.0.0.0/0) zone and askari's Caddy (80/443) + NetBird STUN
|
||||
(3478/udp) ingress so the base nftables default-deny does not drop the live
|
||||
public services when applied to askari. Molecule + filter unit test cover the
|
||||
public-zone rendering. Mesh-hardening 1/3 (ADR-020/024/016).
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: inventory — point Ansible at wt0 + enable mesh-only SSH on askari
|
||||
|
||||
**Files:**
|
||||
- Create: `inventories/production/host_vars/askari.yml`
|
||||
- Modify: `inventories/production/group_vars/offsite_hosts/vars.yml`
|
||||
|
||||
- [ ] **Step 1: Create the host_var override**
|
||||
|
||||
Create `inventories/production/host_vars/askari.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Manage askari over the NetBird mesh (wt0), not its WAN IP. This OVERRIDES the
|
||||
# TF-generated inventories/production/offsite.yml (ansible_host = 77.42.120.136); host_vars
|
||||
# outrank the generated inventory and are NOT touched by `make tf-inventory-offsite`.
|
||||
# Mesh-hardening 1/3 — once SSH is wt0-only, the WAN IP is no longer reachable for SSH.
|
||||
ansible_host: 100.99.226.39 # askari's wt0 address (NetBird, M5)
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Enable mesh-only SSH for offsite hosts**
|
||||
|
||||
In `inventories/production/group_vars/offsite_hosts/vars.yml`, replace the file body with:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
|
||||
# (ADR-016, M5). Mesh-hardening 1/3 (2026-06-17): SSH is moved onto wt0 — sshd binds the
|
||||
# mesh IP only (base__ssh_listen_mesh_only) and the base nftables default-deny applies
|
||||
# (base__firewall_apply defaults true; SSH allowed on wt0 via base__firewall_mgmt_interface,
|
||||
# public services via the catalog). base__mesh_enabled stays true (precondition from M5).
|
||||
base__mesh_enabled: true
|
||||
base__ssh_listen_mesh_only: true
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Verify the override resolves**
|
||||
|
||||
Run: `.venv/bin/ansible-inventory -i inventories/production/ --host askari 2>/dev/null | grep ansible_host`
|
||||
Expected: `"ansible_host": "100.99.226.39"` (the host_var wins over the generated `offsite.yml`).
|
||||
|
||||
- [ ] **Step 4: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: clean pass.
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add inventories/production/host_vars/askari.yml \
|
||||
inventories/production/group_vars/offsite_hosts/vars.yml
|
||||
git commit -m "feat(inventory): manage askari over wt0 + enable mesh-only SSH
|
||||
|
||||
host_vars/askari.yml points ansible_host at the wt0 IP (overriding the generated
|
||||
offsite.yml); offsite_hosts sets base__ssh_listen_mesh_only. Mesh-hardening 1/3.
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Terraform — retire the Hetzner WAN `:22` rule
|
||||
|
||||
**Files:**
|
||||
- Modify: `terraform/modules/hetzner_vm/main.tf`
|
||||
- Modify: `terraform/modules/hetzner_vm/variables.tf`
|
||||
- Modify: `terraform/environments/offsite/main.tf`
|
||||
|
||||
This task makes the SSH rule conditional and sets askari's admin CIDRs to empty (mesh-only). The live `tf-plan`/`tf-apply` happens in Task 5 — here we only change + format/validate the code.
|
||||
|
||||
- [ ] **Step 1: Gate the SSH rule on a non-empty CIDR list**
|
||||
|
||||
In `terraform/modules/hetzner_vm/main.tf`, replace the static SSH `rule { ... }` block (the one with `port = "22"`) with a dynamic block:
|
||||
|
||||
```hcl
|
||||
# SSH from the control node only — and only when admin CIDRs are set. An empty
|
||||
# ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over
|
||||
# wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
|
||||
dynamic "rule" {
|
||||
for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
|
||||
content {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "22"
|
||||
source_ips = var.ssh_admin_cidrs
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Default the variable to empty**
|
||||
|
||||
In `terraform/modules/hetzner_vm/variables.tf`, change the `ssh_admin_cidrs` variable to default to an empty list:
|
||||
|
||||
```hcl
|
||||
variable "ssh_admin_cidrs" {
|
||||
description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)."
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Set askari to mesh-only SSH**
|
||||
|
||||
In `terraform/environments/offsite/main.tf`, change the `ssh_admin_cidrs` argument in the `module "askari"` block to:
|
||||
|
||||
```hcl
|
||||
ssh_admin_cidrs = [] # mesh-only: SSH is reached over wt0; WAN :22 retired (mesh-hardening 1/3)
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Format + validate**
|
||||
|
||||
Run: `cd terraform/environments/offsite && terraform fmt -recursive ../.. && terraform validate && cd -`
|
||||
Expected: `fmt` lists any reformatted files (re-add them); `validate` prints `Success! The configuration is valid.` (offsite is already `init`ed — it has live state.)
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add terraform/modules/hetzner_vm/main.tf terraform/modules/hetzner_vm/variables.tf \
|
||||
terraform/environments/offsite/main.tf
|
||||
git commit -m "feat(tf/offsite): retire askari's WAN :22 (mesh-only SSH)
|
||||
|
||||
The Hetzner Cloud Firewall SSH rule is now conditional on a non-empty
|
||||
ssh_admin_cidrs (default []); askari sets it empty so the WAN :22 rule is
|
||||
removed on the next apply. SSH is reached over wt0; break-glass is the Hetzner
|
||||
console. Apply is the live cutover (Task 5). Mesh-hardening 1/3.
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
|
||||
|
||||
> This task touches the real askari over the network and is lockout-risky. Run it
|
||||
> interactively with the operator, in order, verifying each step before the next. The
|
||||
> firewall's auto-rollback timer + `wait_for_connection` over wt0 is the safety net; the
|
||||
> Hetzner web console is the ultimate break-glass. Do NOT hand this to an unattended agent.
|
||||
|
||||
- [ ] **Step 1: Pre-check the mesh SSH path (before any change)**
|
||||
|
||||
Run: `.venv/bin/ansible askari -i inventories/production/ -m ping`
|
||||
Expected: `SUCCESS` — confirms Ansible reaches askari over `wt0` (Tasks 1–3 are merged, so `ansible_host` is now `100.99.226.39`). If this fails, STOP — the mesh path must work before closing the WAN.
|
||||
|
||||
- [ ] **Step 2: Dry-run the base apply (firewall + sshd)**
|
||||
|
||||
Run: `make check PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
|
||||
Expected: shows the nftables ruleset diff (default-deny + wt0 SSH + public 80/443/3478) and the sshd drop-in diff (`ListenAddress 100.99.226.39`); no errors. Review that the public service rules are present (so they won't be dropped).
|
||||
|
||||
- [ ] **Step 3: Apply the host firewall + sshd (auto-rollback armed)**
|
||||
|
||||
Run: `make deploy PLAYBOOK=site LIMIT=askari TAGS=firewall,hardening`
|
||||
Expected: the firewall concern arms the rollback timer, applies, resets the connection, and `wait_for_connection` succeeds over wt0; sshd reloads with the mesh ListenAddress. If connectivity is lost, the timer auto-reverts the ruleset within `base__firewall_rollback_timeout` (45 s).
|
||||
|
||||
- [ ] **Step 4: Verify services + WAN SSH still open at the cloud edge**
|
||||
|
||||
```bash
|
||||
curl -sSf -o /dev/null -w '%{http_code}\n' https://test.askari.wingu.me # expect 200
|
||||
curl -sSf -o /dev/null -w '%{http_code}\n' https://netbird.askari.wingu.me # expect 200
|
||||
```
|
||||
Expected: both `200` (valid certs); the host firewall did not drop the public services. (WAN `:22` is now dropped by the host nftables, but the Hetzner FW still allows it until Step 5 — that's fine.)
|
||||
|
||||
- [ ] **Step 5: Retire the Hetzner WAN `:22` — plan, review, apply**
|
||||
|
||||
Run: `make tf-plan TF_ENV=offsite`
|
||||
Expected: the plan shows the SSH firewall rule being **destroyed** (and nothing else of substance). Review it.
|
||||
|
||||
Then: `make tf-apply TF_ENV=offsite`
|
||||
Expected: apply succeeds; the WAN `:22` rule is gone.
|
||||
|
||||
- [ ] **Step 6: Verify the end-state (out-of-band)**
|
||||
|
||||
From an OFF-MESH host (e.g. the operator's laptop with NetBird disconnected, or a quick check from askari's perspective):
|
||||
|
||||
```bash
|
||||
nc -vz -w5 77.42.120.136 22 # expect: refused / timeout (WAN SSH closed)
|
||||
nc -vz -w5 77.42.120.136 443 # expect: open (public service intact)
|
||||
```
|
||||
And from ubongo over the mesh: `.venv/bin/ansible askari -i inventories/production/ -m ping` → `SUCCESS`.
|
||||
|
||||
- [ ] **Step 7: Reboot resilience check (optional but recommended)**
|
||||
|
||||
Reboot askari from the Hetzner console; after it comes back, confirm `ansible askari -m ping` succeeds over wt0 without intervention (proves `ip_nonlocal_bind` beat the post-boot bind race).
|
||||
|
||||
- [ ] **Step 8: Update STATUS + ROADMAP**
|
||||
|
||||
- In `STATUS.md`, update the askari row: SSH is now wt0-only; the host nftables default-deny is applied; the Hetzner WAN `:22` is retired. Move "host firewall + moving askari's SSH onto wt0" out of *Pending*.
|
||||
- In `docs/ROADMAP.md`, mark mesh-hardening sub-project 1 (askari SSH→wt0) done; next is sub-project 2 (ubongo default-deny).
|
||||
|
||||
```bash
|
||||
git add STATUS.md docs/ROADMAP.md
|
||||
git commit -m "docs: askari SSH moved onto wt0 (mesh-hardening 1/3 done)
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
- [ ] **Step 9: Push**
|
||||
|
||||
Run: `git push origin main`
|
||||
|
||||
---
|
||||
|
||||
## Self-review (against the spec)
|
||||
|
||||
- **§ three layers** → Task 1 (sshd ListenAddress), Task 2 (nftables catalog; SSH-on-wt0 pre-existing via `base__firewall_mgmt_interface`), Task 4 (Hetzner WAN :22). ✓
|
||||
- **§ boot-race fix** (`ip_nonlocal_bind` + fail-closed assert + live wt0 fact) → Task 1 Steps 4–6. ✓
|
||||
- **§ new code/vars** (`base__ssh_listen_mesh_only`, `base__ssh_listen_addr`, host_vars/askari.yml, offsite flag, catalog, TF) → Tasks 1–4. ✓
|
||||
- **§ staged cutover** → Task 5 Steps 1–6, with the firewall auto-rollback as the gate. ✓
|
||||
- **§ testing** → Molecule render asserts (ListenAddress, sysctl, public-zone rule) + filter unit test + live out-of-band checks. The fail-closed assert is exercised by code; to spot-check it, temporarily blank `base__ssh_listen_addr` in the converge fixture and confirm `make test ROLE=base` fails on the assert, then revert (manual, not automated — a deliberate-failure Molecule scenario is non-idiomatic). ✓
|
||||
- **§ risks/rollback** → auto-rollback timer (Task 5 Step 3), `ip_nonlocal_bind` (Task 1), Hetzner console break-glass, re-addable TF rule. ✓
|
||||
- **IPv6 note** → recorded in the catalog comment (Task 2 Step 6); acceptable because askari has only an A record.
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,409 +0,0 @@
|
|||
# Mesh-hardening redesign (askari) — Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Harden askari's inbound surface with the proven ubongo INPUT-only default-deny pattern (SSH scoped by `iifname "wt0"` + a permanent WAN break-glass), and make the NetBird coordinator survive a no-egress startup — reboot-safe, no boot-race, no lockout.
|
||||
|
||||
**Architecture:** Mirror mesh-hardening 2/3 (ubongo): `base` firewall INPUT-only (`base__firewall_input_only: true`, forward stays `policy accept` so Docker forwarding/NAT survive), **no** sshd `ListenAddress` change (the firewall, not sshd, scopes `:22`). The coordinator-host exception: WAN `:22` stays open from ubongo's static WAN IP as the always-available non-mesh break-glass (the Hetzner console is the ultimate fallback). A `netbird_coordinator` change disables geolocation so a transient egress loss can't FATAL the control plane. Validate firewall reboot-safety on a throwaway VM (ADR-025 harness) GREEN before a supervised live cutover.
|
||||
|
||||
**Tech Stack:** Ansible (`base`, `netbird_coordinator` roles), nftables, Docker Compose, Molecule (Debian 13), the `scripts/integration-vm.py` ADR-025 harness, NetBird self-hosted `netbird-server:0.72.4`.
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md`
|
||||
|
||||
## Global Constraints
|
||||
|
||||
- **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
|
||||
- **No sshd `ListenAddress` change** — `base__ssh_listen_mesh_only` stays `false` everywhere here (this is what sidesteps the 2026-06-17 boot-race).
|
||||
- **WAN `:22` is never closed** — no Terraform / Hetzner-Cloud-Firewall change in this plan.
|
||||
- **`base__firewall_input_only: true` on askari** — the forward chain must stay `policy accept` (Docker host). Never apply a forward-`drop` firewall to askari.
|
||||
- **ubongo's WAN IP is `91.226.145.80`** (operator-confirmed static 2026-06-19) — the break-glass anchor.
|
||||
- **askari `wt0` IP is `100.99.226.39`**; askari domain `netbird.askari.wingu.me`.
|
||||
- **Before any commit:** `rbw unlocked` must succeed (the pre-commit hook decrypts `vault.yml`); run `make lint` and it must be clean.
|
||||
- **Tags:** import each role at play level with its role-name tag; only use concern tags from `tests/tags.yml`.
|
||||
- **Harness GREEN before live** (Task 3 before Task 4). The live cutover (Task 4) is **operator-gated** — never run autonomously.
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Disable geolocation in `netbird_coordinator` (FRICTION 2026-06-17 #4)
|
||||
|
||||
Make the control plane survive a startup with no container egress: NetBird's combined server downloads the GeoLite2 DB at boot and treats failure as FATAL. boma uses no geo posture (ACL is Allow-All), so disable geolocation entirely via the documented env var. TDD'd through the role's render-only Molecule scenario.
|
||||
|
||||
> verified: NetBird self-hosted geolocation knobs (`NB_DISABLE_GEOLOCATION`, `disableGeoliteUpdate`, GeoLite2 pre-seed) · WebFetch · docs.netbird.io/selfhosted/geo-support · 2026-06-19 — *from a docs summary; the live "healthy with egress blocked" check in Task 4 is the real gate, with a concrete pre-seed fallback there.*
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/netbird_coordinator/defaults/main.yml` (add the knob)
|
||||
- Modify: `roles/netbird_coordinator/templates/docker-compose.yml.j2:14-27` (add `environment:` to `netbird-server`)
|
||||
- Test: `roles/netbird_coordinator/molecule/default/verify.yml:21-32` (assert the rendered compose)
|
||||
- Modify: `roles/netbird_coordinator/README.md` (one line documenting the knob)
|
||||
|
||||
**Interfaces:**
|
||||
- Produces: role default `netbird_coordinator__disable_geolocation` (bool, default `true`); rendered compose env `NB_DISABLE_GEOLOCATION: "true"` on the `netbird-server` service.
|
||||
|
||||
- [ ] **Step 1: Write the failing Molecule assertion**
|
||||
|
||||
Append to `roles/netbird_coordinator/molecule/default/verify.yml` (after the existing compose-tags assert, inside the same `tasks:` list):
|
||||
|
||||
```yaml
|
||||
- name: Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
|
||||
fail_msg: >-
|
||||
compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
|
||||
the coordinator on the GeoLite2 download
|
||||
success_msg: "geolocation disabled in compose"
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run Molecule to verify it fails**
|
||||
|
||||
Run: `make test ROLE=netbird_coordinator`
|
||||
Expected: FAIL at "Assert geolocation is disabled" — the rendered compose has no `NB_DISABLE_GEOLOCATION`.
|
||||
|
||||
- [ ] **Step 3: Add the default knob**
|
||||
|
||||
Add to `roles/netbird_coordinator/defaults/main.yml` (after line 7, the `__domain` line):
|
||||
|
||||
```yaml
|
||||
|
||||
# Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
|
||||
# (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
|
||||
# so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
|
||||
# re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
|
||||
# removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
|
||||
netbird_coordinator__disable_geolocation: true
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Render the env in the compose template**
|
||||
|
||||
In `roles/netbird_coordinator/templates/docker-compose.yml.j2`, add an `environment:` block to the `netbird-server` service, immediately after its `command:` line (line 18):
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
# Disable geolocation so a no-egress startup can't FATAL the control plane
|
||||
# (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
|
||||
NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Run Molecule to verify it passes**
|
||||
|
||||
Run: `make test ROLE=netbird_coordinator`
|
||||
Expected: PASS — all asserts green, including "geolocation disabled in compose"; Molecule idempotence clean.
|
||||
|
||||
- [ ] **Step 6: Document the knob**
|
||||
|
||||
Add one line to `roles/netbird_coordinator/README.md` under its variables/defaults section:
|
||||
|
||||
```markdown
|
||||
- `netbird_coordinator__disable_geolocation` (default `true`) — sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4).
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Lint and commit**
|
||||
|
||||
```bash
|
||||
rbw unlocked && make lint
|
||||
git add roles/netbird_coordinator/defaults/main.yml \
|
||||
roles/netbird_coordinator/templates/docker-compose.yml.j2 \
|
||||
roles/netbird_coordinator/molecule/default/verify.yml \
|
||||
roles/netbird_coordinator/README.md
|
||||
git commit -m "feat(netbird_coordinator): disable geolocation so no-egress startup can't FATAL the control plane" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Enable askari's host firewall (INPUT-only) + WAN break-glass + manage over `wt0`
|
||||
|
||||
Flip askari from "firewall not applied" to the redesigned INPUT-only default-deny, add the permanent WAN break-glass source, and point Ansible at the mesh. Pure inventory change — validated by lint + inventory resolution (the firewall *behavior* is proven in Task 3).
|
||||
|
||||
**Files:**
|
||||
- Modify: `inventories/production/group_vars/offsite_hosts/vars.yml` (replace the whole file body)
|
||||
- Create: `inventories/production/host_vars/askari.yml`
|
||||
|
||||
**Interfaces:**
|
||||
- Consumes: `base` knobs `base__firewall_apply`, `base__firewall_input_only`, `base__firewall_admin_addrs`, `base__ssh_listen_mesh_only`, `base__mesh_enabled` (all defined in `roles/base/defaults/main.yml`).
|
||||
- Produces: askari resolves `ansible_host: 100.99.226.39`, `base__firewall_apply: true`, `base__firewall_input_only: true`, `base__firewall_admin_addrs: ["91.226.145.80"]`.
|
||||
|
||||
- [ ] **Step 1: Rewrite the offsite group_vars**
|
||||
|
||||
Replace the body of `inventories/production/group_vars/offsite_hosts/vars.yml` with:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
|
||||
# (ADR-016, M5).
|
||||
#
|
||||
# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward
|
||||
# `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
|
||||
# did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
|
||||
# - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
|
||||
# so Docker container forwarding/NAT survive a reboot;
|
||||
# - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
|
||||
# change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
|
||||
# - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
|
||||
# as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
|
||||
# management path must never depend on a service that host itself hosts).
|
||||
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
|
||||
base__mesh_enabled: true
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true # forward stays `policy accept` → Docker-safe
|
||||
base__ssh_listen_mesh_only: false # no sshd ListenAddress change → no boot-race
|
||||
base__firewall_admin_addrs:
|
||||
- 91.226.145.80 # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Create the askari host_vars to manage over the mesh**
|
||||
|
||||
Create `inventories/production/host_vars/askari.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
|
||||
# in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
|
||||
# (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
|
||||
# break-glass; the Hetzner web console is the IP-independent ultimate fallback.
|
||||
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
|
||||
ansible_host: 100.99.226.39
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Verify the inventory resolves**
|
||||
|
||||
Run: `ansible-inventory -i inventories/production --host askari`
|
||||
Expected: JSON shows `"ansible_host": "100.99.226.39"`, `"base__firewall_apply": true`, `"base__firewall_input_only": true`, and `"base__firewall_admin_addrs": ["91.226.145.80"]`.
|
||||
|
||||
- [ ] **Step 4: Lint**
|
||||
|
||||
Run: `rbw unlocked && make lint`
|
||||
Expected: clean (no yamllint/ansible-lint errors).
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add inventories/production/group_vars/offsite_hosts/vars.yml \
|
||||
inventories/production/host_vars/askari.yml
|
||||
git commit -m "feat(inventory): askari INPUT-only firewall + WAN break-glass + manage over wt0" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Integration harness "askari_inputonly" profile — the reboot-safety GREEN gate
|
||||
|
||||
Prove on a throwaway VM (ADR-025) that the redesigned firewall is reboot-safe BEFORE touching the real host: INPUT default-deny + forward accept + the admin-addr break-glass + published-port DNAT all survive a reboot. New profile (keeps the existing `askari` profile, which validates the `docker_host` container-forward drop-in path, intact).
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/integration/profiles/askari_inputonly.json`
|
||||
- Create: `tests/integration/overrides/askari_inputonly.yml`
|
||||
- Modify: `tests/integration/verify.yml` (allow-list + a new profile branch)
|
||||
|
||||
**Interfaces:**
|
||||
- Consumes: the `scripts/integration-vm.py` harness; `make test-integration HOST=<profile>` maps `HOST` to `profiles/<HOST>.json` (a profile name, not a production inventory host).
|
||||
- Produces: profile `askari_inputonly` with `integration_profile: askari_inputonly`.
|
||||
|
||||
- [ ] **Step 1: Add the new profile to the verify allow-list and a failing branch**
|
||||
|
||||
In `tests/integration/verify.yml`, change the allow-list assert (line 14) from:
|
||||
|
||||
```yaml
|
||||
- integration_profile in ['askari', 'ubongo']
|
||||
```
|
||||
|
||||
to:
|
||||
|
||||
```yaml
|
||||
- integration_profile in ['askari', 'askari_inputonly', 'ubongo']
|
||||
```
|
||||
|
||||
and update its `fail_msg` (line 15) to `"integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"`. Then append this block to the `tasks:` list (after the ubongo block):
|
||||
|
||||
```yaml
|
||||
# ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
|
||||
# INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
|
||||
# (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
|
||||
- name: (askari_inputonly) Read the live nftables ruleset
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.command: nft list ruleset
|
||||
register: _nft_io
|
||||
changed_when: false
|
||||
|
||||
- name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'hook input priority filter; policy drop;' in _nft_io.stdout"
|
||||
- "'hook forward priority filter; policy accept;' in _nft_io.stdout"
|
||||
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
|
||||
fail_msg: >-
|
||||
askari_inputonly: expected input policy drop, forward policy accept (input-only),
|
||||
and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
|
||||
|
||||
- name: (askari_inputonly) Gather service facts
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.service_facts:
|
||||
|
||||
- name: (askari_inputonly) Docker daemon is active
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.assert:
|
||||
that: "ansible_facts.services['docker.service'].state == 'running'"
|
||||
fail_msg: "docker.service is not running"
|
||||
|
||||
- name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ ansible_host }}/"
|
||||
follow_redirects: none
|
||||
status_code: [200, 301, 308, 404, 502, 503]
|
||||
timeout: 10
|
||||
register: _probe_io
|
||||
retries: 5
|
||||
delay: 6
|
||||
until: _probe_io is succeeded
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Create the profile descriptor**
|
||||
|
||||
Create `tests/integration/profiles/askari_inputonly.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"groups": ["offsite_hosts"],
|
||||
"applies": [
|
||||
{"playbook": "site.yml", "tags": ["base"]},
|
||||
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
|
||||
],
|
||||
"extra_vars_files": ["overrides/askari_inputonly.yml"],
|
||||
"mem_mib": 3072,
|
||||
"vcpus": 2
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Create the overlay**
|
||||
|
||||
Create `tests/integration/overrides/askari_inputonly.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
|
||||
# Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
|
||||
# accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
|
||||
integration_profile: askari_inputonly
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true
|
||||
# No sshd ListenAddress change — never wt0-only in a throwaway VM.
|
||||
base__ssh_listen_mesh_only: false
|
||||
# Isolated VM: never touch the real mesh.
|
||||
base__mesh_enabled: false
|
||||
# The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
|
||||
# VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
|
||||
# interface-independent and the default-deny + reboot don't lock out the driver. This
|
||||
# mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
|
||||
base__firewall_admin_addrs:
|
||||
- 192.168.150.1
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run the harness — the GREEN gate**
|
||||
|
||||
Run: `make test-integration HOST=askari_inputonly`
|
||||
Expected: GREEN. The harness boots a VM, applies `base` (INPUT-only) + `docker_host` + `reverse_proxy`, **reboots**, re-SSHes (proving the admin-addr break-glass survives), then `verify.yml` asserts input `policy drop`, forward `policy accept`, the `192.168.150.1` SSH allow, Docker active, and the published `:80` answering. Clean up: `make test-integration-clean`.
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
rbw unlocked && make lint
|
||||
git add tests/integration/profiles/askari_inputonly.json \
|
||||
tests/integration/overrides/askari_inputonly.yml \
|
||||
tests/integration/verify.yml
|
||||
git commit -m "test(integration): askari_inputonly profile — INPUT-only default-deny reboot gate" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Supervised live cutover + STATUS/ROADMAP update — ⚠️ OPERATOR-GATED
|
||||
|
||||
> **⚠️ DO NOT run this task autonomously.** It changes the live off-site host (lockout risk) and runs `make deploy`. An automated executor must STOP here and hand back to the operator. Preconditions: Tasks 1–3 committed and GREEN; `rbw unlocked`; the **Hetzner web console** open in a browser (the out-of-band ultimate break-glass); the operator present. The WAN `:22` break-glass is never removed, so a fallback path is open throughout (FRICTION 2026-06-17 #6).
|
||||
|
||||
**Files (Step 7 only):**
|
||||
- Modify: `STATUS.md` (askari row), `docs/ROADMAP.md` (Next step)
|
||||
|
||||
- [ ] **Step 1: Pre-check both paths are healthy**
|
||||
|
||||
```bash
|
||||
ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
|
||||
ansible askari -i inventories/production -m ping
|
||||
curl -sI https://test.askari.wingu.me | head -1
|
||||
curl -sI https://netbird.askari.wingu.me | head -1
|
||||
```
|
||||
Expected: wt0 SSH OK; ping `pong`; both curls `HTTP/2 200`.
|
||||
|
||||
- [ ] **Step 2: Dry-run the converge (mandatory `check` before `deploy`)**
|
||||
|
||||
```bash
|
||||
make check PLAYBOOK=site LIMIT=askari
|
||||
```
|
||||
Expected: changes limited to the `base` firewall (input-only ruleset, admin-addr) + the `netbird_coordinator` compose env (`NB_DISABLE_GEOLOCATION`). Review and show the output before proceeding.
|
||||
|
||||
- [ ] **Step 3: Apply (operator present, console open, auto-rollback armed)**
|
||||
|
||||
```bash
|
||||
make deploy PLAYBOOK=site LIMIT=askari
|
||||
```
|
||||
The `base` firewall concern arms the auto-rollback timer (`base__firewall_rollback_timeout: 45`) and reconnects over `wt0` — a bad ruleset reverts itself. Expected: converge OK; SSH-over-`wt0` stays up.
|
||||
|
||||
- [ ] **Step 4: Rebuild NAT and confirm the coordinator is healthy with geo disabled**
|
||||
|
||||
`base`'s `flush ruleset` wipes Docker's nat (FRICTION) — rebuild it, then confirm the control plane:
|
||||
|
||||
```bash
|
||||
ssh sjat@100.99.226.39 'sudo systemctl restart docker'
|
||||
ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
|
||||
ssh sjat@100.99.226.39 'docker logs --since 2m netbird-server 2>&1 | grep -iE "geo|fatal" || echo "no geo/fatal log lines"'
|
||||
```
|
||||
Expected: `netbird-server` + `netbird-dashboard` Up; no geo-DB FATAL.
|
||||
|
||||
> **Contingency (only if `netbird-server` still FATALs on geolocation):** `NB_DISABLE_GEOLOCATION` was not honored by the pinned image. Pre-seed the DB into the volume instead — `ssh sjat@100.99.226.39 'sudo curl -fSL -o /var/lib/docker/volumes/netbird_data/_data/GeoLite2-City_20260101.mmdb https://pkgs.netbird.io/geolite2/GeoLite2-City.mmdb && sudo docker restart netbird-server'` — and add `disableGeoliteUpdate: true` under `server:` in `config.yaml.j2` so it never re-downloads. Re-verify, then fold the working fix back into the role (amend Task 1).
|
||||
|
||||
- [ ] **Step 5: Verify the new steady state (both SSH paths + services)**
|
||||
|
||||
```bash
|
||||
ssh sjat@100.99.226.39 true && echo "wt0 SSH OK"
|
||||
# From ubongo: SSH to askari's WAN IP. ubongo's packets egress via OPNsense, SNAT'd to the
|
||||
# WAN IP 91.226.145.80 — matching askari's admin-addr break-glass rule. (No BindAddress:
|
||||
# ubongo does not hold 91.226.145.80; OPNsense does.)
|
||||
ssh sjat@77.42.120.136 true && echo "WAN break-glass OK"
|
||||
curl -sI https://test.askari.wingu.me | head -1
|
||||
nc -vz -u 77.42.120.136 3478 # STUN answers
|
||||
```
|
||||
Expected: both SSH paths succeed; cert valid; STUN reachable.
|
||||
|
||||
- [ ] **Step 6: Reboot-resilience — the real test (console available)**
|
||||
|
||||
```bash
|
||||
ssh sjat@100.99.226.39 'sudo systemctl reboot'
|
||||
# wait ~60s, then from ubongo — no manual intervention:
|
||||
sleep 60; ssh sjat@100.99.226.39 'nft list chain inet filter input | grep -E "policy drop|wt0|91.226.145.80"'
|
||||
curl -sI https://netbird.askari.wingu.me | head -1
|
||||
ssh sjat@100.99.226.39 'docker ps --format "{{.Names}} {{.Status}}"'
|
||||
```
|
||||
Expected, unattended: input `policy drop` with the `wt0` + `91.226.145.80` allows; public cert valid; both containers Up; `wt0` SSH back. (If lost: recover via the Hetzner console — the firewall auto-rollback and the WAN break-glass should make that unnecessary.)
|
||||
|
||||
- [ ] **Step 7: Record reality in the ground-truth docs and commit**
|
||||
|
||||
Update `STATUS.md` (the askari row): firewall now **applied** — INPUT-only default-deny, SSH `wt0`-primary + permanent WAN break-glass (ubongo's WAN), managed over `wt0`, geolocation disabled, **reboot-validated**. Update `docs/ROADMAP.md` "Next step": mark the askari SSH→`wt0` redesign **DONE**; the next mesh-hardening sub-project is the **SPOF reduction** (askari relay single-point-of-failure) — confirmed by the `ubongo → askari` `Relayed` finding (2026-06-19).
|
||||
|
||||
```bash
|
||||
rbw unlocked && make lint
|
||||
git add STATUS.md docs/ROADMAP.md
|
||||
git commit -m "docs(status): mesh-hardening redesign — askari INPUT-only + WAN break-glass applied + reboot-validated" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes / out of scope (carry to the SPOF sub-project)
|
||||
|
||||
- **SPOF reduction is the next sub-project** (operator decision 2026-06-19): `ubongo → askari` is currently `Relayed` through askari's own relay; if askari is down, relayed peers lose the mesh data plane. Its own spec.
|
||||
- **NetBird ACL stays Allow-All** — any enrolled peer can reach askari `wt0:22` until a later sub-project.
|
||||
- **Full forward-chain hardening** (`docker_host` container-forward drop-in over the `input_only` baseline) — a later tightening; the existing `askari` integration profile already covers that path.
|
||||
- **Coordinator off-site backup** (FRICTION 2026-06-17 #5, ADR-022) — still pending; not in scope.
|
||||
|
|
@ -1,470 +0,0 @@
|
|||
# Mesh-hardening 2/3 — ubongo INPUT-only default-deny — Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Apply base's nftables firewall to the control node (ubongo) as an INPUT-only default-deny — hardening its inbound surface — while leaving the forward chain permissive so Docker egress and the libvirt-NAT integration harness keep working, and without any sshd `ListenAddress` change.
|
||||
|
||||
**Architecture:** Two new `base` knobs make the existing firewall concern fit a control node: `base__firewall_input_only` flips the forward chain to `policy accept` (host-local input filtering only), and `base__firewall_admin_addrs` adds operator-workstation LAN sources to the SSH allow-list (alongside `wt0` and `ssh-from-control`). sshd is untouched (nftables does the scoping → no `ip_nonlocal_bind` boot-race). The change is validated on a throwaway VM via the ADR-025 integration harness (a new "be ubongo" profile) before an operator-supervised live cutover whose safety net is the firewall auto-rollback timer plus the permanent on-prem physical console.
|
||||
|
||||
**Tech Stack:** Ansible (role `base`, FQCN), nftables, Jinja2, Molecule on Debian 13, pytest (none new), the ADR-025 integration harness (`scripts/integration-vm.py`, JSON profiles, `-e @` overlays).
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`
|
||||
|
||||
**Conventions:** `make lint` and `make test ROLE=base` before each commit; `make check` before `make deploy`; never hand-edit the generated `offsite.yml`; `rbw unlocked` for any commit touching Ansible content and for the integration/live applies (the production `group_vars/all/vault.yml` is in inventory scope and gets decrypted at playbook load). Tasks 1–3 are code (subagent-driven, each lint/Molecule-verified). Task 4 is a real-VM validation gate on ubongo. Task 5 is the live, operator-supervised cutover.
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
| File | Create/Modify | Responsibility |
|
||||
|---|---|---|
|
||||
| `roles/base/defaults/main.yml` | Modify | Declare `base__firewall_input_only` + `base__firewall_admin_addrs` (defaults: off / empty). |
|
||||
| `roles/base/templates/nftables.conf.j2` | Modify | Conditional forward policy; render an SSH-allow rule per admin address. |
|
||||
| `roles/base/molecule/default/converge.yml` | Modify | Fixture: an admin-addr source (input-only stays at its default → forward drop). |
|
||||
| `roles/base/molecule/default/verify.yml` | Modify | Assert forward-drop default + the admin-addr rule render. |
|
||||
| `inventories/production/group_vars/control/vars.yml` | Modify | Turn the knobs on for ubongo (input-only; mamba's LAN IP). |
|
||||
| `tests/integration/overrides/ubongo.yml` | Create | The "be ubongo" overlay (input-only firewall; harness SSH lifeline). |
|
||||
| `tests/integration/profiles/ubongo.json` | Create | The "be ubongo" VM profile (group `control`, applies `site.yml:base`). |
|
||||
| `tests/integration/overrides/askari.yml` | Modify | Add the `integration_profile` marker (verify is now profile-aware). |
|
||||
| `tests/integration/verify.yml` | Modify | Gate the askari (Docker/DNAT) block; add the ubongo (input-only) block + a guard. |
|
||||
| `STATUS.md`, `docs/ROADMAP.md` | Modify (Task 5) | Record mesh-hardening 2/3 done. |
|
||||
|
||||
---
|
||||
|
||||
### Task 1: base role — `base__firewall_input_only` (forward policy) + `base__firewall_admin_addrs` (LAN SSH allow)
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/base/defaults/main.yml`
|
||||
- Modify: `roles/base/templates/nftables.conf.j2`
|
||||
- Modify: `roles/base/molecule/default/converge.yml`
|
||||
- Modify: `roles/base/molecule/default/verify.yml`
|
||||
|
||||
> **Test strategy (note):** Molecule renders one fixture, so it locks the *secure default* —
|
||||
> `input_only` **off** → forward `policy drop` — plus the new admin-addr rule (red→green). The
|
||||
> `input_only` **on** → forward `policy accept` path is exercised on a real VM by the
|
||||
> integration "be ubongo" profile (Tasks 3–4), whose verify fails red until this template
|
||||
> conditional exists. Both branches are covered, across the two test layers.
|
||||
|
||||
- [ ] **Step 1: Write the failing test (extend Molecule verify)**
|
||||
|
||||
In `roles/base/molecule/default/verify.yml`, after the `Assert the docker_host extension hook is present` block, add:
|
||||
|
||||
```yaml
|
||||
- name: Assert the forward chain defaults to policy drop (input_only off)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'hook forward priority 0; policy drop;' in nft"
|
||||
fail_msg: >-
|
||||
forward chain must default to policy drop when base__firewall_input_only is
|
||||
false (container isolation stays the norm on real service hosts)
|
||||
|
||||
- name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
|
||||
fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Add the fixture that drives it (Molecule converge)**
|
||||
|
||||
In `roles/base/molecule/default/converge.yml`, add to the `vars:` block (after the `base__firewall_control_addr` line):
|
||||
|
||||
```yaml
|
||||
base__firewall_admin_addrs:
|
||||
- "10.30.0.77" # fixture: an operator-workstation LAN source (admin-addr SSH allow)
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Run the test to verify it fails**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: FAIL on `Assert the admin-addr SSH allow rule` (the template does not consume `base__firewall_admin_addrs` yet, so the `ip saddr 10.30.0.77 …` rule is absent). The forward-drop assertion passes already (the template currently hardcodes `policy drop`).
|
||||
|
||||
- [ ] **Step 4: Add the defaults**
|
||||
|
||||
In `roles/base/defaults/main.yml`, after the `base__firewall_apply: true` line (end of the firewall behaviour block, currently line 13), add:
|
||||
|
||||
```yaml
|
||||
base__firewall_input_only: false # true → the forward chain is `policy accept` (host-local
|
||||
# INPUT filtering only). For hosts that forward/route
|
||||
# container or NAT traffic (the control node's Docker +
|
||||
# libvirt-NAT) where a forward default-deny would break
|
||||
# them. Real service hosts keep this false (forward drop).
|
||||
base__firewall_admin_addrs: [] # extra LAN source IPs allowed to SSH, besides wt0 +
|
||||
# ssh-from-control. For an operator workstation reaching
|
||||
# the host over the LAN (no mesh). Key-gated. (ADR-021)
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Make the forward policy conditional + render the admin-addr rules**
|
||||
|
||||
In `roles/base/templates/nftables.conf.j2`:
|
||||
|
||||
(a) Replace the forward-chain line (currently line 21):
|
||||
|
||||
```jinja
|
||||
chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; }
|
||||
```
|
||||
|
||||
(b) After the `ssh-from-control` `{% endif %}` (currently line 14) and before the `ip protocol icmp accept` line, add the admin-addr loop:
|
||||
|
||||
```jinja
|
||||
{% for addr in base__firewall_admin_addrs %}
|
||||
ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
|
||||
{% endfor %}
|
||||
```
|
||||
|
||||
- [ ] **Step 6: Run the test to verify it passes**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: PASS — converge renders the ruleset; verify confirms the forward chain is `policy drop` (input_only defaults false) and the `ip saddr 10.30.0.77 tcp dport 22 accept` rule is present; all pre-existing assertions stay green.
|
||||
|
||||
- [ ] **Step 7: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: `Passed: 0 failure(s)` and `check-tags: OK`.
|
||||
|
||||
- [ ] **Step 8: Commit**
|
||||
|
||||
```bash
|
||||
git add roles/base/defaults/main.yml roles/base/templates/nftables.conf.j2 \
|
||||
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml
|
||||
git commit -m "feat(base): input-only forward policy + admin-addr SSH allow
|
||||
|
||||
base__firewall_input_only renders the forward chain policy accept (host-local
|
||||
INPUT filtering only) for hosts that forward container/NAT traffic; defaults
|
||||
false so real service hosts keep the forward default-deny. base__firewall_admin_addrs
|
||||
adds operator-workstation LAN sources to the SSH allow-list alongside wt0 +
|
||||
ssh-from-control. Molecule locks the secure default + the admin rule.
|
||||
Mesh-hardening 2/3 (ADR-020/021).
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: inventory — enable input-only default-deny + mamba on ubongo (control group)
|
||||
|
||||
**Files:**
|
||||
- Modify: `inventories/production/group_vars/control/vars.yml`
|
||||
|
||||
- [ ] **Step 1: Turn the knobs on for the control group**
|
||||
|
||||
Append to `inventories/production/group_vars/control/vars.yml`:
|
||||
|
||||
```yaml
|
||||
|
||||
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
|
||||
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
|
||||
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
|
||||
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
|
||||
# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
|
||||
# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
|
||||
base__firewall_input_only: true
|
||||
base__firewall_admin_addrs:
|
||||
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
|
||||
# OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
|
||||
- "10.20.10.17" # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Verify the vars resolve for ubongo**
|
||||
|
||||
Run: `.venv/bin/ansible-inventory -i inventories/production/ --host ubongo 2>/dev/null | grep -E 'firewall_input_only|firewall_admin_addrs|10.20.10.(50|17)'`
|
||||
Expected: shows `"base__firewall_input_only": true` and `"base__firewall_admin_addrs": ["10.20.10.50", "10.20.10.17"]`.
|
||||
|
||||
- [ ] **Step 3: Lint**
|
||||
|
||||
Run: `make lint`
|
||||
Expected: clean pass (`check-tags: OK`).
|
||||
|
||||
- [ ] **Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add inventories/production/group_vars/control/vars.yml
|
||||
git commit -m "feat(inventory): ubongo gets INPUT-only host firewall + mamba LAN SSH
|
||||
|
||||
Enables base__firewall_input_only on the control group (forward chain stays
|
||||
permissive so Docker egress + the integration-test libvirt NAT survive) and
|
||||
allows the operator workstations' LAN IPs (mamba 10.20.10.50 + 10.20.10.17;
|
||||
raw leases, backstopped by wt0). Mesh-hardening 2/3.
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: integration harness — "be ubongo" profile (overlay + profile + profile-aware verify)
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/integration/overrides/ubongo.yml`
|
||||
- Create: `tests/integration/profiles/ubongo.json`
|
||||
- Modify: `tests/integration/overrides/askari.yml`
|
||||
- Modify: `tests/integration/verify.yml`
|
||||
|
||||
- [ ] **Step 1: Create the "be ubongo" overlay**
|
||||
|
||||
Create `tests/integration/overrides/ubongo.yml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
|
||||
# Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
|
||||
# chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
|
||||
# ListenAddress change (so no boot-race).
|
||||
integration_profile: ubongo
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true # forward chain renders `policy accept`
|
||||
base__firewall_admin_addrs:
|
||||
- "192.168.150.98" # two representative LAN sources — exercises the
|
||||
- "192.168.150.99" # admin-addr loop with a multi-entry list (like ubongo)
|
||||
# Never wt0-only; never touch the real mesh from a throwaway VM.
|
||||
base__ssh_listen_mesh_only: false
|
||||
base__mesh_enabled: false
|
||||
# Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
|
||||
# default-deny apply + the reboot don't lock out the harness. By source IP (interface-
|
||||
# independent). This is the harness's lifeline; the admin-addr above is only exercised.
|
||||
base__firewall_control_addr: "192.168.150.1"
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Create the "be ubongo" VM profile**
|
||||
|
||||
Create `tests/integration/profiles/ubongo.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"groups": ["control"],
|
||||
"applies": [
|
||||
{"playbook": "site.yml", "tags": ["base"]}
|
||||
],
|
||||
"extra_vars_files": ["overrides/ubongo.yml"],
|
||||
"mem_mib": 2048,
|
||||
"vcpus": 2
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Mark the askari overlay with its profile name**
|
||||
|
||||
In `tests/integration/overrides/askari.yml`, after the two header comment lines (before `base__firewall_apply: true`), add:
|
||||
|
||||
```yaml
|
||||
integration_profile: askari
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Make `verify.yml` profile-aware (the test)**
|
||||
|
||||
Replace the entire contents of `tests/integration/verify.yml` with:
|
||||
|
||||
```yaml
|
||||
---
|
||||
# Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
|
||||
# `integration_profile` (set in each profile's overlay). Each profile asserts its own success
|
||||
# criteria; an unknown/unset profile fails loudly (never a silent pass).
|
||||
- name: Verify the rebooted host
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: A known integration_profile must be set (no silent pass)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- integration_profile is defined
|
||||
- integration_profile in ['askari', 'ubongo']
|
||||
fail_msg: "integration_profile must be set in the profile overlay (askari|ubongo)"
|
||||
|
||||
# ── askari profile — Docker host: published-port forwarding survives the reboot ──
|
||||
# The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
|
||||
# base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
|
||||
- name: (askari) Gather service facts
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.service_facts:
|
||||
|
||||
- name: (askari) Docker daemon is active
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.assert:
|
||||
that: "ansible_facts.services['docker.service'].state == 'running'"
|
||||
fail_msg: "docker.service is not running"
|
||||
|
||||
- name: (askari) Forward chain permits container traffic (drop-in loaded)
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.command: nft list chain inet filter forward
|
||||
register: _fwd
|
||||
changed_when: false
|
||||
|
||||
- name: (askari) Assert container forwarding is allowed (not pure drop)
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.assert:
|
||||
that: "'accept' in _fwd.stdout"
|
||||
fail_msg: >-
|
||||
forward chain is pure drop — container forwarding will die on reboot
|
||||
(FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
|
||||
|
||||
- name: (askari) Published port answers from the controller (DNAT + forward alive)
|
||||
when: integration_profile == 'askari'
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ ansible_host }}/"
|
||||
follow_redirects: none
|
||||
status_code: [200, 301, 308, 404, 502, 503]
|
||||
timeout: 10
|
||||
register: _probe
|
||||
retries: 5
|
||||
delay: 6
|
||||
until: _probe is succeeded
|
||||
|
||||
# ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
|
||||
# SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
|
||||
# checks boot_id changed before this verify runs). Here we assert the ruleset shape.
|
||||
- name: (ubongo) Read the live nftables ruleset
|
||||
when: integration_profile == 'ubongo'
|
||||
ansible.builtin.command: nft list ruleset
|
||||
register: _nft
|
||||
changed_when: false
|
||||
|
||||
- name: (ubongo) INPUT default-deny, forward permissive, admin-addr allow
|
||||
when: integration_profile == 'ubongo'
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'hook input priority 0; policy drop;' in _nft.stdout"
|
||||
- "'hook forward priority 0; policy accept;' in _nft.stdout"
|
||||
- "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
|
||||
- "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
|
||||
fail_msg: >-
|
||||
ubongo profile: expected input policy drop, forward policy accept (input-only),
|
||||
and both admin-addr (192.168.150.98/99) SSH allows in the live ruleset.
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Validate the JSON + lint**
|
||||
|
||||
Run: `.venv/bin/python -m json.tool tests/integration/profiles/ubongo.json >/dev/null && echo OK` then `make lint`
|
||||
Expected: `OK`, then a clean lint pass (`check-tags: OK`).
|
||||
|
||||
- [ ] **Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add tests/integration/overrides/ubongo.yml tests/integration/profiles/ubongo.json \
|
||||
tests/integration/overrides/askari.yml tests/integration/verify.yml
|
||||
git commit -m "test(integration): add the 'be ubongo' profile (input-only default-deny)
|
||||
|
||||
A control-group VM that applies base with INPUT-only default-deny (forward
|
||||
policy accept; admin-addr SSH allow). verify.yml is now profile-aware via an
|
||||
integration_profile marker — the askari Docker/DNAT block is gated, and a ubongo
|
||||
block asserts input drop + forward accept + the admin-addr rule. Enables
|
||||
\`make test-integration HOST=ubongo\`. Mesh-hardening 2/3 (ADR-025).
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Validate on the integration harness (`make test-integration HOST=ubongo`) — the GREEN gate
|
||||
|
||||
> Runs a throwaway UEFI VM on ubongo: boots it, applies the base role with the ubongo
|
||||
> overlay (INPUT-only default-deny), **reboots it**, and asserts the ruleset + SSH-returns.
|
||||
> This proves the change survives a reboot before the real control node is ever touched
|
||||
> (spec §cutover step 1; FRICTION signal-6). No code change / no commit — a validation gate.
|
||||
|
||||
- [ ] **Step 1: Ensure the vault is unlocked**
|
||||
|
||||
The run loads `inventories/production/group_vars/all/vault.yml` (symlinked into the run dir), which is decrypted at playbook load.
|
||||
|
||||
Run: `rbw unlocked || rbw unlock`
|
||||
Expected: exits 0 (unlocked). If it prompts, the operator unlocks.
|
||||
|
||||
- [ ] **Step 2: Run the integration cycle**
|
||||
|
||||
Run: `make test-integration HOST=ubongo`
|
||||
Expected (the `cycle`: up → apply → reboot → assert): the VM gets a `192.168.150.x` lease; `site.yml --tags base` applies cleanly; `… rebooted (boot_id changed), SSH back at 192.168.150.x`; then `VERIFY PASSED for boma-it-ubongo-…`. The VM is destroyed on success.
|
||||
|
||||
- [ ] **Step 3: On failure, read the diagnostics**
|
||||
|
||||
If it prints `VERIFY FAILED`, diagnostics are in `~/integration-runs/boma-it-ubongo-<id>/` (`nft.txt`, `console.log`, `journal.txt`). The likely suspects: the admin-addr/forward assertion (Task 1/3 wiring) or SSH not returning post-reboot (the `base__firewall_control_addr: 192.168.150.1` lifeline in the overlay). Fix the implicated task, re-commit, and re-run Step 2. Re-run `make test-integration-clean` first if a VM was left defined.
|
||||
|
||||
- [ ] **Step 4: Record the result**
|
||||
|
||||
Capture the `VERIFY PASSED` line in the task notes (this is the gate Task 5 step 1 depends on). No commit.
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Live staged cutover (operator-supervised — NOT a subagent task)
|
||||
|
||||
> Touches the **real ubongo** (the control node Ansible runs from) and reboots it — lockout-
|
||||
> risky. Run it interactively with the operator, in order, verifying each step before the
|
||||
> next. The firewall auto-rollback timer (`base__firewall_rollback_timeout`, 45 s) +
|
||||
> `wait_for_connection` over the live path is the safety net; the **on-prem physical console**
|
||||
> is the permanent break-glass. Do NOT hand this to an unattended agent.
|
||||
|
||||
- [ ] **Step 1: Pre-checks (gate: Task 4 GREEN)**
|
||||
|
||||
- `rbw unlocked || rbw unlock`.
|
||||
- SSH to ubongo over `wt0` from a road-warrior succeeds.
|
||||
- SSH to ubongo from mamba on the LAN (`10.20.10.50`) succeeds.
|
||||
- `.venv/bin/ansible ubongo -i inventories/production/ -m ping` → `SUCCESS` (over `10.20.10.151`).
|
||||
- The physical console is reachable. If any path fails, STOP.
|
||||
|
||||
- [ ] **Step 2: Dry-run the firewall apply**
|
||||
|
||||
Run: `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
|
||||
Expected: the nftables diff shows `policy drop` on input, `iifname "wt0" … accept`, `ip saddr 10.20.10.151 … accept`, `ip saddr 10.20.10.50 … accept`, and the forward chain as `policy accept`. No errors.
|
||||
|
||||
- [ ] **Step 3: Apply the host firewall (auto-rollback armed)**
|
||||
|
||||
Run: `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall`
|
||||
Expected: the firewall concern snapshots `/etc/nftables.rollback`, arms the 45 s `systemd-run` revert, applies the ruleset, `reset_connection` → `wait_for_connection` over `10.20.10.151` succeeds, then cancels the timer. If connectivity is lost, the timer reverts the ruleset within 45 s and the console is the fallback.
|
||||
|
||||
- [ ] **Step 4: Verify every path + forwarding still works**
|
||||
|
||||
```bash
|
||||
# from a road-warrior over wt0, and from mamba on the LAN:
|
||||
ssh sjat@100.99.146.14 true && echo "wt0 OK"
|
||||
ssh sjat@10.20.10.151 true && echo "mamba-LAN OK" # run from mamba (10.20.10.50)
|
||||
# Ansible self-path:
|
||||
.venv/bin/ansible ubongo -i inventories/production/ -m ping
|
||||
# a disallowed LAN host (e.g. 10.20.10.17) must now be refused/timeout on :22
|
||||
# Docker egress (forward chain still permissive):
|
||||
docker run --rm busybox wget -qO- https://cloudflare.com/cdn-cgi/trace | head -1
|
||||
# libvirt-NAT forwarding intact — a fresh integration VM still reaches apt:
|
||||
make test-integration HOST=ubongo # expect VERIFY PASSED (proves the NAT path survived)
|
||||
```
|
||||
Expected: `wt0 OK`, `mamba-LAN OK`, Ansible `SUCCESS`, the disallowed host refused, the Docker egress line returns, and the integration cycle passes.
|
||||
|
||||
- [ ] **Step 5: Reboot resilience — while the console is present (FRICTION signal-6)**
|
||||
|
||||
With the operator at the physical console, reboot ubongo (`sudo systemctl reboot`). After it returns, confirm SSH comes back on all paths **unaided**:
|
||||
|
||||
```bash
|
||||
ssh sjat@100.99.146.14 true && echo "wt0 OK after reboot"
|
||||
.venv/bin/ansible ubongo -i inventories/production/ -m ping
|
||||
```
|
||||
Expected: SSH returns with no manual intervention (no `ListenAddress`, so nothing to race). Only now is the cutover complete.
|
||||
|
||||
- [ ] **Step 6: Update STATUS + ROADMAP**
|
||||
|
||||
- In `STATUS.md`: in the `roles/base/` row of "Scaffolded but empty", change the firewall note — the `firewall` concern is now **applied to ubongo** as INPUT-only default-deny (it is no longer "not yet applied to any host"); note the `base__firewall_input_only` knob and that the forward default-deny still awaits the `docker_host` drop-in for real service hosts. Add the ubongo control-node row's "Pending" item for default-deny → done.
|
||||
- In `docs/ROADMAP.md`: mark **mesh-hardening sub-project 2 (ubongo default-deny) done**; the remaining follow-on is sub-project 1 (askari SSH→`wt0` *redesign*) and sub-project 3 (NetBird ACL). Update the "Next step" section accordingly.
|
||||
|
||||
```bash
|
||||
git add STATUS.md docs/ROADMAP.md
|
||||
git commit -m "docs: ubongo INPUT-only default-deny applied (mesh-hardening 2/3 done)
|
||||
|
||||
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Push**
|
||||
|
||||
Run: `git push origin main`
|
||||
|
||||
---
|
||||
|
||||
## Self-review (against the spec)
|
||||
|
||||
- **§ Design — INPUT-only default-deny** → Task 1 (forward-policy knob) + Task 2 (enabled on ubongo). ✓
|
||||
- **§ Design — admin-addrs (operator workstations on LAN)** → Task 1 (`base__firewall_admin_addrs` + template loop) + Task 2 (`10.20.10.50` mamba, `10.20.10.17`). ✓
|
||||
- **§ Design — no sshd ListenAddress change** → nothing touches `ssh.yml`/`sshd_hardening.conf.j2`; only nftables. ✓ (verified: Tasks 1–3 file lists exclude them).
|
||||
- **§ allow-list** (lo, established, wt0, ssh-from-control, admin-addr, icmp; forward accept) → template already renders lo/established/wt0/control/icmp; Task 1 adds admin-addr + forward-accept. ✓
|
||||
- **§ Why-safe (incident signals 1/2/3/6)** → signal 1 (forward accept, Task 1); signal 2 (no ListenAddress); signal 3 (ubongo keeps LAN + console); signal 6 (Task 4 harness reboot + Task 5 step 5 reboot-while-console). ✓
|
||||
- **§ New & changed code** (defaults, template, molecule, group_vars/control, integration profile) → Tasks 1–3. ✓
|
||||
- **§ admin raw-leases + revisit** → Task 2 comments record both leases + the OPNsense-reservation revisit trigger; backstop (wt0) noted; flagged in `FRICTION.md`. ✓
|
||||
- **§ Testing** (Molecule render asserts; `make test-integration HOST=ubongo`; live checks) → Task 1 (Molecule), Task 4 (harness), Task 5 step 4 (live). ✓ Coverage split (default in Molecule, input_only on the VM) noted in Task 1.
|
||||
- **§ Staged cutover (signal-6 order)** → Task 5 steps 1–7; reboot-recovery (step 5) precedes nothing that retires a break-glass (the console is permanent). ✓
|
||||
- **§ Risks/rollback** → auto-rollback (Task 5 step 3), redundant paths + physical console, raw-lease backstop. ✓
|
||||
- **Type/name consistency:** `base__firewall_input_only` (bool) and `base__firewall_admin_addrs` (list) are spelled identically in defaults, template, converge, group_vars, and the overlay. `integration_profile` is spelled identically in both overlays and the three gates in `verify.yml`. ✓
|
||||
- **Placeholder scan:** no TBD/TODO; every code/command step shows the actual content. ✓
|
||||
|
|
@ -1,237 +0,0 @@
|
|||
# Mesh SPOF — accept + targeted resilience — Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Accept askari's single-coordinator SPOF as a documented availability trade-off, and harden the one real gap — a `base` mesh knob that pins the coordinator FQDN in `/etc/hosts` on managed mesh hosts so a local-DNS hiccup can't strand the mesh.
|
||||
|
||||
**Architecture:** One additive, idempotent `base` `mesh`-concern task (a `/etc/hosts` line via `lineinfile`, gated on a new opt-in knob), Molecule-tested; plus documentation (accepted-risk R8 + an ADR-016 availability amendment + STATUS/ROADMAP). No new infra, no Terraform, no live-deploy gate.
|
||||
|
||||
**Tech Stack:** Ansible (`base` role, `lineinfile`), Molecule (Debian 13), Markdown docs.
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-20-mesh-spof-accept-resilience-design.md`
|
||||
|
||||
## Global Constraints
|
||||
|
||||
- **FQCN always** (`ansible.builtin.*`); role defaults use the `rolename__var` namespace.
|
||||
- **No new collection** — derive the coordinator FQDN with builtin `regex_replace` (NOT `urlsplit`, which would pull in `community.general`).
|
||||
- The pin is **opt-in and additive**: gated on `base__mesh_enabled | bool` AND `base__mesh_coordinator_pin | length > 0`. Empty knob (the default) = a clean no-op. The coordinator host (`askari`/`offsite_hosts`) is **exempt** — leave its pin empty.
|
||||
- **askari's coordinator IP = `77.42.120.136`** (stable WAN; the A record for `netbird.askari.wingu.me`); ubongo is in the `control` group.
|
||||
- `make lint` clean + `rbw unlocked` before any commit (the pre-commit hook decrypts the vault).
|
||||
- **No new infra** — no P2P, no second relay/coordinator, no Terraform. The coordinator off-site backup is **out of scope** (ADR-022 kickoff).
|
||||
- Tags: the new task carries the `mesh` concern tag (it belongs to the mesh concern).
|
||||
|
||||
---
|
||||
|
||||
### Task 1: `base` mesh coordinator-FQDN `/etc/hosts` pin (DNS-resilience)
|
||||
|
||||
Add an opt-in knob that pins the coordinator FQDN (derived from `base__mesh_management_url`) to a stable IP in `/etc/hosts`, so a managed mesh host survives a local-DNS failure. TDD'd through the role's Molecule scenario (which already exercises the `mesh` concern with `manage: false`).
|
||||
|
||||
**Files:**
|
||||
- Modify: `roles/base/defaults/main.yml` (add the knob after the mesh block, ~line 53)
|
||||
- Modify: `roles/base/tasks/mesh.yml` (append the pin task)
|
||||
- Modify: `roles/base/molecule/default/converge.yml` (add a fixture pin to the vars block)
|
||||
- Modify: `roles/base/molecule/default/verify.yml` (assert the rendered `/etc/hosts` line)
|
||||
- Modify: `inventories/production/group_vars/control/vars.yml` (set the pin for ubongo)
|
||||
|
||||
**Interfaces:**
|
||||
- Produces: role default `base__mesh_coordinator_pin` (string, default `""`); when set + `base__mesh_enabled`, an `/etc/hosts` line `<pin-ip> <fqdn>` where `<fqdn>` is `base__mesh_management_url` minus scheme/port/path.
|
||||
|
||||
- [ ] **Step 1: Write the failing Molecule test (fixture + assertion)**
|
||||
|
||||
In `roles/base/molecule/default/converge.yml`, add one line to the `vars:` block (after `base__mesh_setup_key`, ~line 15):
|
||||
|
||||
```yaml
|
||||
base__mesh_coordinator_pin: "203.0.113.9" # fixture coordinator IP (TEST-NET-3); pins the FQDN from base__mesh_management_url
|
||||
```
|
||||
|
||||
In `roles/base/molecule/default/verify.yml`, append to the `tasks:` list (after the mesh no-op assertion at the end):
|
||||
|
||||
```yaml
|
||||
- name: Read /etc/hosts (coordinator pin)
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/hosts
|
||||
register: _etchosts
|
||||
- name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)"
|
||||
fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
|
||||
success_msg: "coordinator FQDN pinned in /etc/hosts"
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run Molecule to verify it fails**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: FAIL at "Assert the coordinator FQDN is pinned…" — no pin task exists yet, so `/etc/hosts` has no such line.
|
||||
|
||||
- [ ] **Step 3: Add the default knob**
|
||||
|
||||
In `roles/base/defaults/main.yml`, after `base__mesh_version` (~line 53), add:
|
||||
|
||||
```yaml
|
||||
|
||||
# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
|
||||
# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
|
||||
# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
|
||||
# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
|
||||
base__mesh_coordinator_pin: ""
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Add the pin task**
|
||||
|
||||
Append to `roles/base/tasks/mesh.yml`:
|
||||
|
||||
```yaml
|
||||
|
||||
- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/hosts
|
||||
regexp: '\s{{ _coordinator_fqdn | regex_escape }}$'
|
||||
line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
|
||||
state: present
|
||||
vars:
|
||||
_coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
|
||||
when:
|
||||
- base__mesh_enabled | bool
|
||||
- base__mesh_coordinator_pin | length > 0
|
||||
tags: [mesh]
|
||||
```
|
||||
|
||||
(`_coordinator_fqdn` strips the scheme then anything from the first `:`/`/` → `netbird.askari.wingu.me`. The `regexp` matches an existing ` <fqdn>` at line end so a changed IP updates in place — idempotent; absent → appended.)
|
||||
|
||||
- [ ] **Step 5: Run Molecule to verify it passes**
|
||||
|
||||
Run: `make test ROLE=base`
|
||||
Expected: PASS — the new assertion is green and Molecule idempotence is clean (re-running the pin task reports `ok`, not `changed`). The idempotence pass is what proves the `regexp` matches the line it wrote.
|
||||
|
||||
> Note: the empty-knob no-op (the production default for non-mesh / coordinator hosts) is guaranteed by the `when: base__mesh_coordinator_pin | length > 0` gate, not a separate Molecule case — a single converge can't hold both var-states, and boma uses one default scenario per role. The fixture exercises the meaningful path (rendering + FQDN extraction + idempotence).
|
||||
|
||||
- [ ] **Step 6: Wire the production pin for ubongo**
|
||||
|
||||
In `inventories/production/group_vars/control/vars.yml`, after the `base__mesh_enabled: true` block, add:
|
||||
|
||||
```yaml
|
||||
|
||||
# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
|
||||
# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
|
||||
# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
|
||||
base__mesh_coordinator_pin: "77.42.120.136"
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Lint and commit**
|
||||
|
||||
```bash
|
||||
rbw unlocked && make lint
|
||||
git add roles/base/defaults/main.yml roles/base/tasks/mesh.yml \
|
||||
roles/base/molecule/default/converge.yml roles/base/molecule/default/verify.yml \
|
||||
inventories/production/group_vars/control/vars.yml
|
||||
git commit -m "feat(base): pin the NetBird coordinator FQDN in /etc/hosts (mesh DNS-resilience)" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Accept + document the SPOF (R8, ADR-016 amendment, STATUS/ROADMAP)
|
||||
|
||||
Record the single-coordinator SPOF as a conscious, revisitable trade-off and capture the availability analysis + recovery. Pure documentation; references the pin from Task 1.
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/security/accepted-risks.md` (add row R8; bump the review date)
|
||||
- Modify: `docs/decisions/016-mesh-vpn.md` (add the availability amendment subsection)
|
||||
- Modify: `STATUS.md` (note the SPOF accepted + the coordinator-pin knob)
|
||||
- Modify: `docs/ROADMAP.md` (mark sub-project 3 addressed; surface ADR-022 backup + ACL as next)
|
||||
|
||||
- [ ] **Step 1: Add accepted-risk R8**
|
||||
|
||||
In `docs/security/accepted-risks.md`, add this row to the table after R7:
|
||||
|
||||
```markdown
|
||||
| R8 | **Single off-site mesh coordinator is an availability SPOF for remote mesh access** — `askari` hosts the only NetBird management/signal/relay (ADR-016); while askari is down, every *relayed* peer (all of `ubongo`'s, by the deliberate default-deny posture) loses remote mesh reachability and the control plane pauses. The `netbird_coordinator` store also has **no off-site backup yet** (BACKUP.md), so an askari loss loses mesh control-plane state until rebuilt | Inherent to ADR-016's deliberate single off-site coordinator (sovereignty; survives a homelab outage). **Narrow blast radius:** the mesh is not a gateway (`wt0` routes only `100.99.0.0/16`) — LAN, intra-cluster, and local-service traffic are unaffected; only remote/off-LAN mesh access breaks, and only when off-LAN *and* askari is down at once. askari is a reliable always-on VPS; mitigations: client + managed-host coordinator-FQDN DNS pin (`base__mesh_coordinator_pin`; runbook), documented `/setup` rebuild | askari proves unreliable; the cluster grows to depend on the mesh for intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role lands (closes the state-loss half) |
|
||||
```
|
||||
|
||||
Then update the closing line's date: change `_Last reviewed: 2026-06-18.` to `_Last reviewed: 2026-06-20.`
|
||||
|
||||
- [ ] **Step 2: Add the ADR-016 availability amendment**
|
||||
|
||||
In `docs/decisions/016-mesh-vpn.md`, add this subsection immediately before the `## Related` section:
|
||||
|
||||
```markdown
|
||||
## Availability — an `askari` outage (amendment 2026-06-20)
|
||||
|
||||
The coordinator is deliberately **single** (one off-site host). Recorded here so its
|
||||
availability envelope is explicit; accepted as **R8** (`docs/security/accepted-risks.md`).
|
||||
|
||||
The mesh is **not** a default gateway — `wt0` routes only the overlay CIDR (`100.99.0.0/16`);
|
||||
normal traffic uses the host's default route. So an `askari` outage has a **narrow blast
|
||||
radius**:
|
||||
|
||||
| Traffic | `askari` down |
|
||||
|---|---|
|
||||
| LAN device → LAN service (direct / via reverse proxy) | unaffected |
|
||||
| node ↔ node over LAN IPs (cluster) | unaffected |
|
||||
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P) |
|
||||
| **road-warrior → `ubongo` (remote, relayed)** | **breaks** |
|
||||
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
|
||||
|
||||
Only remote (off-LAN) mesh access to peers is lost, and only when off-LAN **and** `askari`
|
||||
is down simultaneously. On-LAN access to `ubongo` never depends on the mesh (Recovery &
|
||||
operations, above).
|
||||
|
||||
**Recovery:** rebuild the coordinator (`/setup` + re-enrol peers, M5) or restore from backup
|
||||
once ADR-022 lands; the `netbird_coordinator` store backup is the **next sub-project** (its
|
||||
gap is named in R8 and `BACKUP.md`). Client/road-warrior break-glass (reliable resolvers +
|
||||
the coordinator-FQDN `/etc/hosts` pin) is in `docs/runbooks/netbird-client.md`; managed mesh
|
||||
hosts get the same pin via `base__mesh_coordinator_pin`.
|
||||
|
||||
**Not pursued** (deliberately, given the narrow blast radius): direct P2P (punctures the
|
||||
default-deny posture; only helps established sessions), a second relay (needs another public
|
||||
host / reintroduces the home public surface), a second coordinator (unsupported by
|
||||
self-hosted NetBird; against this ADR).
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Update STATUS.md**
|
||||
|
||||
In `STATUS.md`, in the `roles/base/` row, append to the end of the firewall/mesh description (before the closing ` |`): a sentence noting the pin and the accepted SPOF:
|
||||
|
||||
```markdown
|
||||
The `mesh` concern also pins the coordinator FQDN in `/etc/hosts` (`base__mesh_coordinator_pin`, set for ubongo) so a local-DNS hiccup can't strand the mesh; the single-coordinator SPOF is an accepted availability risk (R8, ADR-016 availability amendment).
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Update ROADMAP.md**
|
||||
|
||||
In `docs/ROADMAP.md`, in the "Remaining mesh-hardening sub-projects" list, change item 3 from the SPOF-reduction "(next)" wording to **DONE**, and make the NetBird ACL the next item. Replace the current items 3–4 block with:
|
||||
|
||||
```markdown
|
||||
3. ~~**askari relay-SPOF reduction**~~ → **DONE (2026-06-20)** — assessed + **accepted** as a
|
||||
documented availability risk (R8 + ADR-016 availability amendment): the blast radius is
|
||||
narrow (LAN/intra-cluster/local traffic never touch askari), so no P2P / second relay /
|
||||
second coordinator was warranted. Hardened the one real gap — a managed-host coordinator-FQDN
|
||||
DNS pin (`base__mesh_coordinator_pin`). The coordinator off-site backup gap is handed to ADR-022.
|
||||
4. **NetBird ACL off Allow-All** to scoped policies (open mechanism question — no headless API path).
|
||||
5. **ADR-022 backup kickoff** — off-site backup of the `netbird_coordinator` store (named in R8 /
|
||||
BACKUP.md) as the first slice of the backup role (restic + the `fisi` pull node).
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Consistency check + commit**
|
||||
|
||||
```bash
|
||||
grep -q "^| R8 " docs/security/accepted-risks.md && \
|
||||
grep -q "Availability — an .askari. outage" docs/decisions/016-mesh-vpn.md && \
|
||||
echo "docs OK"
|
||||
```
|
||||
Expected: `docs OK`.
|
||||
|
||||
```bash
|
||||
rbw unlocked
|
||||
git add docs/security/accepted-risks.md docs/decisions/016-mesh-vpn.md STATUS.md docs/ROADMAP.md
|
||||
git commit -m "docs(security): accept the single-coordinator mesh SPOF (R8) + ADR-016 availability amendment" \
|
||||
-m "Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes / out of scope
|
||||
|
||||
- **Coordinator off-site backup → ADR-022 kickoff** (next sub-project). Not built here.
|
||||
- **Direct P2P / second relay / second coordinator** — deliberately not pursued (spec §Design).
|
||||
- No live deploy is required to land this — the pin is additive/idempotent and applies to ubongo on the next routine `base` apply (`make deploy PLAYBOOK=site LIMIT=ubongo`, operator's discretion). Optional post-deploy spot-check: `getent hosts netbird.askari.wingu.me` on ubongo resolves to `77.42.120.136`.
|
||||
|
|
@ -1,156 +0,0 @@
|
|||
# Spec — Mesh-hardening (1 of 3): move askari's SSH onto `wt0`
|
||||
|
||||
Status: Accepted (2026-06-17)
|
||||
|
||||
## Context & scope
|
||||
|
||||
The **mesh-hardening follow-on** was deferred from M5 (ROADMAP). It was decomposed into
|
||||
**three independent sub-projects**, each with its own spec → plan → implementation cycle:
|
||||
|
||||
1. **askari SSH → `wt0`** ← *this spec*
|
||||
2. ubongo nftables default-deny + `ssh-from-control` (its own later spec)
|
||||
3. NetBird ACL off Allow-All → scoped policies (its own later spec)
|
||||
|
||||
This spec covers **only (1)**. It makes askari's SSH reachable **only over the NetBird mesh
|
||||
interface `wt0`** and closes the WAN `:22` surface at both the host and the Hetzner Cloud
|
||||
Firewall. It does **not** touch ubongo, the NetBird ACL (stays Allow-All for now — one
|
||||
moving access-layer at a time), or askari's public service exposure (Caddy 80/443, NetBird
|
||||
STUN 3478 stay on the WAN).
|
||||
|
||||
Current state (STATUS.md): askari is reached at `ansible_host: 77.42.120.136` (WAN, in the
|
||||
TF-generated `inventories/production/offsite.yml`); `wt0` is up at `100.99.226.39`
|
||||
(Management+Signal Connected, M5); the base nftables `firewall` concern is **built but not
|
||||
applied** to askari (the Hetzner Cloud Firewall is its perimeter today); the Hetzner Cloud
|
||||
Firewall (`terraform/modules/hetzner_vm`) opens `:22` from `var.ssh_admin_cidrs` plus
|
||||
80/443/3478 from anywhere.
|
||||
|
||||
## Goal / success criteria
|
||||
|
||||
- SSH to askari succeeds over `wt0` (from ubongo) and **fails from any off-mesh source**.
|
||||
- The WAN `:22` surface is closed at **both** layers (host nftables = `wt0`-only; Hetzner
|
||||
Cloud Firewall drops the `:22` rule).
|
||||
- Public services are unaffected: `https://test.askari.wingu.me` and
|
||||
`https://netbird.askari.wingu.me` serve valid certs; STUN `3478/udp` still answers.
|
||||
- Ansible manages askari over `wt0`.
|
||||
- Break-glass is the **Hetzner web console** (out-of-band; works even if the mesh is down).
|
||||
- A reboot of askari does **not** lock SSH out (the boot-race below is solved).
|
||||
|
||||
## Design — three enforcement layers (defense-in-depth)
|
||||
|
||||
1. **sshd** binds `ListenAddress` to askari's `wt0` IP only, so it does not accept on WAN.
|
||||
2. **host nftables** (base `firewall` concern, ADR-020): catalog-driven default-deny;
|
||||
`:22` allowed only via `iifname "wt0"` (the interface-name match that survives `wt0`
|
||||
being absent — see `docs/testing/gotchas.md`); public service ports stay open on WAN.
|
||||
3. **Hetzner Cloud Firewall** (Terraform): the `:22` `ssh_admin_cidrs` rule is removed;
|
||||
80/443/3478 stay.
|
||||
|
||||
## The boot-race fix (load-bearing)
|
||||
|
||||
`wt0` is brought up by NetBird **after** boot, so at sshd start the `wt0` IP may not exist
|
||||
yet. A plain `ListenAddress 100.99.226.39` would fail to bind → sshd exits → **lockout on
|
||||
reboot**. Solution:
|
||||
|
||||
- **`net.ipv4.ip_nonlocal_bind = 1`** via a sysctl drop-in (`ansible.posix.sysctl`,
|
||||
persisted under `/etc/sysctl.d/`). This lets sshd bind the `wt0` address even before the
|
||||
interface exists; once `wt0` comes up with that IP, traffic is delivered to the existing
|
||||
listener — no reload needed.
|
||||
- The sshd drop-in **fails closed**: the mesh IP is resolved (see below) and the play
|
||||
**asserts it is non-empty** before rendering. An empty `ListenAddress` would silently
|
||||
fall back to listening on all interfaces, defeating the restriction — that must never
|
||||
render.
|
||||
|
||||
**Mesh-IP source (decided):** the **live `wt0` fact** `ansible_wt0.ipv4.address`, gathered
|
||||
at apply time (`wt0` is up during the play, since M5), with a **`host_var` fallback**
|
||||
(`base__ssh_listen_addr`, default `""`) and a fail-closed `assert` that one of them yielded
|
||||
a non-empty address. Live fact is preferred (correct even if NetBird reassigns the IP);
|
||||
the host_var is an explicit override / belt.
|
||||
|
||||
## New & changed code
|
||||
|
||||
**Role `base` (the `hardening` + `firewall` concerns):**
|
||||
|
||||
- `roles/base/defaults/main.yml` — add:
|
||||
- `base__ssh_listen_mesh_only: false` — opt-in; when `true`, sshd binds the mesh IP only.
|
||||
- `base__ssh_listen_addr: ""` — optional explicit mesh-IP override (fallback to the
|
||||
`ansible_wt0` fact).
|
||||
- `roles/base/tasks/ssh.yml` —
|
||||
- resolve the mesh IP (`base__ssh_listen_addr` or `ansible_wt0.ipv4.address`) into a fact;
|
||||
- `assert` it is non-empty **when** `base__ssh_listen_mesh_only`;
|
||||
- set `net.ipv4.ip_nonlocal_bind = 1` (sysctl drop-in) under the same condition.
|
||||
- `roles/base/templates/sshd_hardening.conf.j2` — append a conditional
|
||||
`ListenAddress {{ resolved_mesh_ip }}` block guarded by `base__ssh_listen_mesh_only`
|
||||
(unset → unchanged behaviour: listen on all). Keep the existing `sshd -t` validation.
|
||||
|
||||
**Inventory:**
|
||||
|
||||
- `inventories/production/host_vars/askari.yml` (new) — `ansible_host: 100.99.226.39`
|
||||
(overrides the TF-generated `offsite.yml`; host_vars are not regenerated by
|
||||
`tf_to_inventory.py`). A header comment explains why.
|
||||
- `inventories/production/group_vars/offsite_hosts/vars.yml` — add
|
||||
`base__ssh_listen_mesh_only: true`; ensure `base__firewall_apply: true`.
|
||||
(`base__mesh_enabled` is already `true` for askari — set in M5 — and is a precondition,
|
||||
not a change here.)
|
||||
|
||||
**Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
|
||||
|
||||
- Enumerate askari's required ingress so catalog-driven default-deny does **not** drop a
|
||||
live public service. Derived from the existing `reverse_proxy` + `netbird_coordinator`
|
||||
definitions: `:22/tcp` on the **mesh** zone (`wt0`); `80,443/tcp` + `3478/udp` on the
|
||||
**public** zone (WAN). The exact catalog/zone YAML is finalised in the implementation
|
||||
plan against the `resolve_firewall_rules` filter's schema.
|
||||
|
||||
**Terraform** (`terraform/environments/offsite` + `terraform/modules/hetzner_vm`):
|
||||
|
||||
- Remove the WAN `:22` ingress rule (e.g. drop `ssh_admin_cidrs` from the firewall, or set
|
||||
it empty and guard the rule). Keep 80/443/3478. Applied via `make tf-plan/apply
|
||||
TF_ENV=offsite` (plan shown before apply).
|
||||
|
||||
## Staged cutover — a working path at every step
|
||||
|
||||
1. **Pre-check:** confirm `ssh sjat@100.99.226.39` and an `ansible askari -m ping` forced
|
||||
over `wt0` both succeed **before** changing anything.
|
||||
2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); verify
|
||||
`ansible askari -m ping` runs over the mesh. WAN `:22` still open as a fallback here.
|
||||
3. **Apply `base` (firewall + sshd together):** one `make deploy PLAYBOOK=site LIMIT=askari`
|
||||
converge applies catalog default-deny (`:22` on `wt0` + public ports) **and** the sshd
|
||||
`ListenAddress`=mesh + `ip_nonlocal_bind` drop-in. The firewall concern's
|
||||
`reset_connection` → `wait_for_connection` (now over `wt0`) plus the armed auto-rollback
|
||||
timer (`base__firewall_rollback_timeout`, 45 s) is the safety gate — a bad ruleset
|
||||
reverts itself. The sshd `reload` cannot drop the in-flight `wt0` session. Verify the
|
||||
public services still respond.
|
||||
4. **Retire the Hetzner WAN `:22`:** the Terraform change above; `make tf-plan
|
||||
TF_ENV=offsite` (review) → `make tf-apply`. Verify: `wt0` SSH works; off-mesh `nc -vz
|
||||
77.42.120.136 22` is refused/times out; `:443` open; STUN answers.
|
||||
|
||||
## Testing
|
||||
|
||||
- **Molecule** (base `default` scenario; `wt0` absent in-container, `base__firewall_apply:
|
||||
false` render-only): assert (a) the rendered nftables allows `:22` via `iifname "wt0"`;
|
||||
(b) with `base__ssh_listen_mesh_only: true` + a fixture mesh IP, the sshd drop-in renders
|
||||
`ListenAddress <ip>` and `sshd -t` passes; (c) with the flag set but **no** resolvable
|
||||
mesh IP, the play **fails closed** (the `assert`); (d) the `ip_nonlocal_bind` sysctl task
|
||||
is present. Keep existing firewall/hardening assertions green.
|
||||
- **Live, out-of-band:** post-cutover, from an off-mesh host `nc -vz 77.42.120.136 22` →
|
||||
refused; `:443` → open; from ubongo over `wt0`, SSH + `ansible -m ping` succeed; reboot
|
||||
askari (Hetzner console) and confirm SSH-over-`wt0` returns without intervention.
|
||||
|
||||
## Risks & rollback
|
||||
|
||||
- **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
|
||||
firewall auto-rollback timer, and `ansible_host`=`wt0` so the connectivity confirm tests
|
||||
the real new path.
|
||||
- **Reboot lockout:** mitigated by `ip_nonlocal_bind` (sshd binds `wt0` regardless of
|
||||
interface timing) + the fail-closed assert (never silently listen-all).
|
||||
- **Default-deny breaks a public service:** mitigated by enumerating all live ingress into
|
||||
the catalog and the §Testing service checks; reversible by re-running with
|
||||
`base__firewall_apply: false` or widening the catalog.
|
||||
- **Ultimate break-glass:** the Hetzner web console (out-of-band). The TF `:22` rule is
|
||||
trivially re-addable.
|
||||
|
||||
## Out of scope / follow-ons
|
||||
|
||||
- ubongo default-deny + `ssh-from-control` (sub-project 2).
|
||||
- NetBird ACL off Allow-All (sub-project 3) — until then any enrolled peer can reach
|
||||
askari's `wt0:22`; scoping that is sub-project 3's job.
|
||||
- `/check-access` (ADR-021) live verification — designed, build still pending.
|
||||
- STATUS.md / ROADMAP updates land with the implementation, not this spec.
|
||||
|
|
@ -1,267 +0,0 @@
|
|||
# Local VM integration testing on ubongo (design)
|
||||
|
||||
**Status:** Designed, not built. Resolves `docs/TODO.md` item 2.4 (Local VM integration
|
||||
testing on ubongo, pre-deploy).
|
||||
**Date:** 2026-06-18.
|
||||
**Implements:** the concrete build of ADR-008 Level 2/3 (staging/integration), deferred
|
||||
for lack of hosts but hostable on ubongo. To be recorded as **ADR-025**.
|
||||
|
||||
## Context
|
||||
|
||||
Molecule (ADR-008 Level 1) tests each role in a single Docker container: one `converge`,
|
||||
no real kernel netfilter, no real Docker daemon in the loop, and **no reboot**. That
|
||||
structurally cannot catch an entire class of bug — reboot-survivability, host-firewall ×
|
||||
Docker interaction, and boot-ordering — which is exactly the class that caused the
|
||||
**2026-06-17 mesh-hardening incident**:
|
||||
|
||||
- `base`'s nftables `forward { policy drop; }` killed the askari Docker host **on reboot**
|
||||
(nftables loaded its default-deny *before* Docker, breaking published-port DNAT and
|
||||
inter-container forwarding → public services + the mesh went down). It had worked right
|
||||
after `make deploy`, when Docker's runtime rules still coexisted. (FRICTION 2026-06-17 #1.)
|
||||
- `ip_nonlocal_bind` did **not** beat the sshd boot-race; sshd bound to the `wt0` mesh IP
|
||||
had no listener at boot. (FRICTION #2.)
|
||||
- The coordinator host could not bootstrap the mesh it itself hosts. (FRICTION #3.)
|
||||
- NetBird `netbird-server` FATAL-loops on the GeoLite2 download when egress is lost — and
|
||||
egress was lost when `nft flush` wiped Docker's NAT masquerade. (FRICTION #4.)
|
||||
|
||||
Recovery needed the Hetzner console + a WAN-SSH break-glass. The lesson, already crystallised
|
||||
as a standing rule: *firewall/sshd/boot changes must be tested on a real VM with a real
|
||||
reboot before they touch a live host, and a non-mesh break-glass must be kept.*
|
||||
|
||||
This spec defines a way for the agent to spin up **throwaway KVM VMs locally on ubongo**
|
||||
that mirror a target host (real Docker, a real reboot, the real role apply) and validate
|
||||
risky infra changes **before** a live deploy. ubongo can host this today:
|
||||
|
||||
> verified: ubongo KVM capability · Bash (this session) · `/dev/kvm` present + accessible
|
||||
> (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198
|
||||
> GiB disk free; libvirt/QEMU/Vagrant **not yet installed** · 2026-06-18.
|
||||
|
||||
## Goals
|
||||
|
||||
- Reproduce the 2026-06-17 bug class locally: real OS boot, real Docker, real netfilter,
|
||||
the real role apply, a **real reboot**, then outcome assertions.
|
||||
- Let the agent drive the full loop autonomously: provision → apply → reboot → assert →
|
||||
teardown, with diagnostics captured on failure.
|
||||
- Mirror a *real* host from inventory (first profile: "be askari"), so the apply is
|
||||
faithful, not synthetic.
|
||||
- Be the concrete tool that operationalises the standing "test risky infra before live
|
||||
deploy" rule.
|
||||
|
||||
## Non-goals (v1)
|
||||
|
||||
- Not a production hypervisor on ubongo (reconciles ADR-015 — see Governance).
|
||||
- Not nested Proxmox; the provisioning *chrome* (template clone / Terraform) is **not**
|
||||
mirrored — every incident bug lives in the boot/kernel/Docker layer, not provisioning.
|
||||
- Not a multi-VM mini-cluster; one VM at a time. (All six 2026-06-17 signals occurred on a
|
||||
single host that was Docker host + coordinator + mesh peer.) Multi-VM is a later extension.
|
||||
- Not a CI gate; this is an interactive, agent-driven pre-deploy check on ubongo (CI stays
|
||||
lint + Molecule per ADR-008/010).
|
||||
|
||||
## Decisions (from the 2026-06-18 brainstorm)
|
||||
|
||||
1. **Virtualisation approach: libvirt/KVM directly (Approach A).** A golden Debian-13
|
||||
genericcloud qcow2 cached locally; each run boots an ephemeral qcow2 overlay backed by
|
||||
it, seeded via cloud-init NoCloud, driven by a **stdlib-only** Python script over
|
||||
`virsh` (no `libvirt-python` dependency). Chosen over Vagrant+vagrant-libvirt (Ruby/plugin
|
||||
footprint, box drift from the real cloud image) and terraform-provider-libvirt (poor at
|
||||
the imperative apply→reboot→re-apply sequence, throwaway state, blurs ADR-006's prod-VM
|
||||
boundary). Lightest footprint on a 15 GiB control node; full control of the reboot step;
|
||||
the same Debian cloud image real hosts boot.
|
||||
|
||||
2. **Fidelity envelope: real OS/Docker/netfilter/reboot, not the Proxmox provisioning
|
||||
path.** A lightweight local hypervisor is enough because the bugs are post-boot.
|
||||
|
||||
3. **Scope: one throwaway VM at a time, instantiated from a real host's inventory.** First
|
||||
profile: **"be askari"** (Docker host + NetBird coordinator + mesh peer on one box). The
|
||||
mechanism is generic — later "be" any host by swapping which inventory host it mirrors.
|
||||
|
||||
4. **Acceptance is self-validating against the real failure.** Done = the harness, on a
|
||||
local VM, applies `base` (firewall on) to a Docker host, reboots, and **observes the
|
||||
2026-06-17 breakage** (Docker forwarding dead / services down); then, with the
|
||||
`docker_host` container-forward drop-in in place, the same run **survives the reboot**.
|
||||
If step 1 passes, the harness is not faithful.
|
||||
|
||||
5. **Tiered cert fidelity via a `--certs` knob** (DNS-01 is what makes real certs possible
|
||||
with no public inbound — validation is out-of-band via a Gandi TXT record; the VM needs
|
||||
only outbound to ACME + Gandi, which the NAT net provides):
|
||||
- `internal` (default) — Caddy `tls internal`, zero deps, instant; for the incident repro
|
||||
and runs where certs aren't under test.
|
||||
- `le-staging` — real DNS-01 ACME against Let's Encrypt **staging**: real caddy-gandi
|
||||
path, real cert files/renewal, untrusted root, effectively no rate limits. **Built in v1.**
|
||||
- `le-prod-wildcard` — a real trusted `*.test.wingu.me` wildcard, **issued once,
|
||||
persisted on ubongo, reused** across runs. Wired in v1 but **on-demand only**; its
|
||||
accepted risk is recorded when used (prod Gandi credential reaching an ephemeral VM;
|
||||
transient TXT in the real `wingu.me` zone). A deliberate "no-egress" failure scenario
|
||||
(to reproduce FRICTION #4) forces `internal`, since ACME needs egress.
|
||||
|
||||
6. **The toolchain is Ansible-managed**, not hand-installed: a new non-service role
|
||||
(`integration_test`, `control` group) installs/enables libvirt+QEMU reproducibly. The
|
||||
repo owns ubongo's state. The driver manages *images* lazily on first run (keeps the role
|
||||
lean; avoids fiddly download/refresh logic in Ansible).
|
||||
|
||||
7. **Stubs live in an overlay file, never in the real inventory** — so `make tf-inventory`
|
||||
and "don't edit inventory directly" stay intact, and every stub is explicit and reviewable.
|
||||
|
||||
8. **A new ADR-025** records this decision (approach + alternatives + cert tiers); ADR-008
|
||||
gains a pointer and redirects its "what Molecule does NOT test" gaps here.
|
||||
|
||||
## Architecture — five isolated components
|
||||
|
||||
| # | Component | Purpose | Location |
|
||||
|---|-----------|---------|----------|
|
||||
| 1 | **`integration_test` role** (non-service, `control` group) | Install/enable libvirt+QEMU+virtinst, add `sjat`/`claude` to `libvirt` group, create the image-cache dir, drop the driver. Idempotent, Molecule-tested. | `roles/integration_test/` |
|
||||
| 2 | **`integration-vm.py` driver** | Stdlib-only lifecycle over `virsh`: `up / apply / reboot / assert / cycle / reset / down / prune / console`. Lazily ensures the golden image (download + checksum). | `scripts/integration-vm.py` |
|
||||
| 3 | **Profiles + var overlays** | Make a VM "become" a host: pull that host's real group_vars/host_vars + layer a small explicit overlay (cert tier, in-VM coordinator endpoint, VM connection). | `tests/integration/overrides/<host>.yml` |
|
||||
| 4 | **Verify playbook** | Outcome-based post-reboot assertions (Docker up, published-port DNAT alive, `nft` sane, service responds, `wt0` up), reusing Molecule's `verify.yml` philosophy. | `tests/integration/verify.yml` |
|
||||
| 5 | **Makefile target** | `make test-integration HOST=<name> [CERTS=...] [KEEP=1]` → `cycle`; `make test-integration-clean` → `prune`. Documented in CLAUDE.md's command table. | `Makefile` |
|
||||
|
||||
## Lifecycle / data flow
|
||||
|
||||
`make test-integration HOST=askari` drives:
|
||||
|
||||
```
|
||||
1. ensure golden image Debian-13 genericcloud qcow2, cached + checksum-verified
|
||||
2. ephemeral overlay qcow2 backed by golden (throwaway; never mutate golden)
|
||||
3. cloud-init NoCloud seed hostname + ansible user + ubongo's SSH key + NIC
|
||||
4. virt-install --import boot on an isolated libvirt NAT net (DHCP IP + outbound NAT)
|
||||
5. wait for SSH IP via `virsh domifaddr --source lease` (guest-agent optional)
|
||||
6. transient inventory askari's real vars + ansible_host=<lease IP> + stub overlay
|
||||
7. ansible-playbook site THE REAL APPLY (base + docker_host + reverse_proxy + coordinator)
|
||||
8. [snapshot post-apply] optional reset point for fast re-runs
|
||||
9. virsh reboot ──────────┐ ← the step Molecule structurally cannot do
|
||||
10. wait for SSH ┘
|
||||
11. ansible-playbook verify outcome assertions; THIS is where the incident surfaces
|
||||
12. report + teardown pass/fail; on fail keep VM + dump diagnostics; else destroy overlay
|
||||
```
|
||||
|
||||
Steps 1–7 build a real Docker daemon with real published-port DNAT to break; step 9 is a
|
||||
real kernel reboot, so nftables loads default-deny before Docker exactly as on askari.
|
||||
|
||||
## Fidelity boundary & cert tiers
|
||||
|
||||
**Faithful where the bug lives:** real kernel, real netfilter, real Docker with
|
||||
published-port DNAT, the real role apply, a real reboot, and the coordinator running *inside
|
||||
the VM* so the VM is its own mesh peer — reproducing the circular mesh-bootstrap (FRICTION #3)
|
||||
on one box.
|
||||
|
||||
**Stubbed where it needs the public internet** (explicit, in the overlay): LE certs via the
|
||||
`--certs` knob (Decision 5); public DNS (`askari.wingu.me`) → local resolution; NetBird
|
||||
geo-DB → pre-seeded or requirement disabled (which is *also* the FRICTION #4 fix, so the
|
||||
harness can test both the FATAL-loop and its remedy).
|
||||
|
||||
## Acceptance test (self-validating)
|
||||
|
||||
1. Run the cycle on **today's** `base` (firewall on, no `docker_host` container-forward
|
||||
drop-in) → **step 11 must FAIL after reboot** (Docker forwarding dead, services down).
|
||||
2. Implement the `docker_host` container-forward rules (the pending fix STATUS.md names) →
|
||||
re-run → **step 11 must PASS across the reboot.**
|
||||
|
||||
**Scope boundary:** the *harness* is this plan's deliverable. The `docker_host`
|
||||
container-forward fix is a separate work item (FRICTION 2026-06-17 #1). v1's acceptance
|
||||
deliberately spans both, because a credible harness must demonstrate **both** a true-negative
|
||||
(red on the broken state) and a true-positive (green on the fixed state) — otherwise we have
|
||||
only ever watched the assert go red. The plan decides sequencing: build the small
|
||||
`docker_host` drop-in as the green-half of acceptance, or consume it if built separately
|
||||
first. Minimum credible v1 is the red half (faithful reproduction); full acceptance is red→green.
|
||||
|
||||
This one round-trip proves the harness reproduces the incident, the fix works, and the loop
|
||||
can be trusted for the next risky change before it touches a live host.
|
||||
|
||||
## Robustness, isolation & teardown
|
||||
|
||||
**Failure leaves evidence** (catching a bug is the point):
|
||||
|
||||
| Step fails | Behaviour |
|
||||
|---|---|
|
||||
| Golden image (1) | Fail fast, clear message; image cached (one-time cost) |
|
||||
| Boot / first SSH (4–5) | **Capture serial console to a log file**, fail with its tail — the automated equivalent of the Hetzner console (ties to TODO 10.8) |
|
||||
| Apply (7) | Keep VM, surface Ansible output, dump diagnostics |
|
||||
| **No SSH after reboot (9–10)** | The classic incident signature; FAIL, keep VM, capture console — the harness *succeeding* |
|
||||
| Assert (11) | FAIL, keep VM, dump post-mortem: `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`, `systemd-analyze critical-chain`; exit non-zero |
|
||||
|
||||
Diagnostics land in gitignored `~/integration-runs/<ts>-<host>/` (same pattern as ADR-017's
|
||||
screenshot dir; the agent reads them directly).
|
||||
|
||||
**Three safety invariants** (these make the testing tool itself safe):
|
||||
1. **The transient inventory contains only the test VM** — no real host is ever in scope;
|
||||
the apply is `--limit`ed to the VM.
|
||||
2. **"Be askari" points NetBird at the in-VM coordinator (localhost)** — the VM forms its
|
||||
own one-node mesh; it never enrolls in the real mesh.
|
||||
3. **Test VMs sit on an isolated libvirt NAT net** — outbound NAT for ACME/image pulls, but
|
||||
not reachable to the LAN (`10.20.x`) or the real mesh.
|
||||
|
||||
**Resource guard** (ubongo's 15 GiB ceiling, ADR-015/012): default VM ≈ 2 vCPU / 3 GiB / 20
|
||||
GiB thin overlay; the driver refuses to start below a free-RAM threshold and enforces **one
|
||||
integration VM at a time** (name-prefix `boma-it-*`). **Teardown:** success destroys domain +
|
||||
overlay; failure keeps them and prints how to inspect; `make test-integration-clean` reaps
|
||||
all `boma-it-*` orphans. An optional post-apply **snapshot** lets `reset` re-run
|
||||
reboot+assert without re-applying (fast iteration on a fix).
|
||||
|
||||
## Testing the tester
|
||||
|
||||
- **pytest** on the driver's pure logic: transient-inventory generation, var/overlay merge,
|
||||
`--certs`→overlay mapping, DHCP-lease parsing, resource-guard math (mock `virsh`). Joins
|
||||
boma's existing pytest suite.
|
||||
- **Molecule** (Docker) on the `integration_test` role: asserts libvirt/qemu/virtinst
|
||||
installed, `libvirtd` enabled, users in `libvirt` group, driver present. (Cannot run
|
||||
KVM-in-Docker — the documented Molecule limitation.)
|
||||
- **End-to-end self-test = the acceptance test above**, run manually on first build and
|
||||
recorded in the runbook.
|
||||
|
||||
## Governance & documentation touch-points
|
||||
|
||||
- **ADR-025 "Local VM integration testing"** — decision, approach A, rejected alternatives
|
||||
(Proxmox-nested / Vagrant / TF-libvirt), cert tiers.
|
||||
- **ADR-008** — pointer to ADR-025; redirect its "what Molecule does NOT test" gaps
|
||||
(nftables loading, mesh dataplane) to this level.
|
||||
- **ADR-015** — one-line reconciliation: "not a hypervisor" → runs *ephemeral KVM test VMs*
|
||||
as part of its local-test-runner role (still not a production hypervisor); note the
|
||||
test-VM RAM load.
|
||||
- **`docs/security/accepted-risks.md`** — the `le-prod-wildcard` risk (prod Gandi credential
|
||||
→ ephemeral VM; transient TXT in real `wingu.me`).
|
||||
- **CLAUDE.md** command table + **`docs/runbooks/integration-testing.md`** (run a cycle,
|
||||
cert knobs, where diagnostics land, inspecting a kept failed VM, pruning) + **STATUS.md**
|
||||
entry. The runbook's pre-flight line operationalises FRICTION #6 (*validate
|
||||
reboot-recovery before retiring the break-glass*).
|
||||
|
||||
## Capacity
|
||||
|
||||
One VM (~3 GiB) against ~13 GiB free is comfortable. The only future pinch is concurrency
|
||||
with the Level-4 Chromium/Playwright stack (ADR-017) — handled by the resource guard +
|
||||
"one at a time." Add a note to `docs/hardware/reference.md`; revisit at `/capacity-review`.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
- **Proxmox VE nested on ubongo** — highest fidelity incl. the provisioning step, but heavy
|
||||
(nested virt, RAM), in tension with ADR-015, and the incident bugs don't live in
|
||||
provisioning. Rejected.
|
||||
- **Vagrant + vagrant-libvirt** — mature lifecycle/snapshots, but adds the Ruby/Vagrant
|
||||
ecosystem + a fragile plugin, boxes drift from the real Debian cloud image, and the
|
||||
reboot→assert sequence still needs custom logic. Rejected.
|
||||
- **terraform-provider-libvirt** — declarative and reuses TF, but poor at the imperative
|
||||
apply→reboot→re-apply test sequence, adds throwaway state, and blurs ADR-006's
|
||||
"TF owns *production* VM existence on Proxmox" boundary. Rejected.
|
||||
|
||||
## Open questions / deferred
|
||||
|
||||
- **Multi-VM mini-staging** (inter-host mesh/dataplane) — design the driver + NAT net so a
|
||||
topology is an additive extension; out of scope for v1.
|
||||
- **Interplay with the Level-4 browser stack** — both want ubongo RAM; the resource guard is
|
||||
the v1 answer, revisit when Level 4 is built.
|
||||
- **Snapshot strategy depth** — v1 ships clone-and-destroy + an optional post-apply snapshot;
|
||||
richer snapshot trees deferred.
|
||||
|
||||
## Knowledge to verify at plan stage (ADR-014)
|
||||
|
||||
These are from memory / unverified and must be confirmed against version-matched docs before
|
||||
the plan asserts them:
|
||||
|
||||
- Exact `virt-install --import` flags and the cloud-init **NoCloud** seed format on the
|
||||
Debian-13 libvirt stack.
|
||||
- Whether the Debian-13 genericcloud image ships `qemu-guest-agent` (IP can come from the
|
||||
DHCP lease regardless — guest-agent is an optimisation, not a requirement).
|
||||
- Let's Encrypt **rate limits** (prod vs staging) — to confirm "issue the wildcard once,
|
||||
reuse" stays within limits.
|
||||
- The `caddy-dns/gandi` DNS-01 configuration and pinned version already used by
|
||||
`reverse_proxy`, and whether the Gandi LiveDNS API key can be scoped to `test.wingu.me`.
|
||||
- libvirt default vs a dedicated isolated NAT network on Debian-13 (`virsh net-*`).
|
||||
|
|
@ -1,216 +0,0 @@
|
|||
# Spec — Mesh-hardening redesign: askari SSH `wt0`-primary + permanent WAN break-glass
|
||||
|
||||
Status: Accepted (2026-06-19)
|
||||
|
||||
## Context & scope
|
||||
|
||||
The **mesh-hardening follow-on** (deferred from M5) was decomposed into three independent
|
||||
sub-projects, each with its own spec → plan → implementation cycle. Progress so far:
|
||||
|
||||
1. ~~askari SSH → `wt0`~~ — **attempted 2026-06-17, BACKED OUT** after it took askari down
|
||||
on reboot (spec/plan `docs/superpowers/{specs,plans}/2026-06-17-mesh-hardening-askari-ssh-wt0*`).
|
||||
2. ubongo nftables INPUT-only default-deny — **DONE 2026-06-19**, reboot-validated
|
||||
(`base__firewall_input_only`).
|
||||
3. NetBird ACL off Allow-All → scoped policies — not started.
|
||||
|
||||
This spec is the **redesign of (1)**. The operator sequencing decision (2026-06-19) is:
|
||||
do this redesign **first**, then a separate sub-project to reduce askari's
|
||||
single-point-of-failure (SPOF) role. **This spec covers only the redesign of (1).** The SPOF
|
||||
reduction is the named follow-on (its own later spec).
|
||||
|
||||
### Why the 2026-06-17 attempt was backed out
|
||||
|
||||
Four hazards, recorded in `docs/FRICTION.md` (the six 2026-06-17 signals):
|
||||
|
||||
1. **`base`'s `forward policy drop` breaks Docker hosts on reboot** — nftables loaded
|
||||
default-deny before Docker, so container forwarding/NAT (WAN→Caddy, Caddy→coordinator)
|
||||
died after reboot.
|
||||
2. **`ip_nonlocal_bind` did NOT beat the sshd boot-race** — binding sshd `ListenAddress`
|
||||
to the `wt0` IP still failed at boot ("could not assign the address"); and because
|
||||
`wt0` never came up, sshd had no listener at all.
|
||||
3. **The coordinator host can't bootstrap the mesh it depends on** — askari runs the
|
||||
NetBird coordinator *and* is a mesh peer; its agent needs the local coordinator container
|
||||
healthy to bring up `wt0`. After an unclean reboot the coordinator was down → `wt0`
|
||||
never came up → with SSH `wt0`-only, the host was reachable only via the Hetzner console.
|
||||
General rule: *never make a host's only management path depend on a service that host
|
||||
itself hosts.*
|
||||
4. **The coordinator FATAL-loops on the geolocation-DB download with no egress** — a
|
||||
transient loss of container egress (here: NAT wiped by `nft flush`) crash-loops the whole
|
||||
control plane.
|
||||
|
||||
### What changed since 2026-06-17 (enablers this redesign relies on)
|
||||
|
||||
- `docker_host` **container-forward nftables drop-in** (`172ae37`) — reboot-safe Docker
|
||||
forwarding (available as a later tightening; not required by this pass).
|
||||
- **`base__firewall_input_only`** — input-only default-deny, forward chain stays
|
||||
`policy accept` (Docker-safe). **Proven on ubongo and reboot-validated 2026-06-19.**
|
||||
- The **ADR-025 integration harness** — reproduces a host's boot on a throwaway local VM,
|
||||
so reboot-safety is proven GREEN before the real host is touched.
|
||||
|
||||
## Goal / success criteria
|
||||
|
||||
- askari's host nftables firewall is **applied at last** (`base__firewall_apply: true`),
|
||||
INPUT-only default-deny — matching ubongo.
|
||||
- **Normal management is over the mesh:** `ansible_host` resolves to askari's `wt0` IP
|
||||
(`100.99.226.39`); SSH-over-`wt0` and `ansible askari -m ping` over the mesh both succeed.
|
||||
- **A permanent non-mesh break-glass survives a mesh/coordinator outage**, via two
|
||||
independent channels:
|
||||
- the **Hetzner web console** (out-of-band; always works, IP-independent); and
|
||||
- **WAN `:22` reachable only from ubongo's WAN IP (`91.226.145.80`)**, enforced at *both*
|
||||
the host nftables layer (`base__firewall_admin_addrs`) and the Hetzner Cloud Firewall.
|
||||
WAN `:22` is **deliberately NOT closed** — the coordinator-host exception (FRICTION #3).
|
||||
- **askari survives a reboot under the new firewall, unattended:** Docker forwarding/NAT
|
||||
intact, `https://test.askari.wingu.me` + `https://netbird.askari.wingu.me` serve valid
|
||||
certs, STUN `3478/udp` answers, the coordinator container is healthy (geo-DB no longer
|
||||
FATAL), `wt0` returns, SSH is reachable over both `wt0` and the WAN break-glass.
|
||||
- **No sshd `ListenAddress` change** (`base__ssh_listen_mesh_only` stays `false`) — this is
|
||||
what sidesteps the boot-race that sank the 2026-06-17 attempt.
|
||||
|
||||
## Design — mirror ubongo 2/3, with the coordinator-host exception
|
||||
|
||||
The host firewall does the SSH scoping; sshd is left listening on all interfaces. This is
|
||||
the ubongo 2/3 pattern, which is proven and reboot-validated.
|
||||
|
||||
1. **`base` firewall, INPUT-only default-deny** (`base__firewall_apply: true`,
|
||||
`base__firewall_input_only: true`): the input chain defaults to `drop`; the forward chain
|
||||
stays `policy accept` so Docker container forwarding/NAT and published-port DNAT keep
|
||||
working across a reboot. Allowed ingress:
|
||||
- `:22/tcp` via `iifname "wt0"` (the interface-name match that survives `wt0` being
|
||||
absent at boot — `base__firewall_mgmt_interface: wt0`);
|
||||
- `:22/tcp` from `91.226.145.80` (ubongo's WAN — the break-glass; via
|
||||
`base__firewall_admin_addrs`);
|
||||
- the public service surface from the catalog: `80,443/tcp` + `3478/udp` (WAN).
|
||||
2. **No sshd change.** `base__ssh_listen_mesh_only` stays `false`; sshd keeps listening on
|
||||
all interfaces. The firewall, not sshd, restricts where `:22` is reachable. There is no
|
||||
`ListenAddress`, hence no `ip_nonlocal_bind`, hence no boot-race.
|
||||
3. **The Hetzner Cloud Firewall is unchanged** — the `:22`-from-ubongo rule stays (the
|
||||
2026-06-17 attempt removed it; this redesign keeps it as the perimeter break-glass).
|
||||
4. **Coordinator geo-DB robustness** — make the `netbird_coordinator` control plane survive
|
||||
a transient egress loss (the nat-flush window on apply, and the boot window before Docker
|
||||
re-adds its NAT), so the coordinator stays healthy and `wt0` can come back. One of:
|
||||
- **pre-seed** the GeoLite2 DB into the persistent `netbird_data:/var/lib/netbird` volume
|
||||
so netbird-server finds it locally and never needs to download; or
|
||||
- **disable / make non-fatal** the geolocation requirement in `config.yaml.j2`.
|
||||
The exact v0.72.4 mechanism is verified against NetBird's pinned docs at plan time
|
||||
(ADR-014) — the design fixes the *intent* (a transient egress blip must not FATAL the
|
||||
control plane); the plan fixes the *knob*.
|
||||
|
||||
### Rejected alternatives (these are the 2026-06-17 failures)
|
||||
|
||||
- sshd `ListenAddress = wt0 IP` + `ip_nonlocal_bind` → boot-race; did not bind. **Out.**
|
||||
- `forward policy drop` on a Docker host → broke forwarding on reboot. **Out** (use
|
||||
`input_only`; the `docker_host` container-forward drop-in is a later tightening).
|
||||
- Close WAN `:22` entirely → coordinator host left console-only on a bad reboot. **Out**
|
||||
(keep WAN `:22`-from-ubongo as the always-available non-mesh path).
|
||||
|
||||
### How each 2026-06-17 failure is answered
|
||||
|
||||
| 2026-06-17 failure | Fix in this design |
|
||||
|---|---|
|
||||
| `forward drop` killed Docker on reboot | `base__firewall_input_only: true` → forward stays `accept` |
|
||||
| `ip_nonlocal_bind` sshd boot-race | no sshd `ListenAddress` change; firewall scopes `:22` by `iifname "wt0"` |
|
||||
| coordinator chicken-egg / lockout | permanent WAN `:22`-from-ubongo + Hetzner console; management never depends on a service askari hosts |
|
||||
| coordinator geo-DB FATAL-loop | pre-seed / non-fatal geo so a transient egress blip can't crash the control plane |
|
||||
|
||||
## New & changed code
|
||||
|
||||
**Inventory:**
|
||||
|
||||
- `inventories/production/group_vars/offsite_hosts/vars.yml` —
|
||||
- `base__firewall_apply: true` (was `false`);
|
||||
- `base__firewall_input_only: true` (new — forward stays `accept`, Docker-safe);
|
||||
- `base__firewall_admin_addrs: ["91.226.145.80"]` (new — ubongo's WAN, the break-glass;
|
||||
comment states what it is and why a coordinator host keeps a non-mesh path);
|
||||
- `base__ssh_listen_mesh_only: false` stays (explicit — no boot-race);
|
||||
- rewrite the header backout note → "redesigned 2026-06-19: `wt0`-primary + permanent WAN
|
||||
break-glass; see this spec."
|
||||
- `inventories/production/host_vars/askari.yml` (**new**) — `ansible_host: 100.99.226.39`
|
||||
(the `wt0` IP), so Ansible manages askari over the mesh. Overrides the TF-generated WAN
|
||||
`ansible_host` in `offsite.yml` (host_vars are not regenerated by `tf_to_inventory.py`).
|
||||
Header comment explains why.
|
||||
|
||||
**Role `netbird_coordinator`:**
|
||||
|
||||
- The geo-DB robustness change above (`templates/config.yaml.j2` and/or a pre-seed task +
|
||||
`templates/docker-compose.yml.j2` volume already persists `/var/lib/netbird`), with
|
||||
Molecule/verify coverage that the control plane comes up without external geo egress.
|
||||
|
||||
**Firewall catalog** (`inventories/production/group_vars/all/firewall.yml`):
|
||||
|
||||
- **No change.** It already enumerates askari's public ingress (`reverse_proxy` 80/443,
|
||||
`netbird_stun` 3478/udp). `:22` is handled by the `base` firewall's built-in SSH rules
|
||||
(`mgmt_interface` `wt0` + `admin_addrs`), not the catalog.
|
||||
|
||||
**Terraform / Hetzner Cloud Firewall:**
|
||||
|
||||
- **No change.** The WAN `:22`-from-ubongo rule stays (the perimeter half of the break-glass).
|
||||
|
||||
**sshd:**
|
||||
|
||||
- **No change.**
|
||||
|
||||
## Validation
|
||||
|
||||
### Harness-first GREEN gate (ADR-025) — before any live change
|
||||
|
||||
A "be askari" integration profile (Docker host + a coordinator-like container on the shared
|
||||
network + `base__firewall_input_only` + `admin_addrs`), driven through `make
|
||||
test-integration HOST=askari` (reusing the existing profile/overlay/verify pattern):
|
||||
|
||||
- input chain default-deny with `:22` accepted via `iifname "wt0"` **and** from the
|
||||
break-glass admin address; forward chain `policy accept`;
|
||||
- published-port DNAT + NAT masquerade survive a **reboot** (the RED→GREEN reboot cycle);
|
||||
- the coordinator-like container comes up healthy with **no external geo egress**;
|
||||
- SSH path returns after reboot.
|
||||
|
||||
This must be GREEN before the live cutover.
|
||||
|
||||
### Live cutover — supervised, console open, break-glass never removed
|
||||
|
||||
Sequencing rule (FRICTION #6): validate reboot-recovery while a fallback path is still open.
|
||||
Because the WAN break-glass is *never* removed in this design, that invariant holds by
|
||||
construction.
|
||||
|
||||
1. **Pre-check:** `ssh sjat@100.99.226.39` (over `wt0`) and `ansible askari -m ping` (forced
|
||||
over `wt0`) both succeed; public services + STUN healthy.
|
||||
2. **Repoint Ansible:** add `host_vars/askari.yml` (`ansible_host` = `wt0` IP); confirm
|
||||
`ansible askari -m ping` runs over the mesh.
|
||||
3. **Apply `base` (+ the geo-DB fix):** one `make deploy PLAYBOOK=site LIMIT=askari`
|
||||
converge applies INPUT-only default-deny with the `wt0` + admin-addr SSH allow and the
|
||||
coordinator robustness change. The firewall concern's armed auto-rollback
|
||||
(`base__firewall_rollback_timeout: 45`) reverts a bad ruleset. Then a post-apply
|
||||
`restart docker` rebuilds NAT (base's `flush ruleset` wipes Docker's nat — FRICTION); the
|
||||
coordinator now survives the egress window thanks to the geo-DB fix.
|
||||
4. **Verify the new steady state:** public services serve valid certs; STUN answers; SSH
|
||||
over `wt0` works; SSH over the WAN break-glass (`91.226.145.80` → `:22`) works.
|
||||
5. **Reboot resilience (the real test):** reboot askari (Hetzner console available) and
|
||||
confirm — with no intervention — Docker forwarding/NAT, public services, the coordinator,
|
||||
`wt0`, and SSH (both paths) all return.
|
||||
|
||||
## Risks & rollback
|
||||
|
||||
- **ubongo's WAN IP anchors the break-glass.** If it is dynamic and rotates, the host
|
||||
`admin_addrs` rule and the Hetzner FW rule must be updated. The **Hetzner console** is the
|
||||
IP-independent ultimate break-glass. (Confirmed static by the operator 2026-06-19; it is
|
||||
also already the Hetzner FW assumption today.)
|
||||
- **Mid-cutover lockout:** mitigated by the staged order (a path open at each step), the
|
||||
firewall auto-rollback timer, `ansible_host` = `wt0` (the confirm tests the real new path),
|
||||
and the WAN break-glass that is never removed.
|
||||
- **Reboot lockout:** mitigated by `iifname "wt0"` scoping (no sshd boot-race), the WAN
|
||||
break-glass, the geo-DB fix (coordinator survives the egress window), and harness GREEN.
|
||||
- **Default-deny breaks a public service:** mitigated by the catalog already enumerating all
|
||||
live ingress and the §Validation service checks; reversible via `base__firewall_apply:
|
||||
false`.
|
||||
- **Ultimate break-glass:** the Hetzner web console (out-of-band).
|
||||
|
||||
## Out of scope / follow-ons
|
||||
|
||||
- **SPOF reduction (the next sub-project)** — reduce askari's single-point-of-failure role
|
||||
(currently `ubongo → askari` is `Relayed` through askari's own relay; if askari is down the
|
||||
mesh data plane for relayed peers is down). Its own spec, after this.
|
||||
- **NetBird ACL off Allow-All** — until then any enrolled peer can reach askari's `wt0:22`;
|
||||
scoping that is a separate sub-project.
|
||||
- **Full forward-chain hardening** — the `docker_host` container-forward drop-in (full
|
||||
forward default-deny, reboot-safe) as a later tightening over the `input_only` baseline.
|
||||
- **Coordinator off-site backup** (FRICTION #5, ADR-022) — still pending; noted, not in scope.
|
||||
- STATUS.md / ROADMAP updates land with the implementation, not this spec.
|
||||
|
|
@ -1,203 +0,0 @@
|
|||
# Spec — Mesh-hardening (2 of 3): ubongo INPUT-only default-deny + `ssh-from-control`
|
||||
|
||||
Status: Accepted (2026-06-19)
|
||||
|
||||
## Context & scope
|
||||
|
||||
The **mesh-hardening follow-on** (deferred from M5, ROADMAP) was decomposed into three
|
||||
independent sub-projects, each its own spec → plan → implementation cycle:
|
||||
|
||||
1. askari SSH → `wt0` — spec/plan written 2026-06-17, **attempted and backed out the same day**
|
||||
(the incident; six lessons in `FRICTION.md`). Needs a redesign — **not** this spec.
|
||||
2. **ubongo nftables default-deny + `ssh-from-control`** ← *this spec*
|
||||
3. NetBird ACL off Allow-All → scoped policies (its own later spec; open mechanism question —
|
||||
no headless API path).
|
||||
|
||||
ROADMAP (re-ordered after the 2026-06-17 incident) puts **ubongo first**: it is the clean,
|
||||
low-risk case — a physical box with a permanent console break-glass, and *not* the coordinator
|
||||
host that the incident proved you must not corner.
|
||||
|
||||
This spec hardens **ubongo's inbound surface only**. It does **not** change sshd's
|
||||
`ListenAddress` (so no boot-race), does **not** apply a forward-chain default-deny (so Docker +
|
||||
the libvirt NAT keep working), and does **not** touch askari or the NetBird ACL.
|
||||
|
||||
Current state (verified on ubongo, 2026-06-19): **no host firewall** — sshd listens on
|
||||
`0.0.0.0:22`, reachable from LAN, mesh, and anything routable; only Docker's + libvirt's own
|
||||
`iptables-nft` tables exist. Interfaces: `eno1` `10.20.10.151` (LAN, = `ansible_host`), `wt0`
|
||||
`100.99.146.14` (mesh), `docker0` (one container, no published ports), `virbr-boma`
|
||||
`192.168.150.1/24` (the libvirt NAT that `make test-integration` uses), `ip_forward=1`.
|
||||
|
||||
## Goal / success criteria
|
||||
|
||||
- SSH to ubongo succeeds over **`wt0`** (road-warriors, askari), from **mamba on the LAN**
|
||||
(`10.20.10.50`), and via the **`ssh-from-control` self-path** (Ansible; source `10.20.10.151`).
|
||||
- SSH from any **other** LAN source is **dropped** (default-deny on `input`).
|
||||
- **Docker container egress and `make test-integration` (libvirt NAT) keep working** — the
|
||||
forward chain is untouched.
|
||||
- A **reboot** does not lock SSH out (no `ListenAddress`, so no bind race).
|
||||
- Break-glass is the **on-prem physical console** (permanent, non-mesh). The live apply is
|
||||
additionally gated by the firewall **auto-rollback** timer.
|
||||
|
||||
## Design
|
||||
|
||||
Apply base's nftables `firewall` concern to ubongo, with two adjustments and one deliberate
|
||||
non-change:
|
||||
|
||||
1. **INPUT-only default-deny.** The `input` chain keeps `policy drop` with the guaranteed
|
||||
management plane: `lo`, `established,related`, ICMP, SSH on `wt0`, and SSH from
|
||||
`ssh-from-control` (`10.20.10.151`). We add **one operator-workstation source** (mamba,
|
||||
`10.20.10.50`) via a new `base__firewall_admin_addrs` list. Everything else on `eno1` drops.
|
||||
2. **Forward chain left permissive.** base hardcodes `chain forward { … policy drop; }` for
|
||||
inter-container isolation. On ubongo that would break Docker egress **and** the libvirt NAT
|
||||
the integration harness depends on — the same class of failure that sank askari (FRICTION
|
||||
2026-06-17, signal 1). A new `base__firewall_input_only` knob renders the forward chain
|
||||
`policy accept` instead. Docker's and libvirt's own `iptables-nft` forward rules continue to
|
||||
apply (separate tables); base simply does not add a default-deny on top.
|
||||
3. **No sshd `ListenAddress` change.** sshd keeps listening on `0.0.0.0:22`; nftables does all
|
||||
inbound scoping. This deliberately avoids the `ip_nonlocal_bind` boot-race that broke askari
|
||||
(FRICTION signal 2) — there is nothing to bind before `wt0` exists.
|
||||
|
||||
Resulting `input` allow-list:
|
||||
|
||||
```
|
||||
iif "lo" accept
|
||||
ct state established,related accept
|
||||
ct state invalid drop
|
||||
iifname "wt0" tcp dport 22 accept # mesh (road-warriors, askari)
|
||||
ip saddr 10.20.10.151 tcp dport 22 accept # ssh-from-control (Ansible self) — group_vars/all
|
||||
ip saddr 10.20.10.50 tcp dport 22 accept # mamba on the LAN — base__firewall_admin_addrs
|
||||
ip saddr 10.20.10.17 tcp dport 22 accept # 2nd operator wkstn — base__firewall_admin_addrs
|
||||
ip protocol icmp accept ; ip6 nexthdr ipv6-icmp accept
|
||||
# (no catalog services on ubongo) → default drop
|
||||
chain forward: policy accept # Docker + libvirt-NAT forwarding preserved
|
||||
```
|
||||
|
||||
## Why ubongo is the safe case (maps to the 2026-06-17 incident)
|
||||
|
||||
- **Signal 1** (forward-drop breaks Docker hosts): sidestepped — INPUT-only leaves forwarding alone.
|
||||
- **Signal 2** (`ip_nonlocal_bind` boot-race): sidestepped — no `ListenAddress`; sshd binds nothing new.
|
||||
- **Signal 3** (a host's only mgmt path must not depend on a service it hosts): satisfied —
|
||||
ubongo is not the coordinator and keeps three independent paths (mesh, LAN, physical console).
|
||||
- **Signal 6** (recovery tested after the break-glass was removed): the physical console is
|
||||
permanent (nothing to retire), and reboot-recovery is proven on a throwaway VM first.
|
||||
|
||||
## New & changed code
|
||||
|
||||
**Role `base`:**
|
||||
|
||||
- `roles/base/defaults/main.yml` — add:
|
||||
- `base__firewall_input_only: false` — when true, the forward chain is `policy accept`
|
||||
(host-local input filtering only), for hosts that route/forward container or NAT traffic
|
||||
(e.g. the control node's Docker + libvirt-NAT) where a forward default-deny would break them.
|
||||
- `base__firewall_admin_addrs: []` — extra LAN source IPs allowed to SSH (besides `wt0` +
|
||||
`ssh-from-control`); for an operator workstation reaching the host over the LAN. Key-gated.
|
||||
- `roles/base/templates/nftables.conf.j2`:
|
||||
- the forward line (currently line 21) →
|
||||
`chain forward { type filter hook forward priority 0; policy {{ "accept" if base__firewall_input_only | bool else "drop" }}; }`
|
||||
- after the `ssh-from-control` block (currently lines 12-14), add a loop:
|
||||
`{% for addr in base__firewall_admin_addrs %}` →
|
||||
`ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept`
|
||||
- `roles/base/molecule/default/{converge,verify}.yml` — fixture sets `input_only: true` + an
|
||||
`admin_addrs` entry; assert (a) `forward` renders `policy accept`, (b) the admin-addr accept
|
||||
rule renders, (c) existing input default-deny + `wt0` + control-addr assertions stay green.
|
||||
|
||||
**Inventory** (`inventories/production/group_vars/control/vars.yml`, append):
|
||||
|
||||
```yaml
|
||||
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
|
||||
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
|
||||
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
|
||||
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0, the
|
||||
# ssh-from-control self-path (base__firewall_control_addr in group_vars/all), or mamba on the
|
||||
# LAN. Break-glass: the physical console.
|
||||
base__firewall_input_only: true
|
||||
base__firewall_admin_addrs:
|
||||
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — see note below.
|
||||
- "10.20.10.17" # a 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
|
||||
# base__firewall_apply defaults true; base__firewall_control_addr (= ubongo's own 10.20.10.151)
|
||||
# is set in group_vars/all and covers Ansible's self-connection.
|
||||
```
|
||||
|
||||
**Integration harness** (ADR-025) — a "be ubongo" profile, mirroring "be askari":
|
||||
|
||||
- `tests/integration/overrides/ubongo.yml` — `firewall_apply: true`, `input_only: true`,
|
||||
`admin_addrs: ["192.168.150.99"]` (a representative LAN addr to exercise the rule),
|
||||
`firewall_control_addr: "192.168.150.1"` (the libvirt-NAT gateway = the harness's own SSH
|
||||
path, so the apply + reboot don't lock it out), `ssh_listen_mesh_only: false`,
|
||||
`mesh_enabled: false`.
|
||||
- `tests/integration/profiles/ubongo.json` — mirror `profiles/askari.json` (VM resources/image).
|
||||
- `tests/integration/verify.yml` — make the assertions **profile-aware** (gated on the active
|
||||
profile, since `verify.yml` is shared): for ubongo assert `input` policy drop, `forward`
|
||||
policy **accept**, and the admin-addr rule present. Reachability across the reboot is the
|
||||
harness's existing cycle. The askari assertions (Docker/forward-DNAT) must **not** run for the
|
||||
ubongo profile, nor vice-versa.
|
||||
|
||||
Enables `make test-integration HOST=ubongo`.
|
||||
|
||||
## The admin-addrs — deliberately interim values
|
||||
|
||||
`base__firewall_admin_addrs: ["10.20.10.50", "10.20.10.17"]` are the operator workstations'
|
||||
**current raw DHCP leases** (mamba + a second box), not reservations (operator decision,
|
||||
2026-06-19). Both share the operator's `sjat` SSH key. Caveats, accepted for now:
|
||||
|
||||
- **Lease drift:** if DHCP reassigns either IP, the rule allows whatever host then holds it
|
||||
(still SSH-key-gated, so low risk) and that workstation loses its *LAN* path. **Backstop:**
|
||||
the workstations also reach ubongo over `wt0` (mesh), so they are never cut off — only the
|
||||
off-mesh LAN convenience lapses until the IP is corrected.
|
||||
- **Revisit trigger (flagged for follow-up):** when OPNsense-as-code lands (ADR-020 perimeter /
|
||||
TODO 3.5), replace both raw leases with **MAC-pinned DHCP reservations** (`10.20.10.17` =
|
||||
MAC `bc:0f:f3:c8:4a:8a`) and allow the reserved addresses. Recorded as a `FRICTION.md` open
|
||||
signal so the next `/kaizen` surfaces it.
|
||||
|
||||
## Testing
|
||||
|
||||
- **Molecule** (base `default`, render-only, `firewall_apply: false`): the new forward-accept +
|
||||
admin-addr assertions above, with existing assertions green.
|
||||
- **Integration harness** (`make test-integration HOST=ubongo`): on a throwaway UEFI VM, apply
|
||||
the ubongo overlay, assert the ruleset shape, and prove **SSH survives a reboot** from an
|
||||
allowed source (the existing assert/cycle). This is the gate before touching the real control
|
||||
node.
|
||||
- **Live** (during cutover): SSH over `wt0` ✓, from mamba LAN ✓, Ansible self-ping ✓; SSH from a
|
||||
disallowed LAN host dropped ✓; `docker run … ` egress ✓; a fresh `make test-integration`
|
||||
still spins a VM (libvirt NAT intact) ✓.
|
||||
|
||||
## Staged cutover (operator-supervised — lockout-aware, FRICTION signal-6 order)
|
||||
|
||||
ubongo is managed as `sjat` (password sudo), so the live apply needs the operator present
|
||||
anyway. The physical console is open throughout.
|
||||
|
||||
1. **Harness GREEN:** `make test-integration HOST=ubongo` passes (incl. the reboot).
|
||||
2. **Pre-check the real paths** *before* applying: SSH over `wt0`, SSH from mamba
|
||||
(`10.20.10.50`), `ansible ubongo -m ping`. Confirm the physical console is reachable.
|
||||
3. **Dry-run:** `make check PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — review the nftables diff
|
||||
(input default-deny + `wt0` + `10.20.10.151` + `10.20.10.50`; forward `policy accept`).
|
||||
4. **Apply (auto-rollback armed):** `make deploy PLAYBOOK=site LIMIT=ubongo TAGS=firewall` — the
|
||||
firewall concern snapshots, arms the 45 s revert, applies, `reset_connection` →
|
||||
`wait_for_connection` over the live path (`10.20.10.151`), then cancels the timer. A bad
|
||||
ruleset reverts itself; the console is the ultimate fallback.
|
||||
5. **Verify** every path + Docker egress + a fresh integration-VM spin (above).
|
||||
6. **Reboot ubongo; confirm SSH returns on all paths unaided** (console present). Only now is it
|
||||
done — recovery is proven *while the break-glass is still there*.
|
||||
7. **Docs:** update `STATUS.md` (ubongo row: input-only default-deny applied) and `ROADMAP.md`
|
||||
(mesh-hardening 2/3 done; next is sub-project 1 askari redesign or 3 NetBird ACL).
|
||||
|
||||
## Risks & rollback
|
||||
|
||||
- **Self-referential apply** (ubongo runs Ansible against itself): mitigated by the auto-rollback
|
||||
timer, the `wait_for_connection` over the real path, three redundant allowed sources, and the
|
||||
permanent physical console. ubongo cannot be bricked.
|
||||
- **Raw-lease fragility:** documented above; backstopped by the mesh path; revisit with OPNsense.
|
||||
- **No new container isolation** (forward stays accept): accepted — ubongo is a single-tenant
|
||||
control node, not a service host; Docker/libvirt keep their own forward rules. The forward
|
||||
default-deny remains the norm for real service hosts (`base__firewall_input_only: false`).
|
||||
|
||||
## Out of scope / follow-ons
|
||||
|
||||
- askari SSH → `wt0` redesign (sub-project 1) — needs the boot-race + coordinator-bootstrap
|
||||
resolved; folds in the coordinator-robustness (geo-DB FATAL-loop) + off-site backup lessons.
|
||||
- NetBird ACL off Allow-All (sub-project 3) — open mechanism question (no headless API path).
|
||||
- OPNsense DHCP reservations for the admin workstations (`10.20.10.50` mamba, `10.20.10.17`)
|
||||
and ubongo — replace the raw leases with MAC-pinned reservations; flagged in `FRICTION.md`,
|
||||
with OPNsense-as-code.
|
||||
- Forward-chain container isolation on ubongo — deliberately not done here.
|
||||
- `STATUS.md` / `ROADMAP.md` edits land with the implementation, not this spec.
|
||||
|
|
@ -1,163 +0,0 @@
|
|||
# Spec — Mesh-hardening (SPOF): accept the single-coordinator SPOF + targeted resilience
|
||||
|
||||
Status: Accepted (2026-06-20)
|
||||
|
||||
## Context & scope
|
||||
|
||||
The **mesh-hardening follow-on** decomposed into independent sub-projects (ROADMAP). Progress:
|
||||
|
||||
1. ~~ubongo nftables INPUT-only default-deny~~ — **DONE 2026-06-19**.
|
||||
2. ~~askari SSH → `wt0` redesign~~ — **DONE 2026-06-20** (live reboot-validated).
|
||||
3. **askari relay-SPOF reduction** ← *this spec*.
|
||||
4. NetBird ACL off Allow-All — not started.
|
||||
|
||||
`askari` runs boma's **single** self-hosted NetBird coordinator (management + signal + relay +
|
||||
STUN, one combined container) **and** is a mesh peer (ADR-016). Because `ubongo`'s INPUT-only
|
||||
default-deny drops the inbound UDP that ICE hole-punching needs, `ubongo`'s peers are always
|
||||
**`Relayed`** through askari's own relay (intentional posture — `docs/runbooks/netbird-client.md`,
|
||||
the `ubongo-relay-only` finding). So askari is a single point of failure for **relayed mesh
|
||||
traffic**.
|
||||
|
||||
### The decisive finding — the blast radius is narrow
|
||||
|
||||
The mesh (`wt0`) is **not** a default gateway. Verified on ubongo (2026-06-20):
|
||||
|
||||
```
|
||||
wt0 routes ONLY 100.99.0.0/16 · default route via 10.20.10.1 dev eno1 · Networks: - (no subnet-routes/exit-node)
|
||||
```
|
||||
|
||||
So an askari outage affects **only** traffic addressed to a peer's `100.99.x.x` mesh IP over the
|
||||
relay:
|
||||
|
||||
| Traffic | askari down |
|
||||
|---|---|
|
||||
| LAN device → LAN service (direct or via reverse proxy) | unaffected |
|
||||
| node ↔ node over LAN IPs (future cluster) | unaffected |
|
||||
| node ↔ node same-LAN over mesh IPs | unaffected (direct P2P, local ICE candidate) |
|
||||
| **road-warrior → ubongo (remote, relayed)** | **breaks** |
|
||||
| mesh control plane (new enrol / ACL change / re-handshake) | pauses |
|
||||
|
||||
Nothing on the LAN and no future intra-cluster traffic depends on askari. The only loss is
|
||||
**remote (off-LAN) mesh access to peers** — and only when off-LAN *and* askari is down at once.
|
||||
|
||||
### Why we are not "fixing" the SPOF with new infrastructure
|
||||
|
||||
- **A second coordinator** is not supported by self-hosted NetBird (single management/signal) and
|
||||
contradicts ADR-016's deliberate single off-site coordinator.
|
||||
- **Direct P2P** only helps already-established sessions (re-handshakes still need askari's
|
||||
signal), and enabling it punctures `ubongo`'s deliberate default-deny (a firewall-catalog UDP
|
||||
entry + an `accepted-risks` deviation + OPNsense NAT) — cost out of proportion to a narrow,
|
||||
rare failure.
|
||||
- **A second relay** needs another publicly-reachable host; a relay at home reintroduces the
|
||||
public home surface ADR-016's off-site coordinator exists to avoid.
|
||||
|
||||
Given a reliable always-on VPS and boma's 2–5-host scale, the sound engineering choice is to
|
||||
**accept the SPOF as a conscious, documented trade-off** and harden only the two spots real
|
||||
incidents point to.
|
||||
|
||||
## Goal / success criteria
|
||||
|
||||
- The single-coordinator SPOF is **explicitly accepted and documented** (register entry + an
|
||||
ADR-016 availability analysis + recovery), so the trade-off is revisitable, not forgotten.
|
||||
- **Managed mesh hosts survive a local-DNS hiccup:** `ubongo` (and future managed mesh hosts)
|
||||
resolve the coordinator FQDN even when their resolver dies on a transition, mirroring the
|
||||
client-side fix already in the runbook.
|
||||
- **No new infrastructure** — no P2P, no second relay, no second coordinator, no Terraform.
|
||||
- The coordinator **off-site backup gap** is named in the accepted risk and explicitly handed to
|
||||
the next sub-project (ADR-022), not built here.
|
||||
|
||||
## Design
|
||||
|
||||
### (a) Accepted-risk `R8` — `docs/security/accepted-risks.md`
|
||||
|
||||
Add one row to the register (owned by ADR-002):
|
||||
|
||||
- **Risk:** *Single off-site mesh coordinator is an availability SPOF for remote mesh access* —
|
||||
askari hosts the only management/signal/relay (ADR-016); a relayed peer (all of ubongo's) loses
|
||||
remote mesh reachability while askari is down, and the control plane pauses. The
|
||||
`netbird_coordinator` store has **no off-site backup yet** (BACKUP.md), so an askari loss also
|
||||
loses mesh control-plane state until rebuilt.
|
||||
- **Rationale:** inherent to ADR-016's deliberate single off-site coordinator (sovereignty,
|
||||
survives a homelab outage); **narrow blast radius** (above table — LAN/intra-cluster/local
|
||||
unaffected); askari is a reliable always-on VPS; mitigations exist (client + managed-host DNS
|
||||
pin; documented rebuild).
|
||||
- **Revisit trigger:** askari proves unreliable; the cluster grows to depend on the mesh for
|
||||
intra-node traffic; remote mesh access becomes business-critical; or the ADR-022 backup role
|
||||
lands (closes the state-loss half).
|
||||
|
||||
R8 is the **availability** complement to R3 (which covers askari as a *security* target).
|
||||
|
||||
### (b) ADR-016 amendment — an "Availability — an askari outage" subsection
|
||||
|
||||
A short subsection capturing: the blast-radius table; that the SPOF is an accepted property
|
||||
(→ R8); and the **recovery procedure** — rebuild the coordinator (`/setup` + re-enrol peers, M5)
|
||||
or restore from backup once ADR-022 lands; client/road-warrior break-glass already in
|
||||
`docs/runbooks/netbird-client.md`; on-LAN access to ubongo never depends on the mesh (ADR-016
|
||||
recovery model). Recorded as an amendment (dated), ADR-016 status stays Accepted.
|
||||
|
||||
### (c) DNS-resilience — pin the coordinator FQDN on managed mesh hosts (`base` `mesh` concern)
|
||||
|
||||
The 2026-06-18 outage was a client failing to resolve `netbird.askari.wingu.me` on a network
|
||||
transition; the client fix (public resolvers + an `/etc/hosts` pin to askari's stable WAN IP) is
|
||||
already in the runbook. The gap: **managed** mesh hosts have no equivalent. Add to `base`'s `mesh`
|
||||
concern (`roles/base/tasks/mesh.yml`):
|
||||
|
||||
- New default `base__mesh_coordinator_pin: ""` (empty → no pin; opt-in).
|
||||
- When set (and `base__mesh_enabled`), render an `/etc/hosts` entry mapping the coordinator FQDN
|
||||
— derived from `base__mesh_management_url` via the `urlsplit('hostname')` filter, **not** a
|
||||
duplicated literal — to `base__mesh_coordinator_pin`, idempotently (a marker-scoped
|
||||
`blockinfile`/`lineinfile`).
|
||||
- Set `base__mesh_coordinator_pin` to askari's static WAN IP for managed mesh hosts that depend
|
||||
on the coordinator (ubongo via the `control` group_vars; future cluster groups as they appear).
|
||||
The **coordinator host itself (askari) is exempt** (it would point its own FQDN at its own WAN
|
||||
IP — needs NAT hairpin and is a server with stable DNS); the plan confirms the exact group_vars
|
||||
placement and the askari exemption.
|
||||
|
||||
The pin is safe because askari's WAN IP is static (operator-confirmed); rendering it from a single
|
||||
inventory variable keeps it maintainable if it ever changes.
|
||||
|
||||
## New & changed code/docs
|
||||
|
||||
- `docs/security/accepted-risks.md` — add row **R8**; bump the "Last reviewed" date.
|
||||
- `docs/decisions/016-mesh-vpn.md` — add the dated "Availability — an askari outage" amendment
|
||||
subsection (blast-radius table + recovery + R8 cross-ref).
|
||||
- `roles/base/defaults/main.yml` — add `base__mesh_coordinator_pin: ""` with a comment.
|
||||
- `roles/base/tasks/mesh.yml` — add the `/etc/hosts` coordinator-pin task (gated on
|
||||
`base__mesh_enabled` + a non-empty pin; FQDN from `urlsplit`).
|
||||
- `inventories/production/group_vars/control/vars.yml` — set `base__mesh_coordinator_pin` to
|
||||
askari's WAN IP for ubongo.
|
||||
- `roles/base/molecule/default/{converge,verify}.yml` — assert that with the pin set + a fixture
|
||||
FQDN the `/etc/hosts` entry renders, and that an empty pin renders nothing (no-op).
|
||||
- `STATUS.md` / `docs/ROADMAP.md` — mark sub-project 3 done; surface ADR-022 (coordinator backup)
|
||||
as the next item. (Land with the implementation, not this spec.)
|
||||
|
||||
## Testing
|
||||
|
||||
- **Molecule** (`base` default scenario): (1) `base__mesh_coordinator_pin: ""` → no `/etc/hosts`
|
||||
coordinator line (default no-op); (2) pin set + a fixture `base__mesh_management_url` → exactly
|
||||
one idempotent `<ip> <fqdn>` line, FQDN correctly extracted by `urlsplit`. Existing
|
||||
firewall/hardening/mesh assertions stay green.
|
||||
- **No live deploy required for acceptance** — the pin is additive and idempotent; it lands on
|
||||
ubongo on the next routine `base` apply. (Optional spot-check: `getent hosts
|
||||
netbird.askari.wingu.me` on ubongo resolves to the pinned IP.)
|
||||
|
||||
## Risks & rollback
|
||||
|
||||
- **Stale pin if askari's WAN IP changes** — mitigated by rendering from one inventory variable
|
||||
(single edit) and askari's IP being static; the pin is removable by clearing the knob + a
|
||||
re-apply.
|
||||
- **Over-pinning the coordinator host** — askari is explicitly exempt (hairpin/DNS), set in
|
||||
group_vars scope.
|
||||
- **Accepting the SPOF** is itself the residual risk — bounded by the narrow blast radius, the
|
||||
documented recovery, and R8's revisit triggers.
|
||||
|
||||
## Out of scope / follow-ons
|
||||
|
||||
- **Coordinator off-site backup → ADR-022 kickoff (the next sub-project).** Named in R8 and
|
||||
`BACKUP.md` as the open gap; building it means ADR-022's pull-node (`fisi`) + restic design, not
|
||||
throwaway plumbing here.
|
||||
- **Direct P2P / NAT-traversal** — deferred posture change (default-deny puncture + OPNsense NAT +
|
||||
governance); explicitly not pursued here.
|
||||
- **A second relay / second coordinator** — ruled out above (infra cost / not supported / against
|
||||
ADR-016).
|
||||
- **NetBird ACL off Allow-All** — separate sub-project (4).
|
||||
|
|
@ -70,21 +70,3 @@ testing surprise is worth remembering past the session that hit it.
|
|||
plus review. Only a real (or `--check`) call against the API surfaces them.
|
||||
- → Treat a **check-mode run against the real API as a required gate** for such roles, or
|
||||
build a render-only assertion that materializes and inspects the rendered module args.
|
||||
|
||||
## Single-file bind mount + atomic rewrite = stale config (reload-in-place only)
|
||||
|
||||
- **`ansible.builtin.template` writes atomically** (temp file + rename → a *new inode*). A
|
||||
Docker **single-file** bind mount pins the *old* inode, so a container that reloads
|
||||
config **in place** (no restart) keeps reading the stale file. Live hit: `reverse_proxy`
|
||||
bind-mounted the Caddyfile as a single file; `caddy reload` (in-container) re-read the
|
||||
old inode and silently no-op'd (`"config is unchanged"`). The new NetBird route never
|
||||
loaded → Caddy never requested its cert → surfaced only as a downstream TLS handshake
|
||||
failure.
|
||||
- **Fix for reload-in-place roles: bind-mount the config *directory*, not the file**
|
||||
(`./caddy` → `/etc/caddy`). Directory mounts reflect the inode swap, so the reload sees
|
||||
the new file (proven on askari).
|
||||
- **Restart-based roles are fine with a single-file mount.** Sibling case: `netbird`
|
||||
single-file-mounts `config.yaml`, but its handler does `docker compose restart` (not an
|
||||
in-container reload), and a **restart re-resolves the bind mount** (verified: route
|
||||
count 0 before, 1 after). Rule of thumb: **reload-in-place needs a directory mount;
|
||||
restart-based roles don't.**
|
||||
|
|
|
|||
|
|
@ -2,27 +2,14 @@
|
|||
# Shared firewall topology — single source of truth for the host nftables layer
|
||||
# (base role) and OPNsense (future). See docs/decisions/020-firewall.md.
|
||||
|
||||
# Zone → subnet (from ADR-007). `public` = the WAN (anywhere) for deliberately public
|
||||
# off-site services (askari); home/cluster services use the internal zones only.
|
||||
# Zone → subnet (from ADR-007).
|
||||
firewall_zones:
|
||||
mgmt: 10.10.0.0/24
|
||||
srv: 10.20.0.0/24
|
||||
lan: 10.30.0.0/24
|
||||
iot: 10.40.0.0/24
|
||||
guest: 10.50.0.0/24
|
||||
public: 0.0.0.0/0
|
||||
|
||||
# Service catalog: <name> → placement (host | group | hosts) + ingress[].
|
||||
# askari's public surface (ADR-024 Caddy + ADR-016 NetBird STUN). NOTE: the host
|
||||
# nftables template renders IPv4 source rules only; askari is reached via its A record
|
||||
# (no AAAA), so IPv4-only public rules are sufficient (see the spec's IPv6 note).
|
||||
firewall_catalog:
|
||||
reverse_proxy:
|
||||
host: askari
|
||||
ingress:
|
||||
- { from: public, port: 80, proto: tcp }
|
||||
- { from: public, port: 443, proto: tcp }
|
||||
netbird_stun:
|
||||
host: askari
|
||||
ingress:
|
||||
- { from: public, port: 3478, proto: udp }
|
||||
# Empty until services are built; hosts still get default-deny + the management plane.
|
||||
firewall_catalog: {}
|
||||
|
|
|
|||
|
|
@ -1,108 +1,86 @@
|
|||
$ANSIBLE_VAULT;1.1;AES256
|
||||
33393537643265363864656666343435633766306366316363663337363630636231646436656530
|
||||
3032316362373533636163366562396563613735663335370a373239666261633263353963643632
|
||||
30396263343765396435376539323833623933353563333363383337366535616365393730643239
|
||||
3034313633323963630a376334343134306138636234613438373866633730373737623863396463
|
||||
32646538636261363363636439626131643865306130623164656366663739333464393564663836
|
||||
35313431383834383133386335376661346465613465346537353863363836663936393035646366
|
||||
36393833326437363034646532313263383931316432316132396633333330623035636162626230
|
||||
31643232306664386364666332396439633934303434636633353262396535396161303361643730
|
||||
36636230323834393435376263326537326262396366633130623530303637333032613838373938
|
||||
33623333383539613763646136663466643536653734386263346661653838613034356631353733
|
||||
39393632616236626566613364356364323434313737383530333364323333383036613039323865
|
||||
33396561626564353063623238656663386331323832613832323837346136346330613337393862
|
||||
32356537623934363232373034373838643961343131336263663339643264613366383466613366
|
||||
30303764626437313065316636633938323035303332356262646661636139653630633565636538
|
||||
33613861663836333664623433636134663538663065323964383036616430336631636433646562
|
||||
36393835363838303463356565386365623464326631363339363164396338366531386161646633
|
||||
64663333633437353335336530306537353038356664623231666362633861376262613564643262
|
||||
62393061353865333839386232626361663165623038306366363033383333306139316633343266
|
||||
36356361613438663332653638376262346363613661623633316231316661353166366663616664
|
||||
36323461653034666131386166333335393438376631326338386635623762663666316461643935
|
||||
31326638303766626437393634666531303766326539303939343433393066623933623532636166
|
||||
30636463383237393366306630323739333161373666643962363235613133316361383437643162
|
||||
33623764383762373539373130333563383636386563373330613633373065333235376166373464
|
||||
66653635343665656366383439333433366364663734396239326635653839386662323563663465
|
||||
66633235303738303464383139323163303562643765623166316536363835653362633863646261
|
||||
30393833316135656462326438633432363965356134396531383465333834346436363235336639
|
||||
62663566646632383333613036666431326362346464666530383439373132346531316464613533
|
||||
30663062373066623961316237623933663862613433636461373931643866306564313863613334
|
||||
37353935343637383133316263663661363463383335636463643932323534393861326635613136
|
||||
66326664653234636465353539616432386435633838373436333633366336623233363732363262
|
||||
66666231643661333161613733643234383331616162386136346538373439613430326437333966
|
||||
33623739626263616235373438303333666237336537626639316561306438373534653161643533
|
||||
37303533653238346565396562376266666265646666623661393039383961376466396337656636
|
||||
38343730663837653638653239333334333735666431633639353234326264656462633164346566
|
||||
61316331623964393763616630633861326236333862653565373931303264316462303932633166
|
||||
37363735336266316431303464386232353430636566303637393530663435363536323364346236
|
||||
38376631396465303937656562386166306165316432653133623534336338636233383763666234
|
||||
36626431316534353462356131316161316162313439326266376438316134653433656335643632
|
||||
39653434366464613066666161626334643634353337376166323130353564313961626265373337
|
||||
36636163636538323134353431336166373266333934366462373662323762643061336335646264
|
||||
38343765343237386665623563383733633264316630326433666663373739373666623030366534
|
||||
30363366386563326430333465383362633630646633393466623231333366653837643262336134
|
||||
34396163326335666534366334643539623439336133626232353565306562636564656565646164
|
||||
34323136663164306466666430613039306231336134323165363736666262356639396438306537
|
||||
35633439636435313833626432643832636566366633653161326534303234316632393166396533
|
||||
66613334323533373234393731383034323039656462333833646339353530636466303437643136
|
||||
64383465353136386435626539353032363632303432633830343365396634336534383761353131
|
||||
39343438633837316336633934373132306136393635373933623939623863663465316164313966
|
||||
39323365373438343533623365653761323034633661373339616239346465643639306230636139
|
||||
35313330363838366436666436313864346232396339613362333866646531363162303238373936
|
||||
62353536613763313432636662353362313232373261313865636366373366306137373339333439
|
||||
31376333666538303733353962323239323536643034663662323330373165326433616431383163
|
||||
61616130653939313535396438346162313038616134323837336634366433303866656361376165
|
||||
31393331343738373133313764656539626139643630303730343439613137623930356362333634
|
||||
38663334396335396166383761663866613565643130366135623634343838623739333365653364
|
||||
32396666306166643130353163323036663831613436376562383865306538653763336332353632
|
||||
39346138353463316662376363333835386166393836666462323161376633336635356664376133
|
||||
38326639323932373635653139613165616432336136363866383764393930373732633533306433
|
||||
63303834386131626366633465393235613432363337386139656561333464303637353539653935
|
||||
36616538376663383236386561383339616332306137623731626537343765643637343232303230
|
||||
35393438636361353965353166633833316162376463376338353830386131666238626138666165
|
||||
37646438663561343831643431303434333138666664373634363038653964363335646165343163
|
||||
62613938636663613063383338326437333739386137316535366235366261383162656663636130
|
||||
35613938333763333633636565306239356161383731643864373830646438303137306465376231
|
||||
35356334303233343634653936323966653961616630633061643765373430386362376437656535
|
||||
39306630393466343232663632656133356531663935643137353439333261316632653762323232
|
||||
36323964636534326561626133323530643639386563623435656535386562633635633339343938
|
||||
35343536646565353936326362623930313739386163383765326330316335636139326665653339
|
||||
37333030383438363231333663616565303434303334643234353239313837656563363861656661
|
||||
36313166666566393737636231373634363132623066376437323532353861336338373462323539
|
||||
35306135363835653733356634646332346461643236613263376664343537333531313561333035
|
||||
34393938643561613231666434386331393966353730343634343437353566343263653038316430
|
||||
62336333373336633164626132346534616139333830336535666135613833623734353563353732
|
||||
31326139386336346332363565303333353135663732613765313034356433363932346263386164
|
||||
33343636333039346339356261623037316334623236653736386362323536386134633665383237
|
||||
39393665646231313734393963336634393563366134373233663036363830363265656663646361
|
||||
64353063653362383435623931343133623434356139363430613935346363386139373134306164
|
||||
37343931363931613834316665343662393533383730663364396338623933663766396130646566
|
||||
30626339616537373337303338613931303938323032666634666337626361376130396631376236
|
||||
35373766366637313661616335383739616636373166366332336161316466323731383836643263
|
||||
30623564353934636561323734666663623363666365323734633030643664643232633638636437
|
||||
63373664383863333032383739386238353239666162656436646439356239336266393966366434
|
||||
38613437353931633138343865313831303264653732313764336564623065613339326239356232
|
||||
32376536616635346536633361663463663231636566333062636261653761383664646639646335
|
||||
31656236343930386135346266353533393035646265383437313763653530666136653433353964
|
||||
30326434323038643565356239646533323134356361656365656339383635303065633537656532
|
||||
64633663653138653439623238636532373265386362643238646433616531343962343762623238
|
||||
30663966666434643361313835373835633064376536636436636465383763356663313862393138
|
||||
62346431663864316335386433396535386137366462666334623837666233626661333565613766
|
||||
35656264383936326638613431646236643131396337626231326565653233393061643530333830
|
||||
37396130303862613034393332623665376464353831366562353865373065336366393939623036
|
||||
30633637336564326466326562653966633265343062616536363738363239626637373730643839
|
||||
30336238363535373664643463353035313735633635666562653063386139366464626432633931
|
||||
33393436393435386637333135356630373464646634346364326164303038393664313864623633
|
||||
62383733366430373535633531356162666164653030326232336137633630346237386230323166
|
||||
37346365373632636639363833366461663265313235633663616432643835646133626365616531
|
||||
31646531643134633531353039343832643336373735343264653437373662633465613861646630
|
||||
34323131306236343566343736326264663339363537346539353434303866343036303761656566
|
||||
33386438343539656535306330346433643636343063336433323061313762613839633665363063
|
||||
66363233343337626631323038363336636335333965353636356436373031356262343734386565
|
||||
62396436303238373837373334663130396631373034356462393931653935633161356633383131
|
||||
37376130333232383235633765366636653330376663343566343833323861313236623333653834
|
||||
61363261326266353935333738626530396433306331326339623533393738663437343131656462
|
||||
61396533636334613363363161646366326631643138313161393438303261336537383733343630
|
||||
35383739353136613162326630383961623463626561313033613664643931366435326635383838
|
||||
30333066396132396238633837316636373062316264336530326133623465346264356530363537
|
||||
643734623039346364383038363937353764
|
||||
32313030663934353361336234373562303537356334346238663836373238366136356331363761
|
||||
6337323031666565663430303562646565303533653531640a636662373939363632383838613431
|
||||
38313365626365373539653266326661393765333737386161666165666534636562353165386537
|
||||
3934633033383966360a323965333139643764326236396635383863353437313966326665373537
|
||||
65396564393130303030643861663964383436396561643666623837306366346333306430306238
|
||||
66656136626566626262373037623531623633313664376166376161363336353930636538323339
|
||||
38386564333432353363353663643539343765373662643836646666626339353539323033386230
|
||||
31613165373035363533383862366638353035653836303737656534623361313064616365643131
|
||||
64386165653835366137353339396364313661656333333635616338346561363765353934343162
|
||||
64346462656566376539643030656461363161393936623332373632653731303031393437316636
|
||||
36626165306161336262356161666531323336343663643661626365396437383230613636356530
|
||||
62326363383138643162316464396666623332366434336462363531363836313833366237396464
|
||||
38323635353238653432626361383434646538326531356333393337643066373262663462656466
|
||||
65373036653265616137666533373930333239303732623832353337343434636434616562336135
|
||||
38666137353266353130303235616362323633353735373163336138633838633738393637633964
|
||||
66623866353265316336336566663034306664656365643832616232313732626464316563636335
|
||||
63653930626565636630326661626561366539303964373933653437356537343361626438313439
|
||||
35643165636662643463616337323063343633306536346538623331333365366533653634343538
|
||||
63623261636366303261373338633939363338316463303065613436396163616537666265623439
|
||||
31383361646531633863623230616635646138653630383537366335633030343530383735616435
|
||||
35656464393432313563303030626133383761303763653530653837313930303034353136353237
|
||||
37376366623836646236363062633938666135326631376235323061666465373865396235643937
|
||||
32633736656539356332336237646137303534343337353139383637623165353338623566666535
|
||||
30643134303235633362383064376234366235363262396362613731373364306362303634613138
|
||||
39366230366262363237656631646361356464393266656166386337303663313136666261633836
|
||||
32306132323239343539396232316564326361626462366561313561393635393233653633646431
|
||||
39313039313139616262396334613035333633326135346365333537373138396535633137353832
|
||||
63636335613237623234646234653435616635356637343964656463383864366534363438343938
|
||||
39626364653832373062323434316134653831336534383934346231656533643435306465393065
|
||||
31653731653438646361363732303664626438663533393837356562376633643933376132616236
|
||||
65393432633831313433323930383736316630626230373963653536396637363436643136363962
|
||||
37326534343237363961326438376137663034356532376433376461363337333562646136616462
|
||||
61636131376264393236376532356539376536643632623864656331656630353362623133303830
|
||||
34633461633539643262353263376363613566343261373930623139626364653232363538353330
|
||||
33633634363232653439656236303262373265613762373165646131383537623438383835383962
|
||||
33383931626136313036366562363732396561633631643561646536653665333733383261363833
|
||||
66356461663965373234393237323037356331333339643931313936313234323432613563306630
|
||||
33306638663839363565636661653830316265393639313065313062666534303039326465373636
|
||||
64363033323837313030353132383562343337326366626635663439396231393537313932643337
|
||||
30663031323231313938366436343735326165326433656633336465316630383961626664303536
|
||||
38633964326431643362626631656131303539613033323039393630353766386339346363663362
|
||||
33323034396136356362313163376438393739373738366363623636623634316537313461373066
|
||||
38613062656231363532663133333438663535666566356336316266383763623765346237663838
|
||||
64336435353437373264346561363265643339306532383539306363653564356362313430333066
|
||||
65633733633938343830303537383231303036326132376263363531626565633664343038356661
|
||||
31336139663061656437633138373438663966616338343565396562306638346437353730643664
|
||||
30373133373863626137313062643062393035653463653231653465333166633063353137633538
|
||||
62383331303164343236343539396461623738396234653333356632313664616263623061363563
|
||||
34323165306533323362376161346364316135333535626261353730666131643938306366326263
|
||||
31313934633137623638316534383234376333396131303034633636323037363732383263326335
|
||||
32393766343161386537333062643434333333363538323366363231336666383161373432383563
|
||||
65613537366139643032336230303133623431376231646662643666373532636565393639373930
|
||||
65336630616462353837666431616662636635333532393331326539306233363539396266653239
|
||||
31303031303330396632386131623134313536313433623064356636333230373962643339363736
|
||||
30396130353466373136643935646436613636376636323530643031653334303863376432646534
|
||||
39343165356232346539366233373135326338343663356164616265336235623332646365633466
|
||||
35393533373663393762376332396136336236616635616535313336613034346436363665356565
|
||||
32636536336634613531393434613435613962653862343737373237623261373836386663343831
|
||||
66656135323838636638353963646638326531343635653937306230323237343933626135356533
|
||||
66356263636438633164386535333762616438626439343462393833393731643037396662653737
|
||||
31666361656530383437396230393663616133383764316437623939663631396561343266383766
|
||||
62373636663631393637393763613337356337633264366434346561343263373931323335643135
|
||||
31366661623137353336666630633365663764646234343035313130663562636361623532643461
|
||||
63333961333338623966396662656262323830396439633337663431663235663962666238356630
|
||||
30353331313462653061373638666235653938623931366466666164343566623238333237353265
|
||||
30373064353132366634623966306632303832306630383637623465323134633133656333303964
|
||||
35646637316236303364393363323137616132326437623238336631313530663230333362623633
|
||||
34383032376538366464363032343262656164376166386237383563613630336666633965653730
|
||||
64373236396564363164643637623736626532396630313131356563333238643665356166323837
|
||||
31626338623665623165643763623661666439626435643237336433646132666366623661393832
|
||||
37306533613966663936373061613331633934623462343236626234306130383738343631303231
|
||||
32326339323738323537333363313538373266623363363636633462356234363466393263316235
|
||||
39663033303165656366396334306535643361646663373935303230376466366632373563303231
|
||||
64323264653036333039663965646630653934376239653236323063656137373830623563336463
|
||||
37343461373737313539316361623763373733653930626532393565333938333761323631303332
|
||||
39663530303439616561356561666532653762343339323435636164376664373731343132666539
|
||||
63626637346563393765303065646564643661636130396439323736343764333633373331653333
|
||||
66633465343433303038623638323965636533666639643266353163353436393036336639336133
|
||||
32646664363565326539643763653832313336663262313634343635616333613434373333323036
|
||||
61366435376265336638326132333439613431353633653762653836386235643965366436363866
|
||||
35626664393139386337353335343930306130356335623131646261656434303966656431623231
|
||||
66643730393430363838626434663933613536343533316262373564666665373663336363623166
|
||||
63363037373634383961373035633239646235316137363036333765313864643365396165643432
|
||||
36623465313036376261393566383539336638363836633232656136656533396663323366313062
|
||||
64616632373333313466356362336234346564373832316433373963623263316635
|
||||
|
|
|
|||
|
|
@ -12,28 +12,7 @@ dev_env__users:
|
|||
# group only.
|
||||
ansible_user: sjat
|
||||
|
||||
# ubongo's AI-worker; passwordless sudo for the claude user (ADR-015 amended).
|
||||
base__ai_worker_user: claude
|
||||
|
||||
# ubongo is a NetBird mesh peer (ADR-016, M5) — enrol the agent via base's `mesh` concern.
|
||||
# Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening
|
||||
# follow-on), so this brings up wt0 without changing SSH exposure.
|
||||
base__mesh_enabled: true
|
||||
|
||||
# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as
|
||||
# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so
|
||||
# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged
|
||||
# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the
|
||||
# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
|
||||
# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
|
||||
base__firewall_input_only: true
|
||||
|
||||
# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
|
||||
# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
|
||||
# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
|
||||
base__mesh_coordinator_pin: "77.42.120.136"
|
||||
|
||||
base__firewall_admin_addrs:
|
||||
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
|
||||
# OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.
|
||||
- "10.20.10.17" # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto.
|
||||
|
|
|
|||
|
|
@ -1,21 +1,6 @@
|
|||
---
|
||||
# Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer
|
||||
# (ADR-016, M5).
|
||||
#
|
||||
# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward
|
||||
# `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind
|
||||
# did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern:
|
||||
# - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept`
|
||||
# so Docker container forwarding/NAT survive a reboot;
|
||||
# - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress
|
||||
# change — base__ssh_listen_mesh_only stays false, so there is no boot-race;
|
||||
# - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs)
|
||||
# as the permanent non-mesh break-glass — the coordinator-host exception (a host's only
|
||||
# management path must never depend on a service that host itself hosts).
|
||||
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
|
||||
# (ADR-016, M5) — enrol the agent via base's `mesh` concern. Enrollment only; the
|
||||
# host firewall default-deny + moving askari's SSH onto wt0 stay deferred to the
|
||||
# mesh-hardening follow-on.
|
||||
base__mesh_enabled: true
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true # forward stays `policy accept` → Docker-safe
|
||||
base__ssh_listen_mesh_only: false # no sshd ListenAddress change → no boot-race
|
||||
base__firewall_admin_addrs:
|
||||
- 91.226.145.80 # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass
|
||||
|
|
|
|||
|
|
@ -1,7 +0,0 @@
|
|||
---
|
||||
# Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host`
|
||||
# in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path
|
||||
# (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the
|
||||
# break-glass; the Hetzner web console is the IP-independent ultimate fallback.
|
||||
# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md
|
||||
ansible_host: 100.99.226.39
|
||||
|
|
@ -8,5 +8,3 @@
|
|||
roles:
|
||||
- role: dev_env
|
||||
tags: [dev_env]
|
||||
- role: integration_test
|
||||
tags: [integration_test]
|
||||
|
|
|
|||
|
|
@ -11,14 +11,6 @@ base__firewall_rollback_timeout: 45 # seconds before the auto-revert fires on a
|
|||
base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply
|
||||
base__firewall_dropin_dir: /etc/nftables.d
|
||||
base__firewall_apply: true # set false to render+validate without applying (CI/Molecule)
|
||||
base__firewall_input_only: false # true → the forward chain is `policy accept` (host-local
|
||||
# INPUT filtering only). For hosts that forward/route
|
||||
# container or NAT traffic (the control node's Docker +
|
||||
# libvirt-NAT) where a forward default-deny would break
|
||||
# them. Real service hosts keep this false (forward drop).
|
||||
base__firewall_admin_addrs: [] # extra LAN source IPs allowed to SSH, besides wt0 +
|
||||
# ssh-from-control. For an operator workstation reaching
|
||||
# the host over the LAN (no mesh). Key-gated. (ADR-021)
|
||||
|
||||
# SSH hardening + fail2ban (ADR-002) — `hardening` concern.
|
||||
base__ssh_password_authentication: "no"
|
||||
|
|
@ -29,19 +21,6 @@ base__fail2ban_findtime: 10m
|
|||
# base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys).
|
||||
base__ssh_authorised_keys: []
|
||||
|
||||
# SSH listen-on-mesh (mesh-hardening 1/3, ADR-016/021). Opt-in: when true, sshd binds
|
||||
# ListenAddress to this host's mesh IP only (not the WAN). The IP comes from the live wt0
|
||||
# fact (ansible_facts.wt0.ipv4.address); base__ssh_listen_addr overrides it. ip_nonlocal_bind
|
||||
# lets sshd bind the mesh IP before wt0 exists at boot. Fails closed: the play asserts a
|
||||
# non-empty address rather than silently listening on all interfaces.
|
||||
base__ssh_listen_mesh_only: false
|
||||
base__ssh_listen_addr: ""
|
||||
|
||||
# The automation/AI-worker user granted passwordless sudo (ADR-015 amended / ADR-021).
|
||||
# Empty = no AI-worker sudo. Set per-group (e.g. group_vars/control: claude). The user's
|
||||
# password should be locked so NOPASSWD is its only sudo path; actions are auditd-attributed.
|
||||
base__ai_worker_user: ""
|
||||
|
||||
# NetBird mesh agent enrollment (ADR-016). Opt-in: default off so applying `base` to a
|
||||
# host not on the mesh is a no-op for this concern. The live actions (apt install over
|
||||
# the network, `netbird up` against the coordinator) are additionally gated by
|
||||
|
|
@ -51,9 +30,3 @@ base__mesh_manage: true
|
|||
base__mesh_management_url: "https://netbird.askari.wingu.me"
|
||||
base__mesh_setup_key: "{{ vault.netbird.setup_key }}"
|
||||
base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy
|
||||
|
||||
# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
|
||||
# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
|
||||
# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
|
||||
# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
|
||||
base__mesh_coordinator_pin: ""
|
||||
|
|
|
|||
|
|
@ -6,21 +6,15 @@
|
|||
vars:
|
||||
base__firewall_apply: false
|
||||
base__firewall_control_addr: 10.10.0.99 # test control-node LAN address
|
||||
base__firewall_admin_addrs:
|
||||
- "10.30.0.77" # fixture: an operator-workstation LAN source (admin-addr SSH allow)
|
||||
# Exercise the mesh concern's include path with the live actions gated off, so it
|
||||
# runs hermetically (no coordinator/key needed) and must be a clean no-op.
|
||||
base__mesh_enabled: true
|
||||
base__mesh_manage: false
|
||||
base__mesh_setup_key: "dummy-molecule-key"
|
||||
base__mesh_coordinator_pin: "203.0.113.9" # fixture IP (TEST-NET-3); pins FQDN from base__mesh_management_url
|
||||
base__ssh_listen_mesh_only: true
|
||||
base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container)
|
||||
firewall_zones:
|
||||
lan: 10.30.0.0/24
|
||||
srv: 10.20.0.0/24
|
||||
mgmt: 10.10.0.0/24
|
||||
public: 0.0.0.0/0
|
||||
firewall_catalog:
|
||||
reverse_proxy:
|
||||
host: instance
|
||||
|
|
@ -30,9 +24,5 @@
|
|||
host: instance
|
||||
ingress:
|
||||
- { from: srv, port: 2342, proto: tcp }
|
||||
netbird_stun:
|
||||
host: instance
|
||||
ingress:
|
||||
- { from: public, port: 3478, proto: udp }
|
||||
roles:
|
||||
- role: base
|
||||
|
|
|
|||
|
|
@ -19,16 +19,6 @@ platforms:
|
|||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
command: /lib/systemd/systemd
|
||||
# Pre-create the namespaced sysctl so ansible.posix.sysctl can set it (mesh-hardening 1/3).
|
||||
# The container image lacks procps so the sysctl binary is absent; we also install it in
|
||||
# prepare.yml. This entry ensures the value exists in the container's netns at startup.
|
||||
sysctls:
|
||||
net.ipv4.ip_nonlocal_bind: "0"
|
||||
# ubongo's /etc/resolv.conf points to the NetBird mesh DNS (100.99.x.x), which Docker
|
||||
# containers can't reach (no wt0). Override to a public resolver so prepare.yml apt tasks
|
||||
# can update the cache and install packages.
|
||||
dns_servers:
|
||||
- 8.8.8.8
|
||||
|
||||
provisioner:
|
||||
name: ansible
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
---
|
||||
- name: Prepare
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Install procps so ansible.posix.sysctl can find the sysctl binary
|
||||
ansible.builtin.apt:
|
||||
name: procps
|
||||
state: present
|
||||
update_cache: true
|
||||
|
|
@ -38,33 +38,12 @@
|
|||
- "'tcp dport 2342 accept' in nft"
|
||||
fail_msg: "missing srv->2342 rule for photoprism"
|
||||
|
||||
- name: Assert the public->stun:3478/udp ingress rule (0.0.0.0/0 source)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'0.0.0.0/0' in nft"
|
||||
- "'udp dport 3478 accept' in nft"
|
||||
fail_msg: "missing public->3478/udp rule for netbird_stun"
|
||||
|
||||
- name: Assert the docker_host extension hook is present
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'include \"/etc/nftables.d/*.nft\"' in nft"
|
||||
fail_msg: "missing drop-in include hook"
|
||||
|
||||
- name: Assert the forward chain defaults to policy drop (input_only off)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'hook forward priority 0; policy drop;' in nft"
|
||||
fail_msg: >-
|
||||
forward chain must default to policy drop when base__firewall_input_only is
|
||||
false (container isolation stays the norm on real service hosts)
|
||||
|
||||
- name: Assert the admin-addr SSH allow rule (operator workstation on the LAN)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft"
|
||||
fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs"
|
||||
|
||||
- name: Syntax-check the rendered ruleset (no apply)
|
||||
ansible.builtin.command: nft -c -f /etc/nftables.conf
|
||||
changed_when: false
|
||||
|
|
@ -79,18 +58,6 @@
|
|||
ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local
|
||||
changed_when: false
|
||||
|
||||
- name: ListenAddress bound to the fixture mesh IP (mesh-only mode)
|
||||
ansible.builtin.command: grep -q '^ListenAddress 100.99.0.1$' /etc/ssh/sshd_config.d/10-boma.conf
|
||||
changed_when: false
|
||||
- name: Sysctl drop-in for ip_nonlocal_bind is present
|
||||
ansible.builtin.command: grep -q '^net.ipv4.ip_nonlocal_bind=1' /etc/sysctl.d/60-boma-nonlocal-bind.conf
|
||||
changed_when: false
|
||||
- name: Kernel ip_nonlocal_bind is live in this netns
|
||||
ansible.builtin.command: sysctl -n net.ipv4.ip_nonlocal_bind
|
||||
register: _nonlocal
|
||||
changed_when: false
|
||||
failed_when: _nonlocal.stdout | trim != '1'
|
||||
|
||||
# mesh concern: enabled but manage=false must be a clean no-op (no install/enrol)
|
||||
- name: Check whether netbird got installed
|
||||
ansible.builtin.command: which netbird
|
||||
|
|
@ -103,14 +70,3 @@
|
|||
- _nb.rc != 0
|
||||
fail_msg: "netbird must not be installed when base__mesh_manage is false"
|
||||
success_msg: "mesh concern is a clean no-op under manage=false"
|
||||
|
||||
- name: Read /etc/hosts (coordinator pin)
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/hosts
|
||||
register: _etchosts
|
||||
- name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)" # slurp content is always base64
|
||||
fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
|
||||
success_msg: "coordinator FQDN pinned in /etc/hosts"
|
||||
|
|
|
|||
|
|
@ -23,13 +23,6 @@
|
|||
tags: [hardening]
|
||||
tags: [hardening]
|
||||
|
||||
- name: AI-worker operational access (sudoers drop-in)
|
||||
ansible.builtin.include_tasks:
|
||||
file: operational_access.yml
|
||||
apply:
|
||||
tags: [users]
|
||||
tags: [users]
|
||||
|
||||
- name: NetBird mesh enrollment
|
||||
ansible.builtin.include_tasks:
|
||||
file: mesh.yml
|
||||
|
|
|
|||
|
|
@ -64,19 +64,3 @@
|
|||
- "'Management: Connected' not in (_netbird_status.stdout | default(''))"
|
||||
no_log: true # setup key is on the argv
|
||||
tags: [mesh]
|
||||
|
||||
- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/hosts
|
||||
regexp: '^\S+\s+{{ _coordinator_fqdn | regex_escape }}\s*$'
|
||||
line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
|
||||
state: present
|
||||
# /etc/hosts is bind-mounted in the Docker Molecule container (atomic rename → EBUSY);
|
||||
# this is a fallback only — production VMs still write atomically.
|
||||
unsafe_writes: true
|
||||
vars:
|
||||
_coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
|
||||
when:
|
||||
- base__mesh_enabled | bool
|
||||
- base__mesh_coordinator_pin | length > 0
|
||||
tags: [mesh]
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
---
|
||||
- name: Grant the AI-worker user passwordless sudo (ADR-015 amended / ADR-021)
|
||||
ansible.builtin.copy:
|
||||
content: "{{ base__ai_worker_user }} ALL=(ALL) NOPASSWD:ALL\n"
|
||||
dest: "/etc/sudoers.d/{{ base__ai_worker_user }}-ai-worker"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0440"
|
||||
validate: "visudo -cf %s"
|
||||
when: base__ai_worker_user | length > 0
|
||||
tags: [users]
|
||||
|
|
@ -1,31 +1,4 @@
|
|||
---
|
||||
- name: Resolve the sshd mesh listen address (override, else live wt0 fact)
|
||||
ansible.builtin.set_fact:
|
||||
base__ssh_listen_addr_resolved: >-
|
||||
{{ base__ssh_listen_addr
|
||||
or ansible_facts.get('wt0', {}).get('ipv4', {}).get('address', '') }}
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
|
||||
- name: Fail closed — refuse to render sshd without a known mesh address
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- base__ssh_listen_addr_resolved | length > 0
|
||||
fail_msg: >-
|
||||
base__ssh_listen_mesh_only is true but no mesh address resolved (set
|
||||
base__ssh_listen_addr or ensure wt0 is up so its fact is gathered). Refusing to
|
||||
render sshd ListenAddress empty (which would listen on ALL interfaces).
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
|
||||
- name: Allow sshd to bind the mesh IP before wt0 exists at boot
|
||||
ansible.posix.sysctl:
|
||||
name: net.ipv4.ip_nonlocal_bind
|
||||
value: "1"
|
||||
sysctl_set: true
|
||||
state: present
|
||||
reload: true
|
||||
sysctl_file: /etc/sysctl.d/60-boma-nonlocal-bind.conf
|
||||
when: base__ssh_listen_mesh_only | bool
|
||||
|
||||
- name: Ensure openssh-server is installed
|
||||
ansible.builtin.apt:
|
||||
name: openssh-server
|
||||
|
|
|
|||
|
|
@ -12,16 +12,13 @@ table inet filter {
|
|||
{% if base__firewall_control_addr %}
|
||||
ip saddr {{ base__firewall_control_addr }} tcp dport {{ base__firewall_ssh_port }} accept
|
||||
{% endif %}
|
||||
{% for addr in base__firewall_admin_addrs %}
|
||||
ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept
|
||||
{% endfor %}
|
||||
ip protocol icmp accept
|
||||
ip6 nexthdr ipv6-icmp accept
|
||||
{% for r in base__firewall_resolved %}
|
||||
ip saddr { {{ r.sources | join(', ') }} } {{ r.proto }} dport {{ r.port }} accept
|
||||
{% endfor %}
|
||||
}
|
||||
chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; }
|
||||
chain forward { type filter hook forward priority 0; policy drop; }
|
||||
chain output { type filter hook output priority 0; policy accept; }
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,3 @@ PasswordAuthentication {{ base__ssh_password_authentication }}
|
|||
PermitRootLogin {{ base__ssh_permit_root_login }}
|
||||
PubkeyAuthentication yes
|
||||
KbdInteractiveAuthentication no
|
||||
{% if base__ssh_listen_mesh_only | bool %}
|
||||
ListenAddress {{ base__ssh_listen_addr_resolved }}
|
||||
{% endif %}
|
||||
|
|
|
|||
|
|
@ -1,16 +1,8 @@
|
|||
---
|
||||
# Docker engine install (ADR-004). Cluster-specific daemon hardening is deferred to when
|
||||
# the cluster exists.
|
||||
# Docker engine install (ADR-004). Cluster-specific daemon hardening + nftables.d
|
||||
# integration are deferred to when the cluster + host firewall exist.
|
||||
docker_host__packages:
|
||||
- docker-ce
|
||||
- docker-ce-cli
|
||||
- containerd.io
|
||||
- docker-compose-plugin
|
||||
|
||||
# Container-forward nftables drop-in (FRICTION 2026-06-17 #1 / ADR-025). base's inet-filter
|
||||
# forward chain is `policy drop`; on a Docker host that kills published-port DNAT + inter-
|
||||
# container forwarding ON REBOOT (nftables loads default-deny before dockerd). This drop-in
|
||||
# (loaded via base's /etc/nftables.d/*.nft include) appends the accepts so a rebooted Docker
|
||||
# host keeps forwarding. Only meaningful where base__firewall_apply is true.
|
||||
docker_host__forward_dropin: true
|
||||
docker_host__nftables_dropin_dir: /etc/nftables.d # must match base__firewall_dropin_dir
|
||||
|
|
|
|||
|
|
@ -37,22 +37,3 @@
|
|||
state: present
|
||||
update_cache: true
|
||||
tags: [packages]
|
||||
|
||||
- name: Ensure the nftables drop-in dir exists (for the container-forward rules)
|
||||
ansible.builtin.file:
|
||||
path: "{{ docker_host__nftables_dropin_dir }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
when: docker_host__forward_dropin | bool
|
||||
tags: [firewall]
|
||||
|
||||
- name: Install the container-forward nftables drop-in (reboot-safe Docker forwarding)
|
||||
ansible.builtin.template:
|
||||
src: 10-docker-forward.nft.j2
|
||||
dest: "{{ docker_host__nftables_dropin_dir }}/10-docker-forward.nft"
|
||||
mode: "0644"
|
||||
when: docker_host__forward_dropin | bool
|
||||
# Not reloaded here: a running host already forwards via Docker's runtime rules, so the
|
||||
# drop-in only needs to protect the NEXT boot (loaded by nftables.service). Reloading nft
|
||||
# now would flush Docker's NAT (FRICTION 2026-06-17 #4); the boot loads it cleanly.
|
||||
tags: [firewall]
|
||||
|
|
|
|||
|
|
@ -1,14 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
# Allow container forwarding through base's default-deny forward chain (ADR-025 / FRICTION
|
||||
# 2026-06-17 #1). Appended to base's `table inet filter` / `chain forward` via the
|
||||
# /etc/nftables.d/*.nft include, and loaded by nftables.service at boot — exactly when the
|
||||
# bug bit (default-deny forward loading before dockerd on reboot).
|
||||
table inet filter {
|
||||
chain forward {
|
||||
ct state established,related accept
|
||||
iifname "docker0" accept
|
||||
oifname "docker0" accept
|
||||
iifname "br-*" accept
|
||||
oifname "br-*" accept
|
||||
}
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
# integration_test
|
||||
|
||||
Installs the KVM/libvirt substrate on the control node (`ubongo`) so the agent
|
||||
can boot throwaway Debian VMs for local integration testing (ADR-025).
|
||||
|
||||
This is a **non-service** role — no SECURITY/VERIFY/ACCESS/BACKUP files are
|
||||
required. It does **not** make ubongo a production hypervisor; it only provides
|
||||
the tooling needed to spin up short-lived test VMs (see ADR-015).
|
||||
|
||||
## Target group
|
||||
|
||||
`control` (i.e. `ubongo`)
|
||||
|
||||
## What it does
|
||||
|
||||
1. Installs QEMU/KVM, libvirt daemon + clients, `virt-install`, and
|
||||
cloud-image tools (`cloud-image-utils`, `genisoimage`).
|
||||
2. Enables and starts `libvirtd`.
|
||||
3. Adds the configured users (`sjat`, `claude`) to the `libvirt` and `kvm`
|
||||
groups so VMs can be managed without `sudo`.
|
||||
4. Creates `/var/lib/boma-integration` (owned `root:libvirt`, mode `2775`) as
|
||||
the cache directory for golden images and overlays.
|
||||
|
||||
## Defaults
|
||||
|
||||
| Variable | Default | Purpose |
|
||||
|-------------------------------|-------------------------------|----------------------------------|
|
||||
| `integration_test__packages` | see `defaults/main.yml` | APT packages to install |
|
||||
| `integration_test__users` | `[sjat, claude]` | Users granted libvirt/kvm access |
|
||||
| `integration_test__cache_dir` | `/var/lib/boma-integration` | Image/overlay cache directory |
|
||||
|
||||
## Related decisions
|
||||
|
||||
- [ADR-025](../../docs/decisions/025-local-vm-integration-testing.md) — local VM integration testing
|
||||
- [ADR-015](../../docs/decisions/015-control-host.md) — control host scope (ubongo is not a hypervisor)
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
---
|
||||
# integration_test — installs the local KVM/libvirt substrate on the control node
|
||||
# (ubongo) so the agent can run throwaway VM integration tests (ADR-025). Non-service
|
||||
# role; applied to the `control` group. Not a production hypervisor (ADR-015).
|
||||
integration_test__packages:
|
||||
- qemu-system-x86 # KVM
|
||||
- qemu-utils # qemu-img (overlays)
|
||||
- libvirt-daemon-system
|
||||
- libvirt-clients # virsh
|
||||
- virt-install # virt-install (trixie: the real pkg; `virtinst` is transitional)
|
||||
- cloud-image-utils # cloud-localds (NoCloud seed)
|
||||
- genisoimage # cloud-localds fallback
|
||||
# Users granted libvirt/kvm access (run VMs without sudo).
|
||||
integration_test__users:
|
||||
- sjat
|
||||
- claude
|
||||
# Where the golden image + overlays live (outside the repo).
|
||||
integration_test__cache_dir: "/var/lib/boma-integration"
|
||||
# nftables drop-in dir — must match base__firewall_dropin_dir (base role default: /etc/nftables.d)
|
||||
integration_test__nftables_dropin_dir: /etc/nftables.d
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
---
|
||||
- name: Reload nftables
|
||||
ansible.builtin.service:
|
||||
name: nftables
|
||||
state: reloaded
|
||||
listen: "integration_test | reload nftables"
|
||||
register: _nft_reload
|
||||
# nftables is absent from the Molecule Docker container; ignore "not found" errors there.
|
||||
# On real hosts where base has applied nftables, failures propagate normally.
|
||||
failed_when:
|
||||
- _nft_reload.failed
|
||||
- >-
|
||||
'Could not find the requested service nftables' not in (_nft_reload.msg | default(''))
|
||||
and 'nftables.service not found' not in (_nft_reload.msg | default(''))
|
||||
and 'Unit nftables.service not found' not in (_nft_reload.msg | default(''))
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
---
|
||||
galaxy_info:
|
||||
author: sjat
|
||||
description: >-
|
||||
Installs the KVM/libvirt substrate on the control node (ubongo) to enable
|
||||
local VM integration testing (ADR-025). Non-service role; not a production
|
||||
hypervisor (ADR-015).
|
||||
license: MIT
|
||||
min_ansible_version: "2.17"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions:
|
||||
- trixie
|
||||
dependencies: []
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
---
|
||||
# KVM/libvirt APT packages cannot be installed in the Docker Molecule container
|
||||
# (no internet; KVM unusable in a container). This converge exercises only the
|
||||
# nftables drop-in rendering via tasks_from, which IS meaningful in a container.
|
||||
# The full role (packages/libvirt) is exercised by make test-integration.
|
||||
#
|
||||
# Coverage split:
|
||||
# Docker Molecule (this file): nftables drop-in rendering only.
|
||||
# make test-integration (ADR-025, real KVM): libvirt/KVM package install, cache
|
||||
# dir creation, and end-to-end VM lifecycle — the role's substrate tasks.
|
||||
# The Docker scenario intentionally covers only the firewall drop-in; substrate
|
||||
# coverage lives in the real-KVM integration harness, not here.
|
||||
- name: Converge
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: true
|
||||
tasks:
|
||||
- name: Include integration_test firewall tasks
|
||||
ansible.builtin.include_role:
|
||||
name: integration_test
|
||||
tasks_from: firewall.yml
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
---
|
||||
dependency:
|
||||
name: galaxy
|
||||
options:
|
||||
requirements-file: ../../requirements.yml
|
||||
|
||||
driver:
|
||||
name: docker
|
||||
|
||||
platforms:
|
||||
- name: instance
|
||||
# Project-owned image built from .docker/molecule-debian13/Dockerfile
|
||||
# and hosted in the Forgejo container registry.
|
||||
# Build/push with: make molecule-image / make molecule-image-push
|
||||
image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
|
||||
pre_build_image: true
|
||||
privileged: true # required for systemd
|
||||
cgroupns_mode: host
|
||||
volumes:
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
command: /lib/systemd/systemd
|
||||
|
||||
provisioner:
|
||||
name: ansible
|
||||
inventory:
|
||||
host_vars:
|
||||
instance:
|
||||
ansible_user: root
|
||||
|
||||
verifier:
|
||||
name: ansible
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
---
|
||||
# The Molecule Docker image ships with /var/lib/apt/lists/ cleared to minimise size.
|
||||
# KVM/libvirt packages cannot be installed in a container; converge only runs the
|
||||
# `firewall` tag. Pre-create /etc/nftables.d so the drop-in template task succeeds.
|
||||
- name: Prepare
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Create nftables drop-in dir (normally created by the config task)
|
||||
ansible.builtin.file:
|
||||
path: /etc/nftables.d
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
---
|
||||
# Package-install and cache-dir tasks are skipped (converge runs `firewall` tag only;
|
||||
# KVM/libvirt packages cannot be fetched in the Docker container). This scenario
|
||||
# verifies the nftables drop-in renders correctly.
|
||||
- name: Verify
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: Read the libvirt bridge nftables drop-in
|
||||
ansible.builtin.slurp:
|
||||
src: /etc/nftables.d/10-libvirt-boma.nft
|
||||
register: _dropin
|
||||
- name: Assert drop-in contains virbr-boma accept rule
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'virbr-boma' in (_dropin.content | b64decode)"
|
||||
- "'accept' in (_dropin.content | b64decode)"
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
---
|
||||
- name: Install the libvirt bridge nftables drop-in (virbr-boma input allow)
|
||||
ansible.builtin.template:
|
||||
src: 10-libvirt-boma.nft.j2
|
||||
dest: "{{ integration_test__nftables_dropin_dir }}/10-libvirt-boma.nft"
|
||||
mode: "0644"
|
||||
notify: "integration_test | reload nftables"
|
||||
tags: [firewall]
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
---
|
||||
- name: Install the KVM/libvirt substrate
|
||||
ansible.builtin.apt:
|
||||
name: "{{ integration_test__packages }}"
|
||||
state: present
|
||||
update_cache: true
|
||||
cache_valid_time: 3600
|
||||
tags: [packages]
|
||||
|
||||
- name: Enable and start libvirtd
|
||||
ansible.builtin.systemd:
|
||||
name: libvirtd
|
||||
enabled: true
|
||||
state: started
|
||||
tags: [config]
|
||||
|
||||
- name: Grant users libvirt + kvm access
|
||||
ansible.builtin.user:
|
||||
name: "{{ item }}"
|
||||
groups: [libvirt, kvm]
|
||||
append: true
|
||||
loop: "{{ integration_test__users }}"
|
||||
tags: [users]
|
||||
|
||||
- name: Create the integration cache dir
|
||||
ansible.builtin.file:
|
||||
path: "{{ integration_test__cache_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: libvirt
|
||||
mode: "2775"
|
||||
tags: [config]
|
||||
|
||||
- name: Import firewall tasks
|
||||
ansible.builtin.import_tasks: firewall.yml
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
# Allow DHCP/DNS traffic arriving on the libvirt integration bridge to pass base's
|
||||
# inet filter input default-deny chain (ADR-025). nftables multi-table semantics mean
|
||||
# libvirt's own `ip filter` table accept is not enough — base's `inet filter` input
|
||||
# policy drop kills bridge traffic first without this drop-in.
|
||||
#
|
||||
# Bridge name "virbr-boma" must match NET_XML in scripts/integration-vm.py.
|
||||
table inet filter {
|
||||
chain input {
|
||||
iifname "virbr-boma" accept
|
||||
}
|
||||
}
|
||||
|
|
@ -46,7 +46,6 @@ upstream support; WS/gRPC need long timeouts (Caddy sets none by default).
|
|||
| `netbird_coordinator__domain` | `netbird.askari.wingu.me` | Public hostname; feeds `exposedAddress`, the OIDC issuer, redirect URIs, and the dashboard endpoints |
|
||||
| `netbird_coordinator__trusted_proxies` | `["172.16.0.0/12"]` | Source ranges NetBird trusts `X-Forwarded-*` from (`server.reverseProxy.trustedHTTPProxies`). Must cover Caddy's source IP on the boma network — verify the actual bridge subnet at deploy |
|
||||
| `netbird_coordinator__manage` | `true` | Set `false` in Molecule to render templates without a Docker daemon |
|
||||
| `netbird_coordinator__disable_geolocation` | `true` | sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4) |
|
||||
|
||||
Production overrides live in `inventories/production/group_vars/`.
|
||||
|
||||
|
|
|
|||
|
|
@ -6,13 +6,6 @@ netbird_coordinator__dashboard_image: "netbirdio/dashboard:v2.39.0"
|
|||
netbird_coordinator__base_dir: /opt/services/netbird
|
||||
netbird_coordinator__domain: netbird.askari.wingu.me
|
||||
|
||||
# Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture
|
||||
# (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL —
|
||||
# so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker
|
||||
# re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling
|
||||
# removes that dependency. Revisit if a future ACL sub-project wants geo-based posture.
|
||||
netbird_coordinator__disable_geolocation: true
|
||||
|
||||
# Source IP ranges Caddy fronts NetBird from, rendered into config.yaml
|
||||
# server.reverseProxy.trustedHTTPProxies. NetBird trusts X-Forwarded-* only from
|
||||
# these. MUST cover the Caddy container's source IP on the boma Docker network —
|
||||
|
|
|
|||
|
|
@ -30,12 +30,3 @@
|
|||
- "'v2.39.0' in (_compose.content | b64decode)"
|
||||
fail_msg: "docker-compose.yml is missing pinned image tags"
|
||||
success_msg: "docker-compose.yml pins both image tags"
|
||||
|
||||
- name: "Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)"
|
||||
fail_msg: >-
|
||||
compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL
|
||||
the coordinator on the GeoLite2 download
|
||||
success_msg: "geolocation disabled in compose"
|
||||
|
|
|
|||
|
|
@ -4,8 +4,6 @@
|
|||
path: "{{ netbird_coordinator__base_dir }}"
|
||||
state: directory
|
||||
mode: "0750"
|
||||
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
|
||||
check_mode: false
|
||||
tags: [config]
|
||||
|
||||
- name: Render the combined server config
|
||||
|
|
|
|||
|
|
@ -16,10 +16,6 @@ services:
|
|||
container_name: netbird-server
|
||||
restart: unless-stopped
|
||||
command: ["--config", "/etc/netbird/config.yaml"]
|
||||
environment:
|
||||
# Disable geolocation so a no-egress startup can't FATAL the control plane
|
||||
# (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All).
|
||||
NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}"
|
||||
ports:
|
||||
- "3478:3478/udp"
|
||||
volumes:
|
||||
|
|
|
|||
|
|
@ -35,7 +35,3 @@ access__api: # noqa: var-naming[no-role-prefix]
|
|||
# DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance.
|
||||
backup__service: reverse_proxy # noqa: var-naming[no-role-prefix]
|
||||
backup__state: false # noqa: var-naming[no-role-prefix]
|
||||
|
||||
# Integration-test / staging cert knobs (ADR-025). Default off = production behaviour.
|
||||
reverse_proxy__tls_internal: false # true => every site uses Caddy's self-signed CA
|
||||
reverse_proxy__acme_ca: "" # set to the LE staging directory URL to use staging
|
||||
|
|
|
|||
|
|
@ -4,8 +4,6 @@
|
|||
path: "{{ reverse_proxy__base_dir }}"
|
||||
state: directory
|
||||
mode: "0750"
|
||||
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
|
||||
check_mode: false
|
||||
tags: [config]
|
||||
|
||||
- name: Ensure the Caddy config directory exists
|
||||
|
|
@ -13,8 +11,6 @@
|
|||
path: "{{ reverse_proxy__base_dir }}/caddy"
|
||||
state: directory
|
||||
mode: "0750"
|
||||
# create the scaffold even in --check so dry-run can evaluate templates + compose (idempotent mkdir)
|
||||
check_mode: false
|
||||
tags: [config]
|
||||
|
||||
# Render into a directory that is bind-mounted whole (./caddy -> /etc/caddy). Mounting
|
||||
|
|
|
|||
|
|
@ -1,9 +1,6 @@
|
|||
# {{ ansible_managed }}
|
||||
{
|
||||
email {{ reverse_proxy__acme_email }}
|
||||
{% if reverse_proxy__acme_ca %}
|
||||
acme_ca {{ reverse_proxy__acme_ca }}
|
||||
{% endif %}
|
||||
{% if reverse_proxy__acme_dns_provider == 'gandi' %}
|
||||
# ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the
|
||||
# Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom
|
||||
|
|
@ -13,9 +10,6 @@
|
|||
}
|
||||
{% for r in reverse_proxy__routes %}
|
||||
{{ r['host'] }} {
|
||||
{% if reverse_proxy__tls_internal %}
|
||||
tls internal
|
||||
{% endif %}
|
||||
{% if r['caddy'] is defined %}
|
||||
{{ r['caddy'] | trim | indent(2, first=true) }}
|
||||
{% elif r['upstream'] is defined %}
|
||||
|
|
|
|||
|
|
@ -1,462 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""boma local-VM integration test harness driver (ADR-025).
|
||||
|
||||
Stdlib-only by convention (TODO-14): never imports a YAML library. The transient
|
||||
inventory is emitted via string templates; stubs/cert-tiers reach Ansible as
|
||||
`-e @<file>` extra-vars; profile metadata is JSON. Talks to libvirt via `virsh`.
|
||||
"""
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import uuid
|
||||
|
||||
REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
|
||||
CACHE_DIR = pathlib.Path(os.environ.get("BOMA_IT_CACHE", "/var/lib/boma-integration"))
|
||||
IMAGE_URL = "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2"
|
||||
SHA_URL = "https://cloud.debian.org/images/cloud/trixie/latest/SHA512SUMS"
|
||||
IMAGE_NAME = "debian-13-genericcloud-amd64.qcow2"
|
||||
NET_NAME = "boma-it"
|
||||
NET_XML = """<network>
|
||||
<name>boma-it</name>
|
||||
<forward mode='nat'/>
|
||||
<bridge name='virbr-boma' stp='on' delay='0'/>
|
||||
<ip address='192.168.150.1' netmask='255.255.255.0'>
|
||||
<dhcp><range start='192.168.150.10' end='192.168.150.254'/></dhcp>
|
||||
</ip>
|
||||
</network>
|
||||
"""
|
||||
NAME_PREFIX = "boma-it-"
|
||||
RUN_DIR = REPO_ROOT / "tests" / "integration" / ".run"
|
||||
DIAG_ROOT = pathlib.Path.home() / "integration-runs"
|
||||
PROFILE_DIR = REPO_ROOT / "tests" / "integration" / "profiles"
|
||||
INTEG_DIR = REPO_ROOT / "tests" / "integration"
|
||||
CERT_DIR = REPO_ROOT / "tests" / "integration" / "certs"
|
||||
DEFAULT_MEM_MIB = 3072
|
||||
DEFAULT_VCPUS = 2
|
||||
MIN_FREE_MIB = 4096
|
||||
VALID_TIERS = ("internal", "le-staging", "le-prod-wildcard")
|
||||
|
||||
# Target the SYSTEM libvirtd — where the substrate, /dev/kvm, and the NAT network live.
|
||||
# Without this, a non-root caller's bare virsh/virt-install default to qemu:///session.
|
||||
os.environ.setdefault("LIBVIRT_DEFAULT_URI", "qemu:///system")
|
||||
|
||||
|
||||
def vm_name(host, suffix=None):
|
||||
suffix = suffix or uuid.uuid4().hex[:8]
|
||||
return f"{NAME_PREFIX}{host}-{suffix}"
|
||||
|
||||
|
||||
def free_mib(meminfo_text):
|
||||
m = re.search(r"^MemAvailable:\s+(\d+)\s+kB", meminfo_text, re.MULTILINE)
|
||||
return int(m.group(1)) // 1024 if m else 0
|
||||
|
||||
|
||||
def parse_lease_ip(domifaddr_output):
|
||||
m = re.search(r"ipv4\s+(\d+\.\d+\.\d+\.\d+)", domifaddr_output)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def render_meta_data(instance_id, hostname):
|
||||
return f"instance-id: {instance_id}\nlocal-hostname: {hostname}\n"
|
||||
|
||||
|
||||
def render_user_data(ssh_pubkey, ansible_user):
|
||||
return (
|
||||
"#cloud-config\n"
|
||||
"users:\n"
|
||||
f" - name: {ansible_user}\n"
|
||||
" sudo: 'ALL=(ALL) NOPASSWD:ALL'\n"
|
||||
" shell: /bin/bash\n"
|
||||
" ssh_authorized_keys:\n"
|
||||
f" - {ssh_pubkey}\n"
|
||||
"ssh_pwauth: false\n"
|
||||
"package_update: true\n"
|
||||
)
|
||||
|
||||
|
||||
def cert_file(tier):
|
||||
if tier not in VALID_TIERS:
|
||||
raise ValueError(f"unknown cert tier: {tier}")
|
||||
return CERT_DIR / f"{tier}.yml"
|
||||
|
||||
|
||||
def profile_path(host):
|
||||
return PROFILE_DIR / f"{host}.json"
|
||||
|
||||
|
||||
def render_run_hosts(name, ip, ansible_user, groups):
|
||||
lines = [
|
||||
"---",
|
||||
"# Generated by scripts/integration-vm.py — transient, gitignored. Do not edit.",
|
||||
"# Single test host ONLY (safety invariant: no real host is ever in scope).",
|
||||
"all:",
|
||||
" children:",
|
||||
]
|
||||
for g in dict.fromkeys(groups):
|
||||
lines += [
|
||||
f" {g}:",
|
||||
" hosts:",
|
||||
f" {name}:",
|
||||
f" ansible_host: {ip}",
|
||||
f" ansible_user: {ansible_user}",
|
||||
# Integration VMs reuse IPs; bypass host-key caching so stale
|
||||
# known_hosts entries (from prior runs with a different VM at
|
||||
# the same IP) do not block the Ansible apply step.
|
||||
" ansible_ssh_common_args: >-",
|
||||
" -o StrictHostKeyChecking=no",
|
||||
" -o UserKnownHostsFile=/dev/null",
|
||||
]
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def sh(cmd, check=True, capture=False, **kw):
|
||||
"""Run a command (list form). Logs the command to stderr."""
|
||||
print("+ " + " ".join(str(c) for c in cmd), file=sys.stderr)
|
||||
return subprocess.run(cmd, check=check,
|
||||
capture_output=capture, text=True, **kw)
|
||||
|
||||
|
||||
def _expected_sha(sha_text, filename):
|
||||
for line in sha_text.splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) == 2 and parts[1].lstrip("*") == filename:
|
||||
return parts[0]
|
||||
return None
|
||||
|
||||
|
||||
def ensure_image():
|
||||
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
img = CACHE_DIR / IMAGE_NAME
|
||||
if img.exists():
|
||||
return img
|
||||
print(f"Downloading {IMAGE_URL} ...", file=sys.stderr)
|
||||
tmp = img.with_suffix(".part")
|
||||
urllib.request.urlretrieve(IMAGE_URL, tmp)
|
||||
sha_text = urllib.request.urlopen(SHA_URL).read().decode()
|
||||
want = _expected_sha(sha_text, IMAGE_NAME)
|
||||
if not want:
|
||||
tmp.unlink(missing_ok=True)
|
||||
raise SystemExit(f"checksum for {IMAGE_NAME} not found at {SHA_URL}")
|
||||
h = hashlib.sha512()
|
||||
with open(tmp, "rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(1 << 20), b""):
|
||||
h.update(chunk)
|
||||
if h.hexdigest() != want:
|
||||
tmp.unlink(missing_ok=True)
|
||||
raise SystemExit("golden image SHA512 mismatch — refusing to use it")
|
||||
tmp.rename(img)
|
||||
return img
|
||||
|
||||
|
||||
def net_ensure():
|
||||
r = sh(["virsh", "net-info", NET_NAME], check=False, capture=True)
|
||||
if r.returncode != 0:
|
||||
xml = RUN_DIR / "net.xml"
|
||||
RUN_DIR.mkdir(parents=True, exist_ok=True)
|
||||
xml.write_text(NET_XML)
|
||||
sh(["virsh", "net-define", str(xml)])
|
||||
sh(["virsh", "net-autostart", NET_NAME])
|
||||
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
|
||||
if not re.search(r"Active:\s+yes", active):
|
||||
sh(["virsh", "net-start", NET_NAME])
|
||||
|
||||
|
||||
def _ssh_pubkey():
|
||||
for cand in ("id_ed25519.pub", "id_rsa.pub"):
|
||||
p = pathlib.Path.home() / ".ssh" / cand
|
||||
if p.exists():
|
||||
return p.read_text().strip()
|
||||
raise SystemExit("no SSH public key found in ~/.ssh")
|
||||
|
||||
|
||||
def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS):
|
||||
free = free_mib(pathlib.Path("/proc/meminfo").read_text())
|
||||
if free < MIN_FREE_MIB:
|
||||
raise SystemExit(f"refusing to start: only {free} MiB free (< {MIN_FREE_MIB})")
|
||||
running = sh(["virsh", "list", "--name"], capture=True).stdout.split()
|
||||
if any(n.startswith(NAME_PREFIX) for n in running):
|
||||
raise SystemExit("an integration VM is already running (one at a time); "
|
||||
"run `integration-vm prune` first")
|
||||
name = name or vm_name(host)
|
||||
img = ensure_image()
|
||||
net_ensure()
|
||||
RUN_DIR.mkdir(parents=True, exist_ok=True)
|
||||
# VM disk/seed/console must live where the SYSTEM hypervisor (libvirt-qemu) can reach
|
||||
# them — NOT under the repo/home (qemu cannot traverse /home/claude). CACHE_DIR is
|
||||
# group-libvirt + world-traversable (created by the integration_test role).
|
||||
overlay = CACHE_DIR / f"{name}.qcow2"
|
||||
sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)])
|
||||
(RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible"))
|
||||
# cloud-init rejects underscores in local-hostname (causes init-local to skip
|
||||
# writing the network config → VM never gets a DHCP lease). Sanitize VM name
|
||||
# for use as hostname without affecting disk paths or virsh domain names.
|
||||
(RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name.replace("_", "-")))
|
||||
seed = CACHE_DIR / f"{name}-seed.img"
|
||||
# Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback.
|
||||
# Use explicit renderer + interface name to avoid a netplan 1.1.2 generation issue:
|
||||
# `match.name: en*` with a named key (e.g. `primary`) produces a .network file that
|
||||
# networkd loads but never DHCPs (no DHCP4 messages, just IPv6LL). Using the real
|
||||
# interface name `enp1s0` (all virtio NICs in these KVM VMs are named enp1s0) and
|
||||
# `renderer: networkd` bypasses the bug.
|
||||
(RUN_DIR / "network-config").write_text(
|
||||
'version: 2\n'
|
||||
'renderer: networkd\n'
|
||||
'ethernets:\n'
|
||||
' enp1s0:\n'
|
||||
' dhcp4: true\n')
|
||||
sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"),
|
||||
str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")])
|
||||
console = CACHE_DIR / f"{name}-console.log"
|
||||
# virt-install has a `#!/usr/bin/env python3` shebang; the Makefile prepends .venv/bin to
|
||||
# PATH (so the venv's ansible tools resolve), which would hijack virt-install into the
|
||||
# isolated venv — it lacks system PyGObject (`gi`) and crashes. Strip the venv from PATH
|
||||
# for this system tool so its shebang finds /usr/bin/python3 (which has gi). Ansible is
|
||||
# invoked via its absolute .venv path elsewhere, so it is unaffected.
|
||||
sys_path = ":".join(p for p in os.environ.get("PATH", "").split(":")
|
||||
if "/.venv/bin" not in p)
|
||||
sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus),
|
||||
"--boot", "uefi", # genericcloud triple-faults on legacy BIOS handoff; UEFI boots
|
||||
"--import",
|
||||
"--disk", f"path={overlay},format=qcow2",
|
||||
"--disk", f"path={seed},device=cdrom",
|
||||
"--network", f"network={NET_NAME}",
|
||||
"--osinfo", "debian13",
|
||||
"--graphics", "none",
|
||||
"--serial", f"file,path={console}",
|
||||
"--noautoconsole"],
|
||||
env=dict(os.environ, PATH=sys_path))
|
||||
ip = wait_for_ip(name)
|
||||
wait_for_ssh(ip, "ansible")
|
||||
# Block until cloud-init finishes (incl. apt-get update) so apply sees a ready system.
|
||||
sh(["ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
|
||||
f"ansible@{ip}", "sudo cloud-init status --wait"], check=False)
|
||||
(RUN_DIR / "current").write_text(f"{name}\n{ip}\n{host}\n")
|
||||
print(f"VM {name} up at {ip}")
|
||||
return name, ip
|
||||
|
||||
|
||||
def wait_for_ip(name, timeout=120):
|
||||
# Try --source lease first (fastest when leaseshelper works), then fall back to
|
||||
# --source arp (reads the host neighbour/ARP table — no privileged helper needed,
|
||||
# populated once the VM sends traffic). Both sources produce identical output that
|
||||
# parse_lease_ip handles, so this removes the leaseshelper/suid dependency.
|
||||
end = time.time() + timeout
|
||||
while time.time() < end:
|
||||
for source in ("lease", "arp"):
|
||||
out = sh(["virsh", "domifaddr", name, "--source", source],
|
||||
check=False, capture=True).stdout
|
||||
ip = parse_lease_ip(out)
|
||||
if ip:
|
||||
return ip
|
||||
time.sleep(4)
|
||||
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def wait_for_ssh(ip, user, timeout=180):
|
||||
end = time.time() + timeout
|
||||
while time.time() < end:
|
||||
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
|
||||
f"{user}@{ip}", "true"], check=False, capture=True)
|
||||
if r.returncode == 0:
|
||||
return
|
||||
time.sleep(5)
|
||||
raise SystemExit(f"timed out waiting for SSH to {ip} — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def _read_current():
|
||||
txt = (RUN_DIR / "current").read_text().splitlines()
|
||||
return txt[0], txt[1], txt[2] # name, ip, host
|
||||
|
||||
|
||||
def write_run_inventory(name, ip, groups):
|
||||
RUN_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(RUN_DIR / "hosts.yml").write_text(
|
||||
render_run_hosts(name, ip, "ansible", groups))
|
||||
link = RUN_DIR / "group_vars"
|
||||
target = REPO_ROOT / "inventories" / "production" / "group_vars"
|
||||
if link.is_symlink():
|
||||
link.unlink()
|
||||
elif link.exists():
|
||||
raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
|
||||
link.symlink_to(target)
|
||||
|
||||
|
||||
def apply(host, certs):
|
||||
name, ip, _ = _read_current()
|
||||
prof = json.loads(profile_path(host).read_text())
|
||||
write_run_inventory(name, ip, prof["groups"])
|
||||
extra = []
|
||||
for f in prof.get("extra_vars_files", []):
|
||||
extra += ["-e", f"@{INTEG_DIR / f}"]
|
||||
extra += ["-e", f"@{cert_file(certs)}"]
|
||||
for step in prof["applies"]:
|
||||
cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
|
||||
f"playbooks/{step['playbook']}", "--limit", name]
|
||||
if step.get("tags"):
|
||||
cmd += ["--tags", ",".join(step["tags"])]
|
||||
cmd += extra
|
||||
sh(cmd, cwd=str(REPO_ROOT))
|
||||
print(f"applied {host} profile to {name}")
|
||||
|
||||
|
||||
def _boot_id(ip, user):
|
||||
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
|
||||
f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
|
||||
check=False, capture=True)
|
||||
return r.stdout.strip() if r.returncode == 0 else None
|
||||
|
||||
|
||||
def wait_for_reboot(ip, user, before_boot_id, timeout=240):
|
||||
"""Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
|
||||
end = time.time() + timeout
|
||||
while time.time() < end:
|
||||
bid = _boot_id(ip, user)
|
||||
if bid and bid != before_boot_id:
|
||||
return
|
||||
time.sleep(5)
|
||||
raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def reboot_vm():
|
||||
name, ip, _ = _read_current()
|
||||
before = _boot_id(ip, "ansible")
|
||||
sh(["virsh", "reboot", name])
|
||||
wait_for_reboot(ip, "ansible", before)
|
||||
print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
|
||||
|
||||
|
||||
def run_assert(host, certs):
|
||||
name, ip, _ = _read_current()
|
||||
prof = json.loads(profile_path(host).read_text())
|
||||
write_run_inventory(name, ip, prof["groups"])
|
||||
extra = []
|
||||
for f in prof.get("extra_vars_files", []):
|
||||
extra += ["-e", f"@{INTEG_DIR / f}"]
|
||||
extra += ["-e", f"@{cert_file(certs)}"]
|
||||
cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"),
|
||||
"tests/integration/verify.yml", "--limit", name] + extra
|
||||
r = sh(cmd, cwd=str(REPO_ROOT), check=False)
|
||||
if r.returncode != 0:
|
||||
dump_diagnostics(name, ip)
|
||||
raise SystemExit(f"VERIFY FAILED for {name} — diagnostics in {DIAG_ROOT}")
|
||||
print(f"VERIFY PASSED for {name}")
|
||||
|
||||
|
||||
def dump_diagnostics(name, ip):
|
||||
d = DIAG_ROOT / name
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
for label, cmd in [
|
||||
("nft", "nft list ruleset"),
|
||||
("docker", "docker ps -a"),
|
||||
("ss", "ss -tlnp"),
|
||||
("journal", "journalctl -b --no-pager"),
|
||||
("critical-chain", "systemd-analyze critical-chain"),
|
||||
]:
|
||||
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null",
|
||||
f"ansible@{ip}", "sudo " + cmd], check=False, capture=True)
|
||||
(d / f"{label}.txt").write_text((r.stdout or "") + (r.stderr or ""))
|
||||
console = CACHE_DIR / f"{name}-console.log"
|
||||
if console.exists():
|
||||
# The serial log is root:0600 (libvirt-created); read it via sudo (ADR-015: the
|
||||
# claude worker has sudo) and write a worker-owned copy into the bundle.
|
||||
r = sh(["sudo", "cat", str(console)], check=False, capture=True)
|
||||
(d / "console.log").write_text(r.stdout or "")
|
||||
print(f"diagnostics written to {d}", file=sys.stderr)
|
||||
|
||||
|
||||
def _destroy(name):
|
||||
sh(["virsh", "destroy", name], check=False)
|
||||
sh(["virsh", "undefine", name, "--nvram"], check=False)
|
||||
for base in (RUN_DIR, CACHE_DIR):
|
||||
for f in base.glob(f"{name}*"):
|
||||
f.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def down(host=None, keep=False):
|
||||
if keep:
|
||||
print("--keep: leaving the VM running for inspection")
|
||||
return
|
||||
cur = RUN_DIR / "current"
|
||||
if cur.exists():
|
||||
name = cur.read_text().splitlines()[0]
|
||||
_destroy(name)
|
||||
cur.unlink(missing_ok=True)
|
||||
print(f"destroyed {name}")
|
||||
|
||||
|
||||
def prune():
|
||||
running = sh(["virsh", "list", "--all", "--name"], capture=True).stdout.split()
|
||||
for n in running:
|
||||
if n.startswith(NAME_PREFIX):
|
||||
_destroy(n)
|
||||
print(f"pruned {n}")
|
||||
(RUN_DIR / "current").unlink(missing_ok=True)
|
||||
|
||||
|
||||
def console():
|
||||
name = (RUN_DIR / "current").read_text().splitlines()[0]
|
||||
log = CACHE_DIR / f"{name}-console.log"
|
||||
if log.exists():
|
||||
print(sh(["sudo", "cat", str(log)], check=False, capture=True).stdout or "")
|
||||
else:
|
||||
print(f"no console log at {log}")
|
||||
|
||||
|
||||
def cycle(host, certs, keep=False, no_reboot=False):
|
||||
ok = False
|
||||
try:
|
||||
up(host)
|
||||
apply(host, certs)
|
||||
if not no_reboot:
|
||||
reboot_vm()
|
||||
run_assert(host, certs)
|
||||
ok = True
|
||||
finally:
|
||||
if ok and not keep:
|
||||
down(host)
|
||||
elif not ok:
|
||||
print("FAILED — VM left up for inspection; `integration-vm prune` to clean.",
|
||||
file=sys.stderr)
|
||||
|
||||
|
||||
DISPATCH = {
|
||||
"up": lambda a: (up(a.host), None)[1],
|
||||
"apply": lambda a: apply(a.host, a.certs),
|
||||
"reboot": lambda a: reboot_vm(),
|
||||
"assert": lambda a: run_assert(a.host, a.certs),
|
||||
"down": lambda a: down(a.host, a.keep),
|
||||
"console": lambda a: console(),
|
||||
"prune": lambda a: prune(),
|
||||
"cycle": lambda a: cycle(a.host, a.certs, a.keep, a.no_reboot),
|
||||
}
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
p = argparse.ArgumentParser(prog="integration-vm", description=__doc__)
|
||||
sub = p.add_subparsers(dest="cmd", required=True)
|
||||
for c in ("up", "apply", "reboot", "assert", "cycle", "down", "console"):
|
||||
sp = sub.add_parser(c)
|
||||
sp.add_argument("--host", required=True)
|
||||
sp.add_argument("--certs", choices=VALID_TIERS, default="internal")
|
||||
sp.add_argument("--keep", action="store_true")
|
||||
sp.add_argument("--no-reboot", action="store_true")
|
||||
sub.add_parser("prune")
|
||||
args = p.parse_args(argv)
|
||||
return DISPATCH[args.cmd](args)
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
sys.exit(main())
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Log the local Docker daemon into the Forgejo container registry using a token stored in
|
||||
# the Ansible vault — so registry pushes (make caddy-image-push / molecule-image-push) are
|
||||
# agent-completable non-interactively, like every other vault-backed action.
|
||||
# (2026-06-17 kaizen, docs/FRICTION.md: the push half silently needed an interactive
|
||||
# `docker login`; the creds weren't in the vault, so an agent couldn't complete a push.)
|
||||
#
|
||||
# Reads vault.forgejo.registry_token from the vault (rbw must be unlocked) and pipes it to
|
||||
# `docker login --password-stdin`. The token never lands on argv or on disk and is never
|
||||
# echoed (no `set -x`). Binaries/paths are overridable via env so the Makefile can pass the
|
||||
# venv ansible-vault/python; defaults work when run from the repo root with the venv present.
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
ANSIBLE_VAULT="${ANSIBLE_VAULT:-.venv/bin/ansible-vault}"
|
||||
PYTHON="${PYTHON:-.venv/bin/python}"
|
||||
VAULT="${VAULT:-inventories/production/group_vars/all/vault.yml}"
|
||||
REGISTRY_HOST="${REGISTRY_HOST:-forgejo.nyumbani.baobab.band}"
|
||||
REGISTRY_USER="${REGISTRY_USER:-sjat}"
|
||||
|
||||
token="$("$ANSIBLE_VAULT" view "$VAULT" \
|
||||
| "$PYTHON" -c 'import sys, yaml; d = yaml.safe_load(sys.stdin) or {}; print((((d.get("vault") or {}).get("forgejo") or {}).get("registry_token")) or "", end="")')"
|
||||
|
||||
if [ -z "$token" ] || [ "$token" = "CHANGEME" ]; then
|
||||
echo "registry-login: vault.forgejo.registry_token is unset or still CHANGEME." >&2
|
||||
echo " Mint a Forgejo token (Settings -> Applications -> Generate Token, with package" >&2
|
||||
echo " read+write scope, user $REGISTRY_USER) and set it via: make edit-vault" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf '%s' "$token" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin
|
||||
|
|
@ -41,42 +41,6 @@ LIST_ITEM_RE = re.compile(r"^\s*(\d+\.|[-*+])\s+(.*)")
|
|||
DEFER_REF_RE = re.compile(r"ADR-(\d{3})\D{0,40}?deferred\D{0,12}?(\d+)", re.I)
|
||||
RESOLVE_WORD_RE = re.compile(r"\b(?:resolv\w*|decid\w*|address\w*|complet\w*|done)\b", re.I)
|
||||
|
||||
# Rename-incomplete detection: an ADR announces a rename/supersession of a named
|
||||
# term (Old → New); verify the OLD name no longer lingers in the design-doc set.
|
||||
# (The structural cousin of stale-deferred — see docs/FRICTION.md, ADR-024.)
|
||||
# A "specific" name is a backticked token or a capitalised proper-noun/identifier;
|
||||
# common connective words are rejected so they can't be mistaken for a tool name.
|
||||
_NAME = r"(?:`[^`]+`|[A-Z][A-Za-z0-9_+.-]{2,})"
|
||||
RENAME_STOPWORDS = {
|
||||
"was", "were", "the", "this", "that", "with", "from", "into", "and", "but",
|
||||
"for", "are", "has", "had", "been", "now", "not", "all", "any", "use", "used",
|
||||
"via", "per", "its", "our", "one", "two", "old", "new", "phase", "step",
|
||||
"adr", "read", "name", "term", "tool", "prose", "roadmap",
|
||||
}
|
||||
# Trigger forms — each captures (old, new) as raw name tokens; the connective words
|
||||
# are case-insensitive but the names must still satisfy _NAME (specific tokens).
|
||||
RENAME_ASSERT_RES = (
|
||||
# renamed X to Y
|
||||
re.compile(rf"renamed\s+(?:from\s+)?({_NAME})\s+to\s+({_NAME})", re.I),
|
||||
# replaced X with Y
|
||||
re.compile(rf"replac\w*\s+({_NAME})\s+with\s+({_NAME})", re.I),
|
||||
# superseded X with/by Y
|
||||
re.compile(rf"supersed\w*\s+({_NAME})\s+(?:with|by)\s+({_NAME})", re.I),
|
||||
# X ... (is/are/was/were/been) updated to read Y
|
||||
re.compile(rf"({_NAME})\b.{{0,40}}?\b(?:is|are|was|were|been)?\s*"
|
||||
rf"updated\s+to\s+read\s+[\"']?({_NAME})", re.I),
|
||||
# X → Y / X -> Y on a line that also carries a rename/supersede/update cue
|
||||
re.compile(rf"({_NAME})\s*(?:->|→)\s*({_NAME})"),
|
||||
)
|
||||
RENAME_ARROW_RES = (RENAME_ASSERT_RES[-1],) # arrow forms need a cue word on the line
|
||||
RENAME_CUE_RE = re.compile(r"\b(?:renam\w*|replac\w*|supersed\w*|updated|rename)\b", re.I)
|
||||
# Historical / negation cues — a lingering OLD name on such a line is legitimate
|
||||
# history, not a missed ripple edit, so it is skipped.
|
||||
RENAME_HIST_RE = re.compile(
|
||||
r"\b(?:was|were|formerly|previously|no longer|instead of|rather than|reject\w*|"
|
||||
r"reconsider\w*|supersed\w*|deprecat\w*|legacy|history|heritage|V4|"
|
||||
r"actually ran|used to)\b", re.I)
|
||||
|
||||
# ADR-structure check (ADR-023): numbered ADRs must carry the four mandatory
|
||||
# sections and a parseable Status line. Presence only — section ORDER is a
|
||||
# template-demonstrated convention, not machine-enforced.
|
||||
|
|
@ -178,84 +142,6 @@ def adr_structure_findings(adr_files):
|
|||
return out
|
||||
|
||||
|
||||
def _clean_name(tok):
|
||||
"""Strip backticks/quotes from a captured name token. Return the bare name, or
|
||||
None if it is not a 'specific' token (empty, multi-word, or a stopword)."""
|
||||
s = tok.strip().strip("`\"'").strip()
|
||||
s = s.rstrip(".,;:!?)") # trailing sentence punctuation is not part of the name
|
||||
if not s or " " in s:
|
||||
return None
|
||||
if s.lower() in RENAME_STOPWORDS:
|
||||
return None
|
||||
# An ADR reference (ADR-017) is a document pointer, never the renamed *term* — a
|
||||
# sentence like "the ADR-017 prose ... is updated to read Caddy" must not parse
|
||||
# ADR-017 as the old name. Reject it so such lines skip (precision >> recall).
|
||||
if re.fullmatch(r"ADR-\d{3}", s):
|
||||
return None
|
||||
# Must be backtick-able identifier or a capitalised proper noun (the _NAME shape
|
||||
# already enforced this on capture; this is the after-stripping re-check).
|
||||
if not re.fullmatch(r"[A-Za-z0-9_+.-]{3,}", s):
|
||||
return None
|
||||
return s
|
||||
|
||||
|
||||
def _rename_assertion(line):
|
||||
"""Parse a single ADR line for a tight Old→New rename assertion. Returns
|
||||
(old, new) of cleaned specific names, or None. Conservative: precision >> recall."""
|
||||
for rx in RENAME_ASSERT_RES:
|
||||
m = rx.search(line)
|
||||
if not m:
|
||||
continue
|
||||
# Arrow form only counts when the line also carries a rename/supersede cue.
|
||||
if rx in RENAME_ARROW_RES and not RENAME_CUE_RE.search(line):
|
||||
continue
|
||||
old, new = _clean_name(m.group(1)), _clean_name(m.group(2))
|
||||
if old and new and old != new:
|
||||
return old, new
|
||||
return None
|
||||
|
||||
|
||||
def rename_incomplete_findings(adr_files, extra_docs):
|
||||
"""adr_files: {rel_path: [lines]} for docs/decisions/*.md (the numbered ADRs make
|
||||
the assertions). extra_docs: {rel_path: [lines]} for CAPABILITIES.md / ROADMAP.md.
|
||||
When a numbered ADR announces a rename 'Old' -> 'New', flag any DESIGN-doc line
|
||||
where 'Old' still appears as a whole word in present tense (skipping the announcing
|
||||
ADR, lines that also name 'New', and lines carrying a historical/negation cue)."""
|
||||
out = []
|
||||
# The design-doc set we search: all decisions/*.md plus the two extra docs.
|
||||
doc_set = dict(adr_files)
|
||||
doc_set.update(extra_docs)
|
||||
# Collect assertions only from numbered ADRs (NNN-*.md).
|
||||
assertions = [] # (adr_num, announcer_path, old, new)
|
||||
for rpath, lines in sorted(adr_files.items()):
|
||||
base = os.path.basename(rpath)
|
||||
if not ADR_FILE_RE.match(base):
|
||||
continue
|
||||
adr_num = base[:3]
|
||||
for line in lines:
|
||||
parsed = _rename_assertion(line)
|
||||
if parsed:
|
||||
assertions.append((adr_num, rpath, parsed[0], parsed[1]))
|
||||
for adr_num, announcer, old, new in assertions:
|
||||
old_re = re.compile(r"\b" + re.escape(old) + r"\b") # case-sensitive whole word
|
||||
for rpath, lines in sorted(doc_set.items()):
|
||||
if rpath == announcer: # the ADR that made the claim is exempt
|
||||
continue
|
||||
for i, raw in enumerate(lines, 1):
|
||||
if not old_re.search(raw):
|
||||
continue
|
||||
if new in raw: # rename is being explained on this line
|
||||
continue
|
||||
if RENAME_HIST_RE.search(raw): # legitimate history / negation
|
||||
continue
|
||||
out.append({"check": "rename-incomplete", "severity": "medium",
|
||||
"path": rpath, "line": i,
|
||||
"detail": f"ADR-{adr_num} announced rename '{old}' -> "
|
||||
f"'{new}' but '{old}' still appears here; confirm the "
|
||||
"ripple edit landed or soften the ADR claim"})
|
||||
return out
|
||||
|
||||
|
||||
def walk_files():
|
||||
for dirpath, dirnames, filenames in os.walk(ROOT):
|
||||
dirnames[:] = [d for d in dirnames if d not in PRUNE]
|
||||
|
|
@ -306,11 +192,8 @@ def scan():
|
|||
findings = []
|
||||
adrs = adr_numbers()
|
||||
adr_files = {} # docs/decisions/*.md → lines, for deferred-section parsing
|
||||
extra_docs = {} # CAPABILITIES.md / ROADMAP.md → lines, for rename-incomplete
|
||||
defer_refs = [] # repo-wide "resolves ADR-NNN deferred #K" references
|
||||
decisions_dir = os.path.join("docs", "decisions")
|
||||
rename_extra = {os.path.join("docs", "CAPABILITIES.md"),
|
||||
os.path.join("docs", "ROADMAP.md")}
|
||||
for path in walk_files():
|
||||
rpath = rel(path)
|
||||
if rpath.startswith(SKIP_PREFIX):
|
||||
|
|
@ -340,8 +223,6 @@ def scan():
|
|||
|
||||
if rpath.startswith(decisions_dir) and rpath.endswith(".md"):
|
||||
adr_files[rpath] = lines
|
||||
if rpath in rename_extra:
|
||||
extra_docs[rpath] = lines
|
||||
|
||||
for i, line in enumerate(lines, 1):
|
||||
for m in DEFER_REF_RE.finditer(line):
|
||||
|
|
@ -380,7 +261,6 @@ def scan():
|
|||
"line": i, "detail": f"references '{ref}' which does not exist"})
|
||||
findings.extend(deferred_findings(adr_files, defer_refs))
|
||||
findings.extend(adr_structure_findings(adr_files))
|
||||
findings.extend(rename_incomplete_findings(adr_files, extra_docs))
|
||||
return findings
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ module "askari" {
|
|||
location = "hel1" # Helsinki
|
||||
image = "debian-13"
|
||||
ansible_ssh_pubkey = var.ansible_ssh_pubkey
|
||||
ssh_admin_cidrs = ["91.226.145.80/32"] # TEMP (incident recovery 2026-06-17): re-open WAN :22 to ubongo only; re-close once the firewall/Docker + boot-race issues are fixed
|
||||
ssh_admin_cidrs = var.ssh_admin_cidrs
|
||||
public_web = true # Caddy 80/443 + NetBird 3478 (M4)
|
||||
labels = {
|
||||
env = "offsite"
|
||||
|
|
|
|||
|
|
@ -26,17 +26,12 @@ resource "hcloud_ssh_key" "ansible" {
|
|||
resource "hcloud_firewall" "this" {
|
||||
name = "${var.name}-fw"
|
||||
|
||||
# SSH from the control node only — and only when admin CIDRs are set. An empty
|
||||
# ssh_admin_cidrs removes the WAN :22 rule entirely (mesh-only SSH; reach the host over
|
||||
# wt0, break-glass = Hetzner console). Mesh-hardening 1/3.
|
||||
dynamic "rule" {
|
||||
for_each = length(var.ssh_admin_cidrs) > 0 ? [1] : []
|
||||
content {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "22"
|
||||
source_ips = var.ssh_admin_cidrs
|
||||
}
|
||||
# SSH from the control node only.
|
||||
rule {
|
||||
direction = "in"
|
||||
protocol = "tcp"
|
||||
port = "22"
|
||||
source_ips = var.ssh_admin_cidrs
|
||||
}
|
||||
|
||||
# Public web (Caddy 80/443) + NetBird STUN/TURN (3478/udp) — only when public_web
|
||||
|
|
|
|||
|
|
@ -24,9 +24,8 @@ variable "ansible_ssh_pubkey" {
|
|||
}
|
||||
|
||||
variable "ssh_admin_cidrs" {
|
||||
description = "Source CIDRs allowed to reach SSH over the WAN. Empty = no WAN SSH rule (mesh-only)."
|
||||
description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "public_web" {
|
||||
|
|
|
|||
|
|
@ -1,2 +0,0 @@
|
|||
---
|
||||
reverse_proxy__tls_internal: true
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
---
|
||||
# On-demand only. Records an accepted risk (ADR-025 / accepted-risks.md): the prod
|
||||
# Gandi PAT reaches an ephemeral VM and transient TXT records land in the real wingu.me.
|
||||
reverse_proxy__tls_internal: false
|
||||
reverse_proxy__acme_dns_provider: gandi
|
||||
reverse_proxy__acme_ca: ""
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
---
|
||||
reverse_proxy__tls_internal: false
|
||||
reverse_proxy__acme_dns_provider: gandi
|
||||
reverse_proxy__acme_ca: "https://acme-staging-v02.api.letsencrypt.org/directory"
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
---
|
||||
# Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`.
|
||||
# Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host.
|
||||
integration_profile: askari
|
||||
base__firewall_apply: true
|
||||
# Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM).
|
||||
base__ssh_listen_mesh_only: false
|
||||
# The VM is isolated; it must never touch the real mesh.
|
||||
base__mesh_enabled: false
|
||||
# Allow SSH from the VM's libvirt-NAT gateway (where the driver/ansible connects from),
|
||||
# so base's default-deny firewall + the reboot don't lock out the harness. By source IP,
|
||||
# so it's interface-independent. Overrides askari's real control addr for the test only.
|
||||
base__firewall_control_addr: "192.168.150.1"
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
---
|
||||
# Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19).
|
||||
# Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy
|
||||
# accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable.
|
||||
integration_profile: askari_inputonly
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true
|
||||
# No sshd ListenAddress change — never wt0-only in a throwaway VM.
|
||||
base__ssh_listen_mesh_only: false
|
||||
# Isolated VM: never touch the real mesh.
|
||||
base__mesh_enabled: false
|
||||
# The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the
|
||||
# VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is
|
||||
# interface-independent and the default-deny + reboot don't lock out the driver. This
|
||||
# mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology.
|
||||
base__firewall_admin_addrs:
|
||||
- 192.168.150.1
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
---
|
||||
# Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`.
|
||||
# Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input
|
||||
# chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd
|
||||
# ListenAddress change (so no boot-race).
|
||||
integration_profile: ubongo
|
||||
base__firewall_apply: true
|
||||
base__firewall_input_only: true # forward chain renders `policy accept`
|
||||
base__firewall_admin_addrs:
|
||||
- "192.168.150.98" # two representative LAN sources — exercises the
|
||||
- "192.168.150.99" # admin-addr loop with a multi-entry list (like ubongo)
|
||||
# Never wt0-only; never touch the real mesh from a throwaway VM.
|
||||
base__ssh_listen_mesh_only: false
|
||||
base__mesh_enabled: false
|
||||
# Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the
|
||||
# default-deny apply + the reboot don't lock out the harness. By source IP (interface-
|
||||
# independent). This is the harness's lifeline; the admin-addr above is only exercised.
|
||||
base__firewall_control_addr: "192.168.150.1"
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
{
|
||||
"groups": ["offsite_hosts"],
|
||||
"applies": [
|
||||
{"playbook": "site.yml", "tags": ["base"]},
|
||||
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
|
||||
],
|
||||
"extra_vars_files": ["overrides/askari.yml"],
|
||||
"mem_mib": 3072,
|
||||
"vcpus": 2
|
||||
}
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
{
|
||||
"groups": ["offsite_hosts"],
|
||||
"applies": [
|
||||
{"playbook": "site.yml", "tags": ["base"]},
|
||||
{"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]}
|
||||
],
|
||||
"extra_vars_files": ["overrides/askari_inputonly.yml"],
|
||||
"mem_mib": 3072,
|
||||
"vcpus": 2
|
||||
}
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
{
|
||||
"groups": ["control"],
|
||||
"applies": [
|
||||
{"playbook": "site.yml", "tags": ["base"]}
|
||||
],
|
||||
"extra_vars_files": ["overrides/ubongo.yml"],
|
||||
"mem_mib": 2048,
|
||||
"vcpus": 2
|
||||
}
|
||||
|
|
@ -1,129 +0,0 @@
|
|||
---
|
||||
# Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by
|
||||
# `integration_profile` (set in each profile's overlay). Each profile asserts its own success
|
||||
# criteria; an unknown/unset profile fails loudly (never a silent pass).
|
||||
- name: Verify the rebooted host
|
||||
hosts: all
|
||||
become: true
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: A known integration_profile must be set (no silent pass)
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- integration_profile is defined
|
||||
- integration_profile in ['askari', 'askari_inputonly', 'ubongo']
|
||||
fail_msg: "integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)"
|
||||
|
||||
# ── askari profile — Docker host: published-port forwarding survives the reboot ──
|
||||
# The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if
|
||||
# base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug).
|
||||
- name: (askari) Gather service facts
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.service_facts:
|
||||
|
||||
- name: (askari) Docker daemon is active
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.assert:
|
||||
that: "ansible_facts.services['docker.service'].state == 'running'"
|
||||
fail_msg: "docker.service is not running"
|
||||
|
||||
- name: (askari) Forward chain permits container traffic (drop-in loaded)
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.command: nft list chain inet filter forward
|
||||
register: _fwd
|
||||
changed_when: false
|
||||
|
||||
- name: (askari) Assert container forwarding is allowed (not pure drop)
|
||||
when: integration_profile == 'askari'
|
||||
ansible.builtin.assert:
|
||||
that: "'accept' in _fwd.stdout"
|
||||
fail_msg: >-
|
||||
forward chain is pure drop — container forwarding will die on reboot
|
||||
(FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing.
|
||||
|
||||
- name: (askari) Published port answers from the controller (DNAT + forward alive)
|
||||
when: integration_profile == 'askari'
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
ansible.builtin.uri:
|
||||
# Probe :80 (plain HTTP) — any answer proves the published-port DNAT + forward path
|
||||
# is alive. Don't follow caddy's HTTP->HTTPS redirect (its `tls internal` has no
|
||||
# cert for a bare-IP HTTPS request); the 308 itself proves the path works.
|
||||
url: "http://{{ ansible_host }}/"
|
||||
follow_redirects: none
|
||||
status_code: [200, 301, 308, 404, 502, 503]
|
||||
timeout: 10
|
||||
register: _probe
|
||||
retries: 5
|
||||
delay: 6
|
||||
until: _probe is succeeded
|
||||
|
||||
# ── ubongo profile — control node: INPUT-only default-deny survives the reboot ──
|
||||
# SSH reachability across the reboot is proven by the harness itself (it re-SSHes and
|
||||
# checks boot_id changed before this verify runs). Here we assert the ruleset shape.
|
||||
- name: (ubongo) Read the live nftables ruleset
|
||||
when: integration_profile == 'ubongo'
|
||||
ansible.builtin.command: nft list ruleset
|
||||
register: _nft
|
||||
changed_when: false
|
||||
|
||||
- name: (ubongo) INPUT default-deny, forward permissive, lifeline + admin-addr allow
|
||||
when: integration_profile == 'ubongo'
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
# live `nft list ruleset` prints the SYMBOLIC priority (`filter` = 0), unlike the
|
||||
# rendered /etc/nftables.conf (`priority 0`) that the Molecule scenario asserts against.
|
||||
- "'hook input priority filter; policy drop;' in _nft.stdout"
|
||||
- "'hook forward priority filter; policy accept;' in _nft.stdout"
|
||||
# the ssh-from-control lifeline (base__firewall_control_addr) — the reconnect path
|
||||
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft.stdout"
|
||||
- "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout"
|
||||
- "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout"
|
||||
fail_msg: >-
|
||||
ubongo profile: expected input policy drop, forward policy accept (input-only),
|
||||
the ssh-from-control lifeline (192.168.150.1), and both admin-addr
|
||||
(192.168.150.98/99) SSH allows in the live ruleset.
|
||||
|
||||
# ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ──
|
||||
# INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT
|
||||
# (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot.
|
||||
- name: (askari_inputonly) Read the live nftables ruleset
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.command: nft list ruleset
|
||||
register: _nft_io
|
||||
changed_when: false
|
||||
|
||||
- name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'hook input priority filter; policy drop;' in _nft_io.stdout"
|
||||
- "'hook forward priority filter; policy accept;' in _nft_io.stdout"
|
||||
- "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout"
|
||||
fail_msg: >-
|
||||
askari_inputonly: expected input policy drop, forward policy accept (input-only),
|
||||
and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset.
|
||||
|
||||
- name: (askari_inputonly) Gather service facts
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.service_facts:
|
||||
|
||||
- name: (askari_inputonly) Docker daemon is active
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
ansible.builtin.assert:
|
||||
that: "ansible_facts.services['docker.service'].state == 'running'"
|
||||
fail_msg: "docker.service is not running"
|
||||
|
||||
- name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive)
|
||||
when: integration_profile == 'askari_inputonly'
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ ansible_host }}/"
|
||||
follow_redirects: none
|
||||
status_code: [200, 301, 308, 404, 502, 503]
|
||||
timeout: 10
|
||||
register: _probe_io
|
||||
retries: 5
|
||||
delay: 6
|
||||
until: _probe_io is succeeded
|
||||
|
|
@ -97,12 +97,3 @@ def test_ingress_missing_port_raises():
|
|||
cat = {"svc": {"host": "docker01", "ingress": [{"from": "lan"}]}}
|
||||
with pytest.raises(ValueError):
|
||||
fr.resolve_firewall_rules(cat, ZONES, "docker01", HOSTVARS, GROUPS)
|
||||
|
||||
|
||||
def test_public_zone_resolves_to_anywhere():
|
||||
catalog = {"web": {"host": "askari",
|
||||
"ingress": [{"from": "public", "port": 443, "proto": "tcp"}]}}
|
||||
zones = {"public": "0.0.0.0/0"}
|
||||
rules = fr.resolve_firewall_rules(catalog, zones, "askari",
|
||||
{"askari": {"ansible_host": "100.99.226.39"}}, {})
|
||||
assert rules == [{"proto": "tcp", "port": 443, "sources": ["0.0.0.0/0"]}]
|
||||
|
|
|
|||
|
|
@ -123,8 +123,5 @@ def test_nudge_line_overdue_on_age():
|
|||
def test_load_signals_reads_real_friction_file():
|
||||
path = os.path.join(os.path.dirname(__file__), "..", "docs", "FRICTION.md")
|
||||
sigs = fs.load_signals(path, TODAY)
|
||||
# May legitimately be empty right after a /kaizen pass consumes every open signal —
|
||||
# an empty Open-signals section is the goal state, not a failure. Assert the function
|
||||
# parses the real file into well-formed signals (validity holds vacuously when empty).
|
||||
assert isinstance(sigs, list)
|
||||
assert len(sigs) >= 1
|
||||
assert all(s["tag"] in {"friction", "gotcha", "recurring", "unused"} for s in sigs)
|
||||
|
|
|
|||
|
|
@ -1,106 +0,0 @@
|
|||
import importlib.util
|
||||
import pathlib
|
||||
import types
|
||||
import pytest
|
||||
|
||||
_PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py"
|
||||
_spec = importlib.util.spec_from_file_location("integration_vm", _PATH)
|
||||
ivm = importlib.util.module_from_spec(_spec)
|
||||
_spec.loader.exec_module(ivm)
|
||||
|
||||
|
||||
def test_valid_tiers():
|
||||
assert ivm.VALID_TIERS == ("internal", "le-staging", "le-prod-wildcard")
|
||||
|
||||
|
||||
def test_vm_name_prefix_and_suffix():
|
||||
assert ivm.vm_name("askari", "ab12cd34") == "boma-it-askari-ab12cd34"
|
||||
|
||||
def test_vm_name_generates_suffix():
|
||||
n = ivm.vm_name("askari")
|
||||
assert n.startswith("boma-it-askari-") and len(n.split("-")[-1]) == 8
|
||||
|
||||
def test_free_mib_parses_memavailable():
|
||||
sample = "MemTotal: 16331156 kB\nMemAvailable: 8388608 kB\n"
|
||||
assert ivm.free_mib(sample) == 8192
|
||||
|
||||
def test_parse_lease_ip_extracts_ipv4():
|
||||
out = (" Name MAC address Protocol Address\n"
|
||||
"-------------------------------------------------------------------\n"
|
||||
" vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.42/24\n")
|
||||
assert ivm.parse_lease_ip(out) == "192.168.150.42"
|
||||
|
||||
def test_parse_lease_ip_none_when_absent():
|
||||
assert ivm.parse_lease_ip("no leases\n") is None
|
||||
|
||||
def test_parse_lease_ip_format_is_source_agnostic():
|
||||
# virsh domifaddr --source arp output format is identical to --source lease;
|
||||
# this test only proves the regex is format-agnostic (both sources produce the
|
||||
# same table). The behavioral arp-fallback in wait_for_ip is covered by
|
||||
# test_wait_for_ip_falls_back_to_arp below.
|
||||
out = (" Name MAC address Protocol Address\n"
|
||||
"-------------------------------------------------------------------\n"
|
||||
" vnet0 52:54:00:de:ad:be ipv4 192.168.150.73/24\n")
|
||||
assert ivm.parse_lease_ip(out) == "192.168.150.73"
|
||||
|
||||
|
||||
def test_wait_for_ip_falls_back_to_arp(monkeypatch):
|
||||
# wait_for_ip polls virsh domifaddr with --source lease first, then --source arp.
|
||||
# Simulate lease returning empty (no DHCP lease yet) and arp returning a real address.
|
||||
arp_out = (" Name MAC address Protocol Address\n"
|
||||
"-------------------------------------------------------------------\n"
|
||||
" vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.142/24\n")
|
||||
|
||||
def fake_sh(cmd, **kwargs):
|
||||
if "arp" in cmd:
|
||||
return types.SimpleNamespace(stdout=arp_out)
|
||||
return types.SimpleNamespace(stdout="")
|
||||
|
||||
monkeypatch.setattr(ivm, "sh", fake_sh)
|
||||
monkeypatch.setattr(ivm.time, "sleep", lambda _: None)
|
||||
assert ivm.wait_for_ip("dummy") == "192.168.150.142"
|
||||
|
||||
|
||||
def test_meta_data_has_instance_and_hostname():
|
||||
md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x")
|
||||
assert "instance-id: iid-askari-x" in md
|
||||
assert "local-hostname: boma-it-askari-x" in md
|
||||
|
||||
def test_user_data_injects_key_and_ansible_user():
|
||||
ud = ivm.render_user_data("ssh-ed25519 AAAA... claude@ubongo", "ansible")
|
||||
assert ud.startswith("#cloud-config")
|
||||
assert "name: ansible" in ud
|
||||
assert "ssh-ed25519 AAAA... claude@ubongo" in ud
|
||||
assert "NOPASSWD:ALL" in ud
|
||||
|
||||
|
||||
def test_cert_file_valid_tier():
|
||||
p = ivm.cert_file("le-staging")
|
||||
assert p.name == "le-staging.yml" and p.parent.name == "certs"
|
||||
|
||||
def test_cert_file_rejects_bad_tier():
|
||||
with pytest.raises(ValueError):
|
||||
ivm.cert_file("bogus")
|
||||
|
||||
def test_render_run_hosts_single_host_in_groups():
|
||||
out = ivm.render_run_hosts("boma-it-askari-x", "192.168.150.42",
|
||||
"ansible", ["offsite_hosts"])
|
||||
assert "offsite_hosts:" in out
|
||||
assert "boma-it-askari-x:" in out
|
||||
assert "ansible_host: 192.168.150.42" in out
|
||||
assert "ansible_user: ansible" in out
|
||||
assert "askari:" not in out.replace("boma-it-askari-x:", "")
|
||||
|
||||
def test_free_mib_returns_zero_when_absent():
|
||||
assert ivm.free_mib("MemTotal: 16384 kB\n") == 0
|
||||
|
||||
def test_render_run_hosts_multiple_groups():
|
||||
out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
|
||||
["offsite_hosts", "docker_hosts"])
|
||||
assert "offsite_hosts:" in out
|
||||
assert "docker_hosts:" in out
|
||||
|
||||
def test_render_run_hosts_dedups_groups():
|
||||
out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible",
|
||||
["docker_hosts", "docker_hosts"])
|
||||
assert out.count("docker_hosts:") == 1
|
||||
|
|
@ -57,99 +57,3 @@ def test_non_numbered_file_is_skipped():
|
|||
bare = ["# ADR template\n", "\n", "## Status\n", "\n", "<!-- hint -->\n"]
|
||||
out = _checks(rs.adr_structure_findings({"docs/decisions/adr-template.md": bare}))
|
||||
assert out == []
|
||||
|
||||
|
||||
# --- rename-incomplete -------------------------------------------------------
|
||||
|
||||
def _renames(findings):
|
||||
return [f for f in findings if f["check"] == "rename-incomplete"]
|
||||
|
||||
|
||||
def test_rename_incomplete_flags_lingering_old_name():
|
||||
# ADR announces `Foo` -> `Bar`; another decisions file still says Foo present-tense.
|
||||
announcer = {"docs/decisions/050-rename.md": [
|
||||
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"]}
|
||||
other = {} # extra_docs (CAPABILITIES/ROADMAP) — none here
|
||||
lingering = {"docs/decisions/030-other.md": [
|
||||
"The Foo proxy renders config from the catalog.\n"]}
|
||||
announcer.update(lingering)
|
||||
out = _renames(rs.rename_incomplete_findings(announcer, other))
|
||||
assert len(out) == 1
|
||||
assert out[0]["path"] == "docs/decisions/030-other.md"
|
||||
assert out[0]["line"] == 1
|
||||
assert out[0]["severity"] == "medium"
|
||||
assert "Foo" in out[0]["detail"] and "Bar" in out[0]["detail"]
|
||||
|
||||
|
||||
def test_rename_incomplete_clean_rename_has_no_findings():
|
||||
# The rename announced, and no other doc still mentions Foo.
|
||||
adr_files = {
|
||||
"docs/decisions/050-rename.md": [
|
||||
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
|
||||
"docs/decisions/030-other.md": [
|
||||
"The Bar proxy renders config from the catalog.\n"],
|
||||
}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_rename_incomplete_skips_historical_cue_line():
|
||||
# Foo lingers only on a line carrying a historical/negation cue → no finding.
|
||||
adr_files = {
|
||||
"docs/decisions/050-rename.md": [
|
||||
"## Decision\n", "We renamed `Foo` to `Bar` across the design docs.\n"],
|
||||
"docs/decisions/030-other.md": [
|
||||
"Foo was rejected; we run Bar now.\n",
|
||||
"The history of Foo informs the choice.\n"],
|
||||
}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_rename_incomplete_skips_announcing_adr_itself():
|
||||
# The announcing ADR mentions Foo (it has to) — must not flag itself.
|
||||
adr_files = {
|
||||
"docs/decisions/050-rename.md": [
|
||||
"## Decision\n",
|
||||
"We renamed `Foo` to `Bar`.\n",
|
||||
"Operators who configured Foo should switch their habits.\n"],
|
||||
}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_rename_incomplete_skips_line_naming_new_term():
|
||||
# A line that mentions both Foo and Bar is explaining the rename → skipped.
|
||||
adr_files = {
|
||||
"docs/decisions/050-rename.md": [
|
||||
"## Decision\n", "We renamed `Foo` to `Bar`.\n"],
|
||||
"docs/decisions/030-other.md": [
|
||||
"Foo is being phased out for Bar in this paragraph.\n"],
|
||||
}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_rename_incomplete_searches_extra_docs():
|
||||
# A lingering OLD name in CAPABILITIES.md (an extra_docs file) is flagged.
|
||||
adr_files = {"docs/decisions/050-rename.md": [
|
||||
"## Decision\n", "We renamed `Foo` to `Bar`.\n"]}
|
||||
extra = {"docs/CAPABILITIES.md": ["The Foo proxy is what we deploy.\n"]}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, extra))
|
||||
assert len(out) == 1
|
||||
assert out[0]["path"] == "docs/CAPABILITIES.md"
|
||||
|
||||
|
||||
def test_rename_incomplete_ignores_ambiguous_adr_pointer_assertion():
|
||||
# "the ADR-017 prose ... is updated to read Caddy" must NOT parse ADR-017 as the
|
||||
# old name (it is a doc pointer). With ADR-017 rejected, no assertion → no finding,
|
||||
# even though 'ADR-017' appears in many other docs.
|
||||
adr_files = {
|
||||
"docs/decisions/024-reverse-proxy.md": [
|
||||
"## Consequences\n",
|
||||
'- ADR-017 prose that mentioned Traefik is updated to read "Caddy".\n'],
|
||||
"docs/decisions/008-testing.md": [
|
||||
"Level 4 UI verification follows ADR-017.\n"],
|
||||
}
|
||||
out = _renames(rs.rename_incomplete_findings(adr_files, {}))
|
||||
assert out == []
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue