From ab328a2f791b6dfda0289821a09f998ee9bc3781 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 17:15:33 +0200 Subject: [PATCH 1/9] feat(netbird_coordinator): disable geolocation so no-egress startup can't FATAL the control plane Co-Authored-By: Claude Opus 4.8 (1M context) --- roles/netbird_coordinator/README.md | 1 + roles/netbird_coordinator/defaults/main.yml | 7 +++++++ roles/netbird_coordinator/molecule/default/verify.yml | 9 +++++++++ .../netbird_coordinator/templates/docker-compose.yml.j2 | 4 ++++ 4 files changed, 21 insertions(+) diff --git a/roles/netbird_coordinator/README.md b/roles/netbird_coordinator/README.md index 4145535..414e9a1 100644 --- a/roles/netbird_coordinator/README.md +++ b/roles/netbird_coordinator/README.md @@ -46,6 +46,7 @@ upstream support; WS/gRPC need long timeouts (Caddy sets none by default). | `netbird_coordinator__domain` | `netbird.askari.wingu.me` | Public hostname; feeds `exposedAddress`, the OIDC issuer, redirect URIs, and the dashboard endpoints | | `netbird_coordinator__trusted_proxies` | `["172.16.0.0/12"]` | Source ranges NetBird trusts `X-Forwarded-*` from (`server.reverseProxy.trustedHTTPProxies`). Must cover Caddy's source IP on the boma network — verify the actual bridge subnet at deploy | | `netbird_coordinator__manage` | `true` | Set `false` in Molecule to render templates without a Docker daemon | +| `netbird_coordinator__disable_geolocation` | `true` | sets `NB_DISABLE_GEOLOCATION` so a no-egress startup can't FATAL the server on the GeoLite2 download (FRICTION 2026-06-17 #4) | Production overrides live in `inventories/production/group_vars/`. diff --git a/roles/netbird_coordinator/defaults/main.yml b/roles/netbird_coordinator/defaults/main.yml index 369e55d..a767ab3 100644 --- a/roles/netbird_coordinator/defaults/main.yml +++ b/roles/netbird_coordinator/defaults/main.yml @@ -6,6 +6,13 @@ netbird_coordinator__dashboard_image: "netbirdio/dashboard:v2.39.0" netbird_coordinator__base_dir: /opt/services/netbird netbird_coordinator__domain: netbird.askari.wingu.me +# Disable NetBird's GeoLite2 geolocation (download + lookups). boma uses no geo posture +# (ACL is Allow-All), and the combined server treats a failed GeoLite2 download as FATAL — +# so a transient egress loss (NAT wiped on `nft flush`, or the boot window before Docker +# re-adds NAT) would crash-loop the whole control plane (FRICTION 2026-06-17 #4). Disabling +# removes that dependency. Revisit if a future ACL sub-project wants geo-based posture. +netbird_coordinator__disable_geolocation: true + # Source IP ranges Caddy fronts NetBird from, rendered into config.yaml # server.reverseProxy.trustedHTTPProxies. NetBird trusts X-Forwarded-* only from # these. MUST cover the Caddy container's source IP on the boma Docker network — diff --git a/roles/netbird_coordinator/molecule/default/verify.yml b/roles/netbird_coordinator/molecule/default/verify.yml index 0d657f3..5e38ec2 100644 --- a/roles/netbird_coordinator/molecule/default/verify.yml +++ b/roles/netbird_coordinator/molecule/default/verify.yml @@ -30,3 +30,12 @@ - "'v2.39.0' in (_compose.content | b64decode)" fail_msg: "docker-compose.yml is missing pinned image tags" success_msg: "docker-compose.yml pins both image tags" + + - name: "Assert geolocation is disabled (FRICTION 2026-06-17 #4 — no geo-DB download FATAL)" + ansible.builtin.assert: + that: + - "'NB_DISABLE_GEOLOCATION: \"true\"' in (_compose.content | b64decode)" + fail_msg: >- + compose must set NB_DISABLE_GEOLOCATION=true so a no-egress startup can't FATAL + the coordinator on the GeoLite2 download + success_msg: "geolocation disabled in compose" diff --git a/roles/netbird_coordinator/templates/docker-compose.yml.j2 b/roles/netbird_coordinator/templates/docker-compose.yml.j2 index f84c922..c37f6fc 100644 --- a/roles/netbird_coordinator/templates/docker-compose.yml.j2 +++ b/roles/netbird_coordinator/templates/docker-compose.yml.j2 @@ -16,6 +16,10 @@ services: container_name: netbird-server restart: unless-stopped command: ["--config", "/etc/netbird/config.yaml"] + environment: + # Disable geolocation so a no-egress startup can't FATAL the control plane + # (FRICTION 2026-06-17 #4). boma uses no geo posture (ACL Allow-All). + NB_DISABLE_GEOLOCATION: "{{ netbird_coordinator__disable_geolocation | string | lower }}" ports: - "3478:3478/udp" volumes: From d9b8676fcec09e14febe14c4994e23f032da77c8 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 17:18:58 +0200 Subject: [PATCH 2/9] feat(inventory): askari INPUT-only firewall + WAN break-glass + manage over wt0 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../group_vars/offsite_hosts/vars.yml | 30 +++++++++++-------- inventories/production/host_vars/askari.yml | 7 +++++ 2 files changed, 24 insertions(+), 13 deletions(-) create mode 100644 inventories/production/host_vars/askari.yml diff --git a/inventories/production/group_vars/offsite_hosts/vars.yml b/inventories/production/group_vars/offsite_hosts/vars.yml index 03047e4..54007b3 100644 --- a/inventories/production/group_vars/offsite_hosts/vars.yml +++ b/inventories/production/group_vars/offsite_hosts/vars.yml @@ -1,17 +1,21 @@ --- # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer -# (ADR-016, M5). base__mesh_enabled stays true (M5 enrollment). +# (ADR-016, M5). # -# Mesh-hardening 1/3 (move SSH onto wt0 + nftables default-deny) was attempted on -# 2026-06-17 and BACKED OUT after it took askari down: applying base's nftables -# `forward policy drop` to a Docker host broke container forwarding/NAT on reboot, and the -# wt0-only sshd ListenAddress left no break-glass (ip_nonlocal_bind did not beat the boot -# race). Until docker_host ships Docker-safe container-forward rules and the boot-race + -# coordinator-bootstrap issues are re-designed, askari keeps: -# - sshd listening on all interfaces (reachable over the WAN; Hetzner Cloud Firewall is -# the perimeter) — base__ssh_listen_mesh_only stays false, -# - the host nftables firewall NOT applied — base__firewall_apply false. -# See the incident write-up / the mesh-hardening re-spec before re-enabling either. +# Mesh-hardening REDESIGN (2026-06-19): the 2026-06-17 attempt was backed out (forward +# `policy drop` broke Docker on reboot; wt0-only sshd left no break-glass; ip_nonlocal_bind +# did not beat the boot-race). The redesign mirrors the proven ubongo 2/3 pattern: +# - INPUT-only default-deny (base__firewall_input_only) — forward stays `policy accept` +# so Docker container forwarding/NAT survive a reboot; +# - SSH scoped by the host firewall (iifname wt0 + admin-addr), NOT a sshd ListenAddress +# change — base__ssh_listen_mesh_only stays false, so there is no boot-race; +# - WAN :22 is DELIBERATELY left open from ubongo's WAN IP (base__firewall_admin_addrs) +# as the permanent non-mesh break-glass — the coordinator-host exception (a host's only +# management path must never depend on a service that host itself hosts). +# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md base__mesh_enabled: true -base__ssh_listen_mesh_only: false -base__firewall_apply: false +base__firewall_apply: true +base__firewall_input_only: true # forward stays `policy accept` → Docker-safe +base__ssh_listen_mesh_only: false # no sshd ListenAddress change → no boot-race +base__firewall_admin_addrs: + - 91.226.145.80 # ubongo's (static) WAN IP — the permanent non-mesh SSH break-glass diff --git a/inventories/production/host_vars/askari.yml b/inventories/production/host_vars/askari.yml new file mode 100644 index 0000000..4106f5d --- /dev/null +++ b/inventories/production/host_vars/askari.yml @@ -0,0 +1,7 @@ +--- +# Manage askari over the NetBird mesh (wt0). Overrides the TF-generated WAN `ansible_host` +# in offsite.yml (host_vars are NOT regenerated by tf_to_inventory.py). The WAN :22 path +# (Hetzner Cloud Firewall + base__firewall_admin_addrs = ubongo's WAN) stays as the +# break-glass; the Hetzner web console is the IP-independent ultimate fallback. +# Spec: docs/superpowers/specs/2026-06-19-mesh-hardening-askari-redesign-design.md +ansible_host: 100.99.226.39 From 1042f161b63e71a6811138950cda85f88faaeb5b Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 19:14:55 +0200 Subject: [PATCH 3/9] =?UTF-8?q?test(integration):=20askari=5Finputonly=20?= =?UTF-8?q?=E2=80=94=20INPUT-only=20default-deny=20reboot=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the ADR-025 integration-test profile that proves the askari mesh-hardening REDESIGN (INPUT-only default-deny, forward ACCEPT for Docker) is reboot-safe on a throwaway KVM VM before the live cut-over. Profile applies base (firewall + sshd) and offsite (docker_host + reverse_proxy). Post-reboot verify checks: input policy drop, forward policy accept, admin-addr break-glass SSH (192.168.150.1), Docker up, and a published port answered from the controller. GREEN on 2026-06-19. Co-Authored-By: Claude Sonnet 4.6 --- .../overrides/askari_inputonly.yml | 17 +++++++ .../profiles/askari_inputonly.json | 10 ++++ tests/integration/verify.yml | 48 ++++++++++++++++++- 3 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 tests/integration/overrides/askari_inputonly.yml create mode 100644 tests/integration/profiles/askari_inputonly.json diff --git a/tests/integration/overrides/askari_inputonly.yml b/tests/integration/overrides/askari_inputonly.yml new file mode 100644 index 0000000..e2fe904 --- /dev/null +++ b/tests/integration/overrides/askari_inputonly.yml @@ -0,0 +1,17 @@ +--- +# Integration overlay (ADR-025) — the askari mesh-hardening REDESIGN (2026-06-19). +# Validates INPUT-only default-deny on a Docker host: input policy drop, forward policy +# accept (Docker-safe), SSH via the admin-addr break-glass, reboot-survivable. +integration_profile: askari_inputonly +base__firewall_apply: true +base__firewall_input_only: true +# No sshd ListenAddress change — never wt0-only in a throwaway VM. +base__ssh_listen_mesh_only: false +# Isolated VM: never touch the real mesh. +base__mesh_enabled: false +# The non-mesh SSH break-glass = the admin-addr path the real design uses. Point it at the +# VM's libvirt-NAT gateway (where the harness connects from), by source IP so it is +# interface-independent and the default-deny + reboot don't lock out the driver. This +# mirrors askari's real base__firewall_admin_addrs (ubongo's WAN) in the test topology. +base__firewall_admin_addrs: + - 192.168.150.1 diff --git a/tests/integration/profiles/askari_inputonly.json b/tests/integration/profiles/askari_inputonly.json new file mode 100644 index 0000000..d571d1d --- /dev/null +++ b/tests/integration/profiles/askari_inputonly.json @@ -0,0 +1,10 @@ +{ + "groups": ["offsite_hosts"], + "applies": [ + {"playbook": "site.yml", "tags": ["base"]}, + {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]} + ], + "extra_vars_files": ["overrides/askari_inputonly.yml"], + "mem_mib": 3072, + "vcpus": 2 +} diff --git a/tests/integration/verify.yml b/tests/integration/verify.yml index 1f460f5..cf9f61c 100644 --- a/tests/integration/verify.yml +++ b/tests/integration/verify.yml @@ -11,8 +11,8 @@ ansible.builtin.assert: that: - integration_profile is defined - - integration_profile in ['askari', 'ubongo'] - fail_msg: "integration_profile must be set in the profile overlay (askari|ubongo)" + - integration_profile in ['askari', 'askari_inputonly', 'ubongo'] + fail_msg: "integration_profile must be set in the profile overlay (askari|askari_inputonly|ubongo)" # ── askari profile — Docker host: published-port forwarding survives the reboot ── # The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if @@ -83,3 +83,47 @@ ubongo profile: expected input policy drop, forward policy accept (input-only), the ssh-from-control lifeline (192.168.150.1), and both admin-addr (192.168.150.98/99) SSH allows in the live ruleset. + + # ── askari_inputonly profile — the mesh-hardening REDESIGN (2026-06-19) ── + # INPUT-only default-deny on a Docker host: input policy drop, forward policy ACCEPT + # (Docker-safe), SSH via the admin-addr break-glass, published-port DNAT survives reboot. + - name: (askari_inputonly) Read the live nftables ruleset + when: integration_profile == 'askari_inputonly' + ansible.builtin.command: nft list ruleset + register: _nft_io + changed_when: false + + - name: (askari_inputonly) INPUT default-deny, forward permissive, admin-addr break-glass + when: integration_profile == 'askari_inputonly' + ansible.builtin.assert: + that: + - "'hook input priority filter; policy drop;' in _nft_io.stdout" + - "'hook forward priority filter; policy accept;' in _nft_io.stdout" + - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft_io.stdout" + fail_msg: >- + askari_inputonly: expected input policy drop, forward policy accept (input-only), + and the admin-addr break-glass (192.168.150.1) SSH allow in the live ruleset. + + - name: (askari_inputonly) Gather service facts + when: integration_profile == 'askari_inputonly' + ansible.builtin.service_facts: + + - name: (askari_inputonly) Docker daemon is active + when: integration_profile == 'askari_inputonly' + ansible.builtin.assert: + that: "ansible_facts.services['docker.service'].state == 'running'" + fail_msg: "docker.service is not running" + + - name: (askari_inputonly) Published port answers from the controller (DNAT + forward alive) + when: integration_profile == 'askari_inputonly' + delegate_to: localhost + become: false + ansible.builtin.uri: + url: "http://{{ ansible_host }}/" + follow_redirects: none + status_code: [200, 301, 308, 404, 502, 503] + timeout: 10 + register: _probe_io + retries: 5 + delay: 6 + until: _probe_io is succeeded From 8ca42c389cdb5bfc04f8ab94086ccf876586f00c Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 19:15:07 +0200 Subject: [PATCH 4/9] fix(integration): fix VM boot: hostname, netplan, known_hosts handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes found during askari_inputonly integration-test development: 1. Hostname sanitization: cloud-init rejects underscores in local-hostname (silently skips network-config → VM never gets DHCP). Sanitize with name.replace("_", "-") for the meta-data hostname; paths/domain names keep the original (underscore is valid there). 2. Netplan explicit interface: match.name: en* with a named key produces a .network file that networkd never DHCPs. Use explicit enp1s0 (all virtio NICs in these KVM VMs) + renderer: networkd to bypass the bug. 3. ansible_ssh_common_args in the generated hosts.yml: integration VMs reuse IPs (different VMs at same 192.168.150.x lease). StrictHostKey accept-new from ansible.cfg blocks changed keys. Add StrictHostKeyChecking=no + UserKnownHostsFile=/dev/null per-host to the generated inventory so stale known_hosts entries never block the apply step. Co-Authored-By: Claude Sonnet 4.6 --- scripts/integration-vm.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py index 02c655c..c51f9f7 100644 --- a/scripts/integration-vm.py +++ b/scripts/integration-vm.py @@ -106,6 +106,12 @@ def render_run_hosts(name, ip, ansible_user, groups): f" {name}:", f" ansible_host: {ip}", f" ansible_user: {ansible_user}", + # Integration VMs reuse IPs; bypass host-key caching so stale + # known_hosts entries (from prior runs with a different VM at + # the same IP) do not block the Ansible apply step. + " ansible_ssh_common_args: >-", + " -o StrictHostKeyChecking=no", + " -o UserKnownHostsFile=/dev/null", ] return "\n".join(lines) + "\n" @@ -188,15 +194,22 @@ def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): overlay = CACHE_DIR / f"{name}.qcow2" sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)]) (RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible")) - (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name)) + # cloud-init rejects underscores in local-hostname (causes init-local to skip + # writing the network config → VM never gets a DHCP lease). Sanitize VM name + # for use as hostname without affecting disk paths or virsh domain names. + (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name.replace("_", "-"))) seed = CACHE_DIR / f"{name}-seed.img" # Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback. + # Use explicit renderer + interface name to avoid a netplan 1.1.2 generation issue: + # `match.name: en*` with a named key (e.g. `primary`) produces a .network file that + # networkd loads but never DHCPs (no DHCP4 messages, just IPv6LL). Using the real + # interface name `enp1s0` (all virtio NICs in these KVM VMs are named enp1s0) and + # `renderer: networkd` bypasses the bug. (RUN_DIR / "network-config").write_text( 'version: 2\n' + 'renderer: networkd\n' 'ethernets:\n' - ' primary:\n' - ' match:\n' - ' name: "en*"\n' + ' enp1s0:\n' ' dhcp4: true\n') sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"), str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")]) From 9f0626040bca46e361f36e34755f1704b3b0c10d Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 19:15:18 +0200 Subject: [PATCH 5/9] =?UTF-8?q?docs(todo):=20add=20note=20on=20ubongo?= =?UTF-8?q?=E2=86=94cluster=20network=20topology=20question?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- docs/TODO.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/TODO.md b/docs/TODO.md index 0bcfaec..d2c33ef 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -128,6 +128,7 @@ 6. Supply-chain hygiene: enforce tiered image pinning (stateful `tag@digest`; stateless rolling tags — ADR-011) + official/verified images via the service checklist; revisit active scanning (Trivy/Grype) once a triage stack exists (R1). + 7. Is our network setup as it should be? I am not sure if all traffic between ubongo and notes goes via askari? what if askari breaks - will the rest work? 16. **ADR-011 (update management) — resolve open questions + accept.** Committed as **Proposed**; resolve before marking Accepted: From 4933186d31af6712a9cd034192aa64da0b5c5347 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 19:16:45 +0200 Subject: [PATCH 6/9] docs(friction): task-3 integration-gate findings (dnsmasq, nftables, hostname) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents three blockers found while developing the askari_inputonly integration-test profile: 1. inet filter default-deny silently blocks libvirt dnsmasq DHCP: nftables multi-table independence means ip filter LIBVIRT_INP accept does NOT prevent inet filter drop. Diagnosed via strace; fixed with a drop-in. 2. libvirt leaseshelper PID-file: virPidFileReleasePath unlinks the file after every call; nobody cannot recreate in /run/. Fix: suid root C wrapper. 3. cloud-init rejects underscores in local-hostname → skips network-config → no DHCP. Fix: sanitize with replace("_", "-") in meta-data hostname. Co-Authored-By: Claude Sonnet 4.6 --- docs/FRICTION.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/FRICTION.md b/docs/FRICTION.md index 620eedc..1067556 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -224,6 +224,46 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp `flush` is safe; (3) the firewall final-review checklist should include "does the host run Docker/libvirt? the flush wipes their nat." + + +- `[gotcha]` **`inet filter` default-deny blocks libvirt dnsmasq DHCP — silent, hard to diagnose** + (2026-06-19, task-3 integration gate): when `base__firewall_input_only: true` is applied to + ubongo, the `table inet filter { chain input { policy drop; } }` blocks DHCP packets that arrive + via the libvirt bridge (`virbr-boma`). In nftables, multiple tables at the same hook priority all + run independently; an `accept` verdict in `table ip filter LIBVIRT_INP` does NOT prevent + `table inet filter` from seeing and dropping the same packet. VMs never got DHCP leases (dnsmasq + socket confirmed by strace to never receive POLLIN despite tcpdump seeing the packet on + `virbr-boma`). Diagnosed by temporarily changing `inet filter input` to `policy accept` → fd=3 + immediately fired. Fix: `/etc/nftables.d/10-libvirt-boma.nft` drop-in adding + `iifname "virbr-boma" accept` (survives service restarts via `include "/etc/nftables.d/*.nft"`). + → The `base` role's template needs a `base__firewall_trusted_bridges` variable so this is + encoded at the Ansible level, not in a manual host drop-in. Every host that runs Docker or + libvirt and also has `base__firewall_input_only: true` needs an analogous exception. + +- `[gotcha]` **libvirt `leaseshelper` PID-file permission: `virPidFileReleasePath` unlinks + `/run/leaseshelper.pid` after EVERY call; nobody cannot recreate it** (2026-06-19, task-3 + integration gate): dnsmasq runs as nobody; `libvirt_leaseshelper` is its `--dhcp-script`. The + helper acquires a PID-file mutex at `/run/leaseshelper.pid`, but `virPidFileReleasePath` + UNLINKS the file on exit. `/run/` is `root:root 755`, so nobody cannot create the file after the + first unlink → every subsequent `add` call fails with `errno=13`, dnsmasq silently drops the + DHCP grant (no log, no error to the client). Fix: suid root C wrapper at + `/usr/lib/libvirt/libvirt_leaseshelper` (original moved to `.real`) that pre-creates + `/run/leaseshelper.pid` owned by nobody, then drops privileges and execs the real helper. The + root dnsmasq fork calls the wrapper; suid gives it permission to touch `/run/`; on return to + nobody uid the PID file stays. Also: `/var/lib/libvirt/dnsmasq/` must be `nobody:nogroup 775` + so leaseshelper can update `virbr-boma.status`. This fix is host-local on ubongo and NOT in + Ansible — encode it in an `integration_test` role task (or a libvirt role) before the harness + can be safely re-deployed. + +- `[gotcha]` **cloud-init rejects underscores in `local-hostname` → silently skips + network-config → VM never gets DHCP** (2026-06-19, task-3 integration gate): setting + `local-hostname: boma-it-askari_inputonly-` caused cloud-init-local to consider the + hostname invalid and skip writing the network-config to the system. Systemd-networkd then + used the genericcloud default (no DHCP), so VMs got only IPv6 link-local. Fix in + `scripts/integration-vm.py`: `name.replace("_", "-")` in the meta-data hostname (disk paths + and virsh domain names keep the original underscore). Sanitization rule: RFC-952 hostnames + allow hyphens, not underscores. + --- ## Kaizen reviews — decisions ledger From dc5cc8933f91d5f8fab4a2ec67ce92be96e9b8a8 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 22:29:35 +0200 Subject: [PATCH 7/9] fix(harness): fall back to --source arp for VM IP discovery (no leaseshelper) wait_for_ip now tries --source lease first then --source arp; both produce identical output handled by parse_lease_ip. Removes the suid leaseshelper dependency introduced and backed out in Task 3. New unit test confirms parse_lease_ip works on --source arp output format. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/integration-vm.py | 15 ++++++++++----- tests/test_integration_vm.py | 8 ++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py index c51f9f7..7e5b412 100644 --- a/scripts/integration-vm.py +++ b/scripts/integration-vm.py @@ -243,13 +243,18 @@ def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): def wait_for_ip(name, timeout=120): + # Try --source lease first (fastest when leaseshelper works), then fall back to + # --source arp (reads the host neighbour/ARP table — no privileged helper needed, + # populated once the VM sends traffic). Both sources produce identical output that + # parse_lease_ip handles, so this removes the leaseshelper/suid dependency. end = time.time() + timeout while time.time() < end: - out = sh(["virsh", "domifaddr", name, "--source", "lease"], - check=False, capture=True).stdout - ip = parse_lease_ip(out) - if ip: - return ip + for source in ("lease", "arp"): + out = sh(["virsh", "domifaddr", name, "--source", source], + check=False, capture=True).stdout + ip = parse_lease_ip(out) + if ip: + return ip time.sleep(4) raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — " "VM left defined; run `integration-vm prune` to remove it") diff --git a/tests/test_integration_vm.py b/tests/test_integration_vm.py index 1d0a750..7061c0c 100644 --- a/tests/test_integration_vm.py +++ b/tests/test_integration_vm.py @@ -32,6 +32,14 @@ def test_parse_lease_ip_extracts_ipv4(): def test_parse_lease_ip_none_when_absent(): assert ivm.parse_lease_ip("no leases\n") is None +def test_parse_lease_ip_arp_source(): + # virsh domifaddr --source arp output format is identical to --source lease; + # this test proves parse_lease_ip handles it so the arp fallback in wait_for_ip works. + out = (" Name MAC address Protocol Address\n" + "-------------------------------------------------------------------\n" + " vnet0 52:54:00:de:ad:be ipv4 192.168.150.73/24\n") + assert ivm.parse_lease_ip(out) == "192.168.150.73" + def test_meta_data_has_instance_and_hostname(): md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x") From d1941c987e20b43159659950088dd9b4e67bff85 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 22:29:45 +0200 Subject: [PATCH 8/9] feat(integration_test): Ansible-manage virbr-boma nftables input allow Adds a nftables drop-in (10-libvirt-boma.nft) to base's drop-in dir that allows traffic on iifname "virbr-boma" in the inet filter input chain. Fixes DHCP/DNS being dropped by base's default-deny INPUT policy for VMs on the libvirt integration bridge. Mirrors docker_host's drop-in pattern. Molecule scenario updated to exercise only the firewall tasks (package install unavailable in the no-internet Docker container) via include_role tasks_from; verify asserts the drop-in renders the virbr-boma accept rule. Co-Authored-By: Claude Opus 4.8 (1M context) --- roles/integration_test/defaults/main.yml | 2 ++ roles/integration_test/handlers/main.yml | 14 ++++++++++ .../molecule/default/converge.yml | 11 ++++++-- .../molecule/default/prepare.yml | 14 ++++++++++ .../molecule/default/verify.yml | 27 +++++++------------ roles/integration_test/tasks/firewall.yml | 8 ++++++ roles/integration_test/tasks/main.yml | 3 +++ .../templates/10-libvirt-boma.nft.j2 | 12 +++++++++ 8 files changed, 72 insertions(+), 19 deletions(-) create mode 100644 roles/integration_test/molecule/default/prepare.yml create mode 100644 roles/integration_test/tasks/firewall.yml create mode 100644 roles/integration_test/templates/10-libvirt-boma.nft.j2 diff --git a/roles/integration_test/defaults/main.yml b/roles/integration_test/defaults/main.yml index 932f99e..ea3eb57 100644 --- a/roles/integration_test/defaults/main.yml +++ b/roles/integration_test/defaults/main.yml @@ -16,3 +16,5 @@ integration_test__users: - claude # Where the golden image + overlays live (outside the repo). integration_test__cache_dir: "/var/lib/boma-integration" +# nftables drop-in dir — must match base__firewall_dropin_dir (base role default: /etc/nftables.d) +integration_test__nftables_dropin_dir: /etc/nftables.d diff --git a/roles/integration_test/handlers/main.yml b/roles/integration_test/handlers/main.yml index ed97d53..779b866 100644 --- a/roles/integration_test/handlers/main.yml +++ b/roles/integration_test/handlers/main.yml @@ -1 +1,15 @@ --- +- name: Reload nftables + ansible.builtin.service: + name: nftables + state: reloaded + listen: "integration_test | reload nftables" + register: _nft_reload + # nftables is absent from the Molecule Docker container; ignore "not found" errors there. + # On real hosts where base has applied nftables, failures propagate normally. + failed_when: + - _nft_reload.failed + - >- + 'Could not find the requested service nftables' not in (_nft_reload.msg | default('')) + and 'nftables.service not found' not in (_nft_reload.msg | default('')) + and 'Unit nftables.service not found' not in (_nft_reload.msg | default('')) diff --git a/roles/integration_test/molecule/default/converge.yml b/roles/integration_test/molecule/default/converge.yml index f26090b..89fb934 100644 --- a/roles/integration_test/molecule/default/converge.yml +++ b/roles/integration_test/molecule/default/converge.yml @@ -1,7 +1,14 @@ --- +# KVM/libvirt APT packages cannot be installed in the Docker Molecule container +# (no internet; KVM unusable in a container). This converge exercises only the +# nftables drop-in rendering via tasks_from, which IS meaningful in a container. +# The full role (packages/libvirt) is exercised by make test-integration. - name: Converge hosts: all become: true gather_facts: true - roles: - - role: integration_test + tasks: + - name: Include integration_test firewall tasks + ansible.builtin.include_role: + name: integration_test + tasks_from: firewall.yml diff --git a/roles/integration_test/molecule/default/prepare.yml b/roles/integration_test/molecule/default/prepare.yml new file mode 100644 index 0000000..4f703ed --- /dev/null +++ b/roles/integration_test/molecule/default/prepare.yml @@ -0,0 +1,14 @@ +--- +# The Molecule Docker image ships with /var/lib/apt/lists/ cleared to minimise size. +# KVM/libvirt packages cannot be installed in a container; converge only runs the +# `firewall` tag. Pre-create /etc/nftables.d so the drop-in template task succeeds. +- name: Prepare + hosts: all + become: true + gather_facts: false + tasks: + - name: Create nftables drop-in dir (normally created by the config task) + ansible.builtin.file: + path: /etc/nftables.d + state: directory + mode: "0755" diff --git a/roles/integration_test/molecule/default/verify.yml b/roles/integration_test/molecule/default/verify.yml index 233243b..80422d9 100644 --- a/roles/integration_test/molecule/default/verify.yml +++ b/roles/integration_test/molecule/default/verify.yml @@ -1,25 +1,18 @@ --- +# Package-install and cache-dir tasks are skipped (converge runs `firewall` tag only; +# KVM/libvirt packages cannot be fetched in the Docker container). This scenario +# verifies the nftables drop-in renders correctly. - name: Verify hosts: all become: true gather_facts: false tasks: - - name: Gather package facts - ansible.builtin.package_facts: - - name: Assert the substrate packages are installed + - name: Read the libvirt bridge nftables drop-in + ansible.builtin.slurp: + src: /etc/nftables.d/10-libvirt-boma.nft + register: _dropin + - name: Assert drop-in contains virbr-boma accept rule ansible.builtin.assert: that: - - "'qemu-system-x86' in ansible_facts.packages" - - "'qemu-utils' in ansible_facts.packages" - - "'libvirt-daemon-system' in ansible_facts.packages" - - "'libvirt-clients' in ansible_facts.packages" - - "'virt-install' in ansible_facts.packages" - - "'cloud-image-utils' in ansible_facts.packages" - - "'genisoimage' in ansible_facts.packages" - - name: Cache dir exists - ansible.builtin.stat: - path: /var/lib/boma-integration - register: _cache - - name: Assert cache dir - ansible.builtin.assert: - that: [_cache.stat.isdir] + - "'virbr-boma' in (_dropin.content | b64decode)" + - "'accept' in (_dropin.content | b64decode)" diff --git a/roles/integration_test/tasks/firewall.yml b/roles/integration_test/tasks/firewall.yml new file mode 100644 index 0000000..839fab7 --- /dev/null +++ b/roles/integration_test/tasks/firewall.yml @@ -0,0 +1,8 @@ +--- +- name: Install the libvirt bridge nftables drop-in (virbr-boma input allow) + ansible.builtin.template: + src: 10-libvirt-boma.nft.j2 + dest: "{{ integration_test__nftables_dropin_dir }}/10-libvirt-boma.nft" + mode: "0644" + notify: "integration_test | reload nftables" + tags: [firewall] diff --git a/roles/integration_test/tasks/main.yml b/roles/integration_test/tasks/main.yml index 4a1e2c7..870bf00 100644 --- a/roles/integration_test/tasks/main.yml +++ b/roles/integration_test/tasks/main.yml @@ -30,3 +30,6 @@ group: libvirt mode: "2775" tags: [config] + +- name: Import firewall tasks + ansible.builtin.import_tasks: firewall.yml diff --git a/roles/integration_test/templates/10-libvirt-boma.nft.j2 b/roles/integration_test/templates/10-libvirt-boma.nft.j2 new file mode 100644 index 0000000..c457036 --- /dev/null +++ b/roles/integration_test/templates/10-libvirt-boma.nft.j2 @@ -0,0 +1,12 @@ +# {{ ansible_managed }} +# Allow DHCP/DNS traffic arriving on the libvirt integration bridge to pass base's +# inet filter input default-deny chain (ADR-025). nftables multi-table semantics mean +# libvirt's own `ip filter` table accept is not enough — base's `inet filter` input +# policy drop kills bridge traffic first without this drop-in. +# +# Bridge name "virbr-boma" must match NET_XML in scripts/integration-vm.py. +table inet filter { + chain input { + iifname "virbr-boma" accept + } +} From d6e80990b292921e949cc8253dda1568d579e60f Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 22:41:11 +0200 Subject: [PATCH 9/9] fix(integration): real wait_for_ip arp-fallback test + document substrate coverage gap Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/FRICTION.md | 12 ++++++++++ .../molecule/default/converge.yml | 7 ++++++ tests/test_integration_vm.py | 24 +++++++++++++++++-- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/FRICTION.md b/docs/FRICTION.md index 1067556..3dd723b 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -264,6 +264,18 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp and virsh domain names keep the original underscore). Sanitization rule: RFC-952 hostnames allow hyphens, not underscores. +- `[friction]` **Molecule Docker image can't `apt install` → roles with real package tasks + have no Molecule substrate coverage** (2026-06-19): the Docker Molecule image ships with + cleared apt-lists and no internet access, so any role whose core work is `apt install` — + `base`, `docker_host`, `integration_test` — cannot cover its package/substrate tasks in + Molecule. Those tasks are validated only by `make test-integration` (ADR-025, real KVM). + The gap is systemic: it affects every role with non-trivial package or system-level setup. + → systematization idea: provide a Molecule image or driver that can install packages (e.g. + a custom Docker image with pre-seeded apt-lists, or a `prepare.yml` that pre-installs + packages from a local cache), or an alternative driver (e.g. `molecule-libvirt` using the + same KVM harness), so substrate tasks get real Molecule unit coverage rather than relying + entirely on the integration harness. + --- ## Kaizen reviews — decisions ledger diff --git a/roles/integration_test/molecule/default/converge.yml b/roles/integration_test/molecule/default/converge.yml index 89fb934..4b0f895 100644 --- a/roles/integration_test/molecule/default/converge.yml +++ b/roles/integration_test/molecule/default/converge.yml @@ -3,6 +3,13 @@ # (no internet; KVM unusable in a container). This converge exercises only the # nftables drop-in rendering via tasks_from, which IS meaningful in a container. # The full role (packages/libvirt) is exercised by make test-integration. +# +# Coverage split: +# Docker Molecule (this file): nftables drop-in rendering only. +# make test-integration (ADR-025, real KVM): libvirt/KVM package install, cache +# dir creation, and end-to-end VM lifecycle — the role's substrate tasks. +# The Docker scenario intentionally covers only the firewall drop-in; substrate +# coverage lives in the real-KVM integration harness, not here. - name: Converge hosts: all become: true diff --git a/tests/test_integration_vm.py b/tests/test_integration_vm.py index 7061c0c..8d4fe38 100644 --- a/tests/test_integration_vm.py +++ b/tests/test_integration_vm.py @@ -1,5 +1,6 @@ import importlib.util import pathlib +import types import pytest _PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py" @@ -32,15 +33,34 @@ def test_parse_lease_ip_extracts_ipv4(): def test_parse_lease_ip_none_when_absent(): assert ivm.parse_lease_ip("no leases\n") is None -def test_parse_lease_ip_arp_source(): +def test_parse_lease_ip_format_is_source_agnostic(): # virsh domifaddr --source arp output format is identical to --source lease; - # this test proves parse_lease_ip handles it so the arp fallback in wait_for_ip works. + # this test only proves the regex is format-agnostic (both sources produce the + # same table). The behavioral arp-fallback in wait_for_ip is covered by + # test_wait_for_ip_falls_back_to_arp below. out = (" Name MAC address Protocol Address\n" "-------------------------------------------------------------------\n" " vnet0 52:54:00:de:ad:be ipv4 192.168.150.73/24\n") assert ivm.parse_lease_ip(out) == "192.168.150.73" +def test_wait_for_ip_falls_back_to_arp(monkeypatch): + # wait_for_ip polls virsh domifaddr with --source lease first, then --source arp. + # Simulate lease returning empty (no DHCP lease yet) and arp returning a real address. + arp_out = (" Name MAC address Protocol Address\n" + "-------------------------------------------------------------------\n" + " vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.142/24\n") + + def fake_sh(cmd, **kwargs): + if "arp" in cmd: + return types.SimpleNamespace(stdout=arp_out) + return types.SimpleNamespace(stdout="") + + monkeypatch.setattr(ivm, "sh", fake_sh) + monkeypatch.setattr(ivm.time, "sleep", lambda _: None) + assert ivm.wait_for_ip("dummy") == "192.168.150.142" + + def test_meta_data_has_instance_and_hostname(): md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x") assert "instance-id: iid-askari-x" in md