feat(base): pin the NetBird coordinator FQDN in /etc/hosts (mesh DNS-resilience)

Adds base__mesh_coordinator_pin (default empty = no-op). When set + base__mesh_enabled,
a lineinfile task writes "<ip> <fqdn>" to /etc/hosts so a managed mesh host survives a
local-DNS hiccup (the 2026-06-18 incident class). FQDN derived from base__mesh_management_url
via regex_replace (no community.general). Gated on base__mesh_enabled | bool and pin length;
the coordinator host (askari/offsite_hosts) stays exempt. Production pin wired for ubongo
(77.42.120.136). Molecule dns_servers fix included (Docker/NetBird DNS incompatibility).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
sjat 2026-06-20 11:22:40 +02:00
parent 0286c78f36
commit f83d68d7a0
6 changed files with 43 additions and 0 deletions

View file

@ -27,6 +27,12 @@ base__mesh_enabled: true
# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or # ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or
# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.) # mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.)
base__firewall_input_only: true base__firewall_input_only: true
# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN
# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's
# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally.
base__mesh_coordinator_pin: "77.42.120.136"
base__firewall_admin_addrs: base__firewall_admin_addrs:
- "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an - "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an
# OPNsense reservation when OPNsense-as-code lands; backstopped by wt0. # OPNsense reservation when OPNsense-as-code lands; backstopped by wt0.

View file

@ -51,3 +51,9 @@ base__mesh_manage: true
base__mesh_management_url: "https://netbird.askari.wingu.me" base__mesh_management_url: "https://netbird.askari.wingu.me"
base__mesh_setup_key: "{{ vault.netbird.setup_key }}" base__mesh_setup_key: "{{ vault.netbird.setup_key }}"
base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy
# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's
# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts
# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty
# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty.
base__mesh_coordinator_pin: ""

View file

@ -13,6 +13,7 @@
base__mesh_enabled: true base__mesh_enabled: true
base__mesh_manage: false base__mesh_manage: false
base__mesh_setup_key: "dummy-molecule-key" base__mesh_setup_key: "dummy-molecule-key"
base__mesh_coordinator_pin: "203.0.113.9" # fixture IP (TEST-NET-3); pins FQDN from base__mesh_management_url
base__ssh_listen_mesh_only: true base__ssh_listen_mesh_only: true
base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container) base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container)
firewall_zones: firewall_zones:

View file

@ -24,6 +24,11 @@ platforms:
# prepare.yml. This entry ensures the value exists in the container's netns at startup. # prepare.yml. This entry ensures the value exists in the container's netns at startup.
sysctls: sysctls:
net.ipv4.ip_nonlocal_bind: "0" net.ipv4.ip_nonlocal_bind: "0"
# ubongo's /etc/resolv.conf points to the NetBird mesh DNS (100.99.x.x), which Docker
# containers can't reach (no wt0). Override to a public resolver so prepare.yml apt tasks
# can update the cache and install packages.
dns_servers:
- 8.8.8.8
provisioner: provisioner:
name: ansible name: ansible

View file

@ -103,3 +103,14 @@
- _nb.rc != 0 - _nb.rc != 0
fail_msg: "netbird must not be installed when base__mesh_manage is false" fail_msg: "netbird must not be installed when base__mesh_manage is false"
success_msg: "mesh concern is a clean no-op under manage=false" success_msg: "mesh concern is a clean no-op under manage=false"
- name: Read /etc/hosts (coordinator pin)
ansible.builtin.slurp:
src: /etc/hosts
register: _etchosts
- name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8)
ansible.builtin.assert:
that:
- "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)"
fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin"
success_msg: "coordinator FQDN pinned in /etc/hosts"

View file

@ -64,3 +64,17 @@
- "'Management: Connected' not in (_netbird_status.stdout | default(''))" - "'Management: Connected' not in (_netbird_status.stdout | default(''))"
no_log: true # setup key is on the argv no_log: true # setup key is on the argv
tags: [mesh] tags: [mesh]
- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8)
ansible.builtin.lineinfile:
path: /etc/hosts
regexp: '\s{{ _coordinator_fqdn | regex_escape }}$'
line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}"
state: present
unsafe_writes: true # /etc/hosts is a bind mount in Docker; atomic rename is impossible
vars:
_coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}"
when:
- base__mesh_enabled | bool
- base__mesh_coordinator_pin | length > 0
tags: [mesh]