diff --git a/inventories/production/group_vars/control/vars.yml b/inventories/production/group_vars/control/vars.yml index edc7a1d..018c6e6 100644 --- a/inventories/production/group_vars/control/vars.yml +++ b/inventories/production/group_vars/control/vars.yml @@ -27,6 +27,12 @@ base__mesh_enabled: true # ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or # mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.) base__firewall_input_only: true + +# DNS-resilience (ADR-016 availability / R8): pin the coordinator FQDN to askari's stable WAN +# IP in /etc/hosts so a local-DNS hiccup (the 2026-06-18 incident class) can't strand ubongo's +# mesh. askari (offsite_hosts) is exempt — it reaches the coordinator locally. +base__mesh_coordinator_pin: "77.42.120.136" + base__firewall_admin_addrs: - "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an # OPNsense reservation when OPNsense-as-code lands; backstopped by wt0. diff --git a/roles/base/defaults/main.yml b/roles/base/defaults/main.yml index 774e911..cb259e7 100644 --- a/roles/base/defaults/main.yml +++ b/roles/base/defaults/main.yml @@ -51,3 +51,9 @@ base__mesh_manage: true base__mesh_management_url: "https://netbird.askari.wingu.me" base__mesh_setup_key: "{{ vault.netbird.setup_key }}" base__mesh_version: "0.72.4" # match the coordinator; exact apt pin confirmed on-host at deploy + +# DNS-resilience (ADR-016 availability / accepted-risk R8): when set to the coordinator's +# stable IP, pin the coordinator FQDN (derived from base__mesh_management_url) in /etc/hosts +# so a managed mesh host survives a local-DNS hiccup (the 2026-06-18 incident class). Empty +# = no pin. The coordinator host itself (askari/offsite_hosts) is exempt — leave it empty. +base__mesh_coordinator_pin: "" diff --git a/roles/base/molecule/default/converge.yml b/roles/base/molecule/default/converge.yml index 6ab934d..395918d 100644 --- a/roles/base/molecule/default/converge.yml +++ b/roles/base/molecule/default/converge.yml @@ -13,6 +13,7 @@ base__mesh_enabled: true base__mesh_manage: false base__mesh_setup_key: "dummy-molecule-key" + base__mesh_coordinator_pin: "203.0.113.9" # fixture IP (TEST-NET-3); pins FQDN from base__mesh_management_url base__ssh_listen_mesh_only: true base__ssh_listen_addr: "100.99.0.1" # fixture mesh IP (no wt0 in the container) firewall_zones: diff --git a/roles/base/molecule/default/molecule.yml b/roles/base/molecule/default/molecule.yml index 4c17329..a9fb4ca 100644 --- a/roles/base/molecule/default/molecule.yml +++ b/roles/base/molecule/default/molecule.yml @@ -24,6 +24,11 @@ platforms: # prepare.yml. This entry ensures the value exists in the container's netns at startup. sysctls: net.ipv4.ip_nonlocal_bind: "0" + # ubongo's /etc/resolv.conf points to the NetBird mesh DNS (100.99.x.x), which Docker + # containers can't reach (no wt0). Override to a public resolver so prepare.yml apt tasks + # can update the cache and install packages. + dns_servers: + - 8.8.8.8 provisioner: name: ansible diff --git a/roles/base/molecule/default/verify.yml b/roles/base/molecule/default/verify.yml index d3a7741..51962fb 100644 --- a/roles/base/molecule/default/verify.yml +++ b/roles/base/molecule/default/verify.yml @@ -103,3 +103,14 @@ - _nb.rc != 0 fail_msg: "netbird must not be installed when base__mesh_manage is false" success_msg: "mesh concern is a clean no-op under manage=false" + + - name: Read /etc/hosts (coordinator pin) + ansible.builtin.slurp: + src: /etc/hosts + register: _etchosts + - name: Assert the coordinator FQDN is pinned to the fixture IP (DNS-resilience / R8) + ansible.builtin.assert: + that: + - "'203.0.113.9 netbird.askari.wingu.me' in (_etchosts.content | b64decode)" + fail_msg: "base__mesh_coordinator_pin did not render the /etc/hosts coordinator pin" + success_msg: "coordinator FQDN pinned in /etc/hosts" diff --git a/roles/base/tasks/mesh.yml b/roles/base/tasks/mesh.yml index 5226043..83786ef 100644 --- a/roles/base/tasks/mesh.yml +++ b/roles/base/tasks/mesh.yml @@ -64,3 +64,17 @@ - "'Management: Connected' not in (_netbird_status.stdout | default(''))" no_log: true # setup key is on the argv tags: [mesh] + +- name: Pin the NetBird coordinator FQDN in /etc/hosts (DNS-resilience, ADR-016 availability / R8) + ansible.builtin.lineinfile: + path: /etc/hosts + regexp: '\s{{ _coordinator_fqdn | regex_escape }}$' + line: "{{ base__mesh_coordinator_pin }} {{ _coordinator_fqdn }}" + state: present + unsafe_writes: true # /etc/hosts is a bind mount in Docker; atomic rename is impossible + vars: + _coordinator_fqdn: "{{ base__mesh_management_url | regex_replace('^https?://', '') | regex_replace('[:/].*', '') }}" + when: + - base__mesh_enabled | bool + - base__mesh_coordinator_pin | length > 0 + tags: [mesh]