From b10a33f43921760797f034f54a0d2ab251871b92 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 09:37:06 +0200 Subject: [PATCH 1/9] feat(base): input-only forward policy + admin-addr SSH allow base__firewall_input_only renders the forward chain policy accept (host-local INPUT filtering only) for hosts that forward container/NAT traffic; defaults false so real service hosts keep the forward default-deny. base__firewall_admin_addrs adds operator-workstation LAN sources to the SSH allow-list alongside wt0 + ssh-from-control. Molecule locks the secure default + the admin rule. Mesh-hardening 2/3 (ADR-020/021). Co-Authored-By: Claude Opus 4.8 (1M context) --- roles/base/defaults/main.yml | 8 ++++++++ roles/base/molecule/default/converge.yml | 2 ++ roles/base/molecule/default/verify.yml | 14 ++++++++++++++ roles/base/templates/nftables.conf.j2 | 5 ++++- 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/roles/base/defaults/main.yml b/roles/base/defaults/main.yml index 301dee7..774e911 100644 --- a/roles/base/defaults/main.yml +++ b/roles/base/defaults/main.yml @@ -11,6 +11,14 @@ base__firewall_rollback_timeout: 45 # seconds before the auto-revert fires on a base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply base__firewall_dropin_dir: /etc/nftables.d base__firewall_apply: true # set false to render+validate without applying (CI/Molecule) +base__firewall_input_only: false # true → the forward chain is `policy accept` (host-local + # INPUT filtering only). For hosts that forward/route + # container or NAT traffic (the control node's Docker + + # libvirt-NAT) where a forward default-deny would break + # them. Real service hosts keep this false (forward drop). +base__firewall_admin_addrs: [] # extra LAN source IPs allowed to SSH, besides wt0 + + # ssh-from-control. For an operator workstation reaching + # the host over the LAN (no mesh). Key-gated. (ADR-021) # SSH hardening + fail2ban (ADR-002) — `hardening` concern. base__ssh_password_authentication: "no" diff --git a/roles/base/molecule/default/converge.yml b/roles/base/molecule/default/converge.yml index 88afbae..6ab934d 100644 --- a/roles/base/molecule/default/converge.yml +++ b/roles/base/molecule/default/converge.yml @@ -6,6 +6,8 @@ vars: base__firewall_apply: false base__firewall_control_addr: 10.10.0.99 # test control-node LAN address + base__firewall_admin_addrs: + - "10.30.0.77" # fixture: an operator-workstation LAN source (admin-addr SSH allow) # Exercise the mesh concern's include path with the live actions gated off, so it # runs hermetically (no coordinator/key needed) and must be a clean no-op. base__mesh_enabled: true diff --git a/roles/base/molecule/default/verify.yml b/roles/base/molecule/default/verify.yml index 2557f69..d3a7741 100644 --- a/roles/base/molecule/default/verify.yml +++ b/roles/base/molecule/default/verify.yml @@ -51,6 +51,20 @@ - "'include \"/etc/nftables.d/*.nft\"' in nft" fail_msg: "missing drop-in include hook" + - name: Assert the forward chain defaults to policy drop (input_only off) + ansible.builtin.assert: + that: + - "'hook forward priority 0; policy drop;' in nft" + fail_msg: >- + forward chain must default to policy drop when base__firewall_input_only is + false (container isolation stays the norm on real service hosts) + + - name: Assert the admin-addr SSH allow rule (operator workstation on the LAN) + ansible.builtin.assert: + that: + - "'ip saddr 10.30.0.77 tcp dport 22 accept' in nft" + fail_msg: "missing admin-addr SSH allow rule from base__firewall_admin_addrs" + - name: Syntax-check the rendered ruleset (no apply) ansible.builtin.command: nft -c -f /etc/nftables.conf changed_when: false diff --git a/roles/base/templates/nftables.conf.j2 b/roles/base/templates/nftables.conf.j2 index b85ff86..ce33b53 100644 --- a/roles/base/templates/nftables.conf.j2 +++ b/roles/base/templates/nftables.conf.j2 @@ -12,13 +12,16 @@ table inet filter { {% if base__firewall_control_addr %} ip saddr {{ base__firewall_control_addr }} tcp dport {{ base__firewall_ssh_port }} accept {% endif %} +{% for addr in base__firewall_admin_addrs %} + ip saddr {{ addr }} tcp dport {{ base__firewall_ssh_port }} accept +{% endfor %} ip protocol icmp accept ip6 nexthdr ipv6-icmp accept {% for r in base__firewall_resolved %} ip saddr { {{ r.sources | join(', ') }} } {{ r.proto }} dport {{ r.port }} accept {% endfor %} } - chain forward { type filter hook forward priority 0; policy drop; } + chain forward { type filter hook forward priority 0; policy {{ 'accept' if base__firewall_input_only | bool else 'drop' }}; } chain output { type filter hook output priority 0; policy accept; } } From b3e14decb499da109447963cf4a05b28b9efd359 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 09:42:49 +0200 Subject: [PATCH 2/9] feat(inventory): ubongo gets INPUT-only host firewall + mamba LAN SSH Enables base__firewall_input_only on the control group (forward chain stays permissive so Docker egress + the integration-test libvirt NAT survive) and allows the operator workstations' LAN IPs (mamba 10.20.10.50 + 10.20.10.17; raw leases, backstopped by wt0). Mesh-hardening 2/3. Co-Authored-By: Claude Opus 4.8 (1M context) --- inventories/production/group_vars/control/vars.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/inventories/production/group_vars/control/vars.yml b/inventories/production/group_vars/control/vars.yml index 6f06074..edc7a1d 100644 --- a/inventories/production/group_vars/control/vars.yml +++ b/inventories/production/group_vars/control/vars.yml @@ -19,3 +19,15 @@ base__ai_worker_user: claude # Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening # follow-on), so this brings up wt0 without changing SSH exposure. base__mesh_enabled: true + +# Mesh-hardening 2/3 (2026-06-19, ADR-020/021): apply base's host firewall to ubongo as +# INPUT-only default-deny — harden the inbound surface, leave the forward chain permissive so +# Docker egress + the libvirt-NAT integration harness keep working. sshd is unchanged +# (nftables scopes inbound), so there is no boot-race. Reach ubongo over wt0 (mesh), the +# ssh-from-control self-path (base__firewall_control_addr, group_vars/all = 10.20.10.151), or +# mamba on the LAN. Break-glass: the physical console. (base__firewall_apply defaults true.) +base__firewall_input_only: true +base__firewall_admin_addrs: + - "10.20.10.50" # mamba over the LAN (NetBird off). Raw DHCP lease — revisit with an + # OPNsense reservation when OPNsense-as-code lands; backstopped by wt0. + - "10.20.10.17" # 2nd operator workstation (MAC bc:0f:f3:c8:4a:8a). Raw lease — ditto. From 6ac5afaf6771663e6e1aa23b55814ae7a2064a9f Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 09:47:03 +0200 Subject: [PATCH 3/9] test(integration): add the 'be ubongo' profile (input-only default-deny) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A control-group VM that applies base with INPUT-only default-deny (forward policy accept; admin-addr SSH allow). verify.yml is now profile-aware via an integration_profile marker — the askari Docker/DNAT block is gated, and a ubongo block asserts input drop + forward accept + the admin-addr rule. Enables `make test-integration HOST=ubongo`. Mesh-hardening 2/3 (ADR-025). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/integration/overrides/askari.yml | 1 + tests/integration/overrides/ubongo.yml | 18 +++++++++ tests/integration/profiles/ubongo.json | 9 +++++ tests/integration/verify.yml | 55 ++++++++++++++++++++++---- 4 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 tests/integration/overrides/ubongo.yml create mode 100644 tests/integration/profiles/ubongo.json diff --git a/tests/integration/overrides/askari.yml b/tests/integration/overrides/askari.yml index c0d08b3..1b6637e 100644 --- a/tests/integration/overrides/askari.yml +++ b/tests/integration/overrides/askari.yml @@ -1,6 +1,7 @@ --- # Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`. # Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host. +integration_profile: askari base__firewall_apply: true # Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM). base__ssh_listen_mesh_only: false diff --git a/tests/integration/overrides/ubongo.yml b/tests/integration/overrides/ubongo.yml new file mode 100644 index 0000000..7d1f948 --- /dev/null +++ b/tests/integration/overrides/ubongo.yml @@ -0,0 +1,18 @@ +--- +# Integration-test overlay for the "ubongo" profile (ADR-025). Passed via `-e @`. +# Exercises mesh-hardening 2/3: base's INPUT-only default-deny on the control node — input +# chain default-deny, forward chain left permissive (Docker/libvirt-NAT safe), no sshd +# ListenAddress change (so no boot-race). +integration_profile: ubongo +base__firewall_apply: true +base__firewall_input_only: true # forward chain renders `policy accept` +base__firewall_admin_addrs: + - "192.168.150.98" # two representative LAN sources — exercises the + - "192.168.150.99" # admin-addr loop with a multi-entry list (like ubongo) +# Never wt0-only; never touch the real mesh from a throwaway VM. +base__ssh_listen_mesh_only: false +base__mesh_enabled: false +# Allow SSH from the libvirt-NAT gateway (where the driver/ansible connect from) so the +# default-deny apply + the reboot don't lock out the harness. By source IP (interface- +# independent). This is the harness's lifeline; the admin-addr above is only exercised. +base__firewall_control_addr: "192.168.150.1" diff --git a/tests/integration/profiles/ubongo.json b/tests/integration/profiles/ubongo.json new file mode 100644 index 0000000..2d647e1 --- /dev/null +++ b/tests/integration/profiles/ubongo.json @@ -0,0 +1,9 @@ +{ + "groups": ["control"], + "applies": [ + {"playbook": "site.yml", "tags": ["base"]} + ], + "extra_vars_files": ["overrides/ubongo.yml"], + "mem_mib": 2048, + "vcpus": 2 +} diff --git a/tests/integration/verify.yml b/tests/integration/verify.yml index e6c99b8..129b908 100644 --- a/tests/integration/verify.yml +++ b/tests/integration/verify.yml @@ -1,33 +1,48 @@ --- -# Integration verify (ADR-025). Outcome-based: proves Docker forwarding survives the -# reboot. The load-bearing check probes the VM's published :80 FROM the controller -# (ubongo) — if base's forward-drop killed DNAT, this times out (the FRICTION #1 bug). +# Integration verify (ADR-025). Outcome-based, profile-aware: the active profile is named by +# `integration_profile` (set in each profile's overlay). Each profile asserts its own success +# criteria; an unknown/unset profile fails loudly (never a silent pass). - name: Verify the rebooted host hosts: all become: true gather_facts: false tasks: - - name: Gather service facts + - name: A known integration_profile must be set (no silent pass) + ansible.builtin.assert: + that: + - integration_profile is defined + - integration_profile in ['askari', 'ubongo'] + fail_msg: "integration_profile must be set in the profile overlay (askari|ubongo)" + + # ── askari profile — Docker host: published-port forwarding survives the reboot ── + # The load-bearing check probes the VM's published :80 FROM the controller (ubongo) — if + # base's forward-drop killed DNAT, this times out (the FRICTION 2026-06-17 #1 bug). + - name: (askari) Gather service facts + when: integration_profile == 'askari' ansible.builtin.service_facts: - - name: Docker daemon is active + - name: (askari) Docker daemon is active + when: integration_profile == 'askari' ansible.builtin.assert: that: "ansible_facts.services['docker.service'].state == 'running'" fail_msg: "docker.service is not running" - - name: Forward chain permits container traffic (drop-in loaded) + - name: (askari) Forward chain permits container traffic (drop-in loaded) + when: integration_profile == 'askari' ansible.builtin.command: nft list chain inet filter forward register: _fwd changed_when: false - - name: Assert container forwarding is allowed (not pure drop) + - name: (askari) Assert container forwarding is allowed (not pure drop) + when: integration_profile == 'askari' ansible.builtin.assert: that: "'accept' in _fwd.stdout" fail_msg: >- forward chain is pure drop — container forwarding will die on reboot (FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing. - - name: Published port answers from the controller (DNAT + forward alive) + - name: (askari) Published port answers from the controller (DNAT + forward alive) + when: integration_profile == 'askari' delegate_to: localhost become: false ansible.builtin.uri: @@ -42,3 +57,27 @@ retries: 5 delay: 6 until: _probe is succeeded + + # ── ubongo profile — control node: INPUT-only default-deny survives the reboot ── + # SSH reachability across the reboot is proven by the harness itself (it re-SSHes and + # checks boot_id changed before this verify runs). Here we assert the ruleset shape. + - name: (ubongo) Read the live nftables ruleset + when: integration_profile == 'ubongo' + ansible.builtin.command: nft list ruleset + register: _nft + changed_when: false + + - name: (ubongo) INPUT default-deny, forward permissive, lifeline + admin-addr allow + when: integration_profile == 'ubongo' + ansible.builtin.assert: + that: + - "'hook input priority 0; policy drop;' in _nft.stdout" + - "'hook forward priority 0; policy accept;' in _nft.stdout" + # the ssh-from-control lifeline (base__firewall_control_addr) — the reconnect path + - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft.stdout" + - "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout" + - "'ip saddr 192.168.150.99 tcp dport 22 accept' in _nft.stdout" + fail_msg: >- + ubongo profile: expected input policy drop, forward policy accept (input-only), + the ssh-from-control lifeline (192.168.150.1), and both admin-addr + (192.168.150.98/99) SSH allows in the live ruleset. From 26bb7e442d483e7c336ddf6c68f63ad340be50c3 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 10:32:09 +0200 Subject: [PATCH 4/9] fix(integration): pin system python for virt-install (venv PATH hijack) The Makefile prepends .venv/bin to PATH (so the venv's ansible tools resolve), but virt-install's `#!/usr/bin/env python3` shebang then resolved to the isolated venv, which lacks system PyGObject (gi) -> ModuleNotFoundError. Strip .venv/bin from PATH for the virt-install call so its shebang finds /usr/bin/python3 (which has gi); ansible runs via its absolute .venv path and is unaffected. Surfaced running `make test-integration HOST=ubongo`. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/integration-vm.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py index b5ec90e..02c655c 100644 --- a/scripts/integration-vm.py +++ b/scripts/integration-vm.py @@ -201,6 +201,13 @@ def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"), str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")]) console = CACHE_DIR / f"{name}-console.log" + # virt-install has a `#!/usr/bin/env python3` shebang; the Makefile prepends .venv/bin to + # PATH (so the venv's ansible tools resolve), which would hijack virt-install into the + # isolated venv — it lacks system PyGObject (`gi`) and crashes. Strip the venv from PATH + # for this system tool so its shebang finds /usr/bin/python3 (which has gi). Ansible is + # invoked via its absolute .venv path elsewhere, so it is unaffected. + sys_path = ":".join(p for p in os.environ.get("PATH", "").split(":") + if "/.venv/bin" not in p) sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus), "--boot", "uefi", # genericcloud triple-faults on legacy BIOS handoff; UEFI boots "--import", @@ -210,7 +217,8 @@ def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): "--osinfo", "debian13", "--graphics", "none", "--serial", f"file,path={console}", - "--noautoconsole"]) + "--noautoconsole"], + env=dict(os.environ, PATH=sys_path)) ip = wait_for_ip(name) wait_for_ssh(ip, "ansible") # Block until cloud-init finishes (incl. apt-get update) so apply sees a ready system. From 468f8c3a924e0e87c0ddd5a1e766d4b32563c554 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 10:32:09 +0200 Subject: [PATCH 5/9] fix(integration): match live nft `priority filter` in the ubongo verify `nft list ruleset` prints the symbolic chain priority (`filter` = 0); the ubongo profile asserted `priority 0` (the rendered-file format the Molecule scenario checks), so the live-ruleset assertion failed even though the firewall was correct. Assert `priority filter` for the input/forward policy lines. Caught by the harness GREEN gate (`make test-integration HOST=ubongo`). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/integration/verify.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/verify.yml b/tests/integration/verify.yml index 129b908..1f460f5 100644 --- a/tests/integration/verify.yml +++ b/tests/integration/verify.yml @@ -71,8 +71,10 @@ when: integration_profile == 'ubongo' ansible.builtin.assert: that: - - "'hook input priority 0; policy drop;' in _nft.stdout" - - "'hook forward priority 0; policy accept;' in _nft.stdout" + # live `nft list ruleset` prints the SYMBOLIC priority (`filter` = 0), unlike the + # rendered /etc/nftables.conf (`priority 0`) that the Molecule scenario asserts against. + - "'hook input priority filter; policy drop;' in _nft.stdout" + - "'hook forward priority filter; policy accept;' in _nft.stdout" # the ssh-from-control lifeline (base__firewall_control_addr) — the reconnect path - "'ip saddr 192.168.150.1 tcp dport 22 accept' in _nft.stdout" - "'ip saddr 192.168.150.98 tcp dport 22 accept' in _nft.stdout" From 8d8c86fa39781fd54090f0e324dc316f8d4148d8 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 10:32:09 +0200 Subject: [PATCH 6/9] docs(friction): VM-testing standard + libvirt stale-session gotcha Two signals from running the ubongo harness gate: (1) the operator wants a standard pre-authorising isolated VM integration tests on ubongo so the agent doesn't ask each time; (2) a stale agent session (shell predating the integration_test libvirt-group grant) carries stale process groups, so the harness's qemu-img/file writes are denied -> run via 'sg libvirt -c ...'; self-heal idea noted. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/FRICTION.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/FRICTION.md b/docs/FRICTION.md index 994ba59..fdcc2ac 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -158,6 +158,34 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp reservations** (`10.20.10.17` = MAC `bc:0f:f3:c8:4a:8a`; mamba's MAC TBD) and allow the reserved IPs. Spec: `docs/superpowers/specs/2026-06-19-mesh-hardening-ubongo-default-deny-design.md`. +- `[gotcha]` **`make test-integration` on ubongo fails (`qemu-img` "Permission denied") when + the agent session predates the `libvirt` group grant** (2026-06-19): the `integration_test` + role adds `claude` to `libvirt`+`kvm` and makes the cache dir `/var/lib/boma-integration` + `root:libvirt 2775` — correct — but a `claude` session whose shell started *before* that + grant carries a stale process group set (`id` → `claude,docker` only, no `libvirt`), so + `qemu-img create` of the VM overlay into the group-owned dir is denied. `virsh`/`virt-install` + still work (they reach system libvirtd via polkit/socket, and the real KVM runs server-side + as `libvirt-qemu`), so ONLY claude's own file-writes break. Unblock without restarting the + session: **`sg libvirt -c 'make test-integration HOST='`** (claude needs only `libvirt` + for the dir; `kvm` is server-side; note `sg` adds one group, not the full set). → self-heal + in `scripts/integration-vm.py`: if the `libvirt` gid is absent from `os.getgroups()`, re-exec + under `sg libvirt` (or have the Makefile target do it), so a stale-session agent never hits + this opaque symptom. New agent sessions pick the groups up on login, so it's a stale-session + transient — but high-confusion, worth self-healing. + +- `[friction]` **No standard for when the agent may run local-VM integration tests on ubongo + without asking** (2026-06-19): `make test-integration HOST=` spins an ISOLATED throwaway + KVM VM (its own libvirt NAT; never touches the real host's firewall/network; guards: + one-VM-at-a-time + a 4 GiB free-RAM floor + auto-destroy on success), so it is safe and + self-contained — yet the agent paused for a go-ahead before running it (mesh-hardening 2/3, + Task 4). The operator wants a STANDARD that pre-authorises VM-testing on ubongo so the agent + just runs it. → decide + record the rule: e.g. a `.claude/settings.json` permission allow for + `make test-integration*` / `scripts/integration-vm.py` (and the `sg libvirt -c '…'` form per + the gotcha above), plus a CLAUDE.md line distinguishing the pre-authorised isolated VM tests + from the genuinely-gated live steps (`make deploy` to real hosts, host reboots, cutovers — + still need a go-ahead). Ties to the `test-risky-infra-before-live-deploy` + + `dont-reask-settled-defaults` memories + ADR-025. + --- ## Kaizen reviews — decisions ledger From 180af46879a4acc315ce28a96009c8516b813bea Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 10:40:29 +0200 Subject: [PATCH 7/9] docs(friction): log the Molecule input_only-accept coverage gap Final-review finding: the default Molecule scenario only renders the forward drop (input_only off) branch; the accept branch is covered by the integration harness only. Tracked for a kaizen decision (2nd scenario vs accept the split). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/FRICTION.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/FRICTION.md b/docs/FRICTION.md index fdcc2ac..2ce85fa 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -186,6 +186,17 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp still need a go-ahead). Ties to the `test-risky-infra-before-live-deploy` + `dont-reask-settled-defaults` memories + ADR-025. +- `[gotcha]` **Molecule covers only the `input_only`-OFF (forward drop) branch of the base + firewall** (2026-06-19): mesh-hardening 2/3 added `base__firewall_input_only` (forward policy + drop↔accept). The `default` Molecule scenario renders ONE fixture, set to the secure default + (drop) — so the fast `make test ROLE=base` gate locks the drop default (security-critical for + service hosts) but does NOT exercise the `=true` → forward-`accept` rendering; only `make + test-integration HOST=ubongo` does (passed GREEN). An in-converge re-render can't cheaply + cover it (role defaults aren't in scope outside the role run). → decide in kaizen: a second + Molecule scenario (`molecule/input-only/`) asserting forward `policy accept`, vs accepting the + integration-only coverage. Final-review finding; not a cutover blocker (the accept branch is a + literal, and a var-name break would fail the drop branch too → caught). + --- ## Kaizen reviews — decisions ledger From a881185c73cca5c631dc2cb11e1240b0bb934f79 Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 15:16:21 +0200 Subject: [PATCH 8/9] docs(friction): base firewall flush wipes Docker nat (cutover finding) Applying base's nftables (even INPUT-only/forward-accept) to a Docker host flushes Docker's ip nat -> container egress breaks until 'systemctl restart docker'. Found on the ubongo mesh-hardening 2/3 live cutover; the Docker-less test VM couldn't surface it. Self-heals on reboot (dockerd re-adds nat; forward=accept doesn't block). Runbook/docker_host follow-ups noted. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/FRICTION.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/FRICTION.md b/docs/FRICTION.md index 2ce85fa..f555f04 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -197,6 +197,23 @@ harness on ubongo and shaking it down against real KVM (spec/plan in docs/superp integration-only coverage. Final-review finding; not a cutover blocker (the accept branch is a literal, and a var-name break would fail the drop branch too → caught). +- `[gotcha]` **Applying base's firewall to a Docker host flushes Docker's nat → container + egress dies until `restart docker`** (2026-06-19, mesh-hardening 2/3 live cutover): base's + `nftables.conf.j2` starts with `flush ruleset`, which wipes ALL tables incl. Docker's + `ip nat`/`ip filter` (+ libvirt's). On ubongo I chose INPUT-only so `forward` stays `accept` + — yet the apply STILL broke CONTAINER egress: `docker pull` worked (dockerd uses HOST egress) + but a container `ping` FAILED — the masquerade (SNAT) was gone, so replies couldn't return. + `forward accept` permits forwarding but can't replace the missing nat. The spec's "input-only + keeps Docker egress working" was therefore **incomplete**, and the local-VM harness couldn't + catch it (the test VM runs no Docker). Fix on the live host: `systemctl restart docker` + re-adds its `ip nat`/`ip filter` (egress restored; coexists fine with base's `inet filter`). + On REBOOT it self-heals (dockerd re-adds nat on boot; `forward accept` doesn't block — unlike + the 2026-06-17 `forward drop` incident). → (1) any cutover/runbook applying base firewall to a + Docker host MUST `restart docker` + check container egress after the apply; (2) the pending + `docker_host` nftables integration should own re-adding/persisting Docker's rules so base's + `flush` is safe; (3) the firewall final-review checklist should include "does the host run + Docker/libvirt? the flush wipes their nat." + --- ## Kaizen reviews — decisions ledger From fa2c4c6368360339ec9525f726601a1b1ee3d1ec Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 15:34:20 +0200 Subject: [PATCH 9/9] =?UTF-8?q?docs(status):=20mesh-hardening=202/3=20?= =?UTF-8?q?=E2=80=94=20ubongo=20INPUT-only=20default-deny=20applied?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit base firewall applied + live-verified on ubongo (INPUT-only default-deny; base__firewall_input_only). Records the Docker-nat-flush caveat (needs a restart docker on a Docker host), the claude self-SSH grant, and reboot-validation-pending. ROADMAP: sub-project 2 done; remaining = NetBird ACL + askari redesign. Co-Authored-By: Claude Opus 4.8 (1M context) --- STATUS.md | 8 ++++---- docs/ROADMAP.md | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/STATUS.md b/STATUS.md index bed6cfd..b8c9ed5 100644 --- a/STATUS.md +++ b/STATUS.md @@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the truth. **Before relying on a role, provider, or pipeline existing, check here.** If something is listed as "designed, not built", do not assume it works. -_Last reviewed: 2026-06-18._ +_Last reviewed: 2026-06-19._ ## Real and working today @@ -30,7 +30,7 @@ _Last reviewed: 2026-06-18._ | `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). | | `make check` / `make deploy PLAYBOOK=` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). | | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. | -| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). | +| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **`base` firewall applied (mesh-hardening 2/3, 2026-06-19):** INPUT-only default-deny — input locked to `wt0` + ssh-from-control (`10.20.10.151`) + workstations (`10.20.10.50` mamba, `10.20.10.17`); forward `accept` (Docker/libvirt-NAT safe). Live-verified (SSH self-path + Docker egress, after a post-apply `restart docker` — base's flush wipes Docker nat, FRICTION); **real-host reboot validation pending** (low-risk — lockout-safe via the permanent console). `claude` now self-SSHes (ad-hoc `authorized_keys` grant so the agent can run SSH-based deploys with the auto-rollback safety; fold into the control-node bootstrap). **Pending:** full `base` hardening (auditd/CIS); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservations (10.20.10.151 MAC `88:a4:c2:e0:ee:da` + the `.50`/`.17` workstation leases); Terraform state backup (now relevant — the offsite tfstate exists). | | `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Pending:** host firewall + moving askari's SSH onto `wt0` (deferred mesh-hardening; the Hetzner Cloud Firewall is its perimeter until then), offsite tfstate backup (ADR-022). | | `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy` → `/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). | | `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. | @@ -39,7 +39,7 @@ _Last reviewed: 2026-06-18._ | Thing | State | |---|---| -| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). | +| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is **applied to ubongo** (mesh-hardening 2/3, 2026-06-19): INPUT-only default-deny via the new `base__firewall_input_only` knob (input default-deny + `wt0`/ssh-from-control/`base__firewall_admin_addrs` allow-list; forward left `accept` so Docker/libvirt-NAT survive). **Caveat:** base's `flush ruleset` wipes a Docker host's nat, so applying to a Docker host needs a follow-up `restart docker` (FRICTION) — hence still **not** applied to askari pending `docker_host`'s nftables integration. Not built: auditd, packages, users (Phase 2 / TODO 15). | | `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts | | `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs | @@ -50,7 +50,7 @@ daemon hardening + `nftables.d` container rules, ADR-004/ADR-020 — is still pe A `make deploy PLAYBOOK=site` run now applies real content — `base` (its `firewall` + `hardening` concerns) plus a functional `docker_host` (Docker engine) on docker hosts — but in practice it is still limited: the production cluster has no docker hosts yet, and -`base`'s `firewall` concern is mesh-gated until M5, so a full cluster `site` run does not +`base`'s `firewall` concern is now applied to `ubongo` (control) but not yet to cluster docker hosts (none exist), so a full cluster `site` run does not yet exist. (The `make check`/`deploy` machinery itself works — first proven by applying `dev_env` via `playbooks/workstation.yml`, then `base`/`docker_host`/`reverse_proxy` on askari.) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 0266a68..23d3048 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -13,7 +13,7 @@ as ordering changes, or as new milestones appear. Each milestone gets its own spec → plan → implementation cycle (`docs/superpowers/specs/` then `…/plans/`) when it comes up; this file stays high-level. -_Last updated: 2026-06-17._ +_Last updated: 2026-06-19._ --- @@ -206,14 +206,17 @@ Canonical dependency order: ## Next step -**Phase 1 is complete (M1–M5).** The next build is the **mesh-hardening follow-on** -(deferred from M5, now safe because the `wt0` mesh path exists): +**Phase 1 complete (M1–M5); mesh-hardening 2/3 (ubongo default-deny) DONE (2026-06-19)** — +INPUT-only nftables default-deny applied + live-verified on `ubongo` (`base__firewall_input_only`; +spec/plan `docs/superpowers/{specs,plans}/2026-06-19-mesh-hardening-ubongo-default-deny*`; +real-host reboot validation pending, low-risk — lockout-safe via the permanent console). +Remaining mesh-hardening sub-projects, each its own spec → plan → implementation cycle: -1. apply `base`'s nftables **default-deny** to `ubongo` + set `base__firewall_control_addr` - (ADR-021 `ssh-from-control`, built/dormant) — lockout-risky on the control node itself, - so it relies on the firewall's auto-rollback; -2. tighten the NetBird ACL **off Allow-All** to scoped policies; -3. move `askari`'s SSH onto `wt0`, retiring the Hetzner-firewall WAN allow. +1. ~~`ubongo` nftables default-deny + `ssh-from-control`~~ → **DONE (2026-06-19).** +2. tighten the NetBird ACL **off Allow-All** to scoped policies (open mechanism question — + no headless API path). +3. **redesign** `askari`'s SSH → `wt0` (the 2026-06-17 attempt was backed out; the redesign + must resolve the boot-race, the coordinator-bootstrap chicken-egg, and the Docker-nat-flush + that the `flush ruleset` causes on a Docker host). -Needs its own spec → plan → implementation cycle. **Then** the Procurement gate -(`/capacity-review` → buy Proxmox hardware) opens Phase 2. +**Then** the Procurement gate (`/capacity-review` → buy Proxmox hardware) opens Phase 2.