From 847d9885e23fcaea8a81ebd708ee7c85ec1b93bf Mon Sep 17 00:00:00 2001 From: sjat Date: Wed, 17 Jun 2026 22:16:17 +0200 Subject: [PATCH] revert: back out mesh-hardening 1/3 on askari after it broke the Docker host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Incident 2026-06-17: applying base's nftables default-deny (forward policy drop) to askari — a Docker host — broke container forwarding/NAT on reboot, and the wt0-only sshd ListenAddress left no break-glass (ip_nonlocal_bind did NOT beat the boot race). Recovery: disable nftables + restart docker (restore the wiped NAT masquerade) + force-recreate the coordinator (it FATAL-looped unable to download its GeoLite2 DB with no egress) -> mesh re-formed. Back out the enablement so a future deploy can't re-break askari: - offsite_hosts: base__ssh_listen_mesh_only=false, base__firewall_apply=false - remove host_vars/askari.yml (manage over the WAN again, not wt0) - tf/offsite: re-open WAN :22 to ubongo only (break-glass; already applied) askari now: sshd on all interfaces (Ansible-managed), nftables disabled, WAN :22 open -> stable + reboot-survivable. The base feature code (sshd ListenAddress option, firewall public zone) stays; it's just not enabled on Docker hosts. Mesh-hardening 1/3 to be re-spec'd before any retry. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../group_vars/offsite_hosts/vars.yml | 19 ++++++++++++++----- inventories/production/host_vars/askari.yml | 6 ------ terraform/environments/offsite/main.tf | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) delete mode 100644 inventories/production/host_vars/askari.yml diff --git a/inventories/production/group_vars/offsite_hosts/vars.yml b/inventories/production/group_vars/offsite_hosts/vars.yml index df8724f..03047e4 100644 --- a/inventories/production/group_vars/offsite_hosts/vars.yml +++ b/inventories/production/group_vars/offsite_hosts/vars.yml @@ -1,8 +1,17 @@ --- # Off-site hosts (askari). askari runs the NetBird coordinator AND is a mesh peer -# (ADR-016, M5). Mesh-hardening 1/3 (2026-06-17): SSH is moved onto wt0 — sshd binds the -# mesh IP only (base__ssh_listen_mesh_only) and the base nftables default-deny applies -# (base__firewall_apply defaults true; SSH allowed on wt0 via base__firewall_mgmt_interface, -# public services via the catalog). base__mesh_enabled stays true (precondition from M5). +# (ADR-016, M5). base__mesh_enabled stays true (M5 enrollment). +# +# Mesh-hardening 1/3 (move SSH onto wt0 + nftables default-deny) was attempted on +# 2026-06-17 and BACKED OUT after it took askari down: applying base's nftables +# `forward policy drop` to a Docker host broke container forwarding/NAT on reboot, and the +# wt0-only sshd ListenAddress left no break-glass (ip_nonlocal_bind did not beat the boot +# race). Until docker_host ships Docker-safe container-forward rules and the boot-race + +# coordinator-bootstrap issues are re-designed, askari keeps: +# - sshd listening on all interfaces (reachable over the WAN; Hetzner Cloud Firewall is +# the perimeter) — base__ssh_listen_mesh_only stays false, +# - the host nftables firewall NOT applied — base__firewall_apply false. +# See the incident write-up / the mesh-hardening re-spec before re-enabling either. base__mesh_enabled: true -base__ssh_listen_mesh_only: true +base__ssh_listen_mesh_only: false +base__firewall_apply: false diff --git a/inventories/production/host_vars/askari.yml b/inventories/production/host_vars/askari.yml deleted file mode 100644 index 9d9f7ac..0000000 --- a/inventories/production/host_vars/askari.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -# Manage askari over the NetBird mesh (wt0), not its WAN IP. This OVERRIDES the -# TF-generated inventories/production/offsite.yml (ansible_host = 77.42.120.136); host_vars -# outrank the generated inventory and are NOT touched by `make tf-inventory-offsite`. -# Mesh-hardening 1/3 — once SSH is wt0-only, the WAN IP is no longer reachable for SSH. -ansible_host: 100.99.226.39 # askari's wt0 address (NetBird, M5) diff --git a/terraform/environments/offsite/main.tf b/terraform/environments/offsite/main.tf index 494c84d..b8c08b0 100644 --- a/terraform/environments/offsite/main.tf +++ b/terraform/environments/offsite/main.tf @@ -11,7 +11,7 @@ module "askari" { location = "hel1" # Helsinki image = "debian-13" ansible_ssh_pubkey = var.ansible_ssh_pubkey - ssh_admin_cidrs = [] # mesh-only: SSH is reached over wt0; WAN :22 retired (mesh-hardening 1/3) + ssh_admin_cidrs = ["91.226.145.80/32"] # TEMP (incident recovery 2026-06-17): re-open WAN :22 to ubongo only; re-close once the firewall/Docker + boot-race issues are fixed public_web = true # Caddy 80/443 + NetBird 3478 (M4) labels = { env = "offsite"