diff --git a/.ansible-lint b/.ansible-lint index 147ab8c..f4dea75 100644 --- a/.ansible-lint +++ b/.ansible-lint @@ -6,6 +6,7 @@ exclude_paths: - .venv/ - .collections/ - .scaffold/ + - tests/integration/.run/ # transient harness run dir (gitignored, generated) - "**/vault.yml" # ansible-vault encrypted — not lintable YAML # Warn only (don't fail) on these rules during initial setup diff --git a/.gitignore b/.gitignore index f83b0fc..b948d86 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,6 @@ terraform/**/terraform.tfvars # Service-UI verification screenshots (kept locally on ubongo, not committed — ADR-017) .verify-runs/ + +# Integration-test transient run dir (ADR-025); diagnostics live under ~/integration-runs +tests/integration/.run/ diff --git a/.yamllint b/.yamllint index b47f9ca..2a116d7 100644 --- a/.yamllint +++ b/.yamllint @@ -24,4 +24,5 @@ ignore: | .venv/ .collections/ .scaffold/ + tests/integration/.run/ **/vault.yml diff --git a/CLAUDE.md b/CLAUDE.md index 2e662c9..01334a7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -43,6 +43,8 @@ Full design rationale: `docs/decisions/` | Terraform plan | `make tf-plan [TF_ENV=staging]` | | Terraform apply | `make tf-apply [TF_ENV=staging]` | | Regenerate Ansible inventory | `make tf-inventory TF_ENV=` | +| Integration-test a host on a local VM | `make test-integration HOST= [CERTS=…]` | +| Clean up integration test VMs | `make test-integration-clean` | **Always `tf-plan` before `tf-apply`. Always `check` before `deploy`. Never skip lint.** @@ -256,6 +258,8 @@ Single-contributor, trunk-based (no merge requests / approval gates): | Backup & disaster recovery | `docs/decisions/022-backup.md` | | ADR structure & lifecycle | `docs/decisions/023-adr-structure.md` | | Reverse proxy (Caddy) | `docs/decisions/024-reverse-proxy.md` | +| Local VM integration testing (ADR-025) | `docs/decisions/025-local-vm-integration-testing.md` | +| Integration testing runbook | `docs/runbooks/integration-testing.md` | | Adding a new role | `docs/runbooks/new-role.md` | | Adding a new host | `docs/runbooks/new-host.md` | | Enrolling a NetBird client (laptop/phone) | `docs/runbooks/netbird-client.md` | diff --git a/Makefile b/Makefile index a23efff..0b3bf0c 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,8 @@ endif .DEFAULT_GOAL := help -.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \ +.PHONY: help setup collections lint test test-all test-integration test-integration-clean \ + check deploy encrypt decrypt \ edit-vault check-vault new-role \ tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \ molecule-image molecule-image-push caddy-image caddy-image-push registry-login @@ -53,6 +54,8 @@ help: @echo " make lint Run yamllint + ansible-lint" @echo " make test ROLE= Run Molecule tests for a role" @echo " make test-all Run Molecule tests for all roles" + @echo " make test-integration HOST= [CERTS=internal|le-staging] [KEEP=1] Run ADR-025 integration cycle against a VM" + @echo " make test-integration-clean Prune stale integration-test VM snapshots" @echo " make check PLAYBOOK= [LIMIT=] [TAGS=] Dry-run a playbook (check mode)" @echo " make deploy PLAYBOOK= [LIMIT=] [TAGS=] Run a playbook against production" @echo " make edit-vault [VAULT=] Edit the vault in nvim (auto re-encrypts + checks)" @@ -109,6 +112,16 @@ test-all: cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \ done +test-integration: +ifndef HOST + $(error HOST is required: make test-integration HOST= [CERTS=internal|le-staging] [KEEP=1]) +endif + PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py cycle \ + --host $(HOST) $(if $(CERTS),--certs $(CERTS)) $(if $(KEEP),--keep) + +test-integration-clean: + PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py prune + # ── Playbook execution ──────────────────────────────────────────────────────── check: diff --git a/STATUS.md b/STATUS.md index 9d51c8d..bed6cfd 100644 --- a/STATUS.md +++ b/STATUS.md @@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the truth. **Before relying on a role, provider, or pipeline existing, check here.** If something is listed as "designed, not built", do not assume it works. -_Last reviewed: 2026-06-14._ +_Last reviewed: 2026-06-18._ ## Real and working today @@ -30,7 +30,7 @@ _Last reviewed: 2026-06-14._ | `roles/dev_env/` — interactive developer environment | **Built + applied.** zsh + oh-my-zsh + oh-my-posh, tmux + TPM plugins, neovim; dotfiles deployed via GNU stow (re-derived from V4/fisi per ADR-013). Node.js from a pinned upstream tarball (not Debian's npm). Lint + Molecule (idempotent) green. **Applied to `ubongo`** for users `sjat` + `claude` (verified: zsh login shells, stow-symlinked `.zshrc`/`.tmux.conf` + nvim config, oh-my-zsh, tmux plugins; nvim v0.12.2, oh-my-posh 29.0.1). Run via `playbooks/workstation.yml` against the `control` group (no dedicated `workstations` group yet). | | `make check` / `make deploy PLAYBOOK=` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). | | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. | -| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern; agent management now works because `claude`'s SSH key was added to `sjat`'s `authorized_keys` and `sjat` was granted `NOPASSWD` sudo (`/etc/sudoers.d/sjat-ansible`) — the interim until the proper `ansible`-user bootstrap. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). | +| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker + libvirt groups, **`NOPASSWD:ALL` sudo** — ADR-015 amended 2026-06-18; operator `sjat` uses password-required sudo via `sudo` group; the former `sjat-ansible` NOPASSWD drop-in removed 2026-06-18). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **NetBird mesh-enrolled (M5, 2026-06-17):** `wt0` up at `100.99.146.14` via the `base` `mesh` concern. **Pending:** full `base` hardening (only `firewall` exists, NOT applied here — default-deny is the deferred mesh-hardening step now that `wt0` exists); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). | | `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **NetBird coordinator (M4b):** `netbird_coordinator` deployed — dashboard live at `https://netbird.askari.wingu.me` (valid LE cert), management API behind embedded Dex (401 unauth), STUN on 3478/udp. **NetBird peer (M5, 2026-06-17):** also enrolled as a mesh agent (`base` `mesh` concern) — `wt0` at `100.99.226.39`, Management+Signal Connected; the agent coexists with the coordinator. **Pending:** host firewall + moving askari's SSH onto `wt0` (deferred mesh-hardening; the Hetzner Cloud Firewall is its perimeter until then), offsite tfstate backup (ADR-022). | | `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). The `reverse_proxy` Caddyfile is bind-mounted as a **directory** (`./caddy` → `/etc/caddy`) so atomic re-renders are visible in-container and `caddy reload` actually applies new routes (a single-file mount pinned the stale inode). | | `roles/netbird_coordinator/` — NetBird control plane (ADR-016, M4b) | **Built + applied (askari, 2026-06-16). boma's FIRST real service role.** Self-hosted NetBird **v0.72.4**: a single combined `netbird-server` container (management + signal + relay + STUN + **embedded Dex IdP** at `/oauth2`) + `dashboard:v2.39.0`, on the shared `boma` network behind the M4a Caddy via gRPC-h2c + WebSocket + path routing (`reverse_proxy__routes` gained a raw-`caddy` route type). Secrets `vault.netbird.{auth_secret,datastore_key}` (self-generated). Carries the full service-role file set (SECURITY/VERIFY/ACCESS/BACKUP) — **first stateful role** (`backup__state: true`; encrypted SQLite at `/var/lib/netbird`, off-site backup pending `fisi`/ADR-022). **Verified live:** dashboard 200 + valid LE cert, `/api` 401 (auth-gated, routes OK), STUN up. **Not yet configured:** first-boot `/setup` admin + peer enrolment = M5. | @@ -81,6 +81,18 @@ askari.) | Backup `backup` role + `backup_hosts` group | ADR-022 | Does not exist. Pull node (`fisi`), restic repo, rclone→pCloud, USB air-gap — Plan 2. | | Per-service `backup__*` contract + `BACKUP.md` | ADR-022 | Convention defined; inert until service roles exist to declare against. | +## Integration test harness (ADR-025) + +| Thing | State | +|---|---| +| `roles/integration_test/` | **Built** — installs/enables libvirt+QEMU+virtinst on `control` group hosts; adds `sjat`/`claude` to `libvirt` group; creates image-cache dir. Lint clean; applied live to ubongo (substrate installed); molecule scenario present, not run in the build env. | +| `scripts/integration-vm.py` | **Built** — stdlib-only lifecycle driver over `virsh`/`virt-install`/`cloud-localds`: `up / apply / reboot / assert / cycle / down / prune / console`. Lazily ensures the golden Debian-13 genericcloud image. pytest clean (transient-inventory generation, var/overlay merge, `--certs` mapping, DHCP-lease parsing, resource-guard math). | +| `tests/integration/` (profile, verify, overrides) | **Built** — "be askari" profile + var overlay + `verify.yml` outcome assertions (Docker active, forward-chain accepts present, published-port DNAT alive). Validated end-to-end by the RED→GREEN acceptance run. | +| `make test-integration` / `make test-integration-clean` | **Built** — wired into `Makefile`. | +| ADR-025 | **Accepted (2026-06-18)** — decision recorded, approach A, cert tiers, safety invariants, UEFI boot requirement, and claude-sudo dependency documented. | +| **RED/GREEN acceptance (ubongo live pass)** | **PASSED (2026-06-18).** A throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base nftables forward default-deny kills Docker forwarding on reboot) = RED. Applying the `docker_host` container-forward drop-in and rebooting survived = GREEN. Nine shakedown findings captured in `docs/FRICTION.md`; key learnings (UEFI boot, claude sudo) recorded in ADR-025. `docs/TODO.md` item 2.4 closed. | +| `le-staging` cert validation | **Pending** — wired in v1 but not yet exercised on a real VM (separate from the RED/GREEN acceptance gate). | + ## Keeping this honest Update this file whenever you build, stub, or remove something. It is the first diff --git a/docs/FRICTION.md b/docs/FRICTION.md index b447f9b..888ef73 100644 --- a/docs/FRICTION.md +++ b/docs/FRICTION.md @@ -90,6 +90,62 @@ a WAN-SSH break-glass. Spec/plan: docs/superpowers/{specs,plans}/2026-06-17-mesh open**, and only retire the break-glass once recovery (incl. a reboot) is proven. Generalises beyond this milestone — a candidate line in the new-host / hardening runbooks. + + +- `[gotcha]` **Debian 13 genericcloud boot-loops under legacy BIOS/SeaBIOS** (2026-06-18): + `virt-install --import` of the genericcloud qcow2 with the default (SeaBIOS) firmware + triple-faults at the real-mode kernel handoff — GRUB loops, no "Decompressing Linux", no + DHCP lease. The symptom (no network) pointed away from the cause (firmware). → boot test + VMs via **UEFI** (`virt-install --boot uefi`; OVMF→efistub). + +- `[friction]` **The no-sudo `claude` model blocked diagnosing a failed VM** (2026-06-18): + under ADR-015 `claude` had no sudo, so when the VM wouldn't network there was no way to + introspect it (serial logs are `root:0600`, libguestfs not installed, mounting needs + root). Diagnosis was fully blocked until the operator granted `claude` sudo. → DECISION: + `claude` gets `NOPASSWD:ALL` (reverses ADR-015's "no local sudo"); compensating control + is auditd/Loki attribution (already in ADR-015). Amend ADR-015/ADR-021 + accepted-risks; + codify the sudoers drop-in in Ansible. + +- `[gotcha]` **Non-root `virsh`/`virt-install` default to `qemu:///session`** (2026-06-18): + the substrate (NAT net, /dev/kvm) lives on `qemu:///system`. → pin + `LIBVIRT_DEFAULT_URI=qemu:///system` in the driver. + +- `[gotcha]` **`qemu:///system` (libvirt-qemu) can't traverse `/home`** (2026-06-18): VM + disk/seed/console under the repo/home failed "Permission denied (search permissions for + /home/claude)". → put per-VM artifacts in a system-readable dir (`/var/lib/boma-integration`, + group libvirt); the inventory (read by ansible as the user) can stay in the repo. + +- `[gotcha]` **`ansible-playbook -i /` parses sibling non-inventory files as INI** + (2026-06-18): pointing `-i` at a run-dir holding a state file + qcow2s made the directory + inventory loader parse the state file as INI → phantom hosts INCLUDING the real `askari` + (with its real vars), breaking the single-host isolation invariant. → point `-i` at the + single `hosts.yml`. Caught by the holistic cross-file review BEFORE any hardware run. + +- `[gotcha]` **Jinja `{%- -%}` + ansible `trim_blocks=True` double-strip newlines** + (2026-06-18): a template edit used `{%- -%}`, reviewed by rendering with RAW jinja2 + (trim_blocks=False) which looked fine; ansible (trim_blocks=True) then collapsed the + rendered Caddyfile onto single lines → caddy crash-looped on invalid config. → verify + templates with ansible's whitespace (trim_blocks=True), not raw jinja2; prefer plain + `{% %}` at column 0 (the repo's existing style). + +- `[gotcha]` **Fresh cloud images have empty apt lists** (2026-06-18): `apt install + nftables` failed "No package matching 'nftables' is available" on a fresh genericcloud + VM whose cloud-init had `package_update: false`. → `package_update: true` AND block on + `cloud-init status --wait` before applying. + +- `[gotcha]` **base's default-deny firewall drops SSH to a NAT'd VM unless the gateway is + allowed** (2026-06-18): the driver reaches the VM via the libvirt-NAT gateway + (192.168.150.1). `ct established,related accept` saves the in-flight apply connection, + but a fresh post-reboot SSH is dropped without an explicit allow. → test overlay sets + `base__firewall_control_addr` to the NAT gateway. + +- `[recurring]` **Real-hardware shakedown and static review each caught what the other + couldn't** (2026-06-18): the qemu-URI, storage-path, UEFI, apt-list, and caddy-render + bugs ALL surfaced only on a live KVM run; the phantom-host inventory bug surfaced only in + the holistic cross-file review. → for infra this novel, budget for BOTH an adversarial + cross-file review AND a real-hardware run; neither alone would have shipped it working. + --- ## Kaizen reviews — decisions ledger diff --git a/docs/TODO.md b/docs/TODO.md index 4f0456c..0bcfaec 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -17,19 +17,7 @@ calls, curl pulls of web products, log reviews. Headless browsing → ADR-017 (`/verify-service`); the API/curl/log-review siblings remain open. 3. ~~Standard for test users + manual-test instructions.~~ → ADR-017. - 4. **Local VM integration testing on ubongo (pre-deploy).** Molecule (containers, - one converge, no reboot, no real Docker/firewall interaction) structurally - **cannot** catch reboot-survivability, host-firewall × Docker, or boot-order bugs — - exactly the class that caused the 2026-06-17 mesh-hardening incident (`base`'s - nftables `forward policy drop` broke the askari Docker host on reboot; - `ip_nonlocal_bind` didn't beat the sshd boot-race). Build a way for the agent to - spin up throwaway VMs **locally on ubongo** (libvirt/QEMU? Proxmox-on-ubongo?) that - mirror a target host (real Docker, a real reboot, the real role apply) and validate - risky infra changes there **before** deploying to a live host. This is the concrete - build of ADR-008's Level 2/3 (staging/integration) testing — deferred for lack of - hosts, but ubongo can host it. Decide the virtualisation approach + how the agent - drives it (provision → snapshot/reset → run the playbook → reboot → assert). Ties to - 3.10 (testing approach as it matures) and the 2026-06-17 FRICTION signals. + 4. ~~Local VM integration testing on ubongo.~~ → ADR-025 / `make test-integration` (built + RED→GREEN validated 2026-06-18). 3. **Building services** 1. ~~Decide how to manage logs.~~ → ADR-018. diff --git a/docs/decisions/008-testing.md b/docs/decisions/008-testing.md index c2c5d22..647b3ba 100644 --- a/docs/decisions/008-testing.md +++ b/docs/decisions/008-testing.md @@ -154,6 +154,7 @@ Level 2 (staging) or Level 3 (external). This is a conscious, documented decisio | Capability | Reason not testable in Molecule | |---|---| | `nftables` rule loading | Requires `nf_tables` kernel module; not available in Docker | +| **Reboot-survivability / host-firewall × Docker interaction / boot-ordering** | **Requires a real kernel reboot — the class that caused the 2026-06-17 mesh-hardening incident. Now covered by local VM integration testing (ADR-025).** | | NetBird mesh data plane (`wt0` WireGuard interface) | Requires the `wireguard` kernel module; Molecule checks only that the agent is installed/configured (ADR-016) | | `unattended-upgrades` behaviour | Installs correctly; actual upgrade behaviour requires a real apt environment | | DHCP behaviour (OPNsense) | OPNsense is managed by Ansible but not testable in a container | @@ -165,6 +166,11 @@ For the above, Molecule tests only what it can: that the relevant packages are installed, that configuration files render correctly, and that services are enabled. Behavioural correctness is confirmed on staging. +**ADR-025 is the concrete build of Level 2/3** — local VM integration testing on +ubongo (libvirt/KVM, throwaway overlay VMs, stdlib-only driver). It specifically +targets the reboot-survivability / host-firewall × Docker / boot-ordering class that +Molecule structurally cannot reach. See `docs/decisions/025-local-vm-integration-testing.md`. + --- ### CI pipeline diff --git a/docs/decisions/015-control-host.md b/docs/decisions/015-control-host.md index 13c1b5f..78e0da5 100644 --- a/docs/decisions/015-control-host.md +++ b/docs/decisions/015-control-host.md @@ -2,7 +2,10 @@ ## Status -Accepted (2026-06-05) +Accepted (2026-06-05). **Amended 2026-06-18:** the `claude` AI-worker account now has +`NOPASSWD:ALL` sudo on `ubongo` — reversing the original "no local sudo" sub-decision. +The amendment is recorded in §Access & security below; rationale and accepted risk are +in ADR-021 and `docs/security/accepted-risks.md` (R7). ## Context @@ -43,8 +46,12 @@ points at this physical box. This *strengthens* the ADR-009 control-node excepti it is genuinely outside Terraform's world, not a VM pretending to be the exception. Every other host stays a Terraform-managed VM exactly as designed. -`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a hypervisor -and runs no `docker_host` services. +`ubongo` runs **plain Debian 13** (the `base` role applies). It is not a production +hypervisor and runs no `docker_host` services. It does run **ephemeral KVM test VMs** +as part of its local-test-runner role (ADR-025 — local VM integration testing): one +throwaway VM at a time (~3 GiB RAM), against ~13 GiB free of the 16 GiB sized here. +This is not a production workload — it is the concrete implementation of ADR-008 Level +2/3, and the resource guard enforces one-at-a-time to stay within the RAM ceiling. ### Hardware target @@ -84,12 +91,38 @@ Manual, on bare metal: only** — key-only, with password auth and root login disabled — until the NetBird mesh (ADR-016) is stood up. - **AI-worker identity:** `ubongo` runs the AI worker under a dedicated, - password-locked `claude` user (in the `docker` group for Molecule; **no local sudo** — - boma deploys reach the fleet over SSH as the `ansible` user, not via local root). It is - reached via `sudo -iu claude` or its own SSH key. The rationale is **attribution + - revocation, not containment**: auditd/Loki (ADR-018) can separate human from agent - actions, and the account/key can be revoked without touching the operator's access. - (ADR-021 left the on-`ubongo` agent identity unspecified; this records it.) + password-locked `claude` user (in the `docker` and `libvirt` groups; **`NOPASSWD:ALL` + sudo** via a repo-managed drop-in — see amendment below). It is reached via `sudo -iu + claude` or its own SSH key. The rationale is **attribution + revocation, not + containment**: auditd/Loki (ADR-018) can separate human from agent actions, and the + account/key can be revoked without touching the operator's access. (ADR-021 left the + on-`ubongo` agent identity unspecified; this records it.) + + **Amendment (2026-06-18) — `claude` now has `NOPASSWD:ALL` sudo.** + > **Superseded by [ADR-025](025-local-vm-integration-testing.md)** (per ADR-023 §4): the + > "no local sudo" sub-decision is reversed. The shakedown that necessitated it is ADR-025; + > the resulting two-account access model is ADR-021; the accepted risk is R7. + + During the + integration-testing harness shakedown, the original "no local sudo" sub-decision was + reversed. No-sudo blocked the AI-worker from diagnosing a failed VM: `virsh`, + `virt-install`, `cloud-localds`, `journalctl`, `nft` — nearly all low-level + diagnostic commands — require root. The AI-worker must autonomously spin up, + inspect, and tear down test VMs without operator hand-holding; that is the harness's + core value proposition. Compensating controls make the risk acceptable: + + 1. `claude`'s password is **locked** (no interactive login, no `su claude` without the + operator's own credentials) — `NOPASSWD` sudo is the *only* sudo path. + 2. `auditd` + Loki attribution (ADR-018) separates human from agent root actions. + 3. The drop-in is **repo-managed** via `base__ai_worker_user` — revocable in one commit + and one deploy. + 4. Single-operator homelab: everything in git, off-machine backups (ADR-022). + + The operator (`sjat`) uses **password-required sudo** via the `sudo` group; their + former `NOPASSWD` drop-in was removed 2026-06-18 as redundant once `claude` had sudo + (least-privilege cleanup). The accepted risk is registered as R7 in + `docs/security/accepted-risks.md`. ADR-021 records the resulting sudo model for both + accounts. - **Disk encryption:** `ubongo`'s SSD is **not encrypted at rest** — the SanDisk X600 is TCG-Opal-capable but Opal is unused. This is an accepted risk recorded in `docs/security/accepted-risks.md` (control-node disk not encrypted at rest), diff --git a/docs/decisions/021-operational-access.md b/docs/decisions/021-operational-access.md index 6b12f3e..4e76b1a 100644 --- a/docs/decisions/021-operational-access.md +++ b/docs/decisions/021-operational-access.md @@ -3,7 +3,9 @@ ## Status Accepted (2026-06-09). Resolves TODO 7.2 (what to set up on hosts given direct access -will be rare) and TODO 3.2 (the service admin-API access question). +will be rare) and TODO 3.2 (the service admin-API access question). **Amended +2026-06-18:** the on-`ubongo` sudo model for the two local accounts is now settled +(see §Sudo model on `ubongo` below). **Doctrine ADR.** It pins the operational-access doctrine, the declarative `access__*` data model, the rendered `ACCESS.md` record, and the `/check-access` verifier. It does @@ -163,6 +165,36 @@ exists and `/check-access` is green (or a deviation is recorded in `accepted-ris No scaffold change — same manual-copy-plus-review pattern the sibling records (`SECURITY.md`/`VERIFY.md`) use. +### Sudo model on `ubongo` (amendment 2026-06-18) + +The original ADR left on-`ubongo` local sudo unspecified. The integration-testing +harness shakedown settled it: + +| Account | Role | Sudo | +|---|---|---| +| `claude` | Automated AI-worker | `NOPASSWD:ALL` via repo-managed drop-in (`base__ai_worker_user`) | +| `sjat` | Human operator | Password-required sudo via the `sudo` group | + +**Rationale for `claude NOPASSWD`.** No-sudo blocked the AI-worker from diagnosing a +failed test VM: `virsh`, `virt-install`, `cloud-localds`, `nft`, `journalctl` — +almost every low-level diagnostic tool — require root. The harness's core value is +autonomous spin-up → apply → reboot → assert → diagnose; that loop collapses without +local root access. + +**Compensating controls (R7 in `docs/security/accepted-risks.md`):** +- `claude`'s password is locked — `NOPASSWD` is the account's *only* sudo path; no + interactive login is possible. +- `auditd` + Loki attribution (ADR-018) separates human from agent root actions in the + audit trail. +- The drop-in is repo-managed and revocable in one commit + one deploy. +- Single-operator homelab; everything in git; off-machine backups (ADR-022). + +**`sjat` NOPASSWD removed.** The operator's former `NOPASSWD` drop-in +(`/etc/sudoers.d/sjat-ansible`, added as an interim measure during M5 NetBird +enrolment) was removed 2026-06-18. It was redundant once `claude` held sudo, and its +removal restores least-privilege for the human operator. `sjat` retains full sudo +capability via the `sudo` group (password required). + ## Consequences - Every host and service has at least one documented, verifiable way in — and a verifier diff --git a/docs/decisions/025-local-vm-integration-testing.md b/docs/decisions/025-local-vm-integration-testing.md new file mode 100644 index 0000000..fd58b52 --- /dev/null +++ b/docs/decisions/025-local-vm-integration-testing.md @@ -0,0 +1,180 @@ +# ADR-025 — Local VM integration testing on ubongo + +## Status + +Accepted (2026-06-18). Implements ADR-008 Level 2/3 (deferred for lack of hosts; now +viable on ubongo). **RED→GREEN acceptance PASSED on real hardware (2026-06-18):** a +throwaway KVM VM on ubongo reproduced the 2026-06-17 incident (base's nftables forward +default-deny kills Docker forwarding on reboot) — RED — and survived the reboot once +the `docker_host` container-forward drop-in was applied — GREEN. Two shakedown +learnings added below. + +## Context + +Molecule (ADR-008 Level 1) tests each role in a single Docker container: one +`converge`, no real kernel netfilter, no real Docker daemon in the loop, and **no +reboot**. That structurally cannot catch an entire class of bug — reboot-survivability, +host-firewall × Docker interaction, and boot-ordering — which is exactly the class +that caused the **2026-06-17 mesh-hardening incident**. + +During that incident, `base`'s nftables `forward { policy drop; }` killed the askari +Docker host **on reboot**: nftables loaded its default-deny before Docker, breaking +published-port DNAT and inter-container forwarding. Public services and the mesh went +down. It had worked right after `make deploy`, when Docker's runtime rules still +coexisted. `ip_nonlocal_bind` also failed to beat the sshd boot-race, leaving the mesh +listener absent at boot. Recovery required the Hetzner console and a WAN-SSH +break-glass. Molecule had passed. + +ADR-008's Level 2/3 was deferred "for lack of hosts." ubongo breaks that deferral: + +> verified: ubongo KVM capability · Bash (2026-06-18 session) · `/dev/kvm` present + +> accessible (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM +> free of 16, ~198 GiB disk free; libvirt/QEMU/Vagrant **not yet installed** · +> 2026-06-18. + +## Decision + +### 1. Virtualisation approach: libvirt/KVM directly (Approach A) + +A golden Debian-13 genericcloud qcow2 is cached locally on ubongo. Each run boots an +ephemeral qcow2 **overlay** backed by it (the golden image is never mutated), seeded +via cloud-init NoCloud, driven by a **stdlib-only** Python driver (`scripts/ +integration-vm.py`) over `virsh` / `virt-install` / `cloud-localds`. No `libvirt- +python` dependency — the driver stays portable and the role stays lean. + +### 2. Fidelity envelope + +The bugs are **post-boot**, not in the provisioning path. A lightweight local hypervisor +is sufficient: real OS, real kernel netfilter, real Docker daemon, real published-port +DNAT, a **real reboot**, and the coordinator running inside the VM (so the VM forms its +own one-node mesh, reproducing the circular bootstrap). The Proxmox provisioning chrome +is not mirrored. + +### 3. Scope: one throwaway VM at a time, instantiated from real inventory + +The first profile is **"be askari"** — a single box running Docker host + NetBird +coordinator + mesh peer, mirroring the host whose incident motivates this work. The +mechanism is generic: swap the profile to "be" any inventory host. Multi-VM topologies +are a deferred extension. + +### 4. Acceptance: self-validating against the real failure + +The harness is accepted when it can, on a local VM: + +1. Apply `base` (firewall on, no `docker_host` container-forward drop-in) to a Docker + host, reboot, and observe the **2026-06-17 breakage** (Docker forwarding dead, + services down). If step 1 passes, the harness is not faithful. +2. Apply the `docker_host` container-forward fix, re-run, and **survive the reboot**. + +### 5. Tiered cert fidelity via a `--certs` knob + +DNS-01 is what makes real certs possible without public inbound (validation is +out-of-band via a Gandi TXT record; the VM needs only outbound to ACME + Gandi, which +the isolated NAT network provides): + +| Tier | Description | Default? | +|---|---|---| +| `internal` | Caddy `tls internal` — zero deps, instant. For incident repro and runs where certs are not under test. | Yes | +| `le-staging` | Real DNS-01 ACME against Let's Encrypt **staging** — real caddy-gandi path, real cert files/renewal, untrusted root, effectively no rate limits. | Built in v1; use when testing the ACME/cert path. | +| `le-prod-wildcard` | A real trusted `*.test.wingu.me` wildcard, **issued once, persisted on ubongo, reused** across runs. | On-demand only. Accepted risk recorded as R6 in `docs/security/accepted-risks.md`. | + +A deliberate "no-egress" failure scenario (reproducing FRICTION 2026-06-17 #4 — +`netbird-server` FATAL-loops on GeoLite2 download when egress is lost) forces +`internal`, since ACME requires egress. + +### 6. The toolchain is Ansible-managed + +A new non-service role (`integration_test`, `control` group) installs and enables +libvirt + QEMU + virtinst reproducibly. The driver manages the golden image lazily on +first run (keeping the role lean; no fiddly download/refresh logic in Ansible). The +repo owns ubongo's state. + +### 7. Stubs live in an overlay file, never in the real inventory + +Transient inventory entries for the test VM are generated at runtime as a single-host +file. Stubs (cert tier, in-VM coordinator endpoint, VM connection details) live in +`tests/integration/overrides/.yml` — an explicit, reviewable overlay. The real +inventory is never touched, so `make tf-inventory` and "don't edit inventory directly" +stay intact. + +## Consequences + +- **Reconciles ADR-015:** ubongo runs ephemeral KVM test VMs as part of its + local-test-runner role — it is still not a production hypervisor. A default VM + (~2 vCPU / 3 GiB / 20 GiB thin overlay) against ~13 GiB free is comfortable; the + driver enforces **one integration VM at a time** (resource guard, name-prefix + `boma-it-*`) and refuses to start below a free-RAM threshold. +- **Operationalises the standing rule:** "firewall/sshd/boot changes must be tested on + a real VM with a real reboot before they touch a live host" (FRICTION 2026-06-17 #6) + becomes a concrete, runnable step documented in `docs/runbooks/integration-testing.md`. +- **Accepted risk R6:** `le-prod-wildcard` runs pass the production Gandi PAT + (`vault.gandi.pat`) to an ephemeral local VM and write transient `_acme-challenge` + TXT records into the real `wingu.me` zone. Scope: on-demand only; `le-staging` is the + default. Compensating controls: ephemeral VM, isolated NAT network, TXT records + auto-removed by Caddy after validation. +- **Three safety invariants** make the test tool itself safe: + 1. The transient inventory contains only the test VM — no real host is ever in scope. + 2. "Be askari" points NetBird at the in-VM coordinator — the VM forms its own one-node + mesh; it never enrols in the real mesh. + 3. Test VMs sit on an isolated libvirt NAT network — outbound NAT for ACME/image pulls + only, not reachable to the LAN (`10.20.x`) or the real mesh. +- **Diagnostics on failure** (catching a bug is the point): failure keeps the VM and + dumps `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`, + `systemd-analyze critical-chain`. `make test-integration-clean` reaps all `boma-it-*` + orphans. Diagnostics land in gitignored `~/integration-runs/-/`. +- **Future pinch:** concurrency with the Level-4 Chromium/Playwright stack (ADR-017) + competes for ubongo RAM. The resource guard is the v1 answer — one integration VM at a + time; don't run alongside a heavy Level-4 session. Revisit at `/capacity-review`. + +## Scope + +**In scope:** reboot-survivability, host-firewall × Docker interaction, boot-ordering, +cert/ACME paths, mesh bootstrap on one box. + +**Out of scope (v1):** multi-VM mini-cluster (inter-host mesh dataplane); CI gate +(this is an interactive, agent-driven pre-deploy check; CI stays lint + Molecule per +ADR-008/010); the Proxmox provisioning path (the bugs live in the boot/kernel/Docker +layer, not provisioning). + +## What was ruled out + +| Option | Reason | +|---|---| +| **Proxmox VE nested on ubongo** | Highest fidelity including the provisioning step, but heavy (nested virt, RAM), in tension with ADR-015, and the incident bugs do not live in provisioning. | +| **Vagrant + vagrant-libvirt** | Mature lifecycle/snapshots, but adds the Ruby/Vagrant ecosystem + a fragile plugin; boxes drift from the real Debian cloud image; the reboot→assert sequence still needs custom logic. | +| **terraform-provider-libvirt** | Declarative and reuses TF, but poor at the imperative apply→reboot→re-apply test sequence; adds throwaway state; blurs ADR-006's "TF owns *production* VM existence on Proxmox" boundary. | + +## Verified facts (ADR-014) + +- verified: ubongo KVM capability · Bash · `/dev/kvm` present + accessible (kvm group), + Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198 GiB + disk free · 2026-06-18. + +## Shakedown learnings (2026-06-18 live run) + +Two findings from the RED→GREEN acceptance run that affect anyone operating the harness: + +1. **Boot firmware: UEFI required.** The Debian 13 genericcloud image triple-faults + under legacy BIOS/SeaBIOS and does not reach the kernel. Boot the VM with UEFI + (`virt-install --boot uefi`; `ovmf` package). The driver does this by default; note + it here so the requirement is findable. + +2. **`claude` sudo is load-bearing.** VM management (`virsh`, `virt-install`, + `cloud-localds`) and offline diagnostics (`nft list ruleset`, `journalctl -b`, + `systemd-analyze critical-chain`) all require root. The harness assumes the + AI-worker has `NOPASSWD:ALL` sudo on `ubongo` — settled as the ADR-015 amendment + (2026-06-18) and registered as R7 in `docs/security/accepted-risks.md`. A `claude` + account without sudo will block the harness at the first `virsh` call. + +The nine full shakedown findings (including the UEFI boot-loop) are in +`docs/FRICTION.md`. + +## Related + +- ADR-006 — Terraform owns production VM existence (boundary this ADR respects). +- ADR-008 — Testing methodology (Levels 1–4); this ADR is the concrete build of Level 2/3. +- ADR-015 — Control host (ubongo); this ADR reconciles "not a hypervisor" with ephemeral test VMs. **Supersedes** ADR-015's "no local sudo" sub-decision for the AI-worker — the shakedown necessitated `claude` NOPASSWD sudo (ADR-023 §4; access model in ADR-021, risk R7). +- ADR-016 — Mesh VPN; the "be askari" profile includes the coordinator role. +- ADR-020 — Firewall strategy; firewall × Docker interaction is what this harness tests. +- ADR-021 — Operational access; sudo model for `claude` and `sjat` on `ubongo`. +- ADR-024 — Reverse proxy (Caddy); cert tiers exercise the DNS-01 ACME path. diff --git a/docs/hardware/reference.md b/docs/hardware/reference.md index 7252050..92c0255 100644 --- a/docs/hardware/reference.md +++ b/docs/hardware/reference.md @@ -25,7 +25,7 @@ - **Storage:** 256 GB SanDisk X600 SATA 2.5" SSD (model SD9TB8W256G1001; TCG Opal-capable, Opal unused — no disk encryption) - **NICs:** wired GbE, interface eno1, MAC 88:a4:c2:e0:ee:da - **BIOS:** Lenovo M2WKT5AA (2023-06-20) -- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred) +- **Notes:** always-on; control plane + AI-worker (dedicated `claude` user) + local test runner (Molecule/Docker) per ADR-015; not a Proxmox guest; remote access currently LAN SSH only (mesh deferred). Also runs **one ephemeral KVM integration test VM** (~3 GiB RAM) at a time per ADR-025 — the resource guard enforces one-at-a-time; do not run a test-integration cycle alongside a heavy Level-4 browser session (Chromium/Playwright). ### fisi (backup node — outside the cluster; provisional) - **Model / form factor:** HP Elite 600 G9 (tower) diff --git a/docs/runbooks/integration-testing.md b/docs/runbooks/integration-testing.md new file mode 100644 index 0000000..38a2f09 --- /dev/null +++ b/docs/runbooks/integration-testing.md @@ -0,0 +1,229 @@ +# Runbook — Local VM integration testing + +## When to use this + +Run a local VM integration test before deploying any change that touches: + +- **nftables / firewall rules** (the `firewall` concern of `base`) +- **sshd configuration** (listener address, port, key types, `base` hardening) +- **boot ordering or kernel parameters** (systemd units, sysctl) +- **Docker host networking** (`docker_host` DNAT rules, published-port forwarding, `daemon.json`) + +These are the change classes that Molecule (ADR-008 Level 1) cannot catch: they require +a real kernel reboot to surface. This harness is the concrete tool for ADR-008 Level 2/3 +(see ADR-025) and directly operationalises two standing rules: + +- **"Test risky infra before live deploy"** (standing rule, ubongo memory) — firewall/sshd/boot changes must be tested on a real VM with a real reboot before touching a live host. +- **FRICTION 2026-06-17 #6 — validate reboot-recovery before retiring the break-glass** — the lesson crystallised from the mesh-hardening incident: confirm the host recovers from reboot *while you still have the break-glass open*, not after. + +You do not need this runbook for pure-config changes (template rendering, package lists, user management) — Molecule covers those. + +--- + +## First-deploy (one-time setup) + +The `integration_test` role installs libvirt + QEMU + virtinst on ubongo and adds the +operator accounts (`sjat`, `claude`) to the `libvirt` and `kvm` groups. + +```bash +make deploy PLAYBOOK=site LIMIT=ubongo TAGS=integration_test +``` + +**Re-login after this run** — group membership changes do not take effect in the current +session. The driver (`scripts/integration-vm.py`) requires both `libvirt` and `kvm` +group membership to create and manage VMs. + +The golden Debian-13 genericcloud qcow2 image is downloaded lazily on the first run +(one-time cost, ~500 MB); subsequent runs reuse the cached image. + +--- + +## Running a cycle + +### Makefile interface (recommended) + +```bash +# Full cycle (provision → apply → reboot → assert → teardown on pass) +make test-integration HOST=askari + +# With a specific cert tier +make test-integration HOST=askari CERTS=le-staging + +# Keep the VM alive after the run (for manual inspection) +make test-integration HOST=askari KEEP=1 + +# Destroy all orphan integration VMs (name-prefix boma-it-*) +make test-integration-clean +``` + +`HOST` is a hostname from the production inventory (the profile `tests/integration/ +profiles/.json` must exist — see Adding a new profile below). `CERTS` defaults +to `internal`. + +### Lower-level driver + +The driver (`scripts/integration-vm.py`) exposes individual lifecycle steps for manual +or scripted use: + +| Sub-command | What it does | +|---|---| +| `up` | Ensure golden image → create ephemeral overlay → cloud-init seed → boot | +| `apply` | Run the site playbook against the transient inventory (real apply) | +| `reboot` | `virsh reboot` + wait for a verified reboot (boot-id change) — the step Molecule cannot do | +| `assert` | Run `tests/integration/verify.yml` (outcome assertions) | +| `cycle` | `up` → `apply` → `reboot` → `assert` → `down` (default: destroy on pass) | +| `down` | Destroy the VM + overlay | +| `prune` | Destroy all `boma-it-*` VMs + overlays (orphan cleanup) | +| `console` | Print the VM's captured serial-console log | + +```bash +# Example: step through manually +python3 scripts/integration-vm.py up --host askari +python3 scripts/integration-vm.py apply --host askari +python3 scripts/integration-vm.py reboot --host askari +python3 scripts/integration-vm.py assert --host askari +python3 scripts/integration-vm.py down --host askari +``` + +--- + +## Cert tiers + +| Tier | Flag | Use when | +|---|---|---| +| `internal` | `CERTS=internal` (default) | Incident repro, firewall/sshd/boot changes where certs are not under test. Zero deps, instant. | +| `le-staging` | `CERTS=le-staging` | Testing the Caddy DNS-01 ACME path, cert renewal logic, or the `caddy-gandi` plugin. Real cert files, untrusted root, effectively no rate limits. Requires `vault.gandi.pat`. | +| `le-prod-wildcard` | `CERTS=le-prod-wildcard` | Verifying TLS behaviour with a real trusted cert. On-demand only — accepted risk R6 (`docs/security/accepted-risks.md`): the production Gandi PAT reaches an ephemeral VM and transient TXT records are written into the real `wingu.me` zone. | + +> A deliberate "no-egress" scenario (reproducing FRICTION 2026-06-17 #4 — the +> `netbird-server` GeoLite2 FATAL-loop when NAT masquerade is wiped) **must** use +> `CERTS=internal`: the egress loss is the fault being simulated, and ACME requires egress. + +--- + +## Diagnostics and inspecting a failed VM + +### Where diagnostics land + +Diagnostics from every run are captured in: + +``` +~/integration-runs/-/ +``` + +This directory is gitignored. On a failed assert step, the driver dumps: + +- `nft list ruleset` — the live nftables state at failure +- `docker ps -a` — container states +- `ss -tlnp` — listening sockets +- `journalctl -b` — full boot log +- `systemd-analyze critical-chain` — boot timing +- Serial console capture (on boot/SSH failure — the automated equivalent of the Hetzner + console, addressing FRICTION 2026-06-17 #5) + +The agent reads these directly from `~/integration-runs/` — no manual download needed. + +### Inspecting a kept or failed VM + +When a run fails or when `KEEP=1` is passed, the VM is left running. Connect to it: + +```bash +# Serial console (no SSH needed — useful when SSH is the fault) +python3 scripts/integration-vm.py console --host askari +# or directly: +virsh console boma-it-askari +# Exit with Ctrl-] + +# SSH (as the ansible user, IP from virsh) +virsh domifaddr boma-it-askari --source lease +ssh ansible@ + +# List all integration VMs +virsh list --all | grep boma-it- +``` + +### Cleanup + +```bash +# Destroy a specific VM +python3 scripts/integration-vm.py down --host askari + +# Reap all orphans +make test-integration-clean +# or: +python3 scripts/integration-vm.py prune +``` + +--- + +## Safety invariants + +These make the test tool itself safe — the harness cannot reach or modify production: + +1. **Single-host transient inventory** — the playbook apply runs against a generated + single-host inventory (`ansible_host=`). No real host is ever in scope. +2. **In-VM coordinator only** — "be askari" points NetBird at the coordinator running + inside the VM itself (localhost endpoint). The VM forms its own one-node mesh; it + never enrols in the real NetBird mesh. +3. **Isolated NAT network** — test VMs sit on a dedicated libvirt NAT network. + Outbound NAT provides ACME/image-pull access, but the VM is not reachable from + the LAN (`10.20.x`) or the real mesh. + +--- + +## Resource constraints + +The default VM profile is ~2 vCPU / 3 GiB RAM / 20 GiB thin-provisioned overlay. The +driver enforces **one integration VM at a time** (refusing to start if another +`boma-it-*` VM is already running) and refuses to start below the free-RAM threshold +(~13 GiB available on ubongo at baseline, per ADR-025). + +**Do not run a test-integration cycle alongside a Level-4 browser session** +(Chromium/Playwright, ADR-017) — both compete for ubongo RAM. The resource guard is the +enforcement mechanism, not a suggestion. + +--- + +## Adding a new profile + +To make the harness "be" a different host: + +1. Create `tests/integration/profiles/.json` — specifies which roles to apply + and base VM sizing for that host. +2. Create `tests/integration/overrides/.yml` — the explicit stub overlay: + cert tier, in-VM coordinator endpoint (if the host runs the coordinator), + `ansible_host` placeholder, and any other variables that must differ from the real + inventory (e.g. public DNS → local resolution, geo-DB disable for coordinator). +3. Add assertions to `tests/integration/verify.yml` (or extend an existing task with a + `when: inventory_hostname == ''` guard) for any host-specific outcomes. +4. Run `make test-integration HOST=` to validate the new profile. + +All stubs must be explicit in the overlay — the real inventory is never edited. + +--- + +## Reproducing the 2026-06-17 incident + +The acceptance test for the harness (ADR-025) deliberately reproduces the incident: + +1. Run with today's `base` (firewall on, no `docker_host` container-forward drop-in): + ```bash + make test-integration HOST=askari CERTS=internal + ``` + The assert step **must FAIL** after reboot (Docker forwarding dead, published ports + unreachable). If it passes, the harness is not faithful. + +2. Implement the `docker_host` container-forward rules (FRICTION 2026-06-17 #1 fix) and + re-run. The assert step **must PASS** across the reboot. + +This round-trip proves: (a) the harness faithfully reproduces the incident, and (b) the +fix survives a real reboot. + +--- + +## Related + +- ADR-025 — decision record for this harness (approach, cert tiers, safety invariants) +- ADR-008 — testing methodology; this is Level 2/3 +- `docs/security/accepted-risks.md` R6 — `le-prod-wildcard` accepted risk +- `docs/FRICTION.md` — 2026-06-17 signals that motivated this runbook diff --git a/docs/runbooks/new-host.md b/docs/runbooks/new-host.md index 2ea3db1..d0d5962 100644 --- a/docs/runbooks/new-host.md +++ b/docs/runbooks/new-host.md @@ -109,6 +109,13 @@ make check PLAYBOOK=site # Should report no changes ``` +> **Pre-flight before lockout-risky changes (firewall / sshd / boot):** before applying +> any change that touches nftables rules, SSH configuration, or boot ordering, run +> `make test-integration HOST=` and confirm reboot-recovery on the local VM +> **while the break-glass (Proxmox console / Hetzner console) is still open**. Do not +> retire the break-glass until the integration test passes. See +> `docs/runbooks/integration-testing.md` and ADR-025. + --- ## Part E — Control node (`ubongo`, manual exception) diff --git a/docs/runbooks/new-role.md b/docs/runbooks/new-role.md index 714e1fe..5788977 100644 --- a/docs/runbooks/new-role.md +++ b/docs/runbooks/new-role.md @@ -114,7 +114,20 @@ reason and gets no `BACKUP.md`. Once the backup node exists, `/check-backup +``` + +See `docs/runbooks/integration-testing.md` and ADR-025. + +### 14. Commit ```bash git checkout -b role/ diff --git a/docs/security/accepted-risks.md b/docs/security/accepted-risks.md index 82e3e4e..0801afa 100644 --- a/docs/security/accepted-risks.md +++ b/docs/security/accepted-risks.md @@ -18,8 +18,10 @@ revisit (trigger). | R3 | **Self-hosted mesh control plane is a public target on `askari`** — the NetBird coordinator (ADR-016) exposes a management API + dashboard (TCP 80/443) and STUN (UDP 3478) on `askari`'s public IP; the management API controls the whole mesh (NetBird v0.72.4 embeds STUN in the combined server — no separate Coturn) | Self-hosting means **no third-party trust** and an off-site control plane that survives a homelab outage (boma's sovereignty ethos). Residual surface is on `askari` (already a public VPS) and is mitigated: TLS + embedded-IdP login, source-IP restriction where practical, `base` hardening, version-pinned NetBird (ADR-011) patched on boma's cadence | A coordinator compromise or unpatched NetBird CVE; the management plane is reachable without auth/IP-limits; the operational burden makes a hosted coordinator worth reconsidering | | R4 | **No cryptographic WORM for logs** — shipped logs are append-only via Loki's push API and copied off-site to `askari` (ADR-018), but the stored chunks are not object-locked/immutable; a root-on-`askari` attacker could edit history | Append-only push + off-site copy already defeats the realistic threat (a host attacker covering tracks survives even full-cluster compromise). True WORM (object-lock) is forensic-grade cost for boma's opportunistic threat model (R1) | Threat model shifts toward targeted/forensic; a regulatory/evidentiary need appears; `askari` itself is assessed as a likely target | | R5 | **No disk encryption on `ubongo`** — the control node's SSD (SanDisk X600 256 GB, TCG-Opal-capable but Opal unused) is unencrypted at rest, so it holds recovery-critical secrets in plaintext: the Ansible Vault password's `rbw` local cache and (future) Terraform state. Physical theft of the box would expose them | `ubongo` is always-on in a physically controlled location; compensating controls are a **BIOS supervisor password** and **disabled external/USB + PXE boot** (an attacker cannot trivially boot another OS to read the disk), and the offline-recoverable design means the irreducible root secret (Vaultwarden master password) is never stored on the box anyway. Full-disk encryption was weighed against the always-on/unattended-reboot requirement (LUKS+TPM auto-unlock or passphrase) and deferred for simplicity at this trust level | `ubongo` is relocated to a less-trusted physical location; the box starts holding additional high-value secrets; or a reinstall onto LUKS (TPM-sealed) is undertaken | +| R6 | **`le-prod-wildcard` integration runs** — when `CERTS=le-prod-wildcard` is passed to `make test-integration`, the production Gandi PAT (`vault.gandi.pat`) is passed to an ephemeral local test VM via the var overlay, and transient `_acme-challenge` TXT records are written into the real `wingu.me` DNS zone to satisfy the Let's Encrypt DNS-01 challenge. A compromised or long-lived test VM could exfiltrate the PAT; the real zone is briefly (seconds) modified | Scope is **on-demand only** — `le-staging` is the default cert tier (`CERTS=internal` for incident repro); `le-prod-wildcard` is an explicit opt-in. Compensating controls: the VM is ephemeral and destroyed on success; it sits on an isolated libvirt NAT network (no LAN/mesh access); TXT records are auto-removed by Caddy immediately after validation; the PAT is not persisted inside the VM after the run. ADR-025 documents the cert-tier design and the three isolation invariants | The PAT is exfiltrated from a test VM; the `wingu.me` zone shows unexpected records; a `CERTS=le-prod-wildcard` run must be audited or the tier must be revoked | +| R7 | **`claude` AI-worker has `NOPASSWD:ALL` sudo on `ubongo`** — the automated AI-worker account can execute any command as root on the control node without a password prompt. A compromised or misbehaving agent session could make arbitrary root-level changes to ubongo | The account is **password-locked** (no interactive `claude` login; `NOPASSWD` sudo is the account's only escalation path, so there is no "su to claude + sudo" attack). `auditd` + Loki attribution (ADR-018) logs every `sudo` invocation with the originating user. The drop-in (`/etc/sudoers.d/claude-ai-worker`) is repo-managed via `base__ai_worker_user` — revocable in one commit + one deploy. Single-operator homelab; all changes in git; off-machine backups (ADR-022). Full rationale: ADR-015 amendment (2026-06-18) + ADR-021 §Sudo model. | The AI-worker executes a destructive action that cannot be rolled back via git; the account key is compromised; the threat model shifts toward targeted remote attackers | -_Last reviewed: 2026-06-11. The prior gaps (full CIS hardening, SELinux/AppArmor, +_Last reviewed: 2026-06-18. The prior gaps (full CIS hardening, SELinux/AppArmor, IDS) were re-challenged and **adopted rather than accepted**: CIS Debian L1+L2 + CIS Docker, AppArmor (enforce), AIDE file-integrity, and Suricata network IDS are now part of the security strategy (ADR-002). See STATUS.md / `docs/TODO.md` for build diff --git a/docs/superpowers/plans/2026-06-18-local-vm-integration-testing.md b/docs/superpowers/plans/2026-06-18-local-vm-integration-testing.md new file mode 100644 index 0000000..2b799c4 --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-local-vm-integration-testing.md @@ -0,0 +1,1179 @@ +# Local VM Integration Testing Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Give the agent a `make test-integration HOST=` loop that boots a throwaway KVM VM on ubongo mirroring a real host, applies the real playbooks, performs a **real reboot**, and asserts outcomes — catching the reboot/firewall/Docker class Molecule cannot (the 2026-06-17 incident). + +**Architecture:** A non-service `integration_test` role installs the libvirt/QEMU substrate on ubongo. A stdlib-only driver `scripts/integration-vm.py` orchestrates the lifecycle over `virsh`/`virt-install`/`cloud-localds` (golden Debian-13 image → ephemeral qcow2 overlay → cloud-init seed → boot → apply real playbooks via a single-host transient inventory → reboot → verify playbook → teardown). Stubs and cert-tiers are passed as Ansible `-e @file` extra-vars so the real inventory is never edited and the driver never parses YAML. + +**Tech Stack:** Debian 13 (trixie), libvirt 11.3 / `virt-install` 5.0.0 / QEMU-KVM, cloud-init NoCloud (`cloud-image-utils` 0.33), Ansible, Caddy v2 (DNS-01 via the existing `caddy-gandi` image), Python 3 stdlib, pytest, Molecule (Docker). + +**Verified facts (ADR-014, 2026-06-18):** +- Image: `https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2` + `SHA512SUMS` alongside. Ships cloud-init; **no qemu-guest-agent** → get IP via `virsh domifaddr --source lease`. +- Seed: `cloud-localds seed.img user-data [meta-data]` (`cloud-image-utils`). Label `cidata`. +- `virt-install --import --disk path=...,format=qcow2 --disk path=seed.img,device=cdrom --network network= --osinfo debian13 --graphics none --serial file,path= --noautoconsole` (package `virt-install`; `virtinst` is a transitional shim). +- Isolated NAT net via `virsh net-define/net-start/net-autostart` (own bridge+subnet, ``). +- Caddy: `acme_ca https://acme-staging-v02.api.letsencrypt.org/directory` (global), `tls internal` (self-signed), `tls { dns gandi {env.GANDI_BEARER_TOKEN} }` (DNS-01; module already compiled into the boma `caddy-gandi` image). LE staging limits are effectively unlimited; use staging for routine cert tests. + +**Repo facts this plan extends:** +- `roles/base/templates/nftables.conf.j2:21` — `chain forward { ... policy drop; }`; line 26 `include "{{ base__firewall_dropin_dir }}/*.nft"`; `base__firewall_dropin_dir: /etc/nftables.d`. **The drop-in include already exists** — `docker_host` just needs to ship a `.nft` file. +- `base__firewall_apply` gates application (`roles/base/tasks/firewall.yml:32-35`). +- `roles/docker_host/` installs Docker only; **no container-forward rules** (the green-half fix). +- `roles/reverse_proxy/templates/Caddyfile.j2` — global `acme_dns gandi {env.GANDI_BEARER_TOKEN}` when `reverse_proxy__acme_dns_provider == 'gandi'`; per-site blocks; Gandi PAT via `vault.gandi.pat` → `env.j2` `GANDI_BEARER_TOKEN`. **No `acme_ca` or `tls internal` knob yet** (this plan adds them). +- askari: `inventories/production/offsite.yml` (`ansible_host: 77.42.120.136`, group `offsite_hosts`); `group_vars/offsite_hosts/vars.yml` (`base__firewall_apply: false`, `base__ssh_listen_mesh_only: false`); routes in `group_vars/all/reverse_proxy.yml`. +- `playbooks/site.yml` (base→all, docker_host→docker_hosts) + `playbooks/offsite.yml` (docker_host→reverse_proxy→netbird_coordinator on offsite_hosts). +- Makefile vars: `VENV PLAYBOOK_BIN INVENTORY VAULT_ARGS ROLE PLAYBOOK LIMIT TAGS`. pytest in `tests/test_*.py` (no conftest/pytest.ini; importlib-load of hyphenated scripts, see `tests/test_firewall_rules.py:1-13`). Tag vocabulary `tests/tags.yml`; `scripts/check-tags.py` run by `make lint`. +- None of `roles/integration_test/`, `scripts/integration-vm.py`, `tests/integration/` exist. + +--- + +## File Structure + +**Create:** +- `roles/integration_test/` — substrate role (defaults, tasks, handlers, meta, README, molecule/default/{molecule,converge,verify}.yml). Installs libvirt/QEMU/virt-install/cloud-image-utils; enables `libvirtd`; adds `sjat`/`claude` to `libvirt`+`kvm` groups; creates the image cache dir. +- `scripts/integration-vm.py` — stdlib-only driver. Pure helpers + impure orchestration + argparse CLI. +- `tests/test_integration_vm.py` — pytest for the driver's pure helpers. +- `tests/integration/profiles/askari.json` — driver-side profile metadata (groups, playbook+tags list, extra-vars files, mem/vcpu). +- `tests/integration/overrides/askari.yml` — Ansible stub extra-vars (firewall on, ssh break-glass). +- `tests/integration/certs/{internal,le-staging,le-prod-wildcard}.yml` — cert-tier extra-vars. +- `tests/integration/verify.yml` — outcome-based verify playbook. +- `tests/integration/README.md` — how the harness works. +- `docs/decisions/025-local-vm-integration-testing.md` — ADR. +- `docs/runbooks/integration-testing.md` — operator/agent runbook. + +**Modify:** +- `roles/reverse_proxy/defaults/main.yml` + `templates/Caddyfile.j2` — add `reverse_proxy__tls_internal` + `reverse_proxy__acme_ca` knobs. +- `roles/docker_host/defaults/main.yml` + `tasks/main.yml` + new `templates/10-docker-forward.nft.j2` — the container-forward drop-in (green-half). +- `Makefile` — `test-integration`, `test-integration-clean` targets. +- `.gitignore` — `tests/integration/.run/`, `/integration-runs/` is under $HOME (already outside repo). +- `docs/decisions/008-testing.md`, `015-control-host.md`; `docs/security/accepted-risks.md`; `CLAUDE.md`; `STATUS.md`; `docs/TODO.md`; `docs/hardware/reference.md` — pointers/entries. + +**Milestones:** RED (Task 15: harness reproduces the incident) → GREEN (Task 16: docker_host fix survives reboot) → le-staging cert tier (Task 17) → governance/docs (Tasks 18-20). + +--- + +## Phase A — Substrate role + +### Task 1: `integration_test` role (libvirt/QEMU substrate) + +**Files:** +- Create: `roles/integration_test/{defaults,tasks,handlers,meta}/main.yml`, `roles/integration_test/README.md`, `roles/integration_test/molecule/default/{molecule,converge,verify}.yml` + +- [ ] **Step 1: Scaffold** + +Run: `make new-role NAME=integration_test` +Expected: `Role integration_test scaffolded at roles/integration_test/` + +- [ ] **Step 2: defaults/main.yml** + +```yaml +--- +# integration_test — installs the local KVM/libvirt substrate on the control node +# (ubongo) so the agent can run throwaway VM integration tests (ADR-025). Non-service +# role; applied to the `control` group. Not a production hypervisor (ADR-015). +integration_test__packages: + - qemu-system-x86 # KVM + - qemu-utils # qemu-img (overlays) + - libvirt-daemon-system + - libvirt-clients # virsh + - virt-install # virt-install (trixie: the real pkg; `virtinst` is transitional) + - cloud-image-utils # cloud-localds (NoCloud seed) + - genisoimage # cloud-localds fallback +# Users granted libvirt/kvm access (run VMs without sudo). +integration_test__users: + - sjat + - claude +# Where the golden image + overlays live (outside the repo). +integration_test__cache_dir: "/var/lib/boma-integration" +``` + +- [ ] **Step 3: tasks/main.yml** + +```yaml +--- +- name: Install the KVM/libvirt substrate + ansible.builtin.apt: + name: "{{ integration_test__packages }}" + state: present + update_cache: true + tags: [packages] + +- name: Enable and start libvirtd + ansible.builtin.systemd: + name: libvirtd + enabled: true + state: started + tags: [config] + +- name: Grant users libvirt + kvm access + ansible.builtin.user: + name: "{{ item }}" + groups: [libvirt, kvm] + append: true + loop: "{{ integration_test__users }}" + tags: [users] + +- name: Create the integration cache dir + ansible.builtin.file: + path: "{{ integration_test__cache_dir }}" + state: directory + owner: root + group: libvirt + mode: "2775" + tags: [config] +``` + +- [ ] **Step 4: meta/main.yml** (mirror `roles/dev_env/meta/main.yml`: author `sjat`, Debian/trixie, `min_ansible_version: "2.17"`, `dependencies: []`, description naming ADR-025). **handlers/main.yml** stays `---` (no handlers). **README.md**: purpose, that it targets the `control` group, links ADR-025/ADR-015. + +- [ ] **Step 5: molecule/default/molecule.yml** — copy `roles/dev_env/molecule/default/molecule.yml` verbatim (same Debian-13 systemd image). + +- [ ] **Step 6: molecule/default/converge.yml** + +```yaml +--- +- name: Converge + hosts: all + become: true + gather_facts: true + roles: + - role: integration_test +``` + +- [ ] **Step 7: molecule/default/verify.yml** (assert install tasks — NOT libvirtd active, which cannot run KVM-in-Docker) + +```yaml +--- +- name: Verify + hosts: all + become: true + gather_facts: false + tasks: + - name: Gather package facts + ansible.builtin.package_facts: + - name: Assert the substrate packages are installed + ansible.builtin.assert: + that: + - "'libvirt-clients' in ansible_facts.packages" + - "'virt-install' in ansible_facts.packages" + - "'cloud-image-utils' in ansible_facts.packages" + - "'qemu-system-x86' in ansible_facts.packages" + - name: Cache dir exists + ansible.builtin.stat: + path: /var/lib/boma-integration + register: _cache + - name: Assert cache dir + ansible.builtin.assert: + that: [_cache.stat.isdir] +``` + +- [ ] **Step 8: Add the role to the control-node play.** Edit `playbooks/workstation.yml` (the control-node playbook that applies `dev_env`) to also import `integration_test` for `control`. Confirm the exact play first: + +Run: `grep -n "dev_env\|hosts:\|control" playbooks/workstation.yml` +Then add under the same `control` play's roles: +```yaml + - role: integration_test + tags: [integration_test] +``` + +- [ ] **Step 9: Lint + Molecule** + +Run: `make lint` +Expected: clean (new role-name tag `integration_test` auto-accepted by check-tags; concern tags `packages`/`config`/`users` are in `tests/tags.yml`). +Run: `make test ROLE=integration_test` +Expected: converge + idempotence + verify PASS. + +- [ ] **Step 10: Commit** + +```bash +git add roles/integration_test playbooks/workstation.yml +git commit -m "feat(integration_test): KVM/libvirt substrate role on the control node" +``` + +--- + +## Phase B — Driver: pure helpers (TDD) + +### Task 2: Driver skeleton + constants + CLI dispatch + +**Files:** +- Create: `scripts/integration-vm.py` +- Test: `tests/test_integration_vm.py` + +- [ ] **Step 1: Write the failing test** (`tests/test_integration_vm.py`) + +```python +import importlib.util +import pathlib + +_PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py" +_spec = importlib.util.spec_from_file_location("integration_vm", _PATH) +ivm = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(ivm) + + +def test_valid_tiers(): + assert ivm.VALID_TIERS == ("internal", "le-staging", "le-prod-wildcard") +``` + +- [ ] **Step 2: Run it — fails (file missing)** + +Run: `.venv/bin/pytest tests/test_integration_vm.py -q` +Expected: FAIL (cannot load `scripts/integration-vm.py`). + +- [ ] **Step 3: Create the skeleton** (`scripts/integration-vm.py`) + +```python +#!/usr/bin/env python3 +"""boma local-VM integration test harness driver (ADR-025). + +Stdlib-only by convention (TODO-14): never imports a YAML library. The transient +inventory is emitted via string templates; stubs/cert-tiers reach Ansible as +`-e @` extra-vars; profile metadata is JSON. Talks to libvirt via `virsh`. +""" +import argparse +import hashlib +import json +import os +import pathlib +import re +import shutil +import subprocess +import sys +import time +import urllib.request +import uuid + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent +CACHE_DIR = pathlib.Path(os.environ.get("BOMA_IT_CACHE", "/var/lib/boma-integration")) +IMAGE_URL = "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2" +SHA_URL = "https://cloud.debian.org/images/cloud/trixie/latest/SHA512SUMS" +IMAGE_NAME = "debian-13-genericcloud-amd64.qcow2" +NET_NAME = "boma-it" +NET_XML = """ + boma-it + + + + + + +""" +NAME_PREFIX = "boma-it-" +RUN_DIR = REPO_ROOT / "tests" / "integration" / ".run" +DIAG_ROOT = pathlib.Path.home() / "integration-runs" +PROFILE_DIR = REPO_ROOT / "tests" / "integration" / "profiles" +INTEG_DIR = REPO_ROOT / "tests" / "integration" +CERT_DIR = REPO_ROOT / "tests" / "integration" / "certs" +DEFAULT_MEM_MIB = 3072 +DEFAULT_VCPUS = 2 +MIN_FREE_MIB = 4096 +VALID_TIERS = ("internal", "le-staging", "le-prod-wildcard") + + +def main(argv=None): + p = argparse.ArgumentParser(prog="integration-vm", description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + for c in ("up", "apply", "reboot", "assert", "cycle", "down", "console"): + sp = sub.add_parser(c) + sp.add_argument("--host", required=True) + sp.add_argument("--certs", choices=VALID_TIERS, default="internal") + sp.add_argument("--keep", action="store_true") + sp.add_argument("--no-reboot", action="store_true") + sub.add_parser("prune") + args = p.parse_args(argv) + return DISPATCH[args.cmd](args) + + +if __name__ == "__main__": # pragma: no cover + sys.exit(main()) +``` + +(Define `DISPATCH = {...}` after the command functions in later tasks; for now add a temporary `DISPATCH = {}` above `main` so import succeeds.) + +- [ ] **Step 4: Run — passes** + +Run: `.venv/bin/pytest tests/test_integration_vm.py -q` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add scripts/integration-vm.py tests/test_integration_vm.py +git commit -m "feat(integration-vm): driver skeleton + CLI dispatch" +``` + +### Task 3: `vm_name`, `free_mib`, `parse_lease_ip` (TDD) + +**Files:** Modify `scripts/integration-vm.py`, `tests/test_integration_vm.py` + +- [ ] **Step 1: Write failing tests** + +```python +def test_vm_name_prefix_and_suffix(): + assert ivm.vm_name("askari", "ab12cd34") == "boma-it-askari-ab12cd34" + +def test_vm_name_generates_suffix(): + n = ivm.vm_name("askari") + assert n.startswith("boma-it-askari-") and len(n.split("-")[-1]) == 8 + +def test_free_mib_parses_memavailable(): + sample = "MemTotal: 16331156 kB\nMemAvailable: 8388608 kB\n" + assert ivm.free_mib(sample) == 8192 + +def test_parse_lease_ip_extracts_ipv4(): + out = (" Name MAC address Protocol Address\n" + "-------------------------------------------------------------------\n" + " vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.42/24\n") + assert ivm.parse_lease_ip(out) == "192.168.150.42" + +def test_parse_lease_ip_none_when_absent(): + assert ivm.parse_lease_ip("no leases\n") is None +``` + +- [ ] **Step 2: Run — fail.** `.venv/bin/pytest tests/test_integration_vm.py -q` → FAIL (no attrs). + +- [ ] **Step 3: Implement** (add to `scripts/integration-vm.py`) + +```python +def vm_name(host, suffix=None): + suffix = suffix or uuid.uuid4().hex[:8] + return f"{NAME_PREFIX}{host}-{suffix}" + + +def free_mib(meminfo_text): + m = re.search(r"^MemAvailable:\s+(\d+)\s+kB", meminfo_text, re.MULTILINE) + return int(m.group(1)) // 1024 if m else 0 + + +def parse_lease_ip(domifaddr_output): + m = re.search(r"ipv4\s+(\d+\.\d+\.\d+\.\d+)", domifaddr_output) + return m.group(1) if m else None +``` + +- [ ] **Step 4: Run — pass.** `.venv/bin/pytest tests/test_integration_vm.py -q` → PASS. + +- [ ] **Step 5: Commit.** `git commit -am "feat(integration-vm): vm naming, RAM guard, lease IP parsing"` + +### Task 4: cloud-init `render_meta_data` / `render_user_data` (TDD) + +**Files:** Modify driver + tests + +- [ ] **Step 1: Write failing tests** + +```python +def test_meta_data_has_instance_and_hostname(): + md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x") + assert "instance-id: iid-askari-x" in md + assert "local-hostname: boma-it-askari-x" in md + +def test_user_data_injects_key_and_ansible_user(): + ud = ivm.render_user_data("ssh-ed25519 AAAA... claude@ubongo", "ansible") + assert ud.startswith("#cloud-config") + assert "name: ansible" in ud + assert "ssh-ed25519 AAAA... claude@ubongo" in ud + assert "NOPASSWD:ALL" in ud +``` + +- [ ] **Step 2: Run — fail.** + +- [ ] **Step 3: Implement** + +```python +def render_meta_data(instance_id, hostname): + return f"instance-id: {instance_id}\nlocal-hostname: {hostname}\n" + + +def render_user_data(ssh_pubkey, ansible_user): + return ( + "#cloud-config\n" + "users:\n" + f" - name: {ansible_user}\n" + " sudo: 'ALL=(ALL) NOPASSWD:ALL'\n" + " shell: /bin/bash\n" + " ssh_authorized_keys:\n" + f" - {ssh_pubkey}\n" + "ssh_pwauth: false\n" + "package_update: false\n" + ) +``` + +- [ ] **Step 4: Run — pass.** + +- [ ] **Step 5: Commit.** `git commit -am "feat(integration-vm): cloud-init user-data/meta-data rendering"` + +### Task 5: `cert_file`, `profile_path`, `render_run_hosts` (TDD) + +**Files:** Modify driver + tests + +- [ ] **Step 1: Write failing tests** + +```python +def test_cert_file_valid_tier(): + p = ivm.cert_file("le-staging") + assert p.name == "le-staging.yml" and p.parent.name == "certs" + +def test_cert_file_rejects_bad_tier(): + import pytest + with pytest.raises(ValueError): + ivm.cert_file("bogus") + +def test_render_run_hosts_single_host_in_groups(): + out = ivm.render_run_hosts("boma-it-askari-x", "192.168.150.42", + "ansible", ["offsite_hosts"]) + assert "offsite_hosts:" in out + assert "boma-it-askari-x:" in out + assert "ansible_host: 192.168.150.42" in out + assert "ansible_user: ansible" in out + # invariant: the real askari host must NOT appear + assert "askari:" not in out.replace("boma-it-askari-x:", "") +``` + +- [ ] **Step 2: Run — fail.** + +- [ ] **Step 3: Implement** + +```python +def cert_file(tier): + if tier not in VALID_TIERS: + raise ValueError(f"unknown cert tier: {tier}") + return CERT_DIR / f"{tier}.yml" + + +def profile_path(host): + return PROFILE_DIR / f"{host}.json" + + +def render_run_hosts(name, ip, ansible_user, groups): + lines = [ + "# Generated by scripts/integration-vm.py — transient, gitignored. Do not edit.", + "# Single test host ONLY (safety invariant: no real host is ever in scope).", + "all:", + " children:", + ] + for g in groups: + lines += [ + f" {g}:", + " hosts:", + f" {name}:", + f" ansible_host: {ip}", + f" ansible_user: {ansible_user}", + ] + return "\n".join(lines) + "\n" +``` + +- [ ] **Step 4: Run — pass.** + +- [ ] **Step 5: Commit.** `git commit -am "feat(integration-vm): cert-tier + profile + transient inventory rendering"` + +--- + +## Phase C — Driver: orchestration (impure) + +### Task 6: `sh` helper + `ensure_image` + +**Files:** Modify driver + +- [ ] **Step 1: Implement the subprocess helper + image fetch** + +```python +def sh(cmd, check=True, capture=False, **kw): + """Run a command (list form). Logs the command to stderr.""" + print("+ " + " ".join(str(c) for c in cmd), file=sys.stderr) + return subprocess.run(cmd, check=check, + capture_output=capture, text=True, **kw) + + +def _expected_sha(sha_text, filename): + for line in sha_text.splitlines(): + parts = line.split() + if len(parts) == 2 and parts[1].lstrip("*") == filename: + return parts[0] + return None + + +def ensure_image(): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + img = CACHE_DIR / IMAGE_NAME + if img.exists(): + return img + print(f"Downloading {IMAGE_URL} ...", file=sys.stderr) + tmp = img.with_suffix(".part") + urllib.request.urlretrieve(IMAGE_URL, tmp) + sha_text = urllib.request.urlopen(SHA_URL).read().decode() + want = _expected_sha(sha_text, IMAGE_NAME) + if not want: + tmp.unlink(missing_ok=True) + raise SystemExit(f"checksum for {IMAGE_NAME} not found at {SHA_URL}") + h = hashlib.sha512() + with open(tmp, "rb") as fh: + for chunk in iter(lambda: fh.read(1 << 20), b""): + h.update(chunk) + if h.hexdigest() != want: + tmp.unlink(missing_ok=True) + raise SystemExit("golden image SHA512 mismatch — refusing to use it") + tmp.rename(img) + return img +``` + +- [ ] **Step 2: Manual verification** + +Run: `.venv/bin/python scripts/integration-vm.py prune` (after Task 10 adds `prune`; for now) — or test `ensure_image` directly: +```bash +.venv/bin/python -c "import importlib.util,pathlib; \ +s=importlib.util.spec_from_file_location('ivm','scripts/integration-vm.py'); \ +m=importlib.util.module_from_spec(s); s.loader.exec_module(m); print(m.ensure_image())" +``` +Expected: downloads to `/var/lib/boma-integration/debian-13-genericcloud-amd64.qcow2`, SHA512 verified, prints the path. (Requires Task 1's role applied so the cache dir is group-writable, or run with sudo once.) + +- [ ] **Step 3: Commit.** `git commit -am "feat(integration-vm): golden image fetch + SHA512 verification"` + +### Task 7: `net_ensure`, `up` (boot a VM) + +**Files:** Modify driver + +- [ ] **Step 1: Implement** + +```python +def net_ensure(): + r = sh(["virsh", "net-info", NET_NAME], check=False, capture=True) + if r.returncode != 0: + xml = RUN_DIR / "net.xml" + RUN_DIR.mkdir(parents=True, exist_ok=True) + xml.write_text(NET_XML) + sh(["virsh", "net-define", str(xml)]) + sh(["virsh", "net-autostart", NET_NAME]) + active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout + if "Active: yes" not in active: + sh(["virsh", "net-start", NET_NAME]) + + +def _ssh_pubkey(): + for cand in ("id_ed25519.pub", "id_rsa.pub"): + p = pathlib.Path.home() / ".ssh" / cand + if p.exists(): + return p.read_text().strip() + raise SystemExit("no SSH public key found in ~/.ssh") + + +def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): + free = free_mib(pathlib.Path("/proc/meminfo").read_text()) + if free < MIN_FREE_MIB: + raise SystemExit(f"refusing to start: only {free} MiB free (< {MIN_FREE_MIB})") + running = sh(["virsh", "list", "--name"], capture=True).stdout.split() + if any(n.startswith(NAME_PREFIX) for n in running): + raise SystemExit("an integration VM is already running (one at a time); " + "run `integration-vm prune` first") + name = name or vm_name(host) + img = ensure_image() + net_ensure() + RUN_DIR.mkdir(parents=True, exist_ok=True) + overlay = RUN_DIR / f"{name}.qcow2" + sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)]) + (RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible")) + (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name)) + seed = RUN_DIR / f"{name}-seed.img" + sh(["cloud-localds", str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")]) + DIAG_ROOT.mkdir(parents=True, exist_ok=True) + console = DIAG_ROOT / f"{name}-console.log" + sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus), + "--import", + "--disk", f"path={overlay},format=qcow2", + "--disk", f"path={seed},device=cdrom", + "--network", f"network={NET_NAME}", + "--osinfo", "debian13", + "--graphics", "none", + "--serial", f"file,path={console}", + "--noautoconsole"]) + ip = wait_for_ip(name) + wait_for_ssh(ip, "ansible") + (RUN_DIR / "current").write_text(f"{name}\n{ip}\n{host}\n") + print(f"VM {name} up at {ip}") + return name, ip + + +def wait_for_ip(name, timeout=120): + end = time.time() + timeout + while time.time() < end: + out = sh(["virsh", "domifaddr", name, "--source", "lease"], + check=False, capture=True).stdout + ip = parse_lease_ip(out) + if ip: + return ip + time.sleep(4) + raise SystemExit(f"timed out waiting for {name} to get a DHCP lease") + + +def wait_for_ssh(ip, user, timeout=180): + end = time.time() + timeout + while time.time() < end: + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5", + f"{user}@{ip}", "true"], check=False, capture=True) + if r.returncode == 0: + return + time.sleep(5) + raise SystemExit(f"timed out waiting for SSH to {ip}") +``` + +- [ ] **Step 2: Manual smoke (real KVM — requires Task 1 applied to ubongo)** + +```bash +.venv/bin/python scripts/integration-vm.py up --host askari # via DISPATCH once Task 10 lands +``` +Expected: golden image present, `boma-it` net active, overlay + seed created, VM boots, prints `VM boma-it-askari- up at 192.168.150.x`. SSH in: `ssh ansible@` works. + +- [ ] **Step 3: Commit.** `git commit -am "feat(integration-vm): network + VM boot (overlay, cloud-init seed, virt-install import)"` + +### Task 8: `write_run_inventory`, `apply` + +**Files:** Modify driver + +- [ ] **Step 1: Implement** + +```python +def _read_current(): + txt = (RUN_DIR / "current").read_text().splitlines() + return txt[0], txt[1], txt[2] # name, ip, host + + +def write_run_inventory(name, ip, groups): + RUN_DIR.mkdir(parents=True, exist_ok=True) + (RUN_DIR / "hosts.yml").write_text( + render_run_hosts(name, ip, "ansible", groups)) + link = RUN_DIR / "group_vars" + target = REPO_ROOT / "inventories" / "production" / "group_vars" + if link.is_symlink() or link.exists(): + if link.is_symlink(): + link.unlink() + if not link.exists(): + link.symlink_to(target) + + +def apply(host, certs): + name, ip, _ = _read_current() + prof = json.loads(profile_path(host).read_text()) + write_run_inventory(name, ip, prof["groups"]) + extra = [] + for f in prof.get("extra_vars_files", []): + extra += ["-e", f"@{INTEG_DIR / f}"] + extra += ["-e", f"@{cert_file(certs)}"] + for step in prof["applies"]: + cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR) + "/", + f"playbooks/{step['playbook']}", "--limit", name] + if step.get("tags"): + cmd += ["--tags", ",".join(step["tags"])] + cmd += extra + sh(cmd, cwd=str(REPO_ROOT)) + print(f"applied {host} profile to {name}") +``` + +- [ ] **Step 2: Manual verification** — deferred to the Task 15 RED run (needs the profile/overlay/cert files from Phase D). Lint passes regardless. + +- [ ] **Step 3: Commit.** `git commit -am "feat(integration-vm): transient inventory + real-playbook apply"` + +### Task 9: `reboot_vm`, `run_assert`, `dump_diagnostics` + +**Files:** Modify driver + +- [ ] **Step 1: Implement** + +```python +def reboot_vm(): + name, ip, _ = _read_current() + sh(["virsh", "reboot", name]) + time.sleep(5) + wait_for_ssh(ip, "ansible") + print(f"{name} rebooted, SSH back at {ip}") + + +def run_assert(host, certs): + name, ip, _ = _read_current() + prof = json.loads(profile_path(host).read_text()) + write_run_inventory(name, ip, prof["groups"]) + extra = [] + for f in prof.get("extra_vars_files", []): + extra += ["-e", f"@{INTEG_DIR / f}"] + extra += ["-e", f"@{cert_file(certs)}"] + cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR) + "/", + "tests/integration/verify.yml", "--limit", name] + extra + r = sh(cmd, cwd=str(REPO_ROOT), check=False) + if r.returncode != 0: + dump_diagnostics(name, ip) + raise SystemExit(f"VERIFY FAILED for {name} — diagnostics in {DIAG_ROOT}") + print(f"VERIFY PASSED for {name}") + + +def dump_diagnostics(name, ip): + d = DIAG_ROOT / name + d.mkdir(parents=True, exist_ok=True) + for label, cmd in [ + ("nft", "nft list ruleset"), + ("docker", "docker ps -a"), + ("ss", "ss -tlnp"), + ("journal", "journalctl -b --no-pager"), + ("critical-chain", "systemd-analyze critical-chain"), + ]: + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + f"ansible@{ip}", "sudo " + cmd], check=False, capture=True) + (d / f"{label}.txt").write_text((r.stdout or "") + (r.stderr or "")) + console = DIAG_ROOT / f"{name}-console.log" + if console.exists(): + shutil.copy(console, d / "console.log") + print(f"diagnostics written to {d}", file=sys.stderr) +``` + +- [ ] **Step 2: Commit.** `git commit -am "feat(integration-vm): reboot, verify run, failure diagnostics"` + +### Task 10: `down`, `prune`, `console`, `cycle` + `DISPATCH` + +**Files:** Modify driver + +- [ ] **Step 1: Implement** + +```python +def _destroy(name): + sh(["virsh", "destroy", name], check=False) + sh(["virsh", "undefine", name, "--nvram"], check=False) + for f in RUN_DIR.glob(f"{name}*"): + f.unlink(missing_ok=True) + + +def down(host=None, keep=False): + if keep: + print("--keep: leaving the VM running for inspection") + return + cur = RUN_DIR / "current" + if cur.exists(): + name = cur.read_text().splitlines()[0] + _destroy(name) + cur.unlink(missing_ok=True) + print(f"destroyed {name}") + + +def prune(): + running = sh(["virsh", "list", "--all", "--name"], capture=True).stdout.split() + for n in running: + if n.startswith(NAME_PREFIX): + _destroy(n) + print(f"pruned {n}") + (RUN_DIR / "current").unlink(missing_ok=True) + + +def console(): + name = (RUN_DIR / "current").read_text().splitlines()[0] + log = DIAG_ROOT / f"{name}-console.log" + print(log.read_text() if log.exists() else f"no console log at {log}") + + +def cycle(host, certs, keep=False, no_reboot=False): + try: + up(host) + apply(host, certs) + if not no_reboot: + reboot_vm() + run_assert(host, certs) + finally: + # On success destroy; on failure (SystemExit) keep for inspection unless --keep flips it. + if not keep: + down(host) +``` + +Wire the dispatch (replace the temporary `DISPATCH = {}`): +```python +DISPATCH = { + "up": lambda a: (up(a.host), None)[1], + "apply": lambda a: apply(a.host, a.certs), + "reboot": lambda a: reboot_vm(), + "assert": lambda a: run_assert(a.host, a.certs), + "down": lambda a: down(a.host, a.keep), + "console": lambda a: console(), + "prune": lambda a: prune(), + "cycle": lambda a: cycle(a.host, a.certs, a.keep, a.no_reboot), +} +``` +Fix `cycle`'s teardown semantics: on **failure** keep the VM (so it can be inspected); on **success** destroy. Implement by catching success explicitly: +```python +def cycle(host, certs, keep=False, no_reboot=False): + ok = False + try: + up(host); apply(host, certs) + if not no_reboot: + reboot_vm() + run_assert(host, certs) + ok = True + finally: + if ok and not keep: + down(host) + elif not ok: + print("FAILED — VM left up for inspection; `integration-vm prune` to clean.", + file=sys.stderr) +``` + +- [ ] **Step 2: Run unit tests + lint.** `.venv/bin/pytest tests/test_integration_vm.py -q` PASS; `make lint` clean. + +- [ ] **Step 3: Commit.** `git commit -am "feat(integration-vm): teardown, prune, console, full cycle + dispatch"` + +--- + +## Phase D — Profile, cert `internal` tier, verify playbook + +### Task 11: reverse_proxy `tls internal` + `acme_ca` knobs + +**Files:** Modify `roles/reverse_proxy/defaults/main.yml`, `roles/reverse_proxy/templates/Caddyfile.j2` + +- [ ] **Step 1: defaults** — append: + +```yaml +# Integration-test / staging cert knobs (ADR-025). Default off = production behaviour. +reverse_proxy__tls_internal: false # true => every site uses Caddy's self-signed CA +reverse_proxy__acme_ca: "" # set to the LE staging directory URL to use staging +``` + +- [ ] **Step 2: Caddyfile.j2** — in the global options block (after the `email` line), add: + +```jinja +{% if reverse_proxy__acme_ca %} + acme_ca {{ reverse_proxy__acme_ca }} +{% endif %} +``` + +In each site block (inside `{{ r['host'] }} {`), add as the first directive: + +```jinja +{% if reverse_proxy__tls_internal %} + tls internal +{% endif %} +``` + +- [ ] **Step 3: Molecule regression** — confirm `reverse_proxy` still renders. If the role has a Molecule scenario, run `make test ROLE=reverse_proxy`; else `make lint`. +Expected: clean; default-off means production output is byte-identical (the `{% if %}` blocks emit nothing). + +- [ ] **Step 4: Commit.** `git commit -am "feat(reverse_proxy): tls-internal + acme_ca knobs for integration/staging (ADR-025)"` + +### Task 12: askari profile + overlay + cert-tier files + +**Files:** Create `tests/integration/profiles/askari.json`, `tests/integration/overrides/askari.yml`, `tests/integration/certs/{internal,le-staging,le-prod-wildcard}.yml` + +- [ ] **Step 1: `profiles/askari.json`** + +```json +{ + "groups": ["offsite_hosts"], + "applies": [ + {"playbook": "site.yml", "tags": ["base"]}, + {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]} + ], + "extra_vars_files": ["overrides/askari.yml"], + "mem_mib": 3072, + "vcpus": 2 +} +``` + +(`netbird_coordinator` is intentionally omitted from v1 `applies` — Caddy's published :443 gives the DNAT that reproduces FRICTION #1. Coordinator fidelity (#3/#4) is a follow-on, Task 21.) + +- [ ] **Step 2: `overrides/askari.yml`** (Ansible extra-vars; highest precedence — never edits real inventory) + +```yaml +--- +# Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`. +# Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host. +base__firewall_apply: true +# Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM). +base__ssh_listen_mesh_only: false +# The VM is isolated; it must never touch the real mesh. +base__mesh_enabled: false +``` + +- [ ] **Step 3: cert-tier files** + +`certs/internal.yml`: +```yaml +--- +reverse_proxy__tls_internal: true +``` +`certs/le-staging.yml`: +```yaml +--- +reverse_proxy__tls_internal: false +reverse_proxy__acme_dns_provider: gandi +reverse_proxy__acme_ca: "https://acme-staging-v02.api.letsencrypt.org/directory" +``` +`certs/le-prod-wildcard.yml`: +```yaml +--- +# On-demand only. Records an accepted risk (ADR-025 / accepted-risks.md): the prod +# Gandi PAT reaches an ephemeral VM and transient TXT records land in the real wingu.me. +reverse_proxy__tls_internal: false +reverse_proxy__acme_dns_provider: gandi +reverse_proxy__acme_ca: "" +``` + +- [ ] **Step 4: Commit.** `git commit -am "feat(integration): askari profile, stub overlay, cert-tier files"` + +### Task 13: verify playbook + +**Files:** Create `tests/integration/verify.yml` + +- [ ] **Step 1: Write it** + +```yaml +--- +# Integration verify (ADR-025). Outcome-based: proves Docker forwarding survives the +# reboot. The load-bearing check probes the VM's published :443 FROM the controller +# (ubongo) — if base's forward-drop killed DNAT, this times out (the FRICTION #1 bug). +- name: Verify the rebooted host + hosts: all + become: true + gather_facts: false + tasks: + - name: Docker daemon is active + ansible.builtin.command: systemctl is-active docker + changed_when: false + + - name: Forward chain permits container traffic (drop-in loaded) + ansible.builtin.command: nft list chain inet filter forward + register: _fwd + changed_when: false + + - name: Assert container forwarding is allowed (not pure drop) + ansible.builtin.assert: + that: "'accept' in _fwd.stdout" + fail_msg: >- + forward chain is pure drop — container forwarding will die on reboot + (FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing. + + - name: Published HTTPS port answers from the controller (DNAT + forward alive) + delegate_to: localhost + become: false + ansible.builtin.uri: + url: "https://{{ ansible_host }}/" + validate_certs: false + status_code: [200, 308, 404, 502, 503] + timeout: 10 + register: _probe + retries: 5 + delay: 6 + until: _probe is succeeded +``` + +- [ ] **Step 2: Lint.** `make lint` — clean (file is under `tests/`, not `playbooks/`, but keep tags valid; this play uses none, which is fine). + +- [ ] **Step 3: Commit.** `git commit -am "feat(integration): outcome-based verify playbook (DNAT-survives-reboot)"` + +--- + +## Phase E — Makefile + RED milestone + +### Task 14: Makefile targets + .gitignore + +**Files:** Modify `Makefile`, `.gitignore` + +- [ ] **Step 1: Makefile** — add after the `test-all` target: + +```makefile +test-integration: +ifndef HOST + $(error HOST is required: make test-integration HOST= [CERTS=internal|le-staging] [KEEP=1]) +endif + PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py cycle \ + --host $(HOST) $(if $(CERTS),--certs $(CERTS)) $(if $(KEEP),--keep) + +test-integration-clean: + PATH="$(CURDIR)/$(VENV)/bin:$$PATH" $(PYTHON) scripts/integration-vm.py prune +``` + +Add both to `.PHONY` and the `help` block (match the existing style). + +- [ ] **Step 2: .gitignore** — add: + +``` +# Integration-test transient run dir (ADR-025); diagnostics live under ~/integration-runs +tests/integration/.run/ +``` + +- [ ] **Step 3: Commit.** `git commit -am "feat(make): test-integration / test-integration-clean targets"` + +### Task 15: RED milestone — reproduce the incident + +**Files:** none (a validation run); record the outcome. + +- [ ] **Step 1: Pre-flight** — confirm `rbw unlocked` (the apply decrypts `group_vars/all/vault.yml`); confirm Task 1's role is applied to ubongo (`virsh version` works, you're in the `libvirt` group — may need a re-login). + +- [ ] **Step 2: Run the cycle on TODAY's base (no docker_host fix yet)** + +Run: `make test-integration HOST=askari` +Expected: VM boots → base (firewall on) + docker_host + reverse_proxy apply → **reboot** → verify **FAILS** at "Assert container forwarding is allowed" and/or the :443 probe times out. Diagnostics appear under `~/integration-runs/boma-it-askari-/` (nft shows `forward { policy drop }` with no accepts; the published port is dead). + +- [ ] **Step 3: Confirm the failure is the RIGHT one** — read `~/integration-runs//nft.txt`: the `inet filter forward` chain is pure `policy drop`. This is the faithful reproduction of FRICTION #1. **If verify PASSES here, the harness is not faithful — stop and investigate** (e.g. Docker re-added its own accepts, or the firewall didn't apply). + +- [ ] **Step 4: Clean up.** `make test-integration-clean` + +- [ ] **Step 5: Record** — append a `[gotcha]`/milestone note to `docs/FRICTION.md` Open signals: "ADR-025 harness reproduced the 2026-06-17 firewall×Docker×reboot bug on a local VM (RED). Diagnostics: nft forward pure-drop, :443 DNAT dead post-reboot." Commit: +```bash +git commit -am "test(integration): RED — harness reproduces the 2026-06-17 incident" +``` + +--- + +## Phase F — GREEN milestone (docker_host fix) + +### Task 16: docker_host container-forward drop-in + +**Files:** Modify `roles/docker_host/defaults/main.yml`, `roles/docker_host/tasks/main.yml`; Create `roles/docker_host/templates/10-docker-forward.nft.j2` + +- [ ] **Step 1: defaults** — append: + +```yaml +# Container-forward nftables drop-in (FRICTION 2026-06-17 #1 / ADR-025). base's inet +# filter forward chain is `policy drop`; a drop verdict there is final, so Docker's own +# ip-filter accepts can't save forwarded container traffic. We append accepts to base's +# forward chain via base's /etc/nftables.d/*.nft include. Only meaningful on hosts where +# base__firewall_apply is true. +docker_host__forward_dropin: true +``` + +- [ ] **Step 2: template `templates/10-docker-forward.nft.j2`** + +```jinja +# {{ ansible_managed }} +# Allow container forwarding through base's default-deny forward chain (ADR-025). +table inet filter { + chain forward { + ct state established,related accept + iifname "docker0" accept + oifname "docker0" accept + iifname "br-+" accept + oifname "br-+" accept + } +} +``` + +- [ ] **Step 3: tasks/main.yml** — append (after Docker install): + +```yaml +- name: Install the container-forward nftables drop-in + ansible.builtin.template: + src: 10-docker-forward.nft.j2 + dest: "{{ base__firewall_dropin_dir }}/10-docker-forward.nft" + mode: "0644" + when: docker_host__forward_dropin | bool + notify: reload nftables + tags: [firewall] +``` + +Confirm the handler name base exposes: +Run: `grep -rn "listen:\|reload nftables\|nftables" roles/base/handlers/main.yml` +Use base's actual handler `listen:` topic; if none fits, add a `docker_host` handler that runs `nft -f /etc/nftables.conf` (the same reload base uses). Show the handler you add in `roles/docker_host/handlers/main.yml`: +```yaml +--- +- name: reload nftables + ansible.builtin.command: nft -f /etc/nftables.conf + listen: reload nftables +``` + +- [ ] **Step 4: GREEN run** + +Run: `make test-integration HOST=askari` +Expected: apply (now includes the drop-in) → reboot → verify **PASSES** (forward chain has `accept` rules; :443 answers from ubongo). This is the red→green proof. + +If it still fails, read diagnostics and iterate the `.nft` rules (e.g. Docker's compose bridges, or a NAT/masquerade gap) — **this is exactly what the harness is for**. Keep iterating Step 2 until verify passes. + +- [ ] **Step 5: Idempotence + lint + Molecule.** `make lint`; `make test ROLE=docker_host` (add a Molecule assertion that the drop-in file renders if the role has a scenario). + +- [ ] **Step 6: Commit.** `git commit -am "fix(docker_host): container-forward nftables drop-in survives reboot (FRICTION #1, ADR-025)"` + +--- + +## Phase G — le-staging cert tier + +### Task 17: validate `--certs le-staging` + +**Files:** none new (exercises Task 11/12); may tweak `overrides/askari.yml` if DNS-01 names need adjusting. + +- [ ] **Step 1: Pre-flight** — `rbw unlocked` (the run needs `vault.gandi.pat` for DNS-01). The VM needs outbound egress (the `boma-it` NAT net provides it). + +- [ ] **Step 2: Run with the staging cert tier** + +Run: `make test-integration HOST=askari CERTS=le-staging` +Expected: same apply, but Caddy now uses DNS-01 against LE **staging** (untrusted root) for the profile's route hostnames (under `wingu.me`, whose DNS lives at Gandi). Verify still passes (the :443 probe uses `validate_certs: false`). + +- [ ] **Step 3: Confirm a real staging cert issued** — `make test-integration HOST=askari CERTS=le-staging KEEP=1`, then: +```bash +NAME=$(.venv/bin/python -c "print(open('tests/integration/.run/current').read().split()[0])") +IP=$(sed -n 2p tests/integration/.run/current) +ssh ansible@$IP "sudo docker exec caddy ls /data/caddy/certificates" # adjust to the caddy data path +``` +Expected: a cert dir under an `acme-staging-v02...` issuer path (proves the DNS-01 staging path works end to end). Then `make test-integration-clean`. + +- [ ] **Step 4: Commit** (only if `overrides`/`certs` needed tweaks): `git commit -am "test(integration): validate le-staging DNS-01 cert path"` + +--- + +## Phase H — Governance & docs + +### Task 18: ADR-025 + +**Files:** Create `docs/decisions/025-local-vm-integration-testing.md` + +- [ ] **Step 1: Write the ADR** — use `docs/decisions/adr-template.md`. Content (no placeholders — write these in full): + - **Status:** Accepted (2026-06-18). + - **Context:** Molecule (Level 1) can't catch reboot/firewall/Docker/boot-order bugs; the 2026-06-17 incident; ADR-008 Level 2/3 was deferred for lack of hosts but ubongo can host local KVM (verified `/dev/kvm` + VT-x). + - **Decision:** libvirt/KVM (Approach A), one throwaway VM at a time from real inventory ("be askari"), stdlib driver over `virsh`, tiered certs (`internal` default, `le-staging` built, `le-prod-wildcard` on-demand), Ansible-managed substrate role, stubs via `-e @` overlays. + - **Alternatives rejected:** Proxmox-nested (heavy, ADR-015 tension, bugs aren't in provisioning); Vagrant (Ruby/plugin footprint, box drift); terraform-provider-libvirt (poor at imperative reboot loop, blurs ADR-006). + - **Consequences:** new RAM load on ubongo (resource guard + one-at-a-time); reconciles ADR-015; accepted risk for `le-prod-wildcard`. Cross-reference ADR-008/015/006/024/016/020. + +- [ ] **Step 2: Commit.** `git commit -am "docs(adr): ADR-025 local VM integration testing"` + +### Task 19: pointers + entries + +**Files:** Modify `docs/decisions/008-testing.md`, `docs/decisions/015-control-host.md`, `docs/security/accepted-risks.md`, `CLAUDE.md`, `STATUS.md`, `docs/TODO.md`, `docs/hardware/reference.md` + +- [ ] **Step 1: ADR-008** — in the "what Molecule does NOT test" section, add a line: reboot-survivability / host-firewall×Docker / boot-order are now covered by **local VM integration testing (ADR-025)**; add ADR-025 to the Level 2/3 description as its concrete build. + +- [ ] **Step 2: ADR-015** — one line: ubongo runs **ephemeral KVM test VMs** as part of its local-test-runner role (ADR-025) — still not a production hypervisor; note the test-VM RAM load against the 16 GiB sizing. + +- [ ] **Step 3: accepted-risks.md** — add an entry: *le-prod-wildcard integration runs* — the production Gandi PAT (`vault.gandi.pat`) reaches an ephemeral local VM and transient `_acme-challenge` TXT records are written into the real `wingu.me` zone. Scope: on-demand only; staging is the default. Compensating: ephemeral VM, NAT-isolated, TXT auto-removed by Caddy. Owner/date. + +- [ ] **Step 4: CLAUDE.md** — add to the key-commands table: +``` +| Integration-test a host on a local VM | `make test-integration HOST= [CERTS=…]` | +| Clean up integration test VMs | `make test-integration-clean` | +``` + +- [ ] **Step 5: STATUS.md** — add `roles/integration_test/` + `scripts/integration-vm.py` to "Built + working"; note the RED→GREEN acceptance passed. + +- [ ] **Step 6: TODO.md** — collapse item 2.4 to a one-line pointer: "→ ADR-025 / `make test-integration` (built 2026-06-18)." (Do NOT renumber other items.) + +- [ ] **Step 7: hardware/reference.md** — add a note to ubongo's row/workloads: one integration VM (~3 GiB) at a time; don't run alongside a heavy Level-4 browser session. + +- [ ] **Step 8: Commit.** `git commit -am "docs: wire ADR-025 into testing/control-host/risks/status/todo/capacity"` + +### Task 20: runbook + +**Files:** Create `docs/runbooks/integration-testing.md` + +- [ ] **Step 1: Write it** — sections: when to use it (firewall/sshd/boot/Docker changes, operationalises the standing "test risky infra before live deploy" rule + FRICTION #6 "validate reboot-recovery before retiring break-glass"); commands (`cycle`/`up`/`apply`/`reboot`/`assert`/`down`/`prune`/`console`, `--certs`, `--keep`); where diagnostics land (`~/integration-runs/`); how to inspect a kept failed VM (`virsh console`, ssh); the safety invariants; adding a new profile (a `profiles/.json` + `overrides/.yml`); the cert tiers and when to use each. + +- [ ] **Step 2: Add a pre-flight line** to `docs/runbooks/new-host.md` and the hardening runbook: before a lockout-risky change, `make test-integration HOST=` and confirm reboot-recovery while the break-glass is still open. + +- [ ] **Step 3: Commit.** `git commit -am "docs(runbook): integration-testing runbook + pre-flight cross-links"` + +--- + +## Deferred (out of v1 scope — track in TODO/FRICTION, not this plan) + +- **Task 21 (follow-on): coordinator fidelity** — add `netbird_coordinator` to the askari profile's `applies` + the geo-DB stub var (needs reading `roles/netbird_coordinator/`), so signals #3 (mesh-bootstrap circularity) and #4 (egress FATAL-loop) reproduce. v1 gate is #1 only. +- **`le-prod-wildcard` issuance/persistence** — issue `*.test.wingu.me` once, persist on ubongo, mount into the VM. Wired (cert file exists) but unused until needed. +- **Multi-VM mini-staging** — inter-host mesh/dataplane. +- **Snapshot/`reset`** — post-apply libvirt snapshot for fast re-runs without re-applying base roles. + +--- + +## Self-Review + +**Spec coverage:** Approach A → Tasks 6-10. Substrate role → Task 1. Single-VM "be askari" → Tasks 12/15. Acceptance red→green → Tasks 15/16. Tiered certs (`internal`+`le-staging` built, `le-prod-wildcard` wired) → Tasks 11/12/17. Ansible-managed substrate → Task 1. Stubs in overlay (not inventory) → Task 12 (`-e @`). Safety invariants → Task 5 (single-host inv) + Task 12 (`mesh_enabled: false`) + Task 7 (isolated NAT). Resource guard / one-at-a-time → Task 7. Diagnostics → Task 9. Governance (ADR-025, ADR-008/015 pointers, accepted-risks, CLAUDE.md, runbook, STATUS, TODO, capacity) → Tasks 18-20. **Gap closed:** coordinator (#3/#4) explicitly deferred to Task 21 with the v1 gate stated as #1 — matches the spec's "minimum credible v1 is the red half" scoping. + +**Placeholder scan:** none — `_destroy`'s `--nvram` and the caddy data path in Task 17 Step 3 carry "adjust to actual" notes (verification actions, not placeholders). The base nftables handler name is a confirm-then-use step (Task 16 Step 3), not a guess. + +**Type/name consistency:** `vm_name/free_mib/parse_lease_ip/render_meta_data/render_user_data/cert_file/profile_path/render_run_hosts` (pure, Tasks 3-5) ↔ used by `up/apply/run_assert` (Tasks 7-9). `RUN_DIR/current` written by `up` (Task 7), read by `_read_current` (Task 8). `DISPATCH` keys ↔ argparse subcommands (Task 2/10). Profile JSON keys (`groups`/`applies`/`extra_vars_files`/`mem_mib`/`vcpus`) ↔ `apply` (Task 8) + `askari.json` (Task 12). Cert files ↔ `cert_file` (Task 5) + Task 12. `base__firewall_dropin_dir` ↔ Task 16 template dest. diff --git a/docs/superpowers/specs/2026-06-18-local-vm-integration-testing-design.md b/docs/superpowers/specs/2026-06-18-local-vm-integration-testing-design.md new file mode 100644 index 0000000..d1edc09 --- /dev/null +++ b/docs/superpowers/specs/2026-06-18-local-vm-integration-testing-design.md @@ -0,0 +1,267 @@ +# Local VM integration testing on ubongo (design) + +**Status:** Designed, not built. Resolves `docs/TODO.md` item 2.4 (Local VM integration +testing on ubongo, pre-deploy). +**Date:** 2026-06-18. +**Implements:** the concrete build of ADR-008 Level 2/3 (staging/integration), deferred +for lack of hosts but hostable on ubongo. To be recorded as **ADR-025**. + +## Context + +Molecule (ADR-008 Level 1) tests each role in a single Docker container: one `converge`, +no real kernel netfilter, no real Docker daemon in the loop, and **no reboot**. That +structurally cannot catch an entire class of bug — reboot-survivability, host-firewall × +Docker interaction, and boot-ordering — which is exactly the class that caused the +**2026-06-17 mesh-hardening incident**: + +- `base`'s nftables `forward { policy drop; }` killed the askari Docker host **on reboot** + (nftables loaded its default-deny *before* Docker, breaking published-port DNAT and + inter-container forwarding → public services + the mesh went down). It had worked right + after `make deploy`, when Docker's runtime rules still coexisted. (FRICTION 2026-06-17 #1.) +- `ip_nonlocal_bind` did **not** beat the sshd boot-race; sshd bound to the `wt0` mesh IP + had no listener at boot. (FRICTION #2.) +- The coordinator host could not bootstrap the mesh it itself hosts. (FRICTION #3.) +- NetBird `netbird-server` FATAL-loops on the GeoLite2 download when egress is lost — and + egress was lost when `nft flush` wiped Docker's NAT masquerade. (FRICTION #4.) + +Recovery needed the Hetzner console + a WAN-SSH break-glass. The lesson, already crystallised +as a standing rule: *firewall/sshd/boot changes must be tested on a real VM with a real +reboot before they touch a live host, and a non-mesh break-glass must be kept.* + +This spec defines a way for the agent to spin up **throwaway KVM VMs locally on ubongo** +that mirror a target host (real Docker, a real reboot, the real role apply) and validate +risky infra changes **before** a live deploy. ubongo can host this today: + +> verified: ubongo KVM capability · Bash (this session) · `/dev/kvm` present + accessible +> (kvm group), Intel VT-x (`vmx`) enabled, 8 vCPU (i3-10100T), ~13 GiB RAM free of 16, ~198 +> GiB disk free; libvirt/QEMU/Vagrant **not yet installed** · 2026-06-18. + +## Goals + +- Reproduce the 2026-06-17 bug class locally: real OS boot, real Docker, real netfilter, + the real role apply, a **real reboot**, then outcome assertions. +- Let the agent drive the full loop autonomously: provision → apply → reboot → assert → + teardown, with diagnostics captured on failure. +- Mirror a *real* host from inventory (first profile: "be askari"), so the apply is + faithful, not synthetic. +- Be the concrete tool that operationalises the standing "test risky infra before live + deploy" rule. + +## Non-goals (v1) + +- Not a production hypervisor on ubongo (reconciles ADR-015 — see Governance). +- Not nested Proxmox; the provisioning *chrome* (template clone / Terraform) is **not** + mirrored — every incident bug lives in the boot/kernel/Docker layer, not provisioning. +- Not a multi-VM mini-cluster; one VM at a time. (All six 2026-06-17 signals occurred on a + single host that was Docker host + coordinator + mesh peer.) Multi-VM is a later extension. +- Not a CI gate; this is an interactive, agent-driven pre-deploy check on ubongo (CI stays + lint + Molecule per ADR-008/010). + +## Decisions (from the 2026-06-18 brainstorm) + +1. **Virtualisation approach: libvirt/KVM directly (Approach A).** A golden Debian-13 + genericcloud qcow2 cached locally; each run boots an ephemeral qcow2 overlay backed by + it, seeded via cloud-init NoCloud, driven by a **stdlib-only** Python script over + `virsh` (no `libvirt-python` dependency). Chosen over Vagrant+vagrant-libvirt (Ruby/plugin + footprint, box drift from the real cloud image) and terraform-provider-libvirt (poor at + the imperative apply→reboot→re-apply sequence, throwaway state, blurs ADR-006's prod-VM + boundary). Lightest footprint on a 15 GiB control node; full control of the reboot step; + the same Debian cloud image real hosts boot. + +2. **Fidelity envelope: real OS/Docker/netfilter/reboot, not the Proxmox provisioning + path.** A lightweight local hypervisor is enough because the bugs are post-boot. + +3. **Scope: one throwaway VM at a time, instantiated from a real host's inventory.** First + profile: **"be askari"** (Docker host + NetBird coordinator + mesh peer on one box). The + mechanism is generic — later "be" any host by swapping which inventory host it mirrors. + +4. **Acceptance is self-validating against the real failure.** Done = the harness, on a + local VM, applies `base` (firewall on) to a Docker host, reboots, and **observes the + 2026-06-17 breakage** (Docker forwarding dead / services down); then, with the + `docker_host` container-forward drop-in in place, the same run **survives the reboot**. + If step 1 passes, the harness is not faithful. + +5. **Tiered cert fidelity via a `--certs` knob** (DNS-01 is what makes real certs possible + with no public inbound — validation is out-of-band via a Gandi TXT record; the VM needs + only outbound to ACME + Gandi, which the NAT net provides): + - `internal` (default) — Caddy `tls internal`, zero deps, instant; for the incident repro + and runs where certs aren't under test. + - `le-staging` — real DNS-01 ACME against Let's Encrypt **staging**: real caddy-gandi + path, real cert files/renewal, untrusted root, effectively no rate limits. **Built in v1.** + - `le-prod-wildcard` — a real trusted `*.test.wingu.me` wildcard, **issued once, + persisted on ubongo, reused** across runs. Wired in v1 but **on-demand only**; its + accepted risk is recorded when used (prod Gandi credential reaching an ephemeral VM; + transient TXT in the real `wingu.me` zone). A deliberate "no-egress" failure scenario + (to reproduce FRICTION #4) forces `internal`, since ACME needs egress. + +6. **The toolchain is Ansible-managed**, not hand-installed: a new non-service role + (`integration_test`, `control` group) installs/enables libvirt+QEMU reproducibly. The + repo owns ubongo's state. The driver manages *images* lazily on first run (keeps the role + lean; avoids fiddly download/refresh logic in Ansible). + +7. **Stubs live in an overlay file, never in the real inventory** — so `make tf-inventory` + and "don't edit inventory directly" stay intact, and every stub is explicit and reviewable. + +8. **A new ADR-025** records this decision (approach + alternatives + cert tiers); ADR-008 + gains a pointer and redirects its "what Molecule does NOT test" gaps here. + +## Architecture — five isolated components + +| # | Component | Purpose | Location | +|---|-----------|---------|----------| +| 1 | **`integration_test` role** (non-service, `control` group) | Install/enable libvirt+QEMU+virtinst, add `sjat`/`claude` to `libvirt` group, create the image-cache dir, drop the driver. Idempotent, Molecule-tested. | `roles/integration_test/` | +| 2 | **`integration-vm.py` driver** | Stdlib-only lifecycle over `virsh`: `up / apply / reboot / assert / cycle / reset / down / prune / console`. Lazily ensures the golden image (download + checksum). | `scripts/integration-vm.py` | +| 3 | **Profiles + var overlays** | Make a VM "become" a host: pull that host's real group_vars/host_vars + layer a small explicit overlay (cert tier, in-VM coordinator endpoint, VM connection). | `tests/integration/overrides/.yml` | +| 4 | **Verify playbook** | Outcome-based post-reboot assertions (Docker up, published-port DNAT alive, `nft` sane, service responds, `wt0` up), reusing Molecule's `verify.yml` philosophy. | `tests/integration/verify.yml` | +| 5 | **Makefile target** | `make test-integration HOST= [CERTS=...] [KEEP=1]` → `cycle`; `make test-integration-clean` → `prune`. Documented in CLAUDE.md's command table. | `Makefile` | + +## Lifecycle / data flow + +`make test-integration HOST=askari` drives: + +``` + 1. ensure golden image Debian-13 genericcloud qcow2, cached + checksum-verified + 2. ephemeral overlay qcow2 backed by golden (throwaway; never mutate golden) + 3. cloud-init NoCloud seed hostname + ansible user + ubongo's SSH key + NIC + 4. virt-install --import boot on an isolated libvirt NAT net (DHCP IP + outbound NAT) + 5. wait for SSH IP via `virsh domifaddr --source lease` (guest-agent optional) + 6. transient inventory askari's real vars + ansible_host= + stub overlay + 7. ansible-playbook site THE REAL APPLY (base + docker_host + reverse_proxy + coordinator) + 8. [snapshot post-apply] optional reset point for fast re-runs + 9. virsh reboot ──────────┐ ← the step Molecule structurally cannot do +10. wait for SSH ┘ +11. ansible-playbook verify outcome assertions; THIS is where the incident surfaces +12. report + teardown pass/fail; on fail keep VM + dump diagnostics; else destroy overlay +``` + +Steps 1–7 build a real Docker daemon with real published-port DNAT to break; step 9 is a +real kernel reboot, so nftables loads default-deny before Docker exactly as on askari. + +## Fidelity boundary & cert tiers + +**Faithful where the bug lives:** real kernel, real netfilter, real Docker with +published-port DNAT, the real role apply, a real reboot, and the coordinator running *inside +the VM* so the VM is its own mesh peer — reproducing the circular mesh-bootstrap (FRICTION #3) +on one box. + +**Stubbed where it needs the public internet** (explicit, in the overlay): LE certs via the +`--certs` knob (Decision 5); public DNS (`askari.wingu.me`) → local resolution; NetBird +geo-DB → pre-seeded or requirement disabled (which is *also* the FRICTION #4 fix, so the +harness can test both the FATAL-loop and its remedy). + +## Acceptance test (self-validating) + +1. Run the cycle on **today's** `base` (firewall on, no `docker_host` container-forward + drop-in) → **step 11 must FAIL after reboot** (Docker forwarding dead, services down). +2. Implement the `docker_host` container-forward rules (the pending fix STATUS.md names) → + re-run → **step 11 must PASS across the reboot.** + +**Scope boundary:** the *harness* is this plan's deliverable. The `docker_host` +container-forward fix is a separate work item (FRICTION 2026-06-17 #1). v1's acceptance +deliberately spans both, because a credible harness must demonstrate **both** a true-negative +(red on the broken state) and a true-positive (green on the fixed state) — otherwise we have +only ever watched the assert go red. The plan decides sequencing: build the small +`docker_host` drop-in as the green-half of acceptance, or consume it if built separately +first. Minimum credible v1 is the red half (faithful reproduction); full acceptance is red→green. + +This one round-trip proves the harness reproduces the incident, the fix works, and the loop +can be trusted for the next risky change before it touches a live host. + +## Robustness, isolation & teardown + +**Failure leaves evidence** (catching a bug is the point): + +| Step fails | Behaviour | +|---|---| +| Golden image (1) | Fail fast, clear message; image cached (one-time cost) | +| Boot / first SSH (4–5) | **Capture serial console to a log file**, fail with its tail — the automated equivalent of the Hetzner console (ties to TODO 10.8) | +| Apply (7) | Keep VM, surface Ansible output, dump diagnostics | +| **No SSH after reboot (9–10)** | The classic incident signature; FAIL, keep VM, capture console — the harness *succeeding* | +| Assert (11) | FAIL, keep VM, dump post-mortem: `nft list ruleset`, `docker ps`, `ss -tlnp`, `journalctl -b`, `systemd-analyze critical-chain`; exit non-zero | + +Diagnostics land in gitignored `~/integration-runs/-/` (same pattern as ADR-017's +screenshot dir; the agent reads them directly). + +**Three safety invariants** (these make the testing tool itself safe): +1. **The transient inventory contains only the test VM** — no real host is ever in scope; + the apply is `--limit`ed to the VM. +2. **"Be askari" points NetBird at the in-VM coordinator (localhost)** — the VM forms its + own one-node mesh; it never enrolls in the real mesh. +3. **Test VMs sit on an isolated libvirt NAT net** — outbound NAT for ACME/image pulls, but + not reachable to the LAN (`10.20.x`) or the real mesh. + +**Resource guard** (ubongo's 15 GiB ceiling, ADR-015/012): default VM ≈ 2 vCPU / 3 GiB / 20 +GiB thin overlay; the driver refuses to start below a free-RAM threshold and enforces **one +integration VM at a time** (name-prefix `boma-it-*`). **Teardown:** success destroys domain + +overlay; failure keeps them and prints how to inspect; `make test-integration-clean` reaps +all `boma-it-*` orphans. An optional post-apply **snapshot** lets `reset` re-run +reboot+assert without re-applying (fast iteration on a fix). + +## Testing the tester + +- **pytest** on the driver's pure logic: transient-inventory generation, var/overlay merge, + `--certs`→overlay mapping, DHCP-lease parsing, resource-guard math (mock `virsh`). Joins + boma's existing pytest suite. +- **Molecule** (Docker) on the `integration_test` role: asserts libvirt/qemu/virtinst + installed, `libvirtd` enabled, users in `libvirt` group, driver present. (Cannot run + KVM-in-Docker — the documented Molecule limitation.) +- **End-to-end self-test = the acceptance test above**, run manually on first build and + recorded in the runbook. + +## Governance & documentation touch-points + +- **ADR-025 "Local VM integration testing"** — decision, approach A, rejected alternatives + (Proxmox-nested / Vagrant / TF-libvirt), cert tiers. +- **ADR-008** — pointer to ADR-025; redirect its "what Molecule does NOT test" gaps + (nftables loading, mesh dataplane) to this level. +- **ADR-015** — one-line reconciliation: "not a hypervisor" → runs *ephemeral KVM test VMs* + as part of its local-test-runner role (still not a production hypervisor); note the + test-VM RAM load. +- **`docs/security/accepted-risks.md`** — the `le-prod-wildcard` risk (prod Gandi credential + → ephemeral VM; transient TXT in real `wingu.me`). +- **CLAUDE.md** command table + **`docs/runbooks/integration-testing.md`** (run a cycle, + cert knobs, where diagnostics land, inspecting a kept failed VM, pruning) + **STATUS.md** + entry. The runbook's pre-flight line operationalises FRICTION #6 (*validate + reboot-recovery before retiring the break-glass*). + +## Capacity + +One VM (~3 GiB) against ~13 GiB free is comfortable. The only future pinch is concurrency +with the Level-4 Chromium/Playwright stack (ADR-017) — handled by the resource guard + +"one at a time." Add a note to `docs/hardware/reference.md`; revisit at `/capacity-review`. + +## Alternatives considered + +- **Proxmox VE nested on ubongo** — highest fidelity incl. the provisioning step, but heavy + (nested virt, RAM), in tension with ADR-015, and the incident bugs don't live in + provisioning. Rejected. +- **Vagrant + vagrant-libvirt** — mature lifecycle/snapshots, but adds the Ruby/Vagrant + ecosystem + a fragile plugin, boxes drift from the real Debian cloud image, and the + reboot→assert sequence still needs custom logic. Rejected. +- **terraform-provider-libvirt** — declarative and reuses TF, but poor at the imperative + apply→reboot→re-apply test sequence, adds throwaway state, and blurs ADR-006's + "TF owns *production* VM existence on Proxmox" boundary. Rejected. + +## Open questions / deferred + +- **Multi-VM mini-staging** (inter-host mesh/dataplane) — design the driver + NAT net so a + topology is an additive extension; out of scope for v1. +- **Interplay with the Level-4 browser stack** — both want ubongo RAM; the resource guard is + the v1 answer, revisit when Level 4 is built. +- **Snapshot strategy depth** — v1 ships clone-and-destroy + an optional post-apply snapshot; + richer snapshot trees deferred. + +## Knowledge to verify at plan stage (ADR-014) + +These are from memory / unverified and must be confirmed against version-matched docs before +the plan asserts them: + +- Exact `virt-install --import` flags and the cloud-init **NoCloud** seed format on the + Debian-13 libvirt stack. +- Whether the Debian-13 genericcloud image ships `qemu-guest-agent` (IP can come from the + DHCP lease regardless — guest-agent is an optimisation, not a requirement). +- Let's Encrypt **rate limits** (prod vs staging) — to confirm "issue the wildcard once, + reuse" stays within limits. +- The `caddy-dns/gandi` DNS-01 configuration and pinned version already used by + `reverse_proxy`, and whether the Gandi LiveDNS API key can be scoped to `test.wingu.me`. +- libvirt default vs a dedicated isolated NAT network on Debian-13 (`virsh net-*`). diff --git a/inventories/production/group_vars/control/vars.yml b/inventories/production/group_vars/control/vars.yml index b4e64ee..6f06074 100644 --- a/inventories/production/group_vars/control/vars.yml +++ b/inventories/production/group_vars/control/vars.yml @@ -12,6 +12,9 @@ dev_env__users: # group only. ansible_user: sjat +# ubongo's AI-worker; passwordless sudo for the claude user (ADR-015 amended). +base__ai_worker_user: claude + # ubongo is a NetBird mesh peer (ADR-016, M5) — enrol the agent via base's `mesh` concern. # Enrollment only; the host firewall default-deny stays deferred (the mesh-hardening # follow-on), so this brings up wt0 without changing SSH exposure. diff --git a/playbooks/workstation.yml b/playbooks/workstation.yml index 071b7e0..0d0d6d2 100644 --- a/playbooks/workstation.yml +++ b/playbooks/workstation.yml @@ -8,3 +8,5 @@ roles: - role: dev_env tags: [dev_env] + - role: integration_test + tags: [integration_test] diff --git a/roles/base/defaults/main.yml b/roles/base/defaults/main.yml index 6073a1a..301dee7 100644 --- a/roles/base/defaults/main.yml +++ b/roles/base/defaults/main.yml @@ -29,6 +29,11 @@ base__ssh_authorised_keys: [] base__ssh_listen_mesh_only: false base__ssh_listen_addr: "" +# The automation/AI-worker user granted passwordless sudo (ADR-015 amended / ADR-021). +# Empty = no AI-worker sudo. Set per-group (e.g. group_vars/control: claude). The user's +# password should be locked so NOPASSWD is its only sudo path; actions are auditd-attributed. +base__ai_worker_user: "" + # NetBird mesh agent enrollment (ADR-016). Opt-in: default off so applying `base` to a # host not on the mesh is a no-op for this concern. The live actions (apt install over # the network, `netbird up` against the coordinator) are additionally gated by diff --git a/roles/base/tasks/main.yml b/roles/base/tasks/main.yml index b022586..b40e305 100644 --- a/roles/base/tasks/main.yml +++ b/roles/base/tasks/main.yml @@ -23,6 +23,13 @@ tags: [hardening] tags: [hardening] +- name: AI-worker operational access (sudoers drop-in) + ansible.builtin.include_tasks: + file: operational_access.yml + apply: + tags: [users] + tags: [users] + - name: NetBird mesh enrollment ansible.builtin.include_tasks: file: mesh.yml diff --git a/roles/base/tasks/operational_access.yml b/roles/base/tasks/operational_access.yml new file mode 100644 index 0000000..3a27a18 --- /dev/null +++ b/roles/base/tasks/operational_access.yml @@ -0,0 +1,11 @@ +--- +- name: Grant the AI-worker user passwordless sudo (ADR-015 amended / ADR-021) + ansible.builtin.copy: + content: "{{ base__ai_worker_user }} ALL=(ALL) NOPASSWD:ALL\n" + dest: "/etc/sudoers.d/{{ base__ai_worker_user }}-ai-worker" + owner: root + group: root + mode: "0440" + validate: "visudo -cf %s" + when: base__ai_worker_user | length > 0 + tags: [users] diff --git a/roles/docker_host/defaults/main.yml b/roles/docker_host/defaults/main.yml index 0e22ea5..50a84a6 100644 --- a/roles/docker_host/defaults/main.yml +++ b/roles/docker_host/defaults/main.yml @@ -1,8 +1,16 @@ --- -# Docker engine install (ADR-004). Cluster-specific daemon hardening + nftables.d -# integration are deferred to when the cluster + host firewall exist. +# Docker engine install (ADR-004). Cluster-specific daemon hardening is deferred to when +# the cluster exists. docker_host__packages: - docker-ce - docker-ce-cli - containerd.io - docker-compose-plugin + +# Container-forward nftables drop-in (FRICTION 2026-06-17 #1 / ADR-025). base's inet-filter +# forward chain is `policy drop`; on a Docker host that kills published-port DNAT + inter- +# container forwarding ON REBOOT (nftables loads default-deny before dockerd). This drop-in +# (loaded via base's /etc/nftables.d/*.nft include) appends the accepts so a rebooted Docker +# host keeps forwarding. Only meaningful where base__firewall_apply is true. +docker_host__forward_dropin: true +docker_host__nftables_dropin_dir: /etc/nftables.d # must match base__firewall_dropin_dir diff --git a/roles/docker_host/tasks/main.yml b/roles/docker_host/tasks/main.yml index 44f04f5..a91d333 100644 --- a/roles/docker_host/tasks/main.yml +++ b/roles/docker_host/tasks/main.yml @@ -37,3 +37,22 @@ state: present update_cache: true tags: [packages] + +- name: Ensure the nftables drop-in dir exists (for the container-forward rules) + ansible.builtin.file: + path: "{{ docker_host__nftables_dropin_dir }}" + state: directory + mode: "0755" + when: docker_host__forward_dropin | bool + tags: [firewall] + +- name: Install the container-forward nftables drop-in (reboot-safe Docker forwarding) + ansible.builtin.template: + src: 10-docker-forward.nft.j2 + dest: "{{ docker_host__nftables_dropin_dir }}/10-docker-forward.nft" + mode: "0644" + when: docker_host__forward_dropin | bool + # Not reloaded here: a running host already forwards via Docker's runtime rules, so the + # drop-in only needs to protect the NEXT boot (loaded by nftables.service). Reloading nft + # now would flush Docker's NAT (FRICTION 2026-06-17 #4); the boot loads it cleanly. + tags: [firewall] diff --git a/roles/docker_host/templates/10-docker-forward.nft.j2 b/roles/docker_host/templates/10-docker-forward.nft.j2 new file mode 100644 index 0000000..9190cea --- /dev/null +++ b/roles/docker_host/templates/10-docker-forward.nft.j2 @@ -0,0 +1,14 @@ +# {{ ansible_managed }} +# Allow container forwarding through base's default-deny forward chain (ADR-025 / FRICTION +# 2026-06-17 #1). Appended to base's `table inet filter` / `chain forward` via the +# /etc/nftables.d/*.nft include, and loaded by nftables.service at boot — exactly when the +# bug bit (default-deny forward loading before dockerd on reboot). +table inet filter { + chain forward { + ct state established,related accept + iifname "docker0" accept + oifname "docker0" accept + iifname "br-*" accept + oifname "br-*" accept + } +} diff --git a/roles/integration_test/README.md b/roles/integration_test/README.md new file mode 100644 index 0000000..22609e2 --- /dev/null +++ b/roles/integration_test/README.md @@ -0,0 +1,35 @@ +# integration_test + +Installs the KVM/libvirt substrate on the control node (`ubongo`) so the agent +can boot throwaway Debian VMs for local integration testing (ADR-025). + +This is a **non-service** role — no SECURITY/VERIFY/ACCESS/BACKUP files are +required. It does **not** make ubongo a production hypervisor; it only provides +the tooling needed to spin up short-lived test VMs (see ADR-015). + +## Target group + +`control` (i.e. `ubongo`) + +## What it does + +1. Installs QEMU/KVM, libvirt daemon + clients, `virt-install`, and + cloud-image tools (`cloud-image-utils`, `genisoimage`). +2. Enables and starts `libvirtd`. +3. Adds the configured users (`sjat`, `claude`) to the `libvirt` and `kvm` + groups so VMs can be managed without `sudo`. +4. Creates `/var/lib/boma-integration` (owned `root:libvirt`, mode `2775`) as + the cache directory for golden images and overlays. + +## Defaults + +| Variable | Default | Purpose | +|-------------------------------|-------------------------------|----------------------------------| +| `integration_test__packages` | see `defaults/main.yml` | APT packages to install | +| `integration_test__users` | `[sjat, claude]` | Users granted libvirt/kvm access | +| `integration_test__cache_dir` | `/var/lib/boma-integration` | Image/overlay cache directory | + +## Related decisions + +- [ADR-025](../../docs/decisions/025-local-vm-integration-testing.md) — local VM integration testing +- [ADR-015](../../docs/decisions/015-control-host.md) — control host scope (ubongo is not a hypervisor) diff --git a/roles/integration_test/defaults/main.yml b/roles/integration_test/defaults/main.yml new file mode 100644 index 0000000..932f99e --- /dev/null +++ b/roles/integration_test/defaults/main.yml @@ -0,0 +1,18 @@ +--- +# integration_test — installs the local KVM/libvirt substrate on the control node +# (ubongo) so the agent can run throwaway VM integration tests (ADR-025). Non-service +# role; applied to the `control` group. Not a production hypervisor (ADR-015). +integration_test__packages: + - qemu-system-x86 # KVM + - qemu-utils # qemu-img (overlays) + - libvirt-daemon-system + - libvirt-clients # virsh + - virt-install # virt-install (trixie: the real pkg; `virtinst` is transitional) + - cloud-image-utils # cloud-localds (NoCloud seed) + - genisoimage # cloud-localds fallback +# Users granted libvirt/kvm access (run VMs without sudo). +integration_test__users: + - sjat + - claude +# Where the golden image + overlays live (outside the repo). +integration_test__cache_dir: "/var/lib/boma-integration" diff --git a/roles/integration_test/handlers/main.yml b/roles/integration_test/handlers/main.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/roles/integration_test/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/integration_test/meta/main.yml b/roles/integration_test/meta/main.yml new file mode 100644 index 0000000..ee87dcf --- /dev/null +++ b/roles/integration_test/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + author: sjat + description: >- + Installs the KVM/libvirt substrate on the control node (ubongo) to enable + local VM integration testing (ADR-025). Non-service role; not a production + hypervisor (ADR-015). + license: MIT + min_ansible_version: "2.17" + platforms: + - name: Debian + versions: + - trixie +dependencies: [] diff --git a/roles/integration_test/molecule/default/converge.yml b/roles/integration_test/molecule/default/converge.yml new file mode 100644 index 0000000..f26090b --- /dev/null +++ b/roles/integration_test/molecule/default/converge.yml @@ -0,0 +1,7 @@ +--- +- name: Converge + hosts: all + become: true + gather_facts: true + roles: + - role: integration_test diff --git a/roles/integration_test/molecule/default/molecule.yml b/roles/integration_test/molecule/default/molecule.yml new file mode 100644 index 0000000..b23d8da --- /dev/null +++ b/roles/integration_test/molecule/default/molecule.yml @@ -0,0 +1,31 @@ +--- +dependency: + name: galaxy + options: + requirements-file: ../../requirements.yml + +driver: + name: docker + +platforms: + - name: instance + # Project-owned image built from .docker/molecule-debian13/Dockerfile + # and hosted in the Forgejo container registry. + # Build/push with: make molecule-image / make molecule-image-push + image: forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest + pre_build_image: true + privileged: true # required for systemd + cgroupns_mode: host + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + command: /lib/systemd/systemd + +provisioner: + name: ansible + inventory: + host_vars: + instance: + ansible_user: root + +verifier: + name: ansible diff --git a/roles/integration_test/molecule/default/verify.yml b/roles/integration_test/molecule/default/verify.yml new file mode 100644 index 0000000..233243b --- /dev/null +++ b/roles/integration_test/molecule/default/verify.yml @@ -0,0 +1,25 @@ +--- +- name: Verify + hosts: all + become: true + gather_facts: false + tasks: + - name: Gather package facts + ansible.builtin.package_facts: + - name: Assert the substrate packages are installed + ansible.builtin.assert: + that: + - "'qemu-system-x86' in ansible_facts.packages" + - "'qemu-utils' in ansible_facts.packages" + - "'libvirt-daemon-system' in ansible_facts.packages" + - "'libvirt-clients' in ansible_facts.packages" + - "'virt-install' in ansible_facts.packages" + - "'cloud-image-utils' in ansible_facts.packages" + - "'genisoimage' in ansible_facts.packages" + - name: Cache dir exists + ansible.builtin.stat: + path: /var/lib/boma-integration + register: _cache + - name: Assert cache dir + ansible.builtin.assert: + that: [_cache.stat.isdir] diff --git a/roles/integration_test/tasks/main.yml b/roles/integration_test/tasks/main.yml new file mode 100644 index 0000000..4a1e2c7 --- /dev/null +++ b/roles/integration_test/tasks/main.yml @@ -0,0 +1,32 @@ +--- +- name: Install the KVM/libvirt substrate + ansible.builtin.apt: + name: "{{ integration_test__packages }}" + state: present + update_cache: true + cache_valid_time: 3600 + tags: [packages] + +- name: Enable and start libvirtd + ansible.builtin.systemd: + name: libvirtd + enabled: true + state: started + tags: [config] + +- name: Grant users libvirt + kvm access + ansible.builtin.user: + name: "{{ item }}" + groups: [libvirt, kvm] + append: true + loop: "{{ integration_test__users }}" + tags: [users] + +- name: Create the integration cache dir + ansible.builtin.file: + path: "{{ integration_test__cache_dir }}" + state: directory + owner: root + group: libvirt + mode: "2775" + tags: [config] diff --git a/roles/reverse_proxy/defaults/main.yml b/roles/reverse_proxy/defaults/main.yml index 46249a9..fb6b92f 100644 --- a/roles/reverse_proxy/defaults/main.yml +++ b/roles/reverse_proxy/defaults/main.yml @@ -35,3 +35,7 @@ access__api: # noqa: var-naming[no-role-prefix] # DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance. backup__service: reverse_proxy # noqa: var-naming[no-role-prefix] backup__state: false # noqa: var-naming[no-role-prefix] + +# Integration-test / staging cert knobs (ADR-025). Default off = production behaviour. +reverse_proxy__tls_internal: false # true => every site uses Caddy's self-signed CA +reverse_proxy__acme_ca: "" # set to the LE staging directory URL to use staging diff --git a/roles/reverse_proxy/templates/Caddyfile.j2 b/roles/reverse_proxy/templates/Caddyfile.j2 index 0c0a280..c9da8f5 100644 --- a/roles/reverse_proxy/templates/Caddyfile.j2 +++ b/roles/reverse_proxy/templates/Caddyfile.j2 @@ -1,6 +1,9 @@ # {{ ansible_managed }} { email {{ reverse_proxy__acme_email }} +{% if reverse_proxy__acme_ca %} + acme_ca {{ reverse_proxy__acme_ca }} +{% endif %} {% if reverse_proxy__acme_dns_provider == 'gandi' %} # ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the # Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom @@ -10,6 +13,9 @@ } {% for r in reverse_proxy__routes %} {{ r['host'] }} { +{% if reverse_proxy__tls_internal %} + tls internal +{% endif %} {% if r['caddy'] is defined %} {{ r['caddy'] | trim | indent(2, first=true) }} {% elif r['upstream'] is defined %} diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py new file mode 100644 index 0000000..b5ec90e --- /dev/null +++ b/scripts/integration-vm.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +"""boma local-VM integration test harness driver (ADR-025). + +Stdlib-only by convention (TODO-14): never imports a YAML library. The transient +inventory is emitted via string templates; stubs/cert-tiers reach Ansible as +`-e @` extra-vars; profile metadata is JSON. Talks to libvirt via `virsh`. +""" +import argparse +import hashlib +import json +import os +import pathlib +import re +import subprocess +import sys +import time +import urllib.request +import uuid + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent +CACHE_DIR = pathlib.Path(os.environ.get("BOMA_IT_CACHE", "/var/lib/boma-integration")) +IMAGE_URL = "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-genericcloud-amd64.qcow2" +SHA_URL = "https://cloud.debian.org/images/cloud/trixie/latest/SHA512SUMS" +IMAGE_NAME = "debian-13-genericcloud-amd64.qcow2" +NET_NAME = "boma-it" +NET_XML = """ + boma-it + + + + + + +""" +NAME_PREFIX = "boma-it-" +RUN_DIR = REPO_ROOT / "tests" / "integration" / ".run" +DIAG_ROOT = pathlib.Path.home() / "integration-runs" +PROFILE_DIR = REPO_ROOT / "tests" / "integration" / "profiles" +INTEG_DIR = REPO_ROOT / "tests" / "integration" +CERT_DIR = REPO_ROOT / "tests" / "integration" / "certs" +DEFAULT_MEM_MIB = 3072 +DEFAULT_VCPUS = 2 +MIN_FREE_MIB = 4096 +VALID_TIERS = ("internal", "le-staging", "le-prod-wildcard") + +# Target the SYSTEM libvirtd — where the substrate, /dev/kvm, and the NAT network live. +# Without this, a non-root caller's bare virsh/virt-install default to qemu:///session. +os.environ.setdefault("LIBVIRT_DEFAULT_URI", "qemu:///system") + + +def vm_name(host, suffix=None): + suffix = suffix or uuid.uuid4().hex[:8] + return f"{NAME_PREFIX}{host}-{suffix}" + + +def free_mib(meminfo_text): + m = re.search(r"^MemAvailable:\s+(\d+)\s+kB", meminfo_text, re.MULTILINE) + return int(m.group(1)) // 1024 if m else 0 + + +def parse_lease_ip(domifaddr_output): + m = re.search(r"ipv4\s+(\d+\.\d+\.\d+\.\d+)", domifaddr_output) + return m.group(1) if m else None + + +def render_meta_data(instance_id, hostname): + return f"instance-id: {instance_id}\nlocal-hostname: {hostname}\n" + + +def render_user_data(ssh_pubkey, ansible_user): + return ( + "#cloud-config\n" + "users:\n" + f" - name: {ansible_user}\n" + " sudo: 'ALL=(ALL) NOPASSWD:ALL'\n" + " shell: /bin/bash\n" + " ssh_authorized_keys:\n" + f" - {ssh_pubkey}\n" + "ssh_pwauth: false\n" + "package_update: true\n" + ) + + +def cert_file(tier): + if tier not in VALID_TIERS: + raise ValueError(f"unknown cert tier: {tier}") + return CERT_DIR / f"{tier}.yml" + + +def profile_path(host): + return PROFILE_DIR / f"{host}.json" + + +def render_run_hosts(name, ip, ansible_user, groups): + lines = [ + "---", + "# Generated by scripts/integration-vm.py — transient, gitignored. Do not edit.", + "# Single test host ONLY (safety invariant: no real host is ever in scope).", + "all:", + " children:", + ] + for g in dict.fromkeys(groups): + lines += [ + f" {g}:", + " hosts:", + f" {name}:", + f" ansible_host: {ip}", + f" ansible_user: {ansible_user}", + ] + return "\n".join(lines) + "\n" + + +def sh(cmd, check=True, capture=False, **kw): + """Run a command (list form). Logs the command to stderr.""" + print("+ " + " ".join(str(c) for c in cmd), file=sys.stderr) + return subprocess.run(cmd, check=check, + capture_output=capture, text=True, **kw) + + +def _expected_sha(sha_text, filename): + for line in sha_text.splitlines(): + parts = line.split() + if len(parts) == 2 and parts[1].lstrip("*") == filename: + return parts[0] + return None + + +def ensure_image(): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + img = CACHE_DIR / IMAGE_NAME + if img.exists(): + return img + print(f"Downloading {IMAGE_URL} ...", file=sys.stderr) + tmp = img.with_suffix(".part") + urllib.request.urlretrieve(IMAGE_URL, tmp) + sha_text = urllib.request.urlopen(SHA_URL).read().decode() + want = _expected_sha(sha_text, IMAGE_NAME) + if not want: + tmp.unlink(missing_ok=True) + raise SystemExit(f"checksum for {IMAGE_NAME} not found at {SHA_URL}") + h = hashlib.sha512() + with open(tmp, "rb") as fh: + for chunk in iter(lambda: fh.read(1 << 20), b""): + h.update(chunk) + if h.hexdigest() != want: + tmp.unlink(missing_ok=True) + raise SystemExit("golden image SHA512 mismatch — refusing to use it") + tmp.rename(img) + return img + + +def net_ensure(): + r = sh(["virsh", "net-info", NET_NAME], check=False, capture=True) + if r.returncode != 0: + xml = RUN_DIR / "net.xml" + RUN_DIR.mkdir(parents=True, exist_ok=True) + xml.write_text(NET_XML) + sh(["virsh", "net-define", str(xml)]) + sh(["virsh", "net-autostart", NET_NAME]) + active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout + if not re.search(r"Active:\s+yes", active): + sh(["virsh", "net-start", NET_NAME]) + + +def _ssh_pubkey(): + for cand in ("id_ed25519.pub", "id_rsa.pub"): + p = pathlib.Path.home() / ".ssh" / cand + if p.exists(): + return p.read_text().strip() + raise SystemExit("no SSH public key found in ~/.ssh") + + +def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): + free = free_mib(pathlib.Path("/proc/meminfo").read_text()) + if free < MIN_FREE_MIB: + raise SystemExit(f"refusing to start: only {free} MiB free (< {MIN_FREE_MIB})") + running = sh(["virsh", "list", "--name"], capture=True).stdout.split() + if any(n.startswith(NAME_PREFIX) for n in running): + raise SystemExit("an integration VM is already running (one at a time); " + "run `integration-vm prune` first") + name = name or vm_name(host) + img = ensure_image() + net_ensure() + RUN_DIR.mkdir(parents=True, exist_ok=True) + # VM disk/seed/console must live where the SYSTEM hypervisor (libvirt-qemu) can reach + # them — NOT under the repo/home (qemu cannot traverse /home/claude). CACHE_DIR is + # group-libvirt + world-traversable (created by the integration_test role). + overlay = CACHE_DIR / f"{name}.qcow2" + sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)]) + (RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible")) + (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name)) + seed = CACHE_DIR / f"{name}-seed.img" + # Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback. + (RUN_DIR / "network-config").write_text( + 'version: 2\n' + 'ethernets:\n' + ' primary:\n' + ' match:\n' + ' name: "en*"\n' + ' dhcp4: true\n') + sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"), + str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")]) + console = CACHE_DIR / f"{name}-console.log" + sh(["virt-install", "--name", name, "--memory", str(mem_mib), "--vcpus", str(vcpus), + "--boot", "uefi", # genericcloud triple-faults on legacy BIOS handoff; UEFI boots + "--import", + "--disk", f"path={overlay},format=qcow2", + "--disk", f"path={seed},device=cdrom", + "--network", f"network={NET_NAME}", + "--osinfo", "debian13", + "--graphics", "none", + "--serial", f"file,path={console}", + "--noautoconsole"]) + ip = wait_for_ip(name) + wait_for_ssh(ip, "ansible") + # Block until cloud-init finishes (incl. apt-get update) so apply sees a ready system. + sh(["ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + f"ansible@{ip}", "sudo cloud-init status --wait"], check=False) + (RUN_DIR / "current").write_text(f"{name}\n{ip}\n{host}\n") + print(f"VM {name} up at {ip}") + return name, ip + + +def wait_for_ip(name, timeout=120): + end = time.time() + timeout + while time.time() < end: + out = sh(["virsh", "domifaddr", name, "--source", "lease"], + check=False, capture=True).stdout + ip = parse_lease_ip(out) + if ip: + return ip + time.sleep(4) + raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — " + "VM left defined; run `integration-vm prune` to remove it") + + +def wait_for_ssh(ip, user, timeout=180): + end = time.time() + timeout + while time.time() < end: + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5", + f"{user}@{ip}", "true"], check=False, capture=True) + if r.returncode == 0: + return + time.sleep(5) + raise SystemExit(f"timed out waiting for SSH to {ip} — " + "VM left defined; run `integration-vm prune` to remove it") + + +def _read_current(): + txt = (RUN_DIR / "current").read_text().splitlines() + return txt[0], txt[1], txt[2] # name, ip, host + + +def write_run_inventory(name, ip, groups): + RUN_DIR.mkdir(parents=True, exist_ok=True) + (RUN_DIR / "hosts.yml").write_text( + render_run_hosts(name, ip, "ansible", groups)) + link = RUN_DIR / "group_vars" + target = REPO_ROOT / "inventories" / "production" / "group_vars" + if link.is_symlink(): + link.unlink() + elif link.exists(): + raise SystemExit(f"{link} exists and is not a symlink; remove it manually") + link.symlink_to(target) + + +def apply(host, certs): + name, ip, _ = _read_current() + prof = json.loads(profile_path(host).read_text()) + write_run_inventory(name, ip, prof["groups"]) + extra = [] + for f in prof.get("extra_vars_files", []): + extra += ["-e", f"@{INTEG_DIR / f}"] + extra += ["-e", f"@{cert_file(certs)}"] + for step in prof["applies"]: + cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"), + f"playbooks/{step['playbook']}", "--limit", name] + if step.get("tags"): + cmd += ["--tags", ",".join(step["tags"])] + cmd += extra + sh(cmd, cwd=str(REPO_ROOT)) + print(f"applied {host} profile to {name}") + + +def _boot_id(ip, user): + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5", + f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"], + check=False, capture=True) + return r.stdout.strip() if r.returncode == 0 else None + + +def wait_for_reboot(ip, user, before_boot_id, timeout=240): + """Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd).""" + end = time.time() + timeout + while time.time() < end: + bid = _boot_id(ip, user) + if bid and bid != before_boot_id: + return + time.sleep(5) + raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — " + "VM left defined; run `integration-vm prune` to remove it") + + +def reboot_vm(): + name, ip, _ = _read_current() + before = _boot_id(ip, "ansible") + sh(["virsh", "reboot", name]) + wait_for_reboot(ip, "ansible", before) + print(f"{name} rebooted (boot_id changed), SSH back at {ip}") + + +def run_assert(host, certs): + name, ip, _ = _read_current() + prof = json.loads(profile_path(host).read_text()) + write_run_inventory(name, ip, prof["groups"]) + extra = [] + for f in prof.get("extra_vars_files", []): + extra += ["-e", f"@{INTEG_DIR / f}"] + extra += ["-e", f"@{cert_file(certs)}"] + cmd = [".venv/bin/ansible-playbook", "-i", str(RUN_DIR / "hosts.yml"), + "tests/integration/verify.yml", "--limit", name] + extra + r = sh(cmd, cwd=str(REPO_ROOT), check=False) + if r.returncode != 0: + dump_diagnostics(name, ip) + raise SystemExit(f"VERIFY FAILED for {name} — diagnostics in {DIAG_ROOT}") + print(f"VERIFY PASSED for {name}") + + +def dump_diagnostics(name, ip): + d = DIAG_ROOT / name + d.mkdir(parents=True, exist_ok=True) + for label, cmd in [ + ("nft", "nft list ruleset"), + ("docker", "docker ps -a"), + ("ss", "ss -tlnp"), + ("journal", "journalctl -b --no-pager"), + ("critical-chain", "systemd-analyze critical-chain"), + ]: + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + f"ansible@{ip}", "sudo " + cmd], check=False, capture=True) + (d / f"{label}.txt").write_text((r.stdout or "") + (r.stderr or "")) + console = CACHE_DIR / f"{name}-console.log" + if console.exists(): + # The serial log is root:0600 (libvirt-created); read it via sudo (ADR-015: the + # claude worker has sudo) and write a worker-owned copy into the bundle. + r = sh(["sudo", "cat", str(console)], check=False, capture=True) + (d / "console.log").write_text(r.stdout or "") + print(f"diagnostics written to {d}", file=sys.stderr) + + +def _destroy(name): + sh(["virsh", "destroy", name], check=False) + sh(["virsh", "undefine", name, "--nvram"], check=False) + for base in (RUN_DIR, CACHE_DIR): + for f in base.glob(f"{name}*"): + f.unlink(missing_ok=True) + + +def down(host=None, keep=False): + if keep: + print("--keep: leaving the VM running for inspection") + return + cur = RUN_DIR / "current" + if cur.exists(): + name = cur.read_text().splitlines()[0] + _destroy(name) + cur.unlink(missing_ok=True) + print(f"destroyed {name}") + + +def prune(): + running = sh(["virsh", "list", "--all", "--name"], capture=True).stdout.split() + for n in running: + if n.startswith(NAME_PREFIX): + _destroy(n) + print(f"pruned {n}") + (RUN_DIR / "current").unlink(missing_ok=True) + + +def console(): + name = (RUN_DIR / "current").read_text().splitlines()[0] + log = CACHE_DIR / f"{name}-console.log" + if log.exists(): + print(sh(["sudo", "cat", str(log)], check=False, capture=True).stdout or "") + else: + print(f"no console log at {log}") + + +def cycle(host, certs, keep=False, no_reboot=False): + ok = False + try: + up(host) + apply(host, certs) + if not no_reboot: + reboot_vm() + run_assert(host, certs) + ok = True + finally: + if ok and not keep: + down(host) + elif not ok: + print("FAILED — VM left up for inspection; `integration-vm prune` to clean.", + file=sys.stderr) + + +DISPATCH = { + "up": lambda a: (up(a.host), None)[1], + "apply": lambda a: apply(a.host, a.certs), + "reboot": lambda a: reboot_vm(), + "assert": lambda a: run_assert(a.host, a.certs), + "down": lambda a: down(a.host, a.keep), + "console": lambda a: console(), + "prune": lambda a: prune(), + "cycle": lambda a: cycle(a.host, a.certs, a.keep, a.no_reboot), +} + + +def main(argv=None): + p = argparse.ArgumentParser(prog="integration-vm", description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + for c in ("up", "apply", "reboot", "assert", "cycle", "down", "console"): + sp = sub.add_parser(c) + sp.add_argument("--host", required=True) + sp.add_argument("--certs", choices=VALID_TIERS, default="internal") + sp.add_argument("--keep", action="store_true") + sp.add_argument("--no-reboot", action="store_true") + sub.add_parser("prune") + args = p.parse_args(argv) + return DISPATCH[args.cmd](args) + + +if __name__ == "__main__": # pragma: no cover + sys.exit(main()) diff --git a/tests/integration/certs/internal.yml b/tests/integration/certs/internal.yml new file mode 100644 index 0000000..ba95157 --- /dev/null +++ b/tests/integration/certs/internal.yml @@ -0,0 +1,2 @@ +--- +reverse_proxy__tls_internal: true diff --git a/tests/integration/certs/le-prod-wildcard.yml b/tests/integration/certs/le-prod-wildcard.yml new file mode 100644 index 0000000..88bc1d9 --- /dev/null +++ b/tests/integration/certs/le-prod-wildcard.yml @@ -0,0 +1,6 @@ +--- +# On-demand only. Records an accepted risk (ADR-025 / accepted-risks.md): the prod +# Gandi PAT reaches an ephemeral VM and transient TXT records land in the real wingu.me. +reverse_proxy__tls_internal: false +reverse_proxy__acme_dns_provider: gandi +reverse_proxy__acme_ca: "" diff --git a/tests/integration/certs/le-staging.yml b/tests/integration/certs/le-staging.yml new file mode 100644 index 0000000..12bcdae --- /dev/null +++ b/tests/integration/certs/le-staging.yml @@ -0,0 +1,4 @@ +--- +reverse_proxy__tls_internal: false +reverse_proxy__acme_dns_provider: gandi +reverse_proxy__acme_ca: "https://acme-staging-v02.api.letsencrypt.org/directory" diff --git a/tests/integration/overrides/askari.yml b/tests/integration/overrides/askari.yml new file mode 100644 index 0000000..c0d08b3 --- /dev/null +++ b/tests/integration/overrides/askari.yml @@ -0,0 +1,12 @@ +--- +# Integration-test overlay for the "askari" profile (ADR-025). Passed via `-e @`. +# Reproduces the 2026-06-17 incident: apply base's nftables default-deny to a Docker host. +base__firewall_apply: true +# Keep a break-glass: sshd stays on all interfaces (never wt0-only in a throwaway VM). +base__ssh_listen_mesh_only: false +# The VM is isolated; it must never touch the real mesh. +base__mesh_enabled: false +# Allow SSH from the VM's libvirt-NAT gateway (where the driver/ansible connects from), +# so base's default-deny firewall + the reboot don't lock out the harness. By source IP, +# so it's interface-independent. Overrides askari's real control addr for the test only. +base__firewall_control_addr: "192.168.150.1" diff --git a/tests/integration/profiles/askari.json b/tests/integration/profiles/askari.json new file mode 100644 index 0000000..bab1dd8 --- /dev/null +++ b/tests/integration/profiles/askari.json @@ -0,0 +1,10 @@ +{ + "groups": ["offsite_hosts"], + "applies": [ + {"playbook": "site.yml", "tags": ["base"]}, + {"playbook": "offsite.yml", "tags": ["docker_host", "reverse_proxy"]} + ], + "extra_vars_files": ["overrides/askari.yml"], + "mem_mib": 3072, + "vcpus": 2 +} diff --git a/tests/integration/verify.yml b/tests/integration/verify.yml new file mode 100644 index 0000000..e6c99b8 --- /dev/null +++ b/tests/integration/verify.yml @@ -0,0 +1,44 @@ +--- +# Integration verify (ADR-025). Outcome-based: proves Docker forwarding survives the +# reboot. The load-bearing check probes the VM's published :80 FROM the controller +# (ubongo) — if base's forward-drop killed DNAT, this times out (the FRICTION #1 bug). +- name: Verify the rebooted host + hosts: all + become: true + gather_facts: false + tasks: + - name: Gather service facts + ansible.builtin.service_facts: + + - name: Docker daemon is active + ansible.builtin.assert: + that: "ansible_facts.services['docker.service'].state == 'running'" + fail_msg: "docker.service is not running" + + - name: Forward chain permits container traffic (drop-in loaded) + ansible.builtin.command: nft list chain inet filter forward + register: _fwd + changed_when: false + + - name: Assert container forwarding is allowed (not pure drop) + ansible.builtin.assert: + that: "'accept' in _fwd.stdout" + fail_msg: >- + forward chain is pure drop — container forwarding will die on reboot + (FRICTION 2026-06-17 #1). docker_host container-forward drop-in missing. + + - name: Published port answers from the controller (DNAT + forward alive) + delegate_to: localhost + become: false + ansible.builtin.uri: + # Probe :80 (plain HTTP) — any answer proves the published-port DNAT + forward path + # is alive. Don't follow caddy's HTTP->HTTPS redirect (its `tls internal` has no + # cert for a bare-IP HTTPS request); the 308 itself proves the path works. + url: "http://{{ ansible_host }}/" + follow_redirects: none + status_code: [200, 301, 308, 404, 502, 503] + timeout: 10 + register: _probe + retries: 5 + delay: 6 + until: _probe is succeeded diff --git a/tests/test_integration_vm.py b/tests/test_integration_vm.py new file mode 100644 index 0000000..1d0a750 --- /dev/null +++ b/tests/test_integration_vm.py @@ -0,0 +1,78 @@ +import importlib.util +import pathlib +import pytest + +_PATH = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "integration-vm.py" +_spec = importlib.util.spec_from_file_location("integration_vm", _PATH) +ivm = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(ivm) + + +def test_valid_tiers(): + assert ivm.VALID_TIERS == ("internal", "le-staging", "le-prod-wildcard") + + +def test_vm_name_prefix_and_suffix(): + assert ivm.vm_name("askari", "ab12cd34") == "boma-it-askari-ab12cd34" + +def test_vm_name_generates_suffix(): + n = ivm.vm_name("askari") + assert n.startswith("boma-it-askari-") and len(n.split("-")[-1]) == 8 + +def test_free_mib_parses_memavailable(): + sample = "MemTotal: 16331156 kB\nMemAvailable: 8388608 kB\n" + assert ivm.free_mib(sample) == 8192 + +def test_parse_lease_ip_extracts_ipv4(): + out = (" Name MAC address Protocol Address\n" + "-------------------------------------------------------------------\n" + " vnet0 52:54:00:aa:bb:cc ipv4 192.168.150.42/24\n") + assert ivm.parse_lease_ip(out) == "192.168.150.42" + +def test_parse_lease_ip_none_when_absent(): + assert ivm.parse_lease_ip("no leases\n") is None + + +def test_meta_data_has_instance_and_hostname(): + md = ivm.render_meta_data("iid-askari-x", "boma-it-askari-x") + assert "instance-id: iid-askari-x" in md + assert "local-hostname: boma-it-askari-x" in md + +def test_user_data_injects_key_and_ansible_user(): + ud = ivm.render_user_data("ssh-ed25519 AAAA... claude@ubongo", "ansible") + assert ud.startswith("#cloud-config") + assert "name: ansible" in ud + assert "ssh-ed25519 AAAA... claude@ubongo" in ud + assert "NOPASSWD:ALL" in ud + + +def test_cert_file_valid_tier(): + p = ivm.cert_file("le-staging") + assert p.name == "le-staging.yml" and p.parent.name == "certs" + +def test_cert_file_rejects_bad_tier(): + with pytest.raises(ValueError): + ivm.cert_file("bogus") + +def test_render_run_hosts_single_host_in_groups(): + out = ivm.render_run_hosts("boma-it-askari-x", "192.168.150.42", + "ansible", ["offsite_hosts"]) + assert "offsite_hosts:" in out + assert "boma-it-askari-x:" in out + assert "ansible_host: 192.168.150.42" in out + assert "ansible_user: ansible" in out + assert "askari:" not in out.replace("boma-it-askari-x:", "") + +def test_free_mib_returns_zero_when_absent(): + assert ivm.free_mib("MemTotal: 16384 kB\n") == 0 + +def test_render_run_hosts_multiple_groups(): + out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible", + ["offsite_hosts", "docker_hosts"]) + assert "offsite_hosts:" in out + assert "docker_hosts:" in out + +def test_render_run_hosts_dedups_groups(): + out = ivm.render_run_hosts("boma-it-x-1", "192.168.150.5", "ansible", + ["docker_hosts", "docker_hosts"]) + assert out.count("docker_hosts:") == 1