From 8ca42c389cdb5bfc04f8ab94086ccf876586f00c Mon Sep 17 00:00:00 2001 From: sjat Date: Fri, 19 Jun 2026 19:15:07 +0200 Subject: [PATCH] fix(integration): fix VM boot: hostname, netplan, known_hosts handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes found during askari_inputonly integration-test development: 1. Hostname sanitization: cloud-init rejects underscores in local-hostname (silently skips network-config → VM never gets DHCP). Sanitize with name.replace("_", "-") for the meta-data hostname; paths/domain names keep the original (underscore is valid there). 2. Netplan explicit interface: match.name: en* with a named key produces a .network file that networkd never DHCPs. Use explicit enp1s0 (all virtio NICs in these KVM VMs) + renderer: networkd to bypass the bug. 3. ansible_ssh_common_args in the generated hosts.yml: integration VMs reuse IPs (different VMs at same 192.168.150.x lease). StrictHostKey accept-new from ansible.cfg blocks changed keys. Add StrictHostKeyChecking=no + UserKnownHostsFile=/dev/null per-host to the generated inventory so stale known_hosts entries never block the apply step. Co-Authored-By: Claude Sonnet 4.6 --- scripts/integration-vm.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py index 02c655c..c51f9f7 100644 --- a/scripts/integration-vm.py +++ b/scripts/integration-vm.py @@ -106,6 +106,12 @@ def render_run_hosts(name, ip, ansible_user, groups): f" {name}:", f" ansible_host: {ip}", f" ansible_user: {ansible_user}", + # Integration VMs reuse IPs; bypass host-key caching so stale + # known_hosts entries (from prior runs with a different VM at + # the same IP) do not block the Ansible apply step. + " ansible_ssh_common_args: >-", + " -o StrictHostKeyChecking=no", + " -o UserKnownHostsFile=/dev/null", ] return "\n".join(lines) + "\n" @@ -188,15 +194,22 @@ def up(host, name=None, mem_mib=DEFAULT_MEM_MIB, vcpus=DEFAULT_VCPUS): overlay = CACHE_DIR / f"{name}.qcow2" sh(["qemu-img", "create", "-f", "qcow2", "-F", "qcow2", "-b", str(img), str(overlay)]) (RUN_DIR / "user-data").write_text(render_user_data(_ssh_pubkey(), "ansible")) - (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name)) + # cloud-init rejects underscores in local-hostname (causes init-local to skip + # writing the network config → VM never gets a DHCP lease). Sanitize VM name + # for use as hostname without affecting disk paths or virsh domain names. + (RUN_DIR / "meta-data").write_text(render_meta_data(f"iid-{name}", name.replace("_", "-"))) seed = CACHE_DIR / f"{name}-seed.img" # Force DHCP on the VM NIC — don't rely on the genericcloud image's network fallback. + # Use explicit renderer + interface name to avoid a netplan 1.1.2 generation issue: + # `match.name: en*` with a named key (e.g. `primary`) produces a .network file that + # networkd loads but never DHCPs (no DHCP4 messages, just IPv6LL). Using the real + # interface name `enp1s0` (all virtio NICs in these KVM VMs are named enp1s0) and + # `renderer: networkd` bypasses the bug. (RUN_DIR / "network-config").write_text( 'version: 2\n' + 'renderer: networkd\n' 'ethernets:\n' - ' primary:\n' - ' match:\n' - ' name: "en*"\n' + ' enp1s0:\n' ' dhcp4: true\n') sh(["cloud-localds", "--network-config", str(RUN_DIR / "network-config"), str(seed), str(RUN_DIR / "user-data"), str(RUN_DIR / "meta-data")])