From 4fb4cf99c37b6767ec41d0068a3fc4c5c27b0eea Mon Sep 17 00:00:00 2001 From: sjat Date: Thu, 18 Jun 2026 12:28:06 +0200 Subject: [PATCH] fix(integration-vm): boot-id-verified reboot + actionable timeouts + inventory guard (review) Co-Authored-By: Claude Sonnet 4.6 --- scripts/integration-vm.py | 44 +++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/scripts/integration-vm.py b/scripts/integration-vm.py index bc879ee..191cf60 100644 --- a/scripts/integration-vm.py +++ b/scripts/integration-vm.py @@ -153,7 +153,7 @@ def net_ensure(): sh(["virsh", "net-define", str(xml)]) sh(["virsh", "net-autostart", NET_NAME]) active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout - if "Active: yes" not in active: + if not re.search(r"Active:\s+yes", active): sh(["virsh", "net-start", NET_NAME]) @@ -210,7 +210,8 @@ def wait_for_ip(name, timeout=120): if ip: return ip time.sleep(4) - raise SystemExit(f"timed out waiting for {name} to get a DHCP lease") + raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — " + "VM left defined; run `integration-vm prune` to remove it") def wait_for_ssh(ip, user, timeout=180): @@ -222,7 +223,8 @@ def wait_for_ssh(ip, user, timeout=180): if r.returncode == 0: return time.sleep(5) - raise SystemExit(f"timed out waiting for SSH to {ip}") + raise SystemExit(f"timed out waiting for SSH to {ip} — " + "VM left defined; run `integration-vm prune` to remove it") def _read_current(): @@ -236,11 +238,11 @@ def write_run_inventory(name, ip, groups): render_run_hosts(name, ip, "ansible", groups)) link = RUN_DIR / "group_vars" target = REPO_ROOT / "inventories" / "production" / "group_vars" - if link.is_symlink() or link.exists(): - if link.is_symlink(): - link.unlink() - if not link.exists(): - link.symlink_to(target) + if link.is_symlink(): + link.unlink() + elif link.exists(): + raise SystemExit(f"{link} exists and is not a symlink; remove it manually") + link.symlink_to(target) def apply(host, certs): @@ -261,12 +263,32 @@ def apply(host, certs): print(f"applied {host} profile to {name}") +def _boot_id(ip, user): + r = sh(["ssh", "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5", + f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"], + check=False, capture=True) + return r.stdout.strip() if r.returncode == 0 else None + + +def wait_for_reboot(ip, user, before_boot_id, timeout=240): + """Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd).""" + end = time.time() + timeout + while time.time() < end: + bid = _boot_id(ip, user) + if bid and bid != before_boot_id: + return + time.sleep(5) + raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — " + "VM left defined; run `integration-vm prune` to remove it") + + def reboot_vm(): name, ip, _ = _read_current() + before = _boot_id(ip, "ansible") sh(["virsh", "reboot", name]) - time.sleep(5) - wait_for_ssh(ip, "ansible") - print(f"{name} rebooted, SSH back at {ip}") + wait_for_reboot(ip, "ansible", before) + print(f"{name} rebooted (boot_id changed), SSH back at {ip}") def run_assert(host, certs):