fix(integration-vm): boot-id-verified reboot + actionable timeouts + inventory guard (review)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
sjat 2026-06-18 12:28:06 +02:00
parent 68abd67ce6
commit 4fb4cf99c3

View file

@ -153,7 +153,7 @@ def net_ensure():
sh(["virsh", "net-define", str(xml)])
sh(["virsh", "net-autostart", NET_NAME])
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
if "Active: yes" not in active:
if not re.search(r"Active:\s+yes", active):
sh(["virsh", "net-start", NET_NAME])
@ -210,7 +210,8 @@ def wait_for_ip(name, timeout=120):
if ip:
return ip
time.sleep(4)
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease")
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
"VM left defined; run `integration-vm prune` to remove it")
def wait_for_ssh(ip, user, timeout=180):
@ -222,7 +223,8 @@ def wait_for_ssh(ip, user, timeout=180):
if r.returncode == 0:
return
time.sleep(5)
raise SystemExit(f"timed out waiting for SSH to {ip}")
raise SystemExit(f"timed out waiting for SSH to {ip}"
"VM left defined; run `integration-vm prune` to remove it")
def _read_current():
@ -236,11 +238,11 @@ def write_run_inventory(name, ip, groups):
render_run_hosts(name, ip, "ansible", groups))
link = RUN_DIR / "group_vars"
target = REPO_ROOT / "inventories" / "production" / "group_vars"
if link.is_symlink() or link.exists():
if link.is_symlink():
link.unlink()
if not link.exists():
link.symlink_to(target)
if link.is_symlink():
link.unlink()
elif link.exists():
raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
link.symlink_to(target)
def apply(host, certs):
@ -261,12 +263,32 @@ def apply(host, certs):
print(f"applied {host} profile to {name}")
def _boot_id(ip, user):
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
check=False, capture=True)
return r.stdout.strip() if r.returncode == 0 else None
def wait_for_reboot(ip, user, before_boot_id, timeout=240):
"""Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
end = time.time() + timeout
while time.time() < end:
bid = _boot_id(ip, user)
if bid and bid != before_boot_id:
return
time.sleep(5)
raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
"VM left defined; run `integration-vm prune` to remove it")
def reboot_vm():
name, ip, _ = _read_current()
before = _boot_id(ip, "ansible")
sh(["virsh", "reboot", name])
time.sleep(5)
wait_for_ssh(ip, "ansible")
print(f"{name} rebooted, SSH back at {ip}")
wait_for_reboot(ip, "ansible", before)
print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
def run_assert(host, certs):