fix(integration-vm): boot-id-verified reboot + actionable timeouts + inventory guard (review)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
68abd67ce6
commit
4fb4cf99c3
1 changed files with 33 additions and 11 deletions
|
|
@ -153,7 +153,7 @@ def net_ensure():
|
|||
sh(["virsh", "net-define", str(xml)])
|
||||
sh(["virsh", "net-autostart", NET_NAME])
|
||||
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
|
||||
if "Active: yes" not in active:
|
||||
if not re.search(r"Active:\s+yes", active):
|
||||
sh(["virsh", "net-start", NET_NAME])
|
||||
|
||||
|
||||
|
|
@ -210,7 +210,8 @@ def wait_for_ip(name, timeout=120):
|
|||
if ip:
|
||||
return ip
|
||||
time.sleep(4)
|
||||
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease")
|
||||
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def wait_for_ssh(ip, user, timeout=180):
|
||||
|
|
@ -222,7 +223,8 @@ def wait_for_ssh(ip, user, timeout=180):
|
|||
if r.returncode == 0:
|
||||
return
|
||||
time.sleep(5)
|
||||
raise SystemExit(f"timed out waiting for SSH to {ip}")
|
||||
raise SystemExit(f"timed out waiting for SSH to {ip} — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def _read_current():
|
||||
|
|
@ -236,11 +238,11 @@ def write_run_inventory(name, ip, groups):
|
|||
render_run_hosts(name, ip, "ansible", groups))
|
||||
link = RUN_DIR / "group_vars"
|
||||
target = REPO_ROOT / "inventories" / "production" / "group_vars"
|
||||
if link.is_symlink() or link.exists():
|
||||
if link.is_symlink():
|
||||
link.unlink()
|
||||
if not link.exists():
|
||||
link.symlink_to(target)
|
||||
if link.is_symlink():
|
||||
link.unlink()
|
||||
elif link.exists():
|
||||
raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
|
||||
link.symlink_to(target)
|
||||
|
||||
|
||||
def apply(host, certs):
|
||||
|
|
@ -261,12 +263,32 @@ def apply(host, certs):
|
|||
print(f"applied {host} profile to {name}")
|
||||
|
||||
|
||||
def _boot_id(ip, user):
|
||||
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
|
||||
f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
|
||||
check=False, capture=True)
|
||||
return r.stdout.strip() if r.returncode == 0 else None
|
||||
|
||||
|
||||
def wait_for_reboot(ip, user, before_boot_id, timeout=240):
|
||||
"""Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
|
||||
end = time.time() + timeout
|
||||
while time.time() < end:
|
||||
bid = _boot_id(ip, user)
|
||||
if bid and bid != before_boot_id:
|
||||
return
|
||||
time.sleep(5)
|
||||
raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
|
||||
"VM left defined; run `integration-vm prune` to remove it")
|
||||
|
||||
|
||||
def reboot_vm():
|
||||
name, ip, _ = _read_current()
|
||||
before = _boot_id(ip, "ansible")
|
||||
sh(["virsh", "reboot", name])
|
||||
time.sleep(5)
|
||||
wait_for_ssh(ip, "ansible")
|
||||
print(f"{name} rebooted, SSH back at {ip}")
|
||||
wait_for_reboot(ip, "ansible", before)
|
||||
print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
|
||||
|
||||
|
||||
def run_assert(host, certs):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue