fix(integration-vm): boot-id-verified reboot + actionable timeouts + inventory guard (review)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
68abd67ce6
commit
4fb4cf99c3
1 changed files with 33 additions and 11 deletions
|
|
@ -153,7 +153,7 @@ def net_ensure():
|
||||||
sh(["virsh", "net-define", str(xml)])
|
sh(["virsh", "net-define", str(xml)])
|
||||||
sh(["virsh", "net-autostart", NET_NAME])
|
sh(["virsh", "net-autostart", NET_NAME])
|
||||||
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
|
active = sh(["virsh", "net-info", NET_NAME], capture=True).stdout
|
||||||
if "Active: yes" not in active:
|
if not re.search(r"Active:\s+yes", active):
|
||||||
sh(["virsh", "net-start", NET_NAME])
|
sh(["virsh", "net-start", NET_NAME])
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -210,7 +210,8 @@ def wait_for_ip(name, timeout=120):
|
||||||
if ip:
|
if ip:
|
||||||
return ip
|
return ip
|
||||||
time.sleep(4)
|
time.sleep(4)
|
||||||
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease")
|
raise SystemExit(f"timed out waiting for {name} to get a DHCP lease — "
|
||||||
|
"VM left defined; run `integration-vm prune` to remove it")
|
||||||
|
|
||||||
|
|
||||||
def wait_for_ssh(ip, user, timeout=180):
|
def wait_for_ssh(ip, user, timeout=180):
|
||||||
|
|
@ -222,7 +223,8 @@ def wait_for_ssh(ip, user, timeout=180):
|
||||||
if r.returncode == 0:
|
if r.returncode == 0:
|
||||||
return
|
return
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
raise SystemExit(f"timed out waiting for SSH to {ip}")
|
raise SystemExit(f"timed out waiting for SSH to {ip} — "
|
||||||
|
"VM left defined; run `integration-vm prune` to remove it")
|
||||||
|
|
||||||
|
|
||||||
def _read_current():
|
def _read_current():
|
||||||
|
|
@ -236,11 +238,11 @@ def write_run_inventory(name, ip, groups):
|
||||||
render_run_hosts(name, ip, "ansible", groups))
|
render_run_hosts(name, ip, "ansible", groups))
|
||||||
link = RUN_DIR / "group_vars"
|
link = RUN_DIR / "group_vars"
|
||||||
target = REPO_ROOT / "inventories" / "production" / "group_vars"
|
target = REPO_ROOT / "inventories" / "production" / "group_vars"
|
||||||
if link.is_symlink() or link.exists():
|
if link.is_symlink():
|
||||||
if link.is_symlink():
|
link.unlink()
|
||||||
link.unlink()
|
elif link.exists():
|
||||||
if not link.exists():
|
raise SystemExit(f"{link} exists and is not a symlink; remove it manually")
|
||||||
link.symlink_to(target)
|
link.symlink_to(target)
|
||||||
|
|
||||||
|
|
||||||
def apply(host, certs):
|
def apply(host, certs):
|
||||||
|
|
@ -261,12 +263,32 @@ def apply(host, certs):
|
||||||
print(f"applied {host} profile to {name}")
|
print(f"applied {host} profile to {name}")
|
||||||
|
|
||||||
|
|
||||||
|
def _boot_id(ip, user):
|
||||||
|
r = sh(["ssh", "-o", "StrictHostKeyChecking=no",
|
||||||
|
"-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=5",
|
||||||
|
f"{user}@{ip}", "cat /proc/sys/kernel/random/boot_id"],
|
||||||
|
check=False, capture=True)
|
||||||
|
return r.stdout.strip() if r.returncode == 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_reboot(ip, user, before_boot_id, timeout=240):
|
||||||
|
"""Confirm a REAL reboot: SSH back up AND boot_id changed (not the pre-reboot sshd)."""
|
||||||
|
end = time.time() + timeout
|
||||||
|
while time.time() < end:
|
||||||
|
bid = _boot_id(ip, user)
|
||||||
|
if bid and bid != before_boot_id:
|
||||||
|
return
|
||||||
|
time.sleep(5)
|
||||||
|
raise SystemExit(f"timed out waiting for {ip} to reboot (boot_id unchanged) — "
|
||||||
|
"VM left defined; run `integration-vm prune` to remove it")
|
||||||
|
|
||||||
|
|
||||||
def reboot_vm():
|
def reboot_vm():
|
||||||
name, ip, _ = _read_current()
|
name, ip, _ = _read_current()
|
||||||
|
before = _boot_id(ip, "ansible")
|
||||||
sh(["virsh", "reboot", name])
|
sh(["virsh", "reboot", name])
|
||||||
time.sleep(5)
|
wait_for_reboot(ip, "ansible", before)
|
||||||
wait_for_ssh(ip, "ansible")
|
print(f"{name} rebooted (boot_id changed), SSH back at {ip}")
|
||||||
print(f"{name} rebooted, SSH back at {ip}")
|
|
||||||
|
|
||||||
|
|
||||||
def run_assert(host, certs):
|
def run_assert(host, certs):
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue