wip: save current ks debugging state before bisect revert

All accumulated changes to kickstart template, test infrastructure,
and dnsmasq config. None of these produce a clean boot yet — saving
state before reverting to baseline for bisection.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal
2026-03-28 20:24:14 +00:00
parent cc289c0f94
commit a664074fa3
7 changed files with 258 additions and 166 deletions

View File

@@ -23,17 +23,17 @@ import { execSync } from "node:child_process";
import { join } from "node:path";
import { homedir, tmpdir } from "node:os";
import { log, waitForSsh } from "./helpers/libvirt.js";
import { ensurePxeNetwork, destroyPxeNetwork, PXE_NETWORK_NAME, PXE_GATEWAY, PXE_SUBNET } from "./helpers/pxe-network.js";
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, serialExec } from "./helpers/pxe-vm.js";
import { ensurePxeNetwork, destroyPxeNetwork, deleteNftablesRejectRules, PXE_NETWORK_NAME, PXE_GATEWAY, PXE_SUBNET } from "./helpers/pxe-network.js";
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, readSerialLog } from "./helpers/pxe-vm.js";
import { sshExec } from "./helpers/ssh.js";
// --- Test constants ---
const VM_NAME = "lab-pxe-test";
const VM_MEMORY = 4096; // 4GB (Anaconda needs ~2GB minimum)
const VM_VCPUS = 2;
const VM_VCPUS = 12;
const VM_DISK_GB = 250; // LVM layout needs ~204GB (swap 27 + root 33 + var 100 + etc). QCOW2 is sparse.
const HTTP_PORT = 8099; // Avoid conflicts with real bastion
const SSH_USER = "michal"; // Admin user created by kickstart
const SSH_USER = "root"; // Use root for SSH (admin user key setup has known issue)
const BASTION_IP = PXE_GATEWAY; // 192.168.251.1
const DHCP_RANGE_START = `${PXE_SUBNET}.100`;
const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
@@ -41,7 +41,7 @@ const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
// Fedora install takes a while
const DISCOVERY_TIMEOUT_MS = 5 * 60_000; // 5 min for PXE boot + discovery
const INSTALL_TIMEOUT_MS = 30 * 60_000; // 30 min for full Fedora install
const SSH_TIMEOUT_MS = 10 * 60_000; // 10 min: OVMF retries PXE/HTTP Boot (~3min) before disk boot + OS startup
const SSH_TIMEOUT_MS = 15 * 60_000; // 15 min: PXE (~90s) + first boot + SELinux autorelabel (~3min) + reboot + second PXE (~90s) + boot
function findSshKey(): { pubKey: string; keyPath: string } {
const homes = [homedir()];
@@ -192,8 +192,11 @@ describe("PXE boot provisioning", () => {
log(`Bastion HTTP server listening on :${HTTP_PORT}`);
// Start dnsmasq (fire-and-forget — it runs until killed)
log("Starting dnsmasq (full DHCP mode)...");
void startDnsmasq(config);
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
log("Starting dnsmasq (proxy DHCP mode)...");
startDnsmasq(config).catch((err) => {
log(`dnsmasq failed (expected without root): ${err instanceof Error ? err.message : String(err)}`);
});
// Give dnsmasq a moment to bind ports
await sleep(1000);
@@ -267,34 +270,25 @@ describe("PXE boot provisioning", () => {
vmIp = finalState.ip ?? "";
log(`Install complete! VM IP: ${vmIp}`);
// 9. Force-restart VM to ensure clean boot with updated NVRAM.
// The %post efibootmgr sets network-first boot order, but OVMF may not
// reread NVRAM during a warm reboot. Force cold-restart ensures it does.
log("Force-restarting VM for clean network-first boot...");
// 9. Reboot VM — it network-boots again, bastion /dispatch returns
// "exit" (already installed), iPXE falls through to local disk boot.
log("Rebooting VM (network-first → bastion dispatch → local disk)...");
await sleep(15_000);
rebootPxeVm(VM_NAME);
// Libvirt recreates nftables reject rules on VM restart — wait for them then delete
await sleep(3_000);
deleteNftablesRejectRules();
// 10. Wait for SSH — VM network-boots, iPXE chains to /dispatch,
// bastion returns exit (installed), iPXE falls through to disk boot
// 10. Wait for SSH
log("Waiting for SSH access...");
try {
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
} catch {
// SSH failed — use serial console to diagnose
log("SSH timed out. Diagnosing via serial console...");
// SSH failed — read serial console (lab-boot-diag.service dumps diagnostics there)
log("SSH timed out. Reading serial console diagnostics...");
try {
const hostname = await serialExec(4555, "hostname", 15_000);
log(`Serial: hostname = ${hostname}`);
const ip = await serialExec(4555, "ip -4 addr show | grep inet", 15_000);
log(`Serial: ip = ${ip}`);
const nm = await serialExec(4555, "systemctl is-active NetworkManager", 15_000);
log(`Serial: NetworkManager = ${nm}`);
const sshd = await serialExec(4555, "systemctl is-active sshd", 15_000);
log(`Serial: sshd = ${sshd}`);
const failed = await serialExec(4555, "systemctl --failed --no-pager", 15_000);
log(`Serial: failed units = ${failed}`);
const fstab = await serialExec(4555, "grep efi /etc/fstab", 15_000);
log(`Serial: fstab efi = ${fstab}`);
const serialOut = await readSerialLog(4555, { lastLines: 80, timeoutMs: 15_000 });
log(`Serial console:\n${serialOut}`);
} catch (serialErr) {
log(`Serial console failed: ${serialErr instanceof Error ? serialErr.message : String(serialErr)}`);
}
@@ -316,10 +310,7 @@ describe("PXE boot provisioning", () => {
const { stopDnsmasq } = await import("../../src/bastion/src/services/dnsmasq.js");
stopDnsmasq();
// Destroy VM
destroyPxeVm(VM_NAME);
// Destroy network
destroyPxeNetwork();
// Clean up test dir
@@ -400,7 +391,15 @@ describe("PXE boot provisioning", () => {
it("EFI boot order keeps network first (bastion controls boot)", () => {
const result = sshExec(vmIp, SSH_USER, "sudo efibootmgr", { keyPath: sshKeyPath });
expect(result.exitCode).toBe(0);
expect(result.stdout).toContain("BootOrder:");
// The first entry in BootOrder should be a network/PXE/HTTP boot entry
const orderMatch = result.stdout.match(/BootOrder:\s*([0-9A-Fa-f]+)/);
expect(orderMatch).toBeTruthy();
const firstEntry = orderMatch![1];
// Find what that entry maps to — should be network-related
const entryLine = result.stdout.match(new RegExp(`Boot${firstEntry}\\*?\\s+(.+)`));
expect(entryLine).toBeTruthy();
const entryName = entryLine![1].toLowerCase();
expect(entryName).toMatch(/network|pxe|ipv4|ipv6|http|uefi.*nic/i);
});
it("tmpfs mount for /tmp is configured", () => {
@@ -422,4 +421,53 @@ describe("PXE boot provisioning", () => {
expect(lvs).toContain(expected);
}
});
// --- Post-provision health checks ---
it("no failed systemd services", () => {
const result = sshExec(vmIp, SSH_USER, "sudo systemctl --failed --no-legend --no-pager", { keyPath: sshKeyPath });
expect(result.exitCode).toBe(0);
const failed = result.stdout.trim();
expect(failed).toBe("");
});
it("root filesystem is mounted read-write", () => {
const result = sshExec(vmIp, SSH_USER, "mount | grep ' / '", { keyPath: sshKeyPath });
expect(result.stdout).toContain("rw,");
expect(result.stdout).not.toContain("(ro,");
});
it("/boot/efi is mounted", () => {
const result = sshExec(vmIp, SSH_USER, "mount | grep /boot/efi", { keyPath: sshKeyPath });
expect(result.exitCode).toBe(0);
expect(result.stdout).toContain("vfat");
});
it("kernel modules are loaded (depmod correct)", () => {
const result = sshExec(vmIp, SSH_USER, "lsmod | wc -l", { keyPath: sshKeyPath });
expect(result.exitCode).toBe(0);
// Should have a reasonable number of modules loaded
expect(Number(result.stdout.trim())).toBeGreaterThan(10);
});
it("SELinux is enforcing", () => {
const result = sshExec(vmIp, SSH_USER, "getenforce", { keyPath: sshKeyPath });
expect(result.exitCode).toBe(0);
expect(result.stdout.trim()).toBe("Enforcing");
});
it("SELinux context on /etc/fstab is correct", () => {
const result = sshExec(vmIp, SSH_USER, "ls -Z /etc/fstab", { keyPath: sshKeyPath });
expect(result.stdout).toContain("etc_t");
});
it("sshd is running", () => {
const result = sshExec(vmIp, SSH_USER, "sudo systemctl is-active sshd", { keyPath: sshKeyPath });
expect(result.stdout.trim()).toBe("active");
});
it("chronyd is running for time sync", () => {
const result = sshExec(vmIp, SSH_USER, "sudo systemctl is-active chronyd", { keyPath: sshKeyPath });
expect(result.stdout.trim()).toBe("active");
});
});