fix: PXE boot debugging — bisect root cause, syslog logging, serial console #3
@@ -447,6 +447,9 @@ systemctl mask firewalld || true
|
||||
# -- Enable chronyd for time sync --
|
||||
systemctl enable chronyd || true`}
|
||||
|
||||
# -- Serial console (for debugging — auto-login as root on ttyS0) --
|
||||
systemctl enable serial-getty@ttyS0.service || true
|
||||
|
||||
# -- Boot order: restore network first (Anaconda sets disk first, we undo it) --
|
||||
# Network boot must stay first so the bastion intercepts every reboot. It returns
|
||||
# exit (local disk) for installed machines, or install for reinstalls.
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
// Create a blank UEFI VM for PXE boot testing.
|
||||
// Unlike cloud image VMs, these have an empty disk and boot from network.
|
||||
// Each VM gets a serial console on a TCP port for debugging without network/SSH.
|
||||
|
||||
import { execSync, spawnSync, type SpawnSyncReturns } from "node:child_process";
|
||||
import { existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { createConnection } from "node:net";
|
||||
import { log } from "./libvirt.js";
|
||||
|
||||
const IMAGE_DIR = "/var/lib/libvirt/images";
|
||||
@@ -68,6 +70,9 @@ export function createPxeVm(config: PxeVmConfig): void {
|
||||
"--wait=0",
|
||||
// Graphics for debugging (VNC, connect with virt-viewer if needed)
|
||||
"--graphics=vnc,listen=127.0.0.1",
|
||||
// Serial console via TCP — allows exec without network/SSH
|
||||
// Connect: socat - TCP:127.0.0.1:4555
|
||||
"--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet",
|
||||
];
|
||||
|
||||
if (arch === "aarch64") {
|
||||
@@ -76,7 +81,7 @@ export function createPxeVm(config: PxeVmConfig): void {
|
||||
|
||||
log(`Running: virt-install --name=${config.name} --boot=uefi,network ...`);
|
||||
run(virtInstallArgs.join(" "), { timeout: 30_000 });
|
||||
log(`PXE VM ${config.name} created and booting from network`);
|
||||
log(`PXE VM ${config.name} created (serial: telnet 127.0.0.1 4555)`);
|
||||
}
|
||||
|
||||
/** Destroy a PXE VM and clean up its disk. */
|
||||
@@ -169,6 +174,8 @@ export function createIsoVm(config: IsoVmConfig): void {
|
||||
"--noautoconsole",
|
||||
"--wait=0",
|
||||
"--graphics=vnc,listen=127.0.0.1",
|
||||
// Serial console via TCP (port 4556 to avoid conflict with PXE VM)
|
||||
"--serial=tcp,host=127.0.0.1:4556,mode=bind,protocol=telnet",
|
||||
];
|
||||
|
||||
if (arch === "aarch64") {
|
||||
@@ -177,5 +184,72 @@ export function createIsoVm(config: IsoVmConfig): void {
|
||||
|
||||
log(`Running: virt-install --name=${config.name} --boot=uefi,cdrom ...`);
|
||||
run(virtInstallArgs.join(" "), { timeout: 60_000 });
|
||||
log(`ISO boot VM ${config.name} created and booting from ISO`);
|
||||
log(`ISO boot VM ${config.name} created (serial: telnet 127.0.0.1 4556)`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a command on a VM via its serial console (telnet).
|
||||
* Works even when the VM has no network/SSH.
|
||||
* Returns the output after the command's echo.
|
||||
*/
|
||||
export async function serialExec(
|
||||
port: number,
|
||||
command: string,
|
||||
timeoutMs = 10_000,
|
||||
): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
sock.destroy();
|
||||
reject(new Error(`Serial exec timeout after ${timeoutMs}ms`));
|
||||
}, timeoutMs);
|
||||
|
||||
const sock = createConnection({ host: "127.0.0.1", port });
|
||||
let buffer = "";
|
||||
let sentCommand = false;
|
||||
// Random marker to delimit command output
|
||||
const marker = `__SERIAL_END_${Date.now()}__`;
|
||||
|
||||
sock.on("connect", () => {
|
||||
// Wait for login prompt or shell prompt, then send command
|
||||
setTimeout(() => {
|
||||
// Send a newline first to get a prompt
|
||||
sock.write("\r\n");
|
||||
}, 500);
|
||||
});
|
||||
|
||||
sock.on("data", (data: Buffer) => {
|
||||
buffer += data.toString();
|
||||
|
||||
if (!sentCommand && (buffer.includes("login:") || buffer.includes("# ") || buffer.includes("$ "))) {
|
||||
if (buffer.includes("login:")) {
|
||||
// Auto-login as root
|
||||
sock.write("root\r\n");
|
||||
sentCommand = false; // wait for shell prompt after login
|
||||
buffer = "";
|
||||
return;
|
||||
}
|
||||
// At shell prompt — send command with marker
|
||||
sentCommand = true;
|
||||
buffer = "";
|
||||
sock.write(`${command}; echo "${marker}"\r\n`);
|
||||
}
|
||||
|
||||
if (sentCommand && buffer.includes(marker)) {
|
||||
clearTimeout(timer);
|
||||
// Extract output between command echo and marker
|
||||
const markerIdx = buffer.indexOf(marker);
|
||||
const output = buffer.substring(0, markerIdx).trim();
|
||||
// Remove the command echo (first line)
|
||||
const lines = output.split("\n");
|
||||
const result = lines.slice(1).join("\n").trim();
|
||||
sock.destroy();
|
||||
resolve(result);
|
||||
}
|
||||
});
|
||||
|
||||
sock.on("error", (err) => {
|
||||
clearTimeout(timer);
|
||||
reject(new Error(`Serial connection failed: ${err.message}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ import { join } from "node:path";
|
||||
import { homedir, tmpdir } from "node:os";
|
||||
import { log, waitForSsh } from "./helpers/libvirt.js";
|
||||
import { ensurePxeNetwork, destroyPxeNetwork, PXE_NETWORK_NAME, PXE_GATEWAY, PXE_SUBNET } from "./helpers/pxe-network.js";
|
||||
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm } from "./helpers/pxe-vm.js";
|
||||
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, serialExec } from "./helpers/pxe-vm.js";
|
||||
import { sshExec } from "./helpers/ssh.js";
|
||||
|
||||
// --- Test constants ---
|
||||
@@ -277,7 +277,29 @@ describe("PXE boot provisioning", () => {
|
||||
// 10. Wait for SSH — VM network-boots, iPXE chains to /dispatch,
|
||||
// bastion returns exit (installed), iPXE falls through to disk boot
|
||||
log("Waiting for SSH access...");
|
||||
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
|
||||
try {
|
||||
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
|
||||
} catch {
|
||||
// SSH failed — use serial console to diagnose
|
||||
log("SSH timed out. Diagnosing via serial console...");
|
||||
try {
|
||||
const hostname = await serialExec(4555, "hostname", 15_000);
|
||||
log(`Serial: hostname = ${hostname}`);
|
||||
const ip = await serialExec(4555, "ip -4 addr show | grep inet", 15_000);
|
||||
log(`Serial: ip = ${ip}`);
|
||||
const nm = await serialExec(4555, "systemctl is-active NetworkManager", 15_000);
|
||||
log(`Serial: NetworkManager = ${nm}`);
|
||||
const sshd = await serialExec(4555, "systemctl is-active sshd", 15_000);
|
||||
log(`Serial: sshd = ${sshd}`);
|
||||
const failed = await serialExec(4555, "systemctl --failed --no-pager", 15_000);
|
||||
log(`Serial: failed units = ${failed}`);
|
||||
const fstab = await serialExec(4555, "grep efi /etc/fstab", 15_000);
|
||||
log(`Serial: fstab efi = ${fstab}`);
|
||||
} catch (serialErr) {
|
||||
log(`Serial console failed: ${serialErr instanceof Error ? serialErr.message : String(serialErr)}`);
|
||||
}
|
||||
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above`);
|
||||
}
|
||||
|
||||
log("PXE provision test setup complete.");
|
||||
}, DISCOVERY_TIMEOUT_MS + INSTALL_TIMEOUT_MS + SSH_TIMEOUT_MS + 120_000); // total timeout
|
||||
|
||||
Reference in New Issue
Block a user