fix: PXE boot debugging — bisect root cause, syslog logging, serial console #3

Merged
michal merged 31 commits from wip/ks-debugging into main 2026-03-29 00:50:05 +00:00
4 changed files with 107 additions and 216 deletions
Showing only changes of commit 2a1a29c03b - Show all commits

View File

@@ -74,9 +74,8 @@ cp /root/.ssh/authorized_keys "$ADMIN_HOME/.ssh/authorized_keys"
chown -R ${adminUser}:${adminUser} "$ADMIN_HOME/.ssh"
chmod 600 "$ADMIN_HOME/.ssh/authorized_keys"
# Fix SELinux contexts for SSH (restorecon may not work in Anaconda chroot, use chcon)
chcon -R -t ssh_home_t /root/.ssh "$ADMIN_HOME/.ssh" 2>/dev/null || true
chcon -t user_home_dir_t "$ADMIN_HOME" 2>/dev/null || true
# Fix SELinux contexts for SSH
restorecon -R /root/.ssh "$ADMIN_HOME/.ssh" 2>/dev/null || true
# Passwordless sudo for ${adminUser}
echo '${adminUser} ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/${adminUser}
@@ -102,48 +101,6 @@ done
? `logvol /var/lib/rancher --vgname=${vg} --name=rancher --fstype=xfs --size=20480`
: "";
// Helper: the bastion callback functions used in both %pre and %post.
// Defined as a template so each section gets its own copy (they run in different shells).
const bastionHelpers = `
# Detect MAC address (first real ethernet MAC, skip loopback/veth)
_BASTION_MAC=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
_BASTION_URL="http://${serverIp}:${httpPort}"
# Send a structured progress stage to bastion
bastion_progress() {
local stage="$1" detail="\${2:-}"
curl -sf -X POST "\${_BASTION_URL}/api/progress" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
}
# Send log lines to bastion (batched)
bastion_log() {
local line="$1"
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
}
# Send an error stage to bastion with context
bastion_error() {
local detail="$1"
bastion_progress "error" "$detail"
# Also send the last 50 lines of any log file as context
for logfile in /root/bastion-post-install.log /tmp/pre-partition.log; do
if [ -f "$logfile" ]; then
local tail_content
tail_content=$(tail -50 "$logfile" 2>/dev/null | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/$/\\\\n/' | tr -d '\\n')
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[\\"--- $logfile (last 50 lines) ---\\"],\\"tail\\":\\"$tail_content\\"}" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
fi
done
}`;
return `# Lab Bastion -- Fedora ${fedoraVersion} server install
# Generated: ${now}
# Target: ${fqdn} (role=${role})
@@ -170,25 +127,27 @@ url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$relea
%pre --log=/tmp/pre-partition.log
#!/bin/bash
set -x
${bastionHelpers}
# Error trap: report failures back to bastion
trap 'bastion_error "%pre failed at line $LINENO: $(tail -1 /tmp/pre-partition.log 2>/dev/null)"' ERR
# Progress callback helper
bastion_progress() {
local stage="$1" detail="\${2:-}"
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
curl -sf -X POST "http://${serverIp}:${httpPort}/api/progress" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
}
bastion_progress "partitioning" "detecting disk"
VG="${vg}"
${diskLine}
bastion_log "disk detected: $DISK"
REPROVISION=no
# Check if VG exists (reprovision scenario)
if vgs $VG &>/dev/null; then
echo "=== Existing VG found - reprovision mode ==="
REPROVISION=yes
bastion_progress "partitioning" "reprovision mode -- preserving data volumes"
# Detect which data LVs to preserve
PRESERVE_LONGHORN=no; PRESERVE_SRV=no; PRESERVE_HOME=no; PRESERVE_RANCHER=no
@@ -198,7 +157,6 @@ if vgs $VG &>/dev/null; then
lvs $VG/rancher &>/dev/null && PRESERVE_RANCHER=yes
echo "Preserving: longhorn=$PRESERVE_LONGHORN srv=$PRESERVE_SRV home=$PRESERVE_HOME rancher=$PRESERVE_RANCHER"
bastion_log "preserving LVs: longhorn=$PRESERVE_LONGHORN srv=$PRESERVE_SRV home=$PRESERVE_HOME rancher=$PRESERVE_RANCHER"
# Remove only OS logical volumes (keep data LVs)
for lv in root var varlog swap; do
@@ -275,13 +233,11 @@ cat /tmp/part.ks
echo "==================================="
bastion_progress "partitioning" "disk layout ready"
bastion_log "partition config written to /tmp/part.ks"
%end
%packages
@core
kernel-modules
openssh-server
vim-enhanced
tmux
@@ -331,106 +287,38 @@ ruby-libs
-gdm
-PackageKit
-PackageKit-glib
dosfstools
%end
%post --log=/root/bastion-post-install.log
#!/bin/bash
set -x
${bastionHelpers}
# --- Error trap: catch any failure and report to bastion ---
_post_error_handler() {
local exit_code=$? lineno=$1
bastion_error "%post failed at line $lineno (exit $exit_code)"
}
trap '_post_error_handler $LINENO' ERR
# --- Background log streamer: sends %post output to bastion in real-time ---
_LOG_FILE=/root/bastion-post-install.log
_LOG_STREAMER_PID=""
(
# Wait for the log file to exist
while [ ! -f "$_LOG_FILE" ]; do sleep 1; done
# Tail and batch-send lines every 3 seconds
_batch=""
_count=0
tail -f "$_LOG_FILE" 2>/dev/null | while IFS= read -r _line; do
# Escape for JSON
_escaped=$(echo "$_line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/\\t/\\\\t/g')
if [ -z "$_batch" ]; then
_batch="\\"$_escaped\\""
else
_batch="$_batch,\\"$_escaped\\""
fi
_count=$((_count + 1))
# Send batch every 10 lines
if [ "$_count" -ge 10 ]; then
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[$_batch]}" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
_batch=""
_count=0
fi
done
) &
_LOG_STREAMER_PID=$!
# Flush remaining log lines helper
_flush_log_streamer() {
if [ -n "$_LOG_STREAMER_PID" ]; then
kill "$_LOG_STREAMER_PID" 2>/dev/null || true
wait "$_LOG_STREAMER_PID" 2>/dev/null || true
fi
# Send any remaining lines from the log
if [ -f "$_LOG_FILE" ]; then
local remaining
remaining=$(tail -20 "$_LOG_FILE" 2>/dev/null | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/\\t/\\\\t/g; s/^/"/; s/$/"/' | paste -sd, -)
if [ -n "$remaining" ]; then
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[$remaining]}" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
fi
fi
# Progress callback helper
bastion_progress() {
local stage="$1" detail="\${2:-}"
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
curl -sf -X POST "http://${serverIp}:${httpPort}/api/progress" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
}
bastion_progress "installing" "packages installed, starting post-install"
bastion_progress "post-install" "configuring system"
# -- SSH --
bastion_progress "post-install" "configuring SSH"
systemctl enable sshd
systemctl enable --now sshd
sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
${sshPostBlock}
bastion_log "SSH configured: root login by key only, password auth disabled"
# -- Hostname and domain --
bastion_progress "post-install" "setting hostname to ${fqdn}"
hostnamectl set-hostname ${fqdn}
# -- Rebuild module dependencies (Anaconda's depmod runs against host kernel, not installed kernel) --
INSTALLED_KVER=$(ls /lib/modules/ | grep -v "$(uname -r)" | head -1)
if [ -n "$INSTALLED_KVER" ]; then
depmod -a "$INSTALLED_KVER"
bastion_log "depmod rebuilt for kernel $INSTALLED_KVER"
fi
# Make /boot/efi mount non-fatal — on first boot SELinux labels on kernel module
# files are wrong (Anaconda chroot issue), so vfat may fail to load.
# autorelabel fixes labels and reboots; second boot mounts /boot/efi normally.
sed -i '/boot\\/efi/ s/defaults/defaults,nofail/' /etc/fstab
bastion_log "fstab /boot/efi set to nofail"
# -- tmpfs for /tmp --
echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
bastion_progress "post-install" "vanilla role -- skipping k3s setup"
# -- Enable chronyd for time sync --
systemctl enable chronyd || true` : `# -- Kernel modules for k3s --
bastion_progress "post-install" "loading k3s kernel modules"
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
br_netfilter
overlay
@@ -440,7 +328,6 @@ modprobe br_netfilter || true
modprobe overlay || true
# -- Sysctl for k3s networking --
bastion_progress "post-install" "configuring k3s sysctl"
cat > /etc/sysctl.d/90-k3s.conf << 'SYSCTL'
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
@@ -452,48 +339,30 @@ SYSCTL
sysctl --system || true
# -- Disable firewalld permanently (k3s/Cilium manage iptables directly) --
bastion_progress "post-install" "disabling firewalld"
# Must be masked to prevent re-enable on updates
systemctl disable --now firewalld || true
systemctl mask firewalld || true
# -- Enable chronyd for time sync --
systemctl enable chronyd || true`}
# -- Serial console (for debugging — auto-login as root on ttyS0) --
systemctl enable serial-getty@ttyS0.service || true
# -- Boot order: restore network first (Anaconda sets disk first, we undo it) --
# Network boot must stay first so the bastion intercepts every reboot. It returns
# exit (local disk) for installed machines, or install for reinstalls.
bastion_progress "post-install" "restoring network-first boot order"
# Network boot must stay first so the bastion intercepts every reboot.
if command -v efibootmgr >/dev/null 2>&1; then
# Find network/PXE/HTTP boot entries (OVMF uses HTTPv4, real hardware uses PXE/Network)
PXE_ENTRY=$(efibootmgr | grep -iE 'network|pxe|ipv4|ipv6|http' | head -1 | grep -oP 'Boot\\K[0-9A-F]+')
if [ -n "$PXE_ENTRY" ]; then
CURRENT_ORDER=$(efibootmgr | grep BootOrder | cut -d: -f2 | tr -d ' ')
# Move PXE entry to front
REST=$(echo "$CURRENT_ORDER" | sed "s/$PXE_ENTRY,\\\\?//;s/,$//" | sed 's/^,//')
NEW_ORDER="$PXE_ENTRY,$REST"
efibootmgr -o "$NEW_ORDER" || true
bastion_log "boot order set: network first ($NEW_ORDER)"
else
bastion_log "no PXE boot entry found, boot order unchanged"
fi
else
bastion_log "efibootmgr not available"
fi
# -- Provisioning metadata --
bastion_progress "post-install" "writing provisioning metadata"
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
cat > /etc/lab-provisioned << PROVEOF
hostname: ${fqdn}
role: ${role}
provisioned: $(date -Iseconds)
bastion: ${serverIp}
ip: $IP_ADDR
PROVEOF
cat > /root/README << 'README'
@@ -511,17 +380,9 @@ cat > /root/README << 'README'
README
${hasRancher ? `# Install k3s server (skip start - will be configured manually)
bastion_progress "post-install" "pre-installing k3s server"
curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh -
bastion_log "k3s server pre-installed (not started)"
` : ""}
# -- Fix SELinux labels (Anaconda %post creates files with wrong contexts) --
restorecon -R /etc /var /root 2>/dev/null || true
bastion_log "SELinux contexts restored for /etc /var /root"
# Stop log streamer and flush remaining lines
_flush_log_streamer
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
bastion_progress "complete" "ready at $IP_ADDR"
%end

View File

@@ -91,9 +91,8 @@ describe("renderInstallKickstart", () => {
serverIp: "10.0.0.5",
httpPort: 9090,
}));
expect(ks).toContain('_BASTION_URL="http://10.0.0.5:9090"');
expect(ks).toContain("http://10.0.0.5:9090");
expect(ks).toContain("/api/progress");
expect(ks).toContain("/api/log");
});
it("infra role has /var/lib/rancher partition", () => {
@@ -141,52 +140,18 @@ describe("renderInstallKickstart", () => {
expect(ks).toContain("--name=swap --fstype=swap --size=27648");
});
it("%pre has error trap", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("trap");
expect(ks).toContain("bastion_error");
expect(ks).toContain("%pre failed");
});
it("%post has error trap", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("_post_error_handler");
expect(ks).toContain("%post failed");
});
it("has granular progress stages in %post", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain('"configuring SSH"');
expect(ks).toContain('"setting hostname');
expect(ks).toContain('"writing provisioning metadata"');
expect(ks).toContain('"writing provisioning metadata"');
});
it("has background log streamer in %post", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("_LOG_STREAMER_PID");
expect(ks).toContain("_flush_log_streamer");
expect(ks).toContain("tail -f");
});
it("has bastion_log function for sending log lines", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("bastion_log()");
expect(ks).toContain("/api/log");
});
it("vanilla role skips k3s progress stages", () => {
it("vanilla role skips k3s setup", () => {
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
expect(ks).toContain("vanilla role");
expect(ks).not.toContain('"loading k3s kernel modules"');
expect(ks).not.toContain('"disabling firewalld"');
expect(ks).not.toContain("modules-load.d/k3s.conf");
expect(ks).not.toContain("firewalld");
});
it("worker role has k3s-related progress stages", () => {
it("worker role has k3s setup", () => {
const ks = renderInstallKickstart(baseParams({ role: "worker" }));
expect(ks).toContain('"loading k3s kernel modules"');
expect(ks).toContain('"configuring k3s sysctl"');
expect(ks).toContain('"disabling firewalld"');
expect(ks).toContain("modules-load.d/k3s.conf");
expect(ks).toContain("sysctl.d/90-k3s.conf");
expect(ks).toContain("firewalld");
});
it("kickstart syntax: no merged partition lines", () => {
@@ -211,13 +176,9 @@ describe("renderInstallKickstart", () => {
expect(ends, `${sections} sections but ${ends} %end markers`).toBe(sections);
});
it("kernel-modules package is included", () => {
it("has complete progress stage", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("kernel-modules");
});
it("dosfstools package is included", () => {
const ks = renderInstallKickstart(baseParams());
expect(ks).toContain("dosfstools");
expect(ks).toContain('"complete"');
expect(ks).toContain("ready at");
});
});

View File

@@ -27,13 +27,52 @@ import { ensurePxeNetwork, destroyPxeNetwork, deleteNftablesRejectRules, PXE_NET
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, readSerialLog } from "./helpers/pxe-vm.js";
import { sshExec } from "./helpers/ssh.js";
// --- Boot screenshot capture ---
const SCREENSHOT_DIR = "/tmp/vm-screenshots";
function startBootScreenshots(vmName: string): { stop: () => void } {
try { mkdirSync(SCREENSHOT_DIR, { recursive: true }); } catch {}
// Clean old screenshots
try {
for (const f of require("node:fs").readdirSync(SCREENSHOT_DIR)) {
rmSync(join(SCREENSHOT_DIR, f), { force: true });
}
} catch {}
let running = true;
let seq = 0;
const BUFFER_SIZE = 60; // keep last 60 screenshots (1 per second)
const loop = async () => {
while (running) {
try {
const idx = String(seq % BUFFER_SIZE).padStart(4, "0");
const ppm = join(SCREENSHOT_DIR, `tmp-${idx}.ppm`);
const png = join(SCREENSHOT_DIR, `boot-${idx}.png`);
execSync(`sudo virsh screenshot ${vmName} ${ppm} --screen 0 2>/dev/null`, { timeout: 3000 });
execSync(`convert ${ppm} ${png} 2>/dev/null && rm -f ${ppm}`, { timeout: 3000 });
seq++;
} catch {}
await new Promise(r => setTimeout(r, 1000));
}
};
loop();
return {
stop: () => {
running = false;
log(`Boot screenshots saved to ${SCREENSHOT_DIR}/ (${seq} captured, last ${Math.min(seq, BUFFER_SIZE)} kept)`);
},
};
}
// --- Test constants ---
const VM_NAME = "lab-pxe-test";
const VM_MEMORY = 4096; // 4GB (Anaconda needs ~2GB minimum)
const VM_VCPUS = 12;
const VM_DISK_GB = 250; // LVM layout needs ~204GB (swap 27 + root 33 + var 100 + etc). QCOW2 is sparse.
const HTTP_PORT = 8099; // Avoid conflicts with real bastion
const SSH_USER = "root"; // Use root for SSH (admin user key setup has known issue)
const SSH_USER = "root"; // Use root SSH for baseline testing
const BASTION_IP = PXE_GATEWAY; // 192.168.251.1
const DHCP_RANGE_START = `${PXE_SUBNET}.100`;
const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
@@ -41,7 +80,7 @@ const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
// Fedora install takes a while
const DISCOVERY_TIMEOUT_MS = 5 * 60_000; // 5 min for PXE boot + discovery
const INSTALL_TIMEOUT_MS = 30 * 60_000; // 30 min for full Fedora install
const SSH_TIMEOUT_MS = 15 * 60_000; // 15 min: PXE (~90s) + first boot + SELinux autorelabel (~3min) + reboot + second PXE (~90s) + boot
const SSH_TIMEOUT_MS = 10 * 60_000; // 10 min: OVMF retries PXE/HTTP Boot (~3min) before disk boot + OS startup
function findSshKey(): { pubKey: string; keyPath: string } {
const homes = [homedir()];
@@ -279,8 +318,9 @@ describe("PXE boot provisioning", () => {
await sleep(3_000);
deleteNftablesRejectRules();
// 10. Wait for SSH
// 10. Wait for SSH (with aggressive boot screenshots)
log("Waiting for SSH access...");
const screenshots = startBootScreenshots(VM_NAME);
try {
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
} catch {
@@ -292,7 +332,9 @@ describe("PXE boot provisioning", () => {
} catch (serialErr) {
log(`Serial console failed: ${serialErr instanceof Error ? serialErr.message : String(serialErr)}`);
}
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above`);
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above. Screenshots: ${SCREENSHOT_DIR}/`);
} finally {
screenshots.stop();
}
log("PXE provision test setup complete.");
@@ -345,10 +387,10 @@ describe("PXE boot provisioning", () => {
expect(data.progress).toBe("complete");
});
it("log lines were captured", async () => {
it.skip("log lines were captured", async () => {
// Requires log streamer in %post — skipped until re-added
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
// Should have at least some log lines from the log streamer
expect(data.log_total).toBeGreaterThan(0);
});

View File

@@ -0,0 +1,27 @@
#!/bin/bash
# One-shot PXE integration test runner.
# Compiles, runs unit tests, cleans up, and runs the full integration test.
set -e
cd "$(dirname "$0")/../.."
echo "=== Step 1: Compile ==="
npx tsc --noEmit
echo "✓ Compile OK"
echo ""
echo "=== Step 2: Kickstart unit tests ==="
npx vitest run src/bastion/tests/kickstart.test.ts 2>&1 | tail -5
echo "✓ Unit tests OK"
echo ""
echo "=== Step 3: Clean up ==="
sudo lsof -ti:8099 2>/dev/null | xargs -r sudo kill -9 || true
sudo virsh destroy lab-pxe-test 2>/dev/null || true
sudo virsh undefine lab-pxe-test --nvram 2>/dev/null || true
sudo rm -f /var/lib/libvirt/images/lab-pxe-test.qcow2
echo "✓ Cleanup done"
echo ""
echo "=== Step 4: Integration test ==="
npx vitest run -c /dev/null tests/integration/pxe-provision.test.ts 2>&1