fix: revert kickstart to near-original baseline (Step 0 — boots clean)

Reverted install.ks.ts to near-original state from commit 64533b2.
This is the bisection baseline — 21/22 integration tests pass,
0 failed systemd services, SSH works, /boot/efi mounts.

Removed all accumulated fixes that collectively broke boot:
- ERR trap, background log streamer, bastion_log/bastion_error
- depmod rebuild, nofail on /boot/efi, SELinux autorelabel
- chcon/restorecon for /etc /var /root
- kernel-modules and dosfstools packages

Kept from current branch:
- rootpw --plaintext lab-root-pw (console debug access)
- Network-first boot order (bastion controls boot)
- Vanilla role support, rancher partition support
- Boot screenshots during SSH wait (1/sec rolling buffer)
- Test runner script (run-pxe-test.sh)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal
2026-03-28 20:47:34 +00:00
parent a664074fa3
commit 2a1a29c03b
4 changed files with 107 additions and 216 deletions

View File

@@ -27,13 +27,52 @@ import { ensurePxeNetwork, destroyPxeNetwork, deleteNftablesRejectRules, PXE_NET
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, readSerialLog } from "./helpers/pxe-vm.js";
import { sshExec } from "./helpers/ssh.js";
// --- Boot screenshot capture ---
const SCREENSHOT_DIR = "/tmp/vm-screenshots";
function startBootScreenshots(vmName: string): { stop: () => void } {
try { mkdirSync(SCREENSHOT_DIR, { recursive: true }); } catch {}
// Clean old screenshots
try {
for (const f of require("node:fs").readdirSync(SCREENSHOT_DIR)) {
rmSync(join(SCREENSHOT_DIR, f), { force: true });
}
} catch {}
let running = true;
let seq = 0;
const BUFFER_SIZE = 60; // keep last 60 screenshots (1 per second)
const loop = async () => {
while (running) {
try {
const idx = String(seq % BUFFER_SIZE).padStart(4, "0");
const ppm = join(SCREENSHOT_DIR, `tmp-${idx}.ppm`);
const png = join(SCREENSHOT_DIR, `boot-${idx}.png`);
execSync(`sudo virsh screenshot ${vmName} ${ppm} --screen 0 2>/dev/null`, { timeout: 3000 });
execSync(`convert ${ppm} ${png} 2>/dev/null && rm -f ${ppm}`, { timeout: 3000 });
seq++;
} catch {}
await new Promise(r => setTimeout(r, 1000));
}
};
loop();
return {
stop: () => {
running = false;
log(`Boot screenshots saved to ${SCREENSHOT_DIR}/ (${seq} captured, last ${Math.min(seq, BUFFER_SIZE)} kept)`);
},
};
}
// --- Test constants ---
const VM_NAME = "lab-pxe-test";
const VM_MEMORY = 4096; // 4GB (Anaconda needs ~2GB minimum)
const VM_VCPUS = 12;
const VM_DISK_GB = 250; // LVM layout needs ~204GB (swap 27 + root 33 + var 100 + etc). QCOW2 is sparse.
const HTTP_PORT = 8099; // Avoid conflicts with real bastion
const SSH_USER = "root"; // Use root for SSH (admin user key setup has known issue)
const SSH_USER = "root"; // Use root SSH for baseline testing
const BASTION_IP = PXE_GATEWAY; // 192.168.251.1
const DHCP_RANGE_START = `${PXE_SUBNET}.100`;
const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
@@ -41,7 +80,7 @@ const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
// Fedora install takes a while
const DISCOVERY_TIMEOUT_MS = 5 * 60_000; // 5 min for PXE boot + discovery
const INSTALL_TIMEOUT_MS = 30 * 60_000; // 30 min for full Fedora install
const SSH_TIMEOUT_MS = 15 * 60_000; // 15 min: PXE (~90s) + first boot + SELinux autorelabel (~3min) + reboot + second PXE (~90s) + boot
const SSH_TIMEOUT_MS = 10 * 60_000; // 10 min: OVMF retries PXE/HTTP Boot (~3min) before disk boot + OS startup
function findSshKey(): { pubKey: string; keyPath: string } {
const homes = [homedir()];
@@ -279,8 +318,9 @@ describe("PXE boot provisioning", () => {
await sleep(3_000);
deleteNftablesRejectRules();
// 10. Wait for SSH
// 10. Wait for SSH (with aggressive boot screenshots)
log("Waiting for SSH access...");
const screenshots = startBootScreenshots(VM_NAME);
try {
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
} catch {
@@ -292,7 +332,9 @@ describe("PXE boot provisioning", () => {
} catch (serialErr) {
log(`Serial console failed: ${serialErr instanceof Error ? serialErr.message : String(serialErr)}`);
}
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above`);
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above. Screenshots: ${SCREENSHOT_DIR}/`);
} finally {
screenshots.stop();
}
log("PXE provision test setup complete.");
@@ -345,10 +387,10 @@ describe("PXE boot provisioning", () => {
expect(data.progress).toBe("complete");
});
it("log lines were captured", async () => {
it.skip("log lines were captured", async () => {
// Requires log streamer in %post — skipped until re-added
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
// Should have at least some log lines from the log streamer
expect(data.log_total).toBeGreaterThan(0);
});

View File

@@ -0,0 +1,27 @@
#!/bin/bash
# One-shot PXE integration test runner.
# Compiles, runs unit tests, cleans up, and runs the full integration test.
set -e
cd "$(dirname "$0")/../.."
echo "=== Step 1: Compile ==="
npx tsc --noEmit
echo "✓ Compile OK"
echo ""
echo "=== Step 2: Kickstart unit tests ==="
npx vitest run src/bastion/tests/kickstart.test.ts 2>&1 | tail -5
echo "✓ Unit tests OK"
echo ""
echo "=== Step 3: Clean up ==="
sudo lsof -ti:8099 2>/dev/null | xargs -r sudo kill -9 || true
sudo virsh destroy lab-pxe-test 2>/dev/null || true
sudo virsh undefine lab-pxe-test --nvram 2>/dev/null || true
sudo rm -f /var/lib/libvirt/images/lab-pxe-test.qcow2
echo "✓ Cleanup done"
echo ""
echo "=== Step 4: Integration test ==="
npx vitest run -c /dev/null tests/integration/pxe-provision.test.ts 2>&1