feat: PXE debug boot mode for rescue/diagnostics
Some checks failed
CI/CD / lint (pull_request) Failing after 11s
CI/CD / test (pull_request) Failing after 9s
CI/CD / typecheck (pull_request) Failing after 22s
CI/CD / build (pull_request) Has been skipped
CI/CD / publish-rpm (pull_request) Has been skipped
CI/CD / publish-deb (pull_request) Has been skipped

New `labctl provision debug <target>` command that PXE boots a machine
into Fedora rescue mode (inst.rescue) for live debugging. Auto-clears
after one boot so next reboot returns to normal.

Adds debug state to BastionState, dispatch routing, API endpoints,
labd command routing, and CLI with rescue workflow guide.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal
2026-03-29 22:25:44 +01:00
parent 6c6d5763c4
commit e87edfcfbd
18 changed files with 368 additions and 59 deletions

View File

@@ -266,6 +266,21 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } };
});
labdConn.onCommand("command-debug", async (msg) => {
if (msg.type !== "command-debug") throw new Error("unexpected");
const mac = msg.mac.toLowerCase();
const currentState = state.load();
const hostname =
currentState.installed[mac]?.hostname ??
currentState.install_queue[mac]?.hostname ??
currentState.discovered[mac]?.product ??
mac;
state.update((s) => {
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
});
return { status: "ok", data: { mac, hostname } };
});
labdConn.onCommand("command-forget", async (msg) => {
if (msg.type !== "command-forget") throw new Error("unexpected");
const mac = msg.mac.toLowerCase();
@@ -273,6 +288,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
delete s.discovered[mac];
delete s.install_queue[mac];
delete s.installed[mac];
delete s.debug[mac];
});
return { status: "ok", data: { mac } };
});

View File

@@ -189,6 +189,31 @@ export function registerApiRoutes(
return reply.send({ status: "ok", lines: allLines.length });
});
// Queue debug/rescue mode for a machine
app.post<{
Body: { mac?: string };
}>("/api/debug", async (request, reply) => {
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
if (mac === "") {
return reply.status(400).send({ error: "mac is required" });
}
// Look up hostname from installed or discovered state
const currentState = state.load();
const hostname =
currentState.installed[mac]?.hostname ??
currentState.install_queue[mac]?.hostname ??
currentState.discovered[mac]?.product ??
mac;
state.update((s) => {
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
});
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
return reply.send({ status: "ok", mac, hostname });
});
// Delete a machine from all state
app.delete<{
Params: { mac: string };
@@ -213,6 +238,10 @@ export function registerApiRoutes(
delete s.installed[mac];
found = true;
}
if (s.debug[mac] !== undefined) {
delete s.debug[mac];
found = true;
}
});
if (!found) {

View File

@@ -10,9 +10,11 @@ import type { StateManager } from "../services/state.js";
import {
renderDiscoverIpxe,
renderInstallIpxe,
renderDebugIpxe,
renderLocalBootIpxe,
} from "../templates/boot.ipxe.js";
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
import { renderDebugKickstart } from "../templates/debug.ks.js";
import { logger } from "../services/logger.js";
export function registerDispatchRoutes(
@@ -20,10 +22,34 @@ export function registerDispatchRoutes(
config: BastionConfig,
state: StateManager,
): void {
// Serve debug/rescue kickstart (minimal: SSH keys + network)
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] });
return reply.type("text/plain").send(ks);
});
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
const currentState = state.load();
// Debug mode takes highest priority — auto-clear after serving once
const debugEntry = currentState.debug[mac];
if (debugEntry) {
const hostname = debugEntry.hostname ?? "debug";
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
state.update((s) => { delete s.debug[mac]; });
const script = renderDebugIpxe({
mac,
hostname,
serverIp: config.serverIp,
httpPort: config.httpPort,
fedoraMirror: config.fedoraMirror,
});
return reply.type("text/plain").send(script);
}
const queueEntry = currentState.install_queue[mac];
if (queueEntry) {
const hostname = queueEntry.hostname ?? "lab-node";

View File

@@ -11,6 +11,7 @@ const EMPTY_STATE: BastionState = {
discovered: {},
install_queue: {},
installed: {},
debug: {},
};
export type StateChangeListener = (state: BastionState) => void;
@@ -33,6 +34,7 @@ export class StateManager {
discovered: parsed.discovered ?? {},
install_queue: parsed.install_queue ?? {},
installed: parsed.installed ?? {},
debug: parsed.debug ?? {},
};
} catch {
return { ...EMPTY_STATE };

View File

@@ -75,6 +75,33 @@ boot
`;
}
/**
* iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode.
* Provides a shell with LVM tools, network, and SSH for inspecting installed systems.
*/
export function renderDebugIpxe(params: {
mac: string;
hostname: string;
serverIp: string;
httpPort: number;
fedoraMirror: string;
}): string {
return `#!ipxe
echo
echo =============================================
echo Lab PXE Bastion - DEBUG/RESCUE MODE
echo Target: ${params.hostname}
echo MAC: ${params.mac}
echo =============================================
echo
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror}
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
boot
`;
}
/**
* iPXE script for already-installed machines -- exits to boot from local disk.
*/

View File

@@ -0,0 +1,25 @@
// Debug/rescue kickstart template.
// Minimal: sets SSH access and network for Anaconda rescue mode.
// No disk operations, no packages, no %post.
export interface DebugKickstartParams {
sshKeys: string[];
}
export function renderDebugKickstart(params: DebugKickstartParams): string {
const sshpw = "sshpw --username=root --plaintext lab-root-pw";
const sshkeyLine = params.sshKeys.length > 0
? `sshkey --username=root "${params.sshKeys[0]}"`
: "";
return `# Lab Bastion -- Debug/Rescue Kickstart
# Minimal: only SSH + network for Anaconda rescue mode
lang en_US.UTF-8
keyboard uk
network --bootproto=dhcp --activate
${sshpw}
${sshkeyLine}
`;
}

View File

@@ -322,39 +322,20 @@ bastion_progress() {
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
}
# Send log lines to bastion
bastion_log() {
local line="$1"
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
curl -sf -X POST "http://${serverIp}:${httpPort}/api/log" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$mac\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\
--connect-timeout 5 --max-time 10 2>/dev/null || true
}
# Send an error stage to bastion
bastion_error() {
local detail="$1"
bastion_progress "error" "$detail"
}
# --- Error trap: catch any failure and report to bastion ---
_post_error_handler() {
local exit_code=$? lineno=$1
bastion_error "%post failed at line $lineno (exit $exit_code)"
}
trap '_post_error_handler $LINENO' ERR
bastion_progress "post-install" "configuring system"
# -- SSH --
systemctl enable --now sshd
# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot
systemctl enable sshd || true
sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
${sshPostBlock}
# -- Hostname and domain --
hostnamectl set-hostname ${fqdn}
bastion_progress "post-install" "1-ssh done"
# -- Hostname and domain (write directly, hostnamectl needs D-Bus) --
echo "${fqdn}" > /etc/hostname
# -- tmpfs for /tmp --
echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
@@ -392,12 +373,15 @@ SYSCTL
sysctl --system || true
# -- Disable firewalld permanently (k3s/Cilium manage iptables directly) --
systemctl disable --now firewalld || true
# Note: no '--now' — systemd is not running in the Anaconda chroot
systemctl disable firewalld || true
systemctl mask firewalld || true
# -- Enable chronyd for time sync --
systemctl enable chronyd || true`}
bastion_progress "post-install" "2-system done"
# -- Boot order: restore network first (Anaconda sets disk first, we undo it) --
# Network boot must stay first so the bastion intercepts every reboot.
if command -v efibootmgr >/dev/null 2>&1; then
@@ -410,6 +394,8 @@ if command -v efibootmgr >/dev/null 2>&1; then
fi
fi
bastion_progress "post-install" "3-bootorder done"
# -- Provisioning metadata --
cat > /etc/lab-provisioned << PROVEOF
hostname: ${fqdn}
@@ -435,6 +421,8 @@ README
${hasRancher ? `# Install k3s server (skip start - will be configured manually)
curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh -
` : ""}
bastion_progress "post-install" "4-metadata done"
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
bastion_progress "complete" "ready at $IP_ADDR"

View File

@@ -26,6 +26,7 @@ describe("StateManager", () => {
discovered: {},
install_queue: {},
installed: {},
debug: {},
});
});
@@ -39,6 +40,7 @@ describe("StateManager", () => {
discovered: {},
install_queue: {},
installed: {},
debug: {},
});
});

View File

@@ -94,6 +94,10 @@ export class LabdClient {
return this.request("POST", "/api/machines/install", { body: opts });
}
async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
return this.request("POST", "/api/machines/debug", { body: { mac } });
}
async forgetMachine(mac: string): Promise<{ status: string }> {
return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`);
}

View File

@@ -0,0 +1,153 @@
// CLI command: provision debug
// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE.
import { execFileSync } from "node:child_process";
import { existsSync } from "node:fs";
import { homedir } from "node:os";
import { join } from "node:path";
import { Command } from "commander";
import type { BastionState } from "@lab/shared";
import { getLabdClient } from "../api/config.js";
/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */
function resolveTarget(
target: string,
state: BastionState,
): { mac: string; hostname: string; ip: string } | null {
const normalized = target.toLowerCase().replace(/-/g, ":");
if (state.installed[normalized]) {
const info = state.installed[normalized];
return { mac: normalized, hostname: info.hostname, ip: info.ip };
}
if (state.discovered[normalized]) {
return { mac: normalized, hostname: normalized, ip: "" };
}
if (state.install_queue[normalized]) {
return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" };
}
for (const [mac, info] of Object.entries(state.installed)) {
if (info.hostname === target || info.hostname.startsWith(target + ".")) {
return { mac, hostname: info.hostname, ip: info.ip };
}
}
for (const [mac, info] of Object.entries(state.installed)) {
if (info.ip === target) {
return { mac, hostname: info.hostname, ip: info.ip };
}
}
return null;
}
export function registerDebugCommand(parent: Command): void {
parent
.command("debug <target>")
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
.showHelpAfterError(true)
.action(async (target: string) => {
const client = getLabdClient();
// Resolve target from labd aggregated state
let state: BastionState;
try {
state = await client.getMachines();
} catch (err) {
console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`);
process.exit(1);
}
const resolved = resolveTarget(target, state);
if (!resolved) {
console.error(`Cannot find machine: ${target}`);
console.error("Provide a hostname, MAC, or IP of a known machine.");
console.error("Run 'labctl provision list' to see available machines.");
process.exit(1);
}
const { mac, hostname, ip } = resolved;
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
try {
const result = await client.debugMachine(mac);
if (result.error) {
console.error(`Failed: ${result.error}`);
process.exit(1);
}
} catch (err) {
console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`);
process.exit(1);
}
// Try SSH reboot into PXE
if (ip !== "") {
const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? "";
const effectiveUser = adminUser === "root" ? "" : adminUser;
if (effectiveUser !== "") {
console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`);
const sudoUser = process.env["SUDO_USER"];
const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir();
const keyPaths = [
join(realHome, ".ssh", "id_ed25519"),
join(realHome, ".ssh", "id_rsa"),
join(realHome, ".ssh", "id_ecdsa"),
];
const sshKey = keyPaths.find(k => existsSync(k));
const sshArgs = [
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
...(sshKey !== undefined ? ["-i", sshKey] : []),
`${effectiveUser}@${ip}`,
'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi',
];
try {
execFileSync("ssh", sshArgs, { stdio: "inherit" });
} catch {
// SSH connection closing during reboot is expected
}
}
}
console.log(`
Debug mode queued for ${hostname} (${mac}).
Reboot the machine to enter Fedora rescue mode.
Once in rescue shell:
# Activate LVM
vgchange -ay labvg
# Mount root + other volumes
mkdir -p /mnt/sysroot
mount /dev/labvg/root /mnt/sysroot
cat /mnt/sysroot/etc/fstab # check what else to mount
mount /dev/labvg/var /mnt/sysroot/var
mount /dev/labvg/home /mnt/sysroot/home
# Boot the installed system in a container
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
# Or just chroot for quick fixes
mount --bind /dev /mnt/sysroot/dev
mount --bind /proc /mnt/sysroot/proc
mount --bind /sys /mnt/sysroot/sys
chroot /mnt/sysroot
# Check initramfs size
ls -lh /mnt/sysroot/boot/initramfs-*.img
# Rebuild initramfs without amdgpu
chroot /mnt/sysroot
echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf
dracut -f --regenerate-all
`);
});
}

View File

@@ -14,6 +14,7 @@ import { registerStatusCommand } from "./commands/status.js";
import { registerInstallCommand } from "./commands/install.js";
import { registerListCommand } from "./commands/list.js";
import { registerReprovisionCommand } from "./commands/reprovision.js";
import { registerDebugCommand } from "./commands/debug.js";
import { registerForgetCommand } from "./commands/forget.js";
import { registerLogsCommand } from "./commands/logs.js";
import { registerMakeIsoCommand } from "./commands/makeiso.js";
@@ -95,6 +96,7 @@ export function createProgram(): Command {
registerListCommand(provisionCmd);
registerInstallCommand(provisionCmd);
registerReprovisionCommand(provisionCmd);
registerDebugCommand(provisionCmd);
registerForgetCommand(provisionCmd);
registerLogsCommand(provisionCmd);
registerMakeIsoCommand(provisionCmd);

View File

@@ -172,6 +172,40 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
}
});
// Queue debug/rescue mode — route to correct bastion by MAC
app.post<{
Body: { mac?: string };
}>("/api/machines/debug", async (request, reply) => {
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
if (!mac) {
return reply.code(400).send({ error: "mac is required" });
}
const bastion = bastionRegistry.findBastionByMac(mac);
if (!bastion) {
const all = bastionRegistry.getAll();
if (all.length === 0) {
return reply.code(503).send({ error: "No bastions connected" });
}
if (all.length === 1) {
try {
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac });
return reply.code(result.status === "ok" ? 200 : 500).send(result);
} catch (err) {
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
}
}
return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` });
}
try {
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac });
return reply.code(result.status === "ok" ? 200 : 500).send(result);
} catch (err) {
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
}
});
// Forget machine
app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => {
const mac = request.params.mac.toLowerCase().replace(/-/g, ":");

View File

@@ -3,7 +3,7 @@
import { EventEmitter } from "node:events";
import type { WebSocket } from "ws";
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared";
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared";
export interface ConnectedBastion {
bastionId: string;
@@ -20,6 +20,7 @@ export interface AggregatedState {
discovered: Record<string, HardwareInfo>;
install_queue: Record<string, InstallConfig>;
installed: Record<string, InstalledInfo>;
debug: Record<string, DebugConfig>;
}
export class BastionRegistry extends EventEmitter {
@@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter {
discovered: {},
install_queue: {},
installed: {},
debug: {},
};
for (const bastion of this.bastions.values()) {
@@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter {
for (const [mac, info] of Object.entries(bastion.state.installed)) {
result.installed[mac] = { ...info, bastionId: bastion.bastionId };
}
for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) {
result.debug[mac] = { ...dbg };
}
}
return result;

View File

@@ -5,6 +5,7 @@ export type {
HardwareInfo,
InstallConfig,
InstalledInfo,
DebugConfig,
BastionState,
BastionConfig,
} from "./types/index.js";

View File

@@ -5,6 +5,7 @@ export type {
HardwareInfo,
InstallConfig,
InstalledInfo,
DebugConfig,
BastionState,
} from "./state.js";

View File

@@ -98,8 +98,14 @@ export interface InstalledInfo {
bastionId?: string; // set when aggregated through labd
}
export interface DebugConfig {
hostname: string;
queued_at: string;
}
export interface BastionState {
discovered: Record<string, HardwareInfo>;
install_queue: Record<string, InstallConfig>;
installed: Record<string, InstalledInfo>;
debug: Record<string, DebugConfig>;
}