diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index f7d4743..b27c835 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -29,43 +29,46 @@ _labctl() { COMPREPLY=($(compgen -W "--dir -h --help" -- "$cur")) return ;; "init bastion standalone status") - COMPREPLY=($(compgen -W "--dir --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "init bastion standalone") COMPREPLY=($(compgen -W "start stop status -h --help" -- "$cur")) return ;; "app labcontroller deploy") - COMPREPLY=($(compgen -W "--user --port --crdb-replicas -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user --crdb-replicas -h --help" -- "$cur")) return ;; "app labcontroller status") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s install") - COMPREPLY=($(compgen -W "--role --user --port --k3s-server --k3s-token -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --user --k3s-server --k3s-token -h --help" -- "$cur")) return ;; "app k3s health") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s list") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "init bastion") COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur")) return ;; "provision list") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision install") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision reprovision") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision forget") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision logs") - COMPREPLY=($(compgen -W "-f --follow --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + return ;; + "provision makeiso") + COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur")) return ;; "config list") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) @@ -92,7 +95,7 @@ _labctl() { COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) return ;; "provision") - COMPREPLY=($(compgen -W "list install reprovision forget logs -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur")) return ;; "config") COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 832ad8e..6736142 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -118,38 +118,28 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone start" -l foregro # init bastion standalone stop options complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d 'Bastion data directory' -x -# init bastion standalone status options -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l dir -d 'Bastion data directory' -x -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l port -d 'Bastion HTTP port' -x - # provision subcommands complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' - -# provision list options -complete -c labctl -n "__labctl_in_cmd provision list" -l port -d 'Bastion HTTP port' -x +complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' # provision install options complete -c labctl -n "__labctl_in_cmd provision install" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision install" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision install" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision install" -l port -d 'Bastion HTTP port' -x # provision reprovision options complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision reprovision" -l port -d 'Bastion HTTP port' -x -# provision forget options -complete -c labctl -n "__labctl_in_cmd provision forget" -l port -d 'Bastion HTTP port' -x - -# provision logs options -complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow logs in real-time (SSE stream)' -complete -c labctl -n "__labctl_in_cmd provision logs" -l port -d 'Bastion HTTP port' -x +# provision makeiso options +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l out -d 'Output path for local ISO build' -x # config subcommands complete -c labctl -n "__labctl_using_cmd config" -a list -d 'Show all configuration values' @@ -173,12 +163,10 @@ complete -c labctl -n "__labctl_using_cmd app labcontroller" -a status -d 'Check # app labcontroller deploy options complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l port -d 'Bastion HTTP port' -x complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l crdb-replicas -d 'CockroachDB replicas' -x # app labcontroller status options complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l port -d 'Bastion HTTP port' -x # app k3s subcommands complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)' @@ -188,15 +176,12 @@ complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed ma # app k3s install options complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s install" -l port -d 'Bastion HTTP port (for resolving target)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-server -d 'k3s server URL (required for worker role)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-token -d 'k3s join token (required for worker role)' -x # app k3s health options complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s health" -l port -d 'Bastion HTTP port' -x # app k3s list options complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s list" -l port -d 'Bastion HTTP port' -x diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 6b2a621..03a49bf 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -266,6 +266,21 @@ export async function startBastion(overrides: Partial = {}): Prom return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } }; }); + labdConn.onCommand("command-debug", async (msg) => { + if (msg.type !== "command-debug") throw new Error("unexpected"); + const mac = msg.mac.toLowerCase(); + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + }); + return { status: "ok", data: { mac, hostname } }; + }); + labdConn.onCommand("command-forget", async (msg) => { if (msg.type !== "command-forget") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); @@ -273,6 +288,7 @@ export async function startBastion(overrides: Partial = {}): Prom delete s.discovered[mac]; delete s.install_queue[mac]; delete s.installed[mac]; + delete s.debug[mac]; }); return { status: "ok", data: { mac } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 96e1e7f..75a821a 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -189,6 +189,31 @@ export function registerApiRoutes( return reply.send({ status: "ok", lines: allLines.length }); }); + // Queue debug/rescue mode for a machine + app.post<{ + Body: { mac?: string }; + }>("/api/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + if (mac === "") { + return reply.status(400).send({ error: "mac is required" }); + } + + // Look up hostname from installed or discovered state + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + }); + + logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); + return reply.send({ status: "ok", mac, hostname }); + }); + // Delete a machine from all state app.delete<{ Params: { mac: string }; @@ -213,6 +238,10 @@ export function registerApiRoutes( delete s.installed[mac]; found = true; } + if (s.debug[mac] !== undefined) { + delete s.debug[mac]; + found = true; + } }); if (!found) { diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index 54221fc..c9df55c 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -10,9 +10,11 @@ import type { StateManager } from "../services/state.js"; import { renderDiscoverIpxe, renderInstallIpxe, + renderDebugIpxe, renderLocalBootIpxe, } from "../templates/boot.ipxe.js"; import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js"; +import { renderDebugKickstart } from "../templates/debug.ks.js"; import { logger } from "../services/logger.js"; export function registerDispatchRoutes( @@ -20,10 +22,34 @@ export function registerDispatchRoutes( config: BastionConfig, state: StateManager, ): void { + // Serve debug/rescue kickstart (minimal: SSH keys + network) + app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { + const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] }); + return reply.type("text/plain").send(ks); + }); + app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => { const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); const currentState = state.load(); + // Debug mode takes highest priority — auto-clear after serving once + const debugEntry = currentState.debug[mac]; + if (debugEntry) { + const hostname = debugEntry.hostname ?? "debug"; + logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`); + + state.update((s) => { delete s.debug[mac]; }); + + const script = renderDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + fedoraMirror: config.fedoraMirror, + }); + return reply.type("text/plain").send(script); + } + const queueEntry = currentState.install_queue[mac]; if (queueEntry) { const hostname = queueEntry.hostname ?? "lab-node"; diff --git a/bastion/src/bastion/src/services/state.ts b/bastion/src/bastion/src/services/state.ts index ea90218..68cb6a7 100644 --- a/bastion/src/bastion/src/services/state.ts +++ b/bastion/src/bastion/src/services/state.ts @@ -11,6 +11,7 @@ const EMPTY_STATE: BastionState = { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }; export type StateChangeListener = (state: BastionState) => void; @@ -33,6 +34,7 @@ export class StateManager { discovered: parsed.discovered ?? {}, install_queue: parsed.install_queue ?? {}, installed: parsed.installed ?? {}, + debug: parsed.debug ?? {}, }; } catch { return { ...EMPTY_STATE }; diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index d2fc3b6..826633f 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -75,6 +75,33 @@ boot `; } +/** + * iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode. + * Provides a shell with LVM tools, network, and SSH for inspecting installed systems. + */ +export function renderDebugIpxe(params: { + mac: string; + hostname: string; + serverIp: string; + httpPort: number; + fedoraMirror: string; +}): string { + return `#!ipxe + +echo +echo ============================================= +echo Lab PXE Bastion - DEBUG/RESCUE MODE +echo Target: ${params.hostname} +echo MAC: ${params.mac} +echo ============================================= +echo + +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror} +initrd http://${params.serverIp}:${params.httpPort}/initrd.img +boot +`; +} + /** * iPXE script for already-installed machines -- exits to boot from local disk. */ diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts new file mode 100644 index 0000000..270fa34 --- /dev/null +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -0,0 +1,25 @@ +// Debug/rescue kickstart template. +// Minimal: sets SSH access and network for Anaconda rescue mode. +// No disk operations, no packages, no %post. + +export interface DebugKickstartParams { + sshKeys: string[]; +} + +export function renderDebugKickstart(params: DebugKickstartParams): string { + const sshpw = "sshpw --username=root --plaintext lab-root-pw"; + const sshkeyLine = params.sshKeys.length > 0 + ? `sshkey --username=root "${params.sshKeys[0]}"` + : ""; + + return `# Lab Bastion -- Debug/Rescue Kickstart +# Minimal: only SSH + network for Anaconda rescue mode + +lang en_US.UTF-8 +keyboard uk +network --bootproto=dhcp --activate + +${sshpw} +${sshkeyLine} +`; +} diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index ea1a035..cf5ef73 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -322,39 +322,20 @@ bastion_progress() { -d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true } -# Send log lines to bastion -bastion_log() { - local line="$1" - local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') - curl -sf -X POST "http://${serverIp}:${httpPort}/api/log" \\ - -H "Content-Type: application/json" \\ - -d "{\\"mac\\":\\"$mac\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\ - --connect-timeout 5 --max-time 10 2>/dev/null || true -} - -# Send an error stage to bastion -bastion_error() { - local detail="$1" - bastion_progress "error" "$detail" -} - -# --- Error trap: catch any failure and report to bastion --- -_post_error_handler() { - local exit_code=$? lineno=$1 - bastion_error "%post failed at line $lineno (exit $exit_code)" -} -trap '_post_error_handler $LINENO' ERR bastion_progress "post-install" "configuring system" # -- SSH -- -systemctl enable --now sshd +# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot +systemctl enable sshd || true sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config ${sshPostBlock} -# -- Hostname and domain -- -hostnamectl set-hostname ${fqdn} +bastion_progress "post-install" "1-ssh done" + +# -- Hostname and domain (write directly, hostnamectl needs D-Bus) -- +echo "${fqdn}" > /etc/hostname # -- tmpfs for /tmp -- echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab @@ -392,12 +373,15 @@ SYSCTL sysctl --system || true # -- Disable firewalld permanently (k3s/Cilium manage iptables directly) -- -systemctl disable --now firewalld || true +# Note: no '--now' — systemd is not running in the Anaconda chroot +systemctl disable firewalld || true systemctl mask firewalld || true # -- Enable chronyd for time sync -- systemctl enable chronyd || true`} +bastion_progress "post-install" "2-system done" + # -- Boot order: restore network first (Anaconda sets disk first, we undo it) -- # Network boot must stay first so the bastion intercepts every reboot. if command -v efibootmgr >/dev/null 2>&1; then @@ -410,6 +394,8 @@ if command -v efibootmgr >/dev/null 2>&1; then fi fi +bastion_progress "post-install" "3-bootorder done" + # -- Provisioning metadata -- cat > /etc/lab-provisioned << PROVEOF hostname: ${fqdn} @@ -435,6 +421,8 @@ README ${hasRancher ? `# Install k3s server (skip start - will be configured manually) curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh - ` : ""} +bastion_progress "post-install" "4-metadata done" + IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') bastion_progress "complete" "ready at $IP_ADDR" diff --git a/bastion/src/bastion/tests/state.test.ts b/bastion/src/bastion/tests/state.test.ts index 494b479..2b509b5 100644 --- a/bastion/src/bastion/tests/state.test.ts +++ b/bastion/src/bastion/tests/state.test.ts @@ -26,6 +26,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); @@ -39,6 +40,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index c68f0e9..5ec68cf 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,6 +94,10 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } + async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac } }); + } + async forgetMachine(mac: string): Promise<{ status: string }> { return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`); } diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts new file mode 100644 index 0000000..78b3f6c --- /dev/null +++ b/bastion/src/cli/src/commands/debug.ts @@ -0,0 +1,153 @@ +// CLI command: provision debug +// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE. + +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { Command } from "commander"; +import type { BastionState } from "@lab/shared"; +import { getLabdClient } from "../api/config.js"; + +/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */ +function resolveTarget( + target: string, + state: BastionState, +): { mac: string; hostname: string; ip: string } | null { + const normalized = target.toLowerCase().replace(/-/g, ":"); + + if (state.installed[normalized]) { + const info = state.installed[normalized]; + return { mac: normalized, hostname: info.hostname, ip: info.ip }; + } + + if (state.discovered[normalized]) { + return { mac: normalized, hostname: normalized, ip: "" }; + } + + if (state.install_queue[normalized]) { + return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" }; + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.hostname === target || info.hostname.startsWith(target + ".")) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.ip === target) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + return null; +} + +export function registerDebugCommand(parent: Command): void { + parent + .command("debug ") + .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") + .showHelpAfterError(true) + .action(async (target: string) => { + const client = getLabdClient(); + + // Resolve target from labd aggregated state + let state: BastionState; + try { + state = await client.getMachines(); + } catch (err) { + console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + const resolved = resolveTarget(target, state); + if (!resolved) { + console.error(`Cannot find machine: ${target}`); + console.error("Provide a hostname, MAC, or IP of a known machine."); + console.error("Run 'labctl provision list' to see available machines."); + process.exit(1); + } + + const { mac, hostname, ip } = resolved; + console.log(`Queuing debug mode for ${hostname} (${mac})...`); + + try { + const result = await client.debugMachine(mac); + if (result.error) { + console.error(`Failed: ${result.error}`); + process.exit(1); + } + } catch (err) { + console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + // Try SSH reboot into PXE + if (ip !== "") { + const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? ""; + const effectiveUser = adminUser === "root" ? "" : adminUser; + + if (effectiveUser !== "") { + console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`); + + const sudoUser = process.env["SUDO_USER"]; + const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir(); + const keyPaths = [ + join(realHome, ".ssh", "id_ed25519"), + join(realHome, ".ssh", "id_rsa"), + join(realHome, ".ssh", "id_ecdsa"), + ]; + const sshKey = keyPaths.find(k => existsSync(k)); + + const sshArgs = [ + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + ...(sshKey !== undefined ? ["-i", sshKey] : []), + `${effectiveUser}@${ip}`, + 'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi', + ]; + + try { + execFileSync("ssh", sshArgs, { stdio: "inherit" }); + } catch { + // SSH connection closing during reboot is expected + } + } + } + + console.log(` +Debug mode queued for ${hostname} (${mac}). +Reboot the machine to enter Fedora rescue mode. + +Once in rescue shell: + + # Activate LVM + vgchange -ay labvg + + # Mount root + other volumes + mkdir -p /mnt/sysroot + mount /dev/labvg/root /mnt/sysroot + cat /mnt/sysroot/etc/fstab # check what else to mount + mount /dev/labvg/var /mnt/sysroot/var + mount /dev/labvg/home /mnt/sysroot/home + + # Boot the installed system in a container + /mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot + + # Or just chroot for quick fixes + mount --bind /dev /mnt/sysroot/dev + mount --bind /proc /mnt/sysroot/proc + mount --bind /sys /mnt/sysroot/sys + chroot /mnt/sysroot + + # Check initramfs size + ls -lh /mnt/sysroot/boot/initramfs-*.img + + # Rebuild initramfs without amdgpu + chroot /mnt/sysroot + echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf + dracut -f --regenerate-all +`); + }); +} diff --git a/bastion/src/cli/src/index.ts b/bastion/src/cli/src/index.ts index 0584ec5..00d0df9 100644 --- a/bastion/src/cli/src/index.ts +++ b/bastion/src/cli/src/index.ts @@ -14,6 +14,7 @@ import { registerStatusCommand } from "./commands/status.js"; import { registerInstallCommand } from "./commands/install.js"; import { registerListCommand } from "./commands/list.js"; import { registerReprovisionCommand } from "./commands/reprovision.js"; +import { registerDebugCommand } from "./commands/debug.js"; import { registerForgetCommand } from "./commands/forget.js"; import { registerLogsCommand } from "./commands/logs.js"; import { registerMakeIsoCommand } from "./commands/makeiso.js"; @@ -95,6 +96,7 @@ export function createProgram(): Command { registerListCommand(provisionCmd); registerInstallCommand(provisionCmd); registerReprovisionCommand(provisionCmd); + registerDebugCommand(provisionCmd); registerForgetCommand(provisionCmd); registerLogsCommand(provisionCmd); registerMakeIsoCommand(provisionCmd); diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index a1c0af8..9372dae 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -172,6 +172,40 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } }); + // Queue debug/rescue mode — route to correct bastion by MAC + app.post<{ + Body: { mac?: string }; + }>("/api/machines/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + if (!mac) { + return reply.code(400).send({ error: "mac is required" }); + } + + const bastion = bastionRegistry.findBastionByMac(mac); + if (!bastion) { + const all = bastionRegistry.getAll(); + if (all.length === 0) { + return reply.code(503).send({ error: "No bastions connected" }); + } + if (all.length === 1) { + try { + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + } + return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` }); + } + + try { + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + }); + // Forget machine app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => { const mac = request.params.mac.toLowerCase().replace(/-/g, ":"); diff --git a/bastion/src/labd/src/services/bastion-registry.ts b/bastion/src/labd/src/services/bastion-registry.ts index 15d0570..dba23a4 100644 --- a/bastion/src/labd/src/services/bastion-registry.ts +++ b/bastion/src/labd/src/services/bastion-registry.ts @@ -3,7 +3,7 @@ import { EventEmitter } from "node:events"; import type { WebSocket } from "ws"; -import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared"; +import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared"; export interface ConnectedBastion { bastionId: string; @@ -20,6 +20,7 @@ export interface AggregatedState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; } export class BastionRegistry extends EventEmitter { @@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }; for (const bastion of this.bastions.values()) { @@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter { for (const [mac, info] of Object.entries(bastion.state.installed)) { result.installed[mac] = { ...info, bastionId: bastion.bastionId }; } + for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) { + result.debug[mac] = { ...dbg }; + } } return result; diff --git a/bastion/src/shared/src/index.ts b/bastion/src/shared/src/index.ts index 7179a6d..443edbc 100644 --- a/bastion/src/shared/src/index.ts +++ b/bastion/src/shared/src/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, BastionConfig, } from "./types/index.js"; diff --git a/bastion/src/shared/src/types/index.ts b/bastion/src/shared/src/types/index.ts index 8ff20ed..510ee31 100644 --- a/bastion/src/shared/src/types/index.ts +++ b/bastion/src/shared/src/types/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, } from "./state.js"; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index 9be3d21..382d7d5 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -98,8 +98,14 @@ export interface InstalledInfo { bastionId?: string; // set when aggregated through labd } +export interface DebugConfig { + hostname: string; + queued_at: string; +} + export interface BastionState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; }