feat: PXE debug boot mode for rescue/diagnostics #4
@@ -29,43 +29,46 @@ _labctl() {
|
||||
COMPREPLY=($(compgen -W "--dir -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion standalone status")
|
||||
COMPREPLY=($(compgen -W "--dir --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion standalone")
|
||||
COMPREPLY=($(compgen -W "start stop status -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app labcontroller deploy")
|
||||
COMPREPLY=($(compgen -W "--user --port --crdb-replicas -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user --crdb-replicas -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app labcontroller status")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s install")
|
||||
COMPREPLY=($(compgen -W "--role --user --port --k3s-server --k3s-token -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --user --k3s-server --k3s-token -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s health")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s list")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion")
|
||||
COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision list")
|
||||
COMPREPLY=($(compgen -W "--port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision install")
|
||||
COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision reprovision")
|
||||
COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision forget")
|
||||
COMPREPLY=($(compgen -W "--port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision logs")
|
||||
COMPREPLY=($(compgen -W "-f --follow --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision makeiso")
|
||||
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"config list")
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
@@ -92,7 +95,7 @@ _labctl() {
|
||||
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision")
|
||||
COMPREPLY=($(compgen -W "list install reprovision forget logs -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"config")
|
||||
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))
|
||||
|
||||
@@ -118,38 +118,28 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone start" -l foregro
|
||||
# init bastion standalone stop options
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d 'Bastion data directory' -x
|
||||
|
||||
# init bastion standalone status options
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l dir -d 'Bastion data directory' -x
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
|
||||
|
||||
# provision list options
|
||||
complete -c labctl -n "__labctl_in_cmd provision list" -l port -d 'Bastion HTTP port' -x
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'
|
||||
|
||||
# provision install options
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller'
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision reprovision options
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller'
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision forget options
|
||||
complete -c labctl -n "__labctl_in_cmd provision forget" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision logs options
|
||||
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow logs in real-time (SSE stream)'
|
||||
complete -c labctl -n "__labctl_in_cmd provision logs" -l port -d 'Bastion HTTP port' -x
|
||||
# provision makeiso options
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l out -d 'Output path for local ISO build' -x
|
||||
|
||||
# config subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd config" -a list -d 'Show all configuration values'
|
||||
@@ -173,12 +163,10 @@ complete -c labctl -n "__labctl_using_cmd app labcontroller" -a status -d 'Check
|
||||
|
||||
# app labcontroller deploy options
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l port -d 'Bastion HTTP port' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l crdb-replicas -d 'CockroachDB replicas' -x
|
||||
|
||||
# app labcontroller status options
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# app k3s subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)'
|
||||
@@ -188,15 +176,12 @@ complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed ma
|
||||
# app k3s install options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l port -d 'Bastion HTTP port (for resolving target)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-server -d 'k3s server URL (required for worker role)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-token -d 'k3s join token (required for worker role)' -x
|
||||
|
||||
# app k3s health options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s health" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# app k3s list options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s list" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
|
||||
@@ -266,6 +266,21 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-debug", async (msg) => {
|
||||
if (msg.type !== "command-debug") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
const currentState = state.load();
|
||||
const hostname =
|
||||
currentState.installed[mac]?.hostname ??
|
||||
currentState.install_queue[mac]?.hostname ??
|
||||
currentState.discovered[mac]?.product ??
|
||||
mac;
|
||||
state.update((s) => {
|
||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
|
||||
});
|
||||
return { status: "ok", data: { mac, hostname } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-forget", async (msg) => {
|
||||
if (msg.type !== "command-forget") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
@@ -273,6 +288,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
delete s.discovered[mac];
|
||||
delete s.install_queue[mac];
|
||||
delete s.installed[mac];
|
||||
delete s.debug[mac];
|
||||
});
|
||||
return { status: "ok", data: { mac } };
|
||||
});
|
||||
|
||||
@@ -189,6 +189,31 @@ export function registerApiRoutes(
|
||||
return reply.send({ status: "ok", lines: allLines.length });
|
||||
});
|
||||
|
||||
// Queue debug/rescue mode for a machine
|
||||
app.post<{
|
||||
Body: { mac?: string };
|
||||
}>("/api/debug", async (request, reply) => {
|
||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
if (mac === "") {
|
||||
return reply.status(400).send({ error: "mac is required" });
|
||||
}
|
||||
|
||||
// Look up hostname from installed or discovered state
|
||||
const currentState = state.load();
|
||||
const hostname =
|
||||
currentState.installed[mac]?.hostname ??
|
||||
currentState.install_queue[mac]?.hostname ??
|
||||
currentState.discovered[mac]?.product ??
|
||||
mac;
|
||||
|
||||
state.update((s) => {
|
||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
|
||||
});
|
||||
|
||||
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
||||
return reply.send({ status: "ok", mac, hostname });
|
||||
});
|
||||
|
||||
// Delete a machine from all state
|
||||
app.delete<{
|
||||
Params: { mac: string };
|
||||
@@ -213,6 +238,10 @@ export function registerApiRoutes(
|
||||
delete s.installed[mac];
|
||||
found = true;
|
||||
}
|
||||
if (s.debug[mac] !== undefined) {
|
||||
delete s.debug[mac];
|
||||
found = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!found) {
|
||||
|
||||
@@ -10,9 +10,11 @@ import type { StateManager } from "../services/state.js";
|
||||
import {
|
||||
renderDiscoverIpxe,
|
||||
renderInstallIpxe,
|
||||
renderDebugIpxe,
|
||||
renderLocalBootIpxe,
|
||||
} from "../templates/boot.ipxe.js";
|
||||
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
||||
import { renderDebugKickstart } from "../templates/debug.ks.js";
|
||||
import { logger } from "../services/logger.js";
|
||||
|
||||
export function registerDispatchRoutes(
|
||||
@@ -20,10 +22,34 @@ export function registerDispatchRoutes(
|
||||
config: BastionConfig,
|
||||
state: StateManager,
|
||||
): void {
|
||||
// Serve debug/rescue kickstart (minimal: SSH keys + network)
|
||||
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
|
||||
const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] });
|
||||
return reply.type("text/plain").send(ks);
|
||||
});
|
||||
|
||||
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
||||
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
const currentState = state.load();
|
||||
|
||||
// Debug mode takes highest priority — auto-clear after serving once
|
||||
const debugEntry = currentState.debug[mac];
|
||||
if (debugEntry) {
|
||||
const hostname = debugEntry.hostname ?? "debug";
|
||||
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
||||
|
||||
state.update((s) => { delete s.debug[mac]; });
|
||||
|
||||
const script = renderDebugIpxe({
|
||||
mac,
|
||||
hostname,
|
||||
serverIp: config.serverIp,
|
||||
httpPort: config.httpPort,
|
||||
fedoraMirror: config.fedoraMirror,
|
||||
});
|
||||
return reply.type("text/plain").send(script);
|
||||
}
|
||||
|
||||
const queueEntry = currentState.install_queue[mac];
|
||||
if (queueEntry) {
|
||||
const hostname = queueEntry.hostname ?? "lab-node";
|
||||
|
||||
@@ -11,6 +11,7 @@ const EMPTY_STATE: BastionState = {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
};
|
||||
|
||||
export type StateChangeListener = (state: BastionState) => void;
|
||||
@@ -33,6 +34,7 @@ export class StateManager {
|
||||
discovered: parsed.discovered ?? {},
|
||||
install_queue: parsed.install_queue ?? {},
|
||||
installed: parsed.installed ?? {},
|
||||
debug: parsed.debug ?? {},
|
||||
};
|
||||
} catch {
|
||||
return { ...EMPTY_STATE };
|
||||
|
||||
@@ -75,6 +75,33 @@ boot
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode.
|
||||
* Provides a shell with LVM tools, network, and SSH for inspecting installed systems.
|
||||
*/
|
||||
export function renderDebugIpxe(params: {
|
||||
mac: string;
|
||||
hostname: string;
|
||||
serverIp: string;
|
||||
httpPort: number;
|
||||
fedoraMirror: string;
|
||||
}): string {
|
||||
return `#!ipxe
|
||||
|
||||
echo
|
||||
echo =============================================
|
||||
echo Lab PXE Bastion - DEBUG/RESCUE MODE
|
||||
echo Target: ${params.hostname}
|
||||
echo MAC: ${params.mac}
|
||||
echo =============================================
|
||||
echo
|
||||
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror}
|
||||
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||
boot
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* iPXE script for already-installed machines -- exits to boot from local disk.
|
||||
*/
|
||||
|
||||
25
bastion/src/bastion/src/templates/debug.ks.ts
Normal file
25
bastion/src/bastion/src/templates/debug.ks.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
// Debug/rescue kickstart template.
|
||||
// Minimal: sets SSH access and network for Anaconda rescue mode.
|
||||
// No disk operations, no packages, no %post.
|
||||
|
||||
export interface DebugKickstartParams {
|
||||
sshKeys: string[];
|
||||
}
|
||||
|
||||
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
||||
const sshpw = "sshpw --username=root --plaintext lab-root-pw";
|
||||
const sshkeyLine = params.sshKeys.length > 0
|
||||
? `sshkey --username=root "${params.sshKeys[0]}"`
|
||||
: "";
|
||||
|
||||
return `# Lab Bastion -- Debug/Rescue Kickstart
|
||||
# Minimal: only SSH + network for Anaconda rescue mode
|
||||
|
||||
lang en_US.UTF-8
|
||||
keyboard uk
|
||||
network --bootproto=dhcp --activate
|
||||
|
||||
${sshpw}
|
||||
${sshkeyLine}
|
||||
`;
|
||||
}
|
||||
@@ -322,39 +322,20 @@ bastion_progress() {
|
||||
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Send log lines to bastion
|
||||
bastion_log() {
|
||||
local line="$1"
|
||||
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||
curl -sf -X POST "http://${serverIp}:${httpPort}/api/log" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$mac\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Send an error stage to bastion
|
||||
bastion_error() {
|
||||
local detail="$1"
|
||||
bastion_progress "error" "$detail"
|
||||
}
|
||||
|
||||
# --- Error trap: catch any failure and report to bastion ---
|
||||
_post_error_handler() {
|
||||
local exit_code=$? lineno=$1
|
||||
bastion_error "%post failed at line $lineno (exit $exit_code)"
|
||||
}
|
||||
trap '_post_error_handler $LINENO' ERR
|
||||
|
||||
bastion_progress "post-install" "configuring system"
|
||||
|
||||
# -- SSH --
|
||||
systemctl enable --now sshd
|
||||
# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot
|
||||
systemctl enable sshd || true
|
||||
sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
${sshPostBlock}
|
||||
|
||||
# -- Hostname and domain --
|
||||
hostnamectl set-hostname ${fqdn}
|
||||
bastion_progress "post-install" "1-ssh done"
|
||||
|
||||
# -- Hostname and domain (write directly, hostnamectl needs D-Bus) --
|
||||
echo "${fqdn}" > /etc/hostname
|
||||
|
||||
# -- tmpfs for /tmp --
|
||||
echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
|
||||
@@ -392,12 +373,15 @@ SYSCTL
|
||||
sysctl --system || true
|
||||
|
||||
# -- Disable firewalld permanently (k3s/Cilium manage iptables directly) --
|
||||
systemctl disable --now firewalld || true
|
||||
# Note: no '--now' — systemd is not running in the Anaconda chroot
|
||||
systemctl disable firewalld || true
|
||||
systemctl mask firewalld || true
|
||||
|
||||
# -- Enable chronyd for time sync --
|
||||
systemctl enable chronyd || true`}
|
||||
|
||||
bastion_progress "post-install" "2-system done"
|
||||
|
||||
# -- Boot order: restore network first (Anaconda sets disk first, we undo it) --
|
||||
# Network boot must stay first so the bastion intercepts every reboot.
|
||||
if command -v efibootmgr >/dev/null 2>&1; then
|
||||
@@ -410,6 +394,8 @@ if command -v efibootmgr >/dev/null 2>&1; then
|
||||
fi
|
||||
fi
|
||||
|
||||
bastion_progress "post-install" "3-bootorder done"
|
||||
|
||||
# -- Provisioning metadata --
|
||||
cat > /etc/lab-provisioned << PROVEOF
|
||||
hostname: ${fqdn}
|
||||
@@ -435,6 +421,8 @@ README
|
||||
${hasRancher ? `# Install k3s server (skip start - will be configured manually)
|
||||
curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh -
|
||||
` : ""}
|
||||
bastion_progress "post-install" "4-metadata done"
|
||||
|
||||
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||
bastion_progress "complete" "ready at $IP_ADDR"
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ describe("StateManager", () => {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -39,6 +40,7 @@ describe("StateManager", () => {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -94,6 +94,10 @@ export class LabdClient {
|
||||
return this.request("POST", "/api/machines/install", { body: opts });
|
||||
}
|
||||
|
||||
async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
||||
return this.request("POST", "/api/machines/debug", { body: { mac } });
|
||||
}
|
||||
|
||||
async forgetMachine(mac: string): Promise<{ status: string }> {
|
||||
return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`);
|
||||
}
|
||||
|
||||
153
bastion/src/cli/src/commands/debug.ts
Normal file
153
bastion/src/cli/src/commands/debug.ts
Normal file
@@ -0,0 +1,153 @@
|
||||
// CLI command: provision debug
|
||||
// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE.
|
||||
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { existsSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { Command } from "commander";
|
||||
import type { BastionState } from "@lab/shared";
|
||||
import { getLabdClient } from "../api/config.js";
|
||||
|
||||
/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */
|
||||
function resolveTarget(
|
||||
target: string,
|
||||
state: BastionState,
|
||||
): { mac: string; hostname: string; ip: string } | null {
|
||||
const normalized = target.toLowerCase().replace(/-/g, ":");
|
||||
|
||||
if (state.installed[normalized]) {
|
||||
const info = state.installed[normalized];
|
||||
return { mac: normalized, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
|
||||
if (state.discovered[normalized]) {
|
||||
return { mac: normalized, hostname: normalized, ip: "" };
|
||||
}
|
||||
|
||||
if (state.install_queue[normalized]) {
|
||||
return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" };
|
||||
}
|
||||
|
||||
for (const [mac, info] of Object.entries(state.installed)) {
|
||||
if (info.hostname === target || info.hostname.startsWith(target + ".")) {
|
||||
return { mac, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
}
|
||||
|
||||
for (const [mac, info] of Object.entries(state.installed)) {
|
||||
if (info.ip === target) {
|
||||
return { mac, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export function registerDebugCommand(parent: Command): void {
|
||||
parent
|
||||
.command("debug <target>")
|
||||
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
||||
.showHelpAfterError(true)
|
||||
.action(async (target: string) => {
|
||||
const client = getLabdClient();
|
||||
|
||||
// Resolve target from labd aggregated state
|
||||
let state: BastionState;
|
||||
try {
|
||||
state = await client.getMachines();
|
||||
} catch (err) {
|
||||
console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const resolved = resolveTarget(target, state);
|
||||
if (!resolved) {
|
||||
console.error(`Cannot find machine: ${target}`);
|
||||
console.error("Provide a hostname, MAC, or IP of a known machine.");
|
||||
console.error("Run 'labctl provision list' to see available machines.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const { mac, hostname, ip } = resolved;
|
||||
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
||||
|
||||
try {
|
||||
const result = await client.debugMachine(mac);
|
||||
if (result.error) {
|
||||
console.error(`Failed: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Try SSH reboot into PXE
|
||||
if (ip !== "") {
|
||||
const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? "";
|
||||
const effectiveUser = adminUser === "root" ? "" : adminUser;
|
||||
|
||||
if (effectiveUser !== "") {
|
||||
console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`);
|
||||
|
||||
const sudoUser = process.env["SUDO_USER"];
|
||||
const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir();
|
||||
const keyPaths = [
|
||||
join(realHome, ".ssh", "id_ed25519"),
|
||||
join(realHome, ".ssh", "id_rsa"),
|
||||
join(realHome, ".ssh", "id_ecdsa"),
|
||||
];
|
||||
const sshKey = keyPaths.find(k => existsSync(k));
|
||||
|
||||
const sshArgs = [
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "ConnectTimeout=10",
|
||||
...(sshKey !== undefined ? ["-i", sshKey] : []),
|
||||
`${effectiveUser}@${ip}`,
|
||||
'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi',
|
||||
];
|
||||
|
||||
try {
|
||||
execFileSync("ssh", sshArgs, { stdio: "inherit" });
|
||||
} catch {
|
||||
// SSH connection closing during reboot is expected
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`
|
||||
Debug mode queued for ${hostname} (${mac}).
|
||||
Reboot the machine to enter Fedora rescue mode.
|
||||
|
||||
Once in rescue shell:
|
||||
|
||||
# Activate LVM
|
||||
vgchange -ay labvg
|
||||
|
||||
# Mount root + other volumes
|
||||
mkdir -p /mnt/sysroot
|
||||
mount /dev/labvg/root /mnt/sysroot
|
||||
cat /mnt/sysroot/etc/fstab # check what else to mount
|
||||
mount /dev/labvg/var /mnt/sysroot/var
|
||||
mount /dev/labvg/home /mnt/sysroot/home
|
||||
|
||||
# Boot the installed system in a container
|
||||
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
||||
|
||||
# Or just chroot for quick fixes
|
||||
mount --bind /dev /mnt/sysroot/dev
|
||||
mount --bind /proc /mnt/sysroot/proc
|
||||
mount --bind /sys /mnt/sysroot/sys
|
||||
chroot /mnt/sysroot
|
||||
|
||||
# Check initramfs size
|
||||
ls -lh /mnt/sysroot/boot/initramfs-*.img
|
||||
|
||||
# Rebuild initramfs without amdgpu
|
||||
chroot /mnt/sysroot
|
||||
echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf
|
||||
dracut -f --regenerate-all
|
||||
`);
|
||||
});
|
||||
}
|
||||
@@ -14,6 +14,7 @@ import { registerStatusCommand } from "./commands/status.js";
|
||||
import { registerInstallCommand } from "./commands/install.js";
|
||||
import { registerListCommand } from "./commands/list.js";
|
||||
import { registerReprovisionCommand } from "./commands/reprovision.js";
|
||||
import { registerDebugCommand } from "./commands/debug.js";
|
||||
import { registerForgetCommand } from "./commands/forget.js";
|
||||
import { registerLogsCommand } from "./commands/logs.js";
|
||||
import { registerMakeIsoCommand } from "./commands/makeiso.js";
|
||||
@@ -95,6 +96,7 @@ export function createProgram(): Command {
|
||||
registerListCommand(provisionCmd);
|
||||
registerInstallCommand(provisionCmd);
|
||||
registerReprovisionCommand(provisionCmd);
|
||||
registerDebugCommand(provisionCmd);
|
||||
registerForgetCommand(provisionCmd);
|
||||
registerLogsCommand(provisionCmd);
|
||||
registerMakeIsoCommand(provisionCmd);
|
||||
|
||||
@@ -172,6 +172,40 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
}
|
||||
});
|
||||
|
||||
// Queue debug/rescue mode — route to correct bastion by MAC
|
||||
app.post<{
|
||||
Body: { mac?: string };
|
||||
}>("/api/machines/debug", async (request, reply) => {
|
||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
if (!mac) {
|
||||
return reply.code(400).send({ error: "mac is required" });
|
||||
}
|
||||
|
||||
const bastion = bastionRegistry.findBastionByMac(mac);
|
||||
if (!bastion) {
|
||||
const all = bastionRegistry.getAll();
|
||||
if (all.length === 0) {
|
||||
return reply.code(503).send({ error: "No bastions connected" });
|
||||
}
|
||||
if (all.length === 1) {
|
||||
try {
|
||||
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac });
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||
}
|
||||
}
|
||||
return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` });
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac });
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// Forget machine
|
||||
app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => {
|
||||
const mac = request.params.mac.toLowerCase().replace(/-/g, ":");
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import { EventEmitter } from "node:events";
|
||||
import type { WebSocket } from "ws";
|
||||
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared";
|
||||
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared";
|
||||
|
||||
export interface ConnectedBastion {
|
||||
bastionId: string;
|
||||
@@ -20,6 +20,7 @@ export interface AggregatedState {
|
||||
discovered: Record<string, HardwareInfo>;
|
||||
install_queue: Record<string, InstallConfig>;
|
||||
installed: Record<string, InstalledInfo>;
|
||||
debug: Record<string, DebugConfig>;
|
||||
}
|
||||
|
||||
export class BastionRegistry extends EventEmitter {
|
||||
@@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
};
|
||||
|
||||
for (const bastion of this.bastions.values()) {
|
||||
@@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter {
|
||||
for (const [mac, info] of Object.entries(bastion.state.installed)) {
|
||||
result.installed[mac] = { ...info, bastionId: bastion.bastionId };
|
||||
}
|
||||
for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) {
|
||||
result.debug[mac] = { ...dbg };
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -5,6 +5,7 @@ export type {
|
||||
HardwareInfo,
|
||||
InstallConfig,
|
||||
InstalledInfo,
|
||||
DebugConfig,
|
||||
BastionState,
|
||||
BastionConfig,
|
||||
} from "./types/index.js";
|
||||
|
||||
@@ -5,6 +5,7 @@ export type {
|
||||
HardwareInfo,
|
||||
InstallConfig,
|
||||
InstalledInfo,
|
||||
DebugConfig,
|
||||
BastionState,
|
||||
} from "./state.js";
|
||||
|
||||
|
||||
@@ -98,8 +98,14 @@ export interface InstalledInfo {
|
||||
bastionId?: string; // set when aggregated through labd
|
||||
}
|
||||
|
||||
export interface DebugConfig {
|
||||
hostname: string;
|
||||
queued_at: string;
|
||||
}
|
||||
|
||||
export interface BastionState {
|
||||
discovered: Record<string, HardwareInfo>;
|
||||
install_queue: Record<string, InstallConfig>;
|
||||
installed: Record<string, InstalledInfo>;
|
||||
debug: Record<string, DebugConfig>;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user