1 Commits

Author SHA1 Message Date
Michal
816736793d feat: debug --sshd flag, auto SSH + nc listener + IP callback
Some checks failed
CI/CD / lint (pull_request) Failing after 22s
CI/CD / typecheck (pull_request) Failing after 22s
CI/CD / test (pull_request) Failing after 23s
CI/CD / build (pull_request) Has been skipped
CI/CD / publish-rpm (pull_request) Has been skipped
CI/CD / publish-deb (pull_request) Has been skipped
When using `labctl provision debug <target> --sshd`, the rescue
kickstart generates host keys, starts sshd (pw: debug) and nc
listener (port 2323), and reports the IP back to bastion via
/api/progress callback. Fully self-contained, no mounted FS needed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 23:53:19 +01:00
12 changed files with 190 additions and 18 deletions

View File

@@ -61,6 +61,9 @@ _labctl() {
"provision reprovision") "provision reprovision")
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
return ;; return ;;
"provision debug")
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
return ;;
"provision forget") "provision forget")
COMPREPLY=($(compgen -W "-h --help" -- "$cur")) COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
return ;; return ;;
@@ -95,7 +98,7 @@ _labctl() {
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
return ;; return ;;
"provision") "provision")
COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur")) COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur"))
return ;; return ;;
"config") "config")
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))

View File

@@ -122,6 +122,7 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d '
complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines'
complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation'
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'

View File

@@ -0,0 +1,103 @@
# Kickstart Reference — Lessons Learned
This documents pitfalls discovered during PXE boot testing. Read before modifying
the kickstart template (`src/bastion/src/templates/install.ks.ts`).
## Package requirements
### `kernel-modules` is mandatory
`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`,
`zram`, and many network/filesystem drivers. Without `kernel-modules`:
- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays
read-only** → sshd-keygen can't write host keys → SSH unreachable
- `zram-generator` fails → can trigger emergency mode
**Always include `kernel-modules` in %packages.** This matches what the real
labmaster (192.168.8.11) has installed.
Regression introduced in commit `fac14b6` which removed `@server-product`
(that group pulled in `kernel-modules` via `fedora-release-server`).
### `dosfstools` is needed
Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real
labmaster has it installed.
### Verify against the real machine
Before changing the package list, SSH to the labmaster and compare:
```bash
ssh 192.168.8.11 "rpm -q <package>"
```
## Anaconda %post execution order
This is critical and not well documented:
1. `%pre` scripts run
2. Disk partitioning and formatting
3. Package installation
4. **Anaconda writes system config (fstab, hostname, etc.)**
5. `%post` scripts run (in chroot of installed system)
6. `%post --nochroot` scripts run
7. **Anaconda MAY overwrite fstab again after %post scripts**
**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or
`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both
`sed` in %post and %post --nochroot had no effect on the final fstab.
What DOES work from %post:
- Writing files to `/etc/` (systemd units, config files, SSH keys)
- Enabling/disabling systemd services
- Installing additional packages
- Running `systemctl enable/mask`
What does NOT work from %post:
- Modifying `/etc/fstab` (Anaconda overwrites it)
- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions)
## UEFI / EFI partition
- Anaconda always creates an EFI System Partition for UEFI installs
- The EFI partition is FAT32 — requires `vfat` kernel module to mount
- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves
root as read-only. This cascades to break ALL services that need to write
- The EFI partition is used by firmware directly for bootloader — the OS
doesn't strictly need it mounted, but Anaconda adds it to fstab
## VM-specific issues (libvirt/QEMU/OVMF)
### iPXE exit behavior
- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk
- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk)
- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd`
### nftables
- libvirt creates reject rules for NAT networks in table `ip libvirt_network`
(NOT `inet libvirt` — this wrong table name cost hours of debugging)
- These rules block new host→VM connections (SSH)
- Rules are recreated on every `virsh start` — must delete after each VM restart
- Chains: `guest_input` and `guest_output`
### Serial console
- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet`
- Use `virsh console <vm-name>` for interactive access (handles telnet protocol)
- Raw `socat` works for reading but pagers/readline break interactive use
- Add `console=ttyS0,115200n8` to kernel args for boot output on serial
### SELinux on labmaster
- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it
- SSH works fine with SELinux enforcing on a properly installed Fedora system
- The `ld.so.cache` AVC denials seen during debugging were caused by the
read-only root filesystem, not by SELinux policy
## Testing checklist
Before merging kickstart changes:
1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q <pkg>"`
2. Run the PXE integration test: `sudo pnpm run test:integration:pxe`
3. Verify via serial console (root / `lab-root-pw`) if SSH fails
4. Check `mount | grep " / "` — must show `rw`, not `ro`
5. Check `systemctl --failed` — no critical failures

View File

@@ -269,6 +269,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
labdConn.onCommand("command-debug", async (msg) => { labdConn.onCommand("command-debug", async (msg) => {
if (msg.type !== "command-debug") throw new Error("unexpected"); if (msg.type !== "command-debug") throw new Error("unexpected");
const mac = msg.mac.toLowerCase(); const mac = msg.mac.toLowerCase();
const sshd = msg.sshd ?? false;
const currentState = state.load(); const currentState = state.load();
const hostname = const hostname =
currentState.installed[mac]?.hostname ?? currentState.installed[mac]?.hostname ??
@@ -276,7 +277,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
currentState.discovered[mac]?.product ?? currentState.discovered[mac]?.product ??
mac; mac;
state.update((s) => { state.update((s) => {
s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd };
}); });
return { status: "ok", data: { mac, hostname } }; return { status: "ok", data: { mac, hostname } };
}); });

View File

@@ -191,9 +191,10 @@ export function registerApiRoutes(
// Queue debug/rescue mode for a machine // Queue debug/rescue mode for a machine
app.post<{ app.post<{
Body: { mac?: string }; Body: { mac?: string; sshd?: boolean };
}>("/api/debug", async (request, reply) => { }>("/api/debug", async (request, reply) => {
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
const sshd = request.body?.sshd ?? false;
if (mac === "") { if (mac === "") {
return reply.status(400).send({ error: "mac is required" }); return reply.status(400).send({ error: "mac is required" });
} }
@@ -207,7 +208,7 @@ export function registerApiRoutes(
mac; mac;
state.update((s) => { state.update((s) => {
s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd };
}); });
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);

View File

@@ -23,8 +23,17 @@ export function registerDispatchRoutes(
state: StateManager, state: StateManager,
): void { ): void {
// Serve debug/rescue kickstart (minimal: SSH keys + network) // Serve debug/rescue kickstart (minimal: SSH keys + network)
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { app.get<{ Querystring: { mac?: string; sshd?: string } }>("/debug.ks", async (request, reply) => {
const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] }); const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
const currentState = state.load();
const wantSshd = request.query.sshd === "1" || currentState.debug[mac]?.sshd === true;
const ks = renderDebugKickstart({
sshKeys: config.sshKeys ?? [],
sshd: wantSshd,
serverIp: config.serverIp,
httpPort: config.httpPort,
});
return reply.type("text/plain").send(ks); return reply.type("text/plain").send(ks);
}); });

View File

@@ -1,9 +1,13 @@
// Debug/rescue kickstart template. // Debug/rescue kickstart template.
// Minimal: sets SSH access and network for Anaconda rescue mode. // Minimal kickstart for Anaconda rescue mode.
// No disk operations, no packages, no %post. // When sshd=true: generates host keys, starts sshd, reports IP to bastion.
// No dependency on mounted filesystems — fully self-contained.
export interface DebugKickstartParams { export interface DebugKickstartParams {
sshKeys: string[]; sshKeys: string[];
sshd?: boolean;
serverIp?: string;
httpPort?: number;
} }
export function renderDebugKickstart(params: DebugKickstartParams): string { export function renderDebugKickstart(params: DebugKickstartParams): string {
@@ -12,8 +16,55 @@ export function renderDebugKickstart(params: DebugKickstartParams): string {
? `sshkey --username=root "${params.sshKeys[0]}"` ? `sshkey --username=root "${params.sshKeys[0]}"`
: ""; : "";
const sshdSetup = params.sshd ? `
%post --nochroot --log=/tmp/debug-sshd.log
#!/bin/bash
set -x
# Generate host keys (self-contained, no mounted FS needed)
ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q
ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q
# Write minimal sshd config
cat > /tmp/sshd_config << 'SSHCFG'
HostKey /tmp/ssh_host_ed25519_key
HostKey /tmp/ssh_host_rsa_key
PermitRootLogin yes
PasswordAuthentication yes
PubkeyAuthentication yes
AuthorizedKeysFile /root/.ssh/authorized_keys
SSHCFG
# Set root password for SSH access
echo "root:debug" | chpasswd
# Set up SSH authorized keys
mkdir -p /root/.ssh && chmod 700 /root/.ssh
${params.sshKeys.map(k => `echo '${k}' >> /root/.ssh/authorized_keys`).join("\n")}
chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true
# Start sshd
/usr/sbin/sshd -f /tmp/sshd_config -p 22
echo "sshd started on port 22"
# Start persistent nc listener for remote shell
(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) &
echo "nc shell listener on port 2323"
# Report IP to bastion
sleep 2
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\
-H "Content-Type: application/json" \\
-d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"ssh root@$IP_ADDR (pw: debug) | nc $IP_ADDR 2323\\"}" 2>/dev/null || true
echo "Debug environment ready: ssh root@$IP_ADDR or nc $IP_ADDR 2323"
%end
` : "";
return `# Lab Bastion -- Debug/Rescue Kickstart return `# Lab Bastion -- Debug/Rescue Kickstart
# Minimal: only SSH + network for Anaconda rescue mode # Minimal: SSH + network for Anaconda rescue mode
lang en_US.UTF-8 lang en_US.UTF-8
keyboard uk keyboard uk
@@ -21,5 +72,5 @@ network --bootproto=dhcp --activate
${sshpw} ${sshpw}
${sshkeyLine} ${sshkeyLine}
`; ${sshdSetup}`;
} }

View File

@@ -94,8 +94,8 @@ export class LabdClient {
return this.request("POST", "/api/machines/install", { body: opts }); return this.request("POST", "/api/machines/install", { body: opts });
} }
async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { async debugMachine(mac: string, opts?: { sshd?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
return this.request("POST", "/api/machines/debug", { body: { mac } }); return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd } });
} }
async forgetMachine(mac: string): Promise<{ status: string }> { async forgetMachine(mac: string): Promise<{ status: string }> {

View File

@@ -48,8 +48,9 @@ export function registerDebugCommand(parent: Command): void {
parent parent
.command("debug <target>") .command("debug <target>")
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
.option("--sshd", "Start SSH + nc listener automatically, report IP to bastion")
.showHelpAfterError(true) .showHelpAfterError(true)
.action(async (target: string) => { .action(async (target: string, opts: { sshd?: boolean }) => {
const client = getLabdClient(); const client = getLabdClient();
// Resolve target from labd aggregated state // Resolve target from labd aggregated state
@@ -73,7 +74,7 @@ export function registerDebugCommand(parent: Command): void {
console.log(`Queuing debug mode for ${hostname} (${mac})...`); console.log(`Queuing debug mode for ${hostname} (${mac})...`);
try { try {
const result = await client.debugMachine(mac); const result = await client.debugMachine(mac, { sshd: opts.sshd });
if (result.error) { if (result.error) {
console.error(`Failed: ${result.error}`); console.error(`Failed: ${result.error}`);
process.exit(1); process.exit(1);

View File

@@ -174,9 +174,10 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
// Queue debug/rescue mode — route to correct bastion by MAC // Queue debug/rescue mode — route to correct bastion by MAC
app.post<{ app.post<{
Body: { mac?: string }; Body: { mac?: string; sshd?: boolean };
}>("/api/machines/debug", async (request, reply) => { }>("/api/machines/debug", async (request, reply) => {
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
const sshd = request.body?.sshd ?? false;
if (!mac) { if (!mac) {
return reply.code(400).send({ error: "mac is required" }); return reply.code(400).send({ error: "mac is required" });
} }
@@ -189,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
} }
if (all.length === 1) { if (all.length === 1) {
try { try {
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac }); const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd });
return reply.code(result.status === "ok" ? 200 : 500).send(result); return reply.code(result.status === "ok" ? 200 : 500).send(result);
} catch (err) { } catch (err) {
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
@@ -199,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
} }
try { try {
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac }); const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd });
return reply.code(result.status === "ok" ? 200 : 500).send(result); return reply.code(result.status === "ok" ? 200 : 500).send(result);
} catch (err) { } catch (err) {
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });

View File

@@ -111,7 +111,7 @@ export type LabdBastionMessage =
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
| { type: "command-forget"; requestId: string; mac: string } | { type: "command-forget"; requestId: string; mac: string }
| { type: "command-role-update"; requestId: string; mac: string; role: string } | { type: "command-role-update"; requestId: string; mac: string; role: string }
| { type: "command-debug"; requestId: string; mac: string } | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean }
| { type: "server-shutdown"; reconnectAfter: number }; | { type: "server-shutdown"; reconnectAfter: number };
export type BastionMessageType = BastionMessage["type"]; export type BastionMessageType = BastionMessage["type"];

View File

@@ -101,6 +101,7 @@ export interface InstalledInfo {
export interface DebugConfig { export interface DebugConfig {
hostname: string; hostname: string;
queued_at: string; queued_at: string;
sshd?: boolean;
} }
export interface BastionState { export interface BastionState {