diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index b27c835..21f615e 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -61,6 +61,9 @@ _labctl() { "provision reprovision") COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; + "provision debug") + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + return ;; "provision forget") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; @@ -95,7 +98,7 @@ _labctl() { COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) return ;; "provision") - COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur")) return ;; "config") COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 6736142..a1d4aab 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -122,6 +122,7 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d ' complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' +complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' diff --git a/bastion/docs/kickstart-reference.md b/bastion/docs/kickstart-reference.md new file mode 100644 index 0000000..2bf687c --- /dev/null +++ b/bastion/docs/kickstart-reference.md @@ -0,0 +1,103 @@ +# Kickstart Reference — Lessons Learned + +This documents pitfalls discovered during PXE boot testing. Read before modifying +the kickstart template (`src/bastion/src/templates/install.ks.ts`). + +## Package requirements + +### `kernel-modules` is mandatory + +`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`, +`zram`, and many network/filesystem drivers. Without `kernel-modules`: + +- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays + read-only** → sshd-keygen can't write host keys → SSH unreachable +- `zram-generator` fails → can trigger emergency mode + +**Always include `kernel-modules` in %packages.** This matches what the real +labmaster (192.168.8.11) has installed. + +Regression introduced in commit `fac14b6` which removed `@server-product` +(that group pulled in `kernel-modules` via `fedora-release-server`). + +### `dosfstools` is needed + +Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real +labmaster has it installed. + +### Verify against the real machine + +Before changing the package list, SSH to the labmaster and compare: +```bash +ssh 192.168.8.11 "rpm -q " +``` + +## Anaconda %post execution order + +This is critical and not well documented: + +1. `%pre` scripts run +2. Disk partitioning and formatting +3. Package installation +4. **Anaconda writes system config (fstab, hostname, etc.)** +5. `%post` scripts run (in chroot of installed system) +6. `%post --nochroot` scripts run +7. **Anaconda MAY overwrite fstab again after %post scripts** + +**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or +`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both +`sed` in %post and %post --nochroot had no effect on the final fstab. + +What DOES work from %post: +- Writing files to `/etc/` (systemd units, config files, SSH keys) +- Enabling/disabling systemd services +- Installing additional packages +- Running `systemctl enable/mask` + +What does NOT work from %post: +- Modifying `/etc/fstab` (Anaconda overwrites it) +- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions) + +## UEFI / EFI partition + +- Anaconda always creates an EFI System Partition for UEFI installs +- The EFI partition is FAT32 — requires `vfat` kernel module to mount +- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves + root as read-only. This cascades to break ALL services that need to write +- The EFI partition is used by firmware directly for bootloader — the OS + doesn't strictly need it mounted, but Anaconda adds it to fstab + +## VM-specific issues (libvirt/QEMU/OVMF) + +### iPXE exit behavior +- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk +- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk) +- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd` + +### nftables +- libvirt creates reject rules for NAT networks in table `ip libvirt_network` + (NOT `inet libvirt` — this wrong table name cost hours of debugging) +- These rules block new host→VM connections (SSH) +- Rules are recreated on every `virsh start` — must delete after each VM restart +- Chains: `guest_input` and `guest_output` + +### Serial console +- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet` +- Use `virsh console ` for interactive access (handles telnet protocol) +- Raw `socat` works for reading but pagers/readline break interactive use +- Add `console=ttyS0,115200n8` to kernel args for boot output on serial + +### SELinux on labmaster +- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it +- SSH works fine with SELinux enforcing on a properly installed Fedora system +- The `ld.so.cache` AVC denials seen during debugging were caused by the + read-only root filesystem, not by SELinux policy + +## Testing checklist + +Before merging kickstart changes: +1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q "` +2. Run the PXE integration test: `sudo pnpm run test:integration:pxe` +3. Verify via serial console (root / `lab-root-pw`) if SSH fails +4. Check `mount | grep " / "` — must show `rw`, not `ro` +5. Check `systemctl --failed` — no critical failures diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 03a49bf..fe3a3ac 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -269,6 +269,7 @@ export async function startBastion(overrides: Partial = {}): Prom labdConn.onCommand("command-debug", async (msg) => { if (msg.type !== "command-debug") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); + const sshd = msg.sshd ?? false; const currentState = state.load(); const hostname = currentState.installed[mac]?.hostname ?? @@ -276,7 +277,7 @@ export async function startBastion(overrides: Partial = {}): Prom currentState.discovered[mac]?.product ?? mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; }); return { status: "ok", data: { mac, hostname } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 75a821a..5718357 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -191,9 +191,10 @@ export function registerApiRoutes( // Queue debug/rescue mode for a machine app.post<{ - Body: { mac?: string }; + Body: { mac?: string; sshd?: boolean }; }>("/api/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const sshd = request.body?.sshd ?? false; if (mac === "") { return reply.status(400).send({ error: "mac is required" }); } @@ -207,7 +208,7 @@ export function registerApiRoutes( mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; }); logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index c9df55c..5361d0f 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -23,8 +23,17 @@ export function registerDispatchRoutes( state: StateManager, ): void { // Serve debug/rescue kickstart (minimal: SSH keys + network) - app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { - const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] }); + app.get<{ Querystring: { mac?: string; sshd?: string } }>("/debug.ks", async (request, reply) => { + const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); + const currentState = state.load(); + const wantSshd = request.query.sshd === "1" || currentState.debug[mac]?.sshd === true; + + const ks = renderDebugKickstart({ + sshKeys: config.sshKeys ?? [], + sshd: wantSshd, + serverIp: config.serverIp, + httpPort: config.httpPort, + }); return reply.type("text/plain").send(ks); }); diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts index 270fa34..27b0b4b 100644 --- a/bastion/src/bastion/src/templates/debug.ks.ts +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -1,9 +1,13 @@ // Debug/rescue kickstart template. -// Minimal: sets SSH access and network for Anaconda rescue mode. -// No disk operations, no packages, no %post. +// Minimal kickstart for Anaconda rescue mode. +// When sshd=true: generates host keys, starts sshd, reports IP to bastion. +// No dependency on mounted filesystems — fully self-contained. export interface DebugKickstartParams { sshKeys: string[]; + sshd?: boolean; + serverIp?: string; + httpPort?: number; } export function renderDebugKickstart(params: DebugKickstartParams): string { @@ -12,8 +16,55 @@ export function renderDebugKickstart(params: DebugKickstartParams): string { ? `sshkey --username=root "${params.sshKeys[0]}"` : ""; + const sshdSetup = params.sshd ? ` +%post --nochroot --log=/tmp/debug-sshd.log +#!/bin/bash +set -x + +# Generate host keys (self-contained, no mounted FS needed) +ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q +ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q + +# Write minimal sshd config +cat > /tmp/sshd_config << 'SSHCFG' +HostKey /tmp/ssh_host_ed25519_key +HostKey /tmp/ssh_host_rsa_key +PermitRootLogin yes +PasswordAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /root/.ssh/authorized_keys +SSHCFG + +# Set root password for SSH access +echo "root:debug" | chpasswd + +# Set up SSH authorized keys +mkdir -p /root/.ssh && chmod 700 /root/.ssh +${params.sshKeys.map(k => `echo '${k}' >> /root/.ssh/authorized_keys`).join("\n")} +chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true + +# Start sshd +/usr/sbin/sshd -f /tmp/sshd_config -p 22 +echo "sshd started on port 22" + +# Start persistent nc listener for remote shell +(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) & +echo "nc shell listener on port 2323" + +# Report IP to bastion +sleep 2 +IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') +MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') +curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\ + -H "Content-Type: application/json" \\ + -d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"ssh root@$IP_ADDR (pw: debug) | nc $IP_ADDR 2323\\"}" 2>/dev/null || true + +echo "Debug environment ready: ssh root@$IP_ADDR or nc $IP_ADDR 2323" +%end +` : ""; + return `# Lab Bastion -- Debug/Rescue Kickstart -# Minimal: only SSH + network for Anaconda rescue mode +# Minimal: SSH + network for Anaconda rescue mode lang en_US.UTF-8 keyboard uk @@ -21,5 +72,5 @@ network --bootproto=dhcp --activate ${sshpw} ${sshkeyLine} -`; +${sshdSetup}`; } diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index 5ec68cf..c7bfaa0 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,8 +94,8 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } - async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { - return this.request("POST", "/api/machines/debug", { body: { mac } }); + async debugMachine(mac: string, opts?: { sshd?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd } }); } async forgetMachine(mac: string): Promise<{ status: string }> { diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts index 78b3f6c..49e2847 100644 --- a/bastion/src/cli/src/commands/debug.ts +++ b/bastion/src/cli/src/commands/debug.ts @@ -48,8 +48,9 @@ export function registerDebugCommand(parent: Command): void { parent .command("debug ") .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") + .option("--sshd", "Start SSH + nc listener automatically, report IP to bastion") .showHelpAfterError(true) - .action(async (target: string) => { + .action(async (target: string, opts: { sshd?: boolean }) => { const client = getLabdClient(); // Resolve target from labd aggregated state @@ -73,7 +74,7 @@ export function registerDebugCommand(parent: Command): void { console.log(`Queuing debug mode for ${hostname} (${mac})...`); try { - const result = await client.debugMachine(mac); + const result = await client.debugMachine(mac, { sshd: opts.sshd === true }); if (result.error) { console.error(`Failed: ${result.error}`); process.exit(1); diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index 9372dae..ea694cc 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -174,9 +174,10 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void // Queue debug/rescue mode — route to correct bastion by MAC app.post<{ - Body: { mac?: string }; + Body: { mac?: string; sshd?: boolean }; }>("/api/machines/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const sshd = request.body?.sshd ?? false; if (!mac) { return reply.code(400).send({ error: "mac is required" }); } @@ -189,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } if (all.length === 1) { try { - const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac }); + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); @@ -199,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } try { - const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac }); + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 231d84d..88dffbd 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,7 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } - | { type: "command-debug"; requestId: string; mac: string } + | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index 382d7d5..a569cfa 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -101,6 +101,7 @@ export interface InstalledInfo { export interface DebugConfig { hostname: string; queued_at: string; + sshd?: boolean; } export interface BastionState {