diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index f7d4743..86a58fc 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -29,43 +29,49 @@ _labctl() { COMPREPLY=($(compgen -W "--dir -h --help" -- "$cur")) return ;; "init bastion standalone status") - COMPREPLY=($(compgen -W "--dir --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "init bastion standalone") COMPREPLY=($(compgen -W "start stop status -h --help" -- "$cur")) return ;; "app labcontroller deploy") - COMPREPLY=($(compgen -W "--user --port --crdb-replicas -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user --crdb-replicas -h --help" -- "$cur")) return ;; "app labcontroller status") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s install") - COMPREPLY=($(compgen -W "--role --user --port --k3s-server --k3s-token -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --user --k3s-server --k3s-token -h --help" -- "$cur")) return ;; "app k3s health") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s list") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "init bastion") COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur")) return ;; "provision list") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision install") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision reprovision") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) + return ;; + "provision debug") + COMPREPLY=($(compgen -W "--pxe-boot -h --help" -- "$cur")) return ;; "provision forget") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision logs") - COMPREPLY=($(compgen -W "-f --follow --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur")) + return ;; + "provision makeiso") + COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur")) return ;; "config list") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) @@ -92,7 +98,7 @@ _labctl() { COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) return ;; "provision") - COMPREPLY=($(compgen -W "list install reprovision forget logs -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur")) return ;; "config") COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 832ad8e..a63ae32 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -118,38 +118,35 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone start" -l foregro # init bastion standalone stop options complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d 'Bastion data directory' -x -# init bastion standalone status options -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l dir -d 'Bastion data directory' -x -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l port -d 'Bastion HTTP port' -x - # provision subcommands complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' +complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' - -# provision list options -complete -c labctl -n "__labctl_in_cmd provision list" -l port -d 'Bastion HTTP port' -x +complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' # provision install options complete -c labctl -n "__labctl_in_cmd provision install" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision install" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision install" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision install" -l port -d 'Bastion HTTP port' -x # provision reprovision options complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision reprovision" -l port -d 'Bastion HTTP port' -x -# provision forget options -complete -c labctl -n "__labctl_in_cmd provision forget" -l port -d 'Bastion HTTP port' -x +# provision debug options +complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)' # provision logs options -complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow logs in real-time (SSE stream)' -complete -c labctl -n "__labctl_in_cmd provision logs" -l port -d 'Bastion HTTP port' -x +complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time' + +# provision makeiso options +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l out -d 'Output path for local ISO build' -x # config subcommands complete -c labctl -n "__labctl_using_cmd config" -a list -d 'Show all configuration values' @@ -173,12 +170,10 @@ complete -c labctl -n "__labctl_using_cmd app labcontroller" -a status -d 'Check # app labcontroller deploy options complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l port -d 'Bastion HTTP port' -x complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l crdb-replicas -d 'CockroachDB replicas' -x # app labcontroller status options complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l port -d 'Bastion HTTP port' -x # app k3s subcommands complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)' @@ -188,15 +183,12 @@ complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed ma # app k3s install options complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s install" -l port -d 'Bastion HTTP port (for resolving target)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-server -d 'k3s server URL (required for worker role)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-token -d 'k3s join token (required for worker role)' -x # app k3s health options complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s health" -l port -d 'Bastion HTTP port' -x # app k3s list options complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s list" -l port -d 'Bastion HTTP port' -x diff --git a/bastion/docs/kickstart-reference.md b/bastion/docs/kickstart-reference.md new file mode 100644 index 0000000..2bf687c --- /dev/null +++ b/bastion/docs/kickstart-reference.md @@ -0,0 +1,103 @@ +# Kickstart Reference — Lessons Learned + +This documents pitfalls discovered during PXE boot testing. Read before modifying +the kickstart template (`src/bastion/src/templates/install.ks.ts`). + +## Package requirements + +### `kernel-modules` is mandatory + +`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`, +`zram`, and many network/filesystem drivers. Without `kernel-modules`: + +- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays + read-only** → sshd-keygen can't write host keys → SSH unreachable +- `zram-generator` fails → can trigger emergency mode + +**Always include `kernel-modules` in %packages.** This matches what the real +labmaster (192.168.8.11) has installed. + +Regression introduced in commit `fac14b6` which removed `@server-product` +(that group pulled in `kernel-modules` via `fedora-release-server`). + +### `dosfstools` is needed + +Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real +labmaster has it installed. + +### Verify against the real machine + +Before changing the package list, SSH to the labmaster and compare: +```bash +ssh 192.168.8.11 "rpm -q " +``` + +## Anaconda %post execution order + +This is critical and not well documented: + +1. `%pre` scripts run +2. Disk partitioning and formatting +3. Package installation +4. **Anaconda writes system config (fstab, hostname, etc.)** +5. `%post` scripts run (in chroot of installed system) +6. `%post --nochroot` scripts run +7. **Anaconda MAY overwrite fstab again after %post scripts** + +**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or +`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both +`sed` in %post and %post --nochroot had no effect on the final fstab. + +What DOES work from %post: +- Writing files to `/etc/` (systemd units, config files, SSH keys) +- Enabling/disabling systemd services +- Installing additional packages +- Running `systemctl enable/mask` + +What does NOT work from %post: +- Modifying `/etc/fstab` (Anaconda overwrites it) +- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions) + +## UEFI / EFI partition + +- Anaconda always creates an EFI System Partition for UEFI installs +- The EFI partition is FAT32 — requires `vfat` kernel module to mount +- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves + root as read-only. This cascades to break ALL services that need to write +- The EFI partition is used by firmware directly for bootloader — the OS + doesn't strictly need it mounted, but Anaconda adds it to fstab + +## VM-specific issues (libvirt/QEMU/OVMF) + +### iPXE exit behavior +- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk +- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk) +- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd` + +### nftables +- libvirt creates reject rules for NAT networks in table `ip libvirt_network` + (NOT `inet libvirt` — this wrong table name cost hours of debugging) +- These rules block new host→VM connections (SSH) +- Rules are recreated on every `virsh start` — must delete after each VM restart +- Chains: `guest_input` and `guest_output` + +### Serial console +- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet` +- Use `virsh console ` for interactive access (handles telnet protocol) +- Raw `socat` works for reading but pagers/readline break interactive use +- Add `console=ttyS0,115200n8` to kernel args for boot output on serial + +### SELinux on labmaster +- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it +- SSH works fine with SELinux enforcing on a properly installed Fedora system +- The `ld.so.cache` AVC denials seen during debugging were caused by the + read-only root filesystem, not by SELinux policy + +## Testing checklist + +Before merging kickstart changes: +1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q "` +2. Run the PXE integration test: `sudo pnpm run test:integration:pxe` +3. Verify via serial console (root / `lab-root-pw`) if SSH fails +4. Check `mount | grep " / "` — must show `rw`, not `ro` +5. Check `systemctl --failed` — no critical failures diff --git a/bastion/scripts/deploy.sh b/bastion/scripts/deploy.sh new file mode 100644 index 0000000..86b6f26 --- /dev/null +++ b/bastion/scripts/deploy.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Deploy bastion + labd to k3s cluster and install labctl locally. +# Usage: ./scripts/deploy.sh [bastion|labd|labctl|all] +# +# Builds container images with existing build scripts, pushes to Gitea +# registry, restarts k3s pods, and builds/installs labctl RPM. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$PROJECT_DIR" + +# Load .env if present +if [ -f .env ]; then + set -a; source .env; set +a +fi + +deploy_bastion() { + echo "=== Building & pushing bastion image ===" + bash scripts/build-bastion.sh --push latest + echo "" + echo "=== Restarting bastion pod ===" + kubectl rollout restart deployment/bastion -n lab-infra + kubectl rollout status deployment/bastion -n lab-infra --timeout=180s + echo "✓ Bastion deployed" +} + +deploy_labd() { + echo "=== Building & pushing labd image ===" + bash scripts/build-labd.sh --push latest + echo "" + echo "=== Restarting labd pod ===" + kubectl rollout restart deployment/labd -n lab-system + kubectl rollout status deployment/labd -n lab-system --timeout=180s + echo "✓ Labd deployed" +} + +deploy_labctl() { + echo "=== Building labctl RPM ===" + bash scripts/build-rpm.sh + echo "" + echo "=== Installing labctl ===" + RPM_FILE=$(ls dist/labctl-*.x86_64.rpm 2>/dev/null | head -1) + if [ -n "$RPM_FILE" ]; then + sudo rpm -U --force "$RPM_FILE" + echo "✓ labctl installed: $(labctl --version 2>/dev/null || echo 'installed')" + else + echo "WARNING: No RPM found, falling back to direct install" + pnpm build + sudo install -m 755 <(echo '#!/bin/bash'; echo "exec node $PROJECT_DIR/src/cli/dist/index.js \"\$@\"") /usr/local/bin/labctl + echo "✓ labctl installed (dev mode)" + fi +} + +case "${1:-all}" in + bastion) deploy_bastion ;; + labd) deploy_labd ;; + labctl) deploy_labctl ;; + all) + deploy_bastion + echo "" + deploy_labd + echo "" + deploy_labctl + ;; + *) + echo "Usage: $0 [bastion|labd|labctl|all]" + exit 1 + ;; +esac + +echo "" +echo "=== Deploy complete ===" diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 6b2a621..8c6d066 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -257,7 +257,7 @@ export async function startBastion(overrides: Partial = {}): Prom state.update((s) => { s.install_queue[msg.mac] = { hostname: msg.hostname, - disk: msg.disk ?? "/dev/sda", + disk: msg.disk ?? "", role: msg.role as import("@lab/shared").Role, os: msg.os as import("@lab/shared").OsId, queued_at: new Date().toISOString(), @@ -266,6 +266,22 @@ export async function startBastion(overrides: Partial = {}): Prom return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } }; }); + labdConn.onCommand("command-debug", async (msg) => { + if (msg.type !== "command-debug") throw new Error("unexpected"); + const mac = msg.mac.toLowerCase(); + const pxeBoot = msg.pxeBoot ?? false; + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot }; + }); + return { status: "ok", data: { mac, hostname } }; + }); + labdConn.onCommand("command-forget", async (msg) => { if (msg.type !== "command-forget") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); @@ -273,6 +289,7 @@ export async function startBastion(overrides: Partial = {}): Prom delete s.discovered[mac]; delete s.install_queue[mac]; delete s.installed[mac]; + delete s.debug[mac]; }); return { status: "ok", data: { mac } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 96e1e7f..b178b43 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -13,11 +13,13 @@ import { triggerPostProvisionK3s } from "../services/post-provision.js"; import { progressBus } from "../services/progress-events.js"; import type { ProgressEvent } from "../services/progress-events.js"; import type { InstallLogBuffer } from "../services/install-log.js"; +import type { SyslogListener } from "../services/syslog-listener.js"; export function registerApiRoutes( app: FastifyInstance, state: StateManager, installLog: InstallLogBuffer, + syslog: SyslogListener, ): void { // List all machines app.get("/api/machines", async (_request, reply) => { @@ -84,6 +86,11 @@ export function registerApiRoutes( const { mac: rawMac, stage, detail } = request.body ?? {}; const mac = (rawMac ?? "unknown").toLowerCase(); const stageName = stage ?? "unknown"; + + // Register IP → MAC for syslog routing + if (mac !== "unknown") { + syslog.registerIp(request.ip, mac); + } const detailStr = detail ?? ""; const GREEN = "\x1b[0;32m"; @@ -189,6 +196,32 @@ export function registerApiRoutes( return reply.send({ status: "ok", lines: allLines.length }); }); + // Queue debug/rescue mode for a machine + app.post<{ + Body: { mac?: string; pxeBoot?: boolean }; + }>("/api/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const pxeBoot = request.body?.pxeBoot ?? false; + if (mac === "") { + return reply.status(400).send({ error: "mac is required" }); + } + + // Look up hostname from installed or discovered state + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot }; + }); + + logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); + return reply.send({ status: "ok", mac, hostname }); + }); + // Delete a machine from all state app.delete<{ Params: { mac: string }; @@ -213,6 +246,10 @@ export function registerApiRoutes( delete s.installed[mac]; found = true; } + if (s.debug[mac] !== undefined) { + delete s.debug[mac]; + found = true; + } }); if (!found) { diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index 54221fc..0ecc1c4 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -10,9 +10,12 @@ import type { StateManager } from "../services/state.js"; import { renderDiscoverIpxe, renderInstallIpxe, + renderDebugIpxe, + renderPxeBootDebugIpxe, renderLocalBootIpxe, } from "../templates/boot.ipxe.js"; import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js"; +import { renderDebugKickstart } from "../templates/debug.ks.js"; import { logger } from "../services/logger.js"; export function registerDispatchRoutes( @@ -20,10 +23,76 @@ export function registerDispatchRoutes( config: BastionConfig, state: StateManager, ): void { + // Serve debug/rescue kickstart (minimal: SSH keys + network for inst.sshd) + app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { + const ks = renderDebugKickstart({ + sshKeys: config.sshKeys ?? [], + serverIp: config.serverIp, + httpPort: config.httpPort, + }); + return reply.type("text/plain").send(ks); + }); + + // Shell script for manual debug setup (nc listener + IP reporting) + // Usage from rescue shell: curl http://bastion:port/debug-setup.sh | bash + app.get("/debug-setup.sh", async (_request, reply) => { + const script = `#!/bin/bash +# Lab Bastion debug setup — run from rescue shell +set -x + +IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') +MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') + +# Start persistent nc listener for remote shell +(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) & +echo "nc shell listener on port 2323" + +# Report IP to bastion +curl -sf -X POST "http://${config.serverIp}:${config.httpPort}/api/progress" \\ + -H "Content-Type: application/json" \\ + -d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"nc $IP_ADDR 2323\\"}" 2>/dev/null || true + +echo "" +echo "=== Debug environment ready ===" +echo " nc $IP_ADDR 2323 (remote shell)" +echo " ssh root@$IP_ADDR (password: debug)" +echo "===============================" +`; + return reply.type("text/plain").send(script); + }); + app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => { const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); const currentState = state.load(); + // Debug mode takes highest priority — auto-clear after serving once + const debugEntry = currentState.debug[mac]; + if (debugEntry) { + const hostname = debugEntry.hostname ?? "debug"; + state.update((s) => { delete s.debug[mac]; }); + + let script: string; + if (debugEntry.pxeBoot) { + logger.info(`PXE BOOT DEBUG: ${mac} -> ${hostname} (kernel+initrd from PXE, root from NVMe)`); + script = renderPxeBootDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + }); + } else { + logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`); + script = renderDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + fedoraMirror: config.fedoraMirror, + }); + } + return reply.type("text/plain").send(script); + } + const queueEntry = currentState.install_queue[mac]; if (queueEntry) { const hostname = queueEntry.hostname ?? "lab-node"; diff --git a/bastion/src/bastion/src/routes/kickstart.ts b/bastion/src/bastion/src/routes/kickstart.ts index bce0e04..49ca90a 100644 --- a/bastion/src/bastion/src/routes/kickstart.ts +++ b/bastion/src/bastion/src/routes/kickstart.ts @@ -5,6 +5,7 @@ import type { FastifyInstance } from "fastify"; import type { BastionConfig } from "@lab/shared"; import type { StateManager } from "../services/state.js"; +import type { SyslogListener } from "../services/syslog-listener.js"; import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js"; import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js"; @@ -12,6 +13,7 @@ export function registerKickstartRoutes( app: FastifyInstance, config: BastionConfig, state: StateManager, + syslog: SyslogListener, ): void { // Per-MAC install kickstart app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => { @@ -19,6 +21,11 @@ export function registerKickstartRoutes( const currentState = state.load(); const queueEntry = currentState.install_queue[mac]; + // Register IP → MAC so syslog listener can route Anaconda logs + if (mac) { + syslog.registerIp(request.ip, mac); + } + const ks = generateInstallKickstart(config, { hostname: queueEntry?.hostname ?? "lab-node", disk: queueEntry?.disk ?? "", diff --git a/bastion/src/bastion/src/server.ts b/bastion/src/bastion/src/server.ts index 8bdaf6d..9a2979a 100644 --- a/bastion/src/bastion/src/server.ts +++ b/bastion/src/bastion/src/server.ts @@ -43,8 +43,8 @@ export function createApp(config: BastionConfig): { app: ReturnType void; @@ -33,6 +34,7 @@ export class StateManager { discovered: parsed.discovered ?? {}, install_queue: parsed.install_queue ?? {}, installed: parsed.installed ?? {}, + debug: parsed.debug ?? {}, }; } catch { return { ...EMPTY_STATE }; diff --git a/bastion/src/bastion/src/services/syslog-listener.ts b/bastion/src/bastion/src/services/syslog-listener.ts index 07c384e..e5b1d5a 100644 --- a/bastion/src/bastion/src/services/syslog-listener.ts +++ b/bastion/src/bastion/src/services/syslog-listener.ts @@ -18,7 +18,7 @@ function parseSyslogLine(raw: string): { program: string; message: string } { // Try to extract program and message after the timestamp + hostname // RFC 3164: "Mon DD HH:MM:SS HOSTNAME PROGRAM[PID]: MESSAGE" const match = noPri.match(/^\w+\s+\d+\s+[\d:]+\s+\S+\s+(\S+?)(?:\[\d+\])?:\s*(.*)/); - if (match) { + if (match?.[1] && match[2] !== undefined) { return { program: match[1], message: match[2] }; } // Fallback: just return the whole line @@ -30,6 +30,8 @@ export class SyslogListener { private port: number; private installLog: InstallLogBuffer; private state: StateManager; + /** Explicit IP → MAC mapping registered from kickstart/progress requests. */ + private ipToMac = new Map(); constructor(port: number, installLog: InstallLogBuffer, state: StateManager) { this.port = port; @@ -37,14 +39,21 @@ export class SyslogListener { this.state = state; } - /** Resolve a source IP to a MAC address using the install queue. */ + /** Register an IP → MAC mapping (called when we learn a machine's IP). */ + registerIp(ip: string, mac: string): void { + this.ipToMac.set(ip, mac.toLowerCase()); + } + + /** Resolve a source IP to a MAC address. */ private resolveIpToMac(ip: string): string | null { + // Check explicit mapping first (most reliable) + const explicit = this.ipToMac.get(ip); + if (explicit) return explicit; + const currentState = this.state.load(); // Check install queue — machines being installed have an IP from DHCP for (const [mac, entry] of Object.entries(currentState.install_queue)) { - // The progress callback sends IP in "complete" detail, but during install - // we need to match by what we know. Check if any progress mentions this IP. if (entry.progress_detail?.includes(ip)) return mac; } diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index 72f329f..95f36d2 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -42,7 +42,7 @@ echo Collecting hardware info... echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text nomodeset initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; @@ -69,7 +69,62 @@ echo MAC: ${params.mac} echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text nomodeset +initrd http://${params.serverIp}:${params.httpPort}/initrd.img +boot +`; +} + +/** + * iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode. + * Provides a shell with LVM tools, network, and SSH for inspecting installed systems. + */ +export function renderDebugIpxe(params: { + mac: string; + hostname: string; + serverIp: string; + httpPort: number; + fedoraMirror: string; +}): string { + return `#!ipxe + +echo +echo ============================================= +echo Lab PXE Bastion - DEBUG/RESCUE MODE +echo Target: ${params.hostname} +echo MAC: ${params.mac} +echo ============================================= +echo + +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror} +initrd http://${params.serverIp}:${params.httpPort}/initrd.img +boot +`; +} + +/** + * iPXE script for PXE-boot debug mode -- boots the installed system's root + * filesystem using the bastion's PXE kernel+initrd instead of local GRUB. + * Workaround for UEFI firmware bugs that make local disk boot slow. + */ +export function renderPxeBootDebugIpxe(params: { + mac: string; + hostname: string; + serverIp: string; + httpPort: number; +}): string { + return `#!ipxe + +echo +echo ============================================= +echo Lab PXE Bastion - PXE BOOT (debug) +echo Target: ${params.hostname} +echo MAC: ${params.mac} +echo Kernel+initrd from PXE, root from NVMe +echo ============================================= +echo + +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0 initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts new file mode 100644 index 0000000..29a7e35 --- /dev/null +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -0,0 +1,33 @@ +// Debug/rescue kickstart template. +// Minimal kickstart for Anaconda rescue mode. +// +// SSH access: Anaconda's inst.sshd starts sshd automatically. +// The sshpw directive sets the password, sshkey adds authorized keys. +// %pre/%post do NOT run in rescue mode — don't put setup code there. + +export interface DebugKickstartParams { + sshKeys: string[]; + serverIp?: string; + httpPort?: number; +} + +export function renderDebugKickstart(params: DebugKickstartParams): string { + const sshkeyLine = params.sshKeys.length > 0 + ? `sshkey --username=root "${params.sshKeys[0]}"` + : ""; + + return `# Lab Bastion -- Debug/Rescue Kickstart +# Minimal: SSH + network for Anaconda rescue mode +# +# SSH is started by Anaconda (inst.sshd kernel param). +# Password: debug | SSH keys from bastion config. +# %pre/%post do NOT run in rescue mode. + +lang en_US.UTF-8 +keyboard uk +network --bootproto=dhcp --activate + +sshpw --username=root --plaintext debug +${sshkeyLine} +`; +} diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index 019bbb8..94af999 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -88,8 +88,23 @@ chmod 440 /etc/sudoers.d/${adminUser}`; const diskLine = disk ? `DISK="${disk}"` : `DISK="" -for d in /dev/nvme0n1 /dev/sda /dev/vda; do - [ -b "$d" ] && { DISK="$(basename $d)"; break; } +# Wait up to 10s for NVMe/SCSI disks to appear (they init async in initrd) +for _wait in $(seq 1 10); do + for d in /dev/nvme0n1 /dev/nvme1n1 /dev/sda /dev/sdb /dev/vda; do + [ -b "$d" ] || continue + _bname=$(basename "$d") + # Skip removable disks (USB, CD-ROM, JetKVM virtual media) + [ -f "/sys/block/$_bname/removable" ] && [ "$(cat /sys/block/$_bname/removable)" = "1" ] && continue + # Skip USB-attached disks (JetKVM virtual media shows as SCSI over USB) + _transport=$(readlink -f /sys/block/$_bname/device 2>/dev/null || echo "") + echo "$_transport" | grep -q "usb" && continue + # Skip disks smaller than 20GB (likely USB sticks) + _size=$(cat /sys/block/$_bname/size 2>/dev/null || echo 0) + [ "$_size" -lt 41943040 ] && continue + DISK="$_bname" + break 2 + done + sleep 1 done [ -z "$DISK" ] && { echo "ERROR: no disk found"; exit 1; }`; @@ -119,7 +134,7 @@ network --bootproto=dhcp --activate --hostname=${fqdn} ${auth} ${userDirective} -bootloader --append="console=tty0 console=ttyS0,115200n8" +bootloader --append="console=tty0" logging --host=${serverIp} --port=${syslogPort} @@ -306,56 +321,27 @@ bastion_progress() { -d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true } -# Send log lines to bastion -bastion_log() { - local line="$1" - local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') - curl -sf -X POST "http://${serverIp}:${httpPort}/api/log" \\ - -H "Content-Type: application/json" \\ - -d "{\\"mac\\":\\"$mac\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\ - --connect-timeout 5 --max-time 10 2>/dev/null || true -} - -# Send an error stage to bastion -bastion_error() { - local detail="$1" - bastion_progress "error" "$detail" -} - -# --- Error trap: catch any failure and report to bastion --- -_post_error_handler() { - local exit_code=$? lineno=$1 - bastion_error "%post failed at line $lineno (exit $exit_code)" -} -trap '_post_error_handler $LINENO' ERR bastion_progress "post-install" "configuring system" # -- SSH -- -systemctl enable --now sshd +# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot +systemctl enable sshd || true sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config ${sshPostBlock} -# -- Hostname and domain -- -hostnamectl set-hostname ${fqdn} +bastion_progress "post-install" "1-ssh done" + +# -- Hostname and domain (write directly, hostnamectl needs D-Bus) -- +echo "${fqdn}" > /etc/hostname # -- tmpfs for /tmp -- echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab ${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup -- # -- Enable chronyd for time sync -- -systemctl enable chronyd || true - -# -- Serial console (for debugging — auto-login as root on ttyS0) -- -# AWS EC2 compatible: ttyS0 @ 115200n8 -systemctl enable serial-getty@ttyS0.service || true - -# -- Forward all system logs to serial console -- -cat > /etc/rsyslog.d/serial-console.conf << 'RSYSLOG' -*.* /dev/ttyS0 -RSYSLOG -systemctl enable rsyslog || true` : `# -- Kernel modules for k3s -- +systemctl enable chronyd || true` : `# -- Kernel modules for k3s -- cat > /etc/modules-load.d/k3s.conf << 'MODULES' br_netfilter overlay @@ -376,12 +362,15 @@ SYSCTL sysctl --system || true # -- Disable firewalld permanently (k3s/Cilium manage iptables directly) -- -systemctl disable --now firewalld || true +# Note: no '--now' — systemd is not running in the Anaconda chroot +systemctl disable firewalld || true systemctl mask firewalld || true # -- Enable chronyd for time sync -- systemctl enable chronyd || true`} +bastion_progress "post-install" "2-system done" + # -- Boot order: restore network first (Anaconda sets disk first, we undo it) -- # Network boot must stay first so the bastion intercepts every reboot. if command -v efibootmgr >/dev/null 2>&1; then @@ -394,6 +383,11 @@ if command -v efibootmgr >/dev/null 2>&1; then fi fi +bastion_progress "post-install" "3-bootorder done" + +# -- Enable SysRq magic keys (for emergency reboot via Alt+SysRq+REISUB) -- +echo "kernel.sysrq=1" > /etc/sysctl.d/90-sysrq.conf + # -- Provisioning metadata -- cat > /etc/lab-provisioned << PROVEOF hostname: ${fqdn} @@ -419,6 +413,8 @@ README ${hasRancher ? `# Install k3s server (skip start - will be configured manually) curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh - ` : ""} +bastion_progress "post-install" "4-metadata done" + IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') bastion_progress "complete" "ready at $IP_ADDR" diff --git a/bastion/src/bastion/tests/dispatch.test.ts b/bastion/src/bastion/tests/dispatch.test.ts index 0b9572b..3d07ac4 100644 --- a/bastion/src/bastion/tests/dispatch.test.ts +++ b/bastion/src/bastion/tests/dispatch.test.ts @@ -28,6 +28,7 @@ function createTestConfig(testDir: string): BastionConfig { gateway: "10.0.0.1", sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"], adminUser: "testadmin", + syslogPort: 15514, skipDnsmasq: true, skipArtifacts: true, fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os", diff --git a/bastion/src/bastion/tests/kickstart.test.ts b/bastion/src/bastion/tests/kickstart.test.ts index 2629877..771f5d1 100644 --- a/bastion/src/bastion/tests/kickstart.test.ts +++ b/bastion/src/bastion/tests/kickstart.test.ts @@ -206,10 +206,8 @@ describe("renderInstallKickstart", () => { } }); - it("forwards system logs to serial console", () => { + it("does not include serial console (causes 30s boot timeout on hardware without UART)", () => { const ks = renderInstallKickstart(baseParams({ role: "vanilla" })); - expect(ks).toContain("serial-console.conf"); - expect(ks).toContain("/dev/ttyS0"); - expect(ks).toContain("rsyslog"); + expect(ks).not.toContain("ttyS0"); }); }); diff --git a/bastion/src/bastion/tests/state.test.ts b/bastion/src/bastion/tests/state.test.ts index 494b479..2b509b5 100644 --- a/bastion/src/bastion/tests/state.test.ts +++ b/bastion/src/bastion/tests/state.test.ts @@ -26,6 +26,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); @@ -39,6 +40,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); diff --git a/bastion/src/bastion/tests/syslog-listener.test.ts b/bastion/src/bastion/tests/syslog-listener.test.ts new file mode 100644 index 0000000..2ece0d5 --- /dev/null +++ b/bastion/src/bastion/tests/syslog-listener.test.ts @@ -0,0 +1,121 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { createSocket } from "node:dgram"; +import { mkdtempSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { SyslogListener } from "../src/services/syslog-listener.js"; +import { InstallLogBuffer } from "../src/services/install-log.js"; +import { StateManager } from "../src/services/state.js"; + +function sendUdpSyslog(port: number, message: string): Promise { + return new Promise((resolve, reject) => { + const client = createSocket("udp4"); + const buf = Buffer.from(message); + client.send(buf, 0, buf.length, port, "127.0.0.1", (err) => { + client.close(); + if (err) reject(err); + else resolve(); + }); + }); +} + +describe("SyslogListener", () => { + let tmpDir: string; + let state: StateManager; + let installLog: InstallLogBuffer; + let syslog: SyslogListener; + const PORT = 15514; // use non-privileged port for testing + + beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "syslog-test-")); + state = new StateManager(join(tmpDir, "state.json")); + state.init(); + installLog = new InstallLogBuffer(tmpDir); + syslog = new SyslogListener(PORT, installLog, state); + syslog.start(); + }); + + afterEach(() => { + syslog.stop(); + rmSync(tmpDir, { recursive: true, force: true }); + }); + + it("receives and stores syslog messages for registered IP", async () => { + const mac = "aa:bb:cc:dd:ee:ff"; + // Queue a machine so hostname can be resolved + state.update((s) => { + s.install_queue[mac] = { + hostname: "testnode", + disk: "/dev/sda", + role: "worker", + os: "fedora-43", + queued_at: new Date().toISOString(), + }; + }); + + // Register IP → MAC mapping + syslog.registerIp("127.0.0.1", mac); + + // Send a syslog message (RFC 3164 format) + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: Installing package vim-enhanced"); + + // Wait for UDP delivery + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("anaconda"); + expect(lines[0]!.line).toContain("Installing package vim-enhanced"); + }); + + it("ignores messages from unknown IPs", async () => { + // Don't register any IP mapping + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: test message"); + await new Promise((r) => setTimeout(r, 200)); + + // No MAC to check, but the listener should not crash + // and no logs should be stored for any MAC + expect(installLog.lineCount("unknown")).toBe(0); + }); + + it("resolves IP from installed machines state", async () => { + const mac = "11:22:33:44:55:66"; + state.update((s) => { + s.installed[mac] = { + hostname: "installed-node", + role: "worker", + ip: "127.0.0.1", + installed_at: new Date().toISOString(), + }; + }); + + await sendUdpSyslog(PORT, "<14>Mar 30 02:00:00 installed-node sshd[5678]: Accepted publickey for root"); + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("sshd"); + }); + + it("parses various syslog formats", async () => { + const mac = "aa:bb:cc:dd:ee:ff"; + syslog.registerIp("127.0.0.1", mac); + state.update((s) => { + s.install_queue[mac] = { + hostname: "testnode", + disk: "/dev/sda", + role: "worker", + os: "fedora-43", + queued_at: new Date().toISOString(), + }; + }); + + // Message without PID + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost kernel: NVMe device ready"); + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("kernel"); + }); +}); diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index c68f0e9..241c848 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,6 +94,10 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } + async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } }); + } + async forgetMachine(mac: string): Promise<{ status: string }> { return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`); } diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts new file mode 100644 index 0000000..2bb4f24 --- /dev/null +++ b/bastion/src/cli/src/commands/debug.ts @@ -0,0 +1,155 @@ +// CLI command: provision debug +// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE. + +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { Command } from "commander"; +import type { BastionState } from "@lab/shared"; +import { getLabdClient } from "../api/config.js"; + +/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */ +function resolveTarget( + target: string, + state: BastionState, +): { mac: string; hostname: string; ip: string } | null { + const normalized = target.toLowerCase().replace(/-/g, ":"); + + if (state.installed[normalized]) { + const info = state.installed[normalized]; + return { mac: normalized, hostname: info.hostname, ip: info.ip }; + } + + if (state.discovered[normalized]) { + return { mac: normalized, hostname: normalized, ip: "" }; + } + + if (state.install_queue[normalized]) { + return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" }; + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.hostname === target || info.hostname.startsWith(target + ".")) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.ip === target) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + return null; +} + +export function registerDebugCommand(parent: Command): void { + parent + .command("debug ") + .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") + .option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)") + .showHelpAfterError(true) + .action(async (target: string, opts: { pxeBoot?: boolean }) => { + const client = getLabdClient(); + + // Resolve target from labd aggregated state + let state: BastionState; + try { + state = await client.getMachines(); + } catch (err) { + console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + const resolved = resolveTarget(target, state); + if (!resolved) { + console.error(`Cannot find machine: ${target}`); + console.error("Provide a hostname, MAC, or IP of a known machine."); + console.error("Run 'labctl provision list' to see available machines."); + process.exit(1); + } + + const { mac, hostname, ip } = resolved; + console.log(`Queuing debug mode for ${hostname} (${mac})...`); + + try { + const result = await client.debugMachine(mac, { pxeBoot: opts.pxeBoot === true }); + if (result.error) { + console.error(`Failed: ${result.error}`); + process.exit(1); + } + } catch (err) { + console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + // Try SSH reboot into PXE + if (ip !== "") { + const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? ""; + const effectiveUser = adminUser === "root" ? "" : adminUser; + + if (effectiveUser !== "") { + console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`); + + const sudoUser = process.env["SUDO_USER"]; + const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir(); + const keyPaths = [ + join(realHome, ".ssh", "id_ed25519"), + join(realHome, ".ssh", "id_rsa"), + join(realHome, ".ssh", "id_ecdsa"), + ]; + const sshKey = keyPaths.find(k => existsSync(k)); + + const sshArgs = [ + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + ...(sshKey !== undefined ? ["-i", sshKey] : []), + `${effectiveUser}@${ip}`, + 'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi', + ]; + + try { + execFileSync("ssh", sshArgs, { stdio: "inherit" }); + } catch { + // SSH connection closing during reboot is expected + } + } + } + + // Determine bastion URL from labd config for the setup script URL + const bastionUrl = process.env["LABD_URL"] + ? process.env["LABD_URL"].replace(/\/ws\/bastion$/, "").replace(/^wss?:/, "http:") + : "http://:8080"; + + console.log(` +Debug mode queued for ${hostname} (${mac}). +Reboot the machine to enter Fedora rescue mode. + +SSH access (started by Anaconda): + ssh root@ (password: debug) + +For nc remote shell, run from rescue shell: + curl ${bastionUrl}/debug-setup.sh | bash + +Once in rescue shell: + + # Activate LVM and mount installed system + vgchange -ay + mkdir -p /mnt/sysroot + mount /dev//root /mnt/sysroot + cat /mnt/sysroot/etc/fstab + mount /dev//var /mnt/sysroot/var + mount /dev//home /mnt/sysroot/home + + # Boot installed system in a container + /mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot + + # Or chroot for quick fixes + mount --bind /dev /mnt/sysroot/dev + mount --bind /proc /mnt/sysroot/proc + mount --bind /sys /mnt/sysroot/sys + chroot /mnt/sysroot +`); + }); +} diff --git a/bastion/src/cli/src/commands/logs.ts b/bastion/src/cli/src/commands/logs.ts index 85a59c1..48630a6 100644 --- a/bastion/src/cli/src/commands/logs.ts +++ b/bastion/src/cli/src/commands/logs.ts @@ -39,19 +39,25 @@ export function registerLogsCommand(parent: Command): void { parent .command("logs ") .description("Show provisioning logs for a machine (hostname, MAC, or IP)") - .action(async (target: string) => { + .option("-f, --follow", "Follow log output in real-time") + .action(async (target: string, opts: { follow?: boolean }) => { const mac = await resolveToMac(target); + const BOLD = "\x1b[1m"; + const GREEN = "\x1b[32m"; + const YELLOW = "\x1b[33m"; + const RED = "\x1b[31m"; + const DIM = "\x1b[2m"; + const RESET = "\x1b[0m"; + + if (opts.follow) { + await followLogs(mac, { BOLD, GREEN, YELLOW, RED, DIM, RESET }); + return; + } + try { const data = await getLabdClient().getMachineLogs(mac); - const BOLD = "\x1b[1m"; - const GREEN = "\x1b[32m"; - const YELLOW = "\x1b[33m"; - const RED = "\x1b[31m"; - const DIM = "\x1b[2m"; - const RESET = "\x1b[0m"; - console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`); console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`); console.log(` Role: ${data["role"]}`); @@ -83,3 +89,58 @@ export function registerLogsCommand(parent: Command): void { } }); } + +/** Follow logs by polling labd. */ +async function followLogs( + mac: string, + colors: { BOLD: string; GREEN: string; YELLOW: string; RED: string; DIM: string; RESET: string }, +): Promise { + const { BOLD, GREEN, YELLOW, RED, DIM, RESET } = colors; + const client = getLabdClient(); + + console.log(`${DIM}Following logs for ${mac} (Ctrl+C to stop)${RESET}`); + console.log(""); + + let lastStageCount = 0; + let lastStatus = ""; + + while (true) { + try { + const data = await client.getMachineLogs(mac); + const status = String(data["status"] ?? ""); + const log = data["log"] as Array<{ stage: string; detail: string; timestamp: string }> | undefined; + + // Print header once or on status change + if (status !== lastStatus) { + const hostname = String(data["hostname"] ?? mac); + const statusColor = status === "installed" ? GREEN : YELLOW; + console.log(` ${BOLD}${hostname}${RESET} ${statusColor}${status}${RESET}`); + lastStatus = status; + } + + // Print new stages + if (log && log.length > lastStageCount) { + for (let i = lastStageCount; i < log.length; i++) { + const entry = log[i]!; + const time = entry.timestamp.slice(11, 19); + const color = entry.stage === "complete" ? GREEN : entry.stage === "error" ? RED : YELLOW; + const detail = entry.detail ? ` ${DIM}-- ${entry.detail}${RESET}` : ""; + console.log(` ${DIM}${time}${RESET} ${color}${entry.stage}${RESET}${detail}`); + } + lastStageCount = log.length; + } + + // Done + if (status === "installed") { + const ip = data["ip"] ?? ""; + console.log(""); + console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`); + process.exit(0); + } + } catch { + // Machine may not be in logs yet (still queued) + } + + await new Promise((r) => setTimeout(r, 5000)); + } +} diff --git a/bastion/src/cli/src/index.ts b/bastion/src/cli/src/index.ts index 0584ec5..00d0df9 100644 --- a/bastion/src/cli/src/index.ts +++ b/bastion/src/cli/src/index.ts @@ -14,6 +14,7 @@ import { registerStatusCommand } from "./commands/status.js"; import { registerInstallCommand } from "./commands/install.js"; import { registerListCommand } from "./commands/list.js"; import { registerReprovisionCommand } from "./commands/reprovision.js"; +import { registerDebugCommand } from "./commands/debug.js"; import { registerForgetCommand } from "./commands/forget.js"; import { registerLogsCommand } from "./commands/logs.js"; import { registerMakeIsoCommand } from "./commands/makeiso.js"; @@ -95,6 +96,7 @@ export function createProgram(): Command { registerListCommand(provisionCmd); registerInstallCommand(provisionCmd); registerReprovisionCommand(provisionCmd); + registerDebugCommand(provisionCmd); registerForgetCommand(provisionCmd); registerLogsCommand(provisionCmd); registerMakeIsoCommand(provisionCmd); diff --git a/bastion/src/labd/src/main.ts b/bastion/src/labd/src/main.ts index 17110d9..1c365ef 100644 --- a/bastion/src/labd/src/main.ts +++ b/bastion/src/labd/src/main.ts @@ -34,6 +34,7 @@ async function main(): Promise { server: { findMany: () => dbError(), findUnique: () => dbError(), + upsert: () => dbError(), }, joinToken: { findUnique: () => dbError(), diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index 8ed15ec..9c8e181 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -80,9 +80,54 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void }); }); - // Aggregated machines from all connected bastions + // Aggregated machines from all connected bastions + DB fallback app.get("/api/machines", async () => { - return bastionRegistry.getAggregatedState(); + const live = bastionRegistry.getAggregatedState(); + + // Merge DB records for machines not currently in any bastion's live state + try { + const dbServers = (await db.server.findMany({})) as Array<{ + mac: string | null; hostname: string; role: string; ip: string | null; + status: string; labels: Record; + }>; + for (const s of dbServers) { + if (!s.mac) continue; + const mac = s.mac.toLowerCase(); + // Only add from DB if not already in live state + if (!(mac in live.discovered) && !(mac in live.install_queue) && !(mac in live.installed)) { + if (s.status === "discovered") { + live.discovered[mac] = { + mac, + product: String(s.labels?.product ?? "unknown"), + board: "unknown", + serial: "unknown", + manufacturer: String(s.labels?.manufacturer ?? "unknown"), + cpu_model: String(s.labels?.cpu ?? "unknown"), + cpu_cores: Number(s.labels?.cores ?? 0), + memory_gb: Number(s.labels?.memory_gb ?? 0), + arch: String(s.labels?.arch ?? "unknown"), + disks: [], + nics: [], + first_seen: "", + last_seen: "", + bastionId: "db", + }; + } else if (s.status === "online" || s.status === "offline") { + live.installed[mac] = { + hostname: s.hostname, + role: s.role, + ip: s.ip ?? "", + installed_at: "", + bastionId: "db", + }; + } + } + } + } catch { + // DB unavailable — return live state only + } + + return live; }); // Queue install — route to correct bastion by MAC @@ -106,7 +151,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void try { const result = await sendCommand(all[0]!.bastionId, { type: "command-install", - mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43", + mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43", }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { @@ -119,7 +164,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void try { const result = await sendCommand(bastion.bastionId, { type: "command-install", - mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43", + mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43", }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { @@ -127,6 +172,41 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } }); + // Queue debug/rescue mode — route to correct bastion by MAC + app.post<{ + Body: { mac?: string; pxeBoot?: boolean }; + }>("/api/machines/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const pxeBoot = request.body?.pxeBoot ?? false; + if (!mac) { + return reply.code(400).send({ error: "mac is required" }); + } + + const bastion = bastionRegistry.findBastionByMac(mac); + if (!bastion) { + const all = bastionRegistry.getAll(); + if (all.length === 0) { + return reply.code(503).send({ error: "No bastions connected" }); + } + if (all.length === 1) { + try { + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, pxeBoot }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + } + return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` }); + } + + try { + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, pxeBoot }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + }); + // Forget machine app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => { const mac = request.params.mac.toLowerCase().replace(/-/g, ":"); diff --git a/bastion/src/labd/src/server.ts b/bastion/src/labd/src/server.ts index b1bd8f4..b4ac77a 100644 --- a/bastion/src/labd/src/server.ts +++ b/bastion/src/labd/src/server.ts @@ -19,6 +19,7 @@ export interface DbClient { server: { findMany: (...args: unknown[]) => Promise; findUnique: (...args: unknown[]) => Promise; + upsert: (...args: unknown[]) => Promise; }; joinToken: { findUnique: (...args: unknown[]) => Promise; @@ -139,7 +140,7 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{ socket, connectedAt: new Date(), lastHeartbeat: new Date(), - state: { discovered: {}, install_queue: {}, installed: {} }, + state: { discovered: {}, install_queue: {}, installed: {}, debug: {} }, }); socket.send(JSON.stringify({ type: "bastion-enrolled", bastionId: record.id })); @@ -175,6 +176,52 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{ if (bastionId) { bastionRegistry.updateState(bastionId, msg.state); logger.info(`Bastion ${bastionId.slice(0, 8)} state sync: ${Object.keys(msg.state.discovered).length} discovered, ${Object.keys(msg.state.installed).length} installed`); + + // Persist machines to DB + void (async () => { + try { + // Upsert discovered machines + for (const [mac, hw] of Object.entries(msg.state.discovered)) { + await db.server.upsert({ + where: { mac }, + create: { + hostname: hw.product ?? mac, + mac, + role: "unknown", + status: "discovered", + labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer }, + }, + update: { + status: "discovered", + lastHeartbeat: new Date(), + labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer }, + }, + }); + } + // Upsert installed machines + for (const [mac, info] of Object.entries(msg.state.installed)) { + await db.server.upsert({ + where: { mac }, + create: { + hostname: info.hostname, + mac, + role: info.role ?? "worker", + ip: info.ip, + status: "online", + }, + update: { + hostname: info.hostname, + role: info.role ?? "worker", + ip: info.ip, + status: "online", + lastHeartbeat: new Date(), + }, + }); + } + } catch (err) { + logger.warn(`Failed to persist machines to DB: ${err instanceof Error ? err.message : String(err)}`); + } + })(); } break; } diff --git a/bastion/src/labd/src/services/bastion-registry.ts b/bastion/src/labd/src/services/bastion-registry.ts index 15d0570..dba23a4 100644 --- a/bastion/src/labd/src/services/bastion-registry.ts +++ b/bastion/src/labd/src/services/bastion-registry.ts @@ -3,7 +3,7 @@ import { EventEmitter } from "node:events"; import type { WebSocket } from "ws"; -import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared"; +import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared"; export interface ConnectedBastion { bastionId: string; @@ -20,6 +20,7 @@ export interface AggregatedState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; } export class BastionRegistry extends EventEmitter { @@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }; for (const bastion of this.bastions.values()) { @@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter { for (const [mac, info] of Object.entries(bastion.state.installed)) { result.installed[mac] = { ...info, bastionId: bastion.bastionId }; } + for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) { + result.debug[mac] = { ...dbg }; + } } return result; diff --git a/bastion/src/shared/src/index.ts b/bastion/src/shared/src/index.ts index 7179a6d..443edbc 100644 --- a/bastion/src/shared/src/index.ts +++ b/bastion/src/shared/src/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, BastionConfig, } from "./types/index.js"; diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 6670c54..e2bdd1c 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,6 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } + | { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; @@ -125,7 +126,7 @@ const BASTION_MESSAGE_TYPES = new Set([ const LABD_BASTION_MESSAGE_TYPES = new Set([ "bastion-enrolled", "bastion-heartbeat-ack", "command-install", - "command-forget", "command-role-update", "server-shutdown", + "command-forget", "command-role-update", "command-debug", "server-shutdown", ]); export function isBastionMessage(msg: unknown): msg is BastionMessage { diff --git a/bastion/src/shared/src/types/index.ts b/bastion/src/shared/src/types/index.ts index 8ff20ed..510ee31 100644 --- a/bastion/src/shared/src/types/index.ts +++ b/bastion/src/shared/src/types/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, } from "./state.js"; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index 9be3d21..689f09a 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -98,8 +98,15 @@ export interface InstalledInfo { bastionId?: string; // set when aggregated through labd } +export interface DebugConfig { + hostname: string; + queued_at: string; + pxeBoot?: boolean; +} + export interface BastionState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; } diff --git a/bastion/tests/integration/helpers/jetkvm.sh b/bastion/tests/integration/helpers/jetkvm.sh new file mode 100755 index 0000000..e9ecbb3 --- /dev/null +++ b/bastion/tests/integration/helpers/jetkvm.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# JetKVM helper — authenticate and interact with JetKVM device. +# Usage: +# jetkvm.sh status — check device status +# jetkvm.sh reboot — reboot the target machine via ATX +# jetkvm.sh poweron — power on via ATX short press +# jetkvm.sh poweroff — power off via ATX long press +# +# Environment: +# JETKVM_HOST — JetKVM IP (default: 192.168.3.10) +# JETKVM_PASS — device password + +set -euo pipefail + +HOST="${JETKVM_HOST:-192.168.3.10}" +PASS="${JETKVM_PASS:-}" + +if [ -z "$PASS" ]; then + echo "ERROR: JETKVM_PASS not set" >&2 + exit 1 +fi + +BASE="http://$HOST" + +# Authenticate and get token +login() { + local resp + resp=$(curl -s -X POST "$BASE/auth/login-local" \ + -H "Content-Type: application/json" \ + -d "{\"password\":\"$PASS\"}" 2>&1) + + local token + token=$(echo "$resp" | grep -oP '"token"\s*:\s*"[^"]*"' | head -1 | grep -oP '"[^"]*"$' | tr -d '"') + + if [ -z "$token" ]; then + echo "ERROR: Login failed: $resp" >&2 + exit 1 + fi + echo "$token" +} + +# Make authenticated request +api() { + local method="$1" path="$2" body="${3:-}" + local token + token=$(login) + + if [ -n "$body" ]; then + curl -s -X "$method" "$BASE$path" \ + -H "Authorization: Bearer $token" \ + -H "Content-Type: application/json" \ + -d "$body" + else + curl -s -X "$method" "$BASE$path" \ + -H "Authorization: Bearer $token" + fi +} + +case "${1:-status}" in + status) + curl -s "$BASE/device/status" 2>&1 + ;; + device) + api GET /device + ;; + reboot) + echo "Sending ATX reset..." + api POST /device/atx/reset + ;; + poweron) + echo "Sending ATX short power press..." + api POST /device/atx/power-short + ;; + poweroff) + echo "Sending ATX long power press..." + api POST /device/atx/power-long + ;; + *) + echo "Usage: $0 {status|device|reboot|poweron|poweroff}" + exit 1 + ;; +esac diff --git a/bastion/tests/integration/pxe-provision.test.ts b/bastion/tests/integration/pxe-provision.test.ts index 2d1f20b..9e5deb7 100644 --- a/bastion/tests/integration/pxe-provision.test.ts +++ b/bastion/tests/integration/pxe-provision.test.ts @@ -224,11 +224,12 @@ describe("PXE boot provisioning", () => { // Generate dnsmasq config generateDnsmasqConf(config); - // Start HTTP server - const { app, state } = createApp(config); + // Start HTTP server + syslog listener + const { app, state, syslog } = createApp(config); bastionApp = app; await app.listen({ port: config.httpPort, host: "0.0.0.0" }); - log(`Bastion HTTP server listening on :${HTTP_PORT}`); + syslog.start(); + log(`Bastion HTTP server listening on :${HTTP_PORT}, syslog on UDP :${config.syslogPort}`); // Start dnsmasq (fire-and-forget — it runs until killed) // May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback @@ -387,8 +388,8 @@ describe("PXE boot provisioning", () => { expect(data.progress).toBe("complete"); }); - it.skip("log lines were captured", async () => { - // Requires log streamer in %post — skipped until re-added + it("syslog install logs were captured", async () => { + // Anaconda forwards logs via syslog (logging --host directive in kickstart) const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`); const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> }; expect(data.log_total).toBeGreaterThan(0);