From aae03d9877abb29a71f37ef05db2c8be1f121c9d Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 00:58:00 +0000 Subject: [PATCH 01/16] fix: syslog parser TS strict null check, deploy script Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/scripts/deploy.sh | 74 +++++++++++++++++++ .../bastion/src/services/syslog-listener.ts | 2 +- 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 bastion/scripts/deploy.sh diff --git a/bastion/scripts/deploy.sh b/bastion/scripts/deploy.sh new file mode 100644 index 0000000..86b6f26 --- /dev/null +++ b/bastion/scripts/deploy.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Deploy bastion + labd to k3s cluster and install labctl locally. +# Usage: ./scripts/deploy.sh [bastion|labd|labctl|all] +# +# Builds container images with existing build scripts, pushes to Gitea +# registry, restarts k3s pods, and builds/installs labctl RPM. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$PROJECT_DIR" + +# Load .env if present +if [ -f .env ]; then + set -a; source .env; set +a +fi + +deploy_bastion() { + echo "=== Building & pushing bastion image ===" + bash scripts/build-bastion.sh --push latest + echo "" + echo "=== Restarting bastion pod ===" + kubectl rollout restart deployment/bastion -n lab-infra + kubectl rollout status deployment/bastion -n lab-infra --timeout=180s + echo "✓ Bastion deployed" +} + +deploy_labd() { + echo "=== Building & pushing labd image ===" + bash scripts/build-labd.sh --push latest + echo "" + echo "=== Restarting labd pod ===" + kubectl rollout restart deployment/labd -n lab-system + kubectl rollout status deployment/labd -n lab-system --timeout=180s + echo "✓ Labd deployed" +} + +deploy_labctl() { + echo "=== Building labctl RPM ===" + bash scripts/build-rpm.sh + echo "" + echo "=== Installing labctl ===" + RPM_FILE=$(ls dist/labctl-*.x86_64.rpm 2>/dev/null | head -1) + if [ -n "$RPM_FILE" ]; then + sudo rpm -U --force "$RPM_FILE" + echo "✓ labctl installed: $(labctl --version 2>/dev/null || echo 'installed')" + else + echo "WARNING: No RPM found, falling back to direct install" + pnpm build + sudo install -m 755 <(echo '#!/bin/bash'; echo "exec node $PROJECT_DIR/src/cli/dist/index.js \"\$@\"") /usr/local/bin/labctl + echo "✓ labctl installed (dev mode)" + fi +} + +case "${1:-all}" in + bastion) deploy_bastion ;; + labd) deploy_labd ;; + labctl) deploy_labctl ;; + all) + deploy_bastion + echo "" + deploy_labd + echo "" + deploy_labctl + ;; + *) + echo "Usage: $0 [bastion|labd|labctl|all]" + exit 1 + ;; +esac + +echo "" +echo "=== Deploy complete ===" diff --git a/bastion/src/bastion/src/services/syslog-listener.ts b/bastion/src/bastion/src/services/syslog-listener.ts index 07c384e..1022d6c 100644 --- a/bastion/src/bastion/src/services/syslog-listener.ts +++ b/bastion/src/bastion/src/services/syslog-listener.ts @@ -18,7 +18,7 @@ function parseSyslogLine(raw: string): { program: string; message: string } { // Try to extract program and message after the timestamp + hostname // RFC 3164: "Mon DD HH:MM:SS HOSTNAME PROGRAM[PID]: MESSAGE" const match = noPri.match(/^\w+\s+\d+\s+[\d:]+\s+\S+\s+(\S+?)(?:\[\d+\])?:\s*(.*)/); - if (match) { + if (match?.[1] && match[2] !== undefined) { return { program: match[1], message: match[2] }; } // Fallback: just return the whole line From 0c1e18cee1bebe3eebb012d6b2a416efdfb257bc Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 02:34:26 +0100 Subject: [PATCH 02/16] feat: persist machine state to CockroachDB on bastion-state-sync When bastion syncs state, labd now upserts discovered and installed machines into the Server table. /api/machines merges live bastion state with DB records, so machines survive pod restarts. Discovered machines get status=discovered with hardware labels. Installed machines get status=online with hostname, role, IP. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/labd/src/main.ts | 1 + bastion/src/labd/src/routes/bastions.ts | 49 ++++++++++++++++++++++++- bastion/src/labd/src/server.ts | 47 ++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 2 deletions(-) diff --git a/bastion/src/labd/src/main.ts b/bastion/src/labd/src/main.ts index 17110d9..1c365ef 100644 --- a/bastion/src/labd/src/main.ts +++ b/bastion/src/labd/src/main.ts @@ -34,6 +34,7 @@ async function main(): Promise { server: { findMany: () => dbError(), findUnique: () => dbError(), + upsert: () => dbError(), }, joinToken: { findUnique: () => dbError(), diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index 8ed15ec..a1c0af8 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -80,9 +80,54 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void }); }); - // Aggregated machines from all connected bastions + // Aggregated machines from all connected bastions + DB fallback app.get("/api/machines", async () => { - return bastionRegistry.getAggregatedState(); + const live = bastionRegistry.getAggregatedState(); + + // Merge DB records for machines not currently in any bastion's live state + try { + const dbServers = (await db.server.findMany({})) as Array<{ + mac: string | null; hostname: string; role: string; ip: string | null; + status: string; labels: Record; + }>; + for (const s of dbServers) { + if (!s.mac) continue; + const mac = s.mac.toLowerCase(); + // Only add from DB if not already in live state + if (!(mac in live.discovered) && !(mac in live.install_queue) && !(mac in live.installed)) { + if (s.status === "discovered") { + live.discovered[mac] = { + mac, + product: String(s.labels?.product ?? "unknown"), + board: "unknown", + serial: "unknown", + manufacturer: String(s.labels?.manufacturer ?? "unknown"), + cpu_model: String(s.labels?.cpu ?? "unknown"), + cpu_cores: Number(s.labels?.cores ?? 0), + memory_gb: Number(s.labels?.memory_gb ?? 0), + arch: String(s.labels?.arch ?? "unknown"), + disks: [], + nics: [], + first_seen: "", + last_seen: "", + bastionId: "db", + }; + } else if (s.status === "online" || s.status === "offline") { + live.installed[mac] = { + hostname: s.hostname, + role: s.role, + ip: s.ip ?? "", + installed_at: "", + bastionId: "db", + }; + } + } + } + } catch { + // DB unavailable — return live state only + } + + return live; }); // Queue install — route to correct bastion by MAC diff --git a/bastion/src/labd/src/server.ts b/bastion/src/labd/src/server.ts index b1bd8f4..4881962 100644 --- a/bastion/src/labd/src/server.ts +++ b/bastion/src/labd/src/server.ts @@ -19,6 +19,7 @@ export interface DbClient { server: { findMany: (...args: unknown[]) => Promise; findUnique: (...args: unknown[]) => Promise; + upsert: (...args: unknown[]) => Promise; }; joinToken: { findUnique: (...args: unknown[]) => Promise; @@ -175,6 +176,52 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{ if (bastionId) { bastionRegistry.updateState(bastionId, msg.state); logger.info(`Bastion ${bastionId.slice(0, 8)} state sync: ${Object.keys(msg.state.discovered).length} discovered, ${Object.keys(msg.state.installed).length} installed`); + + // Persist machines to DB + void (async () => { + try { + // Upsert discovered machines + for (const [mac, hw] of Object.entries(msg.state.discovered)) { + await db.server.upsert({ + where: { mac }, + create: { + hostname: hw.product ?? mac, + mac, + role: "unknown", + status: "discovered", + labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer }, + }, + update: { + status: "discovered", + lastHeartbeat: new Date(), + labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer }, + }, + }); + } + // Upsert installed machines + for (const [mac, info] of Object.entries(msg.state.installed)) { + await db.server.upsert({ + where: { mac }, + create: { + hostname: info.hostname, + mac, + role: info.role ?? "worker", + ip: info.ip, + status: "online", + }, + update: { + hostname: info.hostname, + role: info.role ?? "worker", + ip: info.ip, + status: "online", + lastHeartbeat: new Date(), + }, + }); + } + } catch (err) { + logger.warn(`Failed to persist machines to DB: ${err instanceof Error ? err.message : String(err)}`); + } + })(); } break; } From a14fd04947ff0ced0f2340aaca17ca4f16e3ebcb Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 03:01:21 +0100 Subject: [PATCH 03/16] fix: add nomodeset to iPXE kernel args (amdgpu hangs on SER9MAX) Radeon 780M GPU driver initialization hangs during Anaconda boot on SER9MAX. nomodeset disables kernel modesetting so the installer doesn't try to initialize the GPU. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/templates/boot.ipxe.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index 72f329f..bfe1d57 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -42,7 +42,7 @@ echo Collecting hardware info... echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 nomodeset initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; @@ -69,7 +69,7 @@ echo MAC: ${params.mac} echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 nomodeset initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; From 5b04d3162bf87c13f7afa11b30825b8f157715bb Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 11:07:48 +0100 Subject: [PATCH 04/16] fix: disable logging --host (UDP not exposed), add nomodeset + JetKVM helper - logging --host blocks Anaconda when syslog UDP port not reachable - nomodeset prevents amdgpu hang on SER9MAX (Radeon 780M) - JetKVM helper script for device control (status, reboot, power) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/bastion/src/templates/install.ks.ts | 3 +- bastion/tests/integration/helpers/jetkvm.sh | 82 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100755 bastion/tests/integration/helpers/jetkvm.sh diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index 019bbb8..af2d94a 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -121,7 +121,8 @@ ${userDirective} bootloader --append="console=tty0 console=ttyS0,115200n8" -logging --host=${serverIp} --port=${syslogPort} +# logging --host=${serverIp} --port=${syslogPort} +# Disabled: syslog UDP port needs to be exposed in k3s service/hostPort first url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch diff --git a/bastion/tests/integration/helpers/jetkvm.sh b/bastion/tests/integration/helpers/jetkvm.sh new file mode 100755 index 0000000..e9ecbb3 --- /dev/null +++ b/bastion/tests/integration/helpers/jetkvm.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# JetKVM helper — authenticate and interact with JetKVM device. +# Usage: +# jetkvm.sh status — check device status +# jetkvm.sh reboot — reboot the target machine via ATX +# jetkvm.sh poweron — power on via ATX short press +# jetkvm.sh poweroff — power off via ATX long press +# +# Environment: +# JETKVM_HOST — JetKVM IP (default: 192.168.3.10) +# JETKVM_PASS — device password + +set -euo pipefail + +HOST="${JETKVM_HOST:-192.168.3.10}" +PASS="${JETKVM_PASS:-}" + +if [ -z "$PASS" ]; then + echo "ERROR: JETKVM_PASS not set" >&2 + exit 1 +fi + +BASE="http://$HOST" + +# Authenticate and get token +login() { + local resp + resp=$(curl -s -X POST "$BASE/auth/login-local" \ + -H "Content-Type: application/json" \ + -d "{\"password\":\"$PASS\"}" 2>&1) + + local token + token=$(echo "$resp" | grep -oP '"token"\s*:\s*"[^"]*"' | head -1 | grep -oP '"[^"]*"$' | tr -d '"') + + if [ -z "$token" ]; then + echo "ERROR: Login failed: $resp" >&2 + exit 1 + fi + echo "$token" +} + +# Make authenticated request +api() { + local method="$1" path="$2" body="${3:-}" + local token + token=$(login) + + if [ -n "$body" ]; then + curl -s -X "$method" "$BASE$path" \ + -H "Authorization: Bearer $token" \ + -H "Content-Type: application/json" \ + -d "$body" + else + curl -s -X "$method" "$BASE$path" \ + -H "Authorization: Bearer $token" + fi +} + +case "${1:-status}" in + status) + curl -s "$BASE/device/status" 2>&1 + ;; + device) + api GET /device + ;; + reboot) + echo "Sending ATX reset..." + api POST /device/atx/reset + ;; + poweron) + echo "Sending ATX short power press..." + api POST /device/atx/power-short + ;; + poweroff) + echo "Sending ATX long power press..." + api POST /device/atx/power-long + ;; + *) + echo "Usage: $0 {status|device|reboot|poweron|poweroff}" + exit 1 + ;; +esac From e3523d642c376aa016dd12fd73fa68950ba6818b Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 12:32:02 +0100 Subject: [PATCH 05/16] fix: remove serial console from iPXE kernel args (may hang on SER9MAX) ttyS0 console output on iPXE kernel line may cause kernel hang on hardware without physical serial port. Removed from both discover and install iPXE scripts. Serial console stays in bootloader config for the installed system only. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/templates/boot.ipxe.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index bfe1d57..d2fc3b6 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -42,7 +42,7 @@ echo Collecting hardware info... echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 nomodeset +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text nomodeset initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; @@ -69,7 +69,7 @@ echo MAC: ${params.mac} echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text console=ttyS0,115200n8 console=tty0 nomodeset +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text nomodeset initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; From a7a6ad80983f25b03e044b446bcd6e3d5cae1dc1 Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 12:38:41 +0100 Subject: [PATCH 06/16] fix: skip removable/USB disks in %pre, wait for NVMe init JetKVM virtual media appears as /dev/sda before NVMe initializes. Now: wait up to 10s for disks, skip removable disks and anything under 20GB. Fixes "ignoredisk: sda does not exist" on SER9MAX. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/templates/install.ks.ts | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index af2d94a..a989309 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -88,8 +88,20 @@ chmod 440 /etc/sudoers.d/${adminUser}`; const diskLine = disk ? `DISK="${disk}"` : `DISK="" -for d in /dev/nvme0n1 /dev/sda /dev/vda; do - [ -b "$d" ] && { DISK="$(basename $d)"; break; } +# Wait up to 10s for NVMe/SCSI disks to appear (they init async in initrd) +for _wait in $(seq 1 10); do + for d in /dev/nvme0n1 /dev/sda /dev/vda; do + [ -b "$d" ] || continue + # Skip removable disks (USB, CD-ROM, JetKVM virtual media) + _bname=$(basename "$d") + [ -f "/sys/block/$_bname/removable" ] && [ "$(cat /sys/block/$_bname/removable)" = "1" ] && continue + # Skip disks smaller than 20GB (likely USB sticks) + _size=$(cat /sys/block/$_bname/size 2>/dev/null || echo 0) + [ "$_size" -lt 41943040 ] && continue + DISK="$_bname" + break 2 + done + sleep 1 done [ -z "$DISK" ] && { echo "ERROR: no disk found"; exit 1; }`; From 6c6d5763c47145ac934e1c2e9c21dccb84303b62 Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 12:51:44 +0100 Subject: [PATCH 07/16] fix: skip USB-attached disks in %pre (JetKVM virtual media is SCSI-over-USB) Check sysfs device path for 'usb' to skip JetKVM virtual media which appears as /dev/sda but is not a real install target. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/templates/install.ks.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index a989309..ea1a035 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -90,11 +90,14 @@ chmod 440 /etc/sudoers.d/${adminUser}`; : `DISK="" # Wait up to 10s for NVMe/SCSI disks to appear (they init async in initrd) for _wait in $(seq 1 10); do - for d in /dev/nvme0n1 /dev/sda /dev/vda; do + for d in /dev/nvme0n1 /dev/nvme1n1 /dev/sda /dev/sdb /dev/vda; do [ -b "$d" ] || continue - # Skip removable disks (USB, CD-ROM, JetKVM virtual media) _bname=$(basename "$d") + # Skip removable disks (USB, CD-ROM, JetKVM virtual media) [ -f "/sys/block/$_bname/removable" ] && [ "$(cat /sys/block/$_bname/removable)" = "1" ] && continue + # Skip USB-attached disks (JetKVM virtual media shows as SCSI over USB) + _transport=$(readlink -f /sys/block/$_bname/device 2>/dev/null || echo "") + echo "$_transport" | grep -q "usb" && continue # Skip disks smaller than 20GB (likely USB sticks) _size=$(cat /sys/block/$_bname/size 2>/dev/null || echo 0) [ "$_size" -lt 41943040 ] && continue From e87edfcfbda2591d5c53be63862089c9c1caad42 Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 22:25:44 +0100 Subject: [PATCH 08/16] feat: PXE debug boot mode for rescue/diagnostics New `labctl provision debug ` command that PXE boots a machine into Fedora rescue mode (inst.rescue) for live debugging. Auto-clears after one boot so next reboot returns to normal. Adds debug state to BastionState, dispatch routing, API endpoints, labd command routing, and CLI with rescue workflow guide. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/completions/labctl.bash | 27 ++-- bastion/completions/labctl.fish | 25 +-- bastion/src/bastion/src/main.ts | 16 ++ bastion/src/bastion/src/routes/api.ts | 29 ++++ bastion/src/bastion/src/routes/dispatch.ts | 26 +++ bastion/src/bastion/src/services/state.ts | 2 + .../src/bastion/src/templates/boot.ipxe.ts | 27 ++++ bastion/src/bastion/src/templates/debug.ks.ts | 25 +++ .../src/bastion/src/templates/install.ks.ts | 40 ++--- bastion/src/bastion/tests/state.test.ts | 2 + bastion/src/cli/src/api/client.ts | 4 + bastion/src/cli/src/commands/debug.ts | 153 ++++++++++++++++++ bastion/src/cli/src/index.ts | 2 + bastion/src/labd/src/routes/bastions.ts | 34 ++++ .../src/labd/src/services/bastion-registry.ts | 7 +- bastion/src/shared/src/index.ts | 1 + bastion/src/shared/src/types/index.ts | 1 + bastion/src/shared/src/types/state.ts | 6 + 18 files changed, 368 insertions(+), 59 deletions(-) create mode 100644 bastion/src/bastion/src/templates/debug.ks.ts create mode 100644 bastion/src/cli/src/commands/debug.ts diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index f7d4743..b27c835 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -29,43 +29,46 @@ _labctl() { COMPREPLY=($(compgen -W "--dir -h --help" -- "$cur")) return ;; "init bastion standalone status") - COMPREPLY=($(compgen -W "--dir --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "init bastion standalone") COMPREPLY=($(compgen -W "start stop status -h --help" -- "$cur")) return ;; "app labcontroller deploy") - COMPREPLY=($(compgen -W "--user --port --crdb-replicas -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user --crdb-replicas -h --help" -- "$cur")) return ;; "app labcontroller status") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s install") - COMPREPLY=($(compgen -W "--role --user --port --k3s-server --k3s-token -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --user --k3s-server --k3s-token -h --help" -- "$cur")) return ;; "app k3s health") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "app k3s list") - COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--user -h --help" -- "$cur")) return ;; "init bastion") COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur")) return ;; "provision list") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision install") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision reprovision") - COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision forget") - COMPREPLY=($(compgen -W "--port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision logs") - COMPREPLY=($(compgen -W "-f --follow --port -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + return ;; + "provision makeiso") + COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur")) return ;; "config list") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) @@ -92,7 +95,7 @@ _labctl() { COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) return ;; "provision") - COMPREPLY=($(compgen -W "list install reprovision forget logs -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur")) return ;; "config") COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 832ad8e..6736142 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -118,38 +118,28 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone start" -l foregro # init bastion standalone stop options complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d 'Bastion data directory' -x -# init bastion standalone status options -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l dir -d 'Bastion data directory' -x -complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l port -d 'Bastion HTTP port' -x - # provision subcommands complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' - -# provision list options -complete -c labctl -n "__labctl_in_cmd provision list" -l port -d 'Bastion HTTP port' -x +complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' # provision install options complete -c labctl -n "__labctl_in_cmd provision install" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision install" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision install" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision install" -l port -d 'Bastion HTTP port' -x # provision reprovision options complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x -complete -c labctl -n "__labctl_in_cmd provision reprovision" -l port -d 'Bastion HTTP port' -x -# provision forget options -complete -c labctl -n "__labctl_in_cmd provision forget" -l port -d 'Bastion HTTP port' -x - -# provision logs options -complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow logs in real-time (SSE stream)' -complete -c labctl -n "__labctl_in_cmd provision logs" -l port -d 'Bastion HTTP port' -x +# provision makeiso options +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL' +complete -c labctl -n "__labctl_in_cmd provision makeiso" -l out -d 'Output path for local ISO build' -x # config subcommands complete -c labctl -n "__labctl_using_cmd config" -a list -d 'Show all configuration values' @@ -173,12 +163,10 @@ complete -c labctl -n "__labctl_using_cmd app labcontroller" -a status -d 'Check # app labcontroller deploy options complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l port -d 'Bastion HTTP port' -x complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l crdb-replicas -d 'CockroachDB replicas' -x # app labcontroller status options complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l port -d 'Bastion HTTP port' -x # app k3s subcommands complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)' @@ -188,15 +176,12 @@ complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed ma # app k3s install options complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s install" -l port -d 'Bastion HTTP port (for resolving target)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-server -d 'k3s server URL (required for worker role)' -x complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-token -d 'k3s join token (required for worker role)' -x # app k3s health options complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s health" -l port -d 'Bastion HTTP port' -x # app k3s list options complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x -complete -c labctl -n "__labctl_in_cmd app k3s list" -l port -d 'Bastion HTTP port' -x diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 6b2a621..03a49bf 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -266,6 +266,21 @@ export async function startBastion(overrides: Partial = {}): Prom return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } }; }); + labdConn.onCommand("command-debug", async (msg) => { + if (msg.type !== "command-debug") throw new Error("unexpected"); + const mac = msg.mac.toLowerCase(); + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + }); + return { status: "ok", data: { mac, hostname } }; + }); + labdConn.onCommand("command-forget", async (msg) => { if (msg.type !== "command-forget") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); @@ -273,6 +288,7 @@ export async function startBastion(overrides: Partial = {}): Prom delete s.discovered[mac]; delete s.install_queue[mac]; delete s.installed[mac]; + delete s.debug[mac]; }); return { status: "ok", data: { mac } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 96e1e7f..75a821a 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -189,6 +189,31 @@ export function registerApiRoutes( return reply.send({ status: "ok", lines: allLines.length }); }); + // Queue debug/rescue mode for a machine + app.post<{ + Body: { mac?: string }; + }>("/api/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + if (mac === "") { + return reply.status(400).send({ error: "mac is required" }); + } + + // Look up hostname from installed or discovered state + const currentState = state.load(); + const hostname = + currentState.installed[mac]?.hostname ?? + currentState.install_queue[mac]?.hostname ?? + currentState.discovered[mac]?.product ?? + mac; + + state.update((s) => { + s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + }); + + logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); + return reply.send({ status: "ok", mac, hostname }); + }); + // Delete a machine from all state app.delete<{ Params: { mac: string }; @@ -213,6 +238,10 @@ export function registerApiRoutes( delete s.installed[mac]; found = true; } + if (s.debug[mac] !== undefined) { + delete s.debug[mac]; + found = true; + } }); if (!found) { diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index 54221fc..c9df55c 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -10,9 +10,11 @@ import type { StateManager } from "../services/state.js"; import { renderDiscoverIpxe, renderInstallIpxe, + renderDebugIpxe, renderLocalBootIpxe, } from "../templates/boot.ipxe.js"; import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js"; +import { renderDebugKickstart } from "../templates/debug.ks.js"; import { logger } from "../services/logger.js"; export function registerDispatchRoutes( @@ -20,10 +22,34 @@ export function registerDispatchRoutes( config: BastionConfig, state: StateManager, ): void { + // Serve debug/rescue kickstart (minimal: SSH keys + network) + app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { + const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] }); + return reply.type("text/plain").send(ks); + }); + app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => { const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); const currentState = state.load(); + // Debug mode takes highest priority — auto-clear after serving once + const debugEntry = currentState.debug[mac]; + if (debugEntry) { + const hostname = debugEntry.hostname ?? "debug"; + logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`); + + state.update((s) => { delete s.debug[mac]; }); + + const script = renderDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + fedoraMirror: config.fedoraMirror, + }); + return reply.type("text/plain").send(script); + } + const queueEntry = currentState.install_queue[mac]; if (queueEntry) { const hostname = queueEntry.hostname ?? "lab-node"; diff --git a/bastion/src/bastion/src/services/state.ts b/bastion/src/bastion/src/services/state.ts index ea90218..68cb6a7 100644 --- a/bastion/src/bastion/src/services/state.ts +++ b/bastion/src/bastion/src/services/state.ts @@ -11,6 +11,7 @@ const EMPTY_STATE: BastionState = { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }; export type StateChangeListener = (state: BastionState) => void; @@ -33,6 +34,7 @@ export class StateManager { discovered: parsed.discovered ?? {}, install_queue: parsed.install_queue ?? {}, installed: parsed.installed ?? {}, + debug: parsed.debug ?? {}, }; } catch { return { ...EMPTY_STATE }; diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index d2fc3b6..826633f 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -75,6 +75,33 @@ boot `; } +/** + * iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode. + * Provides a shell with LVM tools, network, and SSH for inspecting installed systems. + */ +export function renderDebugIpxe(params: { + mac: string; + hostname: string; + serverIp: string; + httpPort: number; + fedoraMirror: string; +}): string { + return `#!ipxe + +echo +echo ============================================= +echo Lab PXE Bastion - DEBUG/RESCUE MODE +echo Target: ${params.hostname} +echo MAC: ${params.mac} +echo ============================================= +echo + +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror} +initrd http://${params.serverIp}:${params.httpPort}/initrd.img +boot +`; +} + /** * iPXE script for already-installed machines -- exits to boot from local disk. */ diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts new file mode 100644 index 0000000..270fa34 --- /dev/null +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -0,0 +1,25 @@ +// Debug/rescue kickstart template. +// Minimal: sets SSH access and network for Anaconda rescue mode. +// No disk operations, no packages, no %post. + +export interface DebugKickstartParams { + sshKeys: string[]; +} + +export function renderDebugKickstart(params: DebugKickstartParams): string { + const sshpw = "sshpw --username=root --plaintext lab-root-pw"; + const sshkeyLine = params.sshKeys.length > 0 + ? `sshkey --username=root "${params.sshKeys[0]}"` + : ""; + + return `# Lab Bastion -- Debug/Rescue Kickstart +# Minimal: only SSH + network for Anaconda rescue mode + +lang en_US.UTF-8 +keyboard uk +network --bootproto=dhcp --activate + +${sshpw} +${sshkeyLine} +`; +} diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index ea1a035..cf5ef73 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -322,39 +322,20 @@ bastion_progress() { -d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true } -# Send log lines to bastion -bastion_log() { - local line="$1" - local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') - curl -sf -X POST "http://${serverIp}:${httpPort}/api/log" \\ - -H "Content-Type: application/json" \\ - -d "{\\"mac\\":\\"$mac\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\ - --connect-timeout 5 --max-time 10 2>/dev/null || true -} - -# Send an error stage to bastion -bastion_error() { - local detail="$1" - bastion_progress "error" "$detail" -} - -# --- Error trap: catch any failure and report to bastion --- -_post_error_handler() { - local exit_code=$? lineno=$1 - bastion_error "%post failed at line $lineno (exit $exit_code)" -} -trap '_post_error_handler $LINENO' ERR bastion_progress "post-install" "configuring system" # -- SSH -- -systemctl enable --now sshd +# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot +systemctl enable sshd || true sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config ${sshPostBlock} -# -- Hostname and domain -- -hostnamectl set-hostname ${fqdn} +bastion_progress "post-install" "1-ssh done" + +# -- Hostname and domain (write directly, hostnamectl needs D-Bus) -- +echo "${fqdn}" > /etc/hostname # -- tmpfs for /tmp -- echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab @@ -392,12 +373,15 @@ SYSCTL sysctl --system || true # -- Disable firewalld permanently (k3s/Cilium manage iptables directly) -- -systemctl disable --now firewalld || true +# Note: no '--now' — systemd is not running in the Anaconda chroot +systemctl disable firewalld || true systemctl mask firewalld || true # -- Enable chronyd for time sync -- systemctl enable chronyd || true`} +bastion_progress "post-install" "2-system done" + # -- Boot order: restore network first (Anaconda sets disk first, we undo it) -- # Network boot must stay first so the bastion intercepts every reboot. if command -v efibootmgr >/dev/null 2>&1; then @@ -410,6 +394,8 @@ if command -v efibootmgr >/dev/null 2>&1; then fi fi +bastion_progress "post-install" "3-bootorder done" + # -- Provisioning metadata -- cat > /etc/lab-provisioned << PROVEOF hostname: ${fqdn} @@ -435,6 +421,8 @@ README ${hasRancher ? `# Install k3s server (skip start - will be configured manually) curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh - ` : ""} +bastion_progress "post-install" "4-metadata done" + IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') bastion_progress "complete" "ready at $IP_ADDR" diff --git a/bastion/src/bastion/tests/state.test.ts b/bastion/src/bastion/tests/state.test.ts index 494b479..2b509b5 100644 --- a/bastion/src/bastion/tests/state.test.ts +++ b/bastion/src/bastion/tests/state.test.ts @@ -26,6 +26,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); @@ -39,6 +40,7 @@ describe("StateManager", () => { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }); }); diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index c68f0e9..5ec68cf 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,6 +94,10 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } + async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac } }); + } + async forgetMachine(mac: string): Promise<{ status: string }> { return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`); } diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts new file mode 100644 index 0000000..78b3f6c --- /dev/null +++ b/bastion/src/cli/src/commands/debug.ts @@ -0,0 +1,153 @@ +// CLI command: provision debug +// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE. + +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import { Command } from "commander"; +import type { BastionState } from "@lab/shared"; +import { getLabdClient } from "../api/config.js"; + +/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */ +function resolveTarget( + target: string, + state: BastionState, +): { mac: string; hostname: string; ip: string } | null { + const normalized = target.toLowerCase().replace(/-/g, ":"); + + if (state.installed[normalized]) { + const info = state.installed[normalized]; + return { mac: normalized, hostname: info.hostname, ip: info.ip }; + } + + if (state.discovered[normalized]) { + return { mac: normalized, hostname: normalized, ip: "" }; + } + + if (state.install_queue[normalized]) { + return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" }; + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.hostname === target || info.hostname.startsWith(target + ".")) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + for (const [mac, info] of Object.entries(state.installed)) { + if (info.ip === target) { + return { mac, hostname: info.hostname, ip: info.ip }; + } + } + + return null; +} + +export function registerDebugCommand(parent: Command): void { + parent + .command("debug ") + .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") + .showHelpAfterError(true) + .action(async (target: string) => { + const client = getLabdClient(); + + // Resolve target from labd aggregated state + let state: BastionState; + try { + state = await client.getMachines(); + } catch (err) { + console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + const resolved = resolveTarget(target, state); + if (!resolved) { + console.error(`Cannot find machine: ${target}`); + console.error("Provide a hostname, MAC, or IP of a known machine."); + console.error("Run 'labctl provision list' to see available machines."); + process.exit(1); + } + + const { mac, hostname, ip } = resolved; + console.log(`Queuing debug mode for ${hostname} (${mac})...`); + + try { + const result = await client.debugMachine(mac); + if (result.error) { + console.error(`Failed: ${result.error}`); + process.exit(1); + } + } catch (err) { + console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`); + process.exit(1); + } + + // Try SSH reboot into PXE + if (ip !== "") { + const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? ""; + const effectiveUser = adminUser === "root" ? "" : adminUser; + + if (effectiveUser !== "") { + console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`); + + const sudoUser = process.env["SUDO_USER"]; + const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir(); + const keyPaths = [ + join(realHome, ".ssh", "id_ed25519"), + join(realHome, ".ssh", "id_rsa"), + join(realHome, ".ssh", "id_ecdsa"), + ]; + const sshKey = keyPaths.find(k => existsSync(k)); + + const sshArgs = [ + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + ...(sshKey !== undefined ? ["-i", sshKey] : []), + `${effectiveUser}@${ip}`, + 'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi', + ]; + + try { + execFileSync("ssh", sshArgs, { stdio: "inherit" }); + } catch { + // SSH connection closing during reboot is expected + } + } + } + + console.log(` +Debug mode queued for ${hostname} (${mac}). +Reboot the machine to enter Fedora rescue mode. + +Once in rescue shell: + + # Activate LVM + vgchange -ay labvg + + # Mount root + other volumes + mkdir -p /mnt/sysroot + mount /dev/labvg/root /mnt/sysroot + cat /mnt/sysroot/etc/fstab # check what else to mount + mount /dev/labvg/var /mnt/sysroot/var + mount /dev/labvg/home /mnt/sysroot/home + + # Boot the installed system in a container + /mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot + + # Or just chroot for quick fixes + mount --bind /dev /mnt/sysroot/dev + mount --bind /proc /mnt/sysroot/proc + mount --bind /sys /mnt/sysroot/sys + chroot /mnt/sysroot + + # Check initramfs size + ls -lh /mnt/sysroot/boot/initramfs-*.img + + # Rebuild initramfs without amdgpu + chroot /mnt/sysroot + echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf + dracut -f --regenerate-all +`); + }); +} diff --git a/bastion/src/cli/src/index.ts b/bastion/src/cli/src/index.ts index 0584ec5..00d0df9 100644 --- a/bastion/src/cli/src/index.ts +++ b/bastion/src/cli/src/index.ts @@ -14,6 +14,7 @@ import { registerStatusCommand } from "./commands/status.js"; import { registerInstallCommand } from "./commands/install.js"; import { registerListCommand } from "./commands/list.js"; import { registerReprovisionCommand } from "./commands/reprovision.js"; +import { registerDebugCommand } from "./commands/debug.js"; import { registerForgetCommand } from "./commands/forget.js"; import { registerLogsCommand } from "./commands/logs.js"; import { registerMakeIsoCommand } from "./commands/makeiso.js"; @@ -95,6 +96,7 @@ export function createProgram(): Command { registerListCommand(provisionCmd); registerInstallCommand(provisionCmd); registerReprovisionCommand(provisionCmd); + registerDebugCommand(provisionCmd); registerForgetCommand(provisionCmd); registerLogsCommand(provisionCmd); registerMakeIsoCommand(provisionCmd); diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index a1c0af8..9372dae 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -172,6 +172,40 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } }); + // Queue debug/rescue mode — route to correct bastion by MAC + app.post<{ + Body: { mac?: string }; + }>("/api/machines/debug", async (request, reply) => { + const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + if (!mac) { + return reply.code(400).send({ error: "mac is required" }); + } + + const bastion = bastionRegistry.findBastionByMac(mac); + if (!bastion) { + const all = bastionRegistry.getAll(); + if (all.length === 0) { + return reply.code(503).send({ error: "No bastions connected" }); + } + if (all.length === 1) { + try { + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + } + return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` }); + } + + try { + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac }); + return reply.code(result.status === "ok" ? 200 : 500).send(result); + } catch (err) { + return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); + } + }); + // Forget machine app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => { const mac = request.params.mac.toLowerCase().replace(/-/g, ":"); diff --git a/bastion/src/labd/src/services/bastion-registry.ts b/bastion/src/labd/src/services/bastion-registry.ts index 15d0570..dba23a4 100644 --- a/bastion/src/labd/src/services/bastion-registry.ts +++ b/bastion/src/labd/src/services/bastion-registry.ts @@ -3,7 +3,7 @@ import { EventEmitter } from "node:events"; import type { WebSocket } from "ws"; -import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared"; +import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared"; export interface ConnectedBastion { bastionId: string; @@ -20,6 +20,7 @@ export interface AggregatedState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; } export class BastionRegistry extends EventEmitter { @@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter { discovered: {}, install_queue: {}, installed: {}, + debug: {}, }; for (const bastion of this.bastions.values()) { @@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter { for (const [mac, info] of Object.entries(bastion.state.installed)) { result.installed[mac] = { ...info, bastionId: bastion.bastionId }; } + for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) { + result.debug[mac] = { ...dbg }; + } } return result; diff --git a/bastion/src/shared/src/index.ts b/bastion/src/shared/src/index.ts index 7179a6d..443edbc 100644 --- a/bastion/src/shared/src/index.ts +++ b/bastion/src/shared/src/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, BastionConfig, } from "./types/index.js"; diff --git a/bastion/src/shared/src/types/index.ts b/bastion/src/shared/src/types/index.ts index 8ff20ed..510ee31 100644 --- a/bastion/src/shared/src/types/index.ts +++ b/bastion/src/shared/src/types/index.ts @@ -5,6 +5,7 @@ export type { HardwareInfo, InstallConfig, InstalledInfo, + DebugConfig, BastionState, } from "./state.js"; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index 9be3d21..382d7d5 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -98,8 +98,14 @@ export interface InstalledInfo { bastionId?: string; // set when aggregated through labd } +export interface DebugConfig { + hostname: string; + queued_at: string; +} + export interface BastionState { discovered: Record; install_queue: Record; installed: Record; + debug: Record; } From 52150fd95590d2dcc73bf49f4909be946b08b270 Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 22:42:52 +0100 Subject: [PATCH 09/16] fix: add command-debug to LabdBastionMessage protocol types Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/shared/src/protocol/index.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 6670c54..231d84d 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,6 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } + | { type: "command-debug"; requestId: string; mac: string } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; @@ -125,7 +126,7 @@ const BASTION_MESSAGE_TYPES = new Set([ const LABD_BASTION_MESSAGE_TYPES = new Set([ "bastion-enrolled", "bastion-heartbeat-ack", "command-install", - "command-forget", "command-role-update", "server-shutdown", + "command-forget", "command-role-update", "command-debug", "server-shutdown", ]); export function isBastionMessage(msg: unknown): msg is BastionMessage { From 82ca93f4d78dd8f03bee9e1a822741e358bfce82 Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 22:54:02 +0100 Subject: [PATCH 10/16] fix: add debug field to inline BastionState in labd server Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/labd/src/server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bastion/src/labd/src/server.ts b/bastion/src/labd/src/server.ts index 4881962..b4ac77a 100644 --- a/bastion/src/labd/src/server.ts +++ b/bastion/src/labd/src/server.ts @@ -140,7 +140,7 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{ socket, connectedAt: new Date(), lastHeartbeat: new Date(), - state: { discovered: {}, install_queue: {}, installed: {} }, + state: { discovered: {}, install_queue: {}, installed: {}, debug: {} }, }); socket.send(JSON.stringify({ type: "bastion-enrolled", bastionId: record.id })); From d7a59665ad492b93a770981152f0907ef8927dde Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 23:01:16 +0100 Subject: [PATCH 11/16] fix: route command-debug through bastion WebSocket handler Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/services/labd-connection.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/bastion/src/bastion/src/services/labd-connection.ts b/bastion/src/bastion/src/services/labd-connection.ts index bfcd574..7fcd954 100644 --- a/bastion/src/bastion/src/services/labd-connection.ts +++ b/bastion/src/bastion/src/services/labd-connection.ts @@ -164,6 +164,7 @@ export class BastionConnection { case "command-install": case "command-forget": case "command-role-update": + case "command-debug": void this.handleCommand(msg); break; } From 3835fefba15544b101a7f43dc0f1fb25033f982f Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 23:53:19 +0100 Subject: [PATCH 12/16] feat: debug --sshd flag, auto SSH + nc listener + IP callback When using `labctl provision debug --sshd`, the rescue kickstart generates host keys, starts sshd (pw: debug) and nc listener (port 2323), and reports the IP back to bastion via /api/progress callback. Fully self-contained, no mounted FS needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/completions/labctl.bash | 5 +- bastion/completions/labctl.fish | 1 + bastion/docs/kickstart-reference.md | 103 ++++++++++++++++++ bastion/src/bastion/src/main.ts | 3 +- bastion/src/bastion/src/routes/api.ts | 5 +- bastion/src/bastion/src/routes/dispatch.ts | 13 ++- bastion/src/bastion/src/templates/debug.ks.ts | 59 +++++++++- bastion/src/cli/src/api/client.ts | 4 +- bastion/src/cli/src/commands/debug.ts | 5 +- bastion/src/labd/src/routes/bastions.ts | 7 +- bastion/src/shared/src/protocol/index.ts | 2 +- bastion/src/shared/src/types/state.ts | 1 + 12 files changed, 190 insertions(+), 18 deletions(-) create mode 100644 bastion/docs/kickstart-reference.md diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index b27c835..21f615e 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -61,6 +61,9 @@ _labctl() { "provision reprovision") COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; + "provision debug") + COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + return ;; "provision forget") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; @@ -95,7 +98,7 @@ _labctl() { COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur")) return ;; "provision") - COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur")) return ;; "config") COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 6736142..a1d4aab 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -122,6 +122,7 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d ' complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines' complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation' complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)' +complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state' complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)' complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning' diff --git a/bastion/docs/kickstart-reference.md b/bastion/docs/kickstart-reference.md new file mode 100644 index 0000000..2bf687c --- /dev/null +++ b/bastion/docs/kickstart-reference.md @@ -0,0 +1,103 @@ +# Kickstart Reference — Lessons Learned + +This documents pitfalls discovered during PXE boot testing. Read before modifying +the kickstart template (`src/bastion/src/templates/install.ks.ts`). + +## Package requirements + +### `kernel-modules` is mandatory + +`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`, +`zram`, and many network/filesystem drivers. Without `kernel-modules`: + +- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays + read-only** → sshd-keygen can't write host keys → SSH unreachable +- `zram-generator` fails → can trigger emergency mode + +**Always include `kernel-modules` in %packages.** This matches what the real +labmaster (192.168.8.11) has installed. + +Regression introduced in commit `fac14b6` which removed `@server-product` +(that group pulled in `kernel-modules` via `fedora-release-server`). + +### `dosfstools` is needed + +Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real +labmaster has it installed. + +### Verify against the real machine + +Before changing the package list, SSH to the labmaster and compare: +```bash +ssh 192.168.8.11 "rpm -q " +``` + +## Anaconda %post execution order + +This is critical and not well documented: + +1. `%pre` scripts run +2. Disk partitioning and formatting +3. Package installation +4. **Anaconda writes system config (fstab, hostname, etc.)** +5. `%post` scripts run (in chroot of installed system) +6. `%post --nochroot` scripts run +7. **Anaconda MAY overwrite fstab again after %post scripts** + +**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or +`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both +`sed` in %post and %post --nochroot had no effect on the final fstab. + +What DOES work from %post: +- Writing files to `/etc/` (systemd units, config files, SSH keys) +- Enabling/disabling systemd services +- Installing additional packages +- Running `systemctl enable/mask` + +What does NOT work from %post: +- Modifying `/etc/fstab` (Anaconda overwrites it) +- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions) + +## UEFI / EFI partition + +- Anaconda always creates an EFI System Partition for UEFI installs +- The EFI partition is FAT32 — requires `vfat` kernel module to mount +- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves + root as read-only. This cascades to break ALL services that need to write +- The EFI partition is used by firmware directly for bootloader — the OS + doesn't strictly need it mounted, but Anaconda adds it to fstab + +## VM-specific issues (libvirt/QEMU/OVMF) + +### iPXE exit behavior +- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk +- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk) +- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd` + +### nftables +- libvirt creates reject rules for NAT networks in table `ip libvirt_network` + (NOT `inet libvirt` — this wrong table name cost hours of debugging) +- These rules block new host→VM connections (SSH) +- Rules are recreated on every `virsh start` — must delete after each VM restart +- Chains: `guest_input` and `guest_output` + +### Serial console +- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet` +- Use `virsh console ` for interactive access (handles telnet protocol) +- Raw `socat` works for reading but pagers/readline break interactive use +- Add `console=ttyS0,115200n8` to kernel args for boot output on serial + +### SELinux on labmaster +- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it +- SSH works fine with SELinux enforcing on a properly installed Fedora system +- The `ld.so.cache` AVC denials seen during debugging were caused by the + read-only root filesystem, not by SELinux policy + +## Testing checklist + +Before merging kickstart changes: +1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q "` +2. Run the PXE integration test: `sudo pnpm run test:integration:pxe` +3. Verify via serial console (root / `lab-root-pw`) if SSH fails +4. Check `mount | grep " / "` — must show `rw`, not `ro` +5. Check `systemctl --failed` — no critical failures diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 03a49bf..fe3a3ac 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -269,6 +269,7 @@ export async function startBastion(overrides: Partial = {}): Prom labdConn.onCommand("command-debug", async (msg) => { if (msg.type !== "command-debug") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); + const sshd = msg.sshd ?? false; const currentState = state.load(); const hostname = currentState.installed[mac]?.hostname ?? @@ -276,7 +277,7 @@ export async function startBastion(overrides: Partial = {}): Prom currentState.discovered[mac]?.product ?? mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; }); return { status: "ok", data: { mac, hostname } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 75a821a..5718357 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -191,9 +191,10 @@ export function registerApiRoutes( // Queue debug/rescue mode for a machine app.post<{ - Body: { mac?: string }; + Body: { mac?: string; sshd?: boolean }; }>("/api/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const sshd = request.body?.sshd ?? false; if (mac === "") { return reply.status(400).send({ error: "mac is required" }); } @@ -207,7 +208,7 @@ export function registerApiRoutes( mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString() }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; }); logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index c9df55c..5361d0f 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -23,8 +23,17 @@ export function registerDispatchRoutes( state: StateManager, ): void { // Serve debug/rescue kickstart (minimal: SSH keys + network) - app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { - const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] }); + app.get<{ Querystring: { mac?: string; sshd?: string } }>("/debug.ks", async (request, reply) => { + const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); + const currentState = state.load(); + const wantSshd = request.query.sshd === "1" || currentState.debug[mac]?.sshd === true; + + const ks = renderDebugKickstart({ + sshKeys: config.sshKeys ?? [], + sshd: wantSshd, + serverIp: config.serverIp, + httpPort: config.httpPort, + }); return reply.type("text/plain").send(ks); }); diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts index 270fa34..27b0b4b 100644 --- a/bastion/src/bastion/src/templates/debug.ks.ts +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -1,9 +1,13 @@ // Debug/rescue kickstart template. -// Minimal: sets SSH access and network for Anaconda rescue mode. -// No disk operations, no packages, no %post. +// Minimal kickstart for Anaconda rescue mode. +// When sshd=true: generates host keys, starts sshd, reports IP to bastion. +// No dependency on mounted filesystems — fully self-contained. export interface DebugKickstartParams { sshKeys: string[]; + sshd?: boolean; + serverIp?: string; + httpPort?: number; } export function renderDebugKickstart(params: DebugKickstartParams): string { @@ -12,8 +16,55 @@ export function renderDebugKickstart(params: DebugKickstartParams): string { ? `sshkey --username=root "${params.sshKeys[0]}"` : ""; + const sshdSetup = params.sshd ? ` +%post --nochroot --log=/tmp/debug-sshd.log +#!/bin/bash +set -x + +# Generate host keys (self-contained, no mounted FS needed) +ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q +ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q + +# Write minimal sshd config +cat > /tmp/sshd_config << 'SSHCFG' +HostKey /tmp/ssh_host_ed25519_key +HostKey /tmp/ssh_host_rsa_key +PermitRootLogin yes +PasswordAuthentication yes +PubkeyAuthentication yes +AuthorizedKeysFile /root/.ssh/authorized_keys +SSHCFG + +# Set root password for SSH access +echo "root:debug" | chpasswd + +# Set up SSH authorized keys +mkdir -p /root/.ssh && chmod 700 /root/.ssh +${params.sshKeys.map(k => `echo '${k}' >> /root/.ssh/authorized_keys`).join("\n")} +chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true + +# Start sshd +/usr/sbin/sshd -f /tmp/sshd_config -p 22 +echo "sshd started on port 22" + +# Start persistent nc listener for remote shell +(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) & +echo "nc shell listener on port 2323" + +# Report IP to bastion +sleep 2 +IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') +MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') +curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\ + -H "Content-Type: application/json" \\ + -d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"ssh root@$IP_ADDR (pw: debug) | nc $IP_ADDR 2323\\"}" 2>/dev/null || true + +echo "Debug environment ready: ssh root@$IP_ADDR or nc $IP_ADDR 2323" +%end +` : ""; + return `# Lab Bastion -- Debug/Rescue Kickstart -# Minimal: only SSH + network for Anaconda rescue mode +# Minimal: SSH + network for Anaconda rescue mode lang en_US.UTF-8 keyboard uk @@ -21,5 +72,5 @@ network --bootproto=dhcp --activate ${sshpw} ${sshkeyLine} -`; +${sshdSetup}`; } diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index 5ec68cf..c7bfaa0 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,8 +94,8 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } - async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { - return this.request("POST", "/api/machines/debug", { body: { mac } }); + async debugMachine(mac: string, opts?: { sshd?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd } }); } async forgetMachine(mac: string): Promise<{ status: string }> { diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts index 78b3f6c..49e2847 100644 --- a/bastion/src/cli/src/commands/debug.ts +++ b/bastion/src/cli/src/commands/debug.ts @@ -48,8 +48,9 @@ export function registerDebugCommand(parent: Command): void { parent .command("debug ") .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") + .option("--sshd", "Start SSH + nc listener automatically, report IP to bastion") .showHelpAfterError(true) - .action(async (target: string) => { + .action(async (target: string, opts: { sshd?: boolean }) => { const client = getLabdClient(); // Resolve target from labd aggregated state @@ -73,7 +74,7 @@ export function registerDebugCommand(parent: Command): void { console.log(`Queuing debug mode for ${hostname} (${mac})...`); try { - const result = await client.debugMachine(mac); + const result = await client.debugMachine(mac, { sshd: opts.sshd === true }); if (result.error) { console.error(`Failed: ${result.error}`); process.exit(1); diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index 9372dae..ea694cc 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -174,9 +174,10 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void // Queue debug/rescue mode — route to correct bastion by MAC app.post<{ - Body: { mac?: string }; + Body: { mac?: string; sshd?: boolean }; }>("/api/machines/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); + const sshd = request.body?.sshd ?? false; if (!mac) { return reply.code(400).send({ error: "mac is required" }); } @@ -189,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } if (all.length === 1) { try { - const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac }); + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); @@ -199,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } try { - const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac }); + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 231d84d..88dffbd 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,7 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } - | { type: "command-debug"; requestId: string; mac: string } + | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index 382d7d5..a569cfa 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -101,6 +101,7 @@ export interface InstalledInfo { export interface DebugConfig { hostname: string; queued_at: string; + sshd?: boolean; } export interface BastionState { From 92c65b46725a407ad3d6942e7fe9a5a0e358b4ca Mon Sep 17 00:00:00 2001 From: Michal Date: Sun, 29 Mar 2026 23:59:38 +0100 Subject: [PATCH 13/16] fix: generic rescue instructions in debug command output Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/cli/src/commands/debug.ts | 33 ++++++++++++--------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts index 49e2847..aa3ccd7 100644 --- a/bastion/src/cli/src/commands/debug.ts +++ b/bastion/src/cli/src/commands/debug.ts @@ -117,38 +117,33 @@ export function registerDebugCommand(parent: Command): void { } } + const sshdNote = opts.sshd + ? `\nSSH + nc listener will start automatically. Watch bastion logs for the IP callback. + Password: debug | nc 2323 for raw shell\n` + : ""; + console.log(` Debug mode queued for ${hostname} (${mac}). Reboot the machine to enter Fedora rescue mode. - +${sshdNote} Once in rescue shell: - # Activate LVM - vgchange -ay labvg - - # Mount root + other volumes + # Activate LVM and mount installed system + vgchange -ay mkdir -p /mnt/sysroot - mount /dev/labvg/root /mnt/sysroot - cat /mnt/sysroot/etc/fstab # check what else to mount - mount /dev/labvg/var /mnt/sysroot/var - mount /dev/labvg/home /mnt/sysroot/home + mount /dev//root /mnt/sysroot + cat /mnt/sysroot/etc/fstab + mount /dev//var /mnt/sysroot/var + mount /dev//home /mnt/sysroot/home - # Boot the installed system in a container + # Boot installed system in a container /mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot - # Or just chroot for quick fixes + # Or chroot for quick fixes mount --bind /dev /mnt/sysroot/dev mount --bind /proc /mnt/sysroot/proc mount --bind /sys /mnt/sysroot/sys chroot /mnt/sysroot - - # Check initramfs size - ls -lh /mnt/sysroot/boot/initramfs-*.img - - # Rebuild initramfs without amdgpu - chroot /mnt/sysroot - echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf - dracut -f --regenerate-all `); }); } From 8da947a1c3606a786c879860c806c1b28f76fc5f Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 30 Mar 2026 00:25:19 +0100 Subject: [PATCH 14/16] fix: use %pre instead of %post for debug --sshd (rescue mode skips %post) Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/completions/labctl.bash | 2 +- bastion/completions/labctl.fish | 3 +++ bastion/src/bastion/src/templates/debug.ks.ts | 11 ++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index 21f615e..9f7b49b 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -62,7 +62,7 @@ _labctl() { COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision debug") - COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--sshd -h --help" -- "$cur")) return ;; "provision forget") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index a1d4aab..3a14103 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -137,6 +137,9 @@ complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machin complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04' complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x +# provision debug options +complete -c labctl -n "__labctl_in_cmd provision debug" -l sshd -d 'Start SSH + nc listener automatically, report IP to bastion' + # provision makeiso options complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64' complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL' diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts index 27b0b4b..083cc19 100644 --- a/bastion/src/bastion/src/templates/debug.ks.ts +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -17,10 +17,17 @@ export function renderDebugKickstart(params: DebugKickstartParams): string { : ""; const sshdSetup = params.sshd ? ` -%post --nochroot --log=/tmp/debug-sshd.log +%pre --log=/tmp/debug-sshd.log #!/bin/bash set -x +# Wait for network to come up +for i in $(seq 1 30); do + IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') + [ -n "$IP_ADDR" ] && break + sleep 1 +done + # Generate host keys (self-contained, no mounted FS needed) ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q @@ -52,8 +59,6 @@ echo "sshd started on port 22" echo "nc shell listener on port 2323" # Report IP to bastion -sleep 2 -IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\ -H "Content-Type: application/json" \\ From a4a48409301dcead2daa409450a1c23d112e06f2 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 30 Mar 2026 00:49:44 +0100 Subject: [PATCH 15/16] feat: debug --pxe-boot flag, boot installed system via PXE Loads kernel+initrd from bastion HTTP server, mounts root from local NVMe. Workaround for UEFI firmware bugs that make local disk boot 100x slower. One-time use, auto-clears after boot. Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/src/bastion/src/main.ts | 3 +- bastion/src/bastion/src/routes/api.ts | 5 ++-- bastion/src/bastion/src/routes/dispatch.ts | 29 +++++++++++++------ .../src/bastion/src/templates/boot.ipxe.ts | 28 ++++++++++++++++++ bastion/src/cli/src/api/client.ts | 4 +-- bastion/src/cli/src/commands/debug.ts | 5 ++-- bastion/src/labd/src/routes/bastions.ts | 7 +++-- bastion/src/shared/src/protocol/index.ts | 2 +- bastion/src/shared/src/types/state.ts | 1 + 9 files changed, 64 insertions(+), 20 deletions(-) diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index fe3a3ac..289551e 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -270,6 +270,7 @@ export async function startBastion(overrides: Partial = {}): Prom if (msg.type !== "command-debug") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); const sshd = msg.sshd ?? false; + const pxeBoot = msg.pxeBoot ?? false; const currentState = state.load(); const hostname = currentState.installed[mac]?.hostname ?? @@ -277,7 +278,7 @@ export async function startBastion(overrides: Partial = {}): Prom currentState.discovered[mac]?.product ?? mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd, pxeBoot }; }); return { status: "ok", data: { mac, hostname } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 5718357..5b9fe83 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -191,10 +191,11 @@ export function registerApiRoutes( // Queue debug/rescue mode for a machine app.post<{ - Body: { mac?: string; sshd?: boolean }; + Body: { mac?: string; sshd?: boolean; pxeBoot?: boolean }; }>("/api/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); const sshd = request.body?.sshd ?? false; + const pxeBoot = request.body?.pxeBoot ?? false; if (mac === "") { return reply.status(400).send({ error: "mac is required" }); } @@ -208,7 +209,7 @@ export function registerApiRoutes( mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd, pxeBoot }; }); logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index 5361d0f..1954d4d 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -11,6 +11,7 @@ import { renderDiscoverIpxe, renderInstallIpxe, renderDebugIpxe, + renderPxeBootDebugIpxe, renderLocalBootIpxe, } from "../templates/boot.ipxe.js"; import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js"; @@ -45,17 +46,27 @@ export function registerDispatchRoutes( const debugEntry = currentState.debug[mac]; if (debugEntry) { const hostname = debugEntry.hostname ?? "debug"; - logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`); - state.update((s) => { delete s.debug[mac]; }); - const script = renderDebugIpxe({ - mac, - hostname, - serverIp: config.serverIp, - httpPort: config.httpPort, - fedoraMirror: config.fedoraMirror, - }); + let script: string; + if (debugEntry.pxeBoot) { + logger.info(`PXE BOOT DEBUG: ${mac} -> ${hostname} (kernel+initrd from PXE, root from NVMe)`); + script = renderPxeBootDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + }); + } else { + logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`); + script = renderDebugIpxe({ + mac, + hostname, + serverIp: config.serverIp, + httpPort: config.httpPort, + fedoraMirror: config.fedoraMirror, + }); + } return reply.type("text/plain").send(script); } diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index 826633f..f56e815 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -102,6 +102,34 @@ boot `; } +/** + * iPXE script for PXE-boot debug mode -- boots the installed system's root + * filesystem using the bastion's PXE kernel+initrd instead of local GRUB. + * Workaround for UEFI firmware bugs that make local disk boot slow. + */ +export function renderPxeBootDebugIpxe(params: { + mac: string; + hostname: string; + serverIp: string; + httpPort: number; +}): string { + return `#!ipxe + +echo +echo ============================================= +echo Lab PXE Bastion - PXE BOOT (debug) +echo Target: ${params.hostname} +echo MAC: ${params.mac} +echo Kernel+initrd from PXE, root from NVMe +echo ============================================= +echo + +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0 console=ttyS0,115200n8 modprobe.blacklist=amdgpu +initrd http://${params.serverIp}:${params.httpPort}/initrd.img +boot +`; +} + /** * iPXE script for already-installed machines -- exits to boot from local disk. */ diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index c7bfaa0..75380a4 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,8 +94,8 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } - async debugMachine(mac: string, opts?: { sshd?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { - return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd } }); + async debugMachine(mac: string, opts?: { sshd?: boolean; pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd, pxeBoot: opts?.pxeBoot } }); } async forgetMachine(mac: string): Promise<{ status: string }> { diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts index aa3ccd7..45a13a7 100644 --- a/bastion/src/cli/src/commands/debug.ts +++ b/bastion/src/cli/src/commands/debug.ts @@ -49,8 +49,9 @@ export function registerDebugCommand(parent: Command): void { .command("debug ") .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") .option("--sshd", "Start SSH + nc listener automatically, report IP to bastion") + .option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)") .showHelpAfterError(true) - .action(async (target: string, opts: { sshd?: boolean }) => { + .action(async (target: string, opts: { sshd?: boolean; pxeBoot?: boolean }) => { const client = getLabdClient(); // Resolve target from labd aggregated state @@ -74,7 +75,7 @@ export function registerDebugCommand(parent: Command): void { console.log(`Queuing debug mode for ${hostname} (${mac})...`); try { - const result = await client.debugMachine(mac, { sshd: opts.sshd === true }); + const result = await client.debugMachine(mac, { sshd: opts.sshd === true, pxeBoot: opts.pxeBoot === true }); if (result.error) { console.error(`Failed: ${result.error}`); process.exit(1); diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index ea694cc..218805e 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -174,10 +174,11 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void // Queue debug/rescue mode — route to correct bastion by MAC app.post<{ - Body: { mac?: string; sshd?: boolean }; + Body: { mac?: string; sshd?: boolean; pxeBoot?: boolean }; }>("/api/machines/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); const sshd = request.body?.sshd ?? false; + const pxeBoot = request.body?.pxeBoot ?? false; if (!mac) { return reply.code(400).send({ error: "mac is required" }); } @@ -190,7 +191,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } if (all.length === 1) { try { - const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd }); + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd, pxeBoot }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); @@ -200,7 +201,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } try { - const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd }); + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd, pxeBoot }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 88dffbd..3b5054e 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,7 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } - | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean } + | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean; pxeBoot?: boolean } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index a569cfa..eaadf1f 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -102,6 +102,7 @@ export interface DebugConfig { hostname: string; queued_at: string; sshd?: boolean; + pxeBoot?: boolean; } export interface BastionState { From 0a4916d3c9ce9984efbd735ed4be6a09b7d92a07 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 30 Mar 2026 03:58:51 +0100 Subject: [PATCH 16/16] fix: remove serial console (root cause of 30s boot delay), enable syslog logging, disk auto-detect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause found: console=ttyS0,115200n8 causes 30-second timeout at every systemd boot phase on hardware without a physical serial UART. Each phase transition blocks waiting for the non-existent UART. Changes: - Remove console=ttyS0 from kickstart bootloader args and %post setup - Enable Anaconda syslog forwarding (logging --host --port) for install visibility - Improve syslog IP→MAC resolution (register from kickstart fetch + progress) - Fix disk auto-detect: default to empty string (not /dev/sda) for NVMe support - Enable SysRq magic keys (kernel.sysrq=1) for emergency reboot via JetKVM - Simplify debug command: remove --sshd flag (inst.sshd always available), add /debug-setup.sh HTTP endpoint for nc listener setup - Add labctl provision logs -f (follow mode with polling) - Add syslog listener unit tests - Enable syslog log capture test in integration suite Co-Authored-By: Claude Opus 4.6 (1M context) --- bastion/completions/labctl.bash | 4 +- bastion/completions/labctl.fish | 5 +- bastion/src/bastion/src/main.ts | 5 +- bastion/src/bastion/src/routes/api.ts | 12 +- bastion/src/bastion/src/routes/dispatch.ts | 37 +++++- bastion/src/bastion/src/routes/kickstart.ts | 7 + bastion/src/bastion/src/server.ts | 4 +- .../bastion/src/services/syslog-listener.ts | 15 ++- .../src/bastion/src/templates/boot.ipxe.ts | 2 +- bastion/src/bastion/src/templates/debug.ks.ts | 68 ++-------- .../src/bastion/src/templates/install.ks.ts | 20 +-- bastion/src/bastion/tests/dispatch.test.ts | 1 + bastion/src/bastion/tests/kickstart.test.ts | 6 +- .../src/bastion/tests/syslog-listener.test.ts | 121 ++++++++++++++++++ bastion/src/cli/src/api/client.ts | 4 +- bastion/src/cli/src/commands/debug.ts | 21 +-- bastion/src/cli/src/commands/logs.ts | 77 +++++++++-- bastion/src/labd/src/routes/bastions.ts | 11 +- bastion/src/shared/src/protocol/index.ts | 2 +- bastion/src/shared/src/types/state.ts | 1 - .../tests/integration/pxe-provision.test.ts | 11 +- 21 files changed, 305 insertions(+), 129 deletions(-) create mode 100644 bastion/src/bastion/tests/syslog-listener.test.ts diff --git a/bastion/completions/labctl.bash b/bastion/completions/labctl.bash index 9f7b49b..86a58fc 100644 --- a/bastion/completions/labctl.bash +++ b/bastion/completions/labctl.bash @@ -62,13 +62,13 @@ _labctl() { COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur")) return ;; "provision debug") - COMPREPLY=($(compgen -W "--sshd -h --help" -- "$cur")) + COMPREPLY=($(compgen -W "--pxe-boot -h --help" -- "$cur")) return ;; "provision forget") COMPREPLY=($(compgen -W "-h --help" -- "$cur")) return ;; "provision logs") - COMPREPLY=($(compgen -W "-h --help" -- "$cur")) + COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur")) return ;; "provision makeiso") COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur")) diff --git a/bastion/completions/labctl.fish b/bastion/completions/labctl.fish index 3a14103..a63ae32 100644 --- a/bastion/completions/labctl.fish +++ b/bastion/completions/labctl.fish @@ -138,7 +138,10 @@ complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operatin complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x # provision debug options -complete -c labctl -n "__labctl_in_cmd provision debug" -l sshd -d 'Start SSH + nc listener automatically, report IP to bastion' +complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)' + +# provision logs options +complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time' # provision makeiso options complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64' diff --git a/bastion/src/bastion/src/main.ts b/bastion/src/bastion/src/main.ts index 289551e..8c6d066 100644 --- a/bastion/src/bastion/src/main.ts +++ b/bastion/src/bastion/src/main.ts @@ -257,7 +257,7 @@ export async function startBastion(overrides: Partial = {}): Prom state.update((s) => { s.install_queue[msg.mac] = { hostname: msg.hostname, - disk: msg.disk ?? "/dev/sda", + disk: msg.disk ?? "", role: msg.role as import("@lab/shared").Role, os: msg.os as import("@lab/shared").OsId, queued_at: new Date().toISOString(), @@ -269,7 +269,6 @@ export async function startBastion(overrides: Partial = {}): Prom labdConn.onCommand("command-debug", async (msg) => { if (msg.type !== "command-debug") throw new Error("unexpected"); const mac = msg.mac.toLowerCase(); - const sshd = msg.sshd ?? false; const pxeBoot = msg.pxeBoot ?? false; const currentState = state.load(); const hostname = @@ -278,7 +277,7 @@ export async function startBastion(overrides: Partial = {}): Prom currentState.discovered[mac]?.product ?? mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd, pxeBoot }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot }; }); return { status: "ok", data: { mac, hostname } }; }); diff --git a/bastion/src/bastion/src/routes/api.ts b/bastion/src/bastion/src/routes/api.ts index 5b9fe83..b178b43 100644 --- a/bastion/src/bastion/src/routes/api.ts +++ b/bastion/src/bastion/src/routes/api.ts @@ -13,11 +13,13 @@ import { triggerPostProvisionK3s } from "../services/post-provision.js"; import { progressBus } from "../services/progress-events.js"; import type { ProgressEvent } from "../services/progress-events.js"; import type { InstallLogBuffer } from "../services/install-log.js"; +import type { SyslogListener } from "../services/syslog-listener.js"; export function registerApiRoutes( app: FastifyInstance, state: StateManager, installLog: InstallLogBuffer, + syslog: SyslogListener, ): void { // List all machines app.get("/api/machines", async (_request, reply) => { @@ -84,6 +86,11 @@ export function registerApiRoutes( const { mac: rawMac, stage, detail } = request.body ?? {}; const mac = (rawMac ?? "unknown").toLowerCase(); const stageName = stage ?? "unknown"; + + // Register IP → MAC for syslog routing + if (mac !== "unknown") { + syslog.registerIp(request.ip, mac); + } const detailStr = detail ?? ""; const GREEN = "\x1b[0;32m"; @@ -191,10 +198,9 @@ export function registerApiRoutes( // Queue debug/rescue mode for a machine app.post<{ - Body: { mac?: string; sshd?: boolean; pxeBoot?: boolean }; + Body: { mac?: string; pxeBoot?: boolean }; }>("/api/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); - const sshd = request.body?.sshd ?? false; const pxeBoot = request.body?.pxeBoot ?? false; if (mac === "") { return reply.status(400).send({ error: "mac is required" }); @@ -209,7 +215,7 @@ export function registerApiRoutes( mac; state.update((s) => { - s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd, pxeBoot }; + s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot }; }); logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`); diff --git a/bastion/src/bastion/src/routes/dispatch.ts b/bastion/src/bastion/src/routes/dispatch.ts index 1954d4d..0ecc1c4 100644 --- a/bastion/src/bastion/src/routes/dispatch.ts +++ b/bastion/src/bastion/src/routes/dispatch.ts @@ -23,21 +23,44 @@ export function registerDispatchRoutes( config: BastionConfig, state: StateManager, ): void { - // Serve debug/rescue kickstart (minimal: SSH keys + network) - app.get<{ Querystring: { mac?: string; sshd?: string } }>("/debug.ks", async (request, reply) => { - const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); - const currentState = state.load(); - const wantSshd = request.query.sshd === "1" || currentState.debug[mac]?.sshd === true; - + // Serve debug/rescue kickstart (minimal: SSH keys + network for inst.sshd) + app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => { const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [], - sshd: wantSshd, serverIp: config.serverIp, httpPort: config.httpPort, }); return reply.type("text/plain").send(ks); }); + // Shell script for manual debug setup (nc listener + IP reporting) + // Usage from rescue shell: curl http://bastion:port/debug-setup.sh | bash + app.get("/debug-setup.sh", async (_request, reply) => { + const script = `#!/bin/bash +# Lab Bastion debug setup — run from rescue shell +set -x + +IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') +MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') + +# Start persistent nc listener for remote shell +(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) & +echo "nc shell listener on port 2323" + +# Report IP to bastion +curl -sf -X POST "http://${config.serverIp}:${config.httpPort}/api/progress" \\ + -H "Content-Type: application/json" \\ + -d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"nc $IP_ADDR 2323\\"}" 2>/dev/null || true + +echo "" +echo "=== Debug environment ready ===" +echo " nc $IP_ADDR 2323 (remote shell)" +echo " ssh root@$IP_ADDR (password: debug)" +echo "===============================" +`; + return reply.type("text/plain").send(script); + }); + app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => { const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":"); const currentState = state.load(); diff --git a/bastion/src/bastion/src/routes/kickstart.ts b/bastion/src/bastion/src/routes/kickstart.ts index bce0e04..49ca90a 100644 --- a/bastion/src/bastion/src/routes/kickstart.ts +++ b/bastion/src/bastion/src/routes/kickstart.ts @@ -5,6 +5,7 @@ import type { FastifyInstance } from "fastify"; import type { BastionConfig } from "@lab/shared"; import type { StateManager } from "../services/state.js"; +import type { SyslogListener } from "../services/syslog-listener.js"; import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js"; import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js"; @@ -12,6 +13,7 @@ export function registerKickstartRoutes( app: FastifyInstance, config: BastionConfig, state: StateManager, + syslog: SyslogListener, ): void { // Per-MAC install kickstart app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => { @@ -19,6 +21,11 @@ export function registerKickstartRoutes( const currentState = state.load(); const queueEntry = currentState.install_queue[mac]; + // Register IP → MAC so syslog listener can route Anaconda logs + if (mac) { + syslog.registerIp(request.ip, mac); + } + const ks = generateInstallKickstart(config, { hostname: queueEntry?.hostname ?? "lab-node", disk: queueEntry?.disk ?? "", diff --git a/bastion/src/bastion/src/server.ts b/bastion/src/bastion/src/server.ts index 8bdaf6d..9a2979a 100644 --- a/bastion/src/bastion/src/server.ts +++ b/bastion/src/bastion/src/server.ts @@ -43,8 +43,8 @@ export function createApp(config: BastionConfig): { app: ReturnType(); constructor(port: number, installLog: InstallLogBuffer, state: StateManager) { this.port = port; @@ -37,14 +39,21 @@ export class SyslogListener { this.state = state; } - /** Resolve a source IP to a MAC address using the install queue. */ + /** Register an IP → MAC mapping (called when we learn a machine's IP). */ + registerIp(ip: string, mac: string): void { + this.ipToMac.set(ip, mac.toLowerCase()); + } + + /** Resolve a source IP to a MAC address. */ private resolveIpToMac(ip: string): string | null { + // Check explicit mapping first (most reliable) + const explicit = this.ipToMac.get(ip); + if (explicit) return explicit; + const currentState = this.state.load(); // Check install queue — machines being installed have an IP from DHCP for (const [mac, entry] of Object.entries(currentState.install_queue)) { - // The progress callback sends IP in "complete" detail, but during install - // we need to match by what we know. Check if any progress mentions this IP. if (entry.progress_detail?.includes(ip)) return mac; } diff --git a/bastion/src/bastion/src/templates/boot.ipxe.ts b/bastion/src/bastion/src/templates/boot.ipxe.ts index f56e815..95f36d2 100644 --- a/bastion/src/bastion/src/templates/boot.ipxe.ts +++ b/bastion/src/bastion/src/templates/boot.ipxe.ts @@ -124,7 +124,7 @@ echo Kernel+initrd from PXE, root from NVMe echo ============================================= echo -kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0 console=ttyS0,115200n8 modprobe.blacklist=amdgpu +kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0 initrd http://${params.serverIp}:${params.httpPort}/initrd.img boot `; diff --git a/bastion/src/bastion/src/templates/debug.ks.ts b/bastion/src/bastion/src/templates/debug.ks.ts index 083cc19..29a7e35 100644 --- a/bastion/src/bastion/src/templates/debug.ks.ts +++ b/bastion/src/bastion/src/templates/debug.ks.ts @@ -1,81 +1,33 @@ // Debug/rescue kickstart template. // Minimal kickstart for Anaconda rescue mode. -// When sshd=true: generates host keys, starts sshd, reports IP to bastion. -// No dependency on mounted filesystems — fully self-contained. +// +// SSH access: Anaconda's inst.sshd starts sshd automatically. +// The sshpw directive sets the password, sshkey adds authorized keys. +// %pre/%post do NOT run in rescue mode — don't put setup code there. export interface DebugKickstartParams { sshKeys: string[]; - sshd?: boolean; serverIp?: string; httpPort?: number; } export function renderDebugKickstart(params: DebugKickstartParams): string { - const sshpw = "sshpw --username=root --plaintext lab-root-pw"; const sshkeyLine = params.sshKeys.length > 0 ? `sshkey --username=root "${params.sshKeys[0]}"` : ""; - const sshdSetup = params.sshd ? ` -%pre --log=/tmp/debug-sshd.log -#!/bin/bash -set -x - -# Wait for network to come up -for i in $(seq 1 30); do - IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}') - [ -n "$IP_ADDR" ] && break - sleep 1 -done - -# Generate host keys (self-contained, no mounted FS needed) -ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q -ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q - -# Write minimal sshd config -cat > /tmp/sshd_config << 'SSHCFG' -HostKey /tmp/ssh_host_ed25519_key -HostKey /tmp/ssh_host_rsa_key -PermitRootLogin yes -PasswordAuthentication yes -PubkeyAuthentication yes -AuthorizedKeysFile /root/.ssh/authorized_keys -SSHCFG - -# Set root password for SSH access -echo "root:debug" | chpasswd - -# Set up SSH authorized keys -mkdir -p /root/.ssh && chmod 700 /root/.ssh -${params.sshKeys.map(k => `echo '${k}' >> /root/.ssh/authorized_keys`).join("\n")} -chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true - -# Start sshd -/usr/sbin/sshd -f /tmp/sshd_config -p 22 -echo "sshd started on port 22" - -# Start persistent nc listener for remote shell -(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) & -echo "nc shell listener on port 2323" - -# Report IP to bastion -MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}') -curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\ - -H "Content-Type: application/json" \\ - -d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"ssh root@$IP_ADDR (pw: debug) | nc $IP_ADDR 2323\\"}" 2>/dev/null || true - -echo "Debug environment ready: ssh root@$IP_ADDR or nc $IP_ADDR 2323" -%end -` : ""; - return `# Lab Bastion -- Debug/Rescue Kickstart # Minimal: SSH + network for Anaconda rescue mode +# +# SSH is started by Anaconda (inst.sshd kernel param). +# Password: debug | SSH keys from bastion config. +# %pre/%post do NOT run in rescue mode. lang en_US.UTF-8 keyboard uk network --bootproto=dhcp --activate -${sshpw} +sshpw --username=root --plaintext debug ${sshkeyLine} -${sshdSetup}`; +`; } diff --git a/bastion/src/bastion/src/templates/install.ks.ts b/bastion/src/bastion/src/templates/install.ks.ts index cf5ef73..94af999 100644 --- a/bastion/src/bastion/src/templates/install.ks.ts +++ b/bastion/src/bastion/src/templates/install.ks.ts @@ -134,10 +134,9 @@ network --bootproto=dhcp --activate --hostname=${fqdn} ${auth} ${userDirective} -bootloader --append="console=tty0 console=ttyS0,115200n8" +bootloader --append="console=tty0" -# logging --host=${serverIp} --port=${syslogPort} -# Disabled: syslog UDP port needs to be exposed in k3s service/hostPort first +logging --host=${serverIp} --port=${syslogPort} url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch @@ -342,17 +341,7 @@ echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab ${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup -- # -- Enable chronyd for time sync -- -systemctl enable chronyd || true - -# -- Serial console (for debugging — auto-login as root on ttyS0) -- -# AWS EC2 compatible: ttyS0 @ 115200n8 -systemctl enable serial-getty@ttyS0.service || true - -# -- Forward all system logs to serial console -- -cat > /etc/rsyslog.d/serial-console.conf << 'RSYSLOG' -*.* /dev/ttyS0 -RSYSLOG -systemctl enable rsyslog || true` : `# -- Kernel modules for k3s -- +systemctl enable chronyd || true` : `# -- Kernel modules for k3s -- cat > /etc/modules-load.d/k3s.conf << 'MODULES' br_netfilter overlay @@ -396,6 +385,9 @@ fi bastion_progress "post-install" "3-bootorder done" +# -- Enable SysRq magic keys (for emergency reboot via Alt+SysRq+REISUB) -- +echo "kernel.sysrq=1" > /etc/sysctl.d/90-sysrq.conf + # -- Provisioning metadata -- cat > /etc/lab-provisioned << PROVEOF hostname: ${fqdn} diff --git a/bastion/src/bastion/tests/dispatch.test.ts b/bastion/src/bastion/tests/dispatch.test.ts index 0b9572b..3d07ac4 100644 --- a/bastion/src/bastion/tests/dispatch.test.ts +++ b/bastion/src/bastion/tests/dispatch.test.ts @@ -28,6 +28,7 @@ function createTestConfig(testDir: string): BastionConfig { gateway: "10.0.0.1", sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"], adminUser: "testadmin", + syslogPort: 15514, skipDnsmasq: true, skipArtifacts: true, fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os", diff --git a/bastion/src/bastion/tests/kickstart.test.ts b/bastion/src/bastion/tests/kickstart.test.ts index 2629877..771f5d1 100644 --- a/bastion/src/bastion/tests/kickstart.test.ts +++ b/bastion/src/bastion/tests/kickstart.test.ts @@ -206,10 +206,8 @@ describe("renderInstallKickstart", () => { } }); - it("forwards system logs to serial console", () => { + it("does not include serial console (causes 30s boot timeout on hardware without UART)", () => { const ks = renderInstallKickstart(baseParams({ role: "vanilla" })); - expect(ks).toContain("serial-console.conf"); - expect(ks).toContain("/dev/ttyS0"); - expect(ks).toContain("rsyslog"); + expect(ks).not.toContain("ttyS0"); }); }); diff --git a/bastion/src/bastion/tests/syslog-listener.test.ts b/bastion/src/bastion/tests/syslog-listener.test.ts new file mode 100644 index 0000000..2ece0d5 --- /dev/null +++ b/bastion/src/bastion/tests/syslog-listener.test.ts @@ -0,0 +1,121 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { createSocket } from "node:dgram"; +import { mkdtempSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { SyslogListener } from "../src/services/syslog-listener.js"; +import { InstallLogBuffer } from "../src/services/install-log.js"; +import { StateManager } from "../src/services/state.js"; + +function sendUdpSyslog(port: number, message: string): Promise { + return new Promise((resolve, reject) => { + const client = createSocket("udp4"); + const buf = Buffer.from(message); + client.send(buf, 0, buf.length, port, "127.0.0.1", (err) => { + client.close(); + if (err) reject(err); + else resolve(); + }); + }); +} + +describe("SyslogListener", () => { + let tmpDir: string; + let state: StateManager; + let installLog: InstallLogBuffer; + let syslog: SyslogListener; + const PORT = 15514; // use non-privileged port for testing + + beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "syslog-test-")); + state = new StateManager(join(tmpDir, "state.json")); + state.init(); + installLog = new InstallLogBuffer(tmpDir); + syslog = new SyslogListener(PORT, installLog, state); + syslog.start(); + }); + + afterEach(() => { + syslog.stop(); + rmSync(tmpDir, { recursive: true, force: true }); + }); + + it("receives and stores syslog messages for registered IP", async () => { + const mac = "aa:bb:cc:dd:ee:ff"; + // Queue a machine so hostname can be resolved + state.update((s) => { + s.install_queue[mac] = { + hostname: "testnode", + disk: "/dev/sda", + role: "worker", + os: "fedora-43", + queued_at: new Date().toISOString(), + }; + }); + + // Register IP → MAC mapping + syslog.registerIp("127.0.0.1", mac); + + // Send a syslog message (RFC 3164 format) + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: Installing package vim-enhanced"); + + // Wait for UDP delivery + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("anaconda"); + expect(lines[0]!.line).toContain("Installing package vim-enhanced"); + }); + + it("ignores messages from unknown IPs", async () => { + // Don't register any IP mapping + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: test message"); + await new Promise((r) => setTimeout(r, 200)); + + // No MAC to check, but the listener should not crash + // and no logs should be stored for any MAC + expect(installLog.lineCount("unknown")).toBe(0); + }); + + it("resolves IP from installed machines state", async () => { + const mac = "11:22:33:44:55:66"; + state.update((s) => { + s.installed[mac] = { + hostname: "installed-node", + role: "worker", + ip: "127.0.0.1", + installed_at: new Date().toISOString(), + }; + }); + + await sendUdpSyslog(PORT, "<14>Mar 30 02:00:00 installed-node sshd[5678]: Accepted publickey for root"); + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("sshd"); + }); + + it("parses various syslog formats", async () => { + const mac = "aa:bb:cc:dd:ee:ff"; + syslog.registerIp("127.0.0.1", mac); + state.update((s) => { + s.install_queue[mac] = { + hostname: "testnode", + disk: "/dev/sda", + role: "worker", + os: "fedora-43", + queued_at: new Date().toISOString(), + }; + }); + + // Message without PID + await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost kernel: NVMe device ready"); + await new Promise((r) => setTimeout(r, 200)); + + const lines = installLog.getLines(mac); + expect(lines.length).toBeGreaterThan(0); + expect(lines[0]!.line).toContain("kernel"); + }); +}); diff --git a/bastion/src/cli/src/api/client.ts b/bastion/src/cli/src/api/client.ts index 75380a4..241c848 100644 --- a/bastion/src/cli/src/api/client.ts +++ b/bastion/src/cli/src/api/client.ts @@ -94,8 +94,8 @@ export class LabdClient { return this.request("POST", "/api/machines/install", { body: opts }); } - async debugMachine(mac: string, opts?: { sshd?: boolean; pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { - return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd, pxeBoot: opts?.pxeBoot } }); + async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> { + return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } }); } async forgetMachine(mac: string): Promise<{ status: string }> { diff --git a/bastion/src/cli/src/commands/debug.ts b/bastion/src/cli/src/commands/debug.ts index 45a13a7..2bb4f24 100644 --- a/bastion/src/cli/src/commands/debug.ts +++ b/bastion/src/cli/src/commands/debug.ts @@ -48,10 +48,9 @@ export function registerDebugCommand(parent: Command): void { parent .command("debug ") .description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)") - .option("--sshd", "Start SSH + nc listener automatically, report IP to bastion") .option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)") .showHelpAfterError(true) - .action(async (target: string, opts: { sshd?: boolean; pxeBoot?: boolean }) => { + .action(async (target: string, opts: { pxeBoot?: boolean }) => { const client = getLabdClient(); // Resolve target from labd aggregated state @@ -75,7 +74,7 @@ export function registerDebugCommand(parent: Command): void { console.log(`Queuing debug mode for ${hostname} (${mac})...`); try { - const result = await client.debugMachine(mac, { sshd: opts.sshd === true, pxeBoot: opts.pxeBoot === true }); + const result = await client.debugMachine(mac, { pxeBoot: opts.pxeBoot === true }); if (result.error) { console.error(`Failed: ${result.error}`); process.exit(1); @@ -118,15 +117,21 @@ export function registerDebugCommand(parent: Command): void { } } - const sshdNote = opts.sshd - ? `\nSSH + nc listener will start automatically. Watch bastion logs for the IP callback. - Password: debug | nc 2323 for raw shell\n` - : ""; + // Determine bastion URL from labd config for the setup script URL + const bastionUrl = process.env["LABD_URL"] + ? process.env["LABD_URL"].replace(/\/ws\/bastion$/, "").replace(/^wss?:/, "http:") + : "http://:8080"; console.log(` Debug mode queued for ${hostname} (${mac}). Reboot the machine to enter Fedora rescue mode. -${sshdNote} + +SSH access (started by Anaconda): + ssh root@ (password: debug) + +For nc remote shell, run from rescue shell: + curl ${bastionUrl}/debug-setup.sh | bash + Once in rescue shell: # Activate LVM and mount installed system diff --git a/bastion/src/cli/src/commands/logs.ts b/bastion/src/cli/src/commands/logs.ts index 85a59c1..48630a6 100644 --- a/bastion/src/cli/src/commands/logs.ts +++ b/bastion/src/cli/src/commands/logs.ts @@ -39,19 +39,25 @@ export function registerLogsCommand(parent: Command): void { parent .command("logs ") .description("Show provisioning logs for a machine (hostname, MAC, or IP)") - .action(async (target: string) => { + .option("-f, --follow", "Follow log output in real-time") + .action(async (target: string, opts: { follow?: boolean }) => { const mac = await resolveToMac(target); + const BOLD = "\x1b[1m"; + const GREEN = "\x1b[32m"; + const YELLOW = "\x1b[33m"; + const RED = "\x1b[31m"; + const DIM = "\x1b[2m"; + const RESET = "\x1b[0m"; + + if (opts.follow) { + await followLogs(mac, { BOLD, GREEN, YELLOW, RED, DIM, RESET }); + return; + } + try { const data = await getLabdClient().getMachineLogs(mac); - const BOLD = "\x1b[1m"; - const GREEN = "\x1b[32m"; - const YELLOW = "\x1b[33m"; - const RED = "\x1b[31m"; - const DIM = "\x1b[2m"; - const RESET = "\x1b[0m"; - console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`); console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`); console.log(` Role: ${data["role"]}`); @@ -83,3 +89,58 @@ export function registerLogsCommand(parent: Command): void { } }); } + +/** Follow logs by polling labd. */ +async function followLogs( + mac: string, + colors: { BOLD: string; GREEN: string; YELLOW: string; RED: string; DIM: string; RESET: string }, +): Promise { + const { BOLD, GREEN, YELLOW, RED, DIM, RESET } = colors; + const client = getLabdClient(); + + console.log(`${DIM}Following logs for ${mac} (Ctrl+C to stop)${RESET}`); + console.log(""); + + let lastStageCount = 0; + let lastStatus = ""; + + while (true) { + try { + const data = await client.getMachineLogs(mac); + const status = String(data["status"] ?? ""); + const log = data["log"] as Array<{ stage: string; detail: string; timestamp: string }> | undefined; + + // Print header once or on status change + if (status !== lastStatus) { + const hostname = String(data["hostname"] ?? mac); + const statusColor = status === "installed" ? GREEN : YELLOW; + console.log(` ${BOLD}${hostname}${RESET} ${statusColor}${status}${RESET}`); + lastStatus = status; + } + + // Print new stages + if (log && log.length > lastStageCount) { + for (let i = lastStageCount; i < log.length; i++) { + const entry = log[i]!; + const time = entry.timestamp.slice(11, 19); + const color = entry.stage === "complete" ? GREEN : entry.stage === "error" ? RED : YELLOW; + const detail = entry.detail ? ` ${DIM}-- ${entry.detail}${RESET}` : ""; + console.log(` ${DIM}${time}${RESET} ${color}${entry.stage}${RESET}${detail}`); + } + lastStageCount = log.length; + } + + // Done + if (status === "installed") { + const ip = data["ip"] ?? ""; + console.log(""); + console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`); + process.exit(0); + } + } catch { + // Machine may not be in logs yet (still queued) + } + + await new Promise((r) => setTimeout(r, 5000)); + } +} diff --git a/bastion/src/labd/src/routes/bastions.ts b/bastion/src/labd/src/routes/bastions.ts index 218805e..9c8e181 100644 --- a/bastion/src/labd/src/routes/bastions.ts +++ b/bastion/src/labd/src/routes/bastions.ts @@ -151,7 +151,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void try { const result = await sendCommand(all[0]!.bastionId, { type: "command-install", - mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43", + mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43", }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { @@ -164,7 +164,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void try { const result = await sendCommand(bastion.bastionId, { type: "command-install", - mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43", + mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43", }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { @@ -174,10 +174,9 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void // Queue debug/rescue mode — route to correct bastion by MAC app.post<{ - Body: { mac?: string; sshd?: boolean; pxeBoot?: boolean }; + Body: { mac?: string; pxeBoot?: boolean }; }>("/api/machines/debug", async (request, reply) => { const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":"); - const sshd = request.body?.sshd ?? false; const pxeBoot = request.body?.pxeBoot ?? false; if (!mac) { return reply.code(400).send({ error: "mac is required" }); @@ -191,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } if (all.length === 1) { try { - const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd, pxeBoot }); + const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, pxeBoot }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); @@ -201,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void } try { - const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd, pxeBoot }); + const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, pxeBoot }); return reply.code(result.status === "ok" ? 200 : 500).send(result); } catch (err) { return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) }); diff --git a/bastion/src/shared/src/protocol/index.ts b/bastion/src/shared/src/protocol/index.ts index 3b5054e..e2bdd1c 100644 --- a/bastion/src/shared/src/protocol/index.ts +++ b/bastion/src/shared/src/protocol/index.ts @@ -111,7 +111,7 @@ export type LabdBastionMessage = | { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string } | { type: "command-forget"; requestId: string; mac: string } | { type: "command-role-update"; requestId: string; mac: string; role: string } - | { type: "command-debug"; requestId: string; mac: string; sshd?: boolean; pxeBoot?: boolean } + | { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean } | { type: "server-shutdown"; reconnectAfter: number }; export type BastionMessageType = BastionMessage["type"]; diff --git a/bastion/src/shared/src/types/state.ts b/bastion/src/shared/src/types/state.ts index eaadf1f..689f09a 100644 --- a/bastion/src/shared/src/types/state.ts +++ b/bastion/src/shared/src/types/state.ts @@ -101,7 +101,6 @@ export interface InstalledInfo { export interface DebugConfig { hostname: string; queued_at: string; - sshd?: boolean; pxeBoot?: boolean; } diff --git a/bastion/tests/integration/pxe-provision.test.ts b/bastion/tests/integration/pxe-provision.test.ts index 2d1f20b..9e5deb7 100644 --- a/bastion/tests/integration/pxe-provision.test.ts +++ b/bastion/tests/integration/pxe-provision.test.ts @@ -224,11 +224,12 @@ describe("PXE boot provisioning", () => { // Generate dnsmasq config generateDnsmasqConf(config); - // Start HTTP server - const { app, state } = createApp(config); + // Start HTTP server + syslog listener + const { app, state, syslog } = createApp(config); bastionApp = app; await app.listen({ port: config.httpPort, host: "0.0.0.0" }); - log(`Bastion HTTP server listening on :${HTTP_PORT}`); + syslog.start(); + log(`Bastion HTTP server listening on :${HTTP_PORT}, syslog on UDP :${config.syslogPort}`); // Start dnsmasq (fire-and-forget — it runs until killed) // May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback @@ -387,8 +388,8 @@ describe("PXE boot provisioning", () => { expect(data.progress).toBe("complete"); }); - it.skip("log lines were captured", async () => { - // Requires log streamer in %post — skipped until re-added + it("syslog install logs were captured", async () => { + // Anaconda forwards logs via syslog (logging --host directive in kickstart) const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`); const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> }; expect(data.log_total).toBeGreaterThan(0);