Compare commits
9 Commits
816736793d
...
docs/pxe-b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87c1a34232 | ||
| 84afe7d5e4 | |||
|
|
0a4916d3c9 | ||
|
|
a4a4840930 | ||
|
|
8da947a1c3 | ||
|
|
92c65b4672 | ||
|
|
3835fefba1 | ||
| d4e9101bb6 | |||
| 014e8a6e72 |
@@ -61,11 +61,14 @@ _labctl() {
|
|||||||
"provision reprovision")
|
"provision reprovision")
|
||||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
|
"provision debug")
|
||||||
|
COMPREPLY=($(compgen -W "--pxe-boot -h --help" -- "$cur"))
|
||||||
|
return ;;
|
||||||
"provision forget")
|
"provision forget")
|
||||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision logs")
|
"provision logs")
|
||||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision makeiso")
|
"provision makeiso")
|
||||||
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
||||||
@@ -95,7 +98,7 @@ _labctl() {
|
|||||||
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision")
|
"provision")
|
||||||
COMPREPLY=($(compgen -W "list install reprovision forget logs makeiso -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"config")
|
"config")
|
||||||
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))
|
||||||
|
|||||||
@@ -122,6 +122,7 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d '
|
|||||||
complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines'
|
complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines'
|
||||||
complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation'
|
complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation'
|
||||||
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
|
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
|
||||||
|
complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)'
|
||||||
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
|
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
|
||||||
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
|
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
|
||||||
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'
|
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'
|
||||||
@@ -136,6 +137,12 @@ complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machin
|
|||||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||||
|
|
||||||
|
# provision debug options
|
||||||
|
complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)'
|
||||||
|
|
||||||
|
# provision logs options
|
||||||
|
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time'
|
||||||
|
|
||||||
# provision makeiso options
|
# provision makeiso options
|
||||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
||||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
||||||
|
|||||||
103
bastion/docs/kickstart-reference.md
Normal file
103
bastion/docs/kickstart-reference.md
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
# Kickstart Reference — Lessons Learned
|
||||||
|
|
||||||
|
This documents pitfalls discovered during PXE boot testing. Read before modifying
|
||||||
|
the kickstart template (`src/bastion/src/templates/install.ks.ts`).
|
||||||
|
|
||||||
|
## Package requirements
|
||||||
|
|
||||||
|
### `kernel-modules` is mandatory
|
||||||
|
|
||||||
|
`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`,
|
||||||
|
`zram`, and many network/filesystem drivers. Without `kernel-modules`:
|
||||||
|
|
||||||
|
- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays
|
||||||
|
read-only** → sshd-keygen can't write host keys → SSH unreachable
|
||||||
|
- `zram-generator` fails → can trigger emergency mode
|
||||||
|
|
||||||
|
**Always include `kernel-modules` in %packages.** This matches what the real
|
||||||
|
labmaster (192.168.8.11) has installed.
|
||||||
|
|
||||||
|
Regression introduced in commit `fac14b6` which removed `@server-product`
|
||||||
|
(that group pulled in `kernel-modules` via `fedora-release-server`).
|
||||||
|
|
||||||
|
### `dosfstools` is needed
|
||||||
|
|
||||||
|
Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real
|
||||||
|
labmaster has it installed.
|
||||||
|
|
||||||
|
### Verify against the real machine
|
||||||
|
|
||||||
|
Before changing the package list, SSH to the labmaster and compare:
|
||||||
|
```bash
|
||||||
|
ssh 192.168.8.11 "rpm -q <package>"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Anaconda %post execution order
|
||||||
|
|
||||||
|
This is critical and not well documented:
|
||||||
|
|
||||||
|
1. `%pre` scripts run
|
||||||
|
2. Disk partitioning and formatting
|
||||||
|
3. Package installation
|
||||||
|
4. **Anaconda writes system config (fstab, hostname, etc.)**
|
||||||
|
5. `%post` scripts run (in chroot of installed system)
|
||||||
|
6. `%post --nochroot` scripts run
|
||||||
|
7. **Anaconda MAY overwrite fstab again after %post scripts**
|
||||||
|
|
||||||
|
**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or
|
||||||
|
`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both
|
||||||
|
`sed` in %post and %post --nochroot had no effect on the final fstab.
|
||||||
|
|
||||||
|
What DOES work from %post:
|
||||||
|
- Writing files to `/etc/` (systemd units, config files, SSH keys)
|
||||||
|
- Enabling/disabling systemd services
|
||||||
|
- Installing additional packages
|
||||||
|
- Running `systemctl enable/mask`
|
||||||
|
|
||||||
|
What does NOT work from %post:
|
||||||
|
- Modifying `/etc/fstab` (Anaconda overwrites it)
|
||||||
|
- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions)
|
||||||
|
|
||||||
|
## UEFI / EFI partition
|
||||||
|
|
||||||
|
- Anaconda always creates an EFI System Partition for UEFI installs
|
||||||
|
- The EFI partition is FAT32 — requires `vfat` kernel module to mount
|
||||||
|
- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves
|
||||||
|
root as read-only. This cascades to break ALL services that need to write
|
||||||
|
- The EFI partition is used by firmware directly for bootloader — the OS
|
||||||
|
doesn't strictly need it mounted, but Anaconda adds it to fstab
|
||||||
|
|
||||||
|
## VM-specific issues (libvirt/QEMU/OVMF)
|
||||||
|
|
||||||
|
### iPXE exit behavior
|
||||||
|
- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk
|
||||||
|
- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk)
|
||||||
|
- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd`
|
||||||
|
|
||||||
|
### nftables
|
||||||
|
- libvirt creates reject rules for NAT networks in table `ip libvirt_network`
|
||||||
|
(NOT `inet libvirt` — this wrong table name cost hours of debugging)
|
||||||
|
- These rules block new host→VM connections (SSH)
|
||||||
|
- Rules are recreated on every `virsh start` — must delete after each VM restart
|
||||||
|
- Chains: `guest_input` and `guest_output`
|
||||||
|
|
||||||
|
### Serial console
|
||||||
|
- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet`
|
||||||
|
- Use `virsh console <vm-name>` for interactive access (handles telnet protocol)
|
||||||
|
- Raw `socat` works for reading but pagers/readline break interactive use
|
||||||
|
- Add `console=ttyS0,115200n8` to kernel args for boot output on serial
|
||||||
|
|
||||||
|
### SELinux on labmaster
|
||||||
|
- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it
|
||||||
|
- SSH works fine with SELinux enforcing on a properly installed Fedora system
|
||||||
|
- The `ld.so.cache` AVC denials seen during debugging were caused by the
|
||||||
|
read-only root filesystem, not by SELinux policy
|
||||||
|
|
||||||
|
## Testing checklist
|
||||||
|
|
||||||
|
Before merging kickstart changes:
|
||||||
|
1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q <pkg>"`
|
||||||
|
2. Run the PXE integration test: `sudo pnpm run test:integration:pxe`
|
||||||
|
3. Verify via serial console (root / `lab-root-pw`) if SSH fails
|
||||||
|
4. Check `mount | grep " / "` — must show `rw`, not `ro`
|
||||||
|
5. Check `systemctl --failed` — no critical failures
|
||||||
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
# PXE Boot Debugging Session — 2026-03-30
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
Beelink SER Mini Pro (AMD Ryzen 7 255, Radeon 780M, 64GB DDR5, 1TB NVMe) boots Fedora 43 100x slower than normal after PXE kickstart install. Every systemd boot phase takes ~30 seconds. The Anaconda installer/rescue mode boots fast on the same hardware.
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
**`console=ttyS0,115200n8` in kernel cmdline** — added via kickstart `bootloader --append` during install.
|
||||||
|
|
||||||
|
This mini PC has **no physical serial UART**. When systemd writes to ttyS0, each log write blocks for ~30 seconds waiting for the non-existent UART hardware. Since systemd logs at every phase transition, the total boot time was 10+ minutes.
|
||||||
|
|
||||||
|
The Anaconda installer was unaffected because it uses a different init flow that doesn't go through the same systemd phase transitions.
|
||||||
|
|
||||||
|
## How We Found It
|
||||||
|
Hours of systematic elimination:
|
||||||
|
|
||||||
|
| What we tried | Result | Ruled out |
|
||||||
|
|---|---|---|
|
||||||
|
| `modprobe.blacklist=amdgpu` | No change | GPU driver |
|
||||||
|
| `amd_iommu=off` | No change | IOMMU |
|
||||||
|
| Rebuild initramfs without plymouth/drm/fips | No change | Initramfs bloat |
|
||||||
|
| systemd-boot instead of GRUB | Still slow | Bootloader |
|
||||||
|
| PXE-boot kernel+initrd (skip local GRUB entirely) | Still slow | Local bootloader/firmware |
|
||||||
|
| Disable TPM in BIOS | No change | TPM |
|
||||||
|
| Remove `resume=` + resume dracut module | No change | Hibernate resume |
|
||||||
|
| Manual LVM activation in rescue shell | **Fast** | NVMe/LVM themselves |
|
||||||
|
| Remove `console=ttyS0,115200n8` from GRUB | **FAST BOOT** | **This was it** |
|
||||||
|
|
||||||
|
The key breakthrough was noticing the timestamps showed **exactly 30-second gaps** between boot phases — a timeout pattern, not general slowness. Then realising the serial console was added during install and had never been tested without.
|
||||||
|
|
||||||
|
## What Was Fixed (PR #4, merged)
|
||||||
|
|
||||||
|
### 1. Removed serial console from kickstart
|
||||||
|
- Removed `console=ttyS0,115200n8` from `bootloader --append`
|
||||||
|
- Removed `serial-getty@ttyS0.service` enablement
|
||||||
|
- Removed rsyslog serial forwarding
|
||||||
|
|
||||||
|
### 2. Enabled Anaconda syslog forwarding
|
||||||
|
- Uncommented `logging --host --port` directive in kickstart
|
||||||
|
- Bastion's SyslogListener was already built — just needed IP→MAC resolution improvement
|
||||||
|
- Added `registerIp()` calls from kickstart fetch and progress callbacks
|
||||||
|
- Added syslog listener unit tests
|
||||||
|
|
||||||
|
### 3. Fixed disk auto-detection
|
||||||
|
- Default disk changed from `/dev/sda` to `""` (auto-detect) in labd route and bastion command handler
|
||||||
|
- The kickstart `%pre` auto-detect logic probes nvme0n1, sda, sdb, vda in order
|
||||||
|
- Without this fix, NVMe-only machines (like the SER Mini Pro) fail immediately
|
||||||
|
|
||||||
|
### 4. SysRq magic keys
|
||||||
|
- Added `kernel.sysrq=1` sysctl to kickstart `%post`
|
||||||
|
- Enables Alt+SysRq+REISUB via JetKVM for emergency reboot of stuck machines
|
||||||
|
|
||||||
|
### 5. Simplified debug command
|
||||||
|
- Removed `--sshd` flag (SSH always available via `inst.sshd` + `sshpw` in rescue mode)
|
||||||
|
- Added `/debug-setup.sh` HTTP endpoint for nc listener setup from rescue shell
|
||||||
|
- Cleaned up `sshd` field from DebugConfig, protocol types, all routes
|
||||||
|
|
||||||
|
### 6. Added `labctl provision logs -f`
|
||||||
|
- Follow mode with 5-second polling for real-time install monitoring
|
||||||
|
|
||||||
|
## What Works
|
||||||
|
|
||||||
|
- **PXE discovery → install → boot** — full flow works end-to-end
|
||||||
|
- **Anaconda syslog forwarding** — install logs stream to bastion
|
||||||
|
- **Progress callbacks** — stage-by-stage install tracking via curl
|
||||||
|
- **Auto disk detection** — works for NVMe and SATA
|
||||||
|
- **Debug rescue mode** — `labctl provision debug <target>` boots Anaconda rescue with SSH
|
||||||
|
- **Network-first boot order** — bastion controls every reboot via efibootmgr
|
||||||
|
- **SysRq keys** — emergency reboot via JetKVM keyboard
|
||||||
|
|
||||||
|
## What Doesn't Work / Known Issues
|
||||||
|
|
||||||
|
- **`--sshd` in rescue mode** — Anaconda rescue mode skips both `%pre` and `%post` kickstart sections. `inst.sshd` + `sshpw` should provide SSH access, but hasn't been verified end-to-end yet. The `/debug-setup.sh` curl workaround exists for nc.
|
||||||
|
- **arm64 container build** — iPXE cross-compilation fails on arm64 (GCC flag incompatibility). Workaround: build with `--platforms linux/amd64` only.
|
||||||
|
- **Integration test SSH timeout** — VM boots fine but SSH times out due to libvirt nftables reject rules after VM restart. Test infrastructure issue, not a code bug.
|
||||||
|
|
||||||
|
## What Was Skipped / Left To Do
|
||||||
|
|
||||||
|
1. **Syslog UDP port in k3s** — works because bastion uses `hostNetwork: true`, but should be documented properly
|
||||||
|
2. **Background log streamer** — the old `tail -f` approach broke Anaconda filesystem sync. Replaced with syslog forwarding. If more granular %post logging is needed, a synchronous log push at end of %post would be safe.
|
||||||
|
3. **Per-machine hardware overrides** — turned out not to be needed (serial console was the only "special" setting, and removing it is universal)
|
||||||
|
4. **Ubuntu autoinstall disk default** — `ubuntu-autoinstall.ts` still has `disk || "/dev/sda"` fallback (line 38), should be changed to auto-detect
|
||||||
|
5. **Verify `inst.sshd` works in rescue mode** — test SSH with password "debug" next time debug mode is used
|
||||||
|
6. **Re-enable TPM in BIOS** — was disabled during debugging, should be factory-reset (user plans to reset BIOS to factory)
|
||||||
|
|
||||||
|
## Key Learnings
|
||||||
|
|
||||||
|
1. **`console=ttyS0` on hardware without UART = 30s timeout per boot phase.** Never add serial console to kernel cmdline unless the hardware has a verified physical UART.
|
||||||
|
2. **Exactly-N-second gaps in boot logs = timeout, not slowness.** Look for the timeout source, not performance issues.
|
||||||
|
3. **The bisection approach works.** Systematically removing features one at a time found the root cause. But it took hours because the serial console was added early and seemed harmless.
|
||||||
|
4. **Anaconda rescue mode is limited.** It skips `%pre` and `%post`, so you can't automate setup via kickstart. Use `inst.sshd` + `sshpw` for SSH, and serve helper scripts via HTTP for everything else.
|
||||||
|
5. **Default disk paths break NVMe machines.** Always default to auto-detect (empty string) rather than `/dev/sda`.
|
||||||
@@ -257,7 +257,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.install_queue[msg.mac] = {
|
s.install_queue[msg.mac] = {
|
||||||
hostname: msg.hostname,
|
hostname: msg.hostname,
|
||||||
disk: msg.disk ?? "/dev/sda",
|
disk: msg.disk ?? "",
|
||||||
role: msg.role as import("@lab/shared").Role,
|
role: msg.role as import("@lab/shared").Role,
|
||||||
os: msg.os as import("@lab/shared").OsId,
|
os: msg.os as import("@lab/shared").OsId,
|
||||||
queued_at: new Date().toISOString(),
|
queued_at: new Date().toISOString(),
|
||||||
@@ -269,6 +269,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
labdConn.onCommand("command-debug", async (msg) => {
|
labdConn.onCommand("command-debug", async (msg) => {
|
||||||
if (msg.type !== "command-debug") throw new Error("unexpected");
|
if (msg.type !== "command-debug") throw new Error("unexpected");
|
||||||
const mac = msg.mac.toLowerCase();
|
const mac = msg.mac.toLowerCase();
|
||||||
|
const pxeBoot = msg.pxeBoot ?? false;
|
||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
const hostname =
|
const hostname =
|
||||||
currentState.installed[mac]?.hostname ??
|
currentState.installed[mac]?.hostname ??
|
||||||
@@ -276,7 +277,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
currentState.discovered[mac]?.product ??
|
currentState.discovered[mac]?.product ??
|
||||||
mac;
|
mac;
|
||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
|
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||||
});
|
});
|
||||||
return { status: "ok", data: { mac, hostname } };
|
return { status: "ok", data: { mac, hostname } };
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -13,11 +13,13 @@ import { triggerPostProvisionK3s } from "../services/post-provision.js";
|
|||||||
import { progressBus } from "../services/progress-events.js";
|
import { progressBus } from "../services/progress-events.js";
|
||||||
import type { ProgressEvent } from "../services/progress-events.js";
|
import type { ProgressEvent } from "../services/progress-events.js";
|
||||||
import type { InstallLogBuffer } from "../services/install-log.js";
|
import type { InstallLogBuffer } from "../services/install-log.js";
|
||||||
|
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||||
|
|
||||||
export function registerApiRoutes(
|
export function registerApiRoutes(
|
||||||
app: FastifyInstance,
|
app: FastifyInstance,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
installLog: InstallLogBuffer,
|
installLog: InstallLogBuffer,
|
||||||
|
syslog: SyslogListener,
|
||||||
): void {
|
): void {
|
||||||
// List all machines
|
// List all machines
|
||||||
app.get("/api/machines", async (_request, reply) => {
|
app.get("/api/machines", async (_request, reply) => {
|
||||||
@@ -84,6 +86,11 @@ export function registerApiRoutes(
|
|||||||
const { mac: rawMac, stage, detail } = request.body ?? {};
|
const { mac: rawMac, stage, detail } = request.body ?? {};
|
||||||
const mac = (rawMac ?? "unknown").toLowerCase();
|
const mac = (rawMac ?? "unknown").toLowerCase();
|
||||||
const stageName = stage ?? "unknown";
|
const stageName = stage ?? "unknown";
|
||||||
|
|
||||||
|
// Register IP → MAC for syslog routing
|
||||||
|
if (mac !== "unknown") {
|
||||||
|
syslog.registerIp(request.ip, mac);
|
||||||
|
}
|
||||||
const detailStr = detail ?? "";
|
const detailStr = detail ?? "";
|
||||||
|
|
||||||
const GREEN = "\x1b[0;32m";
|
const GREEN = "\x1b[0;32m";
|
||||||
@@ -191,9 +198,10 @@ export function registerApiRoutes(
|
|||||||
|
|
||||||
// Queue debug/rescue mode for a machine
|
// Queue debug/rescue mode for a machine
|
||||||
app.post<{
|
app.post<{
|
||||||
Body: { mac?: string };
|
Body: { mac?: string; pxeBoot?: boolean };
|
||||||
}>("/api/debug", async (request, reply) => {
|
}>("/api/debug", async (request, reply) => {
|
||||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
|
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||||
if (mac === "") {
|
if (mac === "") {
|
||||||
return reply.status(400).send({ error: "mac is required" });
|
return reply.status(400).send({ error: "mac is required" });
|
||||||
}
|
}
|
||||||
@@ -207,7 +215,7 @@ export function registerApiRoutes(
|
|||||||
mac;
|
mac;
|
||||||
|
|
||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString() };
|
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import {
|
|||||||
renderDiscoverIpxe,
|
renderDiscoverIpxe,
|
||||||
renderInstallIpxe,
|
renderInstallIpxe,
|
||||||
renderDebugIpxe,
|
renderDebugIpxe,
|
||||||
|
renderPxeBootDebugIpxe,
|
||||||
renderLocalBootIpxe,
|
renderLocalBootIpxe,
|
||||||
} from "../templates/boot.ipxe.js";
|
} from "../templates/boot.ipxe.js";
|
||||||
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
||||||
@@ -22,12 +23,44 @@ export function registerDispatchRoutes(
|
|||||||
config: BastionConfig,
|
config: BastionConfig,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
): void {
|
): void {
|
||||||
// Serve debug/rescue kickstart (minimal: SSH keys + network)
|
// Serve debug/rescue kickstart (minimal: SSH keys + network for inst.sshd)
|
||||||
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
|
||||||
const ks = renderDebugKickstart({ sshKeys: config.sshKeys ?? [] });
|
const ks = renderDebugKickstart({
|
||||||
|
sshKeys: config.sshKeys ?? [],
|
||||||
|
serverIp: config.serverIp,
|
||||||
|
httpPort: config.httpPort,
|
||||||
|
});
|
||||||
return reply.type("text/plain").send(ks);
|
return reply.type("text/plain").send(ks);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Shell script for manual debug setup (nc listener + IP reporting)
|
||||||
|
// Usage from rescue shell: curl http://bastion:port/debug-setup.sh | bash
|
||||||
|
app.get("/debug-setup.sh", async (_request, reply) => {
|
||||||
|
const script = `#!/bin/bash
|
||||||
|
# Lab Bastion debug setup — run from rescue shell
|
||||||
|
set -x
|
||||||
|
|
||||||
|
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||||
|
MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||||
|
|
||||||
|
# Start persistent nc listener for remote shell
|
||||||
|
(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) &
|
||||||
|
echo "nc shell listener on port 2323"
|
||||||
|
|
||||||
|
# Report IP to bastion
|
||||||
|
curl -sf -X POST "http://${config.serverIp}:${config.httpPort}/api/progress" \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
|
-d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"nc $IP_ADDR 2323\\"}" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Debug environment ready ==="
|
||||||
|
echo " nc $IP_ADDR 2323 (remote shell)"
|
||||||
|
echo " ssh root@$IP_ADDR (password: debug)"
|
||||||
|
echo "==============================="
|
||||||
|
`;
|
||||||
|
return reply.type("text/plain").send(script);
|
||||||
|
});
|
||||||
|
|
||||||
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
||||||
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
@@ -36,17 +69,27 @@ export function registerDispatchRoutes(
|
|||||||
const debugEntry = currentState.debug[mac];
|
const debugEntry = currentState.debug[mac];
|
||||||
if (debugEntry) {
|
if (debugEntry) {
|
||||||
const hostname = debugEntry.hostname ?? "debug";
|
const hostname = debugEntry.hostname ?? "debug";
|
||||||
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
|
||||||
|
|
||||||
state.update((s) => { delete s.debug[mac]; });
|
state.update((s) => { delete s.debug[mac]; });
|
||||||
|
|
||||||
const script = renderDebugIpxe({
|
let script: string;
|
||||||
mac,
|
if (debugEntry.pxeBoot) {
|
||||||
hostname,
|
logger.info(`PXE BOOT DEBUG: ${mac} -> ${hostname} (kernel+initrd from PXE, root from NVMe)`);
|
||||||
serverIp: config.serverIp,
|
script = renderPxeBootDebugIpxe({
|
||||||
httpPort: config.httpPort,
|
mac,
|
||||||
fedoraMirror: config.fedoraMirror,
|
hostname,
|
||||||
});
|
serverIp: config.serverIp,
|
||||||
|
httpPort: config.httpPort,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
||||||
|
script = renderDebugIpxe({
|
||||||
|
mac,
|
||||||
|
hostname,
|
||||||
|
serverIp: config.serverIp,
|
||||||
|
httpPort: config.httpPort,
|
||||||
|
fedoraMirror: config.fedoraMirror,
|
||||||
|
});
|
||||||
|
}
|
||||||
return reply.type("text/plain").send(script);
|
return reply.type("text/plain").send(script);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
import type { FastifyInstance } from "fastify";
|
import type { FastifyInstance } from "fastify";
|
||||||
import type { BastionConfig } from "@lab/shared";
|
import type { BastionConfig } from "@lab/shared";
|
||||||
import type { StateManager } from "../services/state.js";
|
import type { StateManager } from "../services/state.js";
|
||||||
|
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||||
import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js";
|
import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js";
|
||||||
import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js";
|
import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js";
|
||||||
|
|
||||||
@@ -12,6 +13,7 @@ export function registerKickstartRoutes(
|
|||||||
app: FastifyInstance,
|
app: FastifyInstance,
|
||||||
config: BastionConfig,
|
config: BastionConfig,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
|
syslog: SyslogListener,
|
||||||
): void {
|
): void {
|
||||||
// Per-MAC install kickstart
|
// Per-MAC install kickstart
|
||||||
app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => {
|
||||||
@@ -19,6 +21,11 @@ export function registerKickstartRoutes(
|
|||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
const queueEntry = currentState.install_queue[mac];
|
const queueEntry = currentState.install_queue[mac];
|
||||||
|
|
||||||
|
// Register IP → MAC so syslog listener can route Anaconda logs
|
||||||
|
if (mac) {
|
||||||
|
syslog.registerIp(request.ip, mac);
|
||||||
|
}
|
||||||
|
|
||||||
const ks = generateInstallKickstart(config, {
|
const ks = generateInstallKickstart(config, {
|
||||||
hostname: queueEntry?.hostname ?? "lab-node",
|
hostname: queueEntry?.hostname ?? "lab-node",
|
||||||
disk: queueEntry?.disk ?? "",
|
disk: queueEntry?.disk ?? "",
|
||||||
|
|||||||
@@ -43,8 +43,8 @@ export function createApp(config: BastionConfig): { app: ReturnType<typeof Fasti
|
|||||||
|
|
||||||
// Register route handlers
|
// Register route handlers
|
||||||
registerDispatchRoutes(app, config, state);
|
registerDispatchRoutes(app, config, state);
|
||||||
registerKickstartRoutes(app, config, state);
|
registerKickstartRoutes(app, config, state, syslog);
|
||||||
registerApiRoutes(app, state, installLog);
|
registerApiRoutes(app, state, installLog, syslog);
|
||||||
// boot.iso is generated at startup and served as a static file from httpDir
|
// boot.iso is generated at startup and served as a static file from httpDir
|
||||||
// (static serving supports HTTP Range requests, required by JetKVM streaming)
|
// (static serving supports HTTP Range requests, required by JetKVM streaming)
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ export class SyslogListener {
|
|||||||
private port: number;
|
private port: number;
|
||||||
private installLog: InstallLogBuffer;
|
private installLog: InstallLogBuffer;
|
||||||
private state: StateManager;
|
private state: StateManager;
|
||||||
|
/** Explicit IP → MAC mapping registered from kickstart/progress requests. */
|
||||||
|
private ipToMac = new Map<string, string>();
|
||||||
|
|
||||||
constructor(port: number, installLog: InstallLogBuffer, state: StateManager) {
|
constructor(port: number, installLog: InstallLogBuffer, state: StateManager) {
|
||||||
this.port = port;
|
this.port = port;
|
||||||
@@ -37,14 +39,21 @@ export class SyslogListener {
|
|||||||
this.state = state;
|
this.state = state;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Resolve a source IP to a MAC address using the install queue. */
|
/** Register an IP → MAC mapping (called when we learn a machine's IP). */
|
||||||
|
registerIp(ip: string, mac: string): void {
|
||||||
|
this.ipToMac.set(ip, mac.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Resolve a source IP to a MAC address. */
|
||||||
private resolveIpToMac(ip: string): string | null {
|
private resolveIpToMac(ip: string): string | null {
|
||||||
|
// Check explicit mapping first (most reliable)
|
||||||
|
const explicit = this.ipToMac.get(ip);
|
||||||
|
if (explicit) return explicit;
|
||||||
|
|
||||||
const currentState = this.state.load();
|
const currentState = this.state.load();
|
||||||
|
|
||||||
// Check install queue — machines being installed have an IP from DHCP
|
// Check install queue — machines being installed have an IP from DHCP
|
||||||
for (const [mac, entry] of Object.entries(currentState.install_queue)) {
|
for (const [mac, entry] of Object.entries(currentState.install_queue)) {
|
||||||
// The progress callback sends IP in "complete" detail, but during install
|
|
||||||
// we need to match by what we know. Check if any progress mentions this IP.
|
|
||||||
if (entry.progress_detail?.includes(ip)) return mac;
|
if (entry.progress_detail?.includes(ip)) return mac;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -102,6 +102,34 @@ boot
|
|||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* iPXE script for PXE-boot debug mode -- boots the installed system's root
|
||||||
|
* filesystem using the bastion's PXE kernel+initrd instead of local GRUB.
|
||||||
|
* Workaround for UEFI firmware bugs that make local disk boot slow.
|
||||||
|
*/
|
||||||
|
export function renderPxeBootDebugIpxe(params: {
|
||||||
|
mac: string;
|
||||||
|
hostname: string;
|
||||||
|
serverIp: string;
|
||||||
|
httpPort: number;
|
||||||
|
}): string {
|
||||||
|
return `#!ipxe
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo =============================================
|
||||||
|
echo Lab PXE Bastion - PXE BOOT (debug)
|
||||||
|
echo Target: ${params.hostname}
|
||||||
|
echo MAC: ${params.mac}
|
||||||
|
echo Kernel+initrd from PXE, root from NVMe
|
||||||
|
echo =============================================
|
||||||
|
echo
|
||||||
|
|
||||||
|
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0
|
||||||
|
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||||
|
boot
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* iPXE script for already-installed machines -- exits to boot from local disk.
|
* iPXE script for already-installed machines -- exits to boot from local disk.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -1,25 +1,33 @@
|
|||||||
// Debug/rescue kickstart template.
|
// Debug/rescue kickstart template.
|
||||||
// Minimal: sets SSH access and network for Anaconda rescue mode.
|
// Minimal kickstart for Anaconda rescue mode.
|
||||||
// No disk operations, no packages, no %post.
|
//
|
||||||
|
// SSH access: Anaconda's inst.sshd starts sshd automatically.
|
||||||
|
// The sshpw directive sets the password, sshkey adds authorized keys.
|
||||||
|
// %pre/%post do NOT run in rescue mode — don't put setup code there.
|
||||||
|
|
||||||
export interface DebugKickstartParams {
|
export interface DebugKickstartParams {
|
||||||
sshKeys: string[];
|
sshKeys: string[];
|
||||||
|
serverIp?: string;
|
||||||
|
httpPort?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
||||||
const sshpw = "sshpw --username=root --plaintext lab-root-pw";
|
|
||||||
const sshkeyLine = params.sshKeys.length > 0
|
const sshkeyLine = params.sshKeys.length > 0
|
||||||
? `sshkey --username=root "${params.sshKeys[0]}"`
|
? `sshkey --username=root "${params.sshKeys[0]}"`
|
||||||
: "";
|
: "";
|
||||||
|
|
||||||
return `# Lab Bastion -- Debug/Rescue Kickstart
|
return `# Lab Bastion -- Debug/Rescue Kickstart
|
||||||
# Minimal: only SSH + network for Anaconda rescue mode
|
# Minimal: SSH + network for Anaconda rescue mode
|
||||||
|
#
|
||||||
|
# SSH is started by Anaconda (inst.sshd kernel param).
|
||||||
|
# Password: debug | SSH keys from bastion config.
|
||||||
|
# %pre/%post do NOT run in rescue mode.
|
||||||
|
|
||||||
lang en_US.UTF-8
|
lang en_US.UTF-8
|
||||||
keyboard uk
|
keyboard uk
|
||||||
network --bootproto=dhcp --activate
|
network --bootproto=dhcp --activate
|
||||||
|
|
||||||
${sshpw}
|
sshpw --username=root --plaintext debug
|
||||||
${sshkeyLine}
|
${sshkeyLine}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -134,10 +134,9 @@ network --bootproto=dhcp --activate --hostname=${fqdn}
|
|||||||
${auth}
|
${auth}
|
||||||
${userDirective}
|
${userDirective}
|
||||||
|
|
||||||
bootloader --append="console=tty0 console=ttyS0,115200n8"
|
bootloader --append="console=tty0"
|
||||||
|
|
||||||
# logging --host=${serverIp} --port=${syslogPort}
|
logging --host=${serverIp} --port=${syslogPort}
|
||||||
# Disabled: syslog UDP port needs to be exposed in k3s service/hostPort first
|
|
||||||
|
|
||||||
url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch
|
url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch
|
||||||
|
|
||||||
@@ -342,17 +341,7 @@ echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
|
|||||||
|
|
||||||
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
|
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
|
||||||
# -- Enable chronyd for time sync --
|
# -- Enable chronyd for time sync --
|
||||||
systemctl enable chronyd || true
|
systemctl enable chronyd || true` : `# -- Kernel modules for k3s --
|
||||||
|
|
||||||
# -- Serial console (for debugging — auto-login as root on ttyS0) --
|
|
||||||
# AWS EC2 compatible: ttyS0 @ 115200n8
|
|
||||||
systemctl enable serial-getty@ttyS0.service || true
|
|
||||||
|
|
||||||
# -- Forward all system logs to serial console --
|
|
||||||
cat > /etc/rsyslog.d/serial-console.conf << 'RSYSLOG'
|
|
||||||
*.* /dev/ttyS0
|
|
||||||
RSYSLOG
|
|
||||||
systemctl enable rsyslog || true` : `# -- Kernel modules for k3s --
|
|
||||||
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
|
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
|
||||||
br_netfilter
|
br_netfilter
|
||||||
overlay
|
overlay
|
||||||
@@ -396,6 +385,9 @@ fi
|
|||||||
|
|
||||||
bastion_progress "post-install" "3-bootorder done"
|
bastion_progress "post-install" "3-bootorder done"
|
||||||
|
|
||||||
|
# -- Enable SysRq magic keys (for emergency reboot via Alt+SysRq+REISUB) --
|
||||||
|
echo "kernel.sysrq=1" > /etc/sysctl.d/90-sysrq.conf
|
||||||
|
|
||||||
# -- Provisioning metadata --
|
# -- Provisioning metadata --
|
||||||
cat > /etc/lab-provisioned << PROVEOF
|
cat > /etc/lab-provisioned << PROVEOF
|
||||||
hostname: ${fqdn}
|
hostname: ${fqdn}
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ function createTestConfig(testDir: string): BastionConfig {
|
|||||||
gateway: "10.0.0.1",
|
gateway: "10.0.0.1",
|
||||||
sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"],
|
sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"],
|
||||||
adminUser: "testadmin",
|
adminUser: "testadmin",
|
||||||
|
syslogPort: 15514,
|
||||||
skipDnsmasq: true,
|
skipDnsmasq: true,
|
||||||
skipArtifacts: true,
|
skipArtifacts: true,
|
||||||
fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os",
|
fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os",
|
||||||
|
|||||||
@@ -206,10 +206,8 @@ describe("renderInstallKickstart", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it("forwards system logs to serial console", () => {
|
it("does not include serial console (causes 30s boot timeout on hardware without UART)", () => {
|
||||||
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
||||||
expect(ks).toContain("serial-console.conf");
|
expect(ks).not.toContain("ttyS0");
|
||||||
expect(ks).toContain("/dev/ttyS0");
|
|
||||||
expect(ks).toContain("rsyslog");
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||||
|
import { createSocket } from "node:dgram";
|
||||||
|
import { mkdtempSync, rmSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { SyslogListener } from "../src/services/syslog-listener.js";
|
||||||
|
import { InstallLogBuffer } from "../src/services/install-log.js";
|
||||||
|
import { StateManager } from "../src/services/state.js";
|
||||||
|
|
||||||
|
function sendUdpSyslog(port: number, message: string): Promise<void> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const client = createSocket("udp4");
|
||||||
|
const buf = Buffer.from(message);
|
||||||
|
client.send(buf, 0, buf.length, port, "127.0.0.1", (err) => {
|
||||||
|
client.close();
|
||||||
|
if (err) reject(err);
|
||||||
|
else resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("SyslogListener", () => {
|
||||||
|
let tmpDir: string;
|
||||||
|
let state: StateManager;
|
||||||
|
let installLog: InstallLogBuffer;
|
||||||
|
let syslog: SyslogListener;
|
||||||
|
const PORT = 15514; // use non-privileged port for testing
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
tmpDir = mkdtempSync(join(tmpdir(), "syslog-test-"));
|
||||||
|
state = new StateManager(join(tmpDir, "state.json"));
|
||||||
|
state.init();
|
||||||
|
installLog = new InstallLogBuffer(tmpDir);
|
||||||
|
syslog = new SyslogListener(PORT, installLog, state);
|
||||||
|
syslog.start();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
syslog.stop();
|
||||||
|
rmSync(tmpDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("receives and stores syslog messages for registered IP", async () => {
|
||||||
|
const mac = "aa:bb:cc:dd:ee:ff";
|
||||||
|
// Queue a machine so hostname can be resolved
|
||||||
|
state.update((s) => {
|
||||||
|
s.install_queue[mac] = {
|
||||||
|
hostname: "testnode",
|
||||||
|
disk: "/dev/sda",
|
||||||
|
role: "worker",
|
||||||
|
os: "fedora-43",
|
||||||
|
queued_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Register IP → MAC mapping
|
||||||
|
syslog.registerIp("127.0.0.1", mac);
|
||||||
|
|
||||||
|
// Send a syslog message (RFC 3164 format)
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: Installing package vim-enhanced");
|
||||||
|
|
||||||
|
// Wait for UDP delivery
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("anaconda");
|
||||||
|
expect(lines[0]!.line).toContain("Installing package vim-enhanced");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores messages from unknown IPs", async () => {
|
||||||
|
// Don't register any IP mapping
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: test message");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
// No MAC to check, but the listener should not crash
|
||||||
|
// and no logs should be stored for any MAC
|
||||||
|
expect(installLog.lineCount("unknown")).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("resolves IP from installed machines state", async () => {
|
||||||
|
const mac = "11:22:33:44:55:66";
|
||||||
|
state.update((s) => {
|
||||||
|
s.installed[mac] = {
|
||||||
|
hostname: "installed-node",
|
||||||
|
role: "worker",
|
||||||
|
ip: "127.0.0.1",
|
||||||
|
installed_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
await sendUdpSyslog(PORT, "<14>Mar 30 02:00:00 installed-node sshd[5678]: Accepted publickey for root");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("sshd");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("parses various syslog formats", async () => {
|
||||||
|
const mac = "aa:bb:cc:dd:ee:ff";
|
||||||
|
syslog.registerIp("127.0.0.1", mac);
|
||||||
|
state.update((s) => {
|
||||||
|
s.install_queue[mac] = {
|
||||||
|
hostname: "testnode",
|
||||||
|
disk: "/dev/sda",
|
||||||
|
role: "worker",
|
||||||
|
os: "fedora-43",
|
||||||
|
queued_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Message without PID
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost kernel: NVMe device ready");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("kernel");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -94,8 +94,8 @@ export class LabdClient {
|
|||||||
return this.request("POST", "/api/machines/install", { body: opts });
|
return this.request("POST", "/api/machines/install", { body: opts });
|
||||||
}
|
}
|
||||||
|
|
||||||
async debugMachine(mac: string): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
||||||
return this.request("POST", "/api/machines/debug", { body: { mac } });
|
return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } });
|
||||||
}
|
}
|
||||||
|
|
||||||
async forgetMachine(mac: string): Promise<{ status: string }> {
|
async forgetMachine(mac: string): Promise<{ status: string }> {
|
||||||
|
|||||||
@@ -48,8 +48,9 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
parent
|
parent
|
||||||
.command("debug <target>")
|
.command("debug <target>")
|
||||||
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
||||||
|
.option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)")
|
||||||
.showHelpAfterError(true)
|
.showHelpAfterError(true)
|
||||||
.action(async (target: string) => {
|
.action(async (target: string, opts: { pxeBoot?: boolean }) => {
|
||||||
const client = getLabdClient();
|
const client = getLabdClient();
|
||||||
|
|
||||||
// Resolve target from labd aggregated state
|
// Resolve target from labd aggregated state
|
||||||
@@ -73,7 +74,7 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await client.debugMachine(mac);
|
const result = await client.debugMachine(mac, { pxeBoot: opts.pxeBoot === true });
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
console.error(`Failed: ${result.error}`);
|
console.error(`Failed: ${result.error}`);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
@@ -116,38 +117,39 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine bastion URL from labd config for the setup script URL
|
||||||
|
const bastionUrl = process.env["LABD_URL"]
|
||||||
|
? process.env["LABD_URL"].replace(/\/ws\/bastion$/, "").replace(/^wss?:/, "http:")
|
||||||
|
: "http://<bastion-ip>:8080";
|
||||||
|
|
||||||
console.log(`
|
console.log(`
|
||||||
Debug mode queued for ${hostname} (${mac}).
|
Debug mode queued for ${hostname} (${mac}).
|
||||||
Reboot the machine to enter Fedora rescue mode.
|
Reboot the machine to enter Fedora rescue mode.
|
||||||
|
|
||||||
|
SSH access (started by Anaconda):
|
||||||
|
ssh root@<ip> (password: debug)
|
||||||
|
|
||||||
|
For nc remote shell, run from rescue shell:
|
||||||
|
curl ${bastionUrl}/debug-setup.sh | bash
|
||||||
|
|
||||||
Once in rescue shell:
|
Once in rescue shell:
|
||||||
|
|
||||||
# Activate LVM
|
# Activate LVM and mount installed system
|
||||||
vgchange -ay labvg
|
vgchange -ay
|
||||||
|
|
||||||
# Mount root + other volumes
|
|
||||||
mkdir -p /mnt/sysroot
|
mkdir -p /mnt/sysroot
|
||||||
mount /dev/labvg/root /mnt/sysroot
|
mount /dev/<vg>/root /mnt/sysroot
|
||||||
cat /mnt/sysroot/etc/fstab # check what else to mount
|
cat /mnt/sysroot/etc/fstab
|
||||||
mount /dev/labvg/var /mnt/sysroot/var
|
mount /dev/<vg>/var /mnt/sysroot/var
|
||||||
mount /dev/labvg/home /mnt/sysroot/home
|
mount /dev/<vg>/home /mnt/sysroot/home
|
||||||
|
|
||||||
# Boot the installed system in a container
|
# Boot installed system in a container
|
||||||
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
||||||
|
|
||||||
# Or just chroot for quick fixes
|
# Or chroot for quick fixes
|
||||||
mount --bind /dev /mnt/sysroot/dev
|
mount --bind /dev /mnt/sysroot/dev
|
||||||
mount --bind /proc /mnt/sysroot/proc
|
mount --bind /proc /mnt/sysroot/proc
|
||||||
mount --bind /sys /mnt/sysroot/sys
|
mount --bind /sys /mnt/sysroot/sys
|
||||||
chroot /mnt/sysroot
|
chroot /mnt/sysroot
|
||||||
|
|
||||||
# Check initramfs size
|
|
||||||
ls -lh /mnt/sysroot/boot/initramfs-*.img
|
|
||||||
|
|
||||||
# Rebuild initramfs without amdgpu
|
|
||||||
chroot /mnt/sysroot
|
|
||||||
echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf
|
|
||||||
dracut -f --regenerate-all
|
|
||||||
`);
|
`);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,19 +39,25 @@ export function registerLogsCommand(parent: Command): void {
|
|||||||
parent
|
parent
|
||||||
.command("logs <target>")
|
.command("logs <target>")
|
||||||
.description("Show provisioning logs for a machine (hostname, MAC, or IP)")
|
.description("Show provisioning logs for a machine (hostname, MAC, or IP)")
|
||||||
.action(async (target: string) => {
|
.option("-f, --follow", "Follow log output in real-time")
|
||||||
|
.action(async (target: string, opts: { follow?: boolean }) => {
|
||||||
const mac = await resolveToMac(target);
|
const mac = await resolveToMac(target);
|
||||||
|
|
||||||
|
const BOLD = "\x1b[1m";
|
||||||
|
const GREEN = "\x1b[32m";
|
||||||
|
const YELLOW = "\x1b[33m";
|
||||||
|
const RED = "\x1b[31m";
|
||||||
|
const DIM = "\x1b[2m";
|
||||||
|
const RESET = "\x1b[0m";
|
||||||
|
|
||||||
|
if (opts.follow) {
|
||||||
|
await followLogs(mac, { BOLD, GREEN, YELLOW, RED, DIM, RESET });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const data = await getLabdClient().getMachineLogs(mac);
|
const data = await getLabdClient().getMachineLogs(mac);
|
||||||
|
|
||||||
const BOLD = "\x1b[1m";
|
|
||||||
const GREEN = "\x1b[32m";
|
|
||||||
const YELLOW = "\x1b[33m";
|
|
||||||
const RED = "\x1b[31m";
|
|
||||||
const DIM = "\x1b[2m";
|
|
||||||
const RESET = "\x1b[0m";
|
|
||||||
|
|
||||||
console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`);
|
console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`);
|
||||||
console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`);
|
console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`);
|
||||||
console.log(` Role: ${data["role"]}`);
|
console.log(` Role: ${data["role"]}`);
|
||||||
@@ -83,3 +89,58 @@ export function registerLogsCommand(parent: Command): void {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Follow logs by polling labd. */
|
||||||
|
async function followLogs(
|
||||||
|
mac: string,
|
||||||
|
colors: { BOLD: string; GREEN: string; YELLOW: string; RED: string; DIM: string; RESET: string },
|
||||||
|
): Promise<void> {
|
||||||
|
const { BOLD, GREEN, YELLOW, RED, DIM, RESET } = colors;
|
||||||
|
const client = getLabdClient();
|
||||||
|
|
||||||
|
console.log(`${DIM}Following logs for ${mac} (Ctrl+C to stop)${RESET}`);
|
||||||
|
console.log("");
|
||||||
|
|
||||||
|
let lastStageCount = 0;
|
||||||
|
let lastStatus = "";
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
const data = await client.getMachineLogs(mac);
|
||||||
|
const status = String(data["status"] ?? "");
|
||||||
|
const log = data["log"] as Array<{ stage: string; detail: string; timestamp: string }> | undefined;
|
||||||
|
|
||||||
|
// Print header once or on status change
|
||||||
|
if (status !== lastStatus) {
|
||||||
|
const hostname = String(data["hostname"] ?? mac);
|
||||||
|
const statusColor = status === "installed" ? GREEN : YELLOW;
|
||||||
|
console.log(` ${BOLD}${hostname}${RESET} ${statusColor}${status}${RESET}`);
|
||||||
|
lastStatus = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print new stages
|
||||||
|
if (log && log.length > lastStageCount) {
|
||||||
|
for (let i = lastStageCount; i < log.length; i++) {
|
||||||
|
const entry = log[i]!;
|
||||||
|
const time = entry.timestamp.slice(11, 19);
|
||||||
|
const color = entry.stage === "complete" ? GREEN : entry.stage === "error" ? RED : YELLOW;
|
||||||
|
const detail = entry.detail ? ` ${DIM}-- ${entry.detail}${RESET}` : "";
|
||||||
|
console.log(` ${DIM}${time}${RESET} ${color}${entry.stage}${RESET}${detail}`);
|
||||||
|
}
|
||||||
|
lastStageCount = log.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Done
|
||||||
|
if (status === "installed") {
|
||||||
|
const ip = data["ip"] ?? "";
|
||||||
|
console.log("");
|
||||||
|
console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Machine may not be in logs yet (still queued)
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
try {
|
try {
|
||||||
const result = await sendCommand(all[0]!.bastionId, {
|
const result = await sendCommand(all[0]!.bastionId, {
|
||||||
type: "command-install",
|
type: "command-install",
|
||||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||||
});
|
});
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -164,7 +164,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
try {
|
try {
|
||||||
const result = await sendCommand(bastion.bastionId, {
|
const result = await sendCommand(bastion.bastionId, {
|
||||||
type: "command-install",
|
type: "command-install",
|
||||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||||
});
|
});
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -174,9 +174,10 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
|
|
||||||
// Queue debug/rescue mode — route to correct bastion by MAC
|
// Queue debug/rescue mode — route to correct bastion by MAC
|
||||||
app.post<{
|
app.post<{
|
||||||
Body: { mac?: string };
|
Body: { mac?: string; pxeBoot?: boolean };
|
||||||
}>("/api/machines/debug", async (request, reply) => {
|
}>("/api/machines/debug", async (request, reply) => {
|
||||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
|
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||||
if (!mac) {
|
if (!mac) {
|
||||||
return reply.code(400).send({ error: "mac is required" });
|
return reply.code(400).send({ error: "mac is required" });
|
||||||
}
|
}
|
||||||
@@ -189,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
}
|
}
|
||||||
if (all.length === 1) {
|
if (all.length === 1) {
|
||||||
try {
|
try {
|
||||||
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac });
|
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||||
@@ -199,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac });
|
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ export type LabdBastionMessage =
|
|||||||
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
|
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
|
||||||
| { type: "command-forget"; requestId: string; mac: string }
|
| { type: "command-forget"; requestId: string; mac: string }
|
||||||
| { type: "command-role-update"; requestId: string; mac: string; role: string }
|
| { type: "command-role-update"; requestId: string; mac: string; role: string }
|
||||||
| { type: "command-debug"; requestId: string; mac: string }
|
| { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean }
|
||||||
| { type: "server-shutdown"; reconnectAfter: number };
|
| { type: "server-shutdown"; reconnectAfter: number };
|
||||||
|
|
||||||
export type BastionMessageType = BastionMessage["type"];
|
export type BastionMessageType = BastionMessage["type"];
|
||||||
|
|||||||
@@ -101,6 +101,7 @@ export interface InstalledInfo {
|
|||||||
export interface DebugConfig {
|
export interface DebugConfig {
|
||||||
hostname: string;
|
hostname: string;
|
||||||
queued_at: string;
|
queued_at: string;
|
||||||
|
pxeBoot?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface BastionState {
|
export interface BastionState {
|
||||||
|
|||||||
@@ -224,11 +224,12 @@ describe("PXE boot provisioning", () => {
|
|||||||
// Generate dnsmasq config
|
// Generate dnsmasq config
|
||||||
generateDnsmasqConf(config);
|
generateDnsmasqConf(config);
|
||||||
|
|
||||||
// Start HTTP server
|
// Start HTTP server + syslog listener
|
||||||
const { app, state } = createApp(config);
|
const { app, state, syslog } = createApp(config);
|
||||||
bastionApp = app;
|
bastionApp = app;
|
||||||
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
||||||
log(`Bastion HTTP server listening on :${HTTP_PORT}`);
|
syslog.start();
|
||||||
|
log(`Bastion HTTP server listening on :${HTTP_PORT}, syslog on UDP :${config.syslogPort}`);
|
||||||
|
|
||||||
// Start dnsmasq (fire-and-forget — it runs until killed)
|
// Start dnsmasq (fire-and-forget — it runs until killed)
|
||||||
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
|
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
|
||||||
@@ -387,8 +388,8 @@ describe("PXE boot provisioning", () => {
|
|||||||
expect(data.progress).toBe("complete");
|
expect(data.progress).toBe("complete");
|
||||||
});
|
});
|
||||||
|
|
||||||
it.skip("log lines were captured", async () => {
|
it("syslog install logs were captured", async () => {
|
||||||
// Requires log streamer in %post — skipped until re-added
|
// Anaconda forwards logs via syslog (logging --host directive in kickstart)
|
||||||
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
|
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
|
||||||
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
|
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
|
||||||
expect(data.log_total).toBeGreaterThan(0);
|
expect(data.log_total).toBeGreaterThan(0);
|
||||||
|
|||||||
Reference in New Issue
Block a user