Compare commits
11 Commits
816736793d
...
docs/archi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d7a25066bd | ||
| a0f6161533 | |||
|
|
87c1a34232 | ||
| 84afe7d5e4 | |||
|
|
0a4916d3c9 | ||
|
|
a4a4840930 | ||
|
|
8da947a1c3 | ||
|
|
92c65b4672 | ||
|
|
3835fefba1 | ||
| d4e9101bb6 | |||
| 014e8a6e72 |
@@ -62,13 +62,13 @@ _labctl() {
|
|||||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision debug")
|
"provision debug")
|
||||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "--pxe-boot -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision forget")
|
"provision forget")
|
||||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision logs")
|
"provision logs")
|
||||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur"))
|
||||||
return ;;
|
return ;;
|
||||||
"provision makeiso")
|
"provision makeiso")
|
||||||
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
||||||
|
|||||||
@@ -137,6 +137,12 @@ complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machin
|
|||||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||||
|
|
||||||
|
# provision debug options
|
||||||
|
complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)'
|
||||||
|
|
||||||
|
# provision logs options
|
||||||
|
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time'
|
||||||
|
|
||||||
# provision makeiso options
|
# provision makeiso options
|
||||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
||||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
||||||
|
|||||||
431
bastion/docs/ARCHITECTURE.md
Normal file
431
bastion/docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,431 @@
|
|||||||
|
# Lab Platform Architecture
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
A bare-metal and hybrid cloud infrastructure platform for automated machine provisioning, Kubernetes cluster management, and fleet operations. The platform discovers hardware via PXE boot, installs operating systems unattended, deploys k3s clusters, and provides centralized management through a CLI and API.
|
||||||
|
|
||||||
|
**Components:**
|
||||||
|
- **bastion** -- PXE boot server (DHCP/TFTP/HTTP) for machine discovery and OS installation
|
||||||
|
- **labd** -- Master daemon for multi-bastion aggregation, persistent state, agent management
|
||||||
|
- **labctl** -- CLI tool for operators (kubectl-style interface)
|
||||||
|
- **lab-agent** -- Daemon on provisioned servers for remote execution and monitoring
|
||||||
|
- **modules** -- Declarative configuration system (k3s, labcontroller)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
labctl (CLI)
|
||||||
|
|
|
||||||
|
labd (master daemon)
|
||||||
|
/ | \
|
||||||
|
bastion1 bastion2 ... (PXE provisioning)
|
||||||
|
/ \ |
|
||||||
|
[machines] [machines] (bare metal)
|
||||||
|
| |
|
||||||
|
lab-agent lab-agent (remote exec)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Communication Patterns
|
||||||
|
|
||||||
|
| Path | Protocol | Auth |
|
||||||
|
|------|----------|------|
|
||||||
|
| labctl -> labd | HTTP/HTTPS | mTLS cert (future: token) |
|
||||||
|
| bastion -> labd | WebSocket | Join token enrollment |
|
||||||
|
| lab-agent -> labd | WebSocket | mTLS certificate |
|
||||||
|
| machine -> bastion | HTTP | None (local network) |
|
||||||
|
| Anaconda -> bastion | HTTP + UDP syslog | None (install-time) |
|
||||||
|
| labctl -> bastion | HTTP | None (standalone mode) |
|
||||||
|
|
||||||
|
### Standalone vs Centralized
|
||||||
|
|
||||||
|
The bastion can operate in two modes:
|
||||||
|
|
||||||
|
1. **Standalone** -- single bastion, state in local JSON file, CLI talks directly to bastion HTTP API
|
||||||
|
2. **Centralized** -- bastion registers with labd via WebSocket, state aggregated in CockroachDB, CLI talks to labd which routes commands to the correct bastion
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Machine Lifecycle
|
||||||
|
|
||||||
|
```
|
||||||
|
PXE boot
|
||||||
|
|
|
||||||
|
+--------v--------+
|
||||||
|
| DISCOVERED | Hardware inventory collected
|
||||||
|
+---------+-------+
|
||||||
|
|
|
||||||
|
labctl provision install
|
||||||
|
|
|
||||||
|
+---------v-------+
|
||||||
|
| INSTALL_QUEUE | Waiting for next PXE boot
|
||||||
|
+---------+-------+
|
||||||
|
|
|
||||||
|
PXE boot (Anaconda)
|
||||||
|
|
|
||||||
|
+---------v-------+
|
||||||
|
| INSTALLING | Progress: partitioning -> packages -> post-install
|
||||||
|
+---------+-------+
|
||||||
|
|
|
||||||
|
+---------v-------+
|
||||||
|
| INSTALLED | OS ready, SSH accessible
|
||||||
|
+---------+-------+
|
||||||
|
|
|
||||||
|
labctl app k3s install
|
||||||
|
|
|
||||||
|
+---------v-------+
|
||||||
|
| K3S RUNNING | Kubernetes node operational
|
||||||
|
+--------+--------+
|
||||||
|
|
|
||||||
|
labctl provision reprovision
|
||||||
|
|
|
||||||
|
(back to INSTALL_QUEUE)
|
||||||
|
```
|
||||||
|
|
||||||
|
Side paths:
|
||||||
|
- **DEBUG** -- `labctl provision debug` boots Anaconda rescue mode for diagnostics
|
||||||
|
- **FORGET** -- `labctl provision forget` removes machine from all state
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Packages
|
||||||
|
|
||||||
|
### Monorepo Structure
|
||||||
|
|
||||||
|
TypeScript ESM monorepo with pnpm workspaces. Six packages:
|
||||||
|
|
||||||
|
| Package | Role | Key Tech |
|
||||||
|
|---------|------|----------|
|
||||||
|
| `@lab/shared` | Types, protocol, constants | - |
|
||||||
|
| `@lab/bastion` | PXE server | Fastify, dnsmasq |
|
||||||
|
| `@lab/cli` | CLI binary | Commander.js |
|
||||||
|
| `@lab/labd` | Master daemon | Fastify, Prisma, CockroachDB |
|
||||||
|
| `@lab/agent` | Server agent | WebSocket |
|
||||||
|
| `@lab/modules` | Config modules | SSH, k8s-client |
|
||||||
|
|
||||||
|
### @lab/shared
|
||||||
|
|
||||||
|
Core type system shared by all packages.
|
||||||
|
|
||||||
|
**State Model:**
|
||||||
|
```typescript
|
||||||
|
interface BastionState {
|
||||||
|
discovered: Record<MAC, HardwareInfo>
|
||||||
|
install_queue: Record<MAC, InstallConfig>
|
||||||
|
installed: Record<MAC, InstalledInfo>
|
||||||
|
debug: Record<MAC, DebugConfig>
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Roles:**
|
||||||
|
- `vanilla` -- OS only, no k3s, no cluster services
|
||||||
|
- `worker` -- k3s agent + Longhorn storage (joins existing cluster)
|
||||||
|
- `infra` -- k3s server + etcd (control plane node)
|
||||||
|
- `labcontroller` -- infra + bastion + labd + CockroachDB (self-sufficient)
|
||||||
|
|
||||||
|
**OS Support:**
|
||||||
|
- `fedora-43` -- Anaconda kickstart installer
|
||||||
|
- `ubuntu-26.04` -- cloud-init autoinstall
|
||||||
|
|
||||||
|
**Protocol:** Discriminated union message types for WebSocket communication between agents, bastions, and labd. Type guards and parsers for runtime validation.
|
||||||
|
|
||||||
|
### @lab/bastion
|
||||||
|
|
||||||
|
PXE boot server that handles the physical provisioning lifecycle.
|
||||||
|
|
||||||
|
**Services:**
|
||||||
|
- `StateManager` -- JSON file persistence with immutable update pattern
|
||||||
|
- `SyslogListener` -- UDP syslog receiver (port 5514) for Anaconda install logs
|
||||||
|
- `InstallLogBuffer` -- In-memory ring buffer + disk persistence per machine
|
||||||
|
- `BastionConnection` -- WebSocket client to labd for centralized mode
|
||||||
|
- dnsmasq management (spawn, config generation, proxy/full DHCP)
|
||||||
|
- Network auto-detection (interface, IP, subnet, gateway)
|
||||||
|
- ISO builder (xorriso + mtools for non-PXE machines)
|
||||||
|
|
||||||
|
**HTTP Routes:**
|
||||||
|
|
||||||
|
| Endpoint | Purpose |
|
||||||
|
|----------|---------|
|
||||||
|
| `GET /dispatch?mac=` | Dynamic iPXE script (discover/install/debug/local-boot) |
|
||||||
|
| `GET /ks?mac=` | Per-machine Anaconda kickstart |
|
||||||
|
| `GET /debug.ks` | Rescue mode kickstart |
|
||||||
|
| `GET /debug-setup.sh` | nc listener setup script for rescue shell |
|
||||||
|
| `GET /discover.ks` | Hardware discovery kickstart |
|
||||||
|
| `POST /api/discover` | Hardware inventory report |
|
||||||
|
| `POST /api/install` | Queue machine for install |
|
||||||
|
| `POST /api/progress` | Install progress callback |
|
||||||
|
| `POST /api/log` | Raw log line ingestion |
|
||||||
|
| `POST /api/debug` | Queue debug/rescue mode |
|
||||||
|
| `GET /api/machines` | List all machines |
|
||||||
|
| `GET /api/logs/:mac` | Install logs + progress |
|
||||||
|
| `GET /api/logs/:mac/follow` | SSE stream of progress events |
|
||||||
|
| `DELETE /api/machines/:mac` | Forget machine |
|
||||||
|
|
||||||
|
**Templates:**
|
||||||
|
- `boot.ipxe.ts` -- iPXE scripts for each boot mode (discover, install, debug, pxe-boot-debug, local-boot)
|
||||||
|
- `install.ks.ts` -- Full Fedora kickstart with LVM, SSH, k3s prereqs, progress callbacks, SysRq keys
|
||||||
|
- `debug.ks.ts` -- Minimal rescue kickstart (SSH via inst.sshd)
|
||||||
|
- `ubuntu-autoinstall.ts` -- cloud-init for Ubuntu
|
||||||
|
- `dnsmasq.conf.ts` -- DHCP/TFTP configuration
|
||||||
|
|
||||||
|
**Boot Dispatch Logic:**
|
||||||
|
```
|
||||||
|
1. debug[mac]? -> renderDebugIpxe (auto-clear after serving)
|
||||||
|
2. install_queue[mac]? -> renderInstallIpxe
|
||||||
|
3. installed[mac]? -> renderLocalBootIpxe (exit to disk)
|
||||||
|
4. unknown -> renderDiscoverIpxe
|
||||||
|
```
|
||||||
|
|
||||||
|
### @lab/labd
|
||||||
|
|
||||||
|
Central management daemon. Aggregates multiple bastions, stores persistent state in CockroachDB, relays commands, manages agent fleet.
|
||||||
|
|
||||||
|
**Database (Prisma + CockroachDB):**
|
||||||
|
- `Server` -- hostname, MAC, IP, role, status, cloud, environment, labels
|
||||||
|
- `Bastion` -- hostname, network, serverIp, lastHeartbeat
|
||||||
|
- `Agent` -- certificate, enrollment, heartbeat
|
||||||
|
- `Cluster` -- name, cloud, environment, kubeconfig (encrypted)
|
||||||
|
- `User` / `Role` / `Permission` -- RBAC (action:cloud:env:server matrix)
|
||||||
|
- `JoinToken` -- one-time/reusable enrollment tokens
|
||||||
|
- `AuditLog` -- action, resource, result, timestamp
|
||||||
|
|
||||||
|
**Key Services:**
|
||||||
|
- `BastionRegistry` -- in-memory registry of connected bastions, state aggregation, MAC-to-bastion routing
|
||||||
|
- `AgentRegistry` -- connected agents, heartbeat tracking
|
||||||
|
- `MessageRouter` -- command relay between CLI/agents and bastions
|
||||||
|
|
||||||
|
**Command Routing:**
|
||||||
|
```
|
||||||
|
CLI: labctl provision install <mac> <hostname>
|
||||||
|
-> POST /api/machines/install
|
||||||
|
-> labd finds bastion that knows this MAC
|
||||||
|
-> WebSocket: {type: "command-install", mac, hostname, disk, role}
|
||||||
|
-> bastion updates install_queue
|
||||||
|
-> WebSocket: {type: "command-response", status: "ok"}
|
||||||
|
-> HTTP response to CLI
|
||||||
|
```
|
||||||
|
|
||||||
|
### @lab/cli (labctl)
|
||||||
|
|
||||||
|
Operator CLI. Commander.js binary, distributed as RPM/DEB or standalone bun-compiled executable.
|
||||||
|
|
||||||
|
**Command Groups:**
|
||||||
|
|
||||||
|
```
|
||||||
|
labctl init bastion standalone start|stop|status
|
||||||
|
labctl provision list|install|reprovision|forget|debug|logs|makeiso
|
||||||
|
labctl app k3s install|health|list
|
||||||
|
labctl config list|get|set|path
|
||||||
|
labctl login
|
||||||
|
labctl doctor
|
||||||
|
labctl roles
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Features:**
|
||||||
|
- Target resolution: hostname, MAC, or IP -> machine lookup
|
||||||
|
- SSH reboot into PXE for reprovision/debug (efibootmgr --bootnext)
|
||||||
|
- Follow mode: `labctl provision logs <target> -f` (5s polling)
|
||||||
|
- Shell completions: bash, fish
|
||||||
|
|
||||||
|
### @lab/modules
|
||||||
|
|
||||||
|
Declarative configuration modules with three-phase lifecycle: install -> configure -> health.
|
||||||
|
|
||||||
|
**k3s Module:**
|
||||||
|
- 5 operation groups: host-prep, networking, k3s-server, k3s-agent, hardening
|
||||||
|
- 15+ individual operations: kernel modules, sysctl, firewall, Cilium CNI, SELinux, audit policy, pod security, cert checks
|
||||||
|
- Health checks: service running, node ready, API health, pod status, Cilium status, secrets encryption
|
||||||
|
- SSH execution backend with progress callbacks
|
||||||
|
|
||||||
|
### @lab/agent
|
||||||
|
|
||||||
|
Daemon on provisioned servers. WebSocket to labd for:
|
||||||
|
- Heartbeat (hostname, uptime, CPU/mem usage)
|
||||||
|
- Command execution (with stdout/stderr streaming)
|
||||||
|
- Log streaming (journalctl relay)
|
||||||
|
- mTLS certificate enrollment and rotation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Disk Layout
|
||||||
|
|
||||||
|
### LVM Partitioning (labvg)
|
||||||
|
|
||||||
|
All roles share a common LVM layout. The kickstart `%pre` auto-detects the install disk (NVMe preferred, then SATA, skipping USB/removable).
|
||||||
|
|
||||||
|
| Volume | Size | FS | Reprovision |
|
||||||
|
|--------|------|-----|-------------|
|
||||||
|
| `/boot/efi` | 600 MB | vfat | Reused |
|
||||||
|
| `/boot` | 3 GB | ext4 | Reused |
|
||||||
|
| `swap` | 27 GB | swap | Recreated |
|
||||||
|
| `/` (root) | 33 GB | xfs | Recreated |
|
||||||
|
| `/var` | 100 GB | xfs | Recreated |
|
||||||
|
| `/var/log` | 10 GB | xfs | Recreated |
|
||||||
|
| `/home` | 10 GB | xfs | **Preserved** |
|
||||||
|
| `/srv` | 20 GB | xfs | **Preserved** |
|
||||||
|
| `/var/lib/longhorn` | remaining | xfs | **Preserved** (worker) |
|
||||||
|
| `/var/lib/rancher` | 20 GB | xfs | **Preserved** (infra) |
|
||||||
|
| `/tmp` | 4 GB | tmpfs | - |
|
||||||
|
|
||||||
|
Reprovision detection: if `labvg` VG exists, reuse EFI/boot partitions and preserve data volumes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Kickstart Features
|
||||||
|
|
||||||
|
The Fedora kickstart template (`install.ks.ts`) includes:
|
||||||
|
|
||||||
|
- **Dynamic disk detection** -- `%pre` probes NVMe/SATA/virtio, skips USB/removable, supports both fresh install and reprovision
|
||||||
|
- **Progress callbacks** -- `curl -sf POST /api/progress` at each stage (partitioning, post-install substeps, complete)
|
||||||
|
- **Anaconda syslog forwarding** -- `logging --host --port` streams real-time install logs to bastion
|
||||||
|
- **SSH hardening** -- key-only auth, root login via pubkey only, admin user with passwordless sudo
|
||||||
|
- **Network-first boot order** -- `efibootmgr` reorders boot entries so PXE is always first (bastion controls every reboot)
|
||||||
|
- **SysRq magic keys** -- `kernel.sysrq=1` for emergency reboot via KVM keyboard
|
||||||
|
- **Role-specific setup:**
|
||||||
|
- `vanilla`: chronyd only
|
||||||
|
- `worker`/`infra`: kernel modules (br_netfilter, overlay), sysctl (ip_forward, inotify), firewalld disabled, k3s binary installed
|
||||||
|
- `infra`: k3s server binary pre-installed
|
||||||
|
|
||||||
|
**What is NOT in the kickstart:**
|
||||||
|
- `console=ttyS0` -- causes 30s-per-step boot timeout on hardware without physical serial UART (discovered 2026-03-30, see docs/pxe-boot-debugging-2026-03-30.md)
|
||||||
|
- Background log streamer (`tail -f`) -- prevents Anaconda from syncing filesystem, causes %post writes to not persist
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
### Container Images
|
||||||
|
|
||||||
|
**bastion** (`Dockerfile.bastion`):
|
||||||
|
- Base: Fedora 43 (needs dnsmasq, iPXE)
|
||||||
|
- Multi-stage: Alpine build -> Fedora runtime
|
||||||
|
- iPXE rebuilt from source (SNP driver for EFI)
|
||||||
|
- hostNetwork in k8s (DHCP needs raw sockets)
|
||||||
|
- Capabilities: NET_ADMIN, NET_RAW
|
||||||
|
|
||||||
|
**labd** (`Dockerfile.labd`):
|
||||||
|
- Base: Alpine (minimal)
|
||||||
|
- Multi-stage build with Prisma client generation
|
||||||
|
- Runs as non-root `node` user
|
||||||
|
|
||||||
|
### Kubernetes (k3s)
|
||||||
|
|
||||||
|
```
|
||||||
|
Namespace: lab-infra
|
||||||
|
Deployment: bastion (hostNetwork, PVC for /data, host SSH keys)
|
||||||
|
ConfigMap: bastion-config (env vars)
|
||||||
|
Secret: bastion-join-token
|
||||||
|
PVC: bastion-state (local-path)
|
||||||
|
|
||||||
|
Namespace: lab-system
|
||||||
|
Deployment: labd
|
||||||
|
Service: labd (NodePort 30100)
|
||||||
|
StatefulSet: cockroachdb-0
|
||||||
|
```
|
||||||
|
|
||||||
|
### CLI Distribution
|
||||||
|
|
||||||
|
Built with `nfpm` as RPM/DEB. Includes:
|
||||||
|
- `/usr/bin/labctl` (bun-compiled standalone binary)
|
||||||
|
- `/usr/share/bash-completion/completions/labctl`
|
||||||
|
- `/usr/share/fish/vendor_completions.d/labctl.fish`
|
||||||
|
|
||||||
|
Config: `~/.labctl/config.yaml` with `labdUrl`, output format, default cloud/environment.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Build & Release
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Development
|
||||||
|
pnpm install && pnpm build # Compile all packages
|
||||||
|
pnpm test:run # Unit tests (vitest)
|
||||||
|
npx tsc --noEmit # Type check
|
||||||
|
|
||||||
|
# Deploy
|
||||||
|
bash scripts/deploy.sh all # Build containers + RPM, push, restart pods
|
||||||
|
bash scripts/deploy.sh bastion # Just bastion
|
||||||
|
bash scripts/deploy.sh labd # Just labd
|
||||||
|
bash scripts/deploy.sh labctl # Just CLI (local RPM install)
|
||||||
|
|
||||||
|
# Container builds
|
||||||
|
bash scripts/build-bastion.sh --platforms linux/amd64 --push latest
|
||||||
|
bash scripts/build-labd.sh --platforms linux/amd64 --push latest
|
||||||
|
bash scripts/build-rpm.sh # RPM + DEB packages
|
||||||
|
|
||||||
|
# Integration tests (require libvirt, sudo)
|
||||||
|
sudo tests/integration/run-pxe-test.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Registry: `mysources.co.uk` (Gitea at 10.0.0.194:3012)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Unit Tests
|
||||||
|
- Kickstart rendering (ksvalidator syntax check, partition layout, role-specific sections)
|
||||||
|
- State management (load, save, update, debug field)
|
||||||
|
- Dispatch routing (correct iPXE script for each machine state)
|
||||||
|
- Syslog listener (UDP receive, IP->MAC resolution, RFC 3164 parsing)
|
||||||
|
|
||||||
|
### Integration Tests (libvirt VMs)
|
||||||
|
- **pxe-provision.test.ts** -- Full end-to-end: create VM -> PXE discovery -> queue install -> Anaconda install -> SSH verification -> systemd health -> SELinux enforcing -> boot order check
|
||||||
|
- **iso-provision.test.ts** -- ISO boot for non-PXE machines
|
||||||
|
- **k3s-single-node.test.ts** -- Post-provision k3s installation and health
|
||||||
|
- VM screenshot capture during boot for debugging
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
- **mTLS** for agent-labd communication (certificate enrollment via join tokens)
|
||||||
|
- **SSH key-only auth** on provisioned machines (no password auth)
|
||||||
|
- **SELinux enforcing** verified in integration tests
|
||||||
|
- **RBAC** (planned): action:cloud:environment:server permission matrix
|
||||||
|
- **Audit logging** (planned): every mutation tracked in CockroachDB
|
||||||
|
- **Network-first boot order** prevents machines from booting without bastion approval
|
||||||
|
- **SysRq keys** enabled for emergency reboot without SSH access
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Issues & Lessons Learned
|
||||||
|
|
||||||
|
### Serial Console Boot Delay (2026-03-30)
|
||||||
|
`console=ttyS0,115200n8` in kernel cmdline causes 30-second timeout at every systemd boot phase on hardware without a physical serial UART. Root cause: systemd blocks writing to non-existent UART. Fix: removed from kickstart entirely.
|
||||||
|
|
||||||
|
### Anaconda %post Log Streamer
|
||||||
|
Background `tail -f` in kickstart `%post` prevents Anaconda from syncing the filesystem. All file writes in %post appear to succeed but are lost on reboot. Fix: removed background log streamer, replaced with Anaconda's built-in `logging --host --port` syslog forwarding.
|
||||||
|
|
||||||
|
### Disk Auto-Detection
|
||||||
|
Hardcoded `/dev/sda` default broke NVMe-only machines. Fix: default to empty string (auto-detect) which triggers the `%pre` disk probe logic.
|
||||||
|
|
||||||
|
### Anaconda Rescue Mode Limitations
|
||||||
|
`%pre` and `%post` sections do not execute in `inst.rescue` mode. SSH in rescue mode is provided by Anaconda's `inst.sshd` kernel parameter + `sshpw` kickstart directive. Manual setup via `curl bastion:8080/debug-setup.sh | bash` for nc listener.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Planned Work (Taskmaster)
|
||||||
|
|
||||||
|
13 tasks in queue, all pending:
|
||||||
|
|
||||||
|
1. **#72** Expand Prisma schema with resource relationships (Network, ServerNic, ServerDisk, ClusterMember)
|
||||||
|
2. **#73** State persistence service (bastion state -> CockroachDB)
|
||||||
|
3. **#74** State loading from labd on bastion startup
|
||||||
|
4. **#75** Fix bastion --dir env var default
|
||||||
|
5. **#76** Resource type registry with aliases (kubectl-style)
|
||||||
|
6. **#77** `labctl get <resource>` command
|
||||||
|
7. **#78** `labctl describe <resource>` command
|
||||||
|
8. **#79** `labctl create/delete` commands
|
||||||
|
9. **#80** Refactor provision commands to kubectl-style
|
||||||
|
10. **#81** Server and resource API endpoints in labd
|
||||||
|
11. **#82** RBAC permission checks in CLI
|
||||||
|
12. **#83** Audit logging for resource operations
|
||||||
|
13. **#84** Update CLI entry point and help text
|
||||||
|
|
||||||
|
Additional items not in taskmaster:
|
||||||
|
- Ubuntu autoinstall disk auto-detect (still defaults to /dev/sda)
|
||||||
|
- Verify `inst.sshd` works end-to-end in rescue mode
|
||||||
|
- k3s cluster join vs new cluster distinction in `labctl app k3s install`
|
||||||
|
- arm64 container build (iPXE cross-compilation broken)
|
||||||
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
# PXE Boot Debugging Session — 2026-03-30
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
Beelink SER Mini Pro (AMD Ryzen 7 255, Radeon 780M, 64GB DDR5, 1TB NVMe) boots Fedora 43 100x slower than normal after PXE kickstart install. Every systemd boot phase takes ~30 seconds. The Anaconda installer/rescue mode boots fast on the same hardware.
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
**`console=ttyS0,115200n8` in kernel cmdline** — added via kickstart `bootloader --append` during install.
|
||||||
|
|
||||||
|
This mini PC has **no physical serial UART**. When systemd writes to ttyS0, each log write blocks for ~30 seconds waiting for the non-existent UART hardware. Since systemd logs at every phase transition, the total boot time was 10+ minutes.
|
||||||
|
|
||||||
|
The Anaconda installer was unaffected because it uses a different init flow that doesn't go through the same systemd phase transitions.
|
||||||
|
|
||||||
|
## How We Found It
|
||||||
|
Hours of systematic elimination:
|
||||||
|
|
||||||
|
| What we tried | Result | Ruled out |
|
||||||
|
|---|---|---|
|
||||||
|
| `modprobe.blacklist=amdgpu` | No change | GPU driver |
|
||||||
|
| `amd_iommu=off` | No change | IOMMU |
|
||||||
|
| Rebuild initramfs without plymouth/drm/fips | No change | Initramfs bloat |
|
||||||
|
| systemd-boot instead of GRUB | Still slow | Bootloader |
|
||||||
|
| PXE-boot kernel+initrd (skip local GRUB entirely) | Still slow | Local bootloader/firmware |
|
||||||
|
| Disable TPM in BIOS | No change | TPM |
|
||||||
|
| Remove `resume=` + resume dracut module | No change | Hibernate resume |
|
||||||
|
| Manual LVM activation in rescue shell | **Fast** | NVMe/LVM themselves |
|
||||||
|
| Remove `console=ttyS0,115200n8` from GRUB | **FAST BOOT** | **This was it** |
|
||||||
|
|
||||||
|
The key breakthrough was noticing the timestamps showed **exactly 30-second gaps** between boot phases — a timeout pattern, not general slowness. Then realising the serial console was added during install and had never been tested without.
|
||||||
|
|
||||||
|
## What Was Fixed (PR #4, merged)
|
||||||
|
|
||||||
|
### 1. Removed serial console from kickstart
|
||||||
|
- Removed `console=ttyS0,115200n8` from `bootloader --append`
|
||||||
|
- Removed `serial-getty@ttyS0.service` enablement
|
||||||
|
- Removed rsyslog serial forwarding
|
||||||
|
|
||||||
|
### 2. Enabled Anaconda syslog forwarding
|
||||||
|
- Uncommented `logging --host --port` directive in kickstart
|
||||||
|
- Bastion's SyslogListener was already built — just needed IP→MAC resolution improvement
|
||||||
|
- Added `registerIp()` calls from kickstart fetch and progress callbacks
|
||||||
|
- Added syslog listener unit tests
|
||||||
|
|
||||||
|
### 3. Fixed disk auto-detection
|
||||||
|
- Default disk changed from `/dev/sda` to `""` (auto-detect) in labd route and bastion command handler
|
||||||
|
- The kickstart `%pre` auto-detect logic probes nvme0n1, sda, sdb, vda in order
|
||||||
|
- Without this fix, NVMe-only machines (like the SER Mini Pro) fail immediately
|
||||||
|
|
||||||
|
### 4. SysRq magic keys
|
||||||
|
- Added `kernel.sysrq=1` sysctl to kickstart `%post`
|
||||||
|
- Enables Alt+SysRq+REISUB via JetKVM for emergency reboot of stuck machines
|
||||||
|
|
||||||
|
### 5. Simplified debug command
|
||||||
|
- Removed `--sshd` flag (SSH always available via `inst.sshd` + `sshpw` in rescue mode)
|
||||||
|
- Added `/debug-setup.sh` HTTP endpoint for nc listener setup from rescue shell
|
||||||
|
- Cleaned up `sshd` field from DebugConfig, protocol types, all routes
|
||||||
|
|
||||||
|
### 6. Added `labctl provision logs -f`
|
||||||
|
- Follow mode with 5-second polling for real-time install monitoring
|
||||||
|
|
||||||
|
## What Works
|
||||||
|
|
||||||
|
- **PXE discovery → install → boot** — full flow works end-to-end
|
||||||
|
- **Anaconda syslog forwarding** — install logs stream to bastion
|
||||||
|
- **Progress callbacks** — stage-by-stage install tracking via curl
|
||||||
|
- **Auto disk detection** — works for NVMe and SATA
|
||||||
|
- **Debug rescue mode** — `labctl provision debug <target>` boots Anaconda rescue with SSH
|
||||||
|
- **Network-first boot order** — bastion controls every reboot via efibootmgr
|
||||||
|
- **SysRq keys** — emergency reboot via JetKVM keyboard
|
||||||
|
|
||||||
|
## What Doesn't Work / Known Issues
|
||||||
|
|
||||||
|
- **`--sshd` in rescue mode** — Anaconda rescue mode skips both `%pre` and `%post` kickstart sections. `inst.sshd` + `sshpw` should provide SSH access, but hasn't been verified end-to-end yet. The `/debug-setup.sh` curl workaround exists for nc.
|
||||||
|
- **arm64 container build** — iPXE cross-compilation fails on arm64 (GCC flag incompatibility). Workaround: build with `--platforms linux/amd64` only.
|
||||||
|
- **Integration test SSH timeout** — VM boots fine but SSH times out due to libvirt nftables reject rules after VM restart. Test infrastructure issue, not a code bug.
|
||||||
|
|
||||||
|
## What Was Skipped / Left To Do
|
||||||
|
|
||||||
|
1. **Syslog UDP port in k3s** — works because bastion uses `hostNetwork: true`, but should be documented properly
|
||||||
|
2. **Background log streamer** — the old `tail -f` approach broke Anaconda filesystem sync. Replaced with syslog forwarding. If more granular %post logging is needed, a synchronous log push at end of %post would be safe.
|
||||||
|
3. **Per-machine hardware overrides** — turned out not to be needed (serial console was the only "special" setting, and removing it is universal)
|
||||||
|
4. **Ubuntu autoinstall disk default** — `ubuntu-autoinstall.ts` still has `disk || "/dev/sda"` fallback (line 38), should be changed to auto-detect
|
||||||
|
5. **Verify `inst.sshd` works in rescue mode** — test SSH with password "debug" next time debug mode is used
|
||||||
|
6. **Re-enable TPM in BIOS** — was disabled during debugging, should be factory-reset (user plans to reset BIOS to factory)
|
||||||
|
|
||||||
|
## Key Learnings
|
||||||
|
|
||||||
|
1. **`console=ttyS0` on hardware without UART = 30s timeout per boot phase.** Never add serial console to kernel cmdline unless the hardware has a verified physical UART.
|
||||||
|
2. **Exactly-N-second gaps in boot logs = timeout, not slowness.** Look for the timeout source, not performance issues.
|
||||||
|
3. **The bisection approach works.** Systematically removing features one at a time found the root cause. But it took hours because the serial console was added early and seemed harmless.
|
||||||
|
4. **Anaconda rescue mode is limited.** It skips `%pre` and `%post`, so you can't automate setup via kickstart. Use `inst.sshd` + `sshpw` for SSH, and serve helper scripts via HTTP for everything else.
|
||||||
|
5. **Default disk paths break NVMe machines.** Always default to auto-detect (empty string) rather than `/dev/sda`.
|
||||||
@@ -257,7 +257,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.install_queue[msg.mac] = {
|
s.install_queue[msg.mac] = {
|
||||||
hostname: msg.hostname,
|
hostname: msg.hostname,
|
||||||
disk: msg.disk ?? "/dev/sda",
|
disk: msg.disk ?? "",
|
||||||
role: msg.role as import("@lab/shared").Role,
|
role: msg.role as import("@lab/shared").Role,
|
||||||
os: msg.os as import("@lab/shared").OsId,
|
os: msg.os as import("@lab/shared").OsId,
|
||||||
queued_at: new Date().toISOString(),
|
queued_at: new Date().toISOString(),
|
||||||
@@ -269,7 +269,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
labdConn.onCommand("command-debug", async (msg) => {
|
labdConn.onCommand("command-debug", async (msg) => {
|
||||||
if (msg.type !== "command-debug") throw new Error("unexpected");
|
if (msg.type !== "command-debug") throw new Error("unexpected");
|
||||||
const mac = msg.mac.toLowerCase();
|
const mac = msg.mac.toLowerCase();
|
||||||
const sshd = msg.sshd ?? false;
|
const pxeBoot = msg.pxeBoot ?? false;
|
||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
const hostname =
|
const hostname =
|
||||||
currentState.installed[mac]?.hostname ??
|
currentState.installed[mac]?.hostname ??
|
||||||
@@ -277,7 +277,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
|||||||
currentState.discovered[mac]?.product ??
|
currentState.discovered[mac]?.product ??
|
||||||
mac;
|
mac;
|
||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd };
|
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||||
});
|
});
|
||||||
return { status: "ok", data: { mac, hostname } };
|
return { status: "ok", data: { mac, hostname } };
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -13,11 +13,13 @@ import { triggerPostProvisionK3s } from "../services/post-provision.js";
|
|||||||
import { progressBus } from "../services/progress-events.js";
|
import { progressBus } from "../services/progress-events.js";
|
||||||
import type { ProgressEvent } from "../services/progress-events.js";
|
import type { ProgressEvent } from "../services/progress-events.js";
|
||||||
import type { InstallLogBuffer } from "../services/install-log.js";
|
import type { InstallLogBuffer } from "../services/install-log.js";
|
||||||
|
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||||
|
|
||||||
export function registerApiRoutes(
|
export function registerApiRoutes(
|
||||||
app: FastifyInstance,
|
app: FastifyInstance,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
installLog: InstallLogBuffer,
|
installLog: InstallLogBuffer,
|
||||||
|
syslog: SyslogListener,
|
||||||
): void {
|
): void {
|
||||||
// List all machines
|
// List all machines
|
||||||
app.get("/api/machines", async (_request, reply) => {
|
app.get("/api/machines", async (_request, reply) => {
|
||||||
@@ -84,6 +86,11 @@ export function registerApiRoutes(
|
|||||||
const { mac: rawMac, stage, detail } = request.body ?? {};
|
const { mac: rawMac, stage, detail } = request.body ?? {};
|
||||||
const mac = (rawMac ?? "unknown").toLowerCase();
|
const mac = (rawMac ?? "unknown").toLowerCase();
|
||||||
const stageName = stage ?? "unknown";
|
const stageName = stage ?? "unknown";
|
||||||
|
|
||||||
|
// Register IP → MAC for syslog routing
|
||||||
|
if (mac !== "unknown") {
|
||||||
|
syslog.registerIp(request.ip, mac);
|
||||||
|
}
|
||||||
const detailStr = detail ?? "";
|
const detailStr = detail ?? "";
|
||||||
|
|
||||||
const GREEN = "\x1b[0;32m";
|
const GREEN = "\x1b[0;32m";
|
||||||
@@ -191,10 +198,10 @@ export function registerApiRoutes(
|
|||||||
|
|
||||||
// Queue debug/rescue mode for a machine
|
// Queue debug/rescue mode for a machine
|
||||||
app.post<{
|
app.post<{
|
||||||
Body: { mac?: string; sshd?: boolean };
|
Body: { mac?: string; pxeBoot?: boolean };
|
||||||
}>("/api/debug", async (request, reply) => {
|
}>("/api/debug", async (request, reply) => {
|
||||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
const sshd = request.body?.sshd ?? false;
|
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||||
if (mac === "") {
|
if (mac === "") {
|
||||||
return reply.status(400).send({ error: "mac is required" });
|
return reply.status(400).send({ error: "mac is required" });
|
||||||
}
|
}
|
||||||
@@ -208,7 +215,7 @@ export function registerApiRoutes(
|
|||||||
mac;
|
mac;
|
||||||
|
|
||||||
state.update((s) => {
|
state.update((s) => {
|
||||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), sshd };
|
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import {
|
|||||||
renderDiscoverIpxe,
|
renderDiscoverIpxe,
|
||||||
renderInstallIpxe,
|
renderInstallIpxe,
|
||||||
renderDebugIpxe,
|
renderDebugIpxe,
|
||||||
|
renderPxeBootDebugIpxe,
|
||||||
renderLocalBootIpxe,
|
renderLocalBootIpxe,
|
||||||
} from "../templates/boot.ipxe.js";
|
} from "../templates/boot.ipxe.js";
|
||||||
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
||||||
@@ -22,21 +23,44 @@ export function registerDispatchRoutes(
|
|||||||
config: BastionConfig,
|
config: BastionConfig,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
): void {
|
): void {
|
||||||
// Serve debug/rescue kickstart (minimal: SSH keys + network)
|
// Serve debug/rescue kickstart (minimal: SSH keys + network for inst.sshd)
|
||||||
app.get<{ Querystring: { mac?: string; sshd?: string } }>("/debug.ks", async (request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
|
||||||
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
|
||||||
const currentState = state.load();
|
|
||||||
const wantSshd = request.query.sshd === "1" || currentState.debug[mac]?.sshd === true;
|
|
||||||
|
|
||||||
const ks = renderDebugKickstart({
|
const ks = renderDebugKickstart({
|
||||||
sshKeys: config.sshKeys ?? [],
|
sshKeys: config.sshKeys ?? [],
|
||||||
sshd: wantSshd,
|
|
||||||
serverIp: config.serverIp,
|
serverIp: config.serverIp,
|
||||||
httpPort: config.httpPort,
|
httpPort: config.httpPort,
|
||||||
});
|
});
|
||||||
return reply.type("text/plain").send(ks);
|
return reply.type("text/plain").send(ks);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Shell script for manual debug setup (nc listener + IP reporting)
|
||||||
|
// Usage from rescue shell: curl http://bastion:port/debug-setup.sh | bash
|
||||||
|
app.get("/debug-setup.sh", async (_request, reply) => {
|
||||||
|
const script = `#!/bin/bash
|
||||||
|
# Lab Bastion debug setup — run from rescue shell
|
||||||
|
set -x
|
||||||
|
|
||||||
|
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||||
|
MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||||
|
|
||||||
|
# Start persistent nc listener for remote shell
|
||||||
|
(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) &
|
||||||
|
echo "nc shell listener on port 2323"
|
||||||
|
|
||||||
|
# Report IP to bastion
|
||||||
|
curl -sf -X POST "http://${config.serverIp}:${config.httpPort}/api/progress" \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
|
-d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"nc $IP_ADDR 2323\\"}" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Debug environment ready ==="
|
||||||
|
echo " nc $IP_ADDR 2323 (remote shell)"
|
||||||
|
echo " ssh root@$IP_ADDR (password: debug)"
|
||||||
|
echo "==============================="
|
||||||
|
`;
|
||||||
|
return reply.type("text/plain").send(script);
|
||||||
|
});
|
||||||
|
|
||||||
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
||||||
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
@@ -45,17 +69,27 @@ export function registerDispatchRoutes(
|
|||||||
const debugEntry = currentState.debug[mac];
|
const debugEntry = currentState.debug[mac];
|
||||||
if (debugEntry) {
|
if (debugEntry) {
|
||||||
const hostname = debugEntry.hostname ?? "debug";
|
const hostname = debugEntry.hostname ?? "debug";
|
||||||
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
|
||||||
|
|
||||||
state.update((s) => { delete s.debug[mac]; });
|
state.update((s) => { delete s.debug[mac]; });
|
||||||
|
|
||||||
const script = renderDebugIpxe({
|
let script: string;
|
||||||
mac,
|
if (debugEntry.pxeBoot) {
|
||||||
hostname,
|
logger.info(`PXE BOOT DEBUG: ${mac} -> ${hostname} (kernel+initrd from PXE, root from NVMe)`);
|
||||||
serverIp: config.serverIp,
|
script = renderPxeBootDebugIpxe({
|
||||||
httpPort: config.httpPort,
|
mac,
|
||||||
fedoraMirror: config.fedoraMirror,
|
hostname,
|
||||||
});
|
serverIp: config.serverIp,
|
||||||
|
httpPort: config.httpPort,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
||||||
|
script = renderDebugIpxe({
|
||||||
|
mac,
|
||||||
|
hostname,
|
||||||
|
serverIp: config.serverIp,
|
||||||
|
httpPort: config.httpPort,
|
||||||
|
fedoraMirror: config.fedoraMirror,
|
||||||
|
});
|
||||||
|
}
|
||||||
return reply.type("text/plain").send(script);
|
return reply.type("text/plain").send(script);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
import type { FastifyInstance } from "fastify";
|
import type { FastifyInstance } from "fastify";
|
||||||
import type { BastionConfig } from "@lab/shared";
|
import type { BastionConfig } from "@lab/shared";
|
||||||
import type { StateManager } from "../services/state.js";
|
import type { StateManager } from "../services/state.js";
|
||||||
|
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||||
import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js";
|
import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js";
|
||||||
import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js";
|
import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js";
|
||||||
|
|
||||||
@@ -12,6 +13,7 @@ export function registerKickstartRoutes(
|
|||||||
app: FastifyInstance,
|
app: FastifyInstance,
|
||||||
config: BastionConfig,
|
config: BastionConfig,
|
||||||
state: StateManager,
|
state: StateManager,
|
||||||
|
syslog: SyslogListener,
|
||||||
): void {
|
): void {
|
||||||
// Per-MAC install kickstart
|
// Per-MAC install kickstart
|
||||||
app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => {
|
app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => {
|
||||||
@@ -19,6 +21,11 @@ export function registerKickstartRoutes(
|
|||||||
const currentState = state.load();
|
const currentState = state.load();
|
||||||
const queueEntry = currentState.install_queue[mac];
|
const queueEntry = currentState.install_queue[mac];
|
||||||
|
|
||||||
|
// Register IP → MAC so syslog listener can route Anaconda logs
|
||||||
|
if (mac) {
|
||||||
|
syslog.registerIp(request.ip, mac);
|
||||||
|
}
|
||||||
|
|
||||||
const ks = generateInstallKickstart(config, {
|
const ks = generateInstallKickstart(config, {
|
||||||
hostname: queueEntry?.hostname ?? "lab-node",
|
hostname: queueEntry?.hostname ?? "lab-node",
|
||||||
disk: queueEntry?.disk ?? "",
|
disk: queueEntry?.disk ?? "",
|
||||||
|
|||||||
@@ -43,8 +43,8 @@ export function createApp(config: BastionConfig): { app: ReturnType<typeof Fasti
|
|||||||
|
|
||||||
// Register route handlers
|
// Register route handlers
|
||||||
registerDispatchRoutes(app, config, state);
|
registerDispatchRoutes(app, config, state);
|
||||||
registerKickstartRoutes(app, config, state);
|
registerKickstartRoutes(app, config, state, syslog);
|
||||||
registerApiRoutes(app, state, installLog);
|
registerApiRoutes(app, state, installLog, syslog);
|
||||||
// boot.iso is generated at startup and served as a static file from httpDir
|
// boot.iso is generated at startup and served as a static file from httpDir
|
||||||
// (static serving supports HTTP Range requests, required by JetKVM streaming)
|
// (static serving supports HTTP Range requests, required by JetKVM streaming)
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ export class SyslogListener {
|
|||||||
private port: number;
|
private port: number;
|
||||||
private installLog: InstallLogBuffer;
|
private installLog: InstallLogBuffer;
|
||||||
private state: StateManager;
|
private state: StateManager;
|
||||||
|
/** Explicit IP → MAC mapping registered from kickstart/progress requests. */
|
||||||
|
private ipToMac = new Map<string, string>();
|
||||||
|
|
||||||
constructor(port: number, installLog: InstallLogBuffer, state: StateManager) {
|
constructor(port: number, installLog: InstallLogBuffer, state: StateManager) {
|
||||||
this.port = port;
|
this.port = port;
|
||||||
@@ -37,14 +39,21 @@ export class SyslogListener {
|
|||||||
this.state = state;
|
this.state = state;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Resolve a source IP to a MAC address using the install queue. */
|
/** Register an IP → MAC mapping (called when we learn a machine's IP). */
|
||||||
|
registerIp(ip: string, mac: string): void {
|
||||||
|
this.ipToMac.set(ip, mac.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Resolve a source IP to a MAC address. */
|
||||||
private resolveIpToMac(ip: string): string | null {
|
private resolveIpToMac(ip: string): string | null {
|
||||||
|
// Check explicit mapping first (most reliable)
|
||||||
|
const explicit = this.ipToMac.get(ip);
|
||||||
|
if (explicit) return explicit;
|
||||||
|
|
||||||
const currentState = this.state.load();
|
const currentState = this.state.load();
|
||||||
|
|
||||||
// Check install queue — machines being installed have an IP from DHCP
|
// Check install queue — machines being installed have an IP from DHCP
|
||||||
for (const [mac, entry] of Object.entries(currentState.install_queue)) {
|
for (const [mac, entry] of Object.entries(currentState.install_queue)) {
|
||||||
// The progress callback sends IP in "complete" detail, but during install
|
|
||||||
// we need to match by what we know. Check if any progress mentions this IP.
|
|
||||||
if (entry.progress_detail?.includes(ip)) return mac;
|
if (entry.progress_detail?.includes(ip)) return mac;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -102,6 +102,34 @@ boot
|
|||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* iPXE script for PXE-boot debug mode -- boots the installed system's root
|
||||||
|
* filesystem using the bastion's PXE kernel+initrd instead of local GRUB.
|
||||||
|
* Workaround for UEFI firmware bugs that make local disk boot slow.
|
||||||
|
*/
|
||||||
|
export function renderPxeBootDebugIpxe(params: {
|
||||||
|
mac: string;
|
||||||
|
hostname: string;
|
||||||
|
serverIp: string;
|
||||||
|
httpPort: number;
|
||||||
|
}): string {
|
||||||
|
return `#!ipxe
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo =============================================
|
||||||
|
echo Lab PXE Bastion - PXE BOOT (debug)
|
||||||
|
echo Target: ${params.hostname}
|
||||||
|
echo MAC: ${params.mac}
|
||||||
|
echo Kernel+initrd from PXE, root from NVMe
|
||||||
|
echo =============================================
|
||||||
|
echo
|
||||||
|
|
||||||
|
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0
|
||||||
|
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||||
|
boot
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* iPXE script for already-installed machines -- exits to boot from local disk.
|
* iPXE script for already-installed machines -- exits to boot from local disk.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -1,76 +1,33 @@
|
|||||||
// Debug/rescue kickstart template.
|
// Debug/rescue kickstart template.
|
||||||
// Minimal kickstart for Anaconda rescue mode.
|
// Minimal kickstart for Anaconda rescue mode.
|
||||||
// When sshd=true: generates host keys, starts sshd, reports IP to bastion.
|
//
|
||||||
// No dependency on mounted filesystems — fully self-contained.
|
// SSH access: Anaconda's inst.sshd starts sshd automatically.
|
||||||
|
// The sshpw directive sets the password, sshkey adds authorized keys.
|
||||||
|
// %pre/%post do NOT run in rescue mode — don't put setup code there.
|
||||||
|
|
||||||
export interface DebugKickstartParams {
|
export interface DebugKickstartParams {
|
||||||
sshKeys: string[];
|
sshKeys: string[];
|
||||||
sshd?: boolean;
|
|
||||||
serverIp?: string;
|
serverIp?: string;
|
||||||
httpPort?: number;
|
httpPort?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
||||||
const sshpw = "sshpw --username=root --plaintext lab-root-pw";
|
|
||||||
const sshkeyLine = params.sshKeys.length > 0
|
const sshkeyLine = params.sshKeys.length > 0
|
||||||
? `sshkey --username=root "${params.sshKeys[0]}"`
|
? `sshkey --username=root "${params.sshKeys[0]}"`
|
||||||
: "";
|
: "";
|
||||||
|
|
||||||
const sshdSetup = params.sshd ? `
|
|
||||||
%post --nochroot --log=/tmp/debug-sshd.log
|
|
||||||
#!/bin/bash
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Generate host keys (self-contained, no mounted FS needed)
|
|
||||||
ssh-keygen -t ed25519 -f /tmp/ssh_host_ed25519_key -N "" -q
|
|
||||||
ssh-keygen -t rsa -f /tmp/ssh_host_rsa_key -N "" -q
|
|
||||||
|
|
||||||
# Write minimal sshd config
|
|
||||||
cat > /tmp/sshd_config << 'SSHCFG'
|
|
||||||
HostKey /tmp/ssh_host_ed25519_key
|
|
||||||
HostKey /tmp/ssh_host_rsa_key
|
|
||||||
PermitRootLogin yes
|
|
||||||
PasswordAuthentication yes
|
|
||||||
PubkeyAuthentication yes
|
|
||||||
AuthorizedKeysFile /root/.ssh/authorized_keys
|
|
||||||
SSHCFG
|
|
||||||
|
|
||||||
# Set root password for SSH access
|
|
||||||
echo "root:debug" | chpasswd
|
|
||||||
|
|
||||||
# Set up SSH authorized keys
|
|
||||||
mkdir -p /root/.ssh && chmod 700 /root/.ssh
|
|
||||||
${params.sshKeys.map(k => `echo '${k}' >> /root/.ssh/authorized_keys`).join("\n")}
|
|
||||||
chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true
|
|
||||||
|
|
||||||
# Start sshd
|
|
||||||
/usr/sbin/sshd -f /tmp/sshd_config -p 22
|
|
||||||
echo "sshd started on port 22"
|
|
||||||
|
|
||||||
# Start persistent nc listener for remote shell
|
|
||||||
(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) &
|
|
||||||
echo "nc shell listener on port 2323"
|
|
||||||
|
|
||||||
# Report IP to bastion
|
|
||||||
sleep 2
|
|
||||||
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
|
||||||
MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
|
||||||
curl -sf -X POST "http://${params.serverIp}:${params.httpPort}/api/progress" \\
|
|
||||||
-H "Content-Type: application/json" \\
|
|
||||||
-d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"ssh root@$IP_ADDR (pw: debug) | nc $IP_ADDR 2323\\"}" 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "Debug environment ready: ssh root@$IP_ADDR or nc $IP_ADDR 2323"
|
|
||||||
%end
|
|
||||||
` : "";
|
|
||||||
|
|
||||||
return `# Lab Bastion -- Debug/Rescue Kickstart
|
return `# Lab Bastion -- Debug/Rescue Kickstart
|
||||||
# Minimal: SSH + network for Anaconda rescue mode
|
# Minimal: SSH + network for Anaconda rescue mode
|
||||||
|
#
|
||||||
|
# SSH is started by Anaconda (inst.sshd kernel param).
|
||||||
|
# Password: debug | SSH keys from bastion config.
|
||||||
|
# %pre/%post do NOT run in rescue mode.
|
||||||
|
|
||||||
lang en_US.UTF-8
|
lang en_US.UTF-8
|
||||||
keyboard uk
|
keyboard uk
|
||||||
network --bootproto=dhcp --activate
|
network --bootproto=dhcp --activate
|
||||||
|
|
||||||
${sshpw}
|
sshpw --username=root --plaintext debug
|
||||||
${sshkeyLine}
|
${sshkeyLine}
|
||||||
${sshdSetup}`;
|
`;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -134,10 +134,9 @@ network --bootproto=dhcp --activate --hostname=${fqdn}
|
|||||||
${auth}
|
${auth}
|
||||||
${userDirective}
|
${userDirective}
|
||||||
|
|
||||||
bootloader --append="console=tty0 console=ttyS0,115200n8"
|
bootloader --append="console=tty0"
|
||||||
|
|
||||||
# logging --host=${serverIp} --port=${syslogPort}
|
logging --host=${serverIp} --port=${syslogPort}
|
||||||
# Disabled: syslog UDP port needs to be exposed in k3s service/hostPort first
|
|
||||||
|
|
||||||
url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch
|
url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch
|
||||||
|
|
||||||
@@ -342,17 +341,7 @@ echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
|
|||||||
|
|
||||||
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
|
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
|
||||||
# -- Enable chronyd for time sync --
|
# -- Enable chronyd for time sync --
|
||||||
systemctl enable chronyd || true
|
systemctl enable chronyd || true` : `# -- Kernel modules for k3s --
|
||||||
|
|
||||||
# -- Serial console (for debugging — auto-login as root on ttyS0) --
|
|
||||||
# AWS EC2 compatible: ttyS0 @ 115200n8
|
|
||||||
systemctl enable serial-getty@ttyS0.service || true
|
|
||||||
|
|
||||||
# -- Forward all system logs to serial console --
|
|
||||||
cat > /etc/rsyslog.d/serial-console.conf << 'RSYSLOG'
|
|
||||||
*.* /dev/ttyS0
|
|
||||||
RSYSLOG
|
|
||||||
systemctl enable rsyslog || true` : `# -- Kernel modules for k3s --
|
|
||||||
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
|
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
|
||||||
br_netfilter
|
br_netfilter
|
||||||
overlay
|
overlay
|
||||||
@@ -396,6 +385,9 @@ fi
|
|||||||
|
|
||||||
bastion_progress "post-install" "3-bootorder done"
|
bastion_progress "post-install" "3-bootorder done"
|
||||||
|
|
||||||
|
# -- Enable SysRq magic keys (for emergency reboot via Alt+SysRq+REISUB) --
|
||||||
|
echo "kernel.sysrq=1" > /etc/sysctl.d/90-sysrq.conf
|
||||||
|
|
||||||
# -- Provisioning metadata --
|
# -- Provisioning metadata --
|
||||||
cat > /etc/lab-provisioned << PROVEOF
|
cat > /etc/lab-provisioned << PROVEOF
|
||||||
hostname: ${fqdn}
|
hostname: ${fqdn}
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ function createTestConfig(testDir: string): BastionConfig {
|
|||||||
gateway: "10.0.0.1",
|
gateway: "10.0.0.1",
|
||||||
sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"],
|
sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"],
|
||||||
adminUser: "testadmin",
|
adminUser: "testadmin",
|
||||||
|
syslogPort: 15514,
|
||||||
skipDnsmasq: true,
|
skipDnsmasq: true,
|
||||||
skipArtifacts: true,
|
skipArtifacts: true,
|
||||||
fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os",
|
fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os",
|
||||||
|
|||||||
@@ -206,10 +206,8 @@ describe("renderInstallKickstart", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it("forwards system logs to serial console", () => {
|
it("does not include serial console (causes 30s boot timeout on hardware without UART)", () => {
|
||||||
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
||||||
expect(ks).toContain("serial-console.conf");
|
expect(ks).not.toContain("ttyS0");
|
||||||
expect(ks).toContain("/dev/ttyS0");
|
|
||||||
expect(ks).toContain("rsyslog");
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||||
|
import { createSocket } from "node:dgram";
|
||||||
|
import { mkdtempSync, rmSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { SyslogListener } from "../src/services/syslog-listener.js";
|
||||||
|
import { InstallLogBuffer } from "../src/services/install-log.js";
|
||||||
|
import { StateManager } from "../src/services/state.js";
|
||||||
|
|
||||||
|
function sendUdpSyslog(port: number, message: string): Promise<void> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const client = createSocket("udp4");
|
||||||
|
const buf = Buffer.from(message);
|
||||||
|
client.send(buf, 0, buf.length, port, "127.0.0.1", (err) => {
|
||||||
|
client.close();
|
||||||
|
if (err) reject(err);
|
||||||
|
else resolve();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("SyslogListener", () => {
|
||||||
|
let tmpDir: string;
|
||||||
|
let state: StateManager;
|
||||||
|
let installLog: InstallLogBuffer;
|
||||||
|
let syslog: SyslogListener;
|
||||||
|
const PORT = 15514; // use non-privileged port for testing
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
tmpDir = mkdtempSync(join(tmpdir(), "syslog-test-"));
|
||||||
|
state = new StateManager(join(tmpDir, "state.json"));
|
||||||
|
state.init();
|
||||||
|
installLog = new InstallLogBuffer(tmpDir);
|
||||||
|
syslog = new SyslogListener(PORT, installLog, state);
|
||||||
|
syslog.start();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
syslog.stop();
|
||||||
|
rmSync(tmpDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("receives and stores syslog messages for registered IP", async () => {
|
||||||
|
const mac = "aa:bb:cc:dd:ee:ff";
|
||||||
|
// Queue a machine so hostname can be resolved
|
||||||
|
state.update((s) => {
|
||||||
|
s.install_queue[mac] = {
|
||||||
|
hostname: "testnode",
|
||||||
|
disk: "/dev/sda",
|
||||||
|
role: "worker",
|
||||||
|
os: "fedora-43",
|
||||||
|
queued_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Register IP → MAC mapping
|
||||||
|
syslog.registerIp("127.0.0.1", mac);
|
||||||
|
|
||||||
|
// Send a syslog message (RFC 3164 format)
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: Installing package vim-enhanced");
|
||||||
|
|
||||||
|
// Wait for UDP delivery
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("anaconda");
|
||||||
|
expect(lines[0]!.line).toContain("Installing package vim-enhanced");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores messages from unknown IPs", async () => {
|
||||||
|
// Don't register any IP mapping
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: test message");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
// No MAC to check, but the listener should not crash
|
||||||
|
// and no logs should be stored for any MAC
|
||||||
|
expect(installLog.lineCount("unknown")).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("resolves IP from installed machines state", async () => {
|
||||||
|
const mac = "11:22:33:44:55:66";
|
||||||
|
state.update((s) => {
|
||||||
|
s.installed[mac] = {
|
||||||
|
hostname: "installed-node",
|
||||||
|
role: "worker",
|
||||||
|
ip: "127.0.0.1",
|
||||||
|
installed_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
await sendUdpSyslog(PORT, "<14>Mar 30 02:00:00 installed-node sshd[5678]: Accepted publickey for root");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("sshd");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("parses various syslog formats", async () => {
|
||||||
|
const mac = "aa:bb:cc:dd:ee:ff";
|
||||||
|
syslog.registerIp("127.0.0.1", mac);
|
||||||
|
state.update((s) => {
|
||||||
|
s.install_queue[mac] = {
|
||||||
|
hostname: "testnode",
|
||||||
|
disk: "/dev/sda",
|
||||||
|
role: "worker",
|
||||||
|
os: "fedora-43",
|
||||||
|
queued_at: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Message without PID
|
||||||
|
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost kernel: NVMe device ready");
|
||||||
|
await new Promise((r) => setTimeout(r, 200));
|
||||||
|
|
||||||
|
const lines = installLog.getLines(mac);
|
||||||
|
expect(lines.length).toBeGreaterThan(0);
|
||||||
|
expect(lines[0]!.line).toContain("kernel");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -94,8 +94,8 @@ export class LabdClient {
|
|||||||
return this.request("POST", "/api/machines/install", { body: opts });
|
return this.request("POST", "/api/machines/install", { body: opts });
|
||||||
}
|
}
|
||||||
|
|
||||||
async debugMachine(mac: string, opts?: { sshd?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
||||||
return this.request("POST", "/api/machines/debug", { body: { mac, sshd: opts?.sshd } });
|
return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } });
|
||||||
}
|
}
|
||||||
|
|
||||||
async forgetMachine(mac: string): Promise<{ status: string }> {
|
async forgetMachine(mac: string): Promise<{ status: string }> {
|
||||||
|
|||||||
@@ -48,9 +48,9 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
parent
|
parent
|
||||||
.command("debug <target>")
|
.command("debug <target>")
|
||||||
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
||||||
.option("--sshd", "Start SSH + nc listener automatically, report IP to bastion")
|
.option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)")
|
||||||
.showHelpAfterError(true)
|
.showHelpAfterError(true)
|
||||||
.action(async (target: string, opts: { sshd?: boolean }) => {
|
.action(async (target: string, opts: { pxeBoot?: boolean }) => {
|
||||||
const client = getLabdClient();
|
const client = getLabdClient();
|
||||||
|
|
||||||
// Resolve target from labd aggregated state
|
// Resolve target from labd aggregated state
|
||||||
@@ -74,7 +74,7 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await client.debugMachine(mac, { sshd: opts.sshd });
|
const result = await client.debugMachine(mac, { pxeBoot: opts.pxeBoot === true });
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
console.error(`Failed: ${result.error}`);
|
console.error(`Failed: ${result.error}`);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
@@ -117,38 +117,39 @@ export function registerDebugCommand(parent: Command): void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine bastion URL from labd config for the setup script URL
|
||||||
|
const bastionUrl = process.env["LABD_URL"]
|
||||||
|
? process.env["LABD_URL"].replace(/\/ws\/bastion$/, "").replace(/^wss?:/, "http:")
|
||||||
|
: "http://<bastion-ip>:8080";
|
||||||
|
|
||||||
console.log(`
|
console.log(`
|
||||||
Debug mode queued for ${hostname} (${mac}).
|
Debug mode queued for ${hostname} (${mac}).
|
||||||
Reboot the machine to enter Fedora rescue mode.
|
Reboot the machine to enter Fedora rescue mode.
|
||||||
|
|
||||||
|
SSH access (started by Anaconda):
|
||||||
|
ssh root@<ip> (password: debug)
|
||||||
|
|
||||||
|
For nc remote shell, run from rescue shell:
|
||||||
|
curl ${bastionUrl}/debug-setup.sh | bash
|
||||||
|
|
||||||
Once in rescue shell:
|
Once in rescue shell:
|
||||||
|
|
||||||
# Activate LVM
|
# Activate LVM and mount installed system
|
||||||
vgchange -ay labvg
|
vgchange -ay
|
||||||
|
|
||||||
# Mount root + other volumes
|
|
||||||
mkdir -p /mnt/sysroot
|
mkdir -p /mnt/sysroot
|
||||||
mount /dev/labvg/root /mnt/sysroot
|
mount /dev/<vg>/root /mnt/sysroot
|
||||||
cat /mnt/sysroot/etc/fstab # check what else to mount
|
cat /mnt/sysroot/etc/fstab
|
||||||
mount /dev/labvg/var /mnt/sysroot/var
|
mount /dev/<vg>/var /mnt/sysroot/var
|
||||||
mount /dev/labvg/home /mnt/sysroot/home
|
mount /dev/<vg>/home /mnt/sysroot/home
|
||||||
|
|
||||||
# Boot the installed system in a container
|
# Boot installed system in a container
|
||||||
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
||||||
|
|
||||||
# Or just chroot for quick fixes
|
# Or chroot for quick fixes
|
||||||
mount --bind /dev /mnt/sysroot/dev
|
mount --bind /dev /mnt/sysroot/dev
|
||||||
mount --bind /proc /mnt/sysroot/proc
|
mount --bind /proc /mnt/sysroot/proc
|
||||||
mount --bind /sys /mnt/sysroot/sys
|
mount --bind /sys /mnt/sysroot/sys
|
||||||
chroot /mnt/sysroot
|
chroot /mnt/sysroot
|
||||||
|
|
||||||
# Check initramfs size
|
|
||||||
ls -lh /mnt/sysroot/boot/initramfs-*.img
|
|
||||||
|
|
||||||
# Rebuild initramfs without amdgpu
|
|
||||||
chroot /mnt/sysroot
|
|
||||||
echo 'omit_drivers+=" amdgpu "' > /etc/dracut.conf.d/omit-amdgpu.conf
|
|
||||||
dracut -f --regenerate-all
|
|
||||||
`);
|
`);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,19 +39,25 @@ export function registerLogsCommand(parent: Command): void {
|
|||||||
parent
|
parent
|
||||||
.command("logs <target>")
|
.command("logs <target>")
|
||||||
.description("Show provisioning logs for a machine (hostname, MAC, or IP)")
|
.description("Show provisioning logs for a machine (hostname, MAC, or IP)")
|
||||||
.action(async (target: string) => {
|
.option("-f, --follow", "Follow log output in real-time")
|
||||||
|
.action(async (target: string, opts: { follow?: boolean }) => {
|
||||||
const mac = await resolveToMac(target);
|
const mac = await resolveToMac(target);
|
||||||
|
|
||||||
|
const BOLD = "\x1b[1m";
|
||||||
|
const GREEN = "\x1b[32m";
|
||||||
|
const YELLOW = "\x1b[33m";
|
||||||
|
const RED = "\x1b[31m";
|
||||||
|
const DIM = "\x1b[2m";
|
||||||
|
const RESET = "\x1b[0m";
|
||||||
|
|
||||||
|
if (opts.follow) {
|
||||||
|
await followLogs(mac, { BOLD, GREEN, YELLOW, RED, DIM, RESET });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const data = await getLabdClient().getMachineLogs(mac);
|
const data = await getLabdClient().getMachineLogs(mac);
|
||||||
|
|
||||||
const BOLD = "\x1b[1m";
|
|
||||||
const GREEN = "\x1b[32m";
|
|
||||||
const YELLOW = "\x1b[33m";
|
|
||||||
const RED = "\x1b[31m";
|
|
||||||
const DIM = "\x1b[2m";
|
|
||||||
const RESET = "\x1b[0m";
|
|
||||||
|
|
||||||
console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`);
|
console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`);
|
||||||
console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`);
|
console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`);
|
||||||
console.log(` Role: ${data["role"]}`);
|
console.log(` Role: ${data["role"]}`);
|
||||||
@@ -83,3 +89,58 @@ export function registerLogsCommand(parent: Command): void {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Follow logs by polling labd. */
|
||||||
|
async function followLogs(
|
||||||
|
mac: string,
|
||||||
|
colors: { BOLD: string; GREEN: string; YELLOW: string; RED: string; DIM: string; RESET: string },
|
||||||
|
): Promise<void> {
|
||||||
|
const { BOLD, GREEN, YELLOW, RED, DIM, RESET } = colors;
|
||||||
|
const client = getLabdClient();
|
||||||
|
|
||||||
|
console.log(`${DIM}Following logs for ${mac} (Ctrl+C to stop)${RESET}`);
|
||||||
|
console.log("");
|
||||||
|
|
||||||
|
let lastStageCount = 0;
|
||||||
|
let lastStatus = "";
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
const data = await client.getMachineLogs(mac);
|
||||||
|
const status = String(data["status"] ?? "");
|
||||||
|
const log = data["log"] as Array<{ stage: string; detail: string; timestamp: string }> | undefined;
|
||||||
|
|
||||||
|
// Print header once or on status change
|
||||||
|
if (status !== lastStatus) {
|
||||||
|
const hostname = String(data["hostname"] ?? mac);
|
||||||
|
const statusColor = status === "installed" ? GREEN : YELLOW;
|
||||||
|
console.log(` ${BOLD}${hostname}${RESET} ${statusColor}${status}${RESET}`);
|
||||||
|
lastStatus = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print new stages
|
||||||
|
if (log && log.length > lastStageCount) {
|
||||||
|
for (let i = lastStageCount; i < log.length; i++) {
|
||||||
|
const entry = log[i]!;
|
||||||
|
const time = entry.timestamp.slice(11, 19);
|
||||||
|
const color = entry.stage === "complete" ? GREEN : entry.stage === "error" ? RED : YELLOW;
|
||||||
|
const detail = entry.detail ? ` ${DIM}-- ${entry.detail}${RESET}` : "";
|
||||||
|
console.log(` ${DIM}${time}${RESET} ${color}${entry.stage}${RESET}${detail}`);
|
||||||
|
}
|
||||||
|
lastStageCount = log.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Done
|
||||||
|
if (status === "installed") {
|
||||||
|
const ip = data["ip"] ?? "";
|
||||||
|
console.log("");
|
||||||
|
console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Machine may not be in logs yet (still queued)
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((r) => setTimeout(r, 5000));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
try {
|
try {
|
||||||
const result = await sendCommand(all[0]!.bastionId, {
|
const result = await sendCommand(all[0]!.bastionId, {
|
||||||
type: "command-install",
|
type: "command-install",
|
||||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||||
});
|
});
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -164,7 +164,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
try {
|
try {
|
||||||
const result = await sendCommand(bastion.bastionId, {
|
const result = await sendCommand(bastion.bastionId, {
|
||||||
type: "command-install",
|
type: "command-install",
|
||||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||||
});
|
});
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -174,10 +174,10 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
|
|
||||||
// Queue debug/rescue mode — route to correct bastion by MAC
|
// Queue debug/rescue mode — route to correct bastion by MAC
|
||||||
app.post<{
|
app.post<{
|
||||||
Body: { mac?: string; sshd?: boolean };
|
Body: { mac?: string; pxeBoot?: boolean };
|
||||||
}>("/api/machines/debug", async (request, reply) => {
|
}>("/api/machines/debug", async (request, reply) => {
|
||||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||||
const sshd = request.body?.sshd ?? false;
|
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||||
if (!mac) {
|
if (!mac) {
|
||||||
return reply.code(400).send({ error: "mac is required" });
|
return reply.code(400).send({ error: "mac is required" });
|
||||||
}
|
}
|
||||||
@@ -190,7 +190,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
}
|
}
|
||||||
if (all.length === 1) {
|
if (all.length === 1) {
|
||||||
try {
|
try {
|
||||||
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, sshd });
|
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||||
@@ -200,7 +200,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, sshd });
|
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ export type LabdBastionMessage =
|
|||||||
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
|
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
|
||||||
| { type: "command-forget"; requestId: string; mac: string }
|
| { type: "command-forget"; requestId: string; mac: string }
|
||||||
| { type: "command-role-update"; requestId: string; mac: string; role: string }
|
| { type: "command-role-update"; requestId: string; mac: string; role: string }
|
||||||
| { type: "command-debug"; requestId: string; mac: string; sshd?: boolean }
|
| { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean }
|
||||||
| { type: "server-shutdown"; reconnectAfter: number };
|
| { type: "server-shutdown"; reconnectAfter: number };
|
||||||
|
|
||||||
export type BastionMessageType = BastionMessage["type"];
|
export type BastionMessageType = BastionMessage["type"];
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ export interface InstalledInfo {
|
|||||||
export interface DebugConfig {
|
export interface DebugConfig {
|
||||||
hostname: string;
|
hostname: string;
|
||||||
queued_at: string;
|
queued_at: string;
|
||||||
sshd?: boolean;
|
pxeBoot?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface BastionState {
|
export interface BastionState {
|
||||||
|
|||||||
@@ -224,11 +224,12 @@ describe("PXE boot provisioning", () => {
|
|||||||
// Generate dnsmasq config
|
// Generate dnsmasq config
|
||||||
generateDnsmasqConf(config);
|
generateDnsmasqConf(config);
|
||||||
|
|
||||||
// Start HTTP server
|
// Start HTTP server + syslog listener
|
||||||
const { app, state } = createApp(config);
|
const { app, state, syslog } = createApp(config);
|
||||||
bastionApp = app;
|
bastionApp = app;
|
||||||
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
||||||
log(`Bastion HTTP server listening on :${HTTP_PORT}`);
|
syslog.start();
|
||||||
|
log(`Bastion HTTP server listening on :${HTTP_PORT}, syslog on UDP :${config.syslogPort}`);
|
||||||
|
|
||||||
// Start dnsmasq (fire-and-forget — it runs until killed)
|
// Start dnsmasq (fire-and-forget — it runs until killed)
|
||||||
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
|
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
|
||||||
@@ -387,8 +388,8 @@ describe("PXE boot provisioning", () => {
|
|||||||
expect(data.progress).toBe("complete");
|
expect(data.progress).toBe("complete");
|
||||||
});
|
});
|
||||||
|
|
||||||
it.skip("log lines were captured", async () => {
|
it("syslog install logs were captured", async () => {
|
||||||
// Requires log streamer in %post — skipped until re-added
|
// Anaconda forwards logs via syslog (logging --host directive in kickstart)
|
||||||
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
|
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
|
||||||
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
|
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
|
||||||
expect(data.log_total).toBeGreaterThan(0);
|
expect(data.log_total).toBeGreaterThan(0);
|
||||||
|
|||||||
Reference in New Issue
Block a user