Compare commits
34 Commits
fix/pxe-bo
...
feat/regis
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
49d747db98 | ||
| 8635da08a6 | |||
|
|
6a5f23c0f5 | ||
| 63cc033e3e | |||
|
|
d7a25066bd | ||
| a0f6161533 | |||
|
|
87c1a34232 | ||
| 84afe7d5e4 | |||
|
|
0a4916d3c9 | ||
|
|
a4a4840930 | ||
|
|
8da947a1c3 | ||
|
|
92c65b4672 | ||
|
|
3835fefba1 | ||
|
|
d7a59665ad | ||
|
|
82ca93f4d7 | ||
|
|
52150fd955 | ||
|
|
e87edfcfbd | ||
|
|
6c6d5763c4 | ||
|
|
a7a6ad8098 | ||
|
|
e3523d642c | ||
|
|
5b04d3162b | ||
|
|
a14fd04947 | ||
|
|
0c1e18cee1 | ||
|
|
aae03d9877 | ||
| d4e9101bb6 | |||
|
|
84f1a7b133 | ||
|
|
c0fb1310cb | ||
|
|
48b2230665 | ||
|
|
3dc1317301 | ||
|
|
cac7514014 | ||
|
|
25a2beccff | ||
|
|
2a1a29c03b | ||
|
|
a664074fa3 | ||
| 014e8a6e72 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -23,3 +23,7 @@ node_modules/
|
||||
|
||||
# OS specific
|
||||
.DS_Store
|
||||
|
||||
# Task files
|
||||
# tasks.json
|
||||
# tasks/
|
||||
|
||||
@@ -29,43 +29,55 @@ _labctl() {
|
||||
COMPREPLY=($(compgen -W "--dir -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion standalone status")
|
||||
COMPREPLY=($(compgen -W "--dir --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion standalone")
|
||||
COMPREPLY=($(compgen -W "start stop status -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app labcontroller deploy")
|
||||
COMPREPLY=($(compgen -W "--user --port --crdb-replicas -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user --crdb-replicas -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app labcontroller status")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s install")
|
||||
COMPREPLY=($(compgen -W "--role --user --port --k3s-server --k3s-token -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --user --k3s-server --k3s-token -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s health")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s list")
|
||||
COMPREPLY=($(compgen -W "--user --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s kubeconfig")
|
||||
COMPREPLY=($(compgen -W "--user --context --print -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"init bastion")
|
||||
COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision list")
|
||||
COMPREPLY=($(compgen -W "--port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision install")
|
||||
COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision reprovision")
|
||||
COMPREPLY=($(compgen -W "--role --os --disk --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "--role --os --disk -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision debug")
|
||||
COMPREPLY=($(compgen -W "--pxe-boot -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision forget")
|
||||
COMPREPLY=($(compgen -W "--port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision register")
|
||||
COMPREPLY=($(compgen -W "--role --ip -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision logs")
|
||||
COMPREPLY=($(compgen -W "-f --follow --port -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision makeiso")
|
||||
COMPREPLY=($(compgen -W "--arch --local --out -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"config list")
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
@@ -83,7 +95,7 @@ _labctl() {
|
||||
COMPREPLY=($(compgen -W "deploy status -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"app k3s")
|
||||
COMPREPLY=($(compgen -W "install health list -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "install health list kubeconfig -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"version")
|
||||
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
|
||||
@@ -92,7 +104,7 @@ _labctl() {
|
||||
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"provision")
|
||||
COMPREPLY=($(compgen -W "list install reprovision forget logs -h --help" -- "$cur"))
|
||||
COMPREPLY=($(compgen -W "list install reprovision debug forget register logs makeiso -h --help" -- "$cur"))
|
||||
return ;;
|
||||
"config")
|
||||
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))
|
||||
|
||||
@@ -118,38 +118,40 @@ complete -c labctl -n "__labctl_in_cmd init bastion standalone start" -l foregro
|
||||
# init bastion standalone stop options
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone stop" -l dir -d 'Bastion data directory' -x
|
||||
|
||||
# init bastion standalone status options
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l dir -d 'Bastion data directory' -x
|
||||
complete -c labctl -n "__labctl_in_cmd init bastion standalone status" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a list -d 'List all known machines'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a discovered machine for OS installation'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a register -d 'Register an already-installed machine (e.g. after state loss)'
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
|
||||
|
||||
# provision list options
|
||||
complete -c labctl -n "__labctl_in_cmd provision list" -l port -d 'Bastion HTTP port' -x
|
||||
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'
|
||||
|
||||
# provision install options
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller'
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd provision install" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision reprovision options
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l role -d 'Machine role (see below)' -xa 'vanilla worker infra labcontroller'
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l os -d 'Operating system' -xa 'fedora-43 ubuntu-26.04'
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target disk device (auto-detect if omitted)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd provision reprovision" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# provision forget options
|
||||
complete -c labctl -n "__labctl_in_cmd provision forget" -l port -d 'Bastion HTTP port' -x
|
||||
# provision debug options
|
||||
complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)'
|
||||
|
||||
# provision register options
|
||||
complete -c labctl -n "__labctl_in_cmd provision register" -l role -d 'Machine role' -xa 'vanilla worker infra labcontroller'
|
||||
complete -c labctl -n "__labctl_in_cmd provision register" -l ip -d 'Machine IP address' -x
|
||||
|
||||
# provision logs options
|
||||
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow logs in real-time (SSE stream)'
|
||||
complete -c labctl -n "__labctl_in_cmd provision logs" -l port -d 'Bastion HTTP port' -x
|
||||
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time'
|
||||
|
||||
# provision makeiso options
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l arch -d 'Target architecture(s)' -xa 'x86_64 aarch64'
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l local -d 'Build ISO locally instead of using bastion-hosted URL'
|
||||
complete -c labctl -n "__labctl_in_cmd provision makeiso" -l out -d 'Output path for local ISO build' -x
|
||||
|
||||
# config subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd config" -a list -d 'Show all configuration values'
|
||||
@@ -173,30 +175,31 @@ complete -c labctl -n "__labctl_using_cmd app labcontroller" -a status -d 'Check
|
||||
|
||||
# app labcontroller deploy options
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l port -d 'Bastion HTTP port' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller deploy" -l crdb-replicas -d 'CockroachDB replicas' -x
|
||||
|
||||
# app labcontroller status options
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# app k3s subcommands
|
||||
complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)'
|
||||
complete -c labctl -n "__labctl_using_cmd app k3s" -a health -d 'Check k3s health (all hosts if no target given)'
|
||||
complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed machines and their k3s status'
|
||||
complete -c labctl -n "__labctl_using_cmd app k3s" -a kubeconfig -d 'Fetch kubeconfig from a target and merge into ~/.kube/config'
|
||||
|
||||
# app k3s install options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l port -d 'Bastion HTTP port (for resolving target)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-server -d 'k3s server URL (required for worker role)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s install" -l k3s-token -d 'k3s join token (required for worker role)' -x
|
||||
|
||||
# app k3s health options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s health" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# app k3s list options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s list" -l port -d 'Bastion HTTP port' -x
|
||||
|
||||
# app k3s kubeconfig options
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l user -d 'SSH user' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l context -d 'Context name (defaults to hostname)' -x
|
||||
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l print -d 'Print kubeconfig to stdout instead of merging'
|
||||
|
||||
|
||||
431
bastion/docs/ARCHITECTURE.md
Normal file
431
bastion/docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,431 @@
|
||||
# Lab Platform Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
A bare-metal and hybrid cloud infrastructure platform for automated machine provisioning, Kubernetes cluster management, and fleet operations. The platform discovers hardware via PXE boot, installs operating systems unattended, deploys k3s clusters, and provides centralized management through a CLI and API.
|
||||
|
||||
**Components:**
|
||||
- **bastion** -- PXE boot server (DHCP/TFTP/HTTP) for machine discovery and OS installation
|
||||
- **labd** -- Master daemon for multi-bastion aggregation, persistent state, agent management
|
||||
- **labctl** -- CLI tool for operators (kubectl-style interface)
|
||||
- **lab-agent** -- Daemon on provisioned servers for remote execution and monitoring
|
||||
- **modules** -- Declarative configuration system (k3s, labcontroller)
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
labctl (CLI)
|
||||
|
|
||||
labd (master daemon)
|
||||
/ | \
|
||||
bastion1 bastion2 ... (PXE provisioning)
|
||||
/ \ |
|
||||
[machines] [machines] (bare metal)
|
||||
| |
|
||||
lab-agent lab-agent (remote exec)
|
||||
```
|
||||
|
||||
### Communication Patterns
|
||||
|
||||
| Path | Protocol | Auth |
|
||||
|------|----------|------|
|
||||
| labctl -> labd | HTTP/HTTPS | mTLS cert (future: token) |
|
||||
| bastion -> labd | WebSocket | Join token enrollment |
|
||||
| lab-agent -> labd | WebSocket | mTLS certificate |
|
||||
| machine -> bastion | HTTP | None (local network) |
|
||||
| Anaconda -> bastion | HTTP + UDP syslog | None (install-time) |
|
||||
| labctl -> bastion | HTTP | None (standalone mode) |
|
||||
|
||||
### Standalone vs Centralized
|
||||
|
||||
The bastion can operate in two modes:
|
||||
|
||||
1. **Standalone** -- single bastion, state in local JSON file, CLI talks directly to bastion HTTP API
|
||||
2. **Centralized** -- bastion registers with labd via WebSocket, state aggregated in CockroachDB, CLI talks to labd which routes commands to the correct bastion
|
||||
|
||||
---
|
||||
|
||||
## Machine Lifecycle
|
||||
|
||||
```
|
||||
PXE boot
|
||||
|
|
||||
+--------v--------+
|
||||
| DISCOVERED | Hardware inventory collected
|
||||
+---------+-------+
|
||||
|
|
||||
labctl provision install
|
||||
|
|
||||
+---------v-------+
|
||||
| INSTALL_QUEUE | Waiting for next PXE boot
|
||||
+---------+-------+
|
||||
|
|
||||
PXE boot (Anaconda)
|
||||
|
|
||||
+---------v-------+
|
||||
| INSTALLING | Progress: partitioning -> packages -> post-install
|
||||
+---------+-------+
|
||||
|
|
||||
+---------v-------+
|
||||
| INSTALLED | OS ready, SSH accessible
|
||||
+---------+-------+
|
||||
|
|
||||
labctl app k3s install
|
||||
|
|
||||
+---------v-------+
|
||||
| K3S RUNNING | Kubernetes node operational
|
||||
+--------+--------+
|
||||
|
|
||||
labctl provision reprovision
|
||||
|
|
||||
(back to INSTALL_QUEUE)
|
||||
```
|
||||
|
||||
Side paths:
|
||||
- **DEBUG** -- `labctl provision debug` boots Anaconda rescue mode for diagnostics
|
||||
- **FORGET** -- `labctl provision forget` removes machine from all state
|
||||
|
||||
---
|
||||
|
||||
## Packages
|
||||
|
||||
### Monorepo Structure
|
||||
|
||||
TypeScript ESM monorepo with pnpm workspaces. Six packages:
|
||||
|
||||
| Package | Role | Key Tech |
|
||||
|---------|------|----------|
|
||||
| `@lab/shared` | Types, protocol, constants | - |
|
||||
| `@lab/bastion` | PXE server | Fastify, dnsmasq |
|
||||
| `@lab/cli` | CLI binary | Commander.js |
|
||||
| `@lab/labd` | Master daemon | Fastify, Prisma, CockroachDB |
|
||||
| `@lab/agent` | Server agent | WebSocket |
|
||||
| `@lab/modules` | Config modules | SSH, k8s-client |
|
||||
|
||||
### @lab/shared
|
||||
|
||||
Core type system shared by all packages.
|
||||
|
||||
**State Model:**
|
||||
```typescript
|
||||
interface BastionState {
|
||||
discovered: Record<MAC, HardwareInfo>
|
||||
install_queue: Record<MAC, InstallConfig>
|
||||
installed: Record<MAC, InstalledInfo>
|
||||
debug: Record<MAC, DebugConfig>
|
||||
}
|
||||
```
|
||||
|
||||
**Roles:**
|
||||
- `vanilla` -- OS only, no k3s, no cluster services
|
||||
- `worker` -- k3s agent + Longhorn storage (joins existing cluster)
|
||||
- `infra` -- k3s server + etcd (control plane node)
|
||||
- `labcontroller` -- infra + bastion + labd + CockroachDB (self-sufficient)
|
||||
|
||||
**OS Support:**
|
||||
- `fedora-43` -- Anaconda kickstart installer
|
||||
- `ubuntu-26.04` -- cloud-init autoinstall
|
||||
|
||||
**Protocol:** Discriminated union message types for WebSocket communication between agents, bastions, and labd. Type guards and parsers for runtime validation.
|
||||
|
||||
### @lab/bastion
|
||||
|
||||
PXE boot server that handles the physical provisioning lifecycle.
|
||||
|
||||
**Services:**
|
||||
- `StateManager` -- JSON file persistence with immutable update pattern
|
||||
- `SyslogListener` -- UDP syslog receiver (port 5514) for Anaconda install logs
|
||||
- `InstallLogBuffer` -- In-memory ring buffer + disk persistence per machine
|
||||
- `BastionConnection` -- WebSocket client to labd for centralized mode
|
||||
- dnsmasq management (spawn, config generation, proxy/full DHCP)
|
||||
- Network auto-detection (interface, IP, subnet, gateway)
|
||||
- ISO builder (xorriso + mtools for non-PXE machines)
|
||||
|
||||
**HTTP Routes:**
|
||||
|
||||
| Endpoint | Purpose |
|
||||
|----------|---------|
|
||||
| `GET /dispatch?mac=` | Dynamic iPXE script (discover/install/debug/local-boot) |
|
||||
| `GET /ks?mac=` | Per-machine Anaconda kickstart |
|
||||
| `GET /debug.ks` | Rescue mode kickstart |
|
||||
| `GET /debug-setup.sh` | nc listener setup script for rescue shell |
|
||||
| `GET /discover.ks` | Hardware discovery kickstart |
|
||||
| `POST /api/discover` | Hardware inventory report |
|
||||
| `POST /api/install` | Queue machine for install |
|
||||
| `POST /api/progress` | Install progress callback |
|
||||
| `POST /api/log` | Raw log line ingestion |
|
||||
| `POST /api/debug` | Queue debug/rescue mode |
|
||||
| `GET /api/machines` | List all machines |
|
||||
| `GET /api/logs/:mac` | Install logs + progress |
|
||||
| `GET /api/logs/:mac/follow` | SSE stream of progress events |
|
||||
| `DELETE /api/machines/:mac` | Forget machine |
|
||||
|
||||
**Templates:**
|
||||
- `boot.ipxe.ts` -- iPXE scripts for each boot mode (discover, install, debug, pxe-boot-debug, local-boot)
|
||||
- `install.ks.ts` -- Full Fedora kickstart with LVM, SSH, k3s prereqs, progress callbacks, SysRq keys
|
||||
- `debug.ks.ts` -- Minimal rescue kickstart (SSH via inst.sshd)
|
||||
- `ubuntu-autoinstall.ts` -- cloud-init for Ubuntu
|
||||
- `dnsmasq.conf.ts` -- DHCP/TFTP configuration
|
||||
|
||||
**Boot Dispatch Logic:**
|
||||
```
|
||||
1. debug[mac]? -> renderDebugIpxe (auto-clear after serving)
|
||||
2. install_queue[mac]? -> renderInstallIpxe
|
||||
3. installed[mac]? -> renderLocalBootIpxe (exit to disk)
|
||||
4. unknown -> renderDiscoverIpxe
|
||||
```
|
||||
|
||||
### @lab/labd
|
||||
|
||||
Central management daemon. Aggregates multiple bastions, stores persistent state in CockroachDB, relays commands, manages agent fleet.
|
||||
|
||||
**Database (Prisma + CockroachDB):**
|
||||
- `Server` -- hostname, MAC, IP, role, status, cloud, environment, labels
|
||||
- `Bastion` -- hostname, network, serverIp, lastHeartbeat
|
||||
- `Agent` -- certificate, enrollment, heartbeat
|
||||
- `Cluster` -- name, cloud, environment, kubeconfig (encrypted)
|
||||
- `User` / `Role` / `Permission` -- RBAC (action:cloud:env:server matrix)
|
||||
- `JoinToken` -- one-time/reusable enrollment tokens
|
||||
- `AuditLog` -- action, resource, result, timestamp
|
||||
|
||||
**Key Services:**
|
||||
- `BastionRegistry` -- in-memory registry of connected bastions, state aggregation, MAC-to-bastion routing
|
||||
- `AgentRegistry` -- connected agents, heartbeat tracking
|
||||
- `MessageRouter` -- command relay between CLI/agents and bastions
|
||||
|
||||
**Command Routing:**
|
||||
```
|
||||
CLI: labctl provision install <mac> <hostname>
|
||||
-> POST /api/machines/install
|
||||
-> labd finds bastion that knows this MAC
|
||||
-> WebSocket: {type: "command-install", mac, hostname, disk, role}
|
||||
-> bastion updates install_queue
|
||||
-> WebSocket: {type: "command-response", status: "ok"}
|
||||
-> HTTP response to CLI
|
||||
```
|
||||
|
||||
### @lab/cli (labctl)
|
||||
|
||||
Operator CLI. Commander.js binary, distributed as RPM/DEB or standalone bun-compiled executable.
|
||||
|
||||
**Command Groups:**
|
||||
|
||||
```
|
||||
labctl init bastion standalone start|stop|status
|
||||
labctl provision list|install|reprovision|forget|debug|logs|makeiso
|
||||
labctl app k3s install|health|list
|
||||
labctl config list|get|set|path
|
||||
labctl login
|
||||
labctl doctor
|
||||
labctl roles
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
- Target resolution: hostname, MAC, or IP -> machine lookup
|
||||
- SSH reboot into PXE for reprovision/debug (efibootmgr --bootnext)
|
||||
- Follow mode: `labctl provision logs <target> -f` (5s polling)
|
||||
- Shell completions: bash, fish
|
||||
|
||||
### @lab/modules
|
||||
|
||||
Declarative configuration modules with three-phase lifecycle: install -> configure -> health.
|
||||
|
||||
**k3s Module:**
|
||||
- 5 operation groups: host-prep, networking, k3s-server, k3s-agent, hardening
|
||||
- 15+ individual operations: kernel modules, sysctl, firewall, Cilium CNI, SELinux, audit policy, pod security, cert checks
|
||||
- Health checks: service running, node ready, API health, pod status, Cilium status, secrets encryption
|
||||
- SSH execution backend with progress callbacks
|
||||
|
||||
### @lab/agent
|
||||
|
||||
Daemon on provisioned servers. WebSocket to labd for:
|
||||
- Heartbeat (hostname, uptime, CPU/mem usage)
|
||||
- Command execution (with stdout/stderr streaming)
|
||||
- Log streaming (journalctl relay)
|
||||
- mTLS certificate enrollment and rotation
|
||||
|
||||
---
|
||||
|
||||
## Disk Layout
|
||||
|
||||
### LVM Partitioning (labvg)
|
||||
|
||||
All roles share a common LVM layout. The kickstart `%pre` auto-detects the install disk (NVMe preferred, then SATA, skipping USB/removable).
|
||||
|
||||
| Volume | Size | FS | Reprovision |
|
||||
|--------|------|-----|-------------|
|
||||
| `/boot/efi` | 600 MB | vfat | Reused |
|
||||
| `/boot` | 3 GB | ext4 | Reused |
|
||||
| `swap` | 27 GB | swap | Recreated |
|
||||
| `/` (root) | 33 GB | xfs | Recreated |
|
||||
| `/var` | 100 GB | xfs | Recreated |
|
||||
| `/var/log` | 10 GB | xfs | Recreated |
|
||||
| `/home` | 10 GB | xfs | **Preserved** |
|
||||
| `/srv` | 20 GB | xfs | **Preserved** |
|
||||
| `/var/lib/longhorn` | remaining | xfs | **Preserved** (worker) |
|
||||
| `/var/lib/rancher` | 20 GB | xfs | **Preserved** (infra) |
|
||||
| `/tmp` | 4 GB | tmpfs | - |
|
||||
|
||||
Reprovision detection: if `labvg` VG exists, reuse EFI/boot partitions and preserve data volumes.
|
||||
|
||||
---
|
||||
|
||||
## Kickstart Features
|
||||
|
||||
The Fedora kickstart template (`install.ks.ts`) includes:
|
||||
|
||||
- **Dynamic disk detection** -- `%pre` probes NVMe/SATA/virtio, skips USB/removable, supports both fresh install and reprovision
|
||||
- **Progress callbacks** -- `curl -sf POST /api/progress` at each stage (partitioning, post-install substeps, complete)
|
||||
- **Anaconda syslog forwarding** -- `logging --host --port` streams real-time install logs to bastion
|
||||
- **SSH hardening** -- key-only auth, root login via pubkey only, admin user with passwordless sudo
|
||||
- **Network-first boot order** -- `efibootmgr` reorders boot entries so PXE is always first (bastion controls every reboot)
|
||||
- **SysRq magic keys** -- `kernel.sysrq=1` for emergency reboot via KVM keyboard
|
||||
- **Role-specific setup:**
|
||||
- `vanilla`: chronyd only
|
||||
- `worker`/`infra`: kernel modules (br_netfilter, overlay), sysctl (ip_forward, inotify), firewalld disabled, k3s binary installed
|
||||
- `infra`: k3s server binary pre-installed
|
||||
|
||||
**What is NOT in the kickstart:**
|
||||
- `console=ttyS0` -- causes 30s-per-step boot timeout on hardware without physical serial UART (discovered 2026-03-30, see docs/pxe-boot-debugging-2026-03-30.md)
|
||||
- Background log streamer (`tail -f`) -- prevents Anaconda from syncing filesystem, causes %post writes to not persist
|
||||
|
||||
---
|
||||
|
||||
## Deployment
|
||||
|
||||
### Container Images
|
||||
|
||||
**bastion** (`Dockerfile.bastion`):
|
||||
- Base: Fedora 43 (needs dnsmasq, iPXE)
|
||||
- Multi-stage: Alpine build -> Fedora runtime
|
||||
- iPXE rebuilt from source (SNP driver for EFI)
|
||||
- hostNetwork in k8s (DHCP needs raw sockets)
|
||||
- Capabilities: NET_ADMIN, NET_RAW
|
||||
|
||||
**labd** (`Dockerfile.labd`):
|
||||
- Base: Alpine (minimal)
|
||||
- Multi-stage build with Prisma client generation
|
||||
- Runs as non-root `node` user
|
||||
|
||||
### Kubernetes (k3s)
|
||||
|
||||
```
|
||||
Namespace: lab-infra
|
||||
Deployment: bastion (hostNetwork, PVC for /data, host SSH keys)
|
||||
ConfigMap: bastion-config (env vars)
|
||||
Secret: bastion-join-token
|
||||
PVC: bastion-state (local-path)
|
||||
|
||||
Namespace: lab-system
|
||||
Deployment: labd
|
||||
Service: labd (NodePort 30100)
|
||||
StatefulSet: cockroachdb-0
|
||||
```
|
||||
|
||||
### CLI Distribution
|
||||
|
||||
Built with `nfpm` as RPM/DEB. Includes:
|
||||
- `/usr/bin/labctl` (bun-compiled standalone binary)
|
||||
- `/usr/share/bash-completion/completions/labctl`
|
||||
- `/usr/share/fish/vendor_completions.d/labctl.fish`
|
||||
|
||||
Config: `~/.labctl/config.yaml` with `labdUrl`, output format, default cloud/environment.
|
||||
|
||||
---
|
||||
|
||||
## Build & Release
|
||||
|
||||
```bash
|
||||
# Development
|
||||
pnpm install && pnpm build # Compile all packages
|
||||
pnpm test:run # Unit tests (vitest)
|
||||
npx tsc --noEmit # Type check
|
||||
|
||||
# Deploy
|
||||
bash scripts/deploy.sh all # Build containers + RPM, push, restart pods
|
||||
bash scripts/deploy.sh bastion # Just bastion
|
||||
bash scripts/deploy.sh labd # Just labd
|
||||
bash scripts/deploy.sh labctl # Just CLI (local RPM install)
|
||||
|
||||
# Container builds
|
||||
bash scripts/build-bastion.sh --platforms linux/amd64 --push latest
|
||||
bash scripts/build-labd.sh --platforms linux/amd64 --push latest
|
||||
bash scripts/build-rpm.sh # RPM + DEB packages
|
||||
|
||||
# Integration tests (require libvirt, sudo)
|
||||
sudo tests/integration/run-pxe-test.sh
|
||||
```
|
||||
|
||||
Registry: `mysources.co.uk` (Gitea at 10.0.0.194:3012)
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
- Kickstart rendering (ksvalidator syntax check, partition layout, role-specific sections)
|
||||
- State management (load, save, update, debug field)
|
||||
- Dispatch routing (correct iPXE script for each machine state)
|
||||
- Syslog listener (UDP receive, IP->MAC resolution, RFC 3164 parsing)
|
||||
|
||||
### Integration Tests (libvirt VMs)
|
||||
- **pxe-provision.test.ts** -- Full end-to-end: create VM -> PXE discovery -> queue install -> Anaconda install -> SSH verification -> systemd health -> SELinux enforcing -> boot order check
|
||||
- **iso-provision.test.ts** -- ISO boot for non-PXE machines
|
||||
- **k3s-single-node.test.ts** -- Post-provision k3s installation and health
|
||||
- VM screenshot capture during boot for debugging
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
- **mTLS** for agent-labd communication (certificate enrollment via join tokens)
|
||||
- **SSH key-only auth** on provisioned machines (no password auth)
|
||||
- **SELinux enforcing** verified in integration tests
|
||||
- **RBAC** (planned): action:cloud:environment:server permission matrix
|
||||
- **Audit logging** (planned): every mutation tracked in CockroachDB
|
||||
- **Network-first boot order** prevents machines from booting without bastion approval
|
||||
- **SysRq keys** enabled for emergency reboot without SSH access
|
||||
|
||||
---
|
||||
|
||||
## Known Issues & Lessons Learned
|
||||
|
||||
### Serial Console Boot Delay (2026-03-30)
|
||||
`console=ttyS0,115200n8` in kernel cmdline causes 30-second timeout at every systemd boot phase on hardware without a physical serial UART. Root cause: systemd blocks writing to non-existent UART. Fix: removed from kickstart entirely.
|
||||
|
||||
### Anaconda %post Log Streamer
|
||||
Background `tail -f` in kickstart `%post` prevents Anaconda from syncing the filesystem. All file writes in %post appear to succeed but are lost on reboot. Fix: removed background log streamer, replaced with Anaconda's built-in `logging --host --port` syslog forwarding.
|
||||
|
||||
### Disk Auto-Detection
|
||||
Hardcoded `/dev/sda` default broke NVMe-only machines. Fix: default to empty string (auto-detect) which triggers the `%pre` disk probe logic.
|
||||
|
||||
### Anaconda Rescue Mode Limitations
|
||||
`%pre` and `%post` sections do not execute in `inst.rescue` mode. SSH in rescue mode is provided by Anaconda's `inst.sshd` kernel parameter + `sshpw` kickstart directive. Manual setup via `curl bastion:8080/debug-setup.sh | bash` for nc listener.
|
||||
|
||||
---
|
||||
|
||||
## Planned Work (Taskmaster)
|
||||
|
||||
13 tasks in queue, all pending:
|
||||
|
||||
1. **#72** Expand Prisma schema with resource relationships (Network, ServerNic, ServerDisk, ClusterMember)
|
||||
2. **#73** State persistence service (bastion state -> CockroachDB)
|
||||
3. **#74** State loading from labd on bastion startup
|
||||
4. **#75** Fix bastion --dir env var default
|
||||
5. **#76** Resource type registry with aliases (kubectl-style)
|
||||
6. **#77** `labctl get <resource>` command
|
||||
7. **#78** `labctl describe <resource>` command
|
||||
8. **#79** `labctl create/delete` commands
|
||||
9. **#80** Refactor provision commands to kubectl-style
|
||||
10. **#81** Server and resource API endpoints in labd
|
||||
11. **#82** RBAC permission checks in CLI
|
||||
12. **#83** Audit logging for resource operations
|
||||
13. **#84** Update CLI entry point and help text
|
||||
|
||||
Additional items not in taskmaster:
|
||||
- Ubuntu autoinstall disk auto-detect (still defaults to /dev/sda)
|
||||
- Verify `inst.sshd` works end-to-end in rescue mode
|
||||
- k3s cluster join vs new cluster distinction in `labctl app k3s install`
|
||||
- arm64 container build (iPXE cross-compilation broken)
|
||||
103
bastion/docs/kickstart-reference.md
Normal file
103
bastion/docs/kickstart-reference.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# Kickstart Reference — Lessons Learned
|
||||
|
||||
This documents pitfalls discovered during PXE boot testing. Read before modifying
|
||||
the kickstart template (`src/bastion/src/templates/install.ks.ts`).
|
||||
|
||||
## Package requirements
|
||||
|
||||
### `kernel-modules` is mandatory
|
||||
|
||||
`@core` only installs `kernel-modules-core`, which lacks common modules like `vfat`,
|
||||
`zram`, and many network/filesystem drivers. Without `kernel-modules`:
|
||||
|
||||
- `/boot/efi` (FAT32) cannot mount → `systemd-remount-fs` fails → **root stays
|
||||
read-only** → sshd-keygen can't write host keys → SSH unreachable
|
||||
- `zram-generator` fails → can trigger emergency mode
|
||||
|
||||
**Always include `kernel-modules` in %packages.** This matches what the real
|
||||
labmaster (192.168.8.11) has installed.
|
||||
|
||||
Regression introduced in commit `fac14b6` which removed `@server-product`
|
||||
(that group pulled in `kernel-modules` via `fedora-release-server`).
|
||||
|
||||
### `dosfstools` is needed
|
||||
|
||||
Provides `mkfs.vfat` and ensures FAT filesystem support is available. The real
|
||||
labmaster has it installed.
|
||||
|
||||
### Verify against the real machine
|
||||
|
||||
Before changing the package list, SSH to the labmaster and compare:
|
||||
```bash
|
||||
ssh 192.168.8.11 "rpm -q <package>"
|
||||
```
|
||||
|
||||
## Anaconda %post execution order
|
||||
|
||||
This is critical and not well documented:
|
||||
|
||||
1. `%pre` scripts run
|
||||
2. Disk partitioning and formatting
|
||||
3. Package installation
|
||||
4. **Anaconda writes system config (fstab, hostname, etc.)**
|
||||
5. `%post` scripts run (in chroot of installed system)
|
||||
6. `%post --nochroot` scripts run
|
||||
7. **Anaconda MAY overwrite fstab again after %post scripts**
|
||||
|
||||
**Consequence:** You cannot reliably modify `/etc/fstab` from `%post` or
|
||||
`%post --nochroot`. Anaconda overwrites it. Tested and confirmed — both
|
||||
`sed` in %post and %post --nochroot had no effect on the final fstab.
|
||||
|
||||
What DOES work from %post:
|
||||
- Writing files to `/etc/` (systemd units, config files, SSH keys)
|
||||
- Enabling/disabling systemd services
|
||||
- Installing additional packages
|
||||
- Running `systemctl enable/mask`
|
||||
|
||||
What does NOT work from %post:
|
||||
- Modifying `/etc/fstab` (Anaconda overwrites it)
|
||||
- `--fsoptions` on `part /boot/efi` (Anaconda ignores it for EFI partitions)
|
||||
|
||||
## UEFI / EFI partition
|
||||
|
||||
- Anaconda always creates an EFI System Partition for UEFI installs
|
||||
- The EFI partition is FAT32 — requires `vfat` kernel module to mount
|
||||
- If `/boot/efi` fails to mount, `systemd-remount-fs` fails, which leaves
|
||||
root as read-only. This cascades to break ALL services that need to write
|
||||
- The EFI partition is used by firmware directly for bootloader — the OS
|
||||
doesn't strictly need it mounted, but Anaconda adds it to fstab
|
||||
|
||||
## VM-specific issues (libvirt/QEMU/OVMF)
|
||||
|
||||
### iPXE exit behavior
|
||||
- `exit` (no args) returns EFI_SUCCESS → OVMF retries PXE, never reaches disk
|
||||
- `exit 1` returns EFI_ABORTED → OVMF moves to next boot device (disk)
|
||||
- VM boot order needs both `network` and `hd`: `--boot=uefi,network,hd`
|
||||
|
||||
### nftables
|
||||
- libvirt creates reject rules for NAT networks in table `ip libvirt_network`
|
||||
(NOT `inet libvirt` — this wrong table name cost hours of debugging)
|
||||
- These rules block new host→VM connections (SSH)
|
||||
- Rules are recreated on every `virsh start` — must delete after each VM restart
|
||||
- Chains: `guest_input` and `guest_output`
|
||||
|
||||
### Serial console
|
||||
- VM serial port: `--serial=tcp,host=127.0.0.1:4555,mode=bind,protocol=telnet`
|
||||
- Use `virsh console <vm-name>` for interactive access (handles telnet protocol)
|
||||
- Raw `socat` works for reading but pagers/readline break interactive use
|
||||
- Add `console=ttyS0,115200n8` to kernel args for boot output on serial
|
||||
|
||||
### SELinux on labmaster
|
||||
- Set to **permissive** — this is for k3s/kubernetes, NOT because SSH needs it
|
||||
- SSH works fine with SELinux enforcing on a properly installed Fedora system
|
||||
- The `ld.so.cache` AVC denials seen during debugging were caused by the
|
||||
read-only root filesystem, not by SELinux policy
|
||||
|
||||
## Testing checklist
|
||||
|
||||
Before merging kickstart changes:
|
||||
1. Check the real labmaster has the same packages: `ssh 192.168.8.11 "rpm -q <pkg>"`
|
||||
2. Run the PXE integration test: `sudo pnpm run test:integration:pxe`
|
||||
3. Verify via serial console (root / `lab-root-pw`) if SSH fails
|
||||
4. Check `mount | grep " / "` — must show `rw`, not `ro`
|
||||
5. Check `systemctl --failed` — no critical failures
|
||||
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
91
bastion/docs/pxe-boot-debugging-2026-03-30.md
Normal file
@@ -0,0 +1,91 @@
|
||||
# PXE Boot Debugging Session — 2026-03-30
|
||||
|
||||
## Problem
|
||||
Beelink SER Mini Pro (AMD Ryzen 7 255, Radeon 780M, 64GB DDR5, 1TB NVMe) boots Fedora 43 100x slower than normal after PXE kickstart install. Every systemd boot phase takes ~30 seconds. The Anaconda installer/rescue mode boots fast on the same hardware.
|
||||
|
||||
## Root Cause
|
||||
**`console=ttyS0,115200n8` in kernel cmdline** — added via kickstart `bootloader --append` during install.
|
||||
|
||||
This mini PC has **no physical serial UART**. When systemd writes to ttyS0, each log write blocks for ~30 seconds waiting for the non-existent UART hardware. Since systemd logs at every phase transition, the total boot time was 10+ minutes.
|
||||
|
||||
The Anaconda installer was unaffected because it uses a different init flow that doesn't go through the same systemd phase transitions.
|
||||
|
||||
## How We Found It
|
||||
Hours of systematic elimination:
|
||||
|
||||
| What we tried | Result | Ruled out |
|
||||
|---|---|---|
|
||||
| `modprobe.blacklist=amdgpu` | No change | GPU driver |
|
||||
| `amd_iommu=off` | No change | IOMMU |
|
||||
| Rebuild initramfs without plymouth/drm/fips | No change | Initramfs bloat |
|
||||
| systemd-boot instead of GRUB | Still slow | Bootloader |
|
||||
| PXE-boot kernel+initrd (skip local GRUB entirely) | Still slow | Local bootloader/firmware |
|
||||
| Disable TPM in BIOS | No change | TPM |
|
||||
| Remove `resume=` + resume dracut module | No change | Hibernate resume |
|
||||
| Manual LVM activation in rescue shell | **Fast** | NVMe/LVM themselves |
|
||||
| Remove `console=ttyS0,115200n8` from GRUB | **FAST BOOT** | **This was it** |
|
||||
|
||||
The key breakthrough was noticing the timestamps showed **exactly 30-second gaps** between boot phases — a timeout pattern, not general slowness. Then realising the serial console was added during install and had never been tested without.
|
||||
|
||||
## What Was Fixed (PR #4, merged)
|
||||
|
||||
### 1. Removed serial console from kickstart
|
||||
- Removed `console=ttyS0,115200n8` from `bootloader --append`
|
||||
- Removed `serial-getty@ttyS0.service` enablement
|
||||
- Removed rsyslog serial forwarding
|
||||
|
||||
### 2. Enabled Anaconda syslog forwarding
|
||||
- Uncommented `logging --host --port` directive in kickstart
|
||||
- Bastion's SyslogListener was already built — just needed IP→MAC resolution improvement
|
||||
- Added `registerIp()` calls from kickstart fetch and progress callbacks
|
||||
- Added syslog listener unit tests
|
||||
|
||||
### 3. Fixed disk auto-detection
|
||||
- Default disk changed from `/dev/sda` to `""` (auto-detect) in labd route and bastion command handler
|
||||
- The kickstart `%pre` auto-detect logic probes nvme0n1, sda, sdb, vda in order
|
||||
- Without this fix, NVMe-only machines (like the SER Mini Pro) fail immediately
|
||||
|
||||
### 4. SysRq magic keys
|
||||
- Added `kernel.sysrq=1` sysctl to kickstart `%post`
|
||||
- Enables Alt+SysRq+REISUB via JetKVM for emergency reboot of stuck machines
|
||||
|
||||
### 5. Simplified debug command
|
||||
- Removed `--sshd` flag (SSH always available via `inst.sshd` + `sshpw` in rescue mode)
|
||||
- Added `/debug-setup.sh` HTTP endpoint for nc listener setup from rescue shell
|
||||
- Cleaned up `sshd` field from DebugConfig, protocol types, all routes
|
||||
|
||||
### 6. Added `labctl provision logs -f`
|
||||
- Follow mode with 5-second polling for real-time install monitoring
|
||||
|
||||
## What Works
|
||||
|
||||
- **PXE discovery → install → boot** — full flow works end-to-end
|
||||
- **Anaconda syslog forwarding** — install logs stream to bastion
|
||||
- **Progress callbacks** — stage-by-stage install tracking via curl
|
||||
- **Auto disk detection** — works for NVMe and SATA
|
||||
- **Debug rescue mode** — `labctl provision debug <target>` boots Anaconda rescue with SSH
|
||||
- **Network-first boot order** — bastion controls every reboot via efibootmgr
|
||||
- **SysRq keys** — emergency reboot via JetKVM keyboard
|
||||
|
||||
## What Doesn't Work / Known Issues
|
||||
|
||||
- **`--sshd` in rescue mode** — Anaconda rescue mode skips both `%pre` and `%post` kickstart sections. `inst.sshd` + `sshpw` should provide SSH access, but hasn't been verified end-to-end yet. The `/debug-setup.sh` curl workaround exists for nc.
|
||||
- **arm64 container build** — iPXE cross-compilation fails on arm64 (GCC flag incompatibility). Workaround: build with `--platforms linux/amd64` only.
|
||||
- **Integration test SSH timeout** — VM boots fine but SSH times out due to libvirt nftables reject rules after VM restart. Test infrastructure issue, not a code bug.
|
||||
|
||||
## What Was Skipped / Left To Do
|
||||
|
||||
1. **Syslog UDP port in k3s** — works because bastion uses `hostNetwork: true`, but should be documented properly
|
||||
2. **Background log streamer** — the old `tail -f` approach broke Anaconda filesystem sync. Replaced with syslog forwarding. If more granular %post logging is needed, a synchronous log push at end of %post would be safe.
|
||||
3. **Per-machine hardware overrides** — turned out not to be needed (serial console was the only "special" setting, and removing it is universal)
|
||||
4. **Ubuntu autoinstall disk default** — `ubuntu-autoinstall.ts` still has `disk || "/dev/sda"` fallback (line 38), should be changed to auto-detect
|
||||
5. **Verify `inst.sshd` works in rescue mode** — test SSH with password "debug" next time debug mode is used
|
||||
6. **Re-enable TPM in BIOS** — was disabled during debugging, should be factory-reset (user plans to reset BIOS to factory)
|
||||
|
||||
## Key Learnings
|
||||
|
||||
1. **`console=ttyS0` on hardware without UART = 30s timeout per boot phase.** Never add serial console to kernel cmdline unless the hardware has a verified physical UART.
|
||||
2. **Exactly-N-second gaps in boot logs = timeout, not slowness.** Look for the timeout source, not performance issues.
|
||||
3. **The bisection approach works.** Systematically removing features one at a time found the root cause. But it took hours because the serial console was added early and seemed harmless.
|
||||
4. **Anaconda rescue mode is limited.** It skips `%pre` and `%post`, so you can't automate setup via kickstart. Use `inst.sshd` + `sshpw` for SSH, and serve helper scripts via HTTP for everything else.
|
||||
5. **Default disk paths break NVMe machines.** Always default to auto-detect (empty string) rather than `/dev/sda`.
|
||||
74
bastion/scripts/deploy.sh
Normal file
74
bastion/scripts/deploy.sh
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
# Deploy bastion + labd to k3s cluster and install labctl locally.
|
||||
# Usage: ./scripts/deploy.sh [bastion|labd|labctl|all]
|
||||
#
|
||||
# Builds container images with existing build scripts, pushes to Gitea
|
||||
# registry, restarts k3s pods, and builds/installs labctl RPM.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
# Load .env if present
|
||||
if [ -f .env ]; then
|
||||
set -a; source .env; set +a
|
||||
fi
|
||||
|
||||
deploy_bastion() {
|
||||
echo "=== Building & pushing bastion image ==="
|
||||
bash scripts/build-bastion.sh --push latest
|
||||
echo ""
|
||||
echo "=== Restarting bastion pod ==="
|
||||
kubectl rollout restart deployment/bastion -n lab-infra
|
||||
kubectl rollout status deployment/bastion -n lab-infra --timeout=180s
|
||||
echo "✓ Bastion deployed"
|
||||
}
|
||||
|
||||
deploy_labd() {
|
||||
echo "=== Building & pushing labd image ==="
|
||||
bash scripts/build-labd.sh --push latest
|
||||
echo ""
|
||||
echo "=== Restarting labd pod ==="
|
||||
kubectl rollout restart deployment/labd -n lab-system
|
||||
kubectl rollout status deployment/labd -n lab-system --timeout=180s
|
||||
echo "✓ Labd deployed"
|
||||
}
|
||||
|
||||
deploy_labctl() {
|
||||
echo "=== Building labctl RPM ==="
|
||||
bash scripts/build-rpm.sh
|
||||
echo ""
|
||||
echo "=== Installing labctl ==="
|
||||
RPM_FILE=$(ls dist/labctl-*.x86_64.rpm 2>/dev/null | head -1)
|
||||
if [ -n "$RPM_FILE" ]; then
|
||||
sudo rpm -U --force "$RPM_FILE"
|
||||
echo "✓ labctl installed: $(labctl --version 2>/dev/null || echo 'installed')"
|
||||
else
|
||||
echo "WARNING: No RPM found, falling back to direct install"
|
||||
pnpm build
|
||||
sudo install -m 755 <(echo '#!/bin/bash'; echo "exec node $PROJECT_DIR/src/cli/dist/index.js \"\$@\"") /usr/local/bin/labctl
|
||||
echo "✓ labctl installed (dev mode)"
|
||||
fi
|
||||
}
|
||||
|
||||
case "${1:-all}" in
|
||||
bastion) deploy_bastion ;;
|
||||
labd) deploy_labd ;;
|
||||
labctl) deploy_labctl ;;
|
||||
all)
|
||||
deploy_bastion
|
||||
echo ""
|
||||
deploy_labd
|
||||
echo ""
|
||||
deploy_labctl
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 [bastion|labd|labctl|all]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo ""
|
||||
echo "=== Deploy complete ==="
|
||||
@@ -14,6 +14,8 @@ export function loadConfig(overrides: Partial<BastionConfig> = {}): BastionConfi
|
||||
const dhcpRangeStart = overrides.dhcpRangeStart ?? process.env["DHCP_RANGE_START"] ?? "";
|
||||
const dhcpRangeEnd = overrides.dhcpRangeEnd ?? process.env["DHCP_RANGE_END"] ?? "";
|
||||
|
||||
const syslogPort = overrides.syslogPort ?? parseInt(process.env["SYSLOG_PORT"] ?? "5514", 10);
|
||||
|
||||
const ubuntuVersion = overrides.ubuntuVersion ?? process.env["UBUNTU_VERSION"] ?? "26.04";
|
||||
const ubuntuMirror = overrides.ubuntuMirror ?? process.env["UBUNTU_MIRROR"]
|
||||
?? `https://releases.ubuntu.com/${ubuntuVersion}`;
|
||||
@@ -43,6 +45,7 @@ export function loadConfig(overrides: Partial<BastionConfig> = {}): BastionConfi
|
||||
gateway: overrides.gateway ?? "",
|
||||
sshKeys: overrides.sshKeys ?? [],
|
||||
adminUser: overrides.adminUser ?? "",
|
||||
syslogPort,
|
||||
skipDnsmasq: overrides.skipDnsmasq,
|
||||
skipArtifacts: overrides.skipArtifacts,
|
||||
labdUrl: overrides.labdUrl ?? process.env["LABD_URL"],
|
||||
|
||||
@@ -220,10 +220,11 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
openFirewall(config);
|
||||
}
|
||||
|
||||
// Start HTTP server
|
||||
const { app, state } = createApp(config);
|
||||
// Start HTTP server + syslog listener
|
||||
const { app, state, syslog } = createApp(config);
|
||||
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
||||
logger.info(`HTTP server listening on :${config.httpPort}`);
|
||||
syslog.start();
|
||||
|
||||
// Start dnsmasq (unless skipped)
|
||||
if (config.skipDnsmasq !== true) {
|
||||
@@ -256,7 +257,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
state.update((s) => {
|
||||
s.install_queue[msg.mac] = {
|
||||
hostname: msg.hostname,
|
||||
disk: msg.disk ?? "/dev/sda",
|
||||
disk: msg.disk ?? "",
|
||||
role: msg.role as import("@lab/shared").Role,
|
||||
os: msg.os as import("@lab/shared").OsId,
|
||||
queued_at: new Date().toISOString(),
|
||||
@@ -265,6 +266,22 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
return { status: "ok", data: { mac: msg.mac, hostname: msg.hostname } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-debug", async (msg) => {
|
||||
if (msg.type !== "command-debug") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
const pxeBoot = msg.pxeBoot ?? false;
|
||||
const currentState = state.load();
|
||||
const hostname =
|
||||
currentState.installed[mac]?.hostname ??
|
||||
currentState.install_queue[mac]?.hostname ??
|
||||
currentState.discovered[mac]?.product ??
|
||||
mac;
|
||||
state.update((s) => {
|
||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||
});
|
||||
return { status: "ok", data: { mac, hostname } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-forget", async (msg) => {
|
||||
if (msg.type !== "command-forget") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
@@ -272,10 +289,26 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
delete s.discovered[mac];
|
||||
delete s.install_queue[mac];
|
||||
delete s.installed[mac];
|
||||
delete s.debug[mac];
|
||||
});
|
||||
return { status: "ok", data: { mac } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-register", async (msg) => {
|
||||
if (msg.type !== "command-register") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
state.update((s) => {
|
||||
s.installed[mac] = {
|
||||
hostname: msg.hostname,
|
||||
role: msg.role,
|
||||
ip: msg.ip,
|
||||
installed_at: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
logger.info(`MACHINE REGISTERED: ${mac} -> ${msg.hostname} (${msg.role}) ip=${msg.ip}`);
|
||||
return { status: "ok", data: { mac, hostname: msg.hostname } };
|
||||
});
|
||||
|
||||
labdConn.onCommand("command-role-update", async (msg) => {
|
||||
if (msg.type !== "command-role-update") throw new Error("unexpected");
|
||||
const mac = msg.mac.toLowerCase();
|
||||
@@ -310,6 +343,7 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
|
||||
// Graceful shutdown
|
||||
const shutdown = async (): Promise<void> => {
|
||||
logger.info("Shutting down...");
|
||||
syslog.stop();
|
||||
if (labdConn) labdConn.close();
|
||||
if (config.skipDnsmasq !== true) stopDnsmasq();
|
||||
closeFirewall(config);
|
||||
|
||||
@@ -13,11 +13,13 @@ import { triggerPostProvisionK3s } from "../services/post-provision.js";
|
||||
import { progressBus } from "../services/progress-events.js";
|
||||
import type { ProgressEvent } from "../services/progress-events.js";
|
||||
import type { InstallLogBuffer } from "../services/install-log.js";
|
||||
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||
|
||||
export function registerApiRoutes(
|
||||
app: FastifyInstance,
|
||||
state: StateManager,
|
||||
installLog: InstallLogBuffer,
|
||||
syslog: SyslogListener,
|
||||
): void {
|
||||
// List all machines
|
||||
app.get("/api/machines", async (_request, reply) => {
|
||||
@@ -84,6 +86,11 @@ export function registerApiRoutes(
|
||||
const { mac: rawMac, stage, detail } = request.body ?? {};
|
||||
const mac = (rawMac ?? "unknown").toLowerCase();
|
||||
const stageName = stage ?? "unknown";
|
||||
|
||||
// Register IP → MAC for syslog routing
|
||||
if (mac !== "unknown") {
|
||||
syslog.registerIp(request.ip, mac);
|
||||
}
|
||||
const detailStr = detail ?? "";
|
||||
|
||||
const GREEN = "\x1b[0;32m";
|
||||
@@ -189,6 +196,32 @@ export function registerApiRoutes(
|
||||
return reply.send({ status: "ok", lines: allLines.length });
|
||||
});
|
||||
|
||||
// Queue debug/rescue mode for a machine
|
||||
app.post<{
|
||||
Body: { mac?: string; pxeBoot?: boolean };
|
||||
}>("/api/debug", async (request, reply) => {
|
||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||
if (mac === "") {
|
||||
return reply.status(400).send({ error: "mac is required" });
|
||||
}
|
||||
|
||||
// Look up hostname from installed or discovered state
|
||||
const currentState = state.load();
|
||||
const hostname =
|
||||
currentState.installed[mac]?.hostname ??
|
||||
currentState.install_queue[mac]?.hostname ??
|
||||
currentState.discovered[mac]?.product ??
|
||||
mac;
|
||||
|
||||
state.update((s) => {
|
||||
s.debug[mac] = { hostname, queued_at: new Date().toISOString(), pxeBoot };
|
||||
});
|
||||
|
||||
logger.info(`DEBUG QUEUED: ${mac} -> ${hostname}`);
|
||||
return reply.send({ status: "ok", mac, hostname });
|
||||
});
|
||||
|
||||
// Delete a machine from all state
|
||||
app.delete<{
|
||||
Params: { mac: string };
|
||||
@@ -213,6 +246,10 @@ export function registerApiRoutes(
|
||||
delete s.installed[mac];
|
||||
found = true;
|
||||
}
|
||||
if (s.debug[mac] !== undefined) {
|
||||
delete s.debug[mac];
|
||||
found = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!found) {
|
||||
@@ -278,6 +315,50 @@ export function registerApiRoutes(
|
||||
return reply.send({ status: "ok", mac, new: isNew });
|
||||
});
|
||||
|
||||
// Register an already-installed machine (e.g. re-add after state loss)
|
||||
app.post<{
|
||||
Body: {
|
||||
mac?: string;
|
||||
hostname?: string;
|
||||
role?: string;
|
||||
ip?: string;
|
||||
};
|
||||
}>("/api/register", async (request, reply) => {
|
||||
const { mac: rawMac, hostname, role, ip } = request.body ?? {};
|
||||
const mac = (rawMac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
|
||||
if (mac === "") {
|
||||
return reply.status(400).send({ error: "mac is required" });
|
||||
}
|
||||
if (!hostname) {
|
||||
return reply.status(400).send({ error: "hostname is required" });
|
||||
}
|
||||
|
||||
const validRole = role ?? "worker";
|
||||
if (!(SUPPORTED_ROLES as readonly string[]).includes(validRole)) {
|
||||
return reply.status(400).send({ error: `invalid role: '${validRole}'. Supported: ${SUPPORTED_ROLES.join(", ")}` });
|
||||
}
|
||||
|
||||
state.update((s) => {
|
||||
s.installed[mac] = {
|
||||
hostname,
|
||||
role: validRole,
|
||||
ip: ip ?? "",
|
||||
installed_at: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
logger.info(`MACHINE REGISTERED: ${mac} -> hostname=${hostname} role=${validRole} ip=${ip ?? ""}`);
|
||||
|
||||
return reply.send({
|
||||
status: "registered",
|
||||
mac,
|
||||
hostname,
|
||||
role: validRole,
|
||||
ip: ip ?? "",
|
||||
});
|
||||
});
|
||||
|
||||
// Update a machine's role (e.g. promote infra -> labcontroller)
|
||||
app.post<{
|
||||
Body: {
|
||||
|
||||
@@ -10,9 +10,12 @@ import type { StateManager } from "../services/state.js";
|
||||
import {
|
||||
renderDiscoverIpxe,
|
||||
renderInstallIpxe,
|
||||
renderDebugIpxe,
|
||||
renderPxeBootDebugIpxe,
|
||||
renderLocalBootIpxe,
|
||||
} from "../templates/boot.ipxe.js";
|
||||
import { renderUbuntuInstallIpxe } from "../templates/ubuntu-boot.ipxe.js";
|
||||
import { renderDebugKickstart } from "../templates/debug.ks.js";
|
||||
import { logger } from "../services/logger.js";
|
||||
|
||||
export function registerDispatchRoutes(
|
||||
@@ -20,10 +23,76 @@ export function registerDispatchRoutes(
|
||||
config: BastionConfig,
|
||||
state: StateManager,
|
||||
): void {
|
||||
// Serve debug/rescue kickstart (minimal: SSH keys + network for inst.sshd)
|
||||
app.get<{ Querystring: { mac?: string } }>("/debug.ks", async (_request, reply) => {
|
||||
const ks = renderDebugKickstart({
|
||||
sshKeys: config.sshKeys ?? [],
|
||||
serverIp: config.serverIp,
|
||||
httpPort: config.httpPort,
|
||||
});
|
||||
return reply.type("text/plain").send(ks);
|
||||
});
|
||||
|
||||
// Shell script for manual debug setup (nc listener + IP reporting)
|
||||
// Usage from rescue shell: curl http://bastion:port/debug-setup.sh | bash
|
||||
app.get("/debug-setup.sh", async (_request, reply) => {
|
||||
const script = `#!/bin/bash
|
||||
# Lab Bastion debug setup — run from rescue shell
|
||||
set -x
|
||||
|
||||
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||
MAC_ADDR=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||
|
||||
# Start persistent nc listener for remote shell
|
||||
(while true; do nc -l -p 2323 -e /bin/bash 2>/dev/null; done) &
|
||||
echo "nc shell listener on port 2323"
|
||||
|
||||
# Report IP to bastion
|
||||
curl -sf -X POST "http://${config.serverIp}:${config.httpPort}/api/progress" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$MAC_ADDR\\",\\"stage\\":\\"debug-ready\\",\\"detail\\":\\"nc $IP_ADDR 2323\\"}" 2>/dev/null || true
|
||||
|
||||
echo ""
|
||||
echo "=== Debug environment ready ==="
|
||||
echo " nc $IP_ADDR 2323 (remote shell)"
|
||||
echo " ssh root@$IP_ADDR (password: debug)"
|
||||
echo "==============================="
|
||||
`;
|
||||
return reply.type("text/plain").send(script);
|
||||
});
|
||||
|
||||
app.get<{ Querystring: { mac?: string } }>("/dispatch", async (request, reply) => {
|
||||
const mac = (request.query.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
const currentState = state.load();
|
||||
|
||||
// Debug mode takes highest priority — auto-clear after serving once
|
||||
const debugEntry = currentState.debug[mac];
|
||||
if (debugEntry) {
|
||||
const hostname = debugEntry.hostname ?? "debug";
|
||||
state.update((s) => { delete s.debug[mac]; });
|
||||
|
||||
let script: string;
|
||||
if (debugEntry.pxeBoot) {
|
||||
logger.info(`PXE BOOT DEBUG: ${mac} -> ${hostname} (kernel+initrd from PXE, root from NVMe)`);
|
||||
script = renderPxeBootDebugIpxe({
|
||||
mac,
|
||||
hostname,
|
||||
serverIp: config.serverIp,
|
||||
httpPort: config.httpPort,
|
||||
});
|
||||
} else {
|
||||
logger.info(`DEBUG BOOT: ${mac} -> ${hostname} (rescue mode)`);
|
||||
script = renderDebugIpxe({
|
||||
mac,
|
||||
hostname,
|
||||
serverIp: config.serverIp,
|
||||
httpPort: config.httpPort,
|
||||
fedoraMirror: config.fedoraMirror,
|
||||
});
|
||||
}
|
||||
return reply.type("text/plain").send(script);
|
||||
}
|
||||
|
||||
const queueEntry = currentState.install_queue[mac];
|
||||
if (queueEntry) {
|
||||
const hostname = queueEntry.hostname ?? "lab-node";
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
import type { FastifyInstance } from "fastify";
|
||||
import type { BastionConfig } from "@lab/shared";
|
||||
import type { StateManager } from "../services/state.js";
|
||||
import type { SyslogListener } from "../services/syslog-listener.js";
|
||||
import { generateInstallKickstart, generateDiscoverKickstart } from "../services/kickstart-generator.js";
|
||||
import { renderUbuntuAutoinstall, renderUbuntuMetaData, type UbuntuAutoinstallParams } from "../templates/ubuntu-autoinstall.js";
|
||||
|
||||
@@ -12,6 +13,7 @@ export function registerKickstartRoutes(
|
||||
app: FastifyInstance,
|
||||
config: BastionConfig,
|
||||
state: StateManager,
|
||||
syslog: SyslogListener,
|
||||
): void {
|
||||
// Per-MAC install kickstart
|
||||
app.get<{ Querystring: { mac?: string } }>("/ks", async (request, reply) => {
|
||||
@@ -19,6 +21,11 @@ export function registerKickstartRoutes(
|
||||
const currentState = state.load();
|
||||
const queueEntry = currentState.install_queue[mac];
|
||||
|
||||
// Register IP → MAC so syslog listener can route Anaconda logs
|
||||
if (mac) {
|
||||
syslog.registerIp(request.ip, mac);
|
||||
}
|
||||
|
||||
const ks = generateInstallKickstart(config, {
|
||||
hostname: queueEntry?.hostname ?? "lab-node",
|
||||
disk: queueEntry?.disk ?? "",
|
||||
|
||||
@@ -6,13 +6,14 @@ import { mkdirSync, existsSync } from "node:fs";
|
||||
import type { BastionConfig } from "@lab/shared";
|
||||
import { StateManager } from "./services/state.js";
|
||||
import { InstallLogBuffer } from "./services/install-log.js";
|
||||
import { SyslogListener } from "./services/syslog-listener.js";
|
||||
import { logger } from "./services/logger.js";
|
||||
import { registerDispatchRoutes } from "./routes/dispatch.js";
|
||||
import { registerKickstartRoutes } from "./routes/kickstart.js";
|
||||
import { registerApiRoutes } from "./routes/api.js";
|
||||
|
||||
|
||||
export function createApp(config: BastionConfig): { app: ReturnType<typeof Fastify>; state: StateManager; installLog: InstallLogBuffer } {
|
||||
export function createApp(config: BastionConfig): { app: ReturnType<typeof Fastify>; state: StateManager; installLog: InstallLogBuffer; syslog: SyslogListener } {
|
||||
const app = Fastify({
|
||||
logger: false, // We use winston instead
|
||||
});
|
||||
@@ -21,6 +22,7 @@ export function createApp(config: BastionConfig): { app: ReturnType<typeof Fasti
|
||||
state.init();
|
||||
|
||||
const installLog = new InstallLogBuffer(config.bastionDir);
|
||||
const syslog = new SyslogListener(config.syslogPort, installLog, state);
|
||||
|
||||
// Serve static files (vmlinuz, initrd.img, iPXE binaries) from the HTTP directory
|
||||
mkdirSync(config.httpDir, { recursive: true });
|
||||
@@ -41,8 +43,8 @@ export function createApp(config: BastionConfig): { app: ReturnType<typeof Fasti
|
||||
|
||||
// Register route handlers
|
||||
registerDispatchRoutes(app, config, state);
|
||||
registerKickstartRoutes(app, config, state);
|
||||
registerApiRoutes(app, state, installLog);
|
||||
registerKickstartRoutes(app, config, state, syslog);
|
||||
registerApiRoutes(app, state, installLog, syslog);
|
||||
// boot.iso is generated at startup and served as a static file from httpDir
|
||||
// (static serving supports HTTP Range requests, required by JetKVM streaming)
|
||||
|
||||
@@ -51,7 +53,7 @@ export function createApp(config: BastionConfig): { app: ReturnType<typeof Fasti
|
||||
logger.info(`HTTP: ${request.ip} ${request.method} ${request.url}`);
|
||||
});
|
||||
|
||||
return { app, state, installLog };
|
||||
return { app, state, installLog, syslog };
|
||||
}
|
||||
|
||||
export async function startServer(config: BastionConfig): Promise<void> {
|
||||
|
||||
@@ -36,6 +36,7 @@ export function generateInstallKickstart(
|
||||
locale: config.locale,
|
||||
serverIp: config.serverIp,
|
||||
httpPort: config.httpPort,
|
||||
syslogPort: config.syslogPort,
|
||||
sshKeys: config.sshKeys,
|
||||
adminUser: config.adminUser,
|
||||
};
|
||||
|
||||
@@ -164,6 +164,8 @@ export class BastionConnection {
|
||||
case "command-install":
|
||||
case "command-forget":
|
||||
case "command-role-update":
|
||||
case "command-debug":
|
||||
case "command-register":
|
||||
void this.handleCommand(msg);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ const EMPTY_STATE: BastionState = {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
};
|
||||
|
||||
export type StateChangeListener = (state: BastionState) => void;
|
||||
@@ -33,6 +34,7 @@ export class StateManager {
|
||||
discovered: parsed.discovered ?? {},
|
||||
install_queue: parsed.install_queue ?? {},
|
||||
installed: parsed.installed ?? {},
|
||||
debug: parsed.debug ?? {},
|
||||
};
|
||||
} catch {
|
||||
return { ...EMPTY_STATE };
|
||||
|
||||
108
bastion/src/bastion/src/services/syslog-listener.ts
Normal file
108
bastion/src/bastion/src/services/syslog-listener.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
// UDP syslog listener for receiving Anaconda install logs.
|
||||
// Anaconda's `logging --host` sends RFC 3164 syslog over UDP.
|
||||
// We parse the messages and route them to InstallLogBuffer.
|
||||
|
||||
import { createSocket, type Socket } from "node:dgram";
|
||||
import type { InstallLogBuffer } from "./install-log.js";
|
||||
import type { StateManager } from "./state.js";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
/**
|
||||
* Parse a BSD syslog (RFC 3164) message.
|
||||
* Format: <PRI>TIMESTAMP HOSTNAME APP[PID]: MESSAGE
|
||||
* Anaconda messages look like: <13>Mar 28 19:32:01 anaconda[1234]: some message
|
||||
*/
|
||||
function parseSyslogLine(raw: string): { program: string; message: string } {
|
||||
// Strip priority: <NN>
|
||||
const noPri = raw.replace(/^<\d+>/, "");
|
||||
// Try to extract program and message after the timestamp + hostname
|
||||
// RFC 3164: "Mon DD HH:MM:SS HOSTNAME PROGRAM[PID]: MESSAGE"
|
||||
const match = noPri.match(/^\w+\s+\d+\s+[\d:]+\s+\S+\s+(\S+?)(?:\[\d+\])?:\s*(.*)/);
|
||||
if (match?.[1] && match[2] !== undefined) {
|
||||
return { program: match[1], message: match[2] };
|
||||
}
|
||||
// Fallback: just return the whole line
|
||||
return { program: "unknown", message: noPri.trim() };
|
||||
}
|
||||
|
||||
export class SyslogListener {
|
||||
private socket: Socket | null = null;
|
||||
private port: number;
|
||||
private installLog: InstallLogBuffer;
|
||||
private state: StateManager;
|
||||
/** Explicit IP → MAC mapping registered from kickstart/progress requests. */
|
||||
private ipToMac = new Map<string, string>();
|
||||
|
||||
constructor(port: number, installLog: InstallLogBuffer, state: StateManager) {
|
||||
this.port = port;
|
||||
this.installLog = installLog;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
/** Register an IP → MAC mapping (called when we learn a machine's IP). */
|
||||
registerIp(ip: string, mac: string): void {
|
||||
this.ipToMac.set(ip, mac.toLowerCase());
|
||||
}
|
||||
|
||||
/** Resolve a source IP to a MAC address. */
|
||||
private resolveIpToMac(ip: string): string | null {
|
||||
// Check explicit mapping first (most reliable)
|
||||
const explicit = this.ipToMac.get(ip);
|
||||
if (explicit) return explicit;
|
||||
|
||||
const currentState = this.state.load();
|
||||
|
||||
// Check install queue — machines being installed have an IP from DHCP
|
||||
for (const [mac, entry] of Object.entries(currentState.install_queue)) {
|
||||
if (entry.progress_detail?.includes(ip)) return mac;
|
||||
}
|
||||
|
||||
// Check installed machines
|
||||
for (const [mac, info] of Object.entries(currentState.installed)) {
|
||||
if (info.ip === ip) return mac;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Resolve a MAC to the hostname from install queue or installed state. */
|
||||
private resolveHostname(mac: string): string {
|
||||
const s = this.state.load();
|
||||
return s.install_queue[mac]?.hostname ?? s.installed[mac]?.hostname ?? mac;
|
||||
}
|
||||
|
||||
start(): void {
|
||||
this.socket = createSocket("udp4");
|
||||
|
||||
this.socket.on("message", (msg, rinfo) => {
|
||||
const raw = msg.toString("utf-8").trim();
|
||||
if (!raw) return;
|
||||
|
||||
const { program, message } = parseSyslogLine(raw);
|
||||
const mac = this.resolveIpToMac(rinfo.address);
|
||||
|
||||
if (mac) {
|
||||
const hostname = this.resolveHostname(mac);
|
||||
const line = program !== "unknown" ? `[${program}] ${message}` : message;
|
||||
this.installLog.append(mac, [line], hostname);
|
||||
}
|
||||
// If we can't resolve the IP, we still log it for debugging
|
||||
// but don't store it in the install log buffer
|
||||
});
|
||||
|
||||
this.socket.on("error", (err) => {
|
||||
logger.error(`Syslog listener error: ${err.message}`);
|
||||
});
|
||||
|
||||
this.socket.bind(this.port, "0.0.0.0", () => {
|
||||
logger.info(`Syslog listener on UDP :${this.port}`);
|
||||
});
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (this.socket) {
|
||||
this.socket.close();
|
||||
this.socket = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -42,7 +42,7 @@ echo Collecting hardware info...
|
||||
echo =============================================
|
||||
echo
|
||||
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/discover.ks inst.stage2=${params.fedoraMirror} inst.text nomodeset
|
||||
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||
boot
|
||||
`;
|
||||
@@ -69,7 +69,62 @@ echo MAC: ${params.mac}
|
||||
echo =============================================
|
||||
echo
|
||||
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.ks=http://${params.serverIp}:${params.httpPort}/ks?mac=${params.mac} inst.repo=${params.fedoraMirror} inst.text nomodeset
|
||||
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||
boot
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* iPXE script for debug/rescue mode -- boots Fedora installer in rescue mode.
|
||||
* Provides a shell with LVM tools, network, and SSH for inspecting installed systems.
|
||||
*/
|
||||
export function renderDebugIpxe(params: {
|
||||
mac: string;
|
||||
hostname: string;
|
||||
serverIp: string;
|
||||
httpPort: number;
|
||||
fedoraMirror: string;
|
||||
}): string {
|
||||
return `#!ipxe
|
||||
|
||||
echo
|
||||
echo =============================================
|
||||
echo Lab PXE Bastion - DEBUG/RESCUE MODE
|
||||
echo Target: ${params.hostname}
|
||||
echo MAC: ${params.mac}
|
||||
echo =============================================
|
||||
echo
|
||||
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz inst.rescue inst.text inst.sshd inst.ks=http://${params.serverIp}:${params.httpPort}/debug.ks?mac=${params.mac} inst.stage2=${params.fedoraMirror}
|
||||
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||
boot
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* iPXE script for PXE-boot debug mode -- boots the installed system's root
|
||||
* filesystem using the bastion's PXE kernel+initrd instead of local GRUB.
|
||||
* Workaround for UEFI firmware bugs that make local disk boot slow.
|
||||
*/
|
||||
export function renderPxeBootDebugIpxe(params: {
|
||||
mac: string;
|
||||
hostname: string;
|
||||
serverIp: string;
|
||||
httpPort: number;
|
||||
}): string {
|
||||
return `#!ipxe
|
||||
|
||||
echo
|
||||
echo =============================================
|
||||
echo Lab PXE Bastion - PXE BOOT (debug)
|
||||
echo Target: ${params.hostname}
|
||||
echo MAC: ${params.mac}
|
||||
echo Kernel+initrd from PXE, root from NVMe
|
||||
echo =============================================
|
||||
echo
|
||||
|
||||
kernel http://${params.serverIp}:${params.httpPort}/vmlinuz root=/dev/mapper/labvg-root ro rd.lvm.lv=labvg/root rd.lvm.lv=labvg/swap console=tty0
|
||||
initrd http://${params.serverIp}:${params.httpPort}/initrd.img
|
||||
boot
|
||||
`;
|
||||
@@ -88,6 +143,6 @@ echo Already installed, booting from local disk
|
||||
echo =============================================
|
||||
echo
|
||||
sleep 3
|
||||
exit
|
||||
exit 1
|
||||
`;
|
||||
}
|
||||
|
||||
33
bastion/src/bastion/src/templates/debug.ks.ts
Normal file
33
bastion/src/bastion/src/templates/debug.ks.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
// Debug/rescue kickstart template.
|
||||
// Minimal kickstart for Anaconda rescue mode.
|
||||
//
|
||||
// SSH access: Anaconda's inst.sshd starts sshd automatically.
|
||||
// The sshpw directive sets the password, sshkey adds authorized keys.
|
||||
// %pre/%post do NOT run in rescue mode — don't put setup code there.
|
||||
|
||||
export interface DebugKickstartParams {
|
||||
sshKeys: string[];
|
||||
serverIp?: string;
|
||||
httpPort?: number;
|
||||
}
|
||||
|
||||
export function renderDebugKickstart(params: DebugKickstartParams): string {
|
||||
const sshkeyLine = params.sshKeys.length > 0
|
||||
? `sshkey --username=root "${params.sshKeys[0]}"`
|
||||
: "";
|
||||
|
||||
return `# Lab Bastion -- Debug/Rescue Kickstart
|
||||
# Minimal: SSH + network for Anaconda rescue mode
|
||||
#
|
||||
# SSH is started by Anaconda (inst.sshd kernel param).
|
||||
# Password: debug | SSH keys from bastion config.
|
||||
# %pre/%post do NOT run in rescue mode.
|
||||
|
||||
lang en_US.UTF-8
|
||||
keyboard uk
|
||||
network --bootproto=dhcp --activate
|
||||
|
||||
sshpw --username=root --plaintext debug
|
||||
${sshkeyLine}
|
||||
`;
|
||||
}
|
||||
@@ -88,6 +88,9 @@ pxe-service=tag:!ipxe,ARM64_EFI,"PXE Boot",ipxe-arm64.efi` : `# Full DHCP mode -
|
||||
# Discovery protocol which some UEFI implementations don't support). The dhcp-boot
|
||||
# directives above provide the boot filename directly in the DHCP offer.`}
|
||||
|
||||
# Lease file in bastion directory (avoid default /var/lib/dnsmasq which needs root)
|
||||
dhcp-leasefile=${config.bastionDir}/dnsmasq.leases
|
||||
|
||||
# Verbose logging
|
||||
log-dhcp
|
||||
`;
|
||||
|
||||
@@ -14,6 +14,7 @@ export interface InstallKickstartParams {
|
||||
locale: string;
|
||||
serverIp: string;
|
||||
httpPort: number;
|
||||
syslogPort: number;
|
||||
sshKeys: string[];
|
||||
adminUser: string;
|
||||
}
|
||||
@@ -29,6 +30,7 @@ export function renderInstallKickstart(params: InstallKickstartParams): string {
|
||||
locale,
|
||||
serverIp,
|
||||
httpPort,
|
||||
syslogPort,
|
||||
sshKeys,
|
||||
adminUser,
|
||||
} = params;
|
||||
@@ -41,9 +43,10 @@ export function renderInstallKickstart(params: InstallKickstartParams): string {
|
||||
const isVanilla = role === "vanilla";
|
||||
|
||||
// -- Auth section --
|
||||
// Always set a root password (for serial console debugging) + SSH keys
|
||||
const auth = sshKeys.length > 0
|
||||
? `rootpw --lock\nsshkey --username=root "${sshKeys[0]}"`
|
||||
: "rootpw --plaintext changeme";
|
||||
? `rootpw --plaintext lab-root-pw\nsshkey --username=root "${sshKeys[0]}"`
|
||||
: "rootpw --plaintext lab-root-pw";
|
||||
|
||||
// -- Admin user directive --
|
||||
const userDirective = adminUser
|
||||
@@ -85,8 +88,23 @@ chmod 440 /etc/sudoers.d/${adminUser}`;
|
||||
const diskLine = disk
|
||||
? `DISK="${disk}"`
|
||||
: `DISK=""
|
||||
for d in /dev/nvme0n1 /dev/sda /dev/vda; do
|
||||
[ -b "$d" ] && { DISK="$(basename $d)"; break; }
|
||||
# Wait up to 10s for NVMe/SCSI disks to appear (they init async in initrd)
|
||||
for _wait in $(seq 1 10); do
|
||||
for d in /dev/nvme0n1 /dev/nvme1n1 /dev/sda /dev/sdb /dev/vda; do
|
||||
[ -b "$d" ] || continue
|
||||
_bname=$(basename "$d")
|
||||
# Skip removable disks (USB, CD-ROM, JetKVM virtual media)
|
||||
[ -f "/sys/block/$_bname/removable" ] && [ "$(cat /sys/block/$_bname/removable)" = "1" ] && continue
|
||||
# Skip USB-attached disks (JetKVM virtual media shows as SCSI over USB)
|
||||
_transport=$(readlink -f /sys/block/$_bname/device 2>/dev/null || echo "")
|
||||
echo "$_transport" | grep -q "usb" && continue
|
||||
# Skip disks smaller than 20GB (likely USB sticks)
|
||||
_size=$(cat /sys/block/$_bname/size 2>/dev/null || echo 0)
|
||||
[ "$_size" -lt 41943040 ] && continue
|
||||
DISK="$_bname"
|
||||
break 2
|
||||
done
|
||||
sleep 1
|
||||
done
|
||||
[ -z "$DISK" ] && { echo "ERROR: no disk found"; exit 1; }`;
|
||||
|
||||
@@ -100,48 +118,6 @@ done
|
||||
? `logvol /var/lib/rancher --vgname=${vg} --name=rancher --fstype=xfs --size=20480`
|
||||
: "";
|
||||
|
||||
// Helper: the bastion callback functions used in both %pre and %post.
|
||||
// Defined as a template so each section gets its own copy (they run in different shells).
|
||||
const bastionHelpers = `
|
||||
# Detect MAC address (first real ethernet MAC, skip loopback/veth)
|
||||
_BASTION_MAC=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||
_BASTION_URL="http://${serverIp}:${httpPort}"
|
||||
|
||||
# Send a structured progress stage to bastion
|
||||
bastion_progress() {
|
||||
local stage="$1" detail="\${2:-}"
|
||||
curl -sf -X POST "\${_BASTION_URL}/api/progress" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Send log lines to bastion (batched)
|
||||
bastion_log() {
|
||||
local line="$1"
|
||||
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"line\\":\\"$(echo "$line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g')\\"}\" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Send an error stage to bastion with context
|
||||
bastion_error() {
|
||||
local detail="$1"
|
||||
bastion_progress "error" "$detail"
|
||||
# Also send the last 50 lines of any log file as context
|
||||
for logfile in /root/bastion-post-install.log /tmp/pre-partition.log; do
|
||||
if [ -f "$logfile" ]; then
|
||||
local tail_content
|
||||
tail_content=$(tail -50 "$logfile" 2>/dev/null | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/$/\\\\n/' | tr -d '\\n')
|
||||
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[\\"--- $logfile (last 50 lines) ---\\"],\\"tail\\":\\"$tail_content\\"}" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
}`;
|
||||
|
||||
return `# Lab Bastion -- Fedora ${fedoraVersion} server install
|
||||
# Generated: ${now}
|
||||
# Target: ${fqdn} (role=${role})
|
||||
@@ -158,7 +134,9 @@ network --bootproto=dhcp --activate --hostname=${fqdn}
|
||||
${auth}
|
||||
${userDirective}
|
||||
|
||||
bootloader --append="console=tty0 console=ttyS0,115200n8"
|
||||
bootloader --append="console=tty0"
|
||||
|
||||
logging --host=${serverIp} --port=${syslogPort}
|
||||
|
||||
url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$releasever&arch=$basearch
|
||||
|
||||
@@ -168,25 +146,27 @@ url --mirrorlist=https://mirrors.fedoraproject.org/mirrorlist?repo=fedora-$relea
|
||||
%pre --log=/tmp/pre-partition.log
|
||||
#!/bin/bash
|
||||
set -x
|
||||
${bastionHelpers}
|
||||
|
||||
# Error trap: report failures back to bastion
|
||||
trap 'bastion_error "%pre failed at line $LINENO: $(tail -1 /tmp/pre-partition.log 2>/dev/null)"' ERR
|
||||
# Progress callback helper
|
||||
bastion_progress() {
|
||||
local stage="$1" detail="\${2:-}"
|
||||
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||
curl -sf -X POST "http://${serverIp}:${httpPort}/api/progress" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
bastion_progress "partitioning" "detecting disk"
|
||||
|
||||
VG="${vg}"
|
||||
${diskLine}
|
||||
|
||||
bastion_log "disk detected: $DISK"
|
||||
|
||||
REPROVISION=no
|
||||
|
||||
# Check if VG exists (reprovision scenario)
|
||||
if vgs $VG &>/dev/null; then
|
||||
echo "=== Existing VG found - reprovision mode ==="
|
||||
REPROVISION=yes
|
||||
bastion_progress "partitioning" "reprovision mode -- preserving data volumes"
|
||||
|
||||
# Detect which data LVs to preserve
|
||||
PRESERVE_LONGHORN=no; PRESERVE_SRV=no; PRESERVE_HOME=no; PRESERVE_RANCHER=no
|
||||
@@ -196,7 +176,6 @@ if vgs $VG &>/dev/null; then
|
||||
lvs $VG/rancher &>/dev/null && PRESERVE_RANCHER=yes
|
||||
|
||||
echo "Preserving: longhorn=$PRESERVE_LONGHORN srv=$PRESERVE_SRV home=$PRESERVE_HOME rancher=$PRESERVE_RANCHER"
|
||||
bastion_log "preserving LVs: longhorn=$PRESERVE_LONGHORN srv=$PRESERVE_SRV home=$PRESERVE_HOME rancher=$PRESERVE_RANCHER"
|
||||
|
||||
# Remove only OS logical volumes (keep data LVs)
|
||||
for lv in root var varlog swap; do
|
||||
@@ -273,7 +252,6 @@ cat /tmp/part.ks
|
||||
echo "==================================="
|
||||
|
||||
bastion_progress "partitioning" "disk layout ready"
|
||||
bastion_log "partition config written to /tmp/part.ks"
|
||||
|
||||
%end
|
||||
|
||||
@@ -333,91 +311,37 @@ ruby-libs
|
||||
%post --log=/root/bastion-post-install.log
|
||||
#!/bin/bash
|
||||
set -x
|
||||
${bastionHelpers}
|
||||
|
||||
# --- Error trap: catch any failure and report to bastion ---
|
||||
_post_error_handler() {
|
||||
local exit_code=$? lineno=$1
|
||||
bastion_error "%post failed at line $lineno (exit $exit_code)"
|
||||
}
|
||||
trap '_post_error_handler $LINENO' ERR
|
||||
|
||||
# --- Background log streamer: sends %post output to bastion in real-time ---
|
||||
_LOG_FILE=/root/bastion-post-install.log
|
||||
_LOG_STREAMER_PID=""
|
||||
(
|
||||
# Wait for the log file to exist
|
||||
while [ ! -f "$_LOG_FILE" ]; do sleep 1; done
|
||||
# Tail and batch-send lines every 3 seconds
|
||||
_batch=""
|
||||
_count=0
|
||||
tail -f "$_LOG_FILE" 2>/dev/null | while IFS= read -r _line; do
|
||||
# Escape for JSON
|
||||
_escaped=$(echo "$_line" | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/\\t/\\\\t/g')
|
||||
if [ -z "$_batch" ]; then
|
||||
_batch="\\"$_escaped\\""
|
||||
else
|
||||
_batch="$_batch,\\"$_escaped\\""
|
||||
fi
|
||||
_count=$((_count + 1))
|
||||
# Send batch every 10 lines
|
||||
if [ "$_count" -ge 10 ]; then
|
||||
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[$_batch]}" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
_batch=""
|
||||
_count=0
|
||||
fi
|
||||
done
|
||||
) &
|
||||
_LOG_STREAMER_PID=$!
|
||||
|
||||
# Flush remaining log lines helper
|
||||
_flush_log_streamer() {
|
||||
if [ -n "$_LOG_STREAMER_PID" ]; then
|
||||
kill "$_LOG_STREAMER_PID" 2>/dev/null || true
|
||||
wait "$_LOG_STREAMER_PID" 2>/dev/null || true
|
||||
fi
|
||||
# Send any remaining lines from the log
|
||||
if [ -f "$_LOG_FILE" ]; then
|
||||
local remaining
|
||||
remaining=$(tail -20 "$_LOG_FILE" 2>/dev/null | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g; s/\\t/\\\\t/g; s/^/"/; s/$/"/' | paste -sd, -)
|
||||
if [ -n "$remaining" ]; then
|
||||
curl -sf -X POST "\${_BASTION_URL}/api/log" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$_BASTION_MAC\\",\\"lines\\":[$remaining]}" \\
|
||||
--connect-timeout 5 --max-time 10 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
# Progress callback helper
|
||||
bastion_progress() {
|
||||
local stage="$1" detail="\${2:-}"
|
||||
local mac=$(ip link show | awk '/ether/ && !/00:00:00:00/ {print $2; exit}')
|
||||
curl -sf -X POST "http://${serverIp}:${httpPort}/api/progress" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d "{\\"mac\\":\\"$mac\\",\\"stage\\":\\"$stage\\",\\"detail\\":\\"$detail\\"}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
bastion_progress "installing" "packages installed, starting post-install"
|
||||
|
||||
bastion_progress "post-install" "configuring system"
|
||||
|
||||
# -- SSH --
|
||||
bastion_progress "post-install" "configuring SSH"
|
||||
systemctl enable --now sshd
|
||||
# Note: only 'enable', not '--now' — systemd is not running in the Anaconda chroot
|
||||
systemctl enable sshd || true
|
||||
sed -i 's/^#\\?PermitRootLogin.*/PermitRootLogin prohibit-password/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#\\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
${sshPostBlock}
|
||||
bastion_log "SSH configured: root login by key only, password auth disabled"
|
||||
|
||||
# -- Hostname and domain --
|
||||
bastion_progress "post-install" "setting hostname to ${fqdn}"
|
||||
hostnamectl set-hostname ${fqdn}
|
||||
bastion_progress "post-install" "1-ssh done"
|
||||
|
||||
# -- Hostname and domain (write directly, hostnamectl needs D-Bus) --
|
||||
echo "${fqdn}" > /etc/hostname
|
||||
|
||||
# -- tmpfs for /tmp --
|
||||
echo "tmpfs /tmp tmpfs defaults,noatime,nosuid,nodev,size=4G 0 0" >> /etc/fstab
|
||||
|
||||
# Make /boot/efi mount non-fatal (prevents emergency mode if EFI partition isn't found)
|
||||
sed -i '/boot\\/efi/ s/defaults/defaults,nofail/' /etc/fstab
|
||||
bastion_log "fstab /boot/efi set to nofail"
|
||||
|
||||
${isVanilla ? `# -- vanilla role: skip k3s kernel/sysctl/firewall setup --
|
||||
bastion_progress "post-install" "vanilla role -- skipping k3s setup"
|
||||
# -- Enable chronyd for time sync --
|
||||
systemctl enable chronyd || true` : `# -- Kernel modules for k3s --
|
||||
bastion_progress "post-install" "loading k3s kernel modules"
|
||||
cat > /etc/modules-load.d/k3s.conf << 'MODULES'
|
||||
br_netfilter
|
||||
overlay
|
||||
@@ -427,7 +351,6 @@ modprobe br_netfilter || true
|
||||
modprobe overlay || true
|
||||
|
||||
# -- Sysctl for k3s networking --
|
||||
bastion_progress "post-install" "configuring k3s sysctl"
|
||||
cat > /etc/sysctl.d/90-k3s.conf << 'SYSCTL'
|
||||
net.bridge.bridge-nf-call-iptables = 1
|
||||
net.bridge.bridge-nf-call-ip6tables = 1
|
||||
@@ -439,48 +362,38 @@ SYSCTL
|
||||
sysctl --system || true
|
||||
|
||||
# -- Disable firewalld permanently (k3s/Cilium manage iptables directly) --
|
||||
bastion_progress "post-install" "disabling firewalld"
|
||||
# Must be masked to prevent re-enable on updates
|
||||
systemctl disable --now firewalld || true
|
||||
# Note: no '--now' — systemd is not running in the Anaconda chroot
|
||||
systemctl disable firewalld || true
|
||||
systemctl mask firewalld || true
|
||||
|
||||
# -- Enable chronyd for time sync --
|
||||
systemctl enable chronyd || true`}
|
||||
|
||||
# -- Serial console (for debugging — auto-login as root on ttyS0) --
|
||||
systemctl enable serial-getty@ttyS0.service || true
|
||||
bastion_progress "post-install" "2-system done"
|
||||
|
||||
# -- Boot order: restore network first (Anaconda sets disk first, we undo it) --
|
||||
# Network boot must stay first so the bastion intercepts every reboot. It returns
|
||||
# exit (local disk) for installed machines, or install for reinstalls.
|
||||
bastion_progress "post-install" "restoring network-first boot order"
|
||||
# Network boot must stay first so the bastion intercepts every reboot.
|
||||
if command -v efibootmgr >/dev/null 2>&1; then
|
||||
# Find network/PXE/HTTP boot entries (OVMF uses HTTPv4, real hardware uses PXE/Network)
|
||||
PXE_ENTRY=$(efibootmgr | grep -iE 'network|pxe|ipv4|ipv6|http' | head -1 | grep -oP 'Boot\\K[0-9A-F]+')
|
||||
if [ -n "$PXE_ENTRY" ]; then
|
||||
CURRENT_ORDER=$(efibootmgr | grep BootOrder | cut -d: -f2 | tr -d ' ')
|
||||
# Move PXE entry to front
|
||||
REST=$(echo "$CURRENT_ORDER" | sed "s/$PXE_ENTRY,\\\\?//;s/,$//" | sed 's/^,//')
|
||||
NEW_ORDER="$PXE_ENTRY,$REST"
|
||||
efibootmgr -o "$NEW_ORDER" || true
|
||||
bastion_log "boot order set: network first ($NEW_ORDER)"
|
||||
else
|
||||
bastion_log "no PXE boot entry found, boot order unchanged"
|
||||
fi
|
||||
else
|
||||
bastion_log "efibootmgr not available"
|
||||
fi
|
||||
|
||||
# -- Provisioning metadata --
|
||||
bastion_progress "post-install" "writing provisioning metadata"
|
||||
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||
bastion_progress "post-install" "3-bootorder done"
|
||||
|
||||
# -- Enable SysRq magic keys (for emergency reboot via Alt+SysRq+REISUB) --
|
||||
echo "kernel.sysrq=1" > /etc/sysctl.d/90-sysrq.conf
|
||||
|
||||
# -- Provisioning metadata --
|
||||
cat > /etc/lab-provisioned << PROVEOF
|
||||
hostname: ${fqdn}
|
||||
role: ${role}
|
||||
provisioned: $(date -Iseconds)
|
||||
bastion: ${serverIp}
|
||||
ip: $IP_ADDR
|
||||
PROVEOF
|
||||
|
||||
cat > /root/README << 'README'
|
||||
@@ -498,13 +411,11 @@ cat > /root/README << 'README'
|
||||
README
|
||||
|
||||
${hasRancher ? `# Install k3s server (skip start - will be configured manually)
|
||||
bastion_progress "post-install" "pre-installing k3s server"
|
||||
curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true sh -
|
||||
bastion_log "k3s server pre-installed (not started)"
|
||||
` : ""}
|
||||
# Stop log streamer and flush remaining lines
|
||||
_flush_log_streamer
|
||||
bastion_progress "post-install" "4-metadata done"
|
||||
|
||||
IP_ADDR=$(ip -4 addr show | awk '/inet / && !/127.0.0/ {split($2,a,"/"); print a[1]; exit}')
|
||||
bastion_progress "complete" "ready at $IP_ADDR"
|
||||
|
||||
%end
|
||||
|
||||
@@ -28,6 +28,7 @@ function createTestConfig(testDir: string): BastionConfig {
|
||||
gateway: "10.0.0.1",
|
||||
sshKeys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST test@test"],
|
||||
adminUser: "testadmin",
|
||||
syslogPort: 15514,
|
||||
skipDnsmasq: true,
|
||||
skipArtifacts: true,
|
||||
fedoraMirror: "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Everything/x86_64/os",
|
||||
|
||||
@@ -12,6 +12,7 @@ function baseParams(overrides: Partial<InstallKickstartParams> = {}): InstallKic
|
||||
locale: "en_GB.UTF-8",
|
||||
serverIp: "192.168.1.100",
|
||||
httpPort: 8080,
|
||||
syslogPort: 5514,
|
||||
sshKeys: [
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAITEST1 user1@host",
|
||||
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQTEST2 user2@host",
|
||||
@@ -91,9 +92,8 @@ describe("renderInstallKickstart", () => {
|
||||
serverIp: "10.0.0.5",
|
||||
httpPort: 9090,
|
||||
}));
|
||||
expect(ks).toContain('_BASTION_URL="http://10.0.0.5:9090"');
|
||||
expect(ks).toContain("http://10.0.0.5:9090");
|
||||
expect(ks).toContain("/api/progress");
|
||||
expect(ks).toContain("/api/log");
|
||||
});
|
||||
|
||||
it("infra role has /var/lib/rancher partition", () => {
|
||||
@@ -141,51 +141,73 @@ describe("renderInstallKickstart", () => {
|
||||
expect(ks).toContain("--name=swap --fstype=swap --size=27648");
|
||||
});
|
||||
|
||||
it("%pre has error trap", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain("trap");
|
||||
expect(ks).toContain("bastion_error");
|
||||
expect(ks).toContain("%pre failed");
|
||||
});
|
||||
|
||||
it("%post has error trap", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain("_post_error_handler");
|
||||
expect(ks).toContain("%post failed");
|
||||
});
|
||||
|
||||
it("has granular progress stages in %post", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain('"configuring SSH"');
|
||||
expect(ks).toContain('"setting hostname');
|
||||
expect(ks).toContain('"writing provisioning metadata"');
|
||||
expect(ks).toContain('"writing provisioning metadata"');
|
||||
});
|
||||
|
||||
it("has background log streamer in %post", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain("_LOG_STREAMER_PID");
|
||||
expect(ks).toContain("_flush_log_streamer");
|
||||
expect(ks).toContain("tail -f");
|
||||
});
|
||||
|
||||
it("has bastion_log function for sending log lines", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain("bastion_log()");
|
||||
expect(ks).toContain("/api/log");
|
||||
});
|
||||
|
||||
it("vanilla role skips k3s progress stages", () => {
|
||||
it("vanilla role skips k3s setup", () => {
|
||||
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
||||
expect(ks).toContain("vanilla role");
|
||||
expect(ks).not.toContain('"loading k3s kernel modules"');
|
||||
expect(ks).not.toContain('"disabling firewalld"');
|
||||
expect(ks).not.toContain("modules-load.d/k3s.conf");
|
||||
expect(ks).not.toContain("firewalld");
|
||||
});
|
||||
|
||||
it("worker role has k3s-related progress stages", () => {
|
||||
it("worker role has k3s setup", () => {
|
||||
const ks = renderInstallKickstart(baseParams({ role: "worker" }));
|
||||
expect(ks).toContain('"loading k3s kernel modules"');
|
||||
expect(ks).toContain('"configuring k3s sysctl"');
|
||||
expect(ks).toContain('"disabling firewalld"');
|
||||
expect(ks).toContain("modules-load.d/k3s.conf");
|
||||
expect(ks).toContain("sysctl.d/90-k3s.conf");
|
||||
expect(ks).toContain("firewalld");
|
||||
});
|
||||
|
||||
it("kickstart syntax: no merged partition lines", () => {
|
||||
for (const role of ["vanilla", "worker", "infra"] as const) {
|
||||
const ks = renderInstallKickstart(baseParams({ role }));
|
||||
const lines = ks.split("\n");
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const l = lines[i].trim();
|
||||
if (l.startsWith("part ")) {
|
||||
const partCount = (l.match(/\bpart\b/g) || []).length;
|
||||
expect(partCount, `line ${i + 1} has ${partCount} 'part' commands (role=${role}): ${l}`).toBe(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("kickstart syntax: each section-opening has a %end", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
// Only match section openers at start of line
|
||||
const sections = (ks.match(/^%(?:pre|post|packages)\b/gm) || []).length;
|
||||
const ends = (ks.match(/^%end$/gm) || []).length;
|
||||
expect(ends, `${sections} sections but ${ends} %end markers`).toBe(sections);
|
||||
});
|
||||
|
||||
it("has complete progress stage", () => {
|
||||
const ks = renderInstallKickstart(baseParams());
|
||||
expect(ks).toContain('"complete"');
|
||||
expect(ks).toContain("ready at");
|
||||
});
|
||||
|
||||
it("sends install logs to bastion via syslog", () => {
|
||||
const ks = renderInstallKickstart(baseParams({ syslogPort: 5514 }));
|
||||
expect(ks).toContain("logging --host=192.168.1.100 --port=5514");
|
||||
});
|
||||
|
||||
it("passes ksvalidator syntax check", () => {
|
||||
for (const role of ["vanilla", "worker", "infra"] as const) {
|
||||
const ks = renderInstallKickstart(baseParams({ role }));
|
||||
const { execSync } = require("node:child_process");
|
||||
const { writeFileSync, unlinkSync } = require("node:fs");
|
||||
const tmp = `/tmp/ks-test-${role}.ks`;
|
||||
writeFileSync(tmp, ks);
|
||||
try {
|
||||
execSync(`ksvalidator -v F43 ${tmp}`, { encoding: "utf-8" });
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? (err as { stderr?: string }).stderr ?? err.message : String(err);
|
||||
throw new Error(`ksvalidator failed for role=${role}: ${msg}`);
|
||||
} finally {
|
||||
try { unlinkSync(tmp); } catch {}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("does not include serial console (causes 30s boot timeout on hardware without UART)", () => {
|
||||
const ks = renderInstallKickstart(baseParams({ role: "vanilla" }));
|
||||
expect(ks).not.toContain("ttyS0");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -26,6 +26,7 @@ describe("StateManager", () => {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -39,6 +40,7 @@ describe("StateManager", () => {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
121
bastion/src/bastion/tests/syslog-listener.test.ts
Normal file
@@ -0,0 +1,121 @@
|
||||
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
||||
import { createSocket } from "node:dgram";
|
||||
import { mkdtempSync, rmSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
import { SyslogListener } from "../src/services/syslog-listener.js";
|
||||
import { InstallLogBuffer } from "../src/services/install-log.js";
|
||||
import { StateManager } from "../src/services/state.js";
|
||||
|
||||
function sendUdpSyslog(port: number, message: string): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const client = createSocket("udp4");
|
||||
const buf = Buffer.from(message);
|
||||
client.send(buf, 0, buf.length, port, "127.0.0.1", (err) => {
|
||||
client.close();
|
||||
if (err) reject(err);
|
||||
else resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
describe("SyslogListener", () => {
|
||||
let tmpDir: string;
|
||||
let state: StateManager;
|
||||
let installLog: InstallLogBuffer;
|
||||
let syslog: SyslogListener;
|
||||
const PORT = 15514; // use non-privileged port for testing
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = mkdtempSync(join(tmpdir(), "syslog-test-"));
|
||||
state = new StateManager(join(tmpDir, "state.json"));
|
||||
state.init();
|
||||
installLog = new InstallLogBuffer(tmpDir);
|
||||
syslog = new SyslogListener(PORT, installLog, state);
|
||||
syslog.start();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
syslog.stop();
|
||||
rmSync(tmpDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it("receives and stores syslog messages for registered IP", async () => {
|
||||
const mac = "aa:bb:cc:dd:ee:ff";
|
||||
// Queue a machine so hostname can be resolved
|
||||
state.update((s) => {
|
||||
s.install_queue[mac] = {
|
||||
hostname: "testnode",
|
||||
disk: "/dev/sda",
|
||||
role: "worker",
|
||||
os: "fedora-43",
|
||||
queued_at: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
// Register IP → MAC mapping
|
||||
syslog.registerIp("127.0.0.1", mac);
|
||||
|
||||
// Send a syslog message (RFC 3164 format)
|
||||
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: Installing package vim-enhanced");
|
||||
|
||||
// Wait for UDP delivery
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
|
||||
const lines = installLog.getLines(mac);
|
||||
expect(lines.length).toBeGreaterThan(0);
|
||||
expect(lines[0]!.line).toContain("anaconda");
|
||||
expect(lines[0]!.line).toContain("Installing package vim-enhanced");
|
||||
});
|
||||
|
||||
it("ignores messages from unknown IPs", async () => {
|
||||
// Don't register any IP mapping
|
||||
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost anaconda[1234]: test message");
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
|
||||
// No MAC to check, but the listener should not crash
|
||||
// and no logs should be stored for any MAC
|
||||
expect(installLog.lineCount("unknown")).toBe(0);
|
||||
});
|
||||
|
||||
it("resolves IP from installed machines state", async () => {
|
||||
const mac = "11:22:33:44:55:66";
|
||||
state.update((s) => {
|
||||
s.installed[mac] = {
|
||||
hostname: "installed-node",
|
||||
role: "worker",
|
||||
ip: "127.0.0.1",
|
||||
installed_at: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
await sendUdpSyslog(PORT, "<14>Mar 30 02:00:00 installed-node sshd[5678]: Accepted publickey for root");
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
|
||||
const lines = installLog.getLines(mac);
|
||||
expect(lines.length).toBeGreaterThan(0);
|
||||
expect(lines[0]!.line).toContain("sshd");
|
||||
});
|
||||
|
||||
it("parses various syslog formats", async () => {
|
||||
const mac = "aa:bb:cc:dd:ee:ff";
|
||||
syslog.registerIp("127.0.0.1", mac);
|
||||
state.update((s) => {
|
||||
s.install_queue[mac] = {
|
||||
hostname: "testnode",
|
||||
disk: "/dev/sda",
|
||||
role: "worker",
|
||||
os: "fedora-43",
|
||||
queued_at: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
// Message without PID
|
||||
await sendUdpSyslog(PORT, "<13>Mar 30 01:30:00 localhost kernel: NVMe device ready");
|
||||
await new Promise((r) => setTimeout(r, 200));
|
||||
|
||||
const lines = installLog.getLines(mac);
|
||||
expect(lines.length).toBeGreaterThan(0);
|
||||
expect(lines[0]!.line).toContain("kernel");
|
||||
});
|
||||
});
|
||||
@@ -94,6 +94,16 @@ export class LabdClient {
|
||||
return this.request("POST", "/api/machines/install", { body: opts });
|
||||
}
|
||||
|
||||
async registerMachine(opts: {
|
||||
mac: string; hostname: string; role?: string; ip?: string;
|
||||
}): Promise<{ status: string; data?: unknown; error?: string }> {
|
||||
return this.request("POST", "/api/machines/register", { body: opts });
|
||||
}
|
||||
|
||||
async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
|
||||
return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } });
|
||||
}
|
||||
|
||||
async forgetMachine(mac: string): Promise<{ status: string }> {
|
||||
return this.request("DELETE", `/api/machines/${encodeURIComponent(mac)}`);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
// CLI command: labctl app k3s install/health <target>
|
||||
// Install or check k3s on a target machine via SSH.
|
||||
|
||||
import { existsSync } from "node:fs";
|
||||
import { existsSync, writeFileSync, mkdirSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { execSync } from "node:child_process";
|
||||
import type { Command } from "commander";
|
||||
import type { BastionState } from "@lab/shared";
|
||||
import { K3sModule, sshExec } from "@lab/modules";
|
||||
@@ -400,4 +401,88 @@ export function registerAppCommand(program: Command): void {
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
k3sCmd
|
||||
.command("kubeconfig <target>")
|
||||
.description("Fetch kubeconfig from a target and merge into ~/.kube/config")
|
||||
.option("--user <user>", "SSH user", "root")
|
||||
.option("--context <name>", "Context name (defaults to hostname)")
|
||||
.option("--print", "Print kubeconfig to stdout instead of merging")
|
||||
.action(async (target: string, opts: {
|
||||
user: string;
|
||||
context?: string;
|
||||
print?: boolean;
|
||||
}) => {
|
||||
const state = await fetchState();
|
||||
const resolved = resolveTarget(target, state);
|
||||
|
||||
if (!resolved) {
|
||||
console.error(`Cannot resolve target: ${target}`);
|
||||
console.error("Provide an IP address, hostname, or MAC of an installed machine.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const sshKey = findSshKey();
|
||||
|
||||
// Fetch kubeconfig via SSH
|
||||
let raw: string;
|
||||
try {
|
||||
const result = await sshExec(resolved.ip, opts.user, "cat /etc/rancher/k3s/k3s.yaml", {
|
||||
...(sshKey ? { keyPath: sshKey } : {}),
|
||||
timeoutMs: 10_000,
|
||||
});
|
||||
raw = result.stdout;
|
||||
} catch (err) {
|
||||
console.error(`Failed to fetch kubeconfig: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const contextName = opts.context ?? resolved.hostname;
|
||||
|
||||
// Rewrite: replace 127.0.0.1 with actual IP, rename cluster/user/context
|
||||
const rewritten = raw
|
||||
.replace(/server:\s*https:\/\/127\.0\.0\.1:/, `server: https://${resolved.ip}:`)
|
||||
.replace(/name:\s*default/g, `name: ${contextName}`)
|
||||
.replace(/cluster:\s*default/g, `cluster: ${contextName}`)
|
||||
.replace(/user:\s*default/g, `user: ${contextName}`)
|
||||
.replace(/current-context:\s*default/, `current-context: ${contextName}`);
|
||||
|
||||
if (opts.print) {
|
||||
process.stdout.write(rewritten);
|
||||
return;
|
||||
}
|
||||
|
||||
// Merge into ~/.kube/config using kubectl
|
||||
const kubeDir = join(homedir(), ".kube");
|
||||
mkdirSync(kubeDir, { recursive: true });
|
||||
const mainConfig = join(kubeDir, "config");
|
||||
const tmpFile = join(kubeDir, `.labctl-${contextName}.tmp`);
|
||||
|
||||
writeFileSync(tmpFile, rewritten, { mode: 0o600 });
|
||||
|
||||
try {
|
||||
if (existsSync(mainConfig)) {
|
||||
const merged = execSync(
|
||||
`KUBECONFIG="${mainConfig}:${tmpFile}" kubectl config view --flatten`,
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
writeFileSync(mainConfig, merged, { mode: 0o600 });
|
||||
} else {
|
||||
writeFileSync(mainConfig, rewritten, { mode: 0o600 });
|
||||
}
|
||||
|
||||
// Set current context
|
||||
execSync(`kubectl config use-context ${contextName}`, { stdio: "pipe" });
|
||||
|
||||
console.log(`Merged kubeconfig for ${contextName} (${resolved.ip})`);
|
||||
console.log(`Context set to: ${contextName}`);
|
||||
console.log(`\nSwitch contexts: kubectl config use-context <name>`);
|
||||
} catch (err) {
|
||||
console.error(`Failed to merge kubeconfig: ${err instanceof Error ? err.message : String(err)}`);
|
||||
console.error(`Standalone config saved at: ${tmpFile}`);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
try { const { unlinkSync } = await import("node:fs"); unlinkSync(tmpFile); } catch { /* ignore */ }
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
156
bastion/src/cli/src/commands/debug.ts
Normal file
156
bastion/src/cli/src/commands/debug.ts
Normal file
@@ -0,0 +1,156 @@
|
||||
// CLI command: provision debug
|
||||
// Queue a machine for debug/rescue PXE boot and optionally SSH reboot into PXE.
|
||||
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { existsSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { Command } from "commander";
|
||||
import type { BastionState } from "@lab/shared";
|
||||
import { getLabdClient } from "../api/config.js";
|
||||
|
||||
/** Resolve a target (hostname, MAC, or IP) to {mac, hostname, ip} from state. */
|
||||
function resolveTarget(
|
||||
target: string,
|
||||
state: BastionState,
|
||||
): { mac: string; hostname: string; ip: string } | null {
|
||||
const normalized = target.toLowerCase().replace(/-/g, ":");
|
||||
|
||||
if (state.installed[normalized]) {
|
||||
const info = state.installed[normalized];
|
||||
return { mac: normalized, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
|
||||
if (state.discovered[normalized]) {
|
||||
return { mac: normalized, hostname: normalized, ip: "" };
|
||||
}
|
||||
|
||||
if (state.install_queue[normalized]) {
|
||||
return { mac: normalized, hostname: state.install_queue[normalized].hostname, ip: "" };
|
||||
}
|
||||
|
||||
for (const [mac, info] of Object.entries(state.installed)) {
|
||||
if (info.hostname === target || info.hostname.startsWith(target + ".")) {
|
||||
return { mac, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
}
|
||||
|
||||
for (const [mac, info] of Object.entries(state.installed)) {
|
||||
if (info.ip === target) {
|
||||
return { mac, hostname: info.hostname, ip: info.ip };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
export function registerDebugCommand(parent: Command): void {
|
||||
parent
|
||||
.command("debug <target>")
|
||||
.description("PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)")
|
||||
.option("--pxe-boot", "Boot installed system via PXE (kernel+initrd from network, root from NVMe)")
|
||||
.showHelpAfterError(true)
|
||||
.action(async (target: string, opts: { pxeBoot?: boolean }) => {
|
||||
const client = getLabdClient();
|
||||
|
||||
// Resolve target from labd aggregated state
|
||||
let state: BastionState;
|
||||
try {
|
||||
state = await client.getMachines();
|
||||
} catch (err) {
|
||||
console.error(`Cannot reach labd: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const resolved = resolveTarget(target, state);
|
||||
if (!resolved) {
|
||||
console.error(`Cannot find machine: ${target}`);
|
||||
console.error("Provide a hostname, MAC, or IP of a known machine.");
|
||||
console.error("Run 'labctl provision list' to see available machines.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const { mac, hostname, ip } = resolved;
|
||||
console.log(`Queuing debug mode for ${hostname} (${mac})...`);
|
||||
|
||||
try {
|
||||
const result = await client.debugMachine(mac, { pxeBoot: opts.pxeBoot === true });
|
||||
if (result.error) {
|
||||
console.error(`Failed: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Failed to queue debug: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Try SSH reboot into PXE
|
||||
if (ip !== "") {
|
||||
const adminUser = process.env["SUDO_USER"] ?? process.env["USER"] ?? "";
|
||||
const effectiveUser = adminUser === "root" ? "" : adminUser;
|
||||
|
||||
if (effectiveUser !== "") {
|
||||
console.log(`\nAttempting SSH reboot into PXE (${effectiveUser}@${ip})...`);
|
||||
|
||||
const sudoUser = process.env["SUDO_USER"];
|
||||
const realHome = sudoUser !== undefined ? join("/home", sudoUser) : homedir();
|
||||
const keyPaths = [
|
||||
join(realHome, ".ssh", "id_ed25519"),
|
||||
join(realHome, ".ssh", "id_rsa"),
|
||||
join(realHome, ".ssh", "id_ecdsa"),
|
||||
];
|
||||
const sshKey = keyPaths.find(k => existsSync(k));
|
||||
|
||||
const sshArgs = [
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null",
|
||||
"-o", "ConnectTimeout=10",
|
||||
...(sshKey !== undefined ? ["-i", sshKey] : []),
|
||||
`${effectiveUser}@${ip}`,
|
||||
'PXE_ENTRY=$(sudo efibootmgr | grep -iE "pxe|network|ipv4" | head -1 | grep -oP "Boot\\K[0-9A-F]+"); if [ -n "$PXE_ENTRY" ]; then sudo efibootmgr --bootnext "$PXE_ENTRY" && echo "PXE set as next boot" && sudo reboot; else echo "No PXE boot entry found, rebooting anyway..." && sudo reboot; fi',
|
||||
];
|
||||
|
||||
try {
|
||||
execFileSync("ssh", sshArgs, { stdio: "inherit" });
|
||||
} catch {
|
||||
// SSH connection closing during reboot is expected
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine bastion URL from labd config for the setup script URL
|
||||
const bastionUrl = process.env["LABD_URL"]
|
||||
? process.env["LABD_URL"].replace(/\/ws\/bastion$/, "").replace(/^wss?:/, "http:")
|
||||
: "http://<bastion-ip>:8080";
|
||||
|
||||
console.log(`
|
||||
Debug mode queued for ${hostname} (${mac}).
|
||||
Reboot the machine to enter Fedora rescue mode.
|
||||
|
||||
SSH access (started by Anaconda):
|
||||
ssh root@<ip> (password: debug)
|
||||
|
||||
For nc remote shell, run from rescue shell:
|
||||
curl ${bastionUrl}/debug-setup.sh | bash
|
||||
|
||||
Once in rescue shell:
|
||||
|
||||
# Activate LVM and mount installed system
|
||||
vgchange -ay
|
||||
mkdir -p /mnt/sysroot
|
||||
mount /dev/<vg>/root /mnt/sysroot
|
||||
cat /mnt/sysroot/etc/fstab
|
||||
mount /dev/<vg>/var /mnt/sysroot/var
|
||||
mount /dev/<vg>/home /mnt/sysroot/home
|
||||
|
||||
# Boot installed system in a container
|
||||
/mnt/sysroot/usr/bin/systemd-nspawn -D /mnt/sysroot --boot
|
||||
|
||||
# Or chroot for quick fixes
|
||||
mount --bind /dev /mnt/sysroot/dev
|
||||
mount --bind /proc /mnt/sysroot/proc
|
||||
mount --bind /sys /mnt/sysroot/sys
|
||||
chroot /mnt/sysroot
|
||||
`);
|
||||
});
|
||||
}
|
||||
@@ -39,19 +39,25 @@ export function registerLogsCommand(parent: Command): void {
|
||||
parent
|
||||
.command("logs <target>")
|
||||
.description("Show provisioning logs for a machine (hostname, MAC, or IP)")
|
||||
.action(async (target: string) => {
|
||||
.option("-f, --follow", "Follow log output in real-time")
|
||||
.action(async (target: string, opts: { follow?: boolean }) => {
|
||||
const mac = await resolveToMac(target);
|
||||
|
||||
const BOLD = "\x1b[1m";
|
||||
const GREEN = "\x1b[32m";
|
||||
const YELLOW = "\x1b[33m";
|
||||
const RED = "\x1b[31m";
|
||||
const DIM = "\x1b[2m";
|
||||
const RESET = "\x1b[0m";
|
||||
|
||||
if (opts.follow) {
|
||||
await followLogs(mac, { BOLD, GREEN, YELLOW, RED, DIM, RESET });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await getLabdClient().getMachineLogs(mac);
|
||||
|
||||
const BOLD = "\x1b[1m";
|
||||
const GREEN = "\x1b[32m";
|
||||
const YELLOW = "\x1b[33m";
|
||||
const RED = "\x1b[31m";
|
||||
const DIM = "\x1b[2m";
|
||||
const RESET = "\x1b[0m";
|
||||
|
||||
console.log(`${BOLD}${data["hostname"]}${RESET} (${mac})`);
|
||||
console.log(` Status: ${data["status"] === "installed" ? GREEN : YELLOW}${data["status"]}${RESET}`);
|
||||
console.log(` Role: ${data["role"]}`);
|
||||
@@ -83,3 +89,64 @@ export function registerLogsCommand(parent: Command): void {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/** Follow logs by polling labd. */
|
||||
async function followLogs(
|
||||
mac: string,
|
||||
colors: { BOLD: string; GREEN: string; YELLOW: string; RED: string; DIM: string; RESET: string },
|
||||
): Promise<void> {
|
||||
const { BOLD, GREEN, YELLOW, RED, DIM, RESET } = colors;
|
||||
const client = getLabdClient();
|
||||
|
||||
console.log(`${DIM}Following logs for ${mac} (Ctrl+C to stop)${RESET}`);
|
||||
console.log("");
|
||||
|
||||
let lastStageCount = 0;
|
||||
let lastStatus = "";
|
||||
let sawInstalling = false;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
const data = await client.getMachineLogs(mac);
|
||||
const status = String(data["status"] ?? "");
|
||||
const log = data["log"] as Array<{ stage: string; detail: string; timestamp: string }> | undefined;
|
||||
|
||||
// Print header once or on status change
|
||||
if (status !== lastStatus) {
|
||||
const hostname = String(data["hostname"] ?? mac);
|
||||
const statusColor = status === "installed" ? GREEN : YELLOW;
|
||||
console.log(` ${BOLD}${hostname}${RESET} ${statusColor}${status}${RESET}`);
|
||||
lastStatus = status;
|
||||
}
|
||||
|
||||
if (status === "installing" || status === "queued") {
|
||||
sawInstalling = true;
|
||||
}
|
||||
|
||||
// Print new stages
|
||||
if (log && log.length > lastStageCount) {
|
||||
for (let i = lastStageCount; i < log.length; i++) {
|
||||
const entry = log[i]!;
|
||||
const time = entry.timestamp.slice(11, 19);
|
||||
const color = entry.stage === "complete" ? GREEN : entry.stage === "error" ? RED : YELLOW;
|
||||
const detail = entry.detail ? ` ${DIM}-- ${entry.detail}${RESET}` : "";
|
||||
console.log(` ${DIM}${time}${RESET} ${color}${entry.stage}${RESET}${detail}`);
|
||||
}
|
||||
lastStageCount = log.length;
|
||||
}
|
||||
|
||||
// Only exit on "installed" if we actually saw the install happen
|
||||
// (avoids exiting immediately when following a reprovision that hasn't started yet)
|
||||
if (status === "installed" && sawInstalling) {
|
||||
const ip = data["ip"] ?? "";
|
||||
console.log("");
|
||||
console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`);
|
||||
process.exit(0);
|
||||
}
|
||||
} catch {
|
||||
// Machine may not be in logs yet (still queued)
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
}
|
||||
}
|
||||
|
||||
37
bastion/src/cli/src/commands/register.ts
Normal file
37
bastion/src/cli/src/commands/register.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
// CLI command: provision register
|
||||
// Register an already-installed machine that is missing from bastion state.
|
||||
|
||||
import { Command, Option } from "commander";
|
||||
import { SUPPORTED_ROLES } from "@lab/shared";
|
||||
import { getLabdClient } from "../api/config.js";
|
||||
|
||||
export function registerRegisterCommand(parent: Command): void {
|
||||
parent
|
||||
.command("register <mac> <hostname>")
|
||||
.description("Register an already-installed machine (e.g. after state loss)")
|
||||
.addOption(new Option("--role <role>", "Machine role").choices([...SUPPORTED_ROLES]).default("worker"))
|
||||
.option("--ip <address>", "Machine IP address")
|
||||
.action(async (mac: string, hostname: string, opts: {
|
||||
role: string;
|
||||
ip?: string;
|
||||
}) => {
|
||||
try {
|
||||
const result = await getLabdClient().registerMachine({
|
||||
mac,
|
||||
hostname,
|
||||
role: opts.role,
|
||||
...(opts.ip ? { ip: opts.ip } : {}),
|
||||
});
|
||||
|
||||
if (result.error) {
|
||||
console.error(`Failed: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`Registered ${mac} as ${hostname} (role=${opts.role}${opts.ip ? `, ip=${opts.ip}` : ""})`);
|
||||
} catch (err) {
|
||||
console.error(`Failed: ${err instanceof Error ? err.message : String(err)}`);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -144,6 +144,7 @@ export function registerReprovisionCommand(parent: Command): void {
|
||||
|
||||
const sshArgs = [
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null",
|
||||
"-o", "ConnectTimeout=10",
|
||||
...(sshKey !== undefined ? ["-i", sshKey] : []),
|
||||
`${effectiveUser}@${ip}`,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// CLI entry point for lab-bastion.
|
||||
// Commands:
|
||||
// init bastion standalone start/stop/status
|
||||
// provision list/install/reprovision/forget
|
||||
// provision list/install/reprovision/forget/register
|
||||
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { Command, Option } from "commander";
|
||||
@@ -14,7 +14,9 @@ import { registerStatusCommand } from "./commands/status.js";
|
||||
import { registerInstallCommand } from "./commands/install.js";
|
||||
import { registerListCommand } from "./commands/list.js";
|
||||
import { registerReprovisionCommand } from "./commands/reprovision.js";
|
||||
import { registerDebugCommand } from "./commands/debug.js";
|
||||
import { registerForgetCommand } from "./commands/forget.js";
|
||||
import { registerRegisterCommand } from "./commands/register.js";
|
||||
import { registerLogsCommand } from "./commands/logs.js";
|
||||
import { registerMakeIsoCommand } from "./commands/makeiso.js";
|
||||
import { registerConfigCommand } from "./commands/config.js";
|
||||
@@ -95,7 +97,9 @@ export function createProgram(): Command {
|
||||
registerListCommand(provisionCmd);
|
||||
registerInstallCommand(provisionCmd);
|
||||
registerReprovisionCommand(provisionCmd);
|
||||
registerDebugCommand(provisionCmd);
|
||||
registerForgetCommand(provisionCmd);
|
||||
registerRegisterCommand(provisionCmd);
|
||||
registerLogsCommand(provisionCmd);
|
||||
registerMakeIsoCommand(provisionCmd);
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ async function main(): Promise<void> {
|
||||
server: {
|
||||
findMany: () => dbError(),
|
||||
findUnique: () => dbError(),
|
||||
upsert: () => dbError(),
|
||||
},
|
||||
joinToken: {
|
||||
findUnique: () => dbError(),
|
||||
|
||||
@@ -80,9 +80,54 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
});
|
||||
});
|
||||
|
||||
// Aggregated machines from all connected bastions
|
||||
// Aggregated machines from all connected bastions + DB fallback
|
||||
app.get("/api/machines", async () => {
|
||||
return bastionRegistry.getAggregatedState();
|
||||
const live = bastionRegistry.getAggregatedState();
|
||||
|
||||
// Merge DB records for machines not currently in any bastion's live state
|
||||
try {
|
||||
const dbServers = (await db.server.findMany({})) as Array<{
|
||||
mac: string | null; hostname: string; role: string; ip: string | null;
|
||||
status: string; labels: Record<string, unknown>;
|
||||
}>;
|
||||
for (const s of dbServers) {
|
||||
if (!s.mac) continue;
|
||||
const mac = s.mac.toLowerCase();
|
||||
// Only add from DB if not already in live state
|
||||
if (!(mac in live.discovered) && !(mac in live.install_queue) && !(mac in live.installed)) {
|
||||
if (s.status === "discovered") {
|
||||
live.discovered[mac] = {
|
||||
mac,
|
||||
product: String(s.labels?.product ?? "unknown"),
|
||||
board: "unknown",
|
||||
serial: "unknown",
|
||||
manufacturer: String(s.labels?.manufacturer ?? "unknown"),
|
||||
cpu_model: String(s.labels?.cpu ?? "unknown"),
|
||||
cpu_cores: Number(s.labels?.cores ?? 0),
|
||||
memory_gb: Number(s.labels?.memory_gb ?? 0),
|
||||
arch: String(s.labels?.arch ?? "unknown"),
|
||||
disks: [],
|
||||
nics: [],
|
||||
first_seen: "",
|
||||
last_seen: "",
|
||||
bastionId: "db",
|
||||
};
|
||||
} else if (s.status === "online" || s.status === "offline") {
|
||||
live.installed[mac] = {
|
||||
hostname: s.hostname,
|
||||
role: s.role,
|
||||
ip: s.ip ?? "",
|
||||
installed_at: "",
|
||||
bastionId: "db",
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// DB unavailable — return live state only
|
||||
}
|
||||
|
||||
return live;
|
||||
});
|
||||
|
||||
// Queue install — route to correct bastion by MAC
|
||||
@@ -106,7 +151,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
try {
|
||||
const result = await sendCommand(all[0]!.bastionId, {
|
||||
type: "command-install",
|
||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
||||
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||
});
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
@@ -119,7 +164,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
try {
|
||||
const result = await sendCommand(bastion.bastionId, {
|
||||
type: "command-install",
|
||||
mac, hostname, disk: disk ?? "/dev/sda", role: role ?? "infra", os: os ?? "fedora-43",
|
||||
mac, hostname, disk: disk ?? "", role: role ?? "infra", os: os ?? "fedora-43",
|
||||
});
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
@@ -127,6 +172,78 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
}
|
||||
});
|
||||
|
||||
// Register an already-installed machine — route to correct bastion (or single bastion)
|
||||
app.post<{
|
||||
Body: { mac?: string; hostname?: string; role?: string; ip?: string };
|
||||
}>("/api/machines/register", async (request, reply) => {
|
||||
const { mac, hostname, role, ip } = request.body ?? {};
|
||||
if (!mac || !hostname) {
|
||||
return reply.code(400).send({ error: "mac and hostname are required" });
|
||||
}
|
||||
|
||||
const normalized = mac.toLowerCase().replace(/-/g, ":");
|
||||
|
||||
// Find bastion that knows this MAC, or use single connected bastion
|
||||
const bastion = bastionRegistry.findBastionByMac(normalized);
|
||||
const target = bastion ?? (bastionRegistry.getAll().length === 1 ? bastionRegistry.getAll()[0] : null);
|
||||
|
||||
if (!target) {
|
||||
const all = bastionRegistry.getAll();
|
||||
if (all.length === 0) {
|
||||
return reply.code(503).send({ error: "No bastions connected" });
|
||||
}
|
||||
return reply.code(404).send({ error: `MAC ${normalized} not found on any bastion and multiple bastions connected` });
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await sendCommand(target.bastionId, {
|
||||
type: "command-register",
|
||||
mac: normalized,
|
||||
hostname,
|
||||
role: role ?? "worker",
|
||||
ip: ip ?? "",
|
||||
});
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// Queue debug/rescue mode — route to correct bastion by MAC
|
||||
app.post<{
|
||||
Body: { mac?: string; pxeBoot?: boolean };
|
||||
}>("/api/machines/debug", async (request, reply) => {
|
||||
const mac = (request.body?.mac ?? "").toLowerCase().replace(/-/g, ":");
|
||||
const pxeBoot = request.body?.pxeBoot ?? false;
|
||||
if (!mac) {
|
||||
return reply.code(400).send({ error: "mac is required" });
|
||||
}
|
||||
|
||||
const bastion = bastionRegistry.findBastionByMac(mac);
|
||||
if (!bastion) {
|
||||
const all = bastionRegistry.getAll();
|
||||
if (all.length === 0) {
|
||||
return reply.code(503).send({ error: "No bastions connected" });
|
||||
}
|
||||
if (all.length === 1) {
|
||||
try {
|
||||
const result = await sendCommand(all[0]!.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||
}
|
||||
}
|
||||
return reply.code(404).send({ error: `MAC ${mac} not found on any bastion` });
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await sendCommand(bastion.bastionId, { type: "command-debug", mac, pxeBoot });
|
||||
return reply.code(result.status === "ok" ? 200 : 500).send(result);
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
|
||||
}
|
||||
});
|
||||
|
||||
// Forget machine
|
||||
app.delete<{ Params: { mac: string } }>("/api/machines/:mac", async (request, reply) => {
|
||||
const mac = request.params.mac.toLowerCase().replace(/-/g, ":");
|
||||
@@ -177,17 +294,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
const queued = bastion.state.install_queue[mac];
|
||||
const installed = bastion.state.installed[mac];
|
||||
|
||||
if (installed) {
|
||||
return {
|
||||
mac,
|
||||
hostname: installed.hostname,
|
||||
status: "installed",
|
||||
role: installed.role,
|
||||
ip: installed.ip,
|
||||
installed_at: installed.installed_at,
|
||||
};
|
||||
}
|
||||
|
||||
// Active install takes priority over old installed state (reprovision case)
|
||||
if (queued) {
|
||||
return {
|
||||
mac,
|
||||
@@ -202,6 +309,17 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
|
||||
};
|
||||
}
|
||||
|
||||
if (installed) {
|
||||
return {
|
||||
mac,
|
||||
hostname: installed.hostname,
|
||||
status: "installed",
|
||||
role: installed.role,
|
||||
ip: installed.ip,
|
||||
installed_at: installed.installed_at,
|
||||
};
|
||||
}
|
||||
|
||||
return reply.code(404).send({ error: `MAC ${mac} not found in install queue or installed` });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ export interface DbClient {
|
||||
server: {
|
||||
findMany: (...args: unknown[]) => Promise<unknown[]>;
|
||||
findUnique: (...args: unknown[]) => Promise<unknown>;
|
||||
upsert: (...args: unknown[]) => Promise<unknown>;
|
||||
};
|
||||
joinToken: {
|
||||
findUnique: (...args: unknown[]) => Promise<unknown>;
|
||||
@@ -139,7 +140,7 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{
|
||||
socket,
|
||||
connectedAt: new Date(),
|
||||
lastHeartbeat: new Date(),
|
||||
state: { discovered: {}, install_queue: {}, installed: {} },
|
||||
state: { discovered: {}, install_queue: {}, installed: {}, debug: {} },
|
||||
});
|
||||
|
||||
socket.send(JSON.stringify({ type: "bastion-enrolled", bastionId: record.id }));
|
||||
@@ -175,6 +176,52 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{
|
||||
if (bastionId) {
|
||||
bastionRegistry.updateState(bastionId, msg.state);
|
||||
logger.info(`Bastion ${bastionId.slice(0, 8)} state sync: ${Object.keys(msg.state.discovered).length} discovered, ${Object.keys(msg.state.installed).length} installed`);
|
||||
|
||||
// Persist machines to DB
|
||||
void (async () => {
|
||||
try {
|
||||
// Upsert discovered machines
|
||||
for (const [mac, hw] of Object.entries(msg.state.discovered)) {
|
||||
await db.server.upsert({
|
||||
where: { mac },
|
||||
create: {
|
||||
hostname: hw.product ?? mac,
|
||||
mac,
|
||||
role: "unknown",
|
||||
status: "discovered",
|
||||
labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer },
|
||||
},
|
||||
update: {
|
||||
status: "discovered",
|
||||
lastHeartbeat: new Date(),
|
||||
labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer },
|
||||
},
|
||||
});
|
||||
}
|
||||
// Upsert installed machines
|
||||
for (const [mac, info] of Object.entries(msg.state.installed)) {
|
||||
await db.server.upsert({
|
||||
where: { mac },
|
||||
create: {
|
||||
hostname: info.hostname,
|
||||
mac,
|
||||
role: info.role ?? "worker",
|
||||
ip: info.ip,
|
||||
status: "online",
|
||||
},
|
||||
update: {
|
||||
hostname: info.hostname,
|
||||
role: info.role ?? "worker",
|
||||
ip: info.ip,
|
||||
status: "online",
|
||||
lastHeartbeat: new Date(),
|
||||
},
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
logger.warn(`Failed to persist machines to DB: ${err instanceof Error ? err.message : String(err)}`);
|
||||
}
|
||||
})();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
import { EventEmitter } from "node:events";
|
||||
import type { WebSocket } from "ws";
|
||||
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from "@lab/shared";
|
||||
import type { BastionState, HardwareInfo, InstallConfig, InstalledInfo, DebugConfig } from "@lab/shared";
|
||||
|
||||
export interface ConnectedBastion {
|
||||
bastionId: string;
|
||||
@@ -20,6 +20,7 @@ export interface AggregatedState {
|
||||
discovered: Record<string, HardwareInfo>;
|
||||
install_queue: Record<string, InstallConfig>;
|
||||
installed: Record<string, InstalledInfo>;
|
||||
debug: Record<string, DebugConfig>;
|
||||
}
|
||||
|
||||
export class BastionRegistry extends EventEmitter {
|
||||
@@ -86,6 +87,7 @@ export class BastionRegistry extends EventEmitter {
|
||||
discovered: {},
|
||||
install_queue: {},
|
||||
installed: {},
|
||||
debug: {},
|
||||
};
|
||||
|
||||
for (const bastion of this.bastions.values()) {
|
||||
@@ -98,6 +100,9 @@ export class BastionRegistry extends EventEmitter {
|
||||
for (const [mac, info] of Object.entries(bastion.state.installed)) {
|
||||
result.installed[mac] = { ...info, bastionId: bastion.bastionId };
|
||||
}
|
||||
for (const [mac, dbg] of Object.entries(bastion.state.debug ?? {})) {
|
||||
result.debug[mac] = { ...dbg };
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -5,6 +5,7 @@ export type {
|
||||
HardwareInfo,
|
||||
InstallConfig,
|
||||
InstalledInfo,
|
||||
DebugConfig,
|
||||
BastionState,
|
||||
BastionConfig,
|
||||
} from "./types/index.js";
|
||||
|
||||
@@ -100,6 +100,7 @@ export type BastionMessage =
|
||||
| { type: "bastion-heartbeat"; bastionId: string; uptime: number; machineCount: number }
|
||||
| { type: "bastion-state-sync"; bastionId: string; state: import("../types/state.js").BastionState }
|
||||
| { type: "bastion-progress"; bastionId: string; mac: string; stage: string; detail: string; timestamp: string }
|
||||
| { type: "bastion-install-log"; bastionId: string; mac: string; hostname: string; provisionerType: import("../types/state.js").ProvisionStackType; sessionId: string; lines: string[]; timestamp: string }
|
||||
| { type: "command-response"; requestId: string; status: "ok" | "error"; data?: unknown; error?: string };
|
||||
|
||||
// --- labd -> Bastion messages ---
|
||||
@@ -110,6 +111,8 @@ export type LabdBastionMessage =
|
||||
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
|
||||
| { type: "command-forget"; requestId: string; mac: string }
|
||||
| { type: "command-role-update"; requestId: string; mac: string; role: string }
|
||||
| { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean }
|
||||
| { type: "command-register"; requestId: string; mac: string; hostname: string; role: string; ip: string }
|
||||
| { type: "server-shutdown"; reconnectAfter: number };
|
||||
|
||||
export type BastionMessageType = BastionMessage["type"];
|
||||
@@ -119,12 +122,12 @@ export type LabdBastionMessageType = LabdBastionMessage["type"];
|
||||
|
||||
const BASTION_MESSAGE_TYPES = new Set<string>([
|
||||
"bastion-enroll", "bastion-heartbeat", "bastion-state-sync",
|
||||
"bastion-progress", "command-response",
|
||||
"bastion-progress", "bastion-install-log", "command-response",
|
||||
]);
|
||||
|
||||
const LABD_BASTION_MESSAGE_TYPES = new Set<string>([
|
||||
"bastion-enrolled", "bastion-heartbeat-ack", "command-install",
|
||||
"command-forget", "command-role-update", "server-shutdown",
|
||||
"command-forget", "command-role-update", "command-debug", "command-register", "server-shutdown",
|
||||
]);
|
||||
|
||||
export function isBastionMessage(msg: unknown): msg is BastionMessage {
|
||||
|
||||
@@ -14,6 +14,8 @@ export interface BastionConfig {
|
||||
// Ubuntu support
|
||||
ubuntuVersion: string;
|
||||
ubuntuMirror: string;
|
||||
// Syslog listener for install logs (Anaconda logging --host)
|
||||
syslogPort: number;
|
||||
// Flags
|
||||
skipDnsmasq?: boolean | undefined;
|
||||
skipArtifacts?: boolean | undefined;
|
||||
|
||||
@@ -5,6 +5,7 @@ export type {
|
||||
HardwareInfo,
|
||||
InstallConfig,
|
||||
InstalledInfo,
|
||||
DebugConfig,
|
||||
BastionState,
|
||||
} from "./state.js";
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
// State types for discovered machines, install queue, and installed machines.
|
||||
|
||||
export type ProvisionStackType = "dhcpproxy" | "iso" | "cloud-init";
|
||||
|
||||
export type OsId = "fedora-43" | "ubuntu-26.04";
|
||||
export type Arch = "x86_64" | "aarch64";
|
||||
|
||||
@@ -96,8 +98,15 @@ export interface InstalledInfo {
|
||||
bastionId?: string; // set when aggregated through labd
|
||||
}
|
||||
|
||||
export interface DebugConfig {
|
||||
hostname: string;
|
||||
queued_at: string;
|
||||
pxeBoot?: boolean;
|
||||
}
|
||||
|
||||
export interface BastionState {
|
||||
discovered: Record<string, HardwareInfo>;
|
||||
install_queue: Record<string, InstallConfig>;
|
||||
installed: Record<string, InstalledInfo>;
|
||||
debug: Record<string, DebugConfig>;
|
||||
}
|
||||
|
||||
82
bastion/tests/integration/helpers/jetkvm.sh
Executable file
82
bastion/tests/integration/helpers/jetkvm.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# JetKVM helper — authenticate and interact with JetKVM device.
|
||||
# Usage:
|
||||
# jetkvm.sh status — check device status
|
||||
# jetkvm.sh reboot — reboot the target machine via ATX
|
||||
# jetkvm.sh poweron — power on via ATX short press
|
||||
# jetkvm.sh poweroff — power off via ATX long press
|
||||
#
|
||||
# Environment:
|
||||
# JETKVM_HOST — JetKVM IP (default: 192.168.3.10)
|
||||
# JETKVM_PASS — device password
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${JETKVM_HOST:-192.168.3.10}"
|
||||
PASS="${JETKVM_PASS:-}"
|
||||
|
||||
if [ -z "$PASS" ]; then
|
||||
echo "ERROR: JETKVM_PASS not set" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BASE="http://$HOST"
|
||||
|
||||
# Authenticate and get token
|
||||
login() {
|
||||
local resp
|
||||
resp=$(curl -s -X POST "$BASE/auth/login-local" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"password\":\"$PASS\"}" 2>&1)
|
||||
|
||||
local token
|
||||
token=$(echo "$resp" | grep -oP '"token"\s*:\s*"[^"]*"' | head -1 | grep -oP '"[^"]*"$' | tr -d '"')
|
||||
|
||||
if [ -z "$token" ]; then
|
||||
echo "ERROR: Login failed: $resp" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "$token"
|
||||
}
|
||||
|
||||
# Make authenticated request
|
||||
api() {
|
||||
local method="$1" path="$2" body="${3:-}"
|
||||
local token
|
||||
token=$(login)
|
||||
|
||||
if [ -n "$body" ]; then
|
||||
curl -s -X "$method" "$BASE$path" \
|
||||
-H "Authorization: Bearer $token" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$body"
|
||||
else
|
||||
curl -s -X "$method" "$BASE$path" \
|
||||
-H "Authorization: Bearer $token"
|
||||
fi
|
||||
}
|
||||
|
||||
case "${1:-status}" in
|
||||
status)
|
||||
curl -s "$BASE/device/status" 2>&1
|
||||
;;
|
||||
device)
|
||||
api GET /device
|
||||
;;
|
||||
reboot)
|
||||
echo "Sending ATX reset..."
|
||||
api POST /device/atx/reset
|
||||
;;
|
||||
poweron)
|
||||
echo "Sending ATX short power press..."
|
||||
api POST /device/atx/power-short
|
||||
;;
|
||||
poweroff)
|
||||
echo "Sending ATX long power press..."
|
||||
api POST /device/atx/power-long
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {status|device|reboot|poweron|poweroff}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -40,50 +40,50 @@ export function ensurePxeNetwork(): void {
|
||||
|
||||
if (result.status === 0 && result.stdout.includes("Active: yes")) {
|
||||
log(`Network ${PXE_NETWORK_NAME} already active`);
|
||||
return;
|
||||
} else {
|
||||
// Destroy existing if present but inactive
|
||||
if (result.status === 0) {
|
||||
virsh("net-destroy", PXE_NETWORK_NAME);
|
||||
virsh("net-undefine", PXE_NETWORK_NAME);
|
||||
}
|
||||
|
||||
const xmlPath = "/tmp/lab-pxe-test-network.xml";
|
||||
writeFileSync(xmlPath, NETWORK_XML);
|
||||
|
||||
log(`Creating PXE libvirt network: ${PXE_NETWORK_NAME} (${PXE_SUBNET}.0/24, no DHCP)`);
|
||||
run(`virsh net-define "${xmlPath}"`);
|
||||
run(`virsh net-start "${PXE_NETWORK_NAME}"`);
|
||||
|
||||
try { unlinkSync(xmlPath); } catch { /* ignore */ }
|
||||
|
||||
log(`Network ${PXE_NETWORK_NAME} created and active`);
|
||||
}
|
||||
|
||||
// Destroy existing if present but inactive
|
||||
if (result.status === 0) {
|
||||
virsh("net-destroy", PXE_NETWORK_NAME);
|
||||
virsh("net-undefine", PXE_NETWORK_NAME);
|
||||
}
|
||||
// Libvirt adds nftables reject rules for NAT networks that block host→VM SSH.
|
||||
// Delete them now and after every VM reboot (libvirt recreates them).
|
||||
deleteNftablesRejectRules();
|
||||
}
|
||||
|
||||
const xmlPath = "/tmp/lab-pxe-test-network.xml";
|
||||
writeFileSync(xmlPath, NETWORK_XML);
|
||||
|
||||
log(`Creating PXE libvirt network: ${PXE_NETWORK_NAME} (${PXE_SUBNET}.0/24, no DHCP)`);
|
||||
run(`virsh net-define "${xmlPath}"`);
|
||||
run(`virsh net-start "${PXE_NETWORK_NAME}"`);
|
||||
|
||||
try { unlinkSync(xmlPath); } catch { /* ignore */ }
|
||||
|
||||
// Libvirt creates nftables rules that reject traffic on the bridge.
|
||||
// DHCP works (dnsmasq uses raw sockets) but TFTP/HTTP from VM->host gets blocked.
|
||||
// Delete the reject rules so VM traffic can reach the bastion.
|
||||
try {
|
||||
// Delete the reject rules that libvirt added for our bridge.
|
||||
// We find and delete each rule by its handle number.
|
||||
const deleteRejectRules = (chain: string): void => {
|
||||
const output = run(`nft -a list chain inet libvirt ${chain} 2>/dev/null || true`);
|
||||
const lines = output.split("\n");
|
||||
for (const line of lines) {
|
||||
if (line.includes(PXE_BRIDGE) && line.includes("reject")) {
|
||||
const handleMatch = line.match(/# handle (\d+)/);
|
||||
if (handleMatch) {
|
||||
run(`nft delete rule inet libvirt ${chain} handle ${handleMatch[1]}`);
|
||||
/** Delete libvirt's nftables reject rules for our bridge so host→VM traffic works.
|
||||
* Must be called after every VM start/restart — libvirt recreates them. */
|
||||
export function deleteNftablesRejectRules(): void {
|
||||
// libvirt uses "ip libvirt_network" table (not "inet libvirt")
|
||||
const tables = ["ip libvirt_network", "ip6 libvirt_network", "inet libvirt"];
|
||||
for (const table of tables) {
|
||||
try {
|
||||
for (const chain of ["guest_input", "guest_output"]) {
|
||||
const output = run(`nft -a list chain ${table} ${chain} 2>/dev/null || true`);
|
||||
for (const line of output.split("\n")) {
|
||||
if (line.includes(PXE_BRIDGE) && line.includes("reject")) {
|
||||
const handleMatch = line.match(/# handle (\d+)/);
|
||||
if (handleMatch) {
|
||||
run(`nft delete rule ${table} ${chain} handle ${handleMatch[1]}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
deleteRejectRules("guest_input");
|
||||
deleteRejectRules("guest_output");
|
||||
log(`Removed nftables reject rules for ${PXE_BRIDGE}`);
|
||||
} catch {
|
||||
log(`Could not update nftables rules (may need manual firewall config)`);
|
||||
} catch { /* table may not exist */ }
|
||||
}
|
||||
|
||||
log(`Network ${PXE_NETWORK_NAME} created and active`);
|
||||
}
|
||||
|
||||
/** Destroy the PXE test network. */
|
||||
|
||||
@@ -63,7 +63,7 @@ export function createPxeVm(config: PxeVmConfig): void {
|
||||
`--disk=path=${diskPath},format=qcow2,bus=virtio`,
|
||||
`--network=network=${config.network},model=virtio`,
|
||||
// UEFI firmware — required for PXE boot in modern mode
|
||||
`--boot=uefi,network`,
|
||||
`--boot=uefi,network,hd`,
|
||||
// No OS to install — PXE provides everything
|
||||
"--os-variant=generic",
|
||||
"--noautoconsole",
|
||||
@@ -113,29 +113,54 @@ export function rebootPxeVm(name: string): void {
|
||||
log(`PXE VM ${name} restarted`);
|
||||
}
|
||||
|
||||
/** Change VM boot order to disk first (skip PXE on next boot). */
|
||||
export function setBootDisk(name: string): void {
|
||||
log(`Setting ${name} boot order to disk first`);
|
||||
virsh("destroy", name);
|
||||
spawnSync("sleep", ["2"]);
|
||||
// Get current XML, replace boot dev='network' with boot dev='hd'
|
||||
// This preserves UEFI loader/nvram settings (virt-xml --boot hd can break them)
|
||||
const dumpXml = virsh("dumpxml", name);
|
||||
if (dumpXml.status !== 0) throw new Error("Failed to dump VM XML");
|
||||
let xml = dumpXml.stdout;
|
||||
// Replace any <boot dev='...' /> entries with hd
|
||||
xml = xml.replace(/<boot dev='[^']*'\/>/g, "<boot dev='hd'/>");
|
||||
// If no boot dev entry, add one before </os>
|
||||
if (!xml.includes("<boot dev=")) {
|
||||
xml = xml.replace("</os>", " <boot dev='hd'/>\n </os>");
|
||||
}
|
||||
const xmlPath = `/tmp/${name}-bootfix.xml`;
|
||||
const { writeFileSync: writeFs, unlinkSync: unlinkFs } = require("node:fs") as typeof import("node:fs");
|
||||
writeFs(xmlPath, xml);
|
||||
run(`virsh define "${xmlPath}"`);
|
||||
try { unlinkFs(xmlPath); } catch { /* ignore */ }
|
||||
virsh("start", name);
|
||||
log(`${name} restarted with disk boot (UEFI preserved)`);
|
||||
/**
|
||||
* Read raw output from the VM's serial console (telnet TCP port).
|
||||
* Returns the last N lines. Useful for diagnostics when SSH isn't available.
|
||||
*/
|
||||
export async function readSerialLog(
|
||||
port: number,
|
||||
opts: { lastLines?: number; timeoutMs?: number } = {},
|
||||
): Promise<string> {
|
||||
const { lastLines = 50, timeoutMs = 10_000 } = opts;
|
||||
return new Promise((resolve) => {
|
||||
const sock = createConnection({ host: "127.0.0.1", port });
|
||||
let buf = "";
|
||||
const timer = setTimeout(() => { sock.destroy(); resolve(buf); }, timeoutMs);
|
||||
sock.on("data", (d: Buffer) => { buf += d.toString(); });
|
||||
sock.on("error", () => { clearTimeout(timer); resolve(`(connection error) ${buf}`); });
|
||||
sock.on("close", () => { clearTimeout(timer); resolve(buf); });
|
||||
// Send a newline to trigger any buffered output / prompt
|
||||
setTimeout(() => sock.write("\r\n"), 500);
|
||||
}).then((raw: unknown) => {
|
||||
const lines = (raw as string).split("\n").map(l => l.trimEnd()).filter(Boolean);
|
||||
return lines.slice(-lastLines).join("\n");
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a command on the VM's serial console via socat.
|
||||
* Requires auto-login root shell on the serial port.
|
||||
*/
|
||||
export function serialExec(
|
||||
port: number,
|
||||
command: string,
|
||||
timeoutMs = 15_000,
|
||||
): string {
|
||||
const marker = `__END_${Date.now()}__`;
|
||||
// Use socat to handle telnet negotiation properly
|
||||
const input = `\r\n${command}; echo '${marker}'\r\n`;
|
||||
const result = spawnSync("bash", ["-c",
|
||||
`echo -e '${input.replace(/'/g, "\\'")}' | socat -T${Math.ceil(timeoutMs / 1000)} - TCP:127.0.0.1:${port} 2>/dev/null`
|
||||
], { encoding: "utf-8", stdio: "pipe", timeout: timeoutMs + 5000 });
|
||||
const output = result.stdout ?? "";
|
||||
const markerIdx = output.indexOf(marker);
|
||||
if (markerIdx < 0) return `(no marker) ${output.slice(-500)}`;
|
||||
// Get lines between command echo and marker
|
||||
const before = output.substring(0, markerIdx);
|
||||
const lines = before.split("\n");
|
||||
// Skip everything up to and including the command echo line
|
||||
const cmdIdx = lines.findIndex(l => l.includes(command.substring(0, 20)));
|
||||
return lines.slice(cmdIdx >= 0 ? cmdIdx + 1 : 1).join("\n").trim();
|
||||
}
|
||||
|
||||
export interface IsoVmConfig {
|
||||
@@ -187,69 +212,3 @@ export function createIsoVm(config: IsoVmConfig): void {
|
||||
log(`ISO boot VM ${config.name} created (serial: telnet 127.0.0.1 4556)`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a command on a VM via its serial console (telnet).
|
||||
* Works even when the VM has no network/SSH.
|
||||
* Returns the output after the command's echo.
|
||||
*/
|
||||
export async function serialExec(
|
||||
port: number,
|
||||
command: string,
|
||||
timeoutMs = 10_000,
|
||||
): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
sock.destroy();
|
||||
reject(new Error(`Serial exec timeout after ${timeoutMs}ms`));
|
||||
}, timeoutMs);
|
||||
|
||||
const sock = createConnection({ host: "127.0.0.1", port });
|
||||
let buffer = "";
|
||||
let sentCommand = false;
|
||||
// Random marker to delimit command output
|
||||
const marker = `__SERIAL_END_${Date.now()}__`;
|
||||
|
||||
sock.on("connect", () => {
|
||||
// Wait for login prompt or shell prompt, then send command
|
||||
setTimeout(() => {
|
||||
// Send a newline first to get a prompt
|
||||
sock.write("\r\n");
|
||||
}, 500);
|
||||
});
|
||||
|
||||
sock.on("data", (data: Buffer) => {
|
||||
buffer += data.toString();
|
||||
|
||||
if (!sentCommand && (buffer.includes("login:") || buffer.includes("# ") || buffer.includes("$ "))) {
|
||||
if (buffer.includes("login:")) {
|
||||
// Auto-login as root
|
||||
sock.write("root\r\n");
|
||||
sentCommand = false; // wait for shell prompt after login
|
||||
buffer = "";
|
||||
return;
|
||||
}
|
||||
// At shell prompt — send command with marker
|
||||
sentCommand = true;
|
||||
buffer = "";
|
||||
sock.write(`${command}; echo "${marker}"\r\n`);
|
||||
}
|
||||
|
||||
if (sentCommand && buffer.includes(marker)) {
|
||||
clearTimeout(timer);
|
||||
// Extract output between command echo and marker
|
||||
const markerIdx = buffer.indexOf(marker);
|
||||
const output = buffer.substring(0, markerIdx).trim();
|
||||
// Remove the command echo (first line)
|
||||
const lines = output.split("\n");
|
||||
const result = lines.slice(1).join("\n").trim();
|
||||
sock.destroy();
|
||||
resolve(result);
|
||||
}
|
||||
});
|
||||
|
||||
sock.on("error", (err) => {
|
||||
clearTimeout(timer);
|
||||
reject(new Error(`Serial connection failed: ${err.message}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
33
bastion/tests/integration/helpers/vm-screenshot.sh
Executable file
33
bastion/tests/integration/helpers/vm-screenshot.sh
Executable file
@@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
# Capture a screenshot of a libvirt VM and convert to PNG for viewing.
|
||||
# Usage: vm-screenshot.sh [VM_NAME] [OUTPUT_PATH]
|
||||
VM_NAME="${1:-lab-pxe-test}"
|
||||
OUTPUT="${2:-/tmp/vm-screenshot.png}"
|
||||
PPM="/tmp/vm-screenshot-$$.ppm"
|
||||
|
||||
if ! sudo virsh domstate "$VM_NAME" &>/dev/null; then
|
||||
echo "ERROR: VM '$VM_NAME' not found or not running" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sudo virsh screenshot "$VM_NAME" "$PPM" --screen 0 2>/dev/null
|
||||
if [ ! -f "$PPM" ]; then
|
||||
echo "ERROR: screenshot failed" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Convert to PNG (ppm -> png)
|
||||
if command -v convert &>/dev/null; then
|
||||
convert "$PPM" "$OUTPUT"
|
||||
elif command -v ffmpeg &>/dev/null; then
|
||||
ffmpeg -y -i "$PPM" "$OUTPUT" 2>/dev/null
|
||||
elif command -v pnmtopng &>/dev/null; then
|
||||
pnmtopng "$PPM" > "$OUTPUT"
|
||||
else
|
||||
# fallback: just copy the PPM (Read tool can handle it)
|
||||
cp "$PPM" "${OUTPUT%.png}.ppm"
|
||||
OUTPUT="${OUTPUT%.png}.ppm"
|
||||
fi
|
||||
|
||||
rm -f "$PPM"
|
||||
echo "$OUTPUT"
|
||||
@@ -23,17 +23,56 @@ import { execSync } from "node:child_process";
|
||||
import { join } from "node:path";
|
||||
import { homedir, tmpdir } from "node:os";
|
||||
import { log, waitForSsh } from "./helpers/libvirt.js";
|
||||
import { ensurePxeNetwork, destroyPxeNetwork, PXE_NETWORK_NAME, PXE_GATEWAY, PXE_SUBNET } from "./helpers/pxe-network.js";
|
||||
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, serialExec } from "./helpers/pxe-vm.js";
|
||||
import { ensurePxeNetwork, destroyPxeNetwork, deleteNftablesRejectRules, PXE_NETWORK_NAME, PXE_GATEWAY, PXE_SUBNET } from "./helpers/pxe-network.js";
|
||||
import { createPxeVm, destroyPxeVm, getVmMac, rebootPxeVm, readSerialLog } from "./helpers/pxe-vm.js";
|
||||
import { sshExec } from "./helpers/ssh.js";
|
||||
|
||||
// --- Boot screenshot capture ---
|
||||
const SCREENSHOT_DIR = "/tmp/vm-screenshots";
|
||||
|
||||
function startBootScreenshots(vmName: string): { stop: () => void } {
|
||||
try { mkdirSync(SCREENSHOT_DIR, { recursive: true }); } catch {}
|
||||
// Clean old screenshots
|
||||
try {
|
||||
for (const f of require("node:fs").readdirSync(SCREENSHOT_DIR)) {
|
||||
rmSync(join(SCREENSHOT_DIR, f), { force: true });
|
||||
}
|
||||
} catch {}
|
||||
|
||||
let running = true;
|
||||
let seq = 0;
|
||||
const BUFFER_SIZE = 60; // keep last 60 screenshots (1 per second)
|
||||
|
||||
const loop = async () => {
|
||||
while (running) {
|
||||
try {
|
||||
const idx = String(seq % BUFFER_SIZE).padStart(4, "0");
|
||||
const ppm = join(SCREENSHOT_DIR, `tmp-${idx}.ppm`);
|
||||
const png = join(SCREENSHOT_DIR, `boot-${idx}.png`);
|
||||
execSync(`sudo virsh screenshot ${vmName} ${ppm} --screen 0 2>/dev/null`, { timeout: 3000 });
|
||||
execSync(`convert ${ppm} ${png} 2>/dev/null && rm -f ${ppm}`, { timeout: 3000 });
|
||||
seq++;
|
||||
} catch {}
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
}
|
||||
};
|
||||
loop();
|
||||
|
||||
return {
|
||||
stop: () => {
|
||||
running = false;
|
||||
log(`Boot screenshots saved to ${SCREENSHOT_DIR}/ (${seq} captured, last ${Math.min(seq, BUFFER_SIZE)} kept)`);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// --- Test constants ---
|
||||
const VM_NAME = "lab-pxe-test";
|
||||
const VM_MEMORY = 4096; // 4GB (Anaconda needs ~2GB minimum)
|
||||
const VM_VCPUS = 2;
|
||||
const VM_VCPUS = 12;
|
||||
const VM_DISK_GB = 250; // LVM layout needs ~204GB (swap 27 + root 33 + var 100 + etc). QCOW2 is sparse.
|
||||
const HTTP_PORT = 8099; // Avoid conflicts with real bastion
|
||||
const SSH_USER = "michal"; // Admin user created by kickstart
|
||||
const SSH_USER = "lab"; // Admin user created by kickstart
|
||||
const BASTION_IP = PXE_GATEWAY; // 192.168.251.1
|
||||
const DHCP_RANGE_START = `${PXE_SUBNET}.100`;
|
||||
const DHCP_RANGE_END = `${PXE_SUBNET}.200`;
|
||||
@@ -185,15 +224,19 @@ describe("PXE boot provisioning", () => {
|
||||
// Generate dnsmasq config
|
||||
generateDnsmasqConf(config);
|
||||
|
||||
// Start HTTP server
|
||||
const { app, state } = createApp(config);
|
||||
// Start HTTP server + syslog listener
|
||||
const { app, state, syslog } = createApp(config);
|
||||
bastionApp = app;
|
||||
await app.listen({ port: config.httpPort, host: "0.0.0.0" });
|
||||
log(`Bastion HTTP server listening on :${HTTP_PORT}`);
|
||||
syslog.start();
|
||||
log(`Bastion HTTP server listening on :${HTTP_PORT}, syslog on UDP :${config.syslogPort}`);
|
||||
|
||||
// Start dnsmasq (fire-and-forget — it runs until killed)
|
||||
log("Starting dnsmasq (full DHCP mode)...");
|
||||
void startDnsmasq(config);
|
||||
// May fail without root (DHCP socket needs CAP_NET_BIND_SERVICE); libvirt network provides DHCP fallback
|
||||
log("Starting dnsmasq (proxy DHCP mode)...");
|
||||
startDnsmasq(config).catch((err) => {
|
||||
log(`dnsmasq failed (expected without root): ${err instanceof Error ? err.message : String(err)}`);
|
||||
});
|
||||
// Give dnsmasq a moment to bind ports
|
||||
await sleep(1000);
|
||||
|
||||
@@ -267,38 +310,32 @@ describe("PXE boot provisioning", () => {
|
||||
vmIp = finalState.ip ?? "";
|
||||
log(`Install complete! VM IP: ${vmIp}`);
|
||||
|
||||
// 9. Force-restart VM to ensure clean boot with updated NVRAM.
|
||||
// The %post efibootmgr sets network-first boot order, but OVMF may not
|
||||
// reread NVRAM during a warm reboot. Force cold-restart ensures it does.
|
||||
log("Force-restarting VM for clean network-first boot...");
|
||||
// 9. Reboot VM — it network-boots again, bastion /dispatch returns
|
||||
// "exit" (already installed), iPXE falls through to local disk boot.
|
||||
log("Rebooting VM (network-first → bastion dispatch → local disk)...");
|
||||
await sleep(15_000);
|
||||
rebootPxeVm(VM_NAME);
|
||||
// Libvirt recreates nftables reject rules on VM restart — wait for them then delete
|
||||
await sleep(3_000);
|
||||
deleteNftablesRejectRules();
|
||||
|
||||
// 10. Wait for SSH — VM network-boots, iPXE chains to /dispatch,
|
||||
// bastion returns exit (installed), iPXE falls through to disk boot
|
||||
// 10. Wait for SSH (with aggressive boot screenshots)
|
||||
log("Waiting for SSH access...");
|
||||
const screenshots = startBootScreenshots(VM_NAME);
|
||||
try {
|
||||
await waitForSsh(vmIp, SSH_USER, SSH_TIMEOUT_MS, sshKeyPath);
|
||||
} catch {
|
||||
// SSH failed — use serial console to diagnose
|
||||
log("SSH timed out. Diagnosing via serial console...");
|
||||
// SSH failed — read serial console (lab-boot-diag.service dumps diagnostics there)
|
||||
log("SSH timed out. Reading serial console diagnostics...");
|
||||
try {
|
||||
const hostname = await serialExec(4555, "hostname", 15_000);
|
||||
log(`Serial: hostname = ${hostname}`);
|
||||
const ip = await serialExec(4555, "ip -4 addr show | grep inet", 15_000);
|
||||
log(`Serial: ip = ${ip}`);
|
||||
const nm = await serialExec(4555, "systemctl is-active NetworkManager", 15_000);
|
||||
log(`Serial: NetworkManager = ${nm}`);
|
||||
const sshd = await serialExec(4555, "systemctl is-active sshd", 15_000);
|
||||
log(`Serial: sshd = ${sshd}`);
|
||||
const failed = await serialExec(4555, "systemctl --failed --no-pager", 15_000);
|
||||
log(`Serial: failed units = ${failed}`);
|
||||
const fstab = await serialExec(4555, "grep efi /etc/fstab", 15_000);
|
||||
log(`Serial: fstab efi = ${fstab}`);
|
||||
const serialOut = await readSerialLog(4555, { lastLines: 80, timeoutMs: 15_000 });
|
||||
log(`Serial console:\n${serialOut}`);
|
||||
} catch (serialErr) {
|
||||
log(`Serial console failed: ${serialErr instanceof Error ? serialErr.message : String(serialErr)}`);
|
||||
}
|
||||
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above`);
|
||||
throw new Error(`SSH not available on ${vmIp} — check serial console diagnostics above. Screenshots: ${SCREENSHOT_DIR}/`);
|
||||
} finally {
|
||||
screenshots.stop();
|
||||
}
|
||||
|
||||
log("PXE provision test setup complete.");
|
||||
@@ -316,10 +353,7 @@ describe("PXE boot provisioning", () => {
|
||||
const { stopDnsmasq } = await import("../../src/bastion/src/services/dnsmasq.js");
|
||||
stopDnsmasq();
|
||||
|
||||
// Destroy VM
|
||||
destroyPxeVm(VM_NAME);
|
||||
|
||||
// Destroy network
|
||||
destroyPxeNetwork();
|
||||
|
||||
// Clean up test dir
|
||||
@@ -354,10 +388,10 @@ describe("PXE boot provisioning", () => {
|
||||
expect(data.progress).toBe("complete");
|
||||
});
|
||||
|
||||
it("log lines were captured", async () => {
|
||||
it("syslog install logs were captured", async () => {
|
||||
// Anaconda forwards logs via syslog (logging --host directive in kickstart)
|
||||
const res = await fetch(`http://${BASTION_IP}:${HTTP_PORT}/api/logs/${encodeURIComponent(vmMac)}`);
|
||||
const data = (await res.json()) as { log_total?: number; log_lines?: Array<{ line: string }> };
|
||||
// Should have at least some log lines from the log streamer
|
||||
expect(data.log_total).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
@@ -400,7 +434,15 @@ describe("PXE boot provisioning", () => {
|
||||
it("EFI boot order keeps network first (bastion controls boot)", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "sudo efibootmgr", { keyPath: sshKeyPath });
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(result.stdout).toContain("BootOrder:");
|
||||
// The first entry in BootOrder should be a network/PXE/HTTP boot entry
|
||||
const orderMatch = result.stdout.match(/BootOrder:\s*([0-9A-Fa-f]+)/);
|
||||
expect(orderMatch).toBeTruthy();
|
||||
const firstEntry = orderMatch![1];
|
||||
// Find what that entry maps to — should be network-related
|
||||
const entryLine = result.stdout.match(new RegExp(`Boot${firstEntry}\\*?\\s+(.+)`));
|
||||
expect(entryLine).toBeTruthy();
|
||||
const entryName = entryLine![1].toLowerCase();
|
||||
expect(entryName).toMatch(/network|pxe|ipv4|ipv6|http|uefi.*nic/i);
|
||||
});
|
||||
|
||||
it("tmpfs mount for /tmp is configured", () => {
|
||||
@@ -422,4 +464,53 @@ describe("PXE boot provisioning", () => {
|
||||
expect(lvs).toContain(expected);
|
||||
}
|
||||
});
|
||||
|
||||
// --- Post-provision health checks ---
|
||||
|
||||
it("no failed systemd services", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "sudo systemctl --failed --no-legend --no-pager", { keyPath: sshKeyPath });
|
||||
expect(result.exitCode).toBe(0);
|
||||
const failed = result.stdout.trim();
|
||||
expect(failed).toBe("");
|
||||
});
|
||||
|
||||
it("root filesystem is mounted read-write", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "mount | grep ' / '", { keyPath: sshKeyPath });
|
||||
expect(result.stdout).toContain("rw,");
|
||||
expect(result.stdout).not.toContain("(ro,");
|
||||
});
|
||||
|
||||
it("/boot/efi is mounted", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "mount | grep /boot/efi", { keyPath: sshKeyPath });
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(result.stdout).toContain("vfat");
|
||||
});
|
||||
|
||||
it("kernel modules are loaded (depmod correct)", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "lsmod | wc -l", { keyPath: sshKeyPath });
|
||||
expect(result.exitCode).toBe(0);
|
||||
// Should have a reasonable number of modules loaded
|
||||
expect(Number(result.stdout.trim())).toBeGreaterThan(10);
|
||||
});
|
||||
|
||||
it("SELinux is enforcing", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "getenforce", { keyPath: sshKeyPath });
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(result.stdout.trim()).toBe("Enforcing");
|
||||
});
|
||||
|
||||
it("SELinux context on /etc/fstab is correct", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "ls -Z /etc/fstab", { keyPath: sshKeyPath });
|
||||
expect(result.stdout).toContain("etc_t");
|
||||
});
|
||||
|
||||
it("sshd is running", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "sudo systemctl is-active sshd", { keyPath: sshKeyPath });
|
||||
expect(result.stdout.trim()).toBe("active");
|
||||
});
|
||||
|
||||
it("chronyd is running for time sync", () => {
|
||||
const result = sshExec(vmIp, SSH_USER, "sudo systemctl is-active chronyd", { keyPath: sshKeyPath });
|
||||
expect(result.stdout.trim()).toBe("active");
|
||||
});
|
||||
});
|
||||
|
||||
27
bastion/tests/integration/run-pxe-test.sh
Executable file
27
bastion/tests/integration/run-pxe-test.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
# One-shot PXE integration test runner.
|
||||
# Compiles, runs unit tests, cleans up, and runs the full integration test.
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/../.."
|
||||
|
||||
echo "=== Step 1: Compile ==="
|
||||
npx tsc --noEmit
|
||||
echo "✓ Compile OK"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 2: Kickstart unit tests ==="
|
||||
npx vitest run src/bastion/tests/kickstart.test.ts 2>&1 | tail -5
|
||||
echo "✓ Unit tests OK"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 3: Clean up ==="
|
||||
sudo lsof -ti:8099 2>/dev/null | xargs -r sudo kill -9 || true
|
||||
sudo virsh destroy lab-pxe-test 2>/dev/null || true
|
||||
sudo virsh undefine lab-pxe-test --nvram 2>/dev/null || true
|
||||
sudo rm -f /var/lib/libvirt/images/lab-pxe-test.qcow2
|
||||
echo "✓ Cleanup done"
|
||||
|
||||
echo ""
|
||||
echo "=== Step 4: Integration test ==="
|
||||
npx vitest run -c /dev/null tests/integration/pxe-provision.test.ts 2>&1
|
||||
9
bastion/vitest.integration.config.ts
Normal file
9
bastion/vitest.integration.config.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import { defineConfig } from 'vitest/config';
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
globals: true,
|
||||
include: ['tests/integration/**/*.test.ts'],
|
||||
testTimeout: 600000,
|
||||
},
|
||||
});
|
||||
Reference in New Issue
Block a user