bastion: discover-first PXE provisioning with multi-arch support

Rewrote bastion from install-only to discover-first flow:
- Default mode discovers hardware (PXE boot → inventory → poweroff)
- Discovered machines promoted to install via subcommand
- Per-MAC iPXE dispatch (/dispatch?mac=) routes discover vs install
- Python HTTP server with discovery API, state management, kickstart gen
- Added full DHCP mode (DHCP_MODE=full) for isolated/test networks
- Added arm64 UEFI support (client-arch 11, iPXE arm64 binary)
- Added QEMU test script (aarch64+KVM on Asahi Linux)
- All API endpoints unit tested and working

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal Rydlikowski
2026-03-16 00:06:04 +00:00
parent 5ba22b94ea
commit 2a429088c5
2 changed files with 272 additions and 8 deletions

242
test-bastion.sh Executable file
View File

@@ -0,0 +1,242 @@
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────
# test-bastion.sh — End-to-end test of PXE bastion using QEMU
#
# Creates an isolated virtual network, starts the bastion in full DHCP
# mode, and PXE boots a QEMU VM to test the discovery flow.
#
# Uses aarch64 + KVM on Apple Silicon for near-native speed.
#
# Usage:
# sudo bash test-bastion.sh # discover test (default)
# sudo bash test-bastion.sh --install # discover + install test
# sudo bash test-bastion.sh --cleanup # remove test artifacts
#
# Requirements: qemu-system-aarch64, edk2-aarch64, dnsmasq, python3
# ─────────────────────────────────────────────────────────────────────
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
MODE="${1:---discover}"
# Virtual network config
BRIDGE="lab-br0"
TAP="lab-tap0"
BRIDGE_IP="10.99.0.1"
BRIDGE_CIDR="${BRIDGE_IP}/24"
BRIDGE_NET="10.99.0.0"
# Test dir
TEST_DIR="/tmp/lab-bastion-test"
BASTION_LOG="$TEST_DIR/bastion.log"
DISK="$TEST_DIR/test-disk.qcow2"
OVMF_CODE="/usr/share/edk2/aarch64/QEMU_EFI-pflash.raw"
OVMF_VARS_TEMPLATE="/usr/share/AAVMF/AAVMF_VARS.fd"
OVMF_VARS="$TEST_DIR/AAVMF_VARS.fd"
# Colors
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'
log() { echo -e "${GREEN}[test]${NC} $*"; }
warn() { echo -e "${YELLOW}[test]${NC} $*"; }
die() { echo -e "${RED}[test]${NC} $*" >&2; exit 1; }
# ──── Cleanup subcommand ──────────────────────────────────────────
if [[ "$MODE" == "--cleanup" ]]; then
echo "Cleaning up test artifacts..."
ip link del "$TAP" 2>/dev/null || true
ip link del "$BRIDGE" 2>/dev/null || true
rm -rf "$TEST_DIR"
echo "Done."
exit 0
fi
# ──── Preflight ───────────────────────────────────────────────────
[[ $EUID -eq 0 ]] || die "Run as root: sudo bash test-bastion.sh"
MISSING=""
command -v qemu-system-aarch64 >/dev/null || MISSING="$MISSING qemu-system-aarch64"
command -v qemu-img >/dev/null || MISSING="$MISSING qemu-img"
command -v dnsmasq >/dev/null || MISSING="$MISSING dnsmasq"
command -v python3 >/dev/null || MISSING="$MISSING python3"
command -v curl >/dev/null || MISSING="$MISSING curl"
[[ -z "$MISSING" ]] || die "Missing:$MISSING\n Install with: sudo dnf install$MISSING"
[[ -f "$OVMF_CODE" ]] || die "UEFI firmware not found: $OVMF_CODE\n Install with: sudo dnf install edk2-aarch64"
[[ -e /dev/kvm ]] || die "/dev/kvm not available — KVM required for aarch64 testing"
mkdir -p "$TEST_DIR"
# ──── Cleanup handler ─────────────────────────────────────────────
BASTION_PID=""
TAIL_PID=""
cleanup() {
echo ""
log "Cleaning up..."
[[ -n "$TAIL_PID" ]] && kill "$TAIL_PID" 2>/dev/null || true
[[ -n "$BASTION_PID" ]] && kill "$BASTION_PID" 2>/dev/null || true
sleep 1
ip link set "$TAP" down 2>/dev/null || true
ip link del "$TAP" 2>/dev/null || true
ip link set "$BRIDGE" down 2>/dev/null || true
ip link del "$BRIDGE" 2>/dev/null || true
log "Done. Logs: $BASTION_LOG State: $TEST_DIR/bastion/state.json"
}
trap cleanup EXIT INT TERM
# ──── Create isolated virtual network ─────────────────────────────
log "Creating virtual network ${BOLD}${BRIDGE_NET}/24${NC} ..."
# Clean up leftovers from previous runs
ip link del "$TAP" 2>/dev/null || true
ip link del "$BRIDGE" 2>/dev/null || true
ip link add "$BRIDGE" type bridge
ip addr add "$BRIDGE_CIDR" dev "$BRIDGE"
ip link set "$BRIDGE" up
ip tuntap add dev "$TAP" mode tap
ip link set "$TAP" master "$BRIDGE"
ip link set "$TAP" up
log "Bridge ${BOLD}$BRIDGE${NC} at ${BOLD}$BRIDGE_IP${NC}, tap ${BOLD}$TAP${NC}"
# ──── Start bastion ───────────────────────────────────────────────
log "Starting bastion (full DHCP mode, aarch64)..."
# Override ARCH to aarch64 for the test VM
IFACE="$BRIDGE" \
DHCP_MODE="full" \
ARCH="aarch64" \
BASTION_DIR="$TEST_DIR/bastion" \
HTTP_PORT=8080 \
bash "$SCRIPT_DIR/bastion.sh" serve > "$BASTION_LOG" 2>&1 &
BASTION_PID=$!
# Tail bastion output
sleep 1
tail -f "$BASTION_LOG" --pid=$BASTION_PID 2>/dev/null &
TAIL_PID=$!
# Wait for bastion HTTP to be ready
log "Waiting for bastion to start..."
READY=false
for i in $(seq 1 60); do
if curl -sf "http://${BRIDGE_IP}:8080/boot.ipxe" >/dev/null 2>&1; then
READY=true
break
fi
if ! kill -0 "$BASTION_PID" 2>/dev/null; then
echo ""
log "Bastion failed to start. Last 20 lines:"
tail -20 "$BASTION_LOG"
die "Bastion exited unexpectedly"
fi
sleep 1
done
$READY || die "Bastion HTTP not responding after 60s"
log "Bastion is ready!"
# ──── Prepare UEFI vars and disk ──────────────────────────────────
if [[ ! -f "$OVMF_VARS" ]]; then
cp "$OVMF_VARS_TEMPLATE" "$OVMF_VARS"
fi
if [[ ! -f "$DISK" ]]; then
log "Creating 20G test disk..."
qemu-img create -f qcow2 "$DISK" 20G >/dev/null
fi
# ──── Boot QEMU VM ────────────────────────────────────────────────
echo ""
log "${BOLD}Booting QEMU VM (aarch64 + KVM — PXE network boot)${NC}"
log "UEFI firmware will attempt PXE boot automatically."
log "Watch for ${BOLD}'NEW MACHINE DISCOVERED'${NC} in bastion output."
echo ""
echo -e "${CYAN}──── QEMU console ────${NC}"
echo ""
# aarch64 UEFI PXE boot with KVM acceleration
# - virtio-net-pci for networking (UEFI has PXE driver)
# - pflash for UEFI firmware (code + vars)
# - no disk boot priority → falls through to PXE
qemu-system-aarch64 \
-machine virt,gic-version=3 \
-cpu host \
--enable-kvm \
-m 2048 \
-smp 2 \
-drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \
-drive if=pflash,format=raw,file="$OVMF_VARS" \
-drive if=virtio,format=qcow2,file="$DISK" \
-netdev tap,id=net0,ifname="$TAP",script=no,downscript=no \
-device virtio-net-pci,netdev=net0 \
-boot n \
-nographic
# ──── Post-test ───────────────────────────────────────────────────
echo ""
log "QEMU exited. Checking bastion state..."
STATE=$(curl -sf "http://${BRIDGE_IP}:8080/api/machines" 2>/dev/null || echo '{}')
DISCOVERED=$(echo "$STATE" | python3 -c "
import sys, json
state = json.load(sys.stdin)
print(len(state.get('discovered', {})))
" 2>/dev/null || echo "0")
echo ""
if [[ "$DISCOVERED" -gt 0 ]]; then
log "${GREEN}${BOLD}SUCCESS — $DISCOVERED machine(s) discovered!${NC}"
HTTP_PORT=8080 bash "$SCRIPT_DIR/bastion.sh" list 2>/dev/null || \
echo "$STATE" | python3 -m json.tool
else
warn "No machines discovered. Check bastion log: $BASTION_LOG"
fi
# ──── Install phase (if requested) ────────────────────────────────
if [[ "$MODE" == "--install" && "$DISCOVERED" -gt 0 ]]; then
MAC=$(echo "$STATE" | python3 -c "
import sys, json
state = json.load(sys.stdin)
print(list(state.get('discovered', {}).keys())[0])
" 2>/dev/null)
if [[ -n "$MAC" ]]; then
echo ""
log "Install mode: queuing ${BOLD}$MAC${NC} as ${BOLD}test-node${NC}..."
HTTP_PORT=8080 bash "$SCRIPT_DIR/bastion.sh" install "$MAC" test-node
# Reset UEFI vars so it PXE boots again (not from disk)
cp "$OVMF_VARS_TEMPLATE" "$OVMF_VARS"
echo ""
log "Re-booting QEMU for install phase..."
echo ""
echo -e "${CYAN}──── QEMU console (install phase) ────${NC}"
echo ""
qemu-system-aarch64 \
-machine virt,gic-version=3 \
-cpu host \
--enable-kvm \
-m 2048 \
-smp 2 \
-drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \
-drive if=pflash,format=raw,file="$OVMF_VARS" \
-drive if=virtio,format=qcow2,file="$DISK" \
-netdev tap,id=net0,ifname="$TAP",script=no,downscript=no \
-device virtio-net-pci,netdev=net0 \
-boot n \
-nographic
echo ""
log "Install phase complete."
fi
fi
echo ""
log "Test finished."