Files
lab/bastion/src/shared/src/protocol/index.ts

175 lines
6.7 KiB
TypeScript
Raw Normal View History

feat: install logging, error trapping, PXE/ISO integration tests Kickstart installs on real hardware failed silently — no error reporting, only 3 progress callbacks, zero log streaming. This overhaul makes every install fully observable. Kickstart improvements: - Error trapping in %pre and %post (trap ERR sends failure details to bastion) - 12+ granular progress stages (was 3): SSH, hostname, k3s prep, EFI boot, metadata - Background log streamer: tails %post output and batch-sends to /api/log - bastion_log() function for explicit log lines from kickstart scripts Bastion API: - POST /api/log — receives raw log lines from kickstart (single or batch) - InstallLogBuffer — per-MAC ring buffer (2000 lines) + file persistence - GET /api/logs/:mac — now returns log_lines + log_total alongside stages - SSE /api/logs/:mac/follow — uses named events (event: stage vs event: log) - Progress events forwarded to labd via bastion-progress WebSocket message - Post-provision k3s logs routed through progressBus (was console-only) dnsmasq fixes found during VM testing: - HTTP Boot filename: ipxe-real.efi → ipxe.efi (leftover from old 2-stage approach) - pxe-service directives: only in proxy mode (breaks OVMF PXE in full mode) - PXEClient vendor class echo for UEFI firmware compatibility Integration tests: - PXE boot test: blank UEFI VM → dnsmasq → HTTP Boot → iPXE → bastion → install - ISO boot test: blank VM boots from bastion-generated ISO → same flow - Shared helpers: pxe-network (no DHCP, nftables fix), pxe-vm (UEFI + ISO boot) - test-provision.sh: runs both PXE + ISO tests with prerequisite checks - 250GB sparse QCOW2 disk (LVM layout needs ~204GB) 201 unit tests passing (11 new). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:26:33 +00:00
// Protocol types for agent-labd WebSocket communication.
import { randomUUID } from "node:crypto";
// --- Agent -> labd messages ---
export type AgentMessage =
| { type: "heartbeat"; hostname: string; uptime: number; version: string; memUsage: number; cpuUsage: number }
| { type: "exec-stdout"; requestId: string; data: string }
| { type: "exec-stderr"; requestId: string; data: string }
| { type: "exec-exit"; requestId: string; exitCode: number }
| { type: "log-line"; requestId: string; line: string }
| { type: "log-end"; requestId: string }
| { type: "enrollment-request"; joinToken: string; hostname: string; csr: string }
| { type: "rotation-request"; currentFingerprint: string; newCsr: string };
// --- labd -> Agent messages ---
export type ServerMessage =
| { type: "exec"; requestId: string; command: string; args: string[]; timeout: number; tty: boolean }
| { type: "exec-stdin"; requestId: string; data: string }
| { type: "exec-signal"; requestId: string; signal: "SIGTERM" | "SIGKILL" | "SIGINT" }
| { type: "log-subscribe"; requestId: string; options: JournalOptions }
| { type: "log-unsubscribe"; requestId: string }
| { type: "enrollment-response"; status: "success" | "error"; certificatePem?: string; error?: string }
| { type: "heartbeat-ack"; serverTime: string }
| { type: "server-shutdown"; reconnectAfter: number };
// --- Supporting types ---
export interface JournalOptions {
follow?: boolean;
lines?: number;
unit?: string;
since?: string;
priority?: string;
kernel?: boolean;
file?: string;
}
// --- Message types for discriminated union access ---
export type AgentMessageType = AgentMessage["type"];
export type ServerMessageType = ServerMessage["type"];
// --- Type guards ---
const AGENT_MESSAGE_TYPES = new Set<string>([
"heartbeat", "exec-stdout", "exec-stderr", "exec-exit",
"log-line", "log-end", "enrollment-request", "rotation-request",
]);
const SERVER_MESSAGE_TYPES = new Set<string>([
"exec", "exec-stdin", "exec-signal", "log-subscribe",
"log-unsubscribe", "enrollment-response", "heartbeat-ack", "server-shutdown",
]);
export function isAgentMessage(msg: unknown): msg is AgentMessage {
return (
typeof msg === "object" &&
msg !== null &&
"type" in msg &&
typeof (msg as { type: unknown }).type === "string" &&
AGENT_MESSAGE_TYPES.has((msg as { type: string }).type)
);
}
export function isServerMessage(msg: unknown): msg is ServerMessage {
return (
typeof msg === "object" &&
msg !== null &&
"type" in msg &&
typeof (msg as { type: unknown }).type === "string" &&
SERVER_MESSAGE_TYPES.has((msg as { type: string }).type)
);
}
// --- Parsing utilities ---
export function parseAgentMessage(data: string): AgentMessage {
const msg: unknown = JSON.parse(data);
if (!isAgentMessage(msg)) {
throw new Error(`Invalid agent message: ${(msg as { type?: string }).type ?? "unknown"}`);
}
return msg;
}
export function parseServerMessage(data: string): ServerMessage {
const msg: unknown = JSON.parse(data);
if (!isServerMessage(msg)) {
throw new Error(`Invalid server message: ${(msg as { type?: string }).type ?? "unknown"}`);
}
return msg;
}
// --- Bastion -> labd messages ---
export type BastionMessage =
| { type: "bastion-enroll"; token: string; hostname: string; network: string; serverIp: string }
| { type: "bastion-heartbeat"; bastionId: string; uptime: number; machineCount: number }
| { type: "bastion-state-sync"; bastionId: string; state: import("../types/state.js").BastionState }
| { type: "bastion-progress"; bastionId: string; mac: string; stage: string; detail: string; timestamp: string }
| { type: "bastion-install-log"; bastionId: string; mac: string; hostname: string; provisionerType: import("../types/state.js").ProvisionStackType; sessionId: string; lines: string[]; timestamp: string }
feat: install logging, error trapping, PXE/ISO integration tests Kickstart installs on real hardware failed silently — no error reporting, only 3 progress callbacks, zero log streaming. This overhaul makes every install fully observable. Kickstart improvements: - Error trapping in %pre and %post (trap ERR sends failure details to bastion) - 12+ granular progress stages (was 3): SSH, hostname, k3s prep, EFI boot, metadata - Background log streamer: tails %post output and batch-sends to /api/log - bastion_log() function for explicit log lines from kickstart scripts Bastion API: - POST /api/log — receives raw log lines from kickstart (single or batch) - InstallLogBuffer — per-MAC ring buffer (2000 lines) + file persistence - GET /api/logs/:mac — now returns log_lines + log_total alongside stages - SSE /api/logs/:mac/follow — uses named events (event: stage vs event: log) - Progress events forwarded to labd via bastion-progress WebSocket message - Post-provision k3s logs routed through progressBus (was console-only) dnsmasq fixes found during VM testing: - HTTP Boot filename: ipxe-real.efi → ipxe.efi (leftover from old 2-stage approach) - pxe-service directives: only in proxy mode (breaks OVMF PXE in full mode) - PXEClient vendor class echo for UEFI firmware compatibility Integration tests: - PXE boot test: blank UEFI VM → dnsmasq → HTTP Boot → iPXE → bastion → install - ISO boot test: blank VM boots from bastion-generated ISO → same flow - Shared helpers: pxe-network (no DHCP, nftables fix), pxe-vm (UEFI + ISO boot) - test-provision.sh: runs both PXE + ISO tests with prerequisite checks - 250GB sparse QCOW2 disk (LVM layout needs ~204GB) 201 unit tests passing (11 new). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:26:33 +00:00
| { type: "command-response"; requestId: string; status: "ok" | "error"; data?: unknown; error?: string };
// --- labd -> Bastion messages ---
export type LabdBastionMessage =
| { type: "bastion-enrolled"; bastionId: string }
| { type: "bastion-heartbeat-ack"; serverTime: string }
| { type: "command-install"; requestId: string; mac: string; hostname: string; disk?: string; role: string; os: string }
| { type: "command-forget"; requestId: string; mac: string }
| { type: "command-role-update"; requestId: string; mac: string; role: string }
| { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean }
| { type: "command-register"; requestId: string; mac: string; hostname: string; role: string; ip: string }
| { type: "command-discover"; requestId: string; mac: string; product?: string; board?: string; serial?: string; manufacturer?: string; cpu_model?: string; cpu_cores?: number; memory_gb?: number; arch?: string; disks?: Array<{ name: string; size_gb: number; model: string }>; nics?: Array<{ name: string; mac: string; state: string }> }
feat: install logging, error trapping, PXE/ISO integration tests Kickstart installs on real hardware failed silently — no error reporting, only 3 progress callbacks, zero log streaming. This overhaul makes every install fully observable. Kickstart improvements: - Error trapping in %pre and %post (trap ERR sends failure details to bastion) - 12+ granular progress stages (was 3): SSH, hostname, k3s prep, EFI boot, metadata - Background log streamer: tails %post output and batch-sends to /api/log - bastion_log() function for explicit log lines from kickstart scripts Bastion API: - POST /api/log — receives raw log lines from kickstart (single or batch) - InstallLogBuffer — per-MAC ring buffer (2000 lines) + file persistence - GET /api/logs/:mac — now returns log_lines + log_total alongside stages - SSE /api/logs/:mac/follow — uses named events (event: stage vs event: log) - Progress events forwarded to labd via bastion-progress WebSocket message - Post-provision k3s logs routed through progressBus (was console-only) dnsmasq fixes found during VM testing: - HTTP Boot filename: ipxe-real.efi → ipxe.efi (leftover from old 2-stage approach) - pxe-service directives: only in proxy mode (breaks OVMF PXE in full mode) - PXEClient vendor class echo for UEFI firmware compatibility Integration tests: - PXE boot test: blank UEFI VM → dnsmasq → HTTP Boot → iPXE → bastion → install - ISO boot test: blank VM boots from bastion-generated ISO → same flow - Shared helpers: pxe-network (no DHCP, nftables fix), pxe-vm (UEFI + ISO boot) - test-provision.sh: runs both PXE + ISO tests with prerequisite checks - 250GB sparse QCOW2 disk (LVM layout needs ~204GB) 201 unit tests passing (11 new). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:26:33 +00:00
| { type: "server-shutdown"; reconnectAfter: number };
export type BastionMessageType = BastionMessage["type"];
export type LabdBastionMessageType = LabdBastionMessage["type"];
// --- Bastion type guards ---
const BASTION_MESSAGE_TYPES = new Set<string>([
"bastion-enroll", "bastion-heartbeat", "bastion-state-sync",
"bastion-progress", "bastion-install-log", "command-response",
feat: install logging, error trapping, PXE/ISO integration tests Kickstart installs on real hardware failed silently — no error reporting, only 3 progress callbacks, zero log streaming. This overhaul makes every install fully observable. Kickstart improvements: - Error trapping in %pre and %post (trap ERR sends failure details to bastion) - 12+ granular progress stages (was 3): SSH, hostname, k3s prep, EFI boot, metadata - Background log streamer: tails %post output and batch-sends to /api/log - bastion_log() function for explicit log lines from kickstart scripts Bastion API: - POST /api/log — receives raw log lines from kickstart (single or batch) - InstallLogBuffer — per-MAC ring buffer (2000 lines) + file persistence - GET /api/logs/:mac — now returns log_lines + log_total alongside stages - SSE /api/logs/:mac/follow — uses named events (event: stage vs event: log) - Progress events forwarded to labd via bastion-progress WebSocket message - Post-provision k3s logs routed through progressBus (was console-only) dnsmasq fixes found during VM testing: - HTTP Boot filename: ipxe-real.efi → ipxe.efi (leftover from old 2-stage approach) - pxe-service directives: only in proxy mode (breaks OVMF PXE in full mode) - PXEClient vendor class echo for UEFI firmware compatibility Integration tests: - PXE boot test: blank UEFI VM → dnsmasq → HTTP Boot → iPXE → bastion → install - ISO boot test: blank VM boots from bastion-generated ISO → same flow - Shared helpers: pxe-network (no DHCP, nftables fix), pxe-vm (UEFI + ISO boot) - test-provision.sh: runs both PXE + ISO tests with prerequisite checks - 250GB sparse QCOW2 disk (LVM layout needs ~204GB) 201 unit tests passing (11 new). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:26:33 +00:00
]);
const LABD_BASTION_MESSAGE_TYPES = new Set<string>([
"bastion-enrolled", "bastion-heartbeat-ack", "command-install",
"command-forget", "command-role-update", "command-debug", "command-register", "command-discover", "server-shutdown",
feat: install logging, error trapping, PXE/ISO integration tests Kickstart installs on real hardware failed silently — no error reporting, only 3 progress callbacks, zero log streaming. This overhaul makes every install fully observable. Kickstart improvements: - Error trapping in %pre and %post (trap ERR sends failure details to bastion) - 12+ granular progress stages (was 3): SSH, hostname, k3s prep, EFI boot, metadata - Background log streamer: tails %post output and batch-sends to /api/log - bastion_log() function for explicit log lines from kickstart scripts Bastion API: - POST /api/log — receives raw log lines from kickstart (single or batch) - InstallLogBuffer — per-MAC ring buffer (2000 lines) + file persistence - GET /api/logs/:mac — now returns log_lines + log_total alongside stages - SSE /api/logs/:mac/follow — uses named events (event: stage vs event: log) - Progress events forwarded to labd via bastion-progress WebSocket message - Post-provision k3s logs routed through progressBus (was console-only) dnsmasq fixes found during VM testing: - HTTP Boot filename: ipxe-real.efi → ipxe.efi (leftover from old 2-stage approach) - pxe-service directives: only in proxy mode (breaks OVMF PXE in full mode) - PXEClient vendor class echo for UEFI firmware compatibility Integration tests: - PXE boot test: blank UEFI VM → dnsmasq → HTTP Boot → iPXE → bastion → install - ISO boot test: blank VM boots from bastion-generated ISO → same flow - Shared helpers: pxe-network (no DHCP, nftables fix), pxe-vm (UEFI + ISO boot) - test-provision.sh: runs both PXE + ISO tests with prerequisite checks - 250GB sparse QCOW2 disk (LVM layout needs ~204GB) 201 unit tests passing (11 new). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:26:33 +00:00
]);
export function isBastionMessage(msg: unknown): msg is BastionMessage {
return (
typeof msg === "object" &&
msg !== null &&
"type" in msg &&
typeof (msg as { type: unknown }).type === "string" &&
BASTION_MESSAGE_TYPES.has((msg as { type: string }).type)
);
}
export function isLabdBastionMessage(msg: unknown): msg is LabdBastionMessage {
return (
typeof msg === "object" &&
msg !== null &&
"type" in msg &&
typeof (msg as { type: unknown }).type === "string" &&
LABD_BASTION_MESSAGE_TYPES.has((msg as { type: string }).type)
);
}
export function parseBastionMessage(data: string): BastionMessage {
const msg: unknown = JSON.parse(data);
if (!isBastionMessage(msg)) {
throw new Error(`Invalid bastion message: ${(msg as { type?: string }).type ?? "unknown"}`);
}
return msg;
}
export function parseLabdBastionMessage(data: string): LabdBastionMessage {
const msg: unknown = JSON.parse(data);
if (!isLabdBastionMessage(msg)) {
throw new Error(`Invalid labd-bastion message: ${(msg as { type?: string }).type ?? "unknown"}`);
}
return msg;
}
// --- Request ID utility ---
export function generateRequestId(): string {
return randomUUID();
}