6 Commits

Author SHA1 Message Date
Michal
aea28b5a0f fix: Cilium multi-node support — auto-detect NIC, k3s agent API port, worker label
Some checks failed
CI/CD / typecheck (pull_request) Failing after 10s
CI/CD / lint (pull_request) Failing after 22s
CI/CD / test (pull_request) Failing after 7m8s
CI/CD / build (pull_request) Has been skipped
CI/CD / publish-rpm (pull_request) Has been skipped
CI/CD / publish-deb (pull_request) Has been skipped
- Remove hardcoded devices/directRoutingDevice from Cilium install (let
  Cilium auto-detect per node — needed for heterogeneous NICs like eno1 vs enP7s7)
- Set k8sServiceHost=127.0.0.1 k8sServicePort=6444 so Cilium init
  containers can reach the API via k3s agent's local LB proxy
- Add node-role.kubernetes.io/worker label to agent config

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 01:35:51 +01:00
f3f0ea48e7 Merge pull request 'feat: provision register + k3s kubeconfig' (#8) from feat/register-and-kubeconfig into main
Some checks failed
CI/CD / lint (push) Failing after 10s
CI/CD / test (push) Failing after 10s
CI/CD / typecheck (push) Failing after 21s
CI/CD / build (push) Has been skipped
CI/CD / publish-rpm (push) Has been skipped
CI/CD / publish-deb (push) Has been skipped
2026-03-31 00:16:06 +00:00
Michal
49d747db98 feat: provision register command and k3s kubeconfig merge
Some checks failed
CI/CD / lint (pull_request) Failing after 11s
CI/CD / test (pull_request) Failing after 11s
CI/CD / typecheck (pull_request) Failing after 22s
CI/CD / build (pull_request) Has been skipped
CI/CD / publish-rpm (pull_request) Has been skipped
CI/CD / publish-deb (pull_request) Has been skipped
Add `labctl provision register` to re-add machines to installed state
without reprovisioning (e.g. after bastion state loss). Full stack:
protocol type, bastion API + WS handler, labd route, CLI command.

Add `labctl app k3s kubeconfig <target>` to fetch kubeconfig from a
k3s node via SSH, rewrite server URL, and merge into ~/.kube/config.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 01:15:31 +01:00
8635da08a6 Merge pull request 'fix: reprovision workflow bugs' (#7) from fix/reprovision-bugs into main
Some checks failed
CI/CD / typecheck (push) Failing after 10s
CI/CD / test (push) Failing after 10s
CI/CD / lint (push) Failing after 23s
CI/CD / build (push) Has been skipped
CI/CD / publish-rpm (push) Has been skipped
CI/CD / publish-deb (push) Has been skipped
Reviewed-on: #7
2026-03-30 22:44:44 +00:00
Michal
6a5f23c0f5 fix: reprovision workflow bugs — SSH host key warnings, log following, status priority
Some checks failed
CI/CD / lint (pull_request) Failing after 10s
CI/CD / test (pull_request) Failing after 10s
CI/CD / typecheck (pull_request) Failing after 23s
CI/CD / build (pull_request) Has been skipped
CI/CD / publish-rpm (pull_request) Has been skipped
CI/CD / publish-deb (pull_request) Has been skipped
- Add UserKnownHostsFile=/dev/null to SSH in debug and reprovision commands
- Track install state in log follower so it doesn't exit prematurely on "installed"
- Reorder bastion status check to prioritize active queue over stale installed state
- Update .gitignore with task file entries

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 22:59:45 +01:00
63cc033e3e Merge pull request 'docs: comprehensive architecture document' (#6) from docs/architecture into main
Some checks failed
CI/CD / typecheck (push) Failing after 10s
CI/CD / test (push) Failing after 11s
CI/CD / lint (push) Failing after 24s
CI/CD / build (push) Has been skipped
CI/CD / publish-rpm (push) Has been skipped
CI/CD / publish-deb (push) Has been skipped
2026-03-30 16:31:41 +00:00
17 changed files with 282 additions and 28 deletions

4
.gitignore vendored
View File

@@ -23,3 +23,7 @@ node_modules/
# OS specific
.DS_Store
# Task files
# tasks.json
# tasks/

View File

@@ -49,6 +49,9 @@ _labctl() {
"app k3s list")
COMPREPLY=($(compgen -W "--user -h --help" -- "$cur"))
return ;;
"app k3s kubeconfig")
COMPREPLY=($(compgen -W "--user --context --print -h --help" -- "$cur"))
return ;;
"init bastion")
COMPREPLY=($(compgen -W "standalone -h --help" -- "$cur"))
return ;;
@@ -67,6 +70,9 @@ _labctl() {
"provision forget")
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
return ;;
"provision register")
COMPREPLY=($(compgen -W "--role --ip -h --help" -- "$cur"))
return ;;
"provision logs")
COMPREPLY=($(compgen -W "-f --follow -h --help" -- "$cur"))
return ;;
@@ -89,7 +95,7 @@ _labctl() {
COMPREPLY=($(compgen -W "deploy status -h --help" -- "$cur"))
return ;;
"app k3s")
COMPREPLY=($(compgen -W "install health list -h --help" -- "$cur"))
COMPREPLY=($(compgen -W "install health list kubeconfig -h --help" -- "$cur"))
return ;;
"version")
COMPREPLY=($(compgen -W "-h --help" -- "$cur"))
@@ -98,7 +104,7 @@ _labctl() {
COMPREPLY=($(compgen -W "bastion -h --help" -- "$cur"))
return ;;
"provision")
COMPREPLY=($(compgen -W "list install reprovision debug forget logs makeiso -h --help" -- "$cur"))
COMPREPLY=($(compgen -W "list install reprovision debug forget register logs makeiso -h --help" -- "$cur"))
return ;;
"config")
COMPREPLY=($(compgen -W "list get set path -h --help" -- "$cur"))

View File

@@ -124,6 +124,7 @@ complete -c labctl -n "__labctl_using_cmd provision" -a install -d 'Queue a disc
complete -c labctl -n "__labctl_using_cmd provision" -a reprovision -d 'Queue install + SSH reboot into PXE (target: hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a debug -d 'PXE boot into Fedora rescue mode for debugging (target: hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a forget -d 'Remove a machine from bastion state'
complete -c labctl -n "__labctl_using_cmd provision" -a register -d 'Register an already-installed machine (e.g. after state loss)'
complete -c labctl -n "__labctl_using_cmd provision" -a logs -d 'Show provisioning logs for a machine (hostname, MAC, or IP)'
complete -c labctl -n "__labctl_using_cmd provision" -a makeiso -d 'Generate a UEFI-bootable iPXE ISO for network provisioning'
@@ -140,6 +141,10 @@ complete -c labctl -n "__labctl_in_cmd provision reprovision" -l disk -d 'Target
# provision debug options
complete -c labctl -n "__labctl_in_cmd provision debug" -l pxe-boot -d 'Boot installed system via PXE (kernel+initrd from network, root from NVMe)'
# provision register options
complete -c labctl -n "__labctl_in_cmd provision register" -l role -d 'Machine role' -xa 'vanilla worker infra labcontroller'
complete -c labctl -n "__labctl_in_cmd provision register" -l ip -d 'Machine IP address' -x
# provision logs options
complete -c labctl -n "__labctl_in_cmd provision logs" -s f -l follow -d 'Follow log output in real-time'
@@ -179,6 +184,7 @@ complete -c labctl -n "__labctl_in_cmd app labcontroller status" -l user -d 'SSH
complete -c labctl -n "__labctl_using_cmd app k3s" -a install -d 'Install k3s on a target machine (hostname, IP, or MAC)'
complete -c labctl -n "__labctl_using_cmd app k3s" -a health -d 'Check k3s health (all hosts if no target given)'
complete -c labctl -n "__labctl_using_cmd app k3s" -a list -d 'List installed machines and their k3s status'
complete -c labctl -n "__labctl_using_cmd app k3s" -a kubeconfig -d 'Fetch kubeconfig from a target and merge into ~/.kube/config'
# app k3s install options
complete -c labctl -n "__labctl_in_cmd app k3s install" -l role -d 'k3s role: infra (server) or worker (agent)' -x
@@ -192,3 +198,8 @@ complete -c labctl -n "__labctl_in_cmd app k3s health" -l user -d 'SSH user' -x
# app k3s list options
complete -c labctl -n "__labctl_in_cmd app k3s list" -l user -d 'SSH user' -x
# app k3s kubeconfig options
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l user -d 'SSH user' -x
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l context -d 'Context name (defaults to hostname)' -x
complete -c labctl -n "__labctl_in_cmd app k3s kubeconfig" -l print -d 'Print kubeconfig to stdout instead of merging'

View File

@@ -294,6 +294,21 @@ export async function startBastion(overrides: Partial<BastionConfig> = {}): Prom
return { status: "ok", data: { mac } };
});
labdConn.onCommand("command-register", async (msg) => {
if (msg.type !== "command-register") throw new Error("unexpected");
const mac = msg.mac.toLowerCase();
state.update((s) => {
s.installed[mac] = {
hostname: msg.hostname,
role: msg.role,
ip: msg.ip,
installed_at: new Date().toISOString(),
};
});
logger.info(`MACHINE REGISTERED: ${mac} -> ${msg.hostname} (${msg.role}) ip=${msg.ip}`);
return { status: "ok", data: { mac, hostname: msg.hostname } };
});
labdConn.onCommand("command-role-update", async (msg) => {
if (msg.type !== "command-role-update") throw new Error("unexpected");
const mac = msg.mac.toLowerCase();

View File

@@ -315,6 +315,50 @@ export function registerApiRoutes(
return reply.send({ status: "ok", mac, new: isNew });
});
// Register an already-installed machine (e.g. re-add after state loss)
app.post<{
Body: {
mac?: string;
hostname?: string;
role?: string;
ip?: string;
};
}>("/api/register", async (request, reply) => {
const { mac: rawMac, hostname, role, ip } = request.body ?? {};
const mac = (rawMac ?? "").toLowerCase().replace(/-/g, ":");
if (mac === "") {
return reply.status(400).send({ error: "mac is required" });
}
if (!hostname) {
return reply.status(400).send({ error: "hostname is required" });
}
const validRole = role ?? "worker";
if (!(SUPPORTED_ROLES as readonly string[]).includes(validRole)) {
return reply.status(400).send({ error: `invalid role: '${validRole}'. Supported: ${SUPPORTED_ROLES.join(", ")}` });
}
state.update((s) => {
s.installed[mac] = {
hostname,
role: validRole,
ip: ip ?? "",
installed_at: new Date().toISOString(),
};
});
logger.info(`MACHINE REGISTERED: ${mac} -> hostname=${hostname} role=${validRole} ip=${ip ?? ""}`);
return reply.send({
status: "registered",
mac,
hostname,
role: validRole,
ip: ip ?? "",
});
});
// Update a machine's role (e.g. promote infra -> labcontroller)
app.post<{
Body: {

View File

@@ -165,6 +165,7 @@ export class BastionConnection {
case "command-forget":
case "command-role-update":
case "command-debug":
case "command-register":
void this.handleCommand(msg);
break;
}

View File

@@ -94,6 +94,12 @@ export class LabdClient {
return this.request("POST", "/api/machines/install", { body: opts });
}
async registerMachine(opts: {
mac: string; hostname: string; role?: string; ip?: string;
}): Promise<{ status: string; data?: unknown; error?: string }> {
return this.request("POST", "/api/machines/register", { body: opts });
}
async debugMachine(mac: string, opts?: { pxeBoot?: boolean }): Promise<{ status: string; data?: { mac: string; hostname: string }; error?: string }> {
return this.request("POST", "/api/machines/debug", { body: { mac, pxeBoot: opts?.pxeBoot } });
}

View File

@@ -1,9 +1,10 @@
// CLI command: labctl app k3s install/health <target>
// Install or check k3s on a target machine via SSH.
import { existsSync } from "node:fs";
import { existsSync, writeFileSync, mkdirSync } from "node:fs";
import { homedir } from "node:os";
import { join } from "node:path";
import { execSync } from "node:child_process";
import type { Command } from "commander";
import type { BastionState } from "@lab/shared";
import { K3sModule, sshExec } from "@lab/modules";
@@ -400,4 +401,88 @@ export function registerAppCommand(program: Command): void {
);
}
});
k3sCmd
.command("kubeconfig <target>")
.description("Fetch kubeconfig from a target and merge into ~/.kube/config")
.option("--user <user>", "SSH user", "root")
.option("--context <name>", "Context name (defaults to hostname)")
.option("--print", "Print kubeconfig to stdout instead of merging")
.action(async (target: string, opts: {
user: string;
context?: string;
print?: boolean;
}) => {
const state = await fetchState();
const resolved = resolveTarget(target, state);
if (!resolved) {
console.error(`Cannot resolve target: ${target}`);
console.error("Provide an IP address, hostname, or MAC of an installed machine.");
process.exit(1);
}
const sshKey = findSshKey();
// Fetch kubeconfig via SSH
let raw: string;
try {
const result = await sshExec(resolved.ip, opts.user, "cat /etc/rancher/k3s/k3s.yaml", {
...(sshKey ? { keyPath: sshKey } : {}),
timeoutMs: 10_000,
});
raw = result.stdout;
} catch (err) {
console.error(`Failed to fetch kubeconfig: ${err instanceof Error ? err.message : String(err)}`);
process.exit(1);
}
const contextName = opts.context ?? resolved.hostname;
// Rewrite: replace 127.0.0.1 with actual IP, rename cluster/user/context
const rewritten = raw
.replace(/server:\s*https:\/\/127\.0\.0\.1:/, `server: https://${resolved.ip}:`)
.replace(/name:\s*default/g, `name: ${contextName}`)
.replace(/cluster:\s*default/g, `cluster: ${contextName}`)
.replace(/user:\s*default/g, `user: ${contextName}`)
.replace(/current-context:\s*default/, `current-context: ${contextName}`);
if (opts.print) {
process.stdout.write(rewritten);
return;
}
// Merge into ~/.kube/config using kubectl
const kubeDir = join(homedir(), ".kube");
mkdirSync(kubeDir, { recursive: true });
const mainConfig = join(kubeDir, "config");
const tmpFile = join(kubeDir, `.labctl-${contextName}.tmp`);
writeFileSync(tmpFile, rewritten, { mode: 0o600 });
try {
if (existsSync(mainConfig)) {
const merged = execSync(
`KUBECONFIG="${mainConfig}:${tmpFile}" kubectl config view --flatten`,
{ encoding: "utf-8" },
);
writeFileSync(mainConfig, merged, { mode: 0o600 });
} else {
writeFileSync(mainConfig, rewritten, { mode: 0o600 });
}
// Set current context
execSync(`kubectl config use-context ${contextName}`, { stdio: "pipe" });
console.log(`Merged kubeconfig for ${contextName} (${resolved.ip})`);
console.log(`Context set to: ${contextName}`);
console.log(`\nSwitch contexts: kubectl config use-context <name>`);
} catch (err) {
console.error(`Failed to merge kubeconfig: ${err instanceof Error ? err.message : String(err)}`);
console.error(`Standalone config saved at: ${tmpFile}`);
process.exit(1);
} finally {
try { const { unlinkSync } = await import("node:fs"); unlinkSync(tmpFile); } catch { /* ignore */ }
}
});
}

View File

@@ -103,6 +103,7 @@ export function registerDebugCommand(parent: Command): void {
const sshArgs = [
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=10",
...(sshKey !== undefined ? ["-i", sshKey] : []),
`${effectiveUser}@${ip}`,

View File

@@ -103,6 +103,7 @@ async function followLogs(
let lastStageCount = 0;
let lastStatus = "";
let sawInstalling = false;
while (true) {
try {
@@ -118,6 +119,10 @@ async function followLogs(
lastStatus = status;
}
if (status === "installing" || status === "queued") {
sawInstalling = true;
}
// Print new stages
if (log && log.length > lastStageCount) {
for (let i = lastStageCount; i < log.length; i++) {
@@ -130,8 +135,9 @@ async function followLogs(
lastStageCount = log.length;
}
// Done
if (status === "installed") {
// Only exit on "installed" if we actually saw the install happen
// (avoids exiting immediately when following a reprovision that hasn't started yet)
if (status === "installed" && sawInstalling) {
const ip = data["ip"] ?? "";
console.log("");
console.log(` ${GREEN}${BOLD}Install complete!${RESET}${ip ? ` ${DIM}ssh lab@${ip}${RESET}` : ""}`);

View File

@@ -0,0 +1,37 @@
// CLI command: provision register
// Register an already-installed machine that is missing from bastion state.
import { Command, Option } from "commander";
import { SUPPORTED_ROLES } from "@lab/shared";
import { getLabdClient } from "../api/config.js";
export function registerRegisterCommand(parent: Command): void {
parent
.command("register <mac> <hostname>")
.description("Register an already-installed machine (e.g. after state loss)")
.addOption(new Option("--role <role>", "Machine role").choices([...SUPPORTED_ROLES]).default("worker"))
.option("--ip <address>", "Machine IP address")
.action(async (mac: string, hostname: string, opts: {
role: string;
ip?: string;
}) => {
try {
const result = await getLabdClient().registerMachine({
mac,
hostname,
role: opts.role,
...(opts.ip ? { ip: opts.ip } : {}),
});
if (result.error) {
console.error(`Failed: ${result.error}`);
process.exit(1);
}
console.log(`Registered ${mac} as ${hostname} (role=${opts.role}${opts.ip ? `, ip=${opts.ip}` : ""})`);
} catch (err) {
console.error(`Failed: ${err instanceof Error ? err.message : String(err)}`);
process.exit(1);
}
});
}

View File

@@ -144,6 +144,7 @@ export function registerReprovisionCommand(parent: Command): void {
const sshArgs = [
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=10",
...(sshKey !== undefined ? ["-i", sshKey] : []),
`${effectiveUser}@${ip}`,

View File

@@ -2,7 +2,7 @@
// CLI entry point for lab-bastion.
// Commands:
// init bastion standalone start/stop/status
// provision list/install/reprovision/forget
// provision list/install/reprovision/forget/register
import { fileURLToPath } from "node:url";
import { Command, Option } from "commander";
@@ -16,6 +16,7 @@ import { registerListCommand } from "./commands/list.js";
import { registerReprovisionCommand } from "./commands/reprovision.js";
import { registerDebugCommand } from "./commands/debug.js";
import { registerForgetCommand } from "./commands/forget.js";
import { registerRegisterCommand } from "./commands/register.js";
import { registerLogsCommand } from "./commands/logs.js";
import { registerMakeIsoCommand } from "./commands/makeiso.js";
import { registerConfigCommand } from "./commands/config.js";
@@ -98,6 +99,7 @@ export function createProgram(): Command {
registerReprovisionCommand(provisionCmd);
registerDebugCommand(provisionCmd);
registerForgetCommand(provisionCmd);
registerRegisterCommand(provisionCmd);
registerLogsCommand(provisionCmd);
registerMakeIsoCommand(provisionCmd);

View File

@@ -172,6 +172,43 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
}
});
// Register an already-installed machine — route to correct bastion (or single bastion)
app.post<{
Body: { mac?: string; hostname?: string; role?: string; ip?: string };
}>("/api/machines/register", async (request, reply) => {
const { mac, hostname, role, ip } = request.body ?? {};
if (!mac || !hostname) {
return reply.code(400).send({ error: "mac and hostname are required" });
}
const normalized = mac.toLowerCase().replace(/-/g, ":");
// Find bastion that knows this MAC, or use single connected bastion
const bastion = bastionRegistry.findBastionByMac(normalized);
const target = bastion ?? (bastionRegistry.getAll().length === 1 ? bastionRegistry.getAll()[0] : null);
if (!target) {
const all = bastionRegistry.getAll();
if (all.length === 0) {
return reply.code(503).send({ error: "No bastions connected" });
}
return reply.code(404).send({ error: `MAC ${normalized} not found on any bastion and multiple bastions connected` });
}
try {
const result = await sendCommand(target.bastionId, {
type: "command-register",
mac: normalized,
hostname,
role: role ?? "worker",
ip: ip ?? "",
});
return reply.code(result.status === "ok" ? 200 : 500).send(result);
} catch (err) {
return reply.code(500).send({ error: err instanceof Error ? err.message : String(err) });
}
});
// Queue debug/rescue mode — route to correct bastion by MAC
app.post<{
Body: { mac?: string; pxeBoot?: boolean };
@@ -257,17 +294,7 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
const queued = bastion.state.install_queue[mac];
const installed = bastion.state.installed[mac];
if (installed) {
return {
mac,
hostname: installed.hostname,
status: "installed",
role: installed.role,
ip: installed.ip,
installed_at: installed.installed_at,
};
}
// Active install takes priority over old installed state (reprovision case)
if (queued) {
return {
mac,
@@ -282,6 +309,17 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
};
}
if (installed) {
return {
mac,
hostname: installed.hostname,
status: "installed",
role: installed.role,
ip: installed.ip,
installed_at: installed.installed_at,
};
}
return reply.code(404).send({ error: `MAC ${mac} not found in install queue or installed` });
});
}

View File

@@ -35,21 +35,15 @@ export const installCilium: Operation = async (ctx): Promise<OperationResult> =>
}
details.push(`Installed cilium CLI ${version} (${cliArch})`);
// Detect default network device (avoid tailscale/wireguard)
const devResult = await ctx.ssh.exec(
"ip -4 route show default | awk '{print $5}' | head -1",
sshOpts(ctx),
);
const defaultDev = devResult.stdout.trim();
details.push(`Network device: ${defaultDev}`);
// Install Cilium
// - No hardcoded devices: Cilium auto-detects per node (heterogeneous NICs like eno1 vs enP7s7)
// - k8sServiceHost/Port: k3s agents proxy the API on 127.0.0.1:6444 (not 6443)
const installResult = await ctx.ssh.exec(
`KUBECONFIG=/etc/rancher/k3s/k3s.yaml cilium install \
--set kubeProxyReplacement=true \
--set ipam.mode=kubernetes \
--set devices="${defaultDev}" \
--set nodePort.directRoutingDevice="${defaultDev}"`,
--set k8sServiceHost=127.0.0.1 \
--set k8sServicePort=6444`,
{ timeoutMs: 300_000 },
);
if (installResult.exitCode !== 0) {

View File

@@ -42,6 +42,8 @@ ${tlsSans.map((s) => ` - "${s}"`).join("\n")}
function generateAgentConfig(): string {
return `protect-kernel-defaults: true
node-label:
- "node-role.kubernetes.io/worker=true"
kubelet-arg:
- "protect-kernel-defaults=true"
- "streaming-connection-idle-timeout=5m"

View File

@@ -112,6 +112,7 @@ export type LabdBastionMessage =
| { type: "command-forget"; requestId: string; mac: string }
| { type: "command-role-update"; requestId: string; mac: string; role: string }
| { type: "command-debug"; requestId: string; mac: string; pxeBoot?: boolean }
| { type: "command-register"; requestId: string; mac: string; hostname: string; role: string; ip: string }
| { type: "server-shutdown"; reconnectAfter: number };
export type BastionMessageType = BastionMessage["type"];
@@ -126,7 +127,7 @@ const BASTION_MESSAGE_TYPES = new Set<string>([
const LABD_BASTION_MESSAGE_TYPES = new Set<string>([
"bastion-enrolled", "bastion-heartbeat-ack", "command-install",
"command-forget", "command-role-update", "command-debug", "server-shutdown",
"command-forget", "command-role-update", "command-debug", "command-register", "server-shutdown",
]);
export function isBastionMessage(msg: unknown): msg is BastionMessage {