feat(cli): honor BASTION_DIR env var as default for --dir

bastion serve/stop default for --dir was hardcoded to /tmp/lab-bastion. Now reads BASTION_DIR from env if set, so a deployed bastion daemon can run from a persistent directory without callers having to pass --dir on every invocation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
build(labd): include @lab/core in the Dockerfile build chain
2026-05-05 22:09:24 +01:00 · 2026-05-05 22:09:24 +01:00 · 2026-05-05 22:09:24 +01:00 · 2026-05-05 22:06:34 +01:00 · 2026-05-05 20:29:51 +00:00 · 2026-05-05 21:29:16 +01:00
35 changed files with 3975 additions and 148 deletions
--- a/bastion/Dockerfile.labd
+++ b/bastion/Dockerfile.labd
@@ -11,6 +11,7 @@ WORKDIR /app
 # Copy workspace config and package manifests first (layer cache)
 COPY pnpm-workspace.yaml pnpm-lock.yaml package.json tsconfig.base.json tsconfig.json ./
 COPY src/shared/package.json src/shared/tsconfig.json src/shared/
+COPY src/core/package.json   src/core/tsconfig.json   src/core/
 COPY src/labd/package.json   src/labd/tsconfig.json   src/labd/

 # Install all dependencies (dev included -- needed for build)
@@ -22,10 +23,13 @@ RUN pnpm --filter @lab/labd exec prisma generate

 # Copy source code
 COPY src/shared/src/ src/shared/src/
+COPY src/core/src/   src/core/src/
 COPY src/labd/src/   src/labd/src/

-# Build TypeScript (shared first via project references)
-RUN pnpm --filter @lab/shared build && pnpm --filter @lab/labd build
+# Build TypeScript (shared + core before labd via project references)
+RUN pnpm --filter @lab/shared build \
+ && pnpm --filter @lab/core build \
+ && pnpm --filter @lab/labd build

 # Hoist the generated Prisma client so stage 2 can COPY it from a stable path
 RUN mkdir -p /app/_prisma && \
@@ -41,6 +45,7 @@ WORKDIR /app
 # Copy workspace config and package manifests
 COPY pnpm-workspace.yaml pnpm-lock.yaml package.json ./
 COPY src/shared/package.json src/shared/
+COPY src/core/package.json   src/core/
 COPY src/labd/package.json   src/labd/

 # Install production dependencies only
@@ -48,6 +53,7 @@ RUN pnpm install --frozen-lockfile --prod 2>/dev/null || pnpm install --prod

 # Copy built output from builder
 COPY --from=builder /app/src/shared/dist/ src/shared/dist/
+COPY --from=builder /app/src/core/dist/   src/core/dist/
 COPY --from=builder /app/src/labd/dist/   src/labd/dist/

 # Copy Prisma schema + generated client into pnpm store location
--- a/bastion/pnpm-lock.yaml
+++ b/bastion/pnpm-lock.yaml
--- a/bastion/src/cli/src/commands/serve.ts
+++ b/bastion/src/cli/src/commands/serve.ts
@@ -11,7 +11,7 @@ export function registerStartCommand(parent: Command): void {
    .command("start")
    .description("Start the bastion server (HTTP + dnsmasq PXE)")
    .option("--port <port>", "HTTP port", "8080")
-    .option("--dir <dir>", "Bastion data directory", "/tmp/lab-bastion")
+    .option("--dir <dir>", "Bastion data directory", process.env["BASTION_DIR"] ?? "/tmp/lab-bastion")
    .option("--domain <domain>", "Internal domain for hostnames", "ad.itaz.eu")
    .option("--dhcp-mode <mode>", "DHCP mode: proxy or full", "proxy")
    .option("--fedora <version>", "Fedora version", "43")
--- a/bastion/src/cli/src/commands/stop.ts
+++ b/bastion/src/cli/src/commands/stop.ts
@@ -8,7 +8,7 @@ export function registerStopCommand(parent: Command): void {
  parent
    .command("stop")
    .description("Stop a running bastion server")
-    .option("--dir <dir>", "Bastion data directory", "/tmp/lab-bastion")
+    .option("--dir <dir>", "Bastion data directory", process.env["BASTION_DIR"] ?? "/tmp/lab-bastion")
    .action((opts: { dir: string }) => {
      const pidFile = `${opts.dir}/bastion.pid`;

--- a/bastion/src/core/package.json
+++ b/bastion/src/core/package.json
@@ -0,0 +1,23 @@
+{
+  "name": "@lab/core",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "types": "./dist/index.d.ts"
+    }
+  },
+  "scripts": {
+    "build": "tsc --build",
+    "clean": "rimraf dist",
+    "test": "vitest",
+    "test:run": "vitest run"
+  },
+  "dependencies": {
+    "@pulumi/pulumi": "^3.0.0"
+  }
+}
--- a/bastion/src/core/src/audit.ts
+++ b/bastion/src/core/src/audit.ts
@@ -0,0 +1,75 @@
+// Audit event types for the labctl platform.
+// Every mutation is tracked with correlation IDs for causal chains.
+
+export type AuditEventKind =
+  | "resource_created"
+  | "resource_updated"
+  | "resource_deleted"
+  | "resource_state_change"
+  | "plan_generated"
+  | "apply_started"
+  | "apply_step"
+  | "apply_completed"
+  | "driver_translate"
+  | "driver_execute"
+  | "driver_error"
+  | "fleet_discovery"
+  | "fleet_classification"
+  | "fleet_approval"
+  | "fleet_auto_approve"
+  | "pipeline_started"
+  | "pipeline_step_started"
+  | "pipeline_step_completed"
+  | "pipeline_completed"
+  | "deploy_started"
+  | "deploy_completed"
+  | "deploy_failed"
+  | "drift_detected"
+  | "drift_corrected"
+  | "sync_triggered"
+  | "sync_completed"
+  | "auth_login"
+  | "auth_logout"
+  | "auth_bootstrap"
+  | "rbac_decision"
+  | "impersonation"
+  | "server_started"
+  | "controller_started"
+  | "agent_connected"
+  | "agent_disconnected"
+  | "bastion_registered";
+
+export type AuditSource =
+  | "cli"
+  | "labd"
+  | "agent"
+  | "driver"
+  | "fleet-controller"
+  | "sync-controller";
+
+export type AuditResult = "success" | "failure" | "denied" | "skipped";
+
+export interface AuditEvent {
+  id: string;
+  timestamp: Date;
+  eventKind: AuditEventKind;
+  source: AuditSource;
+  verified: boolean;
+
+  userId?: string;
+  userName?: string;
+  sessionId?: string;
+  environmentName?: string;
+  accountName?: string;
+
+  resourceKind?: string;
+  resourceName?: string;
+
+  correlationId: string;
+  parentEventId?: string;
+
+  details: Record<string, unknown>;
+  result: AuditResult;
+  error?: string;
+  durationMs?: number;
+}
--- a/bastion/src/core/src/auth.ts
+++ b/bastion/src/core/src/auth.ts
@@ -0,0 +1,50 @@
+// Auth types for the labctl platform.
+// Bearer token auth for CLI/SDK. mTLS stays for agent/bastion.
+
+export type UserRole = "USER" | "ADMIN";
+
+export interface User {
+  id: string;
+  email: string;
+  name?: string;
+  role: UserRole;
+  createdAt: Date;
+}
+
+export interface Session {
+  id: string;
+  userId: string;
+  token: string;
+  expiresAt: Date;
+  createdAt: Date;
+}
+
+export interface Group {
+  id: string;
+  name: string;
+  description?: string;
+}
+
+export type SubjectKind = "User" | "Group" | "ServiceAccount";
+
+export interface RoleBinding {
+  role: "view" | "edit" | "create" | "delete" | "run" | "admin";
+  resource: string;
+  name?: string;
+  environment?: string;
+  action?: string;
+}
+
+export interface RbacSubject {
+  kind: SubjectKind;
+  name: string;
+}
+
+export interface RbacDefinition {
+  id: string;
+  name: string;
+  subjects: RbacSubject[];
+  roleBindings: RoleBinding[];
+  createdAt: Date;
+  updatedAt: Date;
+}
--- a/bastion/src/core/src/environment.ts
+++ b/bastion/src/core/src/environment.ts
@@ -0,0 +1,24 @@
+// Environment and Account types.
+// An Environment is a logical boundary (production, staging, dev).
+// An Account is a configured driver instance with credentials.
+
+export interface Environment {
+  id: string;
+  name: string;
+  status: "active" | "archived";
+  createdAt: Date;
+}
+
+export interface Account {
+  id: string;
+  name: string;
+  driver: string;
+  config: Record<string, unknown>;
+  createdAt: Date;
+}
+
+export interface Binding {
+  id: string;
+  environmentId: string;
+  accountId: string;
+}
--- a/bastion/src/core/src/index.ts
+++ b/bastion/src/core/src/index.ts
@@ -0,0 +1,9 @@
+// @lab/core — foundation types for the labctl platform.
+// Phase 1 stub: resource types, auth types, audit types, Output<T>.
+// Phase 5 adds: CompositeResource, evaluator integration, full SDK.
+
+export * from "./resource.js";
+export * from "./environment.js";
+export * from "./audit.js";
+export * from "./auth.js";
+export { Output, output, all, interpolate, secret } from "./output.js";
--- a/bastion/src/core/src/output.ts
+++ b/bastion/src/core/src/output.ts
@@ -0,0 +1,5 @@
+// Re-export Pulumi's Output<T> type for use across the platform.
+// Cloud drivers use this for future values (endpoints, IPs, kubeconfigs).
+// Phase 1: type re-export only. Phase 5 adds full evaluator integration.
+
+export { Output, output, all, interpolate, secret } from "@pulumi/pulumi";
--- a/bastion/src/core/src/resource.ts
+++ b/bastion/src/core/src/resource.ts
@@ -0,0 +1,83 @@
+// Core resource types for the labctl platform.
+// Every managed thing (Server, Database, App, Cluster) is a Resource.
+
+export type ResourceOrigin = "file" | "cli" | "fleet" | "imported";
+export type ResourceManagedBy = "gitops" | "manual" | "auto";
+
+export type ResourceStatus =
+  | "pending"
+  | "creating"
+  | "ready"
+  | "updating"
+  | "deleting"
+  | "error"
+  | "unknown";
+
+export interface ResourceMetadata {
+  kind: string;
+  name: string;
+  environmentId: string;
+  accountId: string;
+  origin: ResourceOrigin;
+  managedBy: ResourceManagedBy;
+  sourceRef?: string;
+}
+
+export interface ResourceState {
+  status: ResourceStatus;
+  message?: string;
+  lastReconciled?: Date;
+  platformRef?: string;
+}
+
+export interface Resource<TSpec = Record<string, unknown>> {
+  id: string;
+  metadata: ResourceMetadata;
+  desiredSpec: TSpec;
+  actualSpec?: TSpec;
+  state: ResourceState;
+  createdAt: Date;
+  updatedAt: Date;
+}
+
+// Well-known resource kinds. Drivers register additional kinds.
+export const RESOURCE_KINDS = {
+  SERVER: "server",
+  DATABASE: "database",
+  CACHE: "cache",
+  CLUSTER: "cluster",
+  APP: "app",
+  SERVICE: "service",
+  CRONJOB: "cronjob",
+  NETWORK: "network",
+  LOADBALANCER: "loadbalancer",
+  DNSZONE: "dnszone",
+  CERTIFICATE: "certificate",
+  OBJECTSTORE: "objectstore",
+  QUEUE: "queue",
+  SECRET: "secret",
+  FLEET: "fleet",
+} as const;
+
+export type ResourceKind = (typeof RESOURCE_KINDS)[keyof typeof RESOURCE_KINDS];
+
+// Resource aliases for CLI (kubectl-style shortnames)
+export const RESOURCE_ALIASES: Record<string, string> = {
+  srv: "server",
+  db: "database",
+  cl: "cluster",
+  svc: "service",
+  cj: "cronjob",
+  lb: "loadbalancer",
+  dns: "dnszone",
+  cert: "certificate",
+  os: "objectstore",
+  mq: "queue",
+  sec: "secret",
+  fl: "fleet",
+};
+
+export function resolveResourceKind(input: string): string {
+  const lower = input.toLowerCase();
+  return RESOURCE_ALIASES[lower] ?? lower;
+}
--- a/bastion/src/core/tsconfig.json
+++ b/bastion/src/core/tsconfig.json
@@ -0,0 +1,8 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "rootDir": "src",
+    "outDir": "dist"
+  },
+  "include": ["src/**/*.ts"]
+}
--- a/bastion/src/labd/package.json
+++ b/bastion/src/labd/package.json
@@ -26,8 +26,10 @@
  "dependencies": {
    "@fastify/rate-limit": "^10.3.0",
    "@fastify/websocket": "^11.0.2",
+    "@lab/core": "workspace:^",
    "@lab/shared": "workspace:*",
    "@prisma/client": "^6.9.0",
+    "bcryptjs": "^3.0.3",
    "fastify": "^5.3.3",
    "winston": "^3.17.0",
    "ws": "^8.19.0",
@@ -37,6 +39,7 @@
    "seed": "tsx prisma/seed.ts"
  },
  "devDependencies": {
+    "@types/bcryptjs": "^3.0.0",
    "@types/node": "^22.14.1",
    "@types/ws": "^8.18.1",
    "prisma": "^6.9.0",
--- a/bastion/src/labd/prisma/schema.prisma
+++ b/bastion/src/labd/prisma/schema.prisma
@@ -7,6 +7,225 @@ datasource db {
  url      = env("DATABASE_URL")
 }

+// ── Auth (mcpctl pattern: email/password + bearer token sessions) ──
+
+model User {
+  id        String   @id @default(cuid())
+  email     String   @unique
+  password  String   // bcrypt
+  name      String?
+  role      UserRole @default(USER)
+  createdAt DateTime @default(now())
+  updatedAt DateTime @updatedAt
+
+  sessions  Session[]
+  auditLogs AuditEvent[]
+  groups    GroupMember[]
+}
+
+enum UserRole {
+  USER
+  ADMIN
+}
+
+model Session {
+  id        String   @id @default(cuid())
+  userId    String
+  user      User     @relation(fields: [userId], references: [id], onDelete: Cascade)
+  token     String   @unique
+  expiresAt DateTime
+  createdAt DateTime @default(now())
+
+  @@index([userId])
+  @@index([token])
+}
+
+model Group {
+  id          String        @id @default(cuid())
+  name        String        @unique
+  description String?
+  createdAt   DateTime      @default(now())
+  members     GroupMember[]
+}
+
+model GroupMember {
+  id      String @id @default(cuid())
+  groupId String
+  group   Group  @relation(fields: [groupId], references: [id], onDelete: Cascade)
+  userId  String
+  user    User   @relation(fields: [userId], references: [id], onDelete: Cascade)
+
+  @@unique([groupId, userId])
+}
+
+model ServiceAccount {
+  id        String   @id @default(cuid())
+  name      String   @unique
+  token     String   @unique
+  createdAt DateTime @default(now())
+}
+
+// ── RBAC (mcpctl pattern: named definitions with JSON subjects/bindings) ──
+
+model RbacDefinition {
+  id           String   @id @default(cuid())
+  name         String   @unique
+  subjects     Json     // [{kind: "User"|"Group"|"ServiceAccount", name: string}]
+  roleBindings Json     // [{role, resource, name?, environment?, action?}]
+  createdAt    DateTime @default(now())
+  updatedAt    DateTime @updatedAt
+}
+
+// ── Audit (mcpctl pattern: fire-and-forget with correlation IDs) ──
+
+model AuditEvent {
+  id              String   @id @default(cuid())
+  timestamp       DateTime @default(now())
+  eventKind       String
+  source          String   // cli | labd | agent | driver | fleet-controller | sync-controller
+  verified        Boolean  @default(false)
+
+  userId          String?
+  user            User?    @relation(fields: [userId], references: [id])
+  userName        String?
+  sessionId       String?
+  environmentName String?
+  accountName     String?
+
+  resourceKind    String?
+  resourceName    String?
+
+  correlationId   String
+  parentEventId   String?
+
+  details         Json     @default("{}")
+  result          String   // success | failure | denied | skipped
+  error           String?
+  durationMs      Int?
+
+  @@index([correlationId])
+  @@index([eventKind, timestamp])
+  @@index([environmentName, timestamp])
+  @@index([resourceKind, resourceName])
+  @@index([userId, timestamp])
+}
+
+// ── Core infrastructure ──
+
+model Environment {
+  id        String   @id @default(cuid())
+  name      String   @unique
+  status    String   @default("active") // active | archived
+  createdAt DateTime @default(now())
+  updatedAt DateTime @updatedAt
+
+  bindings  Binding[]
+  resources Resource[]
+}
+
+model Account {
+  id        String   @id @default(cuid())
+  name      String   @unique
+  driver    String   // baremetal-pxe | aws | gcp | kubernetes | ovh
+  config    Json     @default("{}")
+  // Credentials stored in Infisical, referenced by secretPath
+  secretPath String?
+  createdAt DateTime @default(now())
+  updatedAt DateTime @updatedAt
+
+  bindings  Binding[]
+  resources Resource[]
+}
+
+model Binding {
+  id            String      @id @default(cuid())
+  environmentId String
+  environment   Environment @relation(fields: [environmentId], references: [id], onDelete: Cascade)
+  accountId     String
+  account       Account     @relation(fields: [accountId], references: [id], onDelete: Cascade)
+
+  @@unique([environmentId, accountId])
+}
+
+model Resource {
+  id             String       @id @default(cuid())
+  kind           String
+  name           String
+  environmentId  String
+  environment    Environment  @relation(fields: [environmentId], references: [id])
+  accountId      String
+  account        Account      @relation(fields: [accountId], references: [id])
+  origin         String       @default("cli")    // file | cli | fleet | imported
+  managedBy      String       @default("manual")  // gitops | manual | auto
+  sourceRef      String?
+  desiredSpec    Json         @default("{}")
+  actualSpec     Json?
+  platformRef    String?
+  status         String       @default("pending") // pending | creating | ready | updating | deleting | error
+  statusMessage  String?
+  lastReconciled DateTime?
+  createdAt      DateTime     @default(now())
+  updatedAt      DateTime     @updatedAt
+
+  @@unique([kind, name, environmentId])
+  @@index([environmentId])
+  @@index([accountId])
+  @@index([kind, status])
+}
+
+model Secret {
+  id        String   @id @default(cuid())
+  name      String   @unique
+  // Encrypted data — application-layer encryption as fallback if Infisical unavailable
+  data      Json     @default("{}")
+  version   Int      @default(1)
+  createdAt DateTime @default(now())
+  updatedAt DateTime @updatedAt
+}
+
+// ── Fleet ──
+
+model Fleet {
+  id               String        @id @default(cuid())
+  name             String
+  environmentId    String
+  accountId        String
+  selector         Json          // fact-matching rules
+  onboardPipeline  Json          // step definitions
+  offboardPipeline Json?
+  approvalConfig   Json?
+  status           String        @default("active")
+  createdAt        DateTime      @default(now())
+  updatedAt        DateTime      @updatedAt
+
+  members          FleetMember[]
+}
+
+model FleetMember {
+  id       String   @id @default(cuid())
+  fleetId  String
+  fleet    Fleet    @relation(fields: [fleetId], references: [id], onDelete: Cascade)
+  serverId String
+  status   String   // discovered | pending | onboarding | active | offboarding | removed
+  joinedAt DateTime @default(now())
+
+  @@index([fleetId])
+}
+
+// ── Git sources (for sync controller) ──
+
+model GitSource {
+  id       String    @id @default(cuid())
+  name     String    @unique
+  repo     String
+  branch   String    @default("main")
+  path     String    @default("environments/")
+  lastSync DateTime?
+  createdAt DateTime @default(now())
+}
+
+// ── Existing v1.0 models (kept for bastion/agent compatibility) ──
+
 model Server {
  id            String    @id @default(uuid())
  hostname      String    @unique
@@ -17,13 +236,12 @@ model Server {
  labels        Json      @default("{}")
  ip            String?
  agentVersion  String?
-  status        String   @default("unknown") // unknown, online, offline, provisioning
+  status        String    @default("unknown")
  lastHeartbeat DateTime?
  createdAt     DateTime  @default(now())
  updatedAt     DateTime  @updatedAt

  agent     Agent?
-  auditLogs     AuditLog[]
 }

 model Agent {
@@ -33,112 +251,29 @@ model Agent {
  certificatePem String?
  enrolledAt     DateTime @default(now())
  lastSeen       DateTime?
+  facts          Json?    // hardware facts reported by agent

  @@index([serverId])
 }

-model User {
-  id              String   @id @default(uuid())
-  username        String   @unique
-  displayName     String?
-  certFingerprint String?  @unique
-  createdAt       DateTime @default(now())
-  updatedAt       DateTime @updatedAt
-
-  roleBindings UserRole[]
-  auditLogs    AuditLog[]
-}
-
-model Role {
-  id          String   @id @default(uuid())
-  name        String   @unique
-  description String?
-  createdAt   DateTime @default(now())
-
-  permissions  Permission[]
-  userBindings UserRole[]
-}
-
-model Permission {
-  id          String @id @default(uuid())
-  roleId      String
-  role        Role   @relation(fields: [roleId], references: [id], onDelete: Cascade)
-  type        String @default("allow") // allow or deny
-  action      String // read, exec, apply, destroy, manage, admin, kubectl, *
-  cloud       String @default("*")
-  environment String @default("*")
-  server      String @default("*")
-
-  @@index([roleId])
-}
-
-model UserRole {
-  id     String @id @default(uuid())
-  userId String
-  user   User   @relation(fields: [userId], references: [id], onDelete: Cascade)
-  roleId String
-  role   Role   @relation(fields: [roleId], references: [id], onDelete: Cascade)
-
-  @@unique([userId, roleId])
-  @@index([userId])
-  @@index([roleId])
-}
-
 model JoinToken {
  id        String    @id @default(uuid())
  token     String    @unique
-  type      String    @default("one-time") // one-time or reusable
+  type      String    @default("one-time")
  label     String?
-  usedBy    String? // server hostname that used it
+  usedBy    String?
  usedAt    DateTime?
  revokedAt DateTime?
  createdAt DateTime  @default(now())
  expiresAt DateTime?
 }

-model AuditLog {
-  id           String   @id @default(uuid())
-  userId       String?
-  user         User?    @relation(fields: [userId], references: [id])
-  serverId     String?
-  server       Server?  @relation(fields: [serverId], references: [id])
-  sessionId    String?
-  action       String // exec, kubectl, apply, login, rbac-denied, etc.
-  resourceType String?  // server, cluster, role, app, etc.
-  resourceName String?
-  args         String? // sanitized command args
-  result       String   @default("success") // success, denied, error
-  durationMs   Int?
-  sourceIp     String?
-  timestamp    DateTime @default(now())
-
-  @@index([userId])
-  @@index([serverId])
-  @@index([sessionId])
-  @@index([timestamp])
-  @@index([action])
-}
-
-model PulumiRun {
-  id          String    @id @default(uuid())
-  userId      String
-  stackName   String
-  action      String // up, preview, destroy
-  status      String    @default("pending") // pending, running, succeeded, failed
-  output      String?
-  startedAt   DateTime  @default(now())
-  completedAt DateTime?
-
-  @@index([userId])
-  @@index([stackName])
-}
-
 model Bastion {
  id            String    @id @default(uuid())
  hostname      String    @unique
  network       String
  serverIp      String
-  status        String    @default("offline") // online, offline
+  status        String    @default("offline")
  lastHeartbeat DateTime?
  createdAt     DateTime  @default(now())
  updatedAt     DateTime  @updatedAt
@@ -149,7 +284,7 @@ model Cluster {
  name          String   @unique
  cloud         String   @default("baremetal")
  environment   String   @default("default")
-  kubeconfigEnc String? // encrypted kubeconfig
+  kubeconfigEnc String?
  labels        Json     @default("{}")
  createdAt     DateTime @default(now())
  updatedAt     DateTime @updatedAt
--- a/bastion/src/labd/src/middleware/bearer-auth.ts
+++ b/bastion/src/labd/src/middleware/bearer-auth.ts
@@ -0,0 +1,65 @@
+// Bearer token auth middleware for Fastify.
+// Validates Authorization header, resolves user identity, attaches to request.
+
+import type { FastifyRequest, FastifyReply } from "fastify";
+import type { AuthService } from "../services/auth.js";
+
+declare module "fastify" {
+  interface FastifyRequest {
+    userId?: string;
+    userEmail?: string;
+    userRole?: string;
+  }
+}
+
+// Paths that don't require authentication
+const PUBLIC_PATHS = new Set([
+  "/health",
+  "/api/auth/login",
+  "/ws/bastion",
+  "/ws/agent",
+  "/api/auth/enroll",
+]);
+
+export function createBearerAuthMiddleware(authService: AuthService) {
+  return async function bearerAuth(
+    request: FastifyRequest,
+    reply: FastifyReply,
+  ): Promise<void> {
+    // Skip auth for public paths
+    if (PUBLIC_PATHS.has(request.url.split("?")[0] ?? "")) {
+      return;
+    }
+
+    // Skip auth for WebSocket upgrade requests (handled by their own auth)
+    if (request.headers.upgrade === "websocket") {
+      return;
+    }
+
+    const authHeader = request.headers.authorization;
+    if (!authHeader) {
+      void reply.code(401).send({ error: "Authorization header required" });
+      return;
+    }
+
+    if (!authHeader.startsWith("Bearer ")) {
+      void reply.code(401).send({ error: "Invalid authorization format, expected: Bearer <token>" });
+      return;
+    }
+
+    const token = authHeader.slice(7);
+    if (token.length === 0) {
+      void reply.code(401).send({ error: "Empty bearer token" });
+      return;
+    }
+
+    try {
+      const identity = await authService.validateToken(token);
+      request.userId = identity.userId;
+      request.userEmail = identity.email;
+      request.userRole = identity.role;
+    } catch {
+      void reply.code(401).send({ error: "Invalid or expired token. Run: labctl login" });
+    }
+  };
+}
--- a/bastion/src/labd/src/routes/bastions.ts
+++ b/bastion/src/labd/src/routes/bastions.ts
@@ -84,7 +84,6 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
  app.get("/api/machines", async () => {
    const live = bastionRegistry.getAggregatedState();

-    // Merge DB records for machines not currently in any bastion's live state
    try {
      const dbServers = (await db.server.findMany({})) as Array<{
        mac: string | null; hostname: string; role: string; ip: string | null;
@@ -93,9 +92,49 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
      for (const s of dbServers) {
        if (!s.mac) continue;
        const mac = s.mac.toLowerCase();
-        // Only add from DB if not already in live state
+
+        // DB knows this machine has been installed at some point if it has a real
+        // hostname+role (not just product-name-as-hostname and role="unknown").
+        // Status alone is unreliable: a rediscovery can re-set it without erasing the
+        // install identity. If the bastion restarted and lost its installed map, the
+        // machine will only show up in live.discovered — promote it here so the CLI
+        // still sees hostname/role/IP.
+        const dbKnowsInstalled =
+          s.role !== "unknown" && s.role !== "" &&
+          s.hostname !== "" && s.hostname !== s.mac;
+
+        if (dbKnowsInstalled && !(mac in live.installed) && !(mac in live.install_queue)) {
+          const hw = live.discovered[mac];
+          live.installed[mac] = {
+            hostname: s.hostname,
+            role: s.role,
+            ip: s.ip ?? "",
+            installed_at: "",
+            bastionId: hw?.bastionId ?? "db",
+            ...(hw ? {
+              product: hw.product,
+              manufacturer: hw.manufacturer,
+              cpu_model: hw.cpu_model,
+              cpu_cores: hw.cpu_cores,
+              memory_gb: hw.memory_gb,
+              arch: hw.arch,
+            } : {}),
+          };
+          delete live.discovered[mac];
+          continue;
+        }
+
+        // Unknown-to-live MAC: fall back to whatever the DB says.
        if (!(mac in live.discovered) && !(mac in live.install_queue) && !(mac in live.installed)) {
-          if (s.status === "discovered") {
+          if (s.status === "online" || s.status === "offline") {
+            live.installed[mac] = {
+              hostname: s.hostname,
+              role: s.role,
+              ip: s.ip ?? "",
+              installed_at: "",
+              bastionId: "db",
+            };
+          } else {
            live.discovered[mac] = {
              mac,
              product: String(s.labels?.product ?? "unknown"),
@@ -112,14 +151,6 @@ export function registerBastionRoutes(app: FastifyInstance, db: DbClient): void
              last_seen: "",
              bastionId: "db",
            };
-          } else if (s.status === "online" || s.status === "offline") {
-            live.installed[mac] = {
-              hostname: s.hostname,
-              role: s.role,
-              ip: s.ip ?? "",
-              installed_at: "",
-              bastionId: "db",
-            };
          }
        }
      }
--- a/bastion/src/labd/src/routes/environments.ts
+++ b/bastion/src/labd/src/routes/environments.ts
@@ -0,0 +1,191 @@
+// Environment and Account management routes.
+// GET/POST /api/environments       — list/create environments
+// GET/POST /api/accounts           — list/create accounts
+// POST     /api/accounts/bind      — bind account to environment
+// GET      /api/bindings           — list bindings
+
+import type { FastifyInstance } from "fastify";
+import type { PrismaClient, Prisma } from "@prisma/client";
+import type { RbacService } from "../services/rbac.js";
+import type { AuditService } from "../services/audit.js";
+
+export function registerEnvironmentRoutes(
+  app: FastifyInstance,
+  db: PrismaClient,
+  rbacService: RbacService,
+  auditService: AuditService,
+): void {
+  // List environments
+  app.get("/api/environments", async (_request, reply) => {
+    const envs = await db.environment.findMany({ orderBy: { name: "asc" } });
+    return reply.send(envs);
+  });
+
+  // Create environment
+  app.post<{
+    Body: { name?: string };
+  }>("/api/environments", async (request, reply) => {
+    const { name } = request.body ?? {};
+    if (!name) {
+      return reply.code(400).send({ error: "name is required" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "admin",
+      resource: "environments",
+    });
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    try {
+      const env = await db.environment.create({ data: { name } });
+      auditService.emit({
+        eventKind: "resource_created",
+        source: "labd",
+        verified: true,
+        userId: request.userId ?? null,
+        resourceKind: "environment",
+        resourceName: name,
+        result: "success",
+      });
+      return reply.code(201).send(env);
+    } catch (err) {
+      if (err instanceof Error && err.message.includes("Unique constraint")) {
+        return reply.code(409).send({ error: `Environment '${name}' already exists` });
+      }
+      throw err;
+    }
+  });
+
+  // List accounts
+  app.get("/api/accounts", async (_request, reply) => {
+    const accounts = await db.account.findMany({
+      orderBy: { name: "asc" },
+      select: { id: true, name: true, driver: true, config: true, createdAt: true, updatedAt: true },
+    });
+    return reply.send(accounts);
+  });
+
+  // Create account
+  app.post<{
+    Body: { name?: string; driver?: string; config?: Record<string, unknown> };
+  }>("/api/accounts", async (request, reply) => {
+    const { name, driver, config } = request.body ?? {};
+    if (!name || !driver) {
+      return reply.code(400).send({ error: "name and driver are required" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "admin",
+      resource: "accounts",
+    });
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    try {
+      const account = await db.account.create({
+        data: { name, driver, config: (config ?? {}) as Prisma.InputJsonValue },
+      });
+      auditService.emit({
+        eventKind: "resource_created",
+        source: "labd",
+        verified: true,
+        userId: request.userId ?? null,
+        resourceKind: "account",
+        resourceName: name,
+        result: "success",
+        details: { driver },
+      });
+      return reply.code(201).send(account);
+    } catch (err) {
+      if (err instanceof Error && err.message.includes("Unique constraint")) {
+        return reply.code(409).send({ error: `Account '${name}' already exists` });
+      }
+      throw err;
+    }
+  });
+
+  // Bind account to environment
+  app.post<{
+    Body: { environmentId?: string; accountId?: string };
+  }>("/api/accounts/bind", async (request, reply) => {
+    const { environmentId, accountId } = request.body ?? {};
+    if (!environmentId || !accountId) {
+      return reply.code(400).send({ error: "environmentId and accountId are required" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "admin",
+      resource: "accounts",
+    });
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    try {
+      const binding = await db.binding.create({
+        data: { environmentId, accountId },
+      });
+      return reply.code(201).send(binding);
+    } catch (err) {
+      if (err instanceof Error && err.message.includes("Unique constraint")) {
+        return reply.code(409).send({ error: "This account is already bound to this environment" });
+      }
+      throw err;
+    }
+  });
+
+  // List bindings
+  app.get("/api/bindings", async (_request, reply) => {
+    const bindings = await db.binding.findMany({
+      include: { environment: true, account: true },
+    });
+    return reply.send(bindings);
+  });
+
+  // Audit event query
+  app.get<{
+    Querystring: {
+      last?: string;
+      kind?: string;
+      env?: string;
+      correlation?: string;
+      limit?: string;
+    };
+  }>("/api/events", async (request, reply) => {
+    const { last, kind, env, correlation, limit } = request.query as { last?: string; kind?: string; env?: string; correlation?: string; limit?: string };
+
+    const where: Record<string, unknown> = {};
+
+    if (last) {
+      const match = last.match(/^(\d+)(h|d|m)$/);
+      if (match) {
+        const [, num, unit] = match;
+        const ms = { h: 3_600_000, d: 86_400_000, m: 60_000 }[unit!]!;
+        where.timestamp = { gte: new Date(Date.now() - parseInt(num!) * ms) };
+      }
+    }
+    if (kind) where.eventKind = kind;
+    if (env) where.environmentName = env;
+    if (correlation) where.correlationId = correlation;
+
+    const events = await db.auditEvent.findMany({
+      where,
+      orderBy: { timestamp: "desc" },
+      take: Math.min(parseInt(limit ?? "100"), 500),
+    });
+
+    return reply.send(events);
+  });
+}
--- a/bastion/src/labd/src/routes/resources.ts
+++ b/bastion/src/labd/src/routes/resources.ts
@@ -0,0 +1,196 @@
+// Resource CRUD routes with RBAC enforcement.
+// GET    /api/resources          — list (filtered by RBAC scope)
+// GET    /api/resources/:id      — get
+// POST   /api/resources          — create
+// PUT    /api/resources/:id      — update
+// DELETE /api/resources/:id      — delete (marks as deleting)
+
+import type { FastifyInstance } from "fastify";
+import type { ResourceStore, CreateResourceInput } from "../services/resource-store.js";
+import type { RbacService } from "../services/rbac.js";
+import type { AuditService } from "../services/audit.js";
+import { resolveResourceKind } from "@lab/core";
+
+export function registerResourceRoutes(
+  app: FastifyInstance,
+  resourceStore: ResourceStore,
+  rbacService: RbacService,
+  auditService: AuditService,
+): void {
+  // List resources (filtered by kind, environment, status)
+  app.get<{
+    Querystring: { kind?: string; environment?: string; status?: string };
+  }>("/api/resources", async (request, reply) => {
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "view",
+      resource: request.query.kind ? resolveResourceKind(request.query.kind) : undefined,
+    });
+
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    const resources = await resourceStore.list({
+      kind: request.query.kind ? resolveResourceKind(request.query.kind) : undefined,
+      environmentId: request.query.environment,
+      status: request.query.status,
+    });
+
+    return reply.send(resources);
+  });
+
+  // Get single resource
+  app.get<{
+    Params: { id: string };
+  }>("/api/resources/:id", async (request, reply) => {
+    const resource = await resourceStore.get(request.params.id);
+    if (!resource) {
+      return reply.code(404).send({ error: "Resource not found" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "view",
+      resource: resource.kind,
+      name: resource.name,
+    });
+
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    return reply.send(resource);
+  });
+
+  // Create resource
+  app.post<{
+    Body: CreateResourceInput;
+  }>("/api/resources", async (request, reply) => {
+    const input = request.body;
+    if (!input?.kind || !input?.name || !input?.environmentId || !input?.accountId) {
+      return reply.code(400).send({ error: "kind, name, environmentId, and accountId are required" });
+    }
+
+    const kind = resolveResourceKind(input.kind);
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "create",
+      resource: kind,
+    });
+
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    const correlationId = auditService.createCorrelation();
+
+    try {
+      const resource = await resourceStore.create({ ...input, kind });
+
+      auditService.emit({
+        eventKind: "resource_created",
+        source: "labd",
+        verified: true,
+        userId: request.userId ?? null,
+        userName: request.userEmail ?? null,
+        resourceKind: kind,
+        resourceName: input.name,
+        correlationId,
+        result: "success",
+      });
+
+      return reply.code(201).send(resource);
+    } catch (err) {
+      // Prisma unique constraint violation
+      if (err instanceof Error && err.message.includes("Unique constraint")) {
+        return reply.code(409).send({ error: `Resource ${kind}/${input.name} already exists in this environment` });
+      }
+      throw err;
+    }
+  });
+
+  // Update resource
+  app.put<{
+    Params: { id: string };
+    Body: { desiredSpec?: Record<string, unknown>; status?: string };
+  }>("/api/resources/:id", async (request, reply) => {
+    const resource = await resourceStore.get(request.params.id);
+    if (!resource) {
+      return reply.code(404).send({ error: "Resource not found" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "edit",
+      resource: resource.kind,
+      name: resource.name,
+    });
+
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    const updated = await resourceStore.update(request.params.id, request.body);
+
+    auditService.emit({
+      eventKind: "resource_updated",
+      source: "labd",
+      verified: true,
+      userId: request.userId ?? null,
+      userName: request.userEmail ?? null,
+      resourceKind: resource.kind,
+      resourceName: resource.name,
+      result: "success",
+    });
+
+    return reply.send(updated);
+  });
+
+  // Delete resource (marks as deleting)
+  app.delete<{
+    Params: { id: string };
+  }>("/api/resources/:id", async (request, reply) => {
+    const resource = await resourceStore.get(request.params.id);
+    if (!resource) {
+      return reply.code(404).send({ error: "Resource not found" });
+    }
+
+    const rbac = await rbacService.check({
+      userId: request.userId!,
+      userEmail: request.userEmail!,
+      userRole: request.userRole!,
+      action: "delete",
+      resource: resource.kind,
+      name: resource.name,
+    });
+
+    if (!rbac.allowed) {
+      return reply.code(403).send({ error: rbac.reason });
+    }
+
+    await resourceStore.delete(request.params.id);
+
+    auditService.emit({
+      eventKind: "resource_deleted",
+      source: "labd",
+      verified: true,
+      userId: request.userId ?? null,
+      userName: request.userEmail ?? null,
+      resourceKind: resource.kind,
+      resourceName: resource.name,
+      result: "success",
+    });
+
+    return reply.send({ status: "deleting", id: request.params.id });
+  });
+}
--- a/bastion/src/labd/src/routes/v2-auth.ts
+++ b/bastion/src/labd/src/routes/v2-auth.ts
@@ -0,0 +1,81 @@
+// v2 Auth routes: bearer token login/logout.
+// POST /api/auth/login  — email + password → session token
+// POST /api/auth/logout — revoke session
+
+import type { FastifyInstance } from "fastify";
+import type { AuthService } from "../services/auth.js";
+import type { AuditService } from "../services/audit.js";
+import { AuthError } from "../services/auth.js";
+
+export function registerV2AuthRoutes(
+  app: FastifyInstance,
+  authService: AuthService,
+  auditService: AuditService,
+): void {
+  app.post<{
+    Body: { email?: string; password?: string };
+  }>("/api/auth/login", async (request, reply) => {
+    const { email, password } = request.body ?? {};
+
+    if (!email || !password) {
+      return reply.code(400).send({ error: "email and password are required" });
+    }
+
+    try {
+      const result = await authService.login(email, password);
+
+      auditService.emit({
+        eventKind: result.isBootstrap ? "auth_bootstrap" : "auth_login",
+        source: "labd",
+        verified: true,
+        userId: result.userId,
+        userName: email,
+        result: "success",
+        details: { isBootstrap: result.isBootstrap },
+      });
+
+      return reply.send({
+        token: result.token,
+        expiresAt: result.expiresAt.toISOString(),
+        isBootstrap: result.isBootstrap,
+      });
+    } catch (err) {
+      if (err instanceof AuthError) {
+        auditService.emit({
+          eventKind: "auth_login",
+          source: "labd",
+          verified: true,
+          userName: email,
+          result: "failure",
+          error: err.message,
+        });
+        return reply.code(401).send({ error: err.message });
+      }
+      return reply.code(500).send({ error: "Login failed" });
+    }
+  });
+
+  app.post("/api/auth/logout", async (request, reply) => {
+    const token = request.headers.authorization?.slice(7);
+    if (!token) {
+      return reply.code(400).send({ error: "Authorization header required" });
+    }
+
+    try {
+      await authService.logout(token);
+      auditService.emit({
+        eventKind: "auth_logout",
+        source: "labd",
+        verified: true,
+        userId: request.userId ?? null,
+        result: "success",
+      });
+      return reply.send({ status: "logged_out" });
+    } catch (err) {
+      if (err instanceof AuthError) {
+        return reply.code(400).send({ error: err.message });
+      }
+      return reply.code(500).send({ error: "Logout failed" });
+    }
+  });
+}
--- a/bastion/src/labd/src/server.ts
+++ b/bastion/src/labd/src/server.ts
@@ -192,7 +192,9 @@ export async function createApp(_config: LabdConfig, db: DbClient): Promise<{
                          labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer },
                        },
                        update: {
-                          status: "discovered",
+                          // Leave status alone — a previously "online"/"offline" record
+                          // must not be downgraded to "discovered" just because the bastion
+                          // restarted and re-discovered the MAC via DHCP/PXE.
                          lastHeartbeat: new Date(),
                          labels: { cpu: hw.cpu_model, cores: hw.cpu_cores, memory_gb: hw.memory_gb, arch: hw.arch, product: hw.product, manufacturer: hw.manufacturer },
                        },
--- a/bastion/src/labd/src/services/audit.ts
+++ b/bastion/src/labd/src/services/audit.ts
@@ -0,0 +1,100 @@
+// Audit service: fire-and-forget event collection with batching.
+// Batches 50 events or flushes every 5 seconds, whichever comes first.
+// Failures never block the operation being audited.
+
+import { randomBytes } from "node:crypto";
+import type { PrismaClient, Prisma } from "@prisma/client";
+import { logger } from "./logger.js";
+
+const BATCH_SIZE = 50;
+const FLUSH_INTERVAL_MS = 5_000;
+
+export interface AuditEventInput {
+  eventKind: string;
+  source: string;
+  verified?: boolean;
+  userId?: string | null;
+  userName?: string | null;
+  sessionId?: string | null;
+  environmentName?: string | null;
+  accountName?: string | null;
+  resourceKind?: string | null;
+  resourceName?: string | null;
+  correlationId?: string | null;
+  parentEventId?: string | null;
+  details?: Record<string, unknown>;
+  result: string;
+  error?: string | null;
+  durationMs?: number | null;
+}
+
+export class AuditService {
+  private batch: AuditEventInput[] = [];
+  private timer: ReturnType<typeof setInterval> | null = null;
+
+  constructor(private readonly db: PrismaClient) {}
+
+  start(): void {
+    this.timer = setInterval(() => {
+      void this.flush();
+    }, FLUSH_INTERVAL_MS);
+  }
+
+  stop(): void {
+    if (this.timer) {
+      clearInterval(this.timer);
+      this.timer = null;
+    }
+    void this.flush();
+  }
+
+  emit(event: AuditEventInput): void {
+    // Generate correlation ID if not provided
+    if (!event.correlationId) {
+      event.correlationId = `corr_${randomBytes(8).toString("hex")}`;
+    }
+
+    this.batch.push(event);
+
+    if (this.batch.length >= BATCH_SIZE) {
+      void this.flush();
+    }
+  }
+
+  /** Create a correlation context for a chain of related events. */
+  createCorrelation(): string {
+    return `corr_${randomBytes(8).toString("hex")}`;
+  }
+
+  private async flush(): Promise<void> {
+    if (this.batch.length === 0) return;
+
+    const events = this.batch.splice(0);
+    try {
+      await this.db.auditEvent.createMany({
+        data: events.map((e) => ({
+          eventKind: e.eventKind,
+          source: e.source,
+          verified: e.verified ?? false,
+          userId: e.userId ?? null,
+          userName: e.userName ?? null,
+          sessionId: e.sessionId ?? null,
+          environmentName: e.environmentName ?? null,
+          accountName: e.accountName ?? null,
+          resourceKind: e.resourceKind ?? null,
+          resourceName: e.resourceName ?? null,
+          correlationId: e.correlationId ?? `corr_${randomBytes(8).toString("hex")}`,
+          parentEventId: e.parentEventId ?? null,
+          details: (e.details ?? {}) as Prisma.InputJsonValue,
+          result: e.result,
+          error: e.error ?? null,
+          durationMs: e.durationMs ?? null,
+        })),
+      });
+      logger.info(`AUDIT: flushed ${events.length} events`);
+    } catch (err) {
+      // Fire-and-forget: audit failures never block operations
+      logger.warn(`AUDIT: failed to flush ${events.length} events: ${err instanceof Error ? err.message : String(err)}`);
+    }
+  }
+}
--- a/bastion/src/labd/src/services/auth.ts
+++ b/bastion/src/labd/src/services/auth.ts
@@ -0,0 +1,119 @@
+// Auth service: bearer token authentication with bootstrap flow.
+// First login creates the admin user. Subsequent logins return session tokens.
+
+import { randomBytes } from "node:crypto";
+import bcrypt from "bcryptjs";
+import type { PrismaClient } from "@prisma/client";
+import { logger } from "./logger.js";
+
+const SESSION_EXPIRY_DAYS = 30;
+const BCRYPT_ROUNDS = 12;
+
+export interface LoginResult {
+  token: string;
+  expiresAt: Date;
+  userId: string;
+  isBootstrap: boolean;
+}
+
+export class AuthService {
+  constructor(private readonly db: PrismaClient) {}
+
+  async login(email: string, password: string): Promise<LoginResult> {
+    const userCount = await this.db.user.count();
+
+    // Bootstrap: first login creates admin user
+    if (userCount === 0) {
+      return this.bootstrap(email, password);
+    }
+
+    const user = await this.db.user.findUnique({ where: { email } });
+    if (!user) {
+      // Same error for unknown user and wrong password (no enumeration)
+      throw new AuthError("Invalid email or password");
+    }
+
+    const valid = await bcrypt.compare(password, user.password);
+    if (!valid) {
+      throw new AuthError("Invalid email or password");
+    }
+
+    const session = await this.createSession(user.id);
+    logger.info(`AUTH LOGIN: ${email} (${user.id.slice(0, 8)}...)`);
+
+    return {
+      token: session.token,
+      expiresAt: session.expiresAt,
+      userId: user.id,
+      isBootstrap: false,
+    };
+  }
+
+  async logout(token: string): Promise<void> {
+    const session = await this.db.session.findUnique({ where: { token } });
+    if (!session) {
+      throw new AuthError("Invalid session");
+    }
+    await this.db.session.delete({ where: { id: session.id } });
+    logger.info(`AUTH LOGOUT: session ${session.id.slice(0, 8)}...`);
+  }
+
+  async validateToken(token: string): Promise<{ userId: string; email: string; role: string }> {
+    const session = await this.db.session.findUnique({
+      where: { token },
+      include: { user: true },
+    });
+
+    if (!session) {
+      throw new AuthError("Invalid token");
+    }
+    if (session.expiresAt < new Date()) {
+      await this.db.session.delete({ where: { id: session.id } });
+      throw new AuthError("Token expired");
+    }
+
+    return {
+      userId: session.user.id,
+      email: session.user.email,
+      role: session.user.role,
+    };
+  }
+
+  private async bootstrap(email: string, password: string): Promise<LoginResult> {
+    const hashed = await bcrypt.hash(password, BCRYPT_ROUNDS);
+    const user = await this.db.user.create({
+      data: {
+        email,
+        password: hashed,
+        role: "ADMIN",
+        name: email.split("@")[0] ?? null,
+      },
+    });
+
+    const session = await this.createSession(user.id);
+    logger.info(`AUTH BOOTSTRAP: created admin user ${email} (${user.id.slice(0, 8)}...)`);
+
+    return {
+      token: session.token,
+      expiresAt: session.expiresAt,
+      userId: user.id,
+      isBootstrap: true,
+    };
+  }
+
+  private async createSession(userId: string) {
+    const token = randomBytes(32).toString("hex");
+    const expiresAt = new Date(Date.now() + SESSION_EXPIRY_DAYS * 24 * 60 * 60 * 1000);
+
+    return this.db.session.create({
+      data: { userId, token, expiresAt },
+    });
+  }
+}
+
+export class AuthError extends Error {
+  constructor(message: string) {
+    super(message);
+    this.name = "AuthError";
+  }
+}
--- a/bastion/src/labd/src/services/rbac.ts
+++ b/bastion/src/labd/src/services/rbac.ts
@@ -0,0 +1,123 @@
+// RBAC service: environment-scoped permission checks.
+// Uses named RbacDefinition records with JSON subjects and roleBindings.
+//
+// Resolution flow:
+//   1. Find all RbacDefinitions where subjects match the current user/groups
+//   2. Collect all roleBindings from matching definitions
+//   3. Check if any binding grants the requested action on the requested resource
+
+import type { PrismaClient } from "@prisma/client";
+import { logger } from "./logger.js";
+
+export interface RbacCheck {
+  userId: string;
+  userEmail: string;
+  userRole: string;
+  action: string;       // "view" | "edit" | "create" | "delete" | "run" | "admin"
+  resource?: string | undefined;    // "servers" | "databases" | "clusters" | "*"
+  name?: string | undefined;        // specific resource name
+  environment?: string | undefined; // specific environment name
+}
+
+export interface RbacResult {
+  allowed: boolean;
+  reason: string;
+  matchedDefinition?: string;
+}
+
+interface StoredSubject {
+  kind: string;
+  name: string;
+}
+
+interface StoredBinding {
+  role: string;
+  resource?: string;
+  name?: string;
+  environment?: string;
+  action?: string;
+}
+
+export class RbacService {
+  constructor(private readonly db: PrismaClient) {}
+
+  async check(req: RbacCheck): Promise<RbacResult> {
+    // Admin users bypass RBAC
+    if (req.userRole === "ADMIN") {
+      return { allowed: true, reason: "admin role" };
+    }
+
+    // Collect user's group memberships
+    const memberships = await this.db.groupMember.findMany({
+      where: { userId: req.userId },
+      include: { group: true },
+    });
+    const groupNames = memberships.map((m) => m.group.name);
+
+    // Find all RBAC definitions
+    const definitions = await this.db.rbacDefinition.findMany();
+
+    for (const def of definitions) {
+      const subjects = def.subjects as unknown as StoredSubject[];
+      const bindings = def.roleBindings as unknown as StoredBinding[];
+
+      // Check if this definition's subjects match the user
+      const subjectMatch = subjects.some((s) => {
+        if (s.kind === "User" && s.name === req.userEmail) return true;
+        if (s.kind === "Group" && groupNames.includes(s.name)) return true;
+        return false;
+      });
+
+      if (!subjectMatch) continue;
+
+      // Check if any binding grants the requested permission
+      for (const binding of bindings) {
+        if (this.bindingMatches(binding, req)) {
+          logger.info(`RBAC ALLOW: ${req.userEmail} ${req.action} ${req.resource ?? "*"}${req.name ? `/${req.name}` : ""} via ${def.name}`);
+          return {
+            allowed: true,
+            reason: `granted by ${def.name}`,
+            matchedDefinition: def.name,
+          };
+        }
+      }
+    }
+
+    logger.info(`RBAC DENY: ${req.userEmail} ${req.action} ${req.resource ?? "*"}${req.name ? `/${req.name}` : ""}`);
+    return {
+      allowed: false,
+      reason: `no matching role binding for ${req.action} on ${req.resource ?? "*"}`,
+    };
+  }
+
+  private bindingMatches(binding: StoredBinding, req: RbacCheck): boolean {
+    // Check role grants the action
+    if (!this.roleGrantsAction(binding.role, req.action)) return false;
+
+    // Check resource scope
+    if (binding.resource && binding.resource !== "*" && binding.resource !== req.resource) return false;
+
+    // Check name scope
+    if (binding.name && binding.name !== req.name) return false;
+
+    // Check environment scope
+    if (binding.environment && binding.environment !== req.environment) return false;
+
+    // Check operation scope (for "run" role with specific actions)
+    if (binding.action && binding.action !== "*" && binding.action !== req.action) return false;
+
+    return true;
+  }
+
+  private roleGrantsAction(role: string, action: string): boolean {
+    const grants: Record<string, string[]> = {
+      admin: ["view", "edit", "create", "delete", "run", "admin"],
+      edit: ["view", "edit", "create", "delete"],
+      create: ["create"],
+      delete: ["delete"],
+      view: ["view"],
+      run: ["run"],
+    };
+    return grants[role]?.includes(action) ?? false;
+  }
+}
--- a/bastion/src/labd/src/services/resource-store.ts
+++ b/bastion/src/labd/src/services/resource-store.ts
@@ -0,0 +1,108 @@
+// Resource store: CRUD for generic resources with origin/managedBy tracking.
+// All mutations go through this service so RBAC and audit are applied consistently.
+
+import type { PrismaClient, Resource as PrismaResource, Prisma } from "@prisma/client";
+import { logger } from "./logger.js";
+
+export interface CreateResourceInput {
+  kind: string;
+  name: string;
+  environmentId: string;
+  accountId: string;
+  origin?: string;
+  managedBy?: string;
+  sourceRef?: string;
+  desiredSpec: Record<string, unknown>;
+}
+
+export interface UpdateResourceInput {
+  desiredSpec?: Record<string, unknown>;
+  status?: string;
+  statusMessage?: string;
+  actualSpec?: Record<string, unknown>;
+  platformRef?: string;
+}
+
+export interface ListResourcesFilter {
+  kind?: string | undefined;
+  environmentId?: string | undefined;
+  accountId?: string | undefined;
+  status?: string | undefined;
+}
+
+export class ResourceStore {
+  constructor(private readonly db: PrismaClient) {}
+
+  async create(input: CreateResourceInput): Promise<PrismaResource> {
+    const resource = await this.db.resource.create({
+      data: {
+        kind: input.kind,
+        name: input.name,
+        environmentId: input.environmentId,
+        accountId: input.accountId,
+        origin: input.origin ?? "cli",
+        managedBy: input.managedBy ?? "manual",
+        sourceRef: input.sourceRef ?? null,
+        desiredSpec: input.desiredSpec as Prisma.InputJsonValue,
+        status: "pending",
+      },
+    });
+
+    logger.info(`RESOURCE CREATED: ${input.kind}/${input.name} in env ${input.environmentId.slice(0, 8)}...`);
+    return resource;
+  }
+
+  async get(id: string): Promise<PrismaResource | null> {
+    return this.db.resource.findUnique({ where: { id } });
+  }
+
+  async getByKindNameEnv(kind: string, name: string, environmentId: string): Promise<PrismaResource | null> {
+    return this.db.resource.findUnique({
+      where: { kind_name_environmentId: { kind, name, environmentId } },
+    });
+  }
+
+  async list(filter: ListResourcesFilter = {}): Promise<PrismaResource[]> {
+    return this.db.resource.findMany({
+      where: {
+        ...(filter.kind ? { kind: filter.kind } : {}),
+        ...(filter.environmentId ? { environmentId: filter.environmentId } : {}),
+        ...(filter.accountId ? { accountId: filter.accountId } : {}),
+        ...(filter.status ? { status: filter.status } : {}),
+      },
+      orderBy: { createdAt: "desc" },
+    });
+  }
+
+  async update(id: string, input: UpdateResourceInput): Promise<PrismaResource> {
+    const data: Prisma.ResourceUpdateInput = {};
+    if (input.desiredSpec !== undefined) data.desiredSpec = input.desiredSpec as Prisma.InputJsonValue;
+    if (input.status !== undefined) data.status = input.status;
+    if (input.statusMessage !== undefined) data.statusMessage = input.statusMessage;
+    if (input.actualSpec !== undefined) data.actualSpec = input.actualSpec as Prisma.InputJsonValue;
+    if (input.platformRef !== undefined) data.platformRef = input.platformRef;
+    if (input.status === "ready") data.lastReconciled = new Date();
+
+    const resource = await this.db.resource.update({ where: { id }, data });
+
+    logger.info(`RESOURCE UPDATED: ${resource.kind}/${resource.name} -> ${input.status ?? "spec change"}`);
+    return resource;
+  }
+
+  async delete(id: string): Promise<void> {
+    const resource = await this.db.resource.findUnique({ where: { id } });
+    if (!resource) return;
+
+    // Mark as deleting first (driver handles actual deletion)
+    await this.db.resource.update({
+      where: { id },
+      data: { status: "deleting" },
+    });
+
+    logger.info(`RESOURCE DELETING: ${resource.kind}/${resource.name}`);
+  }
+
+  async hardDelete(id: string): Promise<void> {
+    await this.db.resource.delete({ where: { id } });
+  }
+}
--- a/bastion/src/labd/tests/bastions-machines.test.ts
+++ b/bastion/src/labd/tests/bastions-machines.test.ts
@@ -0,0 +1,144 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import Fastify from "fastify";
+import { registerBastionRoutes } from "../src/routes/bastions.js";
+import { bastionRegistry } from "../src/services/bastion-registry.js";
+import type { DbClient } from "../src/server.js";
+import type { BastionState } from "@lab/shared";
+
+function createMockDb(servers: unknown[] = []): DbClient {
+  return {
+    $queryRaw: vi.fn().mockResolvedValue([{ "?column?": 1 }]),
+    server: {
+      findMany: vi.fn().mockResolvedValue(servers),
+      findUnique: vi.fn().mockResolvedValue(null),
+      upsert: vi.fn().mockResolvedValue({}),
+    },
+    joinToken: {
+      findUnique: vi.fn().mockResolvedValue(null),
+      findMany: vi.fn().mockResolvedValue([]),
+      create: vi.fn().mockResolvedValue({ id: "t" }),
+      update: vi.fn().mockResolvedValue({}),
+    },
+    bastion: {
+      upsert: vi.fn().mockResolvedValue({}),
+      findMany: vi.fn().mockResolvedValue([]),
+      findUnique: vi.fn().mockResolvedValue(null),
+      update: vi.fn().mockResolvedValue({}),
+    },
+  };
+}
+
+function registerFakeBastion(bastionId: string, state: BastionState): void {
+  bastionRegistry.register({
+    bastionId,
+    hostname: "fake",
+    network: "192.168.8.0/24",
+    serverIp: "192.168.8.11",
+    // socket is referenced only on commands, not during aggregation
+    socket: { on: () => undefined, off: () => undefined, send: () => undefined, close: () => undefined } as never,
+    connectedAt: new Date(),
+    lastHeartbeat: new Date(),
+    state,
+  });
+}
+
+describe("GET /api/machines aggregation", () => {
+  beforeEach(() => {
+    for (const b of bastionRegistry.getAll()) bastionRegistry.unregister(b.bastionId);
+  });
+
+  it("promotes a live-discovered MAC to installed when the DB has a real hostname+role for it", async () => {
+    // Simulates the worker0-k8s0 bug: bastion restarted, lost its installed map,
+    // rediscovered the machine via DHCP/PXE. DB still has hostname=worker0-k8s0,
+    // role=infra, ip=192.168.8.23. Without the fix, the CLI sees a "discovered"
+    // row with no hostname/role/IP. With the fix, the row is promoted to
+    // "installed" with full identity preserved.
+    const mac = "78:55:36:08:28:fb";
+    registerFakeBastion("b1", {
+      discovered: {
+        [mac]: {
+          mac, product: "SER", board: "SER", serial: "x", manufacturer: "AZW",
+          cpu_model: "AMD Ryzen 7 255", cpu_cores: 16, memory_gb: 58, arch: "x86_64",
+          disks: [], nics: [], first_seen: "", last_seen: "",
+        },
+      },
+      install_queue: {},
+      installed: {},
+      debug: {},
+    });
+
+    const app = Fastify({ logger: false });
+    const db = createMockDb([
+      { mac, hostname: "worker0-k8s0", role: "infra", ip: "192.168.8.23", status: "discovered", labels: {} },
+    ]);
+    registerBastionRoutes(app, db);
+
+    const res = await app.inject({ method: "GET", url: "/api/machines" });
+    expect(res.statusCode).toBe(200);
+    const body = JSON.parse(res.body);
+
+    expect(body.discovered[mac]).toBeUndefined();
+    expect(body.installed[mac]).toMatchObject({
+      hostname: "worker0-k8s0",
+      role: "infra",
+      ip: "192.168.8.23",
+      cpu_model: "AMD Ryzen 7 255",
+      cpu_cores: 16,
+      memory_gb: 58,
+    });
+
+    await app.close();
+  });
+
+  it("leaves a fresh-discovery MAC in discovered when DB only has a discovery-shaped record", async () => {
+    const mac = "aa:bb:cc:dd:ee:ff";
+    registerFakeBastion("b1", {
+      discovered: {
+        [mac]: {
+          mac, product: "SER", board: "SER", serial: "x", manufacturer: "AZW",
+          cpu_model: "AMD Ryzen 7", cpu_cores: 8, memory_gb: 32, arch: "x86_64",
+          disks: [], nics: [], first_seen: "", last_seen: "",
+        },
+      },
+      install_queue: {},
+      installed: {},
+      debug: {},
+    });
+
+    const app = Fastify({ logger: false });
+    // Matches what labd writes on first discovery: hostname=product, role="unknown"
+    const db = createMockDb([
+      { mac, hostname: "SER", role: "unknown", ip: null, status: "discovered", labels: {} },
+    ]);
+    registerBastionRoutes(app, db);
+
+    const res = await app.inject({ method: "GET", url: "/api/machines" });
+    const body = JSON.parse(res.body);
+
+    expect(body.discovered[mac]).toBeDefined();
+    expect(body.installed[mac]).toBeUndefined();
+
+    await app.close();
+  });
+
+  it("falls back to DB for MACs not in any live bucket", async () => {
+    const mac = "11:22:33:44:55:66";
+    // No bastions connected
+    const app = Fastify({ logger: false });
+    const db = createMockDb([
+      { mac, hostname: "worker1-k8s0", role: "infra", ip: "192.168.8.13", status: "online", labels: {} },
+    ]);
+    registerBastionRoutes(app, db);
+
+    const res = await app.inject({ method: "GET", url: "/api/machines" });
+    const body = JSON.parse(res.body);
+
+    expect(body.installed[mac]).toMatchObject({
+      hostname: "worker1-k8s0",
+      role: "infra",
+      ip: "192.168.8.13",
+    });
+
+    await app.close();
+  });
+});
--- a/bastion/src/modules/modules/k3s/src/groups/hardening.ts
+++ b/bastion/src/modules/modules/k3s/src/groups/hardening.ts
@@ -1,19 +1,21 @@
-// Hardening: Pod Security Standards, certificate check, log rotation.
+// Hardening: Pod Security Standards, certificate check, journald cap, storage.

 import type { OperationContext, OperationResult, OperationGroup } from "../types.js";
 import { runSequential } from "../utils.js";
 import { applyPodSecurityStandards } from "../operations/pod-security.js";
 import { checkCertExpiry } from "../operations/cert-check.js";
 import { configureLogRotation } from "../operations/log-rotation.js";
+import { configureJournaldLimits } from "../operations/journald-limits.js";
 import { configureLonghornDisk } from "../operations/longhorn-disk.js";

 export const hardeningGroup: OperationGroup = {
  name: "hardening",
-  description: "Pod security, certificate check, log rotation, storage",
+  description: "Pod security, certificate check, journald cap, storage",
  operations: [
    { name: "Apply Pod Security Standards", fn: applyPodSecurityStandards },
    { name: "Check certificate expiry", fn: checkCertExpiry },
-    { name: "Configure log rotation", fn: configureLogRotation },
+    { name: "Decommission file-based audit logs", fn: configureLogRotation },
+    { name: "Configure journald disk cap", fn: configureJournaldLimits },
    { name: "Configure Longhorn disk", fn: configureLonghornDisk },
  ],
 };
--- a/bastion/src/modules/modules/k3s/src/install.ts
+++ b/bastion/src/modules/modules/k3s/src/install.ts
@@ -76,7 +76,6 @@ sed -i 's/^SELINUX=enforcing/SELINUX=permissive/' /etc/selinux/config 2>/dev/nul
 # ── 5b. Create k3s config directory ──
 echo "[5/10] Writing k3s server configuration..."
 mkdir -p /etc/rancher/k3s
-mkdir -p /var/log/kubernetes

 cat > /etc/rancher/k3s/config.yaml << 'K3S_CONFIG'
 # k3s server configuration — CIS hardened
@@ -91,13 +90,10 @@ disable:
  - servicelb
  - traefik

-# API server hardening
+# API server hardening (audit-log-path=- routes audit to journald via stdout)
 kube-apiserver-arg:
  - "anonymous-auth=false"
-  - "audit-log-path=/var/log/kubernetes/audit.log"
-  - "audit-log-maxage=30"
-  - "audit-log-maxbackup=10"
-  - "audit-log-maxsize=100"
+  - "audit-log-path=-"
  - "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
  - "enable-admission-plugins=NodeRestriction,PodSecurity"
  - "request-timeout=300s"
--- a/bastion/src/modules/modules/k3s/src/operations/etcd-recover.ts
+++ b/bastion/src/modules/modules/k3s/src/operations/etcd-recover.ts
@@ -0,0 +1,194 @@
+// Recover a broken etcd member by removing it from the cluster, wiping its
+// local state, and restarting k3s so it rejoins as a fresh member.
+//
+// Use case: a node panics on startup with
+//   "tocommit(N+1) is out of range [lastIndex(N)]. Was the raft log corrupted,
+//    truncated, or lost?"
+// This means the local raft WAL is missing the last entry the leader thinks
+// the follower acknowledged (lost write, unclean shutdown, etc). The fix is
+// always the same and well-documented; this codifies it so we don't fumble
+// the procedure under pressure.
+//
+// Preconditions:
+//   - At least one healthy peer is reachable so the cluster has quorum after
+//     we remove the broken member. (For a 3-node cluster: 2 healthy. For a
+//     5-node: 3 healthy.) If quorum would be lost, this function refuses.
+//   - SSH access to both the broken node and a healthy peer.
+//   - etcdctl available on the healthy peer (k3s does not bundle it; the
+//     procedure installs it on demand on Fedora).
+
+import type { SshClient } from "../types.js";
+
+const ETCD_TLS = {
+  ca: "/var/lib/rancher/k3s/server/tls/etcd/server-ca.crt",
+  cert: "/var/lib/rancher/k3s/server/tls/etcd/server-client.crt",
+  key: "/var/lib/rancher/k3s/server/tls/etcd/server-client.key",
+} as const;
+
+const SSH_TIMEOUT = 60_000;
+
+export interface RecoverEtcdMemberOptions {
+  /** SSH client for the broken node (the one panicking). */
+  broken: SshClient;
+  /** SSH client for any healthy server peer in the same cluster. */
+  peer: SshClient;
+  /** Hostname (k8s node name) of the broken node. Used to find its etcd member id. */
+  brokenHostname: string;
+  /** Logger for progress output. */
+  log?: (msg: string) => void;
+}
+
+export interface RecoverEtcdMemberResult {
+  success: boolean;
+  changed: boolean;
+  message: string;
+  /** New etcd member id assigned after rejoin (when known). */
+  newMemberId?: string;
+  /** Old etcd member id that was removed. */
+  removedMemberId?: string;
+  error?: string;
+}
+
+function etcdctl(subcmd: string): string {
+  return [
+    "ETCDCTL_API=3 etcdctl",
+    `--cacert=${ETCD_TLS.ca}`,
+    `--cert=${ETCD_TLS.cert}`,
+    `--key=${ETCD_TLS.key}`,
+    "--endpoints=https://127.0.0.1:2379",
+    "--command-timeout=10s",
+    subcmd,
+  ].join(" ");
+}
+
+async function ensureEtcdctl(peer: SshClient): Promise<void> {
+  const probe = await peer.exec("command -v etcdctl 2>/dev/null", { timeoutMs: 5_000 });
+  if (probe.exitCode === 0 && probe.stdout.trim()) return;
+  // Best-effort install on Fedora. If the host isn't dnf-based, surface the
+  // error to the caller via the next etcdctl invocation.
+  await peer.exec("dnf install -y etcd 2>&1", { timeoutMs: 120_000 });
+}
+
+async function getMemberList(peer: SshClient): Promise<Array<{ id: string; name: string }>> {
+  const result = await peer.exec(etcdctl("member list"), { timeoutMs: SSH_TIMEOUT });
+  if (result.exitCode !== 0) {
+    throw new Error(`etcdctl member list failed: ${result.stderr || result.stdout}`);
+  }
+  // Format: <hex-id>, started, <name>, <peer-urls>, <client-urls>, <isLearner>
+  return result.stdout
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .map((line) => {
+      const [id, , name] = line.split(",").map((p) => p.trim());
+      return { id: id ?? "", name: name ?? "" };
+    })
+    .filter((m) => m.id);
+}
+
+export async function recoverEtcdMember(
+  opts: RecoverEtcdMemberOptions,
+): Promise<RecoverEtcdMemberResult> {
+  const log = opts.log ?? (() => {});
+
+  try {
+    log(`Looking up etcd member id for ${opts.brokenHostname} via peer...`);
+    await ensureEtcdctl(opts.peer);
+
+    const members = await getMemberList(opts.peer);
+    if (members.length < 3) {
+      return {
+        success: false,
+        changed: false,
+        message: "Refusing to remove a member from a cluster with <3 members (quorum would be lost)",
+        error: `member count = ${members.length}`,
+      };
+    }
+
+    // Member names are <hostname>-<random-suffix>; match by hostname prefix.
+    const broken = members.find((m) => m.name.startsWith(opts.brokenHostname));
+    if (!broken) {
+      return {
+        success: false,
+        changed: false,
+        message: `No etcd member found matching hostname ${opts.brokenHostname}`,
+        error: `members: ${members.map((m) => m.name).join(", ")}`,
+      };
+    }
+    log(`Broken member: ${broken.id} (${broken.name})`);
+
+    log("Step 1/4: stopping k3s on broken node");
+    await opts.broken.exec("systemctl stop k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
+
+    log("Step 2/4: removing broken etcd member from cluster");
+    const remove = await opts.peer.exec(
+      etcdctl(`member remove ${broken.id}`),
+      { timeoutMs: SSH_TIMEOUT },
+    );
+    if (remove.exitCode !== 0) {
+      return {
+        success: false,
+        changed: false,
+        message: "etcdctl member remove failed",
+        error: remove.stderr || remove.stdout,
+        removedMemberId: broken.id,
+      };
+    }
+
+    log("Step 3/4: archiving corrupt etcd state and stale TLS/cred dirs on broken node");
+    const ts = Math.floor(Date.now() / 1000);
+    await opts.broken.exec(
+      [
+        `mv /var/lib/rancher/k3s/server/db /var/lib/rancher/k3s/server/db.corrupt-${ts} 2>/dev/null || true`,
+        "rm -rf /var/lib/rancher/k3s/server/tls /var/lib/rancher/k3s/server/cred",
+      ].join(" && "),
+      { timeoutMs: SSH_TIMEOUT },
+    );
+
+    log("Step 4/4: starting k3s on broken node — it will rejoin");
+    await opts.broken.exec("systemctl start k3s 2>&1", { timeoutMs: SSH_TIMEOUT });
+
+    // Poll for rejoin. The new member-id is what the cluster assigns on join.
+    let newMemberId: string | undefined;
+    for (let i = 0; i < 60; i++) {
+      await new Promise((r) => setTimeout(r, 5_000));
+      try {
+        const after = await getMemberList(opts.peer);
+        const rejoined = after.find(
+          (m) => m.name.startsWith(opts.brokenHostname) && m.id !== broken.id,
+        );
+        if (rejoined) {
+          newMemberId = rejoined.id;
+          break;
+        }
+      } catch {
+        // peer may briefly be unreachable mid-rejoin — keep polling
+      }
+    }
+
+    if (!newMemberId) {
+      return {
+        success: false,
+        changed: true,
+        message: "k3s started but new member did not appear in cluster within 5 minutes",
+        removedMemberId: broken.id,
+      };
+    }
+
+    log(`Rejoined as ${newMemberId}`);
+    return {
+      success: true,
+      changed: true,
+      message: `Recovered: removed ${broken.id}, rejoined as ${newMemberId}`,
+      removedMemberId: broken.id,
+      newMemberId,
+    };
+  } catch (err) {
+    return {
+      success: false,
+      changed: false,
+      message: "Recovery failed",
+      error: err instanceof Error ? err.message : String(err),
+    };
+  }
+}
--- a/bastion/src/modules/modules/k3s/src/operations/index.ts
+++ b/bastion/src/modules/modules/k3s/src/operations/index.ts
@@ -11,7 +11,13 @@ export { installK3sBinary } from "./k3s-install.js";
 export { installCilium } from "./cilium.js";
 export { fixCoreDnsUpstream } from "./dns-fix.js";
 export { configureLogRotation } from "./log-rotation.js";
+export { configureJournaldLimits } from "./journald-limits.js";
 export { applyDefaultNetworkPolicies } from "./network-policy.js";
 export { applyPodSecurityStandards } from "./pod-security.js";
 export { checkCertExpiry } from "./cert-check.js";
 export { configureLonghornDisk } from "./longhorn-disk.js";
+export { recoverEtcdMember } from "./etcd-recover.js";
+export type {
+  RecoverEtcdMemberOptions,
+  RecoverEtcdMemberResult,
+} from "./etcd-recover.js";
--- a/bastion/src/modules/modules/k3s/src/operations/journald-limits.ts
+++ b/bastion/src/modules/modules/k3s/src/operations/journald-limits.ts
@@ -0,0 +1,33 @@
+// Cap journald disk usage so audit logs (which now flow through journald via
+// kube-apiserver's stdout) cannot fill /var/log. Default journald uses up to
+// 10% of the filesystem, capped at 4 GB. In a /var/log of ~10 GB shared with
+// other services, that's still room for audit volume to evict useful logs.
+// 2 GB / 200 MB-per-file is a comfortable middle.
+
+import type { Operation, OperationResult } from "../types.js";
+import { sshOpts, writeRemoteFile } from "../utils.js";
+
+const DROPIN_CONTENT = `[Journal]
+SystemMaxUse=2G
+SystemKeepFree=1G
+SystemMaxFileSize=200M
+`;
+
+const DROPIN_PATH = "/etc/systemd/journald.conf.d/10-k3s-audit-cap.conf";
+
+export const configureJournaldLimits: Operation = async (ctx): Promise<OperationResult> => {
+  const changed = await writeRemoteFile(ctx, DROPIN_PATH, DROPIN_CONTENT);
+  if (changed) {
+    // Reload journald so the new limit applies without a reboot.
+    await ctx.ssh.exec(
+      "systemctl kill --signal=SIGUSR2 systemd-journald 2>/dev/null; " +
+      "systemctl restart systemd-journald 2>&1 || true",
+      sshOpts(ctx),
+    );
+  }
+  return {
+    success: true,
+    changed,
+    message: changed ? "journald limits configured (2 GB cap)" : "journald limits already configured",
+  };
+};
--- a/bastion/src/modules/modules/k3s/src/operations/k3s-config.ts
+++ b/bastion/src/modules/modules/k3s/src/operations/k3s-config.ts
@@ -13,6 +13,12 @@ function generateServerConfig(config: K3sConfig): string {
  const clusterLines = isJoining
    ? `server: "${config.k3sServerUrl}"\ntoken: "${config.k3sToken}"`
    : "cluster-init: true";
+  // audit-log-path=- routes audit events to k3s.service's stdout, which systemd
+  // forwards to journald. journald enforces its own size caps (see
+  // configureJournaldLimits) so audit volume cannot fill the disk. File-based
+  // audit logs led to /var/log/kubernetes growing to 7+ GB because apiserver's
+  // own rotation produced files that any logrotate glob would double-rotate
+  // and never expire.
  return `# k3s server configuration — CIS hardened, etcd HA
 ${clusterLines}
 protect-kernel-defaults: true
@@ -30,10 +36,7 @@ node-label:

 kube-apiserver-arg:
  - "anonymous-auth=false"
-  - "audit-log-path=/var/log/kubernetes/audit.log"
-  - "audit-log-maxage=30"
-  - "audit-log-maxbackup=10"
-  - "audit-log-maxsize=100"
+  - "audit-log-path=-"
  - "audit-policy-file=/etc/rancher/k3s/audit-policy.yaml"
  - "enable-admission-plugins=NodeRestriction,PodSecurity"
  - "request-timeout=300s"
@@ -61,7 +64,7 @@ kubelet-arg:
 }

 export const writeK3sConfig: Operation = async (ctx): Promise<OperationResult> => {
-  await ctx.ssh.exec("mkdir -p /etc/rancher/k3s /var/log/kubernetes", sshOpts(ctx));
+  await ctx.ssh.exec("mkdir -p /etc/rancher/k3s", sshOpts(ctx));

  const content = isServerRole(ctx.config.role)
    ? generateServerConfig(ctx.config)
--- a/bastion/src/modules/modules/k3s/src/operations/log-rotation.ts
+++ b/bastion/src/modules/modules/k3s/src/operations/log-rotation.ts
@@ -1,25 +1,44 @@
-// Configure log rotation for k3s.
+// Decommission file-based k8s audit logging in favor of journald.
+//
+// Earlier versions wrote audit events to /var/log/kubernetes/audit.log and
+// rotated them with a logrotate rule. Two failure modes followed: kube-apiserver
+// rotated internally (audit-{ts}.log), the *.log glob in logrotate
+// double-rotated those (-{date}), and the resulting filename matched no
+// retention policy, so the directory grew unbounded (we observed 7+ GB).
+//
+// k3s now sets audit-log-path=- so audit goes to stdout → journald, which
+// enforces SystemMaxUse caps. This operation removes the obsolete logrotate
+// rule and reaps any audit files left behind by the old setup. Idempotent: on
+// fresh installs everything is already absent and the operation is a no-op.

 import type { Operation, OperationResult } from "../types.js";
-import { writeRemoteFile } from "../utils.js";
+import { sshOpts } from "../utils.js";

-const LOGROTATE_CONFIG = `/var/log/kubernetes/*.log {
-  daily
-  rotate 14
-  compress
-  delaycompress
-  missingok
-  notifempty
-  copytruncate
-  maxsize 100M
-}`;
+const REMOVE_LOGROTATE = "rm -f /etc/logrotate.d/k3s";
+
+// Bounded by a max-depth and explicit name pattern so we never reach outside
+// the deprecated audit-log directory.
+const REAP_OLD_AUDIT_FILES =
+  "find /var/log/kubernetes -maxdepth 1 -type f " +
+  "\\( -name 'audit*.log*' -o -name 'audit-*.log' \\) " +
+  "-delete 2>/dev/null; " +
+  "rmdir /var/log/kubernetes 2>/dev/null; true";

 export const configureLogRotation: Operation = async (ctx): Promise<OperationResult> => {
-  const changed = await writeRemoteFile(ctx, "/etc/logrotate.d/k3s", LOGROTATE_CONFIG);
+  const before = await ctx.ssh.exec(
+    "test -e /etc/logrotate.d/k3s -o -d /var/log/kubernetes && echo present || echo absent",
+    sshOpts(ctx),
+  );
+  const wasPresent = before.stdout.trim() === "present";
+
+  await ctx.ssh.exec(REMOVE_LOGROTATE, sshOpts(ctx));
+  await ctx.ssh.exec(REAP_OLD_AUDIT_FILES, sshOpts(ctx));

  return {
    success: true,
-    changed,
-    message: changed ? "Log rotation configured" : "Log rotation already configured",
+    changed: wasPresent,
+    message: wasPresent
+      ? "Removed legacy file-based audit logging (now via journald)"
+      : "No legacy audit log artifacts present",
  };
 };
--- a/bastion/src/modules/modules/k3s/tests/install.test.ts
+++ b/bastion/src/modules/modules/k3s/tests/install.test.ts
@@ -71,9 +71,14 @@ describe("k3s install script — server role", () => {
    expect(script).toContain("enable-admission-plugins=NodeRestriction,PodSecurity");
  });

-  it("configures audit logging", () => {
-    expect(script).toContain("audit-log-path=/var/log/kubernetes/audit.log");
-    expect(script).toContain("audit-log-maxage=30");
+  it("configures audit logging via journald (stdout)", () => {
+    expect(script).toContain("audit-log-path=-");
+    // file-based fields and the now-obsolete log directory must be gone
+    expect(script).not.toContain("/var/log/kubernetes/audit.log");
+    expect(script).not.toContain("audit-log-maxage");
+    expect(script).not.toContain("audit-log-maxbackup");
+    expect(script).not.toContain("audit-log-maxsize");
+    expect(script).not.toContain("mkdir -p /var/log/kubernetes");
  });

  it("cleans stale flannel vxlan before Cilium install", () => {
--- a/bastion/src/modules/modules/k3s/tests/operations.test.ts
+++ b/bastion/src/modules/modules/k3s/tests/operations.test.ts
@@ -348,3 +348,143 @@ describe("applyPodSecurityStandards", () => {
    expectCommand(ctx.ssh, "pod-security.kubernetes.io/audit=restricted");
  });
 });
+
+// --- Audit Logging Decommission (file-based → journald) ---
+
+import { configureLogRotation } from "../src/operations/log-rotation.js";
+import { configureJournaldLimits } from "../src/operations/journald-limits.js";
+
+describe("configureLogRotation (decommission file-based audit logs)", () => {
+  it("removes the legacy logrotate rule and reaps obsolete audit files", async () => {
+    const ctx = mockCtx();
+    ctx.ssh.exec.mockResolvedValueOnce(stdout("present")); // probe: legacy artifacts exist
+    ctx.ssh.exec.mockResolvedValue(OK);
+
+    const result = await configureLogRotation(ctx);
+
+    expect(result.success).toBe(true);
+    expect(result.changed).toBe(true);
+    expectCommand(ctx.ssh, "rm -f /etc/logrotate.d/k3s");
+    expectCommand(ctx.ssh, /find \/var\/log\/kubernetes.*audit.*-delete/);
+    expectCommand(ctx.ssh, "rmdir /var/log/kubernetes");
+  });
+
+  it("is a no-op when nothing legacy is present", async () => {
+    const ctx = mockCtx();
+    ctx.ssh.exec.mockResolvedValueOnce(stdout("absent"));
+    ctx.ssh.exec.mockResolvedValue(OK);
+
+    const result = await configureLogRotation(ctx);
+    expect(result.success).toBe(true);
+    expect(result.changed).toBe(false);
+  });
+});
+
+describe("configureJournaldLimits", () => {
+  it("writes a 2 GB SystemMaxUse drop-in and reloads journald when changed", async () => {
+    const ctx = mockCtx();
+    ctx.ssh.exec.mockResolvedValueOnce(stdout("__LABCTL_NOT_FOUND__")); // no existing drop-in
+    ctx.ssh.exec.mockResolvedValue(OK);
+
+    const result = await configureJournaldLimits(ctx);
+
+    expect(result.success).toBe(true);
+    expect(result.changed).toBe(true);
+    const writeCall = ctx.ssh.exec.mock.calls.find((c) => {
+      const cmd = c[0] as string;
+      return cmd.includes("10-k3s-audit-cap.conf") && cmd.includes("LABCTL_EOF");
+    });
+    expect(writeCall).toBeTruthy();
+    const written = writeCall?.[0] as string;
+    expect(written).toContain("SystemMaxUse=2G");
+    expect(written).toContain("SystemKeepFree=1G");
+    expectCommand(ctx.ssh, "systemctl restart systemd-journald");
+  });
+
+  it("does not restart journald when the drop-in is already correct", async () => {
+    const ctx = mockCtx();
+    const existing =
+      "[Journal]\nSystemMaxUse=2G\nSystemKeepFree=1G\nSystemMaxFileSize=200M\n";
+    ctx.ssh.exec.mockResolvedValueOnce(stdout(existing));
+    ctx.ssh.exec.mockResolvedValue(OK);
+
+    const result = await configureJournaldLimits(ctx);
+
+    expect(result.success).toBe(true);
+    expect(result.changed).toBe(false);
+    expectNoCommand(ctx.ssh, "systemctl restart systemd-journald");
+  });
+});
+
+// --- Etcd Recovery ---
+
+import { recoverEtcdMember } from "../src/operations/etcd-recover.js";
+import { mockSsh } from "./helpers.js";
+
+describe("recoverEtcdMember", () => {
+  it("refuses to operate when cluster is below 3 members (quorum risk)", async () => {
+    const broken = mockSsh();
+    const peer = mockSsh();
+    peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl")); // etcdctl present
+    peer.exec.mockResolvedValueOnce(stdout(
+      "111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
+      "222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
+    ));
+
+    const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
+
+    expect(result.success).toBe(false);
+    expect(result.message).toMatch(/quorum/i);
+    // Critically: must NOT have stopped k3s or removed anything
+    expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
+  });
+
+  it("performs full procedure when quorum is preserved", async () => {
+    const broken = mockSsh();
+    const peer = mockSsh();
+    // ensureEtcdctl: present
+    peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
+    // member list (3 members, target = host-b)
+    peer.exec.mockResolvedValueOnce(stdout(
+      "111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
+      "222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
+      "333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
+    ));
+    // member remove
+    peer.exec.mockResolvedValueOnce(stdout("Member 222 removed"));
+    // post-rejoin member list — new id 444 for host-b
+    peer.exec.mockResolvedValueOnce(stdout(
+      "111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
+      "333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false\n" +
+      "444, started, host-b-zzz, https://10.0.0.2:2380, https://10.0.0.2:2379, false",
+    ));
+
+    const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-b" });
+
+    expect(result.success).toBe(true);
+    expect(result.removedMemberId).toBe("222");
+    expect(result.newMemberId).toBe("444");
+    expectCommand(broken,"systemctl stop k3s");
+    expectCommand(peer,"member remove 222");
+    expectCommand(broken,/db\.corrupt-/);
+    expectCommand(broken,/rm -rf .*\/server\/tls/);
+    expectCommand(broken,"systemctl start k3s");
+  });
+
+  it("fails clearly when no member matches the broken hostname", async () => {
+    const broken = mockSsh();
+    const peer = mockSsh();
+    peer.exec.mockResolvedValueOnce(stdout("/usr/bin/etcdctl"));
+    peer.exec.mockResolvedValueOnce(stdout(
+      "111, started, host-a-aaa, https://10.0.0.1:2380, https://10.0.0.1:2379, false\n" +
+      "222, started, host-b-bbb, https://10.0.0.2:2380, https://10.0.0.2:2379, false\n" +
+      "333, started, host-c-ccc, https://10.0.0.3:2380, https://10.0.0.3:2379, false",
+    ));
+
+    const result = await recoverEtcdMember({ broken, peer, brokenHostname: "host-d" });
+
+    expect(result.success).toBe(false);
+    expect(result.message).toMatch(/No etcd member found/);
+    expect(broken.exec).not.toHaveBeenCalledWith(expect.stringContaining("systemctl stop k3s"), expect.anything());
+  });
+});
--- a/bastion/tsconfig.json
+++ b/bastion/tsconfig.json
@@ -1,6 +1,7 @@
 {
  "files": [],
  "references": [
+    { "path": "src/core" },
    { "path": "src/shared" },
    { "path": "src/bastion" },
    { "path": "src/cli" },
Author	SHA1	Message	Date
Michal	98b0ccc6c9	feat(cli): honor BASTION_DIR env var as default for --dir Some checks failed CI/CD / typecheck (pull_request) Failing after 21s Details CI/CD / test (pull_request) Failing after 22s Details CI/CD / lint (pull_request) Failing after 7m2s Details CI/CD / build (pull_request) Has been skipped Details CI/CD / publish-rpm (pull_request) Has been skipped Details CI/CD / publish-deb (pull_request) Has been skipped Details bastion serve/stop default for --dir was hardcoded to /tmp/lab-bastion. Now reads BASTION_DIR from env if set, so a deployed bastion daemon can run from a persistent directory without callers having to pass --dir on every invocation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-05 22:09:24 +01:00
Michal	37a3b51e57	build(labd): include @lab/core in the Dockerfile build chain The v2.0 Phase 1 commit (`04faa07`) introduced the @lab/core package but the labd Dockerfile still only copied @lab/shared and @lab/labd, so the container build would fail to resolve @lab/core imports. Both stages updated: - Builder: copy @lab/core package.json/tsconfig + src, add it to the build order between @lab/shared and @lab/labd. - Runtime: copy @lab/core dist and package.json into the final image. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-05 22:09:24 +01:00
Michal	d6e1f3c74d	fix(labd): preserve machine identity across bastion restarts The worker0-k8s0 bug: when labd restarts, the in-memory installed map is lost. The next DHCP/PXE re-discovery for that MAC ran an upsert that wrote status="discovered", silently downgrading the DB record from "online" or "offline" and erasing the machine's known hostname/role identity from the CLI view. - server.ts: drop status="discovered" from the upsert update branch so re-discovery cannot downgrade an installed record. - routes/bastions.ts (/api/machines): when the DB knows a real hostname+role for a MAC currently only in live.discovered, promote it back to live.installed so the CLI sees the right state. Also reordered the live-vs-DB fallback so DB online/offline maps to live.installed and the discovered branch is the else. - tests: 3 new vitest cases covering promotion, fresh-discovery, and unknown-MAC fallback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-05 22:09:24 +01:00
Michal	52e831b8c1	Merge branch 'main' into feat/v2-phase1-foundation	2026-05-05 22:06:34 +01:00
michal	f5af24699a	Merge pull request 'fix(k3s): audit logs via journald + etcd recovery' (#13 ) from fix/k3s-audit-via-journald into main Some checks failed CI/CD / typecheck (push) Failing after 11s Details CI/CD / test (push) Failing after 9s Details CI/CD / lint (push) Failing after 21s Details CI/CD / build (push) Has been skipped Details CI/CD / publish-rpm (push) Has been skipped Details CI/CD / publish-deb (push) Has been skipped Details	2026-05-05 20:29:51 +00:00
Michal	dd92147341	fix(k3s): route audit logs through journald, codify etcd member recovery Some checks failed CI/CD / typecheck (pull_request) Failing after 13s Details CI/CD / lint (pull_request) Failing after 23s Details CI/CD / test (pull_request) Failing after 10s Details CI/CD / build (pull_request) Has been skipped Details CI/CD / publish-rpm (pull_request) Has been skipped Details CI/CD / publish-deb (pull_request) Has been skipped Details Two changes prompted by today's etcd raft panic on worker1-k8s0 (tocommit out of range, lost-write on follower) and the cascading disk pressure that surfaced underneath it. Audit logs to journald - kube-apiserver now uses audit-log-path=- so audit events flow to k3s.service stdout and into journald instead of growing files in /var/log/kubernetes. The previous setup combined apiserver's internal rotation with a logrotate *.log glob that double-rotated the rotated files into permanent orphans (observed: 7+ GB). - New journald-limits operation writes a SystemMaxUse=2G drop-in so audit volume cannot fill /var/log even under bursty load. - log-rotation operation repurposed to decommission the obsolete logrotate rule and reap leftover audit files. Idempotent: no-op on fresh installs. Etcd member recovery - New recoverEtcdMember(broken, peer, hostname) codifies the documented k3s recovery: stop k3s, etcdctl member remove, wipe /var/lib/rancher/k3s/server/{db,tls,cred}, restart, poll for rejoin. Refuses to operate when cluster size < 3 to preserve quorum. Tests - 7 new unit tests covering both decommission paths and the recovery procedure (54 total, all green). - install.test.ts asserts the file-based audit args are gone. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-05 21:29:16 +01:00
Michal	04faa079e2	feat: v2.0 Phase 1 foundation — @lab/core, auth, RBAC, audit, resource store New packages: - @lab/core: Resource types, Output<T> (Pulumi), audit event types, auth types, environment/account types, resource kind registry New Prisma schema (mcpctl pattern): - User (email/password/bcrypt), Session (bearer tokens), Group, GroupMember - ServiceAccount, RbacDefinition (JSON subjects + roleBindings) - AuditEvent (correlation IDs, causal chains, fire-and-forget batching) - Environment, Account (driver config, Infisical secret path), Binding - Resource (generic, kind/name/env unique, origin/managedBy tracking) - Secret, Fleet, FleetMember, GitSource - Keeps v1.0 models: Server, Agent, Bastion, Cluster, JoinToken New services: - AuthService: bearer token login, bootstrap (first login creates admin), session management with 30-day expiry - RbacService: environment-scoped permission checks, group membership, role hierarchy (admin > edit > view) - AuditService: fire-and-forget event collection, batch 50 / flush 5s, correlation IDs for causal chains - ResourceStore: CRUD with origin/managedBy, RBAC-enforced routes New routes: - POST /api/auth/login, POST /api/auth/logout (bearer token auth) - GET/POST/PUT/DELETE /api/resources (RBAC-enforced CRUD) - GET/POST /api/environments, GET/POST /api/accounts - POST /api/accounts/bind, GET /api/bindings - GET /api/events (audit query with --last, --kind, --env, --correlation) New middleware: - Bearer token auth (validates Authorization header, resolves user identity) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-04-02 01:42:28 +01:00
michal	95c99cb4d5	Merge pull request 'docs: CLAUDE.md routing rules + TODOS.md from v2.0 review' (#12 ) from feat/recheck-and-fixes into main Some checks failed CI/CD / lint (push) Failing after 12s Details CI/CD / typecheck (push) Failing after 22s Details CI/CD / test (push) Failing after 12s Details CI/CD / build (push) Has been skipped Details CI/CD / publish-rpm (push) Has been skipped Details CI/CD / publish-deb (push) Has been skipped Details Reviewed-on: #12	2026-04-02 00:31:44 +00:00