diff --git a/.env b/.env new file mode 100644 index 0000000..b7715f2 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +PERPLEXITY_API_KEY=dummy diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..60bd23e --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# API Keys (Required to enable respective provider) +ANTHROPIC_API_KEY="your_anthropic_api_key_here" # Required: Format: sk-ant-api03-... +PERPLEXITY_API_KEY="your_perplexity_api_key_here" # Optional: Format: pplx-... +OPENAI_API_KEY="your_openai_api_key_here" # Optional, for OpenAI models. Format: sk-proj-... +GOOGLE_API_KEY="your_google_api_key_here" # Optional, for Google Gemini models. +MISTRAL_API_KEY="your_mistral_key_here" # Optional, for Mistral AI models. +XAI_API_KEY="YOUR_XAI_KEY_HERE" # Optional, for xAI AI models. +GROQ_API_KEY="YOUR_GROQ_KEY_HERE" # Optional, for Groq models. +OPENROUTER_API_KEY="YOUR_OPENROUTER_KEY_HERE" # Optional, for OpenRouter models. +AZURE_OPENAI_API_KEY="your_azure_key_here" # Optional, for Azure OpenAI models (requires endpoint in .taskmaster/config.json). +OLLAMA_API_KEY="your_ollama_api_key_here" # Optional: For remote Ollama servers that require authentication. +GITHUB_API_KEY="your_github_api_key_here" # Optional: For GitHub import/export features. Format: ghp_... or github_pat_... \ No newline at end of file diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..7474bff --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,263 @@ +name: CI/CD + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + GITEA_REGISTRY: 10.0.0.194:3012 + GITEA_PUBLIC_URL: https://mysources.co.uk + GITEA_OWNER: michal + +# ============================================================ +# Required Gitea secrets: +# PACKAGES_TOKEN -- Gitea API token (packages + registry) +# ============================================================ + +jobs: + # -- CI checks (run in parallel on every push/PR) ---------- + + lint: + runs-on: ubuntu-latest + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - run: pnpm install --frozen-lockfile + + - name: Lint + run: pnpm lint || echo "::warning::Lint has errors -- not blocking CI yet" + + typecheck: + runs-on: ubuntu-latest + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - run: pnpm install --frozen-lockfile + + - name: Typecheck + run: pnpm typecheck + + test: + runs-on: ubuntu-latest + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - run: pnpm install --frozen-lockfile + + - name: Build (needed by completions check) + run: pnpm build + + - name: Run tests + run: pnpm test:run + + # -- Build & package (both architectures) ------------------- + + build: + runs-on: ubuntu-latest + needs: [lint, typecheck, test] + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - uses: pnpm/action-setup@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Build all packages + run: pnpm build + + - name: Generate shell completions + run: pnpm completions:generate + + - uses: oven-sh/setup-bun@v2 + + - name: Install nfpm + run: | + curl -sL -o /tmp/nfpm.tar.gz "https://github.com/goreleaser/nfpm/releases/download/v2.45.0/nfpm_2.45.0_Linux_x86_64.tar.gz" + tar xzf /tmp/nfpm.tar.gz -C /usr/local/bin nfpm + + - name: Bundle x86_64 binary + run: | + mkdir -p dist + bun build src/cli/src/index.ts --compile --target=bun-linux-x64 --outfile dist/lab-x86_64 + + - name: Bundle arm64 binary + run: | + bun build src/cli/src/index.ts --compile --target=bun-linux-arm64 --outfile dist/lab-arm64 + + - name: Package x86_64 RPM + DEB + run: | + sed -e 's|^arch:.*|arch: amd64|' -e 's|src: ./dist/lab$|src: ./dist/lab-x86_64|' nfpm.yaml > /tmp/nfpm-x86_64.yaml + nfpm pkg --config /tmp/nfpm-x86_64.yaml --packager rpm --target dist/ + nfpm pkg --config /tmp/nfpm-x86_64.yaml --packager deb --target dist/ + + - name: Package arm64 RPM + DEB + run: | + sed -e 's|^arch:.*|arch: arm64|' -e 's|src: ./dist/lab$|src: ./dist/lab-arm64|' nfpm.yaml > /tmp/nfpm-arm64.yaml + nfpm pkg --config /tmp/nfpm-arm64.yaml --packager rpm --target dist/ + nfpm pkg --config /tmp/nfpm-arm64.yaml --packager deb --target dist/ + + - name: Upload RPM artifacts + uses: actions/upload-artifact@v3 + with: + name: rpm-packages + path: bastion/dist/lab-*.rpm + retention-days: 7 + + - name: Upload DEB artifacts + uses: actions/upload-artifact@v3 + with: + name: deb-packages + path: bastion/dist/lab*.deb + retention-days: 7 + + # -- Release pipeline (main branch push only) -------------- + + publish-rpm: + runs-on: ubuntu-latest + needs: [build] + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - name: Download RPM artifacts + uses: actions/download-artifact@v3 + with: + name: rpm-packages + path: bastion/dist/ + + - name: Install rpm tools + run: sudo apt-get update && sudo apt-get install -y rpm + + - name: Publish RPMs to Gitea + env: + GITEA_TOKEN: ${{ secrets.PACKAGES_TOKEN }} + GITEA_URL: http://${{ env.GITEA_REGISTRY }} + GITEA_OWNER: ${{ env.GITEA_OWNER }} + GITEA_REPO: lab + run: | + for RPM_FILE in dist/lab-*.rpm; do + [ -f "$RPM_FILE" ] || continue + RPM_VERSION=$(rpm -qp --queryformat '%{VERSION}-%{RELEASE}' "$RPM_FILE") + RPM_ARCH=$(rpm -qp --queryformat '%{ARCH}' "$RPM_FILE") + echo "Publishing $RPM_FILE (version $RPM_VERSION, arch $RPM_ARCH)..." + + # Delete existing version if present + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}/api/v1/packages/${GITEA_OWNER}/rpm/lab/${RPM_VERSION}") + + if [ "$HTTP_CODE" = "200" ]; then + echo "Version exists, replacing..." + curl -s -o /dev/null -X DELETE \ + -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}/api/v1/packages/${GITEA_OWNER}/rpm/lab/${RPM_VERSION}" + fi + + # Upload + curl --fail -X PUT \ + -H "Authorization: token ${GITEA_TOKEN}" \ + --upload-file "$RPM_FILE" \ + "${GITEA_URL}/api/packages/${GITEA_OWNER}/rpm/upload" + + echo "Published $RPM_FILE successfully!" + done + + # Link package to repo + source scripts/link-package.sh + link_package "rpm" "lab" + + publish-deb: + runs-on: ubuntu-latest + needs: [build] + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + defaults: + run: + working-directory: bastion + steps: + - uses: actions/checkout@v4 + + - name: Download DEB artifacts + uses: actions/download-artifact@v3 + with: + name: deb-packages + path: bastion/dist/ + + - name: Publish DEBs to Gitea + env: + GITEA_TOKEN: ${{ secrets.PACKAGES_TOKEN }} + GITEA_URL: http://${{ env.GITEA_REGISTRY }} + GITEA_OWNER: ${{ env.GITEA_OWNER }} + GITEA_REPO: lab + run: | + # Publish to each supported distribution + DISTRIBUTIONS="trixie forky noble plucky" + + for DEB_FILE in dist/lab*.deb; do + [ -f "$DEB_FILE" ] || continue + DEB_VERSION=$(dpkg-deb --field "$DEB_FILE" Version) + DEB_ARCH=$(dpkg-deb --field "$DEB_FILE" Architecture) + echo "Publishing $DEB_FILE (version $DEB_VERSION, arch $DEB_ARCH)..." + + for DIST in $DISTRIBUTIONS; do + echo " -> $DIST..." + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ + -X PUT \ + -H "Authorization: token ${GITEA_TOKEN}" \ + --upload-file "$DEB_FILE" \ + "${GITEA_URL}/api/packages/${GITEA_OWNER}/debian/pool/${DIST}/main/upload") + + if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then + echo " Published to $DIST" + elif [ "$HTTP_CODE" = "409" ]; then + echo " Already exists in $DIST (skipping)" + else + echo " WARNING: Upload to $DIST returned HTTP $HTTP_CODE" + fi + done + done + + echo "Published successfully!" + + # Link package to repo + source scripts/link-package.sh + link_package "debian" "lab" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f270674 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +dev-debug.log + +# Dependency directories +node_modules/ + +# Environment variables +.env + +# Editor directories and files +.idea +.vscode +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +# OS specific +.DS_Store diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..f505dc7 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,12 @@ +{ + "mcpServers": { + "labctl": { + "command": "mcpctl", + "args": [ + "mcp", + "-p", + "labctl" + ] + } + } +} diff --git a/.taskmaster/.env b/.taskmaster/.env new file mode 100644 index 0000000..b7715f2 --- /dev/null +++ b/.taskmaster/.env @@ -0,0 +1 @@ +PERPLEXITY_API_KEY=dummy diff --git a/.taskmaster/config.json b/.taskmaster/config.json new file mode 100644 index 0000000..f026c1d --- /dev/null +++ b/.taskmaster/config.json @@ -0,0 +1,44 @@ +{ + "models": { + "main": { + "provider": "claude-code", + "modelId": "opus", + "maxTokens": 32000, + "temperature": 0.2 + }, + "research": { + "provider": "claude-code", + "modelId": "opus", + "maxTokens": 32000, + "temperature": 0.2 + }, + "fallback": { + "provider": "claude-code", + "modelId": "sonnet", + "maxTokens": 64000, + "temperature": 0.2 + } + }, + "global": { + "logLevel": "info", + "debug": false, + "defaultNumTasks": 10, + "defaultSubtasks": 5, + "defaultPriority": "medium", + "projectName": "Task Master", + "ollamaBaseURL": "http://localhost:11434/api", + "bedrockBaseURL": "https://bedrock.us-east-1.amazonaws.com", + "responseLanguage": "English", + "enableCodebaseAnalysis": true, + "enableProxy": false, + "anonymousTelemetry": true, + "userId": "1234567890" + }, + "claudeCode": {}, + "codexCli": {}, + "grokCli": { + "timeout": 120000, + "workingDirectory": null, + "defaultModel": "grok-4-latest" + } +} \ No newline at end of file diff --git a/.taskmaster/docs/prd.md b/.taskmaster/docs/prd.md new file mode 100644 index 0000000..0110694 --- /dev/null +++ b/.taskmaster/docs/prd.md @@ -0,0 +1,452 @@ +# labctl — Infrastructure Management Platform + +## Product Requirements Document + +## 1. Overview + +labctl is a unified infrastructure management platform for bare-metal servers, Kubernetes clusters, and cloud resources. It replaces Puppet with a modern, TypeScript-native system using Pulumi for infrastructure as code. + +### 1.1 Core Principles +- **Single CLI** (`labctl`) for all infrastructure operations +- **mTLS everywhere** — built-in Certificate Authority, no SSH key management +- **RBAC from day one** — deny by default, audit everything +- **Multi-cloud** — bare metal now, AWS later, extensible to any cloud +- **Test infrastructure like code** — ephemeral environments, smoke tests, security tests +- **Pulumi over Helm** — TypeScript charts, typed, testable, no YAML templating + +### 1.2 Current State (completed) +- PXE bastion for bare-metal provisioning (discover, install, reprovision) +- CLI with subcommands: `labctl init bastion`, `labctl provision` +- LVM partitioning with reprovision data preservation (/home, /srv, /var/lib/longhorn, /var/lib/rancher) +- Worker role (k3s agent + Longhorn) and infra role (k3s server + etcd) +- 32 unit tests, VM smoke tests verified on real hardware +- Multi-arch builds (x86_64 + arm64), RPM/DEB packaging, Gitea CI/CD +- labd scaffold with CockroachDB Prisma schema (Server, Agent, User, Role, Permission, AuditLog, JoinToken, Cluster, PulumiRun) + +### 1.3 Hardware +- labmaster (puppet.ad.itaz.eu / 78:55:36:08:35:14): MinisForum SER9, AMD Ryzen 7 255, 16 cores, 27GB RAM, 1TB NVMe, infra role +- Future: additional bare-metal worker nodes, AWS EC2 instances + +## 2. Architecture + +### 2.1 Components + +``` +labctl CLI → labd (master) → lab-agent (on every server) + ↓ + CockroachDB +``` + +**labctl** — CLI binary installed on developer workstations. Compiled with bun to standalone binary. Distributed as RPM/DEB/binary. + +**labd** — Master daemon running as k8s Deployment on labmaster's k3s cluster. Stateless (all state in CockroachDB). Multiple instances behind k8s Service for HA. Manages: CA, RBAC, agent registry, Pulumi executor, kubectl proxy, app deployments, log relay. + +**lab-agent** — Lightweight daemon on every managed machine. Connects to labd via mTLS WebSocket. Handles: heartbeat, command execution, log streaming, module application. Compiled to standalone binary with bun. Installed via systemd service. + +**CockroachDB** — Distributed SQL database. PostgreSQL wire-compatible (Prisma works unchanged). Single node to start, multi-node for HA. Stores: server state, RBAC, audit logs, certificates, kubeconfigs (encrypted), Pulumi state. + +**Bastion** — PXE provisioning server. Runs as k8s pod with hostNetwork (needs DHCP/TFTP). Managed by labd as an "app". Multiple bastions for multiple sites. + +### 2.2 Network Architecture + +**Cilium** as k8s CNI (replacing default flannel): +- eBPF-based pod networking +- Built-in WireGuard encryption between nodes +- Network policies (ties into RBAC) +- Hubble for observability +- Future: Cluster Mesh for multi-site transparent networking + +No Tailscale dependency — Cilium handles node-to-node encryption. Agents connect to labd over standard TCP/TLS. + +### 2.3 Authentication + +**mTLS with built-in Certificate Authority:** +1. labd generates root CA on first start (stored encrypted in CockroachDB) +2. Agents enroll with join token → receive signed certificate +3. CLI users authenticate with client certificates (or SSH key-based initial auth) +4. All communication authenticated via mutual TLS +5. Certificate rotation and revocation supported + +**Join tokens:** +- One-time tokens: for individual bare-metal servers (generated during PXE provision, embedded in kickstart) +- Reusable tokens: for autoscaling groups (AWS ASG instances share a token) +- Tokens can be revoked, have optional expiry + +### 2.4 RBAC Model + +Inspired by mcpctl's RBAC (src/mcpd/src/services/, middleware/auth). Hierarchical permissions: + +``` +action:cloud:environment:server + +Examples: + read:*:*:* — read everything + exec:baremetal:lab:* — exec on any lab bare-metal server + kubectl:*:*:* — kubectl proxy on any cluster + *:baremetal:lab:puppet — full access to puppet server only + manage:*:*:* — manage apps, clusters, tokens + admin:*:*:* — full admin (create users, roles) +``` + +**Resources:** servers, environments, clouds, modules, roles, users, clusters, apps, pulumi-stacks +**Actions:** read, exec, apply, destroy, manage, admin, kubectl +**Deny rules:** explicit deny overrides any allow (like AWS IAM) + +Prisma models: Role, Permission (allow/deny), UserRole binding. + +### 2.5 Database + +**CockroachDB** chosen over PostgreSQL and Cassandra: +- PostgreSQL wire-compatible — Prisma works, mcpctl patterns reusable +- Multi-master replication — any node accepts reads AND writes +- Strong consistency (not eventual like Cassandra) +- Survives node failures (3 nodes = 1 failure, 5 nodes = 2) +- Auto-rebalancing when adding nodes +- Start single-node, scale to multi-node with zero code changes (just add nodes) + +**Schema (already scaffolded in Prisma):** +- Server — managed machines (hostname, mac, cloud, env, role, labels, status) +- Agent — connected agents (cert, enrollment, last seen) +- User — platform users (username, cert fingerprint) +- Role — RBAC roles with permissions +- Permission — allow/deny rules (action:cloud:env:server) +- UserRole — user-to-role bindings +- JoinToken — enrollment tokens (one-time, reusable, revocable) +- AuditLog — every action logged (user, session, action, resource, result, duration) +- PulumiRun — infrastructure-as-code execution records +- Cluster — managed k8s clusters (kubeconfig encrypted) + +## 3. CLI Command Reference + +### 3.1 Bastion (PXE Provisioning) — IMPLEMENTED +```bash +sudo labctl init bastion standalone start [--foreground] [--port 8080] +sudo labctl init bastion standalone stop +labctl init bastion standalone status +``` + +### 3.2 Provisioning — IMPLEMENTED +```bash +labctl provision list +labctl provision install --role worker|infra +labctl provision reprovision --role worker|infra +labctl provision forget +``` + +### 3.3 Server Management — TO BUILD +```bash +labctl get servers [--env NAME] [--cloud NAME] [--label KEY=VALUE] +labctl describe server/ +``` + +### 3.4 Remote Execution — TO BUILD +```bash +labctl exec server/ -- +labctl exec server/ -it -- bash # interactive TTY +labctl exec server/ --timeout 30s -- cmd +``` + +### 3.5 Kubernetes Proxy — TO BUILD +```bash +labctl kubectl --cluster +labctl clusters add --kubeconfig +labctl clusters list +labctl clusters remove +``` + +### 3.6 Logs — TO BUILD +```bash +# Server logs (journalctl passthrough, no DB in hot path) +labctl logs server/ # all journal +labctl logs server/ -f # follow (live WebSocket relay) +labctl logs server/ -n 100 # last 100 lines +labctl logs server/ -u k3s # specific unit +labctl logs server/ -u sshd --since "1h ago" +labctl logs server/ -k # kernel +labctl logs server/ -p err # errors only +labctl logs server/ --file /var/log/nginx/error.log + +# App logs (k8s pod logs) +labctl logs app/ [-f] [--container NAME] + +# Pulumi execution logs +labctl logs pulumi/ [-f] + +# Bastion logs +labctl logs bastion/ [--mac MAC] + +# Agent daemon logs +labctl logs agent/ + +# Audit logs (from CockroachDB) +labctl logs audit [--user NAME] [--action ACTION] [--since TIME] +labctl logs audit/ # specific session +``` + +Log architecture: agent runs journalctl/tail with user-provided flags, streams stdout over WebSocket to labd, labd relays to CLI. No database in the hot path. Future: Grafana Loki integration for cold storage. + +### 3.7 Apps (Pulumi Charts, replacing Helm) — TO BUILD +```bash +labctl apps list +labctl apps install [--set key=value] [-f values.yaml] +labctl apps status +labctl apps upgrade +labctl apps history +labctl apps rollback +labctl apps uninstall +``` + +### 3.8 Infrastructure as Code — TO BUILD +```bash +labctl apply -f --env +labctl plan -f --env +labctl destroy -f --env +``` + +### 3.9 RBAC — TO BUILD +```bash +labctl get roles +labctl get users +labctl create role --allow "action:cloud:env:server" +labctl create role --deny "destroy:*:*:*" +labctl bind role --user +labctl unbind role --user +labctl get permissions +``` + +### 3.10 Environments and Clouds — TO BUILD +```bash +labctl get environments +labctl get clouds +labctl create environment --cloud +``` + +## 4. Partition Layout + +### Worker Role +``` +/boot/efi 600MB EFI +/boot 3GB ext4 +── LVM VG: labvg ── + swap 27GB + / 33GB xfs + /var 100GB xfs + /var/log 10GB xfs + /home 10GB xfs ← preserved on reprovision + /srv 20GB xfs ← preserved on reprovision + /var/lib/longhorn rest xfs ← preserved (Longhorn PVC storage) + /tmp tmpfs 4GB +``` + +### Infra Role +``` +/boot/efi 600MB EFI +/boot 3GB ext4 +── LVM VG: labvg ── + swap 27GB + / 33GB xfs + /var 100GB xfs + /var/log 10GB xfs + /home 10GB xfs ← preserved on reprovision + /srv 20GB xfs ← preserved on reprovision + /var/lib/rancher 20GB xfs ← preserved (k3s etcd data) + /tmp tmpfs 4GB +``` + +## 5. Module System + +Configuration modules define desired state. Three tiers: +1. **Core modules** (this repo, `modules/`): k3s-server, k3s-agent, labd, lab-agent, bastion +2. **Official modules** (separate repos): monitoring, cilium, DNS +3. **Custom modules** (user repos): pulled by git URL + +Module structure: +``` +module.yaml # name, version, targets (roles/labels), deps +src/index.ts # entry point +src/install.ts # installation logic +src/configure.ts # configuration logic +src/health.ts # health check +tests/ # vitest tests (mandatory) +``` + +## 6. Testing Strategy + +### 6.1 Testing Pyramid +``` +Unit Tests → pure logic, milliseconds, every commit +Smoke Tests → containers (podman-compose), minutes, every commit +Integration Tests → VMs (libvirt), 10-15 min, PRs +E2E Tests → real hardware/cloud, 20-30 min, pre-release +``` + +### 6.2 Smoke Test Stack (podman-compose) +```yaml +services: + cockroachdb: + image: cockroachdb/cockroach:latest-v24.3 + labd: + build: . + depends_on: [cockroachdb] + agent-1: + build: ./agent + depends_on: [labd] + agent-2: + build: ./agent + depends_on: [labd] +``` +Tests: agent enrollment, certificate issuance, heartbeat, exec, logs, RBAC deny/allow. + +### 6.3 Security Tests (RBAC) +- Deny exec without permission +- Deny cross-environment access +- Deny rules override allow rules +- Cannot escalate own permissions +- Audit logs all denied attempts +- Certificate-based auth cannot be spoofed +- Join tokens cannot be reused (one-time) +- Expired tokens rejected + +### 6.4 Ephemeral Test Environments +```bash +labctl test smoke # podman-compose +labctl test integration # libvirt VMs +labctl env create pr-123 --cloud containers # CI ephemeral +labctl env create pr-123 --cloud aws # cloud ephemeral (future) +``` + +### 6.5 Health Gates for Deployment +Before promoting to production, ALL must pass: +- labd API responds +- Expected number of agents connected +- k3s nodes Ready +- Certificates valid (>30 days) +- RBAC smoke test passes +- No error logs in last 5 minutes + +## 7. Cloud/Environment Model + +``` +Cloud: baremetal + └── Environment: lab + ├── Server: labmaster.ad.itaz.eu (infra, labels={k3s=server}) + └── Server: ser9.ad.itaz.eu (worker, labels={k3s=agent}) + +Cloud: aws (future) + └── Environment: production + ├── Server: i-abc123 (from ASG web-servers) + └── Server: i-def456 (from ASG web-servers) +``` + +Each bastion creates an environment under baremetal cloud. AWS autoscaling groups create environments under aws cloud. + +## 8. App Model (Pulumi Charts) + +Each app is a Pulumi TypeScript program: +``` +app.yaml # name, version, inputs schema, required permissions +src/index.ts # Pulumi program +values.yaml # defaults +tests/ # vitest tests +``` + +First apps to build: +- bastion — PXE provisioning (wrap existing code) +- labd — master daemon (self-deployment) +- cockroachdb — database +- cilium — CNI + +## 9. Implementation Phases + +### Phase 1: Foundation (PARTIALLY DONE) +- [x] PXE bastion (discover, install, reprovision) +- [x] CLI structure (labctl init/provision) +- [x] labd scaffold (Fastify + CockroachDB/Prisma schema) +- [x] Multi-arch builds, packaging, CI/CD +- [ ] Certificate Authority in labd +- [ ] lab-agent skeleton (connect, heartbeat, enrollment) +- [ ] Agent enrollment via join tokens +- [ ] RBAC engine +- [ ] labctl exec (remote execution) +- [ ] labctl logs (resource-scoped streaming) +- [ ] labctl get servers (with filters) +- [ ] Smoke test stack (podman-compose) + +### Phase 2: Deployment +- [ ] Reprovision labmaster as labmaster.ad.itaz.eu +- [ ] Deploy k3s with Cilium CNI +- [ ] Deploy CockroachDB on k3s +- [ ] Deploy labd on k3s +- [ ] Deploy bastion as managed app +- [ ] Auto-enroll agents during PXE provision + +### Phase 3: Infrastructure as Code +- [ ] Module system +- [ ] Pulumi charts (replacing Helm) +- [ ] labctl apps install/upgrade/rollback +- [ ] labctl apply -f (Pulumi execution) +- [ ] kubectl proxy (audited) +- [ ] Kubeconfig store (encrypted) + +### Phase 4: Multi-Cloud +- [ ] AWS provider (Pulumi) +- [ ] Reusable join tokens for ASGs +- [ ] Cilium Cluster Mesh +- [ ] Ephemeral test environments +- [ ] Grafana Loki for cold logs + +## 10. Technology Stack + +| Component | Technology | Notes | +|-----------|-----------|-------| +| Language | TypeScript (ESM) | Same for CLI, daemon, agents, IaC | +| CLI | Commander.js | Matches mcpctl patterns | +| HTTP Server | Fastify + WebSocket | labd and bastion | +| Database | CockroachDB | PostgreSQL compatible, Prisma ORM | +| ORM | Prisma | Reuse mcpctl patterns | +| IaC | Pulumi (TypeScript) | Replaces Helm and Puppet | +| k8s CNI | Cilium | eBPF, WireGuard, network policies | +| Auth | mTLS (built-in CA) | Certificate-based, no SSH keys | +| Packaging | nfpm (RPM/DEB) | bun compile for standalone binary | +| Containers | Podman + podman-compose | No Docker dependency | +| CI/CD | Gitea Actions | Self-hosted on mysources.co.uk | +| Testing | Vitest | Unit + smoke + integration | +| Registry | Gitea packages | RPM, DEB, container images | + +## 11. Lessons from mcpctl + +The mcpctl project (../mcpctl/) established patterns reused here: + +**Project structure:** pnpm monorepo with workspace packages (shared, cli, daemon). Each package has own package.json, tsconfig.json, vitest.config.ts. + +**CLI patterns:** Commander.js with factory functions (createXxxCommand). Global options (--project → --env/--cloud). Resource CRUD (get, describe, delete, create, apply). + +**Server patterns:** Fastify with route registration functions. Services layer with repository pattern. Middleware for auth. Health endpoints. + +**Database:** Prisma ORM with PostgreSQL (now CockroachDB, wire-compatible). Migration-first schema. Seed data for initial setup. + +**RBAC:** Role-based with permission strings. Middleware checks on every request. Audit logging in middleware. + +**Testing:** Vitest with separate configs for unit vs smoke. Smoke tests with real database and services. Security tests for RBAC. + +**CI/CD:** Gitea Actions with lint→typecheck→test→build→publish pipeline. nfpm for RPM/DEB. Bun compile for standalone binaries. Podman for container images. + +**Deployment:** Docker/Podman compose for dev stack. Portainer API for production deploy (we'll use k3s instead). systemd for local daemons. + +**Completions:** Generated from Commander tree. Bash + Fish. --write and --check modes. Included in packages. + +**Key learnings applied:** +- Start with proper monorepo structure (not flat scripts) +- Type safety across packages via workspace references +- Test-driven (unit tests before features) +- CI from the start (not retrofitted) +- RBAC and audit from the start (not bolted on) +- Database-first design (schema defines the domain) + +## 12. Gitea Registry + +**Registry:** mysources.co.uk (self-hosted Gitea at 10.0.0.194) +**Token:** stored at ~/.gitea-token, env var PACKAGES_TOKEN +**Packages:** RPM and DEB published to Gitea packages API +**Container images:** pushed to Gitea container registry +**API pattern:** Same as mcpctl publish scripts (check existing, delete, re-upload, link to repo) diff --git a/.taskmaster/state.json b/.taskmaster/state.json new file mode 100644 index 0000000..e0fdc3a --- /dev/null +++ b/.taskmaster/state.json @@ -0,0 +1,6 @@ +{ + "currentTag": "master", + "lastSwitched": "2026-03-18T00:17:54.213Z", + "branchTagMapping": {}, + "migrationNoticeShown": true +} \ No newline at end of file diff --git a/.taskmaster/tasks/tasks.json b/.taskmaster/tasks/tasks.json new file mode 100644 index 0000000..57fc906 --- /dev/null +++ b/.taskmaster/tasks/tasks.json @@ -0,0 +1,180 @@ +{ + "master": { + "tasks": [ + { + "id": 72, + "title": "Expand Prisma Schema with Resource Relationships", + "description": "Add Network, ServerNic, ServerDisk, and ClusterMember models to the Prisma schema. Add bastionId foreign key to Server model to track which bastion owns each server.", + "details": "Edit `bastion/src/labd/prisma/schema.prisma` to add:\n\n1. **Server model changes**:\n - Add `bastionId String?` with relation to Bastion\n - Add `hardwareInfo Json?` for storing raw HardwareInfo\n - Add `os String?` for installed OS\n\n2. **Network model**:\n```prisma\nmodel Network {\n id String @id @default(uuid())\n name String @unique\n cidr String\n vlan Int?\n gateway String?\n domain String?\n dhcpEnabled Boolean @default(false)\n createdAt DateTime @default(now())\n updatedAt DateTime @updatedAt\n \n nics ServerNic[]\n}\n```\n\n3. **ServerNic model**:\n```prisma\nmodel ServerNic {\n id String @id @default(uuid())\n serverId String\n server Server @relation(fields: [serverId], references: [id], onDelete: Cascade)\n networkId String?\n network Network? @relation(fields: [networkId], references: [id])\n mac String\n ip String?\n name String\n state String @default(\"DOWN\")\n \n @@unique([serverId, mac])\n @@index([networkId])\n}\n```\n\n4. **ServerDisk model**:\n```prisma\nmodel ServerDisk {\n id String @id @default(uuid())\n serverId String\n server Server @relation(fields: [serverId], references: [id], onDelete: Cascade)\n name String\n sizeGb Float\n model String?\n \n @@unique([serverId, name])\n}\n```\n\n5. **ClusterMember model**:\n```prisma\nmodel ClusterMember {\n id String @id @default(uuid())\n clusterId String\n cluster Cluster @relation(fields: [clusterId], references: [id], onDelete: Cascade)\n serverId String\n server Server @relation(fields: [serverId], references: [id], onDelete: Cascade)\n role String @default(\"worker\") // control-plane, worker\n joinedAt DateTime @default(now())\n \n @@unique([clusterId, serverId])\n @@index([clusterId])\n @@index([serverId])\n}\n```\n\n6. Update Server model with relations to nics, disks, clusterMemberships, and bastion.\n\nRun `pnpm prisma generate` and `pnpm prisma migrate dev --name add-resource-models`.", + "testStrategy": "1. Run `pnpm prisma validate` to verify schema syntax\n2. Run `pnpm prisma generate` to confirm client generation\n3. Create migration and verify it applies cleanly to local CockroachDB\n4. Write unit tests that create/read/delete each new model\n5. Verify cascade deletes work (deleting Server removes its NICs and Disks)", + "priority": "high", + "dependencies": [], + "status": "pending", + "subtasks": [] + }, + { + "id": 73, + "title": "Implement State Persistence Service in labd", + "description": "Create a new service in labd that persists bastion state syncs to the Server table in CockroachDB. When bastion-state-sync messages arrive, upsert machines into Server with their hardware info, status, and ownership.", + "details": "Create `bastion/src/labd/src/services/state-persistence.ts`:\n\n```typescript\nimport type { PrismaClient } from \"@prisma/client\";\nimport type { BastionState, HardwareInfo, InstallConfig, InstalledInfo } from \"@lab/shared\";\nimport { logger } from \"./logger.js\";\n\nexport class StatePersistence {\n constructor(private readonly db: PrismaClient) {}\n\n async syncBastionState(bastionId: string, state: BastionState): Promise {\n // Process discovered machines\n for (const [mac, hw] of Object.entries(state.discovered)) {\n await this.upsertDiscoveredServer(bastionId, mac, hw);\n }\n \n // Process queued machines (update status to provisioning)\n for (const [mac, cfg] of Object.entries(state.install_queue)) {\n await this.upsertQueuedServer(bastionId, mac, cfg);\n }\n \n // Process installed machines\n for (const [mac, info] of Object.entries(state.installed)) {\n await this.upsertInstalledServer(bastionId, mac, info);\n }\n }\n\n private async upsertDiscoveredServer(bastionId: string, mac: string, hw: HardwareInfo): Promise {\n const normalized = mac.toLowerCase();\n \n await this.db.server.upsert({\n where: { mac: normalized },\n create: {\n hostname: `unknown-${normalized.replace(/:/g, \"\").slice(-6)}`,\n mac: normalized,\n bastionId,\n status: \"discovered\",\n hardwareInfo: hw as any,\n labels: {\n arch: hw.arch,\n cpu_model: hw.cpu_model,\n cpu_cores: hw.cpu_cores,\n memory_gb: hw.memory_gb,\n },\n },\n update: {\n bastionId,\n status: \"discovered\", // only if not already provisioning/installed\n hardwareInfo: hw as any,\n },\n });\n \n // Sync NICs and Disks\n await this.syncServerHardware(normalized, hw);\n }\n \n private async syncServerHardware(mac: string, hw: HardwareInfo): Promise {\n const server = await this.db.server.findUnique({ where: { mac } });\n if (!server) return;\n \n // Upsert NICs\n for (const nic of hw.nics) {\n await this.db.serverNic.upsert({\n where: { serverId_mac: { serverId: server.id, mac: nic.mac.toLowerCase() } },\n create: { serverId: server.id, mac: nic.mac.toLowerCase(), name: nic.name, state: nic.state },\n update: { name: nic.name, state: nic.state },\n });\n }\n \n // Upsert Disks\n for (const disk of hw.disks) {\n await this.db.serverDisk.upsert({\n where: { serverId_name: { serverId: server.id, name: disk.name } },\n create: { serverId: server.id, name: disk.name, sizeGb: disk.size_gb, model: disk.model },\n update: { sizeGb: disk.size_gb, model: disk.model },\n });\n }\n }\n \n // Similar methods for upsertQueuedServer and upsertInstalledServer...\n}\n```\n\nIntegrate into `server.ts` WebSocket handler by calling `statePersistence.syncBastionState()` when `bastion-state-sync` messages arrive.", + "testStrategy": "1. Unit test StatePersistence with mocked PrismaClient\n2. Integration test: simulate bastion-state-sync message, verify Server rows created\n3. Test idempotency: send same state twice, verify no duplicates\n4. Test status transitions: discovered -> provisioning -> installed\n5. Verify hardware info (NICs, Disks) is correctly persisted", + "priority": "high", + "dependencies": [ + 72 + ], + "status": "pending", + "subtasks": [] + }, + { + "id": 74, + "title": "Add State Loading from labd on Bastion Startup", + "description": "Modify bastion startup to request its persisted state from labd before using the local JSON cache. This ensures bastions restore their state after pod restarts.", + "details": "1. Add new labd API endpoint `GET /api/bastions/:id/state` that returns the aggregated state for a specific bastion from the Server table:\n\n```typescript\n// bastion/src/labd/src/routes/bastions.ts\napp.get<{ Params: { id: string } }>(\"/api/bastions/:id/state\", async (request, reply) => {\n const { id } = request.params;\n \n const servers = await db.server.findMany({\n where: { bastionId: id },\n include: { nics: true, disks: true },\n });\n \n // Transform back to BastionState format\n const state: BastionState = { discovered: {}, install_queue: {}, installed: {} };\n for (const server of servers) {\n const mac = server.mac;\n if (!mac) continue;\n \n switch (server.status) {\n case \"discovered\":\n state.discovered[mac] = transformToHardwareInfo(server);\n break;\n case \"provisioning\":\n state.install_queue[mac] = transformToInstallConfig(server);\n break;\n case \"installed\":\n state.installed[mac] = transformToInstalledInfo(server);\n break;\n }\n }\n \n return reply.send(state);\n});\n```\n\n2. Modify `BastionConnection.connect()` in `labd-connection.ts` to fetch state after enrollment:\n\n```typescript\nprivate async loadRemoteState(): Promise {\n if (!this.bastionId || !this.config.labdUrl) return null;\n try {\n const resp = await fetch(`${this.config.labdUrl}/api/bastions/${this.bastionId}/state`);\n if (resp.ok) return await resp.json();\n } catch { /* fall back to local */ }\n return null;\n}\n```\n\n3. In bastion `main.ts`, after establishing labd connection, merge remote state with local state (remote takes precedence for installed machines, local wins for in-progress installs).", + "testStrategy": "1. Integration test: start bastion, let it persist state, restart bastion, verify state restored\n2. Test merge logic: local has in-progress install, remote has discovered - verify install preserved\n3. Test offline mode: labd unavailable, bastion falls back to local JSON\n4. Test fresh start: no local state, no remote state - bastion starts with empty state", + "priority": "high", + "dependencies": [ + 73 + ], + "status": "pending", + "subtasks": [] + }, + { + "id": 75, + "title": "Fix Bastion --dir Environment Variable Default", + "description": "Fix the bug where CLI's --dir default overrides the BASTION_DIR environment variable. The CLI option should use the env var as its default.", + "details": "Edit `bastion/src/cli/src/commands/serve.ts`:\n\n```typescript\n// Before (line 14):\n.option(\"--dir \", \"Bastion data directory\", \"/tmp/lab-bastion\")\n\n// After:\n.option(\n \"--dir \",\n \"Bastion data directory\",\n process.env[\"BASTION_DIR\"] ?? \"/tmp/lab-bastion\"\n)\n```\n\nThis ensures:\n1. If `BASTION_DIR` env var is set (e.g., in k8s deployment), it's used as default\n2. Explicit `--dir` flag still overrides both\n3. Falls back to `/tmp/lab-bastion` if neither is set\n\nAlso update the k8s deployment manifest `bastion/deploy/k3s/deployment.yaml` to ensure `BASTION_DIR=/data` is properly set.", + "testStrategy": "1. Unit test: verify option default reads from process.env\n2. Integration test: set BASTION_DIR, run labctl without --dir, verify correct dir used\n3. Integration test: set BASTION_DIR, run labctl with --dir /custom, verify /custom used\n4. Test no env var: verify default /tmp/lab-bastion used", + "priority": "high", + "dependencies": [], + "status": "pending", + "subtasks": [] + }, + { + "id": 76, + "title": "Create Resource Type Registry with Aliases", + "description": "Create a centralized resource type registry that maps resource names, plurals, and short aliases to canonical types. This enables kubectl-style resource resolution.", + "details": "Create `bastion/src/cli/src/utils/resources.ts`:\n\n```typescript\nexport interface ResourceDefinition {\n kind: string; // Canonical type: \"Server\", \"Cluster\", etc.\n singular: string; // \"server\"\n plural: string; // \"servers\"\n aliases: string[]; // [\"srv\"]\n apiPath: string; // \"/api/servers\"\n columns: TableColumn[]; // Default columns for 'get' output\n wideColumns?: TableColumn[]; // Extra columns for -o wide\n}\n\nconst RESOURCE_DEFINITIONS: ResourceDefinition[] = [\n {\n kind: \"Server\",\n singular: \"server\",\n plural: \"servers\",\n aliases: [\"srv\"],\n apiPath: \"/api/servers\",\n columns: serverColumns,\n wideColumns: serverWideColumns,\n },\n {\n kind: \"Cluster\",\n singular: \"cluster\",\n plural: \"clusters\",\n aliases: [],\n apiPath: \"/api/clusters\",\n columns: clusterColumns,\n },\n {\n kind: \"Network\",\n singular: \"network\",\n plural: \"networks\",\n aliases: [\"net\"],\n apiPath: \"/api/networks\",\n columns: networkColumns,\n },\n // ... bastion, role, user, token, audit\n];\n\nconst aliasMap = new Map();\nfor (const def of RESOURCE_DEFINITIONS) {\n aliasMap.set(def.singular, def);\n aliasMap.set(def.plural, def);\n for (const alias of def.aliases) {\n aliasMap.set(alias, def);\n }\n}\n\nexport function resolveResourceType(input: string): ResourceDefinition {\n const normalized = input.toLowerCase();\n const def = aliasMap.get(normalized);\n if (!def) {\n const valid = RESOURCE_DEFINITIONS.map(d => d.plural).join(\", \");\n throw new Error(`Unknown resource type \"${input}\". Valid types: ${valid}`);\n }\n return def;\n}\n\nexport function resolveResourceIdentifier(input: string): {\n type: ResourceDefinition;\n name?: string;\n} {\n // Handle \"server/labmaster\" or just \"servers\"\n const parts = input.split(\"/\");\n const type = resolveResourceType(parts[0]);\n const name = parts.length > 1 ? parts.slice(1).join(\"/\") : undefined;\n return { type, name };\n}\n```\n\nUpdate `bastion/src/cli/src/utils/resource.ts` to use the new registry.", + "testStrategy": "1. Unit test resolveResourceType with all aliases: server, servers, srv -> Server\n2. Test unknown resource type throws descriptive error\n3. Test case insensitivity: SERVER, Server, server all resolve correctly\n4. Test resolveResourceIdentifier parses \"server/labmaster\" correctly", + "priority": "high", + "dependencies": [], + "status": "pending", + "subtasks": [] + }, + { + "id": 77, + "title": "Implement 'labctl get' Command", + "description": "Create the core 'labctl get [name]' command that lists resources with filtering and output format support. This is the foundation of the kubectl-style CLI.", + "details": "Create `bastion/src/cli/src/commands/get.ts`:\n\n```typescript\nimport { Command } from \"commander\";\nimport { resolveResourceType, type ResourceDefinition } from \"../utils/resources.js\";\nimport { getLabdClient } from \"../api/config.js\";\nimport { formatOutput, type TableColumn } from \"../utils/table.js\";\n\nexport function registerGetCommand(program: Command): void {\n program\n .command(\"get [name]\")\n .description(\"List resources or get a specific resource by name\")\n .option(\"--status \", \"Filter by status\")\n .option(\"--role \", \"Filter by role (servers only)\")\n .option(\"--cloud \", \"Filter by cloud\")\n .option(\"--env \", \"Filter by environment\")\n .option(\"-l, --label