From 866f6abc88a788ab6da836afd29b394da25c2c54 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 27 Apr 2026 14:28:43 +0100 Subject: [PATCH] feat: virtual-LLM smoke test + docs (v1 Stage 6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final stage of v1. Smoke (mcplocal/tests/smoke/virtual-llm.smoke.test.ts): - Spins an in-process LlmProvider that returns canned content. - Runs the registrar against the live mcpd in fulldeploy. - Asserts: row appears with kind=virtual / status=active, infer through /api/v1/llms//infer comes back through the SSE relay with the provider's content + finish_reason, and a 503 appears immediately after registrar.stop() (publisher offline). - Times out / cleanup paths idempotent so re-runs against the same cluster don't litter rows. The 90-s heartbeat-stale flip and 4-h GC are unit-tested — too slow for smoke. Docs: - New docs/virtual-llms.md: when to use this vs creating a regular Llm row, how to opt-in via publish: true, the lifecycle table, the inference-relay sequence, the v1 streaming caveat, the v2-v5 roadmap, and the full /api/v1/llms/_provider-* surface. - agents.md cross-links virtual-llms.md alongside personalities/chat. - README's Agents section gains a "Virtual LLMs" subsection. Workspace suite: 2043/2043 (smoke files run separately). v1 closes. Stage roadmap (each its own future PR): v2 wake-on-demand · v3 virtual agents · v4 LB pool · v5 task queue Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 26 +++ docs/agents.md | 4 + docs/virtual-llms.md | 171 ++++++++++++++ .../tests/smoke/virtual-llm.smoke.test.ts | 209 ++++++++++++++++++ 4 files changed, 410 insertions(+) create mode 100644 docs/virtual-llms.md create mode 100644 src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts diff --git a/README.md b/README.md index a6b1cb9..e12537e 100644 --- a/README.md +++ b/README.md @@ -571,6 +571,32 @@ For binding prompts to personalities and the API surface, see prompt editing — paste a session token (`mcpctl auth login`) or PAT to log in. +### Virtual LLMs + +A user's local LLM (`vllm-local`, Ollama, …) can publish itself into +mcpd's `Llm` registry so anyone authorized sees it under `mcpctl get llm` +and can chat with it via `mcpctl chat-llm `. Inference is relayed +through the publishing mcplocal's SSE control channel — mcpd never holds +the local URL or API key. + +```fish +# In ~/.mcpctl/config.json, opt the provider in with `publish: true`: +# { "name": "vllm-local", "type": "openai", "model": "...", "publish": true } +systemctl --user restart mcplocal + +mcpctl get llm +# NAME KIND STATUS TYPE MODEL TIER ID +# qwen3-thinking public active openai qwen3-thinking fast ... +# vllm-local virtual active openai Qwen/Qwen2.5-7B-Instruct-AWQ fast ... + +mcpctl chat-llm vllm-local +> hello? +``` + +Lifecycle: 30 s heartbeats, 90 s heartbeat-stale → inactive, 4 h +inactive → auto-deleted. A reconnecting mcplocal adopts the same row +via a sticky `providerSessionId`. Full design: [docs/virtual-llms.md](docs/virtual-llms.md). + ## Commands ```bash diff --git a/docs/agents.md b/docs/agents.md index e86df3d..b14688f 100644 --- a/docs/agents.md +++ b/docs/agents.md @@ -201,4 +201,8 @@ mcpctl chat reviewer - [personalities.md](./personalities.md) — named overlays of prompts on top of an agent. Same agent, different prompt bundles, picked per-turn via `--personality ` or `agent.defaultPersonality`. +- [virtual-llms.md](./virtual-llms.md) — local LLMs (e.g. `vllm-local`) + publishing themselves into `mcpctl get llm` so anyone can chat with + them via `mcpctl chat-llm `. Inference is relayed through the + publishing mcplocal — mcpd never holds the local URL or key. - [chat.md](./chat.md) — `mcpctl chat` flow and LiteLLM-style flags. diff --git a/docs/virtual-llms.md b/docs/virtual-llms.md new file mode 100644 index 0000000..008746b --- /dev/null +++ b/docs/virtual-llms.md @@ -0,0 +1,171 @@ +# Virtual LLMs + +A **virtual LLM** is an `Llm` row in mcpd that's *registered by an mcplocal +client* rather than created by hand with `mcpctl create llm`. Inference for +a virtual LLM is relayed back through the publishing mcplocal's SSE control +channel — **mcpd never needs to know the local URL or hold its API key**. + +When the publishing mcplocal goes away (or the user shuts down their +laptop) the row decays: `active → inactive` after 90 s without a +heartbeat, then deleted after 4 h of inactivity. A reconnecting mcplocal +adopts the same row using a sticky `providerSessionId` it persisted at +first publish. + +## When to use this + +- **Local model on a developer laptop** that you want everyone on the + team to be able to chat with via `mcpctl chat-llm `. The model + doesn't need to be reachable from mcpd's k8s pods — only the user's + mcplocal does (which is already the case because mcplocal pulls + projects from mcpd over HTTPS). +- **Hibernating models** that wake on demand (v2 — see "Roadmap"). +- **Pool of identical models** distributed across user laptops, eligible + for load balancing (v4). + +If your model is reachable from mcpd's k8s pods over LAN/VPN, you don't +need a virtual LLM — just `mcpctl create llm --type openai --url …` +and you're done. + +## Publishing a local provider + +mcplocal's local config (`~/.mcpctl/config.json`) gains a `publish: true` +opt-in per provider: + +```json +{ + "llm": { + "providers": [ + { + "name": "vllm-local", + "type": "openai", + "model": "Qwen/Qwen2.5-7B-Instruct-AWQ", + "url": "http://127.0.0.1:8000/v1", + "tier": "fast", + "publish": true + } + ] + } +} +``` + +Restart mcplocal: + +```fish +systemctl --user restart mcplocal +``` + +The registrar: +1. Reads `~/.mcpctl/credentials` for `mcpdUrl` + bearer token. +2. POSTs to `/api/v1/llms/_provider-register` with the publishable set. +3. Persists the returned `providerSessionId` to + `~/.mcpctl/provider-session` so the next restart adopts the same + mcpd row. +4. Opens the SSE channel at `/api/v1/llms/_provider-stream`. +5. Heartbeats every 30 s. +6. Listens for `event: task` frames and runs them against the local + `LlmProvider`. + +If `~/.mcpctl/credentials` doesn't exist (e.g. you haven't run +`mcpctl auth login`), the registrar logs a warning and skips — +publishing is a best-effort feature, not a boot blocker. + +## Verifying + +```fish +$ mcpctl get llm +NAME KIND STATUS TYPE MODEL TIER KEY ID +qwen3-thinking public active openai qwen3-thinking fast secret://litellm-key/API_KEY cmofx8y7u… +vllm-local virtual active openai Qwen/Qwen2.5-7B-Instruct-AWQ fast - cmoxz12ab… + +$ mcpctl chat-llm vllm-local +───────────────────────────────────────────────────────── +LLM: vllm-local openai → Qwen/Qwen2.5-7B-Instruct-AWQ +Kind: virtual Status: active +───────────────────────────────────────────────────────── +> hello? +Hi! … +``` + +You can also chat with public LLMs the same way: + +```fish +$ mcpctl chat-llm qwen3-thinking +``` + +The CLI doesn't care about `kind` — mcpd's `/api/v1/llms//infer` +route branches on it server-side. + +## Lifecycle in detail + +| State | What it means | +|----------------|-----------------------------------------------------------------------| +| `active` | Heartbeat received within the last 90 s and the SSE channel is open. | +| `inactive` | Either the SSE closed or the heartbeat watchdog tripped. Inference returns 503. | +| `hibernating` | Reserved for v2 (wake-on-demand). v1 never writes this state. | + +Two timers on mcpd run the GC sweep: + +- **90 s** without a heartbeat → flip `active` → `inactive`. +- **4 h** in `inactive` → delete the row entirely. + +A reconnecting mcplocal with the same `providerSessionId` revives every +inactive row it owns; it only orphans rows that fell past the 4-h cutoff. + +## Inference relay + +When mcpd receives `POST /api/v1/llms//infer`: + +1. Look up the row, see `kind=virtual` + `status=active`. +2. Find the open SSE session for that `providerSessionId`. Missing + session → 503. +3. Push a `{ kind: "infer", taskId, llmName, request, streaming }` + task frame onto the SSE. +4. mcplocal pulls, calls `LlmProvider.complete(...)`, and POSTs the + result back to `/api/v1/llms/_provider-task//result`: + - non-streaming: `{ status: 200, body: }` + - streaming: per-chunk `{ chunk: { data, done? } }` + - failure: `{ error: "..." }` +5. mcpd forwards the result/chunks out to the original caller. + +**v1 caveat — streaming granularity**: `LlmProvider.complete()` returns +a finalized `CompletionResult`, not a token stream. Streaming requests +therefore arrive at the caller as a single delta + `[DONE]`. Real +per-token streaming is a v2 concern. + +## Roadmap (later stages) + +- **v2 — Wake-on-demand**: Secret-stored "wake recipe" so mcpd can ask + mcplocal to start a hibernating backend before sending inference. +- **v3 — Virtual agents**: mcplocal publishes its local agent configs + (model + system prompt + sampling defaults) into mcpd's `Agent` table. +- **v4 — LB pool by model**: agents can target a model name instead of + a specific Llm; mcpd picks the healthiest pool member per request. +- **v5 — Task queue**: persisted requests for hibernating/saturated + pools. Workers pull tasks of their model when they come online. + +## API surface (v1) + +``` +POST /api/v1/llms/_provider-register → returns { providerSessionId, llms[] } +GET /api/v1/llms/_provider-stream → SSE channel; require x-mcpctl-provider-session header +POST /api/v1/llms/_provider-heartbeat → { providerSessionId } +POST /api/v1/llms/_provider-task/:id/result + → one of: + { error: "msg" } + { chunk: { data, done? } } + { status, body } + +GET /api/v1/llms → list (now includes kind, status, lastHeartbeatAt, inactiveSince) +POST /api/v1/llms//infer → routes through the SSE relay +DELETE /api/v1/llms/ → delete unconditionally (also runs GC's job) +``` + +RBAC piggybacks on `view/edit/create:llms` — no new resource. Publishing +a virtual LLM is morally a `create:llms` operation. + +## See also + +- [agents.md](./agents.md) — what an Agent is and how it pins to an LLM. +- [chat.md](./chat.md) — `mcpctl chat ` (full agent flow). +- The CLI: `mcpctl chat-llm ` (this doc) is the stateless + counterpart for raw LLM chat. diff --git a/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts b/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts new file mode 100644 index 0000000..1586a1a --- /dev/null +++ b/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts @@ -0,0 +1,209 @@ +/** + * Smoke tests: virtual-LLM register → infer relay → cleanup against a live + * mcpd. Uses an in-process LlmProvider (returns canned content) so we + * exercise the SSE control plane and the kind=virtual infer branch + * without depending on a real upstream model. + * + * The 90-s heartbeat-stale flip and 4-h auto-deletion are covered by unit + * tests (mcpd virtual-llm-service.test.ts); waiting > 90 s in smoke would + * triple the suite duration. + */ +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import http from 'node:http'; +import https from 'node:https'; +import { mkdtempSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { + VirtualLlmRegistrar, + type RegistrarPublishedProvider, +} from '../../src/providers/registrar.js'; +import type { LlmProvider, CompletionResult } from '../../src/providers/types.js'; + +const MCPD_URL = process.env.MCPD_URL ?? 'https://mcpctl.ad.itaz.eu'; +const SUFFIX = Date.now().toString(36); +const PROVIDER_NAME = `smoke-virtual-${SUFFIX}`; + +function makeFakeProvider(name: string, content: string): LlmProvider { + return { + name, + async complete(): Promise { + return { + content, + toolCalls: [], + usage: { promptTokens: 1, completionTokens: 4, totalTokens: 5 }, + finishReason: 'stop', + }; + }, + async listModels() { return []; }, + async isAvailable() { return true; }, + }; +} + +function healthz(url: string, timeoutMs = 5000): Promise { + return new Promise((resolve) => { + const parsed = new URL(`${url.replace(/\/$/, '')}/healthz`); + const driver = parsed.protocol === 'https:' ? https : http; + const req = driver.get( + { + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80), + path: parsed.pathname, + timeout: timeoutMs, + }, + (res) => { resolve((res.statusCode ?? 500) < 500); res.resume(); }, + ); + req.on('error', () => resolve(false)); + req.on('timeout', () => { req.destroy(); resolve(false); }); + }); +} + +function readToken(): string | null { + try { + const home = process.env.HOME ?? ''; + const path = `${home}/.mcpctl/credentials`; + // eslint-disable-next-line @typescript-eslint/no-require-imports + const fs = require('node:fs') as typeof import('node:fs'); + if (!fs.existsSync(path)) return null; + const raw = fs.readFileSync(path, 'utf-8'); + const parsed = JSON.parse(raw) as { token?: string }; + return parsed.token ?? null; + } catch { + return null; + } +} + +interface HttpResponse { status: number; body: string } + +function httpRequest(method: string, urlStr: string, body: unknown): Promise { + return new Promise((resolve, reject) => { + const tokenRaw = readToken(); + const parsed = new URL(urlStr); + const driver = parsed.protocol === 'https:' ? https : http; + const headers: Record = { + Accept: 'application/json', + ...(body !== undefined ? { 'Content-Type': 'application/json' } : {}), + ...(tokenRaw !== null ? { Authorization: `Bearer ${tokenRaw}` } : {}), + }; + const req = driver.request({ + hostname: parsed.hostname, + port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80), + path: parsed.pathname + parsed.search, + method, + headers, + timeout: 30_000, + }, (res) => { + const chunks: Buffer[] = []; + res.on('data', (c: Buffer) => chunks.push(c)); + res.on('end', () => { + resolve({ status: res.statusCode ?? 0, body: Buffer.concat(chunks).toString('utf-8') }); + }); + }); + req.on('error', reject); + req.on('timeout', () => { req.destroy(); reject(new Error(`httpRequest timeout: ${method} ${urlStr}`)); }); + if (body !== undefined) req.write(JSON.stringify(body)); + req.end(); + }); +} + +let mcpdUp = false; +let registrar: VirtualLlmRegistrar | null = null; +let tempDir: string; + +describe('virtual-LLM smoke', () => { + beforeAll(async () => { + mcpdUp = await healthz(MCPD_URL); + if (!mcpdUp) { + // eslint-disable-next-line no-console + console.warn(`\n ○ virtual-llm smoke: skipped — ${MCPD_URL}/healthz unreachable.\n`); + return; + } + if (readToken() === null) { + mcpdUp = false; + // eslint-disable-next-line no-console + console.warn('\n ○ virtual-llm smoke: skipped — no ~/.mcpctl/credentials.\n'); + return; + } + tempDir = mkdtempSync(join(tmpdir(), 'mcpctl-virtual-llm-smoke-')); + }, 20_000); + + afterAll(async () => { + if (registrar !== null) registrar.stop(); + if (tempDir !== undefined) rmSync(tempDir, { recursive: true, force: true }); + // Best-effort cleanup of the row in case the disconnect didn't finish + // before mcpd's heartbeat watchdog ticks. Idempotent. + if (mcpdUp) { + const list = await httpRequest('GET', `${MCPD_URL}/api/v1/llms`, undefined); + if (list.status === 200) { + const rows = JSON.parse(list.body) as Array<{ id: string; name: string }>; + const row = rows.find((r) => r.name === PROVIDER_NAME); + if (row !== undefined) { + await httpRequest('DELETE', `${MCPD_URL}/api/v1/llms/${row.id}`, undefined); + } + } + } + }); + + it('registrar publishes the provider and mcpd lists it as kind=virtual / status=active', async () => { + if (!mcpdUp) return; + const token = readToken(); + if (token === null) return; + const published: RegistrarPublishedProvider[] = [ + { provider: makeFakeProvider(PROVIDER_NAME, 'hi from smoke'), type: 'openai', model: 'fake-smoke', tier: 'fast' }, + ]; + registrar = new VirtualLlmRegistrar({ + mcpdUrl: MCPD_URL, + token, + publishedProviders: published, + sessionFilePath: join(tempDir, 'session'), + log: { info: () => {}, warn: () => {}, error: () => {} }, + heartbeatIntervalMs: 60_000, + }); + await registrar.start(); + expect(registrar.getSessionId()).not.toBeNull(); + // Give the SSE handshake + register a moment to settle on mcpd's side. + await new Promise((r) => setTimeout(r, 400)); + + const res = await httpRequest('GET', `${MCPD_URL}/api/v1/llms`, undefined); + expect(res.status).toBe(200); + const rows = JSON.parse(res.body) as Array<{ name: string; kind: string; status: string; type: string; model: string }>; + const row = rows.find((r) => r.name === PROVIDER_NAME); + expect(row, `${PROVIDER_NAME} must be present`).toBeDefined(); + expect(row!.kind).toBe('virtual'); + expect(row!.status).toBe('active'); + expect(row!.type).toBe('openai'); + expect(row!.model).toBe('fake-smoke'); + }, 30_000); + + it('mcpd routes /api/v1/llms//infer back through the SSE relay to the fake provider', async () => { + if (!mcpdUp) return; + const res = await httpRequest('POST', `${MCPD_URL}/api/v1/llms/${PROVIDER_NAME}/infer`, { + messages: [{ role: 'user', content: 'say something' }], + }); + expect(res.status).toBe(200); + const body = JSON.parse(res.body) as { + choices?: Array<{ message?: { content?: string }; finish_reason?: string }>; + usage?: { total_tokens?: number }; + }; + expect(body.choices?.[0]?.message?.content).toBe('hi from smoke'); + expect(body.choices?.[0]?.finish_reason).toBe('stop'); + expect(body.usage?.total_tokens).toBe(5); + }, 30_000); + + it('returns 503 with a clear error when the publisher disconnects mid-session', async () => { + if (!mcpdUp) return; + if (registrar !== null) { + registrar.stop(); + registrar = null; + } + // Immediately after stop(), the SSE socket closes and mcpd's + // unbindSession flips the row to inactive. Inference should 503. + await new Promise((r) => setTimeout(r, 300)); + + const res = await httpRequest('POST', `${MCPD_URL}/api/v1/llms/${PROVIDER_NAME}/infer`, { + messages: [{ role: 'user', content: 'still there?' }], + }); + expect(res.status).toBe(503); + expect(res.body).toMatch(/publisher offline|inactive/); + }, 30_000); +});