diff --git a/docs/virtual-llms.md b/docs/virtual-llms.md index 008746b..0a5629c 100644 --- a/docs/virtual-llms.md +++ b/docs/virtual-llms.md @@ -97,11 +97,11 @@ route branches on it server-side. ## Lifecycle in detail -| State | What it means | -|----------------|-----------------------------------------------------------------------| -| `active` | Heartbeat received within the last 90 s and the SSE channel is open. | +| State | What it means | +|----------------|---------------------------------------------------------------------------------| +| `active` | Heartbeat received within the last 90 s and the SSE channel is open. | | `inactive` | Either the SSE closed or the heartbeat watchdog tripped. Inference returns 503. | -| `hibernating` | Reserved for v2 (wake-on-demand). v1 never writes this state. | +| `hibernating` | Publisher is online but the backend is asleep; the next inference triggers a `wake` task before relaying. | Two timers on mcpd run the GC sweep: @@ -132,10 +132,75 @@ a finalized `CompletionResult`, not a token stream. Streaming requests therefore arrive at the caller as a single delta + `[DONE]`. Real per-token streaming is a v2 concern. +## Wake-on-demand (v2) + +A provider whose backend hibernates (a vLLM instance that suspends +when idle, an Ollama daemon that exits when nothing's connected, …) +can declare a **wake recipe** in mcplocal config. When that provider's +`isAvailable()` returns false at registrar startup, the row is +published as `status=hibernating`. The next inference request that +hits the row triggers the recipe and waits for the backend to come up +before relaying. + +Two recipe types: + +```jsonc +// HTTP — POST to a "wake controller" that starts the backend out of band. +{ + "name": "vllm-local", + "type": "openai", + "model": "...", + "publish": true, + "wake": { + "type": "http", + "url": "http://10.0.0.50:9090/wake/vllm", + "method": "POST", + "headers": { "Authorization": "Bearer ..." }, + "maxWaitSeconds": 60 + } +} +``` + +```jsonc +// command — spawn a local process (systemd, wakeonlan, custom script). +{ + "name": "vllm-local", + "type": "openai", + "model": "...", + "publish": true, + "wake": { + "type": "command", + "command": "/usr/local/bin/start-vllm", + "args": ["--profile", "qwen3"], + "maxWaitSeconds": 120 + } +} +``` + +How a request flows when the row is `hibernating`: + +``` +client → mcpd POST /api/v1/llms//infer + mcpd: status === hibernating → push wake task on SSE + mcplocal: receive wake task → run recipe → poll isAvailable() + → heartbeat each tick → POST { ok: true } back + mcpd: flip row → active, push the original infer task + mcplocal: run inference → POST result back +mcpd → client (forwards the inference result) +``` + +Concurrent infers for the same hibernating Llm share a single wake +task — only the first request triggers the recipe; later ones await +the same in-flight wake promise. After the wake settles, every queued +infer dispatches in order. + +If the recipe fails (HTTP non-2xx, command exits non-zero, or the +provider doesn't come up within `maxWaitSeconds`), every queued infer +is rejected with a clear error and the row stays `hibernating` — +the next request gets a fresh wake attempt. + ## Roadmap (later stages) -- **v2 — Wake-on-demand**: Secret-stored "wake recipe" so mcpd can ask - mcplocal to start a hibernating backend before sending inference. - **v3 — Virtual agents**: mcplocal publishes its local agent configs (model + system prompt + sampling defaults) into mcpd's `Agent` table. - **v4 — LB pool by model**: agents can target a model name instead of diff --git a/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts b/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts index 1586a1a..c332b9b 100644 --- a/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts +++ b/src/mcplocal/tests/smoke/virtual-llm.smoke.test.ts @@ -207,3 +207,137 @@ describe('virtual-LLM smoke', () => { expect(res.body).toMatch(/publisher offline|inactive/); }, 30_000); }); + +// ── v2: hibernating + wake-on-demand ── + +const HIBERNATING_NAME = `smoke-virtual-hib-${SUFFIX}`; +let hibernatingRegistrar: VirtualLlmRegistrar | null = null; + +/** + * Provider that's "asleep" until \`wakeFn()\` is called. Used to drive + * the wake-on-demand smoke without standing up an actual sleep/wake + * controller — we flip the bool from inside the wake recipe. + */ +function makeSleepingProvider(name: string, content: string): { + provider: LlmProvider; + wakeFn: () => void; + wakeCount: () => number; +} { + let awake = false; + let count = 0; + const provider: LlmProvider = { + name, + async complete(): Promise { + if (!awake) throw new Error('provider not awake'); + return { + content, + toolCalls: [], + usage: { promptTokens: 1, completionTokens: 4, totalTokens: 5 }, + finishReason: 'stop', + }; + }, + async listModels() { return []; }, + async isAvailable() { return awake; }, + }; + return { + provider, + wakeFn: () => { awake = true; count += 1; }, + wakeCount: () => count, + }; +} + +describe('virtual-LLM smoke — wake-on-demand', () => { + let wakeServerUrl: string; + let wakeServer: http.Server; + let wakeFn: (() => void) | null = null; + + beforeAll(async () => { + if (!mcpdUp) return; + // Tiny in-process "wake controller" — receives the http wake recipe + // POST and flips the local provider's `awake` bool. + await new Promise((resolve) => { + wakeServer = http.createServer((req, res) => { + if (req.url === '/wake' && wakeFn !== null) { + wakeFn(); + res.writeHead(200); + res.end('woken'); + return; + } + res.writeHead(404); + res.end(); + }); + wakeServer.listen(0, '127.0.0.1', () => { + const addr = wakeServer.address(); + if (addr === null || typeof addr === 'string') throw new Error('listen failed'); + wakeServerUrl = `http://127.0.0.1:${String(addr.port)}/wake`; + resolve(); + }); + }); + }); + + afterAll(async () => { + if (hibernatingRegistrar !== null) hibernatingRegistrar.stop(); + if (wakeServer) await new Promise((r) => wakeServer.close(() => r())); + if (mcpdUp) { + const list = await httpRequest('GET', `${MCPD_URL}/api/v1/llms`, undefined); + if (list.status === 200) { + const rows = JSON.parse(list.body) as Array<{ id: string; name: string }>; + const row = rows.find((r) => r.name === HIBERNATING_NAME); + if (row !== undefined) { + await httpRequest('DELETE', `${MCPD_URL}/api/v1/llms/${row.id}`, undefined); + } + } + } + }); + + it('publishes a sleeping provider as kind=virtual / status=hibernating', async () => { + if (!mcpdUp) return; + const token = readToken(); + if (token === null) return; + const sleeping = makeSleepingProvider(HIBERNATING_NAME, 'awake now'); + wakeFn = sleeping.wakeFn; + + const published: RegistrarPublishedProvider[] = [{ + provider: sleeping.provider, + type: 'openai', + model: 'fake-hibernating', + tier: 'fast', + wake: { type: 'http', url: wakeServerUrl, method: 'POST', maxWaitSeconds: 5 }, + }]; + hibernatingRegistrar = new VirtualLlmRegistrar({ + mcpdUrl: MCPD_URL, + token, + publishedProviders: published, + sessionFilePath: join(tempDir, 'hib-session'), + log: { info: () => {}, warn: () => {}, error: () => {} }, + heartbeatIntervalMs: 60_000, + }); + await hibernatingRegistrar.start(); + await new Promise((r) => setTimeout(r, 400)); + + const res = await httpRequest('GET', `${MCPD_URL}/api/v1/llms`, undefined); + expect(res.status).toBe(200); + const rows = JSON.parse(res.body) as Array<{ name: string; kind: string; status: string }>; + const row = rows.find((r) => r.name === HIBERNATING_NAME); + expect(row, `${HIBERNATING_NAME} must be present`).toBeDefined(); + expect(row!.kind).toBe('virtual'); + expect(row!.status).toBe('hibernating'); + }, 30_000); + + it('first inference triggers the wake recipe and then completes', async () => { + if (!mcpdUp) return; + // wakeFn was set in the previous test; it flips the provider's + // `awake` bool when the wake POST lands. + const res = await httpRequest('POST', `${MCPD_URL}/api/v1/llms/${HIBERNATING_NAME}/infer`, { + messages: [{ role: 'user', content: 'wake then say hello' }], + }); + expect(res.status).toBe(200); + const body = JSON.parse(res.body) as { choices?: Array<{ message?: { content?: string } }> }; + expect(body.choices?.[0]?.message?.content).toBe('awake now'); + + // After the wake, the row should now be active. + const list = await httpRequest('GET', `${MCPD_URL}/api/v1/llms`, undefined); + const rows = JSON.parse(list.body) as Array<{ name: string; status: string }>; + expect(rows.find((r) => r.name === HIBERNATING_NAME)?.status).toBe('active'); + }, 30_000); +});