feat(mcpd): wake-before-infer for hibernating virtual LLMs (v2 Stage 2)

Second half of v2. mcpd now dispatches a \`wake\` task on the SSE control channel when an inference request hits a row whose status=hibernating, waits for the publisher to confirm readiness, then proceeds with the infer task. Concurrent infers for the same hibernating Llm share a single wake task — \`wakeInFlight\` map dedupes by Llm name. State machine in enqueueInferTask: active → push infer task immediately (existing path). inactive → 503, publisher offline (existing path). hibernating → ensureAwake() → push infer task (new in v2). ensureAwake/runWake (private): - Allocates a fresh taskId on the existing PendingTask plumbing. - Pushes \`{ kind: "wake", taskId, llmName }\` on the SSE handle. - Awaits the publisher's result POST. On 2xx, flips the row to active + bumps lastHeartbeatAt, so all queued + future infers hit the active path. On non-2xx or service.failTask, the row stays hibernating (next request retries). Tests: 4 new in virtual-llm-service.test.ts cover happy path (wake → infer in order), concurrent dedup (3 parallel infers, 1 wake task), wake failure surfaces to all queued infers and leaves the row hibernating, inactive ≠ hibernating (still rejects with 503, no wake attempt). 22/22 service tests, 2050/2050 workspace. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:18:24 +01:00
parent af0fabd84f
commit db839afc57
2 changed files with 191 additions and 2 deletions
--- a/src/mcpd/src/services/virtual-llm.service.ts
+++ b/src/mcpd/src/services/virtual-llm.service.ts
@@ -112,6 +112,12 @@ export interface PendingTaskRef {
 export class VirtualLlmService implements IVirtualLlmService {
  private readonly sessions = new Map<string, VirtualSessionHandle>();
  private readonly tasksById = new Map<string, PendingTask>();
+  /**
+   * Dedupe concurrent wake requests for the same Llm. The first request
+   * starts the wake; later requests for the same name await the same
+   * promise. Cleared as soon as the wake settles (success or failure).
+   */
+  private readonly wakeInFlight = new Map<string, Promise<void>>();

  constructor(private readonly repo: ILlmRepository) {}

@@ -230,9 +236,9 @@ export class VirtualLlmService implements IVirtualLlmService {
        { statusCode: 500 },
      );
    }
-    if (llm.status !== 'active') {
+    if (llm.status === 'inactive') {
      throw Object.assign(
-        new Error(`Virtual Llm '${llmName}' is ${llm.status}; publisher offline`),
+        new Error(`Virtual Llm '${llmName}' is inactive; publisher offline`),
        { statusCode: 503 },
      );
    }
@@ -244,6 +250,16 @@ export class VirtualLlmService implements IVirtualLlmService {
      );
    }

+    // ── Wake-on-demand (v2) ──
+    // Status=hibernating means the publisher told us at register time
+    // (or via a later status update) that the backend is asleep. Fire a
+    // wake task and wait for the publisher to confirm readiness before
+    // dispatching the actual inference. Concurrent infers for the same
+    // Llm share a single wake promise.
+    if (llm.status === 'hibernating') {
+      await this.ensureAwake(llm.id, llm.name, llm.providerSessionId, handle);
+    }
+
    const taskId = randomUUID();
    const chunkSubscribers = new Set<(chunk: { data: string; done?: boolean }) => void>();

@@ -285,6 +301,77 @@ export class VirtualLlmService implements IVirtualLlmService {
    };
  }

+  /**
+   * Drive the publisher to wake the backend. Concurrent callers for the
+   * same Llm name share the in-flight promise — we only ever ask the
+   * publisher once. Throws on timeout or recipe failure; on success the
+   * row is flipped to active and subsequent infer calls proceed.
+   */
+  private async ensureAwake(
+    llmId: string,
+    llmName: string,
+    sessionId: string,
+    handle: VirtualSessionHandle,
+  ): Promise<void> {
+    const existing = this.wakeInFlight.get(llmName);
+    if (existing !== undefined) {
+      await existing;
+      return;
+    }
+    const promise = this.runWake(llmId, llmName, sessionId, handle);
+    this.wakeInFlight.set(llmName, promise);
+    try {
+      await promise;
+    } finally {
+      this.wakeInFlight.delete(llmName);
+    }
+  }
+
+  private async runWake(
+    llmId: string,
+    llmName: string,
+    sessionId: string,
+    handle: VirtualSessionHandle,
+  ): Promise<void> {
+    const taskId = randomUUID();
+    let resolveDone!: () => void;
+    let rejectDone!: (err: Error) => void;
+    const done = new Promise<void>((resolve, reject) => {
+      resolveDone = resolve;
+      rejectDone = reject;
+    });
+
+    const pending: PendingTask = {
+      taskId,
+      sessionId,
+      llmName,
+      streaming: false,
+      // Wake tasks return { ok: true } on success or never resolve at
+      // all if the publisher dies; the rejectNonStreaming path covers
+      // the disconnect-mid-wake case via unbindSession.
+      resolveNonStreaming: (_body, status) => {
+        if (status >= 200 && status < 300) resolveDone();
+        else rejectDone(new Error(`wake task returned status ${String(status)}`));
+      },
+      rejectNonStreaming: rejectDone,
+      pushChunk: null,
+    };
+    this.tasksById.set(taskId, pending);
+
+    handle.pushTask({ kind: 'wake', taskId, llmName });
+
+    await done;
+
+    // Flip the row to active so subsequent infer calls go through the
+    // normal active path. The publisher's own heartbeat will keep the
+    // row alive from this point.
+    await this.repo.update(llmId, {
+      status: 'active',
+      lastHeartbeatAt: new Date(),
+      inactiveSince: null,
+    });
+  }
+
  completeTask(taskId: string, result: { status: number; body: unknown }): boolean {
    const t = this.tasksById.get(taskId);
    if (t === undefined) return false;