fix: MCP proxy resilience — discovery cache, default liveness probes
Some checks failed
CI/CD / lint (push) Successful in 52s
CI/CD / typecheck (push) Successful in 1m51s
CI/CD / test (push) Successful in 1m1s
CI/CD / smoke (push) Failing after 3m21s
CI/CD / build (push) Successful in 4m9s
CI/CD / publish (push) Has been skipped

Adds a per-server tools/list cache in McpRouter (positive + negative TTL)
so a slow or dead upstream only stalls the first discovery call, not every
subsequent client request. Invalidated on upstream add/remove.

Health probes now apply a default liveness spec (tools/list via the real
production path) to any RUNNING instance without an explicit healthCheck,
so synthetic and real failures converge on the same signal.

Includes supporting updates in mcpd-client, discovery, upstream/mcpd,
seeder, and fulldeploy/release scripts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal
2026-04-17 00:48:57 +01:00
parent c968d76e00
commit 3149ea3ae7
15 changed files with 499 additions and 32 deletions

View File

@@ -1,4 +1,5 @@
import type { McpdClient } from './http/mcpd-client.js';
import { DISCOVERY_TIMEOUT_MS } from './http/mcpd-client.js';
import type { McpRouter } from './router.js';
import { McpdUpstream } from './upstream/mcpd.js';
@@ -96,6 +97,10 @@ export async function fetchProjectLlmConfig(
function syncUpstreams(router: McpRouter, mcpdClient: McpdClient, servers: McpdServer[]): string[] {
const registered: string[] = [];
// Discovery-class calls (`*\/list`) go through a short-timeout client so a single
// unreachable upstream cannot stall session init for the full tool-call window.
const discoveryClient = mcpdClient.withTimeout(DISCOVERY_TIMEOUT_MS);
// Remove stale upstreams
const currentNames = new Set(router.getUpstreamNames());
const serverNames = new Set(servers.map((s) => s.name));
@@ -108,7 +113,7 @@ function syncUpstreams(router: McpRouter, mcpdClient: McpdClient, servers: McpdS
// Add/update upstreams for each server
for (const server of servers) {
if (!currentNames.has(server.name)) {
const upstream = new McpdUpstream(server.id, server.name, mcpdClient, server.description);
const upstream = new McpdUpstream(server.id, server.name, mcpdClient, server.description, discoveryClient);
router.addUpstream(upstream);
}
registered.push(server.name);

View File

@@ -21,7 +21,14 @@ export class ConnectionError extends Error {
}
/** Default timeout for mcpd requests (ms). Prevents indefinite hangs on slow upstream tool calls. */
const DEFAULT_TIMEOUT_MS = 30_000;
export const DEFAULT_TIMEOUT_MS = 30_000;
/**
* Discovery-class operations (tools/list, resources/list, prompts/list) should not share
* the full tool-call timeout budget — a single dead upstream would stall session init for
* the entire window. Override via `MCPLOCAL_DISCOVERY_TIMEOUT_MS`.
*/
export const DISCOVERY_TIMEOUT_MS = Number(process.env['MCPLOCAL_DISCOVERY_TIMEOUT_MS']) || 8_000;
export class McpdClient {
private readonly baseUrl: string;
@@ -45,6 +52,14 @@ export class McpdClient {
return new McpdClient(this.baseUrl, this.token, { ...this.extraHeaders, ...headers }, this.timeoutMs);
}
/**
* Create a new client with a different per-request timeout. Used by mcplocal's
* discovery path to avoid sharing the slow tool-call budget.
*/
withTimeout(timeoutMs: number): McpdClient {
return new McpdClient(this.baseUrl, this.token, { ...this.extraHeaders }, timeoutMs);
}
async get<T>(path: string): Promise<T> {
return this.request<T>('GET', path);
}

View File

@@ -18,6 +18,10 @@ export interface RouteContext {
correlationId?: string;
}
type ListCacheEntry =
| { kind: 'ok'; result: unknown; fetchedAt: number }
| { kind: 'err'; message: string; fetchedAt: number };
/**
* Routes MCP requests to the appropriate upstream server.
*
@@ -64,6 +68,13 @@ export class McpRouter {
private plugin: ProxyModelPlugin | null = null;
private pluginContexts = new Map<string, PluginContextImpl>();
// Per-server discovery cache. Keyed `${serverName}:${method}`. Prevents every client
// `tools/list` from re-hitting slow/dead upstreams and absorbs negative results so one
// dead server only stalls the first POST, not every subsequent one.
private listCache = new Map<string, ListCacheEntry>();
private readonly LIST_CACHE_POSITIVE_TTL_MS = 30_000;
private readonly LIST_CACHE_NEGATIVE_TTL_MS = 30_000;
/** Optional callback for traffic inspection — called after each upstream call completes. */
onUpstreamCall: ((info: { upstream: string; method?: string; request: unknown; response: unknown; durationMs: number; correlationId?: string }) => void) | null = null;
@@ -202,6 +213,7 @@ export class McpRouter {
addUpstream(connection: UpstreamConnection): void {
this.upstreams.set(connection.name, connection);
this.invalidateListCache(connection.name);
if (this.notificationHandler && connection.onNotification) {
const serverName = connection.name;
const handler = this.notificationHandler;
@@ -219,6 +231,7 @@ export class McpRouter {
removeUpstream(name: string): void {
this.upstreams.delete(name);
this.invalidateListCache(name);
for (const map of [this.toolToServer, this.resourceToServer, this.promptToServer]) {
for (const [key, server] of map) {
if (server === name) {
@@ -228,6 +241,26 @@ export class McpRouter {
}
}
/** Drop all discovery-cache entries for a server (called on register / remove). */
private invalidateListCache(serverName: string): void {
const prefix = `${serverName}:`;
for (const key of this.listCache.keys()) {
if (key.startsWith(prefix)) this.listCache.delete(key);
}
}
private getListCacheEntry(serverName: string, method: string): ListCacheEntry | null {
const entry = this.listCache.get(`${serverName}:${method}`);
if (!entry) return null;
const ttl = entry.kind === 'ok' ? this.LIST_CACHE_POSITIVE_TTL_MS : this.LIST_CACHE_NEGATIVE_TTL_MS;
if (Date.now() - entry.fetchedAt >= ttl) return null;
return entry;
}
private setListCacheEntry(serverName: string, method: string, entry: ListCacheEntry): void {
this.listCache.set(`${serverName}:${method}`, entry);
}
setNotificationHandler(handler: (notification: JsonRpcNotification) => void): void {
this.notificationHandler = handler;
// Wire to all existing upstreams
@@ -248,14 +281,24 @@ export class McpRouter {
/**
* Discover tools from all upstreams by calling tools/list on each.
* Per-server results are cached (positive + negative) to absorb slow upstreams
* and prevent repeated 30s timeouts on every client `tools/list`.
*/
async discoverTools(correlationId?: string): Promise<Array<{ name: string; description?: string; inputSchema?: unknown }>> {
const allTools: Array<{ name: string; description?: string; inputSchema?: unknown }> = [];
const started = Date.now();
let cachedCount = 0;
let freshCount = 0;
const failed: string[] = [];
// Discover tools from all servers in parallel so one slow server doesn't block the rest
const entries = [...this.upstreams.entries()];
const results = await Promise.allSettled(
entries.map(async ([serverName, upstream]) => {
const cached = this.getListCacheEntry(serverName, 'tools/list');
if (cached) {
return { serverName, upstream, cached };
}
const req = {
jsonrpc: '2.0' as const,
id: `discover-tools-${serverName}`,
@@ -279,11 +322,34 @@ export class McpRouter {
console.warn(`[discoverTools] ${(result.reason as Error).message ?? 'unknown error'}`);
continue;
}
const { serverName, upstream, response } = result.value;
const { serverName, upstream } = result.value;
if (response.error) {
console.warn(`[discoverTools] ${serverName}: ${(response.error as { message?: string }).message ?? 'unknown error'}`);
} else if (response.result && typeof response.result === 'object' && 'tools' in response.result) {
let response: JsonRpcResponse | null = null;
if ('cached' in result.value) {
const cached = result.value.cached;
if (cached.kind === 'err') {
cachedCount++;
failed.push(serverName);
continue;
}
response = { jsonrpc: '2.0', id: `cached-${serverName}`, result: cached.result };
cachedCount++;
} else {
response = result.value.response;
freshCount++;
if (response.error) {
const message = (response.error as { message?: string }).message ?? 'unknown error';
this.setListCacheEntry(serverName, 'tools/list', { kind: 'err', message, fetchedAt: Date.now() });
console.warn(`[discoverTools] ${serverName}: ${message}`);
failed.push(serverName);
continue;
}
if (response.result !== undefined) {
this.setListCacheEntry(serverName, 'tools/list', { kind: 'ok', result: response.result, fetchedAt: Date.now() });
}
}
if (response.result && typeof response.result === 'object' && 'tools' in response.result) {
const tools = (response.result as { tools: Array<{ name: string; description?: string; inputSchema?: unknown }> }).tools;
for (const tool of tools) {
const namespacedName = `${serverName}/${tool.name}`;
@@ -304,11 +370,19 @@ export class McpRouter {
}
}
if (entries.length > 0) {
const elapsed = Date.now() - started;
const project = this.projectName ? ` project=${this.projectName}` : '';
const failedStr = failed.length > 0 ? ` failed=[${failed.join(',')}]` : '';
console.info(`[discoverTools]${project} fresh=${freshCount} cached=${cachedCount}${failedStr} elapsed=${elapsed}ms`);
}
return allTools;
}
/**
* Discover resources from all upstreams by calling resources/list on each.
* Shares the per-server list cache with `discoverTools`.
*/
async discoverResources(correlationId?: string): Promise<Array<{ uri: string; name?: string; description?: string; mimeType?: string }>> {
const allResources: Array<{ uri: string; name?: string; description?: string; mimeType?: string }> = [];
@@ -317,6 +391,8 @@ export class McpRouter {
const entries = [...this.upstreams.entries()];
const results = await Promise.allSettled(
entries.map(async ([serverName, upstream]) => {
const cached = this.getListCacheEntry(serverName, 'resources/list');
if (cached) return { serverName, cached };
const req = {
jsonrpc: '2.0' as const,
id: `discover-resources-${serverName}`,
@@ -337,7 +413,24 @@ export class McpRouter {
for (const result of results) {
if (result.status === 'rejected') continue;
const { serverName, response } = result.value;
const { serverName } = result.value;
let response: JsonRpcResponse | null = null;
if ('cached' in result.value) {
const cached = result.value.cached;
if (cached.kind === 'err') continue;
response = { jsonrpc: '2.0', id: `cached-${serverName}`, result: cached.result };
} else {
response = result.value.response;
if (response.error) {
const message = (response.error as { message?: string }).message ?? 'unknown error';
this.setListCacheEntry(serverName, 'resources/list', { kind: 'err', message, fetchedAt: Date.now() });
continue;
}
if (response.result !== undefined) {
this.setListCacheEntry(serverName, 'resources/list', { kind: 'ok', result: response.result, fetchedAt: Date.now() });
}
}
if (response.result && typeof response.result === 'object' && 'resources' in response.result) {
const resources = (response.result as { resources: Array<{ uri: string; name?: string; description?: string; mimeType?: string }> }).resources;

View File

@@ -12,6 +12,9 @@ interface McpdProxyResponse {
error?: { code: number; message: string; data?: unknown };
}
/** Discovery-class methods routed through the short-timeout client when one is provided. */
const LIST_METHOD_SUFFIX = '/list';
/**
* An upstream that routes MCP requests through mcpd's /api/v1/mcp/proxy endpoint.
* mcpd holds the credentials and manages the actual MCP server connections.
@@ -26,6 +29,8 @@ export class McpdUpstream implements UpstreamConnection {
serverName: string,
private mcpdClient: McpdClient,
serverDescription?: string,
/** Short-timeout client used for `*\/list` methods; falls back to mcpdClient when absent. */
private discoveryClient?: McpdClient,
) {
this.name = serverName;
if (serverDescription !== undefined) this.description = serverDescription;
@@ -46,8 +51,12 @@ export class McpdUpstream implements UpstreamConnection {
params: request.params,
};
const client = request.method.endsWith(LIST_METHOD_SUFFIX) && this.discoveryClient
? this.discoveryClient
: this.mcpdClient;
try {
const result = await this.mcpdClient.post<McpdProxyResponse>('/api/v1/mcp/proxy', proxyRequest);
const result = await client.post<McpdProxyResponse>('/api/v1/mcp/proxy', proxyRequest);
if (result.error) {
return { jsonrpc: '2.0', id: request.id, error: result.error };
}

View File

@@ -3,7 +3,7 @@ import { refreshUpstreams } from '../src/discovery.js';
import { McpRouter } from '../src/router.js';
function mockMcpdClient(servers: Array<{ id: string; name: string; transport: string }>) {
return {
const client = {
baseUrl: 'http://test:3100',
token: 'test-token',
get: vi.fn(async () => servers),
@@ -11,7 +11,10 @@ function mockMcpdClient(servers: Array<{ id: string; name: string; transport: st
put: vi.fn(),
delete: vi.fn(),
forward: vi.fn(),
withTimeout: vi.fn(() => client),
withHeaders: vi.fn(() => client),
};
return client;
}
describe('refreshUpstreams', () => {

View File

@@ -107,4 +107,38 @@ describe('McpdUpstream', () => {
const response = await upstream.send(request);
expect(response.error).toEqual({ code: -32601, message: 'Tool not found' });
});
it('routes */list methods through discoveryClient when provided', async () => {
const mainClient = mockMcpdClient();
const discoveryClient = mockMcpdClient(new Map([
['srv-1:tools/list', { result: { tools: [] } }],
['srv-1:resources/list', { result: { resources: [] } }],
['srv-1:prompts/list', { result: { prompts: [] } }],
]));
const upstream = new McpdUpstream('srv-1', 'slack', mainClient as any, undefined, discoveryClient as any);
await upstream.send({ jsonrpc: '2.0', id: '1', method: 'tools/list' });
await upstream.send({ jsonrpc: '2.0', id: '2', method: 'resources/list' });
await upstream.send({ jsonrpc: '2.0', id: '3', method: 'prompts/list' });
expect(discoveryClient.post).toHaveBeenCalledTimes(3);
expect(mainClient.post).not.toHaveBeenCalled();
});
it('routes tools/call through mainClient even when discoveryClient is set', async () => {
const mainClient = mockMcpdClient(new Map([
['srv-1:tools/call', { result: { ok: true } }],
]));
const discoveryClient = mockMcpdClient();
const upstream = new McpdUpstream('srv-1', 'slack', mainClient as any, undefined, discoveryClient as any);
await upstream.send({
jsonrpc: '2.0', id: '1', method: 'tools/call',
params: { name: 'noop', arguments: {} },
});
expect(mainClient.post).toHaveBeenCalledTimes(1);
expect(discoveryClient.post).not.toHaveBeenCalled();
});
});

View File

@@ -3,7 +3,7 @@ import { refreshProjectUpstreams } from '../src/discovery.js';
import { McpRouter } from '../src/router.js';
function mockMcpdClient(servers: Array<{ id: string; name: string; transport: string }>) {
return {
const client = {
baseUrl: 'http://test:3100',
token: 'test-token',
get: vi.fn(async () => servers),
@@ -11,7 +11,10 @@ function mockMcpdClient(servers: Array<{ id: string; name: string; transport: st
put: vi.fn(),
delete: vi.fn(),
forward: vi.fn(async () => ({ status: 200, body: servers })),
withTimeout: vi.fn(() => client),
withHeaders: vi.fn(() => client),
};
return client;
}
describe('refreshProjectUpstreams', () => {

View File

@@ -0,0 +1,137 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { McpRouter } from '../src/router.js';
import type { UpstreamConnection, JsonRpcRequest, JsonRpcResponse } from '../src/types.js';
function mockUpstream(name: string, opts: { tools?: Array<{ name: string }>; resources?: Array<{ uri: string }>; err?: string } = {}): UpstreamConnection {
return {
name,
isAlive: () => true,
close: async () => {},
send: vi.fn(async (req: JsonRpcRequest): Promise<JsonRpcResponse> => {
if (opts.err) {
return { jsonrpc: '2.0', id: req.id, error: { code: -32603, message: opts.err } };
}
if (req.method === 'tools/list') {
return { jsonrpc: '2.0', id: req.id, result: { tools: opts.tools ?? [] } };
}
if (req.method === 'resources/list') {
return { jsonrpc: '2.0', id: req.id, result: { resources: opts.resources ?? [] } };
}
return { jsonrpc: '2.0', id: req.id, error: { code: -32601, message: 'not handled' } };
}),
} as UpstreamConnection;
}
describe('McpRouter discovery cache', () => {
let router: McpRouter;
beforeEach(() => {
router = new McpRouter();
vi.useFakeTimers();
vi.setSystemTime(new Date('2026-04-15T12:00:00Z'));
});
afterEach(() => {
vi.useRealTimers();
});
it('serves tools/list from cache on the second call within TTL', async () => {
const upstream = mockUpstream('slack', { tools: [{ name: 'search' }] });
router.addUpstream(upstream);
await router.discoverTools();
await router.discoverTools();
expect(upstream.send).toHaveBeenCalledTimes(1);
});
it('re-fetches after positive TTL expires', async () => {
const upstream = mockUpstream('slack', { tools: [{ name: 'search' }] });
router.addUpstream(upstream);
await router.discoverTools();
vi.advanceTimersByTime(31_000);
await router.discoverTools();
expect(upstream.send).toHaveBeenCalledTimes(2);
});
it('negative cache prevents repeated calls to a failing upstream', async () => {
const upstream = mockUpstream('broken', { err: 'mcpd proxy error: timeout' });
router.addUpstream(upstream);
await router.discoverTools();
await router.discoverTools();
await router.discoverTools();
expect(upstream.send).toHaveBeenCalledTimes(1);
});
it('negative cache expires after negative TTL', async () => {
const upstream = mockUpstream('broken', { err: 'mcpd proxy error: timeout' });
router.addUpstream(upstream);
await router.discoverTools();
vi.advanceTimersByTime(31_000);
await router.discoverTools();
expect(upstream.send).toHaveBeenCalledTimes(2);
});
it('re-registering a server invalidates its cache entry', async () => {
const upstream1 = mockUpstream('slack', { tools: [{ name: 'v1' }] });
router.addUpstream(upstream1);
await router.discoverTools();
expect(upstream1.send).toHaveBeenCalledTimes(1);
const upstream2 = mockUpstream('slack', { tools: [{ name: 'v2' }] });
router.addUpstream(upstream2);
const tools = await router.discoverTools();
expect(upstream2.send).toHaveBeenCalledTimes(1);
expect(tools.map((t) => t.name)).toEqual(['slack/v2']);
});
it('removeUpstream clears cache so follow-up add re-fetches', async () => {
const upstream1 = mockUpstream('slack', { tools: [{ name: 'v1' }] });
router.addUpstream(upstream1);
await router.discoverTools();
router.removeUpstream('slack');
const upstream2 = mockUpstream('slack', { tools: [{ name: 'v2' }] });
router.addUpstream(upstream2);
await router.discoverTools();
expect(upstream2.send).toHaveBeenCalledTimes(1);
});
it('one dead server does not block cached results for others', async () => {
const broken = mockUpstream('broken', { err: 'timeout' });
const healthy = mockUpstream('healthy', { tools: [{ name: 'ping' }] });
router.addUpstream(broken);
router.addUpstream(healthy);
const first = await router.discoverTools();
expect(first.map((t) => t.name)).toEqual(['healthy/ping']);
// Second call: both come from cache.
const second = await router.discoverTools();
expect(second.map((t) => t.name)).toEqual(['healthy/ping']);
expect(broken.send).toHaveBeenCalledTimes(1);
expect(healthy.send).toHaveBeenCalledTimes(1);
});
it('discoverResources uses its own cache key independent of tools/list', async () => {
const upstream = mockUpstream('docs', { tools: [{ name: 'search' }], resources: [{ uri: 'doc://1' }] });
router.addUpstream(upstream);
await router.discoverTools();
await router.discoverResources();
await router.discoverTools();
await router.discoverResources();
// Each method cached separately → exactly one call per method.
expect(upstream.send).toHaveBeenCalledTimes(2);
});
});