fix: MCP proxy resilience — discovery cache, default liveness probes
Some checks failed
Some checks failed
Adds a per-server tools/list cache in McpRouter (positive + negative TTL) so a slow or dead upstream only stalls the first discovery call, not every subsequent client request. Invalidated on upstream add/remove. Health probes now apply a default liveness spec (tools/list via the real production path) to any RUNNING instance without an explicit healthCheck, so synthetic and real failures converge on the same signal. Includes supporting updates in mcpd-client, discovery, upstream/mcpd, seeder, and fulldeploy/release scripts. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import { HealthProbeRunner } from '../../src/services/health-probe.service.js';
|
||||
import { HealthProbeRunner, DEFAULT_HEALTH_CHECK } from '../../src/services/health-probe.service.js';
|
||||
import type { HealthCheckSpec } from '../../src/services/health-probe.service.js';
|
||||
import type { IMcpInstanceRepository, IMcpServerRepository } from '../../src/repositories/interfaces.js';
|
||||
import type { McpOrchestrator, ExecResult } from '../../src/services/orchestrator.js';
|
||||
import type { McpOrchestrator } from '../../src/services/orchestrator.js';
|
||||
import type { McpProxyService, McpProxyResponse } from '../../src/services/mcp-proxy-service.js';
|
||||
import type { McpInstance, McpServer } from '@prisma/client';
|
||||
|
||||
function makeInstance(overrides: Partial<McpInstance> = {}): McpInstance {
|
||||
@@ -87,20 +88,30 @@ function mockOrchestrator(): McpOrchestrator {
|
||||
};
|
||||
}
|
||||
|
||||
function mockMcpProxyService(): McpProxyService {
|
||||
return {
|
||||
execute: vi.fn(async (): Promise<McpProxyResponse> => ({ jsonrpc: '2.0', id: 1, result: { tools: [] } })),
|
||||
closeAll: vi.fn(),
|
||||
removeClient: vi.fn(),
|
||||
} as unknown as McpProxyService;
|
||||
}
|
||||
|
||||
describe('HealthProbeRunner', () => {
|
||||
let instanceRepo: IMcpInstanceRepository;
|
||||
let serverRepo: IMcpServerRepository;
|
||||
let orchestrator: McpOrchestrator;
|
||||
let mcpProxyService: McpProxyService;
|
||||
let runner: HealthProbeRunner;
|
||||
|
||||
beforeEach(() => {
|
||||
instanceRepo = mockInstanceRepo();
|
||||
serverRepo = mockServerRepo();
|
||||
orchestrator = mockOrchestrator();
|
||||
runner = new HealthProbeRunner(instanceRepo, serverRepo, orchestrator);
|
||||
mcpProxyService = mockMcpProxyService();
|
||||
runner = new HealthProbeRunner(instanceRepo, serverRepo, orchestrator, undefined, mcpProxyService);
|
||||
});
|
||||
|
||||
it('skips instances without healthCheck config', async () => {
|
||||
it('applies default liveness probe when server has no healthCheck config', async () => {
|
||||
const instance = makeInstance();
|
||||
const server = makeServer({ healthCheck: null });
|
||||
|
||||
@@ -109,8 +120,67 @@ describe('HealthProbeRunner', () => {
|
||||
|
||||
await runner.tick();
|
||||
|
||||
// No exec fallback — liveness goes through mcpProxyService
|
||||
expect(orchestrator.execInContainer).not.toHaveBeenCalled();
|
||||
expect(instanceRepo.updateStatus).not.toHaveBeenCalled();
|
||||
expect(mcpProxyService.execute).toHaveBeenCalledWith({ serverId: 'srv-1', method: 'tools/list' });
|
||||
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
|
||||
'inst-1',
|
||||
'RUNNING',
|
||||
expect.objectContaining({ healthStatus: 'healthy' }),
|
||||
);
|
||||
});
|
||||
|
||||
it('default liveness probe marks unhealthy when tools/list returns JSON-RPC error', async () => {
|
||||
const instance = makeInstance();
|
||||
const server = makeServer({
|
||||
healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
|
||||
});
|
||||
|
||||
vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
|
||||
vi.mocked(serverRepo.findById).mockResolvedValue(server);
|
||||
vi.mocked(mcpProxyService.execute).mockResolvedValue({
|
||||
jsonrpc: '2.0',
|
||||
id: 1,
|
||||
error: { code: -32603, message: 'Cannot connect to upstream' },
|
||||
});
|
||||
|
||||
await runner.tick();
|
||||
|
||||
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
|
||||
'inst-1',
|
||||
'RUNNING',
|
||||
expect.objectContaining({
|
||||
healthStatus: 'unhealthy',
|
||||
events: expect.arrayContaining([
|
||||
expect.objectContaining({ type: 'Warning', message: expect.stringContaining('Cannot connect to upstream') }),
|
||||
]),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('default liveness probe marks unhealthy when mcpProxyService throws', async () => {
|
||||
const instance = makeInstance();
|
||||
const server = makeServer({
|
||||
healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
|
||||
});
|
||||
|
||||
vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
|
||||
vi.mocked(serverRepo.findById).mockResolvedValue(server);
|
||||
vi.mocked(mcpProxyService.execute).mockRejectedValue(new Error('no running instance'));
|
||||
|
||||
await runner.tick();
|
||||
|
||||
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
|
||||
'inst-1',
|
||||
'RUNNING',
|
||||
expect.objectContaining({ healthStatus: 'unhealthy' }),
|
||||
);
|
||||
});
|
||||
|
||||
it('DEFAULT_HEALTH_CHECK has no tool set so it acts as liveness', () => {
|
||||
expect(DEFAULT_HEALTH_CHECK.tool).toBeUndefined();
|
||||
expect(DEFAULT_HEALTH_CHECK.intervalSeconds).toBe(30);
|
||||
expect(DEFAULT_HEALTH_CHECK.failureThreshold).toBe(3);
|
||||
});
|
||||
|
||||
it('skips non-RUNNING instances', async () => {
|
||||
|
||||
Reference in New Issue
Block a user