fix: MCP proxy resilience — discovery cache, default liveness probes
Some checks failed
CI/CD / lint (push) Successful in 52s
CI/CD / typecheck (push) Successful in 1m51s
CI/CD / test (push) Successful in 1m1s
CI/CD / smoke (push) Failing after 3m21s
CI/CD / build (push) Successful in 4m9s
CI/CD / publish (push) Has been skipped

Adds a per-server tools/list cache in McpRouter (positive + negative TTL)
so a slow or dead upstream only stalls the first discovery call, not every
subsequent client request. Invalidated on upstream add/remove.

Health probes now apply a default liveness spec (tools/list via the real
production path) to any RUNNING instance without an explicit healthCheck,
so synthetic and real failures converge on the same signal.

Includes supporting updates in mcpd-client, discovery, upstream/mcpd,
seeder, and fulldeploy/release scripts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Michal
2026-04-17 00:48:57 +01:00
parent c968d76e00
commit 3149ea3ae7
15 changed files with 499 additions and 32 deletions

View File

@@ -1,8 +1,9 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { HealthProbeRunner } from '../../src/services/health-probe.service.js';
import { HealthProbeRunner, DEFAULT_HEALTH_CHECK } from '../../src/services/health-probe.service.js';
import type { HealthCheckSpec } from '../../src/services/health-probe.service.js';
import type { IMcpInstanceRepository, IMcpServerRepository } from '../../src/repositories/interfaces.js';
import type { McpOrchestrator, ExecResult } from '../../src/services/orchestrator.js';
import type { McpOrchestrator } from '../../src/services/orchestrator.js';
import type { McpProxyService, McpProxyResponse } from '../../src/services/mcp-proxy-service.js';
import type { McpInstance, McpServer } from '@prisma/client';
function makeInstance(overrides: Partial<McpInstance> = {}): McpInstance {
@@ -87,20 +88,30 @@ function mockOrchestrator(): McpOrchestrator {
};
}
function mockMcpProxyService(): McpProxyService {
return {
execute: vi.fn(async (): Promise<McpProxyResponse> => ({ jsonrpc: '2.0', id: 1, result: { tools: [] } })),
closeAll: vi.fn(),
removeClient: vi.fn(),
} as unknown as McpProxyService;
}
describe('HealthProbeRunner', () => {
let instanceRepo: IMcpInstanceRepository;
let serverRepo: IMcpServerRepository;
let orchestrator: McpOrchestrator;
let mcpProxyService: McpProxyService;
let runner: HealthProbeRunner;
beforeEach(() => {
instanceRepo = mockInstanceRepo();
serverRepo = mockServerRepo();
orchestrator = mockOrchestrator();
runner = new HealthProbeRunner(instanceRepo, serverRepo, orchestrator);
mcpProxyService = mockMcpProxyService();
runner = new HealthProbeRunner(instanceRepo, serverRepo, orchestrator, undefined, mcpProxyService);
});
it('skips instances without healthCheck config', async () => {
it('applies default liveness probe when server has no healthCheck config', async () => {
const instance = makeInstance();
const server = makeServer({ healthCheck: null });
@@ -109,8 +120,67 @@ describe('HealthProbeRunner', () => {
await runner.tick();
// No exec fallback — liveness goes through mcpProxyService
expect(orchestrator.execInContainer).not.toHaveBeenCalled();
expect(instanceRepo.updateStatus).not.toHaveBeenCalled();
expect(mcpProxyService.execute).toHaveBeenCalledWith({ serverId: 'srv-1', method: 'tools/list' });
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
'inst-1',
'RUNNING',
expect.objectContaining({ healthStatus: 'healthy' }),
);
});
it('default liveness probe marks unhealthy when tools/list returns JSON-RPC error', async () => {
const instance = makeInstance();
const server = makeServer({
healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
});
vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
vi.mocked(serverRepo.findById).mockResolvedValue(server);
vi.mocked(mcpProxyService.execute).mockResolvedValue({
jsonrpc: '2.0',
id: 1,
error: { code: -32603, message: 'Cannot connect to upstream' },
});
await runner.tick();
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
'inst-1',
'RUNNING',
expect.objectContaining({
healthStatus: 'unhealthy',
events: expect.arrayContaining([
expect.objectContaining({ type: 'Warning', message: expect.stringContaining('Cannot connect to upstream') }),
]),
}),
);
});
it('default liveness probe marks unhealthy when mcpProxyService throws', async () => {
const instance = makeInstance();
const server = makeServer({
healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
});
vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
vi.mocked(serverRepo.findById).mockResolvedValue(server);
vi.mocked(mcpProxyService.execute).mockRejectedValue(new Error('no running instance'));
await runner.tick();
expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
'inst-1',
'RUNNING',
expect.objectContaining({ healthStatus: 'unhealthy' }),
);
});
it('DEFAULT_HEALTH_CHECK has no tool set so it acts as liveness', () => {
expect(DEFAULT_HEALTH_CHECK.tool).toBeUndefined();
expect(DEFAULT_HEALTH_CHECK.intervalSeconds).toBe(30);
expect(DEFAULT_HEALTH_CHECK.failureThreshold).toBe(3);
});
it('skips non-RUNNING instances', async () => {