src/mcpd/tests/services/health-probe.test.ts

import { describe, it, expect, vi, beforeEach } from 'vitest';
import { HealthProbeRunner, DEFAULT_HEALTH_CHECK } from '../../src/services/health-probe.service.js';
import type { HealthCheckSpec } from '../../src/services/health-probe.service.js';
import type { IMcpInstanceRepository, IMcpServerRepository } from '../../src/repositories/interfaces.js';
import type { McpOrchestrator } from '../../src/services/orchestrator.js';
import type { McpProxyService, McpProxyResponse } from '../../src/services/mcp-proxy-service.js';
import type { McpInstance, McpServer } from '@prisma/client';

function makeInstance(overrides: Partial<McpInstance> = {}): McpInstance {
  return {
    id: 'inst-1',
    serverId: 'srv-1',
    status: 'RUNNING',
    containerId: 'container-abc',
    port: null,
    healthStatus: null,
    lastHealthCheck: null,
    events: [],
    metadata: {},
    version: 1,
    createdAt: new Date(),
    updatedAt: new Date(),
    ...overrides,
  } as McpInstance;
}

function makeServer(overrides: Partial<McpServer> = {}): McpServer {
  return {
    id: 'srv-1',
    name: 'my-grafana',
    transport: 'STDIO',
    packageName: '@leval/mcp-grafana',
    dockerImage: null,
    externalUrl: null,
    containerPort: null,
    repositoryUrl: null,
    description: null,
    command: null,
    env: [],
    replicas: 1,
    projectId: null,
    healthCheck: {
      tool: 'list_datasources',
      arguments: {},
      intervalSeconds: 60,
      timeoutSeconds: 10,
      failureThreshold: 3,
    },
    version: 1,
    createdAt: new Date(),
    updatedAt: new Date(),
    ...overrides,
  } as McpServer;
}

function mockInstanceRepo(): IMcpInstanceRepository {
  return {
    findAll: vi.fn(async () => []),
    findById: vi.fn(async () => null),
    findByContainerId: vi.fn(async () => null),
    create: vi.fn(async (data) => makeInstance(data)),
    updateStatus: vi.fn(async (id, status, fields) => makeInstance({ id, status, ...fields })),
    delete: vi.fn(async () => {}),
  };
}

function mockServerRepo(): IMcpServerRepository {
  return {
    findAll: vi.fn(async () => []),
    findById: vi.fn(async () => null),
    findByName: vi.fn(async () => null),
    create: vi.fn(async () => makeServer()),
    update: vi.fn(async () => makeServer()),
    delete: vi.fn(async () => {}),
  };
}

function mockOrchestrator(): McpOrchestrator {
  return {
    pullImage: vi.fn(async () => {}),
    createContainer: vi.fn(async () => ({ containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date() })),
    stopContainer: vi.fn(async () => {}),
    removeContainer: vi.fn(async () => {}),
    inspectContainer: vi.fn(async () => ({ containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date() })),
    getContainerLogs: vi.fn(async () => ({ stdout: '', stderr: '' })),
    execInContainer: vi.fn(async () => ({ exitCode: 0, stdout: 'OK', stderr: '' })),
    ping: vi.fn(async () => true),
  };
}

function mockMcpProxyService(): McpProxyService {
  return {
    execute: vi.fn(async (): Promise<McpProxyResponse> => ({ jsonrpc: '2.0', id: 1, result: { tools: [] } })),
    closeAll: vi.fn(),
    removeClient: vi.fn(),
  } as unknown as McpProxyService;
}

describe('HealthProbeRunner', () => {
  let instanceRepo: IMcpInstanceRepository;
  let serverRepo: IMcpServerRepository;
  let orchestrator: McpOrchestrator;
  let mcpProxyService: McpProxyService;
  let runner: HealthProbeRunner;

  beforeEach(() => {
    instanceRepo = mockInstanceRepo();
    serverRepo = mockServerRepo();
    orchestrator = mockOrchestrator();
    mcpProxyService = mockMcpProxyService();
    runner = new HealthProbeRunner(instanceRepo, serverRepo, orchestrator, undefined, mcpProxyService);
  });

  it('applies default liveness probe when server has no healthCheck config', async () => {
    const instance = makeInstance();
    const server = makeServer({ healthCheck: null });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);

    await runner.tick();

    // No exec fallback — liveness goes through mcpProxyService
    expect(orchestrator.execInContainer).not.toHaveBeenCalled();
    expect(mcpProxyService.execute).toHaveBeenCalledWith({ serverId: 'srv-1', method: 'tools/list' });
    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({ healthStatus: 'healthy' }),
    );
  });

  it('default liveness probe marks unhealthy when tools/list returns JSON-RPC error', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0',
      id: 1,
      error: { code: -32603, message: 'Cannot connect to upstream' },
    });

    await runner.tick();

    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({
        healthStatus: 'unhealthy',
        events: expect.arrayContaining([
          expect.objectContaining({ type: 'Warning', message: expect.stringContaining('Cannot connect to upstream') }),
        ]),
      }),
    );
  });

  it('default liveness probe marks unhealthy when mcpProxyService throws', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { intervalSeconds: 0, failureThreshold: 1 } as unknown as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockRejectedValue(new Error('no running instance'));

    await runner.tick();

    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({ healthStatus: 'unhealthy' }),
    );
  });

  it('DEFAULT_HEALTH_CHECK has no tool set so it acts as liveness', () => {
    expect(DEFAULT_HEALTH_CHECK.tool).toBeUndefined();
    expect(DEFAULT_HEALTH_CHECK.intervalSeconds).toBe(30);
    expect(DEFAULT_HEALTH_CHECK.failureThreshold).toBe(3);
  });

  it('skips non-RUNNING instances', async () => {
    const instance = makeInstance({ status: 'ERROR' });
    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);

    await runner.tick();

    expect(serverRepo.findById).not.toHaveBeenCalled();
  });

  it('probes STDIO instance via mcpProxyService and marks healthy on success', async () => {
    const instance = makeInstance();
    const server = makeServer();

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1,
      result: { content: [{ type: 'text', text: 'ok' }] },
    });

    await runner.tick();

    // STDIO readiness now goes through the proxy (the live container),
    // not via docker-exec into a synthetic spawn — see comment on
    // probeReadinessViaProxy for why.
    expect(orchestrator.execInContainer).not.toHaveBeenCalled();
    expect(mcpProxyService.execute).toHaveBeenCalledWith({
      serverId: 'srv-1',
      method: 'tools/call',
      params: { name: 'list_datasources', arguments: {} },
    });

    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({
        healthStatus: 'healthy',
        lastHealthCheck: expect.any(Date),
        events: expect.arrayContaining([
          expect.objectContaining({ type: 'Normal', message: expect.stringContaining('passed') }),
        ]),
      }),
    );
  });

  it('marks unhealthy when proxy returns a JSON-RPC error (e.g. broken-secret auth failure)', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { tool: 'get_me', intervalSeconds: 0, failureThreshold: 1 } as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1,
      error: { code: -32603, message: 'token is required' },
    });

    await runner.tick();

    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({
        healthStatus: 'unhealthy',
        events: expect.arrayContaining([
          expect.objectContaining({ type: 'Warning', message: expect.stringContaining('token is required') }),
        ]),
      }),
    );
  });

  it('marks unhealthy when proxy returns a tool-level error in result.isError', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { tool: 'get_me', intervalSeconds: 0, failureThreshold: 1 } as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1,
      result: { isError: true, content: [{ type: 'text', text: 'auth failed: token is required' }] },
    });

    await runner.tick();

    const events = vi.mocked(instanceRepo.updateStatus).mock.calls[0]?.[2]?.events as Array<{ message: string }> | undefined;
    expect(events?.[events.length - 1]?.message).toContain('auth failed');
    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({ healthStatus: 'unhealthy' }),
    );
  });

  it('marks unhealthy after failureThreshold consecutive failures', async () => {
    const instance = makeInstance();
    const healthCheck: HealthCheckSpec = {
      tool: 'list_datasources',
      arguments: {},
      intervalSeconds: 0, // always due
      failureThreshold: 2,
    };
    const server = makeServer({ healthCheck: healthCheck as unknown as undefined });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1,
      error: { code: -32603, message: 'connection refused' },
    });

    // First failure → degraded
    await runner.tick();
    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({ healthStatus: 'degraded' }),
    );

    // Second failure → unhealthy (threshold = 2)
    await runner.tick();
    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({ healthStatus: 'unhealthy' }),
    );
  });

  it('resets failure count on success', async () => {
    const instance = makeInstance();
    const healthCheck: HealthCheckSpec = {
      tool: 'list_datasources',
      arguments: {},
      intervalSeconds: 0,
      failureThreshold: 3,
    };
    const server = makeServer({ healthCheck: healthCheck as unknown as undefined });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);

    // Two failures
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, error: { code: -32603, message: 'fail' },
    });
    await runner.tick();
    await runner.tick();

    // Then success — should reset to healthy
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, result: {},
    });
    await runner.tick();

    const lastCall = vi.mocked(instanceRepo.updateStatus).mock.calls.at(-1);
    expect(lastCall?.[2]).toEqual(expect.objectContaining({ healthStatus: 'healthy' }));
  });

  it('handles probe timeout as failure', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { tool: 'list_datasources', intervalSeconds: 0, timeoutSeconds: 0.05, failureThreshold: 3 } as unknown as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    // Hang forever — the probe's internal deadline should fire instead.
    vi.mocked(mcpProxyService.execute).mockImplementation(() => new Promise(() => { /* never resolves */ }));

    await runner.tick();

    expect(instanceRepo.updateStatus).toHaveBeenCalledWith(
      'inst-1',
      'RUNNING',
      expect.objectContaining({
        healthStatus: 'degraded',
        events: expect.arrayContaining([
          expect.objectContaining({ type: 'Warning', message: expect.stringContaining('timed out') }),
        ]),
      }),
    );
  });

  it('appends events without losing history', async () => {
    const existingEvents = [
      { timestamp: '2025-01-01T00:00:00Z', type: 'Normal', message: 'old event' },
    ];
    const instance = makeInstance({ events: existingEvents });
    const server = makeServer({
      healthCheck: { tool: 'test', intervalSeconds: 0 } as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, result: {},
    });

    await runner.tick();

    const events = vi.mocked(instanceRepo.updateStatus).mock.calls[0]?.[2]?.events as unknown[];
    expect(events).toHaveLength(2);
    expect((events[0] as { message: string }).message).toBe('old event');
    expect((events[1] as { message: string }).message).toContain('passed');
  });

  it('respects interval — skips probing if not due', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { tool: 'test', intervalSeconds: 300 } as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, result: {},
    });

    // First tick: should probe
    await runner.tick();
    expect(mcpProxyService.execute).toHaveBeenCalledTimes(1);

    // Second tick immediately: should skip (300s interval not elapsed)
    await runner.tick();
    expect(mcpProxyService.execute).toHaveBeenCalledTimes(1);
  });

  it('cleans up probe states for removed instances', async () => {
    const instance = makeInstance();
    const server = makeServer({
      healthCheck: { tool: 'test', intervalSeconds: 0 } as McpServer['healthCheck'],
    });

    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    vi.mocked(serverRepo.findById).mockResolvedValue(server);
    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, result: {},
    });

    await runner.tick();
    expect(mcpProxyService.execute).toHaveBeenCalledTimes(1);

    // Instance removed
    vi.mocked(instanceRepo.findAll).mockResolvedValue([]);
    await runner.tick();

    // Re-add same instance — should probe again (state was cleaned)
    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);
    await runner.tick();
    expect(mcpProxyService.execute).toHaveBeenCalledTimes(2);
  });

  it('skips STDIO instances without containerId', async () => {
    const instance = makeInstance({ containerId: null });
    const server = makeServer();

    // containerId is null, but status is RUNNING — shouldn't be probed
    vi.mocked(instanceRepo.findAll).mockResolvedValue([instance]);

    await runner.tick();
    expect(serverRepo.findById).not.toHaveBeenCalled();
  });

  it('probeInstance returns result directly', async () => {
    const instance = makeInstance();
    const server = makeServer();
    const healthCheck: HealthCheckSpec = {
      tool: 'list_datasources',
      arguments: {},
    };

    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1, result: {},
    });

    const result = await runner.probeInstance(instance, server, healthCheck);
    expect(result.healthy).toBe(true);
    expect(result.latencyMs).toBeGreaterThanOrEqual(0);
    expect(result.message).toBe('ok');
  });

  it('surfaces upstream JSON-RPC error message verbatim', async () => {
    const instance = makeInstance();
    const server = makeServer();
    const healthCheck: HealthCheckSpec = { tool: 'list_datasources', arguments: {} };

    vi.mocked(mcpProxyService.execute).mockResolvedValue({
      jsonrpc: '2.0', id: 1,
      error: { code: -32603, message: 'ECONNREFUSED 10.0.0.1:3000' },
    });

    const result = await runner.probeInstance(instance, server, healthCheck);
    expect(result.healthy).toBe(false);
    expect(result.message).toBe('ECONNREFUSED 10.0.0.1:3000');
  });
});