feat: add health monitoring with metrics collection and REST API
MetricsCollector tracks per-instance request counts, error rates, latency, and uptime. HealthAggregator computes system-wide health status. REST endpoints at /api/v1/health/overview, /health/instances/:id, /metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
304
src/mcpd/tests/health-monitoring.test.ts
Normal file
304
src/mcpd/tests/health-monitoring.test.ts
Normal file
@@ -0,0 +1,304 @@
|
||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
||||
import Fastify from 'fastify';
|
||||
import { MetricsCollector } from '../src/services/metrics-collector.js';
|
||||
import { HealthAggregator } from '../src/services/health-aggregator.js';
|
||||
import type { McpOrchestrator } from '../src/services/orchestrator.js';
|
||||
import { registerHealthMonitoringRoutes } from '../src/routes/health-monitoring.js';
|
||||
|
||||
function mockOrchestrator(available = true): McpOrchestrator {
|
||||
return {
|
||||
ping: vi.fn(async () => available),
|
||||
pullImage: vi.fn(async () => {}),
|
||||
createContainer: vi.fn(async () => ({
|
||||
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
|
||||
})),
|
||||
stopContainer: vi.fn(async () => {}),
|
||||
removeContainer: vi.fn(async () => {}),
|
||||
inspectContainer: vi.fn(async () => ({
|
||||
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
|
||||
})),
|
||||
getContainerLogs: vi.fn(async () => ({ stdout: '', stderr: '' })),
|
||||
};
|
||||
}
|
||||
|
||||
describe('MetricsCollector', () => {
|
||||
let collector: MetricsCollector;
|
||||
|
||||
beforeEach(() => {
|
||||
collector = new MetricsCollector();
|
||||
});
|
||||
|
||||
it('registers and retrieves instance metrics', () => {
|
||||
collector.register('inst-1');
|
||||
const m = collector.getMetrics('inst-1');
|
||||
expect(m).toBeDefined();
|
||||
expect(m!.instanceId).toBe('inst-1');
|
||||
expect(m!.requestCount).toBe(0);
|
||||
expect(m!.errorCount).toBe(0);
|
||||
expect(m!.status).toBe('unknown');
|
||||
});
|
||||
|
||||
it('returns undefined for unregistered instance', () => {
|
||||
expect(collector.getMetrics('nonexistent')).toBeUndefined();
|
||||
});
|
||||
|
||||
it('records requests and increments count', () => {
|
||||
collector.register('inst-1');
|
||||
collector.recordRequest('inst-1', 50);
|
||||
collector.recordRequest('inst-1', 100);
|
||||
const m = collector.getMetrics('inst-1')!;
|
||||
expect(m.requestCount).toBe(2);
|
||||
expect(m.lastRequestAt).not.toBeNull();
|
||||
expect(m.latencyMs).toEqual([50, 100]);
|
||||
});
|
||||
|
||||
it('records errors', () => {
|
||||
collector.register('inst-1');
|
||||
collector.recordError('inst-1');
|
||||
collector.recordError('inst-1');
|
||||
expect(collector.getMetrics('inst-1')!.errorCount).toBe(2);
|
||||
});
|
||||
|
||||
it('auto-registers on recordRequest for unknown instance', () => {
|
||||
collector.recordRequest('new-inst');
|
||||
expect(collector.getMetrics('new-inst')).toBeDefined();
|
||||
expect(collector.getMetrics('new-inst')!.requestCount).toBe(1);
|
||||
});
|
||||
|
||||
it('calculates error rate', () => {
|
||||
collector.register('inst-1');
|
||||
collector.recordRequest('inst-1');
|
||||
collector.recordRequest('inst-1');
|
||||
collector.recordError('inst-1');
|
||||
expect(collector.getErrorRate('inst-1')).toBe(0.5);
|
||||
});
|
||||
|
||||
it('returns 0 error rate when no requests', () => {
|
||||
collector.register('inst-1');
|
||||
expect(collector.getErrorRate('inst-1')).toBe(0);
|
||||
});
|
||||
|
||||
it('calculates average latency', () => {
|
||||
collector.register('inst-1');
|
||||
collector.recordRequest('inst-1', 100);
|
||||
collector.recordRequest('inst-1', 200);
|
||||
collector.recordRequest('inst-1', 300);
|
||||
expect(collector.getAverageLatency('inst-1')).toBe(200);
|
||||
});
|
||||
|
||||
it('returns 0 average latency when no samples', () => {
|
||||
collector.register('inst-1');
|
||||
expect(collector.getAverageLatency('inst-1')).toBe(0);
|
||||
});
|
||||
|
||||
it('updates status', () => {
|
||||
collector.register('inst-1');
|
||||
collector.updateStatus('inst-1', 'healthy');
|
||||
expect(collector.getMetrics('inst-1')!.status).toBe('healthy');
|
||||
});
|
||||
|
||||
it('getAllMetrics returns all registered instances', () => {
|
||||
collector.register('a');
|
||||
collector.register('b');
|
||||
collector.register('c');
|
||||
const all = collector.getAllMetrics();
|
||||
expect(all).toHaveLength(3);
|
||||
expect(all.map((m) => m.instanceId).sort()).toEqual(['a', 'b', 'c']);
|
||||
});
|
||||
|
||||
it('unregister removes instance', () => {
|
||||
collector.register('inst-1');
|
||||
collector.unregister('inst-1');
|
||||
expect(collector.getMetrics('inst-1')).toBeUndefined();
|
||||
expect(collector.getAllMetrics()).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('reset clears all metrics', () => {
|
||||
collector.register('a');
|
||||
collector.register('b');
|
||||
collector.reset();
|
||||
expect(collector.getAllMetrics()).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('computes uptime from registration time', () => {
|
||||
collector.register('inst-1');
|
||||
const m = collector.getMetrics('inst-1')!;
|
||||
expect(m.uptime).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('HealthAggregator', () => {
|
||||
let collector: MetricsCollector;
|
||||
let orchestrator: McpOrchestrator;
|
||||
let aggregator: HealthAggregator;
|
||||
|
||||
beforeEach(() => {
|
||||
collector = new MetricsCollector();
|
||||
orchestrator = mockOrchestrator(true);
|
||||
aggregator = new HealthAggregator(collector, orchestrator);
|
||||
});
|
||||
|
||||
it('reports healthy when all instances healthy', async () => {
|
||||
collector.register('a');
|
||||
collector.register('b');
|
||||
collector.updateStatus('a', 'healthy');
|
||||
collector.updateStatus('b', 'healthy');
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('healthy');
|
||||
expect(overview.totalInstances).toBe(2);
|
||||
expect(overview.healthyCount).toBe(2);
|
||||
expect(overview.unhealthyCount).toBe(0);
|
||||
expect(overview.runtimeAvailable).toBe(true);
|
||||
});
|
||||
|
||||
it('reports degraded when some instances unhealthy', async () => {
|
||||
collector.register('a');
|
||||
collector.register('b');
|
||||
collector.updateStatus('a', 'healthy');
|
||||
collector.updateStatus('b', 'unhealthy');
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('degraded');
|
||||
expect(overview.unhealthyCount).toBe(1);
|
||||
});
|
||||
|
||||
it('reports degraded when some instances unknown', async () => {
|
||||
collector.register('a');
|
||||
collector.updateStatus('a', 'healthy');
|
||||
collector.register('b'); // status remains 'unknown'
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('degraded');
|
||||
expect(overview.unknownCount).toBe(1);
|
||||
});
|
||||
|
||||
it('reports unhealthy when all instances unhealthy', async () => {
|
||||
collector.register('a');
|
||||
collector.updateStatus('a', 'unhealthy');
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('unhealthy');
|
||||
});
|
||||
|
||||
it('reports unhealthy when runtime unavailable', async () => {
|
||||
orchestrator = mockOrchestrator(false);
|
||||
aggregator = new HealthAggregator(collector, orchestrator);
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('unhealthy');
|
||||
expect(overview.runtimeAvailable).toBe(false);
|
||||
});
|
||||
|
||||
it('reports healthy with no instances and runtime available', async () => {
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.overallStatus).toBe('healthy');
|
||||
expect(overview.totalInstances).toBe(0);
|
||||
});
|
||||
|
||||
it('computes aggregate error rate', async () => {
|
||||
collector.register('a');
|
||||
collector.recordRequest('a');
|
||||
collector.recordRequest('a');
|
||||
collector.recordError('a');
|
||||
|
||||
const overview = await aggregator.getOverview();
|
||||
expect(overview.aggregateErrorRate).toBe(0.5);
|
||||
});
|
||||
|
||||
it('getInstanceHealth returns instance details', () => {
|
||||
collector.register('inst-1');
|
||||
collector.updateStatus('inst-1', 'healthy');
|
||||
collector.recordRequest('inst-1', 150);
|
||||
collector.recordError('inst-1');
|
||||
|
||||
const health = aggregator.getInstanceHealth('inst-1');
|
||||
expect(health).toBeDefined();
|
||||
expect(health!.instanceId).toBe('inst-1');
|
||||
expect(health!.status).toBe('healthy');
|
||||
expect(health!.requestCount).toBe(1);
|
||||
expect(health!.errorCount).toBe(1);
|
||||
expect(health!.errorRate).toBe(1);
|
||||
expect(health!.averageLatencyMs).toBe(150);
|
||||
});
|
||||
|
||||
it('getInstanceHealth returns undefined for unknown instance', () => {
|
||||
expect(aggregator.getInstanceHealth('nonexistent')).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('Health Monitoring Routes', () => {
|
||||
let collector: MetricsCollector;
|
||||
|
||||
beforeEach(() => {
|
||||
collector = new MetricsCollector();
|
||||
});
|
||||
|
||||
async function buildApp(runtimeAvailable = true) {
|
||||
const orchestrator = mockOrchestrator(runtimeAvailable);
|
||||
const aggregator = new HealthAggregator(collector, orchestrator);
|
||||
const app = Fastify();
|
||||
registerHealthMonitoringRoutes(app, {
|
||||
healthAggregator: aggregator,
|
||||
metricsCollector: collector,
|
||||
});
|
||||
return app;
|
||||
}
|
||||
|
||||
it('GET /api/v1/health/overview returns system health', async () => {
|
||||
collector.register('a');
|
||||
collector.updateStatus('a', 'healthy');
|
||||
const app = await buildApp();
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/v1/health/overview' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
const body = res.json();
|
||||
expect(body.overallStatus).toBe('healthy');
|
||||
expect(body.totalInstances).toBe(1);
|
||||
expect(body.runtimeAvailable).toBe(true);
|
||||
expect(body.timestamp).toBeDefined();
|
||||
});
|
||||
|
||||
it('GET /api/v1/health/instances/:id returns instance health', async () => {
|
||||
collector.register('inst-1');
|
||||
collector.updateStatus('inst-1', 'healthy');
|
||||
collector.recordRequest('inst-1', 42);
|
||||
const app = await buildApp();
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/inst-1' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
const body = res.json();
|
||||
expect(body.instanceId).toBe('inst-1');
|
||||
expect(body.status).toBe('healthy');
|
||||
expect(body.requestCount).toBe(1);
|
||||
});
|
||||
|
||||
it('GET /api/v1/health/instances/:id returns 404 for unknown', async () => {
|
||||
const app = await buildApp();
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/nonexistent' });
|
||||
expect(res.statusCode).toBe(404);
|
||||
expect(res.json().error).toContain('not found');
|
||||
});
|
||||
|
||||
it('GET /api/v1/metrics returns all instance metrics', async () => {
|
||||
collector.register('a');
|
||||
collector.register('b');
|
||||
collector.recordRequest('a');
|
||||
const app = await buildApp();
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
const body = res.json();
|
||||
expect(body.instances).toHaveLength(2);
|
||||
expect(body.timestamp).toBeDefined();
|
||||
});
|
||||
|
||||
it('GET /api/v1/metrics returns empty with no instances', async () => {
|
||||
const app = await buildApp();
|
||||
|
||||
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(res.json().instances).toEqual([]);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user