feat: add health monitoring with metrics collection and REST API
Some checks are pending
CI / lint (push) Waiting to run
CI / typecheck (push) Waiting to run
CI / test (push) Waiting to run
CI / build (push) Blocked by required conditions

MetricsCollector tracks per-instance request counts, error rates, latency,
and uptime. HealthAggregator computes system-wide health status. REST
endpoints at /api/v1/health/overview, /health/instances/:id, /metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Michal
2026-02-21 05:34:20 +00:00
parent 9e660140b3
commit 9a67e51307
6 changed files with 551 additions and 0 deletions

View File

@@ -0,0 +1,304 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import Fastify from 'fastify';
import { MetricsCollector } from '../src/services/metrics-collector.js';
import { HealthAggregator } from '../src/services/health-aggregator.js';
import type { McpOrchestrator } from '../src/services/orchestrator.js';
import { registerHealthMonitoringRoutes } from '../src/routes/health-monitoring.js';
function mockOrchestrator(available = true): McpOrchestrator {
return {
ping: vi.fn(async () => available),
pullImage: vi.fn(async () => {}),
createContainer: vi.fn(async () => ({
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
})),
stopContainer: vi.fn(async () => {}),
removeContainer: vi.fn(async () => {}),
inspectContainer: vi.fn(async () => ({
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
})),
getContainerLogs: vi.fn(async () => ({ stdout: '', stderr: '' })),
};
}
describe('MetricsCollector', () => {
let collector: MetricsCollector;
beforeEach(() => {
collector = new MetricsCollector();
});
it('registers and retrieves instance metrics', () => {
collector.register('inst-1');
const m = collector.getMetrics('inst-1');
expect(m).toBeDefined();
expect(m!.instanceId).toBe('inst-1');
expect(m!.requestCount).toBe(0);
expect(m!.errorCount).toBe(0);
expect(m!.status).toBe('unknown');
});
it('returns undefined for unregistered instance', () => {
expect(collector.getMetrics('nonexistent')).toBeUndefined();
});
it('records requests and increments count', () => {
collector.register('inst-1');
collector.recordRequest('inst-1', 50);
collector.recordRequest('inst-1', 100);
const m = collector.getMetrics('inst-1')!;
expect(m.requestCount).toBe(2);
expect(m.lastRequestAt).not.toBeNull();
expect(m.latencyMs).toEqual([50, 100]);
});
it('records errors', () => {
collector.register('inst-1');
collector.recordError('inst-1');
collector.recordError('inst-1');
expect(collector.getMetrics('inst-1')!.errorCount).toBe(2);
});
it('auto-registers on recordRequest for unknown instance', () => {
collector.recordRequest('new-inst');
expect(collector.getMetrics('new-inst')).toBeDefined();
expect(collector.getMetrics('new-inst')!.requestCount).toBe(1);
});
it('calculates error rate', () => {
collector.register('inst-1');
collector.recordRequest('inst-1');
collector.recordRequest('inst-1');
collector.recordError('inst-1');
expect(collector.getErrorRate('inst-1')).toBe(0.5);
});
it('returns 0 error rate when no requests', () => {
collector.register('inst-1');
expect(collector.getErrorRate('inst-1')).toBe(0);
});
it('calculates average latency', () => {
collector.register('inst-1');
collector.recordRequest('inst-1', 100);
collector.recordRequest('inst-1', 200);
collector.recordRequest('inst-1', 300);
expect(collector.getAverageLatency('inst-1')).toBe(200);
});
it('returns 0 average latency when no samples', () => {
collector.register('inst-1');
expect(collector.getAverageLatency('inst-1')).toBe(0);
});
it('updates status', () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
expect(collector.getMetrics('inst-1')!.status).toBe('healthy');
});
it('getAllMetrics returns all registered instances', () => {
collector.register('a');
collector.register('b');
collector.register('c');
const all = collector.getAllMetrics();
expect(all).toHaveLength(3);
expect(all.map((m) => m.instanceId).sort()).toEqual(['a', 'b', 'c']);
});
it('unregister removes instance', () => {
collector.register('inst-1');
collector.unregister('inst-1');
expect(collector.getMetrics('inst-1')).toBeUndefined();
expect(collector.getAllMetrics()).toHaveLength(0);
});
it('reset clears all metrics', () => {
collector.register('a');
collector.register('b');
collector.reset();
expect(collector.getAllMetrics()).toHaveLength(0);
});
it('computes uptime from registration time', () => {
collector.register('inst-1');
const m = collector.getMetrics('inst-1')!;
expect(m.uptime).toBeGreaterThanOrEqual(0);
});
});
describe('HealthAggregator', () => {
let collector: MetricsCollector;
let orchestrator: McpOrchestrator;
let aggregator: HealthAggregator;
beforeEach(() => {
collector = new MetricsCollector();
orchestrator = mockOrchestrator(true);
aggregator = new HealthAggregator(collector, orchestrator);
});
it('reports healthy when all instances healthy', async () => {
collector.register('a');
collector.register('b');
collector.updateStatus('a', 'healthy');
collector.updateStatus('b', 'healthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('healthy');
expect(overview.totalInstances).toBe(2);
expect(overview.healthyCount).toBe(2);
expect(overview.unhealthyCount).toBe(0);
expect(overview.runtimeAvailable).toBe(true);
});
it('reports degraded when some instances unhealthy', async () => {
collector.register('a');
collector.register('b');
collector.updateStatus('a', 'healthy');
collector.updateStatus('b', 'unhealthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('degraded');
expect(overview.unhealthyCount).toBe(1);
});
it('reports degraded when some instances unknown', async () => {
collector.register('a');
collector.updateStatus('a', 'healthy');
collector.register('b'); // status remains 'unknown'
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('degraded');
expect(overview.unknownCount).toBe(1);
});
it('reports unhealthy when all instances unhealthy', async () => {
collector.register('a');
collector.updateStatus('a', 'unhealthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('unhealthy');
});
it('reports unhealthy when runtime unavailable', async () => {
orchestrator = mockOrchestrator(false);
aggregator = new HealthAggregator(collector, orchestrator);
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('unhealthy');
expect(overview.runtimeAvailable).toBe(false);
});
it('reports healthy with no instances and runtime available', async () => {
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('healthy');
expect(overview.totalInstances).toBe(0);
});
it('computes aggregate error rate', async () => {
collector.register('a');
collector.recordRequest('a');
collector.recordRequest('a');
collector.recordError('a');
const overview = await aggregator.getOverview();
expect(overview.aggregateErrorRate).toBe(0.5);
});
it('getInstanceHealth returns instance details', () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
collector.recordRequest('inst-1', 150);
collector.recordError('inst-1');
const health = aggregator.getInstanceHealth('inst-1');
expect(health).toBeDefined();
expect(health!.instanceId).toBe('inst-1');
expect(health!.status).toBe('healthy');
expect(health!.requestCount).toBe(1);
expect(health!.errorCount).toBe(1);
expect(health!.errorRate).toBe(1);
expect(health!.averageLatencyMs).toBe(150);
});
it('getInstanceHealth returns undefined for unknown instance', () => {
expect(aggregator.getInstanceHealth('nonexistent')).toBeUndefined();
});
});
describe('Health Monitoring Routes', () => {
let collector: MetricsCollector;
beforeEach(() => {
collector = new MetricsCollector();
});
async function buildApp(runtimeAvailable = true) {
const orchestrator = mockOrchestrator(runtimeAvailable);
const aggregator = new HealthAggregator(collector, orchestrator);
const app = Fastify();
registerHealthMonitoringRoutes(app, {
healthAggregator: aggregator,
metricsCollector: collector,
});
return app;
}
it('GET /api/v1/health/overview returns system health', async () => {
collector.register('a');
collector.updateStatus('a', 'healthy');
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/overview' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.overallStatus).toBe('healthy');
expect(body.totalInstances).toBe(1);
expect(body.runtimeAvailable).toBe(true);
expect(body.timestamp).toBeDefined();
});
it('GET /api/v1/health/instances/:id returns instance health', async () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
collector.recordRequest('inst-1', 42);
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/inst-1' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.instanceId).toBe('inst-1');
expect(body.status).toBe('healthy');
expect(body.requestCount).toBe(1);
});
it('GET /api/v1/health/instances/:id returns 404 for unknown', async () => {
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/nonexistent' });
expect(res.statusCode).toBe(404);
expect(res.json().error).toContain('not found');
});
it('GET /api/v1/metrics returns all instance metrics', async () => {
collector.register('a');
collector.register('b');
collector.recordRequest('a');
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.instances).toHaveLength(2);
expect(body.timestamp).toBeDefined();
});
it('GET /api/v1/metrics returns empty with no instances', async () => {
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
expect(res.statusCode).toBe(200);
expect(res.json().instances).toEqual([]);
});
});