feat: add health monitoring with metrics collection and REST API
Some checks are pending
CI / lint (push) Waiting to run
CI / typecheck (push) Waiting to run
CI / test (push) Waiting to run
CI / build (push) Blocked by required conditions

MetricsCollector tracks per-instance request counts, error rates, latency,
and uptime. HealthAggregator computes system-wide health status. REST
endpoints at /api/v1/health/overview, /health/instances/:id, /metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Michal
2026-02-21 05:34:20 +00:00
parent 9e660140b3
commit 9a67e51307
6 changed files with 551 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
import type { FastifyInstance } from 'fastify';
import type { HealthAggregator } from '../services/health-aggregator.js';
import type { MetricsCollector } from '../services/metrics-collector.js';
export interface HealthMonitoringDeps {
healthAggregator: HealthAggregator;
metricsCollector: MetricsCollector;
}
export function registerHealthMonitoringRoutes(app: FastifyInstance, deps: HealthMonitoringDeps): void {
app.get('/api/v1/health/overview', async () => {
return deps.healthAggregator.getOverview();
});
app.get<{ Params: { id: string } }>('/api/v1/health/instances/:id', async (request, reply) => {
const health = deps.healthAggregator.getInstanceHealth(request.params.id);
if (!health) {
reply.code(404);
return { error: `Instance '${request.params.id}' not found`, statusCode: 404 };
}
return health;
});
app.get('/api/v1/metrics', async () => {
const allMetrics = deps.metricsCollector.getAllMetrics();
return {
instances: allMetrics.map((m) => ({
instanceId: m.instanceId,
status: m.status,
uptime: m.uptime,
requestCount: m.requestCount,
errorCount: m.errorCount,
errorRate: m.requestCount > 0 ? m.errorCount / m.requestCount : 0,
lastRequestAt: m.lastRequestAt?.toISOString() ?? null,
})),
timestamp: new Date().toISOString(),
};
});
}

View File

@@ -5,3 +5,5 @@ export { registerMcpProfileRoutes } from './mcp-profiles.js';
export { registerProjectRoutes } from './projects.js';
export { registerInstanceRoutes } from './instances.js';
export { registerAuditLogRoutes } from './audit-logs.js';
export { registerHealthMonitoringRoutes } from './health-monitoring.js';
export type { HealthMonitoringDeps } from './health-monitoring.js';

View File

@@ -0,0 +1,99 @@
import type { MetricsCollector, InstanceMetrics } from './metrics-collector.js';
import type { McpOrchestrator } from './orchestrator.js';
export interface SystemHealth {
overallStatus: 'healthy' | 'degraded' | 'unhealthy';
totalInstances: number;
healthyCount: number;
unhealthyCount: number;
unknownCount: number;
runtimeAvailable: boolean;
aggregateErrorRate: number;
averageUptime: number;
timestamp: string;
}
export interface InstanceHealth {
instanceId: string;
status: InstanceMetrics['status'];
uptime: number;
requestCount: number;
errorCount: number;
errorRate: number;
averageLatencyMs: number;
lastRequestAt: string | null;
}
export class HealthAggregator {
constructor(
private metricsCollector: MetricsCollector,
private orchestrator: McpOrchestrator,
) {}
async getOverview(): Promise<SystemHealth> {
const runtimeAvailable = await this.orchestrator.ping().catch(() => false);
const allMetrics = this.metricsCollector.getAllMetrics();
let healthyCount = 0;
let unhealthyCount = 0;
let unknownCount = 0;
let totalErrors = 0;
let totalRequests = 0;
let totalUptime = 0;
for (const m of allMetrics) {
switch (m.status) {
case 'healthy':
healthyCount++;
break;
case 'unhealthy':
unhealthyCount++;
break;
default:
unknownCount++;
}
totalErrors += m.errorCount;
totalRequests += m.requestCount;
totalUptime += m.uptime;
}
const totalInstances = allMetrics.length;
const aggregateErrorRate = totalRequests > 0 ? totalErrors / totalRequests : 0;
const averageUptime = totalInstances > 0 ? totalUptime / totalInstances : 0;
let overallStatus: SystemHealth['overallStatus'] = 'healthy';
if (!runtimeAvailable || (totalInstances > 0 && unhealthyCount === totalInstances)) {
overallStatus = 'unhealthy';
} else if (unhealthyCount > 0 || unknownCount > 0) {
overallStatus = 'degraded';
}
return {
overallStatus,
totalInstances,
healthyCount,
unhealthyCount,
unknownCount,
runtimeAvailable,
aggregateErrorRate,
averageUptime,
timestamp: new Date().toISOString(),
};
}
getInstanceHealth(instanceId: string): InstanceHealth | undefined {
const m = this.metricsCollector.getMetrics(instanceId);
if (!m) return undefined;
return {
instanceId: m.instanceId,
status: m.status,
uptime: m.uptime,
requestCount: m.requestCount,
errorCount: m.errorCount,
errorRate: this.metricsCollector.getErrorRate(instanceId),
averageLatencyMs: this.metricsCollector.getAverageLatency(instanceId),
lastRequestAt: m.lastRequestAt?.toISOString() ?? null,
};
}
}

View File

@@ -11,3 +11,7 @@ export { AuditLogService } from './audit-log.service.js';
export type { AuditLogQueryParams } from './audit-log.service.js';
export { KubernetesOrchestrator } from './k8s/index.js';
export type { K8sClientConfig } from './k8s/index.js';
export { MetricsCollector } from './metrics-collector.js';
export type { InstanceMetrics } from './metrics-collector.js';
export { HealthAggregator } from './health-aggregator.js';
export type { SystemHealth, InstanceHealth } from './health-aggregator.js';

View File

@@ -0,0 +1,103 @@
export interface InstanceMetrics {
instanceId: string;
status: 'healthy' | 'unhealthy' | 'unknown';
uptime: number;
requestCount: number;
errorCount: number;
lastRequestAt: Date | null;
latencyMs: number[];
}
export class MetricsCollector {
private metrics = new Map<string, InstanceMetrics>();
private startTimes = new Map<string, Date>();
register(instanceId: string): void {
if (!this.metrics.has(instanceId)) {
this.metrics.set(instanceId, {
instanceId,
status: 'unknown',
uptime: 0,
requestCount: 0,
errorCount: 0,
lastRequestAt: null,
latencyMs: [],
});
this.startTimes.set(instanceId, new Date());
}
}
unregister(instanceId: string): void {
this.metrics.delete(instanceId);
this.startTimes.delete(instanceId);
}
recordRequest(instanceId: string, latencyMs?: number): void {
const m = this.ensureMetrics(instanceId);
m.requestCount++;
m.lastRequestAt = new Date();
if (latencyMs !== undefined) {
m.latencyMs.push(latencyMs);
// Keep only last 1000 latency samples
if (m.latencyMs.length > 1000) {
m.latencyMs = m.latencyMs.slice(-1000);
}
}
}
recordError(instanceId: string): void {
const m = this.ensureMetrics(instanceId);
m.errorCount++;
}
updateStatus(instanceId: string, status: InstanceMetrics['status']): void {
const m = this.ensureMetrics(instanceId);
m.status = status;
}
getMetrics(instanceId: string): InstanceMetrics | undefined {
const m = this.metrics.get(instanceId);
if (!m) return undefined;
// Compute uptime from start time
const start = this.startTimes.get(instanceId);
if (start) {
m.uptime = (Date.now() - start.getTime()) / 1000;
}
return { ...m, latencyMs: [...m.latencyMs] };
}
getAllMetrics(): InstanceMetrics[] {
return [...this.metrics.keys()]
.map((id) => this.getMetrics(id))
.filter((m): m is InstanceMetrics => m !== undefined);
}
getErrorRate(instanceId: string): number {
const m = this.metrics.get(instanceId);
if (!m || m.requestCount === 0) return 0;
return m.errorCount / m.requestCount;
}
getAverageLatency(instanceId: string): number {
const m = this.metrics.get(instanceId);
if (!m || m.latencyMs.length === 0) return 0;
const sum = m.latencyMs.reduce((a, b) => a + b, 0);
return sum / m.latencyMs.length;
}
reset(): void {
this.metrics.clear();
this.startTimes.clear();
}
private ensureMetrics(instanceId: string): InstanceMetrics {
let m = this.metrics.get(instanceId);
if (!m) {
this.register(instanceId);
m = this.metrics.get(instanceId)!;
}
return m;
}
}

View File

@@ -0,0 +1,304 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import Fastify from 'fastify';
import { MetricsCollector } from '../src/services/metrics-collector.js';
import { HealthAggregator } from '../src/services/health-aggregator.js';
import type { McpOrchestrator } from '../src/services/orchestrator.js';
import { registerHealthMonitoringRoutes } from '../src/routes/health-monitoring.js';
function mockOrchestrator(available = true): McpOrchestrator {
return {
ping: vi.fn(async () => available),
pullImage: vi.fn(async () => {}),
createContainer: vi.fn(async () => ({
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
})),
stopContainer: vi.fn(async () => {}),
removeContainer: vi.fn(async () => {}),
inspectContainer: vi.fn(async () => ({
containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(),
})),
getContainerLogs: vi.fn(async () => ({ stdout: '', stderr: '' })),
};
}
describe('MetricsCollector', () => {
let collector: MetricsCollector;
beforeEach(() => {
collector = new MetricsCollector();
});
it('registers and retrieves instance metrics', () => {
collector.register('inst-1');
const m = collector.getMetrics('inst-1');
expect(m).toBeDefined();
expect(m!.instanceId).toBe('inst-1');
expect(m!.requestCount).toBe(0);
expect(m!.errorCount).toBe(0);
expect(m!.status).toBe('unknown');
});
it('returns undefined for unregistered instance', () => {
expect(collector.getMetrics('nonexistent')).toBeUndefined();
});
it('records requests and increments count', () => {
collector.register('inst-1');
collector.recordRequest('inst-1', 50);
collector.recordRequest('inst-1', 100);
const m = collector.getMetrics('inst-1')!;
expect(m.requestCount).toBe(2);
expect(m.lastRequestAt).not.toBeNull();
expect(m.latencyMs).toEqual([50, 100]);
});
it('records errors', () => {
collector.register('inst-1');
collector.recordError('inst-1');
collector.recordError('inst-1');
expect(collector.getMetrics('inst-1')!.errorCount).toBe(2);
});
it('auto-registers on recordRequest for unknown instance', () => {
collector.recordRequest('new-inst');
expect(collector.getMetrics('new-inst')).toBeDefined();
expect(collector.getMetrics('new-inst')!.requestCount).toBe(1);
});
it('calculates error rate', () => {
collector.register('inst-1');
collector.recordRequest('inst-1');
collector.recordRequest('inst-1');
collector.recordError('inst-1');
expect(collector.getErrorRate('inst-1')).toBe(0.5);
});
it('returns 0 error rate when no requests', () => {
collector.register('inst-1');
expect(collector.getErrorRate('inst-1')).toBe(0);
});
it('calculates average latency', () => {
collector.register('inst-1');
collector.recordRequest('inst-1', 100);
collector.recordRequest('inst-1', 200);
collector.recordRequest('inst-1', 300);
expect(collector.getAverageLatency('inst-1')).toBe(200);
});
it('returns 0 average latency when no samples', () => {
collector.register('inst-1');
expect(collector.getAverageLatency('inst-1')).toBe(0);
});
it('updates status', () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
expect(collector.getMetrics('inst-1')!.status).toBe('healthy');
});
it('getAllMetrics returns all registered instances', () => {
collector.register('a');
collector.register('b');
collector.register('c');
const all = collector.getAllMetrics();
expect(all).toHaveLength(3);
expect(all.map((m) => m.instanceId).sort()).toEqual(['a', 'b', 'c']);
});
it('unregister removes instance', () => {
collector.register('inst-1');
collector.unregister('inst-1');
expect(collector.getMetrics('inst-1')).toBeUndefined();
expect(collector.getAllMetrics()).toHaveLength(0);
});
it('reset clears all metrics', () => {
collector.register('a');
collector.register('b');
collector.reset();
expect(collector.getAllMetrics()).toHaveLength(0);
});
it('computes uptime from registration time', () => {
collector.register('inst-1');
const m = collector.getMetrics('inst-1')!;
expect(m.uptime).toBeGreaterThanOrEqual(0);
});
});
describe('HealthAggregator', () => {
let collector: MetricsCollector;
let orchestrator: McpOrchestrator;
let aggregator: HealthAggregator;
beforeEach(() => {
collector = new MetricsCollector();
orchestrator = mockOrchestrator(true);
aggregator = new HealthAggregator(collector, orchestrator);
});
it('reports healthy when all instances healthy', async () => {
collector.register('a');
collector.register('b');
collector.updateStatus('a', 'healthy');
collector.updateStatus('b', 'healthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('healthy');
expect(overview.totalInstances).toBe(2);
expect(overview.healthyCount).toBe(2);
expect(overview.unhealthyCount).toBe(0);
expect(overview.runtimeAvailable).toBe(true);
});
it('reports degraded when some instances unhealthy', async () => {
collector.register('a');
collector.register('b');
collector.updateStatus('a', 'healthy');
collector.updateStatus('b', 'unhealthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('degraded');
expect(overview.unhealthyCount).toBe(1);
});
it('reports degraded when some instances unknown', async () => {
collector.register('a');
collector.updateStatus('a', 'healthy');
collector.register('b'); // status remains 'unknown'
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('degraded');
expect(overview.unknownCount).toBe(1);
});
it('reports unhealthy when all instances unhealthy', async () => {
collector.register('a');
collector.updateStatus('a', 'unhealthy');
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('unhealthy');
});
it('reports unhealthy when runtime unavailable', async () => {
orchestrator = mockOrchestrator(false);
aggregator = new HealthAggregator(collector, orchestrator);
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('unhealthy');
expect(overview.runtimeAvailable).toBe(false);
});
it('reports healthy with no instances and runtime available', async () => {
const overview = await aggregator.getOverview();
expect(overview.overallStatus).toBe('healthy');
expect(overview.totalInstances).toBe(0);
});
it('computes aggregate error rate', async () => {
collector.register('a');
collector.recordRequest('a');
collector.recordRequest('a');
collector.recordError('a');
const overview = await aggregator.getOverview();
expect(overview.aggregateErrorRate).toBe(0.5);
});
it('getInstanceHealth returns instance details', () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
collector.recordRequest('inst-1', 150);
collector.recordError('inst-1');
const health = aggregator.getInstanceHealth('inst-1');
expect(health).toBeDefined();
expect(health!.instanceId).toBe('inst-1');
expect(health!.status).toBe('healthy');
expect(health!.requestCount).toBe(1);
expect(health!.errorCount).toBe(1);
expect(health!.errorRate).toBe(1);
expect(health!.averageLatencyMs).toBe(150);
});
it('getInstanceHealth returns undefined for unknown instance', () => {
expect(aggregator.getInstanceHealth('nonexistent')).toBeUndefined();
});
});
describe('Health Monitoring Routes', () => {
let collector: MetricsCollector;
beforeEach(() => {
collector = new MetricsCollector();
});
async function buildApp(runtimeAvailable = true) {
const orchestrator = mockOrchestrator(runtimeAvailable);
const aggregator = new HealthAggregator(collector, orchestrator);
const app = Fastify();
registerHealthMonitoringRoutes(app, {
healthAggregator: aggregator,
metricsCollector: collector,
});
return app;
}
it('GET /api/v1/health/overview returns system health', async () => {
collector.register('a');
collector.updateStatus('a', 'healthy');
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/overview' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.overallStatus).toBe('healthy');
expect(body.totalInstances).toBe(1);
expect(body.runtimeAvailable).toBe(true);
expect(body.timestamp).toBeDefined();
});
it('GET /api/v1/health/instances/:id returns instance health', async () => {
collector.register('inst-1');
collector.updateStatus('inst-1', 'healthy');
collector.recordRequest('inst-1', 42);
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/inst-1' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.instanceId).toBe('inst-1');
expect(body.status).toBe('healthy');
expect(body.requestCount).toBe(1);
});
it('GET /api/v1/health/instances/:id returns 404 for unknown', async () => {
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/nonexistent' });
expect(res.statusCode).toBe(404);
expect(res.json().error).toContain('not found');
});
it('GET /api/v1/metrics returns all instance metrics', async () => {
collector.register('a');
collector.register('b');
collector.recordRequest('a');
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
expect(res.statusCode).toBe(200);
const body = res.json();
expect(body.instances).toHaveLength(2);
expect(body.timestamp).toBeDefined();
});
it('GET /api/v1/metrics returns empty with no instances', async () => {
const app = await buildApp();
const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' });
expect(res.statusCode).toBe(200);
expect(res.json().instances).toEqual([]);
});
});