From 9a67e5130720b27fad5576a78437712b9524c891 Mon Sep 17 00:00:00 2001 From: Michal Date: Sat, 21 Feb 2026 05:34:20 +0000 Subject: [PATCH] feat: add health monitoring with metrics collection and REST API MetricsCollector tracks per-instance request counts, error rates, latency, and uptime. HealthAggregator computes system-wide health status. REST endpoints at /api/v1/health/overview, /health/instances/:id, /metrics. Co-Authored-By: Claude Opus 4.6 --- src/mcpd/src/routes/health-monitoring.ts | 39 +++ src/mcpd/src/routes/index.ts | 2 + src/mcpd/src/services/health-aggregator.ts | 99 +++++++ src/mcpd/src/services/index.ts | 4 + src/mcpd/src/services/metrics-collector.ts | 103 +++++++ src/mcpd/tests/health-monitoring.test.ts | 304 +++++++++++++++++++++ 6 files changed, 551 insertions(+) create mode 100644 src/mcpd/src/routes/health-monitoring.ts create mode 100644 src/mcpd/src/services/health-aggregator.ts create mode 100644 src/mcpd/src/services/metrics-collector.ts create mode 100644 src/mcpd/tests/health-monitoring.test.ts diff --git a/src/mcpd/src/routes/health-monitoring.ts b/src/mcpd/src/routes/health-monitoring.ts new file mode 100644 index 0000000..30ded01 --- /dev/null +++ b/src/mcpd/src/routes/health-monitoring.ts @@ -0,0 +1,39 @@ +import type { FastifyInstance } from 'fastify'; +import type { HealthAggregator } from '../services/health-aggregator.js'; +import type { MetricsCollector } from '../services/metrics-collector.js'; + +export interface HealthMonitoringDeps { + healthAggregator: HealthAggregator; + metricsCollector: MetricsCollector; +} + +export function registerHealthMonitoringRoutes(app: FastifyInstance, deps: HealthMonitoringDeps): void { + app.get('/api/v1/health/overview', async () => { + return deps.healthAggregator.getOverview(); + }); + + app.get<{ Params: { id: string } }>('/api/v1/health/instances/:id', async (request, reply) => { + const health = deps.healthAggregator.getInstanceHealth(request.params.id); + if (!health) { + reply.code(404); + return { error: `Instance '${request.params.id}' not found`, statusCode: 404 }; + } + return health; + }); + + app.get('/api/v1/metrics', async () => { + const allMetrics = deps.metricsCollector.getAllMetrics(); + return { + instances: allMetrics.map((m) => ({ + instanceId: m.instanceId, + status: m.status, + uptime: m.uptime, + requestCount: m.requestCount, + errorCount: m.errorCount, + errorRate: m.requestCount > 0 ? m.errorCount / m.requestCount : 0, + lastRequestAt: m.lastRequestAt?.toISOString() ?? null, + })), + timestamp: new Date().toISOString(), + }; + }); +} diff --git a/src/mcpd/src/routes/index.ts b/src/mcpd/src/routes/index.ts index 26ed752..b402c09 100644 --- a/src/mcpd/src/routes/index.ts +++ b/src/mcpd/src/routes/index.ts @@ -5,3 +5,5 @@ export { registerMcpProfileRoutes } from './mcp-profiles.js'; export { registerProjectRoutes } from './projects.js'; export { registerInstanceRoutes } from './instances.js'; export { registerAuditLogRoutes } from './audit-logs.js'; +export { registerHealthMonitoringRoutes } from './health-monitoring.js'; +export type { HealthMonitoringDeps } from './health-monitoring.js'; diff --git a/src/mcpd/src/services/health-aggregator.ts b/src/mcpd/src/services/health-aggregator.ts new file mode 100644 index 0000000..ca6d502 --- /dev/null +++ b/src/mcpd/src/services/health-aggregator.ts @@ -0,0 +1,99 @@ +import type { MetricsCollector, InstanceMetrics } from './metrics-collector.js'; +import type { McpOrchestrator } from './orchestrator.js'; + +export interface SystemHealth { + overallStatus: 'healthy' | 'degraded' | 'unhealthy'; + totalInstances: number; + healthyCount: number; + unhealthyCount: number; + unknownCount: number; + runtimeAvailable: boolean; + aggregateErrorRate: number; + averageUptime: number; + timestamp: string; +} + +export interface InstanceHealth { + instanceId: string; + status: InstanceMetrics['status']; + uptime: number; + requestCount: number; + errorCount: number; + errorRate: number; + averageLatencyMs: number; + lastRequestAt: string | null; +} + +export class HealthAggregator { + constructor( + private metricsCollector: MetricsCollector, + private orchestrator: McpOrchestrator, + ) {} + + async getOverview(): Promise { + const runtimeAvailable = await this.orchestrator.ping().catch(() => false); + const allMetrics = this.metricsCollector.getAllMetrics(); + + let healthyCount = 0; + let unhealthyCount = 0; + let unknownCount = 0; + let totalErrors = 0; + let totalRequests = 0; + let totalUptime = 0; + + for (const m of allMetrics) { + switch (m.status) { + case 'healthy': + healthyCount++; + break; + case 'unhealthy': + unhealthyCount++; + break; + default: + unknownCount++; + } + totalErrors += m.errorCount; + totalRequests += m.requestCount; + totalUptime += m.uptime; + } + + const totalInstances = allMetrics.length; + const aggregateErrorRate = totalRequests > 0 ? totalErrors / totalRequests : 0; + const averageUptime = totalInstances > 0 ? totalUptime / totalInstances : 0; + + let overallStatus: SystemHealth['overallStatus'] = 'healthy'; + if (!runtimeAvailable || (totalInstances > 0 && unhealthyCount === totalInstances)) { + overallStatus = 'unhealthy'; + } else if (unhealthyCount > 0 || unknownCount > 0) { + overallStatus = 'degraded'; + } + + return { + overallStatus, + totalInstances, + healthyCount, + unhealthyCount, + unknownCount, + runtimeAvailable, + aggregateErrorRate, + averageUptime, + timestamp: new Date().toISOString(), + }; + } + + getInstanceHealth(instanceId: string): InstanceHealth | undefined { + const m = this.metricsCollector.getMetrics(instanceId); + if (!m) return undefined; + + return { + instanceId: m.instanceId, + status: m.status, + uptime: m.uptime, + requestCount: m.requestCount, + errorCount: m.errorCount, + errorRate: this.metricsCollector.getErrorRate(instanceId), + averageLatencyMs: this.metricsCollector.getAverageLatency(instanceId), + lastRequestAt: m.lastRequestAt?.toISOString() ?? null, + }; + } +} diff --git a/src/mcpd/src/services/index.ts b/src/mcpd/src/services/index.ts index 2bf0cd2..4933389 100644 --- a/src/mcpd/src/services/index.ts +++ b/src/mcpd/src/services/index.ts @@ -11,3 +11,7 @@ export { AuditLogService } from './audit-log.service.js'; export type { AuditLogQueryParams } from './audit-log.service.js'; export { KubernetesOrchestrator } from './k8s/index.js'; export type { K8sClientConfig } from './k8s/index.js'; +export { MetricsCollector } from './metrics-collector.js'; +export type { InstanceMetrics } from './metrics-collector.js'; +export { HealthAggregator } from './health-aggregator.js'; +export type { SystemHealth, InstanceHealth } from './health-aggregator.js'; diff --git a/src/mcpd/src/services/metrics-collector.ts b/src/mcpd/src/services/metrics-collector.ts new file mode 100644 index 0000000..47f1f6a --- /dev/null +++ b/src/mcpd/src/services/metrics-collector.ts @@ -0,0 +1,103 @@ +export interface InstanceMetrics { + instanceId: string; + status: 'healthy' | 'unhealthy' | 'unknown'; + uptime: number; + requestCount: number; + errorCount: number; + lastRequestAt: Date | null; + latencyMs: number[]; +} + +export class MetricsCollector { + private metrics = new Map(); + private startTimes = new Map(); + + register(instanceId: string): void { + if (!this.metrics.has(instanceId)) { + this.metrics.set(instanceId, { + instanceId, + status: 'unknown', + uptime: 0, + requestCount: 0, + errorCount: 0, + lastRequestAt: null, + latencyMs: [], + }); + this.startTimes.set(instanceId, new Date()); + } + } + + unregister(instanceId: string): void { + this.metrics.delete(instanceId); + this.startTimes.delete(instanceId); + } + + recordRequest(instanceId: string, latencyMs?: number): void { + const m = this.ensureMetrics(instanceId); + m.requestCount++; + m.lastRequestAt = new Date(); + if (latencyMs !== undefined) { + m.latencyMs.push(latencyMs); + // Keep only last 1000 latency samples + if (m.latencyMs.length > 1000) { + m.latencyMs = m.latencyMs.slice(-1000); + } + } + } + + recordError(instanceId: string): void { + const m = this.ensureMetrics(instanceId); + m.errorCount++; + } + + updateStatus(instanceId: string, status: InstanceMetrics['status']): void { + const m = this.ensureMetrics(instanceId); + m.status = status; + } + + getMetrics(instanceId: string): InstanceMetrics | undefined { + const m = this.metrics.get(instanceId); + if (!m) return undefined; + + // Compute uptime from start time + const start = this.startTimes.get(instanceId); + if (start) { + m.uptime = (Date.now() - start.getTime()) / 1000; + } + + return { ...m, latencyMs: [...m.latencyMs] }; + } + + getAllMetrics(): InstanceMetrics[] { + return [...this.metrics.keys()] + .map((id) => this.getMetrics(id)) + .filter((m): m is InstanceMetrics => m !== undefined); + } + + getErrorRate(instanceId: string): number { + const m = this.metrics.get(instanceId); + if (!m || m.requestCount === 0) return 0; + return m.errorCount / m.requestCount; + } + + getAverageLatency(instanceId: string): number { + const m = this.metrics.get(instanceId); + if (!m || m.latencyMs.length === 0) return 0; + const sum = m.latencyMs.reduce((a, b) => a + b, 0); + return sum / m.latencyMs.length; + } + + reset(): void { + this.metrics.clear(); + this.startTimes.clear(); + } + + private ensureMetrics(instanceId: string): InstanceMetrics { + let m = this.metrics.get(instanceId); + if (!m) { + this.register(instanceId); + m = this.metrics.get(instanceId)!; + } + return m; + } +} diff --git a/src/mcpd/tests/health-monitoring.test.ts b/src/mcpd/tests/health-monitoring.test.ts new file mode 100644 index 0000000..fd107b3 --- /dev/null +++ b/src/mcpd/tests/health-monitoring.test.ts @@ -0,0 +1,304 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import Fastify from 'fastify'; +import { MetricsCollector } from '../src/services/metrics-collector.js'; +import { HealthAggregator } from '../src/services/health-aggregator.js'; +import type { McpOrchestrator } from '../src/services/orchestrator.js'; +import { registerHealthMonitoringRoutes } from '../src/routes/health-monitoring.js'; + +function mockOrchestrator(available = true): McpOrchestrator { + return { + ping: vi.fn(async () => available), + pullImage: vi.fn(async () => {}), + createContainer: vi.fn(async () => ({ + containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(), + })), + stopContainer: vi.fn(async () => {}), + removeContainer: vi.fn(async () => {}), + inspectContainer: vi.fn(async () => ({ + containerId: 'c1', name: 'test', state: 'running' as const, createdAt: new Date(), + })), + getContainerLogs: vi.fn(async () => ({ stdout: '', stderr: '' })), + }; +} + +describe('MetricsCollector', () => { + let collector: MetricsCollector; + + beforeEach(() => { + collector = new MetricsCollector(); + }); + + it('registers and retrieves instance metrics', () => { + collector.register('inst-1'); + const m = collector.getMetrics('inst-1'); + expect(m).toBeDefined(); + expect(m!.instanceId).toBe('inst-1'); + expect(m!.requestCount).toBe(0); + expect(m!.errorCount).toBe(0); + expect(m!.status).toBe('unknown'); + }); + + it('returns undefined for unregistered instance', () => { + expect(collector.getMetrics('nonexistent')).toBeUndefined(); + }); + + it('records requests and increments count', () => { + collector.register('inst-1'); + collector.recordRequest('inst-1', 50); + collector.recordRequest('inst-1', 100); + const m = collector.getMetrics('inst-1')!; + expect(m.requestCount).toBe(2); + expect(m.lastRequestAt).not.toBeNull(); + expect(m.latencyMs).toEqual([50, 100]); + }); + + it('records errors', () => { + collector.register('inst-1'); + collector.recordError('inst-1'); + collector.recordError('inst-1'); + expect(collector.getMetrics('inst-1')!.errorCount).toBe(2); + }); + + it('auto-registers on recordRequest for unknown instance', () => { + collector.recordRequest('new-inst'); + expect(collector.getMetrics('new-inst')).toBeDefined(); + expect(collector.getMetrics('new-inst')!.requestCount).toBe(1); + }); + + it('calculates error rate', () => { + collector.register('inst-1'); + collector.recordRequest('inst-1'); + collector.recordRequest('inst-1'); + collector.recordError('inst-1'); + expect(collector.getErrorRate('inst-1')).toBe(0.5); + }); + + it('returns 0 error rate when no requests', () => { + collector.register('inst-1'); + expect(collector.getErrorRate('inst-1')).toBe(0); + }); + + it('calculates average latency', () => { + collector.register('inst-1'); + collector.recordRequest('inst-1', 100); + collector.recordRequest('inst-1', 200); + collector.recordRequest('inst-1', 300); + expect(collector.getAverageLatency('inst-1')).toBe(200); + }); + + it('returns 0 average latency when no samples', () => { + collector.register('inst-1'); + expect(collector.getAverageLatency('inst-1')).toBe(0); + }); + + it('updates status', () => { + collector.register('inst-1'); + collector.updateStatus('inst-1', 'healthy'); + expect(collector.getMetrics('inst-1')!.status).toBe('healthy'); + }); + + it('getAllMetrics returns all registered instances', () => { + collector.register('a'); + collector.register('b'); + collector.register('c'); + const all = collector.getAllMetrics(); + expect(all).toHaveLength(3); + expect(all.map((m) => m.instanceId).sort()).toEqual(['a', 'b', 'c']); + }); + + it('unregister removes instance', () => { + collector.register('inst-1'); + collector.unregister('inst-1'); + expect(collector.getMetrics('inst-1')).toBeUndefined(); + expect(collector.getAllMetrics()).toHaveLength(0); + }); + + it('reset clears all metrics', () => { + collector.register('a'); + collector.register('b'); + collector.reset(); + expect(collector.getAllMetrics()).toHaveLength(0); + }); + + it('computes uptime from registration time', () => { + collector.register('inst-1'); + const m = collector.getMetrics('inst-1')!; + expect(m.uptime).toBeGreaterThanOrEqual(0); + }); +}); + +describe('HealthAggregator', () => { + let collector: MetricsCollector; + let orchestrator: McpOrchestrator; + let aggregator: HealthAggregator; + + beforeEach(() => { + collector = new MetricsCollector(); + orchestrator = mockOrchestrator(true); + aggregator = new HealthAggregator(collector, orchestrator); + }); + + it('reports healthy when all instances healthy', async () => { + collector.register('a'); + collector.register('b'); + collector.updateStatus('a', 'healthy'); + collector.updateStatus('b', 'healthy'); + + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('healthy'); + expect(overview.totalInstances).toBe(2); + expect(overview.healthyCount).toBe(2); + expect(overview.unhealthyCount).toBe(0); + expect(overview.runtimeAvailable).toBe(true); + }); + + it('reports degraded when some instances unhealthy', async () => { + collector.register('a'); + collector.register('b'); + collector.updateStatus('a', 'healthy'); + collector.updateStatus('b', 'unhealthy'); + + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('degraded'); + expect(overview.unhealthyCount).toBe(1); + }); + + it('reports degraded when some instances unknown', async () => { + collector.register('a'); + collector.updateStatus('a', 'healthy'); + collector.register('b'); // status remains 'unknown' + + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('degraded'); + expect(overview.unknownCount).toBe(1); + }); + + it('reports unhealthy when all instances unhealthy', async () => { + collector.register('a'); + collector.updateStatus('a', 'unhealthy'); + + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('unhealthy'); + }); + + it('reports unhealthy when runtime unavailable', async () => { + orchestrator = mockOrchestrator(false); + aggregator = new HealthAggregator(collector, orchestrator); + + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('unhealthy'); + expect(overview.runtimeAvailable).toBe(false); + }); + + it('reports healthy with no instances and runtime available', async () => { + const overview = await aggregator.getOverview(); + expect(overview.overallStatus).toBe('healthy'); + expect(overview.totalInstances).toBe(0); + }); + + it('computes aggregate error rate', async () => { + collector.register('a'); + collector.recordRequest('a'); + collector.recordRequest('a'); + collector.recordError('a'); + + const overview = await aggregator.getOverview(); + expect(overview.aggregateErrorRate).toBe(0.5); + }); + + it('getInstanceHealth returns instance details', () => { + collector.register('inst-1'); + collector.updateStatus('inst-1', 'healthy'); + collector.recordRequest('inst-1', 150); + collector.recordError('inst-1'); + + const health = aggregator.getInstanceHealth('inst-1'); + expect(health).toBeDefined(); + expect(health!.instanceId).toBe('inst-1'); + expect(health!.status).toBe('healthy'); + expect(health!.requestCount).toBe(1); + expect(health!.errorCount).toBe(1); + expect(health!.errorRate).toBe(1); + expect(health!.averageLatencyMs).toBe(150); + }); + + it('getInstanceHealth returns undefined for unknown instance', () => { + expect(aggregator.getInstanceHealth('nonexistent')).toBeUndefined(); + }); +}); + +describe('Health Monitoring Routes', () => { + let collector: MetricsCollector; + + beforeEach(() => { + collector = new MetricsCollector(); + }); + + async function buildApp(runtimeAvailable = true) { + const orchestrator = mockOrchestrator(runtimeAvailable); + const aggregator = new HealthAggregator(collector, orchestrator); + const app = Fastify(); + registerHealthMonitoringRoutes(app, { + healthAggregator: aggregator, + metricsCollector: collector, + }); + return app; + } + + it('GET /api/v1/health/overview returns system health', async () => { + collector.register('a'); + collector.updateStatus('a', 'healthy'); + const app = await buildApp(); + + const res = await app.inject({ method: 'GET', url: '/api/v1/health/overview' }); + expect(res.statusCode).toBe(200); + const body = res.json(); + expect(body.overallStatus).toBe('healthy'); + expect(body.totalInstances).toBe(1); + expect(body.runtimeAvailable).toBe(true); + expect(body.timestamp).toBeDefined(); + }); + + it('GET /api/v1/health/instances/:id returns instance health', async () => { + collector.register('inst-1'); + collector.updateStatus('inst-1', 'healthy'); + collector.recordRequest('inst-1', 42); + const app = await buildApp(); + + const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/inst-1' }); + expect(res.statusCode).toBe(200); + const body = res.json(); + expect(body.instanceId).toBe('inst-1'); + expect(body.status).toBe('healthy'); + expect(body.requestCount).toBe(1); + }); + + it('GET /api/v1/health/instances/:id returns 404 for unknown', async () => { + const app = await buildApp(); + + const res = await app.inject({ method: 'GET', url: '/api/v1/health/instances/nonexistent' }); + expect(res.statusCode).toBe(404); + expect(res.json().error).toContain('not found'); + }); + + it('GET /api/v1/metrics returns all instance metrics', async () => { + collector.register('a'); + collector.register('b'); + collector.recordRequest('a'); + const app = await buildApp(); + + const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' }); + expect(res.statusCode).toBe(200); + const body = res.json(); + expect(body.instances).toHaveLength(2); + expect(body.timestamp).toBeDefined(); + }); + + it('GET /api/v1/metrics returns empty with no instances', async () => { + const app = await buildApp(); + + const res = await app.inject({ method: 'GET', url: '/api/v1/metrics' }); + expect(res.statusCode).toBe(200); + expect(res.json().instances).toEqual([]); + }); +});