From d07d4d11dddd640b0080151249f4126f8094f2d1 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 23 Feb 2026 00:18:28 +0000 Subject: [PATCH] feat: container liveness sync + node-runner slim base - Add syncStatus() to InstanceService: detects crashed/stopped containers, marks them ERROR with last log line as context - Reconcile now syncs container status first (detect dead before counting) - Add 30s periodic sync loop in main.ts - Switch node-runner from alpine to slim (Debian) for npm compatibility (fixes home-assistant-mcp-server binary not found on Alpine) Co-Authored-By: Claude Opus 4.6 --- deploy/Dockerfile.node-runner | 3 +- src/mcpd/src/main.ts | 15 +++++++++- src/mcpd/src/services/instance.service.ts | 36 +++++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/deploy/Dockerfile.node-runner b/deploy/Dockerfile.node-runner index 91be2fb..b5d4a40 100644 --- a/deploy/Dockerfile.node-runner +++ b/deploy/Dockerfile.node-runner @@ -1,7 +1,8 @@ # Base container for npm-based MCP servers (STDIO transport). # mcpd uses this image to run `npx -y ` when a server # has packageName but no dockerImage. -FROM node:20-alpine +# Using slim (Debian) instead of alpine for better npm package compatibility. +FROM node:20-slim WORKDIR /mcp diff --git a/src/mcpd/src/main.ts b/src/mcpd/src/main.ts index 629910e..e407a96 100644 --- a/src/mcpd/src/main.ts +++ b/src/mcpd/src/main.ts @@ -134,9 +134,22 @@ async function main(): Promise { await app.listen({ port: config.port, host: config.host }); app.log.info(`mcpd listening on ${config.host}:${config.port}`); + // Periodic container liveness sync — detect crashed containers + const SYNC_INTERVAL_MS = 30_000; // 30s + const syncTimer = setInterval(async () => { + try { + await instanceService.syncStatus(); + } catch (err) { + app.log.error({ err }, 'Container status sync failed'); + } + }, SYNC_INTERVAL_MS); + // Graceful shutdown setupGracefulShutdown(app, { - disconnectDb: () => prisma.$disconnect(), + disconnectDb: async () => { + clearInterval(syncTimer); + await prisma.$disconnect(); + }, }); } diff --git a/src/mcpd/src/services/instance.service.ts b/src/mcpd/src/services/instance.service.ts index 1e55d42..0d28b11 100644 --- a/src/mcpd/src/services/instance.service.ts +++ b/src/mcpd/src/services/instance.service.ts @@ -36,8 +36,41 @@ export class InstanceService { return instance; } + /** + * Sync instance statuses with actual container state. + * Detects crashed/stopped containers and marks them ERROR. + */ + async syncStatus(): Promise { + const instances = await this.instanceRepo.findAll(); + for (const inst of instances) { + if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) { + try { + const info = await this.orchestrator.inspectContainer(inst.containerId); + if (info.state === 'stopped' || info.state === 'error') { + // Container died — get last logs for error context + let errorMsg = `Container ${info.state}`; + try { + const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 }); + const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop(); + if (lastLog) errorMsg = lastLog; + } catch { /* best-effort */ } + await this.instanceRepo.updateStatus(inst.id, 'ERROR', { + metadata: { error: errorMsg }, + }); + } + } catch { + // Container gone entirely + await this.instanceRepo.updateStatus(inst.id, 'ERROR', { + metadata: { error: 'Container not found' }, + }); + } + } + } + } + /** * Reconcile instances for a server to match desired replica count. + * - Syncs container statuses first (detect crashed containers) * - If fewer running instances than replicas: start new ones * - If more running instances than replicas: remove excess (oldest first) */ @@ -45,6 +78,9 @@ export class InstanceService { const server = await this.serverRepo.findById(serverId); if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`); + // Sync container statuses before counting active instances + await this.syncStatus(); + const instances = await this.instanceRepo.findAll(serverId); const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING'); const desired = server.replicas;