Merge pull request 'feat: container liveness sync + node-runner slim base' (#15) from feat/container-liveness-sync into main

This commit is contained in:
2026-02-23 00:18:41 +00:00
3 changed files with 52 additions and 2 deletions

View File

@@ -1,7 +1,8 @@
# Base container for npm-based MCP servers (STDIO transport). # Base container for npm-based MCP servers (STDIO transport).
# mcpd uses this image to run `npx -y <packageName>` when a server # mcpd uses this image to run `npx -y <packageName>` when a server
# has packageName but no dockerImage. # has packageName but no dockerImage.
FROM node:20-alpine # Using slim (Debian) instead of alpine for better npm package compatibility.
FROM node:20-slim
WORKDIR /mcp WORKDIR /mcp

View File

@@ -134,9 +134,22 @@ async function main(): Promise<void> {
await app.listen({ port: config.port, host: config.host }); await app.listen({ port: config.port, host: config.host });
app.log.info(`mcpd listening on ${config.host}:${config.port}`); app.log.info(`mcpd listening on ${config.host}:${config.port}`);
// Periodic container liveness sync — detect crashed containers
const SYNC_INTERVAL_MS = 30_000; // 30s
const syncTimer = setInterval(async () => {
try {
await instanceService.syncStatus();
} catch (err) {
app.log.error({ err }, 'Container status sync failed');
}
}, SYNC_INTERVAL_MS);
// Graceful shutdown // Graceful shutdown
setupGracefulShutdown(app, { setupGracefulShutdown(app, {
disconnectDb: () => prisma.$disconnect(), disconnectDb: async () => {
clearInterval(syncTimer);
await prisma.$disconnect();
},
}); });
} }

View File

@@ -36,8 +36,41 @@ export class InstanceService {
return instance; return instance;
} }
/**
* Sync instance statuses with actual container state.
* Detects crashed/stopped containers and marks them ERROR.
*/
async syncStatus(): Promise<void> {
const instances = await this.instanceRepo.findAll();
for (const inst of instances) {
if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) {
try {
const info = await this.orchestrator.inspectContainer(inst.containerId);
if (info.state === 'stopped' || info.state === 'error') {
// Container died — get last logs for error context
let errorMsg = `Container ${info.state}`;
try {
const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 });
const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop();
if (lastLog) errorMsg = lastLog;
} catch { /* best-effort */ }
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
metadata: { error: errorMsg },
});
}
} catch {
// Container gone entirely
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
metadata: { error: 'Container not found' },
});
}
}
}
}
/** /**
* Reconcile instances for a server to match desired replica count. * Reconcile instances for a server to match desired replica count.
* - Syncs container statuses first (detect crashed containers)
* - If fewer running instances than replicas: start new ones * - If fewer running instances than replicas: start new ones
* - If more running instances than replicas: remove excess (oldest first) * - If more running instances than replicas: remove excess (oldest first)
*/ */
@@ -45,6 +78,9 @@ export class InstanceService {
const server = await this.serverRepo.findById(serverId); const server = await this.serverRepo.findById(serverId);
if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`); if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);
// Sync container statuses before counting active instances
await this.syncStatus();
const instances = await this.instanceRepo.findAll(serverId); const instances = await this.instanceRepo.findAll(serverId);
const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING'); const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
const desired = server.replicas; const desired = server.replicas;