Merge pull request 'feat: container liveness sync + node-runner slim base' (#15) from feat/container-liveness-sync into main
This commit was merged in pull request #15.
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
# Base container for npm-based MCP servers (STDIO transport).
|
||||
# mcpd uses this image to run `npx -y <packageName>` when a server
|
||||
# has packageName but no dockerImage.
|
||||
FROM node:20-alpine
|
||||
# Using slim (Debian) instead of alpine for better npm package compatibility.
|
||||
FROM node:20-slim
|
||||
|
||||
WORKDIR /mcp
|
||||
|
||||
|
||||
@@ -134,9 +134,22 @@ async function main(): Promise<void> {
|
||||
await app.listen({ port: config.port, host: config.host });
|
||||
app.log.info(`mcpd listening on ${config.host}:${config.port}`);
|
||||
|
||||
// Periodic container liveness sync — detect crashed containers
|
||||
const SYNC_INTERVAL_MS = 30_000; // 30s
|
||||
const syncTimer = setInterval(async () => {
|
||||
try {
|
||||
await instanceService.syncStatus();
|
||||
} catch (err) {
|
||||
app.log.error({ err }, 'Container status sync failed');
|
||||
}
|
||||
}, SYNC_INTERVAL_MS);
|
||||
|
||||
// Graceful shutdown
|
||||
setupGracefulShutdown(app, {
|
||||
disconnectDb: () => prisma.$disconnect(),
|
||||
disconnectDb: async () => {
|
||||
clearInterval(syncTimer);
|
||||
await prisma.$disconnect();
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -36,8 +36,41 @@ export class InstanceService {
|
||||
return instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sync instance statuses with actual container state.
|
||||
* Detects crashed/stopped containers and marks them ERROR.
|
||||
*/
|
||||
async syncStatus(): Promise<void> {
|
||||
const instances = await this.instanceRepo.findAll();
|
||||
for (const inst of instances) {
|
||||
if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) {
|
||||
try {
|
||||
const info = await this.orchestrator.inspectContainer(inst.containerId);
|
||||
if (info.state === 'stopped' || info.state === 'error') {
|
||||
// Container died — get last logs for error context
|
||||
let errorMsg = `Container ${info.state}`;
|
||||
try {
|
||||
const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 });
|
||||
const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop();
|
||||
if (lastLog) errorMsg = lastLog;
|
||||
} catch { /* best-effort */ }
|
||||
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
|
||||
metadata: { error: errorMsg },
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Container gone entirely
|
||||
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
|
||||
metadata: { error: 'Container not found' },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconcile instances for a server to match desired replica count.
|
||||
* - Syncs container statuses first (detect crashed containers)
|
||||
* - If fewer running instances than replicas: start new ones
|
||||
* - If more running instances than replicas: remove excess (oldest first)
|
||||
*/
|
||||
@@ -45,6 +78,9 @@ export class InstanceService {
|
||||
const server = await this.serverRepo.findById(serverId);
|
||||
if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);
|
||||
|
||||
// Sync container statuses before counting active instances
|
||||
await this.syncStatus();
|
||||
|
||||
const instances = await this.instanceRepo.findAll(serverId);
|
||||
const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
|
||||
const desired = server.replicas;
|
||||
|
||||
Reference in New Issue
Block a user