Merge pull request 'feat: container liveness sync + node-runner slim base' (#15) from feat/container-liveness-sync into main
This commit is contained in:
@@ -1,7 +1,8 @@
|
|||||||
# Base container for npm-based MCP servers (STDIO transport).
|
# Base container for npm-based MCP servers (STDIO transport).
|
||||||
# mcpd uses this image to run `npx -y <packageName>` when a server
|
# mcpd uses this image to run `npx -y <packageName>` when a server
|
||||||
# has packageName but no dockerImage.
|
# has packageName but no dockerImage.
|
||||||
FROM node:20-alpine
|
# Using slim (Debian) instead of alpine for better npm package compatibility.
|
||||||
|
FROM node:20-slim
|
||||||
|
|
||||||
WORKDIR /mcp
|
WORKDIR /mcp
|
||||||
|
|
||||||
|
|||||||
@@ -134,9 +134,22 @@ async function main(): Promise<void> {
|
|||||||
await app.listen({ port: config.port, host: config.host });
|
await app.listen({ port: config.port, host: config.host });
|
||||||
app.log.info(`mcpd listening on ${config.host}:${config.port}`);
|
app.log.info(`mcpd listening on ${config.host}:${config.port}`);
|
||||||
|
|
||||||
|
// Periodic container liveness sync — detect crashed containers
|
||||||
|
const SYNC_INTERVAL_MS = 30_000; // 30s
|
||||||
|
const syncTimer = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
await instanceService.syncStatus();
|
||||||
|
} catch (err) {
|
||||||
|
app.log.error({ err }, 'Container status sync failed');
|
||||||
|
}
|
||||||
|
}, SYNC_INTERVAL_MS);
|
||||||
|
|
||||||
// Graceful shutdown
|
// Graceful shutdown
|
||||||
setupGracefulShutdown(app, {
|
setupGracefulShutdown(app, {
|
||||||
disconnectDb: () => prisma.$disconnect(),
|
disconnectDb: async () => {
|
||||||
|
clearInterval(syncTimer);
|
||||||
|
await prisma.$disconnect();
|
||||||
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -36,8 +36,41 @@ export class InstanceService {
|
|||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync instance statuses with actual container state.
|
||||||
|
* Detects crashed/stopped containers and marks them ERROR.
|
||||||
|
*/
|
||||||
|
async syncStatus(): Promise<void> {
|
||||||
|
const instances = await this.instanceRepo.findAll();
|
||||||
|
for (const inst of instances) {
|
||||||
|
if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) {
|
||||||
|
try {
|
||||||
|
const info = await this.orchestrator.inspectContainer(inst.containerId);
|
||||||
|
if (info.state === 'stopped' || info.state === 'error') {
|
||||||
|
// Container died — get last logs for error context
|
||||||
|
let errorMsg = `Container ${info.state}`;
|
||||||
|
try {
|
||||||
|
const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 });
|
||||||
|
const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop();
|
||||||
|
if (lastLog) errorMsg = lastLog;
|
||||||
|
} catch { /* best-effort */ }
|
||||||
|
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
|
||||||
|
metadata: { error: errorMsg },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Container gone entirely
|
||||||
|
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
|
||||||
|
metadata: { error: 'Container not found' },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reconcile instances for a server to match desired replica count.
|
* Reconcile instances for a server to match desired replica count.
|
||||||
|
* - Syncs container statuses first (detect crashed containers)
|
||||||
* - If fewer running instances than replicas: start new ones
|
* - If fewer running instances than replicas: start new ones
|
||||||
* - If more running instances than replicas: remove excess (oldest first)
|
* - If more running instances than replicas: remove excess (oldest first)
|
||||||
*/
|
*/
|
||||||
@@ -45,6 +78,9 @@ export class InstanceService {
|
|||||||
const server = await this.serverRepo.findById(serverId);
|
const server = await this.serverRepo.findById(serverId);
|
||||||
if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);
|
if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);
|
||||||
|
|
||||||
|
// Sync container statuses before counting active instances
|
||||||
|
await this.syncStatus();
|
||||||
|
|
||||||
const instances = await this.instanceRepo.findAll(serverId);
|
const instances = await this.instanceRepo.findAll(serverId);
|
||||||
const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
|
const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
|
||||||
const desired = server.replicas;
|
const desired = server.replicas;
|
||||||
|
|||||||
Reference in New Issue
Block a user