feat: container liveness sync + node-runner slim base

- Add syncStatus() to InstanceService: detects crashed/stopped containers,
  marks them ERROR with last log line as context
- Reconcile now syncs container status first (detect dead before counting)
- Add 30s periodic sync loop in main.ts
- Switch node-runner from alpine to slim (Debian) for npm compatibility
  (fixes home-assistant-mcp-server binary not found on Alpine)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Michal
2026-02-23 00:18:28 +00:00
parent b85c70bae0
commit 3c489cbecb
3 changed files with 52 additions and 2 deletions

View File

@@ -1,7 +1,8 @@
# Base container for npm-based MCP servers (STDIO transport).
# mcpd uses this image to run `npx -y <packageName>` when a server
# has packageName but no dockerImage.
FROM node:20-alpine
# Using slim (Debian) instead of alpine for better npm package compatibility.
FROM node:20-slim
WORKDIR /mcp

View File

@@ -134,9 +134,22 @@ async function main(): Promise<void> {
await app.listen({ port: config.port, host: config.host });
app.log.info(`mcpd listening on ${config.host}:${config.port}`);
// Periodic container liveness sync — detect crashed containers
const SYNC_INTERVAL_MS = 30_000; // 30s
const syncTimer = setInterval(async () => {
try {
await instanceService.syncStatus();
} catch (err) {
app.log.error({ err }, 'Container status sync failed');
}
}, SYNC_INTERVAL_MS);
// Graceful shutdown
setupGracefulShutdown(app, {
disconnectDb: () => prisma.$disconnect(),
disconnectDb: async () => {
clearInterval(syncTimer);
await prisma.$disconnect();
},
});
}

View File

@@ -36,8 +36,41 @@ export class InstanceService {
return instance;
}
/**
* Sync instance statuses with actual container state.
* Detects crashed/stopped containers and marks them ERROR.
*/
async syncStatus(): Promise<void> {
const instances = await this.instanceRepo.findAll();
for (const inst of instances) {
if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) {
try {
const info = await this.orchestrator.inspectContainer(inst.containerId);
if (info.state === 'stopped' || info.state === 'error') {
// Container died — get last logs for error context
let errorMsg = `Container ${info.state}`;
try {
const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 });
const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop();
if (lastLog) errorMsg = lastLog;
} catch { /* best-effort */ }
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
metadata: { error: errorMsg },
});
}
} catch {
// Container gone entirely
await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
metadata: { error: 'Container not found' },
});
}
}
}
}
/**
* Reconcile instances for a server to match desired replica count.
* - Syncs container statuses first (detect crashed containers)
* - If fewer running instances than replicas: start new ones
* - If more running instances than replicas: remove excess (oldest first)
*/
@@ -45,6 +78,9 @@ export class InstanceService {
const server = await this.serverRepo.findById(serverId);
if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);
// Sync container statuses before counting active instances
await this.syncStatus();
const instances = await this.instanceRepo.findAll(serverId);
const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
const desired = server.replicas;