Merge pull request 'feat: container liveness sync + node-runner slim base' (#15) from feat/container-liveness-sync into main

2026-02-23 00:18:41 +00:00
parent fa58c1b5ed d07d4d11dd
commit d38b5aac60
3 changed files with 52 additions and 2 deletions
--- a/deploy/Dockerfile.node-runner
+++ b/deploy/Dockerfile.node-runner
@@ -1,7 +1,8 @@
 # Base container for npm-based MCP servers (STDIO transport).
 # mcpd uses this image to run `npx -y <packageName>` when a server
 # has packageName but no dockerImage.
-FROM node:20-alpine
+# Using slim (Debian) instead of alpine for better npm package compatibility.
+FROM node:20-slim

 WORKDIR /mcp

--- a/src/mcpd/src/main.ts
+++ b/src/mcpd/src/main.ts
@@ -134,9 +134,22 @@ async function main(): Promise<void> {
  await app.listen({ port: config.port, host: config.host });
  app.log.info(`mcpd listening on ${config.host}:${config.port}`);

+  // Periodic container liveness sync — detect crashed containers
+  const SYNC_INTERVAL_MS = 30_000; // 30s
+  const syncTimer = setInterval(async () => {
+    try {
+      await instanceService.syncStatus();
+    } catch (err) {
+      app.log.error({ err }, 'Container status sync failed');
+    }
+  }, SYNC_INTERVAL_MS);
+
  // Graceful shutdown
  setupGracefulShutdown(app, {
-    disconnectDb: () => prisma.$disconnect(),
+    disconnectDb: async () => {
+      clearInterval(syncTimer);
+      await prisma.$disconnect();
+    },
  });
 }

--- a/src/mcpd/src/services/instance.service.ts
+++ b/src/mcpd/src/services/instance.service.ts
@@ -36,8 +36,41 @@ export class InstanceService {
    return instance;
  }

+  /**
+   * Sync instance statuses with actual container state.
+   * Detects crashed/stopped containers and marks them ERROR.
+   */
+  async syncStatus(): Promise<void> {
+    const instances = await this.instanceRepo.findAll();
+    for (const inst of instances) {
+      if ((inst.status === 'RUNNING' || inst.status === 'STARTING') && inst.containerId) {
+        try {
+          const info = await this.orchestrator.inspectContainer(inst.containerId);
+          if (info.state === 'stopped' || info.state === 'error') {
+            // Container died — get last logs for error context
+            let errorMsg = `Container ${info.state}`;
+            try {
+              const logs = await this.orchestrator.getContainerLogs(inst.containerId, { tail: 5 });
+              const lastLog = (logs.stdout || logs.stderr).trim().split('\n').pop();
+              if (lastLog) errorMsg = lastLog;
+            } catch { /* best-effort */ }
+            await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
+              metadata: { error: errorMsg },
+            });
+          }
+        } catch {
+          // Container gone entirely
+          await this.instanceRepo.updateStatus(inst.id, 'ERROR', {
+            metadata: { error: 'Container not found' },
+          });
+        }
+      }
+    }
+  }
+
  /**
   * Reconcile instances for a server to match desired replica count.
+   * - Syncs container statuses first (detect crashed containers)
   * - If fewer running instances than replicas: start new ones
   * - If more running instances than replicas: remove excess (oldest first)
   */
@@ -45,6 +78,9 @@ export class InstanceService {
    const server = await this.serverRepo.findById(serverId);
    if (!server) throw new NotFoundError(`McpServer '${serverId}' not found`);

+    // Sync container statuses before counting active instances
+    await this.syncStatus();
+
    const instances = await this.instanceRepo.findAll(serverId);
    const active = instances.filter((i) => i.status === 'RUNNING' || i.status === 'STARTING');
    const desired = server.replicas;