From 1f0be8a5c13541994e05d51cef15cb4107a253fa Mon Sep 17 00:00:00 2001 From: Michal Date: Sat, 25 Apr 2026 23:53:19 +0100 Subject: [PATCH] fix(agents): close gaps from /gstack-review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 — thread reads now enforce ownership ======================================== chat.service.ts / routes/agent-chat.ts GET /api/v1/threads/:id/messages was previously RBAC-mapped to view:agents (no resourceName scope) with the route comment promising "service-level owner check enforces fine-grained access" — but the service didn't actually check. Any caller with view:agents could read another user's thread by guessing/learning the threadId. CUIDs are hard to brute-force but they leak: SSE `final` chunks, agents-plugin `_meta.threadId`, and several response bodies surface them. Now ChatService.listMessages(threadId, ownerId) loads the thread, returns 404 (not 403, to avoid id-enumeration via differential status codes) if ownerId doesn't match. Regression test in chat-service.test.ts covers Alice/Bob isolation + nonexistent-thread same-shape 404. P2 — AgentChatRequestSchema strict mode ======================================== validation/agent.schema.ts `.merge()` does NOT inherit `.strict()` from AgentChatParamsSchema. Typo'd fields (e.g. `temprature`) silently fell through and the agent silently used the default — debuggable only by reading the LLM call payload. Re-applied `.strict()` on the merged schema. P2 — per-agent maxIterations override + clamp ============================================== chat.service.ts Loop cap was a hard-coded module constant (12), wrong for both research-style agents (need higher) and cheap-probe agents (could opt lower). Now reads `agent.extras.maxIterations`, clamps 1..50, falls back to 12 default. The clamp is the soft-DoS guard: a hostile agent definition with `maxIterations:1000000` can't burn unbounded LLM calls per request. Both chat() and chatStream() use ctx.maxIterations now. Regression test covers low-cap override (rejects with `exceeded 2`) and hostile-value clamp (rejects with `exceeded 50`). P3 — SSE write to closed socket ================================ routes/agent-chat.ts When the upstream adapter throws after some chunks were already written AND the client disconnected, the catch block tried to flush more chunks to a closed socket. Without an `on('error')` handler Node emits unhandled error events; once Pino is wired to alerts this'd page on every disconnect-mid-stream. writeSseChunk now checks `reply.raw.destroyed || writableEnded` before write. P3 — BACKEND_TOKEN_DEAD preserves original stack ================================================= services/secret-backend-rotator.service.ts When wrapping mintRoleToken/lookupSelf failures as BACKEND_TOKEN_DEAD, the new Error() discarded the original throw — hard to tell whether the inner failure was a network blip vs an OpenBao API mismatch vs DNS. Now uses `new Error(msg, { cause: err })` so the inner stack survives. P3 — .gitignore .claude/scheduled_tasks.lock ============================================= This persisted state file was leaking into every `git status`. Tests ===== mcpd 761/761 (+2 regression tests). mcplocal 715/715. cli 430/430. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 + src/mcpd/src/routes/agent-chat.ts | 8 +- src/mcpd/src/services/chat.service.ts | 41 +++++++-- .../secret-backend-rotator.service.ts | 6 +- src/mcpd/src/validation/agent.schema.ts | 6 +- src/mcpd/tests/chat-service.test.ts | 87 +++++++++++++++++++ 6 files changed, 143 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index e1267fc..4cf4127 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ logs.sh mcpctl-backup.json a.yaml test-mcp.sh + +# Claude Code local state +.claude/scheduled_tasks.lock diff --git a/src/mcpd/src/routes/agent-chat.ts b/src/mcpd/src/routes/agent-chat.ts index ff154cf..0a83579 100644 --- a/src/mcpd/src/routes/agent-chat.ts +++ b/src/mcpd/src/routes/agent-chat.ts @@ -126,8 +126,9 @@ export function registerAgentChatRoutes( app.get<{ Params: { id: string } }>( '/api/v1/threads/:id/messages', async (request, reply) => { + const ownerId = request.userId ?? 'system'; try { - return await chat.listMessages(request.params.id); + return await chat.listMessages(request.params.id, ownerId); } catch (err) { if (err instanceof NotFoundError) { reply.code(404); @@ -140,5 +141,10 @@ export function registerAgentChatRoutes( } function writeSseChunk(reply: FastifyReply, data: string): void { + // Guard against writing to a destroyed/closed socket — happens when the + // client disconnects mid-stream and we still try to flush an error payload + // from the catch path. Without the guard, Node emits an unhandled 'error' + // on the response and the noise pollutes logs. + if (reply.raw.destroyed || reply.raw.writableEnded) return; reply.raw.write(`data: ${data}\n\n`); } diff --git a/src/mcpd/src/services/chat.service.ts b/src/mcpd/src/services/chat.service.ts index 80ba953..5c9bf7c 100644 --- a/src/mcpd/src/services/chat.service.ts +++ b/src/mcpd/src/services/chat.service.ts @@ -35,7 +35,27 @@ import type { AgentChatParams } from '../validation/agent.schema.js'; import { NotFoundError } from './mcp-server.service.js'; export const TOOL_NAME_SEPARATOR = '__'; +/** Default tool-loop cap. Per-agent override via `agent.extras.maxIterations`, clamped to MIN..MAX. */ export const MAX_ITERATIONS = 12; +const MIN_ITERATIONS_CAP = 1; +const MAX_ITERATIONS_CAP = 50; + +/** + * Resolve the loop cap for this turn: + * agent.extras.maxIterations → clamp(MIN_ITERATIONS_CAP, MAX_ITERATIONS_CAP) → + * fallback to MAX_ITERATIONS default. + * + * The clamp is the soft-DoS guard: a hostile agent definition can't pick a + * thousand-iteration cap, even with `create:agents` permission. + */ +function resolveMaxIterations(extras: Record | null | undefined): number { + const raw = extras?.['maxIterations']; + if (typeof raw !== 'number' || !Number.isFinite(raw)) return MAX_ITERATIONS; + const truncated = Math.trunc(raw); + if (truncated < MIN_ITERATIONS_CAP) return MIN_ITERATIONS_CAP; + if (truncated > MAX_ITERATIONS_CAP) return MAX_ITERATIONS_CAP; + return truncated; +} /** Project-scoped tool surface the chat loop calls into. Stub-friendly. */ export interface ChatTool { @@ -110,7 +130,16 @@ export class ChatService { return rows.map((r) => ({ id: r.id, title: r.title, lastTurnAt: r.lastTurnAt, createdAt: r.createdAt })); } - async listMessages(threadId: string): Promise { + async listMessages(threadId: string, ownerId: string): Promise { + // Owner check guards `view:agents` from leaking another user's thread by + // ID. Thread IDs are CUIDs (hard to guess) but they leak through SSE + // `final` chunks, the agents-plugin tool _meta, and several response + // bodies, so id-knowledge is not a security boundary on its own. Return + // 404 (not 403) on mismatch to avoid id-enumeration via differential + // status codes. + const thread = await this.chatRepo.findThread(threadId); + if (thread === null) throw new NotFoundError(`Thread not found: ${threadId}`); + if (thread.ownerId !== ownerId) throw new NotFoundError(`Thread not found: ${threadId}`); return this.chatRepo.listMessages(threadId); } @@ -120,7 +149,7 @@ export class ChatService { let assistantFinal = ''; let lastTurnIndex = ctx.startingTurnIndex; try { - for (let i = 0; i < MAX_ITERATIONS; i += 1) { + for (let i = 0; i < ctx.maxIterations; i += 1) { const adapter = this.adapters.get(ctx.llmType); const result = await adapter.infer({ body: this.buildBody(ctx), @@ -179,7 +208,7 @@ export class ChatService { await this.chatRepo.touchThread(ctx.threadId); return { threadId: ctx.threadId, assistant: assistantFinal, turnIndex: lastTurnIndex }; } - throw new Error(`Chat loop exceeded ${String(MAX_ITERATIONS)} iterations without a terminal turn`); + throw new Error(`Chat loop exceeded ${String(ctx.maxIterations)} iterations without a terminal turn`); } catch (err) { await this.chatRepo.markPendingAsError(ctx.threadId); throw err; @@ -190,7 +219,7 @@ export class ChatService { async *chatStream(args: ChatRequestArgs): AsyncGenerator { const ctx = await this.prepareContext(args); try { - for (let i = 0; i < MAX_ITERATIONS; i += 1) { + for (let i = 0; i < ctx.maxIterations; i += 1) { const adapter = this.adapters.get(ctx.llmType); const accumulated: { content: string; toolCalls: Array<{ id: string; name: string; argumentsJson: string }> } = { content: '', @@ -285,7 +314,7 @@ export class ChatService { yield { type: 'final', threadId: ctx.threadId, turnIndex: finalMsg.turnIndex }; return; } - throw new Error(`Chat loop exceeded ${String(MAX_ITERATIONS)} iterations without a terminal turn`); + throw new Error(`Chat loop exceeded ${String(ctx.maxIterations)} iterations without a terminal turn`); } catch (err) { await this.chatRepo.markPendingAsError(ctx.threadId); yield { type: 'error', message: (err as Error).message }; @@ -306,6 +335,7 @@ export class ChatService { toolList: ChatTool[]; projectId: string | null; startingTurnIndex: number; + maxIterations: number; }> { const agent = await this.agents.getByName(args.agentName); const llm = await this.llms.getByName(agent.llm.name); @@ -370,6 +400,7 @@ export class ChatService { toolList: filteredTools, projectId, startingTurnIndex, + maxIterations: resolveMaxIterations(agent.extras), }; } diff --git a/src/mcpd/src/services/secret-backend-rotator.service.ts b/src/mcpd/src/services/secret-backend-rotator.service.ts index f041cad..0a8ed11 100644 --- a/src/mcpd/src/services/secret-backend-rotator.service.ts +++ b/src/mcpd/src/services/secret-backend-rotator.service.ts @@ -136,7 +136,11 @@ export class SecretBackendRotator { `This is unrecoverable from inside mcpd — likely cause: OpenBao was re-initialized and all old tokens are invalid. ` + `Remediation: mint a fresh token under role '${cfg.rotation.tokenRole}' using a working OpenBao admin token, ` + `then \`mcpctl create secret ${cfg.tokenSecretRef.name} --data ${cfg.tokenSecretRef.key}= --force\`. ` + - `Original error: ${msg}`) + `Original error: ${msg}`, + // Preserve the original stack trace via Error.cause so the inner + // failure (network vs OpenBao API vs DNS) is recoverable from the + // wrapped throw. + { cause: err }) : err; const wrappedMsg = wrapped instanceof Error ? wrapped.message : String(wrapped); await this.recordError(backendId, meta, wrappedMsg); diff --git a/src/mcpd/src/validation/agent.schema.ts b/src/mcpd/src/validation/agent.schema.ts index 651bd39..27c4de7 100644 --- a/src/mcpd/src/validation/agent.schema.ts +++ b/src/mcpd/src/validation/agent.schema.ts @@ -90,6 +90,10 @@ export const UpdateAgentSchema = z.object({ }); /** Body schema for `POST /api/v1/agents/:name/chat`. */ +// `.merge()` does NOT inherit `.strict()` from `AgentChatParamsSchema`, so we +// re-apply it on the merged schema. Without this, typo'd request fields (e.g. +// `temprature` instead of `temperature`) silently fall through and the agent +// uses the default — debuggable only by reading the LLM call payload. export const AgentChatRequestSchema = AgentChatParamsSchema.merge( z.object({ threadId: z.string().min(1).optional(), @@ -105,7 +109,7 @@ export const AgentChatRequestSchema = AgentChatParamsSchema.merge( .optional(), stream: z.boolean().optional(), }), -).refine((v) => v.message !== undefined || (v.messages?.length ?? 0) > 0, { +).strict().refine((v) => v.message !== undefined || (v.messages?.length ?? 0) > 0, { message: 'Either `message` or `messages` is required', }); diff --git a/src/mcpd/tests/chat-service.test.ts b/src/mcpd/tests/chat-service.test.ts index 2b6170b..80fd58c 100644 --- a/src/mcpd/tests/chat-service.test.ts +++ b/src/mcpd/tests/chat-service.test.ts @@ -410,4 +410,91 @@ describe('ChatService', () => { expect(ctx.body.tools).toHaveLength(1); expect(ctx.body.tools?.[0]?.function.name).toBe(`s1${TOOL_NAME_SEPARATOR}a`); }); + + // Regression: per-agent maxIterations override + clamp. + // Found by /gstack-review on 2026-04-25. + // Without the clamp, a hostile agent definition with `extras.maxIterations:1000000` + // could spin the loop into a near-infinite tool-call burn. + it('per-agent extras.maxIterations clamps below default and refuses absurd values', async () => { + const chatRepo = mockChatRepo(); + const tools = mockTools({ + listTools: vi.fn(async () => [{ + name: `g${TOOL_NAME_SEPARATOR}t`, description: '', parameters: {}, + }]), + callTool: vi.fn(async () => ({})), + }); + // Agent with maxIterations=2 — only 2 tool-call rounds allowed before bail. + const agentsLowCap = { + getByName: vi.fn(async () => ({ + id: 'agent-low', name: 'low', description: '', systemPrompt: '', + llm: { id: 'llm-1', name: 'qwen3-thinking' }, + project: { id: 'proj-1', name: 'mcpctl-dev' }, + proxyModelName: null, defaultParams: {}, + extras: { maxIterations: 2 }, + ownerId: 'owner-1', version: 1, createdAt: NOW, updatedAt: NOW, + })), + } as unknown as AgentService; + const adapter = scriptedAdapter([toolCall(`g${TOOL_NAME_SEPARATOR}t`, {})]); + const svc = new ChatService( + agentsLowCap, mockLlms(), adapterRegistry(adapter), + chatRepo, mockPromptRepo(), tools, + ); + await expect(svc.chat({ + agentName: 'low', userMessage: 'spin', ownerId: 'owner-1', + })).rejects.toThrow(/exceeded 2 iterations/); + + // Hostile agent with maxIterations=1000000 — must clamp to 50, not iterate forever. + const agentsHostile = { + getByName: vi.fn(async () => ({ + id: 'agent-bad', name: 'bad', description: '', systemPrompt: '', + llm: { id: 'llm-1', name: 'qwen3-thinking' }, + project: { id: 'proj-1', name: 'mcpctl-dev' }, + proxyModelName: null, defaultParams: {}, + extras: { maxIterations: 1_000_000 }, + ownerId: 'owner-1', version: 1, createdAt: NOW, updatedAt: NOW, + })), + } as unknown as AgentService; + const adapter2 = scriptedAdapter([toolCall(`g${TOOL_NAME_SEPARATOR}t`, {})]); + const chatRepo2 = mockChatRepo(); + const svc2 = new ChatService( + agentsHostile, mockLlms(), adapterRegistry(adapter2), + chatRepo2, mockPromptRepo(), tools, + ); + await expect(svc2.chat({ + agentName: 'bad', userMessage: 'spin', ownerId: 'owner-1', + })).rejects.toThrow(/exceeded 50 iterations/); + }); + + // Regression: thread message reads must enforce ownership. + // Found by /gstack-review on 2026-04-25. + // Without this, any caller with `view:agents` could read another user's thread + // by guessing/learning the threadId (CUIDs leak through SSE chunks + tool _meta). + it('listMessages refuses a thread owned by another user (404, not 403, to avoid id-enumeration)', async () => { + const chatRepo = mockChatRepo(); + // Pre-seed a thread owned by 'alice' + await chatRepo.createThread({ agentId: 'agent-x', ownerId: 'alice' }); + const aliceThread = chatRepo._threads[0]!; + await chatRepo.appendMessage({ + threadId: aliceThread.id, + role: 'user', + content: 'private to alice', + }); + + const svc = new ChatService( + mockAgents(), mockLlms(), adapterRegistry(scriptedAdapter([chatCompletion('ok')])), + chatRepo, mockPromptRepo(), mockTools(), + ); + + // Bob requests Alice's thread by id — must 404. + await expect(svc.listMessages(aliceThread.id, 'bob')) + .rejects.toThrow(/not found/i); + + // Alice gets her own messages. + const aliceMessages = await svc.listMessages(aliceThread.id, 'alice'); + expect(aliceMessages.map((m) => m.content)).toEqual(['private to alice']); + + // Genuinely missing thread — same 404 shape (no oracle leak). + await expect(svc.listMessages('cnonexistent000000000000000', 'alice')) + .rejects.toThrow(/not found/i); + }); });