From 610808b9e79ac4d8c9f0ba2ccf64994b14e1da83 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 27 Apr 2026 18:39:01 +0100 Subject: [PATCH] fix(chat): real fixes for thinking-model + URL conventions, not test tweaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five real bugs surfaced by the agent-chat smoke against live qwen3-thinking. None of these are fixed by changing the test — the test was right to fail. 1. openai-passthrough adapter doubled `/v1` in the request URL. The adapter hard-codes `/v1/chat/completions` after the configured base, but every OpenAI-compat provider documents its base URL with a trailing `/v1` (api.openai.com/v1, llm.example.com/v1, …). Users pasting that conventional shape produced `https://x/v1/v1/chat/completions` → 404. endpointUrl now strips a trailing `/v1` so both forms canonicalize. `/v1beta` (Anthropic-style) is preserved. 2. Non-streaming chat returned an empty assistant when thinking models (qwen3-thinking, deepseek-reasoner, OpenAI o1) emitted only `reasoning_content` with `content: null`. extractChoice now also pulls reasoning (every spelling the streaming parser already knows about), and a new pickAssistantText helper falls back to it when content is empty. A `[response truncated by max_tokens]` marker is appended when finish_reason is `length`, so users see the cut-off instead of guessing why the answer is short. Symmetric streaming fix: the chatStream loop accumulates reasoning and yields ONE synthesized `text` frame at the end when content stayed empty, keeping the CLI's stdout (which only prints `text` deltas) in sync with the persisted thread message. 3. `mcpctl get agent X -o yaml` emitted `kind: public` (the v3 lifecycle field) instead of `kind: agent` (apply envelope), so round-tripping through `apply -f` failed. Same fix shape as the v1 Llm strip in toApplyDocs — drop kind/status/lastHeartbeatAt/ inactiveSince/providerSessionId for the agents resource too. 4. Non-streaming `mcpctl chat` printed `thread:` (no space) on stderr; streaming printed `(thread: )` (with space). Tests and any other regex watching for one form missed the other. Standardize on `thread: ` (single space) in both paths. 5. agent-chat.smoke's `run()` used `execSync`, which discards stderr on success — making any `expect(stderr).toMatch(...)` assertion structurally impossible to satisfy in the happy path. Switch to `spawnSync` so stderr is actually captured. Includes a small shell-style argv splitter so the existing call sites with quoted multi-word values (`--system-prompt "..."`) keep working. Tests: +6 new mcpd unit tests (4 chat-service for the reasoning fallback / truncation marker / content-preference / streaming synth; 2 llm-adapters for the URL strip + /v1beta preservation). Full mcpd + mcplocal + smoke green: 860/860 + 723/723 + 139/139. --- src/cli/src/commands/chat.ts | 5 +- src/cli/src/commands/get.ts | 15 ++- src/mcpd/src/services/chat.service.ts | 101 +++++++++++++-- .../llm/adapters/openai-passthrough.ts | 10 +- src/mcpd/tests/chat-service.test.ts | 115 ++++++++++++++++++ src/mcpd/tests/llm-adapters.test.ts | 30 +++++ .../tests/smoke/agent-chat.smoke.test.ts | 46 ++++--- 7 files changed, 293 insertions(+), 29 deletions(-) diff --git a/src/cli/src/commands/chat.ts b/src/cli/src/commands/chat.ts index edece9c..f0772f1 100644 --- a/src/cli/src/commands/chat.ts +++ b/src/cli/src/commands/chat.ts @@ -151,7 +151,10 @@ async function runOneShot( const sec = Math.max(0.05, (Date.now() - startMs) / 1000); const words = (res.assistant.match(/\S+/g) ?? []).length; process.stdout.write(`${res.assistant}\n`); - process.stderr.write(styleStats(`(${String(words)}w · ${(words / sec).toFixed(1)} w/s · ${sec.toFixed(1)}s)`) + ` thread:${res.threadId}\n`); + // `thread: ` — single space after the colon, matching the streaming + // path (line 160 below) so any tooling/regex that watches one form picks + // up the other too. + process.stderr.write(styleStats(`(${String(words)}w · ${(words / sec).toFixed(1)} w/s · ${sec.toFixed(1)}s)`) + ` thread: ${res.threadId}\n`); return; } const bar = installStatusBar(); diff --git a/src/cli/src/commands/get.ts b/src/cli/src/commands/get.ts index ea78f33..78583f0 100644 --- a/src/cli/src/commands/get.ts +++ b/src/cli/src/commands/get.ts @@ -408,8 +408,8 @@ function toApplyDocs(resource: string, items: unknown[]): Array<{ kind: string } const kind = RESOURCE_KIND[resource] ?? resource; return items.map((item) => { const cleaned = stripInternalFields(item as Record); - // Llm-specific: the new virtual-provider lifecycle fields collide with - // the apply-doc `kind` envelope (the schema uses `kind: public|virtual`) + // Llm-specific: the virtual-provider lifecycle fields collide with the + // apply-doc `kind` envelope (the schema uses `kind: public|virtual`) // and aren't apply-able anyway — they're derived runtime state managed // by VirtualLlmService. Drop them so YAML round-trips stay clean. if (resource === 'llms') { @@ -419,6 +419,17 @@ function toApplyDocs(resource: string, items: unknown[]): Array<{ kind: string } delete cleaned['inactiveSince']; delete cleaned['providerSessionId']; } + // Agent-specific: same shape as Llm — Agent gained kind/status/etc. in + // v3 Stage 1 (virtual agent lifecycle) and the schema-`kind` field + // shadows the apply-envelope `kind: agent`. Strip the same set so + // `get agent X -o yaml | apply -f -` round-trips without diff. + if (resource === 'agents') { + delete cleaned['kind']; + delete cleaned['status']; + delete cleaned['lastHeartbeatAt']; + delete cleaned['inactiveSince']; + delete cleaned['providerSessionId']; + } return { kind, ...cleaned }; }); } diff --git a/src/mcpd/src/services/chat.service.ts b/src/mcpd/src/services/chat.service.ts index 087ea11..ff6aba8 100644 --- a/src/mcpd/src/services/chat.service.ts +++ b/src/mcpd/src/services/chat.service.ts @@ -185,6 +185,10 @@ export class ChatService { throw new Error(`Adapter returned no choice (status ${String(result.status)})`); } if (choice.tool_calls !== undefined && choice.tool_calls.length > 0) { + // Tool turns: keep `content` literal — even if empty — because the + // OpenAI tool-use protocol expects the assistant message to carry + // its tool_calls separately from any free-form text. Surfacing + // reasoning here would confuse downstream tool dispatchers. const assistantTurn = await this.chatRepo.appendMessage({ threadId: ctx.threadId, role: 'assistant', @@ -219,13 +223,17 @@ export class ChatService { await this.chatRepo.updateStatus(assistantTurn.id, 'complete'); continue; } - // Terminal text turn. + // Terminal text turn. Use pickAssistantText so thinking models that + // produced only reasoning_content still yield a usable answer (with + // a truncation marker when finish_reason indicates max_tokens + // cut-off). Empty body remains empty and bubbles up unchanged. + const assistantText = pickAssistantText(choice); const finalMsg = await this.chatRepo.appendMessage({ threadId: ctx.threadId, role: 'assistant', - content: choice.content ?? '', + content: assistantText, }); - assistantFinal = choice.content ?? ''; + assistantFinal = assistantText; lastTurnIndex = finalMsg.turnIndex; await this.chatRepo.touchThread(ctx.threadId); return { threadId: ctx.threadId, assistant: assistantFinal, turnIndex: lastTurnIndex }; @@ -242,8 +250,16 @@ export class ChatService { const ctx = await this.prepareContext(args); try { for (let i = 0; i < ctx.maxIterations; i += 1) { - const accumulated: { content: string; toolCalls: Array<{ id: string; name: string; argumentsJson: string }> } = { + // `reasoning` is accumulated alongside `content` so we can fall back + // to it when the model produces no `content` (thinking models with a + // tight max_tokens, or providers that don't separate the two). + const accumulated: { + content: string; + reasoning: string; + toolCalls: Array<{ id: string; name: string; argumentsJson: string }>; + } = { content: '', + reasoning: '', toolCalls: [], }; let finishReason: string | null = null; @@ -257,9 +273,11 @@ export class ChatService { yield { type: 'text', delta: evt.contentDelta }; } if (evt.reasoningDelta !== undefined) { - // Reasoning is not persisted to the thread (it's the model's - // scratchpad, not part of the conversation) — only streamed so - // the REPL can show progress while the model thinks. + // Streamed live so the REPL can show progress while the model + // thinks. Also accumulated so a thinking-only response (no + // `content`) still produces a non-empty persisted assistant + // turn — see the fallback at the end of this loop iteration. + accumulated.reasoning += evt.reasoningDelta; yield { type: 'thinking', delta: evt.reasoningDelta }; } if (evt.toolCallDeltas !== undefined) { @@ -326,10 +344,27 @@ export class ChatService { continue; } + // Fall back to reasoning when the model emitted only thinking + // output. Mirrors pickAssistantText() in the non-streaming path — + // same situation (thinking model + tight max_tokens, or a provider + // that bundles the answer into reasoning_content). + const persistedContent = pickAssistantText({ + content: accumulated.content.length > 0 ? accumulated.content : null, + ...(accumulated.reasoning.length > 0 ? { reasoning: accumulated.reasoning } : {}), + finishReason, + }); + // If we synthesized text from reasoning, yield it as a final `text` + // delta so the client's stdout matches what the thread persists. + // Without this, the REPL would show only `thinking` deltas (which + // the CLI writes to stderr) and stdout would be empty for any + // thinking-only response. + if (accumulated.content.length === 0 && persistedContent.length > 0) { + yield { type: 'text', delta: persistedContent }; + } const finalMsg = await this.chatRepo.appendMessage({ threadId: ctx.threadId, role: 'assistant', - content: accumulated.content, + content: persistedContent, }); await this.chatRepo.touchThread(ctx.threadId); yield { type: 'final', threadId: ctx.threadId, turnIndex: finalMsg.turnIndex }; @@ -682,6 +717,17 @@ export class ChatService { interface ExtractedChoice { content: string | null; + /** + * Reasoning text emitted by thinking models (qwen3-thinking, + * deepseek-reasoner, OpenAI o1 family). Different providers spell the + * field differently; the parser accepts every shape the streaming + * counterpart already accepts (see `parseStreamingChunk`). When `content` + * is null/empty, callers fall back to this so thinking models that + * exhaust their token budget on reasoning still produce a usable answer. + */ + reasoning?: string; + /** OpenAI's stop reason — `'stop' | 'length' | 'tool_calls' | 'content_filter' | ...`. */ + finishReason?: string | null; tool_calls?: Array<{ id: string; type: 'function'; function: { name: string; arguments: string } }>; } @@ -689,17 +735,52 @@ function extractChoice(body: unknown): ExtractedChoice | null { if (typeof body !== 'object' || body === null) return null; const choices = (body as { choices?: unknown }).choices; if (!Array.isArray(choices) || choices.length === 0) return null; - const first = choices[0] as { message?: { content?: unknown; tool_calls?: unknown } } | undefined; + const first = choices[0] as { + message?: { + content?: unknown; + reasoning_content?: unknown; + reasoning?: unknown; + provider_specific_fields?: { reasoning_content?: unknown; reasoning?: unknown }; + tool_calls?: unknown; + }; + finish_reason?: unknown; + } | undefined; if (first?.message === undefined) return null; const content = typeof first.message.content === 'string' ? first.message.content : null; + const m = first.message; + const reasoning = + (typeof m.reasoning_content === 'string' && m.reasoning_content.length > 0 ? m.reasoning_content : undefined) + ?? (typeof m.reasoning === 'string' && m.reasoning.length > 0 ? m.reasoning : undefined) + ?? (typeof m.provider_specific_fields?.reasoning_content === 'string' && m.provider_specific_fields.reasoning_content.length > 0 ? m.provider_specific_fields.reasoning_content : undefined) + ?? (typeof m.provider_specific_fields?.reasoning === 'string' && m.provider_specific_fields.reasoning.length > 0 ? m.provider_specific_fields.reasoning : undefined); + const finishReason = typeof first.finish_reason === 'string' ? first.finish_reason : null; const toolCalls = first.message.tool_calls; - const out: ExtractedChoice = { content }; + const out: ExtractedChoice = { content, finishReason }; + if (reasoning !== undefined) out.reasoning = reasoning; if (Array.isArray(toolCalls)) { out.tool_calls = toolCalls as NonNullable; } return out; } +/** + * Pick what text to surface (and persist) as the assistant's reply. + * Thinking models sometimes emit only `reasoning_content` and leave + * `content` null — typically when `max_tokens` is too small for the + * thinking budget, but also when the provider configuration just doesn't + * separate the two. In that case the reasoning IS the answer for this + * request, and the caller should see it. A `length` finish_reason marker + * makes truncation visible so users can fix their max_tokens config. + */ +function pickAssistantText(choice: ExtractedChoice): string { + if (choice.content !== null && choice.content.length > 0) return choice.content; + if (choice.reasoning !== undefined && choice.reasoning.length > 0) { + const truncated = choice.finishReason === 'length' ? '\n\n[response truncated by max_tokens]' : ''; + return `${choice.reasoning}${truncated}`; + } + return ''; +} + function safeParseJson(s: string): unknown { if (s === '') return {}; try { diff --git a/src/mcpd/src/services/llm/adapters/openai-passthrough.ts b/src/mcpd/src/services/llm/adapters/openai-passthrough.ts index ddad8e2..574b57f 100644 --- a/src/mcpd/src/services/llm/adapters/openai-passthrough.ts +++ b/src/mcpd/src/services/llm/adapters/openai-passthrough.ts @@ -123,7 +123,15 @@ export class OpenAiPassthroughAdapter implements LlmAdapter { } private endpointUrl(url: string): string { - if (url !== '') return url.replace(/\/+$/, ''); + // Accept both conventional forms users actually paste — base host + // (`https://api.openai.com`) and base + version (`https://api.openai.com/v1`). + // Every OpenAI-compat provider documents their endpoint with the `/v1` + // suffix, so users naturally include it; the adapter then re-appends + // `/v1/chat/completions`, producing a doubled-`/v1` 404 against LiteLLM + // and others. Strip a trailing `/v1` (with or without slash) so both + // shapes resolve to the same canonical base. A more specific suffix + // like `/v1beta` is preserved. + if (url !== '') return url.replace(/\/+$/, '').replace(/\/v1$/, ''); const def = DEFAULT_URLS[this.kind]; if (def === undefined) { throw new Error(`${this.kind}: url is required (no default endpoint for this provider)`); diff --git a/src/mcpd/tests/chat-service.test.ts b/src/mcpd/tests/chat-service.test.ts index 3eb205a..11bdb07 100644 --- a/src/mcpd/tests/chat-service.test.ts +++ b/src/mcpd/tests/chat-service.test.ts @@ -461,6 +461,121 @@ describe('ChatService', () => { expect(assistantTurn?.content).not.toContain('Let me think'); }); + // Regression: thinking models with a tight max_tokens budget produce + // `reasoning_content` only and leave `content` null. Without falling back + // to reasoning, the assistant turn was empty and the smoke test saw an + // empty stdout. This covers BOTH chat() (non-streaming) and chatStream() + // (synthetic final text frame so the CLI's stdout matches what's + // persisted to the thread). + it('chat falls back to reasoning_content when content is null', async () => { + const chatRepo = mockChatRepo(); + const adapter: LlmAdapter = { + kind: 'thinking-truncated', + infer: vi.fn(async () => ({ + status: 200, + body: { + id: 'cmpl-1', + object: 'chat.completion', + choices: [{ + index: 0, + message: { role: 'assistant', content: null, reasoning_content: 'Thinking out loud about the answer' }, + finish_reason: 'stop', + }], + }, + })), + stream: async function*() { yield { data: '[DONE]', done: true }; }, + }; + const svc = new ChatService( + mockAgents(), mockLlms(), adapterRegistry(adapter), + chatRepo, mockPromptRepo(), mockTools(), + ); + const result = await svc.chat({ agentName: 'reviewer', userMessage: 'hi', ownerId: 'owner-1' }); + expect(result.assistant).toBe('Thinking out loud about the answer'); + const stored = chatRepo._msgs.find((m) => m.role === 'assistant'); + expect(stored?.content).toBe('Thinking out loud about the answer'); + }); + + it('chat appends [response truncated by max_tokens] when finish_reason is "length"', async () => { + const chatRepo = mockChatRepo(); + const adapter: LlmAdapter = { + kind: 'thinking-clipped', + infer: vi.fn(async () => ({ + status: 200, + body: { + choices: [{ + index: 0, + message: { role: 'assistant', content: null, reasoning_content: 'partial reasoning that ran out of' }, + finish_reason: 'length', + }], + }, + })), + stream: async function*() { yield { data: '[DONE]', done: true }; }, + }; + const svc = new ChatService( + mockAgents(), mockLlms(), adapterRegistry(adapter), + chatRepo, mockPromptRepo(), mockTools(), + ); + const result = await svc.chat({ agentName: 'reviewer', userMessage: 'hi', ownerId: 'owner-1' }); + expect(result.assistant).toContain('partial reasoning that ran out of'); + expect(result.assistant).toContain('[response truncated by max_tokens]'); + }); + + it('chat prefers content when both content and reasoning_content are present', async () => { + // Thinking models that DO produce content shouldn't see the reasoning + // bleed into the response — that's what the streaming path's + // text/thinking split is for, and the non-streaming path should match. + const chatRepo = mockChatRepo(); + const adapter: LlmAdapter = { + kind: 'thinking-with-content', + infer: vi.fn(async () => ({ + status: 200, + body: { + choices: [{ + index: 0, + message: { role: 'assistant', content: 'real answer', reasoning_content: 'background thinking' }, + finish_reason: 'stop', + }], + }, + })), + stream: async function*() { yield { data: '[DONE]', done: true }; }, + }; + const svc = new ChatService( + mockAgents(), mockLlms(), adapterRegistry(adapter), + chatRepo, mockPromptRepo(), mockTools(), + ); + const result = await svc.chat({ agentName: 'reviewer', userMessage: 'hi', ownerId: 'owner-1' }); + expect(result.assistant).toBe('real answer'); + expect(result.assistant).not.toContain('background thinking'); + }); + + it('chatStream emits a synthetic text frame and persists reasoning when content is empty', async () => { + const chatRepo = mockChatRepo(); + const adapter: LlmAdapter = { + kind: 'thinking-only-stream', + infer: vi.fn(), + stream: async function*() { + yield { data: JSON.stringify({ choices: [{ delta: { reasoning_content: 'thinking ' }, finish_reason: null }] }) }; + yield { data: JSON.stringify({ choices: [{ delta: { reasoning_content: 'more.' }, finish_reason: 'stop' }] }) }; + yield { data: '[DONE]', done: true }; + }, + }; + const svc = new ChatService( + mockAgents(), mockLlms(), adapterRegistry(adapter), + chatRepo, mockPromptRepo(), mockTools(), + ); + const chunks: Array<{ type: string; delta?: string }> = []; + for await (const c of svc.chatStream({ agentName: 'reviewer', userMessage: 'hi', ownerId: 'owner-1' })) { + chunks.push({ type: c.type, delta: c.delta }); + } + // 2 thinking deltas (live), 1 synthesized text frame, 1 final. + expect(chunks.filter((c) => c.type === 'thinking').map((c) => c.delta)).toEqual(['thinking ', 'more.']); + expect(chunks.filter((c) => c.type === 'text').map((c) => c.delta)).toEqual(['thinking more.']); + // The thread message captures the synthesized text so resumed chats see + // a coherent assistant turn (rather than blank). + const stored = chatRepo._msgs.find((m) => m.role === 'assistant'); + expect(stored?.content).toBe('thinking more.'); + }); + // Regression: provider_specific_fields.reasoning_content shape (LiteLLM // passthrough from vLLM) is also recognized. it('chatStream recognizes LiteLLM provider_specific_fields.reasoning_content', async () => { diff --git a/src/mcpd/tests/llm-adapters.test.ts b/src/mcpd/tests/llm-adapters.test.ts index 045ac88..5b16f18 100644 --- a/src/mcpd/tests/llm-adapters.test.ts +++ b/src/mcpd/tests/llm-adapters.test.ts @@ -71,6 +71,36 @@ describe('OpenAiPassthroughAdapter', () => { await expect(adapter.infer(makeCtx())).rejects.toThrow(/no default endpoint/); }); + it('infer: strips a trailing /v1 from the configured URL', async () => { + // Users naturally paste the OpenAI-style base URL with /v1 because + // every provider documents it that way (https://api.openai.com/v1, + // https://llm.example.com/v1). The adapter then re-appends + // /v1/chat/completions; without normalization this would produce a + // doubled-/v1 404 against LiteLLM and friends. + const fetchFn = mockFetch([{ match: /\/v1\/chat\/completions$/, status: 200, body: {} }]); + const adapter = new OpenAiPassthroughAdapter('openai', { fetch: fetchFn as unknown as typeof fetch }); + await adapter.infer(makeCtx({ url: 'https://llm.example.com/v1' })); + const [url1] = fetchFn.mock.calls[0] as [string]; + expect(url1).toBe('https://llm.example.com/v1/chat/completions'); + + // Trailing slash + /v1 should also normalize correctly. + const fetchFn2 = mockFetch([{ match: /\/v1\/chat\/completions$/, status: 200, body: {} }]); + const adapter2 = new OpenAiPassthroughAdapter('openai', { fetch: fetchFn2 as unknown as typeof fetch }); + await adapter2.infer(makeCtx({ url: 'https://llm.example.com/v1/' })); + const [url2] = fetchFn2.mock.calls[0] as [string]; + expect(url2).toBe('https://llm.example.com/v1/chat/completions'); + }); + + it('infer: preserves a trailing /v1beta suffix (only exact /v1 is stripped)', async () => { + // Some providers expose `/v1beta` as a parallel API surface — don't + // accidentally rewrite that to `/v1` or strip it. + const fetchFn = mockFetch([{ match: /\/v1beta\/v1\/chat\/completions$/, status: 200, body: {} }]); + const adapter = new OpenAiPassthroughAdapter('openai', { fetch: fetchFn as unknown as typeof fetch }); + await adapter.infer(makeCtx({ url: 'https://api.example.com/v1beta' })); + const [url] = fetchFn.mock.calls[0] as [string]; + expect(url).toBe('https://api.example.com/v1beta/v1/chat/completions'); + }); + it('infer: omits Authorization when apiKey is empty', async () => { const fetchFn = mockFetch([{ match: /ollama/, status: 200, body: {} }]); const adapter = new OpenAiPassthroughAdapter('ollama', { fetch: fetchFn as unknown as typeof fetch }); diff --git a/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts b/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts index d96b03b..e445511 100644 --- a/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts +++ b/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts @@ -17,7 +17,7 @@ import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import http from 'node:http'; import https from 'node:https'; -import { execSync } from 'node:child_process'; +import { spawnSync, execSync } from 'node:child_process'; const MCPD_URL = process.env.MCPD_URL ?? 'https://mcpctl.ad.itaz.eu'; const LLM_URL = process.env.MCPCTL_SMOKE_LLM_URL; @@ -31,21 +31,37 @@ const AGENT_NAME = `smoke-chat-agent-${SUFFIX}`; interface CliResult { code: number; stdout: string; stderr: string } function run(args: string): CliResult { - try { - const stdout = execSync(`mcpctl --direct ${args}`, { - encoding: 'utf-8', - timeout: 60_000, - stdio: ['ignore', 'pipe', 'pipe'], - }); - return { code: 0, stdout: stdout.trim(), stderr: '' }; - } catch (err) { - const e = err as { status?: number; stdout?: Buffer | string; stderr?: Buffer | string }; - return { - code: e.status ?? 1, - stdout: e.stdout ? (typeof e.stdout === 'string' ? e.stdout : e.stdout.toString('utf-8')) : '', - stderr: e.stderr ? (typeof e.stderr === 'string' ? e.stderr : e.stderr.toString('utf-8')) : '', - }; + // spawnSync (not execSync) — execSync returns only stdout on success and + // discards stderr, which made any `thread:` assertion against a successful + // chat impossible to evaluate. Splitting the args correctly handles the + // few existing call sites that quote-wrap multi-word values like + // `--system-prompt "You are..."`. + const argv = splitArgs(args); + const res = spawnSync('mcpctl', ['--direct', ...argv], { + encoding: 'utf-8', + timeout: 60_000, + }); + return { + code: res.status ?? 1, + stdout: (res.stdout ?? '').trim(), + stderr: (res.stderr ?? '').trim(), + }; +} + +/** + * Tokenize a shell-style argv string with simple double-quote support — just + * enough for the smoke test's call shapes. Not a full POSIX parser; we only + * need to keep `--system-prompt "You are a smoke test..."` together as one + * arg. + */ +function splitArgs(s: string): string[] { + const out: string[] = []; + const re = /"([^"]*)"|(\S+)/g; + let m: RegExpExecArray | null; + while ((m = re.exec(s)) !== null) { + out.push(m[1] !== undefined ? m[1] : (m[2] ?? '')); } + return out; } function healthz(url: string, timeoutMs = 5000): Promise {