From a84214dad1740590758c62de3779149894ecb3d5 Mon Sep 17 00:00:00 2001 From: Michal Date: Mon, 27 Apr 2026 12:09:42 +0100 Subject: [PATCH] fix(cli): status probe accepts reasoning_content for thinking models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live deploy showed qwen3-thinking failing the probe with "empty content": at max_tokens=8 the model spent its entire budget on the reasoning trace and never emitted a final \`content\` block. Fix: - Bump max_tokens to 64. Still caps latency at ~1-2 sec on cheap models but gives reasoning models enough headroom. - If \`message.content\` is empty but \`reasoning_content\` is non-empty, count it as alive and prefix the preview with "[thinking]" so the user knows the model didn't actually answer "hi" but is responsive. - Replace the prompt with the terser "Reply with just: hi" — closer to what a thinking model can short-circuit on. Tests: existing 25 pass; the failure-path test still asserts on the "empty content" path because reasoning_content is empty there too. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cli/src/commands/status.ts | 37 +++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/cli/src/commands/status.ts b/src/cli/src/commands/status.ts index c77e472..f6bb88a 100644 --- a/src/cli/src/commands/status.ts +++ b/src/cli/src/commands/status.ts @@ -227,13 +227,21 @@ function defaultFetchServerLlms(mcpdUrl: string, token: string | null): Promise< /** * POST a tiny "say hi" prompt to /api/v1/llms//infer and decide if * the LLM actually serves inference. Returns ok=true when the response is - * 200 with a non-empty assistant message; otherwise ok=false with an - * error string suitable for one-line display. + * 200 with non-empty content OR reasoning_content (thinking models often + * spend their token budget on the reasoning trace and never emit a + * `content` block, but they're clearly alive if reasoning came back). + * Otherwise ok=false with an error string suitable for one-line display. + * + * `max_tokens: 64` gives reasoning models enough headroom to emit + * something visible while still capping latency at ~1-2 sec on cheap + * models. The exact wording — "Reply with just: hi" — is more terse and + * closer to what a thinking model can short-circuit on without burning + * its entire budget on reasoning. */ const PROBE_TIMEOUT_MS = 15_000; const PROBE_BODY = JSON.stringify({ - messages: [{ role: 'user', content: "Say exactly the word 'hi' and nothing else." }], - max_tokens: 8, + messages: [{ role: 'user', content: 'Reply with just: hi' }], + max_tokens: 64, temperature: 0, }); @@ -276,20 +284,31 @@ function defaultProbeServerLlm(mcpdUrl: string, name: string, token: string | nu return; } let content = ''; + let reasoning = ''; try { const parsed = JSON.parse(body) as { - choices?: Array<{ message?: { content?: string } }>; + choices?: Array<{ message?: { content?: string; reasoning_content?: string } }>; }; - content = parsed.choices?.[0]?.message?.content?.trim() ?? ''; + const msg = parsed.choices?.[0]?.message; + content = msg?.content?.trim() ?? ''; + reasoning = msg?.reasoning_content?.trim() ?? ''; } catch { resolve({ ok: false, ms, error: 'invalid response body' }); return; } - if (content === '') { - resolve({ ok: false, ms, error: 'empty content' }); + if (content !== '') { + resolve({ ok: true, ms, say: content.slice(0, 16) }); return; } - resolve({ ok: true, ms, say: content.slice(0, 16) }); + if (reasoning !== '') { + // Thinking model burned its budget on the reasoning trace + // before emitting `content`. The LLM is alive — flag it as + // ok and surface a short reasoning preview so the user can + // tell at a glance. + resolve({ ok: true, ms, say: `[thinking] ${reasoning.slice(0, 12)}` }); + return; + } + resolve({ ok: false, ms, error: 'empty content' }); }); }); } catch {