diff --git a/src/cli/src/commands/status.ts b/src/cli/src/commands/status.ts index c77e472..f6bb88a 100644 --- a/src/cli/src/commands/status.ts +++ b/src/cli/src/commands/status.ts @@ -227,13 +227,21 @@ function defaultFetchServerLlms(mcpdUrl: string, token: string | null): Promise< /** * POST a tiny "say hi" prompt to /api/v1/llms//infer and decide if * the LLM actually serves inference. Returns ok=true when the response is - * 200 with a non-empty assistant message; otherwise ok=false with an - * error string suitable for one-line display. + * 200 with non-empty content OR reasoning_content (thinking models often + * spend their token budget on the reasoning trace and never emit a + * `content` block, but they're clearly alive if reasoning came back). + * Otherwise ok=false with an error string suitable for one-line display. + * + * `max_tokens: 64` gives reasoning models enough headroom to emit + * something visible while still capping latency at ~1-2 sec on cheap + * models. The exact wording — "Reply with just: hi" — is more terse and + * closer to what a thinking model can short-circuit on without burning + * its entire budget on reasoning. */ const PROBE_TIMEOUT_MS = 15_000; const PROBE_BODY = JSON.stringify({ - messages: [{ role: 'user', content: "Say exactly the word 'hi' and nothing else." }], - max_tokens: 8, + messages: [{ role: 'user', content: 'Reply with just: hi' }], + max_tokens: 64, temperature: 0, }); @@ -276,20 +284,31 @@ function defaultProbeServerLlm(mcpdUrl: string, name: string, token: string | nu return; } let content = ''; + let reasoning = ''; try { const parsed = JSON.parse(body) as { - choices?: Array<{ message?: { content?: string } }>; + choices?: Array<{ message?: { content?: string; reasoning_content?: string } }>; }; - content = parsed.choices?.[0]?.message?.content?.trim() ?? ''; + const msg = parsed.choices?.[0]?.message; + content = msg?.content?.trim() ?? ''; + reasoning = msg?.reasoning_content?.trim() ?? ''; } catch { resolve({ ok: false, ms, error: 'invalid response body' }); return; } - if (content === '') { - resolve({ ok: false, ms, error: 'empty content' }); + if (content !== '') { + resolve({ ok: true, ms, say: content.slice(0, 16) }); return; } - resolve({ ok: true, ms, say: content.slice(0, 16) }); + if (reasoning !== '') { + // Thinking model burned its budget on the reasoning trace + // before emitting `content`. The LLM is alive — flag it as + // ok and surface a short reasoning preview so the user can + // tell at a glance. + resolve({ ok: true, ms, say: `[thinking] ${reasoning.slice(0, 12)}` }); + return; + } + resolve({ ok: false, ms, error: 'empty content' }); }); }); } catch {