Compare commits
2 Commits
feat/statu
...
fix/status
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a84214dad1 | ||
| 54e56f7b71 |
@@ -227,13 +227,21 @@ function defaultFetchServerLlms(mcpdUrl: string, token: string | null): Promise<
|
|||||||
/**
|
/**
|
||||||
* POST a tiny "say hi" prompt to /api/v1/llms/<name>/infer and decide if
|
* POST a tiny "say hi" prompt to /api/v1/llms/<name>/infer and decide if
|
||||||
* the LLM actually serves inference. Returns ok=true when the response is
|
* the LLM actually serves inference. Returns ok=true when the response is
|
||||||
* 200 with a non-empty assistant message; otherwise ok=false with an
|
* 200 with non-empty content OR reasoning_content (thinking models often
|
||||||
* error string suitable for one-line display.
|
* spend their token budget on the reasoning trace and never emit a
|
||||||
|
* `content` block, but they're clearly alive if reasoning came back).
|
||||||
|
* Otherwise ok=false with an error string suitable for one-line display.
|
||||||
|
*
|
||||||
|
* `max_tokens: 64` gives reasoning models enough headroom to emit
|
||||||
|
* something visible while still capping latency at ~1-2 sec on cheap
|
||||||
|
* models. The exact wording — "Reply with just: hi" — is more terse and
|
||||||
|
* closer to what a thinking model can short-circuit on without burning
|
||||||
|
* its entire budget on reasoning.
|
||||||
*/
|
*/
|
||||||
const PROBE_TIMEOUT_MS = 15_000;
|
const PROBE_TIMEOUT_MS = 15_000;
|
||||||
const PROBE_BODY = JSON.stringify({
|
const PROBE_BODY = JSON.stringify({
|
||||||
messages: [{ role: 'user', content: "Say exactly the word 'hi' and nothing else." }],
|
messages: [{ role: 'user', content: 'Reply with just: hi' }],
|
||||||
max_tokens: 8,
|
max_tokens: 64,
|
||||||
temperature: 0,
|
temperature: 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -276,20 +284,31 @@ function defaultProbeServerLlm(mcpdUrl: string, name: string, token: string | nu
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let content = '';
|
let content = '';
|
||||||
|
let reasoning = '';
|
||||||
try {
|
try {
|
||||||
const parsed = JSON.parse(body) as {
|
const parsed = JSON.parse(body) as {
|
||||||
choices?: Array<{ message?: { content?: string } }>;
|
choices?: Array<{ message?: { content?: string; reasoning_content?: string } }>;
|
||||||
};
|
};
|
||||||
content = parsed.choices?.[0]?.message?.content?.trim() ?? '';
|
const msg = parsed.choices?.[0]?.message;
|
||||||
|
content = msg?.content?.trim() ?? '';
|
||||||
|
reasoning = msg?.reasoning_content?.trim() ?? '';
|
||||||
} catch {
|
} catch {
|
||||||
resolve({ ok: false, ms, error: 'invalid response body' });
|
resolve({ ok: false, ms, error: 'invalid response body' });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (content === '') {
|
if (content !== '') {
|
||||||
resolve({ ok: false, ms, error: 'empty content' });
|
resolve({ ok: true, ms, say: content.slice(0, 16) });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
resolve({ ok: true, ms, say: content.slice(0, 16) });
|
if (reasoning !== '') {
|
||||||
|
// Thinking model burned its budget on the reasoning trace
|
||||||
|
// before emitting `content`. The LLM is alive — flag it as
|
||||||
|
// ok and surface a short reasoning preview so the user can
|
||||||
|
// tell at a glance.
|
||||||
|
resolve({ ok: true, ms, say: `[thinking] ${reasoning.slice(0, 12)}` });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
resolve({ ok: false, ms, error: 'empty content' });
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
} catch {
|
} catch {
|
||||||
|
|||||||
Reference in New Issue
Block a user