feat(chat): surface reasoning_content as thinking chunks; fix --no-stream timeout
Reasoning models (qwen3-thinking, deepseek-reasoner, OpenAI o1 family) emit
their scratchpad as `delta.reasoning_content` (or `delta.reasoning`,
or `delta.provider_specific_fields.reasoning_content` when LiteLLM passes
through from vLLM) — separate from `delta.content`. Before this commit
mcpd's parseStreamingChunk only watched `content`, so the model's 30-90s
reasoning phase looked like dead air to the REPL: streaming connection
open, no chunks, no progress. Caught during the agents-feature shakedown
when qwen3-thinking sat silent for 90s on a docmost__list_pages call.
mcpd
====
chat.service.ts
- parseStreamingChunk extracts a `reasoningDelta` from the chunk body,
accepting all four spellings (reasoning_content / reasoning /
provider_specific_fields.{reasoning_content,reasoning}). Future
providers can add their own field names by extending the
fallback chain.
- chatStream yields `{ type: 'thinking', delta }` chunks as reasoning
arrives, alongside the existing `{ type: 'text', delta }` for content.
- Reasoning is intentionally NOT persisted to the thread. It's the
model's scratchpad, not part of the conversation. Subsequent turns
don't see it.
- Adds 'thinking' to the ChatStreamChunk.type union.
CLI
===
chat.ts
- streamOnce handles 'thinking' chunks: writes them dim+italic to
stderr (ANSI 2;3m) so the model's reasoning visually flows like a
quote block while the final answer streams to stdout. Plain text
when stderr isn't a TTY (pipe to file → no escape codes leak).
- chatRequestNonStream replaces the shared ApiClient.post() for the
--no-stream path. ApiClient defaults to a 10s timeout, way too tight
for any chat that calls a tool: LLM round + tool dispatch + LLM
summary easily exceeds 10s. The new helper uses the same 600s timeout
the streaming path has been using all along.
Tests:
chat-service.test.ts (+2):
- reasoning_content deltas surface as `thinking` chunks (not text);
reasoning is NOT persisted to the assistant turn's content.
- LiteLLM's provider_specific_fields.reasoning_content shape parses
identically to the vendor-native shape.
mcpd 777/777, cli 430/430.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -141,10 +141,7 @@ async function runOneShot(
|
||||
if (stream === false) {
|
||||
const body: Record<string, unknown> = { message, ...overrides };
|
||||
if (threadId !== undefined) body.threadId = threadId;
|
||||
const res = await deps.client.post<{ assistant: string; threadId: string; turnIndex: number }>(
|
||||
`/api/v1/agents/${encodeURIComponent(agent)}/chat`,
|
||||
body,
|
||||
);
|
||||
const res = await chatRequestNonStream(deps, agent, body);
|
||||
process.stdout.write(`${res.assistant}\n`);
|
||||
process.stderr.write(`(thread: ${res.threadId})\n`);
|
||||
return;
|
||||
@@ -188,10 +185,7 @@ async function runRepl(
|
||||
if (stream === false) {
|
||||
const body: Record<string, unknown> = { message: line, ...overrides };
|
||||
if (threadId !== undefined) body.threadId = threadId;
|
||||
const res = await deps.client.post<{ assistant: string; threadId: string }>(
|
||||
`/api/v1/agents/${encodeURIComponent(agent)}/chat`,
|
||||
body,
|
||||
);
|
||||
const res = await chatRequestNonStream(deps, agent, body);
|
||||
threadId = res.threadId;
|
||||
process.stdout.write(`${res.assistant}\n`);
|
||||
} else {
|
||||
@@ -306,6 +300,60 @@ function applySetCommand(o: Overrides, key: string, valueRaw: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-streaming POST to the chat endpoint. Uses the SAME 10-minute timeout
|
||||
* as the streaming path — `deps.client.post` (the shared ApiClient) defaults
|
||||
* to 10s, which is too tight for any chat that calls a tool. Returns the
|
||||
* parsed JSON body on 2xx, throws on 4xx/5xx with the response body.
|
||||
*/
|
||||
async function chatRequestNonStream(
|
||||
deps: ChatCommandDeps,
|
||||
agent: string,
|
||||
body: Record<string, unknown>,
|
||||
): Promise<{ assistant: string; threadId: string; turnIndex: number }> {
|
||||
const url = new URL(`${deps.baseUrl}/api/v1/agents/${encodeURIComponent(agent)}/chat`);
|
||||
const payload = JSON.stringify(body);
|
||||
return new Promise((resolve, reject) => {
|
||||
const driver = url.protocol === 'https:' ? https : http;
|
||||
const req = driver.request({
|
||||
hostname: url.hostname,
|
||||
port: url.port || (url.protocol === 'https:' ? 443 : 80),
|
||||
path: url.pathname + url.search,
|
||||
method: 'POST',
|
||||
timeout: STREAM_TIMEOUT_MS,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Accept: 'application/json',
|
||||
...(deps.token !== undefined ? { Authorization: `Bearer ${deps.token}` } : {}),
|
||||
},
|
||||
}, (res) => {
|
||||
const status = res.statusCode ?? 0;
|
||||
const chunks: Buffer[] = [];
|
||||
res.on('data', (c: Buffer) => chunks.push(c));
|
||||
res.on('end', () => {
|
||||
const raw = Buffer.concat(chunks).toString('utf-8');
|
||||
if (status >= 400) {
|
||||
reject(new Error(`HTTP ${String(status)}: ${raw}`));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
resolve(JSON.parse(raw) as { assistant: string; threadId: string; turnIndex: number });
|
||||
} catch (err) {
|
||||
reject(new Error(`malformed response: ${(err as Error).message}`));
|
||||
}
|
||||
});
|
||||
res.on('error', reject);
|
||||
});
|
||||
req.on('error', reject);
|
||||
req.on('timeout', () => {
|
||||
req.destroy();
|
||||
reject(new Error('chat request timed out'));
|
||||
});
|
||||
req.write(payload);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
/** Stream a single chat call. Returns the resolved threadId. */
|
||||
async function streamOnce(
|
||||
deps: ChatCommandDeps,
|
||||
@@ -356,6 +404,13 @@ async function streamOnce(
|
||||
case 'text':
|
||||
if (typeof evt.delta === 'string') process.stdout.write(evt.delta);
|
||||
break;
|
||||
case 'thinking':
|
||||
// Reasoning models (qwen3-thinking, deepseek-reasoner, o1
|
||||
// family) emit this for tens of seconds before producing
|
||||
// any content delta. Show it dim+italic on stderr so the
|
||||
// final answer (stdout) stays clean for grepping/redirect.
|
||||
if (typeof evt.delta === 'string') process.stderr.write(styleThinking(evt.delta));
|
||||
break;
|
||||
case 'tool_call':
|
||||
process.stderr.write(`\n[tool_call: ${evt.toolName ?? ''}]\n`);
|
||||
break;
|
||||
@@ -389,7 +444,7 @@ async function streamOnce(
|
||||
}
|
||||
|
||||
interface ChatStreamFrame {
|
||||
type: 'text' | 'tool_call' | 'tool_result' | 'final' | 'error';
|
||||
type: 'text' | 'thinking' | 'tool_call' | 'tool_result' | 'final' | 'error';
|
||||
delta?: string;
|
||||
toolName?: string;
|
||||
ok?: boolean;
|
||||
@@ -398,6 +453,16 @@ interface ChatStreamFrame {
|
||||
message?: string;
|
||||
}
|
||||
|
||||
// ANSI codes for the reasoning sidebar. Dim + italic visually separates
|
||||
// reasoning ("the model is thinking") from final assistant content. We only
|
||||
// emit the codes when stderr is a TTY — piping to a file should stay clean.
|
||||
const ANSI_DIM_ITALIC = '\x1b[2;3m';
|
||||
const ANSI_RESET = '\x1b[0m';
|
||||
const STDERR_IS_TTY = process.stderr.isTTY === true;
|
||||
function styleThinking(s: string): string {
|
||||
return STDERR_IS_TTY ? `${ANSI_DIM_ITALIC}${s}${ANSI_RESET}` : s;
|
||||
}
|
||||
|
||||
function collect(value: string, prev: string[]): string[] {
|
||||
return [...prev, value];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user