feat(chat): live tokens/sec ticker + final stats footer
While streaming, the REPL now shows a live word/sec counter on a status
line one row below the cursor — refreshes every 250ms via ANSI cursor
save+restore so it floats with the content as the response grows.
After each response, a dim stats footer prints on stderr:
(47w · 12.3 w/s · 3.9s | thinking 234w · 38 w/s · 6.2s)
The ticker is stderr-only and only emits when stderr is a TTY — pipes
to a file stay clean for grepping/redirect. Words are whitespace-
separated tokens (good enough across English/code/Markdown without a
tokenizer dependency; CJK under-counts but the rate is still
directional).
Both phases tracked separately:
- thinking: reasoning_content from qwen3-thinking / deepseek-reasoner
/ o1, where the model's scratchpad is the long part
- content: the actual assistant answer
Final stats also added to the --no-stream path: total HTTP duration
and word count, since we don't get per-token timing there.
CLI suite still 430/430.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -141,9 +141,12 @@ async function runOneShot(
|
||||
if (stream === false) {
|
||||
const body: Record<string, unknown> = { message, ...overrides };
|
||||
if (threadId !== undefined) body.threadId = threadId;
|
||||
const startMs = Date.now();
|
||||
const res = await chatRequestNonStream(deps, agent, body);
|
||||
const sec = Math.max(0.05, (Date.now() - startMs) / 1000);
|
||||
const words = (res.assistant.match(/\S+/g) ?? []).length;
|
||||
process.stdout.write(`${res.assistant}\n`);
|
||||
process.stderr.write(`(thread: ${res.threadId})\n`);
|
||||
process.stderr.write(styleStats(`(${String(words)}w · ${(words / sec).toFixed(1)} w/s · ${sec.toFixed(1)}s)`) + ` thread:${res.threadId}\n`);
|
||||
return;
|
||||
}
|
||||
const finalThread = await streamOnce(deps, agent, message, threadId, overrides);
|
||||
@@ -365,6 +368,39 @@ async function streamOnce(
|
||||
const url = new URL(`${deps.baseUrl}/api/v1/agents/${encodeURIComponent(agent)}/chat`);
|
||||
const body = JSON.stringify({ message, threadId, stream: true, ...overrides });
|
||||
|
||||
// Per-response counters. Updated on every text/thinking delta, surfaced
|
||||
// via the live ticker (stderr) and the final stats footer.
|
||||
const stats = { thinking: newPhase(), content: newPhase() };
|
||||
|
||||
// Live ticker: every TICK_MS, draws a stats line on a ledger one row below
|
||||
// the current cursor using ANSI save/restore. The ledger floats with the
|
||||
// content as it grows (terminal scrolls take the saved position with them
|
||||
// on modern emulators). Disabled when stderr isn't a TTY (pipes stay clean).
|
||||
const TICK_MS = 250;
|
||||
let tickerTimer: NodeJS.Timeout | null = null;
|
||||
let tickerActive = false;
|
||||
function drawTicker(): void {
|
||||
if (!STDERR_IS_TTY) return;
|
||||
const text = formatStats(stats, true);
|
||||
if (text === '') return;
|
||||
// \x1b[s = save cursor, \n = down one (scrolls if at bottom),
|
||||
// \x1b[K = clear line, write ticker, \x1b[u = restore.
|
||||
process.stderr.write(`\x1b[s\n\x1b[K${styleStats(text)}\x1b[u`);
|
||||
tickerActive = true;
|
||||
}
|
||||
function clearTicker(): void {
|
||||
if (!STDERR_IS_TTY || !tickerActive) return;
|
||||
process.stderr.write('\x1b[s\n\x1b[K\x1b[u');
|
||||
tickerActive = false;
|
||||
}
|
||||
function stopTicker(): void {
|
||||
if (tickerTimer !== null) {
|
||||
clearInterval(tickerTimer);
|
||||
tickerTimer = null;
|
||||
}
|
||||
clearTicker();
|
||||
}
|
||||
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const driver = url.protocol === 'https:' ? https : http;
|
||||
const req = driver.request({
|
||||
@@ -402,14 +438,26 @@ async function streamOnce(
|
||||
const evt = JSON.parse(data) as ChatStreamFrame;
|
||||
switch (evt.type) {
|
||||
case 'text':
|
||||
if (typeof evt.delta === 'string') process.stdout.write(evt.delta);
|
||||
if (typeof evt.delta === 'string') {
|
||||
recordDelta(stats.content, evt.delta);
|
||||
process.stdout.write(evt.delta);
|
||||
if (tickerTimer === null && STDERR_IS_TTY) {
|
||||
tickerTimer = setInterval(drawTicker, TICK_MS);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'thinking':
|
||||
// Reasoning models (qwen3-thinking, deepseek-reasoner, o1
|
||||
// family) emit this for tens of seconds before producing
|
||||
// any content delta. Show it dim+italic on stderr so the
|
||||
// final answer (stdout) stays clean for grepping/redirect.
|
||||
if (typeof evt.delta === 'string') process.stderr.write(styleThinking(evt.delta));
|
||||
if (typeof evt.delta === 'string') {
|
||||
recordDelta(stats.thinking, evt.delta);
|
||||
process.stderr.write(styleThinking(evt.delta));
|
||||
if (tickerTimer === null && STDERR_IS_TTY) {
|
||||
tickerTimer = setInterval(drawTicker, TICK_MS);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'tool_call':
|
||||
process.stderr.write(`\n[tool_call: ${evt.toolName ?? ''}]\n`);
|
||||
@@ -430,11 +478,21 @@ async function streamOnce(
|
||||
}
|
||||
}
|
||||
});
|
||||
res.on('end', () => resolve(resolvedThread));
|
||||
res.on('error', reject);
|
||||
res.on('end', () => {
|
||||
stopTicker();
|
||||
const final = formatStats(stats, false);
|
||||
if (final !== '' && STDERR_IS_TTY) {
|
||||
process.stderr.write(`\n${styleStats(`(${final})`)}`);
|
||||
} else if (final !== '') {
|
||||
process.stderr.write(`\n(${final})`);
|
||||
}
|
||||
resolve(resolvedThread);
|
||||
});
|
||||
req.on('error', reject);
|
||||
res.on('error', (err) => { stopTicker(); reject(err); });
|
||||
});
|
||||
req.on('error', (err) => { stopTicker(); reject(err); });
|
||||
req.on('timeout', () => {
|
||||
stopTicker();
|
||||
req.destroy();
|
||||
reject(new Error('chat stream timed out'));
|
||||
});
|
||||
@@ -457,11 +515,48 @@ interface ChatStreamFrame {
|
||||
// reasoning ("the model is thinking") from final assistant content. We only
|
||||
// emit the codes when stderr is a TTY — piping to a file should stay clean.
|
||||
const ANSI_DIM_ITALIC = '\x1b[2;3m';
|
||||
const ANSI_DIM = '\x1b[2m';
|
||||
const ANSI_RESET = '\x1b[0m';
|
||||
const STDERR_IS_TTY = process.stderr.isTTY === true;
|
||||
function styleThinking(s: string): string {
|
||||
return STDERR_IS_TTY ? `${ANSI_DIM_ITALIC}${s}${ANSI_RESET}` : s;
|
||||
}
|
||||
function styleStats(s: string): string {
|
||||
return STDERR_IS_TTY ? `${ANSI_DIM}${s}${ANSI_RESET}` : s;
|
||||
}
|
||||
|
||||
interface PhaseStats {
|
||||
words: number;
|
||||
firstMs: number;
|
||||
lastMs: number;
|
||||
}
|
||||
function newPhase(): PhaseStats { return { words: 0, firstMs: 0, lastMs: 0 }; }
|
||||
function recordDelta(p: PhaseStats, delta: string): void {
|
||||
const now = Date.now();
|
||||
if (p.firstMs === 0) p.firstMs = now;
|
||||
p.lastMs = now;
|
||||
// Whitespace-separated tokens. Good enough across languages without a
|
||||
// tokenizer dependency. CJK languages will under-count, but for English/
|
||||
// code/Markdown (the common case) this matches user expectations.
|
||||
const matches = delta.match(/\S+/g);
|
||||
if (matches !== null) p.words += matches.length;
|
||||
}
|
||||
function formatPhase(label: string, p: PhaseStats): string | null {
|
||||
if (p.words === 0) return null;
|
||||
const sec = Math.max(0.05, (p.lastMs - p.firstMs) / 1000);
|
||||
const rate = p.words / sec;
|
||||
return `${label}${String(p.words)}w · ${rate.toFixed(1)} w/s · ${sec.toFixed(1)}s`;
|
||||
}
|
||||
function formatStats(s: { thinking: PhaseStats; content: PhaseStats }, partial: boolean): string {
|
||||
const parts: string[] = [];
|
||||
const c = formatPhase('', s.content);
|
||||
if (c !== null) parts.push(c);
|
||||
const t = formatPhase('thinking ', s.thinking);
|
||||
if (t !== null) parts.push(t);
|
||||
if (parts.length === 0) return '';
|
||||
const prefix = partial ? '⏵ ' : '';
|
||||
return `${prefix}${parts.join(' | ')}`;
|
||||
}
|
||||
|
||||
function collect(value: string, prev: string[]): string[] {
|
||||
return [...prev, value];
|
||||
|
||||
Reference in New Issue
Block a user