feat(chat): live tokens/sec ticker + final stats footer

While streaming, the REPL now shows a live word/sec counter on a status line one row below the cursor — refreshes every 250ms via ANSI cursor save+restore so it floats with the content as the response grows. After each response, a dim stats footer prints on stderr: (47w · 12.3 w/s · 3.9s | thinking 234w · 38 w/s · 6.2s) The ticker is stderr-only and only emits when stderr is a TTY — pipes to a file stay clean for grepping/redirect. Words are whitespace- separated tokens (good enough across English/code/Markdown without a tokenizer dependency; CJK under-counts but the rate is still directional). Both phases tracked separately: - thinking: reasoning_content from qwen3-thinking / deepseek-reasoner / o1, where the model's scratchpad is the long part - content: the actual assistant answer Final stats also added to the --no-stream path: total HTTP duration and word count, since we don't get per-token timing there. CLI suite still 430/430. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 17:15:26 +01:00
parent 7cfa449465
commit cc9822d38b
1 changed files with 101 additions and 6 deletions
--- a/src/cli/src/commands/chat.ts
+++ b/src/cli/src/commands/chat.ts
@@ -141,9 +141,12 @@ async function runOneShot(
  if (stream === false) {
    const body: Record<string, unknown> = { message, ...overrides };
    if (threadId !== undefined) body.threadId = threadId;
+    const startMs = Date.now();
    const res = await chatRequestNonStream(deps, agent, body);
+    const sec = Math.max(0.05, (Date.now() - startMs) / 1000);
+    const words = (res.assistant.match(/\S+/g) ?? []).length;
    process.stdout.write(`${res.assistant}\n`);
-    process.stderr.write(`(thread: ${res.threadId})\n`);
+    process.stderr.write(styleStats(`(${String(words)}w · ${(words / sec).toFixed(1)} w/s · ${sec.toFixed(1)}s)`) + `  thread:${res.threadId}\n`);
    return;
  }
  const finalThread = await streamOnce(deps, agent, message, threadId, overrides);
@@ -365,6 +368,39 @@ async function streamOnce(
  const url = new URL(`${deps.baseUrl}/api/v1/agents/${encodeURIComponent(agent)}/chat`);
  const body = JSON.stringify({ message, threadId, stream: true, ...overrides });

+  // Per-response counters. Updated on every text/thinking delta, surfaced
+  // via the live ticker (stderr) and the final stats footer.
+  const stats = { thinking: newPhase(), content: newPhase() };
+
+  // Live ticker: every TICK_MS, draws a stats line on a ledger one row below
+  // the current cursor using ANSI save/restore. The ledger floats with the
+  // content as it grows (terminal scrolls take the saved position with them
+  // on modern emulators). Disabled when stderr isn't a TTY (pipes stay clean).
+  const TICK_MS = 250;
+  let tickerTimer: NodeJS.Timeout | null = null;
+  let tickerActive = false;
+  function drawTicker(): void {
+    if (!STDERR_IS_TTY) return;
+    const text = formatStats(stats, true);
+    if (text === '') return;
+    // \x1b[s = save cursor, \n = down one (scrolls if at bottom),
+    // \x1b[K = clear line, write ticker, \x1b[u = restore.
+    process.stderr.write(`\x1b[s\n\x1b[K${styleStats(text)}\x1b[u`);
+    tickerActive = true;
+  }
+  function clearTicker(): void {
+    if (!STDERR_IS_TTY || !tickerActive) return;
+    process.stderr.write('\x1b[s\n\x1b[K\x1b[u');
+    tickerActive = false;
+  }
+  function stopTicker(): void {
+    if (tickerTimer !== null) {
+      clearInterval(tickerTimer);
+      tickerTimer = null;
+    }
+    clearTicker();
+  }
+
  return new Promise<string>((resolve, reject) => {
    const driver = url.protocol === 'https:' ? https : http;
    const req = driver.request({
@@ -402,14 +438,26 @@ async function streamOnce(
              const evt = JSON.parse(data) as ChatStreamFrame;
              switch (evt.type) {
                case 'text':
-                  if (typeof evt.delta === 'string') process.stdout.write(evt.delta);
+                  if (typeof evt.delta === 'string') {
+                    recordDelta(stats.content, evt.delta);
+                    process.stdout.write(evt.delta);
+                    if (tickerTimer === null && STDERR_IS_TTY) {
+                      tickerTimer = setInterval(drawTicker, TICK_MS);
+                    }
+                  }
                  break;
                case 'thinking':
                  // Reasoning models (qwen3-thinking, deepseek-reasoner, o1
                  // family) emit this for tens of seconds before producing
                  // any content delta. Show it dim+italic on stderr so the
                  // final answer (stdout) stays clean for grepping/redirect.
-                  if (typeof evt.delta === 'string') process.stderr.write(styleThinking(evt.delta));
+                  if (typeof evt.delta === 'string') {
+                    recordDelta(stats.thinking, evt.delta);
+                    process.stderr.write(styleThinking(evt.delta));
+                    if (tickerTimer === null && STDERR_IS_TTY) {
+                      tickerTimer = setInterval(drawTicker, TICK_MS);
+                    }
+                  }
                  break;
                case 'tool_call':
                  process.stderr.write(`\n[tool_call: ${evt.toolName ?? ''}]\n`);
@@ -430,11 +478,21 @@ async function streamOnce(
          }
        }
      });
-      res.on('end', () => resolve(resolvedThread));
-      res.on('error', reject);
+      res.on('end', () => {
+        stopTicker();
+        const final = formatStats(stats, false);
+        if (final !== '' && STDERR_IS_TTY) {
+          process.stderr.write(`\n${styleStats(`(${final})`)}`);
+        } else if (final !== '') {
+          process.stderr.write(`\n(${final})`);
+        }
+        resolve(resolvedThread);
      });
-    req.on('error', reject);
+      res.on('error', (err) => { stopTicker(); reject(err); });
+    });
+    req.on('error', (err) => { stopTicker(); reject(err); });
    req.on('timeout', () => {
+      stopTicker();
      req.destroy();
      reject(new Error('chat stream timed out'));
    });
@@ -457,11 +515,48 @@ interface ChatStreamFrame {
 // reasoning ("the model is thinking") from final assistant content. We only
 // emit the codes when stderr is a TTY — piping to a file should stay clean.
 const ANSI_DIM_ITALIC = '\x1b[2;3m';
+const ANSI_DIM = '\x1b[2m';
 const ANSI_RESET = '\x1b[0m';
 const STDERR_IS_TTY = process.stderr.isTTY === true;
 function styleThinking(s: string): string {
  return STDERR_IS_TTY ? `${ANSI_DIM_ITALIC}${s}${ANSI_RESET}` : s;
 }
+function styleStats(s: string): string {
+  return STDERR_IS_TTY ? `${ANSI_DIM}${s}${ANSI_RESET}` : s;
+}
+
+interface PhaseStats {
+  words: number;
+  firstMs: number;
+  lastMs: number;
+}
+function newPhase(): PhaseStats { return { words: 0, firstMs: 0, lastMs: 0 }; }
+function recordDelta(p: PhaseStats, delta: string): void {
+  const now = Date.now();
+  if (p.firstMs === 0) p.firstMs = now;
+  p.lastMs = now;
+  // Whitespace-separated tokens. Good enough across languages without a
+  // tokenizer dependency. CJK languages will under-count, but for English/
+  // code/Markdown (the common case) this matches user expectations.
+  const matches = delta.match(/\S+/g);
+  if (matches !== null) p.words += matches.length;
+}
+function formatPhase(label: string, p: PhaseStats): string | null {
+  if (p.words === 0) return null;
+  const sec = Math.max(0.05, (p.lastMs - p.firstMs) / 1000);
+  const rate = p.words / sec;
+  return `${label}${String(p.words)}w · ${rate.toFixed(1)} w/s · ${sec.toFixed(1)}s`;
+}
+function formatStats(s: { thinking: PhaseStats; content: PhaseStats }, partial: boolean): string {
+  const parts: string[] = [];
+  const c = formatPhase('', s.content);
+  if (c !== null) parts.push(c);
+  const t = formatPhase('thinking ', s.thinking);
+  if (t !== null) parts.push(t);
+  if (parts.length === 0) return '';
+  const prefix = partial ? '⏵ ' : '';
+  return `${prefix}${parts.join(' | ')}`;
+}

 function collect(value: string, prev: string[]): string[] {
  return [...prev, value];