From 8b56f09f2578af10079eebc1bb1f943977e832de Mon Sep 17 00:00:00 2001
From: Michal <michal@itaz.eu>
Date: Sat, 25 Apr 2026 17:08:37 +0100
Subject: [PATCH] feat(agents): smoke tests + README + docs (Stage 6, final)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the agents feature.

Smoke tests (run via `pnpm test:smoke` against a live mcpd at
$MCPD_URL, default https://mcpctl.ad.itaz.eu):

* tests/smoke/agent.smoke.test.ts — full CRUD round-trip:
  create secret + Llm + agent with sampling defaults; `get agents`
  surfaces it; `get agent foo -o yaml | apply -f` round-trips
  identically; create + list a thread via the HTTP API; agent delete
  leaves Llm + secret intact (Restrict + SetNull as designed). Self-
  skips with a warning when /healthz is unreachable.

* tests/smoke/agent-chat.smoke.test.ts — gated on
  MCPCTL_SMOKE_LLM_URL + MCPCTL_SMOKE_LLM_KEY. Provisions secret +
  Llm + agent against a real upstream, runs `mcpctl chat -m … --no-
  stream` (asserts a reply lands), then runs the streaming default
  (asserts text on stdout + `(thread: …)` on stderr). The fast path
  for verifying the in-cluster qwen3-thinking deployment:

      MCPCTL_SMOKE_LLM_URL=http://litellm.nvidia-nim.svc.cluster.local:4000/v1 \
      MCPCTL_SMOKE_LLM_MODEL=qwen3-thinking \
      MCPCTL_SMOKE_LLM_KEY=$(pulumi config get --stack homelab \
        secrets:litellmMcpctlGatewayToken) \
        pnpm test:smoke

Docs:

* README.md — new "Agents" section under Resources with the
  qwen3-thinking quickstart and links to docs/agents.md and
  docs/chat.md. Adds llm + agent rows to the resources table.

* docs/agents.md (new) — full reference: data model, chat-parameter
  table, HTTP API, RBAC mapping, tool-use loop semantics, yaml
  round-trip shorthand, the kubernetes-deployment wiring recipe,
  and a troubleshooting section (namespace collision, llm-in-use,
  pending-row recovery, Anthropic-tool limitation).

* docs/chat.md (new) — user-facing `mcpctl chat` walkthrough:
  modes, per-call flags, slash-commands, threads, and a
  troubleshooting section.

* CLAUDE.md — adds a "Resource types" cheatsheet with one-line
  pointers to each, including the new `agent` row that links to
  the docs.

All suites still green: mcpd 759/759, mcplocal 715/715, cli 430/430.
Smoke tests typecheck and self-skip when no live mcpd is reachable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |  15 ++
 README.md                                     |  47 ++++
 docs/agents.md                                | 197 +++++++++++++++
 docs/chat.md                                  | 124 +++++++++
 .../tests/smoke/agent-chat.smoke.test.ts      | 149 +++++++++++
 src/mcplocal/tests/smoke/agent.smoke.test.ts  | 235 ++++++++++++++++++
 6 files changed, 767 insertions(+)
 create mode 100644 docs/agents.md
 create mode 100644 docs/chat.md
 create mode 100644 src/mcplocal/tests/smoke/agent-chat.smoke.test.ts
 create mode 100644 src/mcplocal/tests/smoke/agent.smoke.test.ts
diff --git a/CLAUDE.md b/CLAUDE.md
index 90b4c39..967d913 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -23,3 +23,18 @@ Key routing rules:
 - Architecture review → invoke plan-eng-review
 - Save progress, checkpoint, resume → invoke checkpoint
 - Code quality, health check → invoke health
+
+## Resource types
+
+`mcpctl` resource cheatsheet:
+
+- `server` — MCP server definition
+- `instance` — running container (immutable, replicas-managed)
+- `secret` / `secretbackend` — credentials
+- `template` — reusable server blueprint
+- `project` — workspace grouping servers, prompts, agents
+- `llm` — server-managed LLM provider (api key + endpoint)
+- `agent` — LLM persona pinned to one Llm; project attach surfaces project Prompts as system context, project MCP servers as tools, and exposes the agent itself as an MCP virtual server (`agent-<name>/chat`). See `docs/agents.md`, `docs/chat.md`.
+- `prompt` / `promptrequest` — curated content / pending proposal
+- `rbac` — access control bindings
+- `mcptoken` — bearer credentials for HTTP-mode mcplocal
diff --git a/README.md b/README.md
index bc58953..ce12cb4 100644
--- a/README.md
+++ b/README.md
@@ -494,11 +494,58 @@ new FileCache('ns', { maxSize: '10%' })     // 10% of partition
 | **secret** | Key-value credentials | API tokens, passwords |
 | **template** | Reusable server blueprint | Community server configs |
 | **project** | Workspace grouping servers | "monitoring", "home-automation" |
+| **llm** | Server-managed LLM provider | OpenAI / Anthropic / vLLM endpoint + key |
+| **agent** | LLM persona pinned to one Llm | "I review security; ask after each major change" |
 | **prompt** | Curated content for Claude | Instructions, docs, guides |
 | **promptrequest** | Pending prompt proposal | LLM-submitted, needs approval |
 | **rbac** | Access control bindings | Who can do what |
 | **serverattachment** | Server-to-project link | Virtual resource for `apply` |
 
+## Agents
+
+An **Agent** is an LLM persona — a pinned `Llm`, a system prompt, an optional
+project attach, and LiteLLM-style sampling defaults. Once attached to a
+project, the agent inherits the project's prompts (merged into its system
+block, sorted by priority) and gets to call the project's MCP servers as
+tools during chat.
+
+Every agent is also exposed back to MCP clients as a virtual server named
+`agent-<name>` with one tool `chat`. So another Claude session connecting to
+the same project sees, e.g., `agent-reviewer/chat` in `tools/list` with the
+description "I review security design — ask me after each major change."
+That's how agents consult each other.
+
+```bash
+# 1) point at an LLM. For your in-cluster qwen3-thinking via LiteLLM:
+mcpctl create secret litellm-key --data API_KEY=sk-…
+mcpctl create llm qwen3-thinking \
+  --type openai \
+  --model qwen3-thinking \
+  --url http://litellm.nvidia-nim.svc.cluster.local:4000/v1 \
+  --api-key-ref litellm-key/API_KEY
+
+# 2) create an agent, pinned to that Llm and attached to a project
+mcpctl create agent reviewer \
+  --llm qwen3-thinking \
+  --project mcpctl-dev \
+  --description "I review security design — ask me after each major change." \
+  --system-prompt-file ./prompts/reviewer.md \
+  --default-temperature 0.2 --default-max-tokens 4096
+
+# 3) chat with it (interactive REPL — Ctrl-D to exit)
+mcpctl chat reviewer
+
+# Or one-shot
+mcpctl chat reviewer -m "Look at PR #42 and tell me what's risky."
+
+# Resume a thread
+mcpctl get threads --agent reviewer
+mcpctl chat reviewer --thread <id>
+```
+
+Full reference: [docs/agents.md](docs/agents.md). User-facing chat guide:
+[docs/chat.md](docs/chat.md).
+
 ## Commands
 
 ```bash
diff --git a/docs/agents.md b/docs/agents.md
new file mode 100644
index 0000000..fedab5e
--- /dev/null
+++ b/docs/agents.md
@@ -0,0 +1,197 @@
+# Agents
+
+An `Agent` is an LLM persona pinned to a specific `Llm`, with a system prompt,
+a description that surfaces in MCP `tools/list`, optional attachment to a
+`Project`, and LiteLLM-style sampling defaults. Conversations are persisted
+as `ChatThread` + `ChatMessage` rows so REPL sessions resume across runs.
+
+Two surfaces use an agent:
+
+1. **Direct chat** via `mcpctl chat <name>` (interactive REPL or one-shot
+   `-m "msg"`). Streams over SSE; tool calls and tool results print to
+   stderr in dim brackets. Slash-commands `/set`, `/system`, `/tools`,
+   `/clear`, `/save`, `/quit` adjust runtime behavior.
+
+2. **Virtual MCP server** registered into every project session by
+   mcplocal's agents plugin. The agent shows up as `agent-<name>` with
+   one tool `chat`, whose description is the agent's own description.
+   Other Claude sessions / MCP clients see the agent as just another
+   tool in `tools/list` and can consult it.
+
+## Data model
+
+Three Prisma models added to `src/db/prisma/schema.prisma`:
+
+- **`Agent`** — `name` (unique), `description`, `systemPrompt`, `llmId`
+  (FK Restrict — an Llm in active use cannot be deleted), `projectId`
+  (FK SetNull — agents survive project deletion), `proxyModelName`
+  (optional informational override), `defaultParams` (Json,
+  LiteLLM-style), `extras` (Json, reserved for future LoRA / tool
+  allowlists), `ownerId`, version, timestamps.
+
+- **`ChatThread`** — `agentId`, `ownerId`, `title`, `lastTurnAt`,
+  timestamps. Cascade delete on agent.
+
+- **`ChatMessage`** — `threadId`, `turnIndex` (monotonic per thread,
+  enforced by `@@unique([threadId, turnIndex])`), `role`
+  (`'system' | 'user' | 'assistant' | 'tool'`), `content`, `toolCalls`
+  (Json — assistant turn's `[{id,name,arguments}]`), `toolCallId`
+  (which call a tool turn answers), `status`
+  (`'pending' | 'complete' | 'error'`), `createdAt`. Cascade delete
+  on thread.
+
+`status` stays `pending` while the orchestrator runs an in-flight assistant
+or tool turn, then flips to `complete` once the round settles. On any
+exception in the chat loop, every `pending` row in the thread is flipped to
+`error` so the trail stays auditable.
+
+## Chat parameters (LiteLLM-style passthrough)
+
+Per-call resolution: request body → `agent.defaultParams` → adapter default.
+Setting a key to `null` in the request explicitly clears a default.
+
+| Key | Type | Notes |
+|---|---|---|
+| `temperature` | number | 0..2 |
+| `top_p` | number | 0..1 |
+| `top_k` | integer | Anthropic-only; OpenAI ignores |
+| `max_tokens` | integer | adapter clamps to provider max |
+| `stop` | string \| string[] | up to 4 sequences |
+| `presence_penalty` | number | OpenAI |
+| `frequency_penalty` | number | OpenAI |
+| `seed` | integer | reproducibility (provider-dependent) |
+| `response_format` | object | `text` \| `json_object` \| `json_schema` |
+| `tool_choice` | enum/object | `auto`\|`none`\|`required`\|`{type:'function',function:{name}}` |
+| `tools_allowlist` | string[] | restricts which project MCP tools the agent can call this turn |
+| `systemOverride` | string | replaces `agent.systemPrompt` for this call |
+| `systemAppend` | string | concatenated to system block (after project Prompts) |
+| `messages` | array | full message history override; if set, `message`/threadId history is ignored |
+| `extra` | object | provider-specific knobs (Anthropic `metadata.user_id`, vLLM `repetition_penalty`) — adapters cherry-pick |
+
+## HTTP API (mcpd)
+
+```
+GET    /api/v1/agents                  list (RBAC: view:agents)
+GET    /api/v1/agents/:idOrName        describe (view:agents)
+POST   /api/v1/agents                  create (create:agents)
+PUT    /api/v1/agents/:idOrName        update (edit:agents)
+DELETE /api/v1/agents/:idOrName        delete (delete:agents)
+POST   /api/v1/agents/:name/chat       chat — non-streaming or SSE (run:agents:<name>)
+POST   /api/v1/agents/:name/threads    create thread (run:agents:<name>)
+GET    /api/v1/agents/:name/threads    list threads (run:agents:<name>)
+GET    /api/v1/threads/:id/messages    replay history (view:agents)
+GET    /api/v1/projects/:p/agents      project-scoped list (view:projects:<p>)
+```
+
+The chat endpoint reuses the SSE pattern from `llm-infer.ts` exactly: same
+headers (`text/event-stream`, `X-Accel-Buffering: no`), same `data: …\n\n`
+framing, same `[DONE]` terminator. SSE chunk types:
+
+- `{type:'text', delta}` — assistant text increments
+- `{type:'tool_call', toolName, args}` — model decided to call a tool
+- `{type:'tool_result', toolName, ok}` — tool dispatch outcome
+- `{type:'final', threadId, turnIndex}` — terminal turn
+- `{type:'error', message}` — fatal error in the loop
+
+## Tool-use loop
+
+When the agent's project has MCP servers attached, mcpd's `ChatService` lists
+each server's tools (via `mcp-proxy.service.ts` — same path real MCP traffic
+uses) and presents them to the model namespaced as `<server>__<tool>`. On a
+`tool_calls` response the loop dispatches each call back through the same
+proxy, persists the assistant + tool turns linked by `toolCallId`, and loops
+(cap = 12 iterations) until the model returns terminal text.
+
+Persistence is **non-transactional across the loop** because tool calls can
+take minutes; long-held DB transactions would starve other writers.
+
+## RBAC
+
+Agents are their own resource (`agents`), independent of project bindings.
+Recommended:
+
+- `view:agents` — list / describe
+- `create:agents` / `edit:agents` / `delete:agents` — CRUD
+- `run:agents:<name>` — drive a chat turn or manage its threads
+
+Project-attached agents do not implicitly inherit project RBAC. If a project
+member should be able to chat with the project's agents, grant them
+`run:agents:<each-name>` (or wildcard `run:agents`) explicitly.
+
+## YAML round-trip
+
+`get agent foo -o yaml | mcpctl apply -f -` is a no-op. The `apply` schema
+also accepts shorthand:
+
+```yaml
+apiVersion: mcpctl.io/v1
+kind: agent
+metadata: { name: deployer }
+spec:
+  description: "I help you deploy code"
+  llm: qwen3-thinking          # shorthand for `{ name: qwen3-thinking }`
+  project: mcpctl-dev          # shorthand for `{ name: mcpctl-dev }`
+  systemPrompt: |
+    You are a deployment assistant for mcpctl. Always check fulldeploy.sh
+    and the k8s context before suggesting actions.
+  defaultParams:
+    temperature: 0.2
+    max_tokens: 4096
+    top_p: 0.9
+    stop: ["</deploy>"]
+```
+
+## Wiring against your in-cluster qwen3-thinking
+
+The `kubernetes-deployment` repo provisions LiteLLM in the `nvidia-nim`
+namespace (`http://litellm.nvidia-nim.svc.cluster.local:4000/v1` in-cluster,
+`https://llm.ad.itaz.eu/v1` external) and a virtual key reserved for mcpctl
+in the Pulumi secret `secrets:litellmMcpctlGatewayToken`. Pulling it once:
+
+```bash
+cd /path/to/kubernetes-deployment
+LITELLM_TOKEN=$(pulumi config get --stack homelab secrets:litellmMcpctlGatewayToken)
+
+# fallback if Pulumi isn't authed locally:
+# LITELLM_TOKEN=$(kubectl --context worker0-k8s0 -n nvidia-nim get secret litellm-secrets \
+#   -o jsonpath='{.data.LITELLM_MCPCTL_GATEWAY_TOKEN}' | base64 -d)
+
+cd /path/to/mcpctl
+mcpctl create secret litellm-key --data "API_KEY=${LITELLM_TOKEN}"
+mcpctl create llm qwen3-thinking \
+    --type openai --model qwen3-thinking \
+    --url http://litellm.nvidia-nim.svc.cluster.local:4000/v1 \
+    --api-key-ref litellm-key/API_KEY \
+    --description "Qwen3-30B-A3B-Thinking-FP8 via in-cluster vLLM behind LiteLLM"
+mcpctl create agent reviewer \
+    --llm qwen3-thinking \
+    --description "I review what you're shipping; ask after each major change." \
+    --default-temperature 0.2 --default-max-tokens 4096
+mcpctl chat reviewer
+```
+
+## Troubleshooting
+
+- **Namespace collision** in mcplocal: if a project has an upstream MCP
+  server literally named `agent-<x>`, the agents plugin detects the
+  collision in `onSessionCreate`, skips that agent's registration, and
+  emits a `ctx.log.warn` line. Document the `agent-` prefix as reserved
+  on real server names.
+
+- **Llm-in-use blocks delete**: `Agent.llm` is `onDelete: Restrict`. Detach
+  every agent (or delete them) before deleting the underlying Llm.
+
+- **Stale `pending` rows**: a crash mid-loop leaves `pending` ChatMessage
+  rows. The next request recovers — `markPendingAsError` flips them on the
+  next failure path, and `loadHistory` filters out `error` rows when
+  rebuilding context for the next turn.
+
+- **`proxyModelName` is informational only** for agents. The agent's own
+  internal tool loop runs server-side in mcpd and bypasses mcplocal's
+  proxymodel pipeline entirely. Don't try to plumb it.
+
+- **Anthropic + tools**: the Anthropic adapter currently drops `tool` role
+  messages and doesn't translate OpenAI `tool_calls` to Anthropic
+  `tool_use` / `tool_result` blocks. Use an OpenAI-compatible provider
+  (LiteLLM, vLLM, OpenAI) for agents that need tool calling until that
+  translation lands.
diff --git a/docs/chat.md b/docs/chat.md
new file mode 100644
index 0000000..c93f270
--- /dev/null
+++ b/docs/chat.md
@@ -0,0 +1,124 @@
+# `mcpctl chat`
+
+Open an interactive chat session with an `Agent`, or send a single message
+in one shot. See [agents.md](agents.md) for what an Agent is and how to
+create one.
+
+## Modes
+
+```bash
+mcpctl chat <agent>                 # interactive REPL, new thread
+mcpctl chat <agent> --thread <id>   # interactive REPL, resume thread
+mcpctl chat <agent> -m "hi"         # one-shot, prints reply, no REPL
+mcpctl chat <agent> -m "hi" --no-stream  # one-shot, single JSON response (no SSE)
+```
+
+Streaming is on by default. Text deltas land on stdout as they arrive; tool
+calls and tool results print to stderr in dim brackets so the chat output
+stays clean.
+
+## Per-call flags
+
+All optional. They override the agent's `defaultParams` for this session
+only — use the in-REPL `/save` slash-command to persist the current set
+back to the agent.
+
+```bash
+--system <text>              # replace agent.systemPrompt for this session
+--system-file <path>         # read --system text from a file
+--system-append <text>       # append to the agent system block (after project Prompts)
+--temperature <n>            # 0..2
+--top-p <n>                  # 0..1
+--top-k <n>                  # integer; Anthropic-only, OpenAI ignores
+--max-tokens <n>             # cap on assistant tokens
+--seed <n>                   # reproducibility (provider-dependent)
+--stop <text>                # stop sequence (repeatable, up to 4)
+--allow-tool <name>          # repeat to allowlist project MCP tools
+--extra <key=value>          # provider-specific knob (repeatable)
+--no-stream                  # disable SSE; single JSON response
+```
+
+`--extra` is the LiteLLM-style escape hatch: pass anything the underlying
+adapter understands. Numeric values are auto-parsed (`--extra
+repetition_penalty=1.1`); strings stay strings.
+
+## In-REPL slash-commands
+
+```
+/set KEY VALUE      adjust an override for the rest of the session
+                    (temperature, top-p, top-k, max-tokens, seed, stop,
+                     or any provider-specific knob — unknown keys go
+                     into `extra`)
+/system <text>      set systemAppend for this turn onward (empty = clear)
+/tools              list MCP servers the agent can call as tools
+/clear              start a fresh thread (same agent)
+/save               PATCH agent.defaultParams = current overrides
+                    (systemOverride / systemAppend are NOT persisted)
+/quit, /exit        leave the REPL (Ctrl-D works too)
+```
+
+## Threads
+
+Threads persist server-side. To resume:
+
+```bash
+mcpctl get threads --agent reviewer
+mcpctl chat reviewer --thread <id>
+```
+
+A `mcpctl get thread <id>` reads the message log:
+
+```bash
+mcpctl get thread c0abc… -o yaml
+```
+
+## Examples
+
+**Quick gut-check on a deploy:**
+
+```bash
+$ mcpctl chat reviewer -m "is fulldeploy.sh safe to run on the current branch?"
+Yes — I checked: tests are green on commit 727e7d6 and there's no
+in-flight migration. The k8s context is worker0-k8s0 (production); confirm
+that's intended before running.
+(thread: cm9k…)
+```
+
+**Resuming with overrides:**
+
+```bash
+$ mcpctl chat deployer --thread cm9k… --temperature 0.0 --max-tokens 256
+> walk me through what changed since the last deploy
+…
+```
+
+**Pinning sampling defaults to the agent:**
+
+```
+$ mcpctl chat deployer --temperature 0.0 --max-tokens 8000
+> /save
+(saved current overrides as agent.defaultParams)
+> /quit
+```
+
+## Troubleshooting
+
+- **No agents appear in `tools/list`** — check the agent has a project
+  attach (`mcpctl describe agent <name>`). The mcplocal plugin only
+  exposes agents on their attached project's session.
+
+- **Tool calls fail with `Project not found`** — the agent has no project
+  attach. Either attach it (`mcpctl edit agent <name>` and set the project
+  field), or expect text-only chat.
+
+- **Anthropic agents can't call tools** — known limitation; the Anthropic
+  adapter doesn't translate OpenAI tool format yet. Use LiteLLM or a
+  direct OpenAI-compatible provider for tool-using agents until the
+  translator ships.
+
+- **`mcpctl chat <agent>` returns 404** — the agent name doesn't resolve.
+  `mcpctl get agents` to confirm spelling.
+
+- **REPL feels stuck** — agent tool calls can take minutes (e.g. running a
+  Grafana query). Watch stderr for `[tool_call: …]` / `[tool_result: …]`
+  brackets; those tell you the loop is alive.
diff --git a/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts b/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts
new file mode 100644
index 0000000..d96b03b
--- /dev/null
+++ b/src/mcplocal/tests/smoke/agent-chat.smoke.test.ts
@@ -0,0 +1,149 @@
+/**
+ * Live-LLM smoke for agent chat.
+ *
+ * Runs only when MCPCTL_SMOKE_LLM_URL + MCPCTL_SMOKE_LLM_KEY are set; the
+ * idea is to point this at a real OpenAI-compatible endpoint and confirm
+ * the openai-passthrough adapter delivers the user's message and returns
+ * an assistant reply. For the project's qwen3-thinking deployment:
+ *
+ *   MCPCTL_SMOKE_LLM_URL=http://litellm.nvidia-nim.svc.cluster.local:4000/v1 \
+ *   MCPCTL_SMOKE_LLM_MODEL=qwen3-thinking \
+ *   MCPCTL_SMOKE_LLM_KEY=sk-... \
+ *     pnpm test:smoke
+ *
+ * If the env vars are missing the test self-skips without failing the
+ * pipeline (the agent CRUD smoke still runs in agent.smoke.test.ts).
+ */
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import http from 'node:http';
+import https from 'node:https';
+import { execSync } from 'node:child_process';
+
+const MCPD_URL = process.env.MCPD_URL ?? 'https://mcpctl.ad.itaz.eu';
+const LLM_URL = process.env.MCPCTL_SMOKE_LLM_URL;
+const LLM_MODEL = process.env.MCPCTL_SMOKE_LLM_MODEL ?? 'qwen3-thinking';
+const LLM_KEY = process.env.MCPCTL_SMOKE_LLM_KEY;
+const SUFFIX = Date.now().toString(36);
+const SECRET_NAME = `smoke-chat-sec-${SUFFIX}`;
+const LLM_NAME = `smoke-chat-llm-${SUFFIX}`;
+const AGENT_NAME = `smoke-chat-agent-${SUFFIX}`;
+
+interface CliResult { code: number; stdout: string; stderr: string }
+
+function run(args: string): CliResult {
+  try {
+    const stdout = execSync(`mcpctl --direct ${args}`, {
+      encoding: 'utf-8',
+      timeout: 60_000,
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+    return { code: 0, stdout: stdout.trim(), stderr: '' };
+  } catch (err) {
+    const e = err as { status?: number; stdout?: Buffer | string; stderr?: Buffer | string };
+    return {
+      code: e.status ?? 1,
+      stdout: e.stdout ? (typeof e.stdout === 'string' ? e.stdout : e.stdout.toString('utf-8')) : '',
+      stderr: e.stderr ? (typeof e.stderr === 'string' ? e.stderr : e.stderr.toString('utf-8')) : '',
+    };
+  }
+}
+
+function healthz(url: string, timeoutMs = 5000): Promise<boolean> {
+  return new Promise((resolve) => {
+    const parsed = new URL(`${url.replace(/\/$/, '')}/healthz`);
+    const driver = parsed.protocol === 'https:' ? https : http;
+    const req = driver.get({
+      hostname: parsed.hostname,
+      port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80),
+      path: parsed.pathname,
+      timeout: timeoutMs,
+    }, (res) => { resolve((res.statusCode ?? 500) < 500); res.resume(); });
+    req.on('error', () => resolve(false));
+    req.on('timeout', () => { req.destroy(); resolve(false); });
+  });
+}
+
+let mcpdUp = false;
+const liveLlmConfigured = LLM_URL !== undefined && LLM_KEY !== undefined;
+
+describe('agent chat smoke (live LLM)', () => {
+  beforeAll(async () => {
+    if (!liveLlmConfigured) {
+      // eslint-disable-next-line no-console
+      console.warn('\n  ○ agent-chat smoke: skipped — set MCPCTL_SMOKE_LLM_URL + MCPCTL_SMOKE_LLM_KEY to run against a real LLM.\n');
+      return;
+    }
+    mcpdUp = await healthz(MCPD_URL);
+    if (!mcpdUp) {
+      // eslint-disable-next-line no-console
+      console.warn(`\n  ○ agent-chat smoke: skipped — ${MCPD_URL}/healthz unreachable.\n`);
+    }
+  }, 20_000);
+
+  afterAll(() => {
+    if (!liveLlmConfigured || !mcpdUp) return;
+    run(`delete agent ${AGENT_NAME}`);
+    run(`delete llm ${LLM_NAME}`);
+    run(`delete secret ${SECRET_NAME}`);
+  });
+
+  it('provisions secret + Llm + agent against the live endpoint', () => {
+    if (!liveLlmConfigured || !mcpdUp) return;
+    run(`delete secret ${SECRET_NAME}`);
+    run(`delete llm ${LLM_NAME}`);
+    run(`delete agent ${AGENT_NAME}`);
+    const sec = run(`create secret ${SECRET_NAME} --data API_KEY=${LLM_KEY!}`);
+    expect(sec.code, sec.stderr).toBe(0);
+    const llm = run([
+      `create llm ${LLM_NAME}`,
+      '--type openai',
+      `--model ${LLM_MODEL}`,
+      `--url ${LLM_URL!}`,
+      `--api-key-ref ${SECRET_NAME}/API_KEY`,
+    ].join(' '));
+    expect(llm.code, llm.stderr).toBe(0);
+    const agent = run([
+      `create agent ${AGENT_NAME}`,
+      `--llm ${LLM_NAME}`,
+      `--description "live chat smoke"`,
+      `--system-prompt "You are a smoke test. Always reply with the single token READY."`,
+      '--default-temperature 0',
+      '--default-max-tokens 32',
+    ].join(' '));
+    expect(agent.code, agent.stderr).toBe(0);
+  });
+
+  it('one-shot `mcpctl chat` sends a message and prints a reply', () => {
+    if (!liveLlmConfigured || !mcpdUp) return;
+    const result = run(`chat ${AGENT_NAME} -m "ping" --no-stream`);
+    expect(result.code, result.stderr).toBe(0);
+    expect(result.stdout.length).toBeGreaterThan(0);
+    // We can't bind too tightly to model output but the system prompt nudges
+    // toward "READY". Either way: we got a reply.
+    expect(result.stderr).toMatch(/thread:\s+c[a-z0-9]+/);
+  });
+
+  it('streaming `mcpctl chat` emits text deltas', () => {
+    if (!liveLlmConfigured || !mcpdUp) return;
+    // Default mode is streaming. Pipe stdout/stderr separately.
+    let stdout = '';
+    let stderr = '';
+    try {
+      const out = execSync(`mcpctl --direct chat ${AGENT_NAME} -m "say hello" 2> /tmp/agent-smoke-err`, {
+        encoding: 'utf-8', timeout: 60_000,
+      });
+      stdout = out;
+    } catch (err) {
+      const e = err as { status?: number; stdout?: Buffer | string };
+      stdout = e.stdout ? (typeof e.stdout === 'string' ? e.stdout : e.stdout.toString('utf-8')) : '';
+    }
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      const fs = require('node:fs') as typeof import('node:fs');
+      stderr = fs.readFileSync('/tmp/agent-smoke-err', 'utf-8');
+      fs.unlinkSync('/tmp/agent-smoke-err');
+    } catch { /* ignore */ }
+    expect(stdout.length).toBeGreaterThan(0);
+    expect(stderr).toMatch(/thread:\s+c[a-z0-9]+/);
+  });
+});
diff --git a/src/mcplocal/tests/smoke/agent.smoke.test.ts b/src/mcplocal/tests/smoke/agent.smoke.test.ts
new file mode 100644
index 0000000..7944d22
--- /dev/null
+++ b/src/mcplocal/tests/smoke/agent.smoke.test.ts
@@ -0,0 +1,235 @@
+/**
+ * Smoke tests: Agent resource CRUD + thread management against a live mcpd.
+ *
+ * Validates Stages 1-5 end-to-end without requiring a live LLM upstream:
+ *   1. Create a secret + Llm referencing it.
+ *   2. Create an Agent pinned to that Llm with defaultParams.
+ *   3. `mcpctl get agents` shows the row; describe pretty-prints it.
+ *   4. `mcpctl get agent foo -o yaml | apply -f -` round-trips identically.
+ *   5. POST /api/v1/agents/:name/threads creates a thread; GET lists it.
+ *   6. Cleanup leaves the underlying Llm/Secret intact.
+ *
+ * Actual chat turns (which require a live LLM) live in agent-chat.smoke.test.ts
+ * and are gated on MCPCTL_SMOKE_LLM_URL being set.
+ */
+import { describe, it, expect, beforeAll, afterAll } from 'vitest';
+import http from 'node:http';
+import https from 'node:https';
+import { execSync } from 'node:child_process';
+import { writeFileSync, unlinkSync, mkdtempSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+const MCPD_URL = process.env.MCPD_URL ?? 'https://mcpctl.ad.itaz.eu';
+const SUFFIX = Date.now().toString(36);
+const SECRET_NAME = `smoke-agent-sec-${SUFFIX}`;
+const LLM_NAME = `smoke-agent-llm-${SUFFIX}`;
+const AGENT_NAME = `smoke-agent-${SUFFIX}`;
+
+interface CliResult { code: number; stdout: string; stderr: string }
+
+function run(args: string): CliResult {
+  try {
+    const stdout = execSync(`mcpctl --direct ${args}`, {
+      encoding: 'utf-8',
+      timeout: 30_000,
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+    return { code: 0, stdout: stdout.trim(), stderr: '' };
+  } catch (err) {
+    const e = err as { status?: number; stdout?: Buffer | string; stderr?: Buffer | string };
+    return {
+      code: e.status ?? 1,
+      stdout: e.stdout ? (typeof e.stdout === 'string' ? e.stdout : e.stdout.toString('utf-8')) : '',
+      stderr: e.stderr ? (typeof e.stderr === 'string' ? e.stderr : e.stderr.toString('utf-8')) : '',
+    };
+  }
+}
+
+function healthz(url: string, timeoutMs = 5000): Promise<boolean> {
+  return new Promise((resolve) => {
+    const parsed = new URL(`${url.replace(/\/$/, '')}/healthz`);
+    const driver = parsed.protocol === 'https:' ? https : http;
+    const req = driver.get(
+      {
+        hostname: parsed.hostname,
+        port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80),
+        path: parsed.pathname,
+        timeout: timeoutMs,
+      },
+      (res) => { resolve((res.statusCode ?? 500) < 500); res.resume(); },
+    );
+    req.on('error', () => resolve(false));
+    req.on('timeout', () => { req.destroy(); resolve(false); });
+  });
+}
+
+let mcpdUp = false;
+
+describe('agent smoke', () => {
+  beforeAll(async () => {
+    mcpdUp = await healthz(MCPD_URL);
+    if (!mcpdUp) {
+      // eslint-disable-next-line no-console
+      console.warn(`\n  ○ agent smoke: skipped — ${MCPD_URL}/healthz unreachable.\n`);
+    }
+  }, 20_000);
+
+  afterAll(() => {
+    if (!mcpdUp) return;
+    run(`delete agent ${AGENT_NAME}`);
+    run(`delete llm ${LLM_NAME}`);
+    run(`delete secret ${SECRET_NAME}`);
+  });
+
+  it('creates a secret to back the Llm api key', () => {
+    if (!mcpdUp) return;
+    run(`delete secret ${SECRET_NAME}`);
+    const result = run(`create secret ${SECRET_NAME} --data API_KEY=sk-fake-smoke`);
+    expect(result.code, result.stderr).toBe(0);
+  });
+
+  it('creates an Llm pinned to that secret', () => {
+    if (!mcpdUp) return;
+    run(`delete llm ${LLM_NAME}`);
+    const result = run([
+      `create llm ${LLM_NAME}`,
+      '--type openai',
+      '--model gpt-4o-mini',
+      '--url http://localhost:9999',
+      `--api-key-ref ${SECRET_NAME}/API_KEY`,
+    ].join(' '));
+    expect(result.code, result.stderr).toBe(0);
+  });
+
+  it('creates an agent pinned to that Llm with sampling defaults', () => {
+    if (!mcpdUp) return;
+    run(`delete agent ${AGENT_NAME}`);
+    const result = run([
+      `create agent ${AGENT_NAME}`,
+      `--llm ${LLM_NAME}`,
+      `--description "smoke agent for end-to-end CRUD"`,
+      `--system-prompt "You are a smoke-test agent."`,
+      '--default-temperature 0.2',
+      '--default-max-tokens 512',
+    ].join(' '));
+    expect(result.code, result.stderr || result.stdout).toBe(0);
+    expect(result.stdout).toMatch(new RegExp(`agent '${AGENT_NAME}'`));
+  });
+
+  it('lists the agent in `get agents`', () => {
+    if (!mcpdUp) return;
+    const result = run('get agents -o json');
+    expect(result.code).toBe(0);
+    const rows = JSON.parse(result.stdout) as Array<{ name: string; llm: { name: string }; defaultParams: { temperature?: number } }>;
+    const row = rows.find((r) => r.name === AGENT_NAME);
+    expect(row, `agent ${AGENT_NAME} must be present`).toBeDefined();
+    expect(row!.llm.name).toBe(LLM_NAME);
+    expect(row!.defaultParams.temperature).toBe(0.2);
+  });
+
+  it('round-trips yaml output through apply -f without diff', () => {
+    if (!mcpdUp) return;
+    const yaml = run(`get agent ${AGENT_NAME} -o yaml`);
+    expect(yaml.code).toBe(0);
+    expect(yaml.stdout).toMatch(/kind:\s+agent/i);
+    expect(yaml.stdout).toContain(`name: ${AGENT_NAME}`);
+
+    const dir = mkdtempSync(join(tmpdir(), 'mcpctl-agent-smoke-'));
+    const path = join(dir, 'agent.yaml');
+    const amended = yaml.stdout.replace(
+      'smoke agent for end-to-end CRUD',
+      'smoke agent (amended)',
+    );
+    writeFileSync(path, amended);
+    try {
+      const applied = run(`apply -f ${path}`);
+      expect(applied.code, applied.stderr || applied.stdout).toBe(0);
+      const second = run(`get agent ${AGENT_NAME} -o json`);
+      const parsed = JSON.parse(second.stdout) as { description: string };
+      expect(parsed.description).toBe('smoke agent (amended)');
+    } finally {
+      unlinkSync(path);
+    }
+  });
+
+  it('creates a chat thread and the agent lists it', async () => {
+    if (!mcpdUp) return;
+    const create = await httpRequest('POST', `${MCPD_URL}/api/v1/agents/${AGENT_NAME}/threads`, {
+      title: 'smoke thread',
+    });
+    expect(create.status).toBe(201);
+    const created = JSON.parse(create.body) as { id: string };
+    expect(created.id).toMatch(/^c[a-z0-9]+/);
+
+    const list = await httpRequest('GET', `${MCPD_URL}/api/v1/agents/${AGENT_NAME}/threads`, undefined);
+    expect(list.status).toBe(200);
+    const threads = JSON.parse(list.body) as Array<{ id: string; title: string }>;
+    expect(threads.some((t) => t.id === created.id && t.title === 'smoke thread')).toBe(true);
+
+    const messages = await httpRequest('GET', `${MCPD_URL}/api/v1/threads/${created.id}/messages`, undefined);
+    expect(messages.status).toBe(200);
+    expect(JSON.parse(messages.body)).toEqual([]);
+  });
+
+  it('deletes the agent and leaves the underlying Llm + secret intact', () => {
+    if (!mcpdUp) return;
+    const del = run(`delete agent ${AGENT_NAME}`);
+    expect(del.code, del.stderr).toBe(0);
+
+    const llm = run(`describe llm ${LLM_NAME}`);
+    expect(llm.code).toBe(0);
+  });
+});
+
+interface HttpResponse { status: number; body: string }
+
+/**
+ * Async HTTP helper. Authenticates using the same token the CLI carries via
+ * `mcpctl --direct` (read from ~/.mcpctl/credentials.json).
+ */
+function httpRequest(method: string, urlStr: string, body: unknown): Promise<HttpResponse> {
+  return new Promise((resolve, reject) => {
+    const tokenRaw = readToken();
+    const parsed = new URL(urlStr);
+    const driver = parsed.protocol === 'https:' ? https : http;
+    const headers: Record<string, string> = {
+      Accept: 'application/json',
+      ...(body !== undefined ? { 'Content-Type': 'application/json' } : {}),
+      ...(tokenRaw !== null ? { Authorization: `Bearer ${tokenRaw}` } : {}),
+    };
+    const req = driver.request({
+      hostname: parsed.hostname,
+      port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80),
+      path: parsed.pathname + parsed.search,
+      method,
+      headers,
+      timeout: 15_000,
+    }, (res) => {
+      const chunks: Buffer[] = [];
+      res.on('data', (c: Buffer) => chunks.push(c));
+      res.on('end', () => {
+        resolve({ status: res.statusCode ?? 0, body: Buffer.concat(chunks).toString('utf-8') });
+      });
+    });
+    req.on('error', reject);
+    req.on('timeout', () => { req.destroy(); reject(new Error(`httpRequest timeout: ${method} ${urlStr}`)); });
+    if (body !== undefined) req.write(JSON.stringify(body));
+    req.end();
+  });
+}
+
+function readToken(): string | null {
+  try {
+    const home = process.env.HOME ?? '';
+    const path = `${home}/.mcpctl/credentials.json`;
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    const fs = require('node:fs') as typeof import('node:fs');
+    if (!fs.existsSync(path)) return null;
+    const raw = fs.readFileSync(path, 'utf-8');
+    const parsed = JSON.parse(raw) as { token?: string };
+    return parsed.token ?? null;
+  } catch {
+    return null;
+  }
+}