Files
claude-code/packages/@ant/computer-use-mcp/src/tools.ts
unraid c17edcb12e feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge
三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
2026-04-05 15:47:20 +08:00

1054 lines
42 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* MCP tool schemas for the computer-use server. Mirrors
* claude-for-chrome-mcp/src/browserTools.ts in shape (plain `Tool`-shaped
* object literals, no zod).
*
* Coordinate descriptions are baked in at tool-list build time from the
* `chicago_coordinate_mode` gate. The model sees exactly ONE coordinate
* convention in the param descriptions and never learns the other exists.
* The host (`serverDef.ts`) reads the same frozen gate value for
* `scaleCoord` — both must agree or clicks land in the wrong space.
*/
import type { Tool } from "@modelcontextprotocol/sdk/types.js";
import type { CoordinateMode } from "./types.js";
// See packages/desktop/computer-use-mcp/COORDINATES.md before touching any
// model-facing coordinate text. Chrome's browserTools.ts:143 is the reference
// phrasing — "pixels from the left edge", no geometry, no number to do math with.
const COORD_DESC: Record<CoordinateMode, { x: string; y: string }> = {
pixels: {
x: "Horizontal pixel position read directly from the most recent screenshot image, measured from the left edge. The server handles all scaling.",
y: "Vertical pixel position read directly from the most recent screenshot image, measured from the top edge. The server handles all scaling.",
},
normalized_0_100: {
x: "Horizontal position as a percentage of screen width, 0.0100.0 (0 = left edge, 100 = right edge).",
y: "Vertical position as a percentage of screen height, 0.0100.0 (0 = top edge, 100 = bottom edge).",
},
};
const FRONTMOST_GATE_DESC =
"The frontmost application must be in the session allowlist at the time of this call, or this tool returns an error and does nothing.";
/**
* Item schema for the `actions` array in `computer_batch`, `teach_step`, and
* `teach_batch`. All three dispatch through the same `dispatchAction` path
* with the same validation — keep this enum in sync with `BATCHABLE_ACTIONS`
* in toolCalls.ts.
*/
const BATCH_ACTION_ITEM_SCHEMA = {
type: "object",
properties: {
action: {
type: "string",
enum: [
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"triple_click",
"scroll",
"hold_key",
"screenshot",
"cursor_position",
"left_mouse_down",
"left_mouse_up",
"wait",
],
description: "The action to perform.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) for click/mouse_move/scroll/left_click_drag end point.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
"(x, y) drag start — left_click_drag only. Omit to drag from current cursor.",
},
text: {
type: "string",
description:
"For type: the text. For key/hold_key: the chord string. For click/scroll: modifier keys to hold.",
},
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
},
scroll_amount: { type: "integer", minimum: 0, maximum: 100 },
duration: {
type: "number",
description: "Seconds (0100). For hold_key/wait.",
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "For key: repeat count.",
},
},
required: ["action"],
};
/**
* Build the tool list. Parameterized by capabilities and coordinate mode so
* descriptions are honest and unambiguous (plan §1 — "Unfiltered + honest").
*
* `coordinateMode` MUST match what the host passes to `scaleCoord` at tool-
* -call time. Both should read the same frozen-at-load gate constant.
*
* `installedAppNames` — optional pre-sanitized list of app display names to
* enumerate in the `request_access` description. The caller is responsible
* for sanitization (length cap, character allowlist, sort, count cap) —
* this function just splices the list into the description verbatim. Omit
* to fall back to the generic "display names or bundle IDs" wording.
*/
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32" | "linux";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
coordinateMode: CoordinateMode,
installedAppNames?: string[],
): Tool[] {
const coord = COORD_DESC[coordinateMode];
// Shared hint suffix for BOTH request_access and request_teach_access —
// they use the same resolveRequestedApps path, so the model should get
// the same enumeration for both.
const installedAppsHint =
installedAppNames && installedAppNames.length > 0
? ` Available applications on this machine: ${installedAppNames.join(", ")}.`
: "";
// [x, y]` tuple — param shape for all
// click/move/scroll tools.
const coordinateTuple = {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: `(x, y): ${coord.x}`,
};
// Modifier hold during click. Shared across all 5 click variants.
const clickModifierText = {
type: "string",
description:
'Modifier keys to hold during the click (e.g. "shift", "ctrl+shift"). Supports the same syntax as the key tool.',
};
const screenshotDesc =
caps.screenshotFiltering === "native"
? "Take a screenshot of the primary display. Applications not in the session allowlist are excluded at the compositor level — only granted apps and the desktop are visible."
: "Take a screenshot of the primary display. On this platform, screenshots are NOT filtered — all open windows are visible. Input actions targeting apps not in the session allowlist are rejected.";
return [
{
name: "request_access",
description:
"Request user permission to control a set of applications for this session. Must be called before any other tool in this server. " +
"The user sees a single dialog listing all requested apps and either allows the whole set or denies it. " +
"Call this again mid-session to add more apps; previously granted apps remain granted. " +
"Returns the granted apps, denied apps, and screenshot filtering capability.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
"Application display names (e.g. \"Slack\", \"Calendar\") or bundle identifiers (e.g. \"com.tinyspeck.slackmacgap\"). Display names are resolved case-insensitively against installed apps." +
installedAppsHint,
},
reason: {
type: "string",
description:
"One-sentence explanation shown to the user in the approval dialog. Explain the task, not the mechanism.",
},
clipboardRead: {
type: "boolean",
description:
"Also request permission to read the user's clipboard (separate checkbox in the dialog).",
},
clipboardWrite: {
type: "boolean",
description:
"Also request permission to write the user's clipboard. When granted, multi-line `type` calls use the clipboard fast path.",
},
systemKeyCombos: {
type: "boolean",
description:
"Also request permission to send system-level key combos (quit app, switch app, lock screen). Without this, those specific combos are blocked.",
},
},
required: ["apps", "reason"],
},
},
{
name: "screenshot",
description:
screenshotDesc +
" Returns an error if the allowlist is empty. The returned image is what subsequent click coordinates are relative to.",
inputSchema: {
type: "object" as const,
properties: {
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image — screenshots you're just looking at don't need saving.",
},
},
required: [],
},
},
{
name: "zoom",
description:
"Take a higher-resolution screenshot of a specific region of the last full-screen screenshot. Use this liberally to inspect small text, button labels, or fine UI details that are hard to read in the downsampled full-screen image. " +
"IMPORTANT: Coordinates in subsequent click calls always refer to the full-screen screenshot, never the zoomed image. This tool is read-only for inspecting detail.",
inputSchema: {
type: "object" as const,
properties: {
region: {
type: "array",
items: { type: "integer" },
minItems: 4,
maxItems: 4,
description:
"(x0, y0, x1, y1): Rectangle to zoom into, in the coordinate space of the most recent full-screen screenshot. x0,y0 = top-left, x1,y1 = bottom-right.",
},
save_to_disk: {
type: "boolean",
description:
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image.",
},
},
required: ["region"],
},
},
{
name: "left_click",
description: `Left-click at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "double_click",
description: `Double-click at the given coordinates. Selects a word in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "triple_click",
description: `Triple-click at the given coordinates. Selects a line in most text editors. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "right_click",
description: `Right-click at the given coordinates. Opens a context menu in most applications. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "middle_click",
description: `Middle-click (scroll-wheel click) at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
text: clickModifierText,
},
required: ["coordinate"],
},
},
{
name: "type",
description: `Type text into whatever currently has keyboard focus. ${FRONTMOST_GATE_DESC} Newlines are supported. For keyboard shortcuts use \`key\` instead.`,
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string", description: "Text to type." },
},
required: ["text"],
},
},
{
name: "key",
description:
`Press a key or key combination (e.g. "return", "escape", "cmd+a", "ctrl+shift+tab"). ${FRONTMOST_GATE_DESC} ` +
"System-level combos (quit app, switch app, lock screen) require the `systemKeyCombos` grant — without it they return an error. All other combos work.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Modifiers joined with "+", e.g. "cmd+shift+a".',
},
repeat: {
type: "integer",
minimum: 1,
maximum: 100,
description: "Number of times to repeat the key press. Default is 1.",
},
},
required: ["text"],
},
},
{
name: "scroll",
description: `Scroll at the given coordinates. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
scroll_direction: {
type: "string",
enum: ["up", "down", "left", "right"],
description: "Direction to scroll.",
},
scroll_amount: {
type: "integer",
minimum: 0,
maximum: 100,
description: "Number of scroll ticks.",
},
},
required: ["coordinate", "scroll_direction", "scroll_amount"],
},
},
{
name: "left_click_drag",
description: `Press, move to target, and release. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
...coordinateTuple,
description: `(x, y) end point: ${coord.x}`,
},
start_coordinate: {
...coordinateTuple,
description: `(x, y) start point. If omitted, drags from the current cursor position. ${coord.x}`,
},
},
required: ["coordinate"],
},
},
{
name: "mouse_move",
description: `Move the mouse cursor without clicking. Useful for triggering hover states. ${FRONTMOST_GATE_DESC}`,
inputSchema: {
type: "object" as const,
properties: {
coordinate: coordinateTuple,
},
required: ["coordinate"],
},
},
{
name: "open_application",
description:
"Bring an application to the front, launching it if necessary. The target application must already be in the session allowlist — call request_access first.",
inputSchema: {
type: "object" as const,
properties: {
app: {
type: "string",
description:
"Display name (e.g. \"Slack\") or bundle identifier (e.g. \"com.tinyspeck.slackmacgap\").",
},
},
required: ["app"],
},
},
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
// Only available on Windows when a window is bound via open_application.
...(caps.platform === 'win32' ? [{
name: "window_management",
description:
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
"The window must have been opened via open_application first. " +
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
"Use move_resize to reposition or resize the window to specific coordinates.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
description:
"minimize: ShowWindow(SW_MINIMIZE). " +
"maximize: ShowWindow(SW_MAXIMIZE). " +
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
"close: SendMessage(WM_CLOSE) — graceful close. " +
"focus: SetForegroundWindow + BringWindowToTop. " +
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
"move_resize: SetWindowPos to specific x,y,width,height. " +
"get_rect: GetWindowRect — returns current position and size.",
},
x: { type: "integer", description: "X position for move_resize." },
y: { type: "integer", description: "Y position for move_resize." },
width: { type: "integer", description: "Width for move_resize." },
height: { type: "integer", description: "Height for move_resize." },
},
required: ["action"],
},
} as Tool,
{
name: "click_element",
description:
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
inputSchema: {
type: "object" as const,
properties: {
name: {
type: "string",
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
},
role: {
type: "string",
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
},
automationId: {
type: "string",
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
},
},
required: [],
},
} as Tool,
{
name: "type_into_element",
description:
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
"Finds the element by name/role/automationId, then sets its value directly — " +
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
inputSchema: {
type: "object" as const,
properties: {
name: { type: "string", description: "Accessible name of the target element." },
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
automationId: { type: "string", description: "Exact automationId." },
text: { type: "string", description: "Text to type/set into the element." },
},
required: ["text"],
},
} as Tool,
{
name: "open_terminal",
description:
"Open a new terminal window and launch an AI agent CLI. " +
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
"or any custom command. After launching, the tool binds to the new terminal window " +
"and takes a screenshot to verify the agent started successfully. " +
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
inputSchema: {
type: "object" as const,
properties: {
agent: {
type: "string",
enum: ["claude", "codex", "gemini", "custom"],
description:
"Which agent to launch. " +
"claude: runs 'claude' command. " +
"codex: runs 'codex' command. " +
"gemini: runs 'gemini' command. " +
"custom: runs the command specified in 'command' parameter.",
},
command: {
type: "string",
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
},
terminal: {
type: "string",
enum: ["wt", "powershell", "cmd"],
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
},
working_directory: {
type: "string",
description: "Working directory for the terminal. If omitted, uses current directory.",
},
},
required: ["agent"],
},
} as Tool,
{
name: "bind_window",
description:
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
"no cursor movement, no focus steal, no interference with the user's desktop. " +
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["bind", "unbind", "status", "list"],
description:
"bind: Bind to a window (specify title, hwnd, or pid). " +
"unbind: Release the current binding, return to full-screen mode. " +
"status: Show the currently bound window (hwnd, title, rect). " +
"list: List all visible windows with hwnd, pid, and title.",
},
title: {
type: "string",
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
},
hwnd: {
type: "string",
description: "Exact window handle from 'list' output. For 'bind' action.",
},
pid: {
type: "integer",
description: "Process ID to find window for. For 'bind' action.",
},
},
required: ["action"],
},
} as Tool,
{
name: "activate_window",
description:
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
"the window is ready to receive keyboard/mouse events. " +
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
inputSchema: {
type: "object" as const,
properties: {
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
},
required: [],
},
} as Tool,
{
name: "prompt_respond",
description:
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
"Typical flows: " +
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
"3) Text input prompt → type the response + Enter. " +
"After responding, take a screenshot to verify the result.",
inputSchema: {
type: "object" as const,
properties: {
response_type: {
type: "string",
enum: ["yes", "no", "enter", "escape", "select", "type"],
description:
"yes: send 'y' + Enter. " +
"no: send 'n' + Enter. " +
"enter: send Enter only. " +
"escape: send Escape (cancel). " +
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
"type: type custom text then Enter. Requires 'text'.",
},
arrow_direction: {
type: "string",
enum: ["up", "down"],
description: "Arrow key direction for 'select' type. Default: 'down'.",
},
arrow_count: {
type: "integer",
description: "Number of arrow key presses for 'select' type. Default: 1.",
minimum: 0,
maximum: 50,
},
text: {
type: "string",
description: "Text to type for 'type' response_type.",
},
},
required: ["response_type"],
},
} as Tool,
{
name: "status_indicator",
description:
"Control the visual status indicator overlay on the bound window. " +
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["show", "hide", "status"],
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
},
message: {
type: "string",
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
},
},
required: ["action"],
},
} as Tool,
{
name: "virtual_keyboard",
description:
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
"The user can keep typing on their own keyboard without interference. " +
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
"All input targets the bound HWND only — no global keyboard events.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["type", "combo", "press", "release", "hold"],
description:
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
"press: Press a key down and hold it (pair with 'release'). " +
"release: Release a previously pressed key. " +
"hold: Press key(s) for a duration then release.",
},
text: {
type: "string",
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
},
duration: {
type: "number",
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
},
repeat: {
type: "integer",
description: "Number of times to repeat the action. Default: 1.",
minimum: 1,
maximum: 100,
},
},
required: ["action", "text"],
},
} as Tool,
{
name: "virtual_mouse",
description:
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
"The user's real cursor stays free. All operations target the bound HWND only.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
description:
"click: left-click at coordinate. " +
"double_click: double left-click. " +
"right_click: right-click. " +
"move: move virtual cursor (visual only, no click). " +
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
"down: press left button at coordinate (hold). " +
"up: release left button at coordinate.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate on the bound window.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
},
},
required: ["action", "coordinate"],
},
} as Tool,
{
name: "mouse_wheel",
description:
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
"code editors, PDF viewers, and any modern UI. " +
"Specify the click point within the window where the scroll should occur — " +
"this determines which panel/pane/element receives the scroll.",
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
},
delta: {
type: "integer",
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
},
direction: {
type: "string",
enum: ["vertical", "horizontal"],
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
},
},
required: ["coordinate", "delta"],
},
} as Tool,
] : []),
{
name: "switch_display",
description:
"Switch which monitor subsequent screenshots capture. Use this when the " +
"application you need is on a different monitor than the one shown. " +
"The screenshot tool tells you which monitor it captured and lists " +
"other attached monitors by name — pass one of those names here. " +
"After switching, call screenshot to see the new monitor. " +
'Pass "auto" to return to automatic monitor selection.',
inputSchema: {
type: "object" as const,
properties: {
display: {
type: "string",
description:
'Monitor name from the screenshot note (e.g. "Built-in Retina Display", ' +
'"LG UltraFine"), or "auto" to re-enable automatic selection.',
},
},
required: ["display"],
},
},
{
name: "list_granted_applications",
description:
"List the applications currently in the session allowlist, plus the active grant flags and coordinate mode. No side effects.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "read_clipboard",
description:
"Read the current clipboard contents as text. Requires the `clipboardRead` grant.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "write_clipboard",
description:
"Write text to the clipboard. Requires the `clipboardWrite` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: { type: "string" },
},
required: ["text"],
},
},
{
name: "wait",
description: "Wait for a specified duration.",
inputSchema: {
type: "object" as const,
properties: {
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["duration"],
},
},
{
name: "cursor_position",
description:
"Get the current mouse cursor position. Returns image-pixel coordinates relative to the most recent screenshot, or logical points if no screenshot has been taken.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "hold_key",
description:
`Press and hold a key or key combination for the specified duration, then release. ${FRONTMOST_GATE_DESC} ` +
"System-level combos require the `systemKeyCombos` grant.",
inputSchema: {
type: "object" as const,
properties: {
text: {
type: "string",
description: 'Key or chord to hold, e.g. "space", "shift+down".',
},
duration: {
type: "number",
description: "Duration in seconds (0100).",
},
},
required: ["text", "duration"],
},
},
{
name: "left_mouse_down",
description:
`Press the left mouse button at the current cursor position and leave it held. ${FRONTMOST_GATE_DESC} ` +
"Use mouse_move first to position the cursor. Call left_mouse_up to release. Errors if the button is already held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "left_mouse_up",
description:
`Release the left mouse button at the current cursor position. ${FRONTMOST_GATE_DESC} ` +
"Pairs with left_mouse_down. Safe to call even if the button is not currently held.",
inputSchema: {
type: "object" as const,
properties: {},
required: [],
},
},
{
name: "computer_batch",
description:
"Execute a sequence of actions in ONE tool call. Each individual tool call requires a model→API round trip (seconds); " +
"batching a predictable sequence eliminates all but one. Use this whenever you can predict the outcome of several actions ahead — " +
"e.g. click a field, type into it, press Return. Actions execute sequentially and stop on the first error. " +
`${FRONTMOST_GATE_DESC} The frontmost check runs before EACH action inside the batch — if an action opens a non-allowed app, the next action's gate fires and the batch stops there. ` +
"Mid-batch screenshot actions are allowed for inspection but coordinates in subsequent clicks always refer to the PRE-BATCH full-screen screenshot.",
inputSchema: {
type: "object" as const,
properties: {
actions: {
type: "array",
minItems: 1,
items: BATCH_ACTION_ITEM_SCHEMA,
description:
'List of actions. Example: [{"action":"left_click","coordinate":[100,200]},{"action":"type","text":"hello"},{"action":"key","text":"Return"}]',
},
},
required: ["actions"],
},
},
...(caps.teachMode ? buildTeachTools(coord, installedAppsHint) : []),
];
}
/**
* Teach-mode tools. Split out so the spread above stays a single expression;
* takes `coord` so `teach_step.anchor`'s description uses the same
* frozen coordinate-mode phrasing as click coords, and `installedAppsHint`
* so `request_teach_access.apps` gets the same enumeration as
* `request_access.apps` (same resolution path → same hint).
*/
function buildTeachTools(
coord: { x: string; y: string },
installedAppsHint: string,
): Tool[] {
// Shared between teach_step (top-level) and teach_batch (inside steps[]
// items). Depends on coord, so it lives inside this factory.
const teachStepProperties = {
explanation: {
type: "string",
description:
"Tooltip body text. Explain what the user is looking at and why it matters. " +
"This is the ONLY place the user sees your words — be complete but concise.",
},
next_preview: {
type: "string",
description:
"One line describing exactly what will happen when the user clicks Next. " +
'Example: "Next: I\'ll click Create Bucket and type the name." ' +
"Shown below the explanation in a smaller font.",
},
anchor: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description:
`(x, y) — where the tooltip arrow points. ${coord.x} ` +
"Omit to center the tooltip with no arrow (for general-context steps).",
},
actions: {
type: "array",
// Empty allowed — "read this, click Next" steps.
items: BATCH_ACTION_ITEM_SCHEMA,
description:
"Actions to execute when the user clicks Next. Same item schema as computer_batch.actions. " +
"Empty array is valid for purely explanatory steps. Actions run sequentially and stop on first error.",
},
} as const;
return [
{
name: "request_teach_access",
description:
"Request permission to guide the user through a task step-by-step with on-screen tooltips. " +
"Use this INSTEAD OF request_access when the user wants to LEARN how to do something " +
'(phrases like "teach me", "walk me through", "show me how", "help me learn"). ' +
"On approval the main Claude window hides and a fullscreen tooltip overlay appears. " +
"You then call teach_step repeatedly; each call shows one tooltip and waits for the user to click Next. " +
"Same app-allowlist semantics as request_access, but no clipboard/system-key flags. " +
"Teach mode ends automatically when your turn ends.",
inputSchema: {
type: "object" as const,
properties: {
apps: {
type: "array",
items: { type: "string" },
description:
'Application display names (e.g. "Slack", "Calendar") or bundle identifiers. Resolved case-insensitively against installed apps.' +
installedAppsHint,
},
reason: {
type: "string",
description:
'What you will be teaching. Shown in the approval dialog as "Claude wants to guide you through {reason}". Keep it short and task-focused.',
},
},
required: ["apps", "reason"],
},
},
{
name: "teach_step",
description:
"Show one guided-tour tooltip and wait for the user to click Next. On Next, execute the actions, " +
"take a fresh screenshot, and return both — you do NOT need a separate screenshot call between steps. " +
"The returned image shows the state after your actions ran; anchor the next teach_step against it. " +
"IMPORTANT — the user only sees the tooltip during teach mode. Put ALL narration in `explanation`. " +
"Text you emit outside teach_step calls is NOT visible until teach mode ends. " +
"Pack as many actions as possible into each step's `actions` array — the user waits through " +
"the whole round trip between clicks, so one step that fills a form beats five steps that fill one field each. " +
"Returns {exited:true} if the user clicks Exit — do not call teach_step again after that. " +
"Take an initial screenshot before your FIRST teach_step to anchor it.",
inputSchema: {
type: "object" as const,
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
},
{
name: "teach_batch",
description:
"Queue multiple teach steps in one tool call. Parallels computer_batch: " +
"N steps → one model↔API round trip instead of N. Each step still shows a tooltip " +
"and waits for the user's Next click, but YOU aren't waiting for a round trip between steps. " +
"You can call teach_batch multiple times in one tour — treat each batch as one predictable " +
"SEGMENT (typically: all the steps on one page). The returned screenshot shows the state " +
"after the batch's final actions; anchor the NEXT teach_batch against it. " +
"WITHIN a batch, all anchors and click coordinates refer to the PRE-BATCH screenshot " +
"(same invariant as computer_batch) — for steps 2+ in a batch, either omit anchor " +
"(centered tooltip) or target elements you know won't have moved. " +
"Good pattern: batch 5 tooltips on page A (last step navigates) → read returned screenshot → " +
"batch 3 tooltips on page B → done. " +
"Returns {exited:true, stepsCompleted:N} if the user clicks Exit — do NOT call again after that; " +
"{stepsCompleted, stepFailed, ...} if an action errors mid-batch; " +
"otherwise {stepsCompleted, results:[...]} plus a final screenshot. " +
"Fall back to individual teach_step calls when you need to react to each intermediate screenshot.",
inputSchema: {
type: "object" as const,
properties: {
steps: {
type: "array",
minItems: 1,
items: {
type: "object",
properties: teachStepProperties,
required: ["explanation", "next_preview", "actions"],
},
description:
"Ordered steps. Validated upfront — a typo in step 5 errors before any tooltip shows.",
},
},
required: ["steps"],
},
},
];
}