feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge

三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
unraid
2026-04-05 15:27:50 +08:00
parent 7a2ade0a02
commit c17edcb12e
36 changed files with 8297 additions and 351 deletions

View File

@@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = {
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32";
platform: "darwin" | "win32" | "linux";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
@@ -414,6 +414,353 @@ export function buildComputerUseTools(
},
},
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
// Only available on Windows when a window is bound via open_application.
...(caps.platform === 'win32' ? [{
name: "window_management",
description:
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
"The window must have been opened via open_application first. " +
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
"Use move_resize to reposition or resize the window to specific coordinates.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
description:
"minimize: ShowWindow(SW_MINIMIZE). " +
"maximize: ShowWindow(SW_MAXIMIZE). " +
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
"close: SendMessage(WM_CLOSE) — graceful close. " +
"focus: SetForegroundWindow + BringWindowToTop. " +
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
"move_resize: SetWindowPos to specific x,y,width,height. " +
"get_rect: GetWindowRect — returns current position and size.",
},
x: { type: "integer", description: "X position for move_resize." },
y: { type: "integer", description: "Y position for move_resize." },
width: { type: "integer", description: "Width for move_resize." },
height: { type: "integer", description: "Height for move_resize." },
},
required: ["action"],
},
} as Tool,
{
name: "click_element",
description:
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
inputSchema: {
type: "object" as const,
properties: {
name: {
type: "string",
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
},
role: {
type: "string",
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
},
automationId: {
type: "string",
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
},
},
required: [],
},
} as Tool,
{
name: "type_into_element",
description:
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
"Finds the element by name/role/automationId, then sets its value directly — " +
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
inputSchema: {
type: "object" as const,
properties: {
name: { type: "string", description: "Accessible name of the target element." },
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
automationId: { type: "string", description: "Exact automationId." },
text: { type: "string", description: "Text to type/set into the element." },
},
required: ["text"],
},
} as Tool,
{
name: "open_terminal",
description:
"Open a new terminal window and launch an AI agent CLI. " +
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
"or any custom command. After launching, the tool binds to the new terminal window " +
"and takes a screenshot to verify the agent started successfully. " +
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
inputSchema: {
type: "object" as const,
properties: {
agent: {
type: "string",
enum: ["claude", "codex", "gemini", "custom"],
description:
"Which agent to launch. " +
"claude: runs 'claude' command. " +
"codex: runs 'codex' command. " +
"gemini: runs 'gemini' command. " +
"custom: runs the command specified in 'command' parameter.",
},
command: {
type: "string",
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
},
terminal: {
type: "string",
enum: ["wt", "powershell", "cmd"],
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
},
working_directory: {
type: "string",
description: "Working directory for the terminal. If omitted, uses current directory.",
},
},
required: ["agent"],
},
} as Tool,
{
name: "bind_window",
description:
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
"no cursor movement, no focus steal, no interference with the user's desktop. " +
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["bind", "unbind", "status", "list"],
description:
"bind: Bind to a window (specify title, hwnd, or pid). " +
"unbind: Release the current binding, return to full-screen mode. " +
"status: Show the currently bound window (hwnd, title, rect). " +
"list: List all visible windows with hwnd, pid, and title.",
},
title: {
type: "string",
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
},
hwnd: {
type: "string",
description: "Exact window handle from 'list' output. For 'bind' action.",
},
pid: {
type: "integer",
description: "Process ID to find window for. For 'bind' action.",
},
},
required: ["action"],
},
} as Tool,
{
name: "activate_window",
description:
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
"the window is ready to receive keyboard/mouse events. " +
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
inputSchema: {
type: "object" as const,
properties: {
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
},
required: [],
},
} as Tool,
{
name: "prompt_respond",
description:
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
"Typical flows: " +
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
"3) Text input prompt → type the response + Enter. " +
"After responding, take a screenshot to verify the result.",
inputSchema: {
type: "object" as const,
properties: {
response_type: {
type: "string",
enum: ["yes", "no", "enter", "escape", "select", "type"],
description:
"yes: send 'y' + Enter. " +
"no: send 'n' + Enter. " +
"enter: send Enter only. " +
"escape: send Escape (cancel). " +
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
"type: type custom text then Enter. Requires 'text'.",
},
arrow_direction: {
type: "string",
enum: ["up", "down"],
description: "Arrow key direction for 'select' type. Default: 'down'.",
},
arrow_count: {
type: "integer",
description: "Number of arrow key presses for 'select' type. Default: 1.",
minimum: 0,
maximum: 50,
},
text: {
type: "string",
description: "Text to type for 'type' response_type.",
},
},
required: ["response_type"],
},
} as Tool,
{
name: "status_indicator",
description:
"Control the visual status indicator overlay on the bound window. " +
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["show", "hide", "status"],
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
},
message: {
type: "string",
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
},
},
required: ["action"],
},
} as Tool,
{
name: "virtual_keyboard",
description:
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
"The user can keep typing on their own keyboard without interference. " +
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
"All input targets the bound HWND only — no global keyboard events.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["type", "combo", "press", "release", "hold"],
description:
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
"press: Press a key down and hold it (pair with 'release'). " +
"release: Release a previously pressed key. " +
"hold: Press key(s) for a duration then release.",
},
text: {
type: "string",
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
},
duration: {
type: "number",
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
},
repeat: {
type: "integer",
description: "Number of times to repeat the action. Default: 1.",
minimum: 1,
maximum: 100,
},
},
required: ["action", "text"],
},
} as Tool,
{
name: "virtual_mouse",
description:
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
"The user's real cursor stays free. All operations target the bound HWND only.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
description:
"click: left-click at coordinate. " +
"double_click: double left-click. " +
"right_click: right-click. " +
"move: move virtual cursor (visual only, no click). " +
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
"down: press left button at coordinate (hold). " +
"up: release left button at coordinate.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate on the bound window.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
},
},
required: ["action", "coordinate"],
},
} as Tool,
{
name: "mouse_wheel",
description:
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
"code editors, PDF viewers, and any modern UI. " +
"Specify the click point within the window where the scroll should occur — " +
"this determines which panel/pane/element receives the scroll.",
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
},
delta: {
type: "integer",
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
},
direction: {
type: "string",
enum: ["vertical", "horizontal"],
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
},
},
required: ["coordinate", "delta"],
},
} as Tool,
] : []),
{
name: "switch_display",
description: