feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge

三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
unraid
2026-04-05 15:27:50 +08:00
parent 7a2ade0a02
commit c17edcb12e
36 changed files with 8297 additions and 351 deletions

View File

@@ -1,33 +1,30 @@
/**
* @ant/computer-use-input — cross-platform keyboard & mouse simulation
* @ant/computer-use-input — macOS keyboard & mouse simulation (enigo)
*
* Platform backends:
* - darwin: AppleScript/JXA via CoreGraphics events
* - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event)
*
* Add new platforms by creating backends/<platform>.ts implementing InputBackend.
* This package wraps the macOS-only native enigo .node module.
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
*/
import type { FrontmostAppInfo, InputBackend } from './types.js'
export interface FrontmostAppInfo {
bundleId: string
appName: string
}
export type { FrontmostAppInfo, InputBackend } from './types.js'
// ---------------------------------------------------------------------------
// Platform dispatch
// ---------------------------------------------------------------------------
export interface InputBackend {
moveMouse(x: number, y: number, animated: boolean): Promise<void>
key(key: string, action: 'press' | 'release'): Promise<void>
keys(parts: string[]): Promise<void>
mouseLocation(): Promise<{ x: number; y: number }>
mouseButton(button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number): Promise<void>
mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
typeText(text: string): Promise<void>
getFrontmostAppInfo(): FrontmostAppInfo | null
}
function loadBackend(): InputBackend | null {
if (process.platform !== 'darwin') return null
try {
switch (process.platform) {
case 'darwin':
return require('./backends/darwin.js') as InputBackend
case 'win32':
return require('./backends/win32.js') as InputBackend
case 'linux':
return require('./backends/linux.js') as InputBackend
default:
return null
}
return require('./backends/darwin.js') as InputBackend
} catch {
return null
}
@@ -35,30 +32,16 @@ function loadBackend(): InputBackend | null {
const backend = loadBackend()
// ---------------------------------------------------------------------------
// Unsupported stub (throws on call — guards via isSupported check)
// ---------------------------------------------------------------------------
function unsupported(): never {
throw new Error(`computer-use-input is not supported on ${process.platform}`)
}
// ---------------------------------------------------------------------------
// Public API — matches the original export surface
// ---------------------------------------------------------------------------
export const isSupported = backend !== null
export const moveMouse = backend?.moveMouse ?? unsupported
export const key = backend?.key ?? unsupported
export const keys = backend?.keys ?? unsupported
export const mouseLocation = backend?.mouseLocation ?? unsupported
export const mouseButton = backend?.mouseButton ?? unsupported
export const mouseScroll = backend?.mouseScroll ?? unsupported
export const typeText = backend?.typeText ?? unsupported
export const moveMouse = backend?.moveMouse
export const key = backend?.key
export const keys = backend?.keys
export const mouseLocation = backend?.mouseLocation
export const mouseButton = backend?.mouseButton
export const mouseScroll = backend?.mouseScroll
export const typeText = backend?.typeText
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
// Legacy class type — used by inputLoader.ts for type narrowing
export class ComputerUseInputAPI {
declare moveMouse: InputBackend['moveMouse']
declare key: InputBackend['key']
@@ -71,8 +54,5 @@ export class ComputerUseInputAPI {
declare isSupported: true
}
interface ComputerUseInputUnsupported {
isSupported: false
}
interface ComputerUseInputUnsupported { isSupported: false }
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported

View File

@@ -16,6 +16,8 @@ export interface ScreenshotResult {
originX: number
originY: number
displayId?: number
/** Accessibility snapshot — structured GUI element tree as model-friendly text. Windows only. */
accessibilityText?: string
}
export interface FrontmostApp {
@@ -108,4 +110,59 @@ export interface ComputerExecutor {
getAppIcon(path: string): Promise<string | undefined>
listRunningApps(): Promise<RunningApp[]>
openApp(bundleId: string): Promise<void>
// ── Window management (Windows only, optional) ──────────────────────────
/** Perform a window management action on the bound window. Win32 API only — no global shortcuts. */
manageWindow?(action: string, opts?: { x?: number; y?: number; width?: number; height?: number }): Promise<boolean>
/** Get the current window rect of the bound window */
getWindowRect?(): Promise<{ x: number; y: number; width: number; height: number } | null>
// ── Element-targeted actions (Windows UIA, optional) ────────────────────
/** Open terminal and launch an agent CLI */
openTerminal?(opts: {
agent: 'claude' | 'codex' | 'gemini' | 'custom'
command?: string
terminal?: 'wt' | 'powershell' | 'cmd'
workingDirectory?: string
}): Promise<{ hwnd: string; title: string; launched: boolean } | null>
/** Bind to a window by hwnd/title/pid. Returns bound window info or null. */
bindToWindow?(query: { hwnd?: string; title?: string; pid?: number }): Promise<{ hwnd: string; title: string; pid: number } | null>
/** Unbind from the current window */
unbindFromWindow?(): Promise<void>
/** Cheap binding-state check for window-targeted routing decisions. */
hasBoundWindow?(): Promise<boolean>
/** Get current binding status */
getBindingStatus?(): Promise<{ bound: boolean; hwnd?: string; title?: string; pid?: number; rect?: { x: number; y: number; width: number; height: number } } | null>
/** List all visible windows */
listVisibleWindows?(): Promise<Array<{ hwnd: string; pid: number; title: string }>>
/** Control the status indicator overlay */
statusIndicator?(action: 'show' | 'hide' | 'status', message?: string): Promise<{ active: boolean; message?: string }>
/** Virtual keyboard — send keys/text/combos to bound window only */
virtualKeyboard?(opts: {
action: 'type' | 'combo' | 'press' | 'release' | 'hold'
text: string
duration?: number
repeat?: number
}): Promise<boolean>
/** Virtual mouse — click/move/drag on bound window only */
virtualMouse?(opts: {
action: 'click' | 'double_click' | 'right_click' | 'move' | 'drag' | 'down' | 'up'
x: number; y: number
startX?: number; startY?: number
}): Promise<boolean>
/** Mouse wheel scroll at client coordinates (works on Excel, browsers, modern UI) */
mouseWheel?(x: number, y: number, delta: number, horizontal?: boolean): Promise<boolean>
/** Activate the bound window (foreground + click to focus) */
activateWindow?(clickX?: number, clickY?: number): Promise<boolean>
/** Handle a terminal prompt (yes/no/select/type + enter) */
respondToPrompt?(opts: {
responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type'
arrowDirection?: 'up' | 'down'
arrowCount?: number
text?: string
}): Promise<boolean>
/** Click an element by name/role/automationId via UI Automation */
clickElement?(query: { name?: string; role?: string; automationId?: string }): Promise<boolean>
/** Type text into an element by name/role/automationId via UI Automation ValuePattern */
typeIntoElement?(query: { name?: string; role?: string; automationId?: string }, text: string): Promise<boolean>
}

View File

@@ -434,6 +434,15 @@ async function runInputActionGates(
}
}
// Windows/Linux: operations go through SendMessage (HWND-bound) or platform
// abstraction, not global input to the foreground. The frontmost gate is a
// macOS safety net for global CGEvent input — on other platforms, skip it
// when the platform's screenshotFiltering is 'none' (no per-app filtering,
// meaning no hide/defocus, meaning frontmost is meaningless).
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
return null; // pass — non-macOS platform, frontmost irrelevant
}
// Frontmost gate. Check FRESH on every call.
const frontmost = await adapter.executor.getFrontmostApp();
@@ -561,6 +570,13 @@ async function runHitTestGate(
y: number,
actionKind: CuActionKind,
): Promise<CuCallToolResult | null> {
// Non-macOS: HWND-bound mode — clicks go to the bound window via
// SendMessage with window-relative coordinates. Hit-test against the
// real screen is meaningless.
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
return null;
}
const target = await adapter.executor.appUnderPoint(x, y);
if (!target) return null; // desktop / nothing under point / platform no-op
@@ -796,12 +812,12 @@ function resolveRequestedApps(
if (!resolved) {
resolved = byLowerDisplayName.get(requested.toLowerCase());
}
// Fuzzy fallback: match requested name as substring of display name
// e.g. "Chrome" matches "Google Chrome", "Code" matches "Visual Studio Code"
// Windows fuzzy matching: strip .exe suffix, try substring match
if (!resolved) {
const lower = requested.toLowerCase();
for (const app of installed) {
if (app.displayName.toLowerCase().includes(lower)) {
const clean = requested.toLowerCase().replace(/\.exe$/, '').trim();
// Try: "chrome" matches "Google Chrome", "notepad" matches "Notepad"
for (const [name, app] of byLowerDisplayName) {
if (name.includes(clean) || clean.includes(name)) {
resolved = app;
break;
}
@@ -2137,6 +2153,8 @@ async function handleScreenshot(
content: [
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
{
type: "image",
data: shot.base64,
@@ -2204,6 +2222,8 @@ async function handleScreenshot(
content: [
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
{
type: "image",
data: shot.base64,
@@ -2812,6 +2832,443 @@ async function handleOpenApplication(
return okText(`Opened "${app}".`);
}
async function handleVirtualMouse(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.virtualMouse) {
return errorResult("virtual_mouse is only available on Windows with a bound window.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const coord = args.coordinate;
if (!Array.isArray(coord) || coord.length < 2) {
return errorResult("coordinate [x, y] is required.", "bad_args");
}
const validActions = new Set(["click", "double_click", "right_click", "move", "drag", "down", "up"]);
if (!validActions.has(action)) {
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
}
const startCoord = Array.isArray(args.start_coordinate) ? args.start_coordinate : undefined;
const ok = await adapter.executor.virtualMouse({
action: action as any,
x: coord[0], y: coord[1],
startX: startCoord?.[0], startY: startCoord?.[1],
});
if (!ok) {
return errorResult("No window is currently bound.", "bad_args");
}
const desc: Record<string, string> = {
click: `Click at (${coord[0]},${coord[1]})`,
double_click: `Double-click at (${coord[0]},${coord[1]})`,
right_click: `Right-click at (${coord[0]},${coord[1]})`,
move: `Moved to (${coord[0]},${coord[1]})`,
drag: `Dragged ${startCoord ? `(${startCoord[0]},${startCoord[1]})` : "current"} → (${coord[0]},${coord[1]})`,
down: `Button down at (${coord[0]},${coord[1]})`,
up: `Button up at (${coord[0]},${coord[1]})`,
};
return okText(desc[action] ?? action);
}
async function handleVirtualKeyboard(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.virtualKeyboard) {
return errorResult("virtual_keyboard is only available on Windows with a bound window.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const text = requireString(args, "text");
if (text instanceof Error) return errorResult(text.message, "bad_args");
const validActions = new Set(["type", "combo", "press", "release", "hold"]);
if (!validActions.has(action)) {
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
}
const duration = typeof args.duration === "number" ? args.duration : undefined;
const repeat = typeof args.repeat === "number" ? args.repeat : undefined;
const ok = await adapter.executor.virtualKeyboard({
action: action as any,
text,
duration,
repeat,
});
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
const desc: Record<string, string> = {
type: `Typed "${text.length > 40 ? text.slice(0, 40) + "..." : text}"`,
combo: `Sent ${text}`,
press: `Pressed ${text} (holding)`,
release: `Released ${text}`,
hold: `Held ${text} for ${duration ?? 1}s`,
};
return okText(`${desc[action]}${repeat && repeat > 1 ? ` ×${repeat}` : ""}`);
}
async function handleStatusIndicator(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.statusIndicator) {
return errorResult("status_indicator is only available on Windows.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
if (!["show", "hide", "status"].includes(action)) {
return errorResult(`Invalid action "${action}". Valid: show, hide, status.`, "bad_args");
}
const message = typeof args.message === "string" ? args.message : undefined;
if (action === "show" && !message) {
return errorResult("'show' requires a message parameter.", "bad_args");
}
const result = await adapter.executor.statusIndicator(action as any, message);
if (action === "status") {
return okText(result.active ? "Indicator is active on the bound window." : "Indicator is not active (no window bound).");
}
if (action === "show") {
return okText(`Indicator showing: "${message}"`);
}
return okText("Indicator hidden.");
}
async function handleMouseWheel(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.mouseWheel) {
return errorResult("mouse_wheel is only available on Windows with a bound window.", "feature_unavailable");
}
const coord = args.coordinate;
if (!Array.isArray(coord) || coord.length < 2) {
return errorResult("coordinate must be [x, y] array.", "bad_args");
}
const delta = typeof args.delta === "number" ? args.delta : undefined;
if (delta === undefined) {
return errorResult("delta is required (positive=up, negative=down).", "bad_args");
}
const horizontal = args.direction === "horizontal";
const ok = await adapter.executor.mouseWheel(coord[0], coord[1], delta, horizontal);
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
return okText(
`Mouse wheel: ${horizontal ? "horizontal" : "vertical"} scroll ${delta > 0 ? "up" : "down"} ${Math.abs(delta)} click(s) at (${coord[0]},${coord[1]}).`,
);
}
async function handleActivateWindow(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.activateWindow) {
return errorResult("activate_window is only available on Windows with a bound window.", "feature_unavailable");
}
const clickX = typeof args.click_x === "number" ? args.click_x : undefined;
const clickY = typeof args.click_y === "number" ? args.click_y : undefined;
const ok = await adapter.executor.activateWindow(clickX, clickY);
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
return okText("Window activated and focused. Ready for input.");
}
async function handlePromptRespond(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.respondToPrompt) {
return errorResult("prompt_respond is only available on Windows with a bound window.", "feature_unavailable");
}
const responseType = requireString(args, "response_type");
if (responseType instanceof Error) return errorResult(responseType.message, "bad_args");
const validTypes = new Set(["yes", "no", "enter", "escape", "select", "type"]);
if (!validTypes.has(responseType)) {
return errorResult(`Invalid response_type "${responseType}". Valid: ${[...validTypes].join(", ")}`, "bad_args");
}
if (responseType === "select" && typeof args.arrow_count !== "number") {
return errorResult("'select' requires arrow_count parameter.", "bad_args");
}
if (responseType === "type" && typeof args.text !== "string") {
return errorResult("'type' requires text parameter.", "bad_args");
}
const ok = await adapter.executor.respondToPrompt({
responseType: responseType as any,
arrowDirection: typeof args.arrow_direction === "string" ? args.arrow_direction as any : undefined,
arrowCount: typeof args.arrow_count === "number" ? args.arrow_count : undefined,
text: typeof args.text === "string" ? args.text : undefined,
});
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
const descriptions: Record<string, string> = {
yes: "Sent 'y' + Enter.",
no: "Sent 'n' + Enter.",
enter: "Sent Enter.",
escape: "Sent Escape.",
select: `Navigated ${args.arrow_direction ?? "down"} ${args.arrow_count ?? 1} time(s) + Enter.`,
type: `Typed "${args.text}" + Enter.`,
};
return okText(`Prompt responded: ${descriptions[responseType] ?? responseType}. Take a screenshot to verify.`);
}
async function handleOpenTerminal(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.openTerminal) {
return errorResult("open_terminal is only available on Windows.", "feature_unavailable");
}
const agent = requireString(args, "agent");
if (agent instanceof Error) return errorResult(agent.message, "bad_args");
const validAgents = new Set(["claude", "codex", "gemini", "custom"]);
if (!validAgents.has(agent)) {
return errorResult(`Invalid agent "${agent}". Valid: claude, codex, gemini, custom.`, "bad_args");
}
if (agent === "custom" && typeof args.command !== "string") {
return errorResult("agent='custom' requires 'command' parameter.", "bad_args");
}
const result = await adapter.executor.openTerminal({
agent: agent as any,
command: typeof args.command === "string" ? args.command : undefined,
terminal: typeof args.terminal === "string" ? args.terminal as any : undefined,
workingDirectory: typeof args.working_directory === "string" ? args.working_directory : undefined,
});
if (!result) {
return errorResult(
"Failed to open terminal. Windows Terminal (wt.exe) may not be installed.",
"launch_failed",
);
}
if (!result.launched) {
return okText(
`Terminal opened (hwnd=${result.hwnd}, "${result.title}") but no command was sent. Window is now bound.`,
);
}
const agentNames: Record<string, string> = {
claude: "Claude Code", codex: "Codex", gemini: "Gemini",
custom: args.command as string,
};
return okText(
`Terminal opened and ${agentNames[agent] ?? agent} launched.\n` +
`Window: hwnd=${result.hwnd} "${result.title}"\n` +
`Command: '${agent === "custom" ? args.command : agent}' + Enter\n` +
`Status: bound to this terminal. Take a screenshot to verify the agent started.`,
);
}
async function handleBindWindow(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
switch (action) {
case "list": {
if (!adapter.executor.listVisibleWindows) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const windows = await adapter.executor.listVisibleWindows();
if (windows.length === 0) return okText("No visible windows found.");
const lines = windows.map(
(w) => `hwnd=${w.hwnd} pid=${w.pid} "${w.title}"`,
);
return okText(`Visible windows (${windows.length}):\n${lines.join("\n")}`);
}
case "status": {
if (!adapter.executor.getBindingStatus) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const status = await adapter.executor.getBindingStatus();
if (!status || !status.bound) {
return okText("No window is currently bound. Use bind_window(action='list') to see available windows, then bind_window(action='bind', title='...') to bind.");
}
let text = `Bound to: hwnd=${status.hwnd}`;
if (status.title) text += ` "${status.title}"`;
if (status.pid) text += ` pid=${status.pid}`;
if (status.rect) text += ` rect=(${status.rect.x},${status.rect.y} ${status.rect.width}x${status.rect.height})`;
return okText(text);
}
case "bind": {
if (!adapter.executor.bindToWindow) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const title = typeof args.title === "string" ? args.title : undefined;
const hwnd = typeof args.hwnd === "string" ? args.hwnd : undefined;
const pid = typeof args.pid === "number" ? args.pid : undefined;
if (!title && !hwnd && !pid) {
return errorResult("Specify at least one of: title, hwnd, or pid.", "bad_args");
}
const result = await adapter.executor.bindToWindow({ hwnd, title, pid });
if (!result) {
return errorResult(
`No window found matching: ${[title && `title="${title}"`, hwnd && `hwnd=${hwnd}`, pid && `pid=${pid}`].filter(Boolean).join(", ")}. Use bind_window(action='list') to see available windows.`,
"element_not_found",
);
}
return okText(`Bound to window: hwnd=${result.hwnd} pid=${result.pid} "${result.title}". All subsequent screenshot/click/type operations target this window.`);
}
case "unbind": {
if (!adapter.executor.unbindFromWindow) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
await adapter.executor.unbindFromWindow();
return okText("Window binding released. Operations now target the full screen.");
}
default:
return errorResult(`Unknown bind_window action "${action}". Valid: list, bind, unbind, status.`, "bad_args");
}
}
async function handleClickElement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.clickElement) {
return errorResult(
"click_element is only available on Windows with a bound window.",
"feature_unavailable",
);
}
const name = typeof args.name === "string" ? args.name : undefined;
const role = typeof args.role === "string" ? args.role : undefined;
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
if (!name && !role && !automationId) {
return errorResult("At least one of name, role, or automationId is required.", "bad_args");
}
const ok = await adapter.executor.clickElement({ name, role, automationId });
if (!ok) {
return errorResult(
`Element not found: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. Take a screenshot to see current GUI elements.`,
"element_not_found",
);
}
return okText(`Clicked element: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
}
async function handleTypeIntoElement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.typeIntoElement) {
return errorResult(
"type_into_element is only available on Windows with a bound window.",
"feature_unavailable",
);
}
const text = requireString(args, "text");
if (text instanceof Error) return errorResult(text.message, "bad_args");
const name = typeof args.name === "string" ? args.name : undefined;
const role = typeof args.role === "string" ? args.role : undefined;
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
const ok = await adapter.executor.typeIntoElement({ name, role, automationId }, text);
if (!ok) {
return errorResult(
`Could not type into element: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. The element was not found or doesn't support text input.`,
"element_not_found",
);
}
return okText(`Typed ${text.length} chars into: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
}
async function handleWindowManagement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const VALID_ACTIONS = new Set([
"minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect",
]);
if (!VALID_ACTIONS.has(action)) {
return errorResult(
`Unknown window_management action "${action}". Valid: ${[...VALID_ACTIONS].join(", ")}`,
"bad_args",
);
}
if (!adapter.executor.manageWindow) {
return errorResult(
"window_management is only available on Windows with a bound window.",
"feature_unavailable",
);
}
// get_rect: just return the current window position and size
if (action === "get_rect") {
if (!adapter.executor.getWindowRect) {
return errorResult("getWindowRect not available.", "feature_unavailable");
}
const rect = await adapter.executor.getWindowRect();
if (!rect) {
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
}
return okText(
`Window rect: x=${rect.x}, y=${rect.y}, width=${rect.width}, height=${rect.height}`,
);
}
// move_resize: requires x, y (width/height optional)
if (action === "move_resize") {
const x = typeof args.x === "number" ? args.x : undefined;
const y = typeof args.y === "number" ? args.y : undefined;
if (x === undefined || y === undefined) {
return errorResult("move_resize requires x and y parameters.", "bad_args");
}
const width = typeof args.width === "number" ? args.width : undefined;
const height = typeof args.height === "number" ? args.height : undefined;
const ok = await adapter.executor.manageWindow(action, { x, y, width, height });
if (!ok) {
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
}
return okText(
width && height
? `Moved window to (${x}, ${y}) and resized to ${width}×${height}.`
: `Moved window to (${x}, ${y}).`,
);
}
// All other actions: minimize, maximize, restore, close, focus, move_offscreen
const ok = await adapter.executor.manageWindow(action);
if (!ok) {
return errorResult(
"No window is currently bound. Call open_application first.",
"bad_args",
);
}
const descriptions: Record<string, string> = {
minimize: "Window minimized (ShowWindow SW_MINIMIZE).",
maximize: "Window maximized (ShowWindow SW_MAXIMIZE).",
restore: "Window restored (ShowWindow SW_RESTORE).",
close: "Window closed (SendMessage WM_CLOSE). The window binding has been released.",
focus: "Window brought to front (SetForegroundWindow).",
move_offscreen: "Window moved offscreen (-32000,-32000). Still usable via SendMessage/PrintWindow.",
};
return okText(descriptions[action] ?? `Action "${action}" completed.`);
}
async function handleSwitchDisplay(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
@@ -3383,6 +3840,64 @@ async function dispatchAction(
overrides: ComputerUseOverrides,
subGates: CuSubGates,
): Promise<CuCallToolResult> {
// ── Bound-window auto-routing ──────────────────────────────────────
// When a window is bound (Win32), route generic input tools to
// virtual_mouse / virtual_keyboard automatically. The model doesn't
// need to know which tools to use — binding handles it.
const hasBoundWindow =
(await adapter.executor.hasBoundWindow?.()) === true &&
adapter.executor.virtualMouse &&
adapter.executor.virtualKeyboard;
if (hasBoundWindow) {
const coord = Array.isArray(a.coordinate) ? a.coordinate as number[] : undefined;
switch (name) {
case "left_click":
if (coord) return handleVirtualMouse(adapter, { action: "click", coordinate: coord });
break;
case "double_click":
if (coord) return handleVirtualMouse(adapter, { action: "double_click", coordinate: coord });
break;
case "right_click":
if (coord) return handleVirtualMouse(adapter, { action: "right_click", coordinate: coord });
break;
case "mouse_move":
if (coord) return handleVirtualMouse(adapter, { action: "move", coordinate: coord });
break;
case "left_click_drag":
if (coord) return handleVirtualMouse(adapter, {
action: "drag", coordinate: coord,
start_coordinate: Array.isArray(a.start_coordinate) ? a.start_coordinate : undefined,
});
break;
case "left_mouse_down":
if (coord) return handleVirtualMouse(adapter, { action: "down", coordinate: coord });
break;
case "left_mouse_up":
if (coord) return handleVirtualMouse(adapter, { action: "up", coordinate: coord });
break;
case "type":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "type", text: a.text });
break;
case "key":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "combo", text: a.text, repeat: a.repeat });
break;
case "hold_key":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, {
action: "hold", text: a.text,
duration: typeof a.duration === "number" ? a.duration : 1,
});
break;
case "scroll":
if (coord) return handleMouseWheel(adapter, {
coordinate: coord,
delta: a.scroll_direction === "up" ? (a.scroll_amount ?? 3) : -(a.scroll_amount ?? 3),
direction: (a.scroll_direction === "left" || a.scroll_direction === "right") ? "horizontal" : "vertical",
});
break;
// screenshot, zoom, wait, cursor_position — not rerouted, pass through
}
}
// ── Standard dispatch (unbound or tools not rerouted above) ────────
switch (name) {
case "screenshot":
return handleScreenshot(adapter, overrides, subGates);
@@ -3434,6 +3949,39 @@ async function dispatchAction(
case "open_application":
return handleOpenApplication(adapter, a, overrides);
case "window_management":
return handleWindowManagement(adapter, a);
case "click_element":
return handleClickElement(adapter, a);
case "type_into_element":
return handleTypeIntoElement(adapter, a);
case "open_terminal":
return handleOpenTerminal(adapter, a);
case "bind_window":
return handleBindWindow(adapter, a);
case "virtual_mouse":
return handleVirtualMouse(adapter, a);
case "virtual_keyboard":
return handleVirtualKeyboard(adapter, a);
case "status_indicator":
return handleStatusIndicator(adapter, a);
case "mouse_wheel":
return handleMouseWheel(adapter, a);
case "activate_window":
return handleActivateWindow(adapter, a);
case "prompt_respond":
return handlePromptRespond(adapter, a);
case "switch_display":
return handleSwitchDisplay(adapter, a, overrides);

View File

@@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = {
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32";
platform: "darwin" | "win32" | "linux";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
@@ -414,6 +414,353 @@ export function buildComputerUseTools(
},
},
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
// Only available on Windows when a window is bound via open_application.
...(caps.platform === 'win32' ? [{
name: "window_management",
description:
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
"The window must have been opened via open_application first. " +
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
"Use move_resize to reposition or resize the window to specific coordinates.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
description:
"minimize: ShowWindow(SW_MINIMIZE). " +
"maximize: ShowWindow(SW_MAXIMIZE). " +
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
"close: SendMessage(WM_CLOSE) — graceful close. " +
"focus: SetForegroundWindow + BringWindowToTop. " +
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
"move_resize: SetWindowPos to specific x,y,width,height. " +
"get_rect: GetWindowRect — returns current position and size.",
},
x: { type: "integer", description: "X position for move_resize." },
y: { type: "integer", description: "Y position for move_resize." },
width: { type: "integer", description: "Width for move_resize." },
height: { type: "integer", description: "Height for move_resize." },
},
required: ["action"],
},
} as Tool,
{
name: "click_element",
description:
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
inputSchema: {
type: "object" as const,
properties: {
name: {
type: "string",
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
},
role: {
type: "string",
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
},
automationId: {
type: "string",
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
},
},
required: [],
},
} as Tool,
{
name: "type_into_element",
description:
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
"Finds the element by name/role/automationId, then sets its value directly — " +
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
inputSchema: {
type: "object" as const,
properties: {
name: { type: "string", description: "Accessible name of the target element." },
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
automationId: { type: "string", description: "Exact automationId." },
text: { type: "string", description: "Text to type/set into the element." },
},
required: ["text"],
},
} as Tool,
{
name: "open_terminal",
description:
"Open a new terminal window and launch an AI agent CLI. " +
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
"or any custom command. After launching, the tool binds to the new terminal window " +
"and takes a screenshot to verify the agent started successfully. " +
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
inputSchema: {
type: "object" as const,
properties: {
agent: {
type: "string",
enum: ["claude", "codex", "gemini", "custom"],
description:
"Which agent to launch. " +
"claude: runs 'claude' command. " +
"codex: runs 'codex' command. " +
"gemini: runs 'gemini' command. " +
"custom: runs the command specified in 'command' parameter.",
},
command: {
type: "string",
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
},
terminal: {
type: "string",
enum: ["wt", "powershell", "cmd"],
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
},
working_directory: {
type: "string",
description: "Working directory for the terminal. If omitted, uses current directory.",
},
},
required: ["agent"],
},
} as Tool,
{
name: "bind_window",
description:
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
"no cursor movement, no focus steal, no interference with the user's desktop. " +
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["bind", "unbind", "status", "list"],
description:
"bind: Bind to a window (specify title, hwnd, or pid). " +
"unbind: Release the current binding, return to full-screen mode. " +
"status: Show the currently bound window (hwnd, title, rect). " +
"list: List all visible windows with hwnd, pid, and title.",
},
title: {
type: "string",
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
},
hwnd: {
type: "string",
description: "Exact window handle from 'list' output. For 'bind' action.",
},
pid: {
type: "integer",
description: "Process ID to find window for. For 'bind' action.",
},
},
required: ["action"],
},
} as Tool,
{
name: "activate_window",
description:
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
"the window is ready to receive keyboard/mouse events. " +
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
inputSchema: {
type: "object" as const,
properties: {
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
},
required: [],
},
} as Tool,
{
name: "prompt_respond",
description:
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
"Typical flows: " +
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
"3) Text input prompt → type the response + Enter. " +
"After responding, take a screenshot to verify the result.",
inputSchema: {
type: "object" as const,
properties: {
response_type: {
type: "string",
enum: ["yes", "no", "enter", "escape", "select", "type"],
description:
"yes: send 'y' + Enter. " +
"no: send 'n' + Enter. " +
"enter: send Enter only. " +
"escape: send Escape (cancel). " +
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
"type: type custom text then Enter. Requires 'text'.",
},
arrow_direction: {
type: "string",
enum: ["up", "down"],
description: "Arrow key direction for 'select' type. Default: 'down'.",
},
arrow_count: {
type: "integer",
description: "Number of arrow key presses for 'select' type. Default: 1.",
minimum: 0,
maximum: 50,
},
text: {
type: "string",
description: "Text to type for 'type' response_type.",
},
},
required: ["response_type"],
},
} as Tool,
{
name: "status_indicator",
description:
"Control the visual status indicator overlay on the bound window. " +
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["show", "hide", "status"],
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
},
message: {
type: "string",
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
},
},
required: ["action"],
},
} as Tool,
{
name: "virtual_keyboard",
description:
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
"The user can keep typing on their own keyboard without interference. " +
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
"All input targets the bound HWND only — no global keyboard events.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["type", "combo", "press", "release", "hold"],
description:
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
"press: Press a key down and hold it (pair with 'release'). " +
"release: Release a previously pressed key. " +
"hold: Press key(s) for a duration then release.",
},
text: {
type: "string",
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
},
duration: {
type: "number",
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
},
repeat: {
type: "integer",
description: "Number of times to repeat the action. Default: 1.",
minimum: 1,
maximum: 100,
},
},
required: ["action", "text"],
},
} as Tool,
{
name: "virtual_mouse",
description:
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
"The user's real cursor stays free. All operations target the bound HWND only.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
description:
"click: left-click at coordinate. " +
"double_click: double left-click. " +
"right_click: right-click. " +
"move: move virtual cursor (visual only, no click). " +
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
"down: press left button at coordinate (hold). " +
"up: release left button at coordinate.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate on the bound window.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
},
},
required: ["action", "coordinate"],
},
} as Tool,
{
name: "mouse_wheel",
description:
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
"code editors, PDF viewers, and any modern UI. " +
"Specify the click point within the window where the scroll should occur — " +
"this determines which panel/pane/element receives the scroll.",
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
},
delta: {
type: "integer",
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
},
direction: {
type: "string",
enum: ["vertical", "horizontal"],
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
},
},
required: ["coordinate", "delta"],
},
} as Tool,
] : []),
{
name: "switch_display",
description:

View File

@@ -159,28 +159,23 @@ export const apps: AppsAPI = {
async listInstalled() {
try {
// Use Spotlight (mdfind) to enumerate .app bundles and mdls to get real bundle IDs.
// Searches /Applications, /System/Applications, and /System/Applications/Utilities
// so that system apps (Terminal, Chess, etc.) and core services (Finder) are found.
const proc = Bun.spawn([
'bash', '-c',
`for dir in /Applications /System/Applications /System/Applications/Utilities /System/Library/CoreServices; do
mdfind 'kMDItemContentType == "com.apple.application-bundle"' -onlyin "$dir" 2>/dev/null
done | sort -u | while read -r appPath; do
bundleId=$(mdls -raw -name kMDItemCFBundleIdentifier "$appPath" 2>/dev/null)
if [ -n "$bundleId" ] && [ "$bundleId" != "(null)" ]; then
displayName=$(basename "$appPath" .app)
echo "$bundleId|$displayName|$appPath"
fi
done`,
], { stdout: 'pipe', stderr: 'pipe' })
const text = await new Response(proc.stdout).text()
await proc.exited
return text.split('\n').filter(Boolean).map(line => {
const [bundleId, displayName, path] = line.split('|', 3)
const result = await osascript(`
tell application "System Events"
set appList to ""
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
set appPath to POSIX path of (appFile as alias)
set appName to name of appFile
set appList to appList & appPath & "|" & appName & "\\n"
end repeat
return appList
end tell
`)
return result.split('\n').filter(Boolean).map(line => {
const [path, name] = line.split('|', 2)
const displayName = (name ?? '').replace(/\.app$/, '')
return {
bundleId: bundleId ?? '',
displayName: displayName ?? '',
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
displayName,
path: path ?? '',
}
})

View File

@@ -1,14 +1,10 @@
/**
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
* @ant/computer-use-swift — macOS display, apps, and screenshot (Swift native)
*
* Platform backends:
* - darwin: AppleScript/JXA + screencapture
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
*
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
* This package wraps the macOS-only Swift .node native module.
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
*/
// Re-export all types
export type {
DisplayGeometry,
PrepareDisplayResult,
@@ -18,72 +14,42 @@ export type {
ScreenshotResult,
ResolvePrepareCaptureResult,
WindowDisplayInfo,
DisplayAPI,
AppsAPI,
ScreenshotAPI,
SwiftBackend,
} from './types.js'
} from './backends/darwin.js'
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
import type { ResolvePrepareCaptureResult } from './backends/darwin.js'
// ---------------------------------------------------------------------------
// Platform dispatch
// ---------------------------------------------------------------------------
function loadBackend(): SwiftBackend | null {
function loadDarwin() {
if (process.platform !== 'darwin') return null
try {
switch (process.platform) {
case 'darwin':
return require('./backends/darwin.js') as SwiftBackend
case 'win32':
return require('./backends/win32.js') as SwiftBackend
case 'linux':
return require('./backends/linux.js') as SwiftBackend
default:
return null
}
return require('./backends/darwin.js')
} catch {
return null
}
}
const backend = loadBackend()
// ---------------------------------------------------------------------------
// ComputerUseAPI — Main export (preserves original class interface)
// ---------------------------------------------------------------------------
const darwin = loadDarwin()
export class ComputerUseAPI {
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
// These stubs should never be reached in practice — callers check isSupported
// or the feature gate before invoking.
apps = backend?.apps ?? {
apps = darwin?.apps ?? {
async prepareDisplay() { return { activated: '', hidden: [] } },
async previewHideSet() { return [] },
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
async findWindowDisplays(ids: string[]) { return ids.map((b: string) => ({ bundleId: b, displayIds: [] as number[] })) },
async appUnderPoint() { return null },
async listInstalled() { return [] },
iconDataUrl() { return null },
listRunning() { return [] },
async open() { throw new Error('computer-use-swift: no backend for this platform') },
async open() { throw new Error('@ant/computer-use-swift: macOS only') },
async unhide() {},
}
display = backend?.display ?? {
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
display = darwin?.display ?? {
getSize() { throw new Error('@ant/computer-use-swift: macOS only') },
listAll() { throw new Error('@ant/computer-use-swift: macOS only') },
}
screenshot = backend?.screenshot ?? {
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
}
hotkey = (backend as any)?.hotkey ?? {
registerEscape(_cb: () => void): boolean { return false },
unregister() {},
notifyExpectedEscape() {},
screenshot = darwin?.screenshot ?? {
async captureExcluding() { throw new Error('@ant/computer-use-swift: macOS only') },
async captureRegion() { throw new Error('@ant/computer-use-swift: macOS only') },
}
async resolvePrepareCapture(
@@ -93,8 +59,6 @@ export class ComputerUseAPI {
targetW: number,
targetH: number,
displayId?: number,
_autoResolve?: boolean,
_doHide?: boolean,
): Promise<ResolvePrepareCaptureResult> {
return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId)
}