mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-18 22:35:51 +00:00
feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge
三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。
- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用
- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)
- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot
- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
@@ -1,33 +1,30 @@
|
||||
/**
|
||||
* @ant/computer-use-input — cross-platform keyboard & mouse simulation
|
||||
* @ant/computer-use-input — macOS keyboard & mouse simulation (enigo)
|
||||
*
|
||||
* Platform backends:
|
||||
* - darwin: AppleScript/JXA via CoreGraphics events
|
||||
* - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event)
|
||||
*
|
||||
* Add new platforms by creating backends/<platform>.ts implementing InputBackend.
|
||||
* This package wraps the macOS-only native enigo .node module.
|
||||
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
|
||||
*/
|
||||
|
||||
import type { FrontmostAppInfo, InputBackend } from './types.js'
|
||||
export interface FrontmostAppInfo {
|
||||
bundleId: string
|
||||
appName: string
|
||||
}
|
||||
|
||||
export type { FrontmostAppInfo, InputBackend } from './types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Platform dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
export interface InputBackend {
|
||||
moveMouse(x: number, y: number, animated: boolean): Promise<void>
|
||||
key(key: string, action: 'press' | 'release'): Promise<void>
|
||||
keys(parts: string[]): Promise<void>
|
||||
mouseLocation(): Promise<{ x: number; y: number }>
|
||||
mouseButton(button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number): Promise<void>
|
||||
mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
|
||||
typeText(text: string): Promise<void>
|
||||
getFrontmostAppInfo(): FrontmostAppInfo | null
|
||||
}
|
||||
|
||||
function loadBackend(): InputBackend | null {
|
||||
if (process.platform !== 'darwin') return null
|
||||
try {
|
||||
switch (process.platform) {
|
||||
case 'darwin':
|
||||
return require('./backends/darwin.js') as InputBackend
|
||||
case 'win32':
|
||||
return require('./backends/win32.js') as InputBackend
|
||||
case 'linux':
|
||||
return require('./backends/linux.js') as InputBackend
|
||||
default:
|
||||
return null
|
||||
}
|
||||
return require('./backends/darwin.js') as InputBackend
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
@@ -35,30 +32,16 @@ function loadBackend(): InputBackend | null {
|
||||
|
||||
const backend = loadBackend()
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unsupported stub (throws on call — guards via isSupported check)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function unsupported(): never {
|
||||
throw new Error(`computer-use-input is not supported on ${process.platform}`)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API — matches the original export surface
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const isSupported = backend !== null
|
||||
|
||||
export const moveMouse = backend?.moveMouse ?? unsupported
|
||||
export const key = backend?.key ?? unsupported
|
||||
export const keys = backend?.keys ?? unsupported
|
||||
export const mouseLocation = backend?.mouseLocation ?? unsupported
|
||||
export const mouseButton = backend?.mouseButton ?? unsupported
|
||||
export const mouseScroll = backend?.mouseScroll ?? unsupported
|
||||
export const typeText = backend?.typeText ?? unsupported
|
||||
export const moveMouse = backend?.moveMouse
|
||||
export const key = backend?.key
|
||||
export const keys = backend?.keys
|
||||
export const mouseLocation = backend?.mouseLocation
|
||||
export const mouseButton = backend?.mouseButton
|
||||
export const mouseScroll = backend?.mouseScroll
|
||||
export const typeText = backend?.typeText
|
||||
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
|
||||
|
||||
// Legacy class type — used by inputLoader.ts for type narrowing
|
||||
export class ComputerUseInputAPI {
|
||||
declare moveMouse: InputBackend['moveMouse']
|
||||
declare key: InputBackend['key']
|
||||
@@ -71,8 +54,5 @@ export class ComputerUseInputAPI {
|
||||
declare isSupported: true
|
||||
}
|
||||
|
||||
interface ComputerUseInputUnsupported {
|
||||
isSupported: false
|
||||
}
|
||||
|
||||
interface ComputerUseInputUnsupported { isSupported: false }
|
||||
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported
|
||||
|
||||
@@ -16,6 +16,8 @@ export interface ScreenshotResult {
|
||||
originX: number
|
||||
originY: number
|
||||
displayId?: number
|
||||
/** Accessibility snapshot — structured GUI element tree as model-friendly text. Windows only. */
|
||||
accessibilityText?: string
|
||||
}
|
||||
|
||||
export interface FrontmostApp {
|
||||
@@ -108,4 +110,59 @@ export interface ComputerExecutor {
|
||||
getAppIcon(path: string): Promise<string | undefined>
|
||||
listRunningApps(): Promise<RunningApp[]>
|
||||
openApp(bundleId: string): Promise<void>
|
||||
|
||||
// ── Window management (Windows only, optional) ──────────────────────────
|
||||
/** Perform a window management action on the bound window. Win32 API only — no global shortcuts. */
|
||||
manageWindow?(action: string, opts?: { x?: number; y?: number; width?: number; height?: number }): Promise<boolean>
|
||||
/** Get the current window rect of the bound window */
|
||||
getWindowRect?(): Promise<{ x: number; y: number; width: number; height: number } | null>
|
||||
|
||||
// ── Element-targeted actions (Windows UIA, optional) ────────────────────
|
||||
/** Open terminal and launch an agent CLI */
|
||||
openTerminal?(opts: {
|
||||
agent: 'claude' | 'codex' | 'gemini' | 'custom'
|
||||
command?: string
|
||||
terminal?: 'wt' | 'powershell' | 'cmd'
|
||||
workingDirectory?: string
|
||||
}): Promise<{ hwnd: string; title: string; launched: boolean } | null>
|
||||
/** Bind to a window by hwnd/title/pid. Returns bound window info or null. */
|
||||
bindToWindow?(query: { hwnd?: string; title?: string; pid?: number }): Promise<{ hwnd: string; title: string; pid: number } | null>
|
||||
/** Unbind from the current window */
|
||||
unbindFromWindow?(): Promise<void>
|
||||
/** Cheap binding-state check for window-targeted routing decisions. */
|
||||
hasBoundWindow?(): Promise<boolean>
|
||||
/** Get current binding status */
|
||||
getBindingStatus?(): Promise<{ bound: boolean; hwnd?: string; title?: string; pid?: number; rect?: { x: number; y: number; width: number; height: number } } | null>
|
||||
/** List all visible windows */
|
||||
listVisibleWindows?(): Promise<Array<{ hwnd: string; pid: number; title: string }>>
|
||||
/** Control the status indicator overlay */
|
||||
statusIndicator?(action: 'show' | 'hide' | 'status', message?: string): Promise<{ active: boolean; message?: string }>
|
||||
/** Virtual keyboard — send keys/text/combos to bound window only */
|
||||
virtualKeyboard?(opts: {
|
||||
action: 'type' | 'combo' | 'press' | 'release' | 'hold'
|
||||
text: string
|
||||
duration?: number
|
||||
repeat?: number
|
||||
}): Promise<boolean>
|
||||
/** Virtual mouse — click/move/drag on bound window only */
|
||||
virtualMouse?(opts: {
|
||||
action: 'click' | 'double_click' | 'right_click' | 'move' | 'drag' | 'down' | 'up'
|
||||
x: number; y: number
|
||||
startX?: number; startY?: number
|
||||
}): Promise<boolean>
|
||||
/** Mouse wheel scroll at client coordinates (works on Excel, browsers, modern UI) */
|
||||
mouseWheel?(x: number, y: number, delta: number, horizontal?: boolean): Promise<boolean>
|
||||
/** Activate the bound window (foreground + click to focus) */
|
||||
activateWindow?(clickX?: number, clickY?: number): Promise<boolean>
|
||||
/** Handle a terminal prompt (yes/no/select/type + enter) */
|
||||
respondToPrompt?(opts: {
|
||||
responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type'
|
||||
arrowDirection?: 'up' | 'down'
|
||||
arrowCount?: number
|
||||
text?: string
|
||||
}): Promise<boolean>
|
||||
/** Click an element by name/role/automationId via UI Automation */
|
||||
clickElement?(query: { name?: string; role?: string; automationId?: string }): Promise<boolean>
|
||||
/** Type text into an element by name/role/automationId via UI Automation ValuePattern */
|
||||
typeIntoElement?(query: { name?: string; role?: string; automationId?: string }, text: string): Promise<boolean>
|
||||
}
|
||||
|
||||
@@ -434,6 +434,15 @@ async function runInputActionGates(
|
||||
}
|
||||
}
|
||||
|
||||
// Windows/Linux: operations go through SendMessage (HWND-bound) or platform
|
||||
// abstraction, not global input to the foreground. The frontmost gate is a
|
||||
// macOS safety net for global CGEvent input — on other platforms, skip it
|
||||
// when the platform's screenshotFiltering is 'none' (no per-app filtering,
|
||||
// meaning no hide/defocus, meaning frontmost is meaningless).
|
||||
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
|
||||
return null; // pass — non-macOS platform, frontmost irrelevant
|
||||
}
|
||||
|
||||
// Frontmost gate. Check FRESH on every call.
|
||||
const frontmost = await adapter.executor.getFrontmostApp();
|
||||
|
||||
@@ -561,6 +570,13 @@ async function runHitTestGate(
|
||||
y: number,
|
||||
actionKind: CuActionKind,
|
||||
): Promise<CuCallToolResult | null> {
|
||||
// Non-macOS: HWND-bound mode — clicks go to the bound window via
|
||||
// SendMessage with window-relative coordinates. Hit-test against the
|
||||
// real screen is meaningless.
|
||||
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const target = await adapter.executor.appUnderPoint(x, y);
|
||||
if (!target) return null; // desktop / nothing under point / platform no-op
|
||||
|
||||
@@ -796,12 +812,12 @@ function resolveRequestedApps(
|
||||
if (!resolved) {
|
||||
resolved = byLowerDisplayName.get(requested.toLowerCase());
|
||||
}
|
||||
// Fuzzy fallback: match requested name as substring of display name
|
||||
// e.g. "Chrome" matches "Google Chrome", "Code" matches "Visual Studio Code"
|
||||
// Windows fuzzy matching: strip .exe suffix, try substring match
|
||||
if (!resolved) {
|
||||
const lower = requested.toLowerCase();
|
||||
for (const app of installed) {
|
||||
if (app.displayName.toLowerCase().includes(lower)) {
|
||||
const clean = requested.toLowerCase().replace(/\.exe$/, '').trim();
|
||||
// Try: "chrome" matches "Google Chrome", "notepad" matches "Notepad"
|
||||
for (const [name, app] of byLowerDisplayName) {
|
||||
if (name.includes(clean) || clean.includes(name)) {
|
||||
resolved = app;
|
||||
break;
|
||||
}
|
||||
@@ -2137,6 +2153,8 @@ async function handleScreenshot(
|
||||
content: [
|
||||
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
||||
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
||||
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
|
||||
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
|
||||
{
|
||||
type: "image",
|
||||
data: shot.base64,
|
||||
@@ -2204,6 +2222,8 @@ async function handleScreenshot(
|
||||
content: [
|
||||
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
||||
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
||||
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
|
||||
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
|
||||
{
|
||||
type: "image",
|
||||
data: shot.base64,
|
||||
@@ -2812,6 +2832,443 @@ async function handleOpenApplication(
|
||||
return okText(`Opened "${app}".`);
|
||||
}
|
||||
|
||||
async function handleVirtualMouse(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.virtualMouse) {
|
||||
return errorResult("virtual_mouse is only available on Windows with a bound window.", "feature_unavailable");
|
||||
}
|
||||
const action = requireString(args, "action");
|
||||
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||
const coord = args.coordinate;
|
||||
if (!Array.isArray(coord) || coord.length < 2) {
|
||||
return errorResult("coordinate [x, y] is required.", "bad_args");
|
||||
}
|
||||
const validActions = new Set(["click", "double_click", "right_click", "move", "drag", "down", "up"]);
|
||||
if (!validActions.has(action)) {
|
||||
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
|
||||
}
|
||||
const startCoord = Array.isArray(args.start_coordinate) ? args.start_coordinate : undefined;
|
||||
const ok = await adapter.executor.virtualMouse({
|
||||
action: action as any,
|
||||
x: coord[0], y: coord[1],
|
||||
startX: startCoord?.[0], startY: startCoord?.[1],
|
||||
});
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound.", "bad_args");
|
||||
}
|
||||
const desc: Record<string, string> = {
|
||||
click: `Click at (${coord[0]},${coord[1]})`,
|
||||
double_click: `Double-click at (${coord[0]},${coord[1]})`,
|
||||
right_click: `Right-click at (${coord[0]},${coord[1]})`,
|
||||
move: `Moved to (${coord[0]},${coord[1]})`,
|
||||
drag: `Dragged ${startCoord ? `(${startCoord[0]},${startCoord[1]})` : "current"} → (${coord[0]},${coord[1]})`,
|
||||
down: `Button down at (${coord[0]},${coord[1]})`,
|
||||
up: `Button up at (${coord[0]},${coord[1]})`,
|
||||
};
|
||||
return okText(desc[action] ?? action);
|
||||
}
|
||||
|
||||
async function handleVirtualKeyboard(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.virtualKeyboard) {
|
||||
return errorResult("virtual_keyboard is only available on Windows with a bound window.", "feature_unavailable");
|
||||
}
|
||||
const action = requireString(args, "action");
|
||||
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||
const text = requireString(args, "text");
|
||||
if (text instanceof Error) return errorResult(text.message, "bad_args");
|
||||
|
||||
const validActions = new Set(["type", "combo", "press", "release", "hold"]);
|
||||
if (!validActions.has(action)) {
|
||||
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
|
||||
}
|
||||
|
||||
const duration = typeof args.duration === "number" ? args.duration : undefined;
|
||||
const repeat = typeof args.repeat === "number" ? args.repeat : undefined;
|
||||
|
||||
const ok = await adapter.executor.virtualKeyboard({
|
||||
action: action as any,
|
||||
text,
|
||||
duration,
|
||||
repeat,
|
||||
});
|
||||
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||
}
|
||||
|
||||
const desc: Record<string, string> = {
|
||||
type: `Typed "${text.length > 40 ? text.slice(0, 40) + "..." : text}"`,
|
||||
combo: `Sent ${text}`,
|
||||
press: `Pressed ${text} (holding)`,
|
||||
release: `Released ${text}`,
|
||||
hold: `Held ${text} for ${duration ?? 1}s`,
|
||||
};
|
||||
|
||||
return okText(`${desc[action]}${repeat && repeat > 1 ? ` ×${repeat}` : ""}`);
|
||||
}
|
||||
|
||||
async function handleStatusIndicator(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.statusIndicator) {
|
||||
return errorResult("status_indicator is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
const action = requireString(args, "action");
|
||||
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||
if (!["show", "hide", "status"].includes(action)) {
|
||||
return errorResult(`Invalid action "${action}". Valid: show, hide, status.`, "bad_args");
|
||||
}
|
||||
const message = typeof args.message === "string" ? args.message : undefined;
|
||||
if (action === "show" && !message) {
|
||||
return errorResult("'show' requires a message parameter.", "bad_args");
|
||||
}
|
||||
const result = await adapter.executor.statusIndicator(action as any, message);
|
||||
if (action === "status") {
|
||||
return okText(result.active ? "Indicator is active on the bound window." : "Indicator is not active (no window bound).");
|
||||
}
|
||||
if (action === "show") {
|
||||
return okText(`Indicator showing: "${message}"`);
|
||||
}
|
||||
return okText("Indicator hidden.");
|
||||
}
|
||||
|
||||
async function handleMouseWheel(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.mouseWheel) {
|
||||
return errorResult("mouse_wheel is only available on Windows with a bound window.", "feature_unavailable");
|
||||
}
|
||||
const coord = args.coordinate;
|
||||
if (!Array.isArray(coord) || coord.length < 2) {
|
||||
return errorResult("coordinate must be [x, y] array.", "bad_args");
|
||||
}
|
||||
const delta = typeof args.delta === "number" ? args.delta : undefined;
|
||||
if (delta === undefined) {
|
||||
return errorResult("delta is required (positive=up, negative=down).", "bad_args");
|
||||
}
|
||||
const horizontal = args.direction === "horizontal";
|
||||
const ok = await adapter.executor.mouseWheel(coord[0], coord[1], delta, horizontal);
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||
}
|
||||
return okText(
|
||||
`Mouse wheel: ${horizontal ? "horizontal" : "vertical"} scroll ${delta > 0 ? "up" : "down"} ${Math.abs(delta)} click(s) at (${coord[0]},${coord[1]}).`,
|
||||
);
|
||||
}
|
||||
|
||||
async function handleActivateWindow(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.activateWindow) {
|
||||
return errorResult("activate_window is only available on Windows with a bound window.", "feature_unavailable");
|
||||
}
|
||||
const clickX = typeof args.click_x === "number" ? args.click_x : undefined;
|
||||
const clickY = typeof args.click_y === "number" ? args.click_y : undefined;
|
||||
const ok = await adapter.executor.activateWindow(clickX, clickY);
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||
}
|
||||
return okText("Window activated and focused. Ready for input.");
|
||||
}
|
||||
|
||||
async function handlePromptRespond(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.respondToPrompt) {
|
||||
return errorResult("prompt_respond is only available on Windows with a bound window.", "feature_unavailable");
|
||||
}
|
||||
const responseType = requireString(args, "response_type");
|
||||
if (responseType instanceof Error) return errorResult(responseType.message, "bad_args");
|
||||
|
||||
const validTypes = new Set(["yes", "no", "enter", "escape", "select", "type"]);
|
||||
if (!validTypes.has(responseType)) {
|
||||
return errorResult(`Invalid response_type "${responseType}". Valid: ${[...validTypes].join(", ")}`, "bad_args");
|
||||
}
|
||||
|
||||
if (responseType === "select" && typeof args.arrow_count !== "number") {
|
||||
return errorResult("'select' requires arrow_count parameter.", "bad_args");
|
||||
}
|
||||
if (responseType === "type" && typeof args.text !== "string") {
|
||||
return errorResult("'type' requires text parameter.", "bad_args");
|
||||
}
|
||||
|
||||
const ok = await adapter.executor.respondToPrompt({
|
||||
responseType: responseType as any,
|
||||
arrowDirection: typeof args.arrow_direction === "string" ? args.arrow_direction as any : undefined,
|
||||
arrowCount: typeof args.arrow_count === "number" ? args.arrow_count : undefined,
|
||||
text: typeof args.text === "string" ? args.text : undefined,
|
||||
});
|
||||
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||
}
|
||||
|
||||
const descriptions: Record<string, string> = {
|
||||
yes: "Sent 'y' + Enter.",
|
||||
no: "Sent 'n' + Enter.",
|
||||
enter: "Sent Enter.",
|
||||
escape: "Sent Escape.",
|
||||
select: `Navigated ${args.arrow_direction ?? "down"} ${args.arrow_count ?? 1} time(s) + Enter.`,
|
||||
type: `Typed "${args.text}" + Enter.`,
|
||||
};
|
||||
|
||||
return okText(`Prompt responded: ${descriptions[responseType] ?? responseType}. Take a screenshot to verify.`);
|
||||
}
|
||||
|
||||
async function handleOpenTerminal(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.openTerminal) {
|
||||
return errorResult("open_terminal is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
const agent = requireString(args, "agent");
|
||||
if (agent instanceof Error) return errorResult(agent.message, "bad_args");
|
||||
|
||||
const validAgents = new Set(["claude", "codex", "gemini", "custom"]);
|
||||
if (!validAgents.has(agent)) {
|
||||
return errorResult(`Invalid agent "${agent}". Valid: claude, codex, gemini, custom.`, "bad_args");
|
||||
}
|
||||
if (agent === "custom" && typeof args.command !== "string") {
|
||||
return errorResult("agent='custom' requires 'command' parameter.", "bad_args");
|
||||
}
|
||||
|
||||
const result = await adapter.executor.openTerminal({
|
||||
agent: agent as any,
|
||||
command: typeof args.command === "string" ? args.command : undefined,
|
||||
terminal: typeof args.terminal === "string" ? args.terminal as any : undefined,
|
||||
workingDirectory: typeof args.working_directory === "string" ? args.working_directory : undefined,
|
||||
});
|
||||
|
||||
if (!result) {
|
||||
return errorResult(
|
||||
"Failed to open terminal. Windows Terminal (wt.exe) may not be installed.",
|
||||
"launch_failed",
|
||||
);
|
||||
}
|
||||
|
||||
if (!result.launched) {
|
||||
return okText(
|
||||
`Terminal opened (hwnd=${result.hwnd}, "${result.title}") but no command was sent. Window is now bound.`,
|
||||
);
|
||||
}
|
||||
|
||||
const agentNames: Record<string, string> = {
|
||||
claude: "Claude Code", codex: "Codex", gemini: "Gemini",
|
||||
custom: args.command as string,
|
||||
};
|
||||
|
||||
return okText(
|
||||
`Terminal opened and ${agentNames[agent] ?? agent} launched.\n` +
|
||||
`Window: hwnd=${result.hwnd} "${result.title}"\n` +
|
||||
`Command: '${agent === "custom" ? args.command : agent}' + Enter\n` +
|
||||
`Status: bound to this terminal. Take a screenshot to verify the agent started.`,
|
||||
);
|
||||
}
|
||||
|
||||
async function handleBindWindow(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
const action = requireString(args, "action");
|
||||
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||
|
||||
switch (action) {
|
||||
case "list": {
|
||||
if (!adapter.executor.listVisibleWindows) {
|
||||
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
const windows = await adapter.executor.listVisibleWindows();
|
||||
if (windows.length === 0) return okText("No visible windows found.");
|
||||
const lines = windows.map(
|
||||
(w) => `hwnd=${w.hwnd} pid=${w.pid} "${w.title}"`,
|
||||
);
|
||||
return okText(`Visible windows (${windows.length}):\n${lines.join("\n")}`);
|
||||
}
|
||||
case "status": {
|
||||
if (!adapter.executor.getBindingStatus) {
|
||||
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
const status = await adapter.executor.getBindingStatus();
|
||||
if (!status || !status.bound) {
|
||||
return okText("No window is currently bound. Use bind_window(action='list') to see available windows, then bind_window(action='bind', title='...') to bind.");
|
||||
}
|
||||
let text = `Bound to: hwnd=${status.hwnd}`;
|
||||
if (status.title) text += ` "${status.title}"`;
|
||||
if (status.pid) text += ` pid=${status.pid}`;
|
||||
if (status.rect) text += ` rect=(${status.rect.x},${status.rect.y} ${status.rect.width}x${status.rect.height})`;
|
||||
return okText(text);
|
||||
}
|
||||
case "bind": {
|
||||
if (!adapter.executor.bindToWindow) {
|
||||
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
const title = typeof args.title === "string" ? args.title : undefined;
|
||||
const hwnd = typeof args.hwnd === "string" ? args.hwnd : undefined;
|
||||
const pid = typeof args.pid === "number" ? args.pid : undefined;
|
||||
if (!title && !hwnd && !pid) {
|
||||
return errorResult("Specify at least one of: title, hwnd, or pid.", "bad_args");
|
||||
}
|
||||
const result = await adapter.executor.bindToWindow({ hwnd, title, pid });
|
||||
if (!result) {
|
||||
return errorResult(
|
||||
`No window found matching: ${[title && `title="${title}"`, hwnd && `hwnd=${hwnd}`, pid && `pid=${pid}`].filter(Boolean).join(", ")}. Use bind_window(action='list') to see available windows.`,
|
||||
"element_not_found",
|
||||
);
|
||||
}
|
||||
return okText(`Bound to window: hwnd=${result.hwnd} pid=${result.pid} "${result.title}". All subsequent screenshot/click/type operations target this window.`);
|
||||
}
|
||||
case "unbind": {
|
||||
if (!adapter.executor.unbindFromWindow) {
|
||||
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||
}
|
||||
await adapter.executor.unbindFromWindow();
|
||||
return okText("Window binding released. Operations now target the full screen.");
|
||||
}
|
||||
default:
|
||||
return errorResult(`Unknown bind_window action "${action}". Valid: list, bind, unbind, status.`, "bad_args");
|
||||
}
|
||||
}
|
||||
|
||||
async function handleClickElement(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.clickElement) {
|
||||
return errorResult(
|
||||
"click_element is only available on Windows with a bound window.",
|
||||
"feature_unavailable",
|
||||
);
|
||||
}
|
||||
const name = typeof args.name === "string" ? args.name : undefined;
|
||||
const role = typeof args.role === "string" ? args.role : undefined;
|
||||
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
|
||||
if (!name && !role && !automationId) {
|
||||
return errorResult("At least one of name, role, or automationId is required.", "bad_args");
|
||||
}
|
||||
const ok = await adapter.executor.clickElement({ name, role, automationId });
|
||||
if (!ok) {
|
||||
return errorResult(
|
||||
`Element not found: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. Take a screenshot to see current GUI elements.`,
|
||||
"element_not_found",
|
||||
);
|
||||
}
|
||||
return okText(`Clicked element: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
|
||||
}
|
||||
|
||||
async function handleTypeIntoElement(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
if (!adapter.executor.typeIntoElement) {
|
||||
return errorResult(
|
||||
"type_into_element is only available on Windows with a bound window.",
|
||||
"feature_unavailable",
|
||||
);
|
||||
}
|
||||
const text = requireString(args, "text");
|
||||
if (text instanceof Error) return errorResult(text.message, "bad_args");
|
||||
const name = typeof args.name === "string" ? args.name : undefined;
|
||||
const role = typeof args.role === "string" ? args.role : undefined;
|
||||
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
|
||||
const ok = await adapter.executor.typeIntoElement({ name, role, automationId }, text);
|
||||
if (!ok) {
|
||||
return errorResult(
|
||||
`Could not type into element: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. The element was not found or doesn't support text input.`,
|
||||
"element_not_found",
|
||||
);
|
||||
}
|
||||
return okText(`Typed ${text.length} chars into: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
|
||||
}
|
||||
|
||||
async function handleWindowManagement(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
): Promise<CuCallToolResult> {
|
||||
const action = requireString(args, "action");
|
||||
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||
|
||||
const VALID_ACTIONS = new Set([
|
||||
"minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect",
|
||||
]);
|
||||
if (!VALID_ACTIONS.has(action)) {
|
||||
return errorResult(
|
||||
`Unknown window_management action "${action}". Valid: ${[...VALID_ACTIONS].join(", ")}`,
|
||||
"bad_args",
|
||||
);
|
||||
}
|
||||
|
||||
if (!adapter.executor.manageWindow) {
|
||||
return errorResult(
|
||||
"window_management is only available on Windows with a bound window.",
|
||||
"feature_unavailable",
|
||||
);
|
||||
}
|
||||
|
||||
// get_rect: just return the current window position and size
|
||||
if (action === "get_rect") {
|
||||
if (!adapter.executor.getWindowRect) {
|
||||
return errorResult("getWindowRect not available.", "feature_unavailable");
|
||||
}
|
||||
const rect = await adapter.executor.getWindowRect();
|
||||
if (!rect) {
|
||||
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
|
||||
}
|
||||
return okText(
|
||||
`Window rect: x=${rect.x}, y=${rect.y}, width=${rect.width}, height=${rect.height}`,
|
||||
);
|
||||
}
|
||||
|
||||
// move_resize: requires x, y (width/height optional)
|
||||
if (action === "move_resize") {
|
||||
const x = typeof args.x === "number" ? args.x : undefined;
|
||||
const y = typeof args.y === "number" ? args.y : undefined;
|
||||
if (x === undefined || y === undefined) {
|
||||
return errorResult("move_resize requires x and y parameters.", "bad_args");
|
||||
}
|
||||
const width = typeof args.width === "number" ? args.width : undefined;
|
||||
const height = typeof args.height === "number" ? args.height : undefined;
|
||||
const ok = await adapter.executor.manageWindow(action, { x, y, width, height });
|
||||
if (!ok) {
|
||||
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
|
||||
}
|
||||
return okText(
|
||||
width && height
|
||||
? `Moved window to (${x}, ${y}) and resized to ${width}×${height}.`
|
||||
: `Moved window to (${x}, ${y}).`,
|
||||
);
|
||||
}
|
||||
|
||||
// All other actions: minimize, maximize, restore, close, focus, move_offscreen
|
||||
const ok = await adapter.executor.manageWindow(action);
|
||||
if (!ok) {
|
||||
return errorResult(
|
||||
"No window is currently bound. Call open_application first.",
|
||||
"bad_args",
|
||||
);
|
||||
}
|
||||
|
||||
const descriptions: Record<string, string> = {
|
||||
minimize: "Window minimized (ShowWindow SW_MINIMIZE).",
|
||||
maximize: "Window maximized (ShowWindow SW_MAXIMIZE).",
|
||||
restore: "Window restored (ShowWindow SW_RESTORE).",
|
||||
close: "Window closed (SendMessage WM_CLOSE). The window binding has been released.",
|
||||
focus: "Window brought to front (SetForegroundWindow).",
|
||||
move_offscreen: "Window moved offscreen (-32000,-32000). Still usable via SendMessage/PrintWindow.",
|
||||
};
|
||||
|
||||
return okText(descriptions[action] ?? `Action "${action}" completed.`);
|
||||
}
|
||||
|
||||
async function handleSwitchDisplay(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
args: Record<string, unknown>,
|
||||
@@ -3383,6 +3840,64 @@ async function dispatchAction(
|
||||
overrides: ComputerUseOverrides,
|
||||
subGates: CuSubGates,
|
||||
): Promise<CuCallToolResult> {
|
||||
// ── Bound-window auto-routing ──────────────────────────────────────
|
||||
// When a window is bound (Win32), route generic input tools to
|
||||
// virtual_mouse / virtual_keyboard automatically. The model doesn't
|
||||
// need to know which tools to use — binding handles it.
|
||||
const hasBoundWindow =
|
||||
(await adapter.executor.hasBoundWindow?.()) === true &&
|
||||
adapter.executor.virtualMouse &&
|
||||
adapter.executor.virtualKeyboard;
|
||||
if (hasBoundWindow) {
|
||||
const coord = Array.isArray(a.coordinate) ? a.coordinate as number[] : undefined;
|
||||
switch (name) {
|
||||
case "left_click":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "click", coordinate: coord });
|
||||
break;
|
||||
case "double_click":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "double_click", coordinate: coord });
|
||||
break;
|
||||
case "right_click":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "right_click", coordinate: coord });
|
||||
break;
|
||||
case "mouse_move":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "move", coordinate: coord });
|
||||
break;
|
||||
case "left_click_drag":
|
||||
if (coord) return handleVirtualMouse(adapter, {
|
||||
action: "drag", coordinate: coord,
|
||||
start_coordinate: Array.isArray(a.start_coordinate) ? a.start_coordinate : undefined,
|
||||
});
|
||||
break;
|
||||
case "left_mouse_down":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "down", coordinate: coord });
|
||||
break;
|
||||
case "left_mouse_up":
|
||||
if (coord) return handleVirtualMouse(adapter, { action: "up", coordinate: coord });
|
||||
break;
|
||||
case "type":
|
||||
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "type", text: a.text });
|
||||
break;
|
||||
case "key":
|
||||
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "combo", text: a.text, repeat: a.repeat });
|
||||
break;
|
||||
case "hold_key":
|
||||
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, {
|
||||
action: "hold", text: a.text,
|
||||
duration: typeof a.duration === "number" ? a.duration : 1,
|
||||
});
|
||||
break;
|
||||
case "scroll":
|
||||
if (coord) return handleMouseWheel(adapter, {
|
||||
coordinate: coord,
|
||||
delta: a.scroll_direction === "up" ? (a.scroll_amount ?? 3) : -(a.scroll_amount ?? 3),
|
||||
direction: (a.scroll_direction === "left" || a.scroll_direction === "right") ? "horizontal" : "vertical",
|
||||
});
|
||||
break;
|
||||
// screenshot, zoom, wait, cursor_position — not rerouted, pass through
|
||||
}
|
||||
}
|
||||
// ── Standard dispatch (unbound or tools not rerouted above) ────────
|
||||
switch (name) {
|
||||
case "screenshot":
|
||||
return handleScreenshot(adapter, overrides, subGates);
|
||||
@@ -3434,6 +3949,39 @@ async function dispatchAction(
|
||||
case "open_application":
|
||||
return handleOpenApplication(adapter, a, overrides);
|
||||
|
||||
case "window_management":
|
||||
return handleWindowManagement(adapter, a);
|
||||
|
||||
case "click_element":
|
||||
return handleClickElement(adapter, a);
|
||||
|
||||
case "type_into_element":
|
||||
return handleTypeIntoElement(adapter, a);
|
||||
|
||||
case "open_terminal":
|
||||
return handleOpenTerminal(adapter, a);
|
||||
|
||||
case "bind_window":
|
||||
return handleBindWindow(adapter, a);
|
||||
|
||||
case "virtual_mouse":
|
||||
return handleVirtualMouse(adapter, a);
|
||||
|
||||
case "virtual_keyboard":
|
||||
return handleVirtualKeyboard(adapter, a);
|
||||
|
||||
case "status_indicator":
|
||||
return handleStatusIndicator(adapter, a);
|
||||
|
||||
case "mouse_wheel":
|
||||
return handleMouseWheel(adapter, a);
|
||||
|
||||
case "activate_window":
|
||||
return handleActivateWindow(adapter, a);
|
||||
|
||||
case "prompt_respond":
|
||||
return handlePromptRespond(adapter, a);
|
||||
|
||||
case "switch_display":
|
||||
return handleSwitchDisplay(adapter, a, overrides);
|
||||
|
||||
|
||||
@@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = {
|
||||
export function buildComputerUseTools(
|
||||
caps: {
|
||||
screenshotFiltering: "native" | "none";
|
||||
platform: "darwin" | "win32";
|
||||
platform: "darwin" | "win32" | "linux";
|
||||
/** Include request_teach_access + teach_step. Read once at server construction. */
|
||||
teachMode?: boolean;
|
||||
},
|
||||
@@ -414,6 +414,353 @@ export function buildComputerUseTools(
|
||||
},
|
||||
},
|
||||
|
||||
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
|
||||
// Only available on Windows when a window is bound via open_application.
|
||||
...(caps.platform === 'win32' ? [{
|
||||
name: "window_management",
|
||||
description:
|
||||
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
|
||||
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
|
||||
"The window must have been opened via open_application first. " +
|
||||
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
|
||||
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
|
||||
"Use move_resize to reposition or resize the window to specific coordinates.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
|
||||
description:
|
||||
"minimize: ShowWindow(SW_MINIMIZE). " +
|
||||
"maximize: ShowWindow(SW_MAXIMIZE). " +
|
||||
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
|
||||
"close: SendMessage(WM_CLOSE) — graceful close. " +
|
||||
"focus: SetForegroundWindow + BringWindowToTop. " +
|
||||
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
|
||||
"move_resize: SetWindowPos to specific x,y,width,height. " +
|
||||
"get_rect: GetWindowRect — returns current position and size.",
|
||||
},
|
||||
x: { type: "integer", description: "X position for move_resize." },
|
||||
y: { type: "integer", description: "Y position for move_resize." },
|
||||
width: { type: "integer", description: "Width for move_resize." },
|
||||
height: { type: "integer", description: "Height for move_resize." },
|
||||
},
|
||||
required: ["action"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "click_element",
|
||||
description:
|
||||
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
|
||||
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
|
||||
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
|
||||
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
name: {
|
||||
type: "string",
|
||||
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
|
||||
},
|
||||
role: {
|
||||
type: "string",
|
||||
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
|
||||
},
|
||||
automationId: {
|
||||
type: "string",
|
||||
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
|
||||
},
|
||||
},
|
||||
required: [],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "type_into_element",
|
||||
description:
|
||||
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
|
||||
"Finds the element by name/role/automationId, then sets its value directly — " +
|
||||
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
name: { type: "string", description: "Accessible name of the target element." },
|
||||
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
|
||||
automationId: { type: "string", description: "Exact automationId." },
|
||||
text: { type: "string", description: "Text to type/set into the element." },
|
||||
},
|
||||
required: ["text"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "open_terminal",
|
||||
description:
|
||||
"Open a new terminal window and launch an AI agent CLI. " +
|
||||
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
|
||||
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
|
||||
"or any custom command. After launching, the tool binds to the new terminal window " +
|
||||
"and takes a screenshot to verify the agent started successfully. " +
|
||||
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
agent: {
|
||||
type: "string",
|
||||
enum: ["claude", "codex", "gemini", "custom"],
|
||||
description:
|
||||
"Which agent to launch. " +
|
||||
"claude: runs 'claude' command. " +
|
||||
"codex: runs 'codex' command. " +
|
||||
"gemini: runs 'gemini' command. " +
|
||||
"custom: runs the command specified in 'command' parameter.",
|
||||
},
|
||||
command: {
|
||||
type: "string",
|
||||
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
|
||||
},
|
||||
terminal: {
|
||||
type: "string",
|
||||
enum: ["wt", "powershell", "cmd"],
|
||||
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
|
||||
},
|
||||
working_directory: {
|
||||
type: "string",
|
||||
description: "Working directory for the terminal. If omitted, uses current directory.",
|
||||
},
|
||||
},
|
||||
required: ["agent"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "bind_window",
|
||||
description:
|
||||
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
|
||||
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
|
||||
"no cursor movement, no focus steal, no interference with the user's desktop. " +
|
||||
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
|
||||
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
|
||||
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["bind", "unbind", "status", "list"],
|
||||
description:
|
||||
"bind: Bind to a window (specify title, hwnd, or pid). " +
|
||||
"unbind: Release the current binding, return to full-screen mode. " +
|
||||
"status: Show the currently bound window (hwnd, title, rect). " +
|
||||
"list: List all visible windows with hwnd, pid, and title.",
|
||||
},
|
||||
title: {
|
||||
type: "string",
|
||||
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
|
||||
},
|
||||
hwnd: {
|
||||
type: "string",
|
||||
description: "Exact window handle from 'list' output. For 'bind' action.",
|
||||
},
|
||||
pid: {
|
||||
type: "integer",
|
||||
description: "Process ID to find window for. For 'bind' action.",
|
||||
},
|
||||
},
|
||||
required: ["action"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "activate_window",
|
||||
description:
|
||||
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
|
||||
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
|
||||
"the window is ready to receive keyboard/mouse events. " +
|
||||
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
|
||||
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
|
||||
},
|
||||
required: [],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "prompt_respond",
|
||||
description:
|
||||
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
|
||||
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
|
||||
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
|
||||
"Typical flows: " +
|
||||
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
|
||||
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
|
||||
"3) Text input prompt → type the response + Enter. " +
|
||||
"After responding, take a screenshot to verify the result.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
response_type: {
|
||||
type: "string",
|
||||
enum: ["yes", "no", "enter", "escape", "select", "type"],
|
||||
description:
|
||||
"yes: send 'y' + Enter. " +
|
||||
"no: send 'n' + Enter. " +
|
||||
"enter: send Enter only. " +
|
||||
"escape: send Escape (cancel). " +
|
||||
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
|
||||
"type: type custom text then Enter. Requires 'text'.",
|
||||
},
|
||||
arrow_direction: {
|
||||
type: "string",
|
||||
enum: ["up", "down"],
|
||||
description: "Arrow key direction for 'select' type. Default: 'down'.",
|
||||
},
|
||||
arrow_count: {
|
||||
type: "integer",
|
||||
description: "Number of arrow key presses for 'select' type. Default: 1.",
|
||||
minimum: 0,
|
||||
maximum: 50,
|
||||
},
|
||||
text: {
|
||||
type: "string",
|
||||
description: "Text to type for 'type' response_type.",
|
||||
},
|
||||
},
|
||||
required: ["response_type"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "status_indicator",
|
||||
description:
|
||||
"Control the visual status indicator overlay on the bound window. " +
|
||||
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
|
||||
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
|
||||
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["show", "hide", "status"],
|
||||
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
|
||||
},
|
||||
message: {
|
||||
type: "string",
|
||||
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
|
||||
},
|
||||
},
|
||||
required: ["action"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "virtual_keyboard",
|
||||
description:
|
||||
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
|
||||
"The user can keep typing on their own keyboard without interference. " +
|
||||
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
|
||||
"All input targets the bound HWND only — no global keyboard events.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["type", "combo", "press", "release", "hold"],
|
||||
description:
|
||||
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
|
||||
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
|
||||
"press: Press a key down and hold it (pair with 'release'). " +
|
||||
"release: Release a previously pressed key. " +
|
||||
"hold: Press key(s) for a duration then release.",
|
||||
},
|
||||
text: {
|
||||
type: "string",
|
||||
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
|
||||
},
|
||||
duration: {
|
||||
type: "number",
|
||||
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
|
||||
},
|
||||
repeat: {
|
||||
type: "integer",
|
||||
description: "Number of times to repeat the action. Default: 1.",
|
||||
minimum: 1,
|
||||
maximum: 100,
|
||||
},
|
||||
},
|
||||
required: ["action", "text"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "virtual_mouse",
|
||||
description:
|
||||
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
|
||||
"The user's real cursor stays free. All operations target the bound HWND only.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
|
||||
description:
|
||||
"click: left-click at coordinate. " +
|
||||
"double_click: double left-click. " +
|
||||
"right_click: right-click. " +
|
||||
"move: move virtual cursor (visual only, no click). " +
|
||||
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
|
||||
"down: press left button at coordinate (hold). " +
|
||||
"up: release left button at coordinate.",
|
||||
},
|
||||
coordinate: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description: "(x, y) client-area coordinate on the bound window.",
|
||||
},
|
||||
start_coordinate: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
|
||||
},
|
||||
},
|
||||
required: ["action", "coordinate"],
|
||||
},
|
||||
} as Tool,
|
||||
{
|
||||
name: "mouse_wheel",
|
||||
description:
|
||||
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
|
||||
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
|
||||
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
|
||||
"code editors, PDF viewers, and any modern UI. " +
|
||||
"Specify the click point within the window where the scroll should occur — " +
|
||||
"this determines which panel/pane/element receives the scroll.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
|
||||
},
|
||||
delta: {
|
||||
type: "integer",
|
||||
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
|
||||
},
|
||||
direction: {
|
||||
type: "string",
|
||||
enum: ["vertical", "horizontal"],
|
||||
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
|
||||
},
|
||||
},
|
||||
required: ["coordinate", "delta"],
|
||||
},
|
||||
} as Tool,
|
||||
] : []),
|
||||
|
||||
{
|
||||
name: "switch_display",
|
||||
description:
|
||||
|
||||
@@ -159,28 +159,23 @@ export const apps: AppsAPI = {
|
||||
|
||||
async listInstalled() {
|
||||
try {
|
||||
// Use Spotlight (mdfind) to enumerate .app bundles and mdls to get real bundle IDs.
|
||||
// Searches /Applications, /System/Applications, and /System/Applications/Utilities
|
||||
// so that system apps (Terminal, Chess, etc.) and core services (Finder) are found.
|
||||
const proc = Bun.spawn([
|
||||
'bash', '-c',
|
||||
`for dir in /Applications /System/Applications /System/Applications/Utilities /System/Library/CoreServices; do
|
||||
mdfind 'kMDItemContentType == "com.apple.application-bundle"' -onlyin "$dir" 2>/dev/null
|
||||
done | sort -u | while read -r appPath; do
|
||||
bundleId=$(mdls -raw -name kMDItemCFBundleIdentifier "$appPath" 2>/dev/null)
|
||||
if [ -n "$bundleId" ] && [ "$bundleId" != "(null)" ]; then
|
||||
displayName=$(basename "$appPath" .app)
|
||||
echo "$bundleId|$displayName|$appPath"
|
||||
fi
|
||||
done`,
|
||||
], { stdout: 'pipe', stderr: 'pipe' })
|
||||
const text = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return text.split('\n').filter(Boolean).map(line => {
|
||||
const [bundleId, displayName, path] = line.split('|', 3)
|
||||
const result = await osascript(`
|
||||
tell application "System Events"
|
||||
set appList to ""
|
||||
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
|
||||
set appPath to POSIX path of (appFile as alias)
|
||||
set appName to name of appFile
|
||||
set appList to appList & appPath & "|" & appName & "\\n"
|
||||
end repeat
|
||||
return appList
|
||||
end tell
|
||||
`)
|
||||
return result.split('\n').filter(Boolean).map(line => {
|
||||
const [path, name] = line.split('|', 2)
|
||||
const displayName = (name ?? '').replace(/\.app$/, '')
|
||||
return {
|
||||
bundleId: bundleId ?? '',
|
||||
displayName: displayName ?? '',
|
||||
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
|
||||
displayName,
|
||||
path: path ?? '',
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
/**
|
||||
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
|
||||
* @ant/computer-use-swift — macOS display, apps, and screenshot (Swift native)
|
||||
*
|
||||
* Platform backends:
|
||||
* - darwin: AppleScript/JXA + screencapture
|
||||
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
|
||||
*
|
||||
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
|
||||
* This package wraps the macOS-only Swift .node native module.
|
||||
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
|
||||
*/
|
||||
|
||||
// Re-export all types
|
||||
export type {
|
||||
DisplayGeometry,
|
||||
PrepareDisplayResult,
|
||||
@@ -18,72 +14,42 @@ export type {
|
||||
ScreenshotResult,
|
||||
ResolvePrepareCaptureResult,
|
||||
WindowDisplayInfo,
|
||||
DisplayAPI,
|
||||
AppsAPI,
|
||||
ScreenshotAPI,
|
||||
SwiftBackend,
|
||||
} from './types.js'
|
||||
} from './backends/darwin.js'
|
||||
|
||||
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
|
||||
import type { ResolvePrepareCaptureResult } from './backends/darwin.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Platform dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function loadBackend(): SwiftBackend | null {
|
||||
function loadDarwin() {
|
||||
if (process.platform !== 'darwin') return null
|
||||
try {
|
||||
switch (process.platform) {
|
||||
case 'darwin':
|
||||
return require('./backends/darwin.js') as SwiftBackend
|
||||
case 'win32':
|
||||
return require('./backends/win32.js') as SwiftBackend
|
||||
case 'linux':
|
||||
return require('./backends/linux.js') as SwiftBackend
|
||||
default:
|
||||
return null
|
||||
}
|
||||
return require('./backends/darwin.js')
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
const backend = loadBackend()
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ComputerUseAPI — Main export (preserves original class interface)
|
||||
// ---------------------------------------------------------------------------
|
||||
const darwin = loadDarwin()
|
||||
|
||||
export class ComputerUseAPI {
|
||||
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
|
||||
// These stubs should never be reached in practice — callers check isSupported
|
||||
// or the feature gate before invoking.
|
||||
|
||||
apps = backend?.apps ?? {
|
||||
apps = darwin?.apps ?? {
|
||||
async prepareDisplay() { return { activated: '', hidden: [] } },
|
||||
async previewHideSet() { return [] },
|
||||
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
|
||||
async findWindowDisplays(ids: string[]) { return ids.map((b: string) => ({ bundleId: b, displayIds: [] as number[] })) },
|
||||
async appUnderPoint() { return null },
|
||||
async listInstalled() { return [] },
|
||||
iconDataUrl() { return null },
|
||||
listRunning() { return [] },
|
||||
async open() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
async open() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||
async unhide() {},
|
||||
}
|
||||
|
||||
display = backend?.display ?? {
|
||||
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
display = darwin?.display ?? {
|
||||
getSize() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||
listAll() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||
}
|
||||
|
||||
screenshot = backend?.screenshot ?? {
|
||||
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
}
|
||||
|
||||
hotkey = (backend as any)?.hotkey ?? {
|
||||
registerEscape(_cb: () => void): boolean { return false },
|
||||
unregister() {},
|
||||
notifyExpectedEscape() {},
|
||||
screenshot = darwin?.screenshot ?? {
|
||||
async captureExcluding() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||
async captureRegion() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||
}
|
||||
|
||||
async resolvePrepareCapture(
|
||||
@@ -93,8 +59,6 @@ export class ComputerUseAPI {
|
||||
targetW: number,
|
||||
targetH: number,
|
||||
displayId?: number,
|
||||
_autoResolve?: boolean,
|
||||
_doHide?: boolean,
|
||||
): Promise<ResolvePrepareCaptureResult> {
|
||||
return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user