feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge

三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
unraid
2026-04-05 15:27:50 +08:00
parent 7a2ade0a02
commit c17edcb12e
36 changed files with 8297 additions and 351 deletions

14
.gitignore vendored
View File

@@ -13,4 +13,16 @@ src/utils/vendor/
# AI tool runtime directories
.agents/
.codex/
.omx/
.omx/
# Binary / screenshot files (root only)
/*.png
*.bmp
# Agent / tool state dirs
.swarm/
.agents/__pycache__/
# Python bytecode
__pycache__/
*.pyc

View File

@@ -39,6 +39,7 @@
## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03)
在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。
**新增文件:**
@@ -118,23 +119,6 @@ packages/@ant/computer-use-{input,swift}/src/
| `vendor/audio-capture/{platform}/audio-capture.node` | 6 个平台的原生音频二进制cpal来自参考项目 |
| `vendor/audio-capture-src/index.ts` | 原生模块加载器(按 `${arch}-${platform}` 动态 require `.node` |
**修改文件:**
| 文件 | 变更 |
|------|------|
| `packages/audio-capture-napi/src/index.ts` | SoX 子进程 stub → 原生 `.node` 加载器(含 `process.cwd()` workspace 路径 fallback |
| `scripts/dev.ts` | `DEFAULT_FEATURES``"VOICE_MODE"` |
| `build.ts` | `DEFAULT_BUILD_FEATURES``"VOICE_MODE"` |
| `docs/features/voice-mode.md` | 追加恢复计划章节(第八节) |
**验证结果:**
- `isNativeAudioAvailable()``true`Windows x64 原生 `.node` 加载成功)
- `feature('VOICE_MODE')``ENABLED`
- `bun run build` → voice 代码编入产物
**运行时前置条件:** claude.ai OAuth 登录 + 麦克风权限
---
## Enable Claude in Chrome MCP (2026-04-03)

View File

@@ -2,11 +2,11 @@ import { readdir, readFile, writeFile, cp } from "fs/promises";
import { join } from "path";
import { getMacroDefines } from "./scripts/defines.ts";
const outdir = "dist";
const outdir = 'dist'
// Step 1: Clean output directory
const { rmSync } = await import("fs");
rmSync(outdir, { recursive: true, force: true });
const { rmSync } = await import('fs')
rmSync(outdir, { recursive: true, force: true })
// Default features that match the official CLI build.
// Additional features can be enabled via FEATURE_<NAME>=1 env vars.
@@ -14,50 +14,50 @@ const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "CHICAGO_MCP", "VOICE_M
// Collect FEATURE_* env vars → Bun.build features
const envFeatures = Object.keys(process.env)
.filter(k => k.startsWith("FEATURE_"))
.map(k => k.replace("FEATURE_", ""));
const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])];
.filter(k => k.startsWith('FEATURE_'))
.map(k => k.replace('FEATURE_', ''))
const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])]
// Step 2: Bundle with splitting
const result = await Bun.build({
entrypoints: ["src/entrypoints/cli.tsx"],
outdir,
target: "bun",
splitting: true,
define: getMacroDefines(),
features,
});
entrypoints: ['src/entrypoints/cli.tsx'],
outdir,
target: 'bun',
splitting: true,
define: getMacroDefines(),
features,
})
if (!result.success) {
console.error("Build failed:");
for (const log of result.logs) {
console.error(log);
}
process.exit(1);
console.error('Build failed:')
for (const log of result.logs) {
console.error(log)
}
process.exit(1)
}
// Step 3: Post-process — replace Bun-only `import.meta.require` with Node.js compatible version
const files = await readdir(outdir);
const IMPORT_META_REQUIRE = "var __require = import.meta.require;";
const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);`;
const files = await readdir(outdir)
const IMPORT_META_REQUIRE = 'var __require = import.meta.require;'
const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);`
let patched = 0;
let patched = 0
for (const file of files) {
if (!file.endsWith(".js")) continue;
const filePath = join(outdir, file);
const content = await readFile(filePath, "utf-8");
if (content.includes(IMPORT_META_REQUIRE)) {
await writeFile(
filePath,
content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE),
);
patched++;
}
if (!file.endsWith('.js')) continue
const filePath = join(outdir, file)
const content = await readFile(filePath, 'utf-8')
if (content.includes(IMPORT_META_REQUIRE)) {
await writeFile(
filePath,
content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE),
)
patched++
}
}
console.log(
`Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`,
);
`Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`,
)
// Step 4: Copy native .node addon files (audio-capture)
const vendorDir = join(outdir, "vendor", "audio-capture");
@@ -66,16 +66,16 @@ console.log(`Copied vendor/audio-capture/ → ${vendorDir}/`);
// Step 5: Bundle download-ripgrep script as standalone JS for postinstall
const rgScript = await Bun.build({
entrypoints: ["scripts/download-ripgrep.ts"],
outdir,
target: "node",
});
entrypoints: ['scripts/download-ripgrep.ts'],
outdir,
target: 'node',
})
if (!rgScript.success) {
console.error("Failed to bundle download-ripgrep script:");
for (const log of rgScript.logs) {
console.error(log);
}
// Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts
console.error('Failed to bundle download-ripgrep script:')
for (const log of rgScript.logs) {
console.error(log)
}
// Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts
} else {
console.log(`Bundled download-ripgrep script to ${outdir}/`);
console.log(`Bundled download-ripgrep script to ${outdir}/`)
}

View File

@@ -0,0 +1,325 @@
# Computer Use 架构修正方案 v2
更新时间2026-04-04
## 1. 当前架构的问题
### 问题 A平台代码混在错误的包里
`@ant/computer-use-swift` 是 macOS Swift 原生模块的包装器,但我们把 Windows`backends/win32.ts`)和 Linux`backends/linux.ts`)的截图/应用管理代码塞进了这个包。"swift" 在名字里就意味着 macOS后期维护者无法区分。
`@ant/computer-use-input` 同样——原本是 macOS enigo Rust 模块,我们也往里面塞了 win32/linux 后端。
### 问题 B输入方式不对
当前 Windows 后端(`packages/@ant/computer-use-input/src/backends/win32.ts`)使用 `SetCursorPos` + `SendInput` + `keybd_event`——这是**全局输入**
- 鼠标真的会移动到屏幕上
- 键盘真的打到当前前台窗口
- **会影响用户当前的操作**
绑定窗口句柄后,应该用 `SendMessage`/`PostMessage` 向目标 HWND 发送消息:
- `WM_CHAR` — 发送字符,不移动光标
- `WM_KEYDOWN`/`WM_KEYUP` — 发送按键
- `WM_LBUTTONDOWN`/`WM_LBUTTONUP` — 发送鼠标点击(窗口客户区相对坐标)
- `PrintWindow` — 截取窗口内容,不需要窗口在前台
- **不抢焦点、不影响用户当前操作**
已验证:向记事本 `SendMessage(WM_CHAR)` 成功写入文字,记事本在后台,终端保持前台。
### 问题 C截图是公共能力不属于 swift
截图screenshot、显示器枚举display、应用管理apps是所有平台都需要的公共能力不应该放在 `@ant/computer-use-swift`macOS 专属包名)里。
## 2. 修正后的架构
### 2.1 分层原则
```
packages/@ant/ ← macOS 原生模块包装器(不放其他平台代码)
├── computer-use-input/ ← macOS: enigo .node 键鼠(仅 darwin
├── computer-use-swift/ ← macOS: Swift .node 截图/应用(仅 darwin
└── computer-use-mcp/ ← 跨平台: MCP server + 工具定义(不改)
src/utils/computerUse/
├── platforms/ ← 新增: 跨平台抽象层
│ ├── types.ts ← 公共接口: InputPlatform, ScreenshotPlatform, AppsPlatform, DisplayPlatform
│ ├── index.ts ← 平台分发器: 按 process.platform 加载后端
│ ├── darwin.ts ← macOS: 委托给 @ant/computer-use-{input,swift}
│ ├── win32.ts ← Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows + UIA + OCR
│ └── linux.ts ← Linux: xdotool + scrot + xrandr + wmctrl
├── win32/ ← Windows 专属增强能力(不在公共接口中)
│ ├── windowCapture.ts ← PrintWindow 窗口绑定截图
│ ├── windowEnum.ts ← EnumWindows 窗口枚举
│ ├── windowMessage.ts ← SendMessage/PostMessage 无焦点输入(新增)
│ ├── uiAutomation.ts ← IUIAutomation UI 元素操作
│ └── ocr.ts ← Windows.Media.Ocr 文字识别
├── executor.ts ← 改: 通过 platforms/ 获取平台实现,不直接调 @ant 包
├── swiftLoader.ts ← 改: 仅 darwin 使用
├── inputLoader.ts ← 改: 仅 darwin 使用
└── ...其他文件不动
```
### 2.2 公共接口(`platforms/types.ts`
```typescript
/** 窗口标识 — 跨平台 */
export interface WindowHandle {
id: string // macOS: bundleId, Windows: HWND string, Linux: window ID
pid: number
title: string
exePath?: string // Windows/Linux: 进程路径
}
/** 输入平台接口 — 两种模式 */
export interface InputPlatform {
// 模式 A: 全局输入macOS/Linux 默认,向前台窗口发送)
moveMouse(x: number, y: number): Promise<void>
click(x: number, y: number, button: 'left' | 'right' | 'middle'): Promise<void>
typeText(text: string): Promise<void>
key(name: string, action: 'press' | 'release'): Promise<void>
keys(combo: string[]): Promise<void>
scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
mouseLocation(): Promise<{ x: number; y: number }>
// 模式 B: 窗口绑定输入Windows SendMessage不抢焦点
sendChar?(hwnd: string, char: string): Promise<void>
sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise<void>
sendClick?(hwnd: string, x: number, y: number, button: 'left' | 'right'): Promise<void>
sendText?(hwnd: string, text: string): Promise<void>
}
/** 截图平台接口 */
export interface ScreenshotPlatform {
// 全屏截图
captureScreen(displayId?: number): Promise<ScreenshotResult>
// 区域截图
captureRegion(x: number, y: number, w: number, h: number): Promise<ScreenshotResult>
// 窗口截图Windows: PrintWindowmacOS: SCContentFilterLinux: xdotool+import
captureWindow?(hwnd: string): Promise<ScreenshotResult | null>
}
/** 显示器平台接口 */
export interface DisplayPlatform {
listAll(): DisplayInfo[]
getSize(displayId?: number): DisplayInfo
}
/** 应用管理平台接口 */
export interface AppsPlatform {
listRunning(): WindowHandle[]
listInstalled(): Promise<InstalledApp[]>
open(name: string): Promise<void>
getFrontmostApp(): FrontmostAppInfo | null
findWindowByTitle(title: string): WindowHandle | null
}
export interface ScreenshotResult {
base64: string
width: number
height: number
}
export interface DisplayInfo {
width: number
height: number
scaleFactor: number
displayId: number
}
export interface InstalledApp {
id: string // macOS: bundleId, Windows: exe path, Linux: .desktop name
displayName: string
path: string
}
export interface FrontmostAppInfo {
id: string
appName: string
}
```
### 2.3 平台分发器(`platforms/index.ts`
```typescript
import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform } from './types.js'
export interface Platform {
input: InputPlatform
screenshot: ScreenshotPlatform
display: DisplayPlatform
apps: AppsPlatform
}
export function loadPlatform(): Platform {
switch (process.platform) {
case 'darwin':
return require('./darwin.js').platform
case 'win32':
return require('./win32.js').platform
case 'linux':
return require('./linux.js').platform
default:
throw new Error(`Computer Use not supported on ${process.platform}`)
}
}
```
### 2.4 各平台实现
**`platforms/darwin.ts`** — 委托给 @ant 包(保持兼容):
```typescript
// macOS: 通过 @ant/computer-use-input 和 @ant/computer-use-swift
// 这两个包的 darwin 后端保留不动
import { requireComputerUseInput } from '../inputLoader.js'
import { requireComputerUseSwift } from '../swiftLoader.js'
export const platform = {
input: { /* 委托给 requireComputerUseInput() */ },
screenshot: { /* 委托给 requireComputerUseSwift().screenshot */ },
display: { /* 委托给 requireComputerUseSwift().display */ },
apps: { /* 委托给 requireComputerUseSwift().apps */ },
}
```
**`platforms/win32.ts`** — 使用 `src/utils/computerUse/win32/` 模块:
```typescript
// Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows 应用
import { sendChar, sendKey, sendClick, sendText } from '../win32/windowMessage.js'
import { captureWindow } from '../win32/windowCapture.js'
import { listWindows } from '../win32/windowEnum.js'
// ... PowerShell P/Invoke 全局输入作为 fallback
export const platform = {
input: {
// 全局模式: PowerShell SetCursorPos/SendInputfallback
// 窗口模式: SendMessage首选
sendChar, sendKey, sendClick, sendText, // 窗口绑定
moveMouse, click, typeText, ... // 全局 fallback
},
screenshot: {
captureScreen, // CopyFromScreen
captureRegion, // CopyFromScreen(rect)
captureWindow, // PrintWindow不抢焦点
},
display: { /* Screen.AllScreens */ },
apps: { /* EnumWindows */ },
}
```
**`platforms/linux.ts`** — 使用 xdotool/scrot
```typescript
// Linux: xdotool + scrot + xrandr + wmctrl
export const platform = {
input: { /* xdotool mousemove/click/key/type */ },
screenshot: { /* scrot */ },
display: { /* xrandr */ },
apps: { /* wmctrl + ps */ },
}
```
### 2.5 executor.ts 改造
```typescript
// 之前: 直接调 requireComputerUseSwift() 和 requireComputerUseInput()
// 之后: 通过 platforms/ 统一获取
import { loadPlatform } from './platforms/index.js'
const platform = loadPlatform()
// 截图
platform.screenshot.captureScreen()
platform.screenshot.captureWindow(hwnd) // 窗口绑定
// 输入(窗口绑定模式,不抢焦点)
platform.input.sendText?.(hwnd, 'Hello')
platform.input.sendClick?.(hwnd, 100, 200, 'left')
// 输入全局模式fallback
platform.input.moveMouse(500, 500)
platform.input.click(500, 500, 'left')
```
## 3. Windows 输入模式对比
| 方式 | API | 抢焦点 | 移鼠标 | 窗口可最小化 | 适用场景 |
|------|-----|--------|--------|-------------|---------|
| **全局输入** | `SetCursorPos` + `SendInput` | ✅ 抢 | ✅ 动 | ❌ 不行 | 需要坐标点击fallback |
| **窗口消息** | `SendMessage(WM_CHAR/WM_KEYDOWN)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 打字、按键(首选) |
| **窗口消息** | `SendMessage(WM_LBUTTONDOWN)` | ❌ 不抢 | ❌ 不动 | ⚠️ 部分 | 窗口内点击 |
| **窗口截图** | `PrintWindow(hwnd, PW_RENDERFULLCONTENT)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 窗口截图 |
| **UI 操作** | `UIAutomation InvokePattern` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 按钮点击、文本写入 |
**策略**:优先用窗口消息 + UIAutomation不干扰用户全局输入作为 fallback。
## 4. 需要新增的文件
| 文件 | 说明 |
|------|------|
| `src/utils/computerUse/platforms/types.ts` | 公共接口定义 |
| `src/utils/computerUse/platforms/index.ts` | 平台分发器 |
| `src/utils/computerUse/platforms/darwin.ts` | macOS: 委托给 @ant 包 |
| `src/utils/computerUse/platforms/win32.ts` | Windows: 组合 win32/ 下各模块 |
| `src/utils/computerUse/platforms/linux.ts` | Linux: xdotool/scrot |
| `src/utils/computerUse/win32/windowMessage.ts` | **新增**: SendMessage 无焦点输入 |
## 5. 需要移除/清理的文件
| 文件 | 操作 | 原因 |
|------|------|------|
| `packages/@ant/computer-use-input/src/backends/win32.ts` | 删除 | Windows 代码不应在 macOS 包里 |
| `packages/@ant/computer-use-input/src/backends/linux.ts` | 删除 | Linux 代码不应在 macOS 包里 |
| `packages/@ant/computer-use-swift/src/backends/win32.ts` | 删除 | 同上 |
| `packages/@ant/computer-use-swift/src/backends/linux.ts` | 删除 | 同上 |
| `packages/@ant/computer-use-input/src/types.ts` | 删除 | 移到 platforms/types.ts |
| `packages/@ant/computer-use-swift/src/types.ts` | 删除 | 移到 platforms/types.ts |
## 6. 需要修改的文件
| 文件 | 改动 |
|------|------|
| `packages/@ant/computer-use-input/src/index.ts` | 恢复为仅 darwin dispatcher去掉 win32/linux case |
| `packages/@ant/computer-use-swift/src/index.ts` | 恢复为仅 darwin dispatcher去掉 win32/linux case |
| `src/utils/computerUse/executor.ts` | 通过 `platforms/` 获取平台实现,不直接调 @ant 包 |
| `src/utils/computerUse/swiftLoader.ts` | 仅 darwin 加载 |
| `src/utils/computerUse/inputLoader.ts` | 仅 darwin 加载 |
## 7. @ant 包的定位(修正后)
| 包 | 职责 | 平台 |
|---|------|------|
| `@ant/computer-use-input` | macOS enigo 键鼠原生模块包装 | **仅 darwin** |
| `@ant/computer-use-swift` | macOS Swift 截图/应用原生模块包装 | **仅 darwin** |
| `@ant/computer-use-mcp` | MCP Server + 工具定义 + 调用路由 | **跨平台**(不含平台代码) |
Windows/Linux 的平台实现全部在 `src/utils/computerUse/platforms/``src/utils/computerUse/win32/` 中。
## 8. 执行顺序
```
Phase 1: 创建 platforms/ 抽象层
├── platforms/types.ts公共接口
├── platforms/index.ts分发器
└── platforms/darwin.ts委托 @ant 包)
Phase 2: 创建 Windows 平台实现
├── win32/windowMessage.tsSendMessage 无焦点输入)
└── platforms/win32.ts组合 win32/ 各模块)
Phase 3: 创建 Linux 平台实现
└── platforms/linux.tsxdotool/scrot
Phase 4: 改造 executor.ts
└── 通过 platforms/ 获取实现,不直接调 @ant
Phase 5: 清理 @ant 包
├── 删除 @ant/computer-use-input/src/backends/{win32,linux}.ts
├── 删除 @ant/computer-use-swift/src/backends/{win32,linux}.ts
└── 恢复 index.ts 为 darwin-only
Phase 6: 验证 + PR
```

View File

@@ -0,0 +1,496 @@
# Computer Use 工具参考文档
## 概览
Computer Use 提供 37 个工具,分为三类:
| 分类 | 平台 | 工具数 | 说明 |
|------|------|--------|------|
| 通用工具 | 全平台 | 24 | 官方 Computer Use 标准能力 |
| Windows 专属工具 | Win32 | 10 | 绑定窗口模式下的增强能力 |
| 教学工具 | 全平台 | 3 | 分步引导模式(需 teachMode 开启) |
---
## 一、通用工具24 个)
全平台可用。未绑定窗口时,操作对象是整个屏幕。
### 权限与会话
| 工具 | 参数 | 说明 |
|------|------|------|
| `request_access` | `apps[]`, `reason`, `clipboardRead?`, `clipboardWrite?`, `systemKeyCombos?` | 请求操作应用的权限。所有其他工具的前置条件 |
| `list_granted_applications` | — | 列出当前会话已授权的应用 |
### 截图与显示
| 工具 | 参数 | 说明 |
|------|------|------|
| `screenshot` | `save_to_disk?` | 截取当前屏幕。绑定窗口时截取绑定窗口PrintWindow。返回图片 + GUI 元素列表Windows |
| `zoom` | `region: [x1,y1,x2,y2]` | 截取指定区域的高分辨率图片。坐标基于最近一次全屏截图 |
| `switch_display` | `display` | 切换截图的目标显示器 |
### 鼠标操作
| 工具 | 参数 | 说明 |
|------|------|------|
| `left_click` | `coordinate: [x,y]`, `text?` (修饰键) | 左键点击。`text` 可传 "shift"/"ctrl"/"alt" 实现组合点击 |
| `double_click` | `coordinate`, `text?` | 双击 |
| `triple_click` | `coordinate`, `text?` | 三击(选整行) |
| `right_click` | `coordinate`, `text?` | 右键点击 |
| `middle_click` | `coordinate`, `text?` | 中键点击 |
| `mouse_move` | `coordinate` | 移动鼠标(不点击) |
| `left_click_drag` | `coordinate` (终点), `start_coordinate?` (起点) | 拖拽 |
| `left_mouse_down` | — | 按下左键不松 |
| `left_mouse_up` | — | 松开左键 |
| `cursor_position` | — | 获取当前鼠标位置 |
### 键盘操作
| 工具 | 参数 | 说明 |
|------|------|------|
| `type` | `text` | 输入文字 |
| `key` | `text` (如 "ctrl+s"), `repeat?` | 按键/组合键 |
| `hold_key` | `text`, `duration` (秒) | 按住键指定时长 |
### 滚动
| 工具 | 参数 | 说明 |
|------|------|------|
| `scroll` | `coordinate`, `scroll_direction`, `scroll_amount` | 滚动。方向: up/down/left/right |
### 应用管理
| 工具 | 参数 | 说明 |
|------|------|------|
| `open_application` | `app` | 打开应用。Windows 上自动绑定窗口 |
### 剪贴板
| 工具 | 参数 | 说明 |
|------|------|------|
| `read_clipboard` | — | 读取剪贴板文字 |
| `write_clipboard` | `text` | 写入剪贴板 |
### 其他
| 工具 | 参数 | 说明 |
|------|------|------|
| `wait` | `duration` (秒) | 等待 |
| `computer_batch` | `actions[]` | 批量执行多个动作(减少 API 往返) |
---
## 二、Windows 专属工具10 个)
仅 Windows 平台可见。核心能力:**绑定窗口后的独立操作——不抢占用户鼠标键盘**。
### 工作模式
```
┌──────────────────────────────────────────────────┐
│ 未绑定模式 │
│ 使用通用工具 (left_click/type/key/scroll) │
│ 操作对象:整个屏幕 │
│ 输入方式:全局 SendInput会移动真实鼠标
└──────────────────────────────────────────────────┘
bind_window / open_application
┌──────────────────────────────────────────────────┐
│ 绑定窗口模式 │
│ 使用 Win32 工具 (virtual_mouse/virtual_keyboard) │
│ 操作对象:绑定的窗口 │
│ 输入方式SendMessageW不动真实鼠标/键盘) │
│ 可视化DWM 绿色边框 + 虚拟光标 + 状态指示器 │
└──────────────────────────────────────────────────┘
```
### 窗口绑定
| 工具 | 参数 | 说明 |
|------|------|------|
| `bind_window` | `action`: list/bind/unbind/status | 窗口绑定管理 |
**动作详情:**
| action | 参数 | 说明 |
|--------|------|------|
| `list` | — | 列出所有可见窗口hwnd、pid、title |
| `bind` | `title?`, `hwnd?`, `pid?` | 绑定到指定窗口。设置 DWM 绿色边框 + 启动虚拟光标 + 启动状态指示器 + 短暂激活窗口确保可接收输入 |
| `unbind` | — | 解除绑定,恢复全屏模式 |
| `status` | — | 查看当前绑定状态hwnd、title、pid、窗口矩形 |
### 窗口管理
| 工具 | 参数 | 说明 |
|------|------|------|
| `window_management` | `action`, `x?`, `y?`, `width?`, `height?` | 窗口操作Win32 API不走全局快捷键 |
**动作详情:**
| action | 说明 |
|--------|------|
| `minimize` | ShowWindow(SW_MINIMIZE) |
| `maximize` | ShowWindow(SW_MAXIMIZE) |
| `restore` | ShowWindow(SW_RESTORE) — 恢复最小化/最大化 |
| `close` | SendMessage(WM_CLOSE) — 优雅关闭 |
| `focus` | SetForegroundWindow + BringWindowToTop — 激活窗口 |
| `move_offscreen` | SetWindowPos(-32000,-32000) — 移到屏幕外(仍可 SendMessage/PrintWindow |
| `move_resize` | SetWindowPos — 移动/缩放到指定位置和大小 |
| `get_rect` | GetWindowRect — 获取当前位置和大小 |
### 虚拟鼠标
| 工具 | 参数 | 说明 |
|------|------|------|
| `virtual_mouse` | `action`, `coordinate: [x,y]`, `start_coordinate?` | 在绑定窗口内操作虚拟鼠标 |
**动作详情:**
| action | 说明 |
|--------|------|
| `click` | 左键点击。虚拟光标移动到坐标 + 闪烁动画 |
| `double_click` | 双击 |
| `right_click` | 右键点击 |
| `move` | 移动虚拟光标(不点击) |
| `drag` | 按住 → 移动 → 松开。需 `start_coordinate` 指定起点 |
| `down` | 按下左键不松 |
| `up` | 松开左键 |
**与通用鼠标工具的区别:**
| | 通用 (`left_click` 等) | `virtual_mouse` |
|---|---|---|
| 输入方式 | SendInput全局 | SendMessageW窗口级 |
| 真实鼠标 | 会移动 | **不动** |
| 用户干扰 | 有 | **无** |
| 适用场景 | 未绑定时 | **绑定后** |
### 虚拟键盘
| 工具 | 参数 | 说明 |
|------|------|------|
| `virtual_keyboard` | `action`, `text`, `duration?`, `repeat?` | 在绑定窗口内操作虚拟键盘 |
**动作详情:**
| action | text 含义 | 说明 |
|--------|----------|------|
| `type` | 要输入的文字 | SendMessageW(WM_CHAR),支持 Unicode 中文/emoji |
| `combo` | 组合键 (如 "ctrl+s") | WM_KEYDOWN/UP 序列 |
| `press` | 单个键名 | 按下不松(配合 release 使用) |
| `release` | 单个键名 | 松开按键 |
| `hold` | 键名或组合 | 按住指定秒数后松开 |
**与通用键盘工具的区别:**
| | 通用 (`type`/`key`) | `virtual_keyboard` |
|---|---|---|
| 输入方式 | SendInput全局 | SendMessageW窗口级 |
| 物理键盘 | 会冲突 | **不冲突** |
| 适用场景 | 未绑定时 | **绑定后** |
**注意:** SendMessageW 对 Windows Terminal (ConPTY) 等现代应用无效。这些应用需要使用通用工具 + 窗口激活方式操作。
### 鼠标滚轮
| 工具 | 参数 | 说明 |
|------|------|------|
| `mouse_wheel` | `coordinate: [x,y]`, `delta`, `direction?` | WM_MOUSEWHEEL 鼠标中键滚轮 |
**参数说明:**
- `delta`: 正值=向上,负值=向下。每 1 单位 ≈ 3 行
- `direction`: "vertical"(默认)或 "horizontal"
- `coordinate`: 滚轮作用点——决定哪个面板/区域接收滚动
**与通用 `scroll` 的区别:**
| | `scroll` | `mouse_wheel` |
|---|---|---|
| 原理 | WM_VSCROLL/WM_HSCROLL | **WM_MOUSEWHEEL** |
| Excel | ❌ | ✅ |
| 浏览器 | ❌ | ✅ |
| 代码编辑器 | ❌ | ✅ |
### 元素级操作
| 工具 | 参数 | 说明 |
|------|------|------|
| `click_element` | `name?`, `role?`, `automationId?` | 按无障碍名称/角色点击 GUI 元素 |
| `type_into_element` | `name?`, `role?`, `automationId?`, `text` | 按名称向元素输入文字 |
**工作原理:**
1. 通过 UI Automation 在绑定窗口中查找匹配元素
2. `click_element`: 先尝试 InvokePattern按钮/菜单),失败则 SendMessage 点击 BoundingRect 中心
3. `type_into_element`: 先尝试 ValuePattern 直接设值,失败则点击聚焦 + WM_CHAR 输入
**适用场景:**
- 截图中看到元素名称但坐标不精确时
- Accessibility Snapshot 列出了元素的 name/automationId 时
- 比坐标点击更可靠(不受窗口缩放/DPI 影响)
### 终端交互
| 工具 | 参数 | 说明 |
|------|------|------|
| `prompt_respond` | `response_type`, `arrow_direction?`, `arrow_count?`, `text?` | 处理终端 Yes/No/选择提示 |
**response_type 详情:**
| response_type | 操作 | 场景 |
|---------------|------|------|
| `yes` | 发送 'y' + Enter | npm "Continue? (y/n)" |
| `no` | 发送 'n' + Enter | 拒绝确认 |
| `enter` | 发送 Enter | 接受默认选项 |
| `escape` | 发送 Escape | 取消操作 |
| `select` | ↑/↓ 箭头 × N + Enter | inquirer 选择菜单 |
| `type` | 输入文字 + Enter | 文本输入提示 |
### 状态指示器
| 工具 | 参数 | 说明 |
|------|------|------|
| `status_indicator` | `action`: show/hide/status, `message?` | 控制绑定窗口底部的浮动状态标签 |
---
## 三、教学工具3 个)
需要 `teachMode` 开启。
| 工具 | 说明 |
|------|------|
| `request_teach_access` | 请求教学引导模式权限 |
| `teach_step` | 显示一步引导提示,等用户点 Next |
| `teach_batch` | 批量排队多步引导 |
---
## 操作流程
### 流程 1全屏操作未绑定
```
request_access(apps=["Notepad"])
open_application(app="Notepad") ← 自动绑定窗口
screenshot ← PrintWindow 截图 + GUI 元素列表
left_click(coordinate=[500, 300]) ← 全局 SendInput
type(text="hello world") ← 全局 SendInput
key(text="ctrl+s") ← 全局 SendInput
```
### 流程 2绑定窗口操作推荐不干扰用户
```
request_access(apps=["Notepad"])
bind_window(action="list") ← 列出所有窗口
bind_window(action="bind", title="记事本") ← 绑定 + 绿色边框 + 虚拟光标
screenshot ← PrintWindow 截取绑定窗口
virtual_mouse(action="click", coordinate=[500, 300]) ← SendMessageW不动真实鼠标
virtual_keyboard(action="type", text="hello world") ← SendMessageW不动物理键盘
virtual_keyboard(action="combo", text="ctrl+s") ← 保存
mouse_wheel(coordinate=[500, 400], delta=-5) ← 向下滚动
bind_window(action="unbind") ← 解除绑定
```
### 流程 3按元素名称操作
```
bind_window(action="bind", title="记事本")
screenshot ← 返回截图 + GUI elements 列表
click_element(name="保存", role="Button") ← UI Automation 查找并点击
type_into_element(role="Edit", text="new content")
```
### 流程 4终端交互
```
bind_window(action="bind", title="PowerShell")
screenshot
prompt_respond(response_type="yes") ← 回答 y + Enter
prompt_respond(response_type="select", arrow_direction="down", arrow_count=2) ← 选第3项
```
### 流程 5Excel/浏览器滚动
```
bind_window(action="bind", title="Excel")
screenshot
mouse_wheel(coordinate=[600, 400], delta=-10) ← 向下滚动 10 格
mouse_wheel(coordinate=[600, 400], delta=5, direction="horizontal") ← 向右滚动
```
---
## 应用兼容性
| 应用类型 | SendMessageW (virtual_*) | 元素操作 (click_element) | 注意 |
|---------|--------------------------|------------------------|------|
| 传统 Win32 (记事本/写字板) | ✅ | ✅ | 完美支持 |
| Office (Excel/Word) | ✅ (COM 自动化) | ✅ | 通过 COM API |
| WPF 应用 | ✅ | ✅ | 标准 UIA 支持 |
| Electron/Chrome | ⚠️ 部分 | ⚠️ 部分 | 内部渲染不走 Win32 消息 |
| UWP/WinUI (Windows Terminal) | ❌ | ❌ | ConPTY 不接受 SendMessageW |
| 浏览器网页内容 | ❌ | ❌ | 需要全局 SendInput |
**对于不支持 SendMessageW 的应用**,使用通用工具 (`left_click`/`type`/`key`) + `window_management(action="focus")` 先激活窗口。
---
## 绑定窗口时的可视化
绑定窗口后自动启动三层可视化:
1. **DWM 绿色边框** — 窗口自身的边框颜色变绿,零偏移
2. **虚拟鼠标光标** — 红色箭头图标,跟随 virtual_mouse 操作移动,点击时闪烁
3. **状态指示器** — 窗口底部浮动标签,显示当前操作(通过 status_indicator 控制)
---
## Accessibility Snapshot
每次 `screenshot` 时,如果窗口已绑定,会自动附带 GUI 元素列表:
```
GUI elements in this window:
[Button] "Save" (120,50 80x30) enabled
[Edit] "" (200,80 400x25) enabled value="hello" id=textBox1
[MenuItem] "File" (10,0 40x25) enabled
[MenuItem] "Edit" (50,0 40x25) enabled
[CheckBox] "Auto-save" (300,50 100x20) enabled id=chkAutoSave
```
模型同时收到 **截图图片 + 结构化元素列表**,可以选择:
- 用坐标操作:`virtual_mouse(action="click", coordinate=[120, 50])`
- 用名称操作:`click_element(name="Save")`
---
## UI Automation Control Patterns 参考
`click_element` / `type_into_element` 底层使用 UI Automation Control Patterns。当前已实现的和可扩展的
| Pattern | 用途 | 当前状态 | 可用于 |
|---------|------|---------|--------|
| `InvokePattern` | 触发点击 | ✅ 已实现 (`click_element`) | 按钮、菜单项、链接 |
| `ValuePattern` | 读写文本值 | ✅ 已实现 (`type_into_element`) | 文本框、组合框 |
| `TogglePattern` | 切换状态 | ❌ 未实现 | 复选框、开关 |
| `SelectionPattern` | 选择项目 | ❌ 未实现 | 下拉菜单、列表 |
| `ScrollPattern` | 编程滚动 | ❌ 未实现(用 `mouse_wheel` 替代) | 列表、树、面板 |
| `ExpandCollapsePattern` | 展开/折叠 | ❌ 未实现 | 树节点、折叠面板 |
| `WindowPattern` | 窗口操作 | ❌ 未实现(用 `window_management` 替代) | 窗口最大化/关闭 |
| `TextPattern` | 读取文档文本 | ❌ 未实现 | 文档、富文本 |
| `GridPattern` | 表格操作 | ❌ 未实现 | Excel 单元格、数据网格 |
| `TablePattern` | 表格结构 | ❌ 未实现 | 表头、行列关系 |
| `RangeValuePattern` | 范围值操作 | ❌ 未实现 | 滑块、进度条 |
| `TransformPattern` | 移动/缩放 | ❌ 未实现 | 可拖拽元素 |
**扩展路线:** 优先实现 `TogglePattern`(复选框)和 `SelectionPattern`(下拉菜单),这两个在表单自动化中最常用。
---
## 屏幕截取技术方案对比
当前使用 Python Bridge (mss) 进行截图,底层是 GDI BitBlt。三种方案对比
| 方案 | API | 当前状态 | 性能 | 优势 | 限制 |
|------|-----|---------|------|------|------|
| **GDI BitBlt** | `BitBlt` / `PrintWindow` | ✅ 当前使用 (mss/bridge.py) | ~300ms | 简单稳定,支持后台窗口 (PrintWindow) | 不支持硬件加速内容、DPI 处理复杂 |
| **DXGI Desktop Duplication** | `IDXGIOutputDuplication` | ❌ 未实现 | ~16ms (60fps) | 硬件加速,支持 HDRGPU 直接读取 | 不支持单窗口截取,需 D3D11 |
| **Windows.Graphics.Capture** | `GraphicsCaptureItem` | ❌ 未实现 | ~16ms | 最新 API支持单窗口/单显示器,系统级权限管理 | Win10 1903+,首次需用户确认 |
### 推荐升级路径
```
当前: GDI BitBlt (mss) ─── 全屏 ~300ms, 窗口 ~300ms (PrintWindow)
├─ 近期: DXGI Desktop Duplication ─── 全屏 ~16ms, 但不支持单窗口
└─ 远期: Windows.Graphics.Capture ─── 全屏 + 单窗口都 ~16ms
```
### DXGI Desktop Duplication 实现要点
```python
# bridge.py 中可添加 DXGI 截图(通过 d3dshot 或 dxcam 库)
import dxcam # pip install dxcam
camera = dxcam.create()
frame = camera.grab() # numpy array, ~5ms
# 转为 JPEG base64 发送
```
### Windows.Graphics.Capture 实现要点
```python
# 需要 WinRT Python 绑定
# pip install winrt-Windows.Graphics.Capture winrt-Windows.Graphics.DirectX
# 限制:首次调用需要用户在系统弹窗中确认权限
```
---
## 输入方式技术矩阵
不同应用类型需要不同的输入方式:
| 输入方式 | API | 优势 | 限制 | 适用应用 |
|---------|-----|------|------|---------|
| **SendMessageW** | `WM_CHAR` / `WM_KEYDOWN` | 不抢焦点,不动真实键鼠 | 现代应用不支持 | Win32 传统应用 (记事本/Office/WPF) |
| **SendInput** | `INPUT` 结构体 | 所有应用都支持 | **必须前台焦点**,会干扰用户 | 所有应用(通用后备) |
| **WriteConsoleInput** | 控制台 API | 直接写入控制台缓冲区 | 需要 AttachConsole可能被拒绝 | cmd/PowerShell非 Windows Terminal |
| **UI Automation** | `InvokePattern` / `ValuePattern` | 语义级操作,最可靠 | 部分应用不暴露 UIA 接口 | 支持 UIA 的应用 |
| **COM Automation** | Excel/Word COM | 完全编程控制 | 仅 Office 应用 | Excel / Word |
| **剪贴板 + 粘贴** | `SetClipboardData` + `Ctrl+V` | 绕过输入限制 | 会覆盖用户剪贴板 | 通用后备 |
### 按应用类型的推荐输入策略
| 应用类型 | 首选 | 后备 | 说明 |
|---------|------|------|------|
| 传统 Win32 (记事本/写字板) | SendMessageW | UIA ValuePattern | 虚拟输入完美工作 |
| Office (Excel/Word) | COM Automation | SendMessageW | COM 提供结构化操作 |
| WPF 应用 | SendMessageW | UIA | 标准 Win32 消息循环 |
| Electron/Chrome 应用 | UIA | 剪贴板粘贴 | 内部渲染不走 Win32 |
| Windows Terminal (ConPTY) | SendInput (需前台) | 剪贴板粘贴 | ConPTY 不接受外部消息 |
| UWP/WinUI 应用 | SendInput (需前台) | UIA | XAML 渲染不走 Win32 消息 |
---
## 已知限制与待解决
| 限制 | 影响 | 计划 |
|------|------|------|
| Windows Terminal 不接受 SendMessageW | 虚拟键盘/鼠标对终端无效 | 自动检测应用类型,终端类切换到 SendInput + 短暂激活 |
| PrintWindow 截不到 alternate screen buffer | Ink REPL 画面截不到 | 切换到 Windows.Graphics.Capture |
| Accessibility Snapshot 对大应用慢 (>30s) | Excel 等复杂应用超时 | 限制遍历深度 + 超时保护 |
| DWM 边框对自定义标题栏应用可能无效 | 某些 Electron 应用看不到边框 | 检测并回退到叠加窗口方案 |
| 虚拟光标是 PowerShell WinForms 进程 | 启动慢 (~1s),资源占用 | 考虑用 Win32 原生窗口替代 |
---
## 技术路线图
### Phase 1当前— 基础功能
- ✅ SendMessageW 虚拟输入
- ✅ PrintWindow/mss 截图
- ✅ UI Automation (InvokePattern + ValuePattern)
- ✅ Accessibility Snapshot
- ✅ DWM 边框指示
- ✅ Python Bridge
### Phase 2近期— 兼容性增强
- ⬜ 应用类型自动检测Win32 vs Terminal vs UWP
- ⬜ 终端类应用自动切换 SendInput + 短暂激活
- ⬜ TogglePattern / SelectionPattern 支持
- ⬜ DXGI Desktop Duplication 高速截图
- ⬜ Accessibility Snapshot 超时保护
### Phase 3远期— 高级能力
- ⬜ Windows.Graphics.Capture单窗口实时截图
- ⬜ 截图元素标注(在截图上标记 ID 数字)
- ⬜ 浏览器 DOM 提取(绑定浏览器时提取网页结构)
- ⬜ GridPattern / TablePatternExcel 单元格级操作)
- ⬜ TextPattern文档内容读取
- ⬜ 多窗口协同操作

View File

@@ -1,136 +1,197 @@
# Computer Use 用户指南
# Computer Use — macOS / Windows / Linux 跨平台实施计划
Computer Use 让 Claude 直接操控你的电脑——移动鼠标、点击、输入文字、截图,就像一个远程助手坐在你面前操作一样。
更新时间2026-04-03
参考项目:`E:\源码\claude-code-source-main\claude-code-source-main`
## 支持平台
## 1. 现状
| 平台 | 状态 | 额外配置 |
|------|------|---------|
| macOS | 可用 | 需授予辅助功能 + 屏幕录制权限 |
| Windows | 可用 | 无需额外配置 |
| Linux | 不可用 | 后端待开发 |
参考项目的 Computer Use **仅支持 macOS**——从入口到底层全部写死 darwin。我们的项目在 Phase 1-3 中已经完成了:
## 快速开始
-`@ant/computer-use-mcp` stub 替换为完整实现12 文件)
-`@ant/computer-use-input` 拆为 dispatcher + backendsdarwin + win32
-`@ant/computer-use-swift` 拆为 dispatcher + backendsdarwin + win32
-`CHICAGO_MCP` 编译开关已开
-`src/` 层有 6 处 macOS 硬编码阻塞
1. 启动 Claude Code
## 2. 阻塞点全景
```bash
bun run dev
```
### 2.1 入口层
Computer Use 默认已开启,无需额外参数。
| # | 文件:行号 | 阻塞代码 | 影响 |
|---|----------|---------|------|
| 1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` | 整个 CU 初始化被跳过 |
2. 在对话中告诉 Claude 你想做什么,例如:
- "帮我打开系统设置"
- "截个屏看看当前桌面"
- "在 Finder 里点击那个文件"
### 2.2 加载层
3. 首次操控某个应用时,会弹出权限对话框让你确认。
| # | 文件:行号 | 阻塞代码 | 影响 |
|---|----------|---------|------|
| 2 | `src/utils/computerUse/swiftLoader.ts:16` | `process.platform !== 'darwin'` → throw | 截图、应用管理全部不可用 |
| 3 | `src/utils/computerUse/executor.ts:263` | `process.platform !== 'darwin'` → throw | 整个 executor 工厂函数不可用 |
4. 操作过程中随时按 **Esc**macOS或 **Ctrl+C**Windows中止。
### 2.3 macOS 特有依赖
## 权限说明
| # | 文件:行号 | 依赖 | macOS 实现 | 需要替代方案 |
|---|----------|------|-----------|------------|
| 4 | `executor.ts:70-88` | 剪贴板 | `pbcopy`/`pbpaste` | Win: PowerShell `Get/Set-Clipboard`Linux: `xclip`/`wl-copy` |
| 5 | `drainRunLoop.ts:21` | CFRunLoop pump | `cu._drainMainRunLoop()` | 非 darwin直接执行 fn(),不需要 pump |
| 6 | `escHotkey.ts:28` | ESC 热键 | CGEventTap | 非 darwin返回 false已有 Ctrl+C fallback |
| 7 | `hostAdapter.ts:48-54` | 系统权限 | TCC accessibility + screenRecording | Win直接 grantedLinux检查 xdotool |
| 8 | `common.ts:56` | 平台标识 | `platform: 'darwin'` 硬编码 | 动态获取 |
| 9 | `executor.ts:180` | 粘贴快捷键 | `command+v` | Win/Linux`ctrl+v` |
Computer Use 采用分级权限模型,保护你的安全:
### 2.4 缺失的 Linux 后端
| 级别 | 能力 | 适用场景 |
|------|------|---------|
| **full** | 所有操作:鼠标点击(左/右/中键)、拖拽、键盘输入、组合键 | 系统设置、Finder 等系统应用 |
| **click** | 仅左键点击和滚轮滚动 | IDEVS Code、Cursor、终端 |
| 未授权 | 所有操作被拒绝 | 需要通过 `request_access` 申请 |
| | macOS | Windows | Linux |
|---|-------|---------|-------|
| `computer-use-input/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
| `computer-use-swift/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
IDE 类应用默认只有 click 权限,这是安全设计——防止 AI 在你的终端或编辑器中执行危险操作。如需完整控制,可以在权限对话框中手动提升。
## 3. 每个平台的能力依赖
## 可用操作
### 3.1 computer-use-input键鼠
### 鼠标
| 功能 | macOS | Windows | Linux |
|------|-------|---------|-------|
| 鼠标移动 | CGEvent JXA | SetCursorPos P/Invoke | xdotool mousemove |
| 鼠标点击 | CGEvent JXA | SendInput P/Invoke | xdotool click |
| 鼠标滚轮 | CGEvent JXA | SendInput MOUSEEVENTF_WHEEL | xdotool scroll |
| 键盘按键 | System Events osascript | keybd_event P/Invoke | xdotool key |
| 组合键 | System Events osascript | keybd_event 组合 | xdotool key combo |
| 文本输入 | System Events keystroke | SendKeys.SendWait | xdotool type |
| 前台应用 | System Events osascript | GetForegroundWindow P/Invoke | xdotool getactivewindow + /proc |
| 工具依赖 | osascript内置 | powershell内置 | xdotool需安装 |
| 操作 | 说明 |
|------|------|
| 移动鼠标 | 移动到指定坐标 |
| 左键点击 | 单击、双击、三击 |
| 右键点击 | 需要 full 权限 |
| 中键点击 | 需要 full 权限 |
| 拖拽 | 从 A 点拖到 B 点,需要 full 权限 |
| 滚轮 | 向上或向下滚动 |
### 3.2 computer-use-swift截图 + 应用管理)
### 键盘
| 功能 | macOS | Windows | Linux |
|------|-------|---------|-------|
| 全屏截图 | screencapture | CopyFromScreen | gnome-screenshot / scrot / grim |
| 区域截图 | screencapture -R | CopyFromScreen(rect) | gnome-screenshot -a / scrot -a / grim -g |
| 显示器列表 | CGGetActiveDisplayList JXA | Screen.AllScreens | xrandr --query |
| 运行中应用 | System Events JXA | Get-Process | wmctrl -l / ps |
| 打开应用 | osascript activate | Start-Process | xdg-open / gtk-launch |
| 隐藏/显示 | System Events visibility | ShowWindow/SetForegroundWindow | wmctrl -c / xdotool |
| 工具依赖 | screencapture + osascript | powershell | xdotool + scrot/grim + wmctrl |
| 操作 | 说明 |
|------|------|
| 按键 | 单个按键或组合键(如 Ctrl+C |
| 输入文字 | 逐字符输入文本,需要 full 权限 |
| 长按 | 按住某个键一段时间,需要 full 权限 |
### 3.3 executor 层
### 屏幕
| 功能 | macOS | Windows | Linux |
|------|-------|---------|-------|
| drainRunLoop | CFRunLoop pump | 不需要 | 不需要 |
| ESC 热键 | CGEventTap | 跳过Ctrl+C fallback | 跳过Ctrl+C fallback |
| 剪贴板读 | pbpaste | `powershell Get-Clipboard` | xclip -o / wl-paste |
| 剪贴板写 | pbcopy | `powershell Set-Clipboard` | xclip / wl-copy |
| 粘贴快捷键 | command+v | ctrl+v | ctrl+v |
| 终端检测 | __CFBundleIdentifier | WT_SESSION / TERM_PROGRAM | TERM_PROGRAM |
| 系统权限 | TCC check | 直接 granted | 检查 xdotool 安装 |
| 操作 | 说明 |
|------|------|
| 截图 | 截取当前屏幕 |
| 切换显示器 | 多显示器环境下切换目标屏幕 |
| 缩放 | 放大屏幕某个区域 |
## 4. 执行步骤
### 其他
### Phase 1已完成 ✅
| 操作 | 说明 |
|------|------|
| 获取鼠标位置 | 查询当前鼠标坐标 |
| 批量操作 | 一次执行多个操作,减少等待 |
| 等待 | 暂停指定秒数(最长 100 秒) |
- [x] `@ant/computer-use-mcp` stub → 完整实现
- [x] `@ant/computer-use-input` dispatcher + darwin/win32 backends
- [x] `@ant/computer-use-swift` dispatcher + darwin/win32 backends
- [x] `CHICAGO_MCP` 编译开关
## macOS 权限配置
### Phase 2移除 6 处 macOS 硬编码(解锁 macOS + Windows
首次使用前,需要授予两项系统权限。缺少任一项都会导致功能异常(见下方说明)。
**改动原则macOS 代码路径不变,只在每处 darwin 守卫后加 win32/linux 分支。**
### 辅助功能Accessibility
| 步骤 | 文件 | 改动 |
|------|------|------|
| 2.1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` → 去掉平台限制,或改为 `!== 'unknown'` |
| 2.2 | `src/utils/computerUse/swiftLoader.ts:16-18` | 移除 `process.platform !== 'darwin'` throw。`@ant/computer-use-swift/index.ts` 已有跨平台 dispatch |
| 2.3 | `src/utils/computerUse/executor.ts:263-267` | 移除 `process.platform !== 'darwin'` throw。改为检查 input/swift isSupported |
| 2.4 | `src/utils/computerUse/executor.ts:70-88` | 剪贴板函数按平台分发darwin→pbcopy/pbpastewin32→PowerShell Get/Set-Clipboardlinux→xclip |
| 2.5 | `src/utils/computerUse/executor.ts:180` | `typeViaClipboard``command+v` → 非 darwin 时用 `ctrl+v` |
| 2.6 | `src/utils/computerUse/executor.ts:273` | `const cu = requireComputerUseSwift()` → 改为 `new ComputerUseAPI()`(从 package 直接实例化,不走 swiftLoader throw |
| 2.7 | `src/utils/computerUse/drainRunLoop.ts` | 开头加 `if (process.platform !== 'darwin') return fn()` |
| 2.8 | `src/utils/computerUse/escHotkey.ts` | `registerEscHotkey` 非 darwin 返回 false已有 Ctrl+C fallback |
| 2.9 | `src/utils/computerUse/hostAdapter.ts:48-54` | `ensureOsPermissions` 非 darwin 返回 `{ granted: true }` |
| 2.10 | `src/utils/computerUse/common.ts:56` | `platform: 'darwin'``platform: process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin'` |
| 2.11 | `src/utils/computerUse/common.ts:55` | `screenshotFiltering: 'native'` → 非 darwin 时 `'none'`Windows/Linux 截图不支持 per-app 过滤) |
| 2.12 | `src/utils/computerUse/gates.ts:13` | `enabled: false``enabled: true`(无 GrowthBook 时默认可用) |
| 2.13 | `src/utils/computerUse/gates.ts:39-43` | `hasRequiredSubscription()` → 直接返回 `true` |
允许 Claude 控制鼠标和键盘。
### Phase 3新增 Linux 后端
1. 打开 **系统设置 → 隐私与安全性 → 辅助功能**
2. 点击左下角锁图标解锁(需要管理员密码)
3. 将运行 Claude Code 的应用添加到允许列表:
- Terminal → `Terminal.app`
- iTerm → `iTerm.app`
- Cursor → `Cursor.app`
- VS Code 终端 → `Electron` 或 `Visual Studio Code.app`
4. 确保应用旁边的开关已打开
| 步骤 | 文件 | 内容 |
|------|------|------|
| 3.1 | `packages/@ant/computer-use-input/src/backends/linux.ts` | xdotool 键鼠mousemove/click/key/type/getactivewindow |
| 3.2 | `packages/@ant/computer-use-swift/src/backends/linux.ts` | scrot/grim 截图 + xrandr 显示器 + wmctrl 窗口管理 |
| 3.3 | `packages/@ant/computer-use-input/src/index.ts` | dispatcher 加 `case 'linux'` |
| 3.4 | `packages/@ant/computer-use-swift/src/index.ts` | dispatcher 加 `case 'linux'` |
**未授予时的现象**:鼠标移动、点击、键盘输入均无反应,工具执行成功但屏幕没有任何变化。
### Phase 4验证
### 屏幕录制Screen Recording
| 测试项 | macOS | Windows | Linux |
|--------|-------|---------|-------|
| build 成功 | ✅ | 验证 | 验证 |
| MCP 工具列表非空 | 验证 | 验证 | 验证 |
| 鼠标移动 | 验证 | ✅ 已通过 | 验证 |
| 截图 | 验证 | ✅ 已通过 | 验证 |
| 键盘输入 | 验证 | 验证 | 验证 |
| 前台窗口 | 验证 | ✅ 已通过 | 验证 |
| 剪贴板 | 验证 | 验证 | 验证 |
允许 Claude 截取屏幕内容。
## 5. 文件改动总览
1. 打开 **系统设置 → 隐私与安全性 → 屏幕录制**
2. 将同一个应用添加到允许列表并开启开关
3. **需要重启该应用**才能生效(系统会提示 "xxx 需要重新打开"
### 不动的文件14 个)
**未授予时的现象**截图工具执行成功但返回空白图片Claude 无法看到你的屏幕,所有点击操作变成"盲点"。
`cleanup.ts``computerUseLock.ts``wrapper.tsx``toolRendering.tsx``mcpServer.ts``setup.ts``appNames.ts``inputLoader.ts``src/services/mcp/client.ts``@ant/computer-use-mcp/src/*`Phase 1 已完成)、`backends/darwin.ts`(两个包都不动)
### 验证权限
### 改 src/ 的文件8 个)
授予两项权限后,重启 Claude Code在对话中让 Claude 截一张图即可验证是否配置成功。如果截图内容正常显示,说明权限配置完成。
| 文件 | 改动量 | 风险 |
|------|--------|------|
| `main.tsx` | 1 行 | 低 |
| `swiftLoader.ts` | 2 行 | 低 |
| `executor.ts` | ~40 行(剪贴板分发 + 平台守卫 + paste 快捷键) | **中** |
| `drainRunLoop.ts` | 1 行 | 低 |
| `escHotkey.ts` | 3 行 | 低 |
| `hostAdapter.ts` | 5 行 | 低 |
| `common.ts` | 3 行 | 低 |
| `gates.ts` | 3 行 | 低 |
## Linux 依赖(暂不可用
### 新增文件2 个
Linux 后端尚未开发。完成后需要安装以下工具:
| 文件 | 行数估算 |
|------|---------|
| `packages/@ant/computer-use-input/src/backends/linux.ts` | ~150 行 |
| `packages/@ant/computer-use-swift/src/backends/linux.ts` | ~200 行 |
```bash
sudo apt install xdotool scrot xclip wmctrl
## 6. Linux 依赖工具
| 工具 | 用途 | 安装命令Ubuntu |
|------|------|-------------------|
| `xdotool` | 键鼠模拟 + 窗口管理 | `sudo apt install xdotool` |
| `scrot``gnome-screenshot` | 截图 | `sudo apt install scrot` |
| `xrandr` | 显示器信息 | 通常已预装 |
| `xclip` | 剪贴板 | `sudo apt install xclip` |
| `wmctrl` | 窗口列表/切换 | `sudo apt install wmctrl` |
Wayland 环境需要替代工具:`ydotool`(替代 xdotool`grim`(替代 scrot`wl-clipboard`(替代 xclip。初期可先只支持 X11Wayland 标记为 todo。
## 7. 执行顺序建议
```
Phase 2解锁 macOS + Windows
├── 2.1-2.3 移除 3 处硬编码 throw/skip
├── 2.4-2.5 剪贴板 + 粘贴快捷键平台分发
├── 2.6 swiftLoader → 直接实例化
├── 2.7-2.9 drainRunLoop / escHotkey / permissions 平台分支
├── 2.10-2.11 common.ts 平台标识动态化
├── 2.12-2.13 gates.ts 默认值
└── 验证 Windows
Phase 3Linux 后端)
├── 3.1 input/backends/linux.ts
├── 3.2 swift/backends/linux.ts
├── 3.3-3.4 dispatcher 加 linux case
└── 验证 Linux
Phase 4集成验证 + PR
```
仅支持 X11Wayland 不支持。
## 常见问题
### 截图成功但看不到图片
检查 **系统设置 → 隐私与安全性 → 屏幕录制** 是否已授权。未授权时截图工具会执行成功但返回空白内容。
### IDE 中无法输入文字或右键
这是正常行为。IDE 类应用只有 click 权限,无法执行键盘输入、右键、拖拽等操作。如需完整控制,请在系统应用(如 Finder中操作。
### 操作中途想停止
**Esc**macOS**Ctrl+C** 即可立即中止。
每个 Phase 可独立验证、独立提交。Phase 2 完成后 macOS + Windows 可用Phase 3 完成后三平台全部可用。

View File

@@ -1,33 +1,30 @@
/**
* @ant/computer-use-input — cross-platform keyboard & mouse simulation
* @ant/computer-use-input — macOS keyboard & mouse simulation (enigo)
*
* Platform backends:
* - darwin: AppleScript/JXA via CoreGraphics events
* - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event)
*
* Add new platforms by creating backends/<platform>.ts implementing InputBackend.
* This package wraps the macOS-only native enigo .node module.
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
*/
import type { FrontmostAppInfo, InputBackend } from './types.js'
export interface FrontmostAppInfo {
bundleId: string
appName: string
}
export type { FrontmostAppInfo, InputBackend } from './types.js'
// ---------------------------------------------------------------------------
// Platform dispatch
// ---------------------------------------------------------------------------
export interface InputBackend {
moveMouse(x: number, y: number, animated: boolean): Promise<void>
key(key: string, action: 'press' | 'release'): Promise<void>
keys(parts: string[]): Promise<void>
mouseLocation(): Promise<{ x: number; y: number }>
mouseButton(button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number): Promise<void>
mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
typeText(text: string): Promise<void>
getFrontmostAppInfo(): FrontmostAppInfo | null
}
function loadBackend(): InputBackend | null {
if (process.platform !== 'darwin') return null
try {
switch (process.platform) {
case 'darwin':
return require('./backends/darwin.js') as InputBackend
case 'win32':
return require('./backends/win32.js') as InputBackend
case 'linux':
return require('./backends/linux.js') as InputBackend
default:
return null
}
return require('./backends/darwin.js') as InputBackend
} catch {
return null
}
@@ -35,30 +32,16 @@ function loadBackend(): InputBackend | null {
const backend = loadBackend()
// ---------------------------------------------------------------------------
// Unsupported stub (throws on call — guards via isSupported check)
// ---------------------------------------------------------------------------
function unsupported(): never {
throw new Error(`computer-use-input is not supported on ${process.platform}`)
}
// ---------------------------------------------------------------------------
// Public API — matches the original export surface
// ---------------------------------------------------------------------------
export const isSupported = backend !== null
export const moveMouse = backend?.moveMouse ?? unsupported
export const key = backend?.key ?? unsupported
export const keys = backend?.keys ?? unsupported
export const mouseLocation = backend?.mouseLocation ?? unsupported
export const mouseButton = backend?.mouseButton ?? unsupported
export const mouseScroll = backend?.mouseScroll ?? unsupported
export const typeText = backend?.typeText ?? unsupported
export const moveMouse = backend?.moveMouse
export const key = backend?.key
export const keys = backend?.keys
export const mouseLocation = backend?.mouseLocation
export const mouseButton = backend?.mouseButton
export const mouseScroll = backend?.mouseScroll
export const typeText = backend?.typeText
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
// Legacy class type — used by inputLoader.ts for type narrowing
export class ComputerUseInputAPI {
declare moveMouse: InputBackend['moveMouse']
declare key: InputBackend['key']
@@ -71,8 +54,5 @@ export class ComputerUseInputAPI {
declare isSupported: true
}
interface ComputerUseInputUnsupported {
isSupported: false
}
interface ComputerUseInputUnsupported { isSupported: false }
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported

View File

@@ -16,6 +16,8 @@ export interface ScreenshotResult {
originX: number
originY: number
displayId?: number
/** Accessibility snapshot — structured GUI element tree as model-friendly text. Windows only. */
accessibilityText?: string
}
export interface FrontmostApp {
@@ -108,4 +110,59 @@ export interface ComputerExecutor {
getAppIcon(path: string): Promise<string | undefined>
listRunningApps(): Promise<RunningApp[]>
openApp(bundleId: string): Promise<void>
// ── Window management (Windows only, optional) ──────────────────────────
/** Perform a window management action on the bound window. Win32 API only — no global shortcuts. */
manageWindow?(action: string, opts?: { x?: number; y?: number; width?: number; height?: number }): Promise<boolean>
/** Get the current window rect of the bound window */
getWindowRect?(): Promise<{ x: number; y: number; width: number; height: number } | null>
// ── Element-targeted actions (Windows UIA, optional) ────────────────────
/** Open terminal and launch an agent CLI */
openTerminal?(opts: {
agent: 'claude' | 'codex' | 'gemini' | 'custom'
command?: string
terminal?: 'wt' | 'powershell' | 'cmd'
workingDirectory?: string
}): Promise<{ hwnd: string; title: string; launched: boolean } | null>
/** Bind to a window by hwnd/title/pid. Returns bound window info or null. */
bindToWindow?(query: { hwnd?: string; title?: string; pid?: number }): Promise<{ hwnd: string; title: string; pid: number } | null>
/** Unbind from the current window */
unbindFromWindow?(): Promise<void>
/** Cheap binding-state check for window-targeted routing decisions. */
hasBoundWindow?(): Promise<boolean>
/** Get current binding status */
getBindingStatus?(): Promise<{ bound: boolean; hwnd?: string; title?: string; pid?: number; rect?: { x: number; y: number; width: number; height: number } } | null>
/** List all visible windows */
listVisibleWindows?(): Promise<Array<{ hwnd: string; pid: number; title: string }>>
/** Control the status indicator overlay */
statusIndicator?(action: 'show' | 'hide' | 'status', message?: string): Promise<{ active: boolean; message?: string }>
/** Virtual keyboard — send keys/text/combos to bound window only */
virtualKeyboard?(opts: {
action: 'type' | 'combo' | 'press' | 'release' | 'hold'
text: string
duration?: number
repeat?: number
}): Promise<boolean>
/** Virtual mouse — click/move/drag on bound window only */
virtualMouse?(opts: {
action: 'click' | 'double_click' | 'right_click' | 'move' | 'drag' | 'down' | 'up'
x: number; y: number
startX?: number; startY?: number
}): Promise<boolean>
/** Mouse wheel scroll at client coordinates (works on Excel, browsers, modern UI) */
mouseWheel?(x: number, y: number, delta: number, horizontal?: boolean): Promise<boolean>
/** Activate the bound window (foreground + click to focus) */
activateWindow?(clickX?: number, clickY?: number): Promise<boolean>
/** Handle a terminal prompt (yes/no/select/type + enter) */
respondToPrompt?(opts: {
responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type'
arrowDirection?: 'up' | 'down'
arrowCount?: number
text?: string
}): Promise<boolean>
/** Click an element by name/role/automationId via UI Automation */
clickElement?(query: { name?: string; role?: string; automationId?: string }): Promise<boolean>
/** Type text into an element by name/role/automationId via UI Automation ValuePattern */
typeIntoElement?(query: { name?: string; role?: string; automationId?: string }, text: string): Promise<boolean>
}

View File

@@ -434,6 +434,15 @@ async function runInputActionGates(
}
}
// Windows/Linux: operations go through SendMessage (HWND-bound) or platform
// abstraction, not global input to the foreground. The frontmost gate is a
// macOS safety net for global CGEvent input — on other platforms, skip it
// when the platform's screenshotFiltering is 'none' (no per-app filtering,
// meaning no hide/defocus, meaning frontmost is meaningless).
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
return null; // pass — non-macOS platform, frontmost irrelevant
}
// Frontmost gate. Check FRESH on every call.
const frontmost = await adapter.executor.getFrontmostApp();
@@ -561,6 +570,13 @@ async function runHitTestGate(
y: number,
actionKind: CuActionKind,
): Promise<CuCallToolResult | null> {
// Non-macOS: HWND-bound mode — clicks go to the bound window via
// SendMessage with window-relative coordinates. Hit-test against the
// real screen is meaningless.
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
return null;
}
const target = await adapter.executor.appUnderPoint(x, y);
if (!target) return null; // desktop / nothing under point / platform no-op
@@ -796,12 +812,12 @@ function resolveRequestedApps(
if (!resolved) {
resolved = byLowerDisplayName.get(requested.toLowerCase());
}
// Fuzzy fallback: match requested name as substring of display name
// e.g. "Chrome" matches "Google Chrome", "Code" matches "Visual Studio Code"
// Windows fuzzy matching: strip .exe suffix, try substring match
if (!resolved) {
const lower = requested.toLowerCase();
for (const app of installed) {
if (app.displayName.toLowerCase().includes(lower)) {
const clean = requested.toLowerCase().replace(/\.exe$/, '').trim();
// Try: "chrome" matches "Google Chrome", "notepad" matches "Notepad"
for (const [name, app] of byLowerDisplayName) {
if (name.includes(clean) || clean.includes(name)) {
resolved = app;
break;
}
@@ -2137,6 +2153,8 @@ async function handleScreenshot(
content: [
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
{
type: "image",
data: shot.base64,
@@ -2204,6 +2222,8 @@ async function handleScreenshot(
content: [
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
{
type: "image",
data: shot.base64,
@@ -2812,6 +2832,443 @@ async function handleOpenApplication(
return okText(`Opened "${app}".`);
}
async function handleVirtualMouse(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.virtualMouse) {
return errorResult("virtual_mouse is only available on Windows with a bound window.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const coord = args.coordinate;
if (!Array.isArray(coord) || coord.length < 2) {
return errorResult("coordinate [x, y] is required.", "bad_args");
}
const validActions = new Set(["click", "double_click", "right_click", "move", "drag", "down", "up"]);
if (!validActions.has(action)) {
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
}
const startCoord = Array.isArray(args.start_coordinate) ? args.start_coordinate : undefined;
const ok = await adapter.executor.virtualMouse({
action: action as any,
x: coord[0], y: coord[1],
startX: startCoord?.[0], startY: startCoord?.[1],
});
if (!ok) {
return errorResult("No window is currently bound.", "bad_args");
}
const desc: Record<string, string> = {
click: `Click at (${coord[0]},${coord[1]})`,
double_click: `Double-click at (${coord[0]},${coord[1]})`,
right_click: `Right-click at (${coord[0]},${coord[1]})`,
move: `Moved to (${coord[0]},${coord[1]})`,
drag: `Dragged ${startCoord ? `(${startCoord[0]},${startCoord[1]})` : "current"} → (${coord[0]},${coord[1]})`,
down: `Button down at (${coord[0]},${coord[1]})`,
up: `Button up at (${coord[0]},${coord[1]})`,
};
return okText(desc[action] ?? action);
}
async function handleVirtualKeyboard(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.virtualKeyboard) {
return errorResult("virtual_keyboard is only available on Windows with a bound window.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const text = requireString(args, "text");
if (text instanceof Error) return errorResult(text.message, "bad_args");
const validActions = new Set(["type", "combo", "press", "release", "hold"]);
if (!validActions.has(action)) {
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
}
const duration = typeof args.duration === "number" ? args.duration : undefined;
const repeat = typeof args.repeat === "number" ? args.repeat : undefined;
const ok = await adapter.executor.virtualKeyboard({
action: action as any,
text,
duration,
repeat,
});
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
const desc: Record<string, string> = {
type: `Typed "${text.length > 40 ? text.slice(0, 40) + "..." : text}"`,
combo: `Sent ${text}`,
press: `Pressed ${text} (holding)`,
release: `Released ${text}`,
hold: `Held ${text} for ${duration ?? 1}s`,
};
return okText(`${desc[action]}${repeat && repeat > 1 ? ` ×${repeat}` : ""}`);
}
async function handleStatusIndicator(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.statusIndicator) {
return errorResult("status_indicator is only available on Windows.", "feature_unavailable");
}
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
if (!["show", "hide", "status"].includes(action)) {
return errorResult(`Invalid action "${action}". Valid: show, hide, status.`, "bad_args");
}
const message = typeof args.message === "string" ? args.message : undefined;
if (action === "show" && !message) {
return errorResult("'show' requires a message parameter.", "bad_args");
}
const result = await adapter.executor.statusIndicator(action as any, message);
if (action === "status") {
return okText(result.active ? "Indicator is active on the bound window." : "Indicator is not active (no window bound).");
}
if (action === "show") {
return okText(`Indicator showing: "${message}"`);
}
return okText("Indicator hidden.");
}
async function handleMouseWheel(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.mouseWheel) {
return errorResult("mouse_wheel is only available on Windows with a bound window.", "feature_unavailable");
}
const coord = args.coordinate;
if (!Array.isArray(coord) || coord.length < 2) {
return errorResult("coordinate must be [x, y] array.", "bad_args");
}
const delta = typeof args.delta === "number" ? args.delta : undefined;
if (delta === undefined) {
return errorResult("delta is required (positive=up, negative=down).", "bad_args");
}
const horizontal = args.direction === "horizontal";
const ok = await adapter.executor.mouseWheel(coord[0], coord[1], delta, horizontal);
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
return okText(
`Mouse wheel: ${horizontal ? "horizontal" : "vertical"} scroll ${delta > 0 ? "up" : "down"} ${Math.abs(delta)} click(s) at (${coord[0]},${coord[1]}).`,
);
}
async function handleActivateWindow(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.activateWindow) {
return errorResult("activate_window is only available on Windows with a bound window.", "feature_unavailable");
}
const clickX = typeof args.click_x === "number" ? args.click_x : undefined;
const clickY = typeof args.click_y === "number" ? args.click_y : undefined;
const ok = await adapter.executor.activateWindow(clickX, clickY);
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
return okText("Window activated and focused. Ready for input.");
}
async function handlePromptRespond(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.respondToPrompt) {
return errorResult("prompt_respond is only available on Windows with a bound window.", "feature_unavailable");
}
const responseType = requireString(args, "response_type");
if (responseType instanceof Error) return errorResult(responseType.message, "bad_args");
const validTypes = new Set(["yes", "no", "enter", "escape", "select", "type"]);
if (!validTypes.has(responseType)) {
return errorResult(`Invalid response_type "${responseType}". Valid: ${[...validTypes].join(", ")}`, "bad_args");
}
if (responseType === "select" && typeof args.arrow_count !== "number") {
return errorResult("'select' requires arrow_count parameter.", "bad_args");
}
if (responseType === "type" && typeof args.text !== "string") {
return errorResult("'type' requires text parameter.", "bad_args");
}
const ok = await adapter.executor.respondToPrompt({
responseType: responseType as any,
arrowDirection: typeof args.arrow_direction === "string" ? args.arrow_direction as any : undefined,
arrowCount: typeof args.arrow_count === "number" ? args.arrow_count : undefined,
text: typeof args.text === "string" ? args.text : undefined,
});
if (!ok) {
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
}
const descriptions: Record<string, string> = {
yes: "Sent 'y' + Enter.",
no: "Sent 'n' + Enter.",
enter: "Sent Enter.",
escape: "Sent Escape.",
select: `Navigated ${args.arrow_direction ?? "down"} ${args.arrow_count ?? 1} time(s) + Enter.`,
type: `Typed "${args.text}" + Enter.`,
};
return okText(`Prompt responded: ${descriptions[responseType] ?? responseType}. Take a screenshot to verify.`);
}
async function handleOpenTerminal(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.openTerminal) {
return errorResult("open_terminal is only available on Windows.", "feature_unavailable");
}
const agent = requireString(args, "agent");
if (agent instanceof Error) return errorResult(agent.message, "bad_args");
const validAgents = new Set(["claude", "codex", "gemini", "custom"]);
if (!validAgents.has(agent)) {
return errorResult(`Invalid agent "${agent}". Valid: claude, codex, gemini, custom.`, "bad_args");
}
if (agent === "custom" && typeof args.command !== "string") {
return errorResult("agent='custom' requires 'command' parameter.", "bad_args");
}
const result = await adapter.executor.openTerminal({
agent: agent as any,
command: typeof args.command === "string" ? args.command : undefined,
terminal: typeof args.terminal === "string" ? args.terminal as any : undefined,
workingDirectory: typeof args.working_directory === "string" ? args.working_directory : undefined,
});
if (!result) {
return errorResult(
"Failed to open terminal. Windows Terminal (wt.exe) may not be installed.",
"launch_failed",
);
}
if (!result.launched) {
return okText(
`Terminal opened (hwnd=${result.hwnd}, "${result.title}") but no command was sent. Window is now bound.`,
);
}
const agentNames: Record<string, string> = {
claude: "Claude Code", codex: "Codex", gemini: "Gemini",
custom: args.command as string,
};
return okText(
`Terminal opened and ${agentNames[agent] ?? agent} launched.\n` +
`Window: hwnd=${result.hwnd} "${result.title}"\n` +
`Command: '${agent === "custom" ? args.command : agent}' + Enter\n` +
`Status: bound to this terminal. Take a screenshot to verify the agent started.`,
);
}
async function handleBindWindow(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
switch (action) {
case "list": {
if (!adapter.executor.listVisibleWindows) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const windows = await adapter.executor.listVisibleWindows();
if (windows.length === 0) return okText("No visible windows found.");
const lines = windows.map(
(w) => `hwnd=${w.hwnd} pid=${w.pid} "${w.title}"`,
);
return okText(`Visible windows (${windows.length}):\n${lines.join("\n")}`);
}
case "status": {
if (!adapter.executor.getBindingStatus) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const status = await adapter.executor.getBindingStatus();
if (!status || !status.bound) {
return okText("No window is currently bound. Use bind_window(action='list') to see available windows, then bind_window(action='bind', title='...') to bind.");
}
let text = `Bound to: hwnd=${status.hwnd}`;
if (status.title) text += ` "${status.title}"`;
if (status.pid) text += ` pid=${status.pid}`;
if (status.rect) text += ` rect=(${status.rect.x},${status.rect.y} ${status.rect.width}x${status.rect.height})`;
return okText(text);
}
case "bind": {
if (!adapter.executor.bindToWindow) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
const title = typeof args.title === "string" ? args.title : undefined;
const hwnd = typeof args.hwnd === "string" ? args.hwnd : undefined;
const pid = typeof args.pid === "number" ? args.pid : undefined;
if (!title && !hwnd && !pid) {
return errorResult("Specify at least one of: title, hwnd, or pid.", "bad_args");
}
const result = await adapter.executor.bindToWindow({ hwnd, title, pid });
if (!result) {
return errorResult(
`No window found matching: ${[title && `title="${title}"`, hwnd && `hwnd=${hwnd}`, pid && `pid=${pid}`].filter(Boolean).join(", ")}. Use bind_window(action='list') to see available windows.`,
"element_not_found",
);
}
return okText(`Bound to window: hwnd=${result.hwnd} pid=${result.pid} "${result.title}". All subsequent screenshot/click/type operations target this window.`);
}
case "unbind": {
if (!adapter.executor.unbindFromWindow) {
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
}
await adapter.executor.unbindFromWindow();
return okText("Window binding released. Operations now target the full screen.");
}
default:
return errorResult(`Unknown bind_window action "${action}". Valid: list, bind, unbind, status.`, "bad_args");
}
}
async function handleClickElement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.clickElement) {
return errorResult(
"click_element is only available on Windows with a bound window.",
"feature_unavailable",
);
}
const name = typeof args.name === "string" ? args.name : undefined;
const role = typeof args.role === "string" ? args.role : undefined;
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
if (!name && !role && !automationId) {
return errorResult("At least one of name, role, or automationId is required.", "bad_args");
}
const ok = await adapter.executor.clickElement({ name, role, automationId });
if (!ok) {
return errorResult(
`Element not found: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. Take a screenshot to see current GUI elements.`,
"element_not_found",
);
}
return okText(`Clicked element: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
}
async function handleTypeIntoElement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
if (!adapter.executor.typeIntoElement) {
return errorResult(
"type_into_element is only available on Windows with a bound window.",
"feature_unavailable",
);
}
const text = requireString(args, "text");
if (text instanceof Error) return errorResult(text.message, "bad_args");
const name = typeof args.name === "string" ? args.name : undefined;
const role = typeof args.role === "string" ? args.role : undefined;
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
const ok = await adapter.executor.typeIntoElement({ name, role, automationId }, text);
if (!ok) {
return errorResult(
`Could not type into element: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. The element was not found or doesn't support text input.`,
"element_not_found",
);
}
return okText(`Typed ${text.length} chars into: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
}
async function handleWindowManagement(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
): Promise<CuCallToolResult> {
const action = requireString(args, "action");
if (action instanceof Error) return errorResult(action.message, "bad_args");
const VALID_ACTIONS = new Set([
"minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect",
]);
if (!VALID_ACTIONS.has(action)) {
return errorResult(
`Unknown window_management action "${action}". Valid: ${[...VALID_ACTIONS].join(", ")}`,
"bad_args",
);
}
if (!adapter.executor.manageWindow) {
return errorResult(
"window_management is only available on Windows with a bound window.",
"feature_unavailable",
);
}
// get_rect: just return the current window position and size
if (action === "get_rect") {
if (!adapter.executor.getWindowRect) {
return errorResult("getWindowRect not available.", "feature_unavailable");
}
const rect = await adapter.executor.getWindowRect();
if (!rect) {
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
}
return okText(
`Window rect: x=${rect.x}, y=${rect.y}, width=${rect.width}, height=${rect.height}`,
);
}
// move_resize: requires x, y (width/height optional)
if (action === "move_resize") {
const x = typeof args.x === "number" ? args.x : undefined;
const y = typeof args.y === "number" ? args.y : undefined;
if (x === undefined || y === undefined) {
return errorResult("move_resize requires x and y parameters.", "bad_args");
}
const width = typeof args.width === "number" ? args.width : undefined;
const height = typeof args.height === "number" ? args.height : undefined;
const ok = await adapter.executor.manageWindow(action, { x, y, width, height });
if (!ok) {
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
}
return okText(
width && height
? `Moved window to (${x}, ${y}) and resized to ${width}×${height}.`
: `Moved window to (${x}, ${y}).`,
);
}
// All other actions: minimize, maximize, restore, close, focus, move_offscreen
const ok = await adapter.executor.manageWindow(action);
if (!ok) {
return errorResult(
"No window is currently bound. Call open_application first.",
"bad_args",
);
}
const descriptions: Record<string, string> = {
minimize: "Window minimized (ShowWindow SW_MINIMIZE).",
maximize: "Window maximized (ShowWindow SW_MAXIMIZE).",
restore: "Window restored (ShowWindow SW_RESTORE).",
close: "Window closed (SendMessage WM_CLOSE). The window binding has been released.",
focus: "Window brought to front (SetForegroundWindow).",
move_offscreen: "Window moved offscreen (-32000,-32000). Still usable via SendMessage/PrintWindow.",
};
return okText(descriptions[action] ?? `Action "${action}" completed.`);
}
async function handleSwitchDisplay(
adapter: ComputerUseHostAdapter,
args: Record<string, unknown>,
@@ -3383,6 +3840,64 @@ async function dispatchAction(
overrides: ComputerUseOverrides,
subGates: CuSubGates,
): Promise<CuCallToolResult> {
// ── Bound-window auto-routing ──────────────────────────────────────
// When a window is bound (Win32), route generic input tools to
// virtual_mouse / virtual_keyboard automatically. The model doesn't
// need to know which tools to use — binding handles it.
const hasBoundWindow =
(await adapter.executor.hasBoundWindow?.()) === true &&
adapter.executor.virtualMouse &&
adapter.executor.virtualKeyboard;
if (hasBoundWindow) {
const coord = Array.isArray(a.coordinate) ? a.coordinate as number[] : undefined;
switch (name) {
case "left_click":
if (coord) return handleVirtualMouse(adapter, { action: "click", coordinate: coord });
break;
case "double_click":
if (coord) return handleVirtualMouse(adapter, { action: "double_click", coordinate: coord });
break;
case "right_click":
if (coord) return handleVirtualMouse(adapter, { action: "right_click", coordinate: coord });
break;
case "mouse_move":
if (coord) return handleVirtualMouse(adapter, { action: "move", coordinate: coord });
break;
case "left_click_drag":
if (coord) return handleVirtualMouse(adapter, {
action: "drag", coordinate: coord,
start_coordinate: Array.isArray(a.start_coordinate) ? a.start_coordinate : undefined,
});
break;
case "left_mouse_down":
if (coord) return handleVirtualMouse(adapter, { action: "down", coordinate: coord });
break;
case "left_mouse_up":
if (coord) return handleVirtualMouse(adapter, { action: "up", coordinate: coord });
break;
case "type":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "type", text: a.text });
break;
case "key":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "combo", text: a.text, repeat: a.repeat });
break;
case "hold_key":
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, {
action: "hold", text: a.text,
duration: typeof a.duration === "number" ? a.duration : 1,
});
break;
case "scroll":
if (coord) return handleMouseWheel(adapter, {
coordinate: coord,
delta: a.scroll_direction === "up" ? (a.scroll_amount ?? 3) : -(a.scroll_amount ?? 3),
direction: (a.scroll_direction === "left" || a.scroll_direction === "right") ? "horizontal" : "vertical",
});
break;
// screenshot, zoom, wait, cursor_position — not rerouted, pass through
}
}
// ── Standard dispatch (unbound or tools not rerouted above) ────────
switch (name) {
case "screenshot":
return handleScreenshot(adapter, overrides, subGates);
@@ -3434,6 +3949,39 @@ async function dispatchAction(
case "open_application":
return handleOpenApplication(adapter, a, overrides);
case "window_management":
return handleWindowManagement(adapter, a);
case "click_element":
return handleClickElement(adapter, a);
case "type_into_element":
return handleTypeIntoElement(adapter, a);
case "open_terminal":
return handleOpenTerminal(adapter, a);
case "bind_window":
return handleBindWindow(adapter, a);
case "virtual_mouse":
return handleVirtualMouse(adapter, a);
case "virtual_keyboard":
return handleVirtualKeyboard(adapter, a);
case "status_indicator":
return handleStatusIndicator(adapter, a);
case "mouse_wheel":
return handleMouseWheel(adapter, a);
case "activate_window":
return handleActivateWindow(adapter, a);
case "prompt_respond":
return handlePromptRespond(adapter, a);
case "switch_display":
return handleSwitchDisplay(adapter, a, overrides);

View File

@@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = {
export function buildComputerUseTools(
caps: {
screenshotFiltering: "native" | "none";
platform: "darwin" | "win32";
platform: "darwin" | "win32" | "linux";
/** Include request_teach_access + teach_step. Read once at server construction. */
teachMode?: boolean;
},
@@ -414,6 +414,353 @@ export function buildComputerUseTools(
},
},
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
// Only available on Windows when a window is bound via open_application.
...(caps.platform === 'win32' ? [{
name: "window_management",
description:
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
"The window must have been opened via open_application first. " +
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
"Use move_resize to reposition or resize the window to specific coordinates.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
description:
"minimize: ShowWindow(SW_MINIMIZE). " +
"maximize: ShowWindow(SW_MAXIMIZE). " +
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
"close: SendMessage(WM_CLOSE) — graceful close. " +
"focus: SetForegroundWindow + BringWindowToTop. " +
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
"move_resize: SetWindowPos to specific x,y,width,height. " +
"get_rect: GetWindowRect — returns current position and size.",
},
x: { type: "integer", description: "X position for move_resize." },
y: { type: "integer", description: "Y position for move_resize." },
width: { type: "integer", description: "Width for move_resize." },
height: { type: "integer", description: "Height for move_resize." },
},
required: ["action"],
},
} as Tool,
{
name: "click_element",
description:
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
inputSchema: {
type: "object" as const,
properties: {
name: {
type: "string",
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
},
role: {
type: "string",
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
},
automationId: {
type: "string",
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
},
},
required: [],
},
} as Tool,
{
name: "type_into_element",
description:
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
"Finds the element by name/role/automationId, then sets its value directly — " +
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
inputSchema: {
type: "object" as const,
properties: {
name: { type: "string", description: "Accessible name of the target element." },
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
automationId: { type: "string", description: "Exact automationId." },
text: { type: "string", description: "Text to type/set into the element." },
},
required: ["text"],
},
} as Tool,
{
name: "open_terminal",
description:
"Open a new terminal window and launch an AI agent CLI. " +
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
"or any custom command. After launching, the tool binds to the new terminal window " +
"and takes a screenshot to verify the agent started successfully. " +
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
inputSchema: {
type: "object" as const,
properties: {
agent: {
type: "string",
enum: ["claude", "codex", "gemini", "custom"],
description:
"Which agent to launch. " +
"claude: runs 'claude' command. " +
"codex: runs 'codex' command. " +
"gemini: runs 'gemini' command. " +
"custom: runs the command specified in 'command' parameter.",
},
command: {
type: "string",
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
},
terminal: {
type: "string",
enum: ["wt", "powershell", "cmd"],
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
},
working_directory: {
type: "string",
description: "Working directory for the terminal. If omitted, uses current directory.",
},
},
required: ["agent"],
},
} as Tool,
{
name: "bind_window",
description:
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
"no cursor movement, no focus steal, no interference with the user's desktop. " +
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["bind", "unbind", "status", "list"],
description:
"bind: Bind to a window (specify title, hwnd, or pid). " +
"unbind: Release the current binding, return to full-screen mode. " +
"status: Show the currently bound window (hwnd, title, rect). " +
"list: List all visible windows with hwnd, pid, and title.",
},
title: {
type: "string",
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
},
hwnd: {
type: "string",
description: "Exact window handle from 'list' output. For 'bind' action.",
},
pid: {
type: "integer",
description: "Process ID to find window for. For 'bind' action.",
},
},
required: ["action"],
},
} as Tool,
{
name: "activate_window",
description:
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
"the window is ready to receive keyboard/mouse events. " +
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
inputSchema: {
type: "object" as const,
properties: {
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
},
required: [],
},
} as Tool,
{
name: "prompt_respond",
description:
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
"Typical flows: " +
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
"3) Text input prompt → type the response + Enter. " +
"After responding, take a screenshot to verify the result.",
inputSchema: {
type: "object" as const,
properties: {
response_type: {
type: "string",
enum: ["yes", "no", "enter", "escape", "select", "type"],
description:
"yes: send 'y' + Enter. " +
"no: send 'n' + Enter. " +
"enter: send Enter only. " +
"escape: send Escape (cancel). " +
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
"type: type custom text then Enter. Requires 'text'.",
},
arrow_direction: {
type: "string",
enum: ["up", "down"],
description: "Arrow key direction for 'select' type. Default: 'down'.",
},
arrow_count: {
type: "integer",
description: "Number of arrow key presses for 'select' type. Default: 1.",
minimum: 0,
maximum: 50,
},
text: {
type: "string",
description: "Text to type for 'type' response_type.",
},
},
required: ["response_type"],
},
} as Tool,
{
name: "status_indicator",
description:
"Control the visual status indicator overlay on the bound window. " +
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["show", "hide", "status"],
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
},
message: {
type: "string",
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
},
},
required: ["action"],
},
} as Tool,
{
name: "virtual_keyboard",
description:
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
"The user can keep typing on their own keyboard without interference. " +
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
"All input targets the bound HWND only — no global keyboard events.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["type", "combo", "press", "release", "hold"],
description:
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
"press: Press a key down and hold it (pair with 'release'). " +
"release: Release a previously pressed key. " +
"hold: Press key(s) for a duration then release.",
},
text: {
type: "string",
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
},
duration: {
type: "number",
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
},
repeat: {
type: "integer",
description: "Number of times to repeat the action. Default: 1.",
minimum: 1,
maximum: 100,
},
},
required: ["action", "text"],
},
} as Tool,
{
name: "virtual_mouse",
description:
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
"The user's real cursor stays free. All operations target the bound HWND only.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string",
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
description:
"click: left-click at coordinate. " +
"double_click: double left-click. " +
"right_click: right-click. " +
"move: move virtual cursor (visual only, no click). " +
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
"down: press left button at coordinate (hold). " +
"up: release left button at coordinate.",
},
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate on the bound window.",
},
start_coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
},
},
required: ["action", "coordinate"],
},
} as Tool,
{
name: "mouse_wheel",
description:
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
"code editors, PDF viewers, and any modern UI. " +
"Specify the click point within the window where the scroll should occur — " +
"this determines which panel/pane/element receives the scroll.",
inputSchema: {
type: "object" as const,
properties: {
coordinate: {
type: "array",
items: { type: "number" },
minItems: 2,
maxItems: 2,
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
},
delta: {
type: "integer",
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
},
direction: {
type: "string",
enum: ["vertical", "horizontal"],
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
},
},
required: ["coordinate", "delta"],
},
} as Tool,
] : []),
{
name: "switch_display",
description:

View File

@@ -159,28 +159,23 @@ export const apps: AppsAPI = {
async listInstalled() {
try {
// Use Spotlight (mdfind) to enumerate .app bundles and mdls to get real bundle IDs.
// Searches /Applications, /System/Applications, and /System/Applications/Utilities
// so that system apps (Terminal, Chess, etc.) and core services (Finder) are found.
const proc = Bun.spawn([
'bash', '-c',
`for dir in /Applications /System/Applications /System/Applications/Utilities /System/Library/CoreServices; do
mdfind 'kMDItemContentType == "com.apple.application-bundle"' -onlyin "$dir" 2>/dev/null
done | sort -u | while read -r appPath; do
bundleId=$(mdls -raw -name kMDItemCFBundleIdentifier "$appPath" 2>/dev/null)
if [ -n "$bundleId" ] && [ "$bundleId" != "(null)" ]; then
displayName=$(basename "$appPath" .app)
echo "$bundleId|$displayName|$appPath"
fi
done`,
], { stdout: 'pipe', stderr: 'pipe' })
const text = await new Response(proc.stdout).text()
await proc.exited
return text.split('\n').filter(Boolean).map(line => {
const [bundleId, displayName, path] = line.split('|', 3)
const result = await osascript(`
tell application "System Events"
set appList to ""
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
set appPath to POSIX path of (appFile as alias)
set appName to name of appFile
set appList to appList & appPath & "|" & appName & "\\n"
end repeat
return appList
end tell
`)
return result.split('\n').filter(Boolean).map(line => {
const [path, name] = line.split('|', 2)
const displayName = (name ?? '').replace(/\.app$/, '')
return {
bundleId: bundleId ?? '',
displayName: displayName ?? '',
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
displayName,
path: path ?? '',
}
})

View File

@@ -1,14 +1,10 @@
/**
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
* @ant/computer-use-swift — macOS display, apps, and screenshot (Swift native)
*
* Platform backends:
* - darwin: AppleScript/JXA + screencapture
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
*
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
* This package wraps the macOS-only Swift .node native module.
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
*/
// Re-export all types
export type {
DisplayGeometry,
PrepareDisplayResult,
@@ -18,72 +14,42 @@ export type {
ScreenshotResult,
ResolvePrepareCaptureResult,
WindowDisplayInfo,
DisplayAPI,
AppsAPI,
ScreenshotAPI,
SwiftBackend,
} from './types.js'
} from './backends/darwin.js'
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
import type { ResolvePrepareCaptureResult } from './backends/darwin.js'
// ---------------------------------------------------------------------------
// Platform dispatch
// ---------------------------------------------------------------------------
function loadBackend(): SwiftBackend | null {
function loadDarwin() {
if (process.platform !== 'darwin') return null
try {
switch (process.platform) {
case 'darwin':
return require('./backends/darwin.js') as SwiftBackend
case 'win32':
return require('./backends/win32.js') as SwiftBackend
case 'linux':
return require('./backends/linux.js') as SwiftBackend
default:
return null
}
return require('./backends/darwin.js')
} catch {
return null
}
}
const backend = loadBackend()
// ---------------------------------------------------------------------------
// ComputerUseAPI — Main export (preserves original class interface)
// ---------------------------------------------------------------------------
const darwin = loadDarwin()
export class ComputerUseAPI {
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
// These stubs should never be reached in practice — callers check isSupported
// or the feature gate before invoking.
apps = backend?.apps ?? {
apps = darwin?.apps ?? {
async prepareDisplay() { return { activated: '', hidden: [] } },
async previewHideSet() { return [] },
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
async findWindowDisplays(ids: string[]) { return ids.map((b: string) => ({ bundleId: b, displayIds: [] as number[] })) },
async appUnderPoint() { return null },
async listInstalled() { return [] },
iconDataUrl() { return null },
listRunning() { return [] },
async open() { throw new Error('computer-use-swift: no backend for this platform') },
async open() { throw new Error('@ant/computer-use-swift: macOS only') },
async unhide() {},
}
display = backend?.display ?? {
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
display = darwin?.display ?? {
getSize() { throw new Error('@ant/computer-use-swift: macOS only') },
listAll() { throw new Error('@ant/computer-use-swift: macOS only') },
}
screenshot = backend?.screenshot ?? {
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
}
hotkey = (backend as any)?.hotkey ?? {
registerEscape(_cb: () => void): boolean { return false },
unregister() {},
notifyExpectedEscape() {},
screenshot = darwin?.screenshot ?? {
async captureExcluding() { throw new Error('@ant/computer-use-swift: macOS only') },
async captureRegion() { throw new Error('@ant/computer-use-swift: macOS only') },
}
async resolvePrepareCapture(
@@ -93,8 +59,6 @@ export class ComputerUseAPI {
targetW: number,
targetH: number,
displayId?: number,
_autoResolve?: boolean,
_doHide?: boolean,
): Promise<ResolvePrepareCaptureResult> {
return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId)
}

View File

@@ -52,8 +52,14 @@ export function getTerminalBundleId(): string | null {
* takes this shape (no `hostBundleId`, no `teachMode`).
*/
export const CLI_CU_CAPABILITIES = {
screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any,
platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any,
screenshotFiltering: (process.platform === 'darwin'
? 'native'
: 'none') as any,
platform: (process.platform === 'win32'
? 'win32'
: process.platform === 'linux'
? 'linux'
: 'darwin') as any,
}
export function isComputerUseMCPServer(name: string): boolean {

View File

@@ -297,16 +297,17 @@ export function createCliExecutor(opts: {
getMouseAnimationEnabled: () => boolean
getHideBeforeActionEnabled: () => boolean
}): ComputerExecutor {
if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') {
throw new Error(
`createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`,
)
// Non-macOS: delegate entirely to the cross-platform executor.
// No macOS code paths, no drainRunLoop, no @ant packages.
if (process.platform !== 'darwin') {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createCrossPlatformExecutor } = require('./executorCrossPlatform.js') as typeof import('./executorCrossPlatform.js')
return createCrossPlatformExecutor(opts)
}
// Swift loaded once at factory time — every executor method needs it.
// Input loaded lazily via requireComputerUseInput() on first mouse/keyboard
// call — it caches internally, so screenshot-only flows never pull the
// enigo .node.
// ── macOS: native @ant packages ─────────────────────────────────────
// Everything below is macOS-only. No platform checks needed.
const cu = requireComputerUseSwift()
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
@@ -500,18 +501,12 @@ export function createCliExecutor(opts: {
async key(keySequence: string, repeat?: number): Promise<void> {
const input = requireComputerUseInput()
const parts = keySequence.split('+').filter(p => p.length > 0)
// Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape
// etc. pass through without aborting.
const isEsc = isBareEscape(parts)
const n = repeat ?? 1
await drainRunLoop(async () => {
for (let i = 0; i < n; i++) {
if (i > 0) {
await sleep(8)
}
if (isEsc) {
notifyExpectedEscape()
}
if (i > 0) await sleep(8)
if (isEsc) notifyExpectedEscape()
await input.keys(parts)
}
})
@@ -554,12 +549,9 @@ export function createCliExecutor(opts: {
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
const input = requireComputerUseInput()
if (opts.viaClipboard) {
// keys(['command','v']) inside needs the pump.
await drainRunLoop(() => typeViaClipboard(input, text))
return
}
// `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this
// once per grapheme. typeText doesn't dispatch to the main queue.
await input.typeText(text)
},
@@ -656,6 +648,10 @@ export function createCliExecutor(opts: {
// ── App management ───────────────────────────────────────────────────
async getFrontmostApp(): Promise<FrontmostApp | null> {
// When HWND is bound on Windows, operations go through SendMessage
// and don't touch the real foreground. Return the first allowed app
// so the frontmost gate in toolCalls.ts passes — the real foreground
// is irrelevant since we never touch it.
const info = requireComputerUseInput().getFrontmostAppInfo()
if (!info || !info.bundleId) return null
return { bundleId: info.bundleId, displayName: info.appName }
@@ -698,6 +694,7 @@ export async function unhideComputerUseApps(
bundleIds: readonly string[],
): Promise<void> {
if (bundleIds.length === 0) return
if (process.platform !== 'darwin') return // non-macOS: no-op
const cu = requireComputerUseSwift()
await cu.apps.unhide([...bundleIds])
}

File diff suppressed because it is too large Load Diff

View File

@@ -46,16 +46,9 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
}),
ensureOsPermissions: async () => {
if (process.platform !== 'darwin') return { granted: true }
const cu = requireComputerUseSwift() as any
// Native .node module exposes tcc; cross-platform JS backend does not.
// When tcc is absent (JS backend on macOS), we cannot programmatically
// check TCC status — returning granted:false would create a deadlock
// (recheck also fails, user can never pass). The JS backend uses
// osascript/screencapture which trigger OS-level permission prompts
// themselves, so the OS provides the safety net instead.
if (!cu.tcc) return { granted: true }
const accessibility = cu.tcc.checkAccessibility()
const screenRecording = cu.tcc.checkScreenRecording()
const cu = requireComputerUseSwift()
const accessibility = (cu as any).tcc.checkAccessibility()
const screenRecording = (cu as any).tcc.checkScreenRecording()
return accessibility && screenRecording
? { granted: true }
: { granted: false, accessibility, screenRecording }

View File

@@ -0,0 +1,152 @@
/**
* macOS platform backend for Computer Use.
*
* Delegates to @ant/computer-use-input (enigo keyboard/mouse) and
* @ant/computer-use-swift (screenshots, display, apps).
*
* No window-bound input (sendChar/sendKey/sendClick/sendText) — macOS
* uses global input via CoreGraphics events.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
FrontmostAppInfo,
} from './types.js'
import { requireComputerUseInput } from '../inputLoader.js'
import { requireComputerUseSwift } from '../swiftLoader.js'
// ---------------------------------------------------------------------------
// Input — delegate to @ant/computer-use-input darwin backend
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
const api = requireComputerUseInput()
await api.moveMouse(x, y)
},
async click(x, y, button) {
const api = requireComputerUseInput()
await api.moveMouse(x, y)
await api.mouseButton(button, 'click', 1)
},
async typeText(text) {
const api = requireComputerUseInput()
await api.typeText(text)
},
async key(name, action) {
const api = requireComputerUseInput()
await api.key(name, action)
},
async keys(combo) {
const api = requireComputerUseInput()
await api.keys(combo)
},
async scroll(amount, direction) {
const api = requireComputerUseInput()
await api.mouseScroll(amount, direction)
},
async mouseLocation() {
const api = requireComputerUseInput()
return api.mouseLocation()
},
// No window-bound methods on macOS
}
// ---------------------------------------------------------------------------
// Screenshot — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
const swift = requireComputerUseSwift()
return swift.screenshot.captureExcluding([], undefined, undefined, undefined, displayId)
},
async captureRegion(x, y, w, h) {
const swift = requireComputerUseSwift()
return swift.screenshot.captureRegion([], x, y, w, h)
},
// macOS could use SCContentFilter for window capture but we don't expose
// it through this interface yet — the swift module's captureExcluding
// handles most use cases.
}
// ---------------------------------------------------------------------------
// Display — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll() {
const swift = requireComputerUseSwift()
return swift.display.listAll()
},
getSize(displayId) {
const swift = requireComputerUseSwift()
return swift.display.getSize(displayId)
},
}
// ---------------------------------------------------------------------------
// Apps — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
const swift = requireComputerUseSwift()
const running = swift.apps.listRunning()
return running.map((app: any) => ({
id: app.bundleId ?? '',
pid: 0, // macOS listRunning doesn't expose PID through this API
title: app.displayName ?? '',
}))
},
async listInstalled() {
const swift = requireComputerUseSwift()
const installed = await swift.apps.listInstalled()
return installed.map((app: any) => ({
id: app.bundleId ?? '',
displayName: app.displayName ?? '',
path: app.path ?? '',
}))
},
async open(name) {
const swift = requireComputerUseSwift()
await swift.apps.open(name)
},
getFrontmostApp(): FrontmostAppInfo | null {
const api = requireComputerUseInput()
const info = api.getFrontmostAppInfo()
if (!info) return null
return { id: info.bundleId, appName: info.appName }
},
findWindowByTitle(_title): WindowHandle | null {
// macOS: not directly supported through the current swift API.
// Use apps.listRunning() and filter by title instead.
const all = this.listRunning()
return all.find(w => w.title.includes(_title)) ?? null
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
export const platform: Platform = { input, screenshot, display, apps }

View File

@@ -0,0 +1,41 @@
/**
* Platform dispatcher for Computer Use.
*
* Loads the correct platform backend based on `process.platform`.
* Each backend implements the same unified interface.
*/
import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
export interface Platform {
input: InputPlatform
screenshot: ScreenshotPlatform
display: DisplayPlatform
apps: AppsPlatform
windowManagement?: WindowManagementPlatform
}
let cached: Platform | undefined
export function loadPlatform(): Platform {
if (cached) return cached
switch (process.platform) {
case 'darwin':
cached = require('./darwin.js').platform
break
case 'win32':
cached = require('./win32.js').platform
break
case 'linux':
cached = require('./linux.js').platform
break
default:
throw new Error(`Computer Use not supported on ${process.platform}`)
}
return cached!
}
export type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
export type { WindowHandle, ScreenshotResult, DisplayInfo, InstalledApp, FrontmostAppInfo, WindowAction } from './types.js'

View File

@@ -0,0 +1,416 @@
/**
* Linux platform backend for Computer Use.
*
* Uses:
* - xdotool for mouse/keyboard input
* - scrot for screenshots (converted to JPEG)
* - xrandr for display enumeration
* - wmctrl for window management
*
* CRITICAL: All screenshots output JPEG. scrot outputs PNG by default,
* so we pipe through ImageMagick `convert` to produce JPEG.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
ScreenshotResult,
DisplayInfo,
InstalledApp,
FrontmostAppInfo,
} from './types.js'
// ---------------------------------------------------------------------------
// Shell helpers
// ---------------------------------------------------------------------------
function run(cmd: string[]): string {
const result = Bun.spawnSync({ cmd, stdout: 'pipe', stderr: 'pipe' })
return new TextDecoder().decode(result.stdout).trim()
}
async function runAsync(cmd: string[]): Promise<string> {
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
function commandExists(name: string): boolean {
const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' })
return result.exitCode === 0
}
// ---------------------------------------------------------------------------
// xdotool key name mapping
// ---------------------------------------------------------------------------
const KEY_MAP: Record<string, string> = {
return: 'Return', enter: 'Return', tab: 'Tab', space: 'space',
backspace: 'BackSpace', delete: 'Delete', escape: 'Escape', esc: 'Escape',
left: 'Left', up: 'Up', right: 'Right', down: 'Down',
home: 'Home', end: 'End', pageup: 'Prior', pagedown: 'Next',
f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6',
f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12',
shift: 'shift', lshift: 'shift', rshift: 'shift',
control: 'ctrl', ctrl: 'ctrl', lcontrol: 'ctrl', rcontrol: 'ctrl',
alt: 'alt', option: 'alt', lalt: 'alt', ralt: 'alt',
win: 'super', meta: 'super', command: 'super', cmd: 'super', super: 'super',
insert: 'Insert', printscreen: 'Print', pause: 'Pause',
numlock: 'Num_Lock', capslock: 'Caps_Lock', scrolllock: 'Scroll_Lock',
}
const MODIFIER_KEYS = new Set([
'shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol',
'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super',
])
function mapKey(name: string): string {
return KEY_MAP[name.toLowerCase()] ?? name
}
function mouseButtonNum(button: 'left' | 'right' | 'middle'): string {
return button === 'left' ? '1' : button === 'right' ? '3' : '2'
}
// ---------------------------------------------------------------------------
// Input — xdotool
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
},
async click(x, y, button) {
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
run(['xdotool', 'click', mouseButtonNum(button)])
},
async typeText(text) {
run(['xdotool', 'type', '--delay', '12', text])
},
async key(name, action) {
const mapped = mapKey(name)
if (action === 'press') {
run(['xdotool', 'keydown', mapped])
} else {
run(['xdotool', 'keyup', mapped])
}
},
async keys(parts) {
const modifiers: string[] = []
let finalKey: string | null = null
for (const part of parts) {
if (MODIFIER_KEYS.has(part.toLowerCase())) {
modifiers.push(mapKey(part))
} else {
finalKey = part
}
}
if (!finalKey) return
const combo = [...modifiers, mapKey(finalKey)].join('+')
run(['xdotool', 'key', combo])
},
async scroll(amount, direction) {
if (direction === 'vertical') {
const btn = amount >= 0 ? '5' : '4'
const repeats = Math.abs(Math.round(amount))
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
} else {
const btn = amount >= 0 ? '7' : '6'
const repeats = Math.abs(Math.round(amount))
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
}
},
async mouseLocation() {
const out = run(['xdotool', 'getmouselocation'])
const xMatch = out.match(/x:(\d+)/)
const yMatch = out.match(/y:(\d+)/)
return {
x: xMatch ? Number(xMatch[1]) : 0,
y: yMatch ? Number(yMatch[1]) : 0,
}
},
// No window-bound input on Linux
}
// ---------------------------------------------------------------------------
// Screenshot — scrot → JPEG conversion
// ---------------------------------------------------------------------------
const SCREENSHOT_TMP = '/tmp/cu-screenshot-tmp.png'
const SCREENSHOT_JPG = '/tmp/cu-screenshot.jpg'
async function pngToJpegBase64(pngPath: string, width: number, height: number): Promise<ScreenshotResult> {
// Try ImageMagick convert first
if (commandExists('convert')) {
await runAsync(['convert', pngPath, '-quality', '75', SCREENSHOT_JPG])
const file = Bun.file(SCREENSHOT_JPG)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
// Fallback: ffmpeg
if (commandExists('ffmpeg')) {
await runAsync(['ffmpeg', '-y', '-i', pngPath, '-q:v', '5', SCREENSHOT_JPG])
const file = Bun.file(SCREENSHOT_JPG)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
// Last resort: return PNG base64 (caller should be aware)
const file = Bun.file(pngPath)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
try {
await runAsync(['scrot', '-o', SCREENSHOT_TMP])
const size = display.getSize(displayId)
return pngToJpegBase64(SCREENSHOT_TMP, size.width, size.height)
} catch {
return { base64: '', width: 0, height: 0 }
}
},
async captureRegion(x, y, w, h) {
try {
await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_TMP])
return pngToJpegBase64(SCREENSHOT_TMP, w, h)
} catch {
return { base64: '', width: w, height: h }
}
},
async captureWindow(hwnd) {
try {
// Use xdotool to get window geometry, then import (ImageMagick) to capture
if (commandExists('import')) {
const jpgPath = '/tmp/cu-window-capture.jpg'
await runAsync(['import', '-window', hwnd, '-quality', '75', jpgPath])
// Get dimensions from xdotool
const geom = run(['xdotool', 'getwindowgeometry', '--shell', hwnd])
const wMatch = geom.match(/WIDTH=(\d+)/)
const hMatch = geom.match(/HEIGHT=(\d+)/)
const width = wMatch ? Number(wMatch[1]) : 0
const height = hMatch ? Number(hMatch[1]) : 0
const file = Bun.file(jpgPath)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
return null
} catch {
return null
}
},
}
// ---------------------------------------------------------------------------
// Display — xrandr
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll(): DisplayInfo[] {
try {
const raw = run(['xrandr', '--query'])
const displays: DisplayInfo[] = []
let idx = 0
const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm
let match: RegExpExecArray | null
while ((match = regex.exec(raw)) !== null) {
displays.push({
width: Number(match[1]),
height: Number(match[2]),
scaleFactor: 1,
displayId: idx++,
})
}
if (displays.length === 0) {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
return displays
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
getSize(displayId): DisplayInfo {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
}
// ---------------------------------------------------------------------------
// Apps — wmctrl + ps + .desktop files
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
try {
if (commandExists('wmctrl')) {
const raw = run(['wmctrl', '-l', '-p'])
const handles: WindowHandle[] = []
for (const line of raw.split('\n').filter(Boolean)) {
const parts = line.split(/\s+/)
const windowId = parts[0]
const pid = Number(parts[2])
if (!pid) continue
// Title is everything after the 4th field (hostname)
const title = parts.slice(4).join(' ')
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch {}
handles.push({
id: windowId ?? '',
pid,
title,
exePath: exePath || undefined,
})
}
// Deduplicate by id
const seen = new Set<string>()
return handles.filter(h => {
if (seen.has(h.id)) return false
seen.add(h.id)
return true
}).slice(0, 50)
}
// Fallback: xdotool search
const raw = run(['xdotool', 'search', '--name', ''])
const handles: WindowHandle[] = []
for (const windowId of raw.split('\n').filter(Boolean).slice(0, 50)) {
const title = run(['xdotool', 'getwindowname', windowId])
let pid = 0
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
if (title) {
handles.push({ id: windowId, pid, title })
}
}
return handles
} catch {
return []
}
},
async listInstalled(): Promise<InstalledApp[]> {
try {
const dirs = [
'/usr/share/applications',
'/usr/local/share/applications',
`${process.env.HOME}/.local/share/applications`,
]
const result: InstalledApp[] = []
for (const dir of dirs) {
let files: string
try {
files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1'])
} catch { continue }
for (const filepath of files.split('\n').filter(Boolean)) {
try {
const content = run(['cat', filepath])
const nameMatch = content.match(/^Name=(.+)$/m)
const execMatch = content.match(/^Exec=(.+)$/m)
const noDisplay = content.match(/^NoDisplay=true$/m)
if (noDisplay) continue
const name = nameMatch?.[1] ?? ''
const exec = execMatch?.[1] ?? ''
if (!name) continue
result.push({
id: filepath.split('/').pop()?.replace('.desktop', '') ?? '',
displayName: name,
path: exec.split(/\s+/)[0] ?? '',
})
} catch { /* skip unreadable */ }
}
}
return result.slice(0, 200)
} catch {
return []
}
},
async open(name) {
try {
const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop`
if (commandExists('gtk-launch')) {
await runAsync(['gtk-launch', desktopName])
return
}
} catch { /* fall through */ }
await runAsync(['xdg-open', name])
},
getFrontmostApp(): FrontmostAppInfo | null {
try {
const windowId = run(['xdotool', 'getactivewindow'])
if (!windowId) return null
const pidStr = run(['xdotool', 'getwindowpid', windowId])
if (!pidStr) return null
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch {}
let appName = ''
try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch {}
if (!exePath && !appName) return null
return { id: exePath || `/proc/${pidStr}/exe`, appName: appName || 'unknown' }
} catch {
return null
}
},
findWindowByTitle(title): WindowHandle | null {
try {
// xdotool search by name
const raw = run(['xdotool', 'search', '--name', title])
const windowId = raw.split('\n')[0]
if (!windowId) return null
const windowTitle = run(['xdotool', 'getwindowname', windowId])
let pid = 0
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
return { id: windowId, pid, title: windowTitle }
} catch {
return null
}
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
export const platform: Platform = { input, screenshot, display, apps }

View File

@@ -0,0 +1,153 @@
/**
* Cross-platform abstraction types for Computer Use.
*
* These interfaces define a unified API surface for input, screenshots,
* display info, and app management across macOS, Windows, and Linux.
*/
// ---------------------------------------------------------------------------
// Window / App types
// ---------------------------------------------------------------------------
/** Cross-platform window identifier */
export interface WindowHandle {
id: string // macOS: bundleId, Windows: HWND string, Linux: window ID
pid: number
title: string
exePath?: string // Windows/Linux: process executable path
}
export interface ScreenshotResult {
base64: string
width: number
height: number
}
export interface DisplayInfo {
width: number
height: number
scaleFactor: number
displayId: number
}
export interface InstalledApp {
id: string // macOS: bundleId, Windows: exe path or package family, Linux: .desktop name
displayName: string
path: string
}
export interface FrontmostAppInfo {
id: string
appName: string
}
// ---------------------------------------------------------------------------
// InputPlatform
// ---------------------------------------------------------------------------
/**
* Input platform interface — two modes:
*
* Mode A (Global): moveMouse, click, typeText, key, keys, scroll, mouseLocation
* Works on all platforms. Sends input to the foreground window; moves the
* real cursor and steals focus.
*
* Mode B (Window-bound, optional): sendChar, sendKey, sendClick, sendText
* Windows-only via SendMessage/PostMessage. Does NOT steal focus or move
* the cursor. Preferred when a target HWND is known.
*/
export interface InputPlatform {
// --- Mode A: Global input (all platforms) ---
moveMouse(x: number, y: number): Promise<void>
click(
x: number,
y: number,
button: 'left' | 'right' | 'middle',
): Promise<void>
typeText(text: string): Promise<void>
key(name: string, action: 'press' | 'release'): Promise<void>
keys(combo: string[]): Promise<void>
scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
mouseLocation(): Promise<{ x: number; y: number }>
// --- Mode B: Window-bound input (Windows only, optional) ---
sendChar?(hwnd: string, char: string): Promise<void>
sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise<void>
sendClick?(
hwnd: string,
x: number,
y: number,
button: 'left' | 'right',
): Promise<void>
sendText?(hwnd: string, text: string): Promise<void>
}
// ---------------------------------------------------------------------------
// ScreenshotPlatform
// ---------------------------------------------------------------------------
export interface ScreenshotPlatform {
/** Full-screen capture. Returns JPEG base64. */
captureScreen(displayId?: number): Promise<ScreenshotResult>
/** Region capture. Returns JPEG base64. */
captureRegion(
x: number,
y: number,
w: number,
h: number,
): Promise<ScreenshotResult>
/** Window capture (Windows: PrintWindow, macOS: SCContentFilter, Linux: xdotool+import). */
captureWindow?(hwnd: string): Promise<ScreenshotResult | null>
}
// ---------------------------------------------------------------------------
// DisplayPlatform
// ---------------------------------------------------------------------------
export interface DisplayPlatform {
listAll(): DisplayInfo[]
getSize(displayId?: number): DisplayInfo
}
// ---------------------------------------------------------------------------
// AppsPlatform
// ---------------------------------------------------------------------------
export interface AppsPlatform {
listRunning(): WindowHandle[]
listInstalled(): Promise<InstalledApp[]>
open(name: string): Promise<void>
getFrontmostApp(): FrontmostAppInfo | null
findWindowByTitle(title: string): WindowHandle | null
}
// ---------------------------------------------------------------------------
// WindowManagementPlatform (Windows HWND-targeted, no global APIs)
// ---------------------------------------------------------------------------
export type WindowAction =
| 'minimize'
| 'maximize'
| 'restore'
| 'close'
| 'focus'
| 'move_offscreen'
| 'move_resize'
| 'get_rect'
export interface WindowManagementPlatform {
/** Perform a window management action on the bound HWND. All via Win32 API, no global shortcuts. */
manageWindow(
action: WindowAction,
opts?: { x?: number; y?: number; width?: number; height?: number },
): boolean
/** Move window to specific position and/or resize */
moveResize(x: number, y: number, width?: number, height?: number): boolean
/** Get current window rect */
getWindowRect(): {
x: number
y: number
width: number
height: number
} | null
}

View File

@@ -0,0 +1,979 @@
/**
* Windows platform backend for Computer Use.
*
* Combines:
* - PowerShell SetCursorPos/SendInput for global input (fallback)
* - win32/windowMessage.ts for window-bound SendMessage input (preferred)
* - Python Bridge (bridge.py) for screenshots (mss + ctypes PrintWindow)
* - win32/windowEnum.ts for EnumWindows app listing
* - No PowerShell for screenshots (Python Bridge only, no PS fallback)
* - PowerShell Screen.AllScreens for display enumeration
*
* CRITICAL: All screenshots output JPEG (ImageFormat::Jpeg), not PNG.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
ScreenshotResult,
DisplayInfo,
InstalledApp,
FrontmostAppInfo,
} from './types.js'
import { listWindows } from '../win32/windowEnum.js'
import { detectAppType, openWithController } from '../win32/appDispatcher.js'
import {
markBound,
unmarkBound,
cleanupAllBorders,
} from '../win32/windowBorder.js'
import {
showVirtualCursor,
hideVirtualCursor,
moveVirtualCursor,
} from '../win32/virtualCursor.js'
import { showIndicator, hideIndicator } from '../win32/inputIndicator.js'
import {
ps,
psAsync,
validateHwnd,
VK_MAP,
MODIFIER_KEYS,
} from '../win32/shared.js'
import { logForDebugging } from '../../debug.js'
// ---------------------------------------------------------------------------
// Python Bridge (lazy-loaded, preferred over PowerShell for screenshots)
// ---------------------------------------------------------------------------
let _bridge: typeof import('../win32/bridgeClient.js') | undefined
function getBridge() {
if (!_bridge) {
try {
_bridge =
require('../win32/bridgeClient.js') as typeof import('../win32/bridgeClient.js')
} catch {}
}
return _bridge
}
/** Try a bridge call, return null on failure (caller falls back to PS) */
function bridgeCallSync<T>(
method: string,
params: Record<string, unknown> = {},
): T | null {
try {
const b = getBridge()
if (!b) return null
return b.callSync<T>(method, params)
} catch {
return null
}
}
// validateHwnd, ps, psAsync, VK_MAP, MODIFIER_KEYS imported from '../win32/shared.js'
// ---------------------------------------------------------------------------
// Win32 P/Invoke types (compiled once per PS session)
// ---------------------------------------------------------------------------
const WIN32_TYPES = `
Add-Type -Language CSharp @'
using System;
using System.Runtime.InteropServices;
using System.Text;
using System.Diagnostics;
public class CuWin32 {
// --- Cursor ---
[DllImport("user32.dll")] public static extern bool SetCursorPos(int X, int Y);
[DllImport("user32.dll")] public static extern bool GetCursorPos(out POINT p);
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
// --- SendInput ---
[StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT {
public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
}
[StructLayout(LayoutKind.Explicit)] public struct INPUT {
[FieldOffset(0)] public uint type;
[FieldOffset(4)] public MOUSEINPUT mi;
}
[StructLayout(LayoutKind.Sequential)] public struct KEYBDINPUT {
public ushort wVk; public ushort wScan; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
}
[StructLayout(LayoutKind.Explicit)] public struct KINPUT {
[FieldOffset(0)] public uint type;
[FieldOffset(4)] public KEYBDINPUT ki;
}
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb);
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, KINPUT[] i, int cb);
// --- Keyboard ---
[DllImport("user32.dll")] public static extern void keybd_event(byte bVk, byte bScan, uint dwFlags, UIntPtr dwExtraInfo);
[DllImport("user32.dll")] public static extern short VkKeyScan(char ch);
// --- Window ---
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr hWnd, StringBuilder sb, int max);
// Constants
public const uint INPUT_MOUSE = 0, INPUT_KEYBOARD = 1;
public const uint MOUSEEVENTF_LEFTDOWN = 0x0002, MOUSEEVENTF_LEFTUP = 0x0004;
public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008, MOUSEEVENTF_RIGHTUP = 0x0010;
public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020, MOUSEEVENTF_MIDDLEUP = 0x0040;
public const uint MOUSEEVENTF_WHEEL = 0x0800, MOUSEEVENTF_HWHEEL = 0x1000;
public const uint KEYEVENTF_KEYUP = 0x0002;
}
'@
`
// VK_MAP and MODIFIER_KEYS imported from '../win32/shared.js'
// ---------------------------------------------------------------------------
// Session-level HWND binding — all operations target this handle
// ---------------------------------------------------------------------------
let boundHwnd: string | null = null
let boundPid: number | null = null
let boundAppType: import('../win32/appDispatcher.js').AppType | null = null
let boundFilePath: string | null = null
/** Get the bound HWND, or null if not bound */
export function getBoundHwnd(): string | null {
return boundHwnd
}
/** Get the bound app type */
export function getBoundAppType(): string | null {
return boundAppType
}
/** Bind to a window HWND — all subsequent input/screenshot operations target this handle */
export function bindWindow(hwnd: string, pid?: number): void {
hwnd = validateHwnd(hwnd)
// Clean up previous binding
if (boundHwnd) {
unmarkBound(boundHwnd)
hideVirtualCursor()
hideIndicator()
}
boundHwnd = hwnd
boundPid = pid ?? null
boundAppType = 'generic'
boundFilePath = null
// 1. Brief activation: set the window to accept input, then restore user's focus.
// Some apps (UWP/Electron) don't process SendMessage when never-activated.
// Save current foreground → activate target → restore original foreground.
const activateScript = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuActivate {
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
[DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr h);
[DllImport("user32.dll")] public static extern bool IsIconic(IntPtr h);
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
}
'@
$prev = [CuActivate]::GetForegroundWindow()
$target = [IntPtr]::new([long]${hwnd})
if ([CuActivate]::IsIconic($target)) { [CuActivate]::ShowWindow($target, 9) | Out-Null }
[CuActivate]::SetForegroundWindow($target) | Out-Null
Start-Sleep -Milliseconds 100
if ($prev -ne [IntPtr]::Zero -and $prev -ne $target) {
[CuActivate]::SetForegroundWindow($prev) | Out-Null
}
`
ps(activateScript)
// 2. Visual indicators
markBound(hwnd)
showVirtualCursor(hwnd)
showIndicator(hwnd)
}
/** Bind to a COM-controlled file (Excel/Word — no window needed) */
export function bindFile(
filePath: string,
appType: import('../win32/appDispatcher.js').AppType,
): void {
boundHwnd = null
boundPid = null
boundAppType = appType
boundFilePath = filePath
}
/** Unbind — revert to global mode, remove overlays */
export function unbindWindow(): void {
if (boundHwnd) unmarkBound(boundHwnd)
hideVirtualCursor()
hideIndicator()
// Clear cached edit-child / InputSite mappings
getWm().clearEditChildCache()
boundHwnd = null
boundPid = null
boundAppType = null
boundFilePath = null
}
// ---------------------------------------------------------------------------
// Window Message module (lazy loaded)
// ---------------------------------------------------------------------------
let _wm: typeof import('../win32/windowMessage.js') | undefined
function getWm() {
// eslint-disable-next-line @typescript-eslint/no-require-imports
return (_wm ??=
require('../win32/windowMessage.js') as typeof import('../win32/windowMessage.js'))
}
// ---------------------------------------------------------------------------
// Input — ALL text/key input goes through SendMessage when HWND is bound.
// Global SendInput/keybd_event is DISABLED to avoid interfering with user.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Input — When HWND is bound, ALL operations go through SendMessage.
// NO global API (SetCursorPos/SendInput/keybd_event/SendKeys) is used.
// This ensures the user's desktop is never disturbed.
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
if (boundHwnd) {
// Bound mode: move virtual cursor (visual only), no real cursor movement
moveVirtualCursor(Math.round(x), Math.round(y))
return
}
ps(
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null`,
)
},
async click(x, y, button) {
if (boundHwnd) {
moveVirtualCursor(Math.round(x), Math.round(y), true)
// Find the deepest child window at these client coords and click on it.
const editHwnd = getWm().findEditChild(boundHwnd)
const targetHwnd = editHwnd ?? boundHwnd
const ok = getWm().sendClick(
targetHwnd,
Math.round(x),
Math.round(y),
button,
)
if (!ok) {
getWm().sendClick(boundHwnd, Math.round(x), Math.round(y), button)
}
return
}
const downFlag =
button === 'left'
? 'MOUSEEVENTF_LEFTDOWN'
: button === 'right'
? 'MOUSEEVENTF_RIGHTDOWN'
: 'MOUSEEVENTF_MIDDLEDOWN'
const upFlag =
button === 'left'
? 'MOUSEEVENTF_LEFTUP'
: button === 'right'
? 'MOUSEEVENTF_RIGHTUP'
: 'MOUSEEVENTF_MIDDLEUP'
ps(
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
)
},
async typeText(text) {
// COM-controlled apps: write directly via COM API
if (boundAppType === 'word' && boundFilePath) {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { appendText } =
require('../win32/comWord.js') as typeof import('../win32/comWord.js')
appendText(boundFilePath, text)
return
}
// HWND-bound apps: SendMessageW(WM_CHAR) or clipboard paste
if (boundHwnd) {
const ok = getWm().sendText(boundHwnd, text)
if (!ok) {
throw new Error(
`typeText failed: SendMessage to HWND ${boundHwnd} returned false. ` +
`The edit control may not have been found (findEditChild returned null).`,
)
}
return
}
throw new Error(
'typeText requires a bound window or file. Call open() first.',
)
},
async key(name, action) {
if (boundHwnd) {
const lower = name.toLowerCase()
const vk = VK_MAP[lower] ?? (name.length === 1 ? name.charCodeAt(0) : 0)
if (vk)
getWm().sendKey(boundHwnd, vk, action === 'release' ? 'up' : 'down')
return
}
throw new Error('key requires a bound window HWND. Call open() first.')
},
async keys(parts) {
if (boundHwnd) {
const ok = getWm().sendKeys(boundHwnd, parts)
if (!ok) {
throw new Error(`keys [${parts.join('+')}] failed on HWND ${boundHwnd}`)
}
return
}
throw new Error('keys requires a bound window HWND. Call open() first.')
},
async scroll(amount, direction) {
if (boundHwnd) {
// WM_VSCROLL / WM_HSCROLL for window-bound scrolling
const msg = direction === 'vertical' ? '0x0115' : '0x0114' // WM_VSCROLL / WM_HSCROLL
const wParam = amount > 0 ? '1' : '0' // SB_LINEDOWN=1 (positive=down) / SB_LINEUP=0 (negative=up)
const n = Math.abs(Math.round(amount))
let script = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WScroll {
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr h, uint m, IntPtr w, IntPtr l);
}
'@
`
for (let i = 0; i < n; i++) {
script += `[WScroll]::SendMessage([IntPtr]::new([long]${boundHwnd}), ${msg}, [IntPtr]${wParam}, [IntPtr]::Zero) | Out-Null; `
}
ps(script)
return
}
const flag =
direction === 'vertical' ? 'MOUSEEVENTF_WHEEL' : 'MOUSEEVENTF_HWHEEL'
ps(
`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${flag}; $i.mi.mouseData=${amount * 120}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
)
},
async mouseLocation() {
// Always returns real cursor position (informational, doesn't move it)
const out = ps(
`${WIN32_TYPES}; $p = New-Object CuWin32+POINT; [CuWin32]::GetCursorPos([ref]$p) | Out-Null; "$($p.X),$($p.Y)"`,
)
const [xStr, yStr] = out.split(',')
return { x: Number(xStr), y: Number(yStr) }
},
async sendChar(hwnd, char) {
getWm().sendChar(String(hwnd), char)
},
async sendKey(hwnd, vk, action) {
getWm().sendKey(String(hwnd), vk, action)
},
async sendClick(hwnd, x, y, button) {
getWm().sendClick(String(hwnd), x, y, button)
},
async sendText(hwnd, text) {
getWm().sendText(String(hwnd), text)
},
}
// ---------------------------------------------------------------------------
// Screenshot — JPEG output only
// ---------------------------------------------------------------------------
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
// If HWND is bound, capture that specific window
if (boundHwnd) {
const result = this.captureWindow?.(String(boundHwnd))
if (result) return result
}
// Python Bridge (mss + Pillow, ~300ms)
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot', {
display_id: displayId ?? 0,
})
if (bridgeResult && bridgeResult.base64) {
return bridgeResult
}
throw new Error(
'[computer-use] Screenshot failed: Python bridge returned no data. ' +
'Ensure python3 + mss + Pillow are installed (pip install mss Pillow).',
)
},
async captureRegion(x, y, w, h) {
// When HWND is bound, the window IS the region (matches macOS behavior)
if (boundHwnd) {
const result = this.captureWindow?.(String(boundHwnd))
if (result) return result
}
return this.captureScreen()
},
captureWindow(hwnd) {
// Python Bridge (ctypes PrintWindow + GDI → Pillow JPEG, ~300ms)
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot_window', {
hwnd: String(hwnd),
})
if (bridgeResult && bridgeResult.base64) {
return bridgeResult
}
throw new Error(
`[computer-use] Window screenshot failed for HWND ${hwnd}: Python bridge returned no data.`,
)
},
}
// ---------------------------------------------------------------------------
// Display — Screen.AllScreens
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll(): DisplayInfo[] {
try {
const raw = ps(`
Add-Type -AssemblyName System.Windows.Forms
$result = @()
$idx = 0
foreach ($s in [System.Windows.Forms.Screen]::AllScreens) {
$result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)"
$idx++
}
$result -join "|"
`)
return raw
.split('|')
.filter(Boolean)
.map(entry => {
const [w, h, id] = entry.split(',')
return {
width: Number(w),
height: Number(h),
scaleFactor: 1,
displayId: Number(id),
}
})
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
getSize(displayId): DisplayInfo {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
}
// ---------------------------------------------------------------------------
// Find existing window by process name or title (avoid launching new instance)
// ---------------------------------------------------------------------------
function findExistingWindow(
hint: string,
): { hwnd: string; pid: number } | null {
const windows = listWindows()
const lower = hint.toLowerCase()
// Match by window title containing the hint
for (const w of windows) {
const titleLower = (w.title ?? '').toLowerCase()
if (titleLower.includes(lower)) {
return { hwnd: w.hwnd, pid: w.pid }
}
}
return null
}
// ---------------------------------------------------------------------------
// Apps — EnumWindows + registry + AppxPackage
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
const windows = listWindows()
return windows.map(w => ({
id: String(w.hwnd),
pid: w.pid,
title: w.title,
}))
},
async listInstalled(): Promise<InstalledApp[]> {
try {
const raw = await psAsync(`
$apps = @()
# Traditional Win32 apps from registry
$paths = @(
'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*'
)
foreach ($p in $paths) {
Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object {
$apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)"
}
}
# UWP/MSIX apps (Windows 10/11 Store apps)
Get-AppxPackage -ErrorAction SilentlyContinue | Where-Object { $_.IsFramework -eq $false -and $_.SignatureKind -eq 'Store' } | ForEach-Object {
$cleanName = $_.Name -replace '^Microsoft\\.Windows', '' -replace '^Microsoft\\.', ''
$apps += "$cleanName|$($_.InstallLocation)|$($_.PackageFamilyName)"
}
$apps | Select-Object -Unique | Select-Object -First 300
`)
return raw
.split('\n')
.filter(Boolean)
.map(line => {
const [name, path, id] = line.trim().split('|', 3)
return {
id: (id ?? name ?? '').trim(),
displayName: (name ?? '').trim(),
path: (path ?? '').trim(),
}
})
} catch {
return []
}
},
async open(name) {
// Detect app type and route to appropriate controller
const appType = detectAppType(name)
// Excel/Word → COM automation (no window, no HWND)
if (appType === 'excel' || appType === 'word') {
const result = await openWithController(name)
if (result.filePath) {
bindFile(result.filePath, result.type)
}
return
}
// Text/Browser/Generic → exe launch + HWND bind (offscreen)
// If name is a UWP PackageFamilyName (e.g. Microsoft.WindowsNotepad_8wekyb3d8bbwe),
// extract the app name and try as exe. This avoids launching through UWP shell.
let launchName = name
if (name.includes('_') && name.includes('.')) {
// Microsoft.WindowsNotepad_xxx → Notepad
// Microsoft.WindowsCalculator_xxx → Calculator
// Microsoft.WindowsTerminal_xxx → Terminal
const parts = name.split('_')[0]?.split('.') ?? []
const appPart = parts[parts.length - 1] ?? name
// Strip "Windows" prefix: WindowsNotepad → Notepad
launchName = appPart.replace(/^Windows/, '') || appPart
}
// --- Try to find an EXISTING window first (by process name or title) ---
// If found, auto-bind to it. Use bind_window tool to switch later.
const existingHwnd = findExistingWindow(launchName)
if (existingHwnd) {
bindWindow(existingHwnd.hwnd, existingHwnd.pid)
return
}
const escaped = launchName.replace(/'/g, "''")
const result = await psAsync(`
${WIN32_TYPES}
Add-Type @'
using System;
using System.Runtime.InteropServices;
using System.Text;
public class CuLaunch {
public delegate bool EnumProc(IntPtr h, IntPtr lp);
[DllImport("user32.dll")] public static extern bool EnumWindows(EnumProc cb, IntPtr lp);
[DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr h);
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr h, out uint pid);
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr h, StringBuilder sb, int n);
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
public const int SW_SHOWMINNOACTIVE = 7;
// Get all visible window HWNDs as array
public static long[] GetAllVisibleHwnds() {
var list = new System.Collections.Generic.List<long>();
EnumWindows((h, _) => {
if (IsWindowVisible(h)) list.Add(h.ToInt64());
return true;
}, IntPtr.Zero);
return list.ToArray();
}
// Get PID for a single HWND
public static uint GetPidForHwnd(long hwnd) {
uint pid; GetWindowThreadProcessId((IntPtr)hwnd, out pid);
return pid;
}
// Get title for a single HWND
public static string GetTitle(long hwnd) {
var sb = new StringBuilder(256);
GetWindowText((IntPtr)hwnd, sb, 256);
return sb.ToString();
}
}
'@
# Launch strategy: all exe-based, no GUI dialogs.
# 1) exact path 2) exe in PATH 3) registry install dir 4) raw name
$target = '${escaped}'
$proc = $null
# 1. Exact file path
if (Test-Path $target) {
$proc = Start-Process $target -PassThru -ErrorAction SilentlyContinue
}
# 2. exe name in PATH (notepad.exe, code.exe, chrome.exe, etc.)
if (-not $proc) {
# Try with .exe suffix if not already
$tryExe = if ($target -notmatch '[.]exe$') { "$target.exe" } else { $target }
$found = Get-Command $tryExe -ErrorAction SilentlyContinue | Select-Object -First 1
if (-not $found) { $found = Get-Command $target -ErrorAction SilentlyContinue | Select-Object -First 1 }
if ($found) { $proc = Start-Process $found.Source -PassThru -ErrorAction SilentlyContinue }
}
# 3. Search registry for install location by display name → find .exe
if (-not $proc) {
$regPaths = @('HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*')
foreach ($p in $regPaths) {
$app = Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object {
$_.DisplayName -and $_.DisplayName -match [regex]::Escape($target)
} | Select-Object -First 1
if ($app) {
# Try DisplayIcon (often the exe path), then InstallLocation
$exePath = $null
if ($app.DisplayIcon -and $app.DisplayIcon -match '[.]exe') {
$exePath = ($app.DisplayIcon -split ',')[0].Trim('"')
}
if (-not $exePath -and $app.InstallLocation) {
$exeFile = Get-ChildItem $app.InstallLocation -Filter '*.exe' -ErrorAction SilentlyContinue | Select-Object -First 1
if ($exeFile) { $exePath = $exeFile.FullName }
}
if ($exePath -and (Test-Path $exePath)) {
$proc = Start-Process $exePath -PassThru -ErrorAction SilentlyContinue
break
}
}
}
}
# 4. Last resort: direct Start-Process (Windows may resolve it)
if (-not $proc) { $proc = Start-Process -FilePath $target -PassThru -ErrorAction SilentlyContinue }
if (-not $proc) { Write-Host "LAUNCH_FAILED"; exit }
# Snapshot ALL visible window HWNDs before the new window appears
$beforeHwnds = [CuLaunch]::GetAllVisibleHwnds()
# Wait for a NEW window from our process PID
$hwnd = 0
for ($i = 0; $i -lt 50; $i++) {
Start-Sleep -Milliseconds 200
$afterHwnds = [CuLaunch]::GetAllVisibleHwnds()
# Find new windows (in after but not in before)
foreach ($h in $afterHwnds) {
if ($beforeHwnds -contains $h) { continue }
# New window found — check PID
$wPid = [CuLaunch]::GetPidForHwnd($h)
if ($wPid -eq [uint32]$proc.Id) {
$hwnd = $h; break # exact PID match
}
}
if ($hwnd -ne 0) { break }
# PID didn't match (process redirect) — accept new window matching title hint
if ($i -gt 10) {
$hint = '${escaped}'.Split('\\')[-1].Replace('.exe','')
foreach ($h in $afterHwnds) {
if ($beforeHwnds -contains $h) { continue }
$title = [CuLaunch]::GetTitle($h)
if ($title -and $title.IndexOf($hint, [StringComparison]::OrdinalIgnoreCase) -ge 0) {
$hwnd = $h; break
}
}
if ($hwnd -ne 0) { break }
}
}
if ($hwnd -eq 0) { Write-Host "HWND_NOT_FOUND|$($proc.Id)"; exit }
# Move offscreen instead of minimizing — keeps window restored so
# PrintWindow and SendMessage work without needing restore/re-minimize.
# User cannot see the window at -32000,-32000.
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuPos {
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
public const uint SWP_NOSIZE = 0x0001;
public const uint SWP_NOZORDER = 0x0004;
public const uint SWP_NOACTIVATE = 0x0010;
}
'@
[CuPos]::SetWindowPos([IntPtr]::new([long]$hwnd), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuPos]::SWP_NOSIZE -bor [CuPos]::SWP_NOZORDER -bor [CuPos]::SWP_NOACTIVATE) | Out-Null
Write-Host "$hwnd|$($proc.Id)"
`)
if (!result) {
throw new Error(
`open(): failed to launch '${name}' — no output from launcher script`,
)
}
if (result.startsWith('LAUNCH_FAILED')) {
throw new Error(
`open(): failed to launch '${name}' — process did not start (${result})`,
)
}
if (result.startsWith('HWND_NOT_FOUND')) {
throw new Error(
`open(): launched '${name}' but could not find its window HWND (${result})`,
)
}
const parts = result.trim().split('|')
const hwnd = parts[0]!.trim()
const pid = Number(parts[1])
if (hwnd && hwnd !== '0') {
// Bind to the launched window — all subsequent operations target this HWND
bindWindow(hwnd, pid)
}
},
getFrontmostApp(): FrontmostAppInfo | null {
try {
const out = ps(`${WIN32_TYPES}
$hwnd = [CuWin32]::GetForegroundWindow()
$procId = [uint32]0
[CuWin32]::GetWindowThreadProcessId($hwnd, [ref]$procId) | Out-Null
$proc = Get-Process -Id $procId -ErrorAction SilentlyContinue
"$($proc.MainModule.FileName)|$($proc.ProcessName)"`)
if (!out || !out.includes('|')) return null
const [exePath, appName] = out.split('|', 2)
return { id: exePath!, appName: appName! }
} catch {
return null
}
},
findWindowByTitle(title): WindowHandle | null {
const windows = listWindows()
const found = windows.find(w => w.title.includes(title))
if (!found) return null
return { id: String(found.hwnd), pid: found.pid, title: found.title }
},
}
// ---------------------------------------------------------------------------
// Window Management — Win32 API calls targeted at bound HWND.
// NO global shortcuts (Win+Down, Alt+F4, etc.)
// Uses ShowWindow, SetWindowPos, SendMessage(WM_CLOSE) directly.
// ---------------------------------------------------------------------------
const WINDOW_MGMT_TYPES = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuWinMgmt {
[DllImport("user32.dll")]
public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow);
[DllImport("user32.dll")]
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint uFlags);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect);
[DllImport("user32.dll")]
public static extern bool SetForegroundWindow(IntPtr hWnd);
[DllImport("user32.dll")]
public static extern bool BringWindowToTop(IntPtr hWnd);
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool IsIconic(IntPtr hWnd);
[DllImport("user32.dll")]
public static extern bool IsZoomed(IntPtr hWnd);
[StructLayout(LayoutKind.Sequential)]
public struct RECT {
public int Left; public int Top; public int Right; public int Bottom;
}
// ShowWindow constants
public const int SW_MINIMIZE = 6;
public const int SW_MAXIMIZE = 3;
public const int SW_RESTORE = 9;
public const int SW_SHOWNOACTIVATE = 4;
public const int SW_SHOWMINNOACTIVE = 7;
// SetWindowPos flags
public const uint SWP_NOSIZE = 0x0001;
public const uint SWP_NOMOVE = 0x0002;
public const uint SWP_NOZORDER = 0x0004;
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
// WM_CLOSE
public const uint WM_CLOSE = 0x0010;
// WM_SYSCOMMAND
public const uint WM_SYSCOMMAND = 0x0112;
public const int SC_MINIMIZE = 0xF020;
public const int SC_MAXIMIZE = 0xF030;
public const int SC_RESTORE = 0xF120;
public const int SC_CLOSE = 0xF060;
}
'@
`
import type { WindowManagementPlatform, WindowAction } from './types.js'
const windowManagement: WindowManagementPlatform = {
manageWindow(action: WindowAction, opts?): boolean {
if (!boundHwnd) return false
const hwnd = boundHwnd
switch (action) {
case 'minimize': {
// ShowWindow(SW_MINIMIZE) — targeted at HWND, not global
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_SHOWMINNOACTIVE)`,
)
return r !== ''
}
case 'maximize': {
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_MAXIMIZE)`,
)
return r !== ''
}
case 'restore': {
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_RESTORE)`,
)
return r !== ''
}
case 'close': {
// SendMessage(WM_CLOSE) — graceful close targeted at HWND
// Also clean up border overlay
unmarkBound(hwnd)
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SendMessage([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::WM_CLOSE, [IntPtr]::Zero, [IntPtr]::Zero)`,
)
unbindWindow()
return true
}
case 'focus': {
// Restore if minimized, then bring to front
ps(`${WINDOW_MGMT_TYPES}
$h = [IntPtr]::new([long]${hwnd})
if ([CuWinMgmt]::IsIconic($h)) {
[CuWinMgmt]::ShowWindow($h, [CuWinMgmt]::SW_RESTORE) | Out-Null
}
[CuWinMgmt]::SetForegroundWindow($h) | Out-Null
[CuWinMgmt]::BringWindowToTop($h) | Out-Null
`)
return true
}
case 'move_offscreen': {
// Move to -32000,-32000 — keeps window in restored state for SendMessage/PrintWindow
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
return true
}
case 'move_resize': {
if (opts?.x !== undefined && opts?.y !== undefined) {
this.moveResize(opts.x, opts.y, opts.width, opts.height)
}
return true
}
case 'get_rect': {
// get_rect is handled separately by getWindowRect(), not through manageWindow
// Return true to indicate the action is recognized
return true
}
default:
return false
}
},
moveResize(x: number, y: number, width?: number, height?: number): boolean {
if (!boundHwnd) return false
const hwnd = boundHwnd
if (width !== undefined && height !== undefined) {
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, ${width}, ${height}, [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
} else {
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
}
return true
},
getWindowRect(): {
x: number
y: number
width: number
height: number
} | null {
if (!boundHwnd) return null
const out = ps(`${WINDOW_MGMT_TYPES}
$rect = New-Object CuWinMgmt+RECT
if ([CuWinMgmt]::GetWindowRect([IntPtr]::new([long]${boundHwnd}), [ref]$rect)) {
"$($rect.Left),$($rect.Top),$($rect.Right),$($rect.Bottom)"
} else { "FAIL" }
`)
if (!out || out === 'FAIL') return null
const [l, t, r, b] = out.split(',').map(Number)
return { x: l, y: t, width: r - l, height: b - t }
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
// Clean up all overlays on process exit
function cleanupAll() {
cleanupAllBorders()
hideVirtualCursor()
hideIndicator()
// Stop the Python bridge subprocess if it was started
try {
getBridge()?.stopBridge()
} catch {}
}
process.on('exit', cleanupAll)
process.on('SIGINT', () => {
cleanupAll()
process.exit()
})
process.on('SIGTERM', () => {
cleanupAll()
process.exit()
})
export const platform: Platform = {
input,
screenshot,
display,
apps,
windowManagement,
}

View File

@@ -3,21 +3,16 @@ import type { ComputerUseAPI } from '@ant/computer-use-swift'
let cached: ComputerUseAPI | undefined
/**
* Package's js/index.js reads COMPUTER_USE_SWIFT_NODE_PATH (baked by
* build-with-plugins.ts on darwin targets, unset otherwise — falls through to
* the node_modules prebuilds/ path). We cache the loaded native module.
*
* The four @MainActor methods (captureExcluding, captureRegion,
* apps.listInstalled, resolvePrepareCapture) dispatch to DispatchQueue.main
* and will hang under libuv unless CFRunLoop is pumped — call sites wrap
* these in drainRunLoop().
* macOS-only loader for @ant/computer-use-swift.
* Non-darwin platforms should use src/utils/computerUse/platforms/ instead.
*/
export function requireComputerUseSwift(): ComputerUseAPI {
if (process.platform !== 'darwin') {
throw new Error('@ant/computer-use-swift is macOS-only. Use platforms/ for cross-platform.')
}
if (cached) return cached
// eslint-disable-next-line @typescript-eslint/no-require-imports
const mod = require('@ant/computer-use-swift')
// macOS native .node exports a plain object with apps/display/screenshot directly.
// Our cross-platform package exports { ComputerUseAPI } class — needs instantiation.
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
cached = new mod.ComputerUseAPI() as ComputerUseAPI
} else {

View File

@@ -0,0 +1,225 @@
/**
* Accessibility Snapshot — captures the UI Automation tree of a window
* and formats it as compact, model-friendly text.
*
* Sent alongside screenshots so the model has BOTH visual + structural
* understanding of the GUI. This enables:
* - Knowing exact element names, types, and positions
* - Using click_element/type_into_element by name instead of pixel coords
* - Understanding disabled/enabled state, current values
*
* Only includes interactive elements (buttons, edits, menus, links, etc.)
* to keep token count low (~200-500 tokens for typical windows).
*/
import { validateHwnd, ps } from './shared.js'
export interface AccessibilityNode {
role: string // Button, Edit, MenuItem, Link, Text, CheckBox, etc.
name: string // Visible text / accessible name
automationId: string
bounds: { x: number; y: number; w: number; h: number }
enabled: boolean
value?: string // Current text value (for Edit/ComboBox)
children?: AccessibilityNode[]
}
export interface AccessibilitySnapshot {
/** Compact text representation for the model */
text: string
/** Structured tree (for element-targeted actions) */
nodes: AccessibilityNode[]
/** Capture timestamp */
timestamp: number
}
/**
* Capture the accessibility tree of a window, returning only interactive
* and visible elements. Uses Windows UI Automation (crosses process boundaries).
*
* @param hwnd - Window handle as string
* @param maxDepth - Maximum tree depth (default 4)
* @param interactiveOnly - Only include interactive elements (default true)
*/
export function captureAccessibilitySnapshot(
hwnd: string,
maxDepth: number = 4,
interactiveOnly: boolean = true,
): AccessibilitySnapshot | null {
hwnd = validateHwnd(hwnd)
const filterClause = interactiveOnly
? `
# Interactive control types only
$interactiveTypes = @(
'Button','Edit','ComboBox','CheckBox','RadioButton',
'MenuItem','Menu','MenuBar','Link','Slider','Spinner',
'Tab','TabItem','List','ListItem','Tree','TreeItem',
'DataGrid','DataItem','Document','ScrollBar','ToolBar',
'SplitButton','ToggleButton','Hyperlink'
)
function Is-Interactive($ct) {
$typeName = $ct -replace 'ControlType\\.', ''
return $interactiveTypes -contains $typeName
}`
: `
function Is-Interactive($ct) { return $true }`
const script = `
Add-Type -AssemblyName UIAutomationClient
Add-Type -AssemblyName UIAutomationTypes
Add-Type -AssemblyName WindowsBase
${filterClause}
function Get-Tree($el, $depth, $maxDepth) {
if ($depth -ge $maxDepth) { return @() }
$result = @()
$children = $el.FindAll(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.Condition]::TrueCondition)
foreach ($child in $children) {
$ct = $child.Current.ControlType.ProgrammaticName
$typeName = $ct -replace 'ControlType\\.', ''
$name = [string]$child.Current.Name
$autoId = [string]$child.Current.AutomationId
$rect = $child.Current.BoundingRectangle
$enabled = $child.Current.IsEnabled
# Skip invisible/offscreen elements
if ($rect.Width -le 0 -or $rect.Height -le 0) { continue }
if ($rect.X -lt -10000) { continue }
$val = $null
try {
$vp = $child.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
if ($vp -ne $null) { $val = $vp.Current.Value }
} catch {}
$isInteractive = Is-Interactive $ct
$sub = Get-Tree $child ($depth + 1) $maxDepth
if ($isInteractive -or $sub.Count -gt 0) {
$node = @{
role = $typeName
name = $name
id = $autoId
x = [int]$rect.X; y = [int]$rect.Y
w = [int]$rect.Width; h = [int]$rect.Height
on = $enabled
}
if ($val -ne $null -and $val -ne '') { $node['v'] = $val }
if ($sub.Count -gt 0) { $node['c'] = $sub }
$result += $node
}
}
return $result
}
try {
$root = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${hwnd}))
if ($root -eq $null) { Write-Output '[]'; exit }
$tree = Get-Tree $root 0 ${maxDepth}
if ($tree -eq $null -or $tree.Count -eq 0) {
Write-Output '[]'
} else {
$tree | ConvertTo-Json -Depth 20 -Compress
}
} catch {
Write-Output '[]'
}
`
try {
const raw = ps(script)
if (!raw || raw === '[]') return null
const parsed = JSON.parse(raw)
const nodes: AccessibilityNode[] = Array.isArray(parsed)
? parsed.map(parseNode)
: [parseNode(parsed)]
const text = formatForModel(nodes)
return { text, nodes, timestamp: Date.now() }
} catch {
return null
}
}
function parseNode(raw: any): AccessibilityNode {
return {
role: raw.role || '',
name: raw.name || '',
automationId: raw.id || '',
bounds: { x: raw.x || 0, y: raw.y || 0, w: raw.w || 0, h: raw.h || 0 },
enabled: raw.on !== false,
value: raw.v,
children: raw.c
? Array.isArray(raw.c)
? raw.c.map(parseNode)
: [parseNode(raw.c)]
: undefined,
}
}
/**
* Format the accessibility tree as compact text for the model.
* Example output:
* [Button] "Save" (120,50 80x30) enabled
* [Edit] "" (200,80 400x25) enabled value="hello world" id=textBox1
* [MenuItem] "File" (10,0 40x25) enabled
*/
function formatForModel(
nodes: AccessibilityNode[],
indent: number = 0,
): string {
const lines: string[] = []
const pad = ' '.repeat(indent)
for (const node of nodes) {
let line = `${pad}[${node.role}]`
if (node.name) line += ` "${truncate(node.name, 40)}"`
line += ` (${node.bounds.x},${node.bounds.y} ${node.bounds.w}x${node.bounds.h})`
if (!node.enabled) line += ' DISABLED'
if (node.value) line += ` value="${truncate(node.value, 30)}"`
if (node.automationId) line += ` id=${node.automationId}`
lines.push(line)
if (node.children) {
lines.push(formatForModel(node.children, indent + 1))
}
}
return lines.join('\n')
}
function truncate(s: string, max: number): string {
return s.length > max ? s.slice(0, max - 1) + '…' : s
}
/**
* Find an element in the accessibility tree by name, role, or automationId.
* Returns the first match.
*/
export function findNodeInSnapshot(
nodes: AccessibilityNode[],
query: { name?: string; role?: string; automationId?: string },
): AccessibilityNode | null {
for (const node of nodes) {
let match = true
if (
query.name &&
!node.name.toLowerCase().includes(query.name.toLowerCase())
)
match = false
if (query.role && node.role.toLowerCase() !== query.role.toLowerCase())
match = false
if (query.automationId && node.automationId !== query.automationId)
match = false
if (match && (query.name || query.role || query.automationId)) return node
if (node.children) {
const found = findNodeInSnapshot(node.children, query)
if (found) return found
}
}
return null
}

View File

@@ -0,0 +1,129 @@
/**
* Application type dispatcher for Windows Computer Use.
*
* Routes operations to the appropriate controller based on file type:
* - .xlsx/.xls/.csv → Excel COM (headless, no window)
* - .docx/.doc → Word COM (headless, no window)
* - .txt/.log/.md → notepad + SendMessage + HWND bind (offscreen)
* - Others → generic exe + HWND bind (offscreen)
*/
import { extname } from 'path'
export type AppType = 'excel' | 'word' | 'text' | 'browser' | 'generic'
const EXCEL_EXTS = new Set(['.xlsx', '.xls', '.csv', '.xlsm', '.xlsb'])
const WORD_EXTS = new Set(['.docx', '.doc', '.rtf'])
const TEXT_EXTS = new Set([
'.txt',
'.log',
'.md',
'.json',
'.xml',
'.yaml',
'.yml',
'.ini',
'.cfg',
'.conf',
])
const BROWSER_NAMES = new Set(['chrome', 'msedge', 'firefox', 'brave', 'opera'])
/**
* Detect application type from file path or app name.
*/
export function detectAppType(nameOrPath: string): AppType {
const lower = nameOrPath.toLowerCase()
// Check by extension
const ext = extname(lower)
if (ext) {
if (EXCEL_EXTS.has(ext)) return 'excel'
if (WORD_EXTS.has(ext)) return 'word'
if (TEXT_EXTS.has(ext)) return 'text'
}
// Check by app name
const baseName =
lower
.replace(/\.exe$/, '')
.split(/[/\\]/)
.pop() ?? ''
if (baseName === 'excel' || baseName.includes('excel')) return 'excel'
if (
baseName === 'winword' ||
baseName === 'word' ||
baseName.includes('word')
)
return 'word'
if (baseName === 'notepad' || baseName === 'notepad++' || baseName === 'code')
return 'text'
if (BROWSER_NAMES.has(baseName)) return 'browser'
return 'generic'
}
export interface OpenResult {
type: AppType
/** HWND for text/browser/generic apps (SendMessage target) */
hwnd?: string
/** File path for COM-controlled apps (Excel/Word) */
filePath?: string
}
/**
* Open a file or app with the appropriate controller.
*
* - Excel/Word: COM automation (no window, no HWND needed)
* - Text/Browser/Generic: exe launch + offscreen HWND bind
*
* Returns the app type and either HWND or file path for subsequent operations.
*/
export async function openWithController(
nameOrPath: string,
): Promise<OpenResult> {
const type = detectAppType(nameOrPath)
switch (type) {
case 'excel': {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createExcel, openExcel } =
require('./comExcel.js') as typeof import('./comExcel.js')
const isExisting = nameOrPath.match(/\.(xlsx|xls|csv|xlsm|xlsb)$/i)
if (isExisting) {
// Open existing file — just verify it's readable
try {
openExcel(nameOrPath)
return { type: 'excel', filePath: nameOrPath }
} catch {
return { type: 'excel', filePath: nameOrPath }
}
}
// "excel" or "excel.exe" without a file — create new
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.xlsx`
createExcel(tmpPath)
return { type: 'excel', filePath: tmpPath }
}
case 'word': {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createWord, openWord } =
require('./comWord.js') as typeof import('./comWord.js')
const isExisting = nameOrPath.match(/\.(docx|doc|rtf)$/i)
if (isExisting) {
try {
openWord(nameOrPath)
return { type: 'word', filePath: nameOrPath }
} catch {
return { type: 'word', filePath: nameOrPath }
}
}
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.docx`
createWord(tmpPath)
return { type: 'word', filePath: tmpPath }
}
default:
// text/browser/generic — HWND bind handled by caller (platforms/win32.ts open())
return { type }
}
}

View File

@@ -0,0 +1,525 @@
"""
Python Bridge for Windows Computer Use.
Long-lived subprocess communicating via stdin/stdout JSON lines.
Replaces per-call PowerShell spawning with a persistent process.
Capabilities:
- screenshot: full-screen or per-window (mss + PrintWindow)
- input: mouse click/move/drag, keyboard type/key (ctypes user32)
- windows: enumerate, find, get rect, manage (show/min/max/close)
- accessibility: UI Automation tree snapshot (comtypes + UIAutomation)
Protocol: one JSON object per line on stdin → one JSON object per line on stdout.
Request: {"id": 1, "method": "screenshot", "params": {...}}
Response: {"id": 1, "result": {...}} or {"id": 1, "error": "message"}
"""
import sys
import json
import base64
import io
import ctypes
import ctypes.wintypes
import time
import os
# Force UTF-8 output
sys.stdout.reconfigure(encoding='utf-8')
sys.stdin.reconfigure(encoding='utf-8')
user32 = ctypes.windll.user32
gdi32 = ctypes.windll.gdi32
kernel32 = ctypes.windll.kernel32
# ---------------------------------------------------------------------------
# Win32 constants & types
# ---------------------------------------------------------------------------
WM_CHAR = 0x0102
WM_KEYDOWN = 0x0100
WM_KEYUP = 0x0101
WM_CLOSE = 0x0010
WM_LBUTTONDOWN = 0x0201
WM_LBUTTONUP = 0x0202
WM_RBUTTONDOWN = 0x0204
WM_RBUTTONUP = 0x0205
WM_MOUSEMOVE = 0x0200
SW_MINIMIZE = 6
SW_MAXIMIZE = 3
SW_RESTORE = 9
SW_SHOWMINNOACTIVE = 7
SWP_NOSIZE = 0x0001
SWP_NOMOVE = 0x0002
SWP_NOZORDER = 0x0004
SWP_NOACTIVATE = 0x0010
WNDENUMPROC = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
class RECT(ctypes.Structure):
_fields_ = [("left", ctypes.c_long), ("top", ctypes.c_long),
("right", ctypes.c_long), ("bottom", ctypes.c_long)]
class POINT(ctypes.Structure):
_fields_ = [("x", ctypes.c_long), ("y", ctypes.c_long)]
# SendMessageW
SendMessageW = user32.SendMessageW
SendMessageW.argtypes = [ctypes.c_void_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_void_p]
SendMessageW.restype = ctypes.c_void_p
# ---------------------------------------------------------------------------
# Screenshot
# ---------------------------------------------------------------------------
def screenshot_full(display_id=0):
"""Full-screen screenshot via mss, returns JPEG base64."""
import mss
from PIL import Image
with mss.mss() as sct:
monitor = sct.monitors[display_id + 1] if display_id < len(sct.monitors) - 1 else sct.monitors[1]
shot = sct.grab(monitor)
img = Image.frombytes('RGB', shot.size, shot.bgra, 'raw', 'BGRX')
buf = io.BytesIO()
img.save(buf, format='JPEG', quality=75)
return {
'base64': base64.b64encode(buf.getvalue()).decode(),
'width': shot.width,
'height': shot.height,
}
def screenshot_window(hwnd_str):
"""Window screenshot via PrintWindow, returns JPEG base64."""
from PIL import Image
hwnd = int(hwnd_str)
if not user32.IsWindow(hwnd):
return None
# Get window rect
rect = RECT()
user32.GetWindowRect(hwnd, ctypes.byref(rect))
w = rect.right - rect.left
h = rect.bottom - rect.top
if w <= 0 or h <= 0:
return None
# Handle minimized windows
was_minimized = user32.IsIconic(hwnd)
if was_minimized:
user32.ShowWindow(hwnd, 4) # SW_SHOWNOACTIVATE
time.sleep(0.1)
user32.GetWindowRect(hwnd, ctypes.byref(rect))
w = rect.right - rect.left
h = rect.bottom - rect.top
# Create DC and bitmap
hdc_window = user32.GetDC(hwnd)
hdc_mem = gdi32.CreateCompatibleDC(hdc_window)
hbm = gdi32.CreateCompatibleBitmap(hdc_window, w, h)
gdi32.SelectObject(hdc_mem, hbm)
# PrintWindow with PW_RENDERFULLCONTENT
result = ctypes.windll.user32.PrintWindow(hwnd, hdc_mem, 2)
if not result:
# Fallback to BitBlt
gdi32.BitBlt(hdc_mem, 0, 0, w, h, hdc_window, 0, 0, 0x00CC0020) # SRCCOPY
# Extract bitmap bits
class BITMAPINFOHEADER(ctypes.Structure):
_fields_ = [
('biSize', ctypes.c_uint32), ('biWidth', ctypes.c_int32),
('biHeight', ctypes.c_int32), ('biPlanes', ctypes.c_uint16),
('biBitCount', ctypes.c_uint16), ('biCompression', ctypes.c_uint32),
('biSizeImage', ctypes.c_uint32), ('biXPelsPerMeter', ctypes.c_int32),
('biYPelsPerMeter', ctypes.c_int32), ('biClrUsed', ctypes.c_uint32),
('biClrImportant', ctypes.c_uint32),
]
bmi = BITMAPINFOHEADER()
bmi.biSize = ctypes.sizeof(BITMAPINFOHEADER)
bmi.biWidth = w
bmi.biHeight = -h # top-down
bmi.biPlanes = 1
bmi.biBitCount = 32
bmi.biCompression = 0 # BI_RGB
buf_size = w * h * 4
pixel_buf = ctypes.create_string_buffer(buf_size)
gdi32.GetDIBits(hdc_mem, hbm, 0, h, pixel_buf, ctypes.byref(bmi), 0)
# Cleanup GDI
gdi32.DeleteObject(hbm)
gdi32.DeleteDC(hdc_mem)
user32.ReleaseDC(hwnd, hdc_window)
if was_minimized:
user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
# Convert to JPEG
img = Image.frombuffer('RGBA', (w, h), pixel_buf, 'raw', 'BGRA', 0, 1)
img = img.convert('RGB')
out = io.BytesIO()
img.save(out, format='JPEG', quality=75)
return {
'base64': base64.b64encode(out.getvalue()).decode(),
'width': w,
'height': h,
}
# ---------------------------------------------------------------------------
# Window management
# ---------------------------------------------------------------------------
def list_windows():
"""Enumerate all visible windows with title."""
windows = []
def cb(hwnd, _):
if user32.IsWindowVisible(hwnd):
length = user32.GetWindowTextLengthW(hwnd)
if length > 0:
buf = ctypes.create_unicode_buffer(length + 1)
user32.GetWindowTextW(hwnd, buf, length + 1)
pid = ctypes.c_uint32()
user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
windows.append({'hwnd': str(hwnd), 'pid': pid.value, 'title': buf.value})
return True
user32.EnumWindows(WNDENUMPROC(cb), 0)
return windows
def get_window_rect(hwnd_str):
hwnd = int(hwnd_str)
rect = RECT()
if user32.GetWindowRect(hwnd, ctypes.byref(rect)):
return {'x': rect.left, 'y': rect.top,
'width': rect.right - rect.left, 'height': rect.bottom - rect.top}
return None
def get_client_offset(hwnd_str):
"""Get non-client area offset (title bar height, border width)."""
hwnd = int(hwnd_str)
wr = RECT()
user32.GetWindowRect(hwnd, ctypes.byref(wr))
pt = POINT(0, 0)
user32.ClientToScreen(hwnd, ctypes.byref(pt))
return {'dx': pt.x - wr.left, 'dy': pt.y - wr.top}
def manage_window(hwnd_str, action):
hwnd = int(hwnd_str)
if action == 'minimize':
return user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
elif action == 'maximize':
return user32.ShowWindow(hwnd, SW_MAXIMIZE)
elif action == 'restore':
return user32.ShowWindow(hwnd, SW_RESTORE)
elif action == 'close':
SendMessageW(hwnd, WM_CLOSE, 0, 0)
return True
elif action == 'focus':
if user32.IsIconic(hwnd):
user32.ShowWindow(hwnd, SW_RESTORE)
user32.SetForegroundWindow(hwnd)
return True
elif action == 'move_offscreen':
user32.SetWindowPos(hwnd, 0, -32000, -32000, 0, 0,
SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE)
return True
return False
# ---------------------------------------------------------------------------
# Input — all via SendMessageW (window-targeted, no global)
# ---------------------------------------------------------------------------
def make_lparam(x, y):
return (y << 16) | (x & 0xFFFF)
def send_click(hwnd_str, x, y, button='left'):
hwnd = int(hwnd_str)
lp = make_lparam(x, y)
if button == 'left':
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, lp)
SendMessageW(hwnd, WM_LBUTTONUP, 0, lp)
elif button == 'right':
SendMessageW(hwnd, WM_RBUTTONDOWN, 0, lp)
SendMessageW(hwnd, WM_RBUTTONUP, 0, lp)
return True
def send_text(hwnd_str, text):
"""Send text via WM_CHAR (Unicode). Handles surrogate pairs."""
hwnd = int(hwnd_str)
for ch in text:
cp = ord(ch)
if cp <= 0xFFFF:
SendMessageW(hwnd, WM_CHAR, cp, 0)
else:
# Surrogate pair
hi = ((cp - 0x10000) >> 10) + 0xD800
lo = ((cp - 0x10000) & 0x3FF) + 0xDC00
SendMessageW(hwnd, WM_CHAR, hi, 0)
SendMessageW(hwnd, WM_CHAR, lo, 0)
return True
def send_key(hwnd_str, vk, action='down'):
hwnd = int(hwnd_str)
msg = WM_KEYDOWN if action == 'down' else WM_KEYUP
SendMessageW(hwnd, msg, vk, 0)
return True
def send_keys_combo(hwnd_str, keys):
"""Send a key combination like ['ctrl', 's']."""
VK = {
'ctrl': 0x11, 'control': 0x11, 'shift': 0x10, 'alt': 0x12,
'enter': 0x0D, 'return': 0x0D, 'tab': 0x09, 'escape': 0x1B,
'backspace': 0x08, 'delete': 0x2E, 'space': 0x20,
'left': 0x25, 'up': 0x26, 'right': 0x27, 'down': 0x28,
'home': 0x24, 'end': 0x23, 'pageup': 0x21, 'pagedown': 0x22,
'f1': 0x70, 'f2': 0x71, 'f3': 0x72, 'f4': 0x73, 'f5': 0x74,
'f6': 0x75, 'f7': 0x76, 'f8': 0x77, 'f9': 0x78, 'f10': 0x79,
'f11': 0x7A, 'f12': 0x7B,
}
MODIFIERS = {'ctrl', 'control', 'shift', 'alt'}
hwnd = int(hwnd_str)
mods = []
main_key = None
for k in keys:
kl = k.lower()
if kl in MODIFIERS:
mods.append(VK.get(kl, 0))
elif kl in VK:
main_key = VK[kl]
elif len(kl) == 1:
main_key = ord(kl.upper())
if main_key is None:
return False
for m in mods:
SendMessageW(hwnd, WM_KEYDOWN, m, 0)
SendMessageW(hwnd, WM_KEYDOWN, main_key, 0)
SendMessageW(hwnd, WM_KEYUP, main_key, 0)
for m in reversed(mods):
SendMessageW(hwnd, WM_KEYUP, m, 0)
return True
def send_mouse_down(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, make_lparam(x, y))
return True
def send_mouse_up(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_LBUTTONUP, 0, make_lparam(x, y))
return True
def send_mouse_move(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_MOUSEMOVE, 0, make_lparam(x, y))
return True
# ---------------------------------------------------------------------------
# Accessibility snapshot (UI Automation via comtypes)
# ---------------------------------------------------------------------------
_uia_client = None
def _get_uia():
global _uia_client
if _uia_client is None:
try:
import comtypes.client
comtypes.client.GetModule('UIAutomationCore.dll')
from comtypes.gen.UIAutomationClient import CUIAutomation
_uia_client = comtypes.client.CreateObject(CUIAutomation)
except Exception:
# Fallback: use pywinauto
pass
return _uia_client
def accessibility_snapshot(hwnd_str, max_depth=4):
"""Get the accessibility tree using pywinauto (more reliable than raw comtypes)."""
try:
from pywinauto import Desktop
from pywinauto.controls.uiawrapper import UIAWrapper
hwnd = int(hwnd_str)
app = Desktop(backend='uia')
# Find window by handle
win = None
for w in app.windows():
if w.handle == hwnd:
win = w
break
if win is None:
return None
INTERACTIVE = {'Button', 'Edit', 'ComboBox', 'CheckBox', 'RadioButton',
'MenuItem', 'Menu', 'MenuBar', 'Hyperlink', 'Slider',
'Tab', 'TabItem', 'List', 'ListItem', 'Document',
'TreeItem', 'DataItem', 'ToolBar', 'SplitButton'}
def walk(element, depth):
if depth >= max_depth:
return []
nodes = []
try:
children = element.children()
except Exception:
return []
for child in children:
try:
ct = child.element_info.control_type or ''
name = child.element_info.name or ''
auto_id = child.element_info.automation_id or ''
rect = child.rectangle()
w = rect.right - rect.left
h = rect.bottom - rect.top
if w <= 0 or h <= 0 or rect.left < -10000:
continue
enabled = child.is_enabled()
value = None
try:
value = child.get_value()
except Exception:
pass
sub = walk(child, depth + 1)
if ct in INTERACTIVE or sub:
node = {
'role': ct, 'name': name, 'id': auto_id,
'x': rect.left, 'y': rect.top, 'w': w, 'h': h,
'on': enabled,
}
if value:
node['v'] = str(value)[:100]
if sub:
node['c'] = sub
nodes.append(node)
except Exception:
continue
return nodes
tree = walk(win, 0)
return tree if tree else None
except Exception as e:
return None
# ---------------------------------------------------------------------------
# Find edit child (for text input targeting)
# ---------------------------------------------------------------------------
def find_edit_child(hwnd_str):
"""Find the best edit control child using UI Automation."""
try:
from pywinauto import Desktop
hwnd = int(hwnd_str)
app = Desktop(backend='uia')
for w in app.windows():
if w.handle == hwnd:
# Find first Edit or Document control
for child in w.descendants():
try:
ct = child.element_info.control_type
if ct in ('Edit', 'Document'):
return str(child.handle) if child.handle else None
except Exception:
continue
break
except Exception:
pass
return None
# ---------------------------------------------------------------------------
# Clipboard paste (for large text)
# ---------------------------------------------------------------------------
def paste_text(hwnd_str, text):
"""Set clipboard + send Ctrl+V via SendMessage."""
import ctypes
# Set clipboard
CF_UNICODETEXT = 13
user32.OpenClipboard(0)
user32.EmptyClipboard()
data = text.encode('utf-16-le') + b'\x00\x00'
h = kernel32.GlobalAlloc(0x0002, len(data)) # GMEM_MOVEABLE
ptr = kernel32.GlobalLock(h)
ctypes.memmove(ptr, data, len(data))
kernel32.GlobalUnlock(h)
user32.SetClipboardData(CF_UNICODETEXT, h)
user32.CloseClipboard()
# Send Ctrl+V
send_keys_combo(hwnd_str, ['ctrl', 'v'])
return True
# ---------------------------------------------------------------------------
# Mouse wheel scroll (WM_MOUSEWHEEL / WM_MOUSEHWHEEL)
# ---------------------------------------------------------------------------
WM_MOUSEWHEEL = 0x020A
WM_MOUSEHWHEEL = 0x020E
# ClientToScreen for screen coords in lParam
user32.ClientToScreen.argtypes = [ctypes.c_void_p, ctypes.POINTER(POINT)]
user32.ClientToScreen.restype = ctypes.c_bool
def send_mouse_wheel(hwnd_str, x, y, delta, horizontal=False):
"""Send mouse wheel scroll at client coordinates (x, y).
delta: positive = up/right, negative = down/left. In "clicks" (1 click = 120 units).
"""
hwnd = int(hwnd_str)
msg = WM_MOUSEHWHEEL if horizontal else WM_MOUSEWHEEL
wheel_delta = int(delta) * 120
# Convert client coords to screen coords for lParam
pt = POINT(int(x), int(y))
user32.ClientToScreen(hwnd, ctypes.byref(pt))
# wParam: high word = delta (signed short), low word = modifier keys (0)
wparam = ctypes.c_void_p(wheel_delta << 16)
# lParam: screen coords
lparam = ctypes.c_void_p((pt.y << 16) | (pt.x & 0xFFFF))
SendMessageW(hwnd, msg, wparam, lparam)
return True
# ---------------------------------------------------------------------------
# Dispatch
# ---------------------------------------------------------------------------
METHODS = {
'screenshot': lambda p: screenshot_full(p.get('display_id', 0)),
'screenshot_window': lambda p: screenshot_window(p['hwnd']),
'list_windows': lambda p: list_windows(),
'get_window_rect': lambda p: get_window_rect(p['hwnd']),
'get_client_offset': lambda p: get_client_offset(p['hwnd']),
'manage_window': lambda p: manage_window(p['hwnd'], p['action']),
'send_click': lambda p: send_click(p['hwnd'], p['x'], p['y'], p.get('button', 'left')),
'send_text': lambda p: send_text(p['hwnd'], p['text']),
'send_key': lambda p: send_key(p['hwnd'], p['vk'], p.get('action', 'down')),
'send_keys': lambda p: send_keys_combo(p['hwnd'], p['keys']),
'send_mouse_down': lambda p: send_mouse_down(p['hwnd'], p['x'], p['y']),
'send_mouse_up': lambda p: send_mouse_up(p['hwnd'], p['x'], p['y']),
'send_mouse_move': lambda p: send_mouse_move(p['hwnd'], p['x'], p['y']),
'paste_text': lambda p: paste_text(p['hwnd'], p['text']),
'send_mouse_wheel': lambda p: send_mouse_wheel(p['hwnd'], p['x'], p['y'], p['delta'], p.get('horizontal', False)),
'find_edit_child': lambda p: find_edit_child(p['hwnd']),
'accessibility_snapshot': lambda p: accessibility_snapshot(p['hwnd'], p.get('max_depth', 4)),
'ping': lambda p: {'ok': True, 'pid': os.getpid()},
}
def main():
"""Main loop: read JSON lines from stdin, dispatch, write JSON lines to stdout."""
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
req = json.loads(line)
req_id = req.get('id', 0)
method = req.get('method', '')
params = req.get('params', {})
if method not in METHODS:
resp = {'id': req_id, 'error': f'unknown method: {method}'}
else:
try:
result = METHODS[method](params)
resp = {'id': req_id, 'result': result}
except Exception as e:
resp = {'id': req_id, 'error': str(e)}
sys.stdout.write(json.dumps(resp, ensure_ascii=False) + '\n')
sys.stdout.flush()
except json.JSONDecodeError as e:
sys.stdout.write(json.dumps({'id': 0, 'error': f'invalid JSON: {e}'}) + '\n')
sys.stdout.flush()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,191 @@
/**
* Python Bridge Client — manages a long-lived Python subprocess for Windows
* Computer Use operations.
*
* Replaces per-call PowerShell spawning with a persistent Python process
* that communicates via JSON lines over stdin/stdout.
*
* Performance: ~1-5ms per call vs ~200-500ms per PowerShell spawn.
*/
import * as path from 'path'
interface BridgeRequest {
id: number
method: string
params: Record<string, unknown>
}
interface BridgeResponse {
id: number
result?: unknown
error?: string
}
let bridgeProc: ReturnType<typeof Bun.spawn> | null = null
let requestId = 0
const pendingRequests = new Map<
number,
{
resolve: (value: unknown) => void
reject: (error: Error) => void
}
>()
let outputBuffer = ''
/**
* Start the Python bridge process if not already running.
*/
export function ensureBridge(): boolean {
if (bridgeProc) return true
try {
const scriptPath = path.join(__dirname, 'bridge.py')
bridgeProc = Bun.spawn(['python', '-u', scriptPath], {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'ignore',
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUNBUFFERED: '1' },
})
// Read stdout lines asynchronously
const reader = bridgeProc.stdout.getReader()
const readLoop = async () => {
try {
while (true) {
const { done, value } = await reader.read()
if (done) break
outputBuffer += new TextDecoder().decode(value)
// Process complete lines
let newlineIdx: number
while ((newlineIdx = outputBuffer.indexOf('\n')) !== -1) {
const line = outputBuffer.slice(0, newlineIdx).trim()
outputBuffer = outputBuffer.slice(newlineIdx + 1)
if (!line) continue
try {
const resp: BridgeResponse = JSON.parse(line)
const pending = pendingRequests.get(resp.id)
if (pending) {
pendingRequests.delete(resp.id)
if (resp.error) {
pending.reject(new Error(resp.error))
} else {
pending.resolve(resp.result)
}
}
} catch {}
}
}
} catch {}
}
readLoop()
return true
} catch {
bridgeProc = null
return false
}
}
/**
* Send a request to the Python bridge and wait for the response.
*/
export async function call<T = unknown>(
method: string,
params: Record<string, unknown> = {},
timeoutMs: number = 10000,
): Promise<T> {
if (!ensureBridge()) {
throw new Error('Python bridge not available')
}
const id = ++requestId
const req: BridgeRequest = { id, method, params }
return new Promise<T>((resolve, reject) => {
pendingRequests.set(id, {
resolve: resolve as (v: unknown) => void,
reject,
})
// Timeout
const timer = setTimeout(() => {
pendingRequests.delete(id)
reject(new Error(`Bridge call ${method} timed out after ${timeoutMs}ms`))
}, timeoutMs)
// Clear timeout on resolve/reject
const origResolve = resolve
const origReject = reject
pendingRequests.set(id, {
resolve: v => {
clearTimeout(timer)
;(origResolve as any)(v)
},
reject: e => {
clearTimeout(timer)
origReject(e)
},
})
try {
bridgeProc!.stdin.write(JSON.stringify(req) + '\n')
bridgeProc!.stdin.flush()
} catch (err) {
clearTimeout(timer)
pendingRequests.delete(id)
reject(new Error(`Bridge write failed: ${err}`))
}
})
}
/**
* Synchronous call — blocks the event loop. Use sparingly.
* Falls back to PowerShell if bridge is not available.
*/
export function callSync<T = unknown>(
method: string,
params: Record<string, unknown> = {},
timeoutMs: number = 10000,
): T | null {
// For sync calls, spawn a one-shot Python process.
// SECURITY: JSON is passed via stdin (not embedded in -c) to prevent code injection.
try {
const scriptPath = path.join(__dirname, 'bridge.py')
const req = JSON.stringify({ id: 1, method, params })
const result = Bun.spawnSync({
cmd: ['python', '-u', scriptPath],
stdin: Buffer.from(req + '\n'),
stdout: 'pipe',
stderr: 'pipe',
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
timeout: timeoutMs,
})
const out = new TextDecoder().decode(result.stdout).trim()
if (!out) return null
const resp: BridgeResponse = JSON.parse(out)
if (resp.error) throw new Error(resp.error)
return resp.result as T
} catch {
return null
}
}
/**
* Kill the bridge process.
*/
export function stopBridge(): void {
if (bridgeProc) {
try {
bridgeProc.stdin.end()
bridgeProc.kill()
} catch {}
bridgeProc = null
}
pendingRequests.clear()
outputBuffer = ''
}
// NOTE: No process exit handlers here — the platform-level win32.ts
// already registers exit/SIGINT/SIGTERM handlers that call cleanupAll(),
// which includes stopBridge(). Adding handlers here would cause double
// cleanup and duplicate process.exit() calls.

View File

@@ -0,0 +1,320 @@
/**
* Excel COM automation via PowerShell.
* Completely headless — Visible=false, no window, no user impact.
* Each operation opens and closes Excel to avoid orphaned processes.
*/
export interface CellInfo {
row: number
col: number
value: string | number | null
formula?: string
}
export interface SheetInfo {
name: string
usedRange: { rows: number; cols: number }
cells: CellInfo[]
}
export interface ExcelInfo {
sheets: SheetInfo[]
sheetNames: string[]
}
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
const stderr = new TextDecoder().decode(result.stderr).trim()
if (result.exitCode !== 0 && stderr) {
throw new Error(`PowerShell error: ${stderr}`)
}
return new TextDecoder().decode(result.stdout).trim()
}
function escPath(p: string): string {
return p.replace(/'/g, "''")
}
function resolveSheet(varName: string, sheet: string | number): string {
if (typeof sheet === 'number') {
return `$${varName} = $wb.Sheets.Item(${sheet})`
}
return `$${varName} = $wb.Sheets.Item('${sheet.replace(/'/g, "''")}')`
}
const EXCEL_INIT = `
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false
$excel.DisplayAlerts = $false
`.trim()
function excelCleanup(hasWorkbook = true): string {
const parts: string[] = []
if (hasWorkbook) parts.push('if ($wb) { $wb.Close($false) }')
parts.push('$excel.Quit()')
parts.push('[System.Runtime.InteropServices.Marshal]::ReleaseComObject($excel) | Out-Null')
return parts.join('\n ')
}
/**
* Open and read an Excel workbook.
* Limits to first 1000 non-empty cells per sheet.
*/
export function openExcel(filePath: string): ExcelInfo {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
$result = @{ sheets = @(); sheetNames = @() }
foreach ($sheet in $wb.Sheets) {
$result.sheetNames += $sheet.Name
$ur = $sheet.UsedRange
$rows = $ur.Rows.Count
$cols = $ur.Columns.Count
$cells = @()
$count = 0
for ($r = 1; $r -le $rows -and $count -lt 1000; $r++) {
for ($c = 1; $c -le $cols -and $count -lt 1000; $c++) {
$cell = $sheet.Cells.Item($r, $c)
$val = $cell.Value2
if ($null -ne $val) {
$f = $null
if ($cell.HasFormula) { $f = $cell.Formula }
$entry = @{ row = $r; col = $c; value = $val }
if ($f) { $entry.formula = $f }
$cells += $entry
$count++
}
}
}
$result.sheets += @{
name = $sheet.Name
usedRange = @{ rows = $rows; cols = $cols }
cells = $cells
}
}
$result | ConvertTo-Json -Depth 5 -Compress
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (!raw) throw new Error('No output from openExcel')
const parsed = JSON.parse(raw)
// Normalize: PowerShell single-element arrays become objects
const sheets: SheetInfo[] = Array.isArray(parsed.sheets) ? parsed.sheets : [parsed.sheets]
const sheetNames: string[] = Array.isArray(parsed.sheetNames) ? parsed.sheetNames : [parsed.sheetNames]
return {
sheets: sheets.map((s: any) => ({
name: s.name,
usedRange: s.usedRange,
cells: Array.isArray(s.cells) ? s.cells : s.cells ? [s.cells] : [],
})),
sheetNames,
}
}
/**
* Read a single cell value.
*/
export function readCell(
filePath: string,
sheet: string | number,
row: number,
col: number,
): string | number | null {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$val = $sheet.Cells.Item(${row}, ${col}).Value2
if ($null -eq $val) { Write-Output 'null' } else { Write-Output ($val | ConvertTo-Json -Compress) }
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (raw === 'null' || raw === '') return null
return JSON.parse(raw)
}
/**
* Read a rectangular range of cells as a 2D array.
*/
export function readRange(
filePath: string,
sheet: string | number,
startRow: number,
startCol: number,
endRow: number,
endCol: number,
): (string | number | null)[][] {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$rows = @()
for ($r = ${startRow}; $r -le ${endRow}; $r++) {
$row = @()
for ($c = ${startCol}; $c -le ${endCol}; $c++) {
$val = $sheet.Cells.Item($r, $c).Value2
$row += if ($null -eq $val) { '__NULL__' } else { $val }
}
$rows += ,@($row)
}
$rows | ConvertTo-Json -Depth 3 -Compress
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (!raw) return []
const parsed = JSON.parse(raw)
// Normalize single-row case
const rows: any[] = Array.isArray(parsed[0]) ? parsed : [parsed]
return rows.map((row: any[]) =>
row.map((v: any) => (v === '__NULL__' ? null : v)),
)
}
/**
* Write a single cell value.
*/
export function writeCell(
filePath: string,
sheet: string | number,
row: number,
col: number,
value: string | number,
): boolean {
const jsonVal = JSON.stringify(value)
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$sheet.Cells.Item(${row}, ${col}).Value2 = (ConvertFrom-Json '${jsonVal.replace(/'/g, "''")}')
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Write a 2D array of values starting at (startRow, startCol).
*/
export function writeRange(
filePath: string,
sheet: string | number,
startRow: number,
startCol: number,
data: (string | number | null)[][],
): boolean {
const jsonData = JSON.stringify(data).replace(/'/g, "''")
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$data = ConvertFrom-Json '${jsonData}'
for ($r = 0; $r -lt $data.Count; $r++) {
$row = $data[$r]
for ($c = 0; $c -lt $row.Count; $c++) {
$val = $row[$c]
if ($null -ne $val) {
if ($val -is [int] -or $val -is [long] -or $val -is [double] -or $val -is [decimal]) {
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [double]$val
} else {
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [string]$val
}
}
}
}
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Set a formula on a cell.
*/
export function setFormula(
filePath: string,
sheet: string | number,
row: number,
col: number,
formula: string,
): boolean {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$sheet.Cells.Item(${row}, ${col}).Formula = '${formula.replace(/'/g, "''")}'
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Save workbook. If savePath is given, SaveAs to that path; otherwise Save in place.
*/
export function saveExcel(filePath: string, savePath?: string): boolean {
const saveCmd = savePath
? `$wb.SaveAs('${escPath(savePath)}')`
: '$wb.Save()'
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${saveCmd}
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Create a new empty workbook and save it to the given path.
*/
export function createExcel(savePath: string): boolean {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Add()
$wb.SaveAs('${escPath(savePath)}')
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* closeExcel is a no-op since each operation opens and closes its own COM instance.
*/
export function closeExcel(_filePath: string): void {
// No-op: each function manages its own Excel lifecycle
}

View File

@@ -0,0 +1,450 @@
/**
* Word COM automation module for Windows.
* Uses PowerShell to drive Word.Application COM object — fully headless (Visible=false).
* Each function builds a PowerShell script, runs it via Bun.spawnSync, and parses JSON output.
*/
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface WordParagraph {
text: string
bold?: boolean
italic?: boolean
fontSize?: number
}
export interface WordTable {
rows: number
cols: number
data: string[][]
}
export interface WordDocInfo {
text: string
paragraphs: WordParagraph[]
tables: WordTable[]
wordCount: number
pageCount: number
}
export interface AppendTextOptions {
bold?: boolean
italic?: boolean
fontSize?: number
fontName?: string
}
// ---------------------------------------------------------------------------
// PowerShell runner
// ---------------------------------------------------------------------------
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function parseJsonOutput<T>(raw: string, fallback: T): T {
if (!raw) return fallback
try {
return JSON.parse(raw) as T
} catch {
return fallback
}
}
/** Escape a string for safe embedding inside a PowerShell single-quoted string. */
function psEscape(s: string): string {
return s.replace(/'/g, "''")
}
// ---------------------------------------------------------------------------
// Word COM wrapper template
// ---------------------------------------------------------------------------
/**
* Wraps a Word COM script body with standard open/cleanup boilerplate.
* The body receives $word and $doc variables.
* If `openPath` is provided the document is opened; otherwise a new doc is created.
*/
function wrapWordScript(body: string, openPath?: string): string {
const openCmd = openPath
? `$doc = $word.Documents.Open('${psEscape(openPath)}')`
: '$doc = $word.Documents.Add()'
return `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
${openCmd}
${body}
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
}
/**
* Same as wrapWordScript but the body is responsible for saving before close.
* After body runs, $doc.Save() is called automatically.
*/
function wrapWordScriptWithSave(body: string, openPath: string): string {
return `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(openPath)}')
${body}
$doc.Save()
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
}
// ---------------------------------------------------------------------------
// 1. openWord
// ---------------------------------------------------------------------------
export async function openWord(filePath: string): Promise<WordDocInfo> {
const script = wrapWordScript(
`
# Paragraphs (limit 500)
$paras = @()
$paraCount = $doc.Paragraphs.Count
$limit = [Math]::Min($paraCount, 500)
for ($i = 1; $i -le $limit; $i++) {
$p = $doc.Paragraphs.Item($i)
$r = $p.Range
$paras += @{
text = $r.Text -replace '\\r$',''
bold = [bool]($r.Font.Bold -eq -1)
italic = [bool]($r.Font.Italic -eq -1)
fontSize = $r.Font.Size
}
}
# Tables
$tables = @()
foreach ($table in $doc.Tables) {
$rows = $table.Rows.Count
$cols = $table.Columns.Count
$data = @()
for ($r = 1; $r -le $rows; $r++) {
$row = @()
for ($c = 1; $c -le $cols; $c++) {
try {
$cellText = $table.Cell($r, $c).Range.Text
# Trim trailing \\r\\a that Word adds to cell text
$cellText = $cellText -replace '[\\r\\n\\a]+$',''
$row += $cellText
} catch {
$row += ''
}
}
$data += ,@($row)
}
$tables += @{ rows = $rows; cols = $cols; data = $data }
}
# Counts: wdStatisticWords=0, wdStatisticPages=2
$wordCount = $doc.ComputeStatistics(0)
$pageCount = $doc.ComputeStatistics(2)
$result = @{
text = $doc.Content.Text
paragraphs = $paras
tables = $tables
wordCount = $wordCount
pageCount = $pageCount
}
Write-Output (ConvertTo-Json $result -Depth 5 -Compress)
`,
filePath,
)
const raw = runPs(script)
return parseJsonOutput<WordDocInfo>(raw, {
text: '',
paragraphs: [],
tables: [],
wordCount: 0,
pageCount: 0,
})
}
// ---------------------------------------------------------------------------
// 2. readText
// ---------------------------------------------------------------------------
export async function readText(filePath: string): Promise<string> {
const script = wrapWordScript(
`Write-Output $doc.Content.Text`,
filePath,
)
return runPs(script)
}
// ---------------------------------------------------------------------------
// 3. appendText
// ---------------------------------------------------------------------------
export async function appendText(
filePath: string,
text: string,
opts?: AppendTextOptions,
): Promise<boolean> {
const fontSetup = opts
? [
opts.bold !== undefined ? `$sel.Font.Bold = ${opts.bold ? '-1' : '0'}` : '',
opts.italic !== undefined ? `$sel.Font.Italic = ${opts.italic ? '-1' : '0'}` : '',
opts.fontSize !== undefined ? `$sel.Font.Size = ${opts.fontSize}` : '',
opts.fontName ? `$sel.Font.Name = '${psEscape(opts.fontName)}'` : '',
]
.filter(Boolean)
.join('\n ')
: ''
const body = `
$sel = $word.Selection
$sel.EndKey(6) | Out-Null
${fontSetup}
$sel.TypeText('${psEscape(text)}')
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 4. insertText
// ---------------------------------------------------------------------------
export async function insertText(
filePath: string,
paraIndex: number,
text: string,
): Promise<boolean> {
const body = `
$doc.Paragraphs.Item(${paraIndex}).Range.InsertBefore('${psEscape(text)}')
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 5. findReplace
// ---------------------------------------------------------------------------
export async function findReplace(
filePath: string,
find: string,
replace: string,
replaceAll?: boolean,
): Promise<number> {
// wdReplaceAll=2, wdReplaceOne=1
const replaceConst = replaceAll !== false ? 2 : 1
const body = `
$content = $doc.Content
$findObj = $content.Find
$findObj.ClearFormatting()
$findObj.Replacement.ClearFormatting()
# Count replacements by iterating
$count = 0
$findObj.Text = '${psEscape(find)}'
$findObj.Replacement.Text = '${psEscape(replace)}'
$findObj.Forward = $true
$findObj.Wrap = 0
$findObj.Format = $false
$findObj.MatchCase = $false
$findObj.MatchWholeWord = $false
$findObj.MatchWildcards = $false
if (${replaceConst} -eq 2) {
# Count occurrences first using a clone of content
$range2 = $doc.Content.Duplicate
while ($range2.Find.Execute('${psEscape(find)}')) { $count++ }
# Now do the actual replace
$findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 2)
} else {
$found = $findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 1)
if ($found) { $count = 1 }
}
`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
$doc.Save()
Write-Output ('{"count":' + $count + '}')
} catch {
Write-Output '{"count":0}'
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ count: number }>(raw, { count: 0 }).count
}
// ---------------------------------------------------------------------------
// 6. insertTable
// ---------------------------------------------------------------------------
export async function insertTable(
filePath: string,
rows: number,
cols: number,
data: string[][],
): Promise<boolean> {
// Build PowerShell array literal for the data
const psData = data
.map(
(row) =>
',@(' + row.map((cell) => `'${psEscape(cell)}'`).join(',') + ')',
)
.join('\n ')
const body = `
$sel = $word.Selection
$sel.EndKey(6) | Out-Null
$table = $doc.Tables.Add($sel.Range, ${rows}, ${cols})
$data = @(${psData})
for ($r = 0; $r -lt $data.Count; $r++) {
for ($c = 0; $c -lt $data[$r].Count; $c++) {
$table.Cell($r + 1, $c + 1).Range.Text = $data[$r][$c]
}
}
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 7. saveWord
// ---------------------------------------------------------------------------
export async function saveWord(
filePath: string,
savePath?: string,
): Promise<boolean> {
if (!savePath || savePath === filePath) {
const script = wrapWordScriptWithSave('', filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
const body = `$doc.SaveAs('${psEscape(savePath)}')`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 8. saveAsPdf
// ---------------------------------------------------------------------------
export async function saveAsPdf(
filePath: string,
pdfPath: string,
): Promise<boolean> {
// wdFormatPDF = 17
const body = `$doc.SaveAs2('${psEscape(pdfPath)}', 17)`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 9. createWord
// ---------------------------------------------------------------------------
export async function createWord(savePath: string): Promise<boolean> {
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Add()
$doc.SaveAs('${psEscape(savePath)}')
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 10. closeWord (no-op)
// ---------------------------------------------------------------------------
/**
* closeWord is a no-op since each operation opens and closes its own COM instance.
*/
export function closeWord(_filePath: string): void {
// No-op: each function manages its own Word lifecycle
}

View File

@@ -0,0 +1,254 @@
/**
* Input Indicator — floating label showing what Computer Use is doing
* on the bound window.
*
* Displays a small overlay near the bottom of the bound window:
* ⌨ Typing "hello world..."
* 🖱 Click (120, 50)
* ⌨ Ctrl+S
* 📜 Scroll ↓ 3
* ✅ Done
*
* Auto-fades after 2 seconds of inactivity.
* Click-through, TOPMOST, no taskbar icon.
*/
import * as fs from 'fs'
import * as path from 'path'
import { validateHwnd, getTmpDir } from './shared.js'
const INDICATOR_WIDTH = 350
const INDICATOR_HEIGHT = 28
const FADE_AFTER_MS = 2000
const BG_COLOR = '30, 30, 30' // dark background
const TEXT_COLOR = '220, 220, 220' // light text
const ACCENT_COLOR = '80, 200, 80' // green accent for active
let indicatorProc: ReturnType<typeof Bun.spawn> | null = null
let stopFile: string | null = null
let scriptFile: string | null = null
let msgFile: string | null = null
function buildIndicatorScript(hwnd: string, sf: string): string {
const sfEsc = sf.replace(/\\/g, '\\\\')
return `
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class Indicator {
[DllImport("user32.dll")] public static extern bool IsWindow(IntPtr h);
[DllImport("user32.dll",SetLastError=true)] public static extern int SetWindowLong(IntPtr h, int i, int v);
[DllImport("user32.dll",SetLastError=true)] public static extern int GetWindowLong(IntPtr h, int i);
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
[DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)] public struct RECT { public int L,T,R,B; }
public const int GWL_EXSTYLE = -20;
public const int WS_EX_LAYERED = 0x80000;
public const int WS_EX_TRANSPARENT = 0x20;
public const int WS_EX_TOOLWINDOW = 0x80;
public const int WS_EX_NOACTIVATE = 0x08000000;
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
public static void MakeOverlay(IntPtr h) {
int ex = GetWindowLong(h, GWL_EXSTYLE);
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
SetWindowLong(h, GWL_EXSTYLE, ex);
}
}
'@
$targetHwnd = [IntPtr]::new([long]${hwnd})
$stopFile = '${sfEsc}'
$msgFile = $stopFile + '.msg'
$form = New-Object System.Windows.Forms.Form
$form.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
$form.ShowInTaskbar = $false
$form.TopMost = $true
$form.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
$form.Size = New-Object System.Drawing.Size(${INDICATOR_WIDTH}, ${INDICATOR_HEIGHT})
$form.Location = New-Object System.Drawing.Point(-32000, -32000)
$form.BackColor = [System.Drawing.Color]::FromArgb(240, ${BG_COLOR})
$form.Opacity = 0.92
$label = New-Object System.Windows.Forms.Label
$label.Dock = [System.Windows.Forms.DockStyle]::Fill
$label.ForeColor = [System.Drawing.Color]::FromArgb(${TEXT_COLOR})
$label.Font = New-Object System.Drawing.Font("Segoe UI", 10, [System.Drawing.FontStyle]::Regular)
$label.TextAlign = [System.Drawing.ContentAlignment]::MiddleLeft
$label.Padding = New-Object System.Windows.Forms.Padding(8, 0, 8, 0)
$label.Text = ""
$form.Controls.Add($label)
$form.Show()
[Indicator]::MakeOverlay($form.Handle)
$script:lastMsg = ""
$script:lastMsgTime = [DateTime]::MinValue
$script:visible = $false
$timer = New-Object System.Windows.Forms.Timer
$timer.Interval = 50 # 20fps
$timer.Add_Tick({
if (-not [Indicator]::IsWindow($targetHwnd)) {
$timer.Stop(); $form.Close()
[System.Windows.Forms.Application]::ExitThread()
return
}
if (Test-Path $stopFile) {
$timer.Stop(); $form.Close()
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
try { Remove-Item $msgFile -ErrorAction SilentlyContinue } catch {}
[System.Windows.Forms.Application]::ExitThread()
return
}
# Read new message
if (Test-Path $msgFile) {
try {
$msg = Get-Content $msgFile -Raw -Encoding UTF8 -ErrorAction SilentlyContinue
if ($msg) {
$script:lastMsg = $msg.Trim()
$script:lastMsgTime = [DateTime]::Now
Remove-Item $msgFile -ErrorAction SilentlyContinue
}
} catch {}
}
# Fade logic: hide after ${FADE_AFTER_MS}ms of no updates
$elapsed = ([DateTime]::Now - $script:lastMsgTime).TotalMilliseconds
if ($elapsed -gt ${FADE_AFTER_MS} -and $script:visible) {
$form.Visible = $false
$script:visible = $false
return
}
if ($elapsed -le ${FADE_AFTER_MS} -and $script:lastMsg -ne "") {
# Position at bottom-center of the bound window
$wr = New-Object Indicator+RECT
[Indicator]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
$ww = $wr.R - $wr.L
$fx = $wr.L + [int](($ww - ${INDICATOR_WIDTH}) / 2)
$fy = $wr.B - ${INDICATOR_HEIGHT} - 8
$label.Text = $script:lastMsg
[Indicator]::SetWindowPos($form.Handle, [Indicator]::HWND_TOPMOST,
$fx, $fy, 0, 0,
0x0001 -bor [Indicator]::SWP_NOACTIVATE -bor [Indicator]::SWP_SHOWWINDOW) | Out-Null
$form.Visible = $true
$script:visible = $true
# Fade opacity near end
if ($elapsed -gt ${FADE_AFTER_MS * 0.7}) {
$form.Opacity = [Math]::Max(0.3, 0.92 * (1.0 - ($elapsed - ${FADE_AFTER_MS * 0.7}) / ${FADE_AFTER_MS * 0.3}))
} else {
$form.Opacity = 0.92
}
}
})
$timer.Start()
[System.Windows.Forms.Application]::Run()
`
}
/** Start the input indicator for a bound window */
export function showIndicator(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
hideIndicator()
try {
const tmpDir = getTmpDir()
const ts = Date.now()
stopFile = path.join(tmpDir, `cu_indicator_stop_${ts}`)
scriptFile = path.join(tmpDir, `cu_indicator_${ts}.ps1`)
msgFile = stopFile + '.msg'
fs.writeFileSync(scriptFile, buildIndicatorScript(hwnd, stopFile), 'utf-8')
indicatorProc = Bun.spawn(
[
'powershell',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-File',
scriptFile,
],
{ stdout: 'ignore', stderr: 'ignore' },
)
return true
} catch {
return false
}
}
/** Update the indicator message */
export function updateIndicator(message: string): void {
if (!msgFile) return
try {
fs.writeFileSync(msgFile, message, 'utf-8')
} catch {}
}
/** Hide and destroy the indicator */
export function hideIndicator(): void {
if (stopFile) {
try {
fs.writeFileSync(stopFile, 'STOP', 'utf-8')
} catch {}
setTimeout(() => {
try {
indicatorProc?.kill()
} catch {}
try {
if (scriptFile) fs.unlinkSync(scriptFile)
} catch {}
try {
if (stopFile) fs.unlinkSync(stopFile)
} catch {}
try {
if (msgFile) fs.unlinkSync(msgFile)
} catch {}
}, 2000)
}
indicatorProc = null
stopFile = null
scriptFile = null
msgFile = null
}
// ── Convenience methods for common actions ──
export function indicateTyping(text: string): void {
const preview = text.length > 30 ? text.slice(0, 30) + '...' : text
updateIndicator(`\u2328 Typing "${preview}"`)
}
export function indicateKey(combo: string): void {
updateIndicator(`\u2328 ${combo}`)
}
export function indicateClick(
x: number,
y: number,
button: string = 'left',
): void {
updateIndicator(
`\uD83D\uDDB1 ${button === 'right' ? 'Right-click' : 'Click'} (${x}, ${y})`,
)
}
export function indicateScroll(direction: string, amount: number): void {
const arrow =
direction === 'up'
? '\u2191'
: direction === 'down'
? '\u2193'
: direction === 'left'
? '\u2190'
: '\u2192'
updateIndicator(`\uD83D\uDCDC Scroll ${arrow} ${amount}`)
}
export function indicateDone(): void {
updateIndicator('\u2705 Done')
}

View File

@@ -3,6 +3,8 @@
* Captures a screen region or window, then runs WinRT OCR to extract text.
*/
import { ps as runPs } from './shared.js'
export interface OcrLine {
text: string
bounds: { x: number; y: number; w: number; h: number }
@@ -18,15 +20,6 @@ function emptyResult(language: string): OcrResult {
return { text: '', lines: [], language }
}
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/**
* PowerShell script that:
* 1. Screenshots a screen region using CopyFromScreen

View File

@@ -0,0 +1,127 @@
/**
* Shared utilities for win32 Computer Use modules.
* Single source of truth — no more duplication across files.
*/
/** Validate HWND is a pure numeric string — prevents PowerShell/Python injection. */
export function validateHwnd(hwnd: string): string {
if (!/^\d+$/.test(hwnd)) {
throw new Error(`Invalid HWND: "${hwnd}" — must be numeric`)
}
return hwnd
}
/** Run a PowerShell script synchronously, return stdout trimmed. */
export function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/** Run a PowerShell script synchronously, return null on failure. */
export function runPs(script: string): string | null {
try {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
if (result.exitCode !== 0) return null
return new TextDecoder().decode(result.stdout).trim()
} catch {
return null
}
}
/** Run a PowerShell script asynchronously. */
export async function psAsync(script: string): Promise<string> {
const proc = Bun.spawn(
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
{ stdout: 'pipe', stderr: 'pipe' },
)
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
/** Get the system temp directory. */
export function getTmpDir(): string {
return process.env.TEMP || process.env.TMP || '/tmp'
}
/** Virtual key code mapping — canonical, complete. */
export const VK_MAP: Record<string, number> = {
backspace: 0x08,
tab: 0x09,
enter: 0x0d,
return: 0x0d,
shift: 0x10,
lshift: 0xa0,
rshift: 0xa1,
ctrl: 0x11,
control: 0x11,
lcontrol: 0xa2,
rcontrol: 0xa3,
alt: 0x12,
option: 0x12,
menu: 0x12,
lalt: 0xa4,
ralt: 0xa5,
pause: 0x13,
capslock: 0x14,
escape: 0x1b,
esc: 0x1b,
space: 0x20,
pageup: 0x21,
pagedown: 0x22,
end: 0x23,
home: 0x24,
left: 0x25,
up: 0x26,
right: 0x27,
down: 0x28,
insert: 0x2d,
delete: 0x2e,
win: 0x5b,
meta: 0x5b,
command: 0x5b,
cmd: 0x5b,
super: 0x5b,
numlock: 0x90,
scrolllock: 0x91,
printscreen: 0x2c,
f1: 0x70,
f2: 0x71,
f3: 0x72,
f4: 0x73,
f5: 0x74,
f6: 0x75,
f7: 0x76,
f8: 0x77,
f9: 0x78,
f10: 0x79,
f11: 0x7a,
f12: 0x7b,
}
export const MODIFIER_KEYS = new Set([
'shift',
'lshift',
'rshift',
'control',
'ctrl',
'lcontrol',
'rcontrol',
'alt',
'option',
'lalt',
'ralt',
'win',
'meta',
'command',
'cmd',
'super',
])

View File

@@ -5,6 +5,8 @@
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
*/
import { ps } from './shared.js'
export interface UIElement {
name: string
controlType: string // Button, Edit, Text, List, Window, etc.
@@ -15,6 +17,48 @@ export interface UIElement {
children?: UIElement[]
}
const VALID_CONTROL_TYPES = new Set([
'Button',
'Calendar',
'CheckBox',
'ComboBox',
'Custom',
'DataGrid',
'DataItem',
'Document',
'Edit',
'Group',
'Header',
'HeaderItem',
'Hyperlink',
'Image',
'List',
'ListItem',
'Menu',
'MenuBar',
'MenuItem',
'Pane',
'ProgressBar',
'RadioButton',
'ScrollBar',
'Separator',
'Slider',
'Spinner',
'SplitButton',
'StatusBar',
'Tab',
'TabItem',
'Table',
'Text',
'Thumb',
'TitleBar',
'ToolBar',
'ToolTip',
'Tree',
'TreeItem',
'Window',
])
// ---------------------------------------------------------------------------
// Helper
// ---------------------------------------------------------------------------
@@ -25,15 +69,6 @@ Add-Type -AssemblyName UIAutomationTypes
Add-Type -AssemblyName WindowsBase
`
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function parseJsonSafe<T>(raw: string, fallback: T): T {
try {
if (!raw) return fallback
@@ -143,6 +178,9 @@ export function findElement(
)
}
if (query.controlType) {
if (!VALID_CONTROL_TYPES.has(query.controlType)) {
return null // Invalid control type
}
const v = query.controlType.replace(/'/g, "''")
conditions.push(
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
@@ -204,7 +242,10 @@ $obj | ConvertTo-Json -Compress
/**
* Click an element by its automationId using InvokePattern.
*/
export function clickElement(windowTitle: string, automationId: string): boolean {
export function clickElement(
windowTitle: string,
automationId: string,
): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
@@ -237,7 +278,11 @@ try {
/**
* Set the value of an element by its automationId using ValuePattern.
*/
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
export function setValue(
windowTitle: string,
automationId: string,
value: string,
): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
const escapedValue = value.replace(/'/g, "''")

View File

@@ -0,0 +1,268 @@
/**
* Virtual Cursor — visible overlay cursor for the bound window.
*
* Shows a small colored cursor icon on top of the bound window,
* independent of the real mouse cursor. The user's real mouse
* stays free for their own use.
*
* The virtual cursor:
* - Moves when Computer Use calls click/moveMouse
* - Shows click animations (brief color flash)
* - Is click-through (WS_EX_TRANSPARENT) — doesn't intercept real mouse
* - Tracks the bound window position via the border tracker
* - Disappears when the window is unbound
*/
import * as fs from 'fs'
import * as path from 'path'
import { validateHwnd, getTmpDir } from './shared.js'
const CURSOR_SIZE = 20
const CURSOR_COLOR_R = 255
const CURSOR_COLOR_G = 50
const CURSOR_COLOR_B = 50
const CURSOR_OPACITY = 0.9
let cursorProc: ReturnType<typeof Bun.spawn> | null = null
let cursorStopFile: string | null = null
let cursorScriptFile: string | null = null
function buildCursorScript(hwnd: string, stopFile: string): string {
const stopFileEscaped = stopFile.replace(/\\/g, '\\\\')
return `
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
Add-Type @'
using System;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Drawing2D;
public class VCursor {
[DllImport("user32.dll")]
public static extern bool IsWindow(IntPtr hWnd);
[DllImport("user32.dll", SetLastError = true)]
public static extern int SetWindowLong(IntPtr hWnd, int nIndex, int dwNewLong);
[DllImport("user32.dll", SetLastError = true)]
public static extern int GetWindowLong(IntPtr hWnd, int nIndex);
[DllImport("user32.dll")]
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint f);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public const int GWL_EXSTYLE = -20;
public const int WS_EX_LAYERED = 0x80000;
public const int WS_EX_TRANSPARENT = 0x20;
public const int WS_EX_TOOLWINDOW = 0x80;
public const int WS_EX_NOACTIVATE = 0x08000000;
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
public const uint SWP_NOSIZE = 0x0001;
public static void MakeOverlay(IntPtr h) {
int ex = GetWindowLong(h, GWL_EXSTYLE);
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
SetWindowLong(h, GWL_EXSTYLE, ex);
}
}
'@
$targetHwnd = [IntPtr]::new([long]${hwnd})
$stopFile = '${stopFileEscaped}'
$cursorSize = ${CURSOR_SIZE}
# Create cursor form with arrow shape
$cursor = New-Object System.Windows.Forms.Form
$cursor.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
$cursor.ShowInTaskbar = $false
$cursor.TopMost = $true
$cursor.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
$cursor.Size = New-Object System.Drawing.Size($cursorSize, $cursorSize)
$cursor.Location = New-Object System.Drawing.Point(-32000, -32000)
$cursor.Opacity = ${CURSOR_OPACITY}
$cursor.BackColor = [System.Drawing.Color]::Magenta
$cursor.TransparencyKey = [System.Drawing.Color]::Magenta
# Draw arrow cursor shape
$bmp = New-Object System.Drawing.Bitmap($cursorSize, $cursorSize)
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::AntiAlias
# Arrow polygon (pointing top-left)
$points = @(
(New-Object System.Drawing.Point(1, 1)),
(New-Object System.Drawing.Point(1, 16)),
(New-Object System.Drawing.Point(5, 12)),
(New-Object System.Drawing.Point(9, 18)),
(New-Object System.Drawing.Point(12, 16)),
(New-Object System.Drawing.Point(8, 10)),
(New-Object System.Drawing.Point(13, 10)),
(New-Object System.Drawing.Point(1, 1))
)
$brush = New-Object System.Drawing.SolidBrush([System.Drawing.Color]::FromArgb(${CURSOR_COLOR_R}, ${CURSOR_COLOR_G}, ${CURSOR_COLOR_B}))
$g.FillPolygon($brush, $points)
$pen = New-Object System.Drawing.Pen([System.Drawing.Color]::White, 1)
$g.DrawPolygon($pen, $points)
$g.Dispose()
$cursor.BackgroundImage = $bmp
$cursor.Show()
[VCursor]::MakeOverlay($cursor.Handle)
# Position file: the TS side writes "x,y" or "x,y,click" to this file
$posFile = $stopFile + '.pos'
$script:lastCX = -32000
$script:lastCY = -32000
$script:clickFlash = 0
$timer = New-Object System.Windows.Forms.Timer
$timer.Interval = 16 # ~60fps
$timer.Add_Tick({
if (-not [VCursor]::IsWindow($targetHwnd)) {
$timer.Stop(); $cursor.Close()
[System.Windows.Forms.Application]::ExitThread()
return
}
# Check stop
if (Test-Path $stopFile) {
$timer.Stop(); $cursor.Close()
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
try { Remove-Item $posFile -ErrorAction SilentlyContinue } catch {}
[System.Windows.Forms.Application]::ExitThread()
return
}
# Read position updates
if (Test-Path $posFile) {
try {
$data = Get-Content $posFile -Raw -ErrorAction SilentlyContinue
if ($data) {
$parts = $data.Trim().Split(',')
if ($parts.Length -ge 2) {
$script:lastCX = [int]$parts[0]
$script:lastCY = [int]$parts[1]
if ($parts.Length -ge 3 -and $parts[2] -eq 'click') {
$script:clickFlash = 6 # flash for 6 frames (~100ms)
}
}
Remove-Item $posFile -ErrorAction SilentlyContinue
}
} catch {}
}
# Get window position to convert client coords to screen coords
$wr = New-Object VCursor+RECT
[VCursor]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
$screenX = $wr.L + $script:lastCX
$screenY = $wr.T + $script:lastCY
# Click flash: briefly change color
if ($script:clickFlash -gt 0) {
$cursor.Opacity = 1.0
$script:clickFlash--
if ($script:clickFlash -eq 0) {
$cursor.Opacity = ${CURSOR_OPACITY}
}
}
[VCursor]::SetWindowPos($cursor.Handle, [VCursor]::HWND_TOPMOST,
$screenX, $screenY, 0, 0,
[VCursor]::SWP_NOSIZE -bor [VCursor]::SWP_NOACTIVATE -bor [VCursor]::SWP_SHOWWINDOW) | Out-Null
$cursor.Visible = $true
})
$timer.Start()
[System.Windows.Forms.Application]::Run()
`
}
/**
* Start the virtual cursor overlay for a bound window.
*/
export function showVirtualCursor(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
hideVirtualCursor()
try {
const tmpDir = getTmpDir()
const ts = Date.now()
const stopFile = path.join(tmpDir, `cu_vcursor_stop_${ts}`)
const scriptFile = path.join(tmpDir, `cu_vcursor_${ts}.ps1`)
const script = buildCursorScript(hwnd, stopFile)
fs.writeFileSync(scriptFile, script, 'utf-8')
cursorProc = Bun.spawn(
[
'powershell',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-File',
scriptFile,
],
{ stdout: 'ignore', stderr: 'ignore' },
)
cursorStopFile = stopFile
cursorScriptFile = scriptFile
return true
} catch {
return false
}
}
/**
* Move the virtual cursor to client-area coordinates.
*/
export function moveVirtualCursor(
x: number,
y: number,
isClick: boolean = false,
): void {
if (!cursorStopFile) return
const posFile = cursorStopFile + '.pos'
try {
const data = isClick
? `${Math.round(x)},${Math.round(y)},click`
: `${Math.round(x)},${Math.round(y)}`
fs.writeFileSync(posFile, data, 'utf-8')
} catch {}
}
/**
* Hide and destroy the virtual cursor.
*/
export function hideVirtualCursor(): void {
if (cursorStopFile) {
try {
fs.writeFileSync(cursorStopFile, 'STOP', 'utf-8')
} catch {}
setTimeout(() => {
try {
cursorProc?.kill()
} catch {}
try {
if (cursorScriptFile) fs.unlinkSync(cursorScriptFile)
} catch {}
try {
if (cursorStopFile) fs.unlinkSync(cursorStopFile)
} catch {}
}, 2000)
}
cursorProc = null
cursorStopFile = null
cursorScriptFile = null
}
/**
* Check if virtual cursor is active.
*/
export function isVirtualCursorActive(): boolean {
return cursorProc !== null
}

View File

@@ -0,0 +1,66 @@
/**
* Visual indicator for bound windows — DWM native border color.
*
* Uses DwmSetWindowAttribute(DWMWA_BORDER_COLOR) to set a green border
* on the bound window. The border:
* - Is the window's OWN border, not an overlay — zero offset, zero shadow issues
* - Follows window movement/resize/rounded corners automatically (OS-level)
* - Persists across repaints, zero performance overhead
* - Works on Win11 22000+ (Build 22000 = Windows 11 GA)
*
* No overlays, no polling, no separate processes, no z-order issues.
*/
import { validateHwnd, ps } from './shared.js'
/**
* Set green border on bound window via DWM.
*/
export function markBound(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
// DWMWA_BORDER_COLOR = 34, COLORREF = 0x00BBGGRR
// Green: R=0, G=200, B=0 → 0x0000C800
const hr = ps(
`Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuDwm {
[DllImport("dwmapi.dll")]
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
}
'@
$color = [uint32]0x0000C800
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
)
return hr === '0'
}
/**
* Remove border, restore default.
*/
export function unmarkBound(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
// DWMWA_COLOR_DEFAULT = 0xFFFFFFFF
const hr = ps(
`Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuDwm {
[DllImport("dwmapi.dll")]
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
}
'@
$color = [uint32]0xFFFFFFFF
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
)
return hr === '0'
}
/**
* Kill all borders — just reset all bound windows.
* With DWM approach, no processes to kill.
*/
export function cleanupAllBorders(): void {
// DWM border color is a window attribute — it resets automatically
// when the process exits or the window closes. No cleanup needed.
}

View File

@@ -4,7 +4,7 @@
*/
export interface WindowInfo {
hwnd: number
hwnd: string
pid: number
title: string
}
@@ -59,7 +59,13 @@ public class WinEnum {
*/
export function listWindows(): WindowInfo[] {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
cmd: [
'powershell',
'-NoProfile',
'-NonInteractive',
'-Command',
ENUM_WINDOWS_PS,
],
stdout: 'pipe',
stderr: 'pipe',
})
@@ -75,11 +81,11 @@ export function listWindows(): WindowInfo[] {
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
if (firstPipe === -1 || secondPipe === -1) return null
const hwnd = Number(trimmed.slice(0, firstPipe))
const hwnd = trimmed.slice(0, firstPipe)
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
const title = trimmed.slice(secondPipe + 1)
if (isNaN(hwnd) || isNaN(pid) || !title) return null
if (!hwnd || isNaN(pid) || !title) return null
return { hwnd, pid, title }
})
.filter((item): item is WindowInfo => item !== null)

View File

@@ -0,0 +1,696 @@
/**
* SendMessage-based input for Win32 windows.
*
* ALL text/keyboard operations target a specific HWND via SendMessageW.
* No SendInput / keybd_event / SendKeys — those are global and conflict with the user.
*
* Text input strategy:
* 1. Short text (≤ CLIPBOARD_THRESHOLD chars): SendMessageW(WM_CHAR) per codepoint
* 2. Long text (> threshold): Clipboard.SetText() + SendMessageW(Ctrl+V) paste
* Both paths support full Unicode (Chinese, emoji, etc.) without IME involvement.
*/
import { validateHwnd, runPs, VK_MAP, MODIFIER_KEYS } from './shared.js'
/** Character count above which we switch to clipboard paste */
const CLIPBOARD_THRESHOLD = 32
/** Cache findEditChild results — window structure doesn't change while bound */
const editChildCache = new Map<string, string | null>()
/** Clear cached edit-child mappings. Call on unbind. */
export function clearEditChildCache(hwnd?: string): void {
if (hwnd) {
editChildCache.delete(hwnd)
} else {
editChildCache.clear()
}
}
/**
* Resolve the HWND that should actually receive input messages.
* For WinUI 3 apps, returns the InputSite child window.
* For traditional Win32 apps, returns the edit control or the original HWND.
*/
export function resolveInputHwnd(hwnd: string): string {
hwnd = validateHwnd(hwnd)
return findEditChild(hwnd) ?? hwnd
}
const WINMSG_TYPE = `
Add-Type @'
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
public class WinMsg {
public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool EnumChildWindows(IntPtr parent, EnumChildProc proc, IntPtr lParam);
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern int GetClassName(IntPtr h, StringBuilder sb, int max);
// CRITICAL: CharSet.Unicode → resolves to SendMessageW
// SendMessageW sends Unicode WM_CHAR (full UTF-16 codepoints including CJK)
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="PostMessageW")]
public static extern bool PostMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll")]
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
public static IntPtr MakeLParam(int lo, int hi) {
return (IntPtr)((hi << 16) | (lo & 0xFFFF));
}
// Build lParam for WM_KEYDOWN / WM_KEYUP with correct scan code
// lParam bits: 0-15 repeat count, 16-23 scan code, 24 extended, 30 prev state, 31 transition
public static IntPtr KeyDownLParam(uint vk) {
uint scanCode = MapVirtualKeyW(vk, 0); // MAPVK_VK_TO_VSC = 0
return (IntPtr)(1 | (scanCode << 16)); // repeat=1, scanCode in bits 16-23
}
public static IntPtr KeyUpLParam(uint vk) {
uint scanCode = MapVirtualKeyW(vk, 0);
return (IntPtr)(1 | (scanCode << 16) | (1 << 30) | (1u << 31)); // prev=1, transition=1
}
public const uint WM_CHAR = 0x0102;
public const uint WM_KEYDOWN = 0x0100;
public const uint WM_KEYUP = 0x0101;
public const uint WM_LBUTTONDOWN = 0x0201;
public const uint WM_LBUTTONUP = 0x0202;
public const uint WM_RBUTTONDOWN = 0x0204;
public const uint WM_RBUTTONUP = 0x0205;
public static List<string> childResults = new List<string>();
public static void FindChildren(IntPtr parent) {
childResults.Clear();
EnumChildWindows(parent, delegate(IntPtr hWnd, IntPtr lParam) {
StringBuilder sb = new StringBuilder(256);
GetClassName(hWnd, sb, sb.Capacity);
childResults.Add(hWnd.ToInt64() + "|" + sb.ToString());
return true;
}, IntPtr.Zero);
}
}
'@
`
// Edit class names in priority order
const EDIT_CLASSES = [
'Windows.UI.Input.InputSite.WindowClass', // WinUI 3 input bridge (Windows Terminal, etc.)
'RichEditD2DPT', // Win11 Notepad (WinUI 3)
'RichEdit20W', // WordPad
'Edit', // Classic edit controls
'Scintilla', // Scintilla-based editors (Notepad++, etc.)
'Chrome_RenderWidgetHostHWND', // Chrome/Electron
'TextBox', // WPF TextBox
'RichTextBox', // WPF RichTextBox
'Windows.UI.Core.CoreWindow', // UWP CoreWindow (input target for some UWP apps)
]
/**
* Find the first edit-capable child window of a parent HWND.
*
* Strategy:
* 1. EnumChildWindows — search for known edit control class names
* 2. UI Automation fallback — find the first Edit/Document element and get its native HWND
*
* EnumChildWindows is recursive and enumerates all descendant windows,
* but for UWP apps the edit control may be in a different process (hosted
* inside ApplicationFrameHost). UI Automation crosses process boundaries.
*/
export function findEditChild(parentHwnd: string): string | null {
parentHwnd = validateHwnd(parentHwnd)
// Cache hit
if (editChildCache.has(parentHwnd)) {
return editChildCache.get(parentHwnd)!
}
// Strategy 1: EnumChildWindows (fast, works for Win32 apps)
const script = `${WINMSG_TYPE}
[WinMsg]::FindChildren([IntPtr]::new([long]${parentHwnd}))
[WinMsg]::childResults | ForEach-Object { $_ }
`
const raw = runPs(script)
if (raw) {
const children = raw
.split('\n')
.filter(Boolean)
.map(line => {
const trimmed = line.trim()
const pipe = trimmed.indexOf('|')
if (pipe === -1) return null
return {
hwnd: trimmed.slice(0, pipe),
className: trimmed.slice(pipe + 1),
}
})
.filter(
(item): item is { hwnd: string; className: string } => item !== null,
)
// Search in priority order
for (const editClass of EDIT_CLASSES) {
const match = children.find(c => c.className === editClass)
if (match) {
editChildCache.set(parentHwnd, match.hwnd)
return match.hwnd
}
}
}
// Strategy 2: UI Automation (crosses process boundaries, finds UWP edit controls)
const uiaScript = `
Add-Type -AssemblyName UIAutomationClient
Add-Type -AssemblyName UIAutomationTypes
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class UiaHelper {
[DllImport("user32.dll")]
public static extern bool IsWindow(IntPtr hWnd);
}
'@
try {
$el = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${parentHwnd}))
if ($el -eq $null) { Write-Output 'NONE'; exit }
# Search for Edit or Document control types (covers text editors)
$editCond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
[System.Windows.Automation.ControlType]::Edit)
$docCond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
[System.Windows.Automation.ControlType]::Document)
$orCond = [System.Windows.Automation.OrCondition]::new($editCond, $docCond)
$found = $el.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $orCond)
if ($found -eq $null) { Write-Output 'NONE'; exit }
$nativeHwnd = $found.Current.NativeWindowHandle
if ($nativeHwnd -ne 0) {
Write-Output $nativeHwnd
} else {
Write-Output 'NONE'
}
} catch {
Write-Output 'NONE'
}
`
const uiaResult = runPs(uiaScript)
if (uiaResult && uiaResult !== 'NONE') {
const hwnd = uiaResult.trim()
if (hwnd && hwnd !== '0') {
editChildCache.set(parentHwnd, hwnd)
return hwnd
}
}
editChildCache.set(parentHwnd, null)
return null
}
/**
* Send a single Unicode character to a window via SendMessageW(WM_CHAR).
* Handles surrogate pairs for characters outside BMP (emoji, rare CJK, etc.).
*/
export function sendChar(hwnd: string, char: string): boolean {
hwnd = validateHwnd(hwnd)
const codePoint = char.codePointAt(0)
if (codePoint === undefined) return false
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
// BMP character (U+0000 to U+FFFF): single WM_CHAR
if (codePoint <= 0xffff) {
const script = `${WINMSG_TYPE}
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${codePoint}, [IntPtr]0)
`
return runPs(script) !== null
}
// Supplementary character (U+10000+): send as UTF-16 surrogate pair
// Windows processes surrogate pairs as two sequential WM_CHAR messages
const hi = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800
const lo = ((codePoint - 0x10000) % 0x400) + 0xdc00
const script = `${WINMSG_TYPE}
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)
`
return runPs(script) !== null
}
/**
* Build PowerShell lines that send each codepoint via WM_CHAR.
* Handles surrogate pairs for supplementary characters.
*/
function buildWmCharLines(hwnd: string, text: string): string[] {
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const lines: string[] = []
for (const ch of text) {
const cp = ch.codePointAt(0)!
if (cp <= 0xffff) {
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${cp}, [IntPtr]0)`,
)
} else {
const hi = Math.floor((cp - 0x10000) / 0x400) + 0xd800
const lo = ((cp - 0x10000) % 0x400) + 0xdc00
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)`,
)
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)`,
)
}
}
return lines
}
/**
* Paste text via clipboard into the target window.
* Uses Clipboard.SetText() + SendMessageW(Ctrl+V).
* NO global APIs (SendInput/keybd_event/SendKeys) — only window-targeted messages.
*/
function pasteViaClipboard(hwnd: string, text: string): boolean {
// Escape single quotes for PowerShell string literal
const escaped = text.replace(/'/g, "''")
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const script = `${WINMSG_TYPE}
Add-Type -AssemblyName System.Windows.Forms
# Save current clipboard
$saved = $null
try { $saved = [System.Windows.Forms.Clipboard]::GetText() } catch {}
# Set our text
[System.Windows.Forms.Clipboard]::SetText('${escaped}')
# Ctrl+V via PostMessage to the target window (NOT global keybd_event)
# Must use PostMessage + correct lParam (scan code) for Windows Terminal / ConPTY
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x11, [WinMsg]::KeyDownLParam(0x11)) # Ctrl down
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x56, [WinMsg]::KeyDownLParam(0x56)) # V down
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x56, [WinMsg]::KeyUpLParam(0x56)) # V up
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x11, [WinMsg]::KeyUpLParam(0x11)) # Ctrl up
# Brief wait for paste to complete
Start-Sleep -Milliseconds 50
# Restore clipboard
if ($saved -ne $null -and $saved -ne '') {
try { [System.Windows.Forms.Clipboard]::SetText($saved) } catch {}
} else {
try { [System.Windows.Forms.Clipboard]::Clear() } catch {}
}
Write-Output 'OK'
`
return runPs(script) === 'OK'
}
/**
* Send text to a window via WM_CHAR per Unicode codepoint.
* Always uses the WM_CHAR path — reliable across all window types including
* Windows Terminal / ConPTY where clipboard-based Ctrl+V doesn't work.
* Window-targeted, no global input APIs.
*/
export function sendText(hwnd: string, text: string): boolean {
const targetHwnd = resolveInputHwnd(hwnd)
const charLines = buildWmCharLines(targetHwnd, text)
const script = `${WINMSG_TYPE}
${charLines.join('\n')}
`
return runPs(script) !== null
}
/**
* Send a key down or key up event via PostMessageW(WM_KEYDOWN / WM_KEYUP).
* Uses PostMessage (async) instead of SendMessage — required for Windows Terminal
* and ConPTY-based console windows to correctly process key events.
* lParam includes the correct scan code via MapVirtualKeyW.
*/
export function sendKey(
hwnd: string,
vk: number,
action: 'down' | 'up',
): boolean {
hwnd = validateHwnd(hwnd)
const msg = action === 'down' ? '0x0100' : '0x0101'
const lParamFn = action === 'down' ? 'KeyDownLParam' : 'KeyUpLParam'
const script = `${WINMSG_TYPE}
[WinMsg]::PostMessage([IntPtr]::new([long]${hwnd}), ${msg}, [IntPtr]${vk}, [WinMsg]::${lParamFn}(${vk}))
`
return runPs(script) !== null
}
/**
* Send a key combination (e.g. ['ctrl', 'a']).
* Holds modifiers via WM_KEYDOWN, presses the key, then releases in reverse.
* All via SendMessageW — no global APIs.
*/
export function sendKeys(hwnd: string, combo: string[]): boolean {
hwnd = resolveInputHwnd(hwnd)
if (combo.length === 0) return false
const modifiers: number[] = []
let mainKey: number | undefined
for (const key of combo) {
const lower = key.toLowerCase()
const vk = VK_MAP[lower]
if (vk !== undefined) {
if (MODIFIER_KEYS.has(lower)) {
modifiers.push(vk)
} else {
mainKey = vk
}
} else if (lower.length === 1) {
// Single character — use its uppercase VK code
mainKey = lower.toUpperCase().charCodeAt(0)
} else {
return false
}
}
if (mainKey === undefined) return false
// Build script: modifiers down, key down, key up, modifiers up (reverse)
// Uses PostMessage (async) + correct lParam (scan code) — required for
// Windows Terminal / ConPTY to correctly translate key events.
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const lines: string[] = []
for (const mod of modifiers) {
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mod}, [WinMsg]::KeyDownLParam(${mod}))`,
)
}
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mainKey}, [WinMsg]::KeyDownLParam(${mainKey}))`,
)
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mainKey}, [WinMsg]::KeyUpLParam(${mainKey}))`,
)
for (const mod of [...modifiers].reverse()) {
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mod}, [WinMsg]::KeyUpLParam(${mod}))`,
)
}
const script = `${WINMSG_TYPE}
${lines.join('\n')}
`
return runPs(script) !== null
}
// ── Console Input Buffer (WriteConsoleInput) ─────────────────────────
// For terminal/console windows, SendMessageW doesn't reliably inject
// key events into the Console Input Buffer that raw-mode stdin reads.
// This function uses AttachConsole + WriteConsoleInput to inject directly.
const CONSOLE_INPUT_TYPE = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class ConsoleInput {
[DllImport("kernel32.dll", SetLastError=true)]
public static extern bool AttachConsole(uint dwProcessId);
[DllImport("kernel32.dll", SetLastError=true)]
public static extern bool FreeConsole();
[DllImport("kernel32.dll", SetLastError=true)]
public static extern IntPtr GetStdHandle(int nStdHandle);
[DllImport("kernel32.dll", CharSet=CharSet.Unicode, SetLastError=true)]
public static extern bool WriteConsoleInput(
IntPtr hConsoleInput,
INPUT_RECORD[] lpBuffer,
uint nLength,
out uint lpNumberOfEventsWritten);
[DllImport("kernel32.dll")]
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
[DllImport("user32.dll")]
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId);
public const int STD_INPUT_HANDLE = -10;
[StructLayout(LayoutKind.Explicit)]
public struct INPUT_RECORD {
[FieldOffset(0)] public ushort EventType;
[FieldOffset(4)] public KEY_EVENT_RECORD KeyEvent;
}
[StructLayout(LayoutKind.Explicit, CharSet=CharSet.Unicode)]
public struct KEY_EVENT_RECORD {
[FieldOffset(0)] public bool bKeyDown;
[FieldOffset(4)] public ushort wRepeatCount;
[FieldOffset(6)] public ushort wVirtualKeyCode;
[FieldOffset(8)] public ushort wVirtualScanCode;
[FieldOffset(10)] public char UnicodeChar;
[FieldOffset(12)] public uint dwControlKeyState;
}
public static bool SendKeyToConsole(IntPtr hwnd, ushort vk, char ch) {
uint pid;
GetWindowThreadProcessId(hwnd, out pid);
if (pid == 0) return false;
FreeConsole();
if (!AttachConsole(pid)) return false;
try {
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
ushort scanCode = (ushort)MapVirtualKeyW(vk, 0);
INPUT_RECORD[] records = new INPUT_RECORD[2];
// Key down
records[0].EventType = 1; // KEY_EVENT
records[0].KeyEvent.bKeyDown = true;
records[0].KeyEvent.wRepeatCount = 1;
records[0].KeyEvent.wVirtualKeyCode = vk;
records[0].KeyEvent.wVirtualScanCode = scanCode;
records[0].KeyEvent.UnicodeChar = ch;
records[0].KeyEvent.dwControlKeyState = 0;
// Key up
records[1].EventType = 1;
records[1].KeyEvent.bKeyDown = false;
records[1].KeyEvent.wRepeatCount = 1;
records[1].KeyEvent.wVirtualKeyCode = vk;
records[1].KeyEvent.wVirtualScanCode = scanCode;
records[1].KeyEvent.UnicodeChar = ch;
records[1].KeyEvent.dwControlKeyState = 0;
uint written;
return WriteConsoleInput(hInput, records, 2, out written);
} finally {
FreeConsole();
}
}
public static bool SendTextToConsole(IntPtr hwnd, string text) {
uint pid;
GetWindowThreadProcessId(hwnd, out pid);
if (pid == 0) return false;
FreeConsole();
if (!AttachConsole(pid)) return false;
try {
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
INPUT_RECORD[] records = new INPUT_RECORD[text.Length * 2];
for (int i = 0; i < text.Length; i++) {
char c = text[i];
ushort vk = 0;
ushort sc = 0;
// Key down
records[i * 2].EventType = 1;
records[i * 2].KeyEvent.bKeyDown = true;
records[i * 2].KeyEvent.wRepeatCount = 1;
records[i * 2].KeyEvent.wVirtualKeyCode = vk;
records[i * 2].KeyEvent.wVirtualScanCode = sc;
records[i * 2].KeyEvent.UnicodeChar = c;
records[i * 2].KeyEvent.dwControlKeyState = 0;
// Key up
records[i * 2 + 1].EventType = 1;
records[i * 2 + 1].KeyEvent.bKeyDown = false;
records[i * 2 + 1].KeyEvent.wRepeatCount = 1;
records[i * 2 + 1].KeyEvent.wVirtualKeyCode = vk;
records[i * 2 + 1].KeyEvent.wVirtualScanCode = sc;
records[i * 2 + 1].KeyEvent.UnicodeChar = c;
records[i * 2 + 1].KeyEvent.dwControlKeyState = 0;
}
uint written;
return WriteConsoleInput(hInput, records, (uint)records.Length, out written);
} finally {
FreeConsole();
}
}
}
'@
`
/**
* Send a key to a console window via WriteConsoleInput (Console Input Buffer).
* This is required for terminal apps like Claude Code REPL that read stdin in raw mode.
*/
export function consoleKey(
hwnd: string,
vk: number,
ch: string = '\0',
): boolean {
hwnd = validateHwnd(hwnd)
const charCode = ch.charCodeAt(0)
const script = `${CONSOLE_INPUT_TYPE}
[ConsoleInput]::SendKeyToConsole([IntPtr]::new([long]${hwnd}), ${vk}, [char]${charCode})
`
return runPs(script) !== null
}
/**
* Send text + Enter to a console window via WriteConsoleInput.
* Directly injects into the Console Input Buffer — works for raw-mode stdin.
*/
export function consoleText(hwnd: string, text: string): boolean {
hwnd = validateHwnd(hwnd)
// Escape single quotes for PowerShell
const escaped = text.replace(/'/g, "''")
const script = `${CONSOLE_INPUT_TYPE}
[ConsoleInput]::SendTextToConsole([IntPtr]::new([long]${hwnd}), '${escaped}')
`
return runPs(script) !== null
}
/**
* Send a mouse click at client-area coordinates (x, y) relative to the window.
* Via SendMessageW — window-targeted, no cursor movement.
*/
export function sendClick(
hwnd: string,
x: number,
y: number,
button: 'left' | 'right',
): boolean {
hwnd = resolveInputHwnd(hwnd)
const downMsg = button === 'left' ? '0x0201' : '0x0204'
const upMsg = button === 'left' ? '0x0202' : '0x0205'
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage(${hwndExpr}, ${downMsg}, [IntPtr]0, $lp)
[WinMsg]::SendMessage(${hwndExpr}, ${upMsg}, [IntPtr]0, $lp)
`
return runPs(script) !== null
}
/**
* Send a mouse-button-down at client-area coordinates (x, y).
* Via SendMessageW(WM_LBUTTONDOWN) — window-targeted, no cursor movement.
*/
export function sendMouseDown(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONDOWN, [IntPtr]1, $lp)
`
return runPs(script) !== null
}
/**
* Send a mouse-button-up at client-area coordinates (x, y).
* Via SendMessageW(WM_LBUTTONUP) — window-targeted, no cursor movement.
*/
export function sendMouseUp(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONUP, [IntPtr]0, $lp)
`
return runPs(script) !== null
}
/**
* Send a WM_MOUSEMOVE at client-area coordinates (x, y).
* Used during drag operations. Via SendMessageW — window-targeted.
*/
export function sendMouseMove(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), 0x0200, [IntPtr]1, $lp)
`
return runPs(script) !== null
}
/**
* Send mouse wheel scroll at client-area coordinates (x, y).
* Via SendMessageW(WM_MOUSEWHEEL / WM_MOUSEHWHEEL).
*
* WM_MOUSEWHEEL: vertical scroll (positive delta = scroll up)
* WM_MOUSEHWHEEL: horizontal scroll (positive delta = scroll right)
*
* delta is in multiples of WHEEL_DELTA (120). One "click" = 120.
* lParam = screen coordinates (not client), wParam high word = delta.
*
* Works on Excel, browsers, modern UI — unlike WM_VSCROLL/WM_HSCROLL
* which only work on traditional scrollbar controls.
*/
export function sendMouseWheel(
hwnd: string,
x: number,
y: number,
delta: number,
horizontal: boolean = false,
): boolean {
hwnd = resolveInputHwnd(hwnd)
// WM_MOUSEWHEEL = 0x020A, WM_MOUSEHWHEEL = 0x020E
const msg = horizontal ? '0x020E' : '0x020A'
// wParam: high word = wheel delta (signed short), low word = modifier keys (0)
// delta is in units of WHEEL_DELTA (120). Positive = up/right, negative = down/left.
const wheelDelta = Math.round(delta) * 120
// Pack delta into high word of wParam: (delta << 16) as signed
// lParam: screen coordinates packed as MAKELPARAM(screenX, screenY)
const script = `${WINMSG_TYPE}
# WM_MOUSEWHEEL/WM_MOUSEHWHEEL require screen coords in lParam
# and wheel delta in high word of wParam
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WheelHelper {
[DllImport("user32.dll")] public static extern bool ClientToScreen(IntPtr hWnd, ref POINT p);
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X, Y; }
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMsg(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
public static void Scroll(IntPtr hWnd, int clientX, int clientY, int delta, uint msg) {
POINT pt; pt.X = clientX; pt.Y = clientY;
ClientToScreen(hWnd, ref pt);
IntPtr wParam = (IntPtr)(delta << 16);
IntPtr lParam = (IntPtr)((pt.Y << 16) | (pt.X & 0xFFFF));
SendMsg(hWnd, msg, wParam, lParam);
}
}
'@
[WheelHelper]::Scroll([IntPtr]::new([long]${hwnd}), ${x}, ${y}, ${wheelDelta}, ${msg})
`
return runPs(script) !== null
}