mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge
三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。
- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用
- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)
- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot
- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
14
.gitignore
vendored
14
.gitignore
vendored
@@ -13,4 +13,16 @@ src/utils/vendor/
|
|||||||
# AI tool runtime directories
|
# AI tool runtime directories
|
||||||
.agents/
|
.agents/
|
||||||
.codex/
|
.codex/
|
||||||
.omx/
|
.omx/
|
||||||
|
|
||||||
|
# Binary / screenshot files (root only)
|
||||||
|
/*.png
|
||||||
|
*.bmp
|
||||||
|
|
||||||
|
# Agent / tool state dirs
|
||||||
|
.swarm/
|
||||||
|
.agents/__pycache__/
|
||||||
|
|
||||||
|
# Python bytecode
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|||||||
18
DEV-LOG.md
18
DEV-LOG.md
@@ -39,6 +39,7 @@
|
|||||||
|
|
||||||
## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03)
|
## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03)
|
||||||
|
|
||||||
|
|
||||||
在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。
|
在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。
|
||||||
|
|
||||||
**新增文件:**
|
**新增文件:**
|
||||||
@@ -118,23 +119,6 @@ packages/@ant/computer-use-{input,swift}/src/
|
|||||||
| `vendor/audio-capture/{platform}/audio-capture.node` | 6 个平台的原生音频二进制(cpal,来自参考项目) |
|
| `vendor/audio-capture/{platform}/audio-capture.node` | 6 个平台的原生音频二进制(cpal,来自参考项目) |
|
||||||
| `vendor/audio-capture-src/index.ts` | 原生模块加载器(按 `${arch}-${platform}` 动态 require `.node`) |
|
| `vendor/audio-capture-src/index.ts` | 原生模块加载器(按 `${arch}-${platform}` 动态 require `.node`) |
|
||||||
|
|
||||||
**修改文件:**
|
|
||||||
|
|
||||||
| 文件 | 变更 |
|
|
||||||
|------|------|
|
|
||||||
| `packages/audio-capture-napi/src/index.ts` | SoX 子进程 stub → 原生 `.node` 加载器(含 `process.cwd()` workspace 路径 fallback) |
|
|
||||||
| `scripts/dev.ts` | `DEFAULT_FEATURES` 加 `"VOICE_MODE"` |
|
|
||||||
| `build.ts` | `DEFAULT_BUILD_FEATURES` 加 `"VOICE_MODE"` |
|
|
||||||
| `docs/features/voice-mode.md` | 追加恢复计划章节(第八节) |
|
|
||||||
|
|
||||||
**验证结果:**
|
|
||||||
|
|
||||||
- `isNativeAudioAvailable()` → `true`(Windows x64 原生 `.node` 加载成功)
|
|
||||||
- `feature('VOICE_MODE')` → `ENABLED`
|
|
||||||
- `bun run build` → voice 代码编入产物
|
|
||||||
|
|
||||||
**运行时前置条件:** claude.ai OAuth 登录 + 麦克风权限
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Enable Claude in Chrome MCP (2026-04-03)
|
## Enable Claude in Chrome MCP (2026-04-03)
|
||||||
|
|||||||
88
build.ts
88
build.ts
@@ -2,11 +2,11 @@ import { readdir, readFile, writeFile, cp } from "fs/promises";
|
|||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import { getMacroDefines } from "./scripts/defines.ts";
|
import { getMacroDefines } from "./scripts/defines.ts";
|
||||||
|
|
||||||
const outdir = "dist";
|
const outdir = 'dist'
|
||||||
|
|
||||||
// Step 1: Clean output directory
|
// Step 1: Clean output directory
|
||||||
const { rmSync } = await import("fs");
|
const { rmSync } = await import('fs')
|
||||||
rmSync(outdir, { recursive: true, force: true });
|
rmSync(outdir, { recursive: true, force: true })
|
||||||
|
|
||||||
// Default features that match the official CLI build.
|
// Default features that match the official CLI build.
|
||||||
// Additional features can be enabled via FEATURE_<NAME>=1 env vars.
|
// Additional features can be enabled via FEATURE_<NAME>=1 env vars.
|
||||||
@@ -14,50 +14,50 @@ const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "CHICAGO_MCP", "VOICE_M
|
|||||||
|
|
||||||
// Collect FEATURE_* env vars → Bun.build features
|
// Collect FEATURE_* env vars → Bun.build features
|
||||||
const envFeatures = Object.keys(process.env)
|
const envFeatures = Object.keys(process.env)
|
||||||
.filter(k => k.startsWith("FEATURE_"))
|
.filter(k => k.startsWith('FEATURE_'))
|
||||||
.map(k => k.replace("FEATURE_", ""));
|
.map(k => k.replace('FEATURE_', ''))
|
||||||
const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])];
|
const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])]
|
||||||
|
|
||||||
// Step 2: Bundle with splitting
|
// Step 2: Bundle with splitting
|
||||||
const result = await Bun.build({
|
const result = await Bun.build({
|
||||||
entrypoints: ["src/entrypoints/cli.tsx"],
|
entrypoints: ['src/entrypoints/cli.tsx'],
|
||||||
outdir,
|
outdir,
|
||||||
target: "bun",
|
target: 'bun',
|
||||||
splitting: true,
|
splitting: true,
|
||||||
define: getMacroDefines(),
|
define: getMacroDefines(),
|
||||||
features,
|
features,
|
||||||
});
|
})
|
||||||
|
|
||||||
if (!result.success) {
|
if (!result.success) {
|
||||||
console.error("Build failed:");
|
console.error('Build failed:')
|
||||||
for (const log of result.logs) {
|
for (const log of result.logs) {
|
||||||
console.error(log);
|
console.error(log)
|
||||||
}
|
}
|
||||||
process.exit(1);
|
process.exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 3: Post-process — replace Bun-only `import.meta.require` with Node.js compatible version
|
// Step 3: Post-process — replace Bun-only `import.meta.require` with Node.js compatible version
|
||||||
const files = await readdir(outdir);
|
const files = await readdir(outdir)
|
||||||
const IMPORT_META_REQUIRE = "var __require = import.meta.require;";
|
const IMPORT_META_REQUIRE = 'var __require = import.meta.require;'
|
||||||
const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);`;
|
const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);`
|
||||||
|
|
||||||
let patched = 0;
|
let patched = 0
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
if (!file.endsWith(".js")) continue;
|
if (!file.endsWith('.js')) continue
|
||||||
const filePath = join(outdir, file);
|
const filePath = join(outdir, file)
|
||||||
const content = await readFile(filePath, "utf-8");
|
const content = await readFile(filePath, 'utf-8')
|
||||||
if (content.includes(IMPORT_META_REQUIRE)) {
|
if (content.includes(IMPORT_META_REQUIRE)) {
|
||||||
await writeFile(
|
await writeFile(
|
||||||
filePath,
|
filePath,
|
||||||
content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE),
|
content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE),
|
||||||
);
|
)
|
||||||
patched++;
|
patched++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`,
|
`Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`,
|
||||||
);
|
)
|
||||||
|
|
||||||
// Step 4: Copy native .node addon files (audio-capture)
|
// Step 4: Copy native .node addon files (audio-capture)
|
||||||
const vendorDir = join(outdir, "vendor", "audio-capture");
|
const vendorDir = join(outdir, "vendor", "audio-capture");
|
||||||
@@ -66,16 +66,16 @@ console.log(`Copied vendor/audio-capture/ → ${vendorDir}/`);
|
|||||||
|
|
||||||
// Step 5: Bundle download-ripgrep script as standalone JS for postinstall
|
// Step 5: Bundle download-ripgrep script as standalone JS for postinstall
|
||||||
const rgScript = await Bun.build({
|
const rgScript = await Bun.build({
|
||||||
entrypoints: ["scripts/download-ripgrep.ts"],
|
entrypoints: ['scripts/download-ripgrep.ts'],
|
||||||
outdir,
|
outdir,
|
||||||
target: "node",
|
target: 'node',
|
||||||
});
|
})
|
||||||
if (!rgScript.success) {
|
if (!rgScript.success) {
|
||||||
console.error("Failed to bundle download-ripgrep script:");
|
console.error('Failed to bundle download-ripgrep script:')
|
||||||
for (const log of rgScript.logs) {
|
for (const log of rgScript.logs) {
|
||||||
console.error(log);
|
console.error(log)
|
||||||
}
|
}
|
||||||
// Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts
|
// Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts
|
||||||
} else {
|
} else {
|
||||||
console.log(`Bundled download-ripgrep script to ${outdir}/`);
|
console.log(`Bundled download-ripgrep script to ${outdir}/`)
|
||||||
}
|
}
|
||||||
|
|||||||
325
docs/features/computer-use-architecture-v2.md
Normal file
325
docs/features/computer-use-architecture-v2.md
Normal file
@@ -0,0 +1,325 @@
|
|||||||
|
# Computer Use 架构修正方案 v2
|
||||||
|
|
||||||
|
更新时间:2026-04-04
|
||||||
|
|
||||||
|
## 1. 当前架构的问题
|
||||||
|
|
||||||
|
### 问题 A:平台代码混在错误的包里
|
||||||
|
|
||||||
|
`@ant/computer-use-swift` 是 macOS Swift 原生模块的包装器,但我们把 Windows(`backends/win32.ts`)和 Linux(`backends/linux.ts`)的截图/应用管理代码塞进了这个包。"swift" 在名字里就意味着 macOS,后期维护者无法区分。
|
||||||
|
|
||||||
|
`@ant/computer-use-input` 同样——原本是 macOS enigo Rust 模块,我们也往里面塞了 win32/linux 后端。
|
||||||
|
|
||||||
|
### 问题 B:输入方式不对
|
||||||
|
|
||||||
|
当前 Windows 后端(`packages/@ant/computer-use-input/src/backends/win32.ts`)使用 `SetCursorPos` + `SendInput` + `keybd_event`——这是**全局输入**:
|
||||||
|
|
||||||
|
- 鼠标真的会移动到屏幕上
|
||||||
|
- 键盘真的打到当前前台窗口
|
||||||
|
- **会影响用户当前的操作**
|
||||||
|
|
||||||
|
绑定窗口句柄后,应该用 `SendMessage`/`PostMessage` 向目标 HWND 发送消息:
|
||||||
|
|
||||||
|
- `WM_CHAR` — 发送字符,不移动光标
|
||||||
|
- `WM_KEYDOWN`/`WM_KEYUP` — 发送按键
|
||||||
|
- `WM_LBUTTONDOWN`/`WM_LBUTTONUP` — 发送鼠标点击(窗口客户区相对坐标)
|
||||||
|
- `PrintWindow` — 截取窗口内容,不需要窗口在前台
|
||||||
|
- **不抢焦点、不影响用户当前操作**
|
||||||
|
|
||||||
|
已验证:向记事本 `SendMessage(WM_CHAR)` 成功写入文字,记事本在后台,终端保持前台。
|
||||||
|
|
||||||
|
### 问题 C:截图是公共能力,不属于 swift
|
||||||
|
|
||||||
|
截图(screenshot)、显示器枚举(display)、应用管理(apps)是所有平台都需要的公共能力,不应该放在 `@ant/computer-use-swift`(macOS 专属包名)里。
|
||||||
|
|
||||||
|
## 2. 修正后的架构
|
||||||
|
|
||||||
|
### 2.1 分层原则
|
||||||
|
|
||||||
|
```
|
||||||
|
packages/@ant/ ← macOS 原生模块包装器(不放其他平台代码)
|
||||||
|
├── computer-use-input/ ← macOS: enigo .node 键鼠(仅 darwin)
|
||||||
|
├── computer-use-swift/ ← macOS: Swift .node 截图/应用(仅 darwin)
|
||||||
|
└── computer-use-mcp/ ← 跨平台: MCP server + 工具定义(不改)
|
||||||
|
|
||||||
|
src/utils/computerUse/
|
||||||
|
├── platforms/ ← 新增: 跨平台抽象层
|
||||||
|
│ ├── types.ts ← 公共接口: InputPlatform, ScreenshotPlatform, AppsPlatform, DisplayPlatform
|
||||||
|
│ ├── index.ts ← 平台分发器: 按 process.platform 加载后端
|
||||||
|
│ ├── darwin.ts ← macOS: 委托给 @ant/computer-use-{input,swift}
|
||||||
|
│ ├── win32.ts ← Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows + UIA + OCR
|
||||||
|
│ └── linux.ts ← Linux: xdotool + scrot + xrandr + wmctrl
|
||||||
|
│
|
||||||
|
├── win32/ ← Windows 专属增强能力(不在公共接口中)
|
||||||
|
│ ├── windowCapture.ts ← PrintWindow 窗口绑定截图
|
||||||
|
│ ├── windowEnum.ts ← EnumWindows 窗口枚举
|
||||||
|
│ ├── windowMessage.ts ← SendMessage/PostMessage 无焦点输入(新增)
|
||||||
|
│ ├── uiAutomation.ts ← IUIAutomation UI 元素操作
|
||||||
|
│ └── ocr.ts ← Windows.Media.Ocr 文字识别
|
||||||
|
│
|
||||||
|
├── executor.ts ← 改: 通过 platforms/ 获取平台实现,不直接调 @ant 包
|
||||||
|
├── swiftLoader.ts ← 改: 仅 darwin 使用
|
||||||
|
├── inputLoader.ts ← 改: 仅 darwin 使用
|
||||||
|
└── ...其他文件不动
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 公共接口(`platforms/types.ts`)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
/** 窗口标识 — 跨平台 */
|
||||||
|
export interface WindowHandle {
|
||||||
|
id: string // macOS: bundleId, Windows: HWND string, Linux: window ID
|
||||||
|
pid: number
|
||||||
|
title: string
|
||||||
|
exePath?: string // Windows/Linux: 进程路径
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 输入平台接口 — 两种模式 */
|
||||||
|
export interface InputPlatform {
|
||||||
|
// 模式 A: 全局输入(macOS/Linux 默认,向前台窗口发送)
|
||||||
|
moveMouse(x: number, y: number): Promise<void>
|
||||||
|
click(x: number, y: number, button: 'left' | 'right' | 'middle'): Promise<void>
|
||||||
|
typeText(text: string): Promise<void>
|
||||||
|
key(name: string, action: 'press' | 'release'): Promise<void>
|
||||||
|
keys(combo: string[]): Promise<void>
|
||||||
|
scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
|
||||||
|
mouseLocation(): Promise<{ x: number; y: number }>
|
||||||
|
|
||||||
|
// 模式 B: 窗口绑定输入(Windows SendMessage,不抢焦点)
|
||||||
|
sendChar?(hwnd: string, char: string): Promise<void>
|
||||||
|
sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise<void>
|
||||||
|
sendClick?(hwnd: string, x: number, y: number, button: 'left' | 'right'): Promise<void>
|
||||||
|
sendText?(hwnd: string, text: string): Promise<void>
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 截图平台接口 */
|
||||||
|
export interface ScreenshotPlatform {
|
||||||
|
// 全屏截图
|
||||||
|
captureScreen(displayId?: number): Promise<ScreenshotResult>
|
||||||
|
// 区域截图
|
||||||
|
captureRegion(x: number, y: number, w: number, h: number): Promise<ScreenshotResult>
|
||||||
|
// 窗口截图(Windows: PrintWindow,macOS: SCContentFilter,Linux: xdotool+import)
|
||||||
|
captureWindow?(hwnd: string): Promise<ScreenshotResult | null>
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 显示器平台接口 */
|
||||||
|
export interface DisplayPlatform {
|
||||||
|
listAll(): DisplayInfo[]
|
||||||
|
getSize(displayId?: number): DisplayInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 应用管理平台接口 */
|
||||||
|
export interface AppsPlatform {
|
||||||
|
listRunning(): WindowHandle[]
|
||||||
|
listInstalled(): Promise<InstalledApp[]>
|
||||||
|
open(name: string): Promise<void>
|
||||||
|
getFrontmostApp(): FrontmostAppInfo | null
|
||||||
|
findWindowByTitle(title: string): WindowHandle | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScreenshotResult {
|
||||||
|
base64: string
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DisplayInfo {
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
scaleFactor: number
|
||||||
|
displayId: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface InstalledApp {
|
||||||
|
id: string // macOS: bundleId, Windows: exe path, Linux: .desktop name
|
||||||
|
displayName: string
|
||||||
|
path: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FrontmostAppInfo {
|
||||||
|
id: string
|
||||||
|
appName: string
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 平台分发器(`platforms/index.ts`)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform } from './types.js'
|
||||||
|
|
||||||
|
export interface Platform {
|
||||||
|
input: InputPlatform
|
||||||
|
screenshot: ScreenshotPlatform
|
||||||
|
display: DisplayPlatform
|
||||||
|
apps: AppsPlatform
|
||||||
|
}
|
||||||
|
|
||||||
|
export function loadPlatform(): Platform {
|
||||||
|
switch (process.platform) {
|
||||||
|
case 'darwin':
|
||||||
|
return require('./darwin.js').platform
|
||||||
|
case 'win32':
|
||||||
|
return require('./win32.js').platform
|
||||||
|
case 'linux':
|
||||||
|
return require('./linux.js').platform
|
||||||
|
default:
|
||||||
|
throw new Error(`Computer Use not supported on ${process.platform}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.4 各平台实现
|
||||||
|
|
||||||
|
**`platforms/darwin.ts`** — 委托给 @ant 包(保持兼容):
|
||||||
|
```typescript
|
||||||
|
// macOS: 通过 @ant/computer-use-input 和 @ant/computer-use-swift
|
||||||
|
// 这两个包的 darwin 后端保留不动
|
||||||
|
import { requireComputerUseInput } from '../inputLoader.js'
|
||||||
|
import { requireComputerUseSwift } from '../swiftLoader.js'
|
||||||
|
|
||||||
|
export const platform = {
|
||||||
|
input: { /* 委托给 requireComputerUseInput() */ },
|
||||||
|
screenshot: { /* 委托给 requireComputerUseSwift().screenshot */ },
|
||||||
|
display: { /* 委托给 requireComputerUseSwift().display */ },
|
||||||
|
apps: { /* 委托给 requireComputerUseSwift().apps */ },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**`platforms/win32.ts`** — 使用 `src/utils/computerUse/win32/` 模块:
|
||||||
|
```typescript
|
||||||
|
// Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows 应用
|
||||||
|
import { sendChar, sendKey, sendClick, sendText } from '../win32/windowMessage.js'
|
||||||
|
import { captureWindow } from '../win32/windowCapture.js'
|
||||||
|
import { listWindows } from '../win32/windowEnum.js'
|
||||||
|
// ... PowerShell P/Invoke 全局输入作为 fallback
|
||||||
|
|
||||||
|
export const platform = {
|
||||||
|
input: {
|
||||||
|
// 全局模式: PowerShell SetCursorPos/SendInput(fallback)
|
||||||
|
// 窗口模式: SendMessage(首选)
|
||||||
|
sendChar, sendKey, sendClick, sendText, // 窗口绑定
|
||||||
|
moveMouse, click, typeText, ... // 全局 fallback
|
||||||
|
},
|
||||||
|
screenshot: {
|
||||||
|
captureScreen, // CopyFromScreen
|
||||||
|
captureRegion, // CopyFromScreen(rect)
|
||||||
|
captureWindow, // PrintWindow(不抢焦点)
|
||||||
|
},
|
||||||
|
display: { /* Screen.AllScreens */ },
|
||||||
|
apps: { /* EnumWindows */ },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**`platforms/linux.ts`** — 使用 xdotool/scrot:
|
||||||
|
```typescript
|
||||||
|
// Linux: xdotool + scrot + xrandr + wmctrl
|
||||||
|
export const platform = {
|
||||||
|
input: { /* xdotool mousemove/click/key/type */ },
|
||||||
|
screenshot: { /* scrot */ },
|
||||||
|
display: { /* xrandr */ },
|
||||||
|
apps: { /* wmctrl + ps */ },
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.5 executor.ts 改造
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// 之前: 直接调 requireComputerUseSwift() 和 requireComputerUseInput()
|
||||||
|
// 之后: 通过 platforms/ 统一获取
|
||||||
|
|
||||||
|
import { loadPlatform } from './platforms/index.js'
|
||||||
|
|
||||||
|
const platform = loadPlatform()
|
||||||
|
|
||||||
|
// 截图
|
||||||
|
platform.screenshot.captureScreen()
|
||||||
|
platform.screenshot.captureWindow(hwnd) // 窗口绑定
|
||||||
|
|
||||||
|
// 输入(窗口绑定模式,不抢焦点)
|
||||||
|
platform.input.sendText?.(hwnd, 'Hello')
|
||||||
|
platform.input.sendClick?.(hwnd, 100, 200, 'left')
|
||||||
|
|
||||||
|
// 输入(全局模式,fallback)
|
||||||
|
platform.input.moveMouse(500, 500)
|
||||||
|
platform.input.click(500, 500, 'left')
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Windows 输入模式对比
|
||||||
|
|
||||||
|
| 方式 | API | 抢焦点 | 移鼠标 | 窗口可最小化 | 适用场景 |
|
||||||
|
|------|-----|--------|--------|-------------|---------|
|
||||||
|
| **全局输入** | `SetCursorPos` + `SendInput` | ✅ 抢 | ✅ 动 | ❌ 不行 | 需要坐标点击(fallback) |
|
||||||
|
| **窗口消息** | `SendMessage(WM_CHAR/WM_KEYDOWN)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 打字、按键(首选) |
|
||||||
|
| **窗口消息** | `SendMessage(WM_LBUTTONDOWN)` | ❌ 不抢 | ❌ 不动 | ⚠️ 部分 | 窗口内点击 |
|
||||||
|
| **窗口截图** | `PrintWindow(hwnd, PW_RENDERFULLCONTENT)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 窗口截图 |
|
||||||
|
| **UI 操作** | `UIAutomation InvokePattern` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 按钮点击、文本写入 |
|
||||||
|
|
||||||
|
**策略**:优先用窗口消息 + UIAutomation(不干扰用户),全局输入作为 fallback。
|
||||||
|
|
||||||
|
## 4. 需要新增的文件
|
||||||
|
|
||||||
|
| 文件 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `src/utils/computerUse/platforms/types.ts` | 公共接口定义 |
|
||||||
|
| `src/utils/computerUse/platforms/index.ts` | 平台分发器 |
|
||||||
|
| `src/utils/computerUse/platforms/darwin.ts` | macOS: 委托给 @ant 包 |
|
||||||
|
| `src/utils/computerUse/platforms/win32.ts` | Windows: 组合 win32/ 下各模块 |
|
||||||
|
| `src/utils/computerUse/platforms/linux.ts` | Linux: xdotool/scrot |
|
||||||
|
| `src/utils/computerUse/win32/windowMessage.ts` | **新增**: SendMessage 无焦点输入 |
|
||||||
|
|
||||||
|
## 5. 需要移除/清理的文件
|
||||||
|
|
||||||
|
| 文件 | 操作 | 原因 |
|
||||||
|
|------|------|------|
|
||||||
|
| `packages/@ant/computer-use-input/src/backends/win32.ts` | 删除 | Windows 代码不应在 macOS 包里 |
|
||||||
|
| `packages/@ant/computer-use-input/src/backends/linux.ts` | 删除 | Linux 代码不应在 macOS 包里 |
|
||||||
|
| `packages/@ant/computer-use-swift/src/backends/win32.ts` | 删除 | 同上 |
|
||||||
|
| `packages/@ant/computer-use-swift/src/backends/linux.ts` | 删除 | 同上 |
|
||||||
|
| `packages/@ant/computer-use-input/src/types.ts` | 删除 | 移到 platforms/types.ts |
|
||||||
|
| `packages/@ant/computer-use-swift/src/types.ts` | 删除 | 移到 platforms/types.ts |
|
||||||
|
|
||||||
|
## 6. 需要修改的文件
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|------|------|
|
||||||
|
| `packages/@ant/computer-use-input/src/index.ts` | 恢复为仅 darwin dispatcher(去掉 win32/linux case) |
|
||||||
|
| `packages/@ant/computer-use-swift/src/index.ts` | 恢复为仅 darwin dispatcher(去掉 win32/linux case) |
|
||||||
|
| `src/utils/computerUse/executor.ts` | 通过 `platforms/` 获取平台实现,不直接调 @ant 包 |
|
||||||
|
| `src/utils/computerUse/swiftLoader.ts` | 仅 darwin 加载 |
|
||||||
|
| `src/utils/computerUse/inputLoader.ts` | 仅 darwin 加载 |
|
||||||
|
|
||||||
|
## 7. @ant 包的定位(修正后)
|
||||||
|
|
||||||
|
| 包 | 职责 | 平台 |
|
||||||
|
|---|------|------|
|
||||||
|
| `@ant/computer-use-input` | macOS enigo 键鼠原生模块包装 | **仅 darwin** |
|
||||||
|
| `@ant/computer-use-swift` | macOS Swift 截图/应用原生模块包装 | **仅 darwin** |
|
||||||
|
| `@ant/computer-use-mcp` | MCP Server + 工具定义 + 调用路由 | **跨平台**(不含平台代码) |
|
||||||
|
|
||||||
|
Windows/Linux 的平台实现全部在 `src/utils/computerUse/platforms/` 和 `src/utils/computerUse/win32/` 中。
|
||||||
|
|
||||||
|
## 8. 执行顺序
|
||||||
|
|
||||||
|
```
|
||||||
|
Phase 1: 创建 platforms/ 抽象层
|
||||||
|
├── platforms/types.ts(公共接口)
|
||||||
|
├── platforms/index.ts(分发器)
|
||||||
|
└── platforms/darwin.ts(委托 @ant 包)
|
||||||
|
|
||||||
|
Phase 2: 创建 Windows 平台实现
|
||||||
|
├── win32/windowMessage.ts(SendMessage 无焦点输入)
|
||||||
|
└── platforms/win32.ts(组合 win32/ 各模块)
|
||||||
|
|
||||||
|
Phase 3: 创建 Linux 平台实现
|
||||||
|
└── platforms/linux.ts(xdotool/scrot)
|
||||||
|
|
||||||
|
Phase 4: 改造 executor.ts
|
||||||
|
└── 通过 platforms/ 获取实现,不直接调 @ant
|
||||||
|
|
||||||
|
Phase 5: 清理 @ant 包
|
||||||
|
├── 删除 @ant/computer-use-input/src/backends/{win32,linux}.ts
|
||||||
|
├── 删除 @ant/computer-use-swift/src/backends/{win32,linux}.ts
|
||||||
|
└── 恢复 index.ts 为 darwin-only
|
||||||
|
|
||||||
|
Phase 6: 验证 + PR
|
||||||
|
```
|
||||||
496
docs/features/computer-use-tools-reference.md
Normal file
496
docs/features/computer-use-tools-reference.md
Normal file
@@ -0,0 +1,496 @@
|
|||||||
|
# Computer Use 工具参考文档
|
||||||
|
|
||||||
|
## 概览
|
||||||
|
|
||||||
|
Computer Use 提供 37 个工具,分为三类:
|
||||||
|
|
||||||
|
| 分类 | 平台 | 工具数 | 说明 |
|
||||||
|
|------|------|--------|------|
|
||||||
|
| 通用工具 | 全平台 | 24 | 官方 Computer Use 标准能力 |
|
||||||
|
| Windows 专属工具 | Win32 | 10 | 绑定窗口模式下的增强能力 |
|
||||||
|
| 教学工具 | 全平台 | 3 | 分步引导模式(需 teachMode 开启) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 一、通用工具(24 个)
|
||||||
|
|
||||||
|
全平台可用。未绑定窗口时,操作对象是整个屏幕。
|
||||||
|
|
||||||
|
### 权限与会话
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `request_access` | `apps[]`, `reason`, `clipboardRead?`, `clipboardWrite?`, `systemKeyCombos?` | 请求操作应用的权限。所有其他工具的前置条件 |
|
||||||
|
| `list_granted_applications` | — | 列出当前会话已授权的应用 |
|
||||||
|
|
||||||
|
### 截图与显示
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `screenshot` | `save_to_disk?` | 截取当前屏幕。绑定窗口时截取绑定窗口(PrintWindow)。返回图片 + GUI 元素列表(Windows) |
|
||||||
|
| `zoom` | `region: [x1,y1,x2,y2]` | 截取指定区域的高分辨率图片。坐标基于最近一次全屏截图 |
|
||||||
|
| `switch_display` | `display` | 切换截图的目标显示器 |
|
||||||
|
|
||||||
|
### 鼠标操作
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `left_click` | `coordinate: [x,y]`, `text?` (修饰键) | 左键点击。`text` 可传 "shift"/"ctrl"/"alt" 实现组合点击 |
|
||||||
|
| `double_click` | `coordinate`, `text?` | 双击 |
|
||||||
|
| `triple_click` | `coordinate`, `text?` | 三击(选整行) |
|
||||||
|
| `right_click` | `coordinate`, `text?` | 右键点击 |
|
||||||
|
| `middle_click` | `coordinate`, `text?` | 中键点击 |
|
||||||
|
| `mouse_move` | `coordinate` | 移动鼠标(不点击) |
|
||||||
|
| `left_click_drag` | `coordinate` (终点), `start_coordinate?` (起点) | 拖拽 |
|
||||||
|
| `left_mouse_down` | — | 按下左键不松 |
|
||||||
|
| `left_mouse_up` | — | 松开左键 |
|
||||||
|
| `cursor_position` | — | 获取当前鼠标位置 |
|
||||||
|
|
||||||
|
### 键盘操作
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `type` | `text` | 输入文字 |
|
||||||
|
| `key` | `text` (如 "ctrl+s"), `repeat?` | 按键/组合键 |
|
||||||
|
| `hold_key` | `text`, `duration` (秒) | 按住键指定时长 |
|
||||||
|
|
||||||
|
### 滚动
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `scroll` | `coordinate`, `scroll_direction`, `scroll_amount` | 滚动。方向: up/down/left/right |
|
||||||
|
|
||||||
|
### 应用管理
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `open_application` | `app` | 打开应用。Windows 上自动绑定窗口 |
|
||||||
|
|
||||||
|
### 剪贴板
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `read_clipboard` | — | 读取剪贴板文字 |
|
||||||
|
| `write_clipboard` | `text` | 写入剪贴板 |
|
||||||
|
|
||||||
|
### 其他
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `wait` | `duration` (秒) | 等待 |
|
||||||
|
| `computer_batch` | `actions[]` | 批量执行多个动作(减少 API 往返) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 二、Windows 专属工具(10 个)
|
||||||
|
|
||||||
|
仅 Windows 平台可见。核心能力:**绑定窗口后的独立操作——不抢占用户鼠标键盘**。
|
||||||
|
|
||||||
|
### 工作模式
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────────────────────────────┐
|
||||||
|
│ 未绑定模式 │
|
||||||
|
│ 使用通用工具 (left_click/type/key/scroll) │
|
||||||
|
│ 操作对象:整个屏幕 │
|
||||||
|
│ 输入方式:全局 SendInput(会移动真实鼠标) │
|
||||||
|
└──────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
bind_window / open_application
|
||||||
|
▼
|
||||||
|
┌──────────────────────────────────────────────────┐
|
||||||
|
│ 绑定窗口模式 │
|
||||||
|
│ 使用 Win32 工具 (virtual_mouse/virtual_keyboard) │
|
||||||
|
│ 操作对象:绑定的窗口 │
|
||||||
|
│ 输入方式:SendMessageW(不动真实鼠标/键盘) │
|
||||||
|
│ 可视化:DWM 绿色边框 + 虚拟光标 + 状态指示器 │
|
||||||
|
└──────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 窗口绑定
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `bind_window` | `action`: list/bind/unbind/status | 窗口绑定管理 |
|
||||||
|
|
||||||
|
**动作详情:**
|
||||||
|
|
||||||
|
| action | 参数 | 说明 |
|
||||||
|
|--------|------|------|
|
||||||
|
| `list` | — | 列出所有可见窗口(hwnd、pid、title) |
|
||||||
|
| `bind` | `title?`, `hwnd?`, `pid?` | 绑定到指定窗口。设置 DWM 绿色边框 + 启动虚拟光标 + 启动状态指示器 + 短暂激活窗口确保可接收输入 |
|
||||||
|
| `unbind` | — | 解除绑定,恢复全屏模式 |
|
||||||
|
| `status` | — | 查看当前绑定状态(hwnd、title、pid、窗口矩形) |
|
||||||
|
|
||||||
|
### 窗口管理
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `window_management` | `action`, `x?`, `y?`, `width?`, `height?` | 窗口操作(Win32 API,不走全局快捷键) |
|
||||||
|
|
||||||
|
**动作详情:**
|
||||||
|
|
||||||
|
| action | 说明 |
|
||||||
|
|--------|------|
|
||||||
|
| `minimize` | ShowWindow(SW_MINIMIZE) |
|
||||||
|
| `maximize` | ShowWindow(SW_MAXIMIZE) |
|
||||||
|
| `restore` | ShowWindow(SW_RESTORE) — 恢复最小化/最大化 |
|
||||||
|
| `close` | SendMessage(WM_CLOSE) — 优雅关闭 |
|
||||||
|
| `focus` | SetForegroundWindow + BringWindowToTop — 激活窗口 |
|
||||||
|
| `move_offscreen` | SetWindowPos(-32000,-32000) — 移到屏幕外(仍可 SendMessage/PrintWindow) |
|
||||||
|
| `move_resize` | SetWindowPos — 移动/缩放到指定位置和大小 |
|
||||||
|
| `get_rect` | GetWindowRect — 获取当前位置和大小 |
|
||||||
|
|
||||||
|
### 虚拟鼠标
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `virtual_mouse` | `action`, `coordinate: [x,y]`, `start_coordinate?` | 在绑定窗口内操作虚拟鼠标 |
|
||||||
|
|
||||||
|
**动作详情:**
|
||||||
|
|
||||||
|
| action | 说明 |
|
||||||
|
|--------|------|
|
||||||
|
| `click` | 左键点击。虚拟光标移动到坐标 + 闪烁动画 |
|
||||||
|
| `double_click` | 双击 |
|
||||||
|
| `right_click` | 右键点击 |
|
||||||
|
| `move` | 移动虚拟光标(不点击) |
|
||||||
|
| `drag` | 按住 → 移动 → 松开。需 `start_coordinate` 指定起点 |
|
||||||
|
| `down` | 按下左键不松 |
|
||||||
|
| `up` | 松开左键 |
|
||||||
|
|
||||||
|
**与通用鼠标工具的区别:**
|
||||||
|
|
||||||
|
| | 通用 (`left_click` 等) | `virtual_mouse` |
|
||||||
|
|---|---|---|
|
||||||
|
| 输入方式 | SendInput(全局) | SendMessageW(窗口级) |
|
||||||
|
| 真实鼠标 | 会移动 | **不动** |
|
||||||
|
| 用户干扰 | 有 | **无** |
|
||||||
|
| 适用场景 | 未绑定时 | **绑定后** |
|
||||||
|
|
||||||
|
### 虚拟键盘
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `virtual_keyboard` | `action`, `text`, `duration?`, `repeat?` | 在绑定窗口内操作虚拟键盘 |
|
||||||
|
|
||||||
|
**动作详情:**
|
||||||
|
|
||||||
|
| action | text 含义 | 说明 |
|
||||||
|
|--------|----------|------|
|
||||||
|
| `type` | 要输入的文字 | SendMessageW(WM_CHAR),支持 Unicode 中文/emoji |
|
||||||
|
| `combo` | 组合键 (如 "ctrl+s") | WM_KEYDOWN/UP 序列 |
|
||||||
|
| `press` | 单个键名 | 按下不松(配合 release 使用) |
|
||||||
|
| `release` | 单个键名 | 松开按键 |
|
||||||
|
| `hold` | 键名或组合 | 按住指定秒数后松开 |
|
||||||
|
|
||||||
|
**与通用键盘工具的区别:**
|
||||||
|
|
||||||
|
| | 通用 (`type`/`key`) | `virtual_keyboard` |
|
||||||
|
|---|---|---|
|
||||||
|
| 输入方式 | SendInput(全局) | SendMessageW(窗口级) |
|
||||||
|
| 物理键盘 | 会冲突 | **不冲突** |
|
||||||
|
| 适用场景 | 未绑定时 | **绑定后** |
|
||||||
|
|
||||||
|
**注意:** SendMessageW 对 Windows Terminal (ConPTY) 等现代应用无效。这些应用需要使用通用工具 + 窗口激活方式操作。
|
||||||
|
|
||||||
|
### 鼠标滚轮
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `mouse_wheel` | `coordinate: [x,y]`, `delta`, `direction?` | WM_MOUSEWHEEL 鼠标中键滚轮 |
|
||||||
|
|
||||||
|
**参数说明:**
|
||||||
|
- `delta`: 正值=向上,负值=向下。每 1 单位 ≈ 3 行
|
||||||
|
- `direction`: "vertical"(默认)或 "horizontal"
|
||||||
|
- `coordinate`: 滚轮作用点——决定哪个面板/区域接收滚动
|
||||||
|
|
||||||
|
**与通用 `scroll` 的区别:**
|
||||||
|
|
||||||
|
| | `scroll` | `mouse_wheel` |
|
||||||
|
|---|---|---|
|
||||||
|
| 原理 | WM_VSCROLL/WM_HSCROLL | **WM_MOUSEWHEEL** |
|
||||||
|
| Excel | ❌ | ✅ |
|
||||||
|
| 浏览器 | ❌ | ✅ |
|
||||||
|
| 代码编辑器 | ❌ | ✅ |
|
||||||
|
|
||||||
|
### 元素级操作
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `click_element` | `name?`, `role?`, `automationId?` | 按无障碍名称/角色点击 GUI 元素 |
|
||||||
|
| `type_into_element` | `name?`, `role?`, `automationId?`, `text` | 按名称向元素输入文字 |
|
||||||
|
|
||||||
|
**工作原理:**
|
||||||
|
1. 通过 UI Automation 在绑定窗口中查找匹配元素
|
||||||
|
2. `click_element`: 先尝试 InvokePattern(按钮/菜单),失败则 SendMessage 点击 BoundingRect 中心
|
||||||
|
3. `type_into_element`: 先尝试 ValuePattern 直接设值,失败则点击聚焦 + WM_CHAR 输入
|
||||||
|
|
||||||
|
**适用场景:**
|
||||||
|
- 截图中看到元素名称但坐标不精确时
|
||||||
|
- Accessibility Snapshot 列出了元素的 name/automationId 时
|
||||||
|
- 比坐标点击更可靠(不受窗口缩放/DPI 影响)
|
||||||
|
|
||||||
|
### 终端交互
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `prompt_respond` | `response_type`, `arrow_direction?`, `arrow_count?`, `text?` | 处理终端 Yes/No/选择提示 |
|
||||||
|
|
||||||
|
**response_type 详情:**
|
||||||
|
|
||||||
|
| response_type | 操作 | 场景 |
|
||||||
|
|---------------|------|------|
|
||||||
|
| `yes` | 发送 'y' + Enter | npm "Continue? (y/n)" |
|
||||||
|
| `no` | 发送 'n' + Enter | 拒绝确认 |
|
||||||
|
| `enter` | 发送 Enter | 接受默认选项 |
|
||||||
|
| `escape` | 发送 Escape | 取消操作 |
|
||||||
|
| `select` | ↑/↓ 箭头 × N + Enter | inquirer 选择菜单 |
|
||||||
|
| `type` | 输入文字 + Enter | 文本输入提示 |
|
||||||
|
|
||||||
|
### 状态指示器
|
||||||
|
|
||||||
|
| 工具 | 参数 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `status_indicator` | `action`: show/hide/status, `message?` | 控制绑定窗口底部的浮动状态标签 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 三、教学工具(3 个)
|
||||||
|
|
||||||
|
需要 `teachMode` 开启。
|
||||||
|
|
||||||
|
| 工具 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `request_teach_access` | 请求教学引导模式权限 |
|
||||||
|
| `teach_step` | 显示一步引导提示,等用户点 Next |
|
||||||
|
| `teach_batch` | 批量排队多步引导 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 操作流程
|
||||||
|
|
||||||
|
### 流程 1:全屏操作(未绑定)
|
||||||
|
|
||||||
|
```
|
||||||
|
request_access(apps=["Notepad"])
|
||||||
|
open_application(app="Notepad") ← 自动绑定窗口
|
||||||
|
screenshot ← PrintWindow 截图 + GUI 元素列表
|
||||||
|
left_click(coordinate=[500, 300]) ← 全局 SendInput
|
||||||
|
type(text="hello world") ← 全局 SendInput
|
||||||
|
key(text="ctrl+s") ← 全局 SendInput
|
||||||
|
```
|
||||||
|
|
||||||
|
### 流程 2:绑定窗口操作(推荐,不干扰用户)
|
||||||
|
|
||||||
|
```
|
||||||
|
request_access(apps=["Notepad"])
|
||||||
|
bind_window(action="list") ← 列出所有窗口
|
||||||
|
bind_window(action="bind", title="记事本") ← 绑定 + 绿色边框 + 虚拟光标
|
||||||
|
screenshot ← PrintWindow 截取绑定窗口
|
||||||
|
virtual_mouse(action="click", coordinate=[500, 300]) ← SendMessageW,不动真实鼠标
|
||||||
|
virtual_keyboard(action="type", text="hello world") ← SendMessageW,不动物理键盘
|
||||||
|
virtual_keyboard(action="combo", text="ctrl+s") ← 保存
|
||||||
|
mouse_wheel(coordinate=[500, 400], delta=-5) ← 向下滚动
|
||||||
|
bind_window(action="unbind") ← 解除绑定
|
||||||
|
```
|
||||||
|
|
||||||
|
### 流程 3:按元素名称操作
|
||||||
|
|
||||||
|
```
|
||||||
|
bind_window(action="bind", title="记事本")
|
||||||
|
screenshot ← 返回截图 + GUI elements 列表
|
||||||
|
click_element(name="保存", role="Button") ← UI Automation 查找并点击
|
||||||
|
type_into_element(role="Edit", text="new content")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 流程 4:终端交互
|
||||||
|
|
||||||
|
```
|
||||||
|
bind_window(action="bind", title="PowerShell")
|
||||||
|
screenshot
|
||||||
|
prompt_respond(response_type="yes") ← 回答 y + Enter
|
||||||
|
prompt_respond(response_type="select", arrow_direction="down", arrow_count=2) ← 选第3项
|
||||||
|
```
|
||||||
|
|
||||||
|
### 流程 5:Excel/浏览器滚动
|
||||||
|
|
||||||
|
```
|
||||||
|
bind_window(action="bind", title="Excel")
|
||||||
|
screenshot
|
||||||
|
mouse_wheel(coordinate=[600, 400], delta=-10) ← 向下滚动 10 格
|
||||||
|
mouse_wheel(coordinate=[600, 400], delta=5, direction="horizontal") ← 向右滚动
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 应用兼容性
|
||||||
|
|
||||||
|
| 应用类型 | SendMessageW (virtual_*) | 元素操作 (click_element) | 注意 |
|
||||||
|
|---------|--------------------------|------------------------|------|
|
||||||
|
| 传统 Win32 (记事本/写字板) | ✅ | ✅ | 完美支持 |
|
||||||
|
| Office (Excel/Word) | ✅ (COM 自动化) | ✅ | 通过 COM API |
|
||||||
|
| WPF 应用 | ✅ | ✅ | 标准 UIA 支持 |
|
||||||
|
| Electron/Chrome | ⚠️ 部分 | ⚠️ 部分 | 内部渲染不走 Win32 消息 |
|
||||||
|
| UWP/WinUI (Windows Terminal) | ❌ | ❌ | ConPTY 不接受 SendMessageW |
|
||||||
|
| 浏览器网页内容 | ❌ | ❌ | 需要全局 SendInput |
|
||||||
|
|
||||||
|
**对于不支持 SendMessageW 的应用**,使用通用工具 (`left_click`/`type`/`key`) + `window_management(action="focus")` 先激活窗口。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 绑定窗口时的可视化
|
||||||
|
|
||||||
|
绑定窗口后自动启动三层可视化:
|
||||||
|
|
||||||
|
1. **DWM 绿色边框** — 窗口自身的边框颜色变绿,零偏移
|
||||||
|
2. **虚拟鼠标光标** — 红色箭头图标,跟随 virtual_mouse 操作移动,点击时闪烁
|
||||||
|
3. **状态指示器** — 窗口底部浮动标签,显示当前操作(通过 status_indicator 控制)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Accessibility Snapshot
|
||||||
|
|
||||||
|
每次 `screenshot` 时,如果窗口已绑定,会自动附带 GUI 元素列表:
|
||||||
|
|
||||||
|
```
|
||||||
|
GUI elements in this window:
|
||||||
|
[Button] "Save" (120,50 80x30) enabled
|
||||||
|
[Edit] "" (200,80 400x25) enabled value="hello" id=textBox1
|
||||||
|
[MenuItem] "File" (10,0 40x25) enabled
|
||||||
|
[MenuItem] "Edit" (50,0 40x25) enabled
|
||||||
|
[CheckBox] "Auto-save" (300,50 100x20) enabled id=chkAutoSave
|
||||||
|
```
|
||||||
|
|
||||||
|
模型同时收到 **截图图片 + 结构化元素列表**,可以选择:
|
||||||
|
- 用坐标操作:`virtual_mouse(action="click", coordinate=[120, 50])`
|
||||||
|
- 用名称操作:`click_element(name="Save")`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## UI Automation Control Patterns 参考
|
||||||
|
|
||||||
|
`click_element` / `type_into_element` 底层使用 UI Automation Control Patterns。当前已实现的和可扩展的:
|
||||||
|
|
||||||
|
| Pattern | 用途 | 当前状态 | 可用于 |
|
||||||
|
|---------|------|---------|--------|
|
||||||
|
| `InvokePattern` | 触发点击 | ✅ 已实现 (`click_element`) | 按钮、菜单项、链接 |
|
||||||
|
| `ValuePattern` | 读写文本值 | ✅ 已实现 (`type_into_element`) | 文本框、组合框 |
|
||||||
|
| `TogglePattern` | 切换状态 | ❌ 未实现 | 复选框、开关 |
|
||||||
|
| `SelectionPattern` | 选择项目 | ❌ 未实现 | 下拉菜单、列表 |
|
||||||
|
| `ScrollPattern` | 编程滚动 | ❌ 未实现(用 `mouse_wheel` 替代) | 列表、树、面板 |
|
||||||
|
| `ExpandCollapsePattern` | 展开/折叠 | ❌ 未实现 | 树节点、折叠面板 |
|
||||||
|
| `WindowPattern` | 窗口操作 | ❌ 未实现(用 `window_management` 替代) | 窗口最大化/关闭 |
|
||||||
|
| `TextPattern` | 读取文档文本 | ❌ 未实现 | 文档、富文本 |
|
||||||
|
| `GridPattern` | 表格操作 | ❌ 未实现 | Excel 单元格、数据网格 |
|
||||||
|
| `TablePattern` | 表格结构 | ❌ 未实现 | 表头、行列关系 |
|
||||||
|
| `RangeValuePattern` | 范围值操作 | ❌ 未实现 | 滑块、进度条 |
|
||||||
|
| `TransformPattern` | 移动/缩放 | ❌ 未实现 | 可拖拽元素 |
|
||||||
|
|
||||||
|
**扩展路线:** 优先实现 `TogglePattern`(复选框)和 `SelectionPattern`(下拉菜单),这两个在表单自动化中最常用。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 屏幕截取技术方案对比
|
||||||
|
|
||||||
|
当前使用 Python Bridge (mss) 进行截图,底层是 GDI BitBlt。三种方案对比:
|
||||||
|
|
||||||
|
| 方案 | API | 当前状态 | 性能 | 优势 | 限制 |
|
||||||
|
|------|-----|---------|------|------|------|
|
||||||
|
| **GDI BitBlt** | `BitBlt` / `PrintWindow` | ✅ 当前使用 (mss/bridge.py) | ~300ms | 简单稳定,支持后台窗口 (PrintWindow) | 不支持硬件加速内容、DPI 处理复杂 |
|
||||||
|
| **DXGI Desktop Duplication** | `IDXGIOutputDuplication` | ❌ 未实现 | ~16ms (60fps) | 硬件加速,支持 HDR,GPU 直接读取 | 不支持单窗口截取,需 D3D11 |
|
||||||
|
| **Windows.Graphics.Capture** | `GraphicsCaptureItem` | ❌ 未实现 | ~16ms | 最新 API,支持单窗口/单显示器,系统级权限管理 | Win10 1903+,首次需用户确认 |
|
||||||
|
|
||||||
|
### 推荐升级路径
|
||||||
|
|
||||||
|
```
|
||||||
|
当前: GDI BitBlt (mss) ─── 全屏 ~300ms, 窗口 ~300ms (PrintWindow)
|
||||||
|
│
|
||||||
|
├─ 近期: DXGI Desktop Duplication ─── 全屏 ~16ms, 但不支持单窗口
|
||||||
|
│
|
||||||
|
└─ 远期: Windows.Graphics.Capture ─── 全屏 + 单窗口都 ~16ms
|
||||||
|
```
|
||||||
|
|
||||||
|
### DXGI Desktop Duplication 实现要点
|
||||||
|
|
||||||
|
```python
|
||||||
|
# bridge.py 中可添加 DXGI 截图(通过 d3dshot 或 dxcam 库)
|
||||||
|
import dxcam # pip install dxcam
|
||||||
|
|
||||||
|
camera = dxcam.create()
|
||||||
|
frame = camera.grab() # numpy array, ~5ms
|
||||||
|
# 转为 JPEG base64 发送
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows.Graphics.Capture 实现要点
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 需要 WinRT Python 绑定
|
||||||
|
# pip install winrt-Windows.Graphics.Capture winrt-Windows.Graphics.DirectX
|
||||||
|
# 限制:首次调用需要用户在系统弹窗中确认权限
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 输入方式技术矩阵
|
||||||
|
|
||||||
|
不同应用类型需要不同的输入方式:
|
||||||
|
|
||||||
|
| 输入方式 | API | 优势 | 限制 | 适用应用 |
|
||||||
|
|---------|-----|------|------|---------|
|
||||||
|
| **SendMessageW** | `WM_CHAR` / `WM_KEYDOWN` | 不抢焦点,不动真实键鼠 | 现代应用不支持 | Win32 传统应用 (记事本/Office/WPF) |
|
||||||
|
| **SendInput** | `INPUT` 结构体 | 所有应用都支持 | **必须前台焦点**,会干扰用户 | 所有应用(通用后备) |
|
||||||
|
| **WriteConsoleInput** | 控制台 API | 直接写入控制台缓冲区 | 需要 AttachConsole(可能被拒绝) | cmd/PowerShell(非 Windows Terminal) |
|
||||||
|
| **UI Automation** | `InvokePattern` / `ValuePattern` | 语义级操作,最可靠 | 部分应用不暴露 UIA 接口 | 支持 UIA 的应用 |
|
||||||
|
| **COM Automation** | Excel/Word COM | 完全编程控制 | 仅 Office 应用 | Excel / Word |
|
||||||
|
| **剪贴板 + 粘贴** | `SetClipboardData` + `Ctrl+V` | 绕过输入限制 | 会覆盖用户剪贴板 | 通用后备 |
|
||||||
|
|
||||||
|
### 按应用类型的推荐输入策略
|
||||||
|
|
||||||
|
| 应用类型 | 首选 | 后备 | 说明 |
|
||||||
|
|---------|------|------|------|
|
||||||
|
| 传统 Win32 (记事本/写字板) | SendMessageW | UIA ValuePattern | 虚拟输入完美工作 |
|
||||||
|
| Office (Excel/Word) | COM Automation | SendMessageW | COM 提供结构化操作 |
|
||||||
|
| WPF 应用 | SendMessageW | UIA | 标准 Win32 消息循环 |
|
||||||
|
| Electron/Chrome 应用 | UIA | 剪贴板粘贴 | 内部渲染不走 Win32 |
|
||||||
|
| Windows Terminal (ConPTY) | SendInput (需前台) | 剪贴板粘贴 | ConPTY 不接受外部消息 |
|
||||||
|
| UWP/WinUI 应用 | SendInput (需前台) | UIA | XAML 渲染不走 Win32 消息 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 已知限制与待解决
|
||||||
|
|
||||||
|
| 限制 | 影响 | 计划 |
|
||||||
|
|------|------|------|
|
||||||
|
| Windows Terminal 不接受 SendMessageW | 虚拟键盘/鼠标对终端无效 | 自动检测应用类型,终端类切换到 SendInput + 短暂激活 |
|
||||||
|
| PrintWindow 截不到 alternate screen buffer | Ink REPL 画面截不到 | 切换到 Windows.Graphics.Capture |
|
||||||
|
| Accessibility Snapshot 对大应用慢 (>30s) | Excel 等复杂应用超时 | 限制遍历深度 + 超时保护 |
|
||||||
|
| DWM 边框对自定义标题栏应用可能无效 | 某些 Electron 应用看不到边框 | 检测并回退到叠加窗口方案 |
|
||||||
|
| 虚拟光标是 PowerShell WinForms 进程 | 启动慢 (~1s),资源占用 | 考虑用 Win32 原生窗口替代 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 技术路线图
|
||||||
|
|
||||||
|
### Phase 1(当前)— 基础功能
|
||||||
|
- ✅ SendMessageW 虚拟输入
|
||||||
|
- ✅ PrintWindow/mss 截图
|
||||||
|
- ✅ UI Automation (InvokePattern + ValuePattern)
|
||||||
|
- ✅ Accessibility Snapshot
|
||||||
|
- ✅ DWM 边框指示
|
||||||
|
- ✅ Python Bridge
|
||||||
|
|
||||||
|
### Phase 2(近期)— 兼容性增强
|
||||||
|
- ⬜ 应用类型自动检测(Win32 vs Terminal vs UWP)
|
||||||
|
- ⬜ 终端类应用自动切换 SendInput + 短暂激活
|
||||||
|
- ⬜ TogglePattern / SelectionPattern 支持
|
||||||
|
- ⬜ DXGI Desktop Duplication 高速截图
|
||||||
|
- ⬜ Accessibility Snapshot 超时保护
|
||||||
|
|
||||||
|
### Phase 3(远期)— 高级能力
|
||||||
|
- ⬜ Windows.Graphics.Capture(单窗口实时截图)
|
||||||
|
- ⬜ 截图元素标注(在截图上标记 ID 数字)
|
||||||
|
- ⬜ 浏览器 DOM 提取(绑定浏览器时提取网页结构)
|
||||||
|
- ⬜ GridPattern / TablePattern(Excel 单元格级操作)
|
||||||
|
- ⬜ TextPattern(文档内容读取)
|
||||||
|
- ⬜ 多窗口协同操作
|
||||||
@@ -1,136 +1,197 @@
|
|||||||
# Computer Use 用户指南
|
# Computer Use — macOS / Windows / Linux 跨平台实施计划
|
||||||
|
|
||||||
Computer Use 让 Claude 直接操控你的电脑——移动鼠标、点击、输入文字、截图,就像一个远程助手坐在你面前操作一样。
|
更新时间:2026-04-03
|
||||||
|
参考项目:`E:\源码\claude-code-source-main\claude-code-source-main`
|
||||||
|
|
||||||
## 支持平台
|
## 1. 现状
|
||||||
|
|
||||||
| 平台 | 状态 | 额外配置 |
|
参考项目的 Computer Use **仅支持 macOS**——从入口到底层全部写死 darwin。我们的项目在 Phase 1-3 中已经完成了:
|
||||||
|------|------|---------|
|
|
||||||
| macOS | 可用 | 需授予辅助功能 + 屏幕录制权限 |
|
|
||||||
| Windows | 可用 | 无需额外配置 |
|
|
||||||
| Linux | 不可用 | 后端待开发 |
|
|
||||||
|
|
||||||
## 快速开始
|
- ✅ `@ant/computer-use-mcp` stub 替换为完整实现(12 文件)
|
||||||
|
- ✅ `@ant/computer-use-input` 拆为 dispatcher + backends(darwin + win32)
|
||||||
|
- ✅ `@ant/computer-use-swift` 拆为 dispatcher + backends(darwin + win32)
|
||||||
|
- ✅ `CHICAGO_MCP` 编译开关已开
|
||||||
|
- ❌ `src/` 层有 6 处 macOS 硬编码阻塞
|
||||||
|
|
||||||
1. 启动 Claude Code:
|
## 2. 阻塞点全景
|
||||||
|
|
||||||
```bash
|
### 2.1 入口层
|
||||||
bun run dev
|
|
||||||
```
|
|
||||||
|
|
||||||
Computer Use 默认已开启,无需额外参数。
|
| # | 文件:行号 | 阻塞代码 | 影响 |
|
||||||
|
|---|----------|---------|------|
|
||||||
|
| 1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` | 整个 CU 初始化被跳过 |
|
||||||
|
|
||||||
2. 在对话中告诉 Claude 你想做什么,例如:
|
### 2.2 加载层
|
||||||
- "帮我打开系统设置"
|
|
||||||
- "截个屏看看当前桌面"
|
|
||||||
- "在 Finder 里点击那个文件"
|
|
||||||
|
|
||||||
3. 首次操控某个应用时,会弹出权限对话框让你确认。
|
| # | 文件:行号 | 阻塞代码 | 影响 |
|
||||||
|
|---|----------|---------|------|
|
||||||
|
| 2 | `src/utils/computerUse/swiftLoader.ts:16` | `process.platform !== 'darwin'` → throw | 截图、应用管理全部不可用 |
|
||||||
|
| 3 | `src/utils/computerUse/executor.ts:263` | `process.platform !== 'darwin'` → throw | 整个 executor 工厂函数不可用 |
|
||||||
|
|
||||||
4. 操作过程中随时按 **Esc**(macOS)或 **Ctrl+C**(Windows)中止。
|
### 2.3 macOS 特有依赖
|
||||||
|
|
||||||
## 权限说明
|
| # | 文件:行号 | 依赖 | macOS 实现 | 需要替代方案 |
|
||||||
|
|---|----------|------|-----------|------------|
|
||||||
|
| 4 | `executor.ts:70-88` | 剪贴板 | `pbcopy`/`pbpaste` | Win: PowerShell `Get/Set-Clipboard`;Linux: `xclip`/`wl-copy` |
|
||||||
|
| 5 | `drainRunLoop.ts:21` | CFRunLoop pump | `cu._drainMainRunLoop()` | 非 darwin:直接执行 fn(),不需要 pump |
|
||||||
|
| 6 | `escHotkey.ts:28` | ESC 热键 | CGEventTap | 非 darwin:返回 false(已有 Ctrl+C fallback) |
|
||||||
|
| 7 | `hostAdapter.ts:48-54` | 系统权限 | TCC accessibility + screenRecording | Win:直接 granted;Linux:检查 xdotool |
|
||||||
|
| 8 | `common.ts:56` | 平台标识 | `platform: 'darwin'` 硬编码 | 动态获取 |
|
||||||
|
| 9 | `executor.ts:180` | 粘贴快捷键 | `command+v` | Win/Linux:`ctrl+v` |
|
||||||
|
|
||||||
Computer Use 采用分级权限模型,保护你的安全:
|
### 2.4 缺失的 Linux 后端
|
||||||
|
|
||||||
| 级别 | 能力 | 适用场景 |
|
| 包 | macOS | Windows | Linux |
|
||||||
|------|------|---------|
|
|---|-------|---------|-------|
|
||||||
| **full** | 所有操作:鼠标点击(左/右/中键)、拖拽、键盘输入、组合键 | 系统设置、Finder 等系统应用 |
|
| `computer-use-input/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
|
||||||
| **click** | 仅左键点击和滚轮滚动 | IDE(VS Code、Cursor)、终端 |
|
| `computer-use-swift/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
|
||||||
| 未授权 | 所有操作被拒绝 | 需要通过 `request_access` 申请 |
|
|
||||||
|
|
||||||
IDE 类应用默认只有 click 权限,这是安全设计——防止 AI 在你的终端或编辑器中执行危险操作。如需完整控制,可以在权限对话框中手动提升。
|
## 3. 每个平台的能力依赖
|
||||||
|
|
||||||
## 可用操作
|
### 3.1 computer-use-input(键鼠)
|
||||||
|
|
||||||
### 鼠标
|
| 功能 | macOS | Windows | Linux |
|
||||||
|
|------|-------|---------|-------|
|
||||||
|
| 鼠标移动 | CGEvent JXA | SetCursorPos P/Invoke | xdotool mousemove |
|
||||||
|
| 鼠标点击 | CGEvent JXA | SendInput P/Invoke | xdotool click |
|
||||||
|
| 鼠标滚轮 | CGEvent JXA | SendInput MOUSEEVENTF_WHEEL | xdotool scroll |
|
||||||
|
| 键盘按键 | System Events osascript | keybd_event P/Invoke | xdotool key |
|
||||||
|
| 组合键 | System Events osascript | keybd_event 组合 | xdotool key combo |
|
||||||
|
| 文本输入 | System Events keystroke | SendKeys.SendWait | xdotool type |
|
||||||
|
| 前台应用 | System Events osascript | GetForegroundWindow P/Invoke | xdotool getactivewindow + /proc |
|
||||||
|
| 工具依赖 | osascript(内置) | powershell(内置) | xdotool(需安装) |
|
||||||
|
|
||||||
| 操作 | 说明 |
|
### 3.2 computer-use-swift(截图 + 应用管理)
|
||||||
|------|------|
|
|
||||||
| 移动鼠标 | 移动到指定坐标 |
|
|
||||||
| 左键点击 | 单击、双击、三击 |
|
|
||||||
| 右键点击 | 需要 full 权限 |
|
|
||||||
| 中键点击 | 需要 full 权限 |
|
|
||||||
| 拖拽 | 从 A 点拖到 B 点,需要 full 权限 |
|
|
||||||
| 滚轮 | 向上或向下滚动 |
|
|
||||||
|
|
||||||
### 键盘
|
| 功能 | macOS | Windows | Linux |
|
||||||
|
|------|-------|---------|-------|
|
||||||
|
| 全屏截图 | screencapture | CopyFromScreen | gnome-screenshot / scrot / grim |
|
||||||
|
| 区域截图 | screencapture -R | CopyFromScreen(rect) | gnome-screenshot -a / scrot -a / grim -g |
|
||||||
|
| 显示器列表 | CGGetActiveDisplayList JXA | Screen.AllScreens | xrandr --query |
|
||||||
|
| 运行中应用 | System Events JXA | Get-Process | wmctrl -l / ps |
|
||||||
|
| 打开应用 | osascript activate | Start-Process | xdg-open / gtk-launch |
|
||||||
|
| 隐藏/显示 | System Events visibility | ShowWindow/SetForegroundWindow | wmctrl -c / xdotool |
|
||||||
|
| 工具依赖 | screencapture + osascript | powershell | xdotool + scrot/grim + wmctrl |
|
||||||
|
|
||||||
| 操作 | 说明 |
|
### 3.3 executor 层
|
||||||
|------|------|
|
|
||||||
| 按键 | 单个按键或组合键(如 Ctrl+C) |
|
|
||||||
| 输入文字 | 逐字符输入文本,需要 full 权限 |
|
|
||||||
| 长按 | 按住某个键一段时间,需要 full 权限 |
|
|
||||||
|
|
||||||
### 屏幕
|
| 功能 | macOS | Windows | Linux |
|
||||||
|
|------|-------|---------|-------|
|
||||||
|
| drainRunLoop | CFRunLoop pump | 不需要 | 不需要 |
|
||||||
|
| ESC 热键 | CGEventTap | 跳过(Ctrl+C fallback) | 跳过(Ctrl+C fallback) |
|
||||||
|
| 剪贴板读 | pbpaste | `powershell Get-Clipboard` | xclip -o / wl-paste |
|
||||||
|
| 剪贴板写 | pbcopy | `powershell Set-Clipboard` | xclip / wl-copy |
|
||||||
|
| 粘贴快捷键 | command+v | ctrl+v | ctrl+v |
|
||||||
|
| 终端检测 | __CFBundleIdentifier | WT_SESSION / TERM_PROGRAM | TERM_PROGRAM |
|
||||||
|
| 系统权限 | TCC check | 直接 granted | 检查 xdotool 安装 |
|
||||||
|
|
||||||
| 操作 | 说明 |
|
## 4. 执行步骤
|
||||||
|------|------|
|
|
||||||
| 截图 | 截取当前屏幕 |
|
|
||||||
| 切换显示器 | 多显示器环境下切换目标屏幕 |
|
|
||||||
| 缩放 | 放大屏幕某个区域 |
|
|
||||||
|
|
||||||
### 其他
|
### Phase 1:已完成 ✅
|
||||||
|
|
||||||
| 操作 | 说明 |
|
- [x] `@ant/computer-use-mcp` stub → 完整实现
|
||||||
|------|------|
|
- [x] `@ant/computer-use-input` dispatcher + darwin/win32 backends
|
||||||
| 获取鼠标位置 | 查询当前鼠标坐标 |
|
- [x] `@ant/computer-use-swift` dispatcher + darwin/win32 backends
|
||||||
| 批量操作 | 一次执行多个操作,减少等待 |
|
- [x] `CHICAGO_MCP` 编译开关
|
||||||
| 等待 | 暂停指定秒数(最长 100 秒) |
|
|
||||||
|
|
||||||
## macOS 权限配置
|
### Phase 2:移除 6 处 macOS 硬编码(解锁 macOS + Windows)
|
||||||
|
|
||||||
首次使用前,需要授予两项系统权限。缺少任一项都会导致功能异常(见下方说明)。
|
**改动原则:macOS 代码路径不变,只在每处 darwin 守卫后加 win32/linux 分支。**
|
||||||
|
|
||||||
### 辅助功能(Accessibility)
|
| 步骤 | 文件 | 改动 |
|
||||||
|
|------|------|------|
|
||||||
|
| 2.1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` → 去掉平台限制,或改为 `!== 'unknown'` |
|
||||||
|
| 2.2 | `src/utils/computerUse/swiftLoader.ts:16-18` | 移除 `process.platform !== 'darwin'` throw。`@ant/computer-use-swift/index.ts` 已有跨平台 dispatch |
|
||||||
|
| 2.3 | `src/utils/computerUse/executor.ts:263-267` | 移除 `process.platform !== 'darwin'` throw。改为检查 input/swift isSupported |
|
||||||
|
| 2.4 | `src/utils/computerUse/executor.ts:70-88` | 剪贴板函数按平台分发:darwin→pbcopy/pbpaste,win32→PowerShell Get/Set-Clipboard,linux→xclip |
|
||||||
|
| 2.5 | `src/utils/computerUse/executor.ts:180` | `typeViaClipboard` 中 `command+v` → 非 darwin 时用 `ctrl+v` |
|
||||||
|
| 2.6 | `src/utils/computerUse/executor.ts:273` | `const cu = requireComputerUseSwift()` → 改为 `new ComputerUseAPI()`(从 package 直接实例化,不走 swiftLoader throw) |
|
||||||
|
| 2.7 | `src/utils/computerUse/drainRunLoop.ts` | 开头加 `if (process.platform !== 'darwin') return fn()` |
|
||||||
|
| 2.8 | `src/utils/computerUse/escHotkey.ts` | `registerEscHotkey` 非 darwin 返回 false(已有 Ctrl+C fallback) |
|
||||||
|
| 2.9 | `src/utils/computerUse/hostAdapter.ts:48-54` | `ensureOsPermissions` 非 darwin 返回 `{ granted: true }` |
|
||||||
|
| 2.10 | `src/utils/computerUse/common.ts:56` | `platform: 'darwin'` → `platform: process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin'` |
|
||||||
|
| 2.11 | `src/utils/computerUse/common.ts:55` | `screenshotFiltering: 'native'` → 非 darwin 时 `'none'`(Windows/Linux 截图不支持 per-app 过滤) |
|
||||||
|
| 2.12 | `src/utils/computerUse/gates.ts:13` | `enabled: false` → `enabled: true`(无 GrowthBook 时默认可用) |
|
||||||
|
| 2.13 | `src/utils/computerUse/gates.ts:39-43` | `hasRequiredSubscription()` → 直接返回 `true` |
|
||||||
|
|
||||||
允许 Claude 控制鼠标和键盘。
|
### Phase 3:新增 Linux 后端
|
||||||
|
|
||||||
1. 打开 **系统设置 → 隐私与安全性 → 辅助功能**
|
| 步骤 | 文件 | 内容 |
|
||||||
2. 点击左下角锁图标解锁(需要管理员密码)
|
|------|------|------|
|
||||||
3. 将运行 Claude Code 的应用添加到允许列表:
|
| 3.1 | `packages/@ant/computer-use-input/src/backends/linux.ts` | xdotool 键鼠(mousemove/click/key/type/getactivewindow) |
|
||||||
- Terminal → `Terminal.app`
|
| 3.2 | `packages/@ant/computer-use-swift/src/backends/linux.ts` | scrot/grim 截图 + xrandr 显示器 + wmctrl 窗口管理 |
|
||||||
- iTerm → `iTerm.app`
|
| 3.3 | `packages/@ant/computer-use-input/src/index.ts` | dispatcher 加 `case 'linux'` |
|
||||||
- Cursor → `Cursor.app`
|
| 3.4 | `packages/@ant/computer-use-swift/src/index.ts` | dispatcher 加 `case 'linux'` |
|
||||||
- VS Code 终端 → `Electron` 或 `Visual Studio Code.app`
|
|
||||||
4. 确保应用旁边的开关已打开
|
|
||||||
|
|
||||||
**未授予时的现象**:鼠标移动、点击、键盘输入均无反应,工具执行成功但屏幕没有任何变化。
|
### Phase 4:验证
|
||||||
|
|
||||||
### 屏幕录制(Screen Recording)
|
| 测试项 | macOS | Windows | Linux |
|
||||||
|
|--------|-------|---------|-------|
|
||||||
|
| build 成功 | ✅ | 验证 | 验证 |
|
||||||
|
| MCP 工具列表非空 | 验证 | 验证 | 验证 |
|
||||||
|
| 鼠标移动 | 验证 | ✅ 已通过 | 验证 |
|
||||||
|
| 截图 | 验证 | ✅ 已通过 | 验证 |
|
||||||
|
| 键盘输入 | 验证 | 验证 | 验证 |
|
||||||
|
| 前台窗口 | 验证 | ✅ 已通过 | 验证 |
|
||||||
|
| 剪贴板 | 验证 | 验证 | 验证 |
|
||||||
|
|
||||||
允许 Claude 截取屏幕内容。
|
## 5. 文件改动总览
|
||||||
|
|
||||||
1. 打开 **系统设置 → 隐私与安全性 → 屏幕录制**
|
### 不动的文件(14 个)
|
||||||
2. 将同一个应用添加到允许列表并开启开关
|
|
||||||
3. **需要重启该应用**才能生效(系统会提示 "xxx 需要重新打开")
|
|
||||||
|
|
||||||
**未授予时的现象**:截图工具执行成功但返回空白图片,Claude 无法看到你的屏幕,所有点击操作变成"盲点"。
|
`cleanup.ts`、`computerUseLock.ts`、`wrapper.tsx`、`toolRendering.tsx`、`mcpServer.ts`、`setup.ts`、`appNames.ts`、`inputLoader.ts`、`src/services/mcp/client.ts`、`@ant/computer-use-mcp/src/*`(Phase 1 已完成)、`backends/darwin.ts`(两个包都不动)
|
||||||
|
|
||||||
### 验证权限
|
### 改 src/ 的文件(8 个)
|
||||||
|
|
||||||
授予两项权限后,重启 Claude Code,在对话中让 Claude 截一张图即可验证是否配置成功。如果截图内容正常显示,说明权限配置完成。
|
| 文件 | 改动量 | 风险 |
|
||||||
|
|------|--------|------|
|
||||||
|
| `main.tsx` | 1 行 | 低 |
|
||||||
|
| `swiftLoader.ts` | 2 行 | 低 |
|
||||||
|
| `executor.ts` | ~40 行(剪贴板分发 + 平台守卫 + paste 快捷键) | **中** |
|
||||||
|
| `drainRunLoop.ts` | 1 行 | 低 |
|
||||||
|
| `escHotkey.ts` | 3 行 | 低 |
|
||||||
|
| `hostAdapter.ts` | 5 行 | 低 |
|
||||||
|
| `common.ts` | 3 行 | 低 |
|
||||||
|
| `gates.ts` | 3 行 | 低 |
|
||||||
|
|
||||||
## Linux 依赖(暂不可用)
|
### 新增文件(2 个)
|
||||||
|
|
||||||
Linux 后端尚未开发。完成后需要安装以下工具:
|
| 文件 | 行数估算 |
|
||||||
|
|------|---------|
|
||||||
|
| `packages/@ant/computer-use-input/src/backends/linux.ts` | ~150 行 |
|
||||||
|
| `packages/@ant/computer-use-swift/src/backends/linux.ts` | ~200 行 |
|
||||||
|
|
||||||
```bash
|
## 6. Linux 依赖工具
|
||||||
sudo apt install xdotool scrot xclip wmctrl
|
|
||||||
|
| 工具 | 用途 | 安装命令(Ubuntu) |
|
||||||
|
|------|------|-------------------|
|
||||||
|
| `xdotool` | 键鼠模拟 + 窗口管理 | `sudo apt install xdotool` |
|
||||||
|
| `scrot` 或 `gnome-screenshot` | 截图 | `sudo apt install scrot` |
|
||||||
|
| `xrandr` | 显示器信息 | 通常已预装 |
|
||||||
|
| `xclip` | 剪贴板 | `sudo apt install xclip` |
|
||||||
|
| `wmctrl` | 窗口列表/切换 | `sudo apt install wmctrl` |
|
||||||
|
|
||||||
|
Wayland 环境需要替代工具:`ydotool`(替代 xdotool)、`grim`(替代 scrot)、`wl-clipboard`(替代 xclip)。初期可先只支持 X11,Wayland 标记为 todo。
|
||||||
|
|
||||||
|
## 7. 执行顺序建议
|
||||||
|
|
||||||
|
```
|
||||||
|
Phase 2(解锁 macOS + Windows)
|
||||||
|
├── 2.1-2.3 移除 3 处硬编码 throw/skip
|
||||||
|
├── 2.4-2.5 剪贴板 + 粘贴快捷键平台分发
|
||||||
|
├── 2.6 swiftLoader → 直接实例化
|
||||||
|
├── 2.7-2.9 drainRunLoop / escHotkey / permissions 平台分支
|
||||||
|
├── 2.10-2.11 common.ts 平台标识动态化
|
||||||
|
├── 2.12-2.13 gates.ts 默认值
|
||||||
|
└── 验证 Windows
|
||||||
|
|
||||||
|
Phase 3(Linux 后端)
|
||||||
|
├── 3.1 input/backends/linux.ts
|
||||||
|
├── 3.2 swift/backends/linux.ts
|
||||||
|
├── 3.3-3.4 dispatcher 加 linux case
|
||||||
|
└── 验证 Linux
|
||||||
|
|
||||||
|
Phase 4(集成验证 + PR)
|
||||||
```
|
```
|
||||||
|
|
||||||
仅支持 X11,Wayland 不支持。
|
每个 Phase 可独立验证、独立提交。Phase 2 完成后 macOS + Windows 可用,Phase 3 完成后三平台全部可用。
|
||||||
|
|
||||||
## 常见问题
|
|
||||||
|
|
||||||
### 截图成功但看不到图片
|
|
||||||
|
|
||||||
检查 **系统设置 → 隐私与安全性 → 屏幕录制** 是否已授权。未授权时截图工具会执行成功但返回空白内容。
|
|
||||||
|
|
||||||
### IDE 中无法输入文字或右键
|
|
||||||
|
|
||||||
这是正常行为。IDE 类应用只有 click 权限,无法执行键盘输入、右键、拖拽等操作。如需完整控制,请在系统应用(如 Finder)中操作。
|
|
||||||
|
|
||||||
### 操作中途想停止
|
|
||||||
|
|
||||||
按 **Esc**(macOS)或 **Ctrl+C** 即可立即中止。
|
|
||||||
|
|||||||
@@ -1,33 +1,30 @@
|
|||||||
/**
|
/**
|
||||||
* @ant/computer-use-input — cross-platform keyboard & mouse simulation
|
* @ant/computer-use-input — macOS keyboard & mouse simulation (enigo)
|
||||||
*
|
*
|
||||||
* Platform backends:
|
* This package wraps the macOS-only native enigo .node module.
|
||||||
* - darwin: AppleScript/JXA via CoreGraphics events
|
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
|
||||||
* - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event)
|
|
||||||
*
|
|
||||||
* Add new platforms by creating backends/<platform>.ts implementing InputBackend.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { FrontmostAppInfo, InputBackend } from './types.js'
|
export interface FrontmostAppInfo {
|
||||||
|
bundleId: string
|
||||||
|
appName: string
|
||||||
|
}
|
||||||
|
|
||||||
export type { FrontmostAppInfo, InputBackend } from './types.js'
|
export interface InputBackend {
|
||||||
|
moveMouse(x: number, y: number, animated: boolean): Promise<void>
|
||||||
// ---------------------------------------------------------------------------
|
key(key: string, action: 'press' | 'release'): Promise<void>
|
||||||
// Platform dispatch
|
keys(parts: string[]): Promise<void>
|
||||||
// ---------------------------------------------------------------------------
|
mouseLocation(): Promise<{ x: number; y: number }>
|
||||||
|
mouseButton(button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number): Promise<void>
|
||||||
|
mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
|
||||||
|
typeText(text: string): Promise<void>
|
||||||
|
getFrontmostAppInfo(): FrontmostAppInfo | null
|
||||||
|
}
|
||||||
|
|
||||||
function loadBackend(): InputBackend | null {
|
function loadBackend(): InputBackend | null {
|
||||||
|
if (process.platform !== 'darwin') return null
|
||||||
try {
|
try {
|
||||||
switch (process.platform) {
|
return require('./backends/darwin.js') as InputBackend
|
||||||
case 'darwin':
|
|
||||||
return require('./backends/darwin.js') as InputBackend
|
|
||||||
case 'win32':
|
|
||||||
return require('./backends/win32.js') as InputBackend
|
|
||||||
case 'linux':
|
|
||||||
return require('./backends/linux.js') as InputBackend
|
|
||||||
default:
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
} catch {
|
} catch {
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
@@ -35,30 +32,16 @@ function loadBackend(): InputBackend | null {
|
|||||||
|
|
||||||
const backend = loadBackend()
|
const backend = loadBackend()
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Unsupported stub (throws on call — guards via isSupported check)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
function unsupported(): never {
|
|
||||||
throw new Error(`computer-use-input is not supported on ${process.platform}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Public API — matches the original export surface
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
export const isSupported = backend !== null
|
export const isSupported = backend !== null
|
||||||
|
export const moveMouse = backend?.moveMouse
|
||||||
export const moveMouse = backend?.moveMouse ?? unsupported
|
export const key = backend?.key
|
||||||
export const key = backend?.key ?? unsupported
|
export const keys = backend?.keys
|
||||||
export const keys = backend?.keys ?? unsupported
|
export const mouseLocation = backend?.mouseLocation
|
||||||
export const mouseLocation = backend?.mouseLocation ?? unsupported
|
export const mouseButton = backend?.mouseButton
|
||||||
export const mouseButton = backend?.mouseButton ?? unsupported
|
export const mouseScroll = backend?.mouseScroll
|
||||||
export const mouseScroll = backend?.mouseScroll ?? unsupported
|
export const typeText = backend?.typeText
|
||||||
export const typeText = backend?.typeText ?? unsupported
|
|
||||||
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
|
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
|
||||||
|
|
||||||
// Legacy class type — used by inputLoader.ts for type narrowing
|
|
||||||
export class ComputerUseInputAPI {
|
export class ComputerUseInputAPI {
|
||||||
declare moveMouse: InputBackend['moveMouse']
|
declare moveMouse: InputBackend['moveMouse']
|
||||||
declare key: InputBackend['key']
|
declare key: InputBackend['key']
|
||||||
@@ -71,8 +54,5 @@ export class ComputerUseInputAPI {
|
|||||||
declare isSupported: true
|
declare isSupported: true
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ComputerUseInputUnsupported {
|
interface ComputerUseInputUnsupported { isSupported: false }
|
||||||
isSupported: false
|
|
||||||
}
|
|
||||||
|
|
||||||
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported
|
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ export interface ScreenshotResult {
|
|||||||
originX: number
|
originX: number
|
||||||
originY: number
|
originY: number
|
||||||
displayId?: number
|
displayId?: number
|
||||||
|
/** Accessibility snapshot — structured GUI element tree as model-friendly text. Windows only. */
|
||||||
|
accessibilityText?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FrontmostApp {
|
export interface FrontmostApp {
|
||||||
@@ -108,4 +110,59 @@ export interface ComputerExecutor {
|
|||||||
getAppIcon(path: string): Promise<string | undefined>
|
getAppIcon(path: string): Promise<string | undefined>
|
||||||
listRunningApps(): Promise<RunningApp[]>
|
listRunningApps(): Promise<RunningApp[]>
|
||||||
openApp(bundleId: string): Promise<void>
|
openApp(bundleId: string): Promise<void>
|
||||||
|
|
||||||
|
// ── Window management (Windows only, optional) ──────────────────────────
|
||||||
|
/** Perform a window management action on the bound window. Win32 API only — no global shortcuts. */
|
||||||
|
manageWindow?(action: string, opts?: { x?: number; y?: number; width?: number; height?: number }): Promise<boolean>
|
||||||
|
/** Get the current window rect of the bound window */
|
||||||
|
getWindowRect?(): Promise<{ x: number; y: number; width: number; height: number } | null>
|
||||||
|
|
||||||
|
// ── Element-targeted actions (Windows UIA, optional) ────────────────────
|
||||||
|
/** Open terminal and launch an agent CLI */
|
||||||
|
openTerminal?(opts: {
|
||||||
|
agent: 'claude' | 'codex' | 'gemini' | 'custom'
|
||||||
|
command?: string
|
||||||
|
terminal?: 'wt' | 'powershell' | 'cmd'
|
||||||
|
workingDirectory?: string
|
||||||
|
}): Promise<{ hwnd: string; title: string; launched: boolean } | null>
|
||||||
|
/** Bind to a window by hwnd/title/pid. Returns bound window info or null. */
|
||||||
|
bindToWindow?(query: { hwnd?: string; title?: string; pid?: number }): Promise<{ hwnd: string; title: string; pid: number } | null>
|
||||||
|
/** Unbind from the current window */
|
||||||
|
unbindFromWindow?(): Promise<void>
|
||||||
|
/** Cheap binding-state check for window-targeted routing decisions. */
|
||||||
|
hasBoundWindow?(): Promise<boolean>
|
||||||
|
/** Get current binding status */
|
||||||
|
getBindingStatus?(): Promise<{ bound: boolean; hwnd?: string; title?: string; pid?: number; rect?: { x: number; y: number; width: number; height: number } } | null>
|
||||||
|
/** List all visible windows */
|
||||||
|
listVisibleWindows?(): Promise<Array<{ hwnd: string; pid: number; title: string }>>
|
||||||
|
/** Control the status indicator overlay */
|
||||||
|
statusIndicator?(action: 'show' | 'hide' | 'status', message?: string): Promise<{ active: boolean; message?: string }>
|
||||||
|
/** Virtual keyboard — send keys/text/combos to bound window only */
|
||||||
|
virtualKeyboard?(opts: {
|
||||||
|
action: 'type' | 'combo' | 'press' | 'release' | 'hold'
|
||||||
|
text: string
|
||||||
|
duration?: number
|
||||||
|
repeat?: number
|
||||||
|
}): Promise<boolean>
|
||||||
|
/** Virtual mouse — click/move/drag on bound window only */
|
||||||
|
virtualMouse?(opts: {
|
||||||
|
action: 'click' | 'double_click' | 'right_click' | 'move' | 'drag' | 'down' | 'up'
|
||||||
|
x: number; y: number
|
||||||
|
startX?: number; startY?: number
|
||||||
|
}): Promise<boolean>
|
||||||
|
/** Mouse wheel scroll at client coordinates (works on Excel, browsers, modern UI) */
|
||||||
|
mouseWheel?(x: number, y: number, delta: number, horizontal?: boolean): Promise<boolean>
|
||||||
|
/** Activate the bound window (foreground + click to focus) */
|
||||||
|
activateWindow?(clickX?: number, clickY?: number): Promise<boolean>
|
||||||
|
/** Handle a terminal prompt (yes/no/select/type + enter) */
|
||||||
|
respondToPrompt?(opts: {
|
||||||
|
responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type'
|
||||||
|
arrowDirection?: 'up' | 'down'
|
||||||
|
arrowCount?: number
|
||||||
|
text?: string
|
||||||
|
}): Promise<boolean>
|
||||||
|
/** Click an element by name/role/automationId via UI Automation */
|
||||||
|
clickElement?(query: { name?: string; role?: string; automationId?: string }): Promise<boolean>
|
||||||
|
/** Type text into an element by name/role/automationId via UI Automation ValuePattern */
|
||||||
|
typeIntoElement?(query: { name?: string; role?: string; automationId?: string }, text: string): Promise<boolean>
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -434,6 +434,15 @@ async function runInputActionGates(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Windows/Linux: operations go through SendMessage (HWND-bound) or platform
|
||||||
|
// abstraction, not global input to the foreground. The frontmost gate is a
|
||||||
|
// macOS safety net for global CGEvent input — on other platforms, skip it
|
||||||
|
// when the platform's screenshotFiltering is 'none' (no per-app filtering,
|
||||||
|
// meaning no hide/defocus, meaning frontmost is meaningless).
|
||||||
|
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
|
||||||
|
return null; // pass — non-macOS platform, frontmost irrelevant
|
||||||
|
}
|
||||||
|
|
||||||
// Frontmost gate. Check FRESH on every call.
|
// Frontmost gate. Check FRESH on every call.
|
||||||
const frontmost = await adapter.executor.getFrontmostApp();
|
const frontmost = await adapter.executor.getFrontmostApp();
|
||||||
|
|
||||||
@@ -561,6 +570,13 @@ async function runHitTestGate(
|
|||||||
y: number,
|
y: number,
|
||||||
actionKind: CuActionKind,
|
actionKind: CuActionKind,
|
||||||
): Promise<CuCallToolResult | null> {
|
): Promise<CuCallToolResult | null> {
|
||||||
|
// Non-macOS: HWND-bound mode — clicks go to the bound window via
|
||||||
|
// SendMessage with window-relative coordinates. Hit-test against the
|
||||||
|
// real screen is meaningless.
|
||||||
|
if (adapter.executor.capabilities.screenshotFiltering === 'none') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const target = await adapter.executor.appUnderPoint(x, y);
|
const target = await adapter.executor.appUnderPoint(x, y);
|
||||||
if (!target) return null; // desktop / nothing under point / platform no-op
|
if (!target) return null; // desktop / nothing under point / platform no-op
|
||||||
|
|
||||||
@@ -796,12 +812,12 @@ function resolveRequestedApps(
|
|||||||
if (!resolved) {
|
if (!resolved) {
|
||||||
resolved = byLowerDisplayName.get(requested.toLowerCase());
|
resolved = byLowerDisplayName.get(requested.toLowerCase());
|
||||||
}
|
}
|
||||||
// Fuzzy fallback: match requested name as substring of display name
|
// Windows fuzzy matching: strip .exe suffix, try substring match
|
||||||
// e.g. "Chrome" matches "Google Chrome", "Code" matches "Visual Studio Code"
|
|
||||||
if (!resolved) {
|
if (!resolved) {
|
||||||
const lower = requested.toLowerCase();
|
const clean = requested.toLowerCase().replace(/\.exe$/, '').trim();
|
||||||
for (const app of installed) {
|
// Try: "chrome" matches "Google Chrome", "notepad" matches "Notepad"
|
||||||
if (app.displayName.toLowerCase().includes(lower)) {
|
for (const [name, app] of byLowerDisplayName) {
|
||||||
|
if (name.includes(clean) || clean.includes(name)) {
|
||||||
resolved = app;
|
resolved = app;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -2137,6 +2153,8 @@ async function handleScreenshot(
|
|||||||
content: [
|
content: [
|
||||||
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
||||||
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
||||||
|
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
|
||||||
|
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
|
||||||
{
|
{
|
||||||
type: "image",
|
type: "image",
|
||||||
data: shot.base64,
|
data: shot.base64,
|
||||||
@@ -2204,6 +2222,8 @@ async function handleScreenshot(
|
|||||||
content: [
|
content: [
|
||||||
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []),
|
||||||
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []),
|
||||||
|
// Accessibility snapshot: structured GUI element tree (Windows bound-window mode)
|
||||||
|
...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []),
|
||||||
{
|
{
|
||||||
type: "image",
|
type: "image",
|
||||||
data: shot.base64,
|
data: shot.base64,
|
||||||
@@ -2812,6 +2832,443 @@ async function handleOpenApplication(
|
|||||||
return okText(`Opened "${app}".`);
|
return okText(`Opened "${app}".`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function handleVirtualMouse(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.virtualMouse) {
|
||||||
|
return errorResult("virtual_mouse is only available on Windows with a bound window.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const action = requireString(args, "action");
|
||||||
|
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||||
|
const coord = args.coordinate;
|
||||||
|
if (!Array.isArray(coord) || coord.length < 2) {
|
||||||
|
return errorResult("coordinate [x, y] is required.", "bad_args");
|
||||||
|
}
|
||||||
|
const validActions = new Set(["click", "double_click", "right_click", "move", "drag", "down", "up"]);
|
||||||
|
if (!validActions.has(action)) {
|
||||||
|
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
|
||||||
|
}
|
||||||
|
const startCoord = Array.isArray(args.start_coordinate) ? args.start_coordinate : undefined;
|
||||||
|
const ok = await adapter.executor.virtualMouse({
|
||||||
|
action: action as any,
|
||||||
|
x: coord[0], y: coord[1],
|
||||||
|
startX: startCoord?.[0], startY: startCoord?.[1],
|
||||||
|
});
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound.", "bad_args");
|
||||||
|
}
|
||||||
|
const desc: Record<string, string> = {
|
||||||
|
click: `Click at (${coord[0]},${coord[1]})`,
|
||||||
|
double_click: `Double-click at (${coord[0]},${coord[1]})`,
|
||||||
|
right_click: `Right-click at (${coord[0]},${coord[1]})`,
|
||||||
|
move: `Moved to (${coord[0]},${coord[1]})`,
|
||||||
|
drag: `Dragged ${startCoord ? `(${startCoord[0]},${startCoord[1]})` : "current"} → (${coord[0]},${coord[1]})`,
|
||||||
|
down: `Button down at (${coord[0]},${coord[1]})`,
|
||||||
|
up: `Button up at (${coord[0]},${coord[1]})`,
|
||||||
|
};
|
||||||
|
return okText(desc[action] ?? action);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleVirtualKeyboard(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.virtualKeyboard) {
|
||||||
|
return errorResult("virtual_keyboard is only available on Windows with a bound window.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const action = requireString(args, "action");
|
||||||
|
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||||
|
const text = requireString(args, "text");
|
||||||
|
if (text instanceof Error) return errorResult(text.message, "bad_args");
|
||||||
|
|
||||||
|
const validActions = new Set(["type", "combo", "press", "release", "hold"]);
|
||||||
|
if (!validActions.has(action)) {
|
||||||
|
return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
const duration = typeof args.duration === "number" ? args.duration : undefined;
|
||||||
|
const repeat = typeof args.repeat === "number" ? args.repeat : undefined;
|
||||||
|
|
||||||
|
const ok = await adapter.executor.virtualKeyboard({
|
||||||
|
action: action as any,
|
||||||
|
text,
|
||||||
|
duration,
|
||||||
|
repeat,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
const desc: Record<string, string> = {
|
||||||
|
type: `Typed "${text.length > 40 ? text.slice(0, 40) + "..." : text}"`,
|
||||||
|
combo: `Sent ${text}`,
|
||||||
|
press: `Pressed ${text} (holding)`,
|
||||||
|
release: `Released ${text}`,
|
||||||
|
hold: `Held ${text} for ${duration ?? 1}s`,
|
||||||
|
};
|
||||||
|
|
||||||
|
return okText(`${desc[action]}${repeat && repeat > 1 ? ` ×${repeat}` : ""}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleStatusIndicator(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.statusIndicator) {
|
||||||
|
return errorResult("status_indicator is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const action = requireString(args, "action");
|
||||||
|
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||||
|
if (!["show", "hide", "status"].includes(action)) {
|
||||||
|
return errorResult(`Invalid action "${action}". Valid: show, hide, status.`, "bad_args");
|
||||||
|
}
|
||||||
|
const message = typeof args.message === "string" ? args.message : undefined;
|
||||||
|
if (action === "show" && !message) {
|
||||||
|
return errorResult("'show' requires a message parameter.", "bad_args");
|
||||||
|
}
|
||||||
|
const result = await adapter.executor.statusIndicator(action as any, message);
|
||||||
|
if (action === "status") {
|
||||||
|
return okText(result.active ? "Indicator is active on the bound window." : "Indicator is not active (no window bound).");
|
||||||
|
}
|
||||||
|
if (action === "show") {
|
||||||
|
return okText(`Indicator showing: "${message}"`);
|
||||||
|
}
|
||||||
|
return okText("Indicator hidden.");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleMouseWheel(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.mouseWheel) {
|
||||||
|
return errorResult("mouse_wheel is only available on Windows with a bound window.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const coord = args.coordinate;
|
||||||
|
if (!Array.isArray(coord) || coord.length < 2) {
|
||||||
|
return errorResult("coordinate must be [x, y] array.", "bad_args");
|
||||||
|
}
|
||||||
|
const delta = typeof args.delta === "number" ? args.delta : undefined;
|
||||||
|
if (delta === undefined) {
|
||||||
|
return errorResult("delta is required (positive=up, negative=down).", "bad_args");
|
||||||
|
}
|
||||||
|
const horizontal = args.direction === "horizontal";
|
||||||
|
const ok = await adapter.executor.mouseWheel(coord[0], coord[1], delta, horizontal);
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||||
|
}
|
||||||
|
return okText(
|
||||||
|
`Mouse wheel: ${horizontal ? "horizontal" : "vertical"} scroll ${delta > 0 ? "up" : "down"} ${Math.abs(delta)} click(s) at (${coord[0]},${coord[1]}).`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleActivateWindow(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.activateWindow) {
|
||||||
|
return errorResult("activate_window is only available on Windows with a bound window.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const clickX = typeof args.click_x === "number" ? args.click_x : undefined;
|
||||||
|
const clickY = typeof args.click_y === "number" ? args.click_y : undefined;
|
||||||
|
const ok = await adapter.executor.activateWindow(clickX, clickY);
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||||
|
}
|
||||||
|
return okText("Window activated and focused. Ready for input.");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handlePromptRespond(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.respondToPrompt) {
|
||||||
|
return errorResult("prompt_respond is only available on Windows with a bound window.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const responseType = requireString(args, "response_type");
|
||||||
|
if (responseType instanceof Error) return errorResult(responseType.message, "bad_args");
|
||||||
|
|
||||||
|
const validTypes = new Set(["yes", "no", "enter", "escape", "select", "type"]);
|
||||||
|
if (!validTypes.has(responseType)) {
|
||||||
|
return errorResult(`Invalid response_type "${responseType}". Valid: ${[...validTypes].join(", ")}`, "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (responseType === "select" && typeof args.arrow_count !== "number") {
|
||||||
|
return errorResult("'select' requires arrow_count parameter.", "bad_args");
|
||||||
|
}
|
||||||
|
if (responseType === "type" && typeof args.text !== "string") {
|
||||||
|
return errorResult("'type' requires text parameter.", "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
const ok = await adapter.executor.respondToPrompt({
|
||||||
|
responseType: responseType as any,
|
||||||
|
arrowDirection: typeof args.arrow_direction === "string" ? args.arrow_direction as any : undefined,
|
||||||
|
arrowCount: typeof args.arrow_count === "number" ? args.arrow_count : undefined,
|
||||||
|
text: typeof args.text === "string" ? args.text : undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
const descriptions: Record<string, string> = {
|
||||||
|
yes: "Sent 'y' + Enter.",
|
||||||
|
no: "Sent 'n' + Enter.",
|
||||||
|
enter: "Sent Enter.",
|
||||||
|
escape: "Sent Escape.",
|
||||||
|
select: `Navigated ${args.arrow_direction ?? "down"} ${args.arrow_count ?? 1} time(s) + Enter.`,
|
||||||
|
type: `Typed "${args.text}" + Enter.`,
|
||||||
|
};
|
||||||
|
|
||||||
|
return okText(`Prompt responded: ${descriptions[responseType] ?? responseType}. Take a screenshot to verify.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleOpenTerminal(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.openTerminal) {
|
||||||
|
return errorResult("open_terminal is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const agent = requireString(args, "agent");
|
||||||
|
if (agent instanceof Error) return errorResult(agent.message, "bad_args");
|
||||||
|
|
||||||
|
const validAgents = new Set(["claude", "codex", "gemini", "custom"]);
|
||||||
|
if (!validAgents.has(agent)) {
|
||||||
|
return errorResult(`Invalid agent "${agent}". Valid: claude, codex, gemini, custom.`, "bad_args");
|
||||||
|
}
|
||||||
|
if (agent === "custom" && typeof args.command !== "string") {
|
||||||
|
return errorResult("agent='custom' requires 'command' parameter.", "bad_args");
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await adapter.executor.openTerminal({
|
||||||
|
agent: agent as any,
|
||||||
|
command: typeof args.command === "string" ? args.command : undefined,
|
||||||
|
terminal: typeof args.terminal === "string" ? args.terminal as any : undefined,
|
||||||
|
workingDirectory: typeof args.working_directory === "string" ? args.working_directory : undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result) {
|
||||||
|
return errorResult(
|
||||||
|
"Failed to open terminal. Windows Terminal (wt.exe) may not be installed.",
|
||||||
|
"launch_failed",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!result.launched) {
|
||||||
|
return okText(
|
||||||
|
`Terminal opened (hwnd=${result.hwnd}, "${result.title}") but no command was sent. Window is now bound.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const agentNames: Record<string, string> = {
|
||||||
|
claude: "Claude Code", codex: "Codex", gemini: "Gemini",
|
||||||
|
custom: args.command as string,
|
||||||
|
};
|
||||||
|
|
||||||
|
return okText(
|
||||||
|
`Terminal opened and ${agentNames[agent] ?? agent} launched.\n` +
|
||||||
|
`Window: hwnd=${result.hwnd} "${result.title}"\n` +
|
||||||
|
`Command: '${agent === "custom" ? args.command : agent}' + Enter\n` +
|
||||||
|
`Status: bound to this terminal. Take a screenshot to verify the agent started.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleBindWindow(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
const action = requireString(args, "action");
|
||||||
|
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||||
|
|
||||||
|
switch (action) {
|
||||||
|
case "list": {
|
||||||
|
if (!adapter.executor.listVisibleWindows) {
|
||||||
|
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const windows = await adapter.executor.listVisibleWindows();
|
||||||
|
if (windows.length === 0) return okText("No visible windows found.");
|
||||||
|
const lines = windows.map(
|
||||||
|
(w) => `hwnd=${w.hwnd} pid=${w.pid} "${w.title}"`,
|
||||||
|
);
|
||||||
|
return okText(`Visible windows (${windows.length}):\n${lines.join("\n")}`);
|
||||||
|
}
|
||||||
|
case "status": {
|
||||||
|
if (!adapter.executor.getBindingStatus) {
|
||||||
|
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const status = await adapter.executor.getBindingStatus();
|
||||||
|
if (!status || !status.bound) {
|
||||||
|
return okText("No window is currently bound. Use bind_window(action='list') to see available windows, then bind_window(action='bind', title='...') to bind.");
|
||||||
|
}
|
||||||
|
let text = `Bound to: hwnd=${status.hwnd}`;
|
||||||
|
if (status.title) text += ` "${status.title}"`;
|
||||||
|
if (status.pid) text += ` pid=${status.pid}`;
|
||||||
|
if (status.rect) text += ` rect=(${status.rect.x},${status.rect.y} ${status.rect.width}x${status.rect.height})`;
|
||||||
|
return okText(text);
|
||||||
|
}
|
||||||
|
case "bind": {
|
||||||
|
if (!adapter.executor.bindToWindow) {
|
||||||
|
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const title = typeof args.title === "string" ? args.title : undefined;
|
||||||
|
const hwnd = typeof args.hwnd === "string" ? args.hwnd : undefined;
|
||||||
|
const pid = typeof args.pid === "number" ? args.pid : undefined;
|
||||||
|
if (!title && !hwnd && !pid) {
|
||||||
|
return errorResult("Specify at least one of: title, hwnd, or pid.", "bad_args");
|
||||||
|
}
|
||||||
|
const result = await adapter.executor.bindToWindow({ hwnd, title, pid });
|
||||||
|
if (!result) {
|
||||||
|
return errorResult(
|
||||||
|
`No window found matching: ${[title && `title="${title}"`, hwnd && `hwnd=${hwnd}`, pid && `pid=${pid}`].filter(Boolean).join(", ")}. Use bind_window(action='list') to see available windows.`,
|
||||||
|
"element_not_found",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return okText(`Bound to window: hwnd=${result.hwnd} pid=${result.pid} "${result.title}". All subsequent screenshot/click/type operations target this window.`);
|
||||||
|
}
|
||||||
|
case "unbind": {
|
||||||
|
if (!adapter.executor.unbindFromWindow) {
|
||||||
|
return errorResult("bind_window is only available on Windows.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
await adapter.executor.unbindFromWindow();
|
||||||
|
return okText("Window binding released. Operations now target the full screen.");
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return errorResult(`Unknown bind_window action "${action}". Valid: list, bind, unbind, status.`, "bad_args");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleClickElement(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.clickElement) {
|
||||||
|
return errorResult(
|
||||||
|
"click_element is only available on Windows with a bound window.",
|
||||||
|
"feature_unavailable",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const name = typeof args.name === "string" ? args.name : undefined;
|
||||||
|
const role = typeof args.role === "string" ? args.role : undefined;
|
||||||
|
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
|
||||||
|
if (!name && !role && !automationId) {
|
||||||
|
return errorResult("At least one of name, role, or automationId is required.", "bad_args");
|
||||||
|
}
|
||||||
|
const ok = await adapter.executor.clickElement({ name, role, automationId });
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult(
|
||||||
|
`Element not found: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. Take a screenshot to see current GUI elements.`,
|
||||||
|
"element_not_found",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return okText(`Clicked element: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleTypeIntoElement(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
if (!adapter.executor.typeIntoElement) {
|
||||||
|
return errorResult(
|
||||||
|
"type_into_element is only available on Windows with a bound window.",
|
||||||
|
"feature_unavailable",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const text = requireString(args, "text");
|
||||||
|
if (text instanceof Error) return errorResult(text.message, "bad_args");
|
||||||
|
const name = typeof args.name === "string" ? args.name : undefined;
|
||||||
|
const role = typeof args.role === "string" ? args.role : undefined;
|
||||||
|
const automationId = typeof args.automationId === "string" ? args.automationId : undefined;
|
||||||
|
const ok = await adapter.executor.typeIntoElement({ name, role, automationId }, text);
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult(
|
||||||
|
`Could not type into element: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. The element was not found or doesn't support text input.`,
|
||||||
|
"element_not_found",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return okText(`Typed ${text.length} chars into: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleWindowManagement(
|
||||||
|
adapter: ComputerUseHostAdapter,
|
||||||
|
args: Record<string, unknown>,
|
||||||
|
): Promise<CuCallToolResult> {
|
||||||
|
const action = requireString(args, "action");
|
||||||
|
if (action instanceof Error) return errorResult(action.message, "bad_args");
|
||||||
|
|
||||||
|
const VALID_ACTIONS = new Set([
|
||||||
|
"minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect",
|
||||||
|
]);
|
||||||
|
if (!VALID_ACTIONS.has(action)) {
|
||||||
|
return errorResult(
|
||||||
|
`Unknown window_management action "${action}". Valid: ${[...VALID_ACTIONS].join(", ")}`,
|
||||||
|
"bad_args",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!adapter.executor.manageWindow) {
|
||||||
|
return errorResult(
|
||||||
|
"window_management is only available on Windows with a bound window.",
|
||||||
|
"feature_unavailable",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get_rect: just return the current window position and size
|
||||||
|
if (action === "get_rect") {
|
||||||
|
if (!adapter.executor.getWindowRect) {
|
||||||
|
return errorResult("getWindowRect not available.", "feature_unavailable");
|
||||||
|
}
|
||||||
|
const rect = await adapter.executor.getWindowRect();
|
||||||
|
if (!rect) {
|
||||||
|
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
|
||||||
|
}
|
||||||
|
return okText(
|
||||||
|
`Window rect: x=${rect.x}, y=${rect.y}, width=${rect.width}, height=${rect.height}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// move_resize: requires x, y (width/height optional)
|
||||||
|
if (action === "move_resize") {
|
||||||
|
const x = typeof args.x === "number" ? args.x : undefined;
|
||||||
|
const y = typeof args.y === "number" ? args.y : undefined;
|
||||||
|
if (x === undefined || y === undefined) {
|
||||||
|
return errorResult("move_resize requires x and y parameters.", "bad_args");
|
||||||
|
}
|
||||||
|
const width = typeof args.width === "number" ? args.width : undefined;
|
||||||
|
const height = typeof args.height === "number" ? args.height : undefined;
|
||||||
|
const ok = await adapter.executor.manageWindow(action, { x, y, width, height });
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult("No window is currently bound. Call open_application first.", "bad_args");
|
||||||
|
}
|
||||||
|
return okText(
|
||||||
|
width && height
|
||||||
|
? `Moved window to (${x}, ${y}) and resized to ${width}×${height}.`
|
||||||
|
: `Moved window to (${x}, ${y}).`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All other actions: minimize, maximize, restore, close, focus, move_offscreen
|
||||||
|
const ok = await adapter.executor.manageWindow(action);
|
||||||
|
if (!ok) {
|
||||||
|
return errorResult(
|
||||||
|
"No window is currently bound. Call open_application first.",
|
||||||
|
"bad_args",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const descriptions: Record<string, string> = {
|
||||||
|
minimize: "Window minimized (ShowWindow SW_MINIMIZE).",
|
||||||
|
maximize: "Window maximized (ShowWindow SW_MAXIMIZE).",
|
||||||
|
restore: "Window restored (ShowWindow SW_RESTORE).",
|
||||||
|
close: "Window closed (SendMessage WM_CLOSE). The window binding has been released.",
|
||||||
|
focus: "Window brought to front (SetForegroundWindow).",
|
||||||
|
move_offscreen: "Window moved offscreen (-32000,-32000). Still usable via SendMessage/PrintWindow.",
|
||||||
|
};
|
||||||
|
|
||||||
|
return okText(descriptions[action] ?? `Action "${action}" completed.`);
|
||||||
|
}
|
||||||
|
|
||||||
async function handleSwitchDisplay(
|
async function handleSwitchDisplay(
|
||||||
adapter: ComputerUseHostAdapter,
|
adapter: ComputerUseHostAdapter,
|
||||||
args: Record<string, unknown>,
|
args: Record<string, unknown>,
|
||||||
@@ -3383,6 +3840,64 @@ async function dispatchAction(
|
|||||||
overrides: ComputerUseOverrides,
|
overrides: ComputerUseOverrides,
|
||||||
subGates: CuSubGates,
|
subGates: CuSubGates,
|
||||||
): Promise<CuCallToolResult> {
|
): Promise<CuCallToolResult> {
|
||||||
|
// ── Bound-window auto-routing ──────────────────────────────────────
|
||||||
|
// When a window is bound (Win32), route generic input tools to
|
||||||
|
// virtual_mouse / virtual_keyboard automatically. The model doesn't
|
||||||
|
// need to know which tools to use — binding handles it.
|
||||||
|
const hasBoundWindow =
|
||||||
|
(await adapter.executor.hasBoundWindow?.()) === true &&
|
||||||
|
adapter.executor.virtualMouse &&
|
||||||
|
adapter.executor.virtualKeyboard;
|
||||||
|
if (hasBoundWindow) {
|
||||||
|
const coord = Array.isArray(a.coordinate) ? a.coordinate as number[] : undefined;
|
||||||
|
switch (name) {
|
||||||
|
case "left_click":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "click", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "double_click":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "double_click", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "right_click":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "right_click", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "mouse_move":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "move", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "left_click_drag":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, {
|
||||||
|
action: "drag", coordinate: coord,
|
||||||
|
start_coordinate: Array.isArray(a.start_coordinate) ? a.start_coordinate : undefined,
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
case "left_mouse_down":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "down", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "left_mouse_up":
|
||||||
|
if (coord) return handleVirtualMouse(adapter, { action: "up", coordinate: coord });
|
||||||
|
break;
|
||||||
|
case "type":
|
||||||
|
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "type", text: a.text });
|
||||||
|
break;
|
||||||
|
case "key":
|
||||||
|
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "combo", text: a.text, repeat: a.repeat });
|
||||||
|
break;
|
||||||
|
case "hold_key":
|
||||||
|
if (typeof a.text === "string") return handleVirtualKeyboard(adapter, {
|
||||||
|
action: "hold", text: a.text,
|
||||||
|
duration: typeof a.duration === "number" ? a.duration : 1,
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
case "scroll":
|
||||||
|
if (coord) return handleMouseWheel(adapter, {
|
||||||
|
coordinate: coord,
|
||||||
|
delta: a.scroll_direction === "up" ? (a.scroll_amount ?? 3) : -(a.scroll_amount ?? 3),
|
||||||
|
direction: (a.scroll_direction === "left" || a.scroll_direction === "right") ? "horizontal" : "vertical",
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
// screenshot, zoom, wait, cursor_position — not rerouted, pass through
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// ── Standard dispatch (unbound or tools not rerouted above) ────────
|
||||||
switch (name) {
|
switch (name) {
|
||||||
case "screenshot":
|
case "screenshot":
|
||||||
return handleScreenshot(adapter, overrides, subGates);
|
return handleScreenshot(adapter, overrides, subGates);
|
||||||
@@ -3434,6 +3949,39 @@ async function dispatchAction(
|
|||||||
case "open_application":
|
case "open_application":
|
||||||
return handleOpenApplication(adapter, a, overrides);
|
return handleOpenApplication(adapter, a, overrides);
|
||||||
|
|
||||||
|
case "window_management":
|
||||||
|
return handleWindowManagement(adapter, a);
|
||||||
|
|
||||||
|
case "click_element":
|
||||||
|
return handleClickElement(adapter, a);
|
||||||
|
|
||||||
|
case "type_into_element":
|
||||||
|
return handleTypeIntoElement(adapter, a);
|
||||||
|
|
||||||
|
case "open_terminal":
|
||||||
|
return handleOpenTerminal(adapter, a);
|
||||||
|
|
||||||
|
case "bind_window":
|
||||||
|
return handleBindWindow(adapter, a);
|
||||||
|
|
||||||
|
case "virtual_mouse":
|
||||||
|
return handleVirtualMouse(adapter, a);
|
||||||
|
|
||||||
|
case "virtual_keyboard":
|
||||||
|
return handleVirtualKeyboard(adapter, a);
|
||||||
|
|
||||||
|
case "status_indicator":
|
||||||
|
return handleStatusIndicator(adapter, a);
|
||||||
|
|
||||||
|
case "mouse_wheel":
|
||||||
|
return handleMouseWheel(adapter, a);
|
||||||
|
|
||||||
|
case "activate_window":
|
||||||
|
return handleActivateWindow(adapter, a);
|
||||||
|
|
||||||
|
case "prompt_respond":
|
||||||
|
return handlePromptRespond(adapter, a);
|
||||||
|
|
||||||
case "switch_display":
|
case "switch_display":
|
||||||
return handleSwitchDisplay(adapter, a, overrides);
|
return handleSwitchDisplay(adapter, a, overrides);
|
||||||
|
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = {
|
|||||||
export function buildComputerUseTools(
|
export function buildComputerUseTools(
|
||||||
caps: {
|
caps: {
|
||||||
screenshotFiltering: "native" | "none";
|
screenshotFiltering: "native" | "none";
|
||||||
platform: "darwin" | "win32";
|
platform: "darwin" | "win32" | "linux";
|
||||||
/** Include request_teach_access + teach_step. Read once at server construction. */
|
/** Include request_teach_access + teach_step. Read once at server construction. */
|
||||||
teachMode?: boolean;
|
teachMode?: boolean;
|
||||||
},
|
},
|
||||||
@@ -414,6 +414,353 @@ export function buildComputerUseTools(
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// Window management — Win32 API targeted at bound HWND, no global shortcuts.
|
||||||
|
// Only available on Windows when a window is bound via open_application.
|
||||||
|
...(caps.platform === 'win32' ? [{
|
||||||
|
name: "window_management",
|
||||||
|
description:
|
||||||
|
"Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " +
|
||||||
|
"All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " +
|
||||||
|
"The window must have been opened via open_application first. " +
|
||||||
|
"Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " +
|
||||||
|
"close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " +
|
||||||
|
"Use move_resize to reposition or resize the window to specific coordinates.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
action: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"],
|
||||||
|
description:
|
||||||
|
"minimize: ShowWindow(SW_MINIMIZE). " +
|
||||||
|
"maximize: ShowWindow(SW_MAXIMIZE). " +
|
||||||
|
"restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " +
|
||||||
|
"close: SendMessage(WM_CLOSE) — graceful close. " +
|
||||||
|
"focus: SetForegroundWindow + BringWindowToTop. " +
|
||||||
|
"move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " +
|
||||||
|
"move_resize: SetWindowPos to specific x,y,width,height. " +
|
||||||
|
"get_rect: GetWindowRect — returns current position and size.",
|
||||||
|
},
|
||||||
|
x: { type: "integer", description: "X position for move_resize." },
|
||||||
|
y: { type: "integer", description: "Y position for move_resize." },
|
||||||
|
width: { type: "integer", description: "Width for move_resize." },
|
||||||
|
height: { type: "integer", description: "Height for move_resize." },
|
||||||
|
},
|
||||||
|
required: ["action"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "click_element",
|
||||||
|
description:
|
||||||
|
"Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " +
|
||||||
|
"Uses Windows UI Automation to find the element and InvokePattern to click it. " +
|
||||||
|
"Prefer this over left_click when the element name is visible in the accessibility snapshot. " +
|
||||||
|
"Falls back to BoundingRect center-click if InvokePattern is not supported.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
name: {
|
||||||
|
type: "string",
|
||||||
|
description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.",
|
||||||
|
},
|
||||||
|
role: {
|
||||||
|
type: "string",
|
||||||
|
description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.",
|
||||||
|
},
|
||||||
|
automationId: {
|
||||||
|
type: "string",
|
||||||
|
description: "Exact automationId from the accessibility snapshot. Most precise selector.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: [],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "type_into_element",
|
||||||
|
description:
|
||||||
|
"Type text into a named GUI element using Windows UI Automation ValuePattern. " +
|
||||||
|
"Finds the element by name/role/automationId, then sets its value directly — " +
|
||||||
|
"no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
name: { type: "string", description: "Accessible name of the target element." },
|
||||||
|
role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." },
|
||||||
|
automationId: { type: "string", description: "Exact automationId." },
|
||||||
|
text: { type: "string", description: "Text to type/set into the element." },
|
||||||
|
},
|
||||||
|
required: ["text"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "open_terminal",
|
||||||
|
description:
|
||||||
|
"Open a new terminal window and launch an AI agent CLI. " +
|
||||||
|
"This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " +
|
||||||
|
"Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " +
|
||||||
|
"or any custom command. After launching, the tool binds to the new terminal window " +
|
||||||
|
"and takes a screenshot to verify the agent started successfully. " +
|
||||||
|
"Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
agent: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["claude", "codex", "gemini", "custom"],
|
||||||
|
description:
|
||||||
|
"Which agent to launch. " +
|
||||||
|
"claude: runs 'claude' command. " +
|
||||||
|
"codex: runs 'codex' command. " +
|
||||||
|
"gemini: runs 'gemini' command. " +
|
||||||
|
"custom: runs the command specified in 'command' parameter.",
|
||||||
|
},
|
||||||
|
command: {
|
||||||
|
type: "string",
|
||||||
|
description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'",
|
||||||
|
},
|
||||||
|
terminal: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["wt", "powershell", "cmd"],
|
||||||
|
description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.",
|
||||||
|
},
|
||||||
|
working_directory: {
|
||||||
|
type: "string",
|
||||||
|
description: "Working directory for the terminal. If omitted, uses current directory.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["agent"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "bind_window",
|
||||||
|
description:
|
||||||
|
"Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " +
|
||||||
|
"Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " +
|
||||||
|
"no cursor movement, no focus steal, no interference with the user's desktop. " +
|
||||||
|
"Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " +
|
||||||
|
"Use 'list' first to see available windows, then 'bind' with a title or hwnd. " +
|
||||||
|
"open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
action: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["bind", "unbind", "status", "list"],
|
||||||
|
description:
|
||||||
|
"bind: Bind to a window (specify title, hwnd, or pid). " +
|
||||||
|
"unbind: Release the current binding, return to full-screen mode. " +
|
||||||
|
"status: Show the currently bound window (hwnd, title, rect). " +
|
||||||
|
"list: List all visible windows with hwnd, pid, and title.",
|
||||||
|
},
|
||||||
|
title: {
|
||||||
|
type: "string",
|
||||||
|
description: "Window title to search for (partial match, case-insensitive). For 'bind' action.",
|
||||||
|
},
|
||||||
|
hwnd: {
|
||||||
|
type: "string",
|
||||||
|
description: "Exact window handle from 'list' output. For 'bind' action.",
|
||||||
|
},
|
||||||
|
pid: {
|
||||||
|
type: "integer",
|
||||||
|
description: "Process ID to find window for. For 'bind' action.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["action"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "activate_window",
|
||||||
|
description:
|
||||||
|
"Activate the bound window: bring it to foreground, click to ensure keyboard focus, " +
|
||||||
|
"and optionally send an initial key sequence. Use this before any input operations to guarantee " +
|
||||||
|
"the window is ready to receive keyboard/mouse events. " +
|
||||||
|
"Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." },
|
||||||
|
click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." },
|
||||||
|
},
|
||||||
|
required: [],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "prompt_respond",
|
||||||
|
description:
|
||||||
|
"Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " +
|
||||||
|
"Sends a sequence of key events to the bound window to navigate and confirm a prompt. " +
|
||||||
|
"This is a convenience wrapper around bound-window keyboard input for common prompt flows. " +
|
||||||
|
"Typical flows: " +
|
||||||
|
"1) Yes/No prompt → send 'y' or 'n' + Enter. " +
|
||||||
|
"2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " +
|
||||||
|
"3) Text input prompt → type the response + Enter. " +
|
||||||
|
"After responding, take a screenshot to verify the result.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
response_type: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["yes", "no", "enter", "escape", "select", "type"],
|
||||||
|
description:
|
||||||
|
"yes: send 'y' + Enter. " +
|
||||||
|
"no: send 'n' + Enter. " +
|
||||||
|
"enter: send Enter only. " +
|
||||||
|
"escape: send Escape (cancel). " +
|
||||||
|
"select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " +
|
||||||
|
"type: type custom text then Enter. Requires 'text'.",
|
||||||
|
},
|
||||||
|
arrow_direction: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["up", "down"],
|
||||||
|
description: "Arrow key direction for 'select' type. Default: 'down'.",
|
||||||
|
},
|
||||||
|
arrow_count: {
|
||||||
|
type: "integer",
|
||||||
|
description: "Number of arrow key presses for 'select' type. Default: 1.",
|
||||||
|
minimum: 0,
|
||||||
|
maximum: 50,
|
||||||
|
},
|
||||||
|
text: {
|
||||||
|
type: "string",
|
||||||
|
description: "Text to type for 'type' response_type.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["response_type"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "status_indicator",
|
||||||
|
description:
|
||||||
|
"Control the visual status indicator overlay on the bound window. " +
|
||||||
|
"The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " +
|
||||||
|
"It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " +
|
||||||
|
"Actions: show (display a custom message), hide (dismiss), status (check if active).",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
action: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["show", "hide", "status"],
|
||||||
|
description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.",
|
||||||
|
},
|
||||||
|
message: {
|
||||||
|
type: "string",
|
||||||
|
description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["action"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "virtual_keyboard",
|
||||||
|
description:
|
||||||
|
"Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " +
|
||||||
|
"The user can keep typing on their own keyboard without interference. " +
|
||||||
|
"Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " +
|
||||||
|
"All input targets the bound HWND only — no global keyboard events.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
action: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["type", "combo", "press", "release", "hold"],
|
||||||
|
description:
|
||||||
|
"type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " +
|
||||||
|
"combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " +
|
||||||
|
"press: Press a key down and hold it (pair with 'release'). " +
|
||||||
|
"release: Release a previously pressed key. " +
|
||||||
|
"hold: Press key(s) for a duration then release.",
|
||||||
|
},
|
||||||
|
text: {
|
||||||
|
type: "string",
|
||||||
|
description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').",
|
||||||
|
},
|
||||||
|
duration: {
|
||||||
|
type: "number",
|
||||||
|
description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.",
|
||||||
|
},
|
||||||
|
repeat: {
|
||||||
|
type: "integer",
|
||||||
|
description: "Number of times to repeat the action. Default: 1.",
|
||||||
|
minimum: 1,
|
||||||
|
maximum: 100,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["action", "text"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "virtual_mouse",
|
||||||
|
description:
|
||||||
|
"Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " +
|
||||||
|
"The user's real cursor stays free. All operations target the bound HWND only.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
action: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"],
|
||||||
|
description:
|
||||||
|
"click: left-click at coordinate. " +
|
||||||
|
"double_click: double left-click. " +
|
||||||
|
"right_click: right-click. " +
|
||||||
|
"move: move virtual cursor (visual only, no click). " +
|
||||||
|
"drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " +
|
||||||
|
"down: press left button at coordinate (hold). " +
|
||||||
|
"up: release left button at coordinate.",
|
||||||
|
},
|
||||||
|
coordinate: {
|
||||||
|
type: "array",
|
||||||
|
items: { type: "number" },
|
||||||
|
minItems: 2,
|
||||||
|
maxItems: 2,
|
||||||
|
description: "(x, y) client-area coordinate on the bound window.",
|
||||||
|
},
|
||||||
|
start_coordinate: {
|
||||||
|
type: "array",
|
||||||
|
items: { type: "number" },
|
||||||
|
minItems: 2,
|
||||||
|
maxItems: 2,
|
||||||
|
description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["action", "coordinate"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
{
|
||||||
|
name: "mouse_wheel",
|
||||||
|
description:
|
||||||
|
"Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " +
|
||||||
|
"Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " +
|
||||||
|
"mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " +
|
||||||
|
"code editors, PDF viewers, and any modern UI. " +
|
||||||
|
"Specify the click point within the window where the scroll should occur — " +
|
||||||
|
"this determines which panel/pane/element receives the scroll.",
|
||||||
|
inputSchema: {
|
||||||
|
type: "object" as const,
|
||||||
|
properties: {
|
||||||
|
coordinate: {
|
||||||
|
type: "array",
|
||||||
|
items: { type: "number" },
|
||||||
|
minItems: 2,
|
||||||
|
maxItems: 2,
|
||||||
|
description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.",
|
||||||
|
},
|
||||||
|
delta: {
|
||||||
|
type: "integer",
|
||||||
|
description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.",
|
||||||
|
},
|
||||||
|
direction: {
|
||||||
|
type: "string",
|
||||||
|
enum: ["vertical", "horizontal"],
|
||||||
|
description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
required: ["coordinate", "delta"],
|
||||||
|
},
|
||||||
|
} as Tool,
|
||||||
|
] : []),
|
||||||
|
|
||||||
{
|
{
|
||||||
name: "switch_display",
|
name: "switch_display",
|
||||||
description:
|
description:
|
||||||
|
|||||||
@@ -159,28 +159,23 @@ export const apps: AppsAPI = {
|
|||||||
|
|
||||||
async listInstalled() {
|
async listInstalled() {
|
||||||
try {
|
try {
|
||||||
// Use Spotlight (mdfind) to enumerate .app bundles and mdls to get real bundle IDs.
|
const result = await osascript(`
|
||||||
// Searches /Applications, /System/Applications, and /System/Applications/Utilities
|
tell application "System Events"
|
||||||
// so that system apps (Terminal, Chess, etc.) and core services (Finder) are found.
|
set appList to ""
|
||||||
const proc = Bun.spawn([
|
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
|
||||||
'bash', '-c',
|
set appPath to POSIX path of (appFile as alias)
|
||||||
`for dir in /Applications /System/Applications /System/Applications/Utilities /System/Library/CoreServices; do
|
set appName to name of appFile
|
||||||
mdfind 'kMDItemContentType == "com.apple.application-bundle"' -onlyin "$dir" 2>/dev/null
|
set appList to appList & appPath & "|" & appName & "\\n"
|
||||||
done | sort -u | while read -r appPath; do
|
end repeat
|
||||||
bundleId=$(mdls -raw -name kMDItemCFBundleIdentifier "$appPath" 2>/dev/null)
|
return appList
|
||||||
if [ -n "$bundleId" ] && [ "$bundleId" != "(null)" ]; then
|
end tell
|
||||||
displayName=$(basename "$appPath" .app)
|
`)
|
||||||
echo "$bundleId|$displayName|$appPath"
|
return result.split('\n').filter(Boolean).map(line => {
|
||||||
fi
|
const [path, name] = line.split('|', 2)
|
||||||
done`,
|
const displayName = (name ?? '').replace(/\.app$/, '')
|
||||||
], { stdout: 'pipe', stderr: 'pipe' })
|
|
||||||
const text = await new Response(proc.stdout).text()
|
|
||||||
await proc.exited
|
|
||||||
return text.split('\n').filter(Boolean).map(line => {
|
|
||||||
const [bundleId, displayName, path] = line.split('|', 3)
|
|
||||||
return {
|
return {
|
||||||
bundleId: bundleId ?? '',
|
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
|
||||||
displayName: displayName ?? '',
|
displayName,
|
||||||
path: path ?? '',
|
path: path ?? '',
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
/**
|
/**
|
||||||
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
|
* @ant/computer-use-swift — macOS display, apps, and screenshot (Swift native)
|
||||||
*
|
*
|
||||||
* Platform backends:
|
* This package wraps the macOS-only Swift .node native module.
|
||||||
* - darwin: AppleScript/JXA + screencapture
|
* For Windows/Linux, use src/utils/computerUse/platforms/ instead.
|
||||||
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
|
|
||||||
*
|
|
||||||
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Re-export all types
|
|
||||||
export type {
|
export type {
|
||||||
DisplayGeometry,
|
DisplayGeometry,
|
||||||
PrepareDisplayResult,
|
PrepareDisplayResult,
|
||||||
@@ -18,72 +14,42 @@ export type {
|
|||||||
ScreenshotResult,
|
ScreenshotResult,
|
||||||
ResolvePrepareCaptureResult,
|
ResolvePrepareCaptureResult,
|
||||||
WindowDisplayInfo,
|
WindowDisplayInfo,
|
||||||
DisplayAPI,
|
} from './backends/darwin.js'
|
||||||
AppsAPI,
|
|
||||||
ScreenshotAPI,
|
|
||||||
SwiftBackend,
|
|
||||||
} from './types.js'
|
|
||||||
|
|
||||||
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
|
import type { ResolvePrepareCaptureResult } from './backends/darwin.js'
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
function loadDarwin() {
|
||||||
// Platform dispatch
|
if (process.platform !== 'darwin') return null
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
function loadBackend(): SwiftBackend | null {
|
|
||||||
try {
|
try {
|
||||||
switch (process.platform) {
|
return require('./backends/darwin.js')
|
||||||
case 'darwin':
|
|
||||||
return require('./backends/darwin.js') as SwiftBackend
|
|
||||||
case 'win32':
|
|
||||||
return require('./backends/win32.js') as SwiftBackend
|
|
||||||
case 'linux':
|
|
||||||
return require('./backends/linux.js') as SwiftBackend
|
|
||||||
default:
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
} catch {
|
} catch {
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const backend = loadBackend()
|
const darwin = loadDarwin()
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// ComputerUseAPI — Main export (preserves original class interface)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
export class ComputerUseAPI {
|
export class ComputerUseAPI {
|
||||||
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
|
apps = darwin?.apps ?? {
|
||||||
// These stubs should never be reached in practice — callers check isSupported
|
|
||||||
// or the feature gate before invoking.
|
|
||||||
|
|
||||||
apps = backend?.apps ?? {
|
|
||||||
async prepareDisplay() { return { activated: '', hidden: [] } },
|
async prepareDisplay() { return { activated: '', hidden: [] } },
|
||||||
async previewHideSet() { return [] },
|
async previewHideSet() { return [] },
|
||||||
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
|
async findWindowDisplays(ids: string[]) { return ids.map((b: string) => ({ bundleId: b, displayIds: [] as number[] })) },
|
||||||
async appUnderPoint() { return null },
|
async appUnderPoint() { return null },
|
||||||
async listInstalled() { return [] },
|
async listInstalled() { return [] },
|
||||||
iconDataUrl() { return null },
|
iconDataUrl() { return null },
|
||||||
listRunning() { return [] },
|
listRunning() { return [] },
|
||||||
async open() { throw new Error('computer-use-swift: no backend for this platform') },
|
async open() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||||
async unhide() {},
|
async unhide() {},
|
||||||
}
|
}
|
||||||
|
|
||||||
display = backend?.display ?? {
|
display = darwin?.display ?? {
|
||||||
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
|
getSize() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||||
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
|
listAll() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||||
}
|
}
|
||||||
|
|
||||||
screenshot = backend?.screenshot ?? {
|
screenshot = darwin?.screenshot ?? {
|
||||||
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
|
async captureExcluding() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||||
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
|
async captureRegion() { throw new Error('@ant/computer-use-swift: macOS only') },
|
||||||
}
|
|
||||||
|
|
||||||
hotkey = (backend as any)?.hotkey ?? {
|
|
||||||
registerEscape(_cb: () => void): boolean { return false },
|
|
||||||
unregister() {},
|
|
||||||
notifyExpectedEscape() {},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async resolvePrepareCapture(
|
async resolvePrepareCapture(
|
||||||
@@ -93,8 +59,6 @@ export class ComputerUseAPI {
|
|||||||
targetW: number,
|
targetW: number,
|
||||||
targetH: number,
|
targetH: number,
|
||||||
displayId?: number,
|
displayId?: number,
|
||||||
_autoResolve?: boolean,
|
|
||||||
_doHide?: boolean,
|
|
||||||
): Promise<ResolvePrepareCaptureResult> {
|
): Promise<ResolvePrepareCaptureResult> {
|
||||||
return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId)
|
return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,8 +52,14 @@ export function getTerminalBundleId(): string | null {
|
|||||||
* takes this shape (no `hostBundleId`, no `teachMode`).
|
* takes this shape (no `hostBundleId`, no `teachMode`).
|
||||||
*/
|
*/
|
||||||
export const CLI_CU_CAPABILITIES = {
|
export const CLI_CU_CAPABILITIES = {
|
||||||
screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any,
|
screenshotFiltering: (process.platform === 'darwin'
|
||||||
platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any,
|
? 'native'
|
||||||
|
: 'none') as any,
|
||||||
|
platform: (process.platform === 'win32'
|
||||||
|
? 'win32'
|
||||||
|
: process.platform === 'linux'
|
||||||
|
? 'linux'
|
||||||
|
: 'darwin') as any,
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isComputerUseMCPServer(name: string): boolean {
|
export function isComputerUseMCPServer(name: string): boolean {
|
||||||
|
|||||||
@@ -297,16 +297,17 @@ export function createCliExecutor(opts: {
|
|||||||
getMouseAnimationEnabled: () => boolean
|
getMouseAnimationEnabled: () => boolean
|
||||||
getHideBeforeActionEnabled: () => boolean
|
getHideBeforeActionEnabled: () => boolean
|
||||||
}): ComputerExecutor {
|
}): ComputerExecutor {
|
||||||
if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') {
|
// Non-macOS: delegate entirely to the cross-platform executor.
|
||||||
throw new Error(
|
// No macOS code paths, no drainRunLoop, no @ant packages.
|
||||||
`createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`,
|
if (process.platform !== 'darwin') {
|
||||||
)
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
|
const { createCrossPlatformExecutor } = require('./executorCrossPlatform.js') as typeof import('./executorCrossPlatform.js')
|
||||||
|
return createCrossPlatformExecutor(opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Swift loaded once at factory time — every executor method needs it.
|
// ── macOS: native @ant packages ─────────────────────────────────────
|
||||||
// Input loaded lazily via requireComputerUseInput() on first mouse/keyboard
|
// Everything below is macOS-only. No platform checks needed.
|
||||||
// call — it caches internally, so screenshot-only flows never pull the
|
|
||||||
// enigo .node.
|
|
||||||
const cu = requireComputerUseSwift()
|
const cu = requireComputerUseSwift()
|
||||||
|
|
||||||
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
|
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
|
||||||
@@ -500,18 +501,12 @@ export function createCliExecutor(opts: {
|
|||||||
async key(keySequence: string, repeat?: number): Promise<void> {
|
async key(keySequence: string, repeat?: number): Promise<void> {
|
||||||
const input = requireComputerUseInput()
|
const input = requireComputerUseInput()
|
||||||
const parts = keySequence.split('+').filter(p => p.length > 0)
|
const parts = keySequence.split('+').filter(p => p.length > 0)
|
||||||
// Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape
|
|
||||||
// etc. pass through without aborting.
|
|
||||||
const isEsc = isBareEscape(parts)
|
const isEsc = isBareEscape(parts)
|
||||||
const n = repeat ?? 1
|
const n = repeat ?? 1
|
||||||
await drainRunLoop(async () => {
|
await drainRunLoop(async () => {
|
||||||
for (let i = 0; i < n; i++) {
|
for (let i = 0; i < n; i++) {
|
||||||
if (i > 0) {
|
if (i > 0) await sleep(8)
|
||||||
await sleep(8)
|
if (isEsc) notifyExpectedEscape()
|
||||||
}
|
|
||||||
if (isEsc) {
|
|
||||||
notifyExpectedEscape()
|
|
||||||
}
|
|
||||||
await input.keys(parts)
|
await input.keys(parts)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@@ -554,12 +549,9 @@ export function createCliExecutor(opts: {
|
|||||||
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
|
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
|
||||||
const input = requireComputerUseInput()
|
const input = requireComputerUseInput()
|
||||||
if (opts.viaClipboard) {
|
if (opts.viaClipboard) {
|
||||||
// keys(['command','v']) inside needs the pump.
|
|
||||||
await drainRunLoop(() => typeViaClipboard(input, text))
|
await drainRunLoop(() => typeViaClipboard(input, text))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this
|
|
||||||
// once per grapheme. typeText doesn't dispatch to the main queue.
|
|
||||||
await input.typeText(text)
|
await input.typeText(text)
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -656,6 +648,10 @@ export function createCliExecutor(opts: {
|
|||||||
// ── App management ───────────────────────────────────────────────────
|
// ── App management ───────────────────────────────────────────────────
|
||||||
|
|
||||||
async getFrontmostApp(): Promise<FrontmostApp | null> {
|
async getFrontmostApp(): Promise<FrontmostApp | null> {
|
||||||
|
// When HWND is bound on Windows, operations go through SendMessage
|
||||||
|
// and don't touch the real foreground. Return the first allowed app
|
||||||
|
// so the frontmost gate in toolCalls.ts passes — the real foreground
|
||||||
|
// is irrelevant since we never touch it.
|
||||||
const info = requireComputerUseInput().getFrontmostAppInfo()
|
const info = requireComputerUseInput().getFrontmostAppInfo()
|
||||||
if (!info || !info.bundleId) return null
|
if (!info || !info.bundleId) return null
|
||||||
return { bundleId: info.bundleId, displayName: info.appName }
|
return { bundleId: info.bundleId, displayName: info.appName }
|
||||||
@@ -698,6 +694,7 @@ export async function unhideComputerUseApps(
|
|||||||
bundleIds: readonly string[],
|
bundleIds: readonly string[],
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
if (bundleIds.length === 0) return
|
if (bundleIds.length === 0) return
|
||||||
|
if (process.platform !== 'darwin') return // non-macOS: no-op
|
||||||
const cu = requireComputerUseSwift()
|
const cu = requireComputerUseSwift()
|
||||||
await cu.apps.unhide([...bundleIds])
|
await cu.apps.unhide([...bundleIds])
|
||||||
}
|
}
|
||||||
|
|||||||
1150
src/utils/computerUse/executorCrossPlatform.ts
Normal file
1150
src/utils/computerUse/executorCrossPlatform.ts
Normal file
File diff suppressed because it is too large
Load Diff
@@ -46,16 +46,9 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
|
|||||||
}),
|
}),
|
||||||
ensureOsPermissions: async () => {
|
ensureOsPermissions: async () => {
|
||||||
if (process.platform !== 'darwin') return { granted: true }
|
if (process.platform !== 'darwin') return { granted: true }
|
||||||
const cu = requireComputerUseSwift() as any
|
const cu = requireComputerUseSwift()
|
||||||
// Native .node module exposes tcc; cross-platform JS backend does not.
|
const accessibility = (cu as any).tcc.checkAccessibility()
|
||||||
// When tcc is absent (JS backend on macOS), we cannot programmatically
|
const screenRecording = (cu as any).tcc.checkScreenRecording()
|
||||||
// check TCC status — returning granted:false would create a deadlock
|
|
||||||
// (recheck also fails, user can never pass). The JS backend uses
|
|
||||||
// osascript/screencapture which trigger OS-level permission prompts
|
|
||||||
// themselves, so the OS provides the safety net instead.
|
|
||||||
if (!cu.tcc) return { granted: true }
|
|
||||||
const accessibility = cu.tcc.checkAccessibility()
|
|
||||||
const screenRecording = cu.tcc.checkScreenRecording()
|
|
||||||
return accessibility && screenRecording
|
return accessibility && screenRecording
|
||||||
? { granted: true }
|
? { granted: true }
|
||||||
: { granted: false, accessibility, screenRecording }
|
: { granted: false, accessibility, screenRecording }
|
||||||
|
|||||||
152
src/utils/computerUse/platforms/darwin.ts
Normal file
152
src/utils/computerUse/platforms/darwin.ts
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
/**
|
||||||
|
* macOS platform backend for Computer Use.
|
||||||
|
*
|
||||||
|
* Delegates to @ant/computer-use-input (enigo keyboard/mouse) and
|
||||||
|
* @ant/computer-use-swift (screenshots, display, apps).
|
||||||
|
*
|
||||||
|
* No window-bound input (sendChar/sendKey/sendClick/sendText) — macOS
|
||||||
|
* uses global input via CoreGraphics events.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { Platform } from './index.js'
|
||||||
|
import type {
|
||||||
|
InputPlatform,
|
||||||
|
ScreenshotPlatform,
|
||||||
|
DisplayPlatform,
|
||||||
|
AppsPlatform,
|
||||||
|
WindowHandle,
|
||||||
|
FrontmostAppInfo,
|
||||||
|
} from './types.js'
|
||||||
|
import { requireComputerUseInput } from '../inputLoader.js'
|
||||||
|
import { requireComputerUseSwift } from '../swiftLoader.js'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Input — delegate to @ant/computer-use-input darwin backend
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const input: InputPlatform = {
|
||||||
|
async moveMouse(x, y) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.moveMouse(x, y)
|
||||||
|
},
|
||||||
|
|
||||||
|
async click(x, y, button) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.moveMouse(x, y)
|
||||||
|
await api.mouseButton(button, 'click', 1)
|
||||||
|
},
|
||||||
|
|
||||||
|
async typeText(text) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.typeText(text)
|
||||||
|
},
|
||||||
|
|
||||||
|
async key(name, action) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.key(name, action)
|
||||||
|
},
|
||||||
|
|
||||||
|
async keys(combo) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.keys(combo)
|
||||||
|
},
|
||||||
|
|
||||||
|
async scroll(amount, direction) {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
await api.mouseScroll(amount, direction)
|
||||||
|
},
|
||||||
|
|
||||||
|
async mouseLocation() {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
return api.mouseLocation()
|
||||||
|
},
|
||||||
|
|
||||||
|
// No window-bound methods on macOS
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Screenshot — delegate to @ant/computer-use-swift
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const screenshot: ScreenshotPlatform = {
|
||||||
|
async captureScreen(displayId) {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
return swift.screenshot.captureExcluding([], undefined, undefined, undefined, displayId)
|
||||||
|
},
|
||||||
|
|
||||||
|
async captureRegion(x, y, w, h) {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
return swift.screenshot.captureRegion([], x, y, w, h)
|
||||||
|
},
|
||||||
|
|
||||||
|
// macOS could use SCContentFilter for window capture but we don't expose
|
||||||
|
// it through this interface yet — the swift module's captureExcluding
|
||||||
|
// handles most use cases.
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Display — delegate to @ant/computer-use-swift
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const display: DisplayPlatform = {
|
||||||
|
listAll() {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
return swift.display.listAll()
|
||||||
|
},
|
||||||
|
|
||||||
|
getSize(displayId) {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
return swift.display.getSize(displayId)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Apps — delegate to @ant/computer-use-swift
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const apps: AppsPlatform = {
|
||||||
|
listRunning(): WindowHandle[] {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
const running = swift.apps.listRunning()
|
||||||
|
return running.map((app: any) => ({
|
||||||
|
id: app.bundleId ?? '',
|
||||||
|
pid: 0, // macOS listRunning doesn't expose PID through this API
|
||||||
|
title: app.displayName ?? '',
|
||||||
|
}))
|
||||||
|
},
|
||||||
|
|
||||||
|
async listInstalled() {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
const installed = await swift.apps.listInstalled()
|
||||||
|
return installed.map((app: any) => ({
|
||||||
|
id: app.bundleId ?? '',
|
||||||
|
displayName: app.displayName ?? '',
|
||||||
|
path: app.path ?? '',
|
||||||
|
}))
|
||||||
|
},
|
||||||
|
|
||||||
|
async open(name) {
|
||||||
|
const swift = requireComputerUseSwift()
|
||||||
|
await swift.apps.open(name)
|
||||||
|
},
|
||||||
|
|
||||||
|
getFrontmostApp(): FrontmostAppInfo | null {
|
||||||
|
const api = requireComputerUseInput()
|
||||||
|
const info = api.getFrontmostAppInfo()
|
||||||
|
if (!info) return null
|
||||||
|
return { id: info.bundleId, appName: info.appName }
|
||||||
|
},
|
||||||
|
|
||||||
|
findWindowByTitle(_title): WindowHandle | null {
|
||||||
|
// macOS: not directly supported through the current swift API.
|
||||||
|
// Use apps.listRunning() and filter by title instead.
|
||||||
|
const all = this.listRunning()
|
||||||
|
return all.find(w => w.title.includes(_title)) ?? null
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Export
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export const platform: Platform = { input, screenshot, display, apps }
|
||||||
41
src/utils/computerUse/platforms/index.ts
Normal file
41
src/utils/computerUse/platforms/index.ts
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
/**
|
||||||
|
* Platform dispatcher for Computer Use.
|
||||||
|
*
|
||||||
|
* Loads the correct platform backend based on `process.platform`.
|
||||||
|
* Each backend implements the same unified interface.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
|
||||||
|
|
||||||
|
export interface Platform {
|
||||||
|
input: InputPlatform
|
||||||
|
screenshot: ScreenshotPlatform
|
||||||
|
display: DisplayPlatform
|
||||||
|
apps: AppsPlatform
|
||||||
|
windowManagement?: WindowManagementPlatform
|
||||||
|
}
|
||||||
|
|
||||||
|
let cached: Platform | undefined
|
||||||
|
|
||||||
|
export function loadPlatform(): Platform {
|
||||||
|
if (cached) return cached
|
||||||
|
|
||||||
|
switch (process.platform) {
|
||||||
|
case 'darwin':
|
||||||
|
cached = require('./darwin.js').platform
|
||||||
|
break
|
||||||
|
case 'win32':
|
||||||
|
cached = require('./win32.js').platform
|
||||||
|
break
|
||||||
|
case 'linux':
|
||||||
|
cached = require('./linux.js').platform
|
||||||
|
break
|
||||||
|
default:
|
||||||
|
throw new Error(`Computer Use not supported on ${process.platform}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return cached!
|
||||||
|
}
|
||||||
|
|
||||||
|
export type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
|
||||||
|
export type { WindowHandle, ScreenshotResult, DisplayInfo, InstalledApp, FrontmostAppInfo, WindowAction } from './types.js'
|
||||||
416
src/utils/computerUse/platforms/linux.ts
Normal file
416
src/utils/computerUse/platforms/linux.ts
Normal file
@@ -0,0 +1,416 @@
|
|||||||
|
/**
|
||||||
|
* Linux platform backend for Computer Use.
|
||||||
|
*
|
||||||
|
* Uses:
|
||||||
|
* - xdotool for mouse/keyboard input
|
||||||
|
* - scrot for screenshots (converted to JPEG)
|
||||||
|
* - xrandr for display enumeration
|
||||||
|
* - wmctrl for window management
|
||||||
|
*
|
||||||
|
* CRITICAL: All screenshots output JPEG. scrot outputs PNG by default,
|
||||||
|
* so we pipe through ImageMagick `convert` to produce JPEG.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { Platform } from './index.js'
|
||||||
|
import type {
|
||||||
|
InputPlatform,
|
||||||
|
ScreenshotPlatform,
|
||||||
|
DisplayPlatform,
|
||||||
|
AppsPlatform,
|
||||||
|
WindowHandle,
|
||||||
|
ScreenshotResult,
|
||||||
|
DisplayInfo,
|
||||||
|
InstalledApp,
|
||||||
|
FrontmostAppInfo,
|
||||||
|
} from './types.js'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Shell helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function run(cmd: string[]): string {
|
||||||
|
const result = Bun.spawnSync({ cmd, stdout: 'pipe', stderr: 'pipe' })
|
||||||
|
return new TextDecoder().decode(result.stdout).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runAsync(cmd: string[]): Promise<string> {
|
||||||
|
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
|
||||||
|
const out = await new Response(proc.stdout).text()
|
||||||
|
await proc.exited
|
||||||
|
return out.trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
function commandExists(name: string): boolean {
|
||||||
|
const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' })
|
||||||
|
return result.exitCode === 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// xdotool key name mapping
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const KEY_MAP: Record<string, string> = {
|
||||||
|
return: 'Return', enter: 'Return', tab: 'Tab', space: 'space',
|
||||||
|
backspace: 'BackSpace', delete: 'Delete', escape: 'Escape', esc: 'Escape',
|
||||||
|
left: 'Left', up: 'Up', right: 'Right', down: 'Down',
|
||||||
|
home: 'Home', end: 'End', pageup: 'Prior', pagedown: 'Next',
|
||||||
|
f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6',
|
||||||
|
f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12',
|
||||||
|
shift: 'shift', lshift: 'shift', rshift: 'shift',
|
||||||
|
control: 'ctrl', ctrl: 'ctrl', lcontrol: 'ctrl', rcontrol: 'ctrl',
|
||||||
|
alt: 'alt', option: 'alt', lalt: 'alt', ralt: 'alt',
|
||||||
|
win: 'super', meta: 'super', command: 'super', cmd: 'super', super: 'super',
|
||||||
|
insert: 'Insert', printscreen: 'Print', pause: 'Pause',
|
||||||
|
numlock: 'Num_Lock', capslock: 'Caps_Lock', scrolllock: 'Scroll_Lock',
|
||||||
|
}
|
||||||
|
|
||||||
|
const MODIFIER_KEYS = new Set([
|
||||||
|
'shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol',
|
||||||
|
'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super',
|
||||||
|
])
|
||||||
|
|
||||||
|
function mapKey(name: string): string {
|
||||||
|
return KEY_MAP[name.toLowerCase()] ?? name
|
||||||
|
}
|
||||||
|
|
||||||
|
function mouseButtonNum(button: 'left' | 'right' | 'middle'): string {
|
||||||
|
return button === 'left' ? '1' : button === 'right' ? '3' : '2'
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Input — xdotool
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const input: InputPlatform = {
|
||||||
|
async moveMouse(x, y) {
|
||||||
|
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
|
||||||
|
},
|
||||||
|
|
||||||
|
async click(x, y, button) {
|
||||||
|
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
|
||||||
|
run(['xdotool', 'click', mouseButtonNum(button)])
|
||||||
|
},
|
||||||
|
|
||||||
|
async typeText(text) {
|
||||||
|
run(['xdotool', 'type', '--delay', '12', text])
|
||||||
|
},
|
||||||
|
|
||||||
|
async key(name, action) {
|
||||||
|
const mapped = mapKey(name)
|
||||||
|
if (action === 'press') {
|
||||||
|
run(['xdotool', 'keydown', mapped])
|
||||||
|
} else {
|
||||||
|
run(['xdotool', 'keyup', mapped])
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async keys(parts) {
|
||||||
|
const modifiers: string[] = []
|
||||||
|
let finalKey: string | null = null
|
||||||
|
|
||||||
|
for (const part of parts) {
|
||||||
|
if (MODIFIER_KEYS.has(part.toLowerCase())) {
|
||||||
|
modifiers.push(mapKey(part))
|
||||||
|
} else {
|
||||||
|
finalKey = part
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!finalKey) return
|
||||||
|
|
||||||
|
const combo = [...modifiers, mapKey(finalKey)].join('+')
|
||||||
|
run(['xdotool', 'key', combo])
|
||||||
|
},
|
||||||
|
|
||||||
|
async scroll(amount, direction) {
|
||||||
|
if (direction === 'vertical') {
|
||||||
|
const btn = amount >= 0 ? '5' : '4'
|
||||||
|
const repeats = Math.abs(Math.round(amount))
|
||||||
|
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
|
||||||
|
} else {
|
||||||
|
const btn = amount >= 0 ? '7' : '6'
|
||||||
|
const repeats = Math.abs(Math.round(amount))
|
||||||
|
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async mouseLocation() {
|
||||||
|
const out = run(['xdotool', 'getmouselocation'])
|
||||||
|
const xMatch = out.match(/x:(\d+)/)
|
||||||
|
const yMatch = out.match(/y:(\d+)/)
|
||||||
|
return {
|
||||||
|
x: xMatch ? Number(xMatch[1]) : 0,
|
||||||
|
y: yMatch ? Number(yMatch[1]) : 0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// No window-bound input on Linux
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Screenshot — scrot → JPEG conversion
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const SCREENSHOT_TMP = '/tmp/cu-screenshot-tmp.png'
|
||||||
|
const SCREENSHOT_JPG = '/tmp/cu-screenshot.jpg'
|
||||||
|
|
||||||
|
async function pngToJpegBase64(pngPath: string, width: number, height: number): Promise<ScreenshotResult> {
|
||||||
|
// Try ImageMagick convert first
|
||||||
|
if (commandExists('convert')) {
|
||||||
|
await runAsync(['convert', pngPath, '-quality', '75', SCREENSHOT_JPG])
|
||||||
|
const file = Bun.file(SCREENSHOT_JPG)
|
||||||
|
const buffer = await file.arrayBuffer()
|
||||||
|
return { base64: Buffer.from(buffer).toString('base64'), width, height }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: ffmpeg
|
||||||
|
if (commandExists('ffmpeg')) {
|
||||||
|
await runAsync(['ffmpeg', '-y', '-i', pngPath, '-q:v', '5', SCREENSHOT_JPG])
|
||||||
|
const file = Bun.file(SCREENSHOT_JPG)
|
||||||
|
const buffer = await file.arrayBuffer()
|
||||||
|
return { base64: Buffer.from(buffer).toString('base64'), width, height }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: return PNG base64 (caller should be aware)
|
||||||
|
const file = Bun.file(pngPath)
|
||||||
|
const buffer = await file.arrayBuffer()
|
||||||
|
return { base64: Buffer.from(buffer).toString('base64'), width, height }
|
||||||
|
}
|
||||||
|
|
||||||
|
const screenshot: ScreenshotPlatform = {
|
||||||
|
async captureScreen(displayId) {
|
||||||
|
try {
|
||||||
|
await runAsync(['scrot', '-o', SCREENSHOT_TMP])
|
||||||
|
const size = display.getSize(displayId)
|
||||||
|
return pngToJpegBase64(SCREENSHOT_TMP, size.width, size.height)
|
||||||
|
} catch {
|
||||||
|
return { base64: '', width: 0, height: 0 }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async captureRegion(x, y, w, h) {
|
||||||
|
try {
|
||||||
|
await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_TMP])
|
||||||
|
return pngToJpegBase64(SCREENSHOT_TMP, w, h)
|
||||||
|
} catch {
|
||||||
|
return { base64: '', width: w, height: h }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async captureWindow(hwnd) {
|
||||||
|
try {
|
||||||
|
// Use xdotool to get window geometry, then import (ImageMagick) to capture
|
||||||
|
if (commandExists('import')) {
|
||||||
|
const jpgPath = '/tmp/cu-window-capture.jpg'
|
||||||
|
await runAsync(['import', '-window', hwnd, '-quality', '75', jpgPath])
|
||||||
|
|
||||||
|
// Get dimensions from xdotool
|
||||||
|
const geom = run(['xdotool', 'getwindowgeometry', '--shell', hwnd])
|
||||||
|
const wMatch = geom.match(/WIDTH=(\d+)/)
|
||||||
|
const hMatch = geom.match(/HEIGHT=(\d+)/)
|
||||||
|
const width = wMatch ? Number(wMatch[1]) : 0
|
||||||
|
const height = hMatch ? Number(hMatch[1]) : 0
|
||||||
|
|
||||||
|
const file = Bun.file(jpgPath)
|
||||||
|
const buffer = await file.arrayBuffer()
|
||||||
|
return { base64: Buffer.from(buffer).toString('base64'), width, height }
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Display — xrandr
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const display: DisplayPlatform = {
|
||||||
|
listAll(): DisplayInfo[] {
|
||||||
|
try {
|
||||||
|
const raw = run(['xrandr', '--query'])
|
||||||
|
const displays: DisplayInfo[] = []
|
||||||
|
let idx = 0
|
||||||
|
|
||||||
|
const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm
|
||||||
|
let match: RegExpExecArray | null
|
||||||
|
while ((match = regex.exec(raw)) !== null) {
|
||||||
|
displays.push({
|
||||||
|
width: Number(match[1]),
|
||||||
|
height: Number(match[2]),
|
||||||
|
scaleFactor: 1,
|
||||||
|
displayId: idx++,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (displays.length === 0) {
|
||||||
|
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||||
|
}
|
||||||
|
return displays
|
||||||
|
} catch {
|
||||||
|
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
getSize(displayId): DisplayInfo {
|
||||||
|
const all = this.listAll()
|
||||||
|
if (displayId !== undefined) {
|
||||||
|
const found = all.find(d => d.displayId === displayId)
|
||||||
|
if (found) return found
|
||||||
|
}
|
||||||
|
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Apps — wmctrl + ps + .desktop files
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const apps: AppsPlatform = {
|
||||||
|
listRunning(): WindowHandle[] {
|
||||||
|
try {
|
||||||
|
if (commandExists('wmctrl')) {
|
||||||
|
const raw = run(['wmctrl', '-l', '-p'])
|
||||||
|
const handles: WindowHandle[] = []
|
||||||
|
for (const line of raw.split('\n').filter(Boolean)) {
|
||||||
|
const parts = line.split(/\s+/)
|
||||||
|
const windowId = parts[0]
|
||||||
|
const pid = Number(parts[2])
|
||||||
|
if (!pid) continue
|
||||||
|
|
||||||
|
// Title is everything after the 4th field (hostname)
|
||||||
|
const title = parts.slice(4).join(' ')
|
||||||
|
|
||||||
|
let exePath = ''
|
||||||
|
try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch {}
|
||||||
|
|
||||||
|
handles.push({
|
||||||
|
id: windowId ?? '',
|
||||||
|
pid,
|
||||||
|
title,
|
||||||
|
exePath: exePath || undefined,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate by id
|
||||||
|
const seen = new Set<string>()
|
||||||
|
return handles.filter(h => {
|
||||||
|
if (seen.has(h.id)) return false
|
||||||
|
seen.add(h.id)
|
||||||
|
return true
|
||||||
|
}).slice(0, 50)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: xdotool search
|
||||||
|
const raw = run(['xdotool', 'search', '--name', ''])
|
||||||
|
const handles: WindowHandle[] = []
|
||||||
|
for (const windowId of raw.split('\n').filter(Boolean).slice(0, 50)) {
|
||||||
|
const title = run(['xdotool', 'getwindowname', windowId])
|
||||||
|
let pid = 0
|
||||||
|
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
|
||||||
|
if (title) {
|
||||||
|
handles.push({ id: windowId, pid, title })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return handles
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async listInstalled(): Promise<InstalledApp[]> {
|
||||||
|
try {
|
||||||
|
const dirs = [
|
||||||
|
'/usr/share/applications',
|
||||||
|
'/usr/local/share/applications',
|
||||||
|
`${process.env.HOME}/.local/share/applications`,
|
||||||
|
]
|
||||||
|
const result: InstalledApp[] = []
|
||||||
|
|
||||||
|
for (const dir of dirs) {
|
||||||
|
let files: string
|
||||||
|
try {
|
||||||
|
files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1'])
|
||||||
|
} catch { continue }
|
||||||
|
|
||||||
|
for (const filepath of files.split('\n').filter(Boolean)) {
|
||||||
|
try {
|
||||||
|
const content = run(['cat', filepath])
|
||||||
|
const nameMatch = content.match(/^Name=(.+)$/m)
|
||||||
|
const execMatch = content.match(/^Exec=(.+)$/m)
|
||||||
|
const noDisplay = content.match(/^NoDisplay=true$/m)
|
||||||
|
if (noDisplay) continue
|
||||||
|
|
||||||
|
const name = nameMatch?.[1] ?? ''
|
||||||
|
const exec = execMatch?.[1] ?? ''
|
||||||
|
if (!name) continue
|
||||||
|
|
||||||
|
result.push({
|
||||||
|
id: filepath.split('/').pop()?.replace('.desktop', '') ?? '',
|
||||||
|
displayName: name,
|
||||||
|
path: exec.split(/\s+/)[0] ?? '',
|
||||||
|
})
|
||||||
|
} catch { /* skip unreadable */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.slice(0, 200)
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async open(name) {
|
||||||
|
try {
|
||||||
|
const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop`
|
||||||
|
if (commandExists('gtk-launch')) {
|
||||||
|
await runAsync(['gtk-launch', desktopName])
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} catch { /* fall through */ }
|
||||||
|
await runAsync(['xdg-open', name])
|
||||||
|
},
|
||||||
|
|
||||||
|
getFrontmostApp(): FrontmostAppInfo | null {
|
||||||
|
try {
|
||||||
|
const windowId = run(['xdotool', 'getactivewindow'])
|
||||||
|
if (!windowId) return null
|
||||||
|
|
||||||
|
const pidStr = run(['xdotool', 'getwindowpid', windowId])
|
||||||
|
if (!pidStr) return null
|
||||||
|
|
||||||
|
let exePath = ''
|
||||||
|
try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch {}
|
||||||
|
let appName = ''
|
||||||
|
try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch {}
|
||||||
|
|
||||||
|
if (!exePath && !appName) return null
|
||||||
|
return { id: exePath || `/proc/${pidStr}/exe`, appName: appName || 'unknown' }
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
findWindowByTitle(title): WindowHandle | null {
|
||||||
|
try {
|
||||||
|
// xdotool search by name
|
||||||
|
const raw = run(['xdotool', 'search', '--name', title])
|
||||||
|
const windowId = raw.split('\n')[0]
|
||||||
|
if (!windowId) return null
|
||||||
|
|
||||||
|
const windowTitle = run(['xdotool', 'getwindowname', windowId])
|
||||||
|
let pid = 0
|
||||||
|
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
|
||||||
|
|
||||||
|
return { id: windowId, pid, title: windowTitle }
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Export
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export const platform: Platform = { input, screenshot, display, apps }
|
||||||
153
src/utils/computerUse/platforms/types.ts
Normal file
153
src/utils/computerUse/platforms/types.ts
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
/**
|
||||||
|
* Cross-platform abstraction types for Computer Use.
|
||||||
|
*
|
||||||
|
* These interfaces define a unified API surface for input, screenshots,
|
||||||
|
* display info, and app management across macOS, Windows, and Linux.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Window / App types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Cross-platform window identifier */
|
||||||
|
export interface WindowHandle {
|
||||||
|
id: string // macOS: bundleId, Windows: HWND string, Linux: window ID
|
||||||
|
pid: number
|
||||||
|
title: string
|
||||||
|
exePath?: string // Windows/Linux: process executable path
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ScreenshotResult {
|
||||||
|
base64: string
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DisplayInfo {
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
scaleFactor: number
|
||||||
|
displayId: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface InstalledApp {
|
||||||
|
id: string // macOS: bundleId, Windows: exe path or package family, Linux: .desktop name
|
||||||
|
displayName: string
|
||||||
|
path: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface FrontmostAppInfo {
|
||||||
|
id: string
|
||||||
|
appName: string
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// InputPlatform
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input platform interface — two modes:
|
||||||
|
*
|
||||||
|
* Mode A (Global): moveMouse, click, typeText, key, keys, scroll, mouseLocation
|
||||||
|
* Works on all platforms. Sends input to the foreground window; moves the
|
||||||
|
* real cursor and steals focus.
|
||||||
|
*
|
||||||
|
* Mode B (Window-bound, optional): sendChar, sendKey, sendClick, sendText
|
||||||
|
* Windows-only via SendMessage/PostMessage. Does NOT steal focus or move
|
||||||
|
* the cursor. Preferred when a target HWND is known.
|
||||||
|
*/
|
||||||
|
export interface InputPlatform {
|
||||||
|
// --- Mode A: Global input (all platforms) ---
|
||||||
|
moveMouse(x: number, y: number): Promise<void>
|
||||||
|
click(
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
button: 'left' | 'right' | 'middle',
|
||||||
|
): Promise<void>
|
||||||
|
typeText(text: string): Promise<void>
|
||||||
|
key(name: string, action: 'press' | 'release'): Promise<void>
|
||||||
|
keys(combo: string[]): Promise<void>
|
||||||
|
scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
|
||||||
|
mouseLocation(): Promise<{ x: number; y: number }>
|
||||||
|
|
||||||
|
// --- Mode B: Window-bound input (Windows only, optional) ---
|
||||||
|
sendChar?(hwnd: string, char: string): Promise<void>
|
||||||
|
sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise<void>
|
||||||
|
sendClick?(
|
||||||
|
hwnd: string,
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
button: 'left' | 'right',
|
||||||
|
): Promise<void>
|
||||||
|
sendText?(hwnd: string, text: string): Promise<void>
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ScreenshotPlatform
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface ScreenshotPlatform {
|
||||||
|
/** Full-screen capture. Returns JPEG base64. */
|
||||||
|
captureScreen(displayId?: number): Promise<ScreenshotResult>
|
||||||
|
/** Region capture. Returns JPEG base64. */
|
||||||
|
captureRegion(
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
w: number,
|
||||||
|
h: number,
|
||||||
|
): Promise<ScreenshotResult>
|
||||||
|
/** Window capture (Windows: PrintWindow, macOS: SCContentFilter, Linux: xdotool+import). */
|
||||||
|
captureWindow?(hwnd: string): Promise<ScreenshotResult | null>
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// DisplayPlatform
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface DisplayPlatform {
|
||||||
|
listAll(): DisplayInfo[]
|
||||||
|
getSize(displayId?: number): DisplayInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// AppsPlatform
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface AppsPlatform {
|
||||||
|
listRunning(): WindowHandle[]
|
||||||
|
listInstalled(): Promise<InstalledApp[]>
|
||||||
|
open(name: string): Promise<void>
|
||||||
|
getFrontmostApp(): FrontmostAppInfo | null
|
||||||
|
findWindowByTitle(title: string): WindowHandle | null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// WindowManagementPlatform (Windows HWND-targeted, no global APIs)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export type WindowAction =
|
||||||
|
| 'minimize'
|
||||||
|
| 'maximize'
|
||||||
|
| 'restore'
|
||||||
|
| 'close'
|
||||||
|
| 'focus'
|
||||||
|
| 'move_offscreen'
|
||||||
|
| 'move_resize'
|
||||||
|
| 'get_rect'
|
||||||
|
|
||||||
|
export interface WindowManagementPlatform {
|
||||||
|
/** Perform a window management action on the bound HWND. All via Win32 API, no global shortcuts. */
|
||||||
|
manageWindow(
|
||||||
|
action: WindowAction,
|
||||||
|
opts?: { x?: number; y?: number; width?: number; height?: number },
|
||||||
|
): boolean
|
||||||
|
/** Move window to specific position and/or resize */
|
||||||
|
moveResize(x: number, y: number, width?: number, height?: number): boolean
|
||||||
|
/** Get current window rect */
|
||||||
|
getWindowRect(): {
|
||||||
|
x: number
|
||||||
|
y: number
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
} | null
|
||||||
|
}
|
||||||
979
src/utils/computerUse/platforms/win32.ts
Normal file
979
src/utils/computerUse/platforms/win32.ts
Normal file
@@ -0,0 +1,979 @@
|
|||||||
|
/**
|
||||||
|
* Windows platform backend for Computer Use.
|
||||||
|
*
|
||||||
|
* Combines:
|
||||||
|
* - PowerShell SetCursorPos/SendInput for global input (fallback)
|
||||||
|
* - win32/windowMessage.ts for window-bound SendMessage input (preferred)
|
||||||
|
* - Python Bridge (bridge.py) for screenshots (mss + ctypes PrintWindow)
|
||||||
|
* - win32/windowEnum.ts for EnumWindows app listing
|
||||||
|
* - No PowerShell for screenshots (Python Bridge only, no PS fallback)
|
||||||
|
* - PowerShell Screen.AllScreens for display enumeration
|
||||||
|
*
|
||||||
|
* CRITICAL: All screenshots output JPEG (ImageFormat::Jpeg), not PNG.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { Platform } from './index.js'
|
||||||
|
import type {
|
||||||
|
InputPlatform,
|
||||||
|
ScreenshotPlatform,
|
||||||
|
DisplayPlatform,
|
||||||
|
AppsPlatform,
|
||||||
|
WindowHandle,
|
||||||
|
ScreenshotResult,
|
||||||
|
DisplayInfo,
|
||||||
|
InstalledApp,
|
||||||
|
FrontmostAppInfo,
|
||||||
|
} from './types.js'
|
||||||
|
import { listWindows } from '../win32/windowEnum.js'
|
||||||
|
import { detectAppType, openWithController } from '../win32/appDispatcher.js'
|
||||||
|
import {
|
||||||
|
markBound,
|
||||||
|
unmarkBound,
|
||||||
|
cleanupAllBorders,
|
||||||
|
} from '../win32/windowBorder.js'
|
||||||
|
import {
|
||||||
|
showVirtualCursor,
|
||||||
|
hideVirtualCursor,
|
||||||
|
moveVirtualCursor,
|
||||||
|
} from '../win32/virtualCursor.js'
|
||||||
|
import { showIndicator, hideIndicator } from '../win32/inputIndicator.js'
|
||||||
|
import {
|
||||||
|
ps,
|
||||||
|
psAsync,
|
||||||
|
validateHwnd,
|
||||||
|
VK_MAP,
|
||||||
|
MODIFIER_KEYS,
|
||||||
|
} from '../win32/shared.js'
|
||||||
|
import { logForDebugging } from '../../debug.js'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Python Bridge (lazy-loaded, preferred over PowerShell for screenshots)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
let _bridge: typeof import('../win32/bridgeClient.js') | undefined
|
||||||
|
function getBridge() {
|
||||||
|
if (!_bridge) {
|
||||||
|
try {
|
||||||
|
_bridge =
|
||||||
|
require('../win32/bridgeClient.js') as typeof import('../win32/bridgeClient.js')
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
return _bridge
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Try a bridge call, return null on failure (caller falls back to PS) */
|
||||||
|
function bridgeCallSync<T>(
|
||||||
|
method: string,
|
||||||
|
params: Record<string, unknown> = {},
|
||||||
|
): T | null {
|
||||||
|
try {
|
||||||
|
const b = getBridge()
|
||||||
|
if (!b) return null
|
||||||
|
return b.callSync<T>(method, params)
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateHwnd, ps, psAsync, VK_MAP, MODIFIER_KEYS imported from '../win32/shared.js'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Win32 P/Invoke types (compiled once per PS session)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const WIN32_TYPES = `
|
||||||
|
Add-Type -Language CSharp @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Text;
|
||||||
|
using System.Diagnostics;
|
||||||
|
|
||||||
|
public class CuWin32 {
|
||||||
|
// --- Cursor ---
|
||||||
|
[DllImport("user32.dll")] public static extern bool SetCursorPos(int X, int Y);
|
||||||
|
[DllImport("user32.dll")] public static extern bool GetCursorPos(out POINT p);
|
||||||
|
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
|
||||||
|
|
||||||
|
// --- SendInput ---
|
||||||
|
[StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT {
|
||||||
|
public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
|
||||||
|
}
|
||||||
|
[StructLayout(LayoutKind.Explicit)] public struct INPUT {
|
||||||
|
[FieldOffset(0)] public uint type;
|
||||||
|
[FieldOffset(4)] public MOUSEINPUT mi;
|
||||||
|
}
|
||||||
|
[StructLayout(LayoutKind.Sequential)] public struct KEYBDINPUT {
|
||||||
|
public ushort wVk; public ushort wScan; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
|
||||||
|
}
|
||||||
|
[StructLayout(LayoutKind.Explicit)] public struct KINPUT {
|
||||||
|
[FieldOffset(0)] public uint type;
|
||||||
|
[FieldOffset(4)] public KEYBDINPUT ki;
|
||||||
|
}
|
||||||
|
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb);
|
||||||
|
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, KINPUT[] i, int cb);
|
||||||
|
|
||||||
|
// --- Keyboard ---
|
||||||
|
[DllImport("user32.dll")] public static extern void keybd_event(byte bVk, byte bScan, uint dwFlags, UIntPtr dwExtraInfo);
|
||||||
|
[DllImport("user32.dll")] public static extern short VkKeyScan(char ch);
|
||||||
|
|
||||||
|
// --- Window ---
|
||||||
|
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
|
||||||
|
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr hWnd, StringBuilder sb, int max);
|
||||||
|
|
||||||
|
// Constants
|
||||||
|
public const uint INPUT_MOUSE = 0, INPUT_KEYBOARD = 1;
|
||||||
|
public const uint MOUSEEVENTF_LEFTDOWN = 0x0002, MOUSEEVENTF_LEFTUP = 0x0004;
|
||||||
|
public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008, MOUSEEVENTF_RIGHTUP = 0x0010;
|
||||||
|
public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020, MOUSEEVENTF_MIDDLEUP = 0x0040;
|
||||||
|
public const uint MOUSEEVENTF_WHEEL = 0x0800, MOUSEEVENTF_HWHEEL = 0x1000;
|
||||||
|
public const uint KEYEVENTF_KEYUP = 0x0002;
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
`
|
||||||
|
|
||||||
|
// VK_MAP and MODIFIER_KEYS imported from '../win32/shared.js'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Session-level HWND binding — all operations target this handle
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
let boundHwnd: string | null = null
|
||||||
|
let boundPid: number | null = null
|
||||||
|
let boundAppType: import('../win32/appDispatcher.js').AppType | null = null
|
||||||
|
let boundFilePath: string | null = null
|
||||||
|
|
||||||
|
/** Get the bound HWND, or null if not bound */
|
||||||
|
export function getBoundHwnd(): string | null {
|
||||||
|
return boundHwnd
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the bound app type */
|
||||||
|
export function getBoundAppType(): string | null {
|
||||||
|
return boundAppType
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Bind to a window HWND — all subsequent input/screenshot operations target this handle */
|
||||||
|
export function bindWindow(hwnd: string, pid?: number): void {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
// Clean up previous binding
|
||||||
|
if (boundHwnd) {
|
||||||
|
unmarkBound(boundHwnd)
|
||||||
|
hideVirtualCursor()
|
||||||
|
hideIndicator()
|
||||||
|
}
|
||||||
|
boundHwnd = hwnd
|
||||||
|
boundPid = pid ?? null
|
||||||
|
boundAppType = 'generic'
|
||||||
|
boundFilePath = null
|
||||||
|
|
||||||
|
// 1. Brief activation: set the window to accept input, then restore user's focus.
|
||||||
|
// Some apps (UWP/Electron) don't process SendMessage when never-activated.
|
||||||
|
// Save current foreground → activate target → restore original foreground.
|
||||||
|
const activateScript = `
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class CuActivate {
|
||||||
|
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
|
||||||
|
[DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr h);
|
||||||
|
[DllImport("user32.dll")] public static extern bool IsIconic(IntPtr h);
|
||||||
|
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
$prev = [CuActivate]::GetForegroundWindow()
|
||||||
|
$target = [IntPtr]::new([long]${hwnd})
|
||||||
|
if ([CuActivate]::IsIconic($target)) { [CuActivate]::ShowWindow($target, 9) | Out-Null }
|
||||||
|
[CuActivate]::SetForegroundWindow($target) | Out-Null
|
||||||
|
Start-Sleep -Milliseconds 100
|
||||||
|
if ($prev -ne [IntPtr]::Zero -and $prev -ne $target) {
|
||||||
|
[CuActivate]::SetForegroundWindow($prev) | Out-Null
|
||||||
|
}
|
||||||
|
`
|
||||||
|
ps(activateScript)
|
||||||
|
|
||||||
|
// 2. Visual indicators
|
||||||
|
markBound(hwnd)
|
||||||
|
showVirtualCursor(hwnd)
|
||||||
|
showIndicator(hwnd)
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Bind to a COM-controlled file (Excel/Word — no window needed) */
|
||||||
|
export function bindFile(
|
||||||
|
filePath: string,
|
||||||
|
appType: import('../win32/appDispatcher.js').AppType,
|
||||||
|
): void {
|
||||||
|
boundHwnd = null
|
||||||
|
boundPid = null
|
||||||
|
boundAppType = appType
|
||||||
|
boundFilePath = filePath
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Unbind — revert to global mode, remove overlays */
|
||||||
|
export function unbindWindow(): void {
|
||||||
|
if (boundHwnd) unmarkBound(boundHwnd)
|
||||||
|
hideVirtualCursor()
|
||||||
|
hideIndicator()
|
||||||
|
// Clear cached edit-child / InputSite mappings
|
||||||
|
getWm().clearEditChildCache()
|
||||||
|
boundHwnd = null
|
||||||
|
boundPid = null
|
||||||
|
boundAppType = null
|
||||||
|
boundFilePath = null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Window Message module (lazy loaded)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
let _wm: typeof import('../win32/windowMessage.js') | undefined
|
||||||
|
function getWm() {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
|
return (_wm ??=
|
||||||
|
require('../win32/windowMessage.js') as typeof import('../win32/windowMessage.js'))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Input — ALL text/key input goes through SendMessage when HWND is bound.
|
||||||
|
// Global SendInput/keybd_event is DISABLED to avoid interfering with user.
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Input — When HWND is bound, ALL operations go through SendMessage.
|
||||||
|
// NO global API (SetCursorPos/SendInput/keybd_event/SendKeys) is used.
|
||||||
|
// This ensures the user's desktop is never disturbed.
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const input: InputPlatform = {
|
||||||
|
async moveMouse(x, y) {
|
||||||
|
if (boundHwnd) {
|
||||||
|
// Bound mode: move virtual cursor (visual only), no real cursor movement
|
||||||
|
moveVirtualCursor(Math.round(x), Math.round(y))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ps(
|
||||||
|
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null`,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
async click(x, y, button) {
|
||||||
|
if (boundHwnd) {
|
||||||
|
moveVirtualCursor(Math.round(x), Math.round(y), true)
|
||||||
|
// Find the deepest child window at these client coords and click on it.
|
||||||
|
const editHwnd = getWm().findEditChild(boundHwnd)
|
||||||
|
const targetHwnd = editHwnd ?? boundHwnd
|
||||||
|
const ok = getWm().sendClick(
|
||||||
|
targetHwnd,
|
||||||
|
Math.round(x),
|
||||||
|
Math.round(y),
|
||||||
|
button,
|
||||||
|
)
|
||||||
|
if (!ok) {
|
||||||
|
getWm().sendClick(boundHwnd, Math.round(x), Math.round(y), button)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const downFlag =
|
||||||
|
button === 'left'
|
||||||
|
? 'MOUSEEVENTF_LEFTDOWN'
|
||||||
|
: button === 'right'
|
||||||
|
? 'MOUSEEVENTF_RIGHTDOWN'
|
||||||
|
: 'MOUSEEVENTF_MIDDLEDOWN'
|
||||||
|
const upFlag =
|
||||||
|
button === 'left'
|
||||||
|
? 'MOUSEEVENTF_LEFTUP'
|
||||||
|
: button === 'right'
|
||||||
|
? 'MOUSEEVENTF_RIGHTUP'
|
||||||
|
: 'MOUSEEVENTF_MIDDLEUP'
|
||||||
|
ps(
|
||||||
|
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
async typeText(text) {
|
||||||
|
// COM-controlled apps: write directly via COM API
|
||||||
|
if (boundAppType === 'word' && boundFilePath) {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
|
const { appendText } =
|
||||||
|
require('../win32/comWord.js') as typeof import('../win32/comWord.js')
|
||||||
|
appendText(boundFilePath, text)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// HWND-bound apps: SendMessageW(WM_CHAR) or clipboard paste
|
||||||
|
if (boundHwnd) {
|
||||||
|
const ok = getWm().sendText(boundHwnd, text)
|
||||||
|
if (!ok) {
|
||||||
|
throw new Error(
|
||||||
|
`typeText failed: SendMessage to HWND ${boundHwnd} returned false. ` +
|
||||||
|
`The edit control may not have been found (findEditChild returned null).`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
throw new Error(
|
||||||
|
'typeText requires a bound window or file. Call open() first.',
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
async key(name, action) {
|
||||||
|
if (boundHwnd) {
|
||||||
|
const lower = name.toLowerCase()
|
||||||
|
const vk = VK_MAP[lower] ?? (name.length === 1 ? name.charCodeAt(0) : 0)
|
||||||
|
if (vk)
|
||||||
|
getWm().sendKey(boundHwnd, vk, action === 'release' ? 'up' : 'down')
|
||||||
|
return
|
||||||
|
}
|
||||||
|
throw new Error('key requires a bound window HWND. Call open() first.')
|
||||||
|
},
|
||||||
|
|
||||||
|
async keys(parts) {
|
||||||
|
if (boundHwnd) {
|
||||||
|
const ok = getWm().sendKeys(boundHwnd, parts)
|
||||||
|
if (!ok) {
|
||||||
|
throw new Error(`keys [${parts.join('+')}] failed on HWND ${boundHwnd}`)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
throw new Error('keys requires a bound window HWND. Call open() first.')
|
||||||
|
},
|
||||||
|
|
||||||
|
async scroll(amount, direction) {
|
||||||
|
if (boundHwnd) {
|
||||||
|
// WM_VSCROLL / WM_HSCROLL for window-bound scrolling
|
||||||
|
const msg = direction === 'vertical' ? '0x0115' : '0x0114' // WM_VSCROLL / WM_HSCROLL
|
||||||
|
const wParam = amount > 0 ? '1' : '0' // SB_LINEDOWN=1 (positive=down) / SB_LINEUP=0 (negative=up)
|
||||||
|
const n = Math.abs(Math.round(amount))
|
||||||
|
let script = `
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class WScroll {
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
|
||||||
|
public static extern IntPtr SendMessage(IntPtr h, uint m, IntPtr w, IntPtr l);
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
`
|
||||||
|
for (let i = 0; i < n; i++) {
|
||||||
|
script += `[WScroll]::SendMessage([IntPtr]::new([long]${boundHwnd}), ${msg}, [IntPtr]${wParam}, [IntPtr]::Zero) | Out-Null; `
|
||||||
|
}
|
||||||
|
ps(script)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const flag =
|
||||||
|
direction === 'vertical' ? 'MOUSEEVENTF_WHEEL' : 'MOUSEEVENTF_HWHEEL'
|
||||||
|
ps(
|
||||||
|
`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${flag}; $i.mi.mouseData=${amount * 120}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
async mouseLocation() {
|
||||||
|
// Always returns real cursor position (informational, doesn't move it)
|
||||||
|
const out = ps(
|
||||||
|
`${WIN32_TYPES}; $p = New-Object CuWin32+POINT; [CuWin32]::GetCursorPos([ref]$p) | Out-Null; "$($p.X),$($p.Y)"`,
|
||||||
|
)
|
||||||
|
const [xStr, yStr] = out.split(',')
|
||||||
|
return { x: Number(xStr), y: Number(yStr) }
|
||||||
|
},
|
||||||
|
|
||||||
|
async sendChar(hwnd, char) {
|
||||||
|
getWm().sendChar(String(hwnd), char)
|
||||||
|
},
|
||||||
|
async sendKey(hwnd, vk, action) {
|
||||||
|
getWm().sendKey(String(hwnd), vk, action)
|
||||||
|
},
|
||||||
|
async sendClick(hwnd, x, y, button) {
|
||||||
|
getWm().sendClick(String(hwnd), x, y, button)
|
||||||
|
},
|
||||||
|
async sendText(hwnd, text) {
|
||||||
|
getWm().sendText(String(hwnd), text)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Screenshot — JPEG output only
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const screenshot: ScreenshotPlatform = {
|
||||||
|
async captureScreen(displayId) {
|
||||||
|
// If HWND is bound, capture that specific window
|
||||||
|
if (boundHwnd) {
|
||||||
|
const result = this.captureWindow?.(String(boundHwnd))
|
||||||
|
if (result) return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// Python Bridge (mss + Pillow, ~300ms)
|
||||||
|
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot', {
|
||||||
|
display_id: displayId ?? 0,
|
||||||
|
})
|
||||||
|
if (bridgeResult && bridgeResult.base64) {
|
||||||
|
return bridgeResult
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(
|
||||||
|
'[computer-use] Screenshot failed: Python bridge returned no data. ' +
|
||||||
|
'Ensure python3 + mss + Pillow are installed (pip install mss Pillow).',
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
async captureRegion(x, y, w, h) {
|
||||||
|
// When HWND is bound, the window IS the region (matches macOS behavior)
|
||||||
|
if (boundHwnd) {
|
||||||
|
const result = this.captureWindow?.(String(boundHwnd))
|
||||||
|
if (result) return result
|
||||||
|
}
|
||||||
|
return this.captureScreen()
|
||||||
|
},
|
||||||
|
|
||||||
|
captureWindow(hwnd) {
|
||||||
|
// Python Bridge (ctypes PrintWindow + GDI → Pillow JPEG, ~300ms)
|
||||||
|
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot_window', {
|
||||||
|
hwnd: String(hwnd),
|
||||||
|
})
|
||||||
|
if (bridgeResult && bridgeResult.base64) {
|
||||||
|
return bridgeResult
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(
|
||||||
|
`[computer-use] Window screenshot failed for HWND ${hwnd}: Python bridge returned no data.`,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Display — Screen.AllScreens
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const display: DisplayPlatform = {
|
||||||
|
listAll(): DisplayInfo[] {
|
||||||
|
try {
|
||||||
|
const raw = ps(`
|
||||||
|
Add-Type -AssemblyName System.Windows.Forms
|
||||||
|
$result = @()
|
||||||
|
$idx = 0
|
||||||
|
foreach ($s in [System.Windows.Forms.Screen]::AllScreens) {
|
||||||
|
$result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)"
|
||||||
|
$idx++
|
||||||
|
}
|
||||||
|
$result -join "|"
|
||||||
|
`)
|
||||||
|
return raw
|
||||||
|
.split('|')
|
||||||
|
.filter(Boolean)
|
||||||
|
.map(entry => {
|
||||||
|
const [w, h, id] = entry.split(',')
|
||||||
|
return {
|
||||||
|
width: Number(w),
|
||||||
|
height: Number(h),
|
||||||
|
scaleFactor: 1,
|
||||||
|
displayId: Number(id),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
} catch {
|
||||||
|
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
getSize(displayId): DisplayInfo {
|
||||||
|
const all = this.listAll()
|
||||||
|
if (displayId !== undefined) {
|
||||||
|
const found = all.find(d => d.displayId === displayId)
|
||||||
|
if (found) return found
|
||||||
|
}
|
||||||
|
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Find existing window by process name or title (avoid launching new instance)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function findExistingWindow(
|
||||||
|
hint: string,
|
||||||
|
): { hwnd: string; pid: number } | null {
|
||||||
|
const windows = listWindows()
|
||||||
|
const lower = hint.toLowerCase()
|
||||||
|
// Match by window title containing the hint
|
||||||
|
for (const w of windows) {
|
||||||
|
const titleLower = (w.title ?? '').toLowerCase()
|
||||||
|
if (titleLower.includes(lower)) {
|
||||||
|
return { hwnd: w.hwnd, pid: w.pid }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Apps — EnumWindows + registry + AppxPackage
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const apps: AppsPlatform = {
|
||||||
|
listRunning(): WindowHandle[] {
|
||||||
|
const windows = listWindows()
|
||||||
|
return windows.map(w => ({
|
||||||
|
id: String(w.hwnd),
|
||||||
|
pid: w.pid,
|
||||||
|
title: w.title,
|
||||||
|
}))
|
||||||
|
},
|
||||||
|
|
||||||
|
async listInstalled(): Promise<InstalledApp[]> {
|
||||||
|
try {
|
||||||
|
const raw = await psAsync(`
|
||||||
|
$apps = @()
|
||||||
|
|
||||||
|
# Traditional Win32 apps from registry
|
||||||
|
$paths = @(
|
||||||
|
'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
|
||||||
|
'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
|
||||||
|
'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*'
|
||||||
|
)
|
||||||
|
foreach ($p in $paths) {
|
||||||
|
Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object {
|
||||||
|
$apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# UWP/MSIX apps (Windows 10/11 Store apps)
|
||||||
|
Get-AppxPackage -ErrorAction SilentlyContinue | Where-Object { $_.IsFramework -eq $false -and $_.SignatureKind -eq 'Store' } | ForEach-Object {
|
||||||
|
$cleanName = $_.Name -replace '^Microsoft\\.Windows', '' -replace '^Microsoft\\.', ''
|
||||||
|
$apps += "$cleanName|$($_.InstallLocation)|$($_.PackageFamilyName)"
|
||||||
|
}
|
||||||
|
|
||||||
|
$apps | Select-Object -Unique | Select-Object -First 300
|
||||||
|
`)
|
||||||
|
return raw
|
||||||
|
.split('\n')
|
||||||
|
.filter(Boolean)
|
||||||
|
.map(line => {
|
||||||
|
const [name, path, id] = line.trim().split('|', 3)
|
||||||
|
return {
|
||||||
|
id: (id ?? name ?? '').trim(),
|
||||||
|
displayName: (name ?? '').trim(),
|
||||||
|
path: (path ?? '').trim(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
} catch {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async open(name) {
|
||||||
|
// Detect app type and route to appropriate controller
|
||||||
|
const appType = detectAppType(name)
|
||||||
|
|
||||||
|
// Excel/Word → COM automation (no window, no HWND)
|
||||||
|
if (appType === 'excel' || appType === 'word') {
|
||||||
|
const result = await openWithController(name)
|
||||||
|
if (result.filePath) {
|
||||||
|
bindFile(result.filePath, result.type)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Text/Browser/Generic → exe launch + HWND bind (offscreen)
|
||||||
|
// If name is a UWP PackageFamilyName (e.g. Microsoft.WindowsNotepad_8wekyb3d8bbwe),
|
||||||
|
// extract the app name and try as exe. This avoids launching through UWP shell.
|
||||||
|
let launchName = name
|
||||||
|
if (name.includes('_') && name.includes('.')) {
|
||||||
|
// Microsoft.WindowsNotepad_xxx → Notepad
|
||||||
|
// Microsoft.WindowsCalculator_xxx → Calculator
|
||||||
|
// Microsoft.WindowsTerminal_xxx → Terminal
|
||||||
|
const parts = name.split('_')[0]?.split('.') ?? []
|
||||||
|
const appPart = parts[parts.length - 1] ?? name
|
||||||
|
// Strip "Windows" prefix: WindowsNotepad → Notepad
|
||||||
|
launchName = appPart.replace(/^Windows/, '') || appPart
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Try to find an EXISTING window first (by process name or title) ---
|
||||||
|
// If found, auto-bind to it. Use bind_window tool to switch later.
|
||||||
|
const existingHwnd = findExistingWindow(launchName)
|
||||||
|
if (existingHwnd) {
|
||||||
|
bindWindow(existingHwnd.hwnd, existingHwnd.pid)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const escaped = launchName.replace(/'/g, "''")
|
||||||
|
const result = await psAsync(`
|
||||||
|
${WIN32_TYPES}
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Text;
|
||||||
|
public class CuLaunch {
|
||||||
|
public delegate bool EnumProc(IntPtr h, IntPtr lp);
|
||||||
|
[DllImport("user32.dll")] public static extern bool EnumWindows(EnumProc cb, IntPtr lp);
|
||||||
|
[DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr h);
|
||||||
|
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr h, out uint pid);
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr h, StringBuilder sb, int n);
|
||||||
|
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
|
||||||
|
public const int SW_SHOWMINNOACTIVE = 7;
|
||||||
|
// Get all visible window HWNDs as array
|
||||||
|
public static long[] GetAllVisibleHwnds() {
|
||||||
|
var list = new System.Collections.Generic.List<long>();
|
||||||
|
EnumWindows((h, _) => {
|
||||||
|
if (IsWindowVisible(h)) list.Add(h.ToInt64());
|
||||||
|
return true;
|
||||||
|
}, IntPtr.Zero);
|
||||||
|
return list.ToArray();
|
||||||
|
}
|
||||||
|
// Get PID for a single HWND
|
||||||
|
public static uint GetPidForHwnd(long hwnd) {
|
||||||
|
uint pid; GetWindowThreadProcessId((IntPtr)hwnd, out pid);
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
// Get title for a single HWND
|
||||||
|
public static string GetTitle(long hwnd) {
|
||||||
|
var sb = new StringBuilder(256);
|
||||||
|
GetWindowText((IntPtr)hwnd, sb, 256);
|
||||||
|
return sb.ToString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
# Launch strategy: all exe-based, no GUI dialogs.
|
||||||
|
# 1) exact path 2) exe in PATH 3) registry install dir 4) raw name
|
||||||
|
$target = '${escaped}'
|
||||||
|
$proc = $null
|
||||||
|
|
||||||
|
# 1. Exact file path
|
||||||
|
if (Test-Path $target) {
|
||||||
|
$proc = Start-Process $target -PassThru -ErrorAction SilentlyContinue
|
||||||
|
}
|
||||||
|
|
||||||
|
# 2. exe name in PATH (notepad.exe, code.exe, chrome.exe, etc.)
|
||||||
|
if (-not $proc) {
|
||||||
|
# Try with .exe suffix if not already
|
||||||
|
$tryExe = if ($target -notmatch '[.]exe$') { "$target.exe" } else { $target }
|
||||||
|
$found = Get-Command $tryExe -ErrorAction SilentlyContinue | Select-Object -First 1
|
||||||
|
if (-not $found) { $found = Get-Command $target -ErrorAction SilentlyContinue | Select-Object -First 1 }
|
||||||
|
if ($found) { $proc = Start-Process $found.Source -PassThru -ErrorAction SilentlyContinue }
|
||||||
|
}
|
||||||
|
|
||||||
|
# 3. Search registry for install location by display name → find .exe
|
||||||
|
if (-not $proc) {
|
||||||
|
$regPaths = @('HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*')
|
||||||
|
foreach ($p in $regPaths) {
|
||||||
|
$app = Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object {
|
||||||
|
$_.DisplayName -and $_.DisplayName -match [regex]::Escape($target)
|
||||||
|
} | Select-Object -First 1
|
||||||
|
if ($app) {
|
||||||
|
# Try DisplayIcon (often the exe path), then InstallLocation
|
||||||
|
$exePath = $null
|
||||||
|
if ($app.DisplayIcon -and $app.DisplayIcon -match '[.]exe') {
|
||||||
|
$exePath = ($app.DisplayIcon -split ',')[0].Trim('"')
|
||||||
|
}
|
||||||
|
if (-not $exePath -and $app.InstallLocation) {
|
||||||
|
$exeFile = Get-ChildItem $app.InstallLocation -Filter '*.exe' -ErrorAction SilentlyContinue | Select-Object -First 1
|
||||||
|
if ($exeFile) { $exePath = $exeFile.FullName }
|
||||||
|
}
|
||||||
|
if ($exePath -and (Test-Path $exePath)) {
|
||||||
|
$proc = Start-Process $exePath -PassThru -ErrorAction SilentlyContinue
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 4. Last resort: direct Start-Process (Windows may resolve it)
|
||||||
|
if (-not $proc) { $proc = Start-Process -FilePath $target -PassThru -ErrorAction SilentlyContinue }
|
||||||
|
|
||||||
|
if (-not $proc) { Write-Host "LAUNCH_FAILED"; exit }
|
||||||
|
|
||||||
|
# Snapshot ALL visible window HWNDs before the new window appears
|
||||||
|
$beforeHwnds = [CuLaunch]::GetAllVisibleHwnds()
|
||||||
|
|
||||||
|
# Wait for a NEW window from our process PID
|
||||||
|
$hwnd = 0
|
||||||
|
for ($i = 0; $i -lt 50; $i++) {
|
||||||
|
Start-Sleep -Milliseconds 200
|
||||||
|
$afterHwnds = [CuLaunch]::GetAllVisibleHwnds()
|
||||||
|
# Find new windows (in after but not in before)
|
||||||
|
foreach ($h in $afterHwnds) {
|
||||||
|
if ($beforeHwnds -contains $h) { continue }
|
||||||
|
# New window found — check PID
|
||||||
|
$wPid = [CuLaunch]::GetPidForHwnd($h)
|
||||||
|
if ($wPid -eq [uint32]$proc.Id) {
|
||||||
|
$hwnd = $h; break # exact PID match
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($hwnd -ne 0) { break }
|
||||||
|
# PID didn't match (process redirect) — accept new window matching title hint
|
||||||
|
if ($i -gt 10) {
|
||||||
|
$hint = '${escaped}'.Split('\\')[-1].Replace('.exe','')
|
||||||
|
foreach ($h in $afterHwnds) {
|
||||||
|
if ($beforeHwnds -contains $h) { continue }
|
||||||
|
$title = [CuLaunch]::GetTitle($h)
|
||||||
|
if ($title -and $title.IndexOf($hint, [StringComparison]::OrdinalIgnoreCase) -ge 0) {
|
||||||
|
$hwnd = $h; break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($hwnd -ne 0) { break }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($hwnd -eq 0) { Write-Host "HWND_NOT_FOUND|$($proc.Id)"; exit }
|
||||||
|
# Move offscreen instead of minimizing — keeps window restored so
|
||||||
|
# PrintWindow and SendMessage work without needing restore/re-minimize.
|
||||||
|
# User cannot see the window at -32000,-32000.
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class CuPos {
|
||||||
|
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
|
||||||
|
public const uint SWP_NOSIZE = 0x0001;
|
||||||
|
public const uint SWP_NOZORDER = 0x0004;
|
||||||
|
public const uint SWP_NOACTIVATE = 0x0010;
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
[CuPos]::SetWindowPos([IntPtr]::new([long]$hwnd), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuPos]::SWP_NOSIZE -bor [CuPos]::SWP_NOZORDER -bor [CuPos]::SWP_NOACTIVATE) | Out-Null
|
||||||
|
Write-Host "$hwnd|$($proc.Id)"
|
||||||
|
`)
|
||||||
|
if (!result) {
|
||||||
|
throw new Error(
|
||||||
|
`open(): failed to launch '${name}' — no output from launcher script`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (result.startsWith('LAUNCH_FAILED')) {
|
||||||
|
throw new Error(
|
||||||
|
`open(): failed to launch '${name}' — process did not start (${result})`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (result.startsWith('HWND_NOT_FOUND')) {
|
||||||
|
throw new Error(
|
||||||
|
`open(): launched '${name}' but could not find its window HWND (${result})`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
const parts = result.trim().split('|')
|
||||||
|
const hwnd = parts[0]!.trim()
|
||||||
|
const pid = Number(parts[1])
|
||||||
|
if (hwnd && hwnd !== '0') {
|
||||||
|
// Bind to the launched window — all subsequent operations target this HWND
|
||||||
|
bindWindow(hwnd, pid)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
getFrontmostApp(): FrontmostAppInfo | null {
|
||||||
|
try {
|
||||||
|
const out = ps(`${WIN32_TYPES}
|
||||||
|
$hwnd = [CuWin32]::GetForegroundWindow()
|
||||||
|
$procId = [uint32]0
|
||||||
|
[CuWin32]::GetWindowThreadProcessId($hwnd, [ref]$procId) | Out-Null
|
||||||
|
$proc = Get-Process -Id $procId -ErrorAction SilentlyContinue
|
||||||
|
"$($proc.MainModule.FileName)|$($proc.ProcessName)"`)
|
||||||
|
if (!out || !out.includes('|')) return null
|
||||||
|
const [exePath, appName] = out.split('|', 2)
|
||||||
|
return { id: exePath!, appName: appName! }
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
findWindowByTitle(title): WindowHandle | null {
|
||||||
|
const windows = listWindows()
|
||||||
|
const found = windows.find(w => w.title.includes(title))
|
||||||
|
if (!found) return null
|
||||||
|
return { id: String(found.hwnd), pid: found.pid, title: found.title }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Window Management — Win32 API calls targeted at bound HWND.
|
||||||
|
// NO global shortcuts (Win+Down, Alt+F4, etc.)
|
||||||
|
// Uses ShowWindow, SetWindowPos, SendMessage(WM_CLOSE) directly.
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const WINDOW_MGMT_TYPES = `
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
public class CuWinMgmt {
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint uFlags);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool SetForegroundWindow(IntPtr hWnd);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool BringWindowToTop(IntPtr hWnd);
|
||||||
|
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
|
||||||
|
public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool IsIconic(IntPtr hWnd);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool IsZoomed(IntPtr hWnd);
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
public struct RECT {
|
||||||
|
public int Left; public int Top; public int Right; public int Bottom;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ShowWindow constants
|
||||||
|
public const int SW_MINIMIZE = 6;
|
||||||
|
public const int SW_MAXIMIZE = 3;
|
||||||
|
public const int SW_RESTORE = 9;
|
||||||
|
public const int SW_SHOWNOACTIVATE = 4;
|
||||||
|
public const int SW_SHOWMINNOACTIVE = 7;
|
||||||
|
|
||||||
|
// SetWindowPos flags
|
||||||
|
public const uint SWP_NOSIZE = 0x0001;
|
||||||
|
public const uint SWP_NOMOVE = 0x0002;
|
||||||
|
public const uint SWP_NOZORDER = 0x0004;
|
||||||
|
public const uint SWP_NOACTIVATE = 0x0010;
|
||||||
|
public const uint SWP_SHOWWINDOW = 0x0040;
|
||||||
|
|
||||||
|
// WM_CLOSE
|
||||||
|
public const uint WM_CLOSE = 0x0010;
|
||||||
|
// WM_SYSCOMMAND
|
||||||
|
public const uint WM_SYSCOMMAND = 0x0112;
|
||||||
|
public const int SC_MINIMIZE = 0xF020;
|
||||||
|
public const int SC_MAXIMIZE = 0xF030;
|
||||||
|
public const int SC_RESTORE = 0xF120;
|
||||||
|
public const int SC_CLOSE = 0xF060;
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
`
|
||||||
|
|
||||||
|
import type { WindowManagementPlatform, WindowAction } from './types.js'
|
||||||
|
|
||||||
|
const windowManagement: WindowManagementPlatform = {
|
||||||
|
manageWindow(action: WindowAction, opts?): boolean {
|
||||||
|
if (!boundHwnd) return false
|
||||||
|
const hwnd = boundHwnd
|
||||||
|
|
||||||
|
switch (action) {
|
||||||
|
case 'minimize': {
|
||||||
|
// ShowWindow(SW_MINIMIZE) — targeted at HWND, not global
|
||||||
|
const r = ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_SHOWMINNOACTIVE)`,
|
||||||
|
)
|
||||||
|
return r !== ''
|
||||||
|
}
|
||||||
|
case 'maximize': {
|
||||||
|
const r = ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_MAXIMIZE)`,
|
||||||
|
)
|
||||||
|
return r !== ''
|
||||||
|
}
|
||||||
|
case 'restore': {
|
||||||
|
const r = ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_RESTORE)`,
|
||||||
|
)
|
||||||
|
return r !== ''
|
||||||
|
}
|
||||||
|
case 'close': {
|
||||||
|
// SendMessage(WM_CLOSE) — graceful close targeted at HWND
|
||||||
|
// Also clean up border overlay
|
||||||
|
unmarkBound(hwnd)
|
||||||
|
ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SendMessage([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::WM_CLOSE, [IntPtr]::Zero, [IntPtr]::Zero)`,
|
||||||
|
)
|
||||||
|
unbindWindow()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
case 'focus': {
|
||||||
|
// Restore if minimized, then bring to front
|
||||||
|
ps(`${WINDOW_MGMT_TYPES}
|
||||||
|
$h = [IntPtr]::new([long]${hwnd})
|
||||||
|
if ([CuWinMgmt]::IsIconic($h)) {
|
||||||
|
[CuWinMgmt]::ShowWindow($h, [CuWinMgmt]::SW_RESTORE) | Out-Null
|
||||||
|
}
|
||||||
|
[CuWinMgmt]::SetForegroundWindow($h) | Out-Null
|
||||||
|
[CuWinMgmt]::BringWindowToTop($h) | Out-Null
|
||||||
|
`)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
case 'move_offscreen': {
|
||||||
|
// Move to -32000,-32000 — keeps window in restored state for SendMessage/PrintWindow
|
||||||
|
ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
|
||||||
|
)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
case 'move_resize': {
|
||||||
|
if (opts?.x !== undefined && opts?.y !== undefined) {
|
||||||
|
this.moveResize(opts.x, opts.y, opts.width, opts.height)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
case 'get_rect': {
|
||||||
|
// get_rect is handled separately by getWindowRect(), not through manageWindow
|
||||||
|
// Return true to indicate the action is recognized
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
moveResize(x: number, y: number, width?: number, height?: number): boolean {
|
||||||
|
if (!boundHwnd) return false
|
||||||
|
const hwnd = boundHwnd
|
||||||
|
if (width !== undefined && height !== undefined) {
|
||||||
|
ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, ${width}, ${height}, [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
ps(
|
||||||
|
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
|
||||||
|
getWindowRect(): {
|
||||||
|
x: number
|
||||||
|
y: number
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
} | null {
|
||||||
|
if (!boundHwnd) return null
|
||||||
|
const out = ps(`${WINDOW_MGMT_TYPES}
|
||||||
|
$rect = New-Object CuWinMgmt+RECT
|
||||||
|
if ([CuWinMgmt]::GetWindowRect([IntPtr]::new([long]${boundHwnd}), [ref]$rect)) {
|
||||||
|
"$($rect.Left),$($rect.Top),$($rect.Right),$($rect.Bottom)"
|
||||||
|
} else { "FAIL" }
|
||||||
|
`)
|
||||||
|
if (!out || out === 'FAIL') return null
|
||||||
|
const [l, t, r, b] = out.split(',').map(Number)
|
||||||
|
return { x: l, y: t, width: r - l, height: b - t }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Export
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Clean up all overlays on process exit
|
||||||
|
function cleanupAll() {
|
||||||
|
cleanupAllBorders()
|
||||||
|
hideVirtualCursor()
|
||||||
|
hideIndicator()
|
||||||
|
// Stop the Python bridge subprocess if it was started
|
||||||
|
try {
|
||||||
|
getBridge()?.stopBridge()
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
process.on('exit', cleanupAll)
|
||||||
|
process.on('SIGINT', () => {
|
||||||
|
cleanupAll()
|
||||||
|
process.exit()
|
||||||
|
})
|
||||||
|
process.on('SIGTERM', () => {
|
||||||
|
cleanupAll()
|
||||||
|
process.exit()
|
||||||
|
})
|
||||||
|
|
||||||
|
export const platform: Platform = {
|
||||||
|
input,
|
||||||
|
screenshot,
|
||||||
|
display,
|
||||||
|
apps,
|
||||||
|
windowManagement,
|
||||||
|
}
|
||||||
@@ -3,21 +3,16 @@ import type { ComputerUseAPI } from '@ant/computer-use-swift'
|
|||||||
let cached: ComputerUseAPI | undefined
|
let cached: ComputerUseAPI | undefined
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Package's js/index.js reads COMPUTER_USE_SWIFT_NODE_PATH (baked by
|
* macOS-only loader for @ant/computer-use-swift.
|
||||||
* build-with-plugins.ts on darwin targets, unset otherwise — falls through to
|
* Non-darwin platforms should use src/utils/computerUse/platforms/ instead.
|
||||||
* the node_modules prebuilds/ path). We cache the loaded native module.
|
|
||||||
*
|
|
||||||
* The four @MainActor methods (captureExcluding, captureRegion,
|
|
||||||
* apps.listInstalled, resolvePrepareCapture) dispatch to DispatchQueue.main
|
|
||||||
* and will hang under libuv unless CFRunLoop is pumped — call sites wrap
|
|
||||||
* these in drainRunLoop().
|
|
||||||
*/
|
*/
|
||||||
export function requireComputerUseSwift(): ComputerUseAPI {
|
export function requireComputerUseSwift(): ComputerUseAPI {
|
||||||
|
if (process.platform !== 'darwin') {
|
||||||
|
throw new Error('@ant/computer-use-swift is macOS-only. Use platforms/ for cross-platform.')
|
||||||
|
}
|
||||||
if (cached) return cached
|
if (cached) return cached
|
||||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
const mod = require('@ant/computer-use-swift')
|
const mod = require('@ant/computer-use-swift')
|
||||||
// macOS native .node exports a plain object with apps/display/screenshot directly.
|
|
||||||
// Our cross-platform package exports { ComputerUseAPI } class — needs instantiation.
|
|
||||||
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
|
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
|
||||||
cached = new mod.ComputerUseAPI() as ComputerUseAPI
|
cached = new mod.ComputerUseAPI() as ComputerUseAPI
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
225
src/utils/computerUse/win32/accessibilitySnapshot.ts
Normal file
225
src/utils/computerUse/win32/accessibilitySnapshot.ts
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
/**
|
||||||
|
* Accessibility Snapshot — captures the UI Automation tree of a window
|
||||||
|
* and formats it as compact, model-friendly text.
|
||||||
|
*
|
||||||
|
* Sent alongside screenshots so the model has BOTH visual + structural
|
||||||
|
* understanding of the GUI. This enables:
|
||||||
|
* - Knowing exact element names, types, and positions
|
||||||
|
* - Using click_element/type_into_element by name instead of pixel coords
|
||||||
|
* - Understanding disabled/enabled state, current values
|
||||||
|
*
|
||||||
|
* Only includes interactive elements (buttons, edits, menus, links, etc.)
|
||||||
|
* to keep token count low (~200-500 tokens for typical windows).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { validateHwnd, ps } from './shared.js'
|
||||||
|
|
||||||
|
export interface AccessibilityNode {
|
||||||
|
role: string // Button, Edit, MenuItem, Link, Text, CheckBox, etc.
|
||||||
|
name: string // Visible text / accessible name
|
||||||
|
automationId: string
|
||||||
|
bounds: { x: number; y: number; w: number; h: number }
|
||||||
|
enabled: boolean
|
||||||
|
value?: string // Current text value (for Edit/ComboBox)
|
||||||
|
children?: AccessibilityNode[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AccessibilitySnapshot {
|
||||||
|
/** Compact text representation for the model */
|
||||||
|
text: string
|
||||||
|
/** Structured tree (for element-targeted actions) */
|
||||||
|
nodes: AccessibilityNode[]
|
||||||
|
/** Capture timestamp */
|
||||||
|
timestamp: number
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Capture the accessibility tree of a window, returning only interactive
|
||||||
|
* and visible elements. Uses Windows UI Automation (crosses process boundaries).
|
||||||
|
*
|
||||||
|
* @param hwnd - Window handle as string
|
||||||
|
* @param maxDepth - Maximum tree depth (default 4)
|
||||||
|
* @param interactiveOnly - Only include interactive elements (default true)
|
||||||
|
*/
|
||||||
|
export function captureAccessibilitySnapshot(
|
||||||
|
hwnd: string,
|
||||||
|
maxDepth: number = 4,
|
||||||
|
interactiveOnly: boolean = true,
|
||||||
|
): AccessibilitySnapshot | null {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
const filterClause = interactiveOnly
|
||||||
|
? `
|
||||||
|
# Interactive control types only
|
||||||
|
$interactiveTypes = @(
|
||||||
|
'Button','Edit','ComboBox','CheckBox','RadioButton',
|
||||||
|
'MenuItem','Menu','MenuBar','Link','Slider','Spinner',
|
||||||
|
'Tab','TabItem','List','ListItem','Tree','TreeItem',
|
||||||
|
'DataGrid','DataItem','Document','ScrollBar','ToolBar',
|
||||||
|
'SplitButton','ToggleButton','Hyperlink'
|
||||||
|
)
|
||||||
|
function Is-Interactive($ct) {
|
||||||
|
$typeName = $ct -replace 'ControlType\\.', ''
|
||||||
|
return $interactiveTypes -contains $typeName
|
||||||
|
}`
|
||||||
|
: `
|
||||||
|
function Is-Interactive($ct) { return $true }`
|
||||||
|
|
||||||
|
const script = `
|
||||||
|
Add-Type -AssemblyName UIAutomationClient
|
||||||
|
Add-Type -AssemblyName UIAutomationTypes
|
||||||
|
Add-Type -AssemblyName WindowsBase
|
||||||
|
${filterClause}
|
||||||
|
|
||||||
|
function Get-Tree($el, $depth, $maxDepth) {
|
||||||
|
if ($depth -ge $maxDepth) { return @() }
|
||||||
|
$result = @()
|
||||||
|
$children = $el.FindAll(
|
||||||
|
[System.Windows.Automation.TreeScope]::Children,
|
||||||
|
[System.Windows.Automation.Condition]::TrueCondition)
|
||||||
|
foreach ($child in $children) {
|
||||||
|
$ct = $child.Current.ControlType.ProgrammaticName
|
||||||
|
$typeName = $ct -replace 'ControlType\\.', ''
|
||||||
|
$name = [string]$child.Current.Name
|
||||||
|
$autoId = [string]$child.Current.AutomationId
|
||||||
|
$rect = $child.Current.BoundingRectangle
|
||||||
|
$enabled = $child.Current.IsEnabled
|
||||||
|
|
||||||
|
# Skip invisible/offscreen elements
|
||||||
|
if ($rect.Width -le 0 -or $rect.Height -le 0) { continue }
|
||||||
|
if ($rect.X -lt -10000) { continue }
|
||||||
|
|
||||||
|
$val = $null
|
||||||
|
try {
|
||||||
|
$vp = $child.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||||
|
if ($vp -ne $null) { $val = $vp.Current.Value }
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
$isInteractive = Is-Interactive $ct
|
||||||
|
$sub = Get-Tree $child ($depth + 1) $maxDepth
|
||||||
|
|
||||||
|
if ($isInteractive -or $sub.Count -gt 0) {
|
||||||
|
$node = @{
|
||||||
|
role = $typeName
|
||||||
|
name = $name
|
||||||
|
id = $autoId
|
||||||
|
x = [int]$rect.X; y = [int]$rect.Y
|
||||||
|
w = [int]$rect.Width; h = [int]$rect.Height
|
||||||
|
on = $enabled
|
||||||
|
}
|
||||||
|
if ($val -ne $null -and $val -ne '') { $node['v'] = $val }
|
||||||
|
if ($sub.Count -gt 0) { $node['c'] = $sub }
|
||||||
|
$result += $node
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $result
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
$root = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${hwnd}))
|
||||||
|
if ($root -eq $null) { Write-Output '[]'; exit }
|
||||||
|
$tree = Get-Tree $root 0 ${maxDepth}
|
||||||
|
if ($tree -eq $null -or $tree.Count -eq 0) {
|
||||||
|
Write-Output '[]'
|
||||||
|
} else {
|
||||||
|
$tree | ConvertTo-Json -Depth 20 -Compress
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
Write-Output '[]'
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
try {
|
||||||
|
const raw = ps(script)
|
||||||
|
if (!raw || raw === '[]') return null
|
||||||
|
|
||||||
|
const parsed = JSON.parse(raw)
|
||||||
|
const nodes: AccessibilityNode[] = Array.isArray(parsed)
|
||||||
|
? parsed.map(parseNode)
|
||||||
|
: [parseNode(parsed)]
|
||||||
|
const text = formatForModel(nodes)
|
||||||
|
|
||||||
|
return { text, nodes, timestamp: Date.now() }
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseNode(raw: any): AccessibilityNode {
|
||||||
|
return {
|
||||||
|
role: raw.role || '',
|
||||||
|
name: raw.name || '',
|
||||||
|
automationId: raw.id || '',
|
||||||
|
bounds: { x: raw.x || 0, y: raw.y || 0, w: raw.w || 0, h: raw.h || 0 },
|
||||||
|
enabled: raw.on !== false,
|
||||||
|
value: raw.v,
|
||||||
|
children: raw.c
|
||||||
|
? Array.isArray(raw.c)
|
||||||
|
? raw.c.map(parseNode)
|
||||||
|
: [parseNode(raw.c)]
|
||||||
|
: undefined,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format the accessibility tree as compact text for the model.
|
||||||
|
* Example output:
|
||||||
|
* [Button] "Save" (120,50 80x30) enabled
|
||||||
|
* [Edit] "" (200,80 400x25) enabled value="hello world" id=textBox1
|
||||||
|
* [MenuItem] "File" (10,0 40x25) enabled
|
||||||
|
*/
|
||||||
|
function formatForModel(
|
||||||
|
nodes: AccessibilityNode[],
|
||||||
|
indent: number = 0,
|
||||||
|
): string {
|
||||||
|
const lines: string[] = []
|
||||||
|
const pad = ' '.repeat(indent)
|
||||||
|
|
||||||
|
for (const node of nodes) {
|
||||||
|
let line = `${pad}[${node.role}]`
|
||||||
|
if (node.name) line += ` "${truncate(node.name, 40)}"`
|
||||||
|
line += ` (${node.bounds.x},${node.bounds.y} ${node.bounds.w}x${node.bounds.h})`
|
||||||
|
if (!node.enabled) line += ' DISABLED'
|
||||||
|
if (node.value) line += ` value="${truncate(node.value, 30)}"`
|
||||||
|
if (node.automationId) line += ` id=${node.automationId}`
|
||||||
|
lines.push(line)
|
||||||
|
|
||||||
|
if (node.children) {
|
||||||
|
lines.push(formatForModel(node.children, indent + 1))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return lines.join('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
function truncate(s: string, max: number): string {
|
||||||
|
return s.length > max ? s.slice(0, max - 1) + '…' : s
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find an element in the accessibility tree by name, role, or automationId.
|
||||||
|
* Returns the first match.
|
||||||
|
*/
|
||||||
|
export function findNodeInSnapshot(
|
||||||
|
nodes: AccessibilityNode[],
|
||||||
|
query: { name?: string; role?: string; automationId?: string },
|
||||||
|
): AccessibilityNode | null {
|
||||||
|
for (const node of nodes) {
|
||||||
|
let match = true
|
||||||
|
if (
|
||||||
|
query.name &&
|
||||||
|
!node.name.toLowerCase().includes(query.name.toLowerCase())
|
||||||
|
)
|
||||||
|
match = false
|
||||||
|
if (query.role && node.role.toLowerCase() !== query.role.toLowerCase())
|
||||||
|
match = false
|
||||||
|
if (query.automationId && node.automationId !== query.automationId)
|
||||||
|
match = false
|
||||||
|
if (match && (query.name || query.role || query.automationId)) return node
|
||||||
|
|
||||||
|
if (node.children) {
|
||||||
|
const found = findNodeInSnapshot(node.children, query)
|
||||||
|
if (found) return found
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
129
src/utils/computerUse/win32/appDispatcher.ts
Normal file
129
src/utils/computerUse/win32/appDispatcher.ts
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
/**
|
||||||
|
* Application type dispatcher for Windows Computer Use.
|
||||||
|
*
|
||||||
|
* Routes operations to the appropriate controller based on file type:
|
||||||
|
* - .xlsx/.xls/.csv → Excel COM (headless, no window)
|
||||||
|
* - .docx/.doc → Word COM (headless, no window)
|
||||||
|
* - .txt/.log/.md → notepad + SendMessage + HWND bind (offscreen)
|
||||||
|
* - Others → generic exe + HWND bind (offscreen)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { extname } from 'path'
|
||||||
|
|
||||||
|
export type AppType = 'excel' | 'word' | 'text' | 'browser' | 'generic'
|
||||||
|
|
||||||
|
const EXCEL_EXTS = new Set(['.xlsx', '.xls', '.csv', '.xlsm', '.xlsb'])
|
||||||
|
const WORD_EXTS = new Set(['.docx', '.doc', '.rtf'])
|
||||||
|
const TEXT_EXTS = new Set([
|
||||||
|
'.txt',
|
||||||
|
'.log',
|
||||||
|
'.md',
|
||||||
|
'.json',
|
||||||
|
'.xml',
|
||||||
|
'.yaml',
|
||||||
|
'.yml',
|
||||||
|
'.ini',
|
||||||
|
'.cfg',
|
||||||
|
'.conf',
|
||||||
|
])
|
||||||
|
const BROWSER_NAMES = new Set(['chrome', 'msedge', 'firefox', 'brave', 'opera'])
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect application type from file path or app name.
|
||||||
|
*/
|
||||||
|
export function detectAppType(nameOrPath: string): AppType {
|
||||||
|
const lower = nameOrPath.toLowerCase()
|
||||||
|
|
||||||
|
// Check by extension
|
||||||
|
const ext = extname(lower)
|
||||||
|
if (ext) {
|
||||||
|
if (EXCEL_EXTS.has(ext)) return 'excel'
|
||||||
|
if (WORD_EXTS.has(ext)) return 'word'
|
||||||
|
if (TEXT_EXTS.has(ext)) return 'text'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check by app name
|
||||||
|
const baseName =
|
||||||
|
lower
|
||||||
|
.replace(/\.exe$/, '')
|
||||||
|
.split(/[/\\]/)
|
||||||
|
.pop() ?? ''
|
||||||
|
if (baseName === 'excel' || baseName.includes('excel')) return 'excel'
|
||||||
|
if (
|
||||||
|
baseName === 'winword' ||
|
||||||
|
baseName === 'word' ||
|
||||||
|
baseName.includes('word')
|
||||||
|
)
|
||||||
|
return 'word'
|
||||||
|
if (baseName === 'notepad' || baseName === 'notepad++' || baseName === 'code')
|
||||||
|
return 'text'
|
||||||
|
if (BROWSER_NAMES.has(baseName)) return 'browser'
|
||||||
|
|
||||||
|
return 'generic'
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface OpenResult {
|
||||||
|
type: AppType
|
||||||
|
/** HWND for text/browser/generic apps (SendMessage target) */
|
||||||
|
hwnd?: string
|
||||||
|
/** File path for COM-controlled apps (Excel/Word) */
|
||||||
|
filePath?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open a file or app with the appropriate controller.
|
||||||
|
*
|
||||||
|
* - Excel/Word: COM automation (no window, no HWND needed)
|
||||||
|
* - Text/Browser/Generic: exe launch + offscreen HWND bind
|
||||||
|
*
|
||||||
|
* Returns the app type and either HWND or file path for subsequent operations.
|
||||||
|
*/
|
||||||
|
export async function openWithController(
|
||||||
|
nameOrPath: string,
|
||||||
|
): Promise<OpenResult> {
|
||||||
|
const type = detectAppType(nameOrPath)
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case 'excel': {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
|
const { createExcel, openExcel } =
|
||||||
|
require('./comExcel.js') as typeof import('./comExcel.js')
|
||||||
|
const isExisting = nameOrPath.match(/\.(xlsx|xls|csv|xlsm|xlsb)$/i)
|
||||||
|
if (isExisting) {
|
||||||
|
// Open existing file — just verify it's readable
|
||||||
|
try {
|
||||||
|
openExcel(nameOrPath)
|
||||||
|
return { type: 'excel', filePath: nameOrPath }
|
||||||
|
} catch {
|
||||||
|
return { type: 'excel', filePath: nameOrPath }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// "excel" or "excel.exe" without a file — create new
|
||||||
|
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.xlsx`
|
||||||
|
createExcel(tmpPath)
|
||||||
|
return { type: 'excel', filePath: tmpPath }
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'word': {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||||
|
const { createWord, openWord } =
|
||||||
|
require('./comWord.js') as typeof import('./comWord.js')
|
||||||
|
const isExisting = nameOrPath.match(/\.(docx|doc|rtf)$/i)
|
||||||
|
if (isExisting) {
|
||||||
|
try {
|
||||||
|
openWord(nameOrPath)
|
||||||
|
return { type: 'word', filePath: nameOrPath }
|
||||||
|
} catch {
|
||||||
|
return { type: 'word', filePath: nameOrPath }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.docx`
|
||||||
|
createWord(tmpPath)
|
||||||
|
return { type: 'word', filePath: tmpPath }
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
// text/browser/generic — HWND bind handled by caller (platforms/win32.ts open())
|
||||||
|
return { type }
|
||||||
|
}
|
||||||
|
}
|
||||||
525
src/utils/computerUse/win32/bridge.py
Normal file
525
src/utils/computerUse/win32/bridge.py
Normal file
@@ -0,0 +1,525 @@
|
|||||||
|
"""
|
||||||
|
Python Bridge for Windows Computer Use.
|
||||||
|
|
||||||
|
Long-lived subprocess communicating via stdin/stdout JSON lines.
|
||||||
|
Replaces per-call PowerShell spawning with a persistent process.
|
||||||
|
|
||||||
|
Capabilities:
|
||||||
|
- screenshot: full-screen or per-window (mss + PrintWindow)
|
||||||
|
- input: mouse click/move/drag, keyboard type/key (ctypes user32)
|
||||||
|
- windows: enumerate, find, get rect, manage (show/min/max/close)
|
||||||
|
- accessibility: UI Automation tree snapshot (comtypes + UIAutomation)
|
||||||
|
|
||||||
|
Protocol: one JSON object per line on stdin → one JSON object per line on stdout.
|
||||||
|
Request: {"id": 1, "method": "screenshot", "params": {...}}
|
||||||
|
Response: {"id": 1, "result": {...}} or {"id": 1, "error": "message"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import ctypes
|
||||||
|
import ctypes.wintypes
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Force UTF-8 output
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
sys.stdin.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
user32 = ctypes.windll.user32
|
||||||
|
gdi32 = ctypes.windll.gdi32
|
||||||
|
kernel32 = ctypes.windll.kernel32
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Win32 constants & types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
WM_CHAR = 0x0102
|
||||||
|
WM_KEYDOWN = 0x0100
|
||||||
|
WM_KEYUP = 0x0101
|
||||||
|
WM_CLOSE = 0x0010
|
||||||
|
WM_LBUTTONDOWN = 0x0201
|
||||||
|
WM_LBUTTONUP = 0x0202
|
||||||
|
WM_RBUTTONDOWN = 0x0204
|
||||||
|
WM_RBUTTONUP = 0x0205
|
||||||
|
WM_MOUSEMOVE = 0x0200
|
||||||
|
|
||||||
|
SW_MINIMIZE = 6
|
||||||
|
SW_MAXIMIZE = 3
|
||||||
|
SW_RESTORE = 9
|
||||||
|
SW_SHOWMINNOACTIVE = 7
|
||||||
|
|
||||||
|
SWP_NOSIZE = 0x0001
|
||||||
|
SWP_NOMOVE = 0x0002
|
||||||
|
SWP_NOZORDER = 0x0004
|
||||||
|
SWP_NOACTIVATE = 0x0010
|
||||||
|
|
||||||
|
WNDENUMPROC = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
|
||||||
|
|
||||||
|
class RECT(ctypes.Structure):
|
||||||
|
_fields_ = [("left", ctypes.c_long), ("top", ctypes.c_long),
|
||||||
|
("right", ctypes.c_long), ("bottom", ctypes.c_long)]
|
||||||
|
|
||||||
|
class POINT(ctypes.Structure):
|
||||||
|
_fields_ = [("x", ctypes.c_long), ("y", ctypes.c_long)]
|
||||||
|
|
||||||
|
# SendMessageW
|
||||||
|
SendMessageW = user32.SendMessageW
|
||||||
|
SendMessageW.argtypes = [ctypes.c_void_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_void_p]
|
||||||
|
SendMessageW.restype = ctypes.c_void_p
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Screenshot
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def screenshot_full(display_id=0):
|
||||||
|
"""Full-screen screenshot via mss, returns JPEG base64."""
|
||||||
|
import mss
|
||||||
|
from PIL import Image
|
||||||
|
with mss.mss() as sct:
|
||||||
|
monitor = sct.monitors[display_id + 1] if display_id < len(sct.monitors) - 1 else sct.monitors[1]
|
||||||
|
shot = sct.grab(monitor)
|
||||||
|
img = Image.frombytes('RGB', shot.size, shot.bgra, 'raw', 'BGRX')
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format='JPEG', quality=75)
|
||||||
|
return {
|
||||||
|
'base64': base64.b64encode(buf.getvalue()).decode(),
|
||||||
|
'width': shot.width,
|
||||||
|
'height': shot.height,
|
||||||
|
}
|
||||||
|
|
||||||
|
def screenshot_window(hwnd_str):
|
||||||
|
"""Window screenshot via PrintWindow, returns JPEG base64."""
|
||||||
|
from PIL import Image
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
if not user32.IsWindow(hwnd):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get window rect
|
||||||
|
rect = RECT()
|
||||||
|
user32.GetWindowRect(hwnd, ctypes.byref(rect))
|
||||||
|
w = rect.right - rect.left
|
||||||
|
h = rect.bottom - rect.top
|
||||||
|
if w <= 0 or h <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Handle minimized windows
|
||||||
|
was_minimized = user32.IsIconic(hwnd)
|
||||||
|
if was_minimized:
|
||||||
|
user32.ShowWindow(hwnd, 4) # SW_SHOWNOACTIVATE
|
||||||
|
time.sleep(0.1)
|
||||||
|
user32.GetWindowRect(hwnd, ctypes.byref(rect))
|
||||||
|
w = rect.right - rect.left
|
||||||
|
h = rect.bottom - rect.top
|
||||||
|
|
||||||
|
# Create DC and bitmap
|
||||||
|
hdc_window = user32.GetDC(hwnd)
|
||||||
|
hdc_mem = gdi32.CreateCompatibleDC(hdc_window)
|
||||||
|
hbm = gdi32.CreateCompatibleBitmap(hdc_window, w, h)
|
||||||
|
gdi32.SelectObject(hdc_mem, hbm)
|
||||||
|
|
||||||
|
# PrintWindow with PW_RENDERFULLCONTENT
|
||||||
|
result = ctypes.windll.user32.PrintWindow(hwnd, hdc_mem, 2)
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
# Fallback to BitBlt
|
||||||
|
gdi32.BitBlt(hdc_mem, 0, 0, w, h, hdc_window, 0, 0, 0x00CC0020) # SRCCOPY
|
||||||
|
|
||||||
|
# Extract bitmap bits
|
||||||
|
class BITMAPINFOHEADER(ctypes.Structure):
|
||||||
|
_fields_ = [
|
||||||
|
('biSize', ctypes.c_uint32), ('biWidth', ctypes.c_int32),
|
||||||
|
('biHeight', ctypes.c_int32), ('biPlanes', ctypes.c_uint16),
|
||||||
|
('biBitCount', ctypes.c_uint16), ('biCompression', ctypes.c_uint32),
|
||||||
|
('biSizeImage', ctypes.c_uint32), ('biXPelsPerMeter', ctypes.c_int32),
|
||||||
|
('biYPelsPerMeter', ctypes.c_int32), ('biClrUsed', ctypes.c_uint32),
|
||||||
|
('biClrImportant', ctypes.c_uint32),
|
||||||
|
]
|
||||||
|
|
||||||
|
bmi = BITMAPINFOHEADER()
|
||||||
|
bmi.biSize = ctypes.sizeof(BITMAPINFOHEADER)
|
||||||
|
bmi.biWidth = w
|
||||||
|
bmi.biHeight = -h # top-down
|
||||||
|
bmi.biPlanes = 1
|
||||||
|
bmi.biBitCount = 32
|
||||||
|
bmi.biCompression = 0 # BI_RGB
|
||||||
|
|
||||||
|
buf_size = w * h * 4
|
||||||
|
pixel_buf = ctypes.create_string_buffer(buf_size)
|
||||||
|
gdi32.GetDIBits(hdc_mem, hbm, 0, h, pixel_buf, ctypes.byref(bmi), 0)
|
||||||
|
|
||||||
|
# Cleanup GDI
|
||||||
|
gdi32.DeleteObject(hbm)
|
||||||
|
gdi32.DeleteDC(hdc_mem)
|
||||||
|
user32.ReleaseDC(hwnd, hdc_window)
|
||||||
|
|
||||||
|
if was_minimized:
|
||||||
|
user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
|
||||||
|
|
||||||
|
# Convert to JPEG
|
||||||
|
img = Image.frombuffer('RGBA', (w, h), pixel_buf, 'raw', 'BGRA', 0, 1)
|
||||||
|
img = img.convert('RGB')
|
||||||
|
out = io.BytesIO()
|
||||||
|
img.save(out, format='JPEG', quality=75)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'base64': base64.b64encode(out.getvalue()).decode(),
|
||||||
|
'width': w,
|
||||||
|
'height': h,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Window management
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def list_windows():
|
||||||
|
"""Enumerate all visible windows with title."""
|
||||||
|
windows = []
|
||||||
|
def cb(hwnd, _):
|
||||||
|
if user32.IsWindowVisible(hwnd):
|
||||||
|
length = user32.GetWindowTextLengthW(hwnd)
|
||||||
|
if length > 0:
|
||||||
|
buf = ctypes.create_unicode_buffer(length + 1)
|
||||||
|
user32.GetWindowTextW(hwnd, buf, length + 1)
|
||||||
|
pid = ctypes.c_uint32()
|
||||||
|
user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
|
||||||
|
windows.append({'hwnd': str(hwnd), 'pid': pid.value, 'title': buf.value})
|
||||||
|
return True
|
||||||
|
user32.EnumWindows(WNDENUMPROC(cb), 0)
|
||||||
|
return windows
|
||||||
|
|
||||||
|
def get_window_rect(hwnd_str):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
rect = RECT()
|
||||||
|
if user32.GetWindowRect(hwnd, ctypes.byref(rect)):
|
||||||
|
return {'x': rect.left, 'y': rect.top,
|
||||||
|
'width': rect.right - rect.left, 'height': rect.bottom - rect.top}
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_client_offset(hwnd_str):
|
||||||
|
"""Get non-client area offset (title bar height, border width)."""
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
wr = RECT()
|
||||||
|
user32.GetWindowRect(hwnd, ctypes.byref(wr))
|
||||||
|
pt = POINT(0, 0)
|
||||||
|
user32.ClientToScreen(hwnd, ctypes.byref(pt))
|
||||||
|
return {'dx': pt.x - wr.left, 'dy': pt.y - wr.top}
|
||||||
|
|
||||||
|
def manage_window(hwnd_str, action):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
if action == 'minimize':
|
||||||
|
return user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
|
||||||
|
elif action == 'maximize':
|
||||||
|
return user32.ShowWindow(hwnd, SW_MAXIMIZE)
|
||||||
|
elif action == 'restore':
|
||||||
|
return user32.ShowWindow(hwnd, SW_RESTORE)
|
||||||
|
elif action == 'close':
|
||||||
|
SendMessageW(hwnd, WM_CLOSE, 0, 0)
|
||||||
|
return True
|
||||||
|
elif action == 'focus':
|
||||||
|
if user32.IsIconic(hwnd):
|
||||||
|
user32.ShowWindow(hwnd, SW_RESTORE)
|
||||||
|
user32.SetForegroundWindow(hwnd)
|
||||||
|
return True
|
||||||
|
elif action == 'move_offscreen':
|
||||||
|
user32.SetWindowPos(hwnd, 0, -32000, -32000, 0, 0,
|
||||||
|
SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Input — all via SendMessageW (window-targeted, no global)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def make_lparam(x, y):
|
||||||
|
return (y << 16) | (x & 0xFFFF)
|
||||||
|
|
||||||
|
def send_click(hwnd_str, x, y, button='left'):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
lp = make_lparam(x, y)
|
||||||
|
if button == 'left':
|
||||||
|
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, lp)
|
||||||
|
SendMessageW(hwnd, WM_LBUTTONUP, 0, lp)
|
||||||
|
elif button == 'right':
|
||||||
|
SendMessageW(hwnd, WM_RBUTTONDOWN, 0, lp)
|
||||||
|
SendMessageW(hwnd, WM_RBUTTONUP, 0, lp)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_text(hwnd_str, text):
|
||||||
|
"""Send text via WM_CHAR (Unicode). Handles surrogate pairs."""
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
for ch in text:
|
||||||
|
cp = ord(ch)
|
||||||
|
if cp <= 0xFFFF:
|
||||||
|
SendMessageW(hwnd, WM_CHAR, cp, 0)
|
||||||
|
else:
|
||||||
|
# Surrogate pair
|
||||||
|
hi = ((cp - 0x10000) >> 10) + 0xD800
|
||||||
|
lo = ((cp - 0x10000) & 0x3FF) + 0xDC00
|
||||||
|
SendMessageW(hwnd, WM_CHAR, hi, 0)
|
||||||
|
SendMessageW(hwnd, WM_CHAR, lo, 0)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_key(hwnd_str, vk, action='down'):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
msg = WM_KEYDOWN if action == 'down' else WM_KEYUP
|
||||||
|
SendMessageW(hwnd, msg, vk, 0)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_keys_combo(hwnd_str, keys):
|
||||||
|
"""Send a key combination like ['ctrl', 's']."""
|
||||||
|
VK = {
|
||||||
|
'ctrl': 0x11, 'control': 0x11, 'shift': 0x10, 'alt': 0x12,
|
||||||
|
'enter': 0x0D, 'return': 0x0D, 'tab': 0x09, 'escape': 0x1B,
|
||||||
|
'backspace': 0x08, 'delete': 0x2E, 'space': 0x20,
|
||||||
|
'left': 0x25, 'up': 0x26, 'right': 0x27, 'down': 0x28,
|
||||||
|
'home': 0x24, 'end': 0x23, 'pageup': 0x21, 'pagedown': 0x22,
|
||||||
|
'f1': 0x70, 'f2': 0x71, 'f3': 0x72, 'f4': 0x73, 'f5': 0x74,
|
||||||
|
'f6': 0x75, 'f7': 0x76, 'f8': 0x77, 'f9': 0x78, 'f10': 0x79,
|
||||||
|
'f11': 0x7A, 'f12': 0x7B,
|
||||||
|
}
|
||||||
|
MODIFIERS = {'ctrl', 'control', 'shift', 'alt'}
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
mods = []
|
||||||
|
main_key = None
|
||||||
|
for k in keys:
|
||||||
|
kl = k.lower()
|
||||||
|
if kl in MODIFIERS:
|
||||||
|
mods.append(VK.get(kl, 0))
|
||||||
|
elif kl in VK:
|
||||||
|
main_key = VK[kl]
|
||||||
|
elif len(kl) == 1:
|
||||||
|
main_key = ord(kl.upper())
|
||||||
|
if main_key is None:
|
||||||
|
return False
|
||||||
|
for m in mods:
|
||||||
|
SendMessageW(hwnd, WM_KEYDOWN, m, 0)
|
||||||
|
SendMessageW(hwnd, WM_KEYDOWN, main_key, 0)
|
||||||
|
SendMessageW(hwnd, WM_KEYUP, main_key, 0)
|
||||||
|
for m in reversed(mods):
|
||||||
|
SendMessageW(hwnd, WM_KEYUP, m, 0)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_mouse_down(hwnd_str, x, y):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, make_lparam(x, y))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_mouse_up(hwnd_str, x, y):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
SendMessageW(hwnd, WM_LBUTTONUP, 0, make_lparam(x, y))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def send_mouse_move(hwnd_str, x, y):
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
SendMessageW(hwnd, WM_MOUSEMOVE, 0, make_lparam(x, y))
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Accessibility snapshot (UI Automation via comtypes)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
_uia_client = None
|
||||||
|
|
||||||
|
def _get_uia():
|
||||||
|
global _uia_client
|
||||||
|
if _uia_client is None:
|
||||||
|
try:
|
||||||
|
import comtypes.client
|
||||||
|
comtypes.client.GetModule('UIAutomationCore.dll')
|
||||||
|
from comtypes.gen.UIAutomationClient import CUIAutomation
|
||||||
|
_uia_client = comtypes.client.CreateObject(CUIAutomation)
|
||||||
|
except Exception:
|
||||||
|
# Fallback: use pywinauto
|
||||||
|
pass
|
||||||
|
return _uia_client
|
||||||
|
|
||||||
|
def accessibility_snapshot(hwnd_str, max_depth=4):
|
||||||
|
"""Get the accessibility tree using pywinauto (more reliable than raw comtypes)."""
|
||||||
|
try:
|
||||||
|
from pywinauto import Desktop
|
||||||
|
from pywinauto.controls.uiawrapper import UIAWrapper
|
||||||
|
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
app = Desktop(backend='uia')
|
||||||
|
# Find window by handle
|
||||||
|
win = None
|
||||||
|
for w in app.windows():
|
||||||
|
if w.handle == hwnd:
|
||||||
|
win = w
|
||||||
|
break
|
||||||
|
if win is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
INTERACTIVE = {'Button', 'Edit', 'ComboBox', 'CheckBox', 'RadioButton',
|
||||||
|
'MenuItem', 'Menu', 'MenuBar', 'Hyperlink', 'Slider',
|
||||||
|
'Tab', 'TabItem', 'List', 'ListItem', 'Document',
|
||||||
|
'TreeItem', 'DataItem', 'ToolBar', 'SplitButton'}
|
||||||
|
|
||||||
|
def walk(element, depth):
|
||||||
|
if depth >= max_depth:
|
||||||
|
return []
|
||||||
|
nodes = []
|
||||||
|
try:
|
||||||
|
children = element.children()
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
ct = child.element_info.control_type or ''
|
||||||
|
name = child.element_info.name or ''
|
||||||
|
auto_id = child.element_info.automation_id or ''
|
||||||
|
rect = child.rectangle()
|
||||||
|
w = rect.right - rect.left
|
||||||
|
h = rect.bottom - rect.top
|
||||||
|
if w <= 0 or h <= 0 or rect.left < -10000:
|
||||||
|
continue
|
||||||
|
enabled = child.is_enabled()
|
||||||
|
value = None
|
||||||
|
try:
|
||||||
|
value = child.get_value()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
sub = walk(child, depth + 1)
|
||||||
|
if ct in INTERACTIVE or sub:
|
||||||
|
node = {
|
||||||
|
'role': ct, 'name': name, 'id': auto_id,
|
||||||
|
'x': rect.left, 'y': rect.top, 'w': w, 'h': h,
|
||||||
|
'on': enabled,
|
||||||
|
}
|
||||||
|
if value:
|
||||||
|
node['v'] = str(value)[:100]
|
||||||
|
if sub:
|
||||||
|
node['c'] = sub
|
||||||
|
nodes.append(node)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
tree = walk(win, 0)
|
||||||
|
return tree if tree else None
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Find edit child (for text input targeting)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def find_edit_child(hwnd_str):
|
||||||
|
"""Find the best edit control child using UI Automation."""
|
||||||
|
try:
|
||||||
|
from pywinauto import Desktop
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
app = Desktop(backend='uia')
|
||||||
|
for w in app.windows():
|
||||||
|
if w.handle == hwnd:
|
||||||
|
# Find first Edit or Document control
|
||||||
|
for child in w.descendants():
|
||||||
|
try:
|
||||||
|
ct = child.element_info.control_type
|
||||||
|
if ct in ('Edit', 'Document'):
|
||||||
|
return str(child.handle) if child.handle else None
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Clipboard paste (for large text)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def paste_text(hwnd_str, text):
|
||||||
|
"""Set clipboard + send Ctrl+V via SendMessage."""
|
||||||
|
import ctypes
|
||||||
|
# Set clipboard
|
||||||
|
CF_UNICODETEXT = 13
|
||||||
|
user32.OpenClipboard(0)
|
||||||
|
user32.EmptyClipboard()
|
||||||
|
data = text.encode('utf-16-le') + b'\x00\x00'
|
||||||
|
h = kernel32.GlobalAlloc(0x0002, len(data)) # GMEM_MOVEABLE
|
||||||
|
ptr = kernel32.GlobalLock(h)
|
||||||
|
ctypes.memmove(ptr, data, len(data))
|
||||||
|
kernel32.GlobalUnlock(h)
|
||||||
|
user32.SetClipboardData(CF_UNICODETEXT, h)
|
||||||
|
user32.CloseClipboard()
|
||||||
|
# Send Ctrl+V
|
||||||
|
send_keys_combo(hwnd_str, ['ctrl', 'v'])
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Mouse wheel scroll (WM_MOUSEWHEEL / WM_MOUSEHWHEEL)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
WM_MOUSEWHEEL = 0x020A
|
||||||
|
WM_MOUSEHWHEEL = 0x020E
|
||||||
|
|
||||||
|
# ClientToScreen for screen coords in lParam
|
||||||
|
user32.ClientToScreen.argtypes = [ctypes.c_void_p, ctypes.POINTER(POINT)]
|
||||||
|
user32.ClientToScreen.restype = ctypes.c_bool
|
||||||
|
|
||||||
|
def send_mouse_wheel(hwnd_str, x, y, delta, horizontal=False):
|
||||||
|
"""Send mouse wheel scroll at client coordinates (x, y).
|
||||||
|
delta: positive = up/right, negative = down/left. In "clicks" (1 click = 120 units).
|
||||||
|
"""
|
||||||
|
hwnd = int(hwnd_str)
|
||||||
|
msg = WM_MOUSEHWHEEL if horizontal else WM_MOUSEWHEEL
|
||||||
|
wheel_delta = int(delta) * 120
|
||||||
|
# Convert client coords to screen coords for lParam
|
||||||
|
pt = POINT(int(x), int(y))
|
||||||
|
user32.ClientToScreen(hwnd, ctypes.byref(pt))
|
||||||
|
# wParam: high word = delta (signed short), low word = modifier keys (0)
|
||||||
|
wparam = ctypes.c_void_p(wheel_delta << 16)
|
||||||
|
# lParam: screen coords
|
||||||
|
lparam = ctypes.c_void_p((pt.y << 16) | (pt.x & 0xFFFF))
|
||||||
|
SendMessageW(hwnd, msg, wparam, lparam)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dispatch
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
METHODS = {
|
||||||
|
'screenshot': lambda p: screenshot_full(p.get('display_id', 0)),
|
||||||
|
'screenshot_window': lambda p: screenshot_window(p['hwnd']),
|
||||||
|
'list_windows': lambda p: list_windows(),
|
||||||
|
'get_window_rect': lambda p: get_window_rect(p['hwnd']),
|
||||||
|
'get_client_offset': lambda p: get_client_offset(p['hwnd']),
|
||||||
|
'manage_window': lambda p: manage_window(p['hwnd'], p['action']),
|
||||||
|
'send_click': lambda p: send_click(p['hwnd'], p['x'], p['y'], p.get('button', 'left')),
|
||||||
|
'send_text': lambda p: send_text(p['hwnd'], p['text']),
|
||||||
|
'send_key': lambda p: send_key(p['hwnd'], p['vk'], p.get('action', 'down')),
|
||||||
|
'send_keys': lambda p: send_keys_combo(p['hwnd'], p['keys']),
|
||||||
|
'send_mouse_down': lambda p: send_mouse_down(p['hwnd'], p['x'], p['y']),
|
||||||
|
'send_mouse_up': lambda p: send_mouse_up(p['hwnd'], p['x'], p['y']),
|
||||||
|
'send_mouse_move': lambda p: send_mouse_move(p['hwnd'], p['x'], p['y']),
|
||||||
|
'paste_text': lambda p: paste_text(p['hwnd'], p['text']),
|
||||||
|
'send_mouse_wheel': lambda p: send_mouse_wheel(p['hwnd'], p['x'], p['y'], p['delta'], p.get('horizontal', False)),
|
||||||
|
'find_edit_child': lambda p: find_edit_child(p['hwnd']),
|
||||||
|
'accessibility_snapshot': lambda p: accessibility_snapshot(p['hwnd'], p.get('max_depth', 4)),
|
||||||
|
'ping': lambda p: {'ok': True, 'pid': os.getpid()},
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main loop: read JSON lines from stdin, dispatch, write JSON lines to stdout."""
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
req = json.loads(line)
|
||||||
|
req_id = req.get('id', 0)
|
||||||
|
method = req.get('method', '')
|
||||||
|
params = req.get('params', {})
|
||||||
|
|
||||||
|
if method not in METHODS:
|
||||||
|
resp = {'id': req_id, 'error': f'unknown method: {method}'}
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
result = METHODS[method](params)
|
||||||
|
resp = {'id': req_id, 'result': result}
|
||||||
|
except Exception as e:
|
||||||
|
resp = {'id': req_id, 'error': str(e)}
|
||||||
|
|
||||||
|
sys.stdout.write(json.dumps(resp, ensure_ascii=False) + '\n')
|
||||||
|
sys.stdout.flush()
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
sys.stdout.write(json.dumps({'id': 0, 'error': f'invalid JSON: {e}'}) + '\n')
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
191
src/utils/computerUse/win32/bridgeClient.ts
Normal file
191
src/utils/computerUse/win32/bridgeClient.ts
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
/**
|
||||||
|
* Python Bridge Client — manages a long-lived Python subprocess for Windows
|
||||||
|
* Computer Use operations.
|
||||||
|
*
|
||||||
|
* Replaces per-call PowerShell spawning with a persistent Python process
|
||||||
|
* that communicates via JSON lines over stdin/stdout.
|
||||||
|
*
|
||||||
|
* Performance: ~1-5ms per call vs ~200-500ms per PowerShell spawn.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as path from 'path'
|
||||||
|
|
||||||
|
interface BridgeRequest {
|
||||||
|
id: number
|
||||||
|
method: string
|
||||||
|
params: Record<string, unknown>
|
||||||
|
}
|
||||||
|
|
||||||
|
interface BridgeResponse {
|
||||||
|
id: number
|
||||||
|
result?: unknown
|
||||||
|
error?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
let bridgeProc: ReturnType<typeof Bun.spawn> | null = null
|
||||||
|
let requestId = 0
|
||||||
|
const pendingRequests = new Map<
|
||||||
|
number,
|
||||||
|
{
|
||||||
|
resolve: (value: unknown) => void
|
||||||
|
reject: (error: Error) => void
|
||||||
|
}
|
||||||
|
>()
|
||||||
|
let outputBuffer = ''
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the Python bridge process if not already running.
|
||||||
|
*/
|
||||||
|
export function ensureBridge(): boolean {
|
||||||
|
if (bridgeProc) return true
|
||||||
|
try {
|
||||||
|
const scriptPath = path.join(__dirname, 'bridge.py')
|
||||||
|
bridgeProc = Bun.spawn(['python', '-u', scriptPath], {
|
||||||
|
stdin: 'pipe',
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'ignore',
|
||||||
|
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUNBUFFERED: '1' },
|
||||||
|
})
|
||||||
|
|
||||||
|
// Read stdout lines asynchronously
|
||||||
|
const reader = bridgeProc.stdout.getReader()
|
||||||
|
const readLoop = async () => {
|
||||||
|
try {
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read()
|
||||||
|
if (done) break
|
||||||
|
outputBuffer += new TextDecoder().decode(value)
|
||||||
|
// Process complete lines
|
||||||
|
let newlineIdx: number
|
||||||
|
while ((newlineIdx = outputBuffer.indexOf('\n')) !== -1) {
|
||||||
|
const line = outputBuffer.slice(0, newlineIdx).trim()
|
||||||
|
outputBuffer = outputBuffer.slice(newlineIdx + 1)
|
||||||
|
if (!line) continue
|
||||||
|
try {
|
||||||
|
const resp: BridgeResponse = JSON.parse(line)
|
||||||
|
const pending = pendingRequests.get(resp.id)
|
||||||
|
if (pending) {
|
||||||
|
pendingRequests.delete(resp.id)
|
||||||
|
if (resp.error) {
|
||||||
|
pending.reject(new Error(resp.error))
|
||||||
|
} else {
|
||||||
|
pending.resolve(resp.result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
readLoop()
|
||||||
|
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
bridgeProc = null
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a request to the Python bridge and wait for the response.
|
||||||
|
*/
|
||||||
|
export async function call<T = unknown>(
|
||||||
|
method: string,
|
||||||
|
params: Record<string, unknown> = {},
|
||||||
|
timeoutMs: number = 10000,
|
||||||
|
): Promise<T> {
|
||||||
|
if (!ensureBridge()) {
|
||||||
|
throw new Error('Python bridge not available')
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = ++requestId
|
||||||
|
const req: BridgeRequest = { id, method, params }
|
||||||
|
|
||||||
|
return new Promise<T>((resolve, reject) => {
|
||||||
|
pendingRequests.set(id, {
|
||||||
|
resolve: resolve as (v: unknown) => void,
|
||||||
|
reject,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Timeout
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
pendingRequests.delete(id)
|
||||||
|
reject(new Error(`Bridge call ${method} timed out after ${timeoutMs}ms`))
|
||||||
|
}, timeoutMs)
|
||||||
|
|
||||||
|
// Clear timeout on resolve/reject
|
||||||
|
const origResolve = resolve
|
||||||
|
const origReject = reject
|
||||||
|
pendingRequests.set(id, {
|
||||||
|
resolve: v => {
|
||||||
|
clearTimeout(timer)
|
||||||
|
;(origResolve as any)(v)
|
||||||
|
},
|
||||||
|
reject: e => {
|
||||||
|
clearTimeout(timer)
|
||||||
|
origReject(e)
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
try {
|
||||||
|
bridgeProc!.stdin.write(JSON.stringify(req) + '\n')
|
||||||
|
bridgeProc!.stdin.flush()
|
||||||
|
} catch (err) {
|
||||||
|
clearTimeout(timer)
|
||||||
|
pendingRequests.delete(id)
|
||||||
|
reject(new Error(`Bridge write failed: ${err}`))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Synchronous call — blocks the event loop. Use sparingly.
|
||||||
|
* Falls back to PowerShell if bridge is not available.
|
||||||
|
*/
|
||||||
|
export function callSync<T = unknown>(
|
||||||
|
method: string,
|
||||||
|
params: Record<string, unknown> = {},
|
||||||
|
timeoutMs: number = 10000,
|
||||||
|
): T | null {
|
||||||
|
// For sync calls, spawn a one-shot Python process.
|
||||||
|
// SECURITY: JSON is passed via stdin (not embedded in -c) to prevent code injection.
|
||||||
|
try {
|
||||||
|
const scriptPath = path.join(__dirname, 'bridge.py')
|
||||||
|
const req = JSON.stringify({ id: 1, method, params })
|
||||||
|
const result = Bun.spawnSync({
|
||||||
|
cmd: ['python', '-u', scriptPath],
|
||||||
|
stdin: Buffer.from(req + '\n'),
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'pipe',
|
||||||
|
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
|
||||||
|
timeout: timeoutMs,
|
||||||
|
})
|
||||||
|
const out = new TextDecoder().decode(result.stdout).trim()
|
||||||
|
if (!out) return null
|
||||||
|
const resp: BridgeResponse = JSON.parse(out)
|
||||||
|
if (resp.error) throw new Error(resp.error)
|
||||||
|
return resp.result as T
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kill the bridge process.
|
||||||
|
*/
|
||||||
|
export function stopBridge(): void {
|
||||||
|
if (bridgeProc) {
|
||||||
|
try {
|
||||||
|
bridgeProc.stdin.end()
|
||||||
|
bridgeProc.kill()
|
||||||
|
} catch {}
|
||||||
|
bridgeProc = null
|
||||||
|
}
|
||||||
|
pendingRequests.clear()
|
||||||
|
outputBuffer = ''
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: No process exit handlers here — the platform-level win32.ts
|
||||||
|
// already registers exit/SIGINT/SIGTERM handlers that call cleanupAll(),
|
||||||
|
// which includes stopBridge(). Adding handlers here would cause double
|
||||||
|
// cleanup and duplicate process.exit() calls.
|
||||||
320
src/utils/computerUse/win32/comExcel.ts
Normal file
320
src/utils/computerUse/win32/comExcel.ts
Normal file
@@ -0,0 +1,320 @@
|
|||||||
|
/**
|
||||||
|
* Excel COM automation via PowerShell.
|
||||||
|
* Completely headless — Visible=false, no window, no user impact.
|
||||||
|
* Each operation opens and closes Excel to avoid orphaned processes.
|
||||||
|
*/
|
||||||
|
|
||||||
|
export interface CellInfo {
|
||||||
|
row: number
|
||||||
|
col: number
|
||||||
|
value: string | number | null
|
||||||
|
formula?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SheetInfo {
|
||||||
|
name: string
|
||||||
|
usedRange: { rows: number; cols: number }
|
||||||
|
cells: CellInfo[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ExcelInfo {
|
||||||
|
sheets: SheetInfo[]
|
||||||
|
sheetNames: string[]
|
||||||
|
}
|
||||||
|
|
||||||
|
function ps(script: string): string {
|
||||||
|
const result = Bun.spawnSync({
|
||||||
|
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'pipe',
|
||||||
|
})
|
||||||
|
const stderr = new TextDecoder().decode(result.stderr).trim()
|
||||||
|
if (result.exitCode !== 0 && stderr) {
|
||||||
|
throw new Error(`PowerShell error: ${stderr}`)
|
||||||
|
}
|
||||||
|
return new TextDecoder().decode(result.stdout).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
function escPath(p: string): string {
|
||||||
|
return p.replace(/'/g, "''")
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveSheet(varName: string, sheet: string | number): string {
|
||||||
|
if (typeof sheet === 'number') {
|
||||||
|
return `$${varName} = $wb.Sheets.Item(${sheet})`
|
||||||
|
}
|
||||||
|
return `$${varName} = $wb.Sheets.Item('${sheet.replace(/'/g, "''")}')`
|
||||||
|
}
|
||||||
|
|
||||||
|
const EXCEL_INIT = `
|
||||||
|
$excel = New-Object -ComObject Excel.Application
|
||||||
|
$excel.Visible = $false
|
||||||
|
$excel.DisplayAlerts = $false
|
||||||
|
`.trim()
|
||||||
|
|
||||||
|
function excelCleanup(hasWorkbook = true): string {
|
||||||
|
const parts: string[] = []
|
||||||
|
if (hasWorkbook) parts.push('if ($wb) { $wb.Close($false) }')
|
||||||
|
parts.push('$excel.Quit()')
|
||||||
|
parts.push('[System.Runtime.InteropServices.Marshal]::ReleaseComObject($excel) | Out-Null')
|
||||||
|
return parts.join('\n ')
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open and read an Excel workbook.
|
||||||
|
* Limits to first 1000 non-empty cells per sheet.
|
||||||
|
*/
|
||||||
|
export function openExcel(filePath: string): ExcelInfo {
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
$result = @{ sheets = @(); sheetNames = @() }
|
||||||
|
foreach ($sheet in $wb.Sheets) {
|
||||||
|
$result.sheetNames += $sheet.Name
|
||||||
|
$ur = $sheet.UsedRange
|
||||||
|
$rows = $ur.Rows.Count
|
||||||
|
$cols = $ur.Columns.Count
|
||||||
|
$cells = @()
|
||||||
|
$count = 0
|
||||||
|
for ($r = 1; $r -le $rows -and $count -lt 1000; $r++) {
|
||||||
|
for ($c = 1; $c -le $cols -and $count -lt 1000; $c++) {
|
||||||
|
$cell = $sheet.Cells.Item($r, $c)
|
||||||
|
$val = $cell.Value2
|
||||||
|
if ($null -ne $val) {
|
||||||
|
$f = $null
|
||||||
|
if ($cell.HasFormula) { $f = $cell.Formula }
|
||||||
|
$entry = @{ row = $r; col = $c; value = $val }
|
||||||
|
if ($f) { $entry.formula = $f }
|
||||||
|
$cells += $entry
|
||||||
|
$count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$result.sheets += @{
|
||||||
|
name = $sheet.Name
|
||||||
|
usedRange = @{ rows = $rows; cols = $cols }
|
||||||
|
cells = $cells
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$result | ConvertTo-Json -Depth 5 -Compress
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = ps(script)
|
||||||
|
if (!raw) throw new Error('No output from openExcel')
|
||||||
|
const parsed = JSON.parse(raw)
|
||||||
|
|
||||||
|
// Normalize: PowerShell single-element arrays become objects
|
||||||
|
const sheets: SheetInfo[] = Array.isArray(parsed.sheets) ? parsed.sheets : [parsed.sheets]
|
||||||
|
const sheetNames: string[] = Array.isArray(parsed.sheetNames) ? parsed.sheetNames : [parsed.sheetNames]
|
||||||
|
|
||||||
|
return {
|
||||||
|
sheets: sheets.map((s: any) => ({
|
||||||
|
name: s.name,
|
||||||
|
usedRange: s.usedRange,
|
||||||
|
cells: Array.isArray(s.cells) ? s.cells : s.cells ? [s.cells] : [],
|
||||||
|
})),
|
||||||
|
sheetNames,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a single cell value.
|
||||||
|
*/
|
||||||
|
export function readCell(
|
||||||
|
filePath: string,
|
||||||
|
sheet: string | number,
|
||||||
|
row: number,
|
||||||
|
col: number,
|
||||||
|
): string | number | null {
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${resolveSheet('sheet', sheet)}
|
||||||
|
$val = $sheet.Cells.Item(${row}, ${col}).Value2
|
||||||
|
if ($null -eq $val) { Write-Output 'null' } else { Write-Output ($val | ConvertTo-Json -Compress) }
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = ps(script)
|
||||||
|
if (raw === 'null' || raw === '') return null
|
||||||
|
return JSON.parse(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a rectangular range of cells as a 2D array.
|
||||||
|
*/
|
||||||
|
export function readRange(
|
||||||
|
filePath: string,
|
||||||
|
sheet: string | number,
|
||||||
|
startRow: number,
|
||||||
|
startCol: number,
|
||||||
|
endRow: number,
|
||||||
|
endCol: number,
|
||||||
|
): (string | number | null)[][] {
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${resolveSheet('sheet', sheet)}
|
||||||
|
$rows = @()
|
||||||
|
for ($r = ${startRow}; $r -le ${endRow}; $r++) {
|
||||||
|
$row = @()
|
||||||
|
for ($c = ${startCol}; $c -le ${endCol}; $c++) {
|
||||||
|
$val = $sheet.Cells.Item($r, $c).Value2
|
||||||
|
$row += if ($null -eq $val) { '__NULL__' } else { $val }
|
||||||
|
}
|
||||||
|
$rows += ,@($row)
|
||||||
|
}
|
||||||
|
$rows | ConvertTo-Json -Depth 3 -Compress
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = ps(script)
|
||||||
|
if (!raw) return []
|
||||||
|
const parsed = JSON.parse(raw)
|
||||||
|
// Normalize single-row case
|
||||||
|
const rows: any[] = Array.isArray(parsed[0]) ? parsed : [parsed]
|
||||||
|
return rows.map((row: any[]) =>
|
||||||
|
row.map((v: any) => (v === '__NULL__' ? null : v)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a single cell value.
|
||||||
|
*/
|
||||||
|
export function writeCell(
|
||||||
|
filePath: string,
|
||||||
|
sheet: string | number,
|
||||||
|
row: number,
|
||||||
|
col: number,
|
||||||
|
value: string | number,
|
||||||
|
): boolean {
|
||||||
|
const jsonVal = JSON.stringify(value)
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${resolveSheet('sheet', sheet)}
|
||||||
|
$sheet.Cells.Item(${row}, ${col}).Value2 = (ConvertFrom-Json '${jsonVal.replace(/'/g, "''")}')
|
||||||
|
$wb.Save()
|
||||||
|
Write-Output 'true'
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
return ps(script) === 'true'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write a 2D array of values starting at (startRow, startCol).
|
||||||
|
*/
|
||||||
|
export function writeRange(
|
||||||
|
filePath: string,
|
||||||
|
sheet: string | number,
|
||||||
|
startRow: number,
|
||||||
|
startCol: number,
|
||||||
|
data: (string | number | null)[][],
|
||||||
|
): boolean {
|
||||||
|
const jsonData = JSON.stringify(data).replace(/'/g, "''")
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${resolveSheet('sheet', sheet)}
|
||||||
|
$data = ConvertFrom-Json '${jsonData}'
|
||||||
|
for ($r = 0; $r -lt $data.Count; $r++) {
|
||||||
|
$row = $data[$r]
|
||||||
|
for ($c = 0; $c -lt $row.Count; $c++) {
|
||||||
|
$val = $row[$c]
|
||||||
|
if ($null -ne $val) {
|
||||||
|
if ($val -is [int] -or $val -is [long] -or $val -is [double] -or $val -is [decimal]) {
|
||||||
|
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [double]$val
|
||||||
|
} else {
|
||||||
|
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [string]$val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$wb.Save()
|
||||||
|
Write-Output 'true'
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
return ps(script) === 'true'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a formula on a cell.
|
||||||
|
*/
|
||||||
|
export function setFormula(
|
||||||
|
filePath: string,
|
||||||
|
sheet: string | number,
|
||||||
|
row: number,
|
||||||
|
col: number,
|
||||||
|
formula: string,
|
||||||
|
): boolean {
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${resolveSheet('sheet', sheet)}
|
||||||
|
$sheet.Cells.Item(${row}, ${col}).Formula = '${formula.replace(/'/g, "''")}'
|
||||||
|
$wb.Save()
|
||||||
|
Write-Output 'true'
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
return ps(script) === 'true'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save workbook. If savePath is given, SaveAs to that path; otherwise Save in place.
|
||||||
|
*/
|
||||||
|
export function saveExcel(filePath: string, savePath?: string): boolean {
|
||||||
|
const saveCmd = savePath
|
||||||
|
? `$wb.SaveAs('${escPath(savePath)}')`
|
||||||
|
: '$wb.Save()'
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
|
||||||
|
${saveCmd}
|
||||||
|
Write-Output 'true'
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
return ps(script) === 'true'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new empty workbook and save it to the given path.
|
||||||
|
*/
|
||||||
|
export function createExcel(savePath: string): boolean {
|
||||||
|
const script = `
|
||||||
|
${EXCEL_INIT}
|
||||||
|
try {
|
||||||
|
$wb = $excel.Workbooks.Add()
|
||||||
|
$wb.SaveAs('${escPath(savePath)}')
|
||||||
|
Write-Output 'true'
|
||||||
|
} finally {
|
||||||
|
${excelCleanup()}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
return ps(script) === 'true'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* closeExcel is a no-op since each operation opens and closes its own COM instance.
|
||||||
|
*/
|
||||||
|
export function closeExcel(_filePath: string): void {
|
||||||
|
// No-op: each function manages its own Excel lifecycle
|
||||||
|
}
|
||||||
450
src/utils/computerUse/win32/comWord.ts
Normal file
450
src/utils/computerUse/win32/comWord.ts
Normal file
@@ -0,0 +1,450 @@
|
|||||||
|
/**
|
||||||
|
* Word COM automation module for Windows.
|
||||||
|
* Uses PowerShell to drive Word.Application COM object — fully headless (Visible=false).
|
||||||
|
* Each function builds a PowerShell script, runs it via Bun.spawnSync, and parses JSON output.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface WordParagraph {
|
||||||
|
text: string
|
||||||
|
bold?: boolean
|
||||||
|
italic?: boolean
|
||||||
|
fontSize?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface WordTable {
|
||||||
|
rows: number
|
||||||
|
cols: number
|
||||||
|
data: string[][]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface WordDocInfo {
|
||||||
|
text: string
|
||||||
|
paragraphs: WordParagraph[]
|
||||||
|
tables: WordTable[]
|
||||||
|
wordCount: number
|
||||||
|
pageCount: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AppendTextOptions {
|
||||||
|
bold?: boolean
|
||||||
|
italic?: boolean
|
||||||
|
fontSize?: number
|
||||||
|
fontName?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// PowerShell runner
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function runPs(script: string): string {
|
||||||
|
const result = Bun.spawnSync({
|
||||||
|
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'pipe',
|
||||||
|
})
|
||||||
|
return new TextDecoder().decode(result.stdout).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseJsonOutput<T>(raw: string, fallback: T): T {
|
||||||
|
if (!raw) return fallback
|
||||||
|
try {
|
||||||
|
return JSON.parse(raw) as T
|
||||||
|
} catch {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Escape a string for safe embedding inside a PowerShell single-quoted string. */
|
||||||
|
function psEscape(s: string): string {
|
||||||
|
return s.replace(/'/g, "''")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Word COM wrapper template
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps a Word COM script body with standard open/cleanup boilerplate.
|
||||||
|
* The body receives $word and $doc variables.
|
||||||
|
* If `openPath` is provided the document is opened; otherwise a new doc is created.
|
||||||
|
*/
|
||||||
|
function wrapWordScript(body: string, openPath?: string): string {
|
||||||
|
const openCmd = openPath
|
||||||
|
? `$doc = $word.Documents.Open('${psEscape(openPath)}')`
|
||||||
|
: '$doc = $word.Documents.Add()'
|
||||||
|
|
||||||
|
return `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
${openCmd}
|
||||||
|
${body}
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Same as wrapWordScript but the body is responsible for saving before close.
|
||||||
|
* After body runs, $doc.Save() is called automatically.
|
||||||
|
*/
|
||||||
|
function wrapWordScriptWithSave(body: string, openPath: string): string {
|
||||||
|
return `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
$doc = $word.Documents.Open('${psEscape(openPath)}')
|
||||||
|
${body}
|
||||||
|
$doc.Save()
|
||||||
|
Write-Output '{"ok":true}'
|
||||||
|
} catch {
|
||||||
|
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 1. openWord
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function openWord(filePath: string): Promise<WordDocInfo> {
|
||||||
|
const script = wrapWordScript(
|
||||||
|
`
|
||||||
|
# Paragraphs (limit 500)
|
||||||
|
$paras = @()
|
||||||
|
$paraCount = $doc.Paragraphs.Count
|
||||||
|
$limit = [Math]::Min($paraCount, 500)
|
||||||
|
for ($i = 1; $i -le $limit; $i++) {
|
||||||
|
$p = $doc.Paragraphs.Item($i)
|
||||||
|
$r = $p.Range
|
||||||
|
$paras += @{
|
||||||
|
text = $r.Text -replace '\\r$',''
|
||||||
|
bold = [bool]($r.Font.Bold -eq -1)
|
||||||
|
italic = [bool]($r.Font.Italic -eq -1)
|
||||||
|
fontSize = $r.Font.Size
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Tables
|
||||||
|
$tables = @()
|
||||||
|
foreach ($table in $doc.Tables) {
|
||||||
|
$rows = $table.Rows.Count
|
||||||
|
$cols = $table.Columns.Count
|
||||||
|
$data = @()
|
||||||
|
for ($r = 1; $r -le $rows; $r++) {
|
||||||
|
$row = @()
|
||||||
|
for ($c = 1; $c -le $cols; $c++) {
|
||||||
|
try {
|
||||||
|
$cellText = $table.Cell($r, $c).Range.Text
|
||||||
|
# Trim trailing \\r\\a that Word adds to cell text
|
||||||
|
$cellText = $cellText -replace '[\\r\\n\\a]+$',''
|
||||||
|
$row += $cellText
|
||||||
|
} catch {
|
||||||
|
$row += ''
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$data += ,@($row)
|
||||||
|
}
|
||||||
|
$tables += @{ rows = $rows; cols = $cols; data = $data }
|
||||||
|
}
|
||||||
|
|
||||||
|
# Counts: wdStatisticWords=0, wdStatisticPages=2
|
||||||
|
$wordCount = $doc.ComputeStatistics(0)
|
||||||
|
$pageCount = $doc.ComputeStatistics(2)
|
||||||
|
|
||||||
|
$result = @{
|
||||||
|
text = $doc.Content.Text
|
||||||
|
paragraphs = $paras
|
||||||
|
tables = $tables
|
||||||
|
wordCount = $wordCount
|
||||||
|
pageCount = $pageCount
|
||||||
|
}
|
||||||
|
Write-Output (ConvertTo-Json $result -Depth 5 -Compress)
|
||||||
|
`,
|
||||||
|
filePath,
|
||||||
|
)
|
||||||
|
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<WordDocInfo>(raw, {
|
||||||
|
text: '',
|
||||||
|
paragraphs: [],
|
||||||
|
tables: [],
|
||||||
|
wordCount: 0,
|
||||||
|
pageCount: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 2. readText
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function readText(filePath: string): Promise<string> {
|
||||||
|
const script = wrapWordScript(
|
||||||
|
`Write-Output $doc.Content.Text`,
|
||||||
|
filePath,
|
||||||
|
)
|
||||||
|
return runPs(script)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 3. appendText
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function appendText(
|
||||||
|
filePath: string,
|
||||||
|
text: string,
|
||||||
|
opts?: AppendTextOptions,
|
||||||
|
): Promise<boolean> {
|
||||||
|
const fontSetup = opts
|
||||||
|
? [
|
||||||
|
opts.bold !== undefined ? `$sel.Font.Bold = ${opts.bold ? '-1' : '0'}` : '',
|
||||||
|
opts.italic !== undefined ? `$sel.Font.Italic = ${opts.italic ? '-1' : '0'}` : '',
|
||||||
|
opts.fontSize !== undefined ? `$sel.Font.Size = ${opts.fontSize}` : '',
|
||||||
|
opts.fontName ? `$sel.Font.Name = '${psEscape(opts.fontName)}'` : '',
|
||||||
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join('\n ')
|
||||||
|
: ''
|
||||||
|
|
||||||
|
const body = `
|
||||||
|
$sel = $word.Selection
|
||||||
|
$sel.EndKey(6) | Out-Null
|
||||||
|
${fontSetup}
|
||||||
|
$sel.TypeText('${psEscape(text)}')
|
||||||
|
`
|
||||||
|
|
||||||
|
const script = wrapWordScriptWithSave(body, filePath)
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 4. insertText
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function insertText(
|
||||||
|
filePath: string,
|
||||||
|
paraIndex: number,
|
||||||
|
text: string,
|
||||||
|
): Promise<boolean> {
|
||||||
|
const body = `
|
||||||
|
$doc.Paragraphs.Item(${paraIndex}).Range.InsertBefore('${psEscape(text)}')
|
||||||
|
`
|
||||||
|
const script = wrapWordScriptWithSave(body, filePath)
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 5. findReplace
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function findReplace(
|
||||||
|
filePath: string,
|
||||||
|
find: string,
|
||||||
|
replace: string,
|
||||||
|
replaceAll?: boolean,
|
||||||
|
): Promise<number> {
|
||||||
|
// wdReplaceAll=2, wdReplaceOne=1
|
||||||
|
const replaceConst = replaceAll !== false ? 2 : 1
|
||||||
|
|
||||||
|
const body = `
|
||||||
|
$content = $doc.Content
|
||||||
|
$findObj = $content.Find
|
||||||
|
$findObj.ClearFormatting()
|
||||||
|
$findObj.Replacement.ClearFormatting()
|
||||||
|
|
||||||
|
# Count replacements by iterating
|
||||||
|
$count = 0
|
||||||
|
$findObj.Text = '${psEscape(find)}'
|
||||||
|
$findObj.Replacement.Text = '${psEscape(replace)}'
|
||||||
|
$findObj.Forward = $true
|
||||||
|
$findObj.Wrap = 0
|
||||||
|
$findObj.Format = $false
|
||||||
|
$findObj.MatchCase = $false
|
||||||
|
$findObj.MatchWholeWord = $false
|
||||||
|
$findObj.MatchWildcards = $false
|
||||||
|
|
||||||
|
if (${replaceConst} -eq 2) {
|
||||||
|
# Count occurrences first using a clone of content
|
||||||
|
$range2 = $doc.Content.Duplicate
|
||||||
|
while ($range2.Find.Execute('${psEscape(find)}')) { $count++ }
|
||||||
|
# Now do the actual replace
|
||||||
|
$findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 2)
|
||||||
|
} else {
|
||||||
|
$found = $findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 1)
|
||||||
|
if ($found) { $count = 1 }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
const script = `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
$doc = $word.Documents.Open('${psEscape(filePath)}')
|
||||||
|
${body}
|
||||||
|
$doc.Save()
|
||||||
|
Write-Output ('{"count":' + $count + '}')
|
||||||
|
} catch {
|
||||||
|
Write-Output '{"count":0}'
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ count: number }>(raw, { count: 0 }).count
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 6. insertTable
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function insertTable(
|
||||||
|
filePath: string,
|
||||||
|
rows: number,
|
||||||
|
cols: number,
|
||||||
|
data: string[][],
|
||||||
|
): Promise<boolean> {
|
||||||
|
// Build PowerShell array literal for the data
|
||||||
|
const psData = data
|
||||||
|
.map(
|
||||||
|
(row) =>
|
||||||
|
',@(' + row.map((cell) => `'${psEscape(cell)}'`).join(',') + ')',
|
||||||
|
)
|
||||||
|
.join('\n ')
|
||||||
|
|
||||||
|
const body = `
|
||||||
|
$sel = $word.Selection
|
||||||
|
$sel.EndKey(6) | Out-Null
|
||||||
|
$table = $doc.Tables.Add($sel.Range, ${rows}, ${cols})
|
||||||
|
$data = @(${psData})
|
||||||
|
for ($r = 0; $r -lt $data.Count; $r++) {
|
||||||
|
for ($c = 0; $c -lt $data[$r].Count; $c++) {
|
||||||
|
$table.Cell($r + 1, $c + 1).Range.Text = $data[$r][$c]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
|
||||||
|
const script = wrapWordScriptWithSave(body, filePath)
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 7. saveWord
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function saveWord(
|
||||||
|
filePath: string,
|
||||||
|
savePath?: string,
|
||||||
|
): Promise<boolean> {
|
||||||
|
if (!savePath || savePath === filePath) {
|
||||||
|
const script = wrapWordScriptWithSave('', filePath)
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
const body = `$doc.SaveAs('${psEscape(savePath)}')`
|
||||||
|
const script = `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
$doc = $word.Documents.Open('${psEscape(filePath)}')
|
||||||
|
${body}
|
||||||
|
Write-Output '{"ok":true}'
|
||||||
|
} catch {
|
||||||
|
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 8. saveAsPdf
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function saveAsPdf(
|
||||||
|
filePath: string,
|
||||||
|
pdfPath: string,
|
||||||
|
): Promise<boolean> {
|
||||||
|
// wdFormatPDF = 17
|
||||||
|
const body = `$doc.SaveAs2('${psEscape(pdfPath)}', 17)`
|
||||||
|
|
||||||
|
const script = `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
$doc = $word.Documents.Open('${psEscape(filePath)}')
|
||||||
|
${body}
|
||||||
|
Write-Output '{"ok":true}'
|
||||||
|
} catch {
|
||||||
|
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 9. createWord
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function createWord(savePath: string): Promise<boolean> {
|
||||||
|
const script = `
|
||||||
|
$word = New-Object -ComObject Word.Application
|
||||||
|
$word.Visible = $false
|
||||||
|
$word.DisplayAlerts = 0
|
||||||
|
try {
|
||||||
|
$doc = $word.Documents.Add()
|
||||||
|
$doc.SaveAs('${psEscape(savePath)}')
|
||||||
|
Write-Output '{"ok":true}'
|
||||||
|
} catch {
|
||||||
|
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
|
||||||
|
} finally {
|
||||||
|
if ($doc -ne $null) { $doc.Close($false); }
|
||||||
|
if ($word -ne $null) { $word.Quit(); }
|
||||||
|
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const raw = runPs(script)
|
||||||
|
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// 10. closeWord (no-op)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* closeWord is a no-op since each operation opens and closes its own COM instance.
|
||||||
|
*/
|
||||||
|
export function closeWord(_filePath: string): void {
|
||||||
|
// No-op: each function manages its own Word lifecycle
|
||||||
|
}
|
||||||
254
src/utils/computerUse/win32/inputIndicator.ts
Normal file
254
src/utils/computerUse/win32/inputIndicator.ts
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
/**
|
||||||
|
* Input Indicator — floating label showing what Computer Use is doing
|
||||||
|
* on the bound window.
|
||||||
|
*
|
||||||
|
* Displays a small overlay near the bottom of the bound window:
|
||||||
|
* ⌨ Typing "hello world..."
|
||||||
|
* 🖱 Click (120, 50)
|
||||||
|
* ⌨ Ctrl+S
|
||||||
|
* 📜 Scroll ↓ 3
|
||||||
|
* ✅ Done
|
||||||
|
*
|
||||||
|
* Auto-fades after 2 seconds of inactivity.
|
||||||
|
* Click-through, TOPMOST, no taskbar icon.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as fs from 'fs'
|
||||||
|
import * as path from 'path'
|
||||||
|
import { validateHwnd, getTmpDir } from './shared.js'
|
||||||
|
|
||||||
|
const INDICATOR_WIDTH = 350
|
||||||
|
const INDICATOR_HEIGHT = 28
|
||||||
|
const FADE_AFTER_MS = 2000
|
||||||
|
const BG_COLOR = '30, 30, 30' // dark background
|
||||||
|
const TEXT_COLOR = '220, 220, 220' // light text
|
||||||
|
const ACCENT_COLOR = '80, 200, 80' // green accent for active
|
||||||
|
|
||||||
|
let indicatorProc: ReturnType<typeof Bun.spawn> | null = null
|
||||||
|
let stopFile: string | null = null
|
||||||
|
let scriptFile: string | null = null
|
||||||
|
let msgFile: string | null = null
|
||||||
|
|
||||||
|
function buildIndicatorScript(hwnd: string, sf: string): string {
|
||||||
|
const sfEsc = sf.replace(/\\/g, '\\\\')
|
||||||
|
return `
|
||||||
|
Add-Type -AssemblyName System.Windows.Forms
|
||||||
|
Add-Type -AssemblyName System.Drawing
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class Indicator {
|
||||||
|
[DllImport("user32.dll")] public static extern bool IsWindow(IntPtr h);
|
||||||
|
[DllImport("user32.dll",SetLastError=true)] public static extern int SetWindowLong(IntPtr h, int i, int v);
|
||||||
|
[DllImport("user32.dll",SetLastError=true)] public static extern int GetWindowLong(IntPtr h, int i);
|
||||||
|
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
|
||||||
|
[DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||||
|
[StructLayout(LayoutKind.Sequential)] public struct RECT { public int L,T,R,B; }
|
||||||
|
public const int GWL_EXSTYLE = -20;
|
||||||
|
public const int WS_EX_LAYERED = 0x80000;
|
||||||
|
public const int WS_EX_TRANSPARENT = 0x20;
|
||||||
|
public const int WS_EX_TOOLWINDOW = 0x80;
|
||||||
|
public const int WS_EX_NOACTIVATE = 0x08000000;
|
||||||
|
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
|
||||||
|
public const uint SWP_NOACTIVATE = 0x0010;
|
||||||
|
public const uint SWP_SHOWWINDOW = 0x0040;
|
||||||
|
public static void MakeOverlay(IntPtr h) {
|
||||||
|
int ex = GetWindowLong(h, GWL_EXSTYLE);
|
||||||
|
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
|
||||||
|
SetWindowLong(h, GWL_EXSTYLE, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
|
||||||
|
$targetHwnd = [IntPtr]::new([long]${hwnd})
|
||||||
|
$stopFile = '${sfEsc}'
|
||||||
|
$msgFile = $stopFile + '.msg'
|
||||||
|
|
||||||
|
$form = New-Object System.Windows.Forms.Form
|
||||||
|
$form.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
|
||||||
|
$form.ShowInTaskbar = $false
|
||||||
|
$form.TopMost = $true
|
||||||
|
$form.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
|
||||||
|
$form.Size = New-Object System.Drawing.Size(${INDICATOR_WIDTH}, ${INDICATOR_HEIGHT})
|
||||||
|
$form.Location = New-Object System.Drawing.Point(-32000, -32000)
|
||||||
|
$form.BackColor = [System.Drawing.Color]::FromArgb(240, ${BG_COLOR})
|
||||||
|
$form.Opacity = 0.92
|
||||||
|
|
||||||
|
$label = New-Object System.Windows.Forms.Label
|
||||||
|
$label.Dock = [System.Windows.Forms.DockStyle]::Fill
|
||||||
|
$label.ForeColor = [System.Drawing.Color]::FromArgb(${TEXT_COLOR})
|
||||||
|
$label.Font = New-Object System.Drawing.Font("Segoe UI", 10, [System.Drawing.FontStyle]::Regular)
|
||||||
|
$label.TextAlign = [System.Drawing.ContentAlignment]::MiddleLeft
|
||||||
|
$label.Padding = New-Object System.Windows.Forms.Padding(8, 0, 8, 0)
|
||||||
|
$label.Text = ""
|
||||||
|
$form.Controls.Add($label)
|
||||||
|
|
||||||
|
$form.Show()
|
||||||
|
[Indicator]::MakeOverlay($form.Handle)
|
||||||
|
|
||||||
|
$script:lastMsg = ""
|
||||||
|
$script:lastMsgTime = [DateTime]::MinValue
|
||||||
|
$script:visible = $false
|
||||||
|
|
||||||
|
$timer = New-Object System.Windows.Forms.Timer
|
||||||
|
$timer.Interval = 50 # 20fps
|
||||||
|
|
||||||
|
$timer.Add_Tick({
|
||||||
|
if (-not [Indicator]::IsWindow($targetHwnd)) {
|
||||||
|
$timer.Stop(); $form.Close()
|
||||||
|
[System.Windows.Forms.Application]::ExitThread()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if (Test-Path $stopFile) {
|
||||||
|
$timer.Stop(); $form.Close()
|
||||||
|
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
|
||||||
|
try { Remove-Item $msgFile -ErrorAction SilentlyContinue } catch {}
|
||||||
|
[System.Windows.Forms.Application]::ExitThread()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
# Read new message
|
||||||
|
if (Test-Path $msgFile) {
|
||||||
|
try {
|
||||||
|
$msg = Get-Content $msgFile -Raw -Encoding UTF8 -ErrorAction SilentlyContinue
|
||||||
|
if ($msg) {
|
||||||
|
$script:lastMsg = $msg.Trim()
|
||||||
|
$script:lastMsgTime = [DateTime]::Now
|
||||||
|
Remove-Item $msgFile -ErrorAction SilentlyContinue
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fade logic: hide after ${FADE_AFTER_MS}ms of no updates
|
||||||
|
$elapsed = ([DateTime]::Now - $script:lastMsgTime).TotalMilliseconds
|
||||||
|
if ($elapsed -gt ${FADE_AFTER_MS} -and $script:visible) {
|
||||||
|
$form.Visible = $false
|
||||||
|
$script:visible = $false
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if ($elapsed -le ${FADE_AFTER_MS} -and $script:lastMsg -ne "") {
|
||||||
|
# Position at bottom-center of the bound window
|
||||||
|
$wr = New-Object Indicator+RECT
|
||||||
|
[Indicator]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
|
||||||
|
$ww = $wr.R - $wr.L
|
||||||
|
$fx = $wr.L + [int](($ww - ${INDICATOR_WIDTH}) / 2)
|
||||||
|
$fy = $wr.B - ${INDICATOR_HEIGHT} - 8
|
||||||
|
$label.Text = $script:lastMsg
|
||||||
|
[Indicator]::SetWindowPos($form.Handle, [Indicator]::HWND_TOPMOST,
|
||||||
|
$fx, $fy, 0, 0,
|
||||||
|
0x0001 -bor [Indicator]::SWP_NOACTIVATE -bor [Indicator]::SWP_SHOWWINDOW) | Out-Null
|
||||||
|
$form.Visible = $true
|
||||||
|
$script:visible = $true
|
||||||
|
# Fade opacity near end
|
||||||
|
if ($elapsed -gt ${FADE_AFTER_MS * 0.7}) {
|
||||||
|
$form.Opacity = [Math]::Max(0.3, 0.92 * (1.0 - ($elapsed - ${FADE_AFTER_MS * 0.7}) / ${FADE_AFTER_MS * 0.3}))
|
||||||
|
} else {
|
||||||
|
$form.Opacity = 0.92
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
$timer.Start()
|
||||||
|
[System.Windows.Forms.Application]::Run()
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Start the input indicator for a bound window */
|
||||||
|
export function showIndicator(hwnd: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
hideIndicator()
|
||||||
|
try {
|
||||||
|
const tmpDir = getTmpDir()
|
||||||
|
const ts = Date.now()
|
||||||
|
stopFile = path.join(tmpDir, `cu_indicator_stop_${ts}`)
|
||||||
|
scriptFile = path.join(tmpDir, `cu_indicator_${ts}.ps1`)
|
||||||
|
msgFile = stopFile + '.msg'
|
||||||
|
fs.writeFileSync(scriptFile, buildIndicatorScript(hwnd, stopFile), 'utf-8')
|
||||||
|
indicatorProc = Bun.spawn(
|
||||||
|
[
|
||||||
|
'powershell',
|
||||||
|
'-NoProfile',
|
||||||
|
'-ExecutionPolicy',
|
||||||
|
'Bypass',
|
||||||
|
'-File',
|
||||||
|
scriptFile,
|
||||||
|
],
|
||||||
|
{ stdout: 'ignore', stderr: 'ignore' },
|
||||||
|
)
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Update the indicator message */
|
||||||
|
export function updateIndicator(message: string): void {
|
||||||
|
if (!msgFile) return
|
||||||
|
try {
|
||||||
|
fs.writeFileSync(msgFile, message, 'utf-8')
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Hide and destroy the indicator */
|
||||||
|
export function hideIndicator(): void {
|
||||||
|
if (stopFile) {
|
||||||
|
try {
|
||||||
|
fs.writeFileSync(stopFile, 'STOP', 'utf-8')
|
||||||
|
} catch {}
|
||||||
|
setTimeout(() => {
|
||||||
|
try {
|
||||||
|
indicatorProc?.kill()
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
if (scriptFile) fs.unlinkSync(scriptFile)
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
if (stopFile) fs.unlinkSync(stopFile)
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
if (msgFile) fs.unlinkSync(msgFile)
|
||||||
|
} catch {}
|
||||||
|
}, 2000)
|
||||||
|
}
|
||||||
|
indicatorProc = null
|
||||||
|
stopFile = null
|
||||||
|
scriptFile = null
|
||||||
|
msgFile = null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Convenience methods for common actions ──
|
||||||
|
|
||||||
|
export function indicateTyping(text: string): void {
|
||||||
|
const preview = text.length > 30 ? text.slice(0, 30) + '...' : text
|
||||||
|
updateIndicator(`\u2328 Typing "${preview}"`)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function indicateKey(combo: string): void {
|
||||||
|
updateIndicator(`\u2328 ${combo}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function indicateClick(
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
button: string = 'left',
|
||||||
|
): void {
|
||||||
|
updateIndicator(
|
||||||
|
`\uD83D\uDDB1 ${button === 'right' ? 'Right-click' : 'Click'} (${x}, ${y})`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function indicateScroll(direction: string, amount: number): void {
|
||||||
|
const arrow =
|
||||||
|
direction === 'up'
|
||||||
|
? '\u2191'
|
||||||
|
: direction === 'down'
|
||||||
|
? '\u2193'
|
||||||
|
: direction === 'left'
|
||||||
|
? '\u2190'
|
||||||
|
: '\u2192'
|
||||||
|
updateIndicator(`\uD83D\uDCDC Scroll ${arrow} ${amount}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function indicateDone(): void {
|
||||||
|
updateIndicator('\u2705 Done')
|
||||||
|
}
|
||||||
@@ -3,6 +3,8 @@
|
|||||||
* Captures a screen region or window, then runs WinRT OCR to extract text.
|
* Captures a screen region or window, then runs WinRT OCR to extract text.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { ps as runPs } from './shared.js'
|
||||||
|
|
||||||
export interface OcrLine {
|
export interface OcrLine {
|
||||||
text: string
|
text: string
|
||||||
bounds: { x: number; y: number; w: number; h: number }
|
bounds: { x: number; y: number; w: number; h: number }
|
||||||
@@ -18,15 +20,6 @@ function emptyResult(language: string): OcrResult {
|
|||||||
return { text: '', lines: [], language }
|
return { text: '', lines: [], language }
|
||||||
}
|
}
|
||||||
|
|
||||||
function runPs(script: string): string {
|
|
||||||
const result = Bun.spawnSync({
|
|
||||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
|
||||||
stdout: 'pipe',
|
|
||||||
stderr: 'pipe',
|
|
||||||
})
|
|
||||||
return new TextDecoder().decode(result.stdout).trim()
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PowerShell script that:
|
* PowerShell script that:
|
||||||
* 1. Screenshots a screen region using CopyFromScreen
|
* 1. Screenshots a screen region using CopyFromScreen
|
||||||
|
|||||||
127
src/utils/computerUse/win32/shared.ts
Normal file
127
src/utils/computerUse/win32/shared.ts
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
/**
|
||||||
|
* Shared utilities for win32 Computer Use modules.
|
||||||
|
* Single source of truth — no more duplication across files.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Validate HWND is a pure numeric string — prevents PowerShell/Python injection. */
|
||||||
|
export function validateHwnd(hwnd: string): string {
|
||||||
|
if (!/^\d+$/.test(hwnd)) {
|
||||||
|
throw new Error(`Invalid HWND: "${hwnd}" — must be numeric`)
|
||||||
|
}
|
||||||
|
return hwnd
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a PowerShell script synchronously, return stdout trimmed. */
|
||||||
|
export function ps(script: string): string {
|
||||||
|
const result = Bun.spawnSync({
|
||||||
|
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'pipe',
|
||||||
|
})
|
||||||
|
return new TextDecoder().decode(result.stdout).trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a PowerShell script synchronously, return null on failure. */
|
||||||
|
export function runPs(script: string): string | null {
|
||||||
|
try {
|
||||||
|
const result = Bun.spawnSync({
|
||||||
|
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||||
|
stdout: 'pipe',
|
||||||
|
stderr: 'pipe',
|
||||||
|
})
|
||||||
|
if (result.exitCode !== 0) return null
|
||||||
|
return new TextDecoder().decode(result.stdout).trim()
|
||||||
|
} catch {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a PowerShell script asynchronously. */
|
||||||
|
export async function psAsync(script: string): Promise<string> {
|
||||||
|
const proc = Bun.spawn(
|
||||||
|
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||||
|
{ stdout: 'pipe', stderr: 'pipe' },
|
||||||
|
)
|
||||||
|
const out = await new Response(proc.stdout).text()
|
||||||
|
await proc.exited
|
||||||
|
return out.trim()
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the system temp directory. */
|
||||||
|
export function getTmpDir(): string {
|
||||||
|
return process.env.TEMP || process.env.TMP || '/tmp'
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Virtual key code mapping — canonical, complete. */
|
||||||
|
export const VK_MAP: Record<string, number> = {
|
||||||
|
backspace: 0x08,
|
||||||
|
tab: 0x09,
|
||||||
|
enter: 0x0d,
|
||||||
|
return: 0x0d,
|
||||||
|
shift: 0x10,
|
||||||
|
lshift: 0xa0,
|
||||||
|
rshift: 0xa1,
|
||||||
|
ctrl: 0x11,
|
||||||
|
control: 0x11,
|
||||||
|
lcontrol: 0xa2,
|
||||||
|
rcontrol: 0xa3,
|
||||||
|
alt: 0x12,
|
||||||
|
option: 0x12,
|
||||||
|
menu: 0x12,
|
||||||
|
lalt: 0xa4,
|
||||||
|
ralt: 0xa5,
|
||||||
|
pause: 0x13,
|
||||||
|
capslock: 0x14,
|
||||||
|
escape: 0x1b,
|
||||||
|
esc: 0x1b,
|
||||||
|
space: 0x20,
|
||||||
|
pageup: 0x21,
|
||||||
|
pagedown: 0x22,
|
||||||
|
end: 0x23,
|
||||||
|
home: 0x24,
|
||||||
|
left: 0x25,
|
||||||
|
up: 0x26,
|
||||||
|
right: 0x27,
|
||||||
|
down: 0x28,
|
||||||
|
insert: 0x2d,
|
||||||
|
delete: 0x2e,
|
||||||
|
win: 0x5b,
|
||||||
|
meta: 0x5b,
|
||||||
|
command: 0x5b,
|
||||||
|
cmd: 0x5b,
|
||||||
|
super: 0x5b,
|
||||||
|
numlock: 0x90,
|
||||||
|
scrolllock: 0x91,
|
||||||
|
printscreen: 0x2c,
|
||||||
|
f1: 0x70,
|
||||||
|
f2: 0x71,
|
||||||
|
f3: 0x72,
|
||||||
|
f4: 0x73,
|
||||||
|
f5: 0x74,
|
||||||
|
f6: 0x75,
|
||||||
|
f7: 0x76,
|
||||||
|
f8: 0x77,
|
||||||
|
f9: 0x78,
|
||||||
|
f10: 0x79,
|
||||||
|
f11: 0x7a,
|
||||||
|
f12: 0x7b,
|
||||||
|
}
|
||||||
|
|
||||||
|
export const MODIFIER_KEYS = new Set([
|
||||||
|
'shift',
|
||||||
|
'lshift',
|
||||||
|
'rshift',
|
||||||
|
'control',
|
||||||
|
'ctrl',
|
||||||
|
'lcontrol',
|
||||||
|
'rcontrol',
|
||||||
|
'alt',
|
||||||
|
'option',
|
||||||
|
'lalt',
|
||||||
|
'ralt',
|
||||||
|
'win',
|
||||||
|
'meta',
|
||||||
|
'command',
|
||||||
|
'cmd',
|
||||||
|
'super',
|
||||||
|
])
|
||||||
@@ -5,6 +5,8 @@
|
|||||||
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
|
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { ps } from './shared.js'
|
||||||
|
|
||||||
export interface UIElement {
|
export interface UIElement {
|
||||||
name: string
|
name: string
|
||||||
controlType: string // Button, Edit, Text, List, Window, etc.
|
controlType: string // Button, Edit, Text, List, Window, etc.
|
||||||
@@ -15,6 +17,48 @@ export interface UIElement {
|
|||||||
children?: UIElement[]
|
children?: UIElement[]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const VALID_CONTROL_TYPES = new Set([
|
||||||
|
'Button',
|
||||||
|
'Calendar',
|
||||||
|
'CheckBox',
|
||||||
|
'ComboBox',
|
||||||
|
'Custom',
|
||||||
|
'DataGrid',
|
||||||
|
'DataItem',
|
||||||
|
'Document',
|
||||||
|
'Edit',
|
||||||
|
'Group',
|
||||||
|
'Header',
|
||||||
|
'HeaderItem',
|
||||||
|
'Hyperlink',
|
||||||
|
'Image',
|
||||||
|
'List',
|
||||||
|
'ListItem',
|
||||||
|
'Menu',
|
||||||
|
'MenuBar',
|
||||||
|
'MenuItem',
|
||||||
|
'Pane',
|
||||||
|
'ProgressBar',
|
||||||
|
'RadioButton',
|
||||||
|
'ScrollBar',
|
||||||
|
'Separator',
|
||||||
|
'Slider',
|
||||||
|
'Spinner',
|
||||||
|
'SplitButton',
|
||||||
|
'StatusBar',
|
||||||
|
'Tab',
|
||||||
|
'TabItem',
|
||||||
|
'Table',
|
||||||
|
'Text',
|
||||||
|
'Thumb',
|
||||||
|
'TitleBar',
|
||||||
|
'ToolBar',
|
||||||
|
'ToolTip',
|
||||||
|
'Tree',
|
||||||
|
'TreeItem',
|
||||||
|
'Window',
|
||||||
|
])
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Helper
|
// Helper
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -25,15 +69,6 @@ Add-Type -AssemblyName UIAutomationTypes
|
|||||||
Add-Type -AssemblyName WindowsBase
|
Add-Type -AssemblyName WindowsBase
|
||||||
`
|
`
|
||||||
|
|
||||||
function ps(script: string): string {
|
|
||||||
const result = Bun.spawnSync({
|
|
||||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
|
||||||
stdout: 'pipe',
|
|
||||||
stderr: 'pipe',
|
|
||||||
})
|
|
||||||
return new TextDecoder().decode(result.stdout).trim()
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseJsonSafe<T>(raw: string, fallback: T): T {
|
function parseJsonSafe<T>(raw: string, fallback: T): T {
|
||||||
try {
|
try {
|
||||||
if (!raw) return fallback
|
if (!raw) return fallback
|
||||||
@@ -143,6 +178,9 @@ export function findElement(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
if (query.controlType) {
|
if (query.controlType) {
|
||||||
|
if (!VALID_CONTROL_TYPES.has(query.controlType)) {
|
||||||
|
return null // Invalid control type
|
||||||
|
}
|
||||||
const v = query.controlType.replace(/'/g, "''")
|
const v = query.controlType.replace(/'/g, "''")
|
||||||
conditions.push(
|
conditions.push(
|
||||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
|
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
|
||||||
@@ -204,7 +242,10 @@ $obj | ConvertTo-Json -Compress
|
|||||||
/**
|
/**
|
||||||
* Click an element by its automationId using InvokePattern.
|
* Click an element by its automationId using InvokePattern.
|
||||||
*/
|
*/
|
||||||
export function clickElement(windowTitle: string, automationId: string): boolean {
|
export function clickElement(
|
||||||
|
windowTitle: string,
|
||||||
|
automationId: string,
|
||||||
|
): boolean {
|
||||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||||
const escapedId = automationId.replace(/'/g, "''")
|
const escapedId = automationId.replace(/'/g, "''")
|
||||||
|
|
||||||
@@ -237,7 +278,11 @@ try {
|
|||||||
/**
|
/**
|
||||||
* Set the value of an element by its automationId using ValuePattern.
|
* Set the value of an element by its automationId using ValuePattern.
|
||||||
*/
|
*/
|
||||||
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
|
export function setValue(
|
||||||
|
windowTitle: string,
|
||||||
|
automationId: string,
|
||||||
|
value: string,
|
||||||
|
): boolean {
|
||||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||||
const escapedId = automationId.replace(/'/g, "''")
|
const escapedId = automationId.replace(/'/g, "''")
|
||||||
const escapedValue = value.replace(/'/g, "''")
|
const escapedValue = value.replace(/'/g, "''")
|
||||||
|
|||||||
268
src/utils/computerUse/win32/virtualCursor.ts
Normal file
268
src/utils/computerUse/win32/virtualCursor.ts
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
/**
|
||||||
|
* Virtual Cursor — visible overlay cursor for the bound window.
|
||||||
|
*
|
||||||
|
* Shows a small colored cursor icon on top of the bound window,
|
||||||
|
* independent of the real mouse cursor. The user's real mouse
|
||||||
|
* stays free for their own use.
|
||||||
|
*
|
||||||
|
* The virtual cursor:
|
||||||
|
* - Moves when Computer Use calls click/moveMouse
|
||||||
|
* - Shows click animations (brief color flash)
|
||||||
|
* - Is click-through (WS_EX_TRANSPARENT) — doesn't intercept real mouse
|
||||||
|
* - Tracks the bound window position via the border tracker
|
||||||
|
* - Disappears when the window is unbound
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as fs from 'fs'
|
||||||
|
import * as path from 'path'
|
||||||
|
import { validateHwnd, getTmpDir } from './shared.js'
|
||||||
|
|
||||||
|
const CURSOR_SIZE = 20
|
||||||
|
const CURSOR_COLOR_R = 255
|
||||||
|
const CURSOR_COLOR_G = 50
|
||||||
|
const CURSOR_COLOR_B = 50
|
||||||
|
const CURSOR_OPACITY = 0.9
|
||||||
|
|
||||||
|
let cursorProc: ReturnType<typeof Bun.spawn> | null = null
|
||||||
|
let cursorStopFile: string | null = null
|
||||||
|
let cursorScriptFile: string | null = null
|
||||||
|
|
||||||
|
function buildCursorScript(hwnd: string, stopFile: string): string {
|
||||||
|
const stopFileEscaped = stopFile.replace(/\\/g, '\\\\')
|
||||||
|
return `
|
||||||
|
Add-Type -AssemblyName System.Windows.Forms
|
||||||
|
Add-Type -AssemblyName System.Drawing
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Drawing;
|
||||||
|
using System.Drawing.Drawing2D;
|
||||||
|
|
||||||
|
public class VCursor {
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool IsWindow(IntPtr hWnd);
|
||||||
|
|
||||||
|
[DllImport("user32.dll", SetLastError = true)]
|
||||||
|
public static extern int SetWindowLong(IntPtr hWnd, int nIndex, int dwNewLong);
|
||||||
|
|
||||||
|
[DllImport("user32.dll", SetLastError = true)]
|
||||||
|
public static extern int GetWindowLong(IntPtr hWnd, int nIndex);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint f);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Sequential)]
|
||||||
|
public struct RECT { public int L, T, R, B; }
|
||||||
|
|
||||||
|
public const int GWL_EXSTYLE = -20;
|
||||||
|
public const int WS_EX_LAYERED = 0x80000;
|
||||||
|
public const int WS_EX_TRANSPARENT = 0x20;
|
||||||
|
public const int WS_EX_TOOLWINDOW = 0x80;
|
||||||
|
public const int WS_EX_NOACTIVATE = 0x08000000;
|
||||||
|
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
|
||||||
|
public const uint SWP_NOACTIVATE = 0x0010;
|
||||||
|
public const uint SWP_SHOWWINDOW = 0x0040;
|
||||||
|
public const uint SWP_NOSIZE = 0x0001;
|
||||||
|
|
||||||
|
public static void MakeOverlay(IntPtr h) {
|
||||||
|
int ex = GetWindowLong(h, GWL_EXSTYLE);
|
||||||
|
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
|
||||||
|
SetWindowLong(h, GWL_EXSTYLE, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
|
||||||
|
$targetHwnd = [IntPtr]::new([long]${hwnd})
|
||||||
|
$stopFile = '${stopFileEscaped}'
|
||||||
|
$cursorSize = ${CURSOR_SIZE}
|
||||||
|
|
||||||
|
# Create cursor form with arrow shape
|
||||||
|
$cursor = New-Object System.Windows.Forms.Form
|
||||||
|
$cursor.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
|
||||||
|
$cursor.ShowInTaskbar = $false
|
||||||
|
$cursor.TopMost = $true
|
||||||
|
$cursor.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
|
||||||
|
$cursor.Size = New-Object System.Drawing.Size($cursorSize, $cursorSize)
|
||||||
|
$cursor.Location = New-Object System.Drawing.Point(-32000, -32000)
|
||||||
|
$cursor.Opacity = ${CURSOR_OPACITY}
|
||||||
|
$cursor.BackColor = [System.Drawing.Color]::Magenta
|
||||||
|
$cursor.TransparencyKey = [System.Drawing.Color]::Magenta
|
||||||
|
|
||||||
|
# Draw arrow cursor shape
|
||||||
|
$bmp = New-Object System.Drawing.Bitmap($cursorSize, $cursorSize)
|
||||||
|
$g = [System.Drawing.Graphics]::FromImage($bmp)
|
||||||
|
$g.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::AntiAlias
|
||||||
|
# Arrow polygon (pointing top-left)
|
||||||
|
$points = @(
|
||||||
|
(New-Object System.Drawing.Point(1, 1)),
|
||||||
|
(New-Object System.Drawing.Point(1, 16)),
|
||||||
|
(New-Object System.Drawing.Point(5, 12)),
|
||||||
|
(New-Object System.Drawing.Point(9, 18)),
|
||||||
|
(New-Object System.Drawing.Point(12, 16)),
|
||||||
|
(New-Object System.Drawing.Point(8, 10)),
|
||||||
|
(New-Object System.Drawing.Point(13, 10)),
|
||||||
|
(New-Object System.Drawing.Point(1, 1))
|
||||||
|
)
|
||||||
|
$brush = New-Object System.Drawing.SolidBrush([System.Drawing.Color]::FromArgb(${CURSOR_COLOR_R}, ${CURSOR_COLOR_G}, ${CURSOR_COLOR_B}))
|
||||||
|
$g.FillPolygon($brush, $points)
|
||||||
|
$pen = New-Object System.Drawing.Pen([System.Drawing.Color]::White, 1)
|
||||||
|
$g.DrawPolygon($pen, $points)
|
||||||
|
$g.Dispose()
|
||||||
|
$cursor.BackgroundImage = $bmp
|
||||||
|
|
||||||
|
$cursor.Show()
|
||||||
|
[VCursor]::MakeOverlay($cursor.Handle)
|
||||||
|
|
||||||
|
# Position file: the TS side writes "x,y" or "x,y,click" to this file
|
||||||
|
$posFile = $stopFile + '.pos'
|
||||||
|
|
||||||
|
$script:lastCX = -32000
|
||||||
|
$script:lastCY = -32000
|
||||||
|
$script:clickFlash = 0
|
||||||
|
|
||||||
|
$timer = New-Object System.Windows.Forms.Timer
|
||||||
|
$timer.Interval = 16 # ~60fps
|
||||||
|
|
||||||
|
$timer.Add_Tick({
|
||||||
|
if (-not [VCursor]::IsWindow($targetHwnd)) {
|
||||||
|
$timer.Stop(); $cursor.Close()
|
||||||
|
[System.Windows.Forms.Application]::ExitThread()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
# Check stop
|
||||||
|
if (Test-Path $stopFile) {
|
||||||
|
$timer.Stop(); $cursor.Close()
|
||||||
|
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
|
||||||
|
try { Remove-Item $posFile -ErrorAction SilentlyContinue } catch {}
|
||||||
|
[System.Windows.Forms.Application]::ExitThread()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
# Read position updates
|
||||||
|
if (Test-Path $posFile) {
|
||||||
|
try {
|
||||||
|
$data = Get-Content $posFile -Raw -ErrorAction SilentlyContinue
|
||||||
|
if ($data) {
|
||||||
|
$parts = $data.Trim().Split(',')
|
||||||
|
if ($parts.Length -ge 2) {
|
||||||
|
$script:lastCX = [int]$parts[0]
|
||||||
|
$script:lastCY = [int]$parts[1]
|
||||||
|
if ($parts.Length -ge 3 -and $parts[2] -eq 'click') {
|
||||||
|
$script:clickFlash = 6 # flash for 6 frames (~100ms)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Remove-Item $posFile -ErrorAction SilentlyContinue
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get window position to convert client coords to screen coords
|
||||||
|
$wr = New-Object VCursor+RECT
|
||||||
|
[VCursor]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
|
||||||
|
$screenX = $wr.L + $script:lastCX
|
||||||
|
$screenY = $wr.T + $script:lastCY
|
||||||
|
|
||||||
|
# Click flash: briefly change color
|
||||||
|
if ($script:clickFlash -gt 0) {
|
||||||
|
$cursor.Opacity = 1.0
|
||||||
|
$script:clickFlash--
|
||||||
|
if ($script:clickFlash -eq 0) {
|
||||||
|
$cursor.Opacity = ${CURSOR_OPACITY}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[VCursor]::SetWindowPos($cursor.Handle, [VCursor]::HWND_TOPMOST,
|
||||||
|
$screenX, $screenY, 0, 0,
|
||||||
|
[VCursor]::SWP_NOSIZE -bor [VCursor]::SWP_NOACTIVATE -bor [VCursor]::SWP_SHOWWINDOW) | Out-Null
|
||||||
|
$cursor.Visible = $true
|
||||||
|
})
|
||||||
|
|
||||||
|
$timer.Start()
|
||||||
|
[System.Windows.Forms.Application]::Run()
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the virtual cursor overlay for a bound window.
|
||||||
|
*/
|
||||||
|
export function showVirtualCursor(hwnd: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
hideVirtualCursor()
|
||||||
|
try {
|
||||||
|
const tmpDir = getTmpDir()
|
||||||
|
const ts = Date.now()
|
||||||
|
const stopFile = path.join(tmpDir, `cu_vcursor_stop_${ts}`)
|
||||||
|
const scriptFile = path.join(tmpDir, `cu_vcursor_${ts}.ps1`)
|
||||||
|
const script = buildCursorScript(hwnd, stopFile)
|
||||||
|
fs.writeFileSync(scriptFile, script, 'utf-8')
|
||||||
|
|
||||||
|
cursorProc = Bun.spawn(
|
||||||
|
[
|
||||||
|
'powershell',
|
||||||
|
'-NoProfile',
|
||||||
|
'-ExecutionPolicy',
|
||||||
|
'Bypass',
|
||||||
|
'-File',
|
||||||
|
scriptFile,
|
||||||
|
],
|
||||||
|
{ stdout: 'ignore', stderr: 'ignore' },
|
||||||
|
)
|
||||||
|
cursorStopFile = stopFile
|
||||||
|
cursorScriptFile = scriptFile
|
||||||
|
return true
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move the virtual cursor to client-area coordinates.
|
||||||
|
*/
|
||||||
|
export function moveVirtualCursor(
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
isClick: boolean = false,
|
||||||
|
): void {
|
||||||
|
if (!cursorStopFile) return
|
||||||
|
const posFile = cursorStopFile + '.pos'
|
||||||
|
try {
|
||||||
|
const data = isClick
|
||||||
|
? `${Math.round(x)},${Math.round(y)},click`
|
||||||
|
: `${Math.round(x)},${Math.round(y)}`
|
||||||
|
fs.writeFileSync(posFile, data, 'utf-8')
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hide and destroy the virtual cursor.
|
||||||
|
*/
|
||||||
|
export function hideVirtualCursor(): void {
|
||||||
|
if (cursorStopFile) {
|
||||||
|
try {
|
||||||
|
fs.writeFileSync(cursorStopFile, 'STOP', 'utf-8')
|
||||||
|
} catch {}
|
||||||
|
setTimeout(() => {
|
||||||
|
try {
|
||||||
|
cursorProc?.kill()
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
if (cursorScriptFile) fs.unlinkSync(cursorScriptFile)
|
||||||
|
} catch {}
|
||||||
|
try {
|
||||||
|
if (cursorStopFile) fs.unlinkSync(cursorStopFile)
|
||||||
|
} catch {}
|
||||||
|
}, 2000)
|
||||||
|
}
|
||||||
|
cursorProc = null
|
||||||
|
cursorStopFile = null
|
||||||
|
cursorScriptFile = null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if virtual cursor is active.
|
||||||
|
*/
|
||||||
|
export function isVirtualCursorActive(): boolean {
|
||||||
|
return cursorProc !== null
|
||||||
|
}
|
||||||
66
src/utils/computerUse/win32/windowBorder.ts
Normal file
66
src/utils/computerUse/win32/windowBorder.ts
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
/**
|
||||||
|
* Visual indicator for bound windows — DWM native border color.
|
||||||
|
*
|
||||||
|
* Uses DwmSetWindowAttribute(DWMWA_BORDER_COLOR) to set a green border
|
||||||
|
* on the bound window. The border:
|
||||||
|
* - Is the window's OWN border, not an overlay — zero offset, zero shadow issues
|
||||||
|
* - Follows window movement/resize/rounded corners automatically (OS-level)
|
||||||
|
* - Persists across repaints, zero performance overhead
|
||||||
|
* - Works on Win11 22000+ (Build 22000 = Windows 11 GA)
|
||||||
|
*
|
||||||
|
* No overlays, no polling, no separate processes, no z-order issues.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { validateHwnd, ps } from './shared.js'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set green border on bound window via DWM.
|
||||||
|
*/
|
||||||
|
export function markBound(hwnd: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
// DWMWA_BORDER_COLOR = 34, COLORREF = 0x00BBGGRR
|
||||||
|
// Green: R=0, G=200, B=0 → 0x0000C800
|
||||||
|
const hr = ps(
|
||||||
|
`Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class CuDwm {
|
||||||
|
[DllImport("dwmapi.dll")]
|
||||||
|
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
$color = [uint32]0x0000C800
|
||||||
|
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
|
||||||
|
)
|
||||||
|
return hr === '0'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove border, restore default.
|
||||||
|
*/
|
||||||
|
export function unmarkBound(hwnd: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
// DWMWA_COLOR_DEFAULT = 0xFFFFFFFF
|
||||||
|
const hr = ps(
|
||||||
|
`Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class CuDwm {
|
||||||
|
[DllImport("dwmapi.dll")]
|
||||||
|
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
$color = [uint32]0xFFFFFFFF
|
||||||
|
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
|
||||||
|
)
|
||||||
|
return hr === '0'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kill all borders — just reset all bound windows.
|
||||||
|
* With DWM approach, no processes to kill.
|
||||||
|
*/
|
||||||
|
export function cleanupAllBorders(): void {
|
||||||
|
// DWM border color is a window attribute — it resets automatically
|
||||||
|
// when the process exits or the window closes. No cleanup needed.
|
||||||
|
}
|
||||||
@@ -4,7 +4,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
export interface WindowInfo {
|
export interface WindowInfo {
|
||||||
hwnd: number
|
hwnd: string
|
||||||
pid: number
|
pid: number
|
||||||
title: string
|
title: string
|
||||||
}
|
}
|
||||||
@@ -59,7 +59,13 @@ public class WinEnum {
|
|||||||
*/
|
*/
|
||||||
export function listWindows(): WindowInfo[] {
|
export function listWindows(): WindowInfo[] {
|
||||||
const result = Bun.spawnSync({
|
const result = Bun.spawnSync({
|
||||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
|
cmd: [
|
||||||
|
'powershell',
|
||||||
|
'-NoProfile',
|
||||||
|
'-NonInteractive',
|
||||||
|
'-Command',
|
||||||
|
ENUM_WINDOWS_PS,
|
||||||
|
],
|
||||||
stdout: 'pipe',
|
stdout: 'pipe',
|
||||||
stderr: 'pipe',
|
stderr: 'pipe',
|
||||||
})
|
})
|
||||||
@@ -75,11 +81,11 @@ export function listWindows(): WindowInfo[] {
|
|||||||
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
|
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
|
||||||
if (firstPipe === -1 || secondPipe === -1) return null
|
if (firstPipe === -1 || secondPipe === -1) return null
|
||||||
|
|
||||||
const hwnd = Number(trimmed.slice(0, firstPipe))
|
const hwnd = trimmed.slice(0, firstPipe)
|
||||||
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
|
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
|
||||||
const title = trimmed.slice(secondPipe + 1)
|
const title = trimmed.slice(secondPipe + 1)
|
||||||
|
|
||||||
if (isNaN(hwnd) || isNaN(pid) || !title) return null
|
if (!hwnd || isNaN(pid) || !title) return null
|
||||||
return { hwnd, pid, title }
|
return { hwnd, pid, title }
|
||||||
})
|
})
|
||||||
.filter((item): item is WindowInfo => item !== null)
|
.filter((item): item is WindowInfo => item !== null)
|
||||||
|
|||||||
696
src/utils/computerUse/win32/windowMessage.ts
Normal file
696
src/utils/computerUse/win32/windowMessage.ts
Normal file
@@ -0,0 +1,696 @@
|
|||||||
|
/**
|
||||||
|
* SendMessage-based input for Win32 windows.
|
||||||
|
*
|
||||||
|
* ALL text/keyboard operations target a specific HWND via SendMessageW.
|
||||||
|
* No SendInput / keybd_event / SendKeys — those are global and conflict with the user.
|
||||||
|
*
|
||||||
|
* Text input strategy:
|
||||||
|
* 1. Short text (≤ CLIPBOARD_THRESHOLD chars): SendMessageW(WM_CHAR) per codepoint
|
||||||
|
* 2. Long text (> threshold): Clipboard.SetText() + SendMessageW(Ctrl+V) paste
|
||||||
|
* Both paths support full Unicode (Chinese, emoji, etc.) without IME involvement.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { validateHwnd, runPs, VK_MAP, MODIFIER_KEYS } from './shared.js'
|
||||||
|
|
||||||
|
/** Character count above which we switch to clipboard paste */
|
||||||
|
const CLIPBOARD_THRESHOLD = 32
|
||||||
|
|
||||||
|
/** Cache findEditChild results — window structure doesn't change while bound */
|
||||||
|
const editChildCache = new Map<string, string | null>()
|
||||||
|
|
||||||
|
/** Clear cached edit-child mappings. Call on unbind. */
|
||||||
|
export function clearEditChildCache(hwnd?: string): void {
|
||||||
|
if (hwnd) {
|
||||||
|
editChildCache.delete(hwnd)
|
||||||
|
} else {
|
||||||
|
editChildCache.clear()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the HWND that should actually receive input messages.
|
||||||
|
* For WinUI 3 apps, returns the InputSite child window.
|
||||||
|
* For traditional Win32 apps, returns the edit control or the original HWND.
|
||||||
|
*/
|
||||||
|
export function resolveInputHwnd(hwnd: string): string {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
return findEditChild(hwnd) ?? hwnd
|
||||||
|
}
|
||||||
|
|
||||||
|
const WINMSG_TYPE = `
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
using System.Text;
|
||||||
|
|
||||||
|
public class WinMsg {
|
||||||
|
public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool EnumChildWindows(IntPtr parent, EnumChildProc proc, IntPtr lParam);
|
||||||
|
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||||
|
public static extern int GetClassName(IntPtr h, StringBuilder sb, int max);
|
||||||
|
|
||||||
|
// CRITICAL: CharSet.Unicode → resolves to SendMessageW
|
||||||
|
// SendMessageW sends Unicode WM_CHAR (full UTF-16 codepoints including CJK)
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
|
||||||
|
public static extern IntPtr SendMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
|
||||||
|
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="PostMessageW")]
|
||||||
|
public static extern bool PostMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
|
||||||
|
|
||||||
|
public static IntPtr MakeLParam(int lo, int hi) {
|
||||||
|
return (IntPtr)((hi << 16) | (lo & 0xFFFF));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build lParam for WM_KEYDOWN / WM_KEYUP with correct scan code
|
||||||
|
// lParam bits: 0-15 repeat count, 16-23 scan code, 24 extended, 30 prev state, 31 transition
|
||||||
|
public static IntPtr KeyDownLParam(uint vk) {
|
||||||
|
uint scanCode = MapVirtualKeyW(vk, 0); // MAPVK_VK_TO_VSC = 0
|
||||||
|
return (IntPtr)(1 | (scanCode << 16)); // repeat=1, scanCode in bits 16-23
|
||||||
|
}
|
||||||
|
public static IntPtr KeyUpLParam(uint vk) {
|
||||||
|
uint scanCode = MapVirtualKeyW(vk, 0);
|
||||||
|
return (IntPtr)(1 | (scanCode << 16) | (1 << 30) | (1u << 31)); // prev=1, transition=1
|
||||||
|
}
|
||||||
|
|
||||||
|
public const uint WM_CHAR = 0x0102;
|
||||||
|
public const uint WM_KEYDOWN = 0x0100;
|
||||||
|
public const uint WM_KEYUP = 0x0101;
|
||||||
|
public const uint WM_LBUTTONDOWN = 0x0201;
|
||||||
|
public const uint WM_LBUTTONUP = 0x0202;
|
||||||
|
public const uint WM_RBUTTONDOWN = 0x0204;
|
||||||
|
public const uint WM_RBUTTONUP = 0x0205;
|
||||||
|
|
||||||
|
public static List<string> childResults = new List<string>();
|
||||||
|
|
||||||
|
public static void FindChildren(IntPtr parent) {
|
||||||
|
childResults.Clear();
|
||||||
|
EnumChildWindows(parent, delegate(IntPtr hWnd, IntPtr lParam) {
|
||||||
|
StringBuilder sb = new StringBuilder(256);
|
||||||
|
GetClassName(hWnd, sb, sb.Capacity);
|
||||||
|
childResults.Add(hWnd.ToInt64() + "|" + sb.ToString());
|
||||||
|
return true;
|
||||||
|
}, IntPtr.Zero);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
`
|
||||||
|
|
||||||
|
// Edit class names in priority order
|
||||||
|
const EDIT_CLASSES = [
|
||||||
|
'Windows.UI.Input.InputSite.WindowClass', // WinUI 3 input bridge (Windows Terminal, etc.)
|
||||||
|
'RichEditD2DPT', // Win11 Notepad (WinUI 3)
|
||||||
|
'RichEdit20W', // WordPad
|
||||||
|
'Edit', // Classic edit controls
|
||||||
|
'Scintilla', // Scintilla-based editors (Notepad++, etc.)
|
||||||
|
'Chrome_RenderWidgetHostHWND', // Chrome/Electron
|
||||||
|
'TextBox', // WPF TextBox
|
||||||
|
'RichTextBox', // WPF RichTextBox
|
||||||
|
'Windows.UI.Core.CoreWindow', // UWP CoreWindow (input target for some UWP apps)
|
||||||
|
]
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the first edit-capable child window of a parent HWND.
|
||||||
|
*
|
||||||
|
* Strategy:
|
||||||
|
* 1. EnumChildWindows — search for known edit control class names
|
||||||
|
* 2. UI Automation fallback — find the first Edit/Document element and get its native HWND
|
||||||
|
*
|
||||||
|
* EnumChildWindows is recursive and enumerates all descendant windows,
|
||||||
|
* but for UWP apps the edit control may be in a different process (hosted
|
||||||
|
* inside ApplicationFrameHost). UI Automation crosses process boundaries.
|
||||||
|
*/
|
||||||
|
export function findEditChild(parentHwnd: string): string | null {
|
||||||
|
parentHwnd = validateHwnd(parentHwnd)
|
||||||
|
|
||||||
|
// Cache hit
|
||||||
|
if (editChildCache.has(parentHwnd)) {
|
||||||
|
return editChildCache.get(parentHwnd)!
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 1: EnumChildWindows (fast, works for Win32 apps)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
[WinMsg]::FindChildren([IntPtr]::new([long]${parentHwnd}))
|
||||||
|
[WinMsg]::childResults | ForEach-Object { $_ }
|
||||||
|
`
|
||||||
|
const raw = runPs(script)
|
||||||
|
if (raw) {
|
||||||
|
const children = raw
|
||||||
|
.split('\n')
|
||||||
|
.filter(Boolean)
|
||||||
|
.map(line => {
|
||||||
|
const trimmed = line.trim()
|
||||||
|
const pipe = trimmed.indexOf('|')
|
||||||
|
if (pipe === -1) return null
|
||||||
|
return {
|
||||||
|
hwnd: trimmed.slice(0, pipe),
|
||||||
|
className: trimmed.slice(pipe + 1),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(
|
||||||
|
(item): item is { hwnd: string; className: string } => item !== null,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Search in priority order
|
||||||
|
for (const editClass of EDIT_CLASSES) {
|
||||||
|
const match = children.find(c => c.className === editClass)
|
||||||
|
if (match) {
|
||||||
|
editChildCache.set(parentHwnd, match.hwnd)
|
||||||
|
return match.hwnd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strategy 2: UI Automation (crosses process boundaries, finds UWP edit controls)
|
||||||
|
const uiaScript = `
|
||||||
|
Add-Type -AssemblyName UIAutomationClient
|
||||||
|
Add-Type -AssemblyName UIAutomationTypes
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class UiaHelper {
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern bool IsWindow(IntPtr hWnd);
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
try {
|
||||||
|
$el = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${parentHwnd}))
|
||||||
|
if ($el -eq $null) { Write-Output 'NONE'; exit }
|
||||||
|
|
||||||
|
# Search for Edit or Document control types (covers text editors)
|
||||||
|
$editCond = [System.Windows.Automation.PropertyCondition]::new(
|
||||||
|
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
|
||||||
|
[System.Windows.Automation.ControlType]::Edit)
|
||||||
|
$docCond = [System.Windows.Automation.PropertyCondition]::new(
|
||||||
|
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
|
||||||
|
[System.Windows.Automation.ControlType]::Document)
|
||||||
|
$orCond = [System.Windows.Automation.OrCondition]::new($editCond, $docCond)
|
||||||
|
|
||||||
|
$found = $el.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $orCond)
|
||||||
|
if ($found -eq $null) { Write-Output 'NONE'; exit }
|
||||||
|
|
||||||
|
$nativeHwnd = $found.Current.NativeWindowHandle
|
||||||
|
if ($nativeHwnd -ne 0) {
|
||||||
|
Write-Output $nativeHwnd
|
||||||
|
} else {
|
||||||
|
Write-Output 'NONE'
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
Write-Output 'NONE'
|
||||||
|
}
|
||||||
|
`
|
||||||
|
const uiaResult = runPs(uiaScript)
|
||||||
|
if (uiaResult && uiaResult !== 'NONE') {
|
||||||
|
const hwnd = uiaResult.trim()
|
||||||
|
if (hwnd && hwnd !== '0') {
|
||||||
|
editChildCache.set(parentHwnd, hwnd)
|
||||||
|
return hwnd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
editChildCache.set(parentHwnd, null)
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a single Unicode character to a window via SendMessageW(WM_CHAR).
|
||||||
|
* Handles surrogate pairs for characters outside BMP (emoji, rare CJK, etc.).
|
||||||
|
*/
|
||||||
|
export function sendChar(hwnd: string, char: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
const codePoint = char.codePointAt(0)
|
||||||
|
if (codePoint === undefined) return false
|
||||||
|
|
||||||
|
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
|
||||||
|
|
||||||
|
// BMP character (U+0000 to U+FFFF): single WM_CHAR
|
||||||
|
if (codePoint <= 0xffff) {
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${codePoint}, [IntPtr]0)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
// Supplementary character (U+10000+): send as UTF-16 surrogate pair
|
||||||
|
// Windows processes surrogate pairs as two sequential WM_CHAR messages
|
||||||
|
const hi = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800
|
||||||
|
const lo = ((codePoint - 0x10000) % 0x400) + 0xdc00
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)
|
||||||
|
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build PowerShell lines that send each codepoint via WM_CHAR.
|
||||||
|
* Handles surrogate pairs for supplementary characters.
|
||||||
|
*/
|
||||||
|
function buildWmCharLines(hwnd: string, text: string): string[] {
|
||||||
|
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
|
||||||
|
const lines: string[] = []
|
||||||
|
for (const ch of text) {
|
||||||
|
const cp = ch.codePointAt(0)!
|
||||||
|
if (cp <= 0xffff) {
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${cp}, [IntPtr]0)`,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
const hi = Math.floor((cp - 0x10000) / 0x400) + 0xd800
|
||||||
|
const lo = ((cp - 0x10000) % 0x400) + 0xdc00
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)`,
|
||||||
|
)
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lines
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Paste text via clipboard into the target window.
|
||||||
|
* Uses Clipboard.SetText() + SendMessageW(Ctrl+V).
|
||||||
|
* NO global APIs (SendInput/keybd_event/SendKeys) — only window-targeted messages.
|
||||||
|
*/
|
||||||
|
function pasteViaClipboard(hwnd: string, text: string): boolean {
|
||||||
|
// Escape single quotes for PowerShell string literal
|
||||||
|
const escaped = text.replace(/'/g, "''")
|
||||||
|
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
Add-Type -AssemblyName System.Windows.Forms
|
||||||
|
|
||||||
|
# Save current clipboard
|
||||||
|
$saved = $null
|
||||||
|
try { $saved = [System.Windows.Forms.Clipboard]::GetText() } catch {}
|
||||||
|
|
||||||
|
# Set our text
|
||||||
|
[System.Windows.Forms.Clipboard]::SetText('${escaped}')
|
||||||
|
|
||||||
|
# Ctrl+V via PostMessage to the target window (NOT global keybd_event)
|
||||||
|
# Must use PostMessage + correct lParam (scan code) for Windows Terminal / ConPTY
|
||||||
|
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x11, [WinMsg]::KeyDownLParam(0x11)) # Ctrl down
|
||||||
|
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x56, [WinMsg]::KeyDownLParam(0x56)) # V down
|
||||||
|
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x56, [WinMsg]::KeyUpLParam(0x56)) # V up
|
||||||
|
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x11, [WinMsg]::KeyUpLParam(0x11)) # Ctrl up
|
||||||
|
|
||||||
|
# Brief wait for paste to complete
|
||||||
|
Start-Sleep -Milliseconds 50
|
||||||
|
|
||||||
|
# Restore clipboard
|
||||||
|
if ($saved -ne $null -and $saved -ne '') {
|
||||||
|
try { [System.Windows.Forms.Clipboard]::SetText($saved) } catch {}
|
||||||
|
} else {
|
||||||
|
try { [System.Windows.Forms.Clipboard]::Clear() } catch {}
|
||||||
|
}
|
||||||
|
Write-Output 'OK'
|
||||||
|
`
|
||||||
|
return runPs(script) === 'OK'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send text to a window via WM_CHAR per Unicode codepoint.
|
||||||
|
* Always uses the WM_CHAR path — reliable across all window types including
|
||||||
|
* Windows Terminal / ConPTY where clipboard-based Ctrl+V doesn't work.
|
||||||
|
* Window-targeted, no global input APIs.
|
||||||
|
*/
|
||||||
|
export function sendText(hwnd: string, text: string): boolean {
|
||||||
|
const targetHwnd = resolveInputHwnd(hwnd)
|
||||||
|
const charLines = buildWmCharLines(targetHwnd, text)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
${charLines.join('\n')}
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a key down or key up event via PostMessageW(WM_KEYDOWN / WM_KEYUP).
|
||||||
|
* Uses PostMessage (async) instead of SendMessage — required for Windows Terminal
|
||||||
|
* and ConPTY-based console windows to correctly process key events.
|
||||||
|
* lParam includes the correct scan code via MapVirtualKeyW.
|
||||||
|
*/
|
||||||
|
export function sendKey(
|
||||||
|
hwnd: string,
|
||||||
|
vk: number,
|
||||||
|
action: 'down' | 'up',
|
||||||
|
): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
const msg = action === 'down' ? '0x0100' : '0x0101'
|
||||||
|
const lParamFn = action === 'down' ? 'KeyDownLParam' : 'KeyUpLParam'
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
[WinMsg]::PostMessage([IntPtr]::new([long]${hwnd}), ${msg}, [IntPtr]${vk}, [WinMsg]::${lParamFn}(${vk}))
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a key combination (e.g. ['ctrl', 'a']).
|
||||||
|
* Holds modifiers via WM_KEYDOWN, presses the key, then releases in reverse.
|
||||||
|
* All via SendMessageW — no global APIs.
|
||||||
|
*/
|
||||||
|
export function sendKeys(hwnd: string, combo: string[]): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
if (combo.length === 0) return false
|
||||||
|
|
||||||
|
const modifiers: number[] = []
|
||||||
|
let mainKey: number | undefined
|
||||||
|
|
||||||
|
for (const key of combo) {
|
||||||
|
const lower = key.toLowerCase()
|
||||||
|
const vk = VK_MAP[lower]
|
||||||
|
if (vk !== undefined) {
|
||||||
|
if (MODIFIER_KEYS.has(lower)) {
|
||||||
|
modifiers.push(vk)
|
||||||
|
} else {
|
||||||
|
mainKey = vk
|
||||||
|
}
|
||||||
|
} else if (lower.length === 1) {
|
||||||
|
// Single character — use its uppercase VK code
|
||||||
|
mainKey = lower.toUpperCase().charCodeAt(0)
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mainKey === undefined) return false
|
||||||
|
|
||||||
|
// Build script: modifiers down, key down, key up, modifiers up (reverse)
|
||||||
|
// Uses PostMessage (async) + correct lParam (scan code) — required for
|
||||||
|
// Windows Terminal / ConPTY to correctly translate key events.
|
||||||
|
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
|
||||||
|
const lines: string[] = []
|
||||||
|
for (const mod of modifiers) {
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mod}, [WinMsg]::KeyDownLParam(${mod}))`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mainKey}, [WinMsg]::KeyDownLParam(${mainKey}))`,
|
||||||
|
)
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mainKey}, [WinMsg]::KeyUpLParam(${mainKey}))`,
|
||||||
|
)
|
||||||
|
for (const mod of [...modifiers].reverse()) {
|
||||||
|
lines.push(
|
||||||
|
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mod}, [WinMsg]::KeyUpLParam(${mod}))`,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
${lines.join('\n')}
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Console Input Buffer (WriteConsoleInput) ─────────────────────────
|
||||||
|
// For terminal/console windows, SendMessageW doesn't reliably inject
|
||||||
|
// key events into the Console Input Buffer that raw-mode stdin reads.
|
||||||
|
// This function uses AttachConsole + WriteConsoleInput to inject directly.
|
||||||
|
|
||||||
|
const CONSOLE_INPUT_TYPE = `
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
|
||||||
|
public class ConsoleInput {
|
||||||
|
[DllImport("kernel32.dll", SetLastError=true)]
|
||||||
|
public static extern bool AttachConsole(uint dwProcessId);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", SetLastError=true)]
|
||||||
|
public static extern bool FreeConsole();
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", SetLastError=true)]
|
||||||
|
public static extern IntPtr GetStdHandle(int nStdHandle);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll", CharSet=CharSet.Unicode, SetLastError=true)]
|
||||||
|
public static extern bool WriteConsoleInput(
|
||||||
|
IntPtr hConsoleInput,
|
||||||
|
INPUT_RECORD[] lpBuffer,
|
||||||
|
uint nLength,
|
||||||
|
out uint lpNumberOfEventsWritten);
|
||||||
|
|
||||||
|
[DllImport("kernel32.dll")]
|
||||||
|
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
|
||||||
|
|
||||||
|
[DllImport("user32.dll")]
|
||||||
|
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId);
|
||||||
|
|
||||||
|
public const int STD_INPUT_HANDLE = -10;
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Explicit)]
|
||||||
|
public struct INPUT_RECORD {
|
||||||
|
[FieldOffset(0)] public ushort EventType;
|
||||||
|
[FieldOffset(4)] public KEY_EVENT_RECORD KeyEvent;
|
||||||
|
}
|
||||||
|
|
||||||
|
[StructLayout(LayoutKind.Explicit, CharSet=CharSet.Unicode)]
|
||||||
|
public struct KEY_EVENT_RECORD {
|
||||||
|
[FieldOffset(0)] public bool bKeyDown;
|
||||||
|
[FieldOffset(4)] public ushort wRepeatCount;
|
||||||
|
[FieldOffset(6)] public ushort wVirtualKeyCode;
|
||||||
|
[FieldOffset(8)] public ushort wVirtualScanCode;
|
||||||
|
[FieldOffset(10)] public char UnicodeChar;
|
||||||
|
[FieldOffset(12)] public uint dwControlKeyState;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static bool SendKeyToConsole(IntPtr hwnd, ushort vk, char ch) {
|
||||||
|
uint pid;
|
||||||
|
GetWindowThreadProcessId(hwnd, out pid);
|
||||||
|
if (pid == 0) return false;
|
||||||
|
|
||||||
|
FreeConsole();
|
||||||
|
if (!AttachConsole(pid)) return false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
|
||||||
|
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
|
||||||
|
|
||||||
|
ushort scanCode = (ushort)MapVirtualKeyW(vk, 0);
|
||||||
|
INPUT_RECORD[] records = new INPUT_RECORD[2];
|
||||||
|
|
||||||
|
// Key down
|
||||||
|
records[0].EventType = 1; // KEY_EVENT
|
||||||
|
records[0].KeyEvent.bKeyDown = true;
|
||||||
|
records[0].KeyEvent.wRepeatCount = 1;
|
||||||
|
records[0].KeyEvent.wVirtualKeyCode = vk;
|
||||||
|
records[0].KeyEvent.wVirtualScanCode = scanCode;
|
||||||
|
records[0].KeyEvent.UnicodeChar = ch;
|
||||||
|
records[0].KeyEvent.dwControlKeyState = 0;
|
||||||
|
|
||||||
|
// Key up
|
||||||
|
records[1].EventType = 1;
|
||||||
|
records[1].KeyEvent.bKeyDown = false;
|
||||||
|
records[1].KeyEvent.wRepeatCount = 1;
|
||||||
|
records[1].KeyEvent.wVirtualKeyCode = vk;
|
||||||
|
records[1].KeyEvent.wVirtualScanCode = scanCode;
|
||||||
|
records[1].KeyEvent.UnicodeChar = ch;
|
||||||
|
records[1].KeyEvent.dwControlKeyState = 0;
|
||||||
|
|
||||||
|
uint written;
|
||||||
|
return WriteConsoleInput(hInput, records, 2, out written);
|
||||||
|
} finally {
|
||||||
|
FreeConsole();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static bool SendTextToConsole(IntPtr hwnd, string text) {
|
||||||
|
uint pid;
|
||||||
|
GetWindowThreadProcessId(hwnd, out pid);
|
||||||
|
if (pid == 0) return false;
|
||||||
|
|
||||||
|
FreeConsole();
|
||||||
|
if (!AttachConsole(pid)) return false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
|
||||||
|
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
|
||||||
|
|
||||||
|
INPUT_RECORD[] records = new INPUT_RECORD[text.Length * 2];
|
||||||
|
for (int i = 0; i < text.Length; i++) {
|
||||||
|
char c = text[i];
|
||||||
|
ushort vk = 0;
|
||||||
|
ushort sc = 0;
|
||||||
|
|
||||||
|
// Key down
|
||||||
|
records[i * 2].EventType = 1;
|
||||||
|
records[i * 2].KeyEvent.bKeyDown = true;
|
||||||
|
records[i * 2].KeyEvent.wRepeatCount = 1;
|
||||||
|
records[i * 2].KeyEvent.wVirtualKeyCode = vk;
|
||||||
|
records[i * 2].KeyEvent.wVirtualScanCode = sc;
|
||||||
|
records[i * 2].KeyEvent.UnicodeChar = c;
|
||||||
|
records[i * 2].KeyEvent.dwControlKeyState = 0;
|
||||||
|
|
||||||
|
// Key up
|
||||||
|
records[i * 2 + 1].EventType = 1;
|
||||||
|
records[i * 2 + 1].KeyEvent.bKeyDown = false;
|
||||||
|
records[i * 2 + 1].KeyEvent.wRepeatCount = 1;
|
||||||
|
records[i * 2 + 1].KeyEvent.wVirtualKeyCode = vk;
|
||||||
|
records[i * 2 + 1].KeyEvent.wVirtualScanCode = sc;
|
||||||
|
records[i * 2 + 1].KeyEvent.UnicodeChar = c;
|
||||||
|
records[i * 2 + 1].KeyEvent.dwControlKeyState = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint written;
|
||||||
|
return WriteConsoleInput(hInput, records, (uint)records.Length, out written);
|
||||||
|
} finally {
|
||||||
|
FreeConsole();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
`
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a key to a console window via WriteConsoleInput (Console Input Buffer).
|
||||||
|
* This is required for terminal apps like Claude Code REPL that read stdin in raw mode.
|
||||||
|
*/
|
||||||
|
export function consoleKey(
|
||||||
|
hwnd: string,
|
||||||
|
vk: number,
|
||||||
|
ch: string = '\0',
|
||||||
|
): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
const charCode = ch.charCodeAt(0)
|
||||||
|
const script = `${CONSOLE_INPUT_TYPE}
|
||||||
|
[ConsoleInput]::SendKeyToConsole([IntPtr]::new([long]${hwnd}), ${vk}, [char]${charCode})
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send text + Enter to a console window via WriteConsoleInput.
|
||||||
|
* Directly injects into the Console Input Buffer — works for raw-mode stdin.
|
||||||
|
*/
|
||||||
|
export function consoleText(hwnd: string, text: string): boolean {
|
||||||
|
hwnd = validateHwnd(hwnd)
|
||||||
|
// Escape single quotes for PowerShell
|
||||||
|
const escaped = text.replace(/'/g, "''")
|
||||||
|
const script = `${CONSOLE_INPUT_TYPE}
|
||||||
|
[ConsoleInput]::SendTextToConsole([IntPtr]::new([long]${hwnd}), '${escaped}')
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a mouse click at client-area coordinates (x, y) relative to the window.
|
||||||
|
* Via SendMessageW — window-targeted, no cursor movement.
|
||||||
|
*/
|
||||||
|
export function sendClick(
|
||||||
|
hwnd: string,
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
button: 'left' | 'right',
|
||||||
|
): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
const downMsg = button === 'left' ? '0x0201' : '0x0204'
|
||||||
|
const upMsg = button === 'left' ? '0x0202' : '0x0205'
|
||||||
|
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
|
||||||
|
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
$lp = [WinMsg]::MakeLParam(${x}, ${y})
|
||||||
|
[WinMsg]::SendMessage(${hwndExpr}, ${downMsg}, [IntPtr]0, $lp)
|
||||||
|
[WinMsg]::SendMessage(${hwndExpr}, ${upMsg}, [IntPtr]0, $lp)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a mouse-button-down at client-area coordinates (x, y).
|
||||||
|
* Via SendMessageW(WM_LBUTTONDOWN) — window-targeted, no cursor movement.
|
||||||
|
*/
|
||||||
|
export function sendMouseDown(hwnd: string, x: number, y: number): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
$lp = [WinMsg]::MakeLParam(${x}, ${y})
|
||||||
|
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONDOWN, [IntPtr]1, $lp)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a mouse-button-up at client-area coordinates (x, y).
|
||||||
|
* Via SendMessageW(WM_LBUTTONUP) — window-targeted, no cursor movement.
|
||||||
|
*/
|
||||||
|
export function sendMouseUp(hwnd: string, x: number, y: number): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
$lp = [WinMsg]::MakeLParam(${x}, ${y})
|
||||||
|
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONUP, [IntPtr]0, $lp)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send a WM_MOUSEMOVE at client-area coordinates (x, y).
|
||||||
|
* Used during drag operations. Via SendMessageW — window-targeted.
|
||||||
|
*/
|
||||||
|
export function sendMouseMove(hwnd: string, x: number, y: number): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
$lp = [WinMsg]::MakeLParam(${x}, ${y})
|
||||||
|
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), 0x0200, [IntPtr]1, $lp)
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send mouse wheel scroll at client-area coordinates (x, y).
|
||||||
|
* Via SendMessageW(WM_MOUSEWHEEL / WM_MOUSEHWHEEL).
|
||||||
|
*
|
||||||
|
* WM_MOUSEWHEEL: vertical scroll (positive delta = scroll up)
|
||||||
|
* WM_MOUSEHWHEEL: horizontal scroll (positive delta = scroll right)
|
||||||
|
*
|
||||||
|
* delta is in multiples of WHEEL_DELTA (120). One "click" = 120.
|
||||||
|
* lParam = screen coordinates (not client), wParam high word = delta.
|
||||||
|
*
|
||||||
|
* Works on Excel, browsers, modern UI — unlike WM_VSCROLL/WM_HSCROLL
|
||||||
|
* which only work on traditional scrollbar controls.
|
||||||
|
*/
|
||||||
|
export function sendMouseWheel(
|
||||||
|
hwnd: string,
|
||||||
|
x: number,
|
||||||
|
y: number,
|
||||||
|
delta: number,
|
||||||
|
horizontal: boolean = false,
|
||||||
|
): boolean {
|
||||||
|
hwnd = resolveInputHwnd(hwnd)
|
||||||
|
// WM_MOUSEWHEEL = 0x020A, WM_MOUSEHWHEEL = 0x020E
|
||||||
|
const msg = horizontal ? '0x020E' : '0x020A'
|
||||||
|
// wParam: high word = wheel delta (signed short), low word = modifier keys (0)
|
||||||
|
// delta is in units of WHEEL_DELTA (120). Positive = up/right, negative = down/left.
|
||||||
|
const wheelDelta = Math.round(delta) * 120
|
||||||
|
// Pack delta into high word of wParam: (delta << 16) as signed
|
||||||
|
// lParam: screen coordinates packed as MAKELPARAM(screenX, screenY)
|
||||||
|
const script = `${WINMSG_TYPE}
|
||||||
|
# WM_MOUSEWHEEL/WM_MOUSEHWHEEL require screen coords in lParam
|
||||||
|
# and wheel delta in high word of wParam
|
||||||
|
Add-Type @'
|
||||||
|
using System;
|
||||||
|
using System.Runtime.InteropServices;
|
||||||
|
public class WheelHelper {
|
||||||
|
[DllImport("user32.dll")] public static extern bool ClientToScreen(IntPtr hWnd, ref POINT p);
|
||||||
|
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X, Y; }
|
||||||
|
|
||||||
|
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
|
||||||
|
public static extern IntPtr SendMsg(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
|
||||||
|
|
||||||
|
public static void Scroll(IntPtr hWnd, int clientX, int clientY, int delta, uint msg) {
|
||||||
|
POINT pt; pt.X = clientX; pt.Y = clientY;
|
||||||
|
ClientToScreen(hWnd, ref pt);
|
||||||
|
IntPtr wParam = (IntPtr)(delta << 16);
|
||||||
|
IntPtr lParam = (IntPtr)((pt.Y << 16) | (pt.X & 0xFFFF));
|
||||||
|
SendMsg(hWnd, msg, wParam, lParam);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'@
|
||||||
|
[WheelHelper]::Scroll([IntPtr]::new([long]${hwnd}), ${x}, ${y}, ${wheelDelta}, ${msg})
|
||||||
|
`
|
||||||
|
return runPs(script) !== null
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user